{"id":3774,"date":"2023-01-24T10:06:02","date_gmt":"2023-01-24T08:06:02","guid":{"rendered":"https:\/\/gaz-temporal.i3a.es\/?p=3774"},"modified":"2023-01-24T10:12:04","modified_gmt":"2023-01-24T08:12:04","slug":"dario-suarez-gracia","status":"publish","type":"post","link":"https:\/\/gaz.i3a.es\/es\/dario-suarez-gracia\/","title":{"rendered":"Dar\u00edo Suarez Gracia"},"content":{"rendered":"<div id=\"pl-gb3774-6a30fb9894c0d\"  class=\"panel-layout\" ><div id=\"pg-gb3774-6a30fb9894c0d-0\"  class=\"panel-grid panel-has-style\" ><div class=\"siteorigin-panels-stretch panel-row-style panel-row-style-for-gb3774-6a30fb9894c0d-0\" data-stretch-type=\"full-width-stretch\" ><div id=\"pgc-gb3774-6a30fb9894c0d-0-0\"  class=\"panel-grid-cell\" ><div id=\"panel-gb3774-6a30fb9894c0d-0-0-0\" class=\"so-panel widget widget_sow-hero panel-first-child panel-last-child\" data-index=\"0\" ><div\n\t\t\t\n\t\t\tclass=\"so-widget-sow-hero so-widget-sow-hero-default-93415d0e2dbf-3774 so-widget-fittext-wrapper\"\n\t\t\t data-fit-text-compressor=\"0.85\"\n\t\t>\t\t\t\t<div class=\"sow-slider-base\" style=\"display: none\" tabindex=\"0\">\n\t\t\t\t\t<ul\n\t\t\t\t\tclass=\"sow-slider-images\"\n\t\t\t\t\tdata-settings=\"{&quot;pagination&quot;:true,&quot;speed&quot;:800,&quot;timeout&quot;:8000,&quot;paused&quot;:false,&quot;pause_on_hover&quot;:false,&quot;swipe&quot;:true,&quot;nav_always_show_desktop&quot;:&quot;&quot;,&quot;nav_always_show_mobile&quot;:&quot;&quot;,&quot;breakpoint&quot;:&quot;780px&quot;,&quot;unmute&quot;:false,&quot;anchor&quot;:null}\"\n\t\t\t\t\t\t\t\t\t\tdata-anchor-id=\"\"\n\t\t\t\t>\t\t<li class=\"sow-slider-image\" style=\"visibility: visible;;background-color: #1e73be\" >\n\t\t\t\t\t<div class=\"sow-slider-image-container\">\n\t\t\t<div class=\"sow-slider-image-wrapper\">\n\t\t\t\t<h3 style=\"text-align: center\"><a href=\"..\/team\/\">Investigadores<\/a><\/h3>\n<h1 style=\"text-align: center\"><strong>Dar\u00edo Suarez Gracia<\/strong><\/h1>\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t<\/li>\n\t\t<\/ul>\t\t\t\t<ol class=\"sow-slider-pagination\">\n\t\t\t\t\t\t\t\t\t\t\t<li><a href=\"#\" data-goto=\"0\" aria-label=\"mostrar diapositiva 1\"><\/a><\/li>\n\t\t\t\t\t\t\t\t\t<\/ol>\n\n\t\t\t\t<div class=\"sow-slide-nav sow-slide-nav-next\">\n\t\t\t\t\t<a href=\"#\" data-goto=\"next\" aria-label=\"diapositiva siguiente\" data-action=\"next\">\n\t\t\t\t\t\t<em class=\"sow-sld-icon-thin-right\"><\/em>\n\t\t\t\t\t<\/a>\n\t\t\t\t<\/div>\n\n\t\t\t\t<div class=\"sow-slide-nav sow-slide-nav-prev\">\n\t\t\t\t\t<a href=\"#\" data-goto=\"previous\" aria-label=\"diapositiva anterior\" data-action=\"prev\">\n\t\t\t\t\t\t<em class=\"sow-sld-icon-thin-left\"><\/em>\n\t\t\t\t\t<\/a>\n\t\t\t\t<\/div>\n\t\t\t\t<\/div><\/div><\/div><\/div><\/div><\/div><\/div>\n\n<div id=\"pl-gb3774-6a30fb989574b\"  class=\"panel-layout\" ><div id=\"pg-gb3774-6a30fb989574b-0\"  class=\"panel-grid panel-no-style\" ><div id=\"pgc-gb3774-6a30fb989574b-0-0\"  class=\"panel-grid-cell\" ><div id=\"panel-gb3774-6a30fb989574b-0-0-0\" class=\"so-panel widget widget_sow-image panel-first-child panel-last-child\" data-index=\"0\" ><div\n\t\t\t\n\t\t\tclass=\"so-widget-sow-image so-widget-sow-image-default-8b5b6f678277-3774\"\n\t\t\t\n\t\t>\n<div class=\"sow-image-container\">\n\t\t<img \n\tsrc=\"https:\/\/gaz.i3a.es\/wp-content\/uploads\/2023\/01\/Dario_Suarez_360x270-300x225.png\" width=\"300\" height=\"225\" srcset=\"https:\/\/gaz.i3a.es\/wp-content\/uploads\/2023\/01\/Dario_Suarez_360x270-300x225.png 300w, https:\/\/gaz.i3a.es\/wp-content\/uploads\/2023\/01\/Dario_Suarez_360x270-16x12.png 16w, https:\/\/gaz.i3a.es\/wp-content\/uploads\/2023\/01\/Dario_Suarez_360x270.png 360w\" sizes=\"(max-width: 300px) 100vw, 300px\" alt=\"\" \t\tclass=\"so-widget-image\"\/>\n\t<\/div>\n\n<\/div><\/div><\/div><div id=\"pgc-gb3774-6a30fb989574b-0-1\"  class=\"panel-grid-cell\" ><div id=\"panel-gb3774-6a30fb989574b-0-1-0\" class=\"so-panel widget widget_sow-image-grid panel-first-child\" data-index=\"1\" ><div class=\"panel-widget-style panel-widget-style-for-gb3774-6a30fb989574b-0-1-0\" ><div\n\t\t\t\n\t\t\tclass=\"so-widget-sow-image-grid so-widget-sow-image-grid-default-5ff4073610f5-3774\"\n\t\t\t\n\t\t><\/div><\/div><\/div><div id=\"panel-gb3774-6a30fb989574b-0-1-1\" class=\"so-panel widget widget_sow-editor panel-last-child\" data-index=\"2\" ><div\n\t\t\t\n\t\t\tclass=\"so-widget-sow-editor so-widget-sow-editor-base\"\n\t\t\t\n\t\t>\n<div class=\"siteorigin-widget-tinymce textwidget\">\n\t<p><strong>Senior Lecturer<\/strong><\/p>\n<p><strong>Email:<\/strong> <a href=\"mailto:dario@unizar.es\">dario@unizar.es<\/a><\/p>\n<p><strong>Address:<\/strong> Campus R\u00edo Ebro, University of Zaragoza<br \/>\nC\/Mar\u00eda de Luna 1, Ada Byron Building,<br \/>\n50018, Zaragoza, Spain<\/p>\n<\/div>\n<\/div><\/div><\/div><\/div><\/div>\n\n<div id=\"pl-gb3774-6a30fb9896656\"  class=\"panel-layout\" ><div id=\"pg-gb3774-6a30fb9896656-0\"  class=\"panel-grid panel-has-style\" ><div class=\"panel-row-style panel-row-style-for-gb3774-6a30fb9896656-0\" ><div id=\"pgc-gb3774-6a30fb9896656-0-0\"  class=\"panel-grid-cell\" ><div id=\"panel-gb3774-6a30fb9896656-0-0-0\" class=\"so-panel widget widget_sow-headline panel-first-child\" data-index=\"0\" ><div\n\t\t\t\n\t\t\tclass=\"so-widget-sow-headline so-widget-sow-headline-default-244eb6bef45a-3774\"\n\t\t\t\n\t\t><div class=\"sow-headline-container\">\n\t\t\t\t\t\t\t<h5 class=\"sow-headline\">\n\t\t\t\t\t\tABOUT ME\t\t\t\t\t\t<\/h5>\n\t\t\t\t\t\t\t\t\t\t\t<div class=\"decoration\">\n\t\t\t\t\t\t<div class=\"decoration-inside\"><\/div>\n\t\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n<\/div><\/div><div id=\"panel-gb3774-6a30fb9896656-0-0-1\" class=\"so-panel widget widget_sow-editor panel-last-child\" data-index=\"1\" ><div\n\t\t\t\n\t\t\tclass=\"so-widget-sow-editor so-widget-sow-editor-base\"\n\t\t\t\n\t\t>\n<div class=\"siteorigin-widget-tinymce textwidget\">\n\t<p>&#8212;<\/p>\n<\/div>\n<\/div><\/div><\/div><\/div><\/div><div id=\"pg-gb3774-6a30fb9896656-1\"  class=\"panel-grid panel-has-style\" ><div class=\"panel-row-style panel-row-style-for-gb3774-6a30fb9896656-1\" ><div id=\"pgc-gb3774-6a30fb9896656-1-0\"  class=\"panel-grid-cell\" ><div id=\"panel-gb3774-6a30fb9896656-1-0-0\" class=\"so-panel widget widget_sow-headline panel-first-child\" data-index=\"2\" ><div\n\t\t\t\n\t\t\tclass=\"so-widget-sow-headline so-widget-sow-headline-default-244eb6bef45a-3774\"\n\t\t\t\n\t\t><div class=\"sow-headline-container\">\n\t\t\t\t\t\t\t<h5 class=\"sow-headline\">\n\t\t\t\t\t\tPUBLICATIONS\t\t\t\t\t\t<\/h5>\n\t\t\t\t\t\t\t\t\t\t\t<div class=\"decoration\">\n\t\t\t\t\t\t<div class=\"decoration-inside\"><\/div>\n\t\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n<\/div><\/div><div id=\"panel-gb3774-6a30fb9896656-1-0-1\" class=\"so-panel widget widget_sow-editor panel-last-child\" data-index=\"3\" ><div\n\t\t\t\n\t\t\tclass=\"so-widget-sow-editor so-widget-sow-editor-base\"\n\t\t\t\n\t\t>\n<div class=\"siteorigin-widget-tinymce textwidget\">\n\t<div class=\"teachpress_pub_list\"><form name=\"tppublistform\" method=\"get\" action=\"\"><a name=\"tppubs\" id=\"tppubs\"><\/a><div class=\"teachpress_filter\"><select class=\"default\" name=\"yr\" id=\"yr\" tabindex=\"2\" onchange=\"teachpress_jumpMenu('parent',this, 'https:\/\/gaz.i3a.es\/es\/dario-suarez-gracia\/?')\">\r\n                   <option value=\"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=#tppubs\">Todos los a\u00f1os<\/option>\r\n                   <option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2026#tppubs\" >2026<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2025#tppubs\" >2025<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2024#tppubs\" >2024<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2023#tppubs\" >2023<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2022#tppubs\" >2022<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2021#tppubs\" >2021<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2020#tppubs\" >2020<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2019#tppubs\" >2019<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2018#tppubs\" >2018<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2017#tppubs\" >2017<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2016#tppubs\" >2016<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2015#tppubs\" >2015<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2014#tppubs\" >2014<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2013#tppubs\" >2013<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2012#tppubs\" >2012<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2011#tppubs\" >2011<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2010#tppubs\" >2010<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2009#tppubs\" >2009<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2008#tppubs\" >2008<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2007#tppubs\" >2007<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2006#tppubs\" >2006<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2005#tppubs\" >2005<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2004#tppubs\" >2004<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2003#tppubs\" >2003<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2002#tppubs\" >2002<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2001#tppubs\" >2001<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2000#tppubs\" >2000<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1999#tppubs\" >1999<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1998#tppubs\" >1998<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1997#tppubs\" >1997<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1996#tppubs\" >1996<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1995#tppubs\" >1995<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1994#tppubs\" >1994<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1989#tppubs\" >1989<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1987#tppubs\" >1987<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1985#tppubs\" >1985<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1791#tppubs\" >1791<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=0000#tppubs\" >0000<\/option>\r\n                <\/select><select class=\"default\" name=\"type\" id=\"type\" tabindex=\"3\" onchange=\"teachpress_jumpMenu('parent',this, 'https:\/\/gaz.i3a.es\/es\/dario-suarez-gracia\/?')\">\r\n                   <option value=\"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=#tppubs\">Todas las tipolog\u00edas<\/option>\r\n                   <option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=article#tppubs\" >Art\u00edculos de revista<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=book#tppubs\" >Libros<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=incollection#tppubs\" >Book Sections<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=inproceedings#tppubs\" >Proceedings Articles<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=mastersthesis#tppubs\" >Tesis de m\u00e1ster o tesina<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=misc#tppubs\" >Miscel\u00e1nea<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=phdthesis#tppubs\" >Tesis doctorales<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=proceedings#tppubs\" >Actas de congresos<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=techreport#tppubs\" >Informes t\u00e9cnicos<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=workshop#tppubs\" >Workshops<\/option>\r\n                <\/select><\/div><input type=\"hidden\" name=\"trp-form-language\" value=\"es\"\/><\/form><div class=\"tablenav\"><div class=\"tablenav-pages\"><span class=\"displaying-num\">66 registros<\/span> <a class=\"page-numbers button disabled\">&laquo;<\/a> <a class=\"page-numbers button disabled\">&lsaquo;<\/a> 1 de 14 <a href=\"https:\/\/gaz.i3a.es\/es\/dario-suarez-gracia\/?limit=2&amp;tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=&amp;tsr=#tppubs\" title=\"p\u00e1gina siguiente\" class=\"page-numbers button\">&rsaquo;<\/a> <a href=\"https:\/\/gaz.i3a.es\/es\/dario-suarez-gracia\/?limit=14&amp;tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=&amp;tsr=#tppubs\" title=\"\u00faltima p\u00e1gina\" class=\"page-numbers button\">&raquo;<\/a> <\/div><\/div><div class=\"teachpress_publication_list\"><h3 class=\"tp_h3\" id=\"tp_h3_2025\">2025<\/h3><h3 class=\"tp_h3\" id=\"tp_h3_article\">Art\u00edculos de revista<\/h3><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Pedrajas, Samuel P\u00e9rez;  Resano, Javier;  Gracia, Dar\u00edo Su\u00e1rez<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('866','tp_links')\" style=\"cursor:pointer;\">BnnRV: Hardware and Software Optimizations for Weight Sampling in Bayesian Neural Networks on Edge RISC-V Cores<\/a> <span class=\"tp_pub_type tp_  article\">Art\u00edculo de revista<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Circuits and Systems for Artificial Intelligence, <\/span><span class=\"tp_pub_additional_pages\">pp. 1-12, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>, <span class=\"tp_pub_additional_issn\">ISSN: 2996-6647<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_866\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('866','tp_abstract')\" title=\"Mostrar resumen\" style=\"cursor:pointer;\">Resumen<\/a><\/span> | <span class=\"tp_resource_link\"><a id=\"tp_links_sh_866\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('866','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_866\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('866','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_866\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{11216142,<br \/>\r\ntitle = {BnnRV: Hardware and Software Optimizations for Weight Sampling in Bayesian Neural Networks on Edge RISC-V Cores},<br \/>\r\nauthor = {Samuel P\u00e9rez Pedrajas and Javier Resano and Dar\u00edo Su\u00e1rez Gracia},<br \/>\r\ndoi = {10.1109\/TCASAI.2025.3625517},<br \/>\r\nissn = {2996-6647},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-01},<br \/>\r\njournal = {IEEE Transactions on Circuits and Systems for Artificial Intelligence},<br \/>\r\npages = {1-12},<br \/>\r\nabstract = {Bayesian Neural Networks (BNN) allow prediction uncertainty estimation, making them a more suitable option for safety-critical applications. However, in BNNs, the forward-pass computational cost is significantly higher than in traditional neural networks (NN), due to the overhead generated by weight sampling. This limits their deployment in edge systems. This paper presents an optimization that allows using lower-cost Uniform distribution sampling instead of Gaussian sampling during BNN inference. Building upon this optimization, this paper proposes a lightweight RISC-V instruction set architecture extension that accelerates BNN inference by introducing fixed point arithmetic operations and an efficient Uniform random number generator. The flexibility of RISC-V enables such domain-specific acceleration, narrowing the performance gap between NNs and BNNs for edge machine learning workloads. The proposed software and hardware optimizations achieve an average speedup of 8.93\u00d7 while reducing energy consumption per forward pass by 87.12%, increasing image\/J efficiency by 8.19\u00d7. They have been designed to maintain accuracy, calibration, and uncertainty quality, while optimizing execution efficiency. This has been verified with an extensive validation process that considers relevant model architectures. Additionally, our results highlight that weight sampling is no longer the BNN inference performance bottleneck, shifting the primary limiting factor to control overhead.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('866','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_866\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Bayesian Neural Networks (BNN) allow prediction uncertainty estimation, making them a more suitable option for safety-critical applications. However, in BNNs, the forward-pass computational cost is significantly higher than in traditional neural networks (NN), due to the overhead generated by weight sampling. This limits their deployment in edge systems. This paper presents an optimization that allows using lower-cost Uniform distribution sampling instead of Gaussian sampling during BNN inference. Building upon this optimization, this paper proposes a lightweight RISC-V instruction set architecture extension that accelerates BNN inference by introducing fixed point arithmetic operations and an efficient Uniform random number generator. The flexibility of RISC-V enables such domain-specific acceleration, narrowing the performance gap between NNs and BNNs for edge machine learning workloads. The proposed software and hardware optimizations achieve an average speedup of 8.93\u00d7 while reducing energy consumption per forward pass by 87.12%, increasing image\/J efficiency by 8.19\u00d7. They have been designed to maintain accuracy, calibration, and uncertainty quality, while optimizing execution efficiency. This has been verified with an extensive validation process that considers relevant model architectures. Additionally, our results highlight that weight sampling is no longer the BNN inference performance bottleneck, shifting the primary limiting factor to control overhead.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('866','tp_abstract')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_866\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"ai ai-doi\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dx.doi.org\/10.1109\/TCASAI.2025.3625517\" title=\"DOI de seguimiento:10.1109\/TCASAI.2025.3625517\" target=\"_blank\">doi:10.1109\/TCASAI.2025.3625517<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('866','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><h3 class=\"tp_h3\" id=\"tp_h3_inproceedings\">Proceedings Articles<\/h3><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Soria-Pardos, V\u00edctor;  Armejach, Adri\u00e0;  Su\u00e1rez, Dar\u00edo;  Martinot, Didier;  Grasset, Arnaud;  Moret\u00f3, Miquel<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('895','tp_links')\" style=\"cursor:pointer;\">FLAMA: Architecting floating-point atomic memory operations for heterogeneous HPC systems<\/a> <span class=\"tp_pub_type tp_  inproceedings\">Proceedings Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_booktitle\">2025 28th Euromicro Conference on Digital System Design (DSD), <\/span><span class=\"tp_pub_additional_pages\">pp. 435\u2013442, <\/span><span class=\"tp_pub_additional_organization\">IEEE <\/span><span class=\"tp_pub_additional_publisher\">IEEE, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_895\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('895','tp_abstract')\" title=\"Mostrar resumen\" style=\"cursor:pointer;\">Resumen<\/a><\/span> | <span class=\"tp_resource_link\"><a id=\"tp_links_sh_895\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('895','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_895\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('895','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_895\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{soria2025flama,<br \/>\r\ntitle = {FLAMA: Architecting floating-point atomic memory operations for heterogeneous HPC systems},<br \/>\r\nauthor = {V\u00edctor Soria-Pardos and Adri\u00e0 Armejach and Dar\u00edo Su\u00e1rez and Didier Martinot and Arnaud Grasset and Miquel Moret\u00f3},<br \/>\r\nurl = {https:\/\/upcommons.upc.edu\/server\/api\/core\/bitstreams\/9199c411-ce89-4327-a06b-bf21838aa8db\/content},<br \/>\r\ndoi = {10.1109\/DSD67783.2025.00066},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-01},<br \/>\r\nurldate = {2025-01-01},<br \/>\r\nbooktitle = {2025 28th Euromicro Conference on Digital System Design (DSD)},<br \/>\r\npages = {435\u2013442},<br \/>\r\npublisher = {IEEE},<br \/>\r\norganization = {IEEE},<br \/>\r\nabstract = {Current heterogeneous systems integrate generalpurpose Central Processing Units (CPUs), Graphics Processing Units (GPUs), and Neural Processing Units (NPUs). The efficient use of such systems requires a significant programming effort to distribute computation and synchronize across devices, which usually involves using Atomic Memory Operations (AMOs). Arm recently launched a floating-point Atomic Memory Operations (FAMOs) extension to perform atomic updates on floating-point data types specifically. This work characterizes and models heterogeneous architectures to understand how floating-point AMOs impact graph, Machine Learning (ML), and high-performance computing (HPC) workloads. Our analysis shows that many AMOs are performed on floating-point data, which modern systems execute using inefficient compare-and-swap (CAS) constructs. Therefore, replacing CASbased constructs with FAMOs can improve a wide range of workloads. Moreover, we analyze the trade-offs of executing FAMOs at different memory hierarchy levels, either in private caches (near) or remotely in shared caches (far). We have extended the widely used AMBA CHI protocol to evaluate such FAMO support on a simulated chiplet-based heterogeneous architecture. While near FAMOs achieve an average 1.34\u00d7 speed-up, far FAMOs reach an average 1.58\u00d7 speed-up. We conclude that FAMOs can bridge the gap between CPU architecture and accelerators and enabling synchronization in key application domains.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('895','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_895\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Current heterogeneous systems integrate generalpurpose Central Processing Units (CPUs), Graphics Processing Units (GPUs), and Neural Processing Units (NPUs). The efficient use of such systems requires a significant programming effort to distribute computation and synchronize across devices, which usually involves using Atomic Memory Operations (AMOs). Arm recently launched a floating-point Atomic Memory Operations (FAMOs) extension to perform atomic updates on floating-point data types specifically. This work characterizes and models heterogeneous architectures to understand how floating-point AMOs impact graph, Machine Learning (ML), and high-performance computing (HPC) workloads. Our analysis shows that many AMOs are performed on floating-point data, which modern systems execute using inefficient compare-and-swap (CAS) constructs. Therefore, replacing CASbased constructs with FAMOs can improve a wide range of workloads. Moreover, we analyze the trade-offs of executing FAMOs at different memory hierarchy levels, either in private caches (near) or remotely in shared caches (far). We have extended the widely used AMBA CHI protocol to evaluate such FAMO support on a simulated chiplet-based heterogeneous architecture. While near FAMOs achieve an average 1.34\u00d7 speed-up, far FAMOs reach an average 1.58\u00d7 speed-up. We conclude that FAMOs can bridge the gap between CPU architecture and accelerators and enabling synchronization in key application domains.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('895','tp_abstract')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_895\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"fas fa-globe\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/upcommons.upc.edu\/server\/api\/core\/bitstreams\/9199c411-ce89-4327-a06b-bf21838aa8db\/content\" title=\"https:\/\/upcommons.upc.edu\/server\/api\/core\/bitstreams\/9199c411-ce89-4327-a06b-bf2[...]\" target=\"_blank\">https:\/\/upcommons.upc.edu\/server\/api\/core\/bitstreams\/9199c411-ce89-4327-a06b-bf2[&#8230;]<\/a><\/li><li><i class=\"ai ai-doi\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dx.doi.org\/10.1109\/DSD67783.2025.00066\" title=\"DOI de seguimiento:10.1109\/DSD67783.2025.00066\" target=\"_blank\">doi:10.1109\/DSD67783.2025.00066<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('895','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Soria-Pardos, V\u00edctor;  Armejach, Adri\u00e0;  M\u00fcck, Tiago;  Gracia, Dar\u00edo Su\u00e1rez;  Joao, Jose;  Moret\u00f3, Miquel<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('892','tp_links')\" style=\"cursor:pointer;\">Delegato: Locality-Aware Atomic Memory Operations on Chiplets<\/a> <span class=\"tp_pub_type tp_  inproceedings\">Proceedings Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_booktitle\">Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture, <\/span><span class=\"tp_pub_additional_pages\">pp. 1793\u20131808, <\/span><span class=\"tp_pub_additional_publisher\">ACM, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_892\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('892','tp_abstract')\" title=\"Mostrar resumen\" style=\"cursor:pointer;\">Resumen<\/a><\/span> | <span class=\"tp_resource_link\"><a id=\"tp_links_sh_892\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('892','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_892\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('892','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_892\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{soria2025delegato,<br \/>\r\ntitle = {Delegato: Locality-Aware Atomic Memory Operations on Chiplets},<br \/>\r\nauthor = {V\u00edctor Soria-Pardos and Adri\u00e0 Armejach and Tiago M\u00fcck and Dar\u00edo Su\u00e1rez Gracia and Jose Joao and Miquel Moret\u00f3},<br \/>\r\nurl = {https:\/\/dl.acm.org\/doi\/full\/10.1145\/3725843.3756030},<br \/>\r\ndoi = {10.1145\/3725843.375603},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-01},<br \/>\r\nurldate = {2025-01-01},<br \/>\r\nbooktitle = {Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture},<br \/>\r\npages = {1793\u20131808},<br \/>\r\npublisher = {ACM},<br \/>\r\nabstract = {The irruption of chiplet-based architectures has been a game changer, enabling higher transistor integration and core counts in a single socket. However, chiplets impose higher and non-uniform memory access (NUMA) latencies than monolithic integration. This harms the efficiency of atomic memory operations (AMOs), which are fundamental to implementing fine-grained synchronization and concurrent data structures on large systems. AMOs are executed either near the core (near) or at a remote location within the cache hierarchy (far). On near AMOs, the core\u2019s private cache fetches the target cache line in exclusiveness to modify it locally. Near AMOs cause significant data movement between private caches, especially harming parallel applications\u2019 performance on chiplet-based architectures. Alternatively, far AMOs can alleviate the communication overhead by reducing data movement between processing elements. However, current multicore architectures only support one type of far AMO, which sends all updates to a single serialization point (centralized AMOs).<br \/>\r\nThis work introduces two new types of far AMOs, delegated and migrating, that execute AMOs remotely without centralizing updates in a single point of the cache hierarchy. Combining centralized, delegated, and migrating AMOs allows the directory to select the best location to execute AMOs. Moreover, we propose Delegato, a tracing optimization to effectively transport usage information from private caches to the directory to predict the best atomic type to issue accurately. Additionally, we design a simple predictor on top of Delegato that seamlessly selects the best placement to perform AMOs based on the data access pattern and usage activity of cores. Our evaluation using gem5 shows that Delegato can speed up applications on average by 1.07 \u00d7 over centralized AMOs and by 1.13 \u00d7 over the state-of-the-art AMO predictor.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('892','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_892\" style=\"display:none;\"><div class=\"tp_abstract_entry\">The irruption of chiplet-based architectures has been a game changer, enabling higher transistor integration and core counts in a single socket. However, chiplets impose higher and non-uniform memory access (NUMA) latencies than monolithic integration. This harms the efficiency of atomic memory operations (AMOs), which are fundamental to implementing fine-grained synchronization and concurrent data structures on large systems. AMOs are executed either near the core (near) or at a remote location within the cache hierarchy (far). On near AMOs, the core\u2019s private cache fetches the target cache line in exclusiveness to modify it locally. Near AMOs cause significant data movement between private caches, especially harming parallel applications\u2019 performance on chiplet-based architectures. Alternatively, far AMOs can alleviate the communication overhead by reducing data movement between processing elements. However, current multicore architectures only support one type of far AMO, which sends all updates to a single serialization point (centralized AMOs).<br \/>\r\nThis work introduces two new types of far AMOs, delegated and migrating, that execute AMOs remotely without centralizing updates in a single point of the cache hierarchy. Combining centralized, delegated, and migrating AMOs allows the directory to select the best location to execute AMOs. Moreover, we propose Delegato, a tracing optimization to effectively transport usage information from private caches to the directory to predict the best atomic type to issue accurately. Additionally, we design a simple predictor on top of Delegato that seamlessly selects the best placement to perform AMOs based on the data access pattern and usage activity of cores. Our evaluation using gem5 shows that Delegato can speed up applications on average by 1.07 \u00d7 over centralized AMOs and by 1.13 \u00d7 over the state-of-the-art AMO predictor.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('892','tp_abstract')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_892\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"fas fa-globe\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dl.acm.org\/doi\/full\/10.1145\/3725843.3756030\" title=\"https:\/\/dl.acm.org\/doi\/full\/10.1145\/3725843.3756030\" target=\"_blank\">https:\/\/dl.acm.org\/doi\/full\/10.1145\/3725843.3756030<\/a><\/li><li><i class=\"ai ai-doi\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dx.doi.org\/10.1145\/3725843.375603\" title=\"DOI de seguimiento:10.1145\/3725843.375603\" target=\"_blank\">doi:10.1145\/3725843.375603<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('892','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><h3 class=\"tp_h3\" id=\"tp_h3_2024\">2024<\/h3><h3 class=\"tp_h3\" id=\"tp_h3_inproceedings\">Proceedings Articles<\/h3><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> P\u00e9rez, Samuel;  Resano, Javier;  Gracia, Dar\u00edo Su\u00e1rez<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('872','tp_links')\" style=\"cursor:pointer;\">Accelerating Bayesian Neural Networks on Low-Power Edge RISC-V Processors<\/a> <span class=\"tp_pub_type tp_  inproceedings\">Proceedings Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_booktitle\">2024 IEEE 24th International Conference on Nanotechnology (NANO), <\/span><span class=\"tp_pub_additional_pages\">pp. 507-512, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>, <span class=\"tp_pub_additional_issn\">ISSN: 1944-9380<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_872\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('872','tp_abstract')\" title=\"Mostrar resumen\" style=\"cursor:pointer;\">Resumen<\/a><\/span> | <span class=\"tp_resource_link\"><a id=\"tp_links_sh_872\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('872','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_872\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('872','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_872\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{10628877,<br \/>\r\ntitle = {Accelerating Bayesian Neural Networks on Low-Power Edge RISC-V Processors},<br \/>\r\nauthor = {Samuel P\u00e9rez and Javier Resano and Dar\u00edo Su\u00e1rez Gracia},<br \/>\r\ndoi = {10.1109\/NANO61778.2024.10628877},<br \/>\r\nissn = {1944-9380},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-07-01},<br \/>\r\nbooktitle = {2024 IEEE 24th International Conference on Nanotechnology (NANO)},<br \/>\r\npages = {507-512},<br \/>\r\nabstract = {Neural Networks (NN s) are a very popular solution for classification tasks. As the combination of Internet of Things (IoT) with Machine Learning (ML), also known as TinyML, grows in popularity, more NN are being executed on low-end edge systems. The reliability of the predictions is crucial for safety-critical applications. Bayesian Neural Networks (BNNs) address this issue by calculating uncertainty metrics with their predictions at the cost of increasing computing requirements. This work addresses the challenges of executing BNNs inference on low-end systems. BNNs require multiple forward passes in which the weights are sampled from distributions. This sampling process can take up to 85,13% of execution time. This work optimizes the weight sampling and integrates it within a low cost custom extension for a RISC- V CPU, improving speedup up to x 8,10 and similar energy savings.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('872','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_872\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Neural Networks (NN s) are a very popular solution for classification tasks. As the combination of Internet of Things (IoT) with Machine Learning (ML), also known as TinyML, grows in popularity, more NN are being executed on low-end edge systems. The reliability of the predictions is crucial for safety-critical applications. Bayesian Neural Networks (BNNs) address this issue by calculating uncertainty metrics with their predictions at the cost of increasing computing requirements. This work addresses the challenges of executing BNNs inference on low-end systems. BNNs require multiple forward passes in which the weights are sampled from distributions. This sampling process can take up to 85,13% of execution time. This work optimizes the weight sampling and integrates it within a low cost custom extension for a RISC- V CPU, improving speedup up to x 8,10 and similar energy savings.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('872','tp_abstract')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_872\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"ai ai-doi\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dx.doi.org\/10.1109\/NANO61778.2024.10628877\" title=\"DOI de seguimiento:10.1109\/NANO61778.2024.10628877\" target=\"_blank\">doi:10.1109\/NANO61778.2024.10628877<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('872','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><h3 class=\"tp_h3\" id=\"tp_h3_2023\">2023<\/h3><h3 class=\"tp_h3\" id=\"tp_h3_inproceedings\">Proceedings Articles<\/h3><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Soria-Pardos, V\u00edctor;  Armejach, Adria;  M\u00fcck, Tiago;  Su\u00e1rez-Gracia, Dario;  Joao, Jos\u00e9;  Rico, Alejandro;  Moret\u00f3, Miquel<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('888','tp_links')\" style=\"cursor:pointer;\">DynAMO: Improving Parallelism Through Dynamic Placement of Atomic Memory Operations<\/a> <span class=\"tp_pub_type tp_  inproceedings\">Proceedings Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_booktitle\">Proceedings of the 50th Annual International Symposium on Computer Architecture, <\/span><span class=\"tp_pub_additional_pages\">pp. 1\u201313, <\/span><span class=\"tp_pub_additional_publisher\">ACM, <\/span><span class=\"tp_pub_additional_year\">2023<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_888\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('888','tp_abstract')\" title=\"Mostrar resumen\" style=\"cursor:pointer;\">Resumen<\/a><\/span> | <span class=\"tp_resource_link\"><a id=\"tp_links_sh_888\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('888','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_888\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('888','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_888\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{soria2023dynamo,<br \/>\r\ntitle = {DynAMO: Improving Parallelism Through Dynamic Placement of Atomic Memory Operations},<br \/>\r\nauthor = {V\u00edctor Soria-Pardos and Adria Armejach and Tiago M\u00fcck and Dario Su\u00e1rez-Gracia and Jos\u00e9 Joao and Alejandro Rico and Miquel Moret\u00f3},<br \/>\r\nurl = {https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3579371.3589065},<br \/>\r\ndoi = {10.1145\/3579371.3589065},<br \/>\r\nyear  = {2023},<br \/>\r\ndate = {2023-01-01},<br \/>\r\nurldate = {2023-01-01},<br \/>\r\nbooktitle = {Proceedings of the 50th Annual International Symposium on Computer Architecture},<br \/>\r\npages = {1\u201313},<br \/>\r\npublisher = {ACM},<br \/>\r\nabstract = {With increasing core counts in modern multi-core designs, the overhead of synchronization jeopardizes the scalability and efficiency of parallel applications. To mitigate these overheads, modern cache-coherent protocols offer support for Atomic Memory Operations (AMOs) that can be executed near-core (near) or remotely in the on-chip memory hierarchy (far).<br \/>\r\nThis paper evaluates current available static AMO execution policies implemented in multi-core Systems-on-Chip (SoC) designs, which select AMOs&#039; execution placement (near or far) based on the cache block coherence state. We propose three static policies and show that the performance of static policies is application dependent. Moreover, we show that one of our proposed static policies outperforms currently available implementations.<br \/>\r\nFurthermore, we propose DynAMO, a predictor that selects the best location to execute the AMOs. DynAMO identifies the different locality patterns to make informed decisions, improving AMO latency and increasing overall throughput. DynAMO outperforms the best-performing static policy and provides geometric mean speed-ups of 1.09\u00d7 across all workloads and 1.31\u00d7 on AMO-intensive applications with respect to executing all AMOs near.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('888','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_888\" style=\"display:none;\"><div class=\"tp_abstract_entry\">With increasing core counts in modern multi-core designs, the overhead of synchronization jeopardizes the scalability and efficiency of parallel applications. To mitigate these overheads, modern cache-coherent protocols offer support for Atomic Memory Operations (AMOs) that can be executed near-core (near) or remotely in the on-chip memory hierarchy (far).<br \/>\r\nThis paper evaluates current available static AMO execution policies implemented in multi-core Systems-on-Chip (SoC) designs, which select AMOs&#039; execution placement (near or far) based on the cache block coherence state. We propose three static policies and show that the performance of static policies is application dependent. Moreover, we show that one of our proposed static policies outperforms currently available implementations.<br \/>\r\nFurthermore, we propose DynAMO, a predictor that selects the best location to execute the AMOs. DynAMO identifies the different locality patterns to make informed decisions, improving AMO latency and increasing overall throughput. DynAMO outperforms the best-performing static policy and provides geometric mean speed-ups of 1.09\u00d7 across all workloads and 1.31\u00d7 on AMO-intensive applications with respect to executing all AMOs near.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('888','tp_abstract')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_888\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"fas fa-globe\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3579371.3589065\" title=\"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3579371.3589065\" target=\"_blank\">https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3579371.3589065<\/a><\/li><li><i class=\"ai ai-doi\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dx.doi.org\/10.1145\/3579371.3589065\" title=\"DOI de seguimiento:10.1145\/3579371.3589065\" target=\"_blank\">doi:10.1145\/3579371.3589065<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('888','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><\/div><div class=\"tablenav\"><div class=\"tablenav-pages\"><span class=\"displaying-num\">66 registros<\/span> <a class=\"page-numbers button disabled\">&laquo;<\/a> <a class=\"page-numbers button disabled\">&lsaquo;<\/a> 1 de 14 <a href=\"https:\/\/gaz.i3a.es\/es\/dario-suarez-gracia\/?limit=2&amp;tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=&amp;tsr=#tppubs\" title=\"p\u00e1gina siguiente\" class=\"page-numbers button\">&rsaquo;<\/a> <a href=\"https:\/\/gaz.i3a.es\/es\/dario-suarez-gracia\/?limit=14&amp;tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=&amp;tsr=#tppubs\" title=\"\u00faltima p\u00e1gina\" class=\"page-numbers button\">&raquo;<\/a> <\/div><\/div><\/div>\n<\/div>\n<\/div><\/div><\/div><\/div><\/div><\/div>\n\n\n<p><\/p>","protected":false},"excerpt":{"rendered":"","protected":false},"author":1,"featured_media":3779,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[238,239],"tags":[],"class_list":["post-3774","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-reseacher","category-team"],"_links":{"self":[{"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/posts\/3774","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/comments?post=3774"}],"version-history":[{"count":5,"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/posts\/3774\/revisions"}],"predecessor-version":[{"id":3796,"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/posts\/3774\/revisions\/3796"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/media\/3779"}],"wp:attachment":[{"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/media?parent=3774"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/categories?post=3774"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/tags?post=3774"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}