{"id":2608,"date":"2020-08-26T08:58:57","date_gmt":"2020-08-26T06:58:57","guid":{"rendered":"https:\/\/afrodita.i3a.es\/?page_id=2608"},"modified":"2023-01-23T15:40:51","modified_gmt":"2023-01-23T13:40:51","slug":"publications","status":"publish","type":"page","link":"https:\/\/gaz.i3a.es\/es\/publications\/","title":{"rendered":"Publicaciones"},"content":{"rendered":"<div id=\"pl-gb2608-6a072876312ab\"  class=\"panel-layout\" ><div id=\"pg-gb2608-6a072876312ab-0\"  class=\"panel-grid panel-has-style\" ><div class=\"siteorigin-panels-stretch panel-row-style panel-row-style-for-gb2608-6a072876312ab-0\" data-stretch-type=\"full-width-stretch\" ><div id=\"pgc-gb2608-6a072876312ab-0-0\"  class=\"panel-grid-cell\" ><div id=\"panel-gb2608-6a072876312ab-0-0-0\" class=\"so-panel widget widget_sow-hero panel-first-child panel-last-child\" data-index=\"0\" ><div\n\t\t\t\n\t\t\tclass=\"so-widget-sow-hero so-widget-sow-hero-default-93415d0e2dbf-2608 so-widget-fittext-wrapper\"\n\t\t\t data-fit-text-compressor=\"0.85\"\n\t\t>\t\t\t\t<div class=\"sow-slider-base\" style=\"display: none\" tabindex=\"0\">\n\t\t\t\t\t<ul\n\t\t\t\t\tclass=\"sow-slider-images\"\n\t\t\t\t\tdata-settings=\"{&quot;pagination&quot;:true,&quot;speed&quot;:800,&quot;timeout&quot;:8000,&quot;paused&quot;:false,&quot;pause_on_hover&quot;:false,&quot;swipe&quot;:true,&quot;nav_always_show_desktop&quot;:&quot;&quot;,&quot;nav_always_show_mobile&quot;:&quot;&quot;,&quot;breakpoint&quot;:&quot;780px&quot;,&quot;unmute&quot;:false,&quot;anchor&quot;:null}\"\n\t\t\t\t\t\t\t\t\t\tdata-anchor-id=\"\"\n\t\t\t\t>\t\t<li class=\"sow-slider-image  sow-slider-image-cover\" style=\"visibility: visible;;background-color: #333333\" >\n\t\t\t\t\t<div class=\"sow-slider-image-container\">\n\t\t\t<div class=\"sow-slider-image-wrapper\">\n\t\t\t\t<h1 class=\"entry-title\" style=\"text-align: center\"><strong>Publicaciones<\/strong><\/h1>\n\t\t\t<\/div>\n\t\t<\/div>\n\t\t<div class=\"sow-slider-image-overlay sow-slider-image-cover\" style=\"opacity: 0.5;background-image: url(https:\/\/gaz.i3a.es\/wp-content\/uploads\/2020\/08\/books_2-1.jpg)\"  ><\/div>\t\t<\/li>\n\t\t<\/ul>\t\t\t\t<ol class=\"sow-slider-pagination\">\n\t\t\t\t\t\t\t\t\t\t\t<li><a href=\"#\" data-goto=\"0\" aria-label=\"mostrar diapositiva 1\"><\/a><\/li>\n\t\t\t\t\t\t\t\t\t<\/ol>\n\n\t\t\t\t<div class=\"sow-slide-nav sow-slide-nav-next\">\n\t\t\t\t\t<a href=\"#\" data-goto=\"next\" aria-label=\"diapositiva siguiente\" data-action=\"next\">\n\t\t\t\t\t\t<em class=\"sow-sld-icon-thin-right\"><\/em>\n\t\t\t\t\t<\/a>\n\t\t\t\t<\/div>\n\n\t\t\t\t<div class=\"sow-slide-nav sow-slide-nav-prev\">\n\t\t\t\t\t<a href=\"#\" data-goto=\"previous\" aria-label=\"diapositiva anterior\" data-action=\"prev\">\n\t\t\t\t\t\t<em class=\"sow-sld-icon-thin-left\"><\/em>\n\t\t\t\t\t<\/a>\n\t\t\t\t<\/div>\n\t\t\t\t<\/div><\/div><\/div><\/div><\/div><\/div><\/div>\n\n<div id=\"pl-gb2608-6a07287631fdd\"  class=\"panel-layout\" ><div id=\"pg-gb2608-6a07287631fdd-0\"  class=\"panel-grid panel-no-style\" ><div id=\"pgc-gb2608-6a07287631fdd-0-0\"  class=\"panel-grid-cell\" ><div id=\"panel-gb2608-6a07287631fdd-0-0-0\" class=\"so-panel widget widget_sow-editor panel-first-child panel-last-child\" data-index=\"0\" ><div\n\t\t\t\n\t\t\tclass=\"so-widget-sow-editor so-widget-sow-editor-base\"\n\t\t\t\n\t\t>\n<div class=\"siteorigin-widget-tinymce textwidget\">\n\t<div class=\"teachpress_pub_list\"><form name=\"tppublistform\" method=\"get\" action=\"\"><a name=\"tppubs\" id=\"tppubs\"><\/a><div class=\"teachpress_filter\"><select class=\"default\" name=\"yr\" id=\"yr\" tabindex=\"2\" onchange=\"teachpress_jumpMenu('parent',this, 'https:\/\/gaz.i3a.es\/es\/publications\/?')\">\r\n                   <option value=\"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=#tppubs\">Todos los a\u00f1os<\/option>\r\n                   <option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2026#tppubs\" >2026<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2025#tppubs\" >2025<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2024#tppubs\" >2024<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2023#tppubs\" >2023<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2022#tppubs\" >2022<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2021#tppubs\" >2021<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2020#tppubs\" >2020<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2019#tppubs\" >2019<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2018#tppubs\" >2018<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2017#tppubs\" >2017<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2016#tppubs\" >2016<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2015#tppubs\" >2015<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2014#tppubs\" >2014<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2013#tppubs\" >2013<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2012#tppubs\" >2012<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2011#tppubs\" >2011<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2010#tppubs\" >2010<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2009#tppubs\" >2009<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2008#tppubs\" >2008<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2007#tppubs\" >2007<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2006#tppubs\" >2006<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2005#tppubs\" >2005<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2004#tppubs\" >2004<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2003#tppubs\" >2003<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2002#tppubs\" >2002<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2001#tppubs\" >2001<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=2000#tppubs\" >2000<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1999#tppubs\" >1999<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1998#tppubs\" >1998<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1997#tppubs\" >1997<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1996#tppubs\" >1996<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1995#tppubs\" >1995<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1994#tppubs\" >1994<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1989#tppubs\" >1989<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1987#tppubs\" >1987<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1985#tppubs\" >1985<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=1791#tppubs\" >1791<\/option><option value = \"tgid=&amp;type=&amp;auth=&amp;usr=&amp;yr=0000#tppubs\" >0000<\/option>\r\n                <\/select><select class=\"default\" name=\"type\" id=\"type\" tabindex=\"3\" onchange=\"teachpress_jumpMenu('parent',this, 'https:\/\/gaz.i3a.es\/es\/publications\/?')\">\r\n                   <option value=\"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=#tppubs\">Todas las tipolog\u00edas<\/option>\r\n                   <option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=article#tppubs\" >Art\u00edculos de revista<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=book#tppubs\" >Libros<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=incollection#tppubs\" >Book Sections<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=inproceedings#tppubs\" >Proceedings Articles<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=mastersthesis#tppubs\" >Tesis de m\u00e1ster o tesina<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=misc#tppubs\" >Miscel\u00e1nea<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=phdthesis#tppubs\" >Tesis doctorales<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=proceedings#tppubs\" >Actas de congresos<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=techreport#tppubs\" >Informes t\u00e9cnicos<\/option><option value = \"tgid=&amp;yr=&amp;auth=&amp;usr=&amp;type=workshop#tppubs\" >Workshops<\/option>\r\n                <\/select><\/div><input type=\"hidden\" name=\"trp-form-language\" value=\"es\"\/><\/form><div class=\"tablenav\"><div class=\"tablenav-pages\"><span class=\"displaying-num\">405 registros<\/span> <a class=\"page-numbers button disabled\">&laquo;<\/a> <a class=\"page-numbers button disabled\">&lsaquo;<\/a> 1 de 21 <a href=\"https:\/\/gaz.i3a.es\/es\/publications\/?limit=2&amp;tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=&amp;tsr=#tppubs\" title=\"p\u00e1gina siguiente\" class=\"page-numbers button\">&rsaquo;<\/a> <a href=\"https:\/\/gaz.i3a.es\/es\/publications\/?limit=21&amp;tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=&amp;tsr=#tppubs\" title=\"\u00faltima p\u00e1gina\" class=\"page-numbers button\">&raquo;<\/a> <\/div><\/div><div class=\"teachpress_publication_list\"><h3 class=\"tp_h3\" id=\"tp_h3_2026\">2026<\/h3><h3 class=\"tp_h3\" id=\"tp_h3_article\">Art\u00edculos de revista<\/h3><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Jim\u00e9nez-Blanco, Albert;  L\u00f3pez-Villellas, Lori\u00e9n;  Moure, Juan Carlos;  Moreto, Miquel;  Marco-Sola, Santiago<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('896','tp_links')\" style=\"cursor:pointer;\">Theseus: Fast and Optimal Affine-Gap Sequence-to-Graph Alignment<\/a> <span class=\"tp_pub_type tp_  article\">Art\u00edculo de revista<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_year\">2026<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_resource_link\"><a id=\"tp_links_sh_896\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('896','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_896\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('896','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_896\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{JimnezBlanco2026,<br \/>\r\ntitle = {Theseus: Fast and Optimal Affine-Gap Sequence-to-Graph Alignment},<br \/>\r\nauthor = {Albert Jim\u00e9nez-Blanco and Lori\u00e9n L\u00f3pez-Villellas and Juan Carlos Moure and Miquel Moreto and Santiago Marco-Sola},<br \/>\r\nurl = {http:\/\/dx.doi.org\/10.64898\/2026.02.12.705572},<br \/>\r\ndoi = {10.64898\/2026.02.12.705572},<br \/>\r\nyear  = {2026},<br \/>\r\ndate = {2026-02-01},<br \/>\r\npublisher = {openRxiv},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('896','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_896\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"fas fa-globe\"><\/i><a class=\"tp_pub_list\" href=\"http:\/\/dx.doi.org\/10.64898\/2026.02.12.705572\" title=\"http:\/\/dx.doi.org\/10.64898\/2026.02.12.705572\" target=\"_blank\">http:\/\/dx.doi.org\/10.64898\/2026.02.12.705572<\/a><\/li><li><i class=\"ai ai-doi\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dx.doi.org\/10.64898\/2026.02.12.705572\" title=\"DOI de seguimiento:10.64898\/2026.02.12.705572\" target=\"_blank\">doi:10.64898\/2026.02.12.705572<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('896','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><h3 class=\"tp_h3\" id=\"tp_h3_inproceedings\">Proceedings Articles<\/h3><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Siracusa, Marco;  Hsu, Olivia;  Soria-Pardos, Victor;  Randall, Joshua;  Grasset, Arnaud;  Biscondi, Eric;  Joseph, Doug;  Allen, Randy;  Kjolstad, Fredrik;  Planas, Miquel Moret\u00f3;  Armejach, Adri\u00e0<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('890','tp_links')\" style=\"cursor:pointer;\">Ember: A Compiler for Efficient Embedding Operations on Decoupled Access-Execute Architectures<\/a> <span class=\"tp_pub_type tp_  inproceedings\">Proceedings Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_year\">2026<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_resource_link\"><a id=\"tp_links_sh_890\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('890','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_890\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('890','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_890\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{siracusa2025ember,<br \/>\r\ntitle = {Ember: A Compiler for Efficient Embedding Operations on Decoupled Access-Execute Architectures},<br \/>\r\nauthor = {Marco Siracusa and Olivia Hsu and Victor Soria-Pardos and Joshua Randall and Arnaud Grasset and Eric Biscondi and Doug Joseph and Randy Allen and Fredrik Kjolstad and Miquel Moret\u00f3 Planas and Adri\u00e0 Armejach},<br \/>\r\nurl = {https:\/\/arxiv.org\/pdf\/2504.09870},<br \/>\r\nyear  = {2026},<br \/>\r\ndate = {2026-01-01},<br \/>\r\nurldate = {2026-01-01},<br \/>\r\njournal = {Proceedings of the 22nd ACM International Symposium on Code Generation and Optimization, CGO },<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('890','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_890\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"ai ai-arxiv\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/arxiv.org\/pdf\/2504.09870\" title=\"https:\/\/arxiv.org\/pdf\/2504.09870\" target=\"_blank\">https:\/\/arxiv.org\/pdf\/2504.09870<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('890','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><h3 class=\"tp_h3\" id=\"tp_h3_2025\">2025<\/h3><h3 class=\"tp_h3\" id=\"tp_h3_article\">Art\u00edculos de revista<\/h3><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Valero, Alejandro;  Lorente, Vicente;  Petit, Salvador;  Sahuquillo, Julio<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('849','tp_links')\" style=\"cursor:pointer;\">Dual Fast-Track Cache: Organizing Ring-Shaped Racetracks to Work as L1 Caches<\/a> <span class=\"tp_pub_type tp_  article\">Art\u00edculo de revista<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Computers, <\/span><span class=\"tp_pub_additional_volume\">vol. 74, <\/span><span class=\"tp_pub_additional_number\">no 8, <\/span><span class=\"tp_pub_additional_pages\">pp. 2812-2826, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>, <span class=\"tp_pub_additional_issn\">ISSN: 0018-9340<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_849\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('849','tp_abstract')\" title=\"Mostrar resumen\" style=\"cursor:pointer;\">Resumen<\/a><\/span> | <span class=\"tp_resource_link\"><a id=\"tp_links_sh_849\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('849','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_849\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('849','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_849\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{Valero2025,<br \/>\r\ntitle = {Dual Fast-Track Cache: Organizing Ring-Shaped Racetracks to Work as L1 Caches},<br \/>\r\nauthor = {Alejandro Valero and Vicente Lorente and Salvador Petit and Julio Sahuquillo},<br \/>\r\nurl = {https:\/\/www.computer.org\/csdl\/journal\/tc\/2025\/08\/11022726\/27fzlt4rw88},<br \/>\r\ndoi = {10.1109\/TC.2025.3575909},<br \/>\r\nissn = {0018-9340},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-08-01},<br \/>\r\nurldate = {2025-08-01},<br \/>\r\njournal = {IEEE Transactions on Computers},<br \/>\r\nvolume = {74},<br \/>\r\nnumber = {8},<br \/>\r\npages = {2812-2826},<br \/>\r\nabstract = {Static Random-Access Memory (SRAM) is the fastest memory technology and has been the common design choice for implementing first-level (L1) caches in the processor pipeline, where speed is a key design issue that must be fulfilled. On the contrary, this technology offers much lower density compared to other technologies like Dynamic RAM, limiting L1 cache sizes of modern processors to a few tens of KB. This paper explores the use of slower but denser Domain Wall Memory (DWM) technology for L1 caches. This technology provides slow access times since it arranges multiple bits sequentially in a magnetic racetrack. To access these bits, they need to be shifted in order to place them under a header. A 1-bit shift usually takes one processor cycle, which can significantly hurt the application performance, making this working behavior inappropriate for L1 caches. Based on the locality (temporal and spatial) principles exploited by caches, this work proposes the Dual Fast-Track Cache (Dual FTC) design, a new approach to organizing a set of racetracks to build set-associative caches. Compared to a conventional SRAM cache, Dual FTC enhances storage capacity by 5\u00d7 while incurring minimal shifting overhead, thereby rendering it a practical and appealing solution for L1 cache implementations. Experimental results show that the devised cache organization is as fast as an SRAM cache for 78% and 86% of the L1 data cache hits and L1 instruction cache hits, respectively (i.e., no shift is required). Consequently, due to the larger L1 cache capacities, significant system performance gains (by 22% on average) are obtained under the same silicon area.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('849','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_849\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Static Random-Access Memory (SRAM) is the fastest memory technology and has been the common design choice for implementing first-level (L1) caches in the processor pipeline, where speed is a key design issue that must be fulfilled. On the contrary, this technology offers much lower density compared to other technologies like Dynamic RAM, limiting L1 cache sizes of modern processors to a few tens of KB. This paper explores the use of slower but denser Domain Wall Memory (DWM) technology for L1 caches. This technology provides slow access times since it arranges multiple bits sequentially in a magnetic racetrack. To access these bits, they need to be shifted in order to place them under a header. A 1-bit shift usually takes one processor cycle, which can significantly hurt the application performance, making this working behavior inappropriate for L1 caches. Based on the locality (temporal and spatial) principles exploited by caches, this work proposes the Dual Fast-Track Cache (Dual FTC) design, a new approach to organizing a set of racetracks to build set-associative caches. Compared to a conventional SRAM cache, Dual FTC enhances storage capacity by 5\u00d7 while incurring minimal shifting overhead, thereby rendering it a practical and appealing solution for L1 cache implementations. Experimental results show that the devised cache organization is as fast as an SRAM cache for 78% and 86% of the L1 data cache hits and L1 instruction cache hits, respectively (i.e., no shift is required). Consequently, due to the larger L1 cache capacities, significant system performance gains (by 22% on average) are obtained under the same silicon area.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('849','tp_abstract')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_849\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"fas fa-globe\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/www.computer.org\/csdl\/journal\/tc\/2025\/08\/11022726\/27fzlt4rw88\" title=\"https:\/\/www.computer.org\/csdl\/journal\/tc\/2025\/08\/11022726\/27fzlt4rw88\" target=\"_blank\">https:\/\/www.computer.org\/csdl\/journal\/tc\/2025\/08\/11022726\/27fzlt4rw88<\/a><\/li><li><i class=\"ai ai-doi\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dx.doi.org\/10.1109\/TC.2025.3575909\" title=\"DOI de seguimiento:10.1109\/TC.2025.3575909\" target=\"_blank\">doi:10.1109\/TC.2025.3575909<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('849','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Navarro-Torres, Agust\u00edn;  Panda, Biswabandan;  Alastruey-Bened\u00e9, Jes\u00fas;  Ib\u00e1\u00f1ez, Pablo;  Vi\u00f1nals-Y\u00fafera, V\u00edctor;  Ros, Alberto<\/p><p class=\"tp_pub_title\">A Complexity-Effective Local Delta Prefetcher <span class=\"tp_pub_type tp_  article\">Art\u00edculo de revista<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Computers, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_859\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('859','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_859\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{navarro2025complexity,<br \/>\r\ntitle = {A Complexity-Effective Local Delta Prefetcher},<br \/>\r\nauthor = {Agust\u00edn Navarro-Torres and Biswabandan Panda and Jes\u00fas Alastruey-Bened\u00e9 and Pablo Ib\u00e1\u00f1ez and V\u00edctor Vi\u00f1nals-Y\u00fafera and Alberto Ros},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-01},<br \/>\r\njournal = {IEEE Transactions on Computers},<br \/>\r\npublisher = {IEEE},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('859','tp_bibtex')\">Cerrar<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> L\u00f3pez-Villellas, Lori\u00e9n;  Mikkelsen, Carl Christian Kjelgaard;  Galano-Frutos, Juan Jos\u00e9;  Marco-Sola, Santiago;  Alastruey-Bened\u00e9, Jes\u00fas;  Ib\u00e1\u00f1ez, Pablo;  Echenique, Pablo;  Moret\u00f3, Miquel;  Rosa, Maria Cristina De;  Garc\u00eda-Risue\u00f1o, Pablo<\/p><p class=\"tp_pub_title\">ILVES: Accurate and Efficient Bond Length and Angle Constraints in Molecular Dynamics <span class=\"tp_pub_type tp_  article\">Art\u00edculo de revista<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_journal\">Journal of Chemical Theory and Computation, <\/span><span class=\"tp_pub_additional_volume\">vol. 21, <\/span><span class=\"tp_pub_additional_number\">no 18, <\/span><span class=\"tp_pub_additional_pages\">pp. 8711\u20138719, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_860\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('860','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_860\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{lopez2025ilves,<br \/>\r\ntitle = {ILVES: Accurate and Efficient Bond Length and Angle Constraints in Molecular Dynamics},<br \/>\r\nauthor = {Lori\u00e9n L\u00f3pez-Villellas and Carl Christian Kjelgaard Mikkelsen and Juan Jos\u00e9 Galano-Frutos and Santiago Marco-Sola and Jes\u00fas Alastruey-Bened\u00e9 and Pablo Ib\u00e1\u00f1ez and Pablo Echenique and Miquel Moret\u00f3 and Maria Cristina De Rosa and Pablo Garc\u00eda-Risue\u00f1o},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-01},<br \/>\r\njournal = {Journal of Chemical Theory and Computation},<br \/>\r\nvolume = {21},<br \/>\r\nnumber = {18},<br \/>\r\npages = {8711\u20138719},<br \/>\r\npublisher = {American Chemical Society},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('860','tp_bibtex')\">Cerrar<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Pedrajas, Samuel P\u00e9rez;  Resano, Javier;  Gracia, Dar\u00edo Su\u00e1rez<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('866','tp_links')\" style=\"cursor:pointer;\">BnnRV: Hardware and Software Optimizations for Weight Sampling in Bayesian Neural Networks on Edge RISC-V Cores<\/a> <span class=\"tp_pub_type tp_  article\">Art\u00edculo de revista<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Circuits and Systems for Artificial Intelligence, <\/span><span class=\"tp_pub_additional_pages\">pp. 1-12, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>, <span class=\"tp_pub_additional_issn\">ISSN: 2996-6647<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_866\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('866','tp_abstract')\" title=\"Mostrar resumen\" style=\"cursor:pointer;\">Resumen<\/a><\/span> | <span class=\"tp_resource_link\"><a id=\"tp_links_sh_866\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('866','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_866\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('866','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_866\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{11216142,<br \/>\r\ntitle = {BnnRV: Hardware and Software Optimizations for Weight Sampling in Bayesian Neural Networks on Edge RISC-V Cores},<br \/>\r\nauthor = {Samuel P\u00e9rez Pedrajas and Javier Resano and Dar\u00edo Su\u00e1rez Gracia},<br \/>\r\ndoi = {10.1109\/TCASAI.2025.3625517},<br \/>\r\nissn = {2996-6647},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-01},<br \/>\r\njournal = {IEEE Transactions on Circuits and Systems for Artificial Intelligence},<br \/>\r\npages = {1-12},<br \/>\r\nabstract = {Bayesian Neural Networks (BNN) allow prediction uncertainty estimation, making them a more suitable option for safety-critical applications. However, in BNNs, the forward-pass computational cost is significantly higher than in traditional neural networks (NN), due to the overhead generated by weight sampling. This limits their deployment in edge systems. This paper presents an optimization that allows using lower-cost Uniform distribution sampling instead of Gaussian sampling during BNN inference. Building upon this optimization, this paper proposes a lightweight RISC-V instruction set architecture extension that accelerates BNN inference by introducing fixed point arithmetic operations and an efficient Uniform random number generator. The flexibility of RISC-V enables such domain-specific acceleration, narrowing the performance gap between NNs and BNNs for edge machine learning workloads. The proposed software and hardware optimizations achieve an average speedup of 8.93\u00d7 while reducing energy consumption per forward pass by 87.12%, increasing image\/J efficiency by 8.19\u00d7. They have been designed to maintain accuracy, calibration, and uncertainty quality, while optimizing execution efficiency. This has been verified with an extensive validation process that considers relevant model architectures. Additionally, our results highlight that weight sampling is no longer the BNN inference performance bottleneck, shifting the primary limiting factor to control overhead.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('866','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_866\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Bayesian Neural Networks (BNN) allow prediction uncertainty estimation, making them a more suitable option for safety-critical applications. However, in BNNs, the forward-pass computational cost is significantly higher than in traditional neural networks (NN), due to the overhead generated by weight sampling. This limits their deployment in edge systems. This paper presents an optimization that allows using lower-cost Uniform distribution sampling instead of Gaussian sampling during BNN inference. Building upon this optimization, this paper proposes a lightweight RISC-V instruction set architecture extension that accelerates BNN inference by introducing fixed point arithmetic operations and an efficient Uniform random number generator. The flexibility of RISC-V enables such domain-specific acceleration, narrowing the performance gap between NNs and BNNs for edge machine learning workloads. The proposed software and hardware optimizations achieve an average speedup of 8.93\u00d7 while reducing energy consumption per forward pass by 87.12%, increasing image\/J efficiency by 8.19\u00d7. They have been designed to maintain accuracy, calibration, and uncertainty quality, while optimizing execution efficiency. This has been verified with an extensive validation process that considers relevant model architectures. Additionally, our results highlight that weight sampling is no longer the BNN inference performance bottleneck, shifting the primary limiting factor to control overhead.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('866','tp_abstract')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_866\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"ai ai-doi\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dx.doi.org\/10.1109\/TCASAI.2025.3625517\" title=\"DOI de seguimiento:10.1109\/TCASAI.2025.3625517\" target=\"_blank\">doi:10.1109\/TCASAI.2025.3625517<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('866','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Bazo, Antonio;  L\u00f3pez-Villellas, Lori\u00e9n;  Mataloni, Matilde;  Bolea-Fernandez, Eduardo;  Rua-Ibarz, Ana;  Grotti, Marco;  Aramend\u00eda, Maite;  Resano, Mart\u00edn<\/p><p class=\"tp_pub_title\">Improving detection and figures of merit in single-particle inductively coupled plasma-mass spectrometry via transient event heights <span class=\"tp_pub_type tp_  article\">Art\u00edculo de revista<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_journal\">Analytica Chimica Acta, <\/span><span class=\"tp_pub_additional_pages\">pp. 344694, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_881\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('881','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_881\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{bazo2025improving,<br \/>\r\ntitle = {Improving detection and figures of merit in single-particle inductively coupled plasma-mass spectrometry via transient event heights},<br \/>\r\nauthor = {Antonio Bazo and Lori\u00e9n L\u00f3pez-Villellas and Matilde Mataloni and Eduardo Bolea-Fernandez and Ana Rua-Ibarz and Marco Grotti and Maite Aramend\u00eda and Mart\u00edn Resano},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-01},<br \/>\r\njournal = {Analytica Chimica Acta},<br \/>\r\npages = {344694},<br \/>\r\npublisher = {Elsevier},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('881','tp_bibtex')\">Cerrar<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> L\u00f3pez-Villellas, Lori\u00e9n;  I\u00f1iguez, Cristian;  Jim\u00e9nez-Blanco, Albert;  Aguado-Puig, Quim;  Moret\u00f3, Miquel;  Alastruey-Bened\u00e9, Jes\u00fas;  Ib\u00e1\u00f1ez, Pablo;  Marco-Sola, Santiago<\/p><p class=\"tp_pub_title\">Singletrack: An Algorithm for Improving Memory Consumption and Performance of Gap-Affine Sequence Alignment <span class=\"tp_pub_type tp_  article\">Art\u00edculo de revista<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_journal\">bioRxiv, <\/span><span class=\"tp_pub_additional_pages\">pp. 2025\u201310, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_882\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('882','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_882\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{lopez2025singletrack,<br \/>\r\ntitle = {Singletrack: An Algorithm for Improving Memory Consumption and Performance of Gap-Affine Sequence Alignment},<br \/>\r\nauthor = {Lori\u00e9n L\u00f3pez-Villellas and Cristian I\u00f1iguez and Albert Jim\u00e9nez-Blanco and Quim Aguado-Puig and Miquel Moret\u00f3 and Jes\u00fas Alastruey-Bened\u00e9 and Pablo Ib\u00e1\u00f1ez and Santiago Marco-Sola},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-01},<br \/>\r\njournal = {bioRxiv},<br \/>\r\npages = {2025\u201310},<br \/>\r\npublisher = {Cold Spring Harbor Laboratory},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('882','tp_bibtex')\">Cerrar<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Mikkelsen, Carl Christian Kjelgaard;  L\u00f3pez-Villellas, Lori\u00e9n<\/p><p class=\"tp_pub_title\">How Accurate is Richardson&#8217;s Error Estimate? <span class=\"tp_pub_type tp_  article\">Art\u00edculo de revista<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_journal\">Concurrency and Computation: Practice and Experience, <\/span><span class=\"tp_pub_additional_volume\">vol. 37, <\/span><span class=\"tp_pub_additional_number\">no 27-28, <\/span><span class=\"tp_pub_additional_pages\">pp. e70305, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_883\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('883','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_883\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{kjelgaard2025accurate,<br \/>\r\ntitle = {How Accurate is Richardson's Error Estimate?},<br \/>\r\nauthor = {Carl Christian Kjelgaard Mikkelsen and Lori\u00e9n L\u00f3pez-Villellas},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-01},<br \/>\r\njournal = {Concurrency and Computation: Practice and Experience},<br \/>\r\nvolume = {37},<br \/>\r\nnumber = {27-28},<br \/>\r\npages = {e70305},<br \/>\r\npublisher = {John Wiley & Sons, Inc. Hoboken, USA},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('883','tp_bibtex')\">Cerrar<\/a><\/p><\/div><\/div><\/div><h3 class=\"tp_h3\" id=\"tp_h3_inproceedings\">Proceedings Articles<\/h3><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Soria-Pardos, V\u00edctor;  Armejach, Adri\u00e0;  M\u00fcck, Tiago;  Gracia, Dar\u00edo Su\u00e1rez;  Joao, Jose;  Moret\u00f3, Miquel<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('892','tp_links')\" style=\"cursor:pointer;\">Delegato: Locality-Aware Atomic Memory Operations on Chiplets<\/a> <span class=\"tp_pub_type tp_  inproceedings\">Proceedings Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_booktitle\">Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture, <\/span><span class=\"tp_pub_additional_pages\">pp. 1793\u20131808, <\/span><span class=\"tp_pub_additional_publisher\">ACM, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_892\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('892','tp_abstract')\" title=\"Mostrar resumen\" style=\"cursor:pointer;\">Resumen<\/a><\/span> | <span class=\"tp_resource_link\"><a id=\"tp_links_sh_892\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('892','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_892\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('892','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_892\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{soria2025delegato,<br \/>\r\ntitle = {Delegato: Locality-Aware Atomic Memory Operations on Chiplets},<br \/>\r\nauthor = {V\u00edctor Soria-Pardos and Adri\u00e0 Armejach and Tiago M\u00fcck and Dar\u00edo Su\u00e1rez Gracia and Jose Joao and Miquel Moret\u00f3},<br \/>\r\nurl = {https:\/\/dl.acm.org\/doi\/full\/10.1145\/3725843.3756030},<br \/>\r\ndoi = {10.1145\/3725843.375603},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-01},<br \/>\r\nurldate = {2025-01-01},<br \/>\r\nbooktitle = {Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture},<br \/>\r\npages = {1793\u20131808},<br \/>\r\npublisher = {ACM},<br \/>\r\nabstract = {The irruption of chiplet-based architectures has been a game changer, enabling higher transistor integration and core counts in a single socket. However, chiplets impose higher and non-uniform memory access (NUMA) latencies than monolithic integration. This harms the efficiency of atomic memory operations (AMOs), which are fundamental to implementing fine-grained synchronization and concurrent data structures on large systems. AMOs are executed either near the core (near) or at a remote location within the cache hierarchy (far). On near AMOs, the core\u2019s private cache fetches the target cache line in exclusiveness to modify it locally. Near AMOs cause significant data movement between private caches, especially harming parallel applications\u2019 performance on chiplet-based architectures. Alternatively, far AMOs can alleviate the communication overhead by reducing data movement between processing elements. However, current multicore architectures only support one type of far AMO, which sends all updates to a single serialization point (centralized AMOs).<br \/>\r\nThis work introduces two new types of far AMOs, delegated and migrating, that execute AMOs remotely without centralizing updates in a single point of the cache hierarchy. Combining centralized, delegated, and migrating AMOs allows the directory to select the best location to execute AMOs. Moreover, we propose Delegato, a tracing optimization to effectively transport usage information from private caches to the directory to predict the best atomic type to issue accurately. Additionally, we design a simple predictor on top of Delegato that seamlessly selects the best placement to perform AMOs based on the data access pattern and usage activity of cores. Our evaluation using gem5 shows that Delegato can speed up applications on average by 1.07 \u00d7 over centralized AMOs and by 1.13 \u00d7 over the state-of-the-art AMO predictor.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('892','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_892\" style=\"display:none;\"><div class=\"tp_abstract_entry\">The irruption of chiplet-based architectures has been a game changer, enabling higher transistor integration and core counts in a single socket. However, chiplets impose higher and non-uniform memory access (NUMA) latencies than monolithic integration. This harms the efficiency of atomic memory operations (AMOs), which are fundamental to implementing fine-grained synchronization and concurrent data structures on large systems. AMOs are executed either near the core (near) or at a remote location within the cache hierarchy (far). On near AMOs, the core\u2019s private cache fetches the target cache line in exclusiveness to modify it locally. Near AMOs cause significant data movement between private caches, especially harming parallel applications\u2019 performance on chiplet-based architectures. Alternatively, far AMOs can alleviate the communication overhead by reducing data movement between processing elements. However, current multicore architectures only support one type of far AMO, which sends all updates to a single serialization point (centralized AMOs).<br \/>\r\nThis work introduces two new types of far AMOs, delegated and migrating, that execute AMOs remotely without centralizing updates in a single point of the cache hierarchy. Combining centralized, delegated, and migrating AMOs allows the directory to select the best location to execute AMOs. Moreover, we propose Delegato, a tracing optimization to effectively transport usage information from private caches to the directory to predict the best atomic type to issue accurately. Additionally, we design a simple predictor on top of Delegato that seamlessly selects the best placement to perform AMOs based on the data access pattern and usage activity of cores. Our evaluation using gem5 shows that Delegato can speed up applications on average by 1.07 \u00d7 over centralized AMOs and by 1.13 \u00d7 over the state-of-the-art AMO predictor.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('892','tp_abstract')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_892\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"fas fa-globe\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dl.acm.org\/doi\/full\/10.1145\/3725843.3756030\" title=\"https:\/\/dl.acm.org\/doi\/full\/10.1145\/3725843.3756030\" target=\"_blank\">https:\/\/dl.acm.org\/doi\/full\/10.1145\/3725843.3756030<\/a><\/li><li><i class=\"ai ai-doi\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dx.doi.org\/10.1145\/3725843.375603\" title=\"DOI de seguimiento:10.1145\/3725843.375603\" target=\"_blank\">doi:10.1145\/3725843.375603<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('892','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Soria-Pardos, V\u00edctor;  Armejach, Adri\u00e0;  Su\u00e1rez, Dar\u00edo;  Martinot, Didier;  Grasset, Arnaud;  Moret\u00f3, Miquel<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('895','tp_links')\" style=\"cursor:pointer;\">FLAMA: Architecting floating-point atomic memory operations for heterogeneous HPC systems<\/a> <span class=\"tp_pub_type tp_  inproceedings\">Proceedings Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_booktitle\">2025 28th Euromicro Conference on Digital System Design (DSD), <\/span><span class=\"tp_pub_additional_pages\">pp. 435\u2013442, <\/span><span class=\"tp_pub_additional_organization\">IEEE <\/span><span class=\"tp_pub_additional_publisher\">IEEE, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_895\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('895','tp_abstract')\" title=\"Mostrar resumen\" style=\"cursor:pointer;\">Resumen<\/a><\/span> | <span class=\"tp_resource_link\"><a id=\"tp_links_sh_895\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('895','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_895\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('895','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_895\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{soria2025flama,<br \/>\r\ntitle = {FLAMA: Architecting floating-point atomic memory operations for heterogeneous HPC systems},<br \/>\r\nauthor = {V\u00edctor Soria-Pardos and Adri\u00e0 Armejach and Dar\u00edo Su\u00e1rez and Didier Martinot and Arnaud Grasset and Miquel Moret\u00f3},<br \/>\r\nurl = {https:\/\/upcommons.upc.edu\/server\/api\/core\/bitstreams\/9199c411-ce89-4327-a06b-bf21838aa8db\/content},<br \/>\r\ndoi = {10.1109\/DSD67783.2025.00066},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-01},<br \/>\r\nurldate = {2025-01-01},<br \/>\r\nbooktitle = {2025 28th Euromicro Conference on Digital System Design (DSD)},<br \/>\r\npages = {435\u2013442},<br \/>\r\npublisher = {IEEE},<br \/>\r\norganization = {IEEE},<br \/>\r\nabstract = {Current heterogeneous systems integrate generalpurpose Central Processing Units (CPUs), Graphics Processing Units (GPUs), and Neural Processing Units (NPUs). The efficient use of such systems requires a significant programming effort to distribute computation and synchronize across devices, which usually involves using Atomic Memory Operations (AMOs). Arm recently launched a floating-point Atomic Memory Operations (FAMOs) extension to perform atomic updates on floating-point data types specifically. This work characterizes and models heterogeneous architectures to understand how floating-point AMOs impact graph, Machine Learning (ML), and high-performance computing (HPC) workloads. Our analysis shows that many AMOs are performed on floating-point data, which modern systems execute using inefficient compare-and-swap (CAS) constructs. Therefore, replacing CASbased constructs with FAMOs can improve a wide range of workloads. Moreover, we analyze the trade-offs of executing FAMOs at different memory hierarchy levels, either in private caches (near) or remotely in shared caches (far). We have extended the widely used AMBA CHI protocol to evaluate such FAMO support on a simulated chiplet-based heterogeneous architecture. While near FAMOs achieve an average 1.34\u00d7 speed-up, far FAMOs reach an average 1.58\u00d7 speed-up. We conclude that FAMOs can bridge the gap between CPU architecture and accelerators and enabling synchronization in key application domains.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('895','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_895\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Current heterogeneous systems integrate generalpurpose Central Processing Units (CPUs), Graphics Processing Units (GPUs), and Neural Processing Units (NPUs). The efficient use of such systems requires a significant programming effort to distribute computation and synchronize across devices, which usually involves using Atomic Memory Operations (AMOs). Arm recently launched a floating-point Atomic Memory Operations (FAMOs) extension to perform atomic updates on floating-point data types specifically. This work characterizes and models heterogeneous architectures to understand how floating-point AMOs impact graph, Machine Learning (ML), and high-performance computing (HPC) workloads. Our analysis shows that many AMOs are performed on floating-point data, which modern systems execute using inefficient compare-and-swap (CAS) constructs. Therefore, replacing CASbased constructs with FAMOs can improve a wide range of workloads. Moreover, we analyze the trade-offs of executing FAMOs at different memory hierarchy levels, either in private caches (near) or remotely in shared caches (far). We have extended the widely used AMBA CHI protocol to evaluate such FAMO support on a simulated chiplet-based heterogeneous architecture. While near FAMOs achieve an average 1.34\u00d7 speed-up, far FAMOs reach an average 1.58\u00d7 speed-up. We conclude that FAMOs can bridge the gap between CPU architecture and accelerators and enabling synchronization in key application domains.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('895','tp_abstract')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_895\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"fas fa-globe\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/upcommons.upc.edu\/server\/api\/core\/bitstreams\/9199c411-ce89-4327-a06b-bf21838aa8db\/content\" title=\"https:\/\/upcommons.upc.edu\/server\/api\/core\/bitstreams\/9199c411-ce89-4327-a06b-bf2[...]\" target=\"_blank\">https:\/\/upcommons.upc.edu\/server\/api\/core\/bitstreams\/9199c411-ce89-4327-a06b-bf2[&#8230;]<\/a><\/li><li><i class=\"ai ai-doi\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dx.doi.org\/10.1109\/DSD67783.2025.00066\" title=\"DOI de seguimiento:10.1109\/DSD67783.2025.00066\" target=\"_blank\">doi:10.1109\/DSD67783.2025.00066<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('895','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><h3 class=\"tp_h3\" id=\"tp_h3_2024\">2024<\/h3><h3 class=\"tp_h3\" id=\"tp_h3_article\">Art\u00edculos de revista<\/h3><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Toca-D\u00edaz, Yamilka;  Tejero, Rub\u00e9n Gran;  Valero, Alejandro<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('850','tp_links')\" style=\"cursor:pointer;\">Shift-and-Safe: Addressing permanent faults in aggressively undervolted CNN accelerators<\/a> <span class=\"tp_pub_type tp_  article\">Art\u00edculo de revista<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_journal\">Journal of Systems Architecture, <\/span><span class=\"tp_pub_additional_volume\">vol. 157, <\/span><span class=\"tp_pub_additional_pages\">pp. 1-13, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>, <span class=\"tp_pub_additional_issn\">ISSN: 1383-7621<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_850\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('850','tp_abstract')\" title=\"Mostrar resumen\" style=\"cursor:pointer;\">Resumen<\/a><\/span> | <span class=\"tp_resource_link\"><a id=\"tp_links_sh_850\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('850','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_850\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('850','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_850\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{Toca-D\u00edaz2024,<br \/>\r\ntitle = {Shift-and-Safe: Addressing permanent faults in aggressively undervolted CNN accelerators},<br \/>\r\nauthor = {Yamilka Toca-D\u00edaz and Rub\u00e9n Gran Tejero and Alejandro Valero},<br \/>\r\nurl = {https:\/\/www.sciencedirect.com\/science\/article\/pii\/S1383762124002297},<br \/>\r\ndoi = {https:\/\/doi.org\/10.1016\/j.sysarc.2024.103292},<br \/>\r\nissn = {1383-7621},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-12-01},<br \/>\r\nurldate = {2024-12-01},<br \/>\r\njournal = {Journal of Systems Architecture},<br \/>\r\nvolume = {157},<br \/>\r\npages = {1-13},<br \/>\r\nabstract = {Underscaling the supply voltage (Vdd) to ultra-low levels below the safe-operation threshold voltage (Vmin) holds promise for substantial power savings in digital CMOS circuits. However, these benefits come with pronounced challenges due to the heightened risk of bitcell permanent faults stemming from process variations in current technology node sizes. This work delves into the repercussions of such faults on the accuracy of a 16-bit fixed-point Convolutional Neural Network (CNN) inference accelerator powering on-chip activation memories at ultra-low Vdd voltages. Through an in-depth examination of fault patterns, memory usage, and statistical analysis of activation values, this paper introduces Shift-and-Safe: two novel and cost-effective microarchitectural techniques exploiting the presence of outlier activation values and the underutilization of activation memories. Particularly, activation outliers enable a shift-based data representation that reduces the impact of faults on the activation values, whereas the memory underutilization is exploited to maintain a safe replica of affected activations in idle memory regions. Remarkably, these mechanisms do not add any burden to the programmer and are independent of application characteristics, rendering them easily deployable across real-world CNN accelerators. Experimental results show that Shift-and-Safe maintains the CNN accuracy even in the presence of almost a quarter of the total activations with faults. In addition, average energy savings are by 5% and 11% compared to the state-of-the-art approach and a conventional accelerator supplied at Vmin, respectively.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('850','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_850\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Underscaling the supply voltage (Vdd) to ultra-low levels below the safe-operation threshold voltage (Vmin) holds promise for substantial power savings in digital CMOS circuits. However, these benefits come with pronounced challenges due to the heightened risk of bitcell permanent faults stemming from process variations in current technology node sizes. This work delves into the repercussions of such faults on the accuracy of a 16-bit fixed-point Convolutional Neural Network (CNN) inference accelerator powering on-chip activation memories at ultra-low Vdd voltages. Through an in-depth examination of fault patterns, memory usage, and statistical analysis of activation values, this paper introduces Shift-and-Safe: two novel and cost-effective microarchitectural techniques exploiting the presence of outlier activation values and the underutilization of activation memories. Particularly, activation outliers enable a shift-based data representation that reduces the impact of faults on the activation values, whereas the memory underutilization is exploited to maintain a safe replica of affected activations in idle memory regions. Remarkably, these mechanisms do not add any burden to the programmer and are independent of application characteristics, rendering them easily deployable across real-world CNN accelerators. Experimental results show that Shift-and-Safe maintains the CNN accuracy even in the presence of almost a quarter of the total activations with faults. In addition, average energy savings are by 5% and 11% compared to the state-of-the-art approach and a conventional accelerator supplied at Vmin, respectively.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('850','tp_abstract')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_850\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"fas fa-globe\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/www.sciencedirect.com\/science\/article\/pii\/S1383762124002297\" title=\"https:\/\/www.sciencedirect.com\/science\/article\/pii\/S1383762124002297\" target=\"_blank\">https:\/\/www.sciencedirect.com\/science\/article\/pii\/S1383762124002297<\/a><\/li><li><i class=\"ai ai-doi\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dx.doi.org\/https:\/\/doi.org\/10.1016\/j.sysarc.2024.103292\" title=\"DOI de seguimiento:https:\/\/doi.org\/10.1016\/j.sysarc.2024.103292\" target=\"_blank\">doi:https:\/\/doi.org\/10.1016\/j.sysarc.2024.103292<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('850','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Toca-D\u00edaz, Yamilka;  Palacios, Reynier Hern\u00e1ndez;  Tejero, Ruben Gran;  Valero, Alejandro<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('851','tp_links')\" style=\"cursor:pointer;\">Flip-and-Patch: A fault-tolerant technique for on-chip memories of CNN accelerators at low supply voltage<\/a> <span class=\"tp_pub_type tp_  article\">Art\u00edculo de revista<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_journal\">Microprocessors and Microsystems, <\/span><span class=\"tp_pub_additional_volume\">vol. 106, <\/span><span class=\"tp_pub_additional_pages\">pp. 1-13, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>, <span class=\"tp_pub_additional_issn\">ISSN: 0141-9331<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_851\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('851','tp_abstract')\" title=\"Mostrar resumen\" style=\"cursor:pointer;\">Resumen<\/a><\/span> | <span class=\"tp_resource_link\"><a id=\"tp_links_sh_851\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('851','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_851\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('851','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_851\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{Toca-D\u00edaz2024b,<br \/>\r\ntitle = {Flip-and-Patch: A fault-tolerant technique for on-chip memories of CNN accelerators at low supply voltage},<br \/>\r\nauthor = {Yamilka Toca-D\u00edaz and Reynier Hern\u00e1ndez Palacios and Ruben Gran Tejero and Alejandro Valero},<br \/>\r\nurl = {https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0141933124000188},<br \/>\r\ndoi = {https:\/\/doi.org\/10.1016\/j.micpro.2024.105023},<br \/>\r\nissn = {0141-9331},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-04-01},<br \/>\r\nurldate = {2024-04-01},<br \/>\r\njournal = {Microprocessors and Microsystems},<br \/>\r\nvolume = {106},<br \/>\r\npages = {1-13},<br \/>\r\nabstract = {Aggressively reducing the supply voltage (Vdd) below the safe threshold voltage (Vmin) can effectively lead to significant energy savings in digital circuits. However, operating at such low supply voltages poses challenges due to a high occurrence of permanent faults resulting from manufacturing process variations in current technology nodes. This work addresses the impact of permanent faults on the accuracy of a Convolutional Neural Network (CNN) inference accelerator using on-chip activation memories supplied at low Vdd below Vmin. Based on a characterization study of fault patterns, this paper proposes two low-cost microarchitectural techniques, namely Flip-and-Patch, which maintain the original accuracy of CNN applications even in the presence of a high number of faults caused by operating at Vdd &lt; Vmin. Unlike existing techniques, Flip-and-Patch remains transparent to the programmer and does not rely on application characteristics, making it easily applicable to real CNN accelerators.<br \/>\r\nExperimental results show that Flip-and-Patch ensures the original CNN accuracy with a minimal impact on system performance (less than 0.05% for every application), while achieving average energy savings of 10.5% and 46.6% in activation memories compared to a conventional accelerator operating at safe and nominal supply voltages, respectively. Compared to the state-of-the-art ThUnderVolt technique, which dynamically adjusts the supply voltage at run time and discarding any energy overhead for such an approach, the average energy savings are by 3.2%.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('851','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_851\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Aggressively reducing the supply voltage (Vdd) below the safe threshold voltage (Vmin) can effectively lead to significant energy savings in digital circuits. However, operating at such low supply voltages poses challenges due to a high occurrence of permanent faults resulting from manufacturing process variations in current technology nodes. This work addresses the impact of permanent faults on the accuracy of a Convolutional Neural Network (CNN) inference accelerator using on-chip activation memories supplied at low Vdd below Vmin. Based on a characterization study of fault patterns, this paper proposes two low-cost microarchitectural techniques, namely Flip-and-Patch, which maintain the original accuracy of CNN applications even in the presence of a high number of faults caused by operating at Vdd &lt; Vmin. Unlike existing techniques, Flip-and-Patch remains transparent to the programmer and does not rely on application characteristics, making it easily applicable to real CNN accelerators.<br \/>\r\nExperimental results show that Flip-and-Patch ensures the original CNN accuracy with a minimal impact on system performance (less than 0.05% for every application), while achieving average energy savings of 10.5% and 46.6% in activation memories compared to a conventional accelerator operating at safe and nominal supply voltages, respectively. Compared to the state-of-the-art ThUnderVolt technique, which dynamically adjusts the supply voltage at run time and discarding any energy overhead for such an approach, the average energy savings are by 3.2%.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('851','tp_abstract')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_851\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"fas fa-globe\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0141933124000188\" title=\"https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0141933124000188\" target=\"_blank\">https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0141933124000188<\/a><\/li><li><i class=\"ai ai-doi\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dx.doi.org\/https:\/\/doi.org\/10.1016\/j.micpro.2024.105023\" title=\"DOI de seguimiento:https:\/\/doi.org\/10.1016\/j.micpro.2024.105023\" target=\"_blank\">doi:https:\/\/doi.org\/10.1016\/j.micpro.2024.105023<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('851','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> L\u00f3pez-Villellas, Lori\u00e9n;  Langarita-Ben\u00edtez, Rub\u00e9n;  Badouh, Asaf;  Soria-Pardos, V\u00edctor;  Aguado-Puig, Quim;  L\u00f3pez-Parad\u00eds, Guillem;  Doblas, Max;  Setoain, Javier;  Kim, Chulho;  Ono, Makoto;  Armejach, Adri\u00e0;  Marco-Sola, Santiago;  Alastruey-Bened\u00e9, Jes\u00fas;  Ib\u00e1\u00f1ez, Pablo;  Moret\u00f3, Miquel<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('844','tp_links')\" style=\"cursor:pointer;\">GenArchBench: A genomics benchmark suite for arm HPC processors<\/a> <span class=\"tp_pub_type tp_  article\">Art\u00edculo de revista<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_journal\">Future Generation Computer Systems, <\/span><span class=\"tp_pub_additional_volume\">vol. 157, <\/span><span class=\"tp_pub_additional_pages\">pp. 313-329, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>, <span class=\"tp_pub_additional_issn\">ISSN: 0167-739X<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_844\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('844','tp_abstract')\" title=\"Mostrar resumen\" style=\"cursor:pointer;\">Resumen<\/a><\/span> | <span class=\"tp_resource_link\"><a id=\"tp_links_sh_844\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('844','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_844\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('844','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_844\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{LOPEZVILLELLAS2024313,<br \/>\r\ntitle = {GenArchBench: A genomics benchmark suite for arm HPC processors},<br \/>\r\nauthor = {Lori\u00e9n L\u00f3pez-Villellas and Rub\u00e9n Langarita-Ben\u00edtez and Asaf Badouh and V\u00edctor Soria-Pardos and Quim Aguado-Puig and Guillem L\u00f3pez-Parad\u00eds and Max Doblas and Javier Setoain and Chulho Kim and Makoto Ono and Adri\u00e0 Armejach and Santiago Marco-Sola and Jes\u00fas Alastruey-Bened\u00e9 and Pablo Ib\u00e1\u00f1ez and Miquel Moret\u00f3},<br \/>\r\nurl = {https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0167739X24001250},<br \/>\r\ndoi = {https:\/\/doi.org\/10.1016\/j.future.2024.03.050},<br \/>\r\nissn = {0167-739X},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-01-01},<br \/>\r\njournal = {Future Generation Computer Systems},<br \/>\r\nvolume = {157},<br \/>\r\npages = {313-329},<br \/>\r\nabstract = {Arm usage has substantially grown in the High-Performance Computing (HPC) community. Japanese supercomputer Fugaku, powered by Arm-based A64FX processors, held the top position on the Top500 list between June 2020 and June 2022, currently sitting in the fourth position. The recently released 7th generation of Amazon EC2 instances for compute-intensive workloads (C7 g) is also powered by Arm Graviton3 processors. Projects like European Mont-Blanc and U.S. DOE\/NNSA Astra are further examples of Arm irruption in HPC. In parallel, over the last decade, the rapid improvement of genomic sequencing technologies and the exponential growth of sequencing data has placed a significant bottleneck on the computational side. While most genomics applications have been thoroughly tested and optimized for x86 systems, just a few are prepared to perform efficiently on Arm machines. Moreover, these applications do not exploit the newly introduced Scalable Vector Extensions (SVE). This paper presents GenArchBench, the first genome analysis benchmark suite targeting Arm architectures. We have selected computationally demanding kernels from the most widely used tools in genome data analysis and ported them to Arm-based A64FX and Graviton3 processors. Overall, the GenArch benchmark suite comprises 13 multi-core kernels from critical stages of widely-used genome analysis pipelines, including base-calling, read mapping, variant calling, and genome assembly. Our benchmark suite includes different input data sets per kernel (small and large), each with a corresponding regression test to verify the correctness of each execution automatically. Moreover, the porting features the usage of the novel Arm SVE instructions, algorithmic and code optimizations, and the exploitation of Arm-optimized libraries. We present the optimizations implemented in each kernel and a detailed performance evaluation and comparison of their performance on four different HPC machines (i.e., A64FX, Graviton3, Intel Xeon Skylake Platinum, and AMD EPYC Rome). Overall, the experimental evaluation shows that Graviton3 outperforms other machines on average. Moreover, we observed that the performance of the A64FX is significantly constrained by its small memory hierarchy and latencies. Additionally, as proof of concept, we study the performance of a production-ready tool that exploits two of the ported and optimized genomic kernels.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('844','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_844\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Arm usage has substantially grown in the High-Performance Computing (HPC) community. Japanese supercomputer Fugaku, powered by Arm-based A64FX processors, held the top position on the Top500 list between June 2020 and June 2022, currently sitting in the fourth position. The recently released 7th generation of Amazon EC2 instances for compute-intensive workloads (C7 g) is also powered by Arm Graviton3 processors. Projects like European Mont-Blanc and U.S. DOE\/NNSA Astra are further examples of Arm irruption in HPC. In parallel, over the last decade, the rapid improvement of genomic sequencing technologies and the exponential growth of sequencing data has placed a significant bottleneck on the computational side. While most genomics applications have been thoroughly tested and optimized for x86 systems, just a few are prepared to perform efficiently on Arm machines. Moreover, these applications do not exploit the newly introduced Scalable Vector Extensions (SVE). This paper presents GenArchBench, the first genome analysis benchmark suite targeting Arm architectures. We have selected computationally demanding kernels from the most widely used tools in genome data analysis and ported them to Arm-based A64FX and Graviton3 processors. Overall, the GenArch benchmark suite comprises 13 multi-core kernels from critical stages of widely-used genome analysis pipelines, including base-calling, read mapping, variant calling, and genome assembly. Our benchmark suite includes different input data sets per kernel (small and large), each with a corresponding regression test to verify the correctness of each execution automatically. Moreover, the porting features the usage of the novel Arm SVE instructions, algorithmic and code optimizations, and the exploitation of Arm-optimized libraries. We present the optimizations implemented in each kernel and a detailed performance evaluation and comparison of their performance on four different HPC machines (i.e., A64FX, Graviton3, Intel Xeon Skylake Platinum, and AMD EPYC Rome). Overall, the experimental evaluation shows that Graviton3 outperforms other machines on average. Moreover, we observed that the performance of the A64FX is significantly constrained by its small memory hierarchy and latencies. Additionally, as proof of concept, we study the performance of a production-ready tool that exploits two of the ported and optimized genomic kernels.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('844','tp_abstract')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_844\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"fas fa-globe\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0167739X24001250\" title=\"https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0167739X24001250\" target=\"_blank\">https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0167739X24001250<\/a><\/li><li><i class=\"ai ai-doi\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dx.doi.org\/https:\/\/doi.org\/10.1016\/j.future.2024.03.050\" title=\"DOI de seguimiento:https:\/\/doi.org\/10.1016\/j.future.2024.03.050\" target=\"_blank\">doi:https:\/\/doi.org\/10.1016\/j.future.2024.03.050<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('844','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Torres-Mac\u00edas, A. G.;  Ram\u00edrez-Trevi\u00f1o, A.;  Briz, J. L.;  Segarra, J.;  Blanco-Alcaine, H.<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('873','tp_links')\" style=\"cursor:pointer;\">Modeling Time-Sensitive Networking Using Timed Continuous Petri Nets<\/a> <span class=\"tp_pub_type tp_  article\">Art\u00edculo de revista<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_journal\">IFAC-PapersOnLine, <\/span><span class=\"tp_pub_additional_volume\">vol. 58, <\/span><span class=\"tp_pub_additional_number\">no 1, <\/span><span class=\"tp_pub_additional_pages\">pp. 300-305, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_resource_link\"><a id=\"tp_links_sh_873\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('873','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_873\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('873','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_873\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{Tor:24a,<br \/>\r\ntitle = {Modeling Time-Sensitive Networking Using Timed Continuous Petri Nets},<br \/>\r\nauthor = {A. G. Torres-Mac\u00edas and A. Ram\u00edrez-Trevi\u00f1o and J. L. Briz and J. Segarra and H. Blanco-Alcaine},<br \/>\r\nurl = {https:\/\/doi.org\/10.1016\/j.ifacol.2024.07.051},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-01-01},<br \/>\r\nurldate = {2024-01-01},<br \/>\r\njournal = {IFAC-PapersOnLine},<br \/>\r\nvolume = {58},<br \/>\r\nnumber = {1},<br \/>\r\npages = {300-305},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('873','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_873\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"fas fa-globe\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/doi.org\/10.1016\/j.ifacol.2024.07.051\" title=\"https:\/\/doi.org\/10.1016\/j.ifacol.2024.07.051\" target=\"_blank\">https:\/\/doi.org\/10.1016\/j.ifacol.2024.07.051<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('873','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Torres-Mac\u00edas, A. G.;  Segarra, J.;  Briz, J. L.;  Ram\u00edrez-Trevi\u00f1o, A.;  Blanco-Alcaine, H.<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('874','tp_links')\" style=\"cursor:pointer;\">Fast IEEE802.1Qbv Gate Scheduling Through Integer Linear Programming<\/a> <span class=\"tp_pub_type tp_  article\">Art\u00edculo de revista<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_journal\">IEEE Access, <\/span><span class=\"tp_pub_additional_volume\">vol. 12, <\/span><span class=\"tp_pub_additional_pages\">pp. 111239-111250, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_resource_link\"><a id=\"tp_links_sh_874\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('874','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_874\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('874','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_874\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{Tor:24b,<br \/>\r\ntitle = {Fast IEEE802.1Qbv Gate Scheduling Through Integer Linear Programming},<br \/>\r\nauthor = {A. G. Torres-Mac\u00edas and J. Segarra and J. L. Briz and A. Ram\u00edrez-Trevi\u00f1o and H. Blanco-Alcaine},<br \/>\r\nurl = {https:\/\/doi.org\/10.1109\/ACCESS.2024.3440828},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-01-01},<br \/>\r\njournal = {IEEE Access},<br \/>\r\nvolume = {12},<br \/>\r\npages = {111239-111250},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('874','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_874\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"fas fa-globe\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/doi.org\/10.1109\/ACCESS.2024.3440828\" title=\"https:\/\/doi.org\/10.1109\/ACCESS.2024.3440828\" target=\"_blank\">https:\/\/doi.org\/10.1109\/ACCESS.2024.3440828<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('874','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Mikkelsen, Carl Christian Kjelgaard;  L\u00f3pez-Villellas, Lori\u00e9n;  Garc\u00eda-Risue\u00f1o, Pablo<\/p><p class=\"tp_pub_title\">Newton&#8217;s method revisited: How accurate do we have to be? <span class=\"tp_pub_type tp_  article\">Art\u00edculo de revista<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_journal\">Concurrency and Computation: Practice and Experience, <\/span><span class=\"tp_pub_additional_volume\">vol. 36, <\/span><span class=\"tp_pub_additional_number\">no 10, <\/span><span class=\"tp_pub_additional_pages\">pp. e7853, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_877\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('877','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_877\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{kjelgaard2024newton,<br \/>\r\ntitle = {Newton's method revisited: How accurate do we have to be?},<br \/>\r\nauthor = {Carl Christian Kjelgaard Mikkelsen and Lori\u00e9n L\u00f3pez-Villellas and Pablo Garc\u00eda-Risue\u00f1o},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-01-01},<br \/>\r\njournal = {Concurrency and Computation: Practice and Experience},<br \/>\r\nvolume = {36},<br \/>\r\nnumber = {10},<br \/>\r\npages = {e7853},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('877','tp_bibtex')\">Cerrar<\/a><\/p><\/div><\/div><\/div><h3 class=\"tp_h3\" id=\"tp_h3_inproceedings\">Proceedings Articles<\/h3><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Toca-D\u00edaz, Yamilka;  Tejero, Rub\u00e9n Gran;  Valero, Alejandro<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('852','tp_links')\" style=\"cursor:pointer;\">Ensuring the Accuracy of CNN Accelerators Supplied at Ultra-Low Voltage<\/a> <span class=\"tp_pub_type tp_  inproceedings\">Proceedings Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_pages\">pp. 92-95, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>, <span class=\"tp_pub_additional_isbn\">ISBN: 979-8-3503-8040-8<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_852\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('852','tp_abstract')\" title=\"Mostrar resumen\" style=\"cursor:pointer;\">Resumen<\/a><\/span> | <span class=\"tp_resource_link\"><a id=\"tp_links_sh_852\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('852','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_852\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('852','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_852\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{Toca-D\u00edaz2024c,<br \/>\r\ntitle = {Ensuring the Accuracy of CNN Accelerators Supplied at Ultra-Low Voltage},<br \/>\r\nauthor = {Yamilka Toca-D\u00edaz and Rub\u00e9n Gran Tejero and Alejandro Valero},<br \/>\r\nurl = {https:\/\/ieeexplore.ieee.org\/document\/10817950},<br \/>\r\ndoi = {https:\/\/doi.org\/10.1109\/ICCD63220.2024.00024},<br \/>\r\nisbn = {979-8-3503-8040-8},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-11-18},<br \/>\r\nurldate = {2024-11-18},<br \/>\r\njournal = {Proceedings of the 42nd IEEE International Conference on Computer Design (ICCD 2024)},<br \/>\r\npages = {92-95},<br \/>\r\nabstract = {Underscaling the supply voltage (Vdd) to ultra-low levels below the safe-operation threshold voltage (Vmin) brings significant energy savings in digital CMOS circuits but introduces reliability challenges due to increased risk of bitcell permanent faults. This work explores the impact of such faults on the accuracy of a CNN inference accelerator supplying on-chip activation memories at ultra-low Vdd. By examining fault pat-terns, activation values, and memory usage, this paper proposes two microarchitectural techniques exploiting activation outliers and activation memory underutilization. These approaches are cost-effective, do not require programmer intervention, and are application-independent. Experimental results show that the proposed approaches maintain the original CNN accuracy and achieve energy savings by 2.1 % and 8.2 % compared to the state-of-the-art technique and a conventional accelerator supplied at Vmin, respectively, with a negligible impact on the system performance (less than 0.25 %).},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('852','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_852\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Underscaling the supply voltage (Vdd) to ultra-low levels below the safe-operation threshold voltage (Vmin) brings significant energy savings in digital CMOS circuits but introduces reliability challenges due to increased risk of bitcell permanent faults. This work explores the impact of such faults on the accuracy of a CNN inference accelerator supplying on-chip activation memories at ultra-low Vdd. By examining fault pat-terns, activation values, and memory usage, this paper proposes two microarchitectural techniques exploiting activation outliers and activation memory underutilization. These approaches are cost-effective, do not require programmer intervention, and are application-independent. Experimental results show that the proposed approaches maintain the original CNN accuracy and achieve energy savings by 2.1 % and 8.2 % compared to the state-of-the-art technique and a conventional accelerator supplied at Vmin, respectively, with a negligible impact on the system performance (less than 0.25 %).<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('852','tp_abstract')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_852\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"fas fa-globe\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/ieeexplore.ieee.org\/document\/10817950\" title=\"https:\/\/ieeexplore.ieee.org\/document\/10817950\" target=\"_blank\">https:\/\/ieeexplore.ieee.org\/document\/10817950<\/a><\/li><li><i class=\"ai ai-doi\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dx.doi.org\/https:\/\/doi.org\/10.1109\/ICCD63220.2024.00024\" title=\"DOI de seguimiento:https:\/\/doi.org\/10.1109\/ICCD63220.2024.00024\" target=\"_blank\">doi:https:\/\/doi.org\/10.1109\/ICCD63220.2024.00024<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('852','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> P\u00e9rez, Samuel;  Resano, Javier;  Gracia, Dar\u00edo Su\u00e1rez<\/p><p class=\"tp_pub_title\"><a class=\"tp_title_link\" onclick=\"teachpress_pub_showhide('872','tp_links')\" style=\"cursor:pointer;\">Accelerating Bayesian Neural Networks on Low-Power Edge RISC-V Processors<\/a> <span class=\"tp_pub_type tp_  inproceedings\">Proceedings Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_booktitle\">2024 IEEE 24th International Conference on Nanotechnology (NANO), <\/span><span class=\"tp_pub_additional_pages\">pp. 507-512, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>, <span class=\"tp_pub_additional_issn\">ISSN: 1944-9380<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_872\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('872','tp_abstract')\" title=\"Mostrar resumen\" style=\"cursor:pointer;\">Resumen<\/a><\/span> | <span class=\"tp_resource_link\"><a id=\"tp_links_sh_872\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('872','tp_links')\" title=\"Mostrar enlaces y recursos\" style=\"cursor:pointer;\">Enlaces<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_872\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('872','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_872\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{10628877,<br \/>\r\ntitle = {Accelerating Bayesian Neural Networks on Low-Power Edge RISC-V Processors},<br \/>\r\nauthor = {Samuel P\u00e9rez and Javier Resano and Dar\u00edo Su\u00e1rez Gracia},<br \/>\r\ndoi = {10.1109\/NANO61778.2024.10628877},<br \/>\r\nissn = {1944-9380},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-07-01},<br \/>\r\nbooktitle = {2024 IEEE 24th International Conference on Nanotechnology (NANO)},<br \/>\r\npages = {507-512},<br \/>\r\nabstract = {Neural Networks (NN s) are a very popular solution for classification tasks. As the combination of Internet of Things (IoT) with Machine Learning (ML), also known as TinyML, grows in popularity, more NN are being executed on low-end edge systems. The reliability of the predictions is crucial for safety-critical applications. Bayesian Neural Networks (BNNs) address this issue by calculating uncertainty metrics with their predictions at the cost of increasing computing requirements. This work addresses the challenges of executing BNNs inference on low-end systems. BNNs require multiple forward passes in which the weights are sampled from distributions. This sampling process can take up to 85,13% of execution time. This work optimizes the weight sampling and integrates it within a low cost custom extension for a RISC- V CPU, improving speedup up to x 8,10 and similar energy savings.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('872','tp_bibtex')\">Cerrar<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_872\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Neural Networks (NN s) are a very popular solution for classification tasks. As the combination of Internet of Things (IoT) with Machine Learning (ML), also known as TinyML, grows in popularity, more NN are being executed on low-end edge systems. The reliability of the predictions is crucial for safety-critical applications. Bayesian Neural Networks (BNNs) address this issue by calculating uncertainty metrics with their predictions at the cost of increasing computing requirements. This work addresses the challenges of executing BNNs inference on low-end systems. BNNs require multiple forward passes in which the weights are sampled from distributions. This sampling process can take up to 85,13% of execution time. This work optimizes the weight sampling and integrates it within a low cost custom extension for a RISC- V CPU, improving speedup up to x 8,10 and similar energy savings.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('872','tp_abstract')\">Cerrar<\/a><\/p><\/div><div class=\"tp_links\" id=\"tp_links_872\" style=\"display:none;\"><div class=\"tp_links_entry\"><ul class=\"tp_pub_list\"><li><i class=\"ai ai-doi\"><\/i><a class=\"tp_pub_list\" href=\"https:\/\/dx.doi.org\/10.1109\/NANO61778.2024.10628877\" title=\"DOI de seguimiento:10.1109\/NANO61778.2024.10628877\" target=\"_blank\">doi:10.1109\/NANO61778.2024.10628877<\/a><\/li><\/ul><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('872','tp_links')\">Cerrar<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\"> Mikkelsen, Carl Christian Kjelgaard;  L\u00f3pez-Villellas, Lori\u00e9n<\/p><p class=\"tp_pub_title\">The need for accuracy and smoothness in numerical simulations <span class=\"tp_pub_type tp_  inproceedings\">Proceedings Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">En: <\/span><span class=\"tp_pub_additional_booktitle\">International Conference on Parallel Processing and Applied Mathematics, <\/span><span class=\"tp_pub_additional_pages\">pp. 3\u201316, <\/span><span class=\"tp_pub_additional_organization\">Springer <\/span><span class=\"tp_pub_additional_year\">2024<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_880\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('880','tp_bibtex')\" title=\"Mostrar entrada BibTeX \" style=\"cursor:pointer;\">BibTeX<\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_880\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{kjelgaard2024need,<br \/>\r\ntitle = {The need for accuracy and smoothness in numerical simulations},<br \/>\r\nauthor = {Carl Christian Kjelgaard Mikkelsen and Lori\u00e9n L\u00f3pez-Villellas},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-01-01},<br \/>\r\nbooktitle = {International Conference on Parallel Processing and Applied Mathematics},<br \/>\r\npages = {3\u201316},<br \/>\r\norganization = {Springer},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('880','tp_bibtex')\">Cerrar<\/a><\/p><\/div><\/div><\/div><\/div><div class=\"tablenav\"><div class=\"tablenav-pages\"><span class=\"displaying-num\">405 registros<\/span> <a class=\"page-numbers button disabled\">&laquo;<\/a> <a class=\"page-numbers button disabled\">&lsaquo;<\/a> 1 de 21 <a href=\"https:\/\/gaz.i3a.es\/es\/publications\/?limit=2&amp;tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=&amp;tsr=#tppubs\" title=\"p\u00e1gina siguiente\" class=\"page-numbers button\">&rsaquo;<\/a> <a href=\"https:\/\/gaz.i3a.es\/es\/publications\/?limit=21&amp;tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=&amp;tsr=#tppubs\" title=\"\u00faltima p\u00e1gina\" class=\"page-numbers button\">&raquo;<\/a> <\/div><\/div><\/div>\n<\/div>\n<\/div><\/div><\/div><\/div><\/div>","protected":false},"excerpt":{"rendered":"","protected":false},"author":1,"featured_media":0,"parent":0,"menu_order":0,"comment_status":"closed","ping_status":"closed","template":"","meta":{"footnotes":""},"class_list":["post-2608","page","type-page","status-publish"],"_links":{"self":[{"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/pages\/2608","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/pages"}],"about":[{"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/types\/page"}],"author":[{"embeddable":true,"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/comments?post=2608"}],"version-history":[{"count":17,"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/pages\/2608\/revisions"}],"predecessor-version":[{"id":3742,"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/pages\/2608\/revisions\/3742"}],"wp:attachment":[{"href":"https:\/\/gaz.i3a.es\/es\/wp-json\/wp\/v2\/media?parent=2608"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}