2025
Journal Articles
Valero, Alejandro; Lorente, Vicente; Petit, Salvador; Sahuquillo, Julio
Dual Fast-Track Cache: Organizing Ring-Shaped Racetracks to Work as L1 Caches Journal Article
In: IEEE Transactions on Computers, vol. 74, no. 8, pp. 2812-2826, 2025, ISSN: 0018-9340.
@article{Valero2025,
title = {Dual Fast-Track Cache: Organizing Ring-Shaped Racetracks to Work as L1 Caches},
author = {Alejandro Valero and Vicente Lorente and Salvador Petit and Julio Sahuquillo},
url = {https://www.computer.org/csdl/journal/tc/2025/08/11022726/27fzlt4rw88},
doi = {10.1109/TC.2025.3575909},
issn = {0018-9340},
year = {2025},
date = {2025-08-01},
urldate = {2025-08-01},
journal = {IEEE Transactions on Computers},
volume = {74},
number = {8},
pages = {2812-2826},
abstract = {Static Random-Access Memory (SRAM) is the fastest memory technology and has been the common design choice for implementing first-level (L1) caches in the processor pipeline, where speed is a key design issue that must be fulfilled. On the contrary, this technology offers much lower density compared to other technologies like Dynamic RAM, limiting L1 cache sizes of modern processors to a few tens of KB. This paper explores the use of slower but denser Domain Wall Memory (DWM) technology for L1 caches. This technology provides slow access times since it arranges multiple bits sequentially in a magnetic racetrack. To access these bits, they need to be shifted in order to place them under a header. A 1-bit shift usually takes one processor cycle, which can significantly hurt the application performance, making this working behavior inappropriate for L1 caches. Based on the locality (temporal and spatial) principles exploited by caches, this work proposes the Dual Fast-Track Cache (Dual FTC) design, a new approach to organizing a set of racetracks to build set-associative caches. Compared to a conventional SRAM cache, Dual FTC enhances storage capacity by 5× while incurring minimal shifting overhead, thereby rendering it a practical and appealing solution for L1 cache implementations. Experimental results show that the devised cache organization is as fast as an SRAM cache for 78% and 86% of the L1 data cache hits and L1 instruction cache hits, respectively (i.e., no shift is required). Consequently, due to the larger L1 cache capacities, significant system performance gains (by 22% on average) are obtained under the same silicon area.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2024
Journal Articles
Toca-Díaz, Yamilka; Tejero, Rubén Gran; Valero, Alejandro
Shift-and-Safe: Addressing permanent faults in aggressively undervolted CNN accelerators Journal Article
In: Journal of Systems Architecture, vol. 157, pp. 1-13, 2024, ISSN: 1383-7621.
@article{Toca-Díaz2024,
title = {Shift-and-Safe: Addressing permanent faults in aggressively undervolted CNN accelerators},
author = {Yamilka Toca-Díaz and Rubén Gran Tejero and Alejandro Valero},
url = {https://www.sciencedirect.com/science/article/pii/S1383762124002297},
doi = {https://doi.org/10.1016/j.sysarc.2024.103292},
issn = {1383-7621},
year = {2024},
date = {2024-12-01},
urldate = {2024-12-01},
journal = {Journal of Systems Architecture},
volume = {157},
pages = {1-13},
abstract = {Underscaling the supply voltage (Vdd) to ultra-low levels below the safe-operation threshold voltage (Vmin) holds promise for substantial power savings in digital CMOS circuits. However, these benefits come with pronounced challenges due to the heightened risk of bitcell permanent faults stemming from process variations in current technology node sizes. This work delves into the repercussions of such faults on the accuracy of a 16-bit fixed-point Convolutional Neural Network (CNN) inference accelerator powering on-chip activation memories at ultra-low Vdd voltages. Through an in-depth examination of fault patterns, memory usage, and statistical analysis of activation values, this paper introduces Shift-and-Safe: two novel and cost-effective microarchitectural techniques exploiting the presence of outlier activation values and the underutilization of activation memories. Particularly, activation outliers enable a shift-based data representation that reduces the impact of faults on the activation values, whereas the memory underutilization is exploited to maintain a safe replica of affected activations in idle memory regions. Remarkably, these mechanisms do not add any burden to the programmer and are independent of application characteristics, rendering them easily deployable across real-world CNN accelerators. Experimental results show that Shift-and-Safe maintains the CNN accuracy even in the presence of almost a quarter of the total activations with faults. In addition, average energy savings are by 5% and 11% compared to the state-of-the-art approach and a conventional accelerator supplied at Vmin, respectively.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Toca-Díaz, Yamilka; Palacios, Reynier Hernández; Tejero, Ruben Gran; Valero, Alejandro
Flip-and-Patch: A fault-tolerant technique for on-chip memories of CNN accelerators at low supply voltage Journal Article
In: Microprocessors and Microsystems, vol. 106, pp. 1-13, 2024, ISSN: 0141-9331.
@article{Toca-Díaz2024b,
title = {Flip-and-Patch: A fault-tolerant technique for on-chip memories of CNN accelerators at low supply voltage},
author = {Yamilka Toca-Díaz and Reynier Hernández Palacios and Ruben Gran Tejero and Alejandro Valero},
url = {https://www.sciencedirect.com/science/article/pii/S0141933124000188},
doi = {https://doi.org/10.1016/j.micpro.2024.105023},
issn = {0141-9331},
year = {2024},
date = {2024-04-01},
urldate = {2024-04-01},
journal = {Microprocessors and Microsystems},
volume = {106},
pages = {1-13},
abstract = {Aggressively reducing the supply voltage (Vdd) below the safe threshold voltage (Vmin) can effectively lead to significant energy savings in digital circuits. However, operating at such low supply voltages poses challenges due to a high occurrence of permanent faults resulting from manufacturing process variations in current technology nodes. This work addresses the impact of permanent faults on the accuracy of a Convolutional Neural Network (CNN) inference accelerator using on-chip activation memories supplied at low Vdd below Vmin. Based on a characterization study of fault patterns, this paper proposes two low-cost microarchitectural techniques, namely Flip-and-Patch, which maintain the original accuracy of CNN applications even in the presence of a high number of faults caused by operating at Vdd < Vmin. Unlike existing techniques, Flip-and-Patch remains transparent to the programmer and does not rely on application characteristics, making it easily applicable to real CNN accelerators.
Experimental results show that Flip-and-Patch ensures the original CNN accuracy with a minimal impact on system performance (less than 0.05% for every application), while achieving average energy savings of 10.5% and 46.6% in activation memories compared to a conventional accelerator operating at safe and nominal supply voltages, respectively. Compared to the state-of-the-art ThUnderVolt technique, which dynamically adjusts the supply voltage at run time and discarding any energy overhead for such an approach, the average energy savings are by 3.2%.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Experimental results show that Flip-and-Patch ensures the original CNN accuracy with a minimal impact on system performance (less than 0.05% for every application), while achieving average energy savings of 10.5% and 46.6% in activation memories compared to a conventional accelerator operating at safe and nominal supply voltages, respectively. Compared to the state-of-the-art ThUnderVolt technique, which dynamically adjusts the supply voltage at run time and discarding any energy overhead for such an approach, the average energy savings are by 3.2%.
López-Villellas, Lorién; Langarita-Benítez, Rubén; Badouh, Asaf; Soria-Pardos, Víctor; Aguado-Puig, Quim; López-Paradís, Guillem; Doblas, Max; Setoain, Javier; Kim, Chulho; Ono, Makoto; Armejach, Adrià; Marco-Sola, Santiago; Alastruey-Benedé, Jesús; Ibáñez, Pablo; Moretó, Miquel
GenArchBench: A genomics benchmark suite for arm HPC processors Journal Article
In: Future Generation Computer Systems, vol. 157, pp. 313-329, 2024, ISSN: 0167-739X.
@article{LOPEZVILLELLAS2024313,
title = {GenArchBench: A genomics benchmark suite for arm HPC processors},
author = {Lorién López-Villellas and Rubén Langarita-Benítez and Asaf Badouh and Víctor Soria-Pardos and Quim Aguado-Puig and Guillem López-Paradís and Max Doblas and Javier Setoain and Chulho Kim and Makoto Ono and Adrià Armejach and Santiago Marco-Sola and Jesús Alastruey-Benedé and Pablo Ibáñez and Miquel Moretó},
url = {https://www.sciencedirect.com/science/article/pii/S0167739X24001250},
doi = {https://doi.org/10.1016/j.future.2024.03.050},
issn = {0167-739X},
year = {2024},
date = {2024-01-01},
journal = {Future Generation Computer Systems},
volume = {157},
pages = {313-329},
abstract = {Arm usage has substantially grown in the High-Performance Computing (HPC) community. Japanese supercomputer Fugaku, powered by Arm-based A64FX processors, held the top position on the Top500 list between June 2020 and June 2022, currently sitting in the fourth position. The recently released 7th generation of Amazon EC2 instances for compute-intensive workloads (C7 g) is also powered by Arm Graviton3 processors. Projects like European Mont-Blanc and U.S. DOE/NNSA Astra are further examples of Arm irruption in HPC. In parallel, over the last decade, the rapid improvement of genomic sequencing technologies and the exponential growth of sequencing data has placed a significant bottleneck on the computational side. While most genomics applications have been thoroughly tested and optimized for x86 systems, just a few are prepared to perform efficiently on Arm machines. Moreover, these applications do not exploit the newly introduced Scalable Vector Extensions (SVE). This paper presents GenArchBench, the first genome analysis benchmark suite targeting Arm architectures. We have selected computationally demanding kernels from the most widely used tools in genome data analysis and ported them to Arm-based A64FX and Graviton3 processors. Overall, the GenArch benchmark suite comprises 13 multi-core kernels from critical stages of widely-used genome analysis pipelines, including base-calling, read mapping, variant calling, and genome assembly. Our benchmark suite includes different input data sets per kernel (small and large), each with a corresponding regression test to verify the correctness of each execution automatically. Moreover, the porting features the usage of the novel Arm SVE instructions, algorithmic and code optimizations, and the exploitation of Arm-optimized libraries. We present the optimizations implemented in each kernel and a detailed performance evaluation and comparison of their performance on four different HPC machines (i.e., A64FX, Graviton3, Intel Xeon Skylake Platinum, and AMD EPYC Rome). Overall, the experimental evaluation shows that Graviton3 outperforms other machines on average. Moreover, we observed that the performance of the A64FX is significantly constrained by its small memory hierarchy and latencies. Additionally, as proof of concept, we study the performance of a production-ready tool that exploits two of the ported and optimized genomic kernels.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Proceedings Articles
Toca-Díaz, Yamilka; Tejero, Rubén Gran; Valero, Alejandro
Ensuring the Accuracy of CNN Accelerators Supplied at Ultra-Low Voltage Proceedings Article
In: pp. 92-95, 2024, ISBN: 979-8-3503-8040-8.
@inproceedings{Toca-Díaz2024c,
title = {Ensuring the Accuracy of CNN Accelerators Supplied at Ultra-Low Voltage},
author = {Yamilka Toca-Díaz and Rubén Gran Tejero and Alejandro Valero},
url = {https://ieeexplore.ieee.org/document/10817950},
doi = {https://doi.org/10.1109/ICCD63220.2024.00024},
isbn = {979-8-3503-8040-8},
year = {2024},
date = {2024-11-18},
urldate = {2024-11-18},
journal = {Proceedings of the 42nd IEEE International Conference on Computer Design (ICCD 2024)},
pages = {92-95},
abstract = {Underscaling the supply voltage (Vdd) to ultra-low levels below the safe-operation threshold voltage (Vmin) brings significant energy savings in digital CMOS circuits but introduces reliability challenges due to increased risk of bitcell permanent faults. This work explores the impact of such faults on the accuracy of a CNN inference accelerator supplying on-chip activation memories at ultra-low Vdd. By examining fault pat-terns, activation values, and memory usage, this paper proposes two microarchitectural techniques exploiting activation outliers and activation memory underutilization. These approaches are cost-effective, do not require programmer intervention, and are application-independent. Experimental results show that the proposed approaches maintain the original CNN accuracy and achieve energy savings by 2.1 % and 8.2 % compared to the state-of-the-art technique and a conventional accelerator supplied at Vmin, respectively, with a negligible impact on the system performance (less than 0.25 %).},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2023
Journal Articles
Mikkelsen, Carl Christian Kjelgaard; López‐Villellas, Lorién; García‐Risueño, Pablo
Newton’s method revisited: How accurate do we have to be? Journal Article
In: Concurrency and Computation: Practice and Experience, vol. 36, no. 10, 2023, ISSN: 1532-0634.
@article{KjelgaardMikkelsen2023,
title = {Newton’s method revisited: How accurate do we have to be?},
author = {Carl Christian Kjelgaard Mikkelsen and Lorién López‐Villellas and Pablo García‐Risueño},
url = {http://dx.doi.org/10.1002/cpe.7853},
doi = {10.1002/cpe.7853},
issn = {1532-0634},
year = {2023},
date = {2023-07-01},
journal = {Concurrency and Computation: Practice and Experience},
volume = {36},
number = {10},
publisher = {Wiley},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Navarro-Torres, Agustín; Alastruey-Benedé, Jesús; Ibáñez, Pablo; Viñals-Yúfera, Víctor
BALANCER: bandwidth allocation and cache partitioning for multicore processors Journal Article
In: The Journal of Supercomputing, pp. 1–25, 2023.
@article{navarro2023balancer,
title = {BALANCER: bandwidth allocation and cache partitioning for multicore processors},
author = {Agustín Navarro-Torres and Jesús Alastruey-Benedé and Pablo Ibáñez and Víctor Viñals-Yúfera},
url = {https://doi.org/10.1007/s11227-023-05070-0},
doi = {10.1007/s11227-023-05070-0},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
journal = {The Journal of Supercomputing},
pages = {1--25},
publisher = {Springer},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
López-Villellas, Lorién; Mikkelsen, Carl Christian Kjelgaard; Galano-Frutos, Juan José; Marco-Sola, Santiago; Alastruey-Benedé, Jesús; Ibáñez, Pablo; Moretó, Miquel; Sancho, Javier; García-Risueño, Pablo
Accurate and efficient constrained molecular dynamics of polymers using Newton’s method and special purpose code Journal Article
In: Computer Physics Communications, vol. 288, pp. 108742, 2023, ISSN: 0010-4655.
@article{LOPEZVILLELLAS2023108742,
title = {Accurate and efficient constrained molecular dynamics of polymers using Newton's method and special purpose code},
author = {Lorién López-Villellas and Carl Christian Kjelgaard Mikkelsen and Juan José Galano-Frutos and Santiago Marco-Sola and Jesús Alastruey-Benedé and Pablo Ibáñez and Miquel Moretó and Javier Sancho and Pablo García-Risueño},
url = {https://www.sciencedirect.com/science/article/pii/S0010465523000875},
doi = {https://doi.org/10.1016/j.cpc.2023.108742},
issn = {0010-4655},
year = {2023},
date = {2023-01-01},
journal = {Computer Physics Communications},
volume = {288},
pages = {108742},
abstract = {In molecular dynamics simulations we can often increase the time step by imposing constraints on bond lengths and bond angles. This allows us to extend the length of the time interval and therefore the range of physical phenomena that we can afford to simulate. We examine the existing algorithms and software for solving nonlinear constraint equations in parallel and we explain why it is necessary to advance the state-of-the-art. We present ILVES-PC, a new algorithm for imposing bond constraints on proteins accurately and efficiently. It solves the same system of differential algebraic equations as the celebrated SHAKE algorithm, but ILVES-PC solves the nonlinear constraint equations using Newton's method rather than the nonlinear Gauss-Seidel method. Moreover, ILVES-PC solves the necessary linear systems using a specialized linear solver that exploits the structure of the protein. ILVES-PC can rapidly solve constraint equations as accurately as the hardware will allow. The run-time of ILVES-PC is proportional to the number of constraints. We have integrated ILVES-PC into GROMACS and simulated proteins of different sizes. Compared with SHAKE, we have achieved speedups of up to 4.9× in single-threaded executions and up to 76× in shared-memory multi-threaded executions. Moreover, ILVES-PC is more accurate than P-LINCS algorithm. Our work is a proof-of-concept of the utility of software designed specifically for the simulation of polymers.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Langarita, Rubén; Armejach, Adrià; Ibáñez, Pablo; Alastruey-Benedé, Jesús; Moretó, Miquel
Porting and Optimizing BWA-MEM2 Using the Fujitsu A64FX Processor Journal Article
In: IEEE/ACM Transactions on Computational Biology and Bioinformatics, vol. 20, no. 5, pp. 3139-3153, 2023.
@article{10093071,
title = {Porting and Optimizing BWA-MEM2 Using the Fujitsu A64FX Processor},
author = {Rubén Langarita and Adrià Armejach and Pablo Ibáñez and Jesús Alastruey-Benedé and Miquel Moretó},
doi = {10.1109/TCBB.2023.3264514},
year = {2023},
date = {2023-01-01},
journal = {IEEE/ACM Transactions on Computational Biology and Bioinformatics},
volume = {20},
number = {5},
pages = {3139-3153},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Proceedings Articles
Toca-Díaz, Yamilka; Muñoz, Nicolás Landeros; Tejero, Ruben Gran; Valero, Alejandro
On Fault-Tolerant Microarchitectural Techniques for Voltage Underscaling in On-Chip Memories of CNN Accelerators Proceedings Article
In: pp. 138-145, 2023, ISBN: 979-8-3503-4419-6.
@inproceedings{Toca-Díaz2023,
title = {On Fault-Tolerant Microarchitectural Techniques for Voltage Underscaling in On-Chip Memories of CNN Accelerators},
author = {Yamilka Toca-Díaz and Nicolás Landeros Muñoz and Ruben Gran Tejero and Alejandro Valero},
url = {https://ieeexplore.ieee.org/document/10456839},
doi = {https://doi.org/10.1109/DSD60849.2023.00029},
isbn = {979-8-3503-4419-6},
year = {2023},
date = {2023-09-06},
urldate = {2023-09-06},
journal = {Proceedings of the 26th Euromicro Conference on Digital System Design (DSD 2023)},
pages = {138-145},
abstract = {Aggressively underscaling the supply voltage (Vdd) below the safe voltage (Vmin) margin is an effective solution to attain substantial energy savings. Unfortunately, operating at such low voltages is challenging due to the high number of permanent faults as a result of variations in the manufacturing process of current technology nodes. This work characterizes the impact of permanent faults on the accuracy of a Convolutional Neural Network (CNN) inference accelerator with on-chip activation memories supplied at low Vdd below Vmin. Based on these observations, this paper proposes a couple of low-cost microarchitectural techniques, referred to as flipping and patching, that ensure the accuracy of CNN applications despite the presence of permanent faults. Contrary to prior work, the proposed techniques are transparent to the programmer and do not depend on application characteristics. Experimental results show that the proposed techniques maintain the original CNN accuracy with a minimal impact on system performance (less than 0.05%), while reducing the energy consumption of activation memories by 11.2% and 46.7% compared to those of a conventional accelerator operating at safe and nominal supply voltages, respectively.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
López-Villellas, Lorién; Pineda-Sánchez, Esteve; Badouh, Asaf; Marco-Sola, Santiago; Ibáñez, Pablo; Alastruey-Benedé, Jesús; Moretó, Miquel
RISC-V for Genome Data Analysis: Opportunities and Challenges Proceedings Article
In: 2023 38th Conference on Design of Circuits and Integrated Systems (DCIS), pp. 1-6, 2023.
@inproceedings{10335997,
title = {RISC-V for Genome Data Analysis: Opportunities and Challenges},
author = {Lorién López-Villellas and Esteve Pineda-Sánchez and Asaf Badouh and Santiago Marco-Sola and Pablo Ibáñez and Jesús Alastruey-Benedé and Miquel Moretó},
doi = {10.1109/DCIS58620.2023.10335997},
year = {2023},
date = {2023-01-01},
booktitle = {2023 38th Conference on Design of Circuits and Integrated Systems (DCIS)},
pages = {1-6},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Mikkelsen, Carl Christian Kjelgaard; López-Villellas, Lorién; García-Risueño, Pablo
How Accurate Does Newton Have to Be? Proceedings Article
In: Wyrzykowski, Roman; Dongarra, Jack; Deelman, Ewa; Karczewski, Konrad (Ed.): Parallel Processing and Applied Mathematics, pp. 3–15, Springer International Publishing, Cham, 2023, ISBN: 978-3-031-30442-2.
@inproceedings{10.1007/978-3-031-30442-2_1,
title = {How Accurate Does Newton Have to Be?},
author = {Carl Christian Kjelgaard Mikkelsen and Lorién López-Villellas and Pablo García-Risueño},
editor = {Roman Wyrzykowski and Jack Dongarra and Ewa Deelman and Konrad Karczewski},
isbn = {978-3-031-30442-2},
year = {2023},
date = {2023-01-01},
booktitle = {Parallel Processing and Applied Mathematics},
pages = {3–15},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {We analyze the convergence of quasi-Newton methods in exact and finite precision arithmetic. In particular, we derive an upper bound for the stagnation level and we show that any sufficiently exact quasi-Newton method will converge quadratically until stagnation. In the absence of sufficient accuracy, we are likely to retain rapid linear convergence. We confirm our analysis by computing square roots and solving bond constraint equations in the context of molecular dynamics. We briefly discuss implications for parallel solvers.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2022
Journal Articles
Muñoz, Nicolás Landeros; Valero, Alejandro; Tejero, Rubén Gran; Zoni, Davide
Gated-CNN: Combating NBTI and HCI aging effects in on-chip activation memories of Convolutional Neural Network accelerators Journal Article
In: Journal of Systems Architecture, vol. 128, pp. 1-13, 2022, ISSN: 1383-7621.
@article{Muñoz2022,
title = {Gated-CNN: Combating NBTI and HCI aging effects in on-chip activation memories of Convolutional Neural Network accelerators},
author = {Nicolás Landeros Muñoz and Alejandro Valero and Rubén Gran Tejero and Davide Zoni},
url = {https://www.sciencedirect.com/science/article/pii/S1383762122001072},
doi = {https://doi.org/10.1016/j.sysarc.2022.102553},
issn = {1383-7621},
year = {2022},
date = {2022-07-01},
urldate = {2022-07-01},
journal = {Journal of Systems Architecture},
volume = {128},
pages = {1-13},
abstract = {Negative Bias Temperature Instability (NBTI) and Hot Carrier Injection (HCI) are two of the main reliability threats in current technology nodes. These aging phenomena degrade the transistor’s threshold voltage (Vth) over the lifetime of a digital circuit, resulting in slower transistors that eventually lead to a faulty operation when the critical paths become longer than the processor cycle time. Among all the transistors on a chip, the most vulnerable transistors to such wearout effects are those used to implement SRAM storage, since memory cells are continuously degrading. In particular, NBTI ages PMOS cell transistors when a given logic value is stored for a long period (i.e., a long duty cycle), whereas HCI ages NMOS cell transistors not only when the stored value flips but also when it is accessed. This work focuses on mitigating aging in the on-chip SRAM memories of Convolutional Neural Network (CNN) accelerators storing activations. This paper makes two main contributions. At the software level, we quantify the aging induced by current CNN benchmarks with a characterization study of duty cycle, flip, and access patterns in every activation memory cell. Based on the insights from this study, this work proposes a novel microarchitectural technique, Gated-CNN, that ensures a uniform aging degradation of every memory cell. To do so, Gated-CNN exploits power-gating and address rotation techniques tailored to the memory demands and temporal/spatial localities exhibited by CNN applications, as well as the memory organization and management of CNN accelerators. Experimental results show that, compared to a conventional design, the average Vth degradation savings are at least as much as 49% depending on the type of transistor.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Langarita, Rubén; Armejach, Adrià; Setoain, Javier; Ibáñez-Marín, Pablo; Alastruey-Benedé, Jesús; Moretó, Miquel
Compressed Sparse FM-Index: Fast Sequence Alignment Using Large K-Steps Journal Article
In: IEEE ACM Trans. Comput. Biol. Bioinform., vol. 19, no. 1, pp. 355–368, 2022.
@article{DBLP:journals/tcbb/LangaritaASIAM22,
title = {Compressed Sparse FM-Index: Fast Sequence Alignment Using Large K-Steps},
author = {Rubén Langarita and Adrià Armejach and Javier Setoain and Pablo Ibáñez-Marín and Jesús Alastruey-Benedé and Miquel Moretó},
url = {https://doi.org/10.1109/TCBB.2020.3000253},
doi = {10.1109/TCBB.2020.3000253},
year = {2022},
date = {2022-01-01},
journal = {IEEE ACM Trans. Comput. Biol. Bioinform.},
volume = {19},
number = {1},
pages = {355--368},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Proceedings Articles
Gracia, Darío Suárez; Valero, Alejandro; Tejero, Rubén Gran; Villarroya-Gaudó, María; Viñals, Víctor
peRISCVcope: A Tiny Teaching-Oriented RISC-V Interpreter Proceedings Article
In: Proceedings of the 37th Conference on Design of Circuits and Integrated Circuits (DCIS 2022), pp. 1-6, 2022, ISBN: 978-1-6654-5950-1.
@inproceedings{Gracia2022,
title = {peRISCVcope: A Tiny Teaching-Oriented RISC-V Interpreter},
author = {Darío Suárez Gracia and Alejandro Valero and Rubén Gran Tejero and María Villarroya-Gaudó and Víctor Viñals},
url = {https://ieeexplore.ieee.org/document/9970050},
doi = {https://doi.org/10.1109/DCIS55711.2022.9970050},
isbn = {978-1-6654-5950-1},
year = {2022},
date = {2022-11-16},
urldate = {2022-11-16},
booktitle = {Proceedings of the 37th Conference on Design of Circuits and Integrated Circuits (DCIS 2022)},
pages = {1-6},
abstract = {The fast advances of computer systems translate into a growing demand of methodologies and tools to introduce those novelties into classes. Among the plethora of those advances, virtualization has become an essential technology in almost every relevant system stack, from connected cars to hyperscaled cloud servers. However, introducing those technologies into the classroom remains a challenging task because of the huge complexity of their software components that may hinder the learning process of students. peRISCVcope aims to help in this area by proposing a tiny yet powerful interpreter to dig into virtualization technologies, such as the implementation of trap&emulate hypervisors. With less than 2,000 lines of code, and thanks to the conciseness of the RV32I base instruction set of RISC-V, peRISCVcope enables students to make virtualization knowledge their own. This paper presents our experiences developing and testing a virtualization laboratory where students implement parts of an interpreter. After the practical experience, peRISCVcope has been proved as a useful pedagogical tool, and, most importantly, students have positively rated the experience.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Tárrega, Hugo; Valero, Alejandro; Lorente, Vicente; Petit, Salvador; Sahuquillo, Julio
Fast-Track Cache: a huge racetrack memory L1 data cache Proceedings Article
In: Proceedings of the 36th ACM International Conference on Supercomputing (ICS 2022), pp. 1-12, ACM, 2022, ISBN: 978-1-4503-9281-5.
@inproceedings{Tárrega2022,
title = {Fast-Track Cache: a huge racetrack memory L1 data cache},
author = {Hugo Tárrega and Alejandro Valero and Vicente Lorente and Salvador Petit and Julio Sahuquillo},
url = {https://dl.acm.org/doi/10.1145/3524059.3532383},
doi = {https://doi.org/10.1145/3524059.3532383},
isbn = {978-1-4503-9281-5},
year = {2022},
date = {2022-06-28},
urldate = {2022-06-28},
booktitle = {Proceedings of the 36th ACM International Conference on Supercomputing (ICS 2022)},
pages = {1-12},
publisher = {ACM},
abstract = {First-level (L1) caches have been traditionally implemented with Static Random-Access Memory (SRAM) technology, since it is the fastest memory technology, and L1 caches call for tight timing constraints in the processor pipeline. However, one of the main downsides of SRAM is its low density, which prevents L1 caches to improve their storage capacity beyond a few tens of KB. On the other hand, the recent Domain Wall Memory (DWM) technology overcomes such a constraint by arranging multiple bits in a magnetic racetrack, and sharing a header to access those bits. Accessing a bit requires a shift operation to align the target bit under the header. Such shifts increase the final access latency, which is the main reason why DWM has been mostly used to implement slow last-level caches. This paper proposes a novel DWM-based L1 cache data array design, namely Fast-Track Cache (FTC), that allows L1 caches with bigger storage capacities while reducing the shift overhead thanks to an enhanced exploitation of spatial and temporal localities. Experimental results show that most FTC accesses do not require shifts. As a consequence, and due to its larger capacity, FTC improves the processor performance on average by 15% over a conventional SRAM memory subsystem and the state-of-the-art TapeCache architecture based on DWM. At the same time, energy savings are improved on average by 34% over the conventional design.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Navarro-Torres, Agustín; Panda, Biswabandan; Alastruey-Benedé, Jesús; Ibáñez, Pablo; Yúfera, Víctor Viñals; Ros, Alberto
Berti: an Accurate Local-Delta Data Prefetcher Proceedings Article
In: 55th IEEE/ACM International Symposium on Microarchitecture, MICRO 2022, Chicago, IL, USA, October 1-5, 2022, pp. 975–991, IEEE, 2022.
@inproceedings{DBLP:conf/micro/Navarro-TorresP22,
title = {Berti: an Accurate Local-Delta Data Prefetcher},
author = {Agustín Navarro-Torres and Biswabandan Panda and Jesús Alastruey-Benedé and Pablo Ibáñez and Víctor Viñals Yúfera and Alberto Ros},
url = {https://doi.org/10.1109/MICRO56248.2022.00072},
doi = {10.1109/MICRO56248.2022.00072},
year = {2022},
date = {2022-01-01},
booktitle = {55th IEEE/ACM International Symposium on Microarchitecture, MICRO
2022, Chicago, IL, USA, October 1-5, 2022},
pages = {975--991},
publisher = {IEEE},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Escuin, Carlos; Khan, Asif Ali; Ibáñez, Pablo; Monreal, Teresa; Viñals, Víctor; Castrillón, Jerónimo
HyCSim: A rapid design space exploration tool for emerging hybrid last-level caches Proceedings Article
In: DroneSE and RAPIDO ’22: System Engineering for constrained embedded systems, Budapest Hungary, January 17 – 19, 2022, pp. 53–58, ACM, 2022.
@inproceedings{DBLP:conf/hipeac/EscuinKIMVC22,
title = {HyCSim: A rapid design space exploration tool for emerging hybrid
last-level caches},
author = {Carlos Escuin and Asif Ali Khan and Pablo Ibáñez and Teresa Monreal and Víctor Viñals and Jerónimo Castrillón},
url = {https://doi.org/10.1145/3522784.3522801},
doi = {10.1145/3522784.3522801},
year = {2022},
date = {2022-01-01},
booktitle = {DroneSE and RAPIDO '22: System Engineering for constrained embedded
systems, Budapest Hungary, January 17 - 19, 2022},
pages = {53--58},
publisher = {ACM},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2021
Journal Articles
Valero, Alejandro; Tejero, Ruben Gran; Gracia, Darío Suárez; Georgescu, Emanuel A.; Ezpeleta, Joaquín; Álvarez, Pedro; Muñoz, Adolfo; Ramos, Luis M.; Ibáñez, Pablo
A learning experience toward the understanding of abstraction-level interactions in parallel applications Journal Article
In: J. Parallel Distributed Comput., vol. 156, pp. 38–52, 2021.
@article{DBLP:journals/jpdc/ValeroTGGEAMRI21,
title = {A learning experience toward the understanding of abstraction-level
interactions in parallel applications},
author = {Alejandro Valero and Ruben Gran Tejero and Darío Suárez Gracia and Emanuel A. Georgescu and Joaquín Ezpeleta and Pedro Álvarez and Adolfo Muñoz and Luis M. Ramos and Pablo Ibáñez},
url = {https://doi.org/10.1016/j.jpdc.2021.05.008},
doi = {10.1016/j.jpdc.2021.05.008},
year = {2021},
date = {2021-01-01},
journal = {J. Parallel Distributed Comput.},
volume = {156},
pages = {38--52},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Díaz, Javier; Ibáñez, Pablo; Monreal, Teresa; Viñals, Víctor; Llabería, José M.
Near-optimal replacement policies for shared caches in multicore processors Journal Article
In: J. Supercomput., vol. 77, no. 10, pp. 11756–11785, 2021.
@article{DBLP:journals/tjs/DiazIMVL21,
title = {Near-optimal replacement policies for shared caches in multicore processors},
author = {Javier Díaz and Pablo Ibáñez and Teresa Monreal and Víctor Viñals and José M. Llabería},
url = {https://doi.org/10.1007/s11227-021-03736-1},
doi = {10.1007/s11227-021-03736-1},
year = {2021},
date = {2021-01-01},
journal = {J. Supercomput.},
volume = {77},
number = {10},
pages = {11756--11785},
keywords = {},
pubstate = {published},
tppubtype = {article}
}