@inproceedings{KVCacheProfile-IPDPS25,
title = {Characterizing the Behavior and Impact of KV Caching on Transformer Inferences under Concurrency},
author = {Ye, Jie and Cernuda, Jaime and Maurya, Avinash and Sun, Xian-He and Kougas, Anthony and Nicolae, Bogdan},
booktitle = {IPDPS'25: The 39th IEEE International Parallel and Distributed Processing Symposium},
year = {2025},
address = {Milan, Italy},
url = {https://hal.inria.fr/hal-04984000},
keywords = {LLM inference, KV cache profiling, access pattern characterization}
}
@misc{EIARA25,
title = {EAIRA: Establishing a Methodology for Evaluating AI Models as Scientific Research Assistants},
author = {Cappello, Franck and Madireddy, Sandeep and Underwood, Robert and Getty, Neil and Chia, Nicholas Lee-Ping and Ramachandra, Nesar and Nguyen, Josh and Keceli, Murat and Mallick, Tanwi and Li, Zilinghan and Ngom, Marieme and Zhang, Chenhui and Yanguas-Gil, Angel and Antoniuk, Evan and Kailkhura, Bhavya and Tian, Minyang and Du, Yufeng and Ting, Yuan-Sen and Wells, Azton and Nicolae, Bogdan and Maurya, Avinash and Rafique, M. Mustafa and Huerta, Eliu and Li, Bo and Foster, Ian and Stevens, Rick},
year = {2025},
eprint = {2502.20309},
archiveprefix = {arXiv},
primaryclass = {cs.AI},
url = {https://arxiv.org/abs/2502.20309},
doi = {10.48550/arXiv.2502.20309},
keywords = {AI, large language models, evaluation methodology, scientific case studies}
}
@article{Wilkins-FHPC25,
author = {Yildiz, Orcun and Morozov, Dmitriy and Nigmetov, Arnur and Nicolae, Bogdan and Peterka, Tom},
title = {Wilkins: HPC in situ workflows made easy},
journal = {Frontiers in High Performance Computing},
volume = {2},
year = {2024},
url = {https://www.frontiersin.org/journals/high-performance-computing/articles/10.3389/fhpcp.2024.1472719},
doi = {10.3389/fhpcp.2024.1472719},
issn = {2813-7337},
keywords = {HPC, in situ workflows, usability, ensembles, data transport, flow control}
}
@inproceedings{DeepOptStates-Middleware24,
author = {Maurya, Avinash and Ye, Jie and Rafique, M. Mustafa and Cappello, Franck and Nicolae, Bogdan},
title = {Deep Optimizer States: Towards Scalable Training of Transformer Models using Interleaved Offloading},
year = {2024},
isbn = {9798400706233},
url = {https://doi.org/10.1145/3652892.3700781},
doi = {10.1145/3652892.3700781},
booktitle = {MIDDLEWARE'24: The 25th International Middleware Conference},
pages = {404–416},
numpages = {13},
keywords = {scalable training of large language models, hybrid CPU-GPU I/O performance tuning and middleware, data management for hybrid LLM training, scalable optimization methods for ML},
address = {Hong Kong, China}
}
@inproceedings{CkptComparison-Middleware24,
author = {Tan, Nigel and Assogba, Kevin and Ashworth, Walter J. and Bogale, Befikir and Cappello, Franck and Rafique, M. Mustafa and Taufer, Michela and Nicolae, Bogdan},
title = {Towards Affordable Reproducibility Using Scalable Capture and Comparison of Intermediate Multi-Run Results},
year = {2024},
isbn = {9798400706233},
url = {https://doi.org/10.1145/3652892.3700780},
doi = {10.1145/3652892.3700780},
booktitle = {MIDDLEWARE '24: The 25th International Middleware Conference},
pages = {392–403},
numpages = {12},
keywords = {results reproducibility, checkpoint analysis, high-performance computing, error-bounded hashing},
address = {Hong Kong, China}
}
@inproceedings{Diaspora-Overview-eScience24,
title = {Diaspora: Resilience-Enabling Services for Real-Time Distributed Workflows},
author = {Nicolae, Bogdan and Wozniak, Justin and Bicer, Tekin and Nguyen, Hai and Patel, Parth and Pan, Haochen and Gueroudji, Amal and Gonthier, Maxime and Hayot-Sasson, Valerie and Huerta, Eliu and Chard, Kyle and Chard, Ryan and Dorier, Matthieu and Rao, Nageswara S. V. and Al-Najjar, Anees and Corsi, Alessandra and Foster, Ian},
booktitle = {NRDPISI’24: The 1st The 1st International Workshop on Near Real-time Data Processing for Interconnected Scientific Instruments (co-located with eScience’24)},
year = {2024},
address = {Osaka, Japan},
doi = {10.1109/e-Science62913.2024.10678669},
url = {https://hal.inria.fr/hal-04819775},
keywords = {real-time distributed HPC workflows, resilience, high-availability, data streaming, elasticity, anomaly detection and prediction}
}
@inproceedings{Viper-ICPP24,
title = {Viper: A High-Performance I/O Framework for Transparently Updating, Storing, and Transferring Deep Neural Network Models},
author = {Ye, Jie and Cernuda, Jaime and Rajesh, Neeraj and Bateman, Keith and Yildiz, Orcun and Peterka, Tom and Nigmetov, Arnur and Morozov, Dmitriy and Sun, Xian-He and Kougkas, Anthony and Nicolae, Bogdan},
booktitle = {ICPP'24: The 53nd International Conference on Parallel Processing},
year = {2024},
address = {Gotland, Sweden},
doi = {10.1145/3673038.3673070},
url = {https://hal.inria.fr/hal-04664225},
keywords = {AI Workflows, Coupled Training and Inferences, Adaptive AI Model Checkpointing, Inferences During Partial Training}
}
@article{RehearsalBuff-FGCS24,
title = {Efficient Distributed Continual Learning for Steering Experiments in Real-Time},
journal = {Future Generation Computer Systems},
year = {2024},
issn = {0167-739X},
doi = {10.1016/j.future.2024.07.016},
url = {https://hal.inria.fr/hal-04664176},
author = {Bouvier, Thomas and Nicolae, Bogdan and Costan, Alexandru and Bicer, Tekin and Foster, Ian and Antoniu, Gabriel},
keywords = {continual learning, data-parallel training, experience replay, distributed rehearsal buffers, asynchronous data management, scalability, streaming, generative AI}
}
@article{VELOC-FGCS24,
title = {Scalable I/O aggregation for asynchronous multi-level checkpointing},
journal = {Future Generation Computer Systems},
volume = {160},
pages = {420-432},
year = {2024},
issn = {0167-739X},
doi = {10.1016/j.future.2024.06.003},
url = {https://www.sciencedirect.com/science/article/pii/S0167739X24002929},
author = {Gossman, Mikaila J. and Nicolae, Bogdan and Calhoun, Jon C.},
keywords = {checkpoint-restart, asynchronous I/O, distributed I/O aggregation}
}
@inproceedings{DataStatesLLM-HPDC24,
title = {DataStates-LLM: Lazy Asynchronous Checkpointing for Large Language Models},
author = {Maurya, Avinash and Underwood, Robert and Rafique, Mustafa and Cappello, Franck and Nicolae, Bogdan},
booktitle = {HPDC'24: The 33nd International Symposium on High-Performance Parallel and Distributed Computing},
year = {2024},
address = {Pisa, Italy},
url = {https://hal.science/hal-04614247},
doi = {10.1145/3625549.3658685},
keywords = {scalable checkpointing, asynchronous multi-level I/O, machine learning and AI, LLMs and transformers}
}
@inproceedings{EvoStore-HPDC24,
title = {EvoStore: Towards Scalable Storage of Evolving Learning Models},
author = {Underwood, Robert and Madhyastha, Meghana and Burns, Randal and Nicolae, Bogdan},
booktitle = {HPDC'24: The 33nd International Symposium on High-Performance Parallel and Distributed Computing},
year = {2024},
address = {Pisa, Italy},
url = {https://hal.science/hal-04617763},
doi = {10.1145/3625549.3658679},
keywords = {AI, Model Repository, Network Architecture Search, Regulartized Evolution, Distributed, AI for HPC}
}
@inproceedings{CCGrid24-RehearsalBuffers,
title = {{Efficient Data-Parallel Continual Learning with Asynchronous Distributed Rehearsal Buffers}},
author = {Bouvier, Thomas and Nicolae, Bogdan and Chaugier, Hugo and Costan, Alexandru and Foster, Ian and Antoniu, Gabriel},
booktitle = {{CCGrid 2024: IEEE 24th International Symposium on Cluster, Cloud and Internet Computing}},
address = {Philadelphia, USA},
pages = {1-10},
year = {2024},
keywords = {continual learning, data-parallel training, experience replay, distributed rehearsal buffers, asynchronous data management, scalability},
doi = {10.1109/CCGrid59990.2024.00036},
url = {https://inria.hal.science/hal-04600107}
}
@inproceedings{FlexScience24-MemWall,
title = {Breaking the Memory Wall: A Study of I/O Patterns and GPU Memory Utilization for Hybrid CPU-GPU Offloaded Optimizers},
author = {Maurya, Avinash and Ye, Jie and Rafique, Mustafa and Cappello, Franck and Nicolae, Bogdan},
booktitle = {FlexScience'24: The 14th IEEE/ACM Workshop on AI and Scientific Computing at Scale using Flexible Computing Infrastructures},
address = {Pisa, Italy},
year = {2024},
keywords = {deep learning, distributed caching, data pipelines, reuse or training data},
url = {https://arxiv.org/pdf/2406.10728},
doi = {10.1145/3659995.3660038}
}
@inproceedings{AICache-HIPC23,
title = {Optimizing the Training of Co-Located Deep Learning Models Using Cache-Aware Staggering},
author = {Assogba, Kevin and Rafique, Mustafa and Nicolae, Bogdan},
booktitle = {HIPC’23: 30th IEEE International Conference on High Performance Computing, Data, and Analytics},
year = {2023},
address = {Goa, India},
doi = {10.1109/HiPC58850.2023.00042},
url = {https://hal.inria.fr/hal-04343672},
keywords = {Deep Learning, Caching and Reuse of Training Data, Co-Located Training, Performance Modeling}
}
@inproceedings{LineageComp-HIPC23,
title = {Towards Efficient I/O Pipelines using Accumulated Compression},
author = {Maurya, Avinash and Rafique, M. Mustafa and Cappello, Franck and Nicolae, Bogdan},
booktitle = {HIPC’23: 30th IEEE International Conference on High Performance Computing, Data, and Analytics},
year = {2023},
address = {Goa, India},
doi = {10.1109/HiPC58850.2023.00043},
url = {https://hal.inria.fr/hal-04343670},
keywords = {GPU compression and checkpointing, data accumulation, fast compression}
}
@inproceedings{NASPatterns-HIPC23,
title = {Understanding Patterns of Deep Learning Model Evolution in Network Architecture Search},
author = {Underwood, Robert and Madhastha, Meghana and Burns, Randal and Nicolae, Bogdan},
booktitle = {HIPC’23: 30th IEEE International Conference on High Performance Computing, Data, and Analytics},
year = {2023},
address = {Goa, India},
doi = {10.48550/arXiv.2309.12576},
url = {https://arxiv.org/pdf/2309.12576.pdf},
keywords = {Transfer Learning, AI, Network Architecture Search, Regularized Evolution, Characterization Study}
}
@inproceedings{ElasticHorovod-AI4S23,
title = {Elastic deep learning through resilient collective operations},
author = {Li, Jiali and Bouteiller, Aurelien and Bosilca, George and Nicolae, Bogdan},
booktitle = {AI4S'23: 4th Workshop on Artificial Intelligence and Machine Learning for Scientific Applications (with SC’23)},
year = {2023},
address = {Denver, USA},
pages = {44-50},
doi = {10.1145/3624062.3626080},
url = {https://hal.inria.fr/hal-04343677},
keywords = {Distributed deep learning, fault tolerance, elastic training, resilient collective communication}
}
@inproceedings{VELOCReprod-SuperCheck23,
title = {Asynchronous Multi-Level Checkpointing: An Enabler of Reproducibility using Checkpoint History Analytics},
author = {Assogba, Kevin and Dam, Huub Van and Rafique, Mustafa and Nicolae, Bogdan},
booktitle = {SuperCheck’23: The 4th International Symposium on Checkpointing for Supercomputing (with SC'23)},
year = {2023},
address = {Denver, USA},
pages = {1748–1756},
doi = {10.1145/3624062.3624256},
url = {https://hal.inria.fr/hal-04343694},
keywords = {result reproducibility, checkpoint analysis, high performance computing, asynchronous multi-level checkpointing}
}
@inproceedings{RECUP-REWORDS23,
title = {Building the I (Interoperability) of FAIR for performance reproducibility of large-scale composable workflows in RECUP},
author = {Nicolae, Bogdan and Islam, Tanzima and Ross, Robert and Dam, Huub Van and Assogba, Kevin and Shpilker, Polina and Titov, Mikhail and Turilli, Matteo and Wang, Tianle and Kilic, Ozgur and Jha, Shantenu and Pouchard, Line},
booktitle = {REWORDS’23: The 3rd Workshop on Reproducible Workflows, Data Management, and Security (with eScience’23)},
pages = {1-7},
year = {2023},
address = {Limassol, Cyprus},
doi = {10.1109/e-Science58273.2023.10254808},
url = {https://hal.inria.fr/hal-04343665},
keywords = {High performance computing, HPC, performance reproducibility, workflow execution patterns, workflow execution provenance, metadata capture, research software engineering, FAIR4RS, FAIR4HPC, RO-Crate}
}
@inproceedings{IOAgg-ISPDC23,
title = {Modeling Multi-Threaded Aggregated I/O for Asynchronous Checkpointing on HPC Systems},
author = {Gossman, Mikaila and Nicolae, Bogdan and Calhoun, Jon},
booktitle = {ISPDC’23: The 22nd IEEE International Conference on Parallel and Distributed Computing},
year = {2023},
address = {Bucharest, Romania},
pages = {101-105},
doi = {10.1109/ISPDC59212.2023.00021},
url = {https://hal.inria.fr/hal-04343661},
keywords = {performance evaluation, checkpointing, parallel file systems, multi-threaded I/O}
}
@inproceedings{GPUDedup-ICPP23,
title = {Scalable Incremental Checkpointing using GPU-Accelerated De-Duplication},
author = {Tan, Nigel and Luettgau, Jakob and Marquez, Jack and Terianishi, Keita and Morales, Nicolas and Bhowmick, Sanjukta and Cappello, Franck and Taufer, Michela and Nicolae, Bogdan},
booktitle = {ICPP'23: The 52nd International Conference on Parallel Processing},
year = {2023},
pages = {665-674},
address = {Salt Lake City, USA},
url = {https://hal.inria.fr/hal-04173764},
doi = {10.1145/3605573.3605639},
keywords = {checkpointing, data versioning, incremental storage, de-duplication, GPU parallelization}
}
@inproceedings{MPIGDB-FlexScience23,
title = {MPIGDB: A Flexible Debugging Infrastructure for MPI Programs},
author = {Underwood, Robert and Nicolae, Bogdan},
booktitle = {FlexScience’23: The 13th Workshop on AI and Scientific Computing at Scale using Flexible Computing Infrastructures (with HPDC'23)},
year = {2023},
address = {Orlando, USA},
pages = {11-18},
doi = {10.1145/3589013.3596675},
url = {https://hal.inria.fr/hal-04343674},
keywords = {MPI, debugging, distributed state}
}
@inproceedings{GPUPrefetch-HPDC23,
title = {GPU-Enabled Asynchronous Multi-level Checkpoint Caching and Prefetching},
author = {Maurya, Avinash and Rafique, Mustafa and Tonellot, Thierry and AlSalem, Hussain and Cappello, Franck and Nicolae, Bogdan},
booktitle = {HPDC'23: The 32nd International Symposium on High-Performance Parallel and Distributed Computing},
year = {2023},
pages = {73-85},
address = {Orlando, USA},
url = {https://hal.inria.fr/hal-04119928},
doi = {10.1145/3588195.3592987},
keywords = {High-Performance Computing (HPC), Graphics Processing Unit (GPU), asynchronous multi-level checkpointing, hierarchical cache management, prefetching}
}
@inproceedings{DStore-ICS23,
title = {DStore: A Lightweight Scalable Learning Model Repository with Fine-Grained Tensor-Level Access},
author = {Madhyastha, Meghana and Underwood, Robert and Burns, Randal and Nicolae, Bogdan},
booktitle = {ICS'23: The 2023 International Conference on Supercomputing},
year = {2023},
pages = {133-143},
address = {Orlando, USA},
url = {https://hal.inria.fr/hal-04119926},
doi = {10.1145/3577193.3593730},
keywords = {DL model repository, fine-grained tensor storage and access, benchmarking}
}
@inproceedings{LowFive-IPDPS23,
title = {LowFive: In Situ Data Transport for High-Performance Workflows},
author = {Peterka, Tom and Morozov, Dmitriy and Nigmetov, Arnur and Yildiz, Orcun and Nicolae, Bogdan and Davis, Philip E.},
booktitle = {IPDPS'23: The 37th IEEE International Parallel and Distributed Processing Symposium},
year = {2023},
pages = {985-995},
address = {St. Petersburg, USA},
url = {https://hal.inria.fr/hal-04119925},
doi = {10.1109/IPDPS54959.2023.00102},
keywords = {workflow, data model, data transport, in situ}
}
@inproceedings{RECUP-WORKS22,
title = {Novel Proposals for FAIR, Automated, Recommendable, and Robust Workflows},
author = {Abhinit, Ishan and Adams, Emily K. and Alam, Khairul and Chase, Brian and Deelman, Ewa and Gorenstein, Lev and Hudson, Stephen and Islam, Tanzima and Larson, Jeffrey and Lentner, Geoffrey and Mandal, Anirban and Navarro, John-Luke and Nicolae, Bogdan and Pouchard, Line and Ross, Rob and Roy, Banani and Rynge, Mats and Serebrenik, Alexander and Vahi, Karan and Wild, Stefan and Xin, Yufeng and da Silva, Rafael Ferreira and Filgueira, Rosa},
booktitle = {WORKS'22: 17th Workshop on Workflows in Support of Large-Scale Science (in conjunction with SC'22)},
year = {2022},
pages = {84 - 92},
address = {Dallas, USA},
url = {https://www.osti.gov/biblio/1958760},
doi = {10.1109/WORKS56498.2022.00016},
keywords = {reproducibility, scalable data collection, metadata aggregation and indexing}
}
@inproceedings{VELOCGPU-HiPC22,
title = {{Towards Efficient Cache Allocation for High-Frequency Checkpointing}},
author = {Maurya, Avinash and Nicolae, Bogdan and Rafique, M. Mustafa and Elsayed, Amr M. and Tonellot, Thierry and Cappello, Franck},
booktitle = {{HiPC'22: The 29th IEEE International Conference on High Performance Computing, Data, and Analytics}},
address = {Bangalore, India},
year = {2022},
pages = {262-271},
doi = {10.1109/HiPC56025.2022.00043},
url = {https://hal.inria.fr/hal-03799226/},
keywords = {GPU checkpointing, multi-level caching, fast initialization}
}
@inproceedings{Kokkos-CLUSTER22,
title = {{Integrating process, control-flow, and data resiliency layers using a hybrid Fenix/Kokkos approach}},
author = {Whitlock, Matthew and Morales, Nicolas and Bosilca, George and Bouteiller, Aurélien and Nicolae, Bogdan and Teranishi, Keita and Giem, Elisabeth and Sarkar, Vivek},
booktitle = {{CLUSTER'22: The 2022 IEEE International Conference on Cluster Computing}},
address = {Heidelberg, Germany},
year = {2022},
pages = {418-428},
url = {https://hal.inria.fr/hal-03772536},
doi = {10.1109/CLUSTER51413.2022.00052},
keywords = {Fault Tolerance, Resilience, Checkpointing, MPI-ULFM, Kokkos, Fenix, HPC}
}
@inproceedings{LobsterAI-ICPP22,
title = {{Lobster: Load Balance-Aware I/O for Distributed DNN Training}},
author = {Liu, Jie and Nicolae, Bogdan and Li, Dong},
booktitle = {{ICPP '22: The 51st International Conference on Parallel Processing}},
address = {Bordeaux, France},
year = {2022},
pages = {26:1-26:11},
url = {https://hal.inria.fr/hal-03718681},
doi = {10.1145/3545008.3545090},
keywords = {deep learning, data pipelines, collaborative caching, prefetching, load balancing}
}
@inproceedings{AI-FlexScience22,
title = {Large Scale Caching and Streaming of Training Data for Online Deep Learning},
author = {Liu, Jie and Nicolae, Bogdan and Li, Dong and Wozniak, Justin M. and Bicer, Tekin and Liu, Zhengchun and Foster, Ian},
booktitle = {FlexScience'22: The 12th IEEE/ACM Workshop on AI and Scientific Computing at Scale using Flexible Computing Infrastructures},
address = {Minneapolis, USA},
year = {2022},
keywords = {deep learning, distributed caching, data pipelines, reuse or training data},
url = {https://hal.inria.fr/hal-03694669},
doi = {10.1145/3526058.3535453},
pages = {19-26}
}
@inproceedings{DLFT-CCGrid22,
title = {Towards Low-Overhead Resilience for Data Parallel Deep Learning},
author = {Nicolae, Bogdan and Hobson, Tanner and Yildiz, Orcun and Peterka, Tom and Morozov, Dmitry},
url = {https://hal.inria.fr/hal-03631882},
doi = {10.1109/CCGrid54584.2022.00043},
pages = {336-345},
booktitle = {CCGrid'22: The 22th IEEE/ACM International Symposium on Cluster, Cloud and Internet Computing},
address = {Messina, Italy},
year = {2022},
keywords = {deep learning, data-parallel training, failure simulation, performance model, trade-off analysis}
}
@inproceedings{DataStates-IPDPS22,
title = {{Scalable Multi-Versioning Ordered Key-Value Stores with Persistent Memory Support}},
author = {Nicolae, Bogdan},
booktitle = {{IPDPS 2022: The 36th IEEE International Parallel and Distributed Processing Symposium}},
address = {Lyon, France},
year = {2022},
url = {https://hal.inria.fr/hal-03598396},
doi = {10.1109/IPDPS53621.2022.00018},
pages = {93-103},
keywords = {key-value store, ordered dictionary, versioning control, scalable access under concurrency, persistent memory}
}
@inproceedings{BraidDB21,
author = {Wozniak, Justin M. and Liu, Zhengchun and Vescovi, Rafael and Chard, Ryan and Nicolae, Bogdan and Foster, Ian T.},
title = {{Braid-DB: Toward AI-Driven Science with Machine Learning Provenance}},
booktitle = {{SMC'21: The 21st Smoky Mountains Computational Sciences and Engineering Conference}},
address = {Virtual Event},
year = {2021},
pages = {247--261},
url = {https://doi.org/10.1007/978-3-030-96498-6\_14},
doi = {10.1007/978-3-030-96498-6\_14},
keywords = {provenance, machine learning, database}
}
@inproceedings{VELOC-MASCOTS21,
title = {{Towards Efficient I/O Scheduling for Collaborative Multi-Level Checkpointing}},
author = {Maurya, Avinash and Nicolae, Bogdan and Rafique, Mustafa and Tonellot, Thierry and Cappello, Franck},
booktitle = {{MASCOTS'21: The 29th IEEE International Symposium on the Modeling, Analysis, and Simulation of Computer and Telecommunication Systems}},
address = {Virtual, Portugal},
year = {2021},
pages = {1-8},
url = {https://hal.inria.fr/hal-03344362},
doi = {10.1109/MASCOTS53633.2021.9614284},
keywords = {GPU checkpointing, asynchronous I/O, peer-to-peer collaborative caching, multi-level checkpointing}
}
@inproceedings{DataStates-CLUSTER21,
title = {{Accelerating DNN Architecture Search at Scale Using Selective Weight Transfer}},
author = {Liu, Hongyuan and Nicolae, Bogdan and Di, Sheng and Cappello, Franck and Jog, Adwait},
booktitle = {{CLUSTER'21: The 2021 IEEE International Conference on Cluster Computing}},
address = {Portland, USA},
year = {2021},
url = {https://hal.inria.fr/hal-03341805},
doi = {10.1109/Cluster48925.2021.00051},
pages = {82-93},
keywords = {deep learning, neural architecture search, checkpointing, reuse of intermediate data states}
}
@inproceedings{KerA-CLUSTER21,
title = {{Virtual Log-Structured Storage for High-Performance Streaming}},
author = {Marcu, Ovidiu and Costan, Alexandru and Nicolae, Bogdan and Antoniu, Gabriel},
booktitle = {{CLUSTER'21: The 2021 IEEE International Conference on Cluster Computing}},
address = {Portland, USA},
year = {2021},
doi = {10.1109/Cluster48925.2021.00046},
url = {https://hal.inria.fr/hal-03300796},
pages = {135-145},
keywords = {replicated virtual log, stream storage, log structured, durability, consistent stream ordering}
}
@inproceedings{BRAID-SMC21,
title = {{High-Performance Ptychographic Reconstruction with Federated Facilities}},
author = {Bicer, Tekin and Yu, Xiaodong and Ching, Daniel J. and Chard, Ryan and Cherukara, Mathew J. and Nicolae, Bogdan and Kettimuthu, Rajkumar},
booktitle = {{SMC'21: The 2021 Smoky Mountains Computational Sciences and Engineering Conference}},
address = {Kingsport, United States},
year = {2021},
pages = {173-189},
doi = {10.1007/978-3-030-96498-6_10},
url = {https://arxiv.org/pdf/2111.11330},
keywords = {ptychography, high-performance computing, synchrotron light source, scientific computing, federation}
}
@inproceedings{PortResEuroPar21,
title = {Towards High Performance Resilience Using Performance Portable Abstractions.},
year = {2021},
author = {Morales, Nicolas and Teranishi, Keita and Nicolae, Bogdan and Trott, Christian and Cappello, Franck},
booktitle = {EuroPar’21: 27th International European Conference on Parallel and Distributed Systems},
address = {Lisbon, Portugal},
url = {https://hal.inria.fr/hal-03260432},
doi = {10.1007/978-3-030-29400-7_4},
pages = {47–60},
keywords = {Performance Portability, Resilience, Fault Tolerance, Checkpointing, Programming Models}
}
@article{AsyncInterferenceJHPCA-21,
author = {Tseng, Shu-Mei and Nicolae, Bogdan and Cappello, Franck and Chandramowlishwaran, Aparna},
title = {Demystifying asynchronous I/O Interference in HPC applications},
journal = {The International Journal of High Performance Computing Applications},
volume = {35},
issue = {4},
pages = {391-412},
year = {2021},
doi = {10.1177/10943420211016511},
url = {https://doi.org/10.1177/10943420211016511},
keywords = {asynchronous I/O, interference study}
}
@inproceedings{Dhmem-CCGrid21,
title = {Shared-Memory Communication for Containerized Workflows},
author = {Hobson, Tanner and Yildiz, Orcun and Nicolae, Bogdan and Huang, Jian and Peterka, Tom},
url = {https://hal.inria.fr/hal-03200931},
booktitle = {CCGrid'21: The 21th IEEE/ACM International Symposium on Cluster, Cloud and Internet Computing},
address = {Virtual, Australia},
year = {2021},
pages = {123-132},
doi = {10.1109/CCGrid51090.2021.00022},
keywords = {shared memory, workflow systems, containers}
}
@inproceedings{VELOC-SuperCheck21,
title = {VELOC: VEry Low Overhead Checkpointing in the Age of Exascale},
author = {Nicolae, Bogdan and Moody, Adam and Kosinovsky, Greg and Mohror, Kathryn and Cappello, Franck},
url = {https://arxiv.org/pdf/2103.02131.pdf},
booktitle = {SuperCheck'21: The First International Symposium on Checkpointing for Supercomputing},
address = {Virtual Event},
year = {2021},
keywords = {HPC, checkpoint-restart, state preservation, resilience}
}
@inproceedings{MLHPC20-CANDLE,
title = {High-bypass Learning: Automated Detection of Tumor Cells That Significantly Impact Drug Response},
author = {Wozniak, Justin and Yoo, Hyunseung and Mohd-Yusof, Jamaludin and Nicolae, Bogdan and Collier, Nicholson and Ozik, Jonathan and Brettin, Thomas and Stevens, Rick},
url = {https://www.mcs.anl.gov/~wozniak/papers/CANDLE_2020.pdf},
booktitle = {MLHPC'20: The 2020 IEEE/ACM Workshop on Machine Learning in High Performance Computing Environments (in conjuction with SC'20)},
address = {Virtual Event},
year = {2020},
pages = {1-10},
doi = {10.1109/MLHPCAI4S51975.2020.00012},
keywords = {deep learning, sensitivity analysis, outlier detection, ensembles, workflows}
}
@inproceedings{DataStates20,
title = {{DataStates: Towards Lightweight Data Models for Deep Learning}},
author = {Nicolae, Bogdan},
url = {https://hal.inria.fr/hal-02941295},
doi = {10.1007/978-3-030-63393-6_8},
booktitle = {{SMC'20: The 2020 Smoky Mountains Computational Sciences and Engineering Conference}},
address = {Nashville, United States},
year = {2020},
pages = {117-129},
keywords = {deep learning, state preservation, clone, model reuse}
}
@inproceedings{CoSim20,
title = {{CoSim: A Simulator for Co-Scheduling of Batch and On-Demand Jobs in HPC Datacenters}},
author = {Maurya, Avinash and Nicolae, Bogdan and Guliani, Ishan and Rafique, M Mustafa},
url = {https://hal.inria.fr/hal-02925237},
doi = {10.1109/DS-RT50469.2020.9213578},
booktitle = {{DS-RT'20: The 24th IEEE/ACM International Symposium on Distributed Simulation and Real Time Applications}},
address = {Prague, Czech Republic},
year = {2020},
pages = {167-174},
keywords = {high performance computing, job scheduling, checkpointing strategies}
}
@inproceedings{DeepClone20,
title = {{DeepClone: Lightweight State Replication of Deep Learning Models for Data Parallel Training}},
author = {Nicolae, Bogdan and Wozniak, Justin M and Dorier, Matthieu and Cappello, Franck},
url = {https://hal.archives-ouvertes.fr/hal-02914545},
doi = {10.1109/CLUSTER49012.2020.00033},
booktitle = {{CLUSTER'20: The 2020 IEEE International Conference on Cluster Computing}},
address = {Kobe, Japan},
year = {2020},
pages = {226-236},
keywords = {deep learning, data-parallel training, layer-wise parallelism, state cloning and replication, large-scale AI}
}
@inproceedings{MLCkpt20,
title = {Optimizing Asynchronous Multi-Level Checkpoint/Restart Configurations with Machine Learning},
url = {https://hal.archives-ouvertes.fr/hal-02914478},
booktitle = {HPS'20: The 2020 IEEE International Workshop on High-Performance Storage},
author = {Dey, Tonmoy and Sato, Kento and Nicolae, Bogdan and Guo, Jian and Domke, Jens and Yu, Weikuan and Cappello, Franck and Mohror, Kathryn},
address = {New Orleans, USA},
doi = {10.1109/IPDPSW50202.2020.00174},
year = {2020},
pages = {1036-1043},
keywords = {high performance computing, checkpoint-restat, machine learning optimization}
}
@inproceedings{DeepFreeze20,
title = {DeepFreeze: Towards Scalable Asynchronous Checkpointing of Deep Learning Models},
year = {2020},
author = {Nicolae, Bogdan and Li, Jiali and Wozniak, Justin and Bosilca, George and Dorier, Matthieu and Cappello, Franck},
booktitle = {CGrid'20: 20th IEEE/ACM International Symposium on Cluster, Cloud and Internet Computing},
address = {Melbourne, Australia},
pages = {172-181},
doi = {10.1109/CCGrid49817.2020.00-76},
url = {https://hal.inria.fr/hal-02543977},
keywords = {deep learning, checkpointing, state preservation, multi-level data persistence, fine-grain asynchronous I/O}
}
@inproceedings{VeloCIPDPS19,
title = {VeloC: Towards High Performance Adaptive Asynchronous Checkpointing at Large Scale},
year = {2019},
author = {Nicolae, Bogdan and Moody, Adam and Gonsiorowski, Elsa and Mohror, Kathryn and Cappello, Franck},
booktitle = {IPDPS'19: The 2019 IEEE International Parallel and Distributed Processing Symposium},
pages = {911-920},
address = {Rio de Janeiro, Brazil},
doi = {10.1109/IPDPS.2019.00099},
url = {https://hal.inria.fr/hal-02184203},
keywords = {parallel I/O, checkpoint-restart, immutable data, adaptive multilevel asynchronous I/O}
}
@inproceedings{NetPredEuroPar19,
title = {Towards Portable Online Prediction of Network Utilization using MPI-level Monitoring},
year = {2019},
author = {Tseng, Shu-Mei and Nicolae, Bogdan and Bosilca, George and Jeannot, Emmanuel and Chandramowlishwaran, Aparna and Cappello, Franck},
booktitle = {EuroPar’19 : 25th International European Conference on Parallel and Distributed Systems},
pages = {47-60},
address = {Goettingen, Germany},
doi = {10.1007/978-3-030-29400-7_4},
url = {https://hal.inria.fr/hal-02184204},
keywords = {Work stealing, Prediction of resource utilization, Timeseries forecasting, Network monitoring, Online learning}
}
@inproceedings{SZ-SC19,
title = {Significantly Improving Lossy Compression Quality Based on an Optimized Hybrid Prediction Model},
year = {2019},
author = {Liang, Xin and Di, Sheng and Li, Sihuan and Tao, Dingwen and Nicolae, Bogdan and Chen, Zizhong and Cappello, Franck},
booktitle = {SC '19: 32nd International Conference for High Performance Computing, Networking, Storage and Analytics},
pages = {1-26},
address = {Denver, USA},
doi = {10.1145/3295500.3356193},
url = {http://tao.cs.ua.edu/paper/SC19-HybridModel.pdf},
keywords = {Error-Bounded Lossy Compression, Rate Distortion, Data Dumping/Loading, Compression Performance}
}
@inproceedings{SZ-CLUSTER19,
title = {Improving Performance of Data Dumping with Lossy Compression for Scientific Simulation},
year = {2019},
author = {Liang, Xin and Di, Sheng and Tao, Dingwen and Li, Sihuan and Nicolae, Bogdan and Chen, Zizhong and Cappello, Franck},
booktitle = {CLUSTER'19: IEEE International Conference on Cluster Computing},
pages = {1-11},
address = {Albuquerque, USA},
doi = {10.1109/CLUSTER.2019.8891037},
url = {http://tao.cs.ua.edu/paper/CLUSTER19-IOAwareLossy.pdf},
keywords = {lossy compression, efficient data flush, parallel file systems}
}
@incollection{ElasticStoreIGI19,
title = {Transparent Throughput Elasticity for Modern Cloud Storage: An Adaptive Block-Level Caching Proposal},
year = {2019},
author = {Nicolae, Bogdan and Riteau, Pierre and Zhen, Zhuo and Keahey, Kate},
booktitle = {Applying Integration Techniques and Methods in Distributed Systems and Technologies},
pages = {156-191},
publisher = {IGI Global},
isbn = {9781522582953},
doi = {10.4018/978-1-5225-8295-3.ch007},
keywords = {cloud computing, storage elasticity, adaptive I/O throughput, block-level caching}
}
@article{SparkDIYIEEE19,
title = {Toward High-Performance Computing and Big Data Analytics Convergence: The Case of Spark-DIY},
year = {2019},
author = {Caino-Lores, Silvina and Carretero, Jesus and Nicolae, Bogdan and Yildiz, Orcun and Peterka, Tom},
journal = {IEEE Access},
volume = {7},
pages = {156929--156955},
doi = {10.1109/ACCESS.2019.2949836},
url = {https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8884083},
keywords = {big data, HPC, convergence, data model, Spark, DIY}
}
@inproceedings{DLAsyncMLHPC19,
title = {Understanding Scalability and Fine-Grain Parallelism of Synchronous Data Parallel Training},
year = {2019},
author = {Li, Jiali and Nicolae, Bogdan and Wozniak, Justin and Bosilca, George},
booktitle = {MLHPC'19: The 2019 IEEE/ACM Workshop on Machine Learning in High Performance Computing Environments (in conjuction with SC'19)},
pages = {1-8},
address = {Denver, USA},
doi = {10.1109/MLHPC49564.2019.00006},
url = {https://hal.inria.fr/hal-02570148},
keywords = {deep learning, behavior analysis, Tensorflow, data-parallel learning, tensor parallelism}
}
@inproceedings{VeloC-SC18,
author = {Nicolae, Bogdan and Cappello, Franck and Moody, Adam and Gonsiorowski, Elsa and Mohror, Kathryn},
title = {VeloC: Very Low Overhead Checkpointing System},
booktitle = {{SC '18: 31th International Conference for High Performance Computing,
Networking, Storage and Analysis}},
year = {2018},
address = {Dallas, USA},
url = {https://sc18.supercomputing.org/proceedings/tech_poster/poster_files/post230s2-file3.pdf},
note = {Poster Session},
keywords = {HPC, resilience, checkpoint-restart}
}
@article{MapRedTPDS18,
title = {Performance Model of MapReduce Iterative Applications for Hybrid Cloud Bursting},
year = {2018},
author = {Clemente-Castello, Francisco J and Nicolae, Bogdan and Mayo, Rafael and Fernandez, Juan Carlos},
journal = {IEEE Transactions on Parallel and Distributed Systems},
volume = {29},
number = {8},
pages = {1794-1807},
doi = {10.1109/TPDS.2018.2802932},
url = {https://hal.archives-ouvertes.fr/hal-01999033/en},
keywords = {cloud computing, hybrid cloud, bursting, MapReduce}
}
@inproceedings{SparkDIY18,
title = {Spark-DIY: A Framework for Interoperable Spark Operations with High Performance Block-Based Data Models},
year = {2018},
author = {Caino-Lores, Silvina and Carretero, Jesus and Nicolae, Bogdan and Yildiz, Orcun and Peterka, Tom},
booktitle = {BDCAT'18: 5th IEEE/ACM International Conference on Big Data Computing Applications and Technologies},
pages = {1-10},
address = {Zurich, Switzerland},
doi = {10.1109/BDCAT.2018.00010},
keywords = {big data, Spark, high performance computing, convergence}
}
@inproceedings{KeraICDCS18,
title = {KerA: Scalable Data Ingestion for Stream Processing},
year = {2018},
author = {Marcu, Ovidiu-Cristian and Costan, Alexandru and Antoniu, Gabriel and Perez-Hernandez, Maria S and Nicolae, Bogdan and Tudoran, Radu and Bortoli, Stefano},
booktitle = {ICDCS'18: 38th IEEE International Conference on Distributed Computing Systems},
pages = {1480-1485},
address = {Vienna, Austria},
doi = {10.1109/ICDCS.2018.00152},
url = {https://hal.archives-ouvertes.fr/hal-01773799/en},
keywords = {big data, stream computing, data ingestion}
}
@article{ShuffleTPDS16,
title = {Leveraging Adaptive I/O to Optimize Collective Data Shuffling Patterns for Big Data Analytics},
year = {2017},
author = {Nicolae, Bogdan and Costa, Carlos and Misale, Claudia and Katrinis, Kostas and Park, Yonhoo},
journal = {IEEE Transactions on Parallel and Distributed Systems},
volume = {28},
number = {6},
pages = {1663-1674},
doi = {10.1109/TPDS.2016.2627558},
url = {https://hal.archives-ouvertes.fr/hal-01531374v1/en},
keywords = {elastic buffering, big data analytics, data shuffling, memory-efficient I/O, Spark}
}
@inproceedings{HybridCCGrid17,
title = {Evaluation of Data Locality Strategies for Hybrid Cloud Bursting of Iterative MapReduce},
year = {2017},
author = {Clemente-Castello, Francisco J and Nicolae, Bogdan and Mayo, M Mustafa Rafique Rafael and Fernandez, Juan Carlos},
booktitle = {CCGrid’17 : 17th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing},
pages = {181-185},
address = {Madrid, Spain},
doi = {10.1109/CCGRID.2017.96},
url = {https://hal.inria.fr/hal-01469991},
keywords = {hybrid cloud, big data analytics, data locality, data management, scheduling, MapReduce, iterative}
}
@inproceedings{KeraBigData17,
title = {Towards a unified storage and ingestion architecture for stream processing},
year = {2017},
author = {Marcu, Ovidiu-Cristian and Costan, Alexandru and Antoniu, Gabriel and Perez-Hernandez, Maria S and Tudoran, Radu and Bortoli, Stefano and Nicolae, Bogdan},
booktitle = {BigData'17: 2017 IEEE International Conference on Big Data},
pages = {2402-2407},
address = {Boston, USA},
doi = {10.1109/BigData.2017.8258196},
url = {https://hal.inria.fr/hal-01649207/},
keywords = {Big Data, Streaming, Storage, Ingestion, Unified Architecture}
}
@inproceedings{KeraEBMA17,
title = {Exploring Shared State in Key-Value Store for Window-Based Multi-Pattern Streaming Analytics},
year = {2017},
author = {Marcu, Ovidiu-Cristian and Tudoran, Radu and Nicolae, Bogdan and Costan, Alexandru and Antoniu, Gabriel and Perez-Hernandez, Maria S},
booktitle = {EBDMA'17: 1st Workshop on the Integration of Extreme Scale Computing and Big Data Management and Analytics},
pages = {1044-1052},
address = {Madrid, Spain},
doi = {10.1109/CCGRID.2017.126},
url = {https://hal.inria.fr/hal-01530744},
keywords = {Big Data, sliding-window aggregations, memory deduplication, Apache Flink, streaming analytics}
}
@inproceedings{ShuffleCCGrid16,
title = {Towards Memory-Optimized Data Shuffling Patterns for Big Data Analytics},
year = {2016},
author = {Nicolae, Bogdan and Costa, Carlos and Misale, Claudia and Katrinis, Kostas and Park, Yonhoo},
booktitle = {CCGrid’16: 16th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing},
pages = {409-412},
address = {Cartagena, Colombia},
doi = {10.1109/CCGrid.2016.85},
url = {http://ieeexplore.ieee.org/iel7/7510545/7515592/07515716.pdf},
note = {Short Paper},
keywords = {elastic buffering, Big data analytics, data shuffling, memory-efficient I/O, Spark}
}
@inproceedings{HybridMapRed16,
title = {On Exploiting Data Locality for Iterative MapReduce Applications in Hybrid Clouds
},
year = {2016},
author = {Clemente-Castello, Francisco J and Nicolae, Bogdan and Mayo, Rafael and Fernandez, Juan Carlos and Rafique, Mustafa},
booktitle = {BDCAT'16: 3rd IEEE/ACM International Conference on Big Data Computing, Applications and Technologies},
pages = {118-122},
address = {Shanghai, China},
doi = {10.1145/3006299.3006329},
url = {https://hal.archives-ouvertes.fr/hal-01476052v1/en},
keywords = {hybrid cloud, bursting, big data analytics, iterative, MapReduce, data locality, data management, scheduling}
}
@article{ACEStoreJPDC16,
title = {Towards Scalable On-Demand Collective Data Access in IaaS Clouds: An Adaptive Collaborative Content Exchange Proposal},
year = {2016},
author = {Nicolae, Bogdan and Kochut, Andrzej and Karve, Alexei},
journal = {Journal of Parallel and Distributed Computing},
volume = {87},
pages = {67-79},
doi = {10.1016/j.jpdc.2015.09.006},
url = {https://hal.inria.fr/hal-01355213/en},
keywords = {IaaS, scalable content dissemination, collective I/O, on-demand data access, high thoughput, collaborative I/O, adaptive prefetching}
}
@inproceedings{DBLP:conf/cost/TudoranNB16,
title = {Data Multiverse : The Uncertainty Challenge of Future Big Data Analytics},
year = {2016},
author = {Tudoran, Radu and Nicolae, Bogdan and Brasche, Gotz},
booktitle = {IKC'16: 2nd International KEYSTONE Conference},
pages = {17-22},
address = {Cluj-Napoca, Romania},
doi = {10.1007/978-3-319-53640-8_2},
url = {https://hal.archives-ouvertes.fr/hal-01480509v1/en},
keywords = {big data analytics, large scale data processing, data access model, data uncertainty, approximate computing}
}
@inproceedings{CollCkptHPCS15,
title = {Techniques to improve the scalability of collective checkpointing at large scale},
year = {2015},
author = {Nicolae, Bogdan},
booktitle = {HPCS’15: The 2015 International Conference on High Performance Computing and Simulation},
pages = {660-661},
address = {Amsterdam, The Netherlands},
doi = {10.1109/HPCSim.2015.7237113},
url = {http://ieeexplore.ieee.org/document/7237113/},
keywords = {checkpointing, checkpoint restart, redundancy, scalability, data resilience, high performance computing, adaptive I/O, collective I/O, deduplication}
}
@inproceedings{HybridMapRedBDAC15,
title = {Understanding Spark Performance in Hybrid and Multi-Site Clouds},
year = {2015},
author = {Roman, Roxana-Ioana and Nicolae, Bogdan and Costan, Alexandru and Antoniu, Gabriel},
booktitle = {BDAC-15: 6th International Workshop on Big Data Analytics: Challenges and Opportunities},
address = {Austin, USA},
url = {https://hal.inria.fr/hal-01239140/en/},
keywords = {big data, Spark, hybrid cloud, network bottleneck}
}
@article{ElasticBWIJDST15,
title = {Towards Transparent Throughput Elasticity for IaaS Cloud Storage: Exploring the Benefits of Adaptive Block-Level Caching},
year = {2015},
author = {Nicolae, Bogdan and Riteau, Pierre and Keahey, Kate},
journal = {International Journal of Distributed Systems and Technologies},
volume = {6},
number = {4},
pages = {21-44},
doi = {10.4018/IJDST.2015100102},
url = {https://hal.inria.fr/hal-01199464/en/},
keywords = {IaaS, cloud computing, storage elasticity, adaptive I/O, virtual disk, block-level caching, performance prediction, cost prediction, modeling}
}
@inproceedings{HybridMapRed15,
title = {Enabling Big Data Analytics in the Hybrid Cloud Using Iterative MapReduce},
year = {2015},
author = {Clemente-Castello, Francisco J and Nicolae, Bogdan and Katrinis, Kostas and Rafique, M Mustafa and Mayo, Rafael and Fernandez, Juan Carlos and Loreti, Daniela},
booktitle = {UCC’15: 8th IEEE/ACM International Conference on Utility and Cloud Computing},
pages = {290-299},
address = {Limassol, Cyprus},
doi = {10.1109/UCC.2015.47},
url = {https://hal.inria.fr/hal-01207186/en},
keywords = {hybrid cloud, big data analytics, iterative, MapReduce, data locality, performance prediction}
}
@inproceedings{VMRuntimeIO-IM15,
title = {Towards Efficient On-demand VM Provisioning: Study of VM Runtime I/O Access Patterns to Shared Image Content},
year = {2015},
author = {Kochut, Andrzej and Karve, Alexei and Nicolae, Bogdan},
booktitle = {IM’15: 13th IFIP/IEEE International Symposium on Integrated Network Management},
pages = {321-329},
address = {Ottawa, Canada},
doi = {10.1109/INM.2015.7140307},
url = {https://hal.inria.fr/hal-01138689/en},
keywords = {cloud computing, Iaas, content similarity, deduplication, correlations, I/O access pattern, virtual disk}
}
@inproceedings{DedupRepIPDPS15,
title = {Leveraging naturally distributed data redundancy to reduce collective I/O replication overhead},
year = {2015},
author = {Nicolae, Bogdan},
booktitle = {IPDPS ’15: 29th IEEE International Parallel and Distributed Processing Symposium},
pages = {1023-1032},
address = {Hyderabad, India},
doi = {10.1109/IPDPS.2015.82},
url = {https://hal.inria.fr/hal-01115700/en},
keywords = {scalable I/O, checkpoint restart, checkpointing, data replication, deduplication, collective I/O, redundancy, data resilience, high availability}
}
@inproceedings{CollDedupCCGrid15,
title = {Discovering and Leveraging Content Similarity to Optimize Collective On-Demand Data Access to IaaS Cloud Storage},
year = {2015},
author = {Nicolae, Bogdan and Karve, Alexei and Kochut, Andrzej},
booktitle = {CCGrid’15: 15th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing},
pages = {211-220},
address = {Shenzhen, China},
doi = {10.1109/CCGrid.2015.156},
url = {https://hal.inria.fr/hal-01138684/en},
keywords = {collective I/O, content similarity, deduplication, cloud storage, on-demand data access}
}
@inproceedings{ElasticStoreIPDPS14,
title = {Bursting the Cloud Data Bubble: Towards Transparent Storage Elasticity in IaaS Clouds},
year = {2014},
author = {Nicolae, Bogdan and Riteau, Pierre and Keahey, Kate},
booktitle = {IPDPS ’14: Proc. 28th IEEE International Parallel and Distributed Processing Symposium},
pages = {135-144},
address = {Phoenix, USA},
doi = {10.1109/IPDPS.2014.25},
url = {https://hal.inria.fr/hal-00947599/en},
keywords = {adaptive I/O, cloud computing, elastic storage, utilization prediction}
}
@inproceedings{ElasticBW-UCC14,
title = {Transparent Throughput Elasticity for IaaS Cloud Storage Using Guest-Side Block-Level Caching},
year = {2014},
author = {Nicolae, Bogdan and Riteau, Pierre and Keahey, Kate},
booktitle = {UCC’14: 7th IEEE/ACM International Conference on Utility and Cloud Computing},
pages = {186-195},
address = {London, UK},
doi = {10.1109/UCC.2014.27},
url = {https://hal.inria.fr/hal-01070227/en},
keywords = {adaptive I/O, block-level caching, cloud computing, elastic storage, virtual disk, utilization prediction}
}
@inproceedings{CHAMP-DIHC14,
title = {Next Generation HPC Clouds: A View for Large-Scale Scientific and Data-Intensive Applications},
year = {2014},
author = {Petcu, Dana and Gonzalez-Velez, Horacio and Nicolae, Bogdan and Garcia-Gomez, Juan Miguel and Fuster-Garcia, Elies and Sheridan, Craig},
booktitle = {DIHC’14: The 2nd Workshop on Dependability and Interoperability in Heterogeneous Clouds},
pages = {26-37},
address = {Porto, Portugal},
doi = {10.1007/978-3-319-14313-2_3},
url = {http://link.springer.com/chapter/10.1007%2F978-3-319-14313-2_3},
keywords = {cloud storage, data analytics, heterogeneous clouds, high performance computing}
}
@inproceedings{OverlapMapRed14,
title = {To Overlap or Not to Overlap: Optimizing Incremental MapReduce Computations for On-Demand Data Upload},
year = {2014},
author = {Ene, Stefan and Nicolae, Bogdan and Costan, Alexandru and Antoniu, Gabriel},
booktitle = {DataCloud ’14: The 5th International Workshop on Data-Intensive Computing in the Clouds},
pages = {9-16},
address = {New Orleans, USA},
doi = {10.1109/DataCloud.2014.7},
url = {https://hal.inria.fr/hal-01094609/en},
keywords = {big data, data management, incremental processing, MapReduce}
}
@inproceedings{CollDedupRep-SC14,
title = {Leveraging Naturally Distributed Data Redundancy to Optimize Collective Replication},
year = {2014},
author = {Nicolae, Bogdan and Lemarinier, Pierre and Meneghin, Massimiliano},
booktitle = {SC ’14: 27th International Conference for High Performance Computing, Networking, Storage and Analysis},
address = {New Orleans, USA},
url = {http://sc14.supercomputing.org/sites/all/themes/sc14/files/archive/tech_poster/poster_files/post286s2-file3.pdf},
keywords = {high performance computing, data resilience, high availability, replication, deduplication, collective I/O, redundancy, fault tolerance}
}
@inproceedings{VIOBigDataCloud13,
title = {Understanding Vertical Scalability of I/O Virtualization for MapReduce Workloads: Challenges and Opportunities},
year = {2013},
author = {Nicolae, Bogdan},
booktitle = {BigDataCloud ’13: 2nd Workshop on Big Data Management in Clouds (held in conjunction with EuroPar’13)},
address = {Aachen, Germany},
doi = {10.1007/978-3-642-54420-0_1},
url = {https://hal.inria.fr/hal-00856877/en},
keywords = {I/O virtualization, big data, vertical I/O scalability, big data, IaaS, cloud computing}
}
@article{MapRedIJCC13,
title = {Scalable Data Management for Map-Reduce-based Data-Intensive Applications: A View for Cloud and Hybrid Infrastructures},
year = {2013},
author = {Antoniu, Gabriel and Bigot, Julien and Blanchet, Cristophe and Bouge, Luc and Briant, Francois and Cappello, Franck and Costan, Alexandru and Desprez, Frederic and Fedak, Gilles and Gault, Sylvain and Keahey, Kate and Nicolae, Bogdan and Perez, Christian and Simonet, Anthony and Suter, Frederic and Tang, Bing and Terreux, Raphael},
journal = {International Journal of Cloud Computing},
volume = {2},
pages = {150-170},
doi = {10.1504/IJCC.2013.055265},
url = {https://hal.inria.fr/hal-00684866/en},
keywords = {MapReduce, cloud computing, desktop grids, hybrid infrastructures, bioinformatics, task scheduling, fault tolerance, scalable data management, data-intensive, scalable storage, massive data, concurrency control, volatility.}
}
@inproceedings{AICkptIPDPS13,
title = {Towards Scalable Checkpoint Restart: A Collective Inline Memory Contents Deduplication Proposal},
year = {2013},
author = {Nicolae, Bogdan},
booktitle = {IPDPS ’13: The 27th IEEE International Parallel and Distributed Processing Symposium},
pages = {19-28},
address = {Boston, USA},
doi = {10.1109/IPDPS.2013.14},
url = {https://hal.inria.fr/hal-00781532/en},
keywords = {I/O load balancing, checkpoint restart, deduplication, fault tolerance, high performance computing, checkpointing}
}
@article{BlobCRJPDC13,
title = {BlobCR: Virtual disk based checkpoint-restart for HPC applications on IaaS clouds},
year = {2013},
author = {Nicolae, Bogdan and Cappello, Franck},
journal = {Journal of Parallel and Distributed Computing},
volume = {73},
number = {5},
pages = {698-711},
doi = {10.1016/j.jpdc.2013.01.013},
url = {https://hal.inria.fr/hal-00857964/en},
keywords = {checkpoint restart, high performance computing, IaaS, cloud computing, snapshotting, fault tolerance, file system rollback, virtual disk},
issn = {07437315}
}
@inproceedings{AICkptHPDC13,
title = {AI-Ckpt: Leveraging Memory Access Patterns for Adaptive Asynchronous Incremental Checkpointing},
year = {2013},
author = {Nicolae, Bogdan and Cappello, Franck},
booktitle = {HPDC ’13: 22th International ACM Symposium on High-Performance Parallel and Distributed Computing},
pages = {155-166},
address = {New York, USA},
doi = {10.1145/2462902.2462918},
url = {https://hal.archives-ouvertes.fr/hal-00809847/en},
keywords = {scientific computing, high performance computing, cloud computing, fault tolerance, checkpoint restart, checkpointing, adaptive I/O}
}
@inproceedings{AStoreVD-EUROPAR13,
title = {Leveraging Collaborative Content Exchange for On-Demand VM Multi-Deployments in IaaS Clouds},
year = {2013},
author = {Nicolae, Bogdan and Rafique, Mustafa},
booktitle = {Euro-Par ’13: 19th International Euro-Par Conference on Parallel Processing},
address = {Aachen, Germany},
doi = {10.1007/978-3-642-40047-6_32},
url = {https://hal.inria.fr/hal-00835432/en},
keywords = {IaaS, cloud computing, multi-deployment, VM provisioning, collaborative content exchange}
}
@inproceedings{ANRMapRed-ICACON12,
title = {Towards Scalable Data Management for Map-Reduce-based Data-Intensive Applications on Cloud and Hybrid Infrastructures},
year = {2012},
author = {Antoniu, Gabriel and Bigot, Julien and Blanchet, Cristophe and Bouge, Luc and Briant, Francois and Cappello, Franck and Costan, Alexandru and Desprez, Frederic and Fedak, Gilles and Gault, Sylvain and Keahey, Kate and Nicolae, Bogdan and Perez, Christian and Simonet, Anthony and Suter, Frederic and Tang, Bing and Terreux, Raphael},
booktitle = {ICACON ’12 : 1st International IBM Cloud Academy Conference},
address = {Research Triangle Park, USA},
url = {http://hal.inria.fr/hal-00684866/en},
keywords = {MapReduce, cloud computing, data-intensive computing, hybrid infrastructures, BlobSeer, BitDew, Nimbus, HLCM, Grid'5000}
}
@inproceedings{RSEnc-EUROPAR12,
title = {Scalable Reed-Solomon-based Reliable Local Storage for HPC Applications on IaaS Clouds},
year = {2012},
author = {Gomez, Leonardo Bautista and Nicolae, Bogdan and Maruyama, Naoya and Cappello, Franck and Matsuoka, Satoshi},
booktitle = {Euro-Par ’12: 18th International Euro-Par Conference on Parallel Processing},
address = {Rhodes, Greece},
doi = {10.1007/978-3-642-32820-6_32},
url = {http://hal.inria.fr/hal-00703119/en},
keywords = {Cloud computing, IaaS, storage systems, virtual disk, erasure codes, Reed Solomon}
}
@article{Pyramid-OSR12,
title = {Towards scalable array-oriented active storage: the Pyramid approach},
year = {2012},
author = {Tran, Viet-Trung and Nicolae, Bogdan and Antoniu, Gabriel},
journal = {SIGOPS Oper. Syst. Rev.},
volume = {46},
number = {1},
pages = {19-25},
doi = {10.1145/2146382.2146387},
url = {https://hal.inria.fr/hal-00640900/en},
keywords = {large scale data management, multi-dimensional I/O, concurrency control, parallel array processing, versioning},
publisher = {ACM},
address = {New York, NY, USA},
issn = {0163-5980}
}
@inproceedings{LiveMigrHPDC12,
title = {A hybrid local storage transfer scheme for live migration of I/O intensive workloads},
year = {2012},
author = {Nicolae, Bogdan and Cappello, Franck},
booktitle = {HPDC ’12: 21th International ACM Symposium on High-Performance Parallel and Distributed Computing},
pages = {85-96},
address = {Delft, The Netherlands},
doi = {10.1145/2287076.2287088},
url = {https://hal.inria.fr/hal-00686654/en/},
keywords = {virtualization, live migration, block migration, local storage transfer, I/O intensive workloads, IaaS, cloud computing, data intensive applications}
}
@article{Compression11,
title = {On the Benefits of Transparent Compression for Cost-Effective Cloud Data Storage},
year = {2011},
author = {Nicolae, Bogdan},
journal = {Transactions on Large-Scale Data- and Knowledge-Centered Systems},
volume = {3},
number = {3},
pages = {167-184},
doi = {10.1007/978-3-642-23074-5},
url = {https://hal.inria.fr/inria-00613583},
keywords = {IaaS, cloud computing, scalable storage, high throughput, compression},
publisher = {Springer Berlin / Heidelberg}
}
@article{BlobSeerJPDC11,
title = {BlobSeer: Next-generation data management for large scale infrastructures},
year = {2011},
author = {Nicolae, Bogdan and Antoniu, Gabriel and Bouge, Luc and Moise, Diana and Carpen-Amarie, Alexandra},
journal = {J. Parallel Distrib. Comput.},
volume = {71},
number = {2},
pages = {169-184},
doi = {10.1016/j.jpdc.2010.08.004},
url = {https://hal.inria.fr/inria-00511414/en/},
keywords = {scalable storage, data management, high throughput, versioning, decentralized metadata, concurrency control, data model, BlobSeer},
publisher = {Academic Press, Inc.},
address = {Orlando, FL, USA},
issn = {0743-7315}
}
@inproceedings{BlobSeerHPDC11,
title = {Going Back and Forth: Efficient Multideployment and Multisnapshotting on Clouds},
year = {2011},
author = {Nicolae, Bogdan and Bresnahan, John and Keahey, Kate and Antoniu, Gabriel},
booktitle = {HPDC ’11: 20th International ACM Symposium on High-Performance Parallel and Distributed Computing},
pages = {147-158},
address = {San José, USA},
doi = {10.1145/1996130.1996152},
url = {https://hal.inria.fr/inria-00570682/en},
keywords = {Nimbus, Grid'5000, cloud computing, BlobSeer, VM storage, IaaS, multi-snaphotting, multi-deployment, large scale provisioning}
}
@inproceedings{BlobCR-SC11,
title = {BlobCR: Efficient Checkpoint-Restart for HPC Applications on IaaS Clouds using Virtual Disk Image Snapshots},
year = {2011},
author = {Nicolae, Bogdan and Cappello, Franck},
booktitle = {SC ’11: 24th International Conference for High Performance Computing, Networking, Storage and Analysis},
pages = {34-1},
address = {Seattle, USA},
doi = {10.1145/2063384.2063429},
url = {http://hal.inria.fr/inria-00601865/en/},
keywords = {IaaS, cloud computing, large scale multi-deployment, checkpoint restart, fault tolerance, virtual disk snapshots, BlobSeer}
}
@inproceedings{BlobVM-EUROPAR11,
title = {Optimizing multi-deployment on clouds by means of self-adaptive prefetching},
year = {2011},
author = {Nicolae, Bogdan and Cappello, Franck and Antoniu, Gabriel},
booktitle = {Euro-Par ’11: 17th International Euro-Par Conference on Parallel Processing},
pages = {503-513},
address = {Bordeaux, France},
doi = {10.1007/978-3-642-23400-2_46},
url = {http://hal.inria.fr/inria-00594406/en/},
keywords = {IaaS, cloud computing, large scale multi-deployment, provisioning, adaptive I/O}
}
@inproceedings{BlobSeerCCGRID11,
title = {Efficient support for MPI-I/O atomicity based on versioning},
year = {2011},
author = {Tran, Viet-Trung and Nicolae, Bogdan and Antoniu, Gabriel and Bouge, Luc},
booktitle = {CCGRID ’11: 11th IEEE/ACM International Symposium on Cluster, Cloud, and Grid Computing},
pages = {514-523},
address = {Newport Beach, USA},
doi = {10.1109/CCGrid.2011.60},
url = {http://hal.inria.fr/inria-00565358/en/},
keywords = {large scale, storage, MPI-IO, atomicity, non-contiguous I/O, versioning}
}
@inproceedings{Pyramid-LADIS11,
title = {Pyramid: A large-scale array-oriented active storage system},
year = {2011},
author = {Tran, Viet-Trung and Nicolae, Bogdan and Antoniu, Gabriel and Bouge, Luc},
booktitle = {LADIS ’11: Proceedings of the 5th Workshop on Large-Scale Distributed Systems and Middleware},
address = {Newport Beach, USA},
url = {https://hal.inria.fr/inria-00627665/en},
keywords = {large scale data management, multi-dimensional I/O, concurrency control, parallel array processing, versioning}
}
@article{BlobRand10,
title = {Gathering Entropy at Large Scale with HAVEGE and BlobSeer},
year = {2010},
author = {Suciu, Alin and Nicolae, Bogdan and Antoniu, Gabriel and Istvan, Zsolt and Szakats, Istvan},
journal = {Automat. Comput. Appl. Math.},
volume = {19},
pages = {3-11},
url = {https://hal.inria.fr/hal-00803430/en},
keywords = {random number generation, large scale, high throughput, high entropy, Blobseer, HAVEGE}
}
@inproceedings{BlobSeerCloudCom,
title = {Using Global Behavior Modeling to Improve QoS in Cloud Data Storage Services},
year = {2010},
author = {Montes, Jesus and Nicolae, Bogdan and Antoniu, Gabriel and Sanchez, Alberto and Perez, Maria},
booktitle = {CloudCom ’10: Proc. 2nd IEEE International Conference on Cloud Computing Technology and Science},
pages = {304-311},
address = {Indianapolis, USA},
doi = {10.1109/CloudCom.2010.33},
url = {https://hal.inria.fr/inria-00527650v1},
keywords = {QoS, cloud computing, data storage, behavioral modeling, throughput stabilization, GloBeM, BlobSeer, MapReduce}
}
@inproceedings{BlobSeerCompression,
title = {High Throughput Data-Compression for Cloud Storage},
year = {2010},
author = {Nicolae, Bogdan},
booktitle = {Globe ’10: Proc. 3rd International Conference on Data Management in Grid and P2P Systems},
pages = {1-12},
address = {Bilbao, Spain},
doi = {10.1007/978-3-642-15108-8_1},
url = {https://hal.inria.fr/inria-00490541},
keywords = {cloud computing, distributed data storage, high throughput, adaptive I/O, data intensive applications}
}
@inproceedings{BlobSeerPhdForum,
title = {BlobSeer: Efficient Data Management for Data-Intensive Applications Distributed at Large-Scale},
year = {2010},
author = {Nicolae, Bogdan},
booktitle = {IPDPS ’10: Proc. 24th IEEE International Symposium on Parallel and Distributed Processing: Workshops and Phd Forum},
pages = {1-4},
address = {Atlanta, USA},
doi = {10.1109/IPDPSW.2010.5470802},
url = {https://hal.inria.fr/inria-00457809/en/},
note = {Best Poster Award},
keywords = {data intensive applications, large scale, distributed data storage, high throughput, heavy access concurrency, versioning, efficient concurrency control, data striping, distributed metadata management}
}
@phdthesis{BlobSeerThesis,
title = {BlobSeer: Towards Efficient Data Storage Management for Large-Scale, Distributed Systems},
year = {2010},
author = {Nicolae, Bogdan},
school = {University of Rennes 1},
address = {Rennes, France},
url = {https://hal.inria.fr/tel-00552271/en},
keywords = {large scale data storage, cloud storage, versioning, decentralized metadata management, high throughput, heavy access concurrency},
month = nov
}
@inproceedings{BlobSeerMapRed,
title = {BlobSeer: Bringing High Throughput under Heavy Concurrency to Hadoop Map/Reduce Applications},
year = {2010},
author = {Nicolae, Bogdan and Moise, Diana and Antoniu, Gabriel and Bouge, Luc and Dorier, Matthieu},
booktitle = {IPDPS ’10: Proc. 24th IEEE International Parallel and Distributed Processing Symposium},
pages = {1-12},
address = {Atlanta, USA},
doi = {10.1109/IPDPS.2010.5470433},
url = {https://hal.inria.fr/inria-00456801/en},
keywords = {large-scale distributed computing, data-intensive, MapReduce, distributed file systems, high throughput, heavy access concurrency, Hadoop, BlobSeer}
}
@inproceedings{BlobSeerDAMAP,
title = {BlobSeer: How to Enable Efficient Versioning for Large Object Storage under Heavy Access Concurrency},
year = {2009},
author = {Nicolae, Bogdan and Antoniu, Gabriel and Bouge, Luc},
booktitle = {EDBT/ICDT ’09 Workshops},
pages = {18-25},
address = {Saint-Petersburg, Russia},
doi = {10.1145/1698790.1698796},
url = {https://hal.inria.fr/inria-00382354v1},
keywords = {large scale data storage, concurrency control, versioning, decentralized metadata}
}
@inproceedings{BlobSeerEuroPar09,
title = {Enabling High Data Throughput in Desktop Grids Through Decentralized Data and Metadata Management: The BlobSeer Approach},
year = {2009},
author = {Nicolae, Bogdan and Antoniu, Gabriel and Bouge, Luc},
booktitle = {Euro-Par ’09 : Proc. 15th International Euro-Par Conference on Parallel Processing},
pages = {404-416},
address = {Delft, The Netherlands},
doi = {10.1007/978-3-642-03869-3_40},
url = {https://hal.inria.fr/inria-00410956v2},
keywords = {desktop grids, distributed metadata management, data intensive applications, large data size, heavy access concurrency, high speed writes}
}
@inproceedings{BlobSeerGfarm,
title = {Towards A Grid File System Based On A Large-Scale BLOB Management Service},
year = {2009},
author = {Tran, Viet-Trung and Antoniu, Gabriel and Nicolae, Bogdan and Bouge, Luc},
booktitle = {Grids, P2P and Service Computing},
pages = {7-19},
address = {Delft, The Netherlands},
doi = {10.1109/IPDPSW.2010.5470802},
url = {https://hal.inria.fr/inria-00457809v1},
keywords = {data intensive applications, large scale, distributed data storage, high throughput, heavy access concurrency, versioning, efficient concurrency control, data striping, distributed metadata management}
}
@inproceedings{BlobSeerCLUSTER,
title = {Enabling lock-free concurrent fine-grain access to massive distributed data: Application to supernovae detection},
year = {2008},
author = {Nicolae, Bogdan and Antoniu, Gabriel and Bouge, Luc},
booktitle = {Cluster ’08 : Proc. IEEE International Conference on Cluster Computing: Poster Session},
pages = {310-315},
address = {Tsukuba, Japan},
doi = {10.1109/CLUSTR.2008.4663787},
url = {https://hal.inria.fr/inria-00329698},
keywords = {large scale data management, object storage, huge file, versioning, heavy access concurrency}
}
@inproceedings{BlobSeerVECPAR,
title = {Distributed Management of Massive Data: An Efficient Fine-Grain Data Access Scheme},
year = {2008},
author = {Nicolae, Bogdan and Antoniu, Gabriel and Bouge, Luc},
booktitle = {VECPAR ’08 : Proc. 8th International Meeting on High Performance Computing for Computational Science},
pages = {532-543},
address = {Toulouse, France},
doi = {10.1007/978-3-540-92859-1_47},
url = {https://hal.inria.fr/inria-00323248v1},
keywords = {high performance distributed computing, large scale data sharing, distributed data management, lock-free, fine grain access}
}