@inproceedings{685c797faf5e4c779d04f5d4d04e2112,
title = "HeteroCheckpoint: Efficient checkpointing for accelerator-based systems",
abstract = "Moving toward exascale, the number of GPUs in HPC machines is bound to increase, and applications will spend increasing amounts of time running on those GPU devices. While GPU usage has already led to substantial speedup for HPC codes, their failure rates due to overheating are at least 10 times higher than those seen for the CPUs now commonly used on HPC machines. This makes it increasingly important for GPUs to have robust checkpoint/restart mechanisms. This paper introduces a unified CPU-GPU checkpoint mechanism, which can efficiently checkpoint the combined GPU-CPU memory state resident on machine nodes. Efficiency is gained in part by addressing the end-to-end data movements required for check pointing - from GPU to storage - by introducing novel pre-copy and checksum methods. These methods reduce checkpoint data movement cost seen by HPC applications, with initial measurements using different benchmark applications showing up to 60% reduced checkpoint overhead. Additional exploration of the use of next-generation storage, like NVM, show further promises of reduced check pointing overheads.",
keywords = "Checkpoint, GPUs, NVM, Pre-Copy",
author = "Sudarsun Kannan and Naila Farooqui and Ada Gavrilovska and Karsten Schwan",
note = "Publisher Copyright: {\textcopyright} 2014 IEEE.; 44th Annual IEEE/IFIP International Conference on Dependable Systems and Networks, DSN 2014 ; Conference date: 23-06-2014 Through 26-06-2014",
year = "2014",
month = sep,
day = "18",
doi = "10.1109/DSN.2014.76",
language = "English (US)",
series = "Proceedings of the International Conference on Dependable Systems and Networks",
publisher = "IEEE Computer Society",
pages = "738--743",
booktitle = "Proceedings of the International Conference on Dependable Systems and Networks",
address = "United States",
}