@inproceedings{f10242f3d6134d619bcdf1db402da9ad,
title = "Employing checkpoint to improve job scheduling in large-scale systems",
abstract = "The FCFS-based backfill algorithm is widely used in scheduling high-performance computer systems. The algorithm relies on runtime estimate of jobs which is provided by users. However, statistics show the accuracy of user-provided estimate is poor. Users are very likely to provide a much longer runtime estimate than its real execution time. In this paper, we propose an aggressive backfilling approach with checkpoint based preemption to address the inaccuracy in user-provided runtime estimate. The approach is evaluated with real workload traces. The results show that compared with the FCFS-based backfill algorithm, our scheme improves the job scheduling performance in waiting time, slowdown and mean queue length by up to 40%. Meanwhile, only 4% of the jobs need to perform checkpoints.",
keywords = "backfill algorithm, check-point/restart, job scheduling, runtime estimate",
author = "Shuangcheng Niu and Jidong Zhai and Xiaosong Ma and Mingliang Liu and Yan Zhai and Wenguang Chen and Weimin Zheng",
year = "2013",
doi = "10.1007/978-3-642-35867-8_3",
language = "English",
isbn = "9783642358661",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "36--55",
booktitle = "Job Scheduling Strategies for Parallel Processing - 16th International Workshop, JSSPP 2012, Revised Selected Papers",
address = "Germany",
note = "16th Workshop on Job Scheduling Strategies for Parallel Processing, JSSPP 2012 ; Conference date: 25-05-2012 Through 25-05-2012",
}