Wubiao Xu, Xin Huang, Shiman Meng, Weiping Zhang, Luanzheng Guo, Kento Sato. An Efficient Checkpointing System for Large Machine Learning Model Training. In SC24-W: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis, Atlanta, GA, USA, November 17-22, 2024. pages 896-900, IEEE, 2024. [doi]
@inproceedings{XuHMZGS24,
title = {An Efficient Checkpointing System for Large Machine Learning Model Training},
author = {Wubiao Xu and Xin Huang and Shiman Meng and Weiping Zhang and Luanzheng Guo and Kento Sato},
year = {2024},
doi = {10.1109/SCW63240.2024.00127},
url = {https://doi.org/10.1109/SCW63240.2024.00127},
researchr = {https://researchr.org/publication/XuHMZGS24},
cites = {0},
citedby = {0},
pages = {896-900},
booktitle = {SC24-W: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis, Atlanta, GA, USA, November 17-22, 2024},
publisher = {IEEE},
isbn = {979-8-3503-5554-3},
}