Enabling Parallelism Hot Switching for Efficient Training of Large Language Models

Hao Ge, Fangcheng Fu, Haoyang Li, Xuanyu Wang, Sheng Lin, Yujie Wang, Xiaonan Nie, Hailin Zhang 0004, Xupeng Miao, Bin Cui 0001. Enabling Parallelism Hot Switching for Efficient Training of Large Language Models. In Emmett Witchel, Christopher J. Rossbach, Andrea C. Arpaci-Dusseau, Kimberly Keeton, editors, Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles, SOSP 2024, Austin, TX, USA, November 4-6, 2024. pages 178-194, ACM, 2024. [doi]

@inproceedings{GeFLWLWN0M024,
  title = {Enabling Parallelism Hot Switching for Efficient Training of Large Language Models},
  author = {Hao Ge and Fangcheng Fu and Haoyang Li and Xuanyu Wang and Sheng Lin and Yujie Wang and Xiaonan Nie and Hailin Zhang 0004 and Xupeng Miao and Bin Cui 0001},
  year = {2024},
  doi = {10.1145/3694715.3695969},
  url = {https://doi.org/10.1145/3694715.3695969},
  researchr = {https://researchr.org/publication/GeFLWLWN0M024},
  cites = {0},
  citedby = {0},
  pages = {178-194},
  booktitle = {Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles, SOSP 2024, Austin, TX, USA, November 4-6, 2024},
  editor = {Emmett Witchel and Christopher J. Rossbach and Andrea C. Arpaci-Dusseau and Kimberly Keeton},
  publisher = {ACM},
  isbn = {979-8-4007-1251-7},
}