Understanding and Improving Failure Tolerant Training for Deep Learning Recommendation with Partial Recovery

Kiwan Maeng, Shivam Bharuka, Isabel Gao, Mark C. Jeffrey, Vikram Saraph, Bor-Yiing Su, Caroline Trippel, Jiyan Yang, Mike Rabbat, Brandon Lucia, Carole-Jean Wu. Understanding and Improving Failure Tolerant Training for Deep Learning Recommendation with Partial Recovery. In Alex Smola, Alex Dimakis, Ion Stoica, editors, Proceedings of Machine Learning and Systems 2021, MLSys 2021, virtual, April 5-9, 2021. mlsys.org, 2021. [doi]

@inproceedings{MaengBGJSSTYRLW21,
  title = {Understanding and Improving Failure Tolerant Training for Deep Learning Recommendation with Partial Recovery},
  author = {Kiwan Maeng and Shivam Bharuka and Isabel Gao and Mark C. Jeffrey and Vikram Saraph and Bor-Yiing Su and Caroline Trippel and Jiyan Yang and Mike Rabbat and Brandon Lucia and Carole-Jean Wu},
  year = {2021},
  url = {https://proceedings.mlsys.org/paper/2021/hash/b73ce398c39f506af761d2277d853a92-Abstract.html},
  researchr = {https://researchr.org/publication/MaengBGJSSTYRLW21},
  cites = {0},
  citedby = {0},
  booktitle = {Proceedings of Machine Learning and Systems 2021, MLSys 2021, virtual, April 5-9, 2021},
  editor = {Alex Smola and Alex Dimakis and Ion Stoica},
  publisher = {mlsys.org},
}