Flash-LLM: Enabling Low-Cost and Highly-Efficient Large Generative Model Inference With Unstructured Sparsity

Haojun Xia, Zhen Zheng, Yuchao Li, Donglin Zhuang, Zhongzhu Zhou, Xiafei Qiu, Yong Li, Wei Lin 0016, Shuaiwen Leon Song. Flash-LLM: Enabling Low-Cost and Highly-Efficient Large Generative Model Inference With Unstructured Sparsity. PVLDB, 17(2):211-224, 2023. [doi]

@article{XiaZLZZQL0S23,
  title = {Flash-LLM: Enabling Low-Cost and Highly-Efficient Large Generative Model Inference With Unstructured Sparsity},
  author = {Haojun Xia and Zhen Zheng and Yuchao Li and Donglin Zhuang and Zhongzhu Zhou and Xiafei Qiu and Yong Li and Wei Lin 0016 and Shuaiwen Leon Song},
  year = {2023},
  url = {https://www.vldb.org/pvldb/vol17/p211-xia.pdf},
  researchr = {https://researchr.org/publication/XiaZLZZQL0S23},
  cites = {0},
  citedby = {0},
  journal = {PVLDB},
  volume = {17},
  number = {2},
  pages = {211-224},
}