Zihao Wang, Bin Cui, Shaoduo Gan. SqueezeAttention: 2D Management of KV-Cache in LLM Inference via Layer-wise Optimal Budget. In The Thirteenth International Conference on Learning Representations, ICLR 2025, Singapore, April 24-28, 2025. OpenReview.net, 2025. [doi]
@inproceedings{WangCG25-0, title = {SqueezeAttention: 2D Management of KV-Cache in LLM Inference via Layer-wise Optimal Budget}, author = {Zihao Wang and Bin Cui and Shaoduo Gan}, year = {2025}, url = {https://openreview.net/forum?id=9HK2rHNAhd}, researchr = {https://researchr.org/publication/WangCG25-0}, cites = {0}, citedby = {0}, booktitle = {The Thirteenth International Conference on Learning Representations, ICLR 2025, Singapore, April 24-28, 2025}, publisher = {OpenReview.net}, }