Mixhead: Breaking the low-rank bottleneck in multi-head attention language models

Zhong Zhang 0004, Nian Shao, Chongming Gao, Rui Miao, Qinli Yang, Junming Shao. Mixhead: Breaking the low-rank bottleneck in multi-head attention language models. Knowl.-Based Syst., 240:108075, 2022. [doi]

@article{0004SGMYS22,
  title = {Mixhead: Breaking the low-rank bottleneck in multi-head attention language models},
  author = {Zhong Zhang 0004 and Nian Shao and Chongming Gao and Rui Miao and Qinli Yang and Junming Shao},
  year = {2022},
  doi = {10.1016/j.knosys.2021.108075},
  url = {https://doi.org/10.1016/j.knosys.2021.108075},
  researchr = {https://researchr.org/publication/0004SGMYS22},
  cites = {0},
  citedby = {0},
  journal = {Knowl.-Based Syst.},
  volume = {240},
  pages = {108075},
}