Ji Lin, Hongxu Yin, Wei Ping, Pavlo Molchanov 0001, Mohammad Shoeybi, Song Han. VILA: On Pre-training for Visual Language Models. In IEEE/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2024, Seattle, WA, USA, June 16-22, 2024. pages 26679-26689, IEEE, 2024. [doi]
@inproceedings{LinYP0SH24,
title = {VILA: On Pre-training for Visual Language Models},
author = {Ji Lin and Hongxu Yin and Wei Ping and Pavlo Molchanov 0001 and Mohammad Shoeybi and Song Han},
year = {2024},
doi = {10.1109/CVPR52733.2024.02520},
url = {https://doi.org/10.1109/CVPR52733.2024.02520},
researchr = {https://researchr.org/publication/LinYP0SH24},
cites = {0},
citedby = {0},
pages = {26679-26689},
booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2024, Seattle, WA, USA, June 16-22, 2024},
publisher = {IEEE},
isbn = {979-8-3503-5300-6},
}