Rui Su, Qian Yu, Dong Xu 0001. STVGBert: A Visual-linguistic Transformer based Framework for Spatio-temporal Video Grounding. In 2021 IEEE/CVF International Conference on Computer Vision, ICCV 2021, Montreal, QC, Canada, October 10-17, 2021. pages 1513-1522, IEEE, 2021. [doi]
@inproceedings{SuY021, title = {STVGBert: A Visual-linguistic Transformer based Framework for Spatio-temporal Video Grounding}, author = {Rui Su and Qian Yu and Dong Xu 0001}, year = {2021}, doi = {10.1109/ICCV48922.2021.00156}, url = {https://doi.org/10.1109/ICCV48922.2021.00156}, researchr = {https://researchr.org/publication/SuY021}, cites = {0}, citedby = {0}, pages = {1513-1522}, booktitle = {2021 IEEE/CVF International Conference on Computer Vision, ICCV 2021, Montreal, QC, Canada, October 10-17, 2021}, publisher = {IEEE}, isbn = {978-1-6654-2812-5}, }