Zekun Yang, Jiajun He, Tomoki Toda. Multi-Modal Video Summarization Based on Two-Stage Fusion of Audio, Visual, and Recognized Text Information. In Asia Pacific Signal and Information Processing Association Annual Summit and Conference, APSIPA ASC 2024, Macau, December 3-6, 2024. pages 1-6, IEEE, 2024. [doi]
@inproceedings{YangHT24,
title = {Multi-Modal Video Summarization Based on Two-Stage Fusion of Audio, Visual, and Recognized Text Information},
author = {Zekun Yang and Jiajun He and Tomoki Toda},
year = {2024},
doi = {10.1109/APSIPAASC63619.2025.10849046},
url = {https://doi.org/10.1109/APSIPAASC63619.2025.10849046},
researchr = {https://researchr.org/publication/YangHT24},
cites = {0},
citedby = {0},
pages = {1-6},
booktitle = {Asia Pacific Signal and Information Processing Association Annual Summit and Conference, APSIPA ASC 2024, Macau, December 3-6, 2024},
publisher = {IEEE},
isbn = {979-8-3503-6733-1},
}