Hao-Wen Dong, Xiaoyu Liu, Jordi Pons, Gautam Bhattacharya, Santiago Pascual, Joan SerrĂ , Taylor Berg-Kirkpatrick, Julian J. McAuley. CLIPSonic: Text-to-Audio Synthesis with Unlabeled Videos and Pretrained Language-Vision Models. In IEEE Workshop on Applications of Signal Processing to Audio and Acoustics, WASPAA 2023, New Paltz, NY, USA, October 22-25, 2023. pages 1-5, IEEE, 2023. [doi]
@inproceedings{DongLPBPSBM23, title = {CLIPSonic: Text-to-Audio Synthesis with Unlabeled Videos and Pretrained Language-Vision Models}, author = {Hao-Wen Dong and Xiaoyu Liu and Jordi Pons and Gautam Bhattacharya and Santiago Pascual and Joan SerrĂ and Taylor Berg-Kirkpatrick and Julian J. McAuley}, year = {2023}, doi = {10.1109/WASPAA58266.2023.10248160}, url = {https://doi.org/10.1109/WASPAA58266.2023.10248160}, researchr = {https://researchr.org/publication/DongLPBPSBM23}, cites = {0}, citedby = {0}, pages = {1-5}, booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics, WASPAA 2023, New Paltz, NY, USA, October 22-25, 2023}, publisher = {IEEE}, isbn = {979-8-3503-2372-6}, }