Thuat Nguyen, Chien Van Nguyen, Viet Dac Lai, Hieu Man, Nghia Trung Ngo, Franck Dernoncourt, Ryan A. Rossi, Thien Huu Nguyen. CulturaX: A Cleaned, Enormous, and Multilingual Dataset for Large Language Models in 167 Languages. In Nicoletta Calzolari, Min-Yen Kan, Véronique Hoste, Alessandro Lenci, Sakriani Sakti, Nianwen Xue, editors, Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation, LREC/COLING 2024, 20-25 May, 2024, Torino, Italy. pages 4226-4237, ELRA and ICCL, 2024. [doi]
@inproceedings{NguyenNLMNDRN24,
title = {CulturaX: A Cleaned, Enormous, and Multilingual Dataset for Large Language Models in 167 Languages},
author = {Thuat Nguyen and Chien Van Nguyen and Viet Dac Lai and Hieu Man and Nghia Trung Ngo and Franck Dernoncourt and Ryan A. Rossi and Thien Huu Nguyen},
year = {2024},
url = {https://aclanthology.org/2024.lrec-main.377},
researchr = {https://researchr.org/publication/NguyenNLMNDRN24},
cites = {0},
citedby = {0},
pages = {4226-4237},
booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation, LREC/COLING 2024, 20-25 May, 2024, Torino, Italy},
editor = {Nicoletta Calzolari and Min-Yen Kan and Véronique Hoste and Alessandro Lenci and Sakriani Sakti and Nianwen Xue},
publisher = {ELRA and ICCL},
isbn = {978-2-493814-10-4},
}