Quality at a Glance: An Audit of Web-Crawled Multilingual Datasets

Julia Kreutzer, Isaac Caswell, Lisa Wang, Ahsan Wahab, Daan van Esch, Nasanbayar Ulzii-Orshikh, Allahsera Tapo, Nishant Subramani, Artem Sokolov, Claytone Sikasote, Monang Setyawan, Supheakmungkol Sarin, Sokhar Samb, Benoît Sagot, Clara Rivera, Annette Rios, Isabel Papadimitriou, Salomey Osei, Pedro Javier Ortiz Suárez, Iroro Orife, Kelechi Ogueji, Andre Niyongabo Rubungo, Toan Q. Nguyen, Mathias Müller 0002, André Müller, Shamsuddeen Hassan Muhammad, Nanda Muhammad, Ayanda Mnyakeni, Jamshidbek Mirzakhalov, Tapiwanashe Matangira, Colin Leong, Nze Lawson, Sneha Kudugunta, Yacine Jernite, Mathias Jenny, Orhan Firat, Bonaventure F. P. Dossou, Sakhile Dlamini, Nisansa de Silva, Sakine Çabuk Balli, Stella Biderman, Alessia Battisti, Ahmed Baruwa, Ankur Bapna, Pallavi Baljekar, Israel Abebe Azime, Ayodele Awokoya, Duygu Ataman, Orevaoghene Ahia, Oghenefego Ahia, Sweta Agrawal, Mofetoluwa Adeyemi. Quality at a Glance: An Audit of Web-Crawled Multilingual Datasets. TACL, 10:50-72, 2022. [doi]

@article{KreutzerCWWEUTS22,
  title = {Quality at a Glance: An Audit of Web-Crawled Multilingual Datasets},
  author = {Julia Kreutzer and Isaac Caswell and Lisa Wang and Ahsan Wahab and Daan van Esch and Nasanbayar Ulzii-Orshikh and Allahsera Tapo and Nishant Subramani and Artem Sokolov and Claytone Sikasote and Monang Setyawan and Supheakmungkol Sarin and Sokhar Samb and Benoît Sagot and Clara Rivera and Annette Rios and Isabel Papadimitriou and Salomey Osei and Pedro Javier Ortiz Suárez and Iroro Orife and Kelechi Ogueji and Andre Niyongabo Rubungo and Toan Q. Nguyen and Mathias Müller 0002 and André Müller and Shamsuddeen Hassan Muhammad and Nanda Muhammad and Ayanda Mnyakeni and Jamshidbek Mirzakhalov and Tapiwanashe Matangira and Colin Leong and Nze Lawson and Sneha Kudugunta and Yacine Jernite and Mathias Jenny and Orhan Firat and Bonaventure F. P. Dossou and Sakhile Dlamini and Nisansa de Silva and Sakine Çabuk Balli and Stella Biderman and Alessia Battisti and Ahmed Baruwa and Ankur Bapna and Pallavi Baljekar and Israel Abebe Azime and Ayodele Awokoya and Duygu Ataman and Orevaoghene Ahia and Oghenefego Ahia and Sweta Agrawal and Mofetoluwa Adeyemi},
  year = {2022},
  doi = {10.1162/tacl_a_00447},
  url = {https://doi.org/10.1162/tacl_a_00447},
  researchr = {https://researchr.org/publication/KreutzerCWWEUTS22},
  cites = {0},
  citedby = {0},
  journal = {TACL},
  volume = {10},
  pages = {50-72},
}