@inproceedings{8bc5f2f0c561476b80b5fe38f2bf9b05,
title = "Improving Language Models Trained on Translated Data with Continual Pre-Training and Dictionary Learning Analysis",
abstract = "Training LLMs in low resources languages usually utilizes machine translation (MT) data augmentation from English language. However, translation brings a number of challenges: there are large costs attached to translating and curating huge amounts of content with high-end machine translation solutions; the translated content carries over cultural biases; and if the translation is not faithful and accurate, the quality of the data degrades causing issues in the trained model. In this work, we investigate the role of translation and synthetic data in training language models. We translate TinyStories, a dataset of 2.2M short stories for 3-4 year old children, from English to Arabic using the open NLLB-3B MT model. We train a number of story generation models of size 1M-33M parameters using this data. We identify a number of quality and task-specific issues in the resulting models. To rectify these issues, we further pre-train the models with a small dataset of synthesized high-quality stories generated by a capable LLM in Arabic, representing 1% of the original training data. We show, using GPT-4 as a judge and dictionary learning analysis from mechanistic interpretability, that the suggested approach is a practical means to resolve some of the translation pitfalls. We illustrate the improvement through case studies of linguistic and cultural bias issues.",
author = "Sabri Boughorbel and Parvez, {Md Rizwan} and Majd Hawasly",
note = "Publisher Copyright: {\textcopyright}2024 Association for Computational Linguistics.; 2nd Arabic Natural Language Processing Conference, ArabicNLP 2024 ; Conference date: 16-08-2024",
year = "2024",
language = "English",
series = "ArabicNLP 2024 - 2nd Arabic Natural Language Processing Conference, Proceedings of the Conference",
publisher = "Association for Computational Linguistics (ACL)",
pages = "73--88",
editor = "Nizar Habash and Houda Bouamor and Ramy Eskander and Nadi Tomeh and Farha, {Ibrahim Abu} and Ahmed Abdelali and Samia Touileb and Injy Hamed and Yaser Onaizan and Bashar Alhafni and Wissam Antoun and Salam Khalifa and Hatem Haddad and Imed Zitouni and Badr AlKhamissi and Rawan Almatham and Khalil Mrini",
booktitle = "ArabicNLP 2024 - 2nd Arabic Natural Language Processing Conference, Proceedings of the Conference",
address = "United States",
}