@inproceedings{69172c84e6aa47f09bf74a756bf741d2,
title = "Verifiably effective arabic dialect identification",
abstract = "Several recent papers on Arabic dialect identification have hinted that using a word unigram model is sufficient and effective for the task. However, most previous work was done on a standard fairly homogeneous dataset of dialectal user comments. In this paper, we show that training on the standard dataset does not generalize, because a unigram model may be tuned to topics in the comments and does not capture the distinguishing features of dialects. We show that effective dialect identification requires that we account for the distinguishing lexical, morphological, and phonological phenomena of dialects. We show that accounting for such can improve dialect detection accuracy by nearly 10% absolute.",
author = "Kareem Darwish and Hassan Sajjad and Hamdy Mubarak",
note = "Publisher Copyright: {\textcopyright} 2014 Association for Computational Linguistics.; 2014 Conference on Empirical Methods in Natural Language Processing, EMNLP 2014 ; Conference date: 25-10-2014 Through 29-10-2014",
year = "2014",
doi = "10.3115/v1/d14-1154",
language = "English",
series = "EMNLP 2014 - 2014 Conference on Empirical Methods in Natural Language Processing, Proceedings of the Conference",
publisher = "Association for Computational Linguistics (ACL)",
pages = "1465--1468",
booktitle = "EMNLP 2014 - 2014 Conference on Empirical Methods in Natural Language Processing, Proceedings of the Conference",
address = "United States",
}