@inproceedings{2d29ea15c7f74396804e89c3aba4cd51,
title = "A pilot study on arabic multi-genre corpus diacritization annotation",
abstract = "Arabic script writing is typically underspecified for short vowels and other mark up, referred to as diacritics. Apart from the lexical ambiguity found in words, similar to that exhibited in other languages, the lack of diacritics in written Arabic script adds another layer of ambiguity which is an artifact of the orthography. Diacritization of written text has a significant impact on Arabic NLP applications. In this paper, we present a pilot study on building a diacritized multi-genre corpus in Arabic. We annotate a sample of nondiacritized words extracted from five text genres. We explore different annotation strategies: Basic where we present only the bare undiacritized forms to the annotators, Intermediate (Basic forms+their POS tags), and Advanced (automatically diacritized words). We present the impact of the annotation strategy on annotation quality. Moreover, we study different diacritization schemes in the process.",
author = "Houda Bouamor and Wajdi Zaghouani and Mona Diab and Ossama Obeid and Kemal Oflazer and Mahmoud Ghoneim and Abdelati Hawwari",
note = "Publisher Copyright: {\textcopyright} ACL 2015. All rights reserved.; 2nd Workshop on Arabic Natural Language Processing, ANLP 2015 ; Conference date: 30-07-2015",
year = "2015",
language = "English",
series = "2nd Workshop on Arabic Natural Language Processing, ANLP 2015 - held at 53rd Annual Meeting of the Association for Computational Linguistics, ACL 2015 - Proceedings",
publisher = "Association for Computational Linguistics (ACL)",
pages = "80--88",
editor = "Nizar Habash and Stephan Vogel and Kareem Darwish",
booktitle = "2nd Workshop on Arabic Natural Language Processing, ANLP 2015 - held at 53rd Annual Meeting of the Association for Computational Linguistics, ACL 2015 - Proceedings",
address = "United States",
}