1
2
3
4
5
6
7
8
9 """
10 Read tokens from UDHR Corpus
11
12 This corpus contains examples of text in over 300 language/encoding combinations,
13 from the Universal Declaration of Human Rights
14 """
15
16 import os
17 from nltk_lite import tokenize
18 from nltk_lite.corpora import *
19
20
21 items = ['Abkhaz-Cyrillic+Abkh', 'Abkhaz-UTF8', 'Achehnese-Latin1', 'Achuar-Shiwiar-Latin1', 'Adja-UTF8',
22 'Afaan_Oromo_Oromiffa-Latin1', 'Afrikaans-Latin1', 'Aguaruna-Latin1', 'Akuapem_Twi-UTF8',
23 'Albanian_Shqip-Latin1', 'Amahuaca-Latin1', 'Amahuaca', 'Amarakaeri-Latin1',
24 'Amharic-Afenegus6..60375', 'Amuesha-Yanesha-UTF8', 'Arabela-Latin1', 'Arabic_Alarabia-Arabic',
25 'Armenian-DallakHelv', 'Asante-UTF8', 'Ashaninca-Latin1', 'Asheninca-Latin1', 'Asturian_Bable-Latin1',
26 'Aymara-Latin1', 'Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117',
27 'Azeri_Azerbaijani_Latin-Az.Times.Lat0117', 'Balinese-Latin1', 'Bambara-UTF8',
28 'Baoule-UTF8', 'Basque_Euskara-Latin1', 'Batonu_Bariba-UTF8', 'Belorus_Belaruski-Cyrillic',
29 'Belorus_Belaruski-UTF8', 'Bemba-Latin1', 'Bengali-UTF8', 'Beti-UTF8', 'Bhojpuri-Agra',
30 'Bichelamar-Latin1', 'Bikol_Bicolano-Latin1', 'Bora-Latin1', 'Bosnian_Bosanski-Cyrillic',
31 'Bosnian_Bosanski-Latin2', 'Bosnian_Bosanski-UTF8', 'Breton-Latin1', 'Bugisnese-Latin1',
32 'Bulgarian_Balgarski-Cyrillic', 'Bulgarian_Balgarski-UTF8', 'Burmese_Myanmar-UTF8',
33 'Burmese_Myanmar-WinResearcher', 'Cakchiquel-Latin1', 'Campa_Pajonalino-Latin1',
34 'Candoshi-Shapra-Latin1', 'Caquinte-Latin1', 'Cashibo-Cacataibo-Latin1', 'Cashinahua-Latin1',
35 'Catalan_Catala-Latin1', 'Catalan-Latin1', 'Cebuano-Latin1', 'Chamorro-Latin1', 'Chayahuita-Latin1',
36 'Chechewa_Nyanja-Latin1', 'Chickasaw-Latin1', 'Chinanteco-Ajitlan-Latin1', 'Chinanteco-UTF8',
37 'Chinese_Mandarin-GB2312', 'Chinese_Mandarin-HZ', 'Chinese_Mandarin-UTF8', 'Chuuk_Trukese-Latin1',
38 'Cokwe-Latin1', 'Corsican-Latin1', 'Croatian_Hrvatski-Latin2', 'Czech_Cesky-Latin2', 'Czech_Cesky-UTF8',
39 'Czech-Latin2-err', 'Czech-Latin2', 'Czech-UTF8', 'Dagaare-UTF8', 'Dagbani-UTF8', 'Dangme-UTF8',
40 'Danish_Dansk-Latin1', 'Dendi-UTF8', 'Ditammari-UTF8', 'Dutch_Nederlands-Latin1', 'Edo-Latin1',
41 'English-Latin1', 'Esperanto-T61', 'Esperanto-UTF8', 'Estonian_Eesti-Latin1', 'Ewe_Eve-UTF8',
42 'Fante-UTF8', 'Faroese-Latin1', 'Farsi_Persian-UTF8', 'Farsi_Persian-v2-UTF8', 'Fijian-Latin1',
43 'Filipino_Tagalog-Latin1', 'Finnish_Suomi-Latin1', 'Fon-UTF8', 'French_Francais-Latin1',
44 'Frisian-Latin1', 'Friulian_Friulano-Latin1', 'Gagauz_Gagauzi-UTF8', 'Galician_Galego-Latin1',
45 'Garifuna_Garifuna-Latin1', 'Ga-UTF8', 'German_Deutsch-Latin1', 'Gonja-UTF8', 'Greek_Ellinika-Greek',
46 'Greek_Ellinika-UTF8', 'Greenlandic_Inuktikut-Latin1', 'Guarani-Latin1', 'Guen_Mina-UTF8',
47 'Gujarati-UTF8', 'HaitianCreole_Kreyol-Latin1', 'HaitianCreole_Popular-Latin1', 'Hani-Latin1',
48 'Hausa_Haoussa-Latin1', 'Hawaiian-UTF8', 'Hebrew_Ivrit-Hebrew', 'Hebrew_Ivrit-UTF8', 'Hiligaynon-Latin1',
49 'Hindi-UFT8', 'Hindi_web-UFT8', 'Hmong_Miao_Northern-East-Guizhou-Latin1',
50 'Hmong_Miao-Sichuan-Guizhou-Yunnan-Latin1', 'Hmong_Miao-SouthernEast-Guizhou-Latin1',
51 'Hrvatski_Croatian-Latin2', 'Huasteco-Latin1', 'Huitoto_Murui-Latin1', 'Hungarian_Magyar-Latin1',
52 'Hungarian_Magyar-Latin2', 'Hungarian_Magyar-Unicode', 'Hungarian_Magyar-UTF8', 'Ibibio_Efik-Latin1',
53 'Icelandic_Yslenska-Latin1', 'Ido-Latin1', 'Igbo-UTF8', 'Iloko_Ilocano-Latin1', 'Indonesian-Latin1',
54 'Interlingua-Latin1', 'Inuktikut_Greenlandic-Latin1', 'IrishGaelic_Gaeilge-Latin1', 'Italian_Italiano-Latin1',
55 'Italian-Latin1', 'Japanese_Nihongo-EUC', 'Japanese_Nihongo-JIS', 'Japanese_Nihongo-SJIS',
56 'Japanese_Nihongo-UTF8', 'Javanese-Latin1', 'Jola-Fogny_Diola-UTF8', 'Kabye-UTF8', 'Kannada-UTF8',
57 'Kaonde-Latin1', 'Kapampangan-Latin1', 'Kasem-UTF8', 'Kazakh-Cyrillic', 'Kazakh-UTF8', 'Kiche_Quiche-Latin1',
58 'Kicongo-Latin1', 'Kimbundu_Mbundu-Latin1', 'Kinyamwezi_Nyamwezi-Latin1', 'Kinyarwanda-Latin1', 'Kituba-Latin1',
59 'Korean_Hankuko-UTF8', 'Kpelewo-UTF8', 'Krio-UTF8', 'Kurdish-UTF8', 'Lamnso_Lam-nso-UTF8', 'Lao-UTF8',
60 'Latin_Latina-Latin1', 'Latin_Latina-v2-Latin1', 'Latvian-Latin1', 'Limba-UTF8', 'Lingala-Latin1',
61 'Lithuanian_Lietuviskai-Baltic', 'Lozi-Latin1', 'Luba-Kasai_Tshiluba-Latin1', 'Luganda_Ganda-Latin1',
62 'Lunda_Chokwe-lunda-Latin1', 'Luvale-Latin1', 'Luxembourgish_Letzebuergeusch-Latin1', 'Macedonian-UTF8',
63 'Madurese-Latin1', 'Magahi-Agra', 'Magahi-UTF8', 'Makonde-Latin1', 'Malagasy-Latin1',
64 'Malay_BahasaMelayu-Latin1', 'Maltese-UTF8', 'Mam-Latin1', 'Maninka-UTF8', 'Maori-Latin1',
65 'Mapudungun_Mapuzgun-Latin1', 'Mapudungun_Mapuzgun-UTF8', 'Marathi-UTF8', 'Marshallese-Latin1',
66 'Matses-Latin1', 'Mayan_Yucateco-Latin1', 'Mazahua_Jnatrjo-UTF8', 'Mazateco-Latin1', 'Mende-UTF8',
67 'Mikmaq_Micmac-Mikmaq-Latin1', 'Minangkabau-Latin1', 'Miskito_Miskito-Latin1', 'Mixteco-Latin1',
68 'Mongolian_Khalkha-Cyrillic', 'Mongolian_Khalkha-UTF8', 'Moore_More-UTF8', 'Nahuatl-Latin1',
69 'Navaho_Dine-Navajo-Navaho-font', 'Ndebele-Latin1', 'Nepali-UTF8', 'Ngangela_Nyemba-Latin1',
70 'NigerianPidginEnglish-Latin1', 'Nomatsiguenga-Latin1', 'NorthernSotho_Pedi-Sepedi-Latin1',
71 'Norwegian-Latin1', 'Norwegian_Norsk-Bokmal-Latin1', 'Norwegian_Norsk-Nynorsk-Latin1', 'Nyanja_Chechewa-Latin1',
72 'Nyanja_Chinyanja-Latin1', 'Nzema-UTF8', 'OccitanAuvergnat-Latin1', 'OccitanLanguedocien-Latin1',
73 'Oromiffa_AfaanOromo-Latin1', 'Osetin_Ossetian-UTF8', 'Oshiwambo_Ndonga-Latin1', 'Otomi_Nahnu-Latin1',
74 'Paez-Latin1', 'Palauan-Latin1', 'Peuhl-UTF8', 'Picard-Latin1', 'Pipil-Latin1', 'Polish-Latin2',
75 'Polish_Polski-Latin2', 'Ponapean-Latin1', 'Portuguese_Portugues-Latin1', 'Pulaar-UTF8',
76 'Punjabi_Panjabi-UTF8', 'Purhepecha-UTF8', 'Qechi_Kekchi-Latin1', 'Quechua-Latin1', 'Quichua-Latin1',
77 'Rarotongan_MaoriCookIslands-Latin1', 'Rhaeto-Romance_Rumantsch-Latin1', 'Romanian-Latin2',
78 'Romanian_Romana-Latin2', 'Romani-Latin1', 'Romani-UTF8', 'Rukonzo_Konjo-Latin1', 'Rundi_Kirundi-Latin1',
79 'Runyankore-rukiga_Nkore-kiga-Latin1', 'Russian-Cyrillic', 'Russian_Russky-Cyrillic', 'Russian_Russky-UTF8',
80 'Russian-UTF8', 'Sami_Lappish-UTF8', 'Sammarinese-Latin1', 'Samoan-Latin1', 'Sango_Sangho-Latin1',
81 'Sanskrit-UTF8', 'Saraiki-UTF8', 'Sardinian-Latin1', 'ScottishGaelic_GaidhligAlbanach-Latin1',
82 'Seereer-UTF8', 'Serbian_Srpski-Cyrillic', 'Serbian_Srpski-Latin2', 'Serbian_Srpski-UTF8',
83 'Sharanahua-Latin1', 'Shipibo-Conibo-Latin1', 'Shona-Latin1', 'Sinhala-UTF8', 'Siswati-Latin1',
84 'Slovak-Latin2', 'Slovak_Slovencina-Latin2', 'Slovenian_Slovenscina-Latin2', 'SolomonsPidgin_Pijin-Latin1',
85 'Somali-Latin1', 'Soninke_Soninkanxaane-UTF8', 'Sorbian-Latin2', 'SouthernSotho_Sotho-Sesotho-Sutu-Sesutu-Latin1',
86 'Spanish_Espanol-Latin1', 'Spanish-Latin1', 'Sukuma-Latin1', 'Sundanese-Latin1',
87 'Sussu_Soussou-Sosso-Soso-Susu-UTF8', 'Swaheli-Latin1', 'Swahili_Kiswahili-Latin1', 'Swedish_Svenska-Latin1',
88 'Tahitian-UTF8', 'Tamil-UTF8', 'Tenek_Huasteco-Latin1', 'Tetum-Latin1', 'Themne_Temne-UTF8',
89 'Tigrinya_Tigrigna-VG2Main', 'Tiv-Latin1', 'Toba-UTF8', 'Tojol-abal-Latin1', 'TokPisin-Latin1',
90 'Tonga-Latin1', 'Tongan_Tonga-Latin1', 'Totonaco-Latin1', 'Trukese_Chuuk-Latin1', 'Turkish_Turkce-Turkish',
91 'Turkish_Turkce-UTF8', 'Tzeltal-Latin1', 'Tzotzil-Latin1', 'Uighur_Uyghur-Latin1', 'Uighur_Uyghur-UTF8',
92 'Ukrainian-Cyrillic', 'Ukrainian-UTF8', 'Umbundu-Latin1', 'Urarina-Latin1', 'Uzbek-Latin1',
93 'Vietnamese-ALRN-UTF8', 'Vietnamese-TCVN', 'Vietnamese-UTF8', 'Vietnamese-VIQR', 'Vietnamese-VPS',
94 'Vlach-Latin1', 'Walloon_Wallon-Latin1', 'Wama-UTF8', 'Waray-Latin1', 'Wayuu-Latin1', 'Welsh_Cymraeg-Latin1',
95 'WesternSotho_Tswana-Setswana-Latin1', 'Wolof-Latin1', 'Xhosa-Latin1', 'Yagua-Latin1', 'Yao-Latin1',
96 'Yapese-Latin1', 'Yoruba-UTF8', 'Zapoteco-Latin1', 'Zapoteco-SanLucasQuiavini-Latin1', 'Zhuang-Latin1',
97 'Zulu-Latin1']
98
99 item_name = {}
100
101 -def raw(files = 'English-Latin1'):
109
117
119 from nltk_lite.corpora import udhr
120 from itertools import islice
121
122 print "English-Latin1"
123 for word in islice(udhr.raw('English-Latin1'), 27):
124 print word,
125 print
126
127 print "Italian-Latin1"
128 for word in islice(udhr.raw('Italian-Latin1'), 27):
129 print word,
130 print
131
132 print "English-Latin1, Italian-Latin1"
133 data = udhr.langs(files = ('English-Latin1', 'Italian-Latin1'))
134
135 print data["English-Latin1"]
136 print data["Italian-Latin1"]
137
138 if __name__ == '__main__':
139 demo()
140