1 """
2 Sam Huston 2007
3
4 This is a simulation of the article:
5 "Evaluation of a language identification system for mono- and multilingual text documents"
6 by Artemenko, O; Mandl, T; Shramko, M; Womser-Hacker, C.
7 presented at: Applied Computing 2006, 21st Annual ACM Symposium on Applied Computing; 23-27 April 2006
8
9 This implementation is intended for monolingual documents only,
10 however it is performed over a much larger range of languages.
11 Additionally three supervised methods of classification are explored:
12 Cosine distance, NaiveBayes, and Spearman-rho
13
14 """
15
16 from nltk_lite.contrib import classify
17 from nltk_lite import detect
18 from nltk_lite.corpora import udhr
19 import string
20
21 -def run(classifier, training_data, gold_data):
29
30
31 fd = detect.feature({"char-bigrams" : lambda t: [string.join(t)[n:n+2] for n in range(len(t)-1)]})
32
33 training_data = udhr.langs(['English-Latin1', 'French_Francais-Latin1', 'Indonesian-Latin1', 'Zapoteco-Latin1'])
34 gold_data = {}
35 for lang in training_data:
36 gold_data[lang] = training_data[lang][:50]
37 training_data[lang] = training_data[lang][100:200]
38
39 print "Cosine classifier: ",
40 run(classify.Cosine(fd), training_data, gold_data)
41
42 print "Naivebayes classifier: ",
43 run(classify.NaiveBayes(fd), training_data, gold_data)
44
45 print "Spearman classifier: ",
46 run(classify.Spearman(fd), training_data, gold_data)
47