Package nltk_lite :: Package contrib :: Module langid
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.langid

 1  """ 
 2  Sam Huston 2007 
 3   
 4  This is a simulation of the article: 
 5  "Evaluation of a language identification system for mono- and multilingual text documents" 
 6  by Artemenko, O; Mandl, T; Shramko, M; Womser-Hacker, C. 
 7  presented at: Applied Computing 2006, 21st Annual ACM Symposium on Applied Computing; 23-27 April 2006 
 8   
 9  This implementation is intended for monolingual documents only, 
10  however it is performed over a much larger range of languages. 
11  Additionally three supervised methods of classification are explored: 
12  Cosine distance, NaiveBayes, and Spearman-rho 
13   
14  """ 
15   
16  from nltk_lite.contrib import classify 
17  from nltk_lite import detect 
18  from nltk_lite.corpora import udhr 
19  import string 
20   
21 -def run(classifier, training_data, gold_data):
22 classifier.train(training_data) 23 correct = 0 24 for lang in gold_data: 25 cls = classifier.get_class(gold_data[lang]) 26 if cls == lang: 27 correct += 1 28 print correct, "in", len(gold_data), "correct"
29 30 # features: character bigrams 31 fd = detect.feature({"char-bigrams" : lambda t: [string.join(t)[n:n+2] for n in range(len(t)-1)]}) 32 33 training_data = udhr.langs(['English-Latin1', 'French_Francais-Latin1', 'Indonesian-Latin1', 'Zapoteco-Latin1']) 34 gold_data = {} 35 for lang in training_data: 36 gold_data[lang] = training_data[lang][:50] 37 training_data[lang] = training_data[lang][100:200] 38 39 print "Cosine classifier: ", 40 run(classify.Cosine(fd), training_data, gold_data) 41 42 print "Naivebayes classifier: ", 43 run(classify.NaiveBayes(fd), training_data, gold_data) 44 45 print "Spearman classifier: ", 46 run(classify.Spearman(fd), training_data, gold_data) 47