1
2
3
4
5
6
7
8
9 """
10 Classes and interfaces for tagging each token of a document with
11 supplementary information, such as its part of speech or its WordNet
12 synset tag. This task, which is known as X{tagging}, is defined by
13 the L{TagI} interface.
14 """
15
16 import yaml
17 import string
18
19 -class TagI(yaml.YAMLObject):
20 """
21 A processing interface for assigning a tag to each token in a list.
22 Tags are case sensitive strings that identify some property of each
23 token, such as its part of speech or its sense.
24 """
25 - def tag(self, tokens):
26 """
27 Assign a tag to each token in C{tokens}, and yield a tagged token
28 of the form (token, tag)
29 """
30 raise NotImplementedError()
31
33 """
34 A tagger that tags words sequentially, left to right.
35 """
36 - def tag(self, tokens, verbose=False):
48
52
54 if self._backoff:
55 return self._backoff.tag_one(token, history)
56 else:
57 return None
58
60 """
61 A tagger that assigns the same tag to every token.
62 """
63 yaml_tag = '!tag.Default'
65 """
66 Construct a new default tagger.
67
68 @type tag: C{string}
69 @param tag: The tag that should be assigned to every token.
70 """
71 self._tag = tag
72 self._backoff = None
73 self._history = None
74
75 - def tag_one(self, token, history=None):
77
79 return '<DefaultTagger: tag=%s>' % self._tag
80
81
82
83
84
85
86 from nltk_lite import tokenize
87
89 loc = s.rfind(sep)
90 if loc >= 0:
91 return (s[:loc], s[loc+1:])
92 else:
93 return (s, None)
94
95 -def untag(tagged_sentence):
96 return (w for (w, t) in tagged_sentence)
97
100
103
106
107
108
109
110
111 from nltk_lite import evaluate
113 """
114 Score the accuracy of the tagger against the gold standard.
115 Strip the tags from the gold standard text, retag it using
116 the tagger, then compute the accuracy score.
117
118 @type tagger: C{TagI}
119 @param tagger: The tagger being evaluated.
120 @type gold: C{list} of C{Token}
121 @param gold: The list of tagged tokens to score the tagger on.
122 @rtype: C{float}
123 """
124
125 gold_tokens = []
126 test_tokens = []
127 for sent in gold:
128 sent = list(sent)
129 gold_tokens += sent
130 test_tokens += list(tagger.tag(untag(sent)))
131
132
133
134 return evaluate.accuracy(gold_tokens, test_tokens)
135
136
137
138 from unigram import *
139 from ngram import *
140 from brill import *
141