1
2
3
4
5
6
7
8
9
10
11 """
12 This module provides functionality for reading language settings files for
13 Toolbox.
14 """
15
16 from nltk_lite.etree.ElementTree import TreeBuilder
17 from nltk_lite.contrib.toolbox.settings import ToolboxSettings
18 import re
19
21 __slots__ = ('upper', 'lower')
22
25
26
28 """Class for Toolbox Language settings.
29 """
30 - def __init__(self, fname, encoding=None):
42
44 self.case = case = {}
45 for c in case_pairs.splitlines():
46 val = c.split()
47 if len(val) != 2:
48 raise ValueError, '"%s" is not a valid case association' % c
49 u, l = val
50 let_u = case[u] = Letter()
51 let_l = case[l] = Letter()
52 let_u.upper = let_l.upper = u
53 let_u.lower = let_l.lower = l
54
56 """return the lower case form of the letter.
57
58 @rtype: string
59 """
60 return self.case[let].lower
61
63 """return the upper case form of the letter.
64
65 @rtype: string
66 """
67 return self.case[let].upper
68
69
71 """"""
72 __slots__ = ('order', 'type')
73
76
77
79 """Class for Shoebox sort orders
80
81 """
82
84 self.name = srt_order.text
85 self.desc = srt_order.findtext('desc')
86
87 try:
88 primary = srt_order.findtext('primary').splitlines()
89 except AttributeError:
90 primary = []
91 try:
92 sec_pre = srt_order.findtext('SecPreceding').split()
93 except AttributeError:
94 sec_pre = []
95 try:
96 sec_fol = srt_order.findtext('SecFollowing').split()
97 except AttributeError:
98 sec_fol = []
99 try:
100 ignore = srt_order.findtext('ignore').split()
101 except AttributeError:
102 ignore = []
103 self.sec_after = srt_order.find('SecAfterBase') is not None
104
105 primaries = [p.split() for p in primary]
106
107 self.graphs = graphs = {}
108 unmarked = len(sec_pre) + 1
109 primaries[0:0] = [' ']
110 i = 1
111 for p in primaries:
112 j = 1
113 for m in p:
114 if m in graphs:
115 raise ValueError, 'primary "%s" already in sort order' % m
116 graphs[m] = g = Graph()
117 g.type = 'p'
118 g.order = (i, j, unmarked)
119 j += 1
120 i += 1
121 prims = graphs.keys()
122 prims.remove(' ')
123 self.letter_pat = self.make_pattern(prims)
124
125 i = 1
126 for s in sec_pre:
127 if s in graphs:
128 raise ValueError, 'secondary preceding "%s" already in sort order' % s
129 graphs[s] = g = Graph()
130 g.type = 's'
131 g.order = i
132 i += 1
133
134
135 i += 1
136 for s in sec_fol:
137 if s in graphs:
138 raise ValueError, 'secondary following "%s" already in sort order' % s
139 graphs[s] = g = Graph()
140 g.type = 's'
141 g.order = i
142 i += 1
143
144 self.graph_pat = self.make_pattern(graphs.keys())
145
146
147
148
149
150
151
152
153
155 """Return a regular expression pattern to match the strings in slist"""
156
157 tmpl = [(len(x), x) for x in slist]
158 tmpl.sort()
159 tmpl.reverse()
160 sorted_list = [x[1] for x in tmpl]
161 escape = re.escape
162 pat = re.compile('|'.join([re.escape(g) for g in sorted_list]))
163 return pat
164
166 """return the first primary in the string s"""
167 match = self.letter_pat.search(s)
168 if match is not None:
169 return match.group()
170 else:
171 raise ValueError, 'no primary found in "%s"' % s
172
199