1
2
3
4
5
6
7
8
9
10
11
12
13 """
14 Module for reading, writing and manipulating Toolbox databases.
15 """
16
17 import os, re
18 from nltk_lite.corpora import get_basedir
19 from string import split
20 from itertools import imap
21 from StringIO import StringIO
22 from nltk_lite.etree.ElementTree import TreeBuilder, Element
23
127
210
211
213 """
214 Return an element tree resulting from parsing the toolbox datafile.
215
216 A convenience function that creates a C{ToolboxData} object, opens and
217 parses the toolbox data file. The data file is assumed to be in the toolbox
218 subdirectory of the directory where NLTK looks for corpora,
219 see L{corpora.get_basedir()}.
220 @param file_name: Name of file in toolbox corpus directory
221 @type file_name: string
222 @param key: marker at the start of each record
223 @type key: string
224 @param kwargs: Keyword arguments passed to L{ToolboxData.parse()}
225 @type kwargs: keyword arguments dictionary
226 @rtype: ElementTree._ElementInterface
227 @return: contents of toolbox data divided into header and records
228 """
229 db = ToolboxData()
230 db.open(os.path.join(get_basedir(), 'toolbox', file_name))
231 return db.parse(key, **kwargs)
232
233 import re
234
235 _is_value = re.compile(r"\S")
236
237 -def to_sfm_string(tree, encoding=None, errors='strict', unicode_fields=None):
238 """Return a string with a standard format representation of the toolbox
239 data in tree (tree can be a toolbox database or a single record).
240
241 @param tree: flat representation of toolbox data (whole database or single record)
242 @type tree: ElementTree._ElementInterface
243 @param encoding: Name of an encoding to use.
244 @type encoding: string
245 @param errors: Error handling scheme for codec. Same as the C{encode}
246 inbuilt string method.
247 @type errors: string
248 @param unicode_fields:
249 @type unicode_fields: string
250 @rtype: string
251 @return: string using standard format markup
252 """
253 if tree.tag == 'record':
254 root = Element('toolbox_data')
255 root.append(tree)
256 tree = root
257
258 if tree.tag != 'toolbox_data':
259 raise ValueError, "not a toolbox_data element structure"
260 if encoding is None and unicode_fields is not None:
261 raise ValueError, \
262 "if encoding is not specified then neither should unicode_fields"
263 l = []
264 for rec in tree:
265 l.append('\n')
266 for field in rec:
267 mkr = field.tag
268 value = field.text
269 if encoding is not None:
270 if unicode_fields is not None and mkr in unicode_fields:
271 cur_encoding = 'utf8'
272 else:
273 cur_encoding = encoding
274 if re.search(_is_value, value):
275 l.append((u"\\%s %s\n" % (mkr, value)).encode(cur_encoding, errors))
276 else:
277 l.append((u"\\%s%s\n" % (mkr, value)).encode(cur_encoding, errors))
278 else:
279 if re.search(_is_value, value):
280 l.append("\\%s %s\n" % (mkr, value))
281 else:
282 l.append("\\%s%s\n" % (mkr, value))
283 return ''.join(l[1:])
284
286 """
287 Deprecated: use C{StandardFormat.fields()}
288
289 @param s: toolbox record as a string
290 @type s: L{string}
291 @rtype: iterator over L{list(string)}
292 """
293
294 s = "\n" + s
295 if s.endswith("\n") : s = s[:-1]
296 for field in split(s, sep="\n\\")[1:] :
297 parsed_field = split(field, sep=" ", maxsplit=1)
298 try :
299 yield (parsed_field[0], parsed_field[1])
300 except IndexError :
301 yield (parsed_field[0], '')
302
303
304 -def raw(files='rotokas.dic', include_header=False, head_field_marker=None):
305 """
306 Deprecated: use C{StandardFormat.fields()}
307
308 @param files: One or more toolbox files to be processed
309 @type files: L{string} or L{tuple(string)}
310 @param include_header: flag that determines whether to treat header as record (default is no)
311 @type include_header: boolean
312 @param head_field_marker: option for explicitly setting which marker to use as the head field
313 when parsing the file (default is automatically determining it from
314 the first field of the first record)
315 @type head_field_marker: string
316 @rtype: iterator over L{list(string)}
317 """
318
319
320 if type(files) is str : files = (files,)
321
322 for file in files:
323 path = os.path.join(get_basedir(), "toolbox", file)
324 fc = open(path, "U").read()
325 if fc.strip().startswith(r"\_") :
326 (header, body) = split(fc, sep="\n\n", maxsplit=1)
327 if include_header:
328 yield list(_parse_record(header))
329 else :
330 body = fc
331
332
333 if head_field_marker :
334 hfm_with_backslash = "\\" + hfm
335 else :
336 ff = split(body, sep="\n", maxsplit=1)[0]
337 hfm_with_backslash = split(ff, sep=" ", maxsplit=1)[0]
338 recordsep = "\n\n"+hfm_with_backslash
339
340
341 for r in split("\n\n"+body, sep=recordsep)[1:] :
342 yield list(_parse_record(hfm_with_backslash + r))
343
344
345 -def dictionary(files='rotokas.dic', include_header=False) :
346 """
347 Deprecated: use C{ToolboxData.parse()}
348
349 @param files: One or more toolbox files to be processed
350 @type files: L{string} or L{tuple(string)}
351 @param include_header: treat header as entry?
352 @type include_header: boolean
353 @rtype: iterator over L{dict}
354 """
355 return imap(dict, raw(files, include_header))
356
358 d = {}
359 for field in entry:
360 if len(field) == 2:
361 name, value = field
362 if name not in d:
363 d[name] = []
364 d[name].append(value)
365 return d
366
367
368 -def dict_list(files='rotokas.dic', include_header=False) :
369 """
370 Deprecated: use C{ToolboxData.parse()}
371
372 @param files: One or more toolbox files to be processed
373 @type files: L{string} or L{tuple(string)}
374 @param include_header: treat header as entry?
375 @type include_header: boolean
376 @rtype: iterator over L{dict}
377 """
378
379
380 if type(files) is str : files = (files,)
381
382 for entry in raw(files, include_header) :
383 yield _dict_list_entry(entry)
384
386 from nltk_lite.corpora import toolbox
387 from itertools import islice
388 from pprint import pprint
389
390 print 'Raw:'
391 pprint(list(islice(toolbox.raw(), 3)))
392
393 print 'Dictionary:'
394 pprint(list(islice(toolbox.dictionary(), 3)))
395
396 print 'Dictionary-List:'
397 pprint(list(islice(toolbox.dict_list(), 3)))
398
399 print 'Complex test cases, no header'
400 pprint(list(toolbox.raw("test.dic")))
401
402 print 'Complex test cases, no header, dictionary'
403 pprint(list(toolbox.dictionary("test.dic")))
404
405 print 'Complex test cases, no header, dictionary list'
406 pprint(list(toolbox.dict_list("test.dic")))
407
408 print 'Complex test cases, with header'
409 pprint(list(toolbox.raw("test.dic", include_header=True)))
410
411 if __name__ == '__main__':
412 demo()
413