Package nltk_lite :: Package corpora :: Module shakespeare
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.shakespeare

 1  # Natural Language Toolkit: Shakespeare XML Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2007 University of Pennsylvania 
 4  # Author: Steven Bird <sb@csse.unimelb.edu.au> 
 5  # URL: <http://nltk.sf.net> 
 6  # For license information, see LICENSE.TXT 
 7   
 8  """ 
 9  Read from the Shakespeare XML Corpus Sample 
10   
11  http://www.andrew.cmu.edu/user/akj/shakespeare/ 
12   
13  Marked up in XML by Jon Bosak, CSS stylesheet by Ajay Juneja. 
14  """        
15   
16  import os 
17  from nltk_lite.corpora import get_basedir 
18  from nltk_lite.etree import ElementTree 
19   
20  items = ['a_and_c',  # Antony and Cleopatra 
21           'dream',    # A Midsummer Night's Dream 
22           'hamlet',   # Hamlet 
23           'j_caesar', # Julius Caesar 
24           'macbeth',  # Macbeth 
25           'merchant', # The Merchant of Venice 
26           'othello',  # Othello 
27           'r_and_j'   # Romeo and Juliet 
28  ] 
29   
30 -def xml(file):
31 """ 32 @param files: A book to be loaded 33 @type files: L{string} 34 @rtype: iterator over L{tuple} 35 """ 36 37 path = os.path.join(get_basedir(), "shakespeare", file+".xml") 38 return ElementTree.parse(path).getroot()
39
40 -def demo():
41 from nltk_lite.corpora import shakespeare 42 from pprint import pprint 43 import re 44 45 play = shakespeare.xml('dream') 46 47 print "Access the subelements" 48 print play.getchildren() 49 print 50 51 print "Access the text content of the first subelement" 52 print play[0].text 53 print 54 55 print "Persona" 56 personae = [persona.text for persona in play.findall('PERSONAE/PERSONA')] 57 print personae 58 print 59 60 print "Are any speakers not identified as personae?" 61 names = set(re.match(r'[A-Z]*', persona).group() for persona in personae) 62 speakers = set(speaker.text for speaker in play.findall('*/*/*/SPEAKER')) 63 print speakers.difference(names) 64 print 65 66 print "who responds to whom?" 67 responds_to = {} 68 for scene in play.findall('ACT/SCENE'): 69 prev = None 70 for speaker in scene.findall('SPEECH/SPEAKER'): 71 name = speaker.text 72 if prev: 73 if prev not in responds_to: 74 responds_to[prev] = set() 75 responds_to[prev].add(name) 76 prev = name 77 pprint(responds_to) 78 print
79 80 if __name__ == '__main__': 81 demo() 82