Package nltk_lite :: Package contrib :: Module ieer_rels
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.ieer_rels

  1  # Natural Language Toolkit: Relation Extraction 
  2  # 
  3  # Author: Ewan Klein <ewan@inf.ed.ac.uk> 
  4  # URL: <http://nltk.sf.net> 
  5  # For license information, see LICENSE.TXT 
  6   
  7  """ 
  8  Code for extracting triples of the form C{(subj, filler, obj)} from the ieer corpus, 
  9  after the latter has been converted to chunk format. 
 10  C{sub} and C{obj} are pairs of Named Entities, and C{filler} is the string of words occuring between C{sub} and C{obj} (with no intervening NEs). 
 11  Subsequent processing can try to identify interesting relations expressed in  
 12  C{filler}. 
 13  """ 
 14   
 15  from nltk_lite.corpora import ieer, conll2002 
 16  from nltk_lite.parse import tree, Tree 
 17  from nltk_lite.tag import tag2tuple 
 18  from string import join 
 19  import re 
 20  from itertools import islice 
 21   
 22  ne_types = {'ieer': ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION',  
 23                      'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'], 
 24              'conll2002': ['LOC', 'PER', 'ORG'], 
 25              'conll2002-ned': ['LOC', 'PER', 'ORG'], 
 26              'conll2002-esp': ['LOC', 'PER', 'ORG'] 
 27                  } 
 28                       
 29   
 30  short2long = dict(LOC = 'LOCATION', ORG = 'ORGANIZATION', PER = 'PERSON') 
 31  long2short = dict(LOCATION ='LOC', ORGANIZATION = 'ORG', PERSON = 'PER') 
 32   
 33  corpora = { 
 34      'ieer': (d[key] for key in ['text','headline'] for d in ieer.dictionary()), 
 35      'conll2002': (tree for tree in conll2002.ne_chunked()), 
 36      'conll2002-ned': (tree for tree in conll2002.ne_chunked(files = ['ned.train'])), 
 37      'conll2002-esp': (tree for tree in conll2002.ne_chunked(files = ['esp.train'])) 
 38  } 
 39   
40 -def check_words(s):
41 """ 42 Filter out strings which introduce unwanted noise. 43 44 @param s: The string to be filtered 45 @type s: C{string} 46 @rtype: C{string} or C{None} 47 """ 48 PUNC = re.compile(r'[._-]') 49 if PUNC.search(s): 50 return None 51 else: 52 return s
53
54 -def check_type(tree, type=None):
55 """ 56 Given a Named Entity (represented as a C{Tree}), check whether it 57 has the required type (i.e., check the tree's root node). 58 59 @param tree: The candidate Named Entity 60 @type tree: C{Tree} 61 @rtype: C{bool} 62 """ 63 if type is None: 64 return True 65 else: 66 return tree.node == type
67
68 -def _tuple2tag(item):
69 if isinstance(item, tuple): 70 (token, tag) = item 71 return "".join(token + "/" + str(tag)) 72 else: return item
73
74 -def ne_fillers(t, stype= None, otype=None):
75 """ 76 Search through a chunk structure, looking for relational triples. 77 These consist of 78 - a Named Entity (i.e subtree), called the 'subject' of the triple, 79 - a string of words (i.e. leaves), called the 'filler' of the triple, 80 - another Named Entity, called the 'object' of the triple. 81 82 To help in data analysis, we also identify a fourth item, C{rcon}, 83 i.e., a few words of right context immediately following the 84 second Named Entity. 85 86 Apart from the first and last, every Named Entity can occur as both the 87 subject and the object of a triple. 88 89 The parameters C{stype} and C{otype} can be used to restrict the 90 Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION', 91 'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'). 92 93 @param t: a chunk structured portion of the C{ieer} corpus. 94 @type t: C{Tree} 95 @param stype: the type of the subject Named Entity (by default, all types are 96 admissible). 97 @type stype: C{string} or C{None}. 98 @param otype: the type of the object Named Entity (by default, all types are 99 admissible). 100 @type otype: C{string} or C{None}. 101 @return: a list of 4-tuples C{(subj, filler, obj, rcon)}. 102 @rtype: C{list} 103 104 """ 105 words = [] 106 window = 10 107 #look for the next potential subject NE 108 for d in t: 109 if isinstance(d, Tree) and check_type(d, stype): 110 subj = d 111 # process the rest of the tree 112 tail = t[t.index(d)+1:] 113 for next in tail: 114 # accumulate some words 115 if not isinstance(next, Tree): 116 next = _tuple2tag(next) 117 #if isinstance(next, tuple): 118 #(token, tag) = next 119 #next = "".join(token + "/" + str(tag)) 120 words.append(next) 121 # next is another NE; it's a potential object 122 else: 123 obj = next 124 if len(words) <= window: 125 filler = check_words(join(words)) 126 else: 127 filler = None 128 if check_type(obj, otype) and filler: 129 pos = tail.index(obj) 130 rcon= [_tuple2tag(item) for item in tail[pos+1:pos+5]] 131 triple = (subj, filler, obj, rcon) 132 try: 133 return [triple] + ne_fillers(tail, stype, otype) 134 except: 135 # nothing left to loop over -- ne_fillers(tail) returns None 136 return [triple] 137 # current triple is no good; carry on with the tail 138 else: 139 return ne_fillers(tail, stype, otype) 140 # nothing to loop over 141 return []
142
143 -def _expand(type):
144 try: 145 return short2long[type] 146 except KeyError: 147 return ''
148
149 -def relextract(stype, otype, corpus = 'ieer', pattern = None, rcontext = None):
150 """ 151 Extract a relation by filtering the results of C{ne_fillers}. 152 153 @param trees: the syntax trees to be processed 154 @type trees: list of C{Tree} 155 @param stype: the type of the subject Named Entity. 156 @type stype: C{string} 157 @param otype: the type of the object Named Entity. 158 @type otype: C{string} 159 @param pattern: a regular expression for filtering the fillers of 160 retrieved triples. 161 @type pattern: C{SRE_Pattern} 162 @param rcontext: if C{True}, a few words of right context are added 163 to the output triples. 164 @type rcontext: C{bool} 165 @return: generates 3-tuples or 4-tuples <subj, filler, obj, rcontext>. 166 @rtype: C{generator} 167 """ 168 try: 169 trees = corpora[corpus] 170 except KeyError: 171 print "corpus not recognized: '%s'" % corpus 172 173 if stype not in ne_types[corpus]: 174 if _expand(stype) in ne_types[corpus]: 175 stype = _expand(stype) 176 else: 177 raise ValueError, "your value for the subject type has not been recognized: %s" % stype 178 if otype not in ne_types[corpus]: 179 if _expand(otype) in ne_types[corpus]: 180 otype = _expand(otype) 181 else: 182 raise ValueError, "your value for the object type has not been recognized: %s" % otype 183 184 for tree in trees: 185 rels = ne_fillers(tree, stype=stype, otype=otype) 186 if pattern: 187 rels = [r for r in rels if pattern.match(r[1])] 188 for (subj, filler, obj, rcon) in rels: 189 if rcontext: 190 yield subj, filler, obj, rcon 191 else: 192 yield subj, filler, obj
193
194 -def _shorten(type):
195 try: 196 return long2short[type] 197 except KeyError: 198 return type
199
200 -def _show(item, tags=None):
201 if isinstance(item, Tree): 202 label = _shorten(item.node) 203 try: 204 words = [word for (word, tag) in item.leaves()] 205 except ValueError: 206 words = item.leaves() 207 text = join(words) 208 return '[%s: %s]' % (label, text) 209 elif isinstance(item, list): 210 return join([_show(e) for e in item]) 211 else: 212 if tags: 213 return item 214 else: 215 item = tag2tuple(item) 216 return item[0]
217
218 -def show_tuple(t):
219 """ 220 Utility function for displaying tuples in succinct format. 221 222 @param t: a (subj, filler, obj) tuple (possibly with right context as a fourth item). 223 @type t: C{tuple} 224 """ 225 l = [_show(t[0]), t[1], _show(t[2])] 226 if len(t) > 3: 227 l.append(_show(t[3])) 228 return '%s %s %s (%s...' % tuple(l) 229 return '%s %s %s' % tuple(l)
230
231 -def demo():
232 233 ieer_trees = [d['text'] for d in ieer.dictionary()] 234 """ 235 A demonstration of two relations extracted by simple regexps: 236 - in(ORG, LOC), and 237 - has_role(PERS, ORG) 238 """ 239 ############################################ 240 # Example of in(ORG, LOC) 241 ############################################ 242 IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') 243 244 print "in(ORG, LOC):" 245 print "=" * 30 246 for r in islice(relextract('ORG', 'LOC', pattern = IN), 29, 39): 247 print show_tuple(r) 248 print 249 250 ############################################ 251 # Example of has_role(PER, LOC) 252 ############################################ 253 roles = """ 254 (.*( # assorted roles 255 analyst| 256 chair(wo)?man| 257 commissioner| 258 counsel| 259 director| 260 economist| 261 editor| 262 executive| 263 foreman| 264 governor| 265 head| 266 lawyer| 267 leader| 268 librarian).*)| 269 manager| 270 partner| 271 president| 272 producer| 273 professor| 274 researcher| 275 spokes(wo)?man| 276 writer| 277 ,\sof\sthe?\s* # "X, of (the) Y" 278 """ 279 ROLES = re.compile(roles, re.VERBOSE) 280 281 print "has_role(PER, ORG):" 282 print "=" * 30 283 for r in islice(relextract('PER', 'ORG', pattern = ROLES, rcontext = True), 10): 284 print show_tuple(r) 285 print 286 287 ############################################ 288 # Show what's in the IEER Headlines 289 ############################################ 290 291 print "NER in Headlines" 292 print "=" * 30 293 for d in ieer.dictionary(): 294 tree = d['headline'] 295 for r in ne_fillers(tree): 296 print show_tuple(r[:-1]) 297 print 298 299 ############################################ 300 # Dutch CONLL2002: take_on_role(PER, ORG 301 ############################################ 302 303 vnv = """ 304 ( 305 is/V| 306 was/V| 307 werd/V| 308 wordt/V 309 ) 310 .* 311 van/Prep 312 """ 313 VAN = re.compile(vnv, re.VERBOSE) 314 315 print "van(PER, ORG):" 316 print "=" * 30 317 for r in relextract('PER', 'ORG', corpus='conll2002-ned', pattern = VAN): 318 print show_tuple(r) 319 print 320 321 ############################################ 322 # Spanish CONLL2002: (PER, ORG) 323 ############################################ 324 325 de = """ 326 .* 327 ( 328 de/SP| 329 del/SP 330 ) 331 """ 332 DE = re.compile(de, re.VERBOSE) 333 334 print "de(ORG, LOC):" 335 print "=" * 30 336 for r in islice(relextract('ORG', 'LOC', corpus='conll2002-esp', pattern = DE), 10): 337 print show_tuple(r) 338 print
339 340 341 if __name__ == '__main__': 342 demo() 343