Package nltk_lite :: Package corpora :: Module state_union
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.state_union

  1  # Natural Language Toolkit: Presidential State of the Union Addres Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Steven Bird <sb@ldc.upenn.edu> 
  5  # URL: <http://nltk.sf.net> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  """ 
  9  C-Span State of the Union Address Corpus 
 10   
 11  Annual US presidential addresses 1945-2005 
 12   
 13  http://www.c-span.org/executive/stateoftheunion.asp 
 14  """        
 15   
 16  from nltk_lite.corpora import get_basedir 
 17  from nltk_lite import tokenize 
 18  import os, re 
 19   
 20  items = [ 
 21      '1945-Truman', 
 22      '1946-Truman', 
 23      '1947-Truman', 
 24      '1948-Truman', 
 25      '1949-Truman', 
 26      '1950-Truman', 
 27      '1951-Truman', 
 28      '1953-Eisenhower', 
 29      '1954-Eisenhower', 
 30      '1955-Eisenhower', 
 31      '1956-Eisenhower', 
 32      '1957-Eisenhower', 
 33      '1958-Eisenhower', 
 34      '1959-Eisenhower', 
 35      '1960-Eisenhower', 
 36      '1961-Kennedy', 
 37      '1962-Kennedy', 
 38      '1963-Johnson', 
 39      '1963-Kennedy', 
 40      '1964-Johnson', 
 41      '1965-Johnson-1', 
 42      '1965-Johnson-2', 
 43      '1966-Johnson', 
 44      '1967-Johnson', 
 45      '1968-Johnson', 
 46      '1969-Johnson', 
 47      '1970-Nixon', 
 48      '1971-Nixon', 
 49      '1972-Nixon', 
 50      '1973-Nixon', 
 51      '1974-Nixon', 
 52      '1975-Ford', 
 53      '1976-Ford', 
 54      '1977-Ford', 
 55      '1978-Carter', 
 56      '1979-Carter', 
 57      '1980-Carter', 
 58      '1981-Reagan', 
 59      '1982-Reagan', 
 60      '1983-Reagan', 
 61      '1984-Reagan', 
 62      '1985-Reagan', 
 63      '1986-Reagan', 
 64      '1987-Reagan', 
 65      '1988-Reagan', 
 66      '1989-Bush', 
 67      '1990-Bush', 
 68      '1991-Bush-1', 
 69      '1991-Bush-2', 
 70      '1992-Bush', 
 71      '1993-Clinton', 
 72      '1994-Clinton', 
 73      '1995-Clinton', 
 74      '1996-Clinton', 
 75      '1997-Clinton', 
 76      '1998-Clinton', 
 77      '1999-Clinton', 
 78      '2000-Clinton', 
 79      '2001-Bush-1', 
 80      '2001-Bush-2', 
 81      '2002-Bush', 
 82      '2003-Bush', 
 83      '2004-Bush', 
 84      '2005-Bush' 
 85  ] 
 86   
87 -def raw(files = items):
88 if type(files) is str: files = (files,) 89 90 for file in files: 91 path = os.path.join(get_basedir(), "state_union", file + ".txt") 92 f = open(path) 93 preamble = True 94 text = f.read() 95 for t in tokenize.wordpunct(text): 96 yield t
97
98 -def demo():
99 from nltk_lite.corpora import state_union 100 101 for speech in state_union.items: 102 year = speech[:4] 103 freq = list(state_union.raw(speech)).count('men') 104 print year, freq
105 106 if __name__ == '__main__': 107 demo() 108