Package nltk_lite :: Package corpora :: Module web
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.web

 1  # Natural Language Toolkit: Webpage reader 
 2  # 
 3  # Copyright (C) 2001-2007 University of Pennsylvania 
 4  # Author: Steven Bird <sb@ldc.upenn.edu> 
 5  # URL: <http://nltk.sf.net> 
 6  # For license information, see LICENSE.TXT 
 7   
 8  """ 
 9  Read tokens from a webpage 
10  """ 
11   
12  # to do: check html comments are being ignored properly 
13  # to do: add support for a cache directory 
14   
15  from urllib import urlopen 
16  from HTMLParser import HTMLParser 
17  from nltk_lite import tokenize 
18  import string 
19   
20  skip = ['script']   # non-nesting tags to skip 
21   
22 -class MarkupCleaner(HTMLParser):
23 - def __init__(self):
24 self.reset() 25 self.fed = [] 26 self._flag = True
27 - def handle_data(self, d):
28 if self._flag: 29 self.fed.append(d)
30 - def handle_starttag(self, tag, attrs):
31 if tag in skip: 32 self._flag = False
33 - def handle_endtag(self, tag):
34 if tag in skip: 35 self._flag = True
36 - def clean_text(self):
37 return ''.join(self.fed)
38
39 -def raw(urls):
40 if type(urls) is str: urls = (urls,) 41 cleaner = MarkupCleaner() 42 43 for url in urls: 44 html = urlopen(url).read() 45 cleaner.feed(html) 46 text = cleaner.clean_text() 47 for token in tokenize.wordpunct(text): 48 yield token
49
50 -def demo():
51 from nltk_lite.corpora import web 52 from textwrap import wrap 53 54 constitution = "http://www.archives.gov/national-archives-experience/charters/constitution_transcript.html" 55 56 text = string.join(web.raw(constitution)) 57 print "\n".join(wrap(text))
58 59 if __name__ == '__main__': 60 demo() 61