1
2
3
4
5
6
7
8 """
9 Read tokens from a webpage
10 """
11
12
13
14
15 from urllib import urlopen
16 from HTMLParser import HTMLParser
17 from nltk_lite import tokenize
18 import string
19
20 skip = ['script']
21
24 self.reset()
25 self.fed = []
26 self._flag = True
28 if self._flag:
29 self.fed.append(d)
31 if tag in skip:
32 self._flag = False
34 if tag in skip:
35 self._flag = True
36 - def clean_text(self):
37 return ''.join(self.fed)
38
49
51 from nltk_lite.corpora import web
52 from textwrap import wrap
53
54 constitution = "http://www.archives.gov/national-archives-experience/charters/constitution_transcript.html"
55
56 text = string.join(web.raw(constitution))
57 print "\n".join(wrap(text))
58
59 if __name__ == '__main__':
60 demo()
61