1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 r"""highlight.py: Highlight and summarise text.
19
20 """
21 __docformat__ = "restructuredtext en"
22
23 import re
24 import xapian
25
27 """Class for highlighting text and creating contextual summaries.
28
29 >>> hl = Highlighter("en")
30 >>> hl.makeSample('Hello world.', ['world'])
31 'Hello world.'
32 >>> hl.highlight('Hello world', ['world'], ('<', '>'))
33 'Hello <world>'
34
35 """
36
37
38 _split_re = re.compile(r'<\w+[^>]*>|</\w+>|[\w\']+|\s+|[^\w\'\s<>/]+')
39
40 - def __init__(self, language_code='en', stemmer=None):
41 """Create a new highlighter for the specified language.
42
43 """
44 if stemmer is not None:
45 self.stem = stemmer
46 else:
47 self.stem = xapian.Stem(language_code)
48
49 - def _split_text(self, text, strip_tags=False):
50 """Split some text into words and non-words.
51
52 - `text` is the text to process. It may be a unicode object or a utf-8
53 encoded simple string.
54 - `strip_tags` is a flag - False to keep tags, True to strip all tags
55 from the output.
56
57 Returns a list of utf-8 encoded simple strings.
58
59 """
60 if isinstance(text, unicode):
61 text = text.encode('utf-8')
62
63 words = self._split_re.findall(text)
64 if strip_tags:
65 return [w for w in words if w[0] != '<']
66 else:
67 return words
68
70 """Strip the prefix off a term.
71
72 Prefixes are any initial capital letters, with the exception that R always
73 ends a prefix, even if followed by capital letters.
74
75 >>> hl = Highlighter("en")
76 >>> print hl._strip_prefix('hello')
77 hello
78 >>> print hl._strip_prefix('Rhello')
79 hello
80 >>> print hl._strip_prefix('XARHello')
81 Hello
82 >>> print hl._strip_prefix('XAhello')
83 hello
84 >>> print hl._strip_prefix('XAh')
85 h
86 >>> print hl._strip_prefix('XA')
87 <BLANKLINE>
88
89 """
90 for p in xrange(len(term)):
91 if term[p].islower():
92 return term[p:]
93 elif term[p] == 'R':
94 return term[p+1:]
95 return ''
96
98 """Convert a query to a list of stemmed words.
99
100 - `query` is the query to parse: it may be xapian.Query object, or a
101 sequence of terms.
102
103 """
104 if isinstance(query, xapian.Query):
105 return [self._strip_prefix(t) for t in query]
106 else:
107 return [self.stem(q.lower()) for q in query]
108
109
110 - def makeSample(self, text, query, maxlen=600, hl=None):
111 """Make a contextual summary from the supplied text.
112
113 This basically works by splitting the text into phrases, counting the query
114 terms in each, and keeping those with the most.
115
116 Any markup tags in the text will be stripped.
117
118 `text` is the source text to summarise.
119 `query` is either a Xapian query object or a list of (unstemmed) term strings.
120 `maxlen` is the maximum length of the generated summary.
121 `hl` is a pair of strings to insert around highlighted terms, e.g. ('<b>', '</b>')
122
123 """
124
125
126 maxlen = int(maxlen)
127
128 words = self._split_text(text, True)
129 terms = self._query_to_stemmed_words(query)
130
131
132
133 blocks = []
134 start = end = count = blockchars = 0
135
136 while end < len(words):
137 blockchars += len(words[end])
138 if words[end].isalnum():
139 if self.stem(words[end].lower()) in terms:
140 count += 1
141 end += 1
142 elif words[end] in ',.;:?!\n':
143 end += 1
144 blocks.append([start, end, blockchars, count, False])
145 start = end
146 blockchars = 0
147 count = 0
148 else:
149 end += 1
150 if start != end:
151 blocks.append([start, end, blockchars, count, False])
152 if len(blocks) == 0:
153 return ''
154
155
156 chars = 0
157 for count in xrange(3, -1, -1):
158 for b in blocks:
159 if b[3] >= count:
160 b[4] = True
161 chars += b[2]
162 if chars >= maxlen: break
163 if chars >= maxlen: break
164
165
166 words2 = []
167 lastblock = -1
168 for i, b in enumerate(blocks):
169 if b[4]:
170 if i != lastblock + 1:
171 words2.append('..')
172 words2.extend(words[b[0]:b[1]])
173 lastblock = i
174
175 if not blocks[-1][4]:
176 words2.append('..')
177
178
179 l = 0
180 for i in xrange (len (words2)):
181 l += len (words2[i])
182 if l >= maxlen:
183 words2[i:] = ['..']
184 break
185
186 if hl is None:
187 return ''.join(words2)
188 else:
189 return self._hl(words2, terms, hl)
190
191 - def highlight(self, text, query, hl, strip_tags=False):
192 """Add highlights (string prefix/postfix) to a string.
193
194 `text` is the source to highlight.
195 `query` is either a Xapian query object or a list of (unstemmed) term strings.
196 `hl` is a pair of highlight strings, e.g. ('<i>', '</i>')
197 `strip_tags` strips HTML markout iff True
198
199 >>> hl = Highlighter()
200 >>> qp = xapian.QueryParser()
201 >>> q = qp.parse_query('cat dog')
202 >>> tags = ('[[', ']]')
203 >>> hl.highlight('The cat went Dogging; but was <i>dog tired</i>.', q, tags)
204 'The [[cat]] went [[Dogging]]; but was <i>[[dog]] tired</i>.'
205
206 """
207 words = self._split_text(text, strip_tags)
208 terms = self._query_to_stemmed_words(query)
209 return self._hl(words, terms, hl)
210
211 - def _hl(self, words, terms, hl):
212 """Add highlights to a list of words.
213
214 `words` is the list of words and non-words to be highlighted..
215 `terms` is the list of stemmed words to look for.
216
217 """
218 for i, w in enumerate(words):
219
220 wl = w.lower()
221 if wl in terms or self.stem (wl) in terms:
222 words[i] = ''.join((hl[0], w, hl[1]))
223
224 return ''.join(words)
225
226
227 __test__ = {
228 'no_punc': r'''
229
230 Test the highlighter's behaviour when there is no punctuation in the sample
231 text (regression test - used to return no output):
232 >>> hl = Highlighter("en")
233 >>> hl.makeSample('Hello world', ['world'])
234 'Hello world'
235
236 ''',
237
238 'stem_levels': r'''
239
240 Test highlighting of words, and how it works with stemming:
241 >>> hl = Highlighter("en")
242
243 # "word" and "wording" stem to "word", so the following 4 calls all return
244 # the same thing
245 >>> hl.makeSample('Hello. word. wording. wordinging.', ['word'], hl='<>')
246 'Hello. <word>. <wording>. wordinging.'
247 >>> hl.highlight('Hello. word. wording. wordinging.', ['word'], '<>')
248 'Hello. <word>. <wording>. wordinging.'
249 >>> hl.makeSample('Hello. word. wording. wordinging.', ['wording'], hl='<>')
250 'Hello. <word>. <wording>. wordinging.'
251 >>> hl.highlight('Hello. word. wording. wordinging.', ['wording'], '<>')
252 'Hello. <word>. <wording>. wordinging.'
253
254 # "wordinging" stems to "wording", so only the last two words are
255 # highlighted for this one.
256 >>> hl.makeSample('Hello. word. wording. wordinging.', ['wordinging'], hl='<>')
257 'Hello. word. <wording>. <wordinging>.'
258 >>> hl.highlight('Hello. word. wording. wordinging.', ['wordinging'], '<>')
259 'Hello. word. <wording>. <wordinging>.'
260 ''',
261
262 'supplied_stemmer': r'''
263
264 Test behaviour if we pass in our own stemmer:
265 >>> stem = xapian.Stem('en')
266 >>> hl = Highlighter(stemmer=stem)
267 >>> hl.highlight('Hello. word. wording. wordinging.', ['word'], '<>')
268 'Hello. <word>. <wording>. wordinging.'
269
270 ''',
271
272 'unicode': r'''
273
274 Test behaviour if we pass in unicode input:
275 >>> hl = Highlighter('en')
276 >>> hl.highlight(u'Hello\xf3. word. wording. wordinging.', ['word'], '<>')
277 'Hello\xc3\xb3. <word>. <wording>. wordinging.'
278
279 ''',
280
281 'no_sample': r'''
282
283 Test behaviour if we pass in unicode input:
284 >>> hl = Highlighter('en')
285 >>> hl.makeSample(u'', ['word'])
286 ''
287
288 ''',
289
290 'short_samples': r'''
291
292 >>> hl = Highlighter('en')
293 >>> hl.makeSample("A boring start. Hello world indeed. A boring end.", ['hello'], 20, ('<', '>'))
294 '.. <Hello> world ..'
295 >>> hl.makeSample("A boring start. Hello world indeed. A boring end.", ['hello'], 40, ('<', '>'))
296 'A boring start. <Hello> world indeed...'
297 >>> hl.makeSample("A boring start. Hello world indeed. A boring end.", ['boring'], 40, ('<', '>'))
298 'A <boring> start... A <boring> end.'
299
300 ''',
301
302 'apostrophes': r'''
303
304 >>> hl = Highlighter('en')
305 >>> hl.makeSample("A boring start. Hello world's indeed. A boring end.", ['world'], 40, ('<', '>'))
306 "A boring start. Hello <world's> indeed..."
307
308 ''',
309
310 }
311
312 if __name__ == '__main__':
313 import doctest, sys
314 doctest.testmod (sys.modules[__name__])
315