Package xappy :: Module highlight
[frames] | no frames]

Source Code for Module xappy.highlight

  1  #!/usr/bin/env python 
  2  # 
  3  # Copyright (C) 2007 Lemur Consulting Ltd 
  4  # 
  5  # This program is free software; you can redistribute it and/or modify 
  6  # it under the terms of the GNU General Public License as published by 
  7  # the Free Software Foundation; either version 2 of the License, or 
  8  # (at your option) any later version. 
  9  # 
 10  # This program is distributed in the hope that it will be useful, 
 11  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 13  # GNU General Public License for more details. 
 14  # 
 15  # You should have received a copy of the GNU General Public License along 
 16  # with this program; if not, write to the Free Software Foundation, Inc., 
 17  # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 
 18  r"""highlight.py: Highlight and summarise text. 
 19   
 20  """ 
 21  __docformat__ = "restructuredtext en" 
 22   
 23  import re 
 24  import xapian 
 25   
26 -class Highlighter(object):
27 """Class for highlighting text and creating contextual summaries. 28 29 >>> hl = Highlighter("en") 30 >>> hl.makeSample('Hello world.', ['world']) 31 'Hello world.' 32 >>> hl.highlight('Hello world', ['world'], ('<', '>')) 33 'Hello <world>' 34 35 """ 36 37 # split string into words, spaces, punctuation and markup tags 38 _split_re = re.compile(r'<\w+[^>]*>|</\w+>|[\w\']+|\s+|[^\w\'\s<>/]+') 39
40 - def __init__(self, language_code='en', stemmer=None):
41 """Create a new highlighter for the specified language. 42 43 """ 44 if stemmer is not None: 45 self.stem = stemmer 46 else: 47 self.stem = xapian.Stem(language_code)
48
49 - def _split_text(self, text, strip_tags=False):
50 """Split some text into words and non-words. 51 52 - `text` is the text to process. It may be a unicode object or a utf-8 53 encoded simple string. 54 - `strip_tags` is a flag - False to keep tags, True to strip all tags 55 from the output. 56 57 Returns a list of utf-8 encoded simple strings. 58 59 """ 60 if isinstance(text, unicode): 61 text = text.encode('utf-8') 62 63 words = self._split_re.findall(text) 64 if strip_tags: 65 return [w for w in words if w[0] != '<'] 66 else: 67 return words
68
69 - def _strip_prefix(self, term):
70 """Strip the prefix off a term. 71 72 Prefixes are any initial capital letters, with the exception that R always 73 ends a prefix, even if followed by capital letters. 74 75 >>> hl = Highlighter("en") 76 >>> print hl._strip_prefix('hello') 77 hello 78 >>> print hl._strip_prefix('Rhello') 79 hello 80 >>> print hl._strip_prefix('XARHello') 81 Hello 82 >>> print hl._strip_prefix('XAhello') 83 hello 84 >>> print hl._strip_prefix('XAh') 85 h 86 >>> print hl._strip_prefix('XA') 87 <BLANKLINE> 88 89 """ 90 for p in xrange(len(term)): 91 if term[p].islower(): 92 return term[p:] 93 elif term[p] == 'R': 94 return term[p+1:] 95 return ''
96
97 - def _query_to_stemmed_words(self, query):
98 """Convert a query to a list of stemmed words. 99 100 - `query` is the query to parse: it may be xapian.Query object, or a 101 sequence of terms. 102 103 """ 104 if isinstance(query, xapian.Query): 105 return [self._strip_prefix(t) for t in query] 106 else: 107 return [self.stem(q.lower()) for q in query]
108 109
110 - def makeSample(self, text, query, maxlen=600, hl=None):
111 """Make a contextual summary from the supplied text. 112 113 This basically works by splitting the text into phrases, counting the query 114 terms in each, and keeping those with the most. 115 116 Any markup tags in the text will be stripped. 117 118 `text` is the source text to summarise. 119 `query` is either a Xapian query object or a list of (unstemmed) term strings. 120 `maxlen` is the maximum length of the generated summary. 121 `hl` is a pair of strings to insert around highlighted terms, e.g. ('<b>', '</b>') 122 123 """ 124 125 # coerce maxlen into an int, otherwise truncation doesn't happen 126 maxlen = int(maxlen) 127 128 words = self._split_text(text, True) 129 terms = self._query_to_stemmed_words(query) 130 131 # build blocks delimited by puncuation, and count matching words in each block 132 # blocks[n] is a block [firstword, endword, charcount, termcount, selected] 133 blocks = [] 134 start = end = count = blockchars = 0 135 136 while end < len(words): 137 blockchars += len(words[end]) 138 if words[end].isalnum(): 139 if self.stem(words[end].lower()) in terms: 140 count += 1 141 end += 1 142 elif words[end] in ',.;:?!\n': 143 end += 1 144 blocks.append([start, end, blockchars, count, False]) 145 start = end 146 blockchars = 0 147 count = 0 148 else: 149 end += 1 150 if start != end: 151 blocks.append([start, end, blockchars, count, False]) 152 if len(blocks) == 0: 153 return '' 154 155 # select high-scoring blocks first, down to zero-scoring 156 chars = 0 157 for count in xrange(3, -1, -1): 158 for b in blocks: 159 if b[3] >= count: 160 b[4] = True 161 chars += b[2] 162 if chars >= maxlen: break 163 if chars >= maxlen: break 164 165 # assemble summary 166 words2 = [] 167 lastblock = -1 168 for i, b in enumerate(blocks): 169 if b[4]: 170 if i != lastblock + 1: 171 words2.append('..') 172 words2.extend(words[b[0]:b[1]]) 173 lastblock = i 174 175 if not blocks[-1][4]: 176 words2.append('..') 177 178 # trim down to maxlen 179 l = 0 180 for i in xrange (len (words2)): 181 l += len (words2[i]) 182 if l >= maxlen: 183 words2[i:] = ['..'] 184 break 185 186 if hl is None: 187 return ''.join(words2) 188 else: 189 return self._hl(words2, terms, hl)
190
191 - def highlight(self, text, query, hl, strip_tags=False):
192 """Add highlights (string prefix/postfix) to a string. 193 194 `text` is the source to highlight. 195 `query` is either a Xapian query object or a list of (unstemmed) term strings. 196 `hl` is a pair of highlight strings, e.g. ('<i>', '</i>') 197 `strip_tags` strips HTML markout iff True 198 199 >>> hl = Highlighter() 200 >>> qp = xapian.QueryParser() 201 >>> q = qp.parse_query('cat dog') 202 >>> tags = ('[[', ']]') 203 >>> hl.highlight('The cat went Dogging; but was <i>dog tired</i>.', q, tags) 204 'The [[cat]] went [[Dogging]]; but was <i>[[dog]] tired</i>.' 205 206 """ 207 words = self._split_text(text, strip_tags) 208 terms = self._query_to_stemmed_words(query) 209 return self._hl(words, terms, hl)
210
211 - def _hl(self, words, terms, hl):
212 """Add highlights to a list of words. 213 214 `words` is the list of words and non-words to be highlighted.. 215 `terms` is the list of stemmed words to look for. 216 217 """ 218 for i, w in enumerate(words): 219 # HACK - more forgiving about stemmed terms 220 wl = w.lower() 221 if wl in terms or self.stem (wl) in terms: 222 words[i] = ''.join((hl[0], w, hl[1])) 223 224 return ''.join(words)
225 226 227 __test__ = { 228 'no_punc': r''' 229 230 Test the highlighter's behaviour when there is no punctuation in the sample 231 text (regression test - used to return no output): 232 >>> hl = Highlighter("en") 233 >>> hl.makeSample('Hello world', ['world']) 234 'Hello world' 235 236 ''', 237 238 'stem_levels': r''' 239 240 Test highlighting of words, and how it works with stemming: 241 >>> hl = Highlighter("en") 242 243 # "word" and "wording" stem to "word", so the following 4 calls all return 244 # the same thing 245 >>> hl.makeSample('Hello. word. wording. wordinging.', ['word'], hl='<>') 246 'Hello. <word>. <wording>. wordinging.' 247 >>> hl.highlight('Hello. word. wording. wordinging.', ['word'], '<>') 248 'Hello. <word>. <wording>. wordinging.' 249 >>> hl.makeSample('Hello. word. wording. wordinging.', ['wording'], hl='<>') 250 'Hello. <word>. <wording>. wordinging.' 251 >>> hl.highlight('Hello. word. wording. wordinging.', ['wording'], '<>') 252 'Hello. <word>. <wording>. wordinging.' 253 254 # "wordinging" stems to "wording", so only the last two words are 255 # highlighted for this one. 256 >>> hl.makeSample('Hello. word. wording. wordinging.', ['wordinging'], hl='<>') 257 'Hello. word. <wording>. <wordinging>.' 258 >>> hl.highlight('Hello. word. wording. wordinging.', ['wordinging'], '<>') 259 'Hello. word. <wording>. <wordinging>.' 260 ''', 261 262 'supplied_stemmer': r''' 263 264 Test behaviour if we pass in our own stemmer: 265 >>> stem = xapian.Stem('en') 266 >>> hl = Highlighter(stemmer=stem) 267 >>> hl.highlight('Hello. word. wording. wordinging.', ['word'], '<>') 268 'Hello. <word>. <wording>. wordinging.' 269 270 ''', 271 272 'unicode': r''' 273 274 Test behaviour if we pass in unicode input: 275 >>> hl = Highlighter('en') 276 >>> hl.highlight(u'Hello\xf3. word. wording. wordinging.', ['word'], '<>') 277 'Hello\xc3\xb3. <word>. <wording>. wordinging.' 278 279 ''', 280 281 'no_sample': r''' 282 283 Test behaviour if we pass in unicode input: 284 >>> hl = Highlighter('en') 285 >>> hl.makeSample(u'', ['word']) 286 '' 287 288 ''', 289 290 'short_samples': r''' 291 292 >>> hl = Highlighter('en') 293 >>> hl.makeSample("A boring start. Hello world indeed. A boring end.", ['hello'], 20, ('<', '>')) 294 '.. <Hello> world ..' 295 >>> hl.makeSample("A boring start. Hello world indeed. A boring end.", ['hello'], 40, ('<', '>')) 296 'A boring start. <Hello> world indeed...' 297 >>> hl.makeSample("A boring start. Hello world indeed. A boring end.", ['boring'], 40, ('<', '>')) 298 'A <boring> start... A <boring> end.' 299 300 ''', 301 302 'apostrophes': r''' 303 304 >>> hl = Highlighter('en') 305 >>> hl.makeSample("A boring start. Hello world's indeed. A boring end.", ['world'], 40, ('<', '>')) 306 "A boring start. Hello <world's> indeed..." 307 308 ''', 309 310 } 311 312 if __name__ == '__main__': 313 import doctest, sys 314 doctest.testmod (sys.modules[__name__]) 315