Package xappy :: Module datastructures
[frames] | no frames]

Source Code for Module xappy.datastructures

  1  #!/usr/bin/env python 
  2  # 
  3  # Copyright (C) 2007 Lemur Consulting Ltd 
  4  # 
  5  # This program is free software; you can redistribute it and/or modify 
  6  # it under the terms of the GNU General Public License as published by 
  7  # the Free Software Foundation; either version 2 of the License, or 
  8  # (at your option) any later version. 
  9  # 
 10  # This program is distributed in the hope that it will be useful, 
 11  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 13  # GNU General Public License for more details. 
 14  #  
 15  # You should have received a copy of the GNU General Public License along 
 16  # with this program; if not, write to the Free Software Foundation, Inc., 
 17  # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 
 18  r"""datastructures.py: Datastructures for search engine core. 
 19   
 20  """ 
 21  __docformat__ = "restructuredtext en" 
 22   
 23  import errors 
 24  from replaylog import log 
 25  import xapian 
 26  import cPickle 
 27   
28 -class Field(object):
29 # Use __slots__ because we're going to have very many Field objects in 30 # typical usage. 31 __slots__ = 'name', 'value' 32
33 - def __init__(self, name, value):
34 self.name = name 35 self.value = value
36
37 - def __repr__(self):
38 return 'Field(%r, %r)' % (self.name, self.value)
39
40 -class UnprocessedDocument(object):
41 """A unprocessed document to be passed to the indexer. 42 43 This represents an item to be processed and stored in the search engine. 44 Each document will be processed by the indexer to generate a 45 ProcessedDocument, which can then be stored in the search engine index. 46 47 Note that some information in an UnprocessedDocument will not be 48 represented in the ProcessedDocument: therefore, it is not possible to 49 retrieve an UnprocessedDocument from the search engine index. 50 51 An unprocessed document is a simple container with two attributes: 52 53 - `fields` is a list of Field objects, or an iterator returning Field 54 objects. 55 - `id` is a string holding a unique identifier for the document (or 56 None to get the database to allocate a unique identifier automatically 57 when the document is added). 58 59 """ 60 61 __slots__ = 'id', 'fields',
62 - def __init__(self, id=None, fields=None):
63 self.id = id 64 if fields is None: 65 self.fields = [] 66 else: 67 self.fields = fields
68
69 - def __repr__(self):
70 return 'UnprocessedDocument(%r, %r)' % (self.id, self.fields)
71
72 -class ProcessedDocument(object):
73 """A processed document, as stored in the index. 74 75 This represents an item which is ready to be stored in the search engine, 76 or which has been returned by the search engine. 77 78 """ 79 80 __slots__ = '_doc', '_fieldmappings', '_data',
81 - def __init__(self, fieldmappings, xapdoc=None):
82 """Create a ProcessedDocument. 83 84 `fieldmappings` is the configuration from a database connection used lookup 85 the configuration to use to store each field. 86 87 If supplied, `xapdoc` is a Xapian document to store in the processed 88 document. Otherwise, a new Xapian document is created. 89 90 """ 91 if xapdoc is None: 92 self._doc = log(xapian.Document) 93 else: 94 self._doc = xapdoc 95 self._fieldmappings = fieldmappings 96 self._data = None
97
98 - def add_term(self, field, term, wdfinc=1, positions=None):
99 """Add a term to the document. 100 101 Terms are the main unit of information used for performing searches. 102 103 - `field` is the field to add the term to. 104 - `term` is the term to add. 105 - `wdfinc` is the value to increase the within-document-frequency 106 measure for the term by. 107 - `positions` is the positional information to add for the term. 108 This may be None to indicate that there is no positional information, 109 or may be an integer to specify one position, or may be a sequence of 110 integers to specify several positions. (Note that the wdf is not 111 increased automatically for each position: if you add a term at 7 112 positions, and the wdfinc value is 2, the total wdf for the term will 113 only be increased by 2, not by 14.) 114 115 """ 116 prefix = self._fieldmappings.get_prefix(field) 117 if len(term) > 0: 118 # We use the following check, rather than "isupper()" to ensure 119 # that we match the check performed by the queryparser, regardless 120 # of our locale. 121 if ord(term[0]) >= ord('A') and ord(term[0]) <= ord('Z'): 122 prefix = prefix + ':' 123 124 # Note - xapian currently restricts term lengths to about 248 125 # characters - except that zero bytes are encoded in two bytes, so 126 # in practice a term of length 125 characters could be too long. 127 # Xapian will give an error when commit() is called after such 128 # documents have been added to the database. 129 # As a simple workaround, we give an error here for terms over 220 130 # characters, which will catch most occurrences of the error early. 131 # 132 # In future, it might be good to change to a hashing scheme in this 133 # situation (or for terms over, say, 64 characters), where the 134 # characters after position 64 are hashed (we obviously need to do this 135 # hashing at search time, too). 136 if len(prefix + term) > 220: 137 raise errors.IndexerError("Field %r is too long: maximum length " 138 "220 - was %d (%r)" % 139 (field, len(prefix + term), 140 prefix + term)) 141 142 if positions is None: 143 self._doc.add_term(prefix + term, wdfinc) 144 elif isinstance(positions, int): 145 self._doc.add_posting(prefix + term, positions, wdfinc) 146 else: 147 self._doc.add_term(prefix + term, wdfinc) 148 for pos in positions: 149 self._doc.add_posting(prefix + term, pos, 0)
150
151 - def add_value(self, field, value, purpose=''):
152 """Add a value to the document. 153 154 Values are additional units of information used when performing 155 searches. Note that values are _not_ intended to be used to store 156 information for display in the search results - use the document data 157 for that. The intention is that as little information as possible is 158 stored in values, so that they can be accessed as quickly as possible 159 during the search operation. 160 161 Unlike terms, each document may have at most one value in each field 162 (whereas there may be an arbitrary number of terms in a given field). 163 If an attempt to add multiple values to a single field is made, only 164 the last value added will be stored. 165 166 """ 167 slot = self._fieldmappings.get_slot(field, purpose) 168 self._doc.add_value(slot, value)
169
170 - def get_value(self, field, purpose=''):
171 """Get a value from the document. 172 173 """ 174 slot = self._fieldmappings.get_slot(field, purpose) 175 return self._doc.get_value(slot)
176
177 - def prepare(self):
178 """Prepare the document for adding to a xapian database. 179 180 This updates the internal xapian document with any changes which have 181 been made, and then returns it. 182 183 """ 184 if self._data is not None: 185 self._doc.set_data(cPickle.dumps(self._data, 2)) 186 self._data = None 187 return self._doc
188
189 - def _get_data(self):
190 if self._data is None: 191 rawdata = self._doc.get_data() 192 if rawdata == '': 193 self._data = {} 194 else: 195 self._data = cPickle.loads(rawdata) 196 return self._data
197 - def _set_data(self, data):
198 if not isinstance(data, dict): 199 raise TypeError("Cannot set data to any type other than a dict") 200 self._data = data
201 data = property(_get_data, _set_data, doc= 202 """The data stored in this processed document. 203 204 This data is a dictionary of entries, where the key is a fieldname, and the 205 value is a list of strings. 206 207 """) 208
209 - def _get_id(self):
210 tl = self._doc.termlist() 211 try: 212 term = tl.skip_to('Q').term 213 if len(term) == 0 or term[0] != 'Q': 214 return None 215 except StopIteration: 216 return None 217 return term[1:]
218 - def _set_id(self, id):
219 tl = self._doc.termlist() 220 try: 221 term = tl.skip_to('Q').term 222 except StopIteration: 223 term = '' 224 if len(term) != 0 and term[0] == 'Q': 225 self._doc.remove_term(term) 226 if id is not None: 227 self._doc.add_term('Q' + id, 0)
228 id = property(_get_id, _set_id, doc= 229 """The unique ID for this document. 230 231 """) 232
233 - def __repr__(self):
234 return '<ProcessedDocument(%r)>' % (self.id)
235 236 if __name__ == '__main__': 237 import doctest, sys 238 doctest.testmod (sys.modules[__name__]) 239