Package xappy :: Module indexerconnection
[frames] | no frames]

Source Code for Module xappy.indexerconnection

  1  #!/usr/bin/env python 
  2  # 
  3  # Copyright (C) 2007 Lemur Consulting Ltd 
  4  # 
  5  # This program is free software; you can redistribute it and/or modify 
  6  # it under the terms of the GNU General Public License as published by 
  7  # the Free Software Foundation; either version 2 of the License, or 
  8  # (at your option) any later version. 
  9  # 
 10  # This program is distributed in the hope that it will be useful, 
 11  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 13  # GNU General Public License for more details. 
 14  #  
 15  # You should have received a copy of the GNU General Public License along 
 16  # with this program; if not, write to the Free Software Foundation, Inc., 
 17  # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 
 18  r"""indexerconnection.py: A connection to the search engine for indexing. 
 19   
 20  """ 
 21  __docformat__ = "restructuredtext en" 
 22   
 23  import _checkxapian 
 24  import cPickle 
 25  import xapian 
 26   
 27  from datastructures import * 
 28  import errors 
 29  from fieldactions import * 
 30  import fieldmappings 
 31  import memutils 
 32  from replaylog import log 
 33   
34 -class IndexerConnection(object):
35 """A connection to the search engine for indexing. 36 37 """ 38
39 - def __init__(self, indexpath):
40 """Create a new connection to the index. 41 42 There may only be one indexer connection for a particular database open 43 at a given time. Therefore, if a connection to the database is already 44 open, this will raise a xapian.DatabaseLockError. 45 46 If the database doesn't already exist, it will be created. 47 48 """ 49 self._index = log(xapian.WritableDatabase, indexpath, xapian.DB_CREATE_OR_OPEN) 50 self._indexpath = indexpath 51 52 # Read existing actions. 53 self._field_actions = {} 54 self._field_mappings = fieldmappings.FieldMappings() 55 self._facet_hierarchy = {} 56 self._facet_query_table = {} 57 self._next_docid = 0 58 self._config_modified = False 59 self._load_config() 60 61 # Set management of the memory used. 62 # This can be removed once Xapian implements this itself. 63 self._mem_buffered = 0 64 self.set_max_mem_use()
65
66 - def set_max_mem_use(self, max_mem=None, max_mem_proportion=None):
67 """Set the maximum memory to use. 68 69 This call allows the amount of memory to use to buffer changes to be 70 set. This will affect the speed of indexing, but should not result in 71 other changes to the indexing. 72 73 Note: this is an approximate measure - the actual amount of memory used 74 max exceed the specified amount. Also, note that future versions of 75 xapian are likely to implement this differently, so this setting may be 76 entirely ignored. 77 78 The absolute amount of memory to use (in bytes) may be set by setting 79 max_mem. Alternatively, the proportion of the available memory may be 80 set by setting max_mem_proportion (this should be a value between 0 and 81 1). 82 83 Setting too low a value will result in excessive flushing, and very 84 slow indexing. Setting too high a value will result in excessive 85 buffering, leading to swapping, and very slow indexing. 86 87 A reasonable default for max_mem_proportion for a system which is 88 dedicated to indexing is probably 0.5: if other tasks are also being 89 performed on the system, the value should be lowered. 90 91 """ 92 if self._index is None: 93 raise errors.IndexerError("IndexerConnection has been closed") 94 if max_mem is not None and max_mem_proportion is not None: 95 raise errors.IndexerError("Only one of max_mem and " 96 "max_mem_proportion may be specified") 97 98 if max_mem is None and max_mem_proportion is None: 99 self._max_mem = None 100 101 if max_mem_proportion is not None: 102 physmem = memutils.get_physical_memory() 103 if physmem is not None: 104 max_mem = int(physmem * max_mem_proportion) 105 106 self._max_mem = max_mem
107
108 - def _store_config(self):
109 """Store the configuration for the database. 110 111 Currently, this stores the configuration in a file in the database 112 directory, so changes to it are not protected by transactions. When 113 support is available in xapian for storing metadata associated with 114 databases. this will be used instead of a file. 115 116 """ 117 assert self._index is not None 118 119 config_str = cPickle.dumps(( 120 self._field_actions, 121 self._field_mappings.serialise(), 122 self._facet_hierarchy, 123 self._facet_query_table, 124 self._next_docid, 125 ), 2) 126 log(self._index.set_metadata, '_xappy_config', config_str) 127 128 self._config_modified = False
129
130 - def _load_config(self):
131 """Load the configuration for the database. 132 133 """ 134 assert self._index is not None 135 136 config_str = log(self._index.get_metadata, '_xappy_config') 137 if len(config_str) == 0: 138 return 139 140 try: 141 (self._field_actions, mappings, self._facet_hierarchy, self._facet_query_table, self._next_docid) = cPickle.loads(config_str) 142 except ValueError: 143 # Backwards compatibility - configuration used to lack _facet_hierarchy and _facet_query_table 144 (self._field_actions, mappings, self._next_docid) = cPickle.loads(config_str) 145 self._facet_hierarchy = {} 146 self._facet_query_table = {} 147 self._field_mappings = fieldmappings.FieldMappings(mappings) 148 149 self._config_modified = False
150
151 - def _allocate_id(self):
152 """Allocate a new ID. 153 154 """ 155 while True: 156 idstr = "%x" % self._next_docid 157 self._next_docid += 1 158 if not self._index.term_exists('Q' + idstr): 159 break 160 self._config_modified = True 161 return idstr
162
163 - def add_field_action(self, fieldname, fieldtype, **kwargs):
164 """Add an action to be performed on a field. 165 166 Note that this change to the configuration will not be preserved on 167 disk until the next call to flush(). 168 169 """ 170 if self._index is None: 171 raise errors.IndexerError("IndexerConnection has been closed") 172 if fieldname in self._field_actions: 173 actions = self._field_actions[fieldname] 174 else: 175 actions = FieldActions(fieldname) 176 self._field_actions[fieldname] = actions 177 actions.add(self._field_mappings, fieldtype, **kwargs) 178 self._config_modified = True
179
180 - def clear_field_actions(self, fieldname):
181 """Clear all actions for the specified field. 182 183 This does not report an error if there are already no actions for the 184 specified field. 185 186 Note that this change to the configuration will not be preserved on 187 disk until the next call to flush(). 188 189 """ 190 if self._index is None: 191 raise errors.IndexerError("IndexerConnection has been closed") 192 if fieldname in self._field_actions: 193 del self._field_actions[fieldname] 194 self._config_modified = True
195
196 - def get_fields_with_actions(self):
197 """Get a list of field names which have actions defined. 198 199 """ 200 if self._index is None: 201 raise errors.IndexerError("IndexerConnection has been closed") 202 return self._field_actions.keys()
203
204 - def process(self, document):
205 """Process an UnprocessedDocument with the settings in this database. 206 207 The resulting ProcessedDocument is returned. 208 209 Note that this processing will be automatically performed if an 210 UnprocessedDocument is supplied to the add() or replace() methods of 211 IndexerConnection. This method is exposed to allow the processing to 212 be performed separately, which may be desirable if you wish to manually 213 modify the processed document before adding it to the database, or if 214 you want to split processing of documents from adding documents to the 215 database for performance reasons. 216 217 """ 218 if self._index is None: 219 raise errors.IndexerError("IndexerConnection has been closed") 220 result = ProcessedDocument(self._field_mappings) 221 result.id = document.id 222 context = ActionContext(self._index) 223 224 for field in document.fields: 225 try: 226 actions = self._field_actions[field.name] 227 except KeyError: 228 # If no actions are defined, just ignore the field. 229 continue 230 actions.perform(result, field.value, context) 231 232 return result
233
234 - def _get_bytes_used_by_doc_terms(self, xapdoc):
235 """Get an estimate of the bytes used by the terms in a document. 236 237 (This is a very rough estimate.) 238 239 """ 240 count = 0 241 for item in xapdoc.termlist(): 242 # The term may also be stored in the spelling correction table, so 243 # double the amount used. 244 count += len(item.term) * 2 245 246 # Add a few more bytes for holding the wdf, and other bits and 247 # pieces. 248 count += 8 249 250 # Empirical observations indicate that about 5 times as much memory as 251 # the above calculation predicts is used for buffering in practice. 252 return count * 5
253
254 - def add(self, document):
255 """Add a new document to the search engine index. 256 257 If the document has a id set, and the id already exists in 258 the database, an exception will be raised. Use the replace() method 259 instead if you wish to overwrite documents. 260 261 Returns the id of the newly added document (making up a new 262 unique ID if no id was set). 263 264 The supplied document may be an instance of UnprocessedDocument, or an 265 instance of ProcessedDocument. 266 267 """ 268 if self._index is None: 269 raise errors.IndexerError("IndexerConnection has been closed") 270 if not hasattr(document, '_doc'): 271 # It's not a processed document. 272 document = self.process(document) 273 274 # Ensure that we have a id 275 orig_id = document.id 276 if orig_id is None: 277 id = self._allocate_id() 278 document.id = id 279 else: 280 id = orig_id 281 if self._index.term_exists('Q' + id): 282 raise errors.IndexerError("Document ID of document supplied to add() is not unique.") 283 284 # Add the document. 285 xapdoc = document.prepare() 286 self._index.add_document(xapdoc) 287 288 if self._max_mem is not None: 289 self._mem_buffered += self._get_bytes_used_by_doc_terms(xapdoc) 290 if self._mem_buffered > self._max_mem: 291 self.flush() 292 293 if id is not orig_id: 294 document.id = orig_id 295 return id
296
297 - def replace(self, document):
298 """Replace a document in the search engine index. 299 300 If the document does not have a id set, an exception will be 301 raised. 302 303 If the document has a id set, and the id does not already 304 exist in the database, this method will have the same effect as add(). 305 306 """ 307 if self._index is None: 308 raise errors.IndexerError("IndexerConnection has been closed") 309 if not hasattr(document, '_doc'): 310 # It's not a processed document. 311 document = self.process(document) 312 313 # Ensure that we have a id 314 id = document.id 315 if id is None: 316 raise errors.IndexerError("No document ID set for document supplied to replace().") 317 318 xapdoc = document.prepare() 319 self._index.replace_document('Q' + id, xapdoc) 320 321 if self._max_mem is not None: 322 self._mem_buffered += self._get_bytes_used_by_doc_terms(xapdoc) 323 if self._mem_buffered > self._max_mem: 324 self.flush()
325
326 - def _make_synonym_key(self, original, field):
327 """Make a synonym key (ie, the term or group of terms to store in 328 xapian). 329 330 """ 331 if field is not None: 332 prefix = self._field_mappings.get_prefix(field) 333 else: 334 prefix = '' 335 original = original.lower() 336 # Add the prefix to the start of each word. 337 return ' '.join((prefix + word for word in original.split(' ')))
338
339 - def add_synonym(self, original, synonym, field=None, 340 original_field=None, synonym_field=None):
341 """Add a synonym to the index. 342 343 - `original` is the word or words which will be synonym expanded in 344 searches (if multiple words are specified, each word should be 345 separated by a single space). 346 - `synonym` is a synonym for `original`. 347 - `field` is the field which the synonym is specific to. If no field 348 is specified, the synonym will be used for searches which are not 349 specific to any particular field. 350 351 """ 352 if self._index is None: 353 raise errors.IndexerError("IndexerConnection has been closed") 354 if original_field is None: 355 original_field = field 356 if synonym_field is None: 357 synonym_field = field 358 key = self._make_synonym_key(original, original_field) 359 # FIXME - this only works for exact fields which have no upper case 360 # characters, or single words 361 value = self._make_synonym_key(synonym, synonym_field) 362 self._index.add_synonym(key, value)
363
364 - def remove_synonym(self, original, synonym, field=None):
365 """Remove a synonym from the index. 366 367 - `original` is the word or words which will be synonym expanded in 368 searches (if multiple words are specified, each word should be 369 separated by a single space). 370 - `synonym` is a synonym for `original`. 371 - `field` is the field which this synonym is specific to. If no field 372 is specified, the synonym will be used for searches which are not 373 specific to any particular field. 374 375 """ 376 if self._index is None: 377 raise errors.IndexerError("IndexerConnection has been closed") 378 key = self._make_synonym_key(original, field) 379 self._index.remove_synonym(key, synonym.lower())
380
381 - def clear_synonyms(self, original, field=None):
382 """Remove all synonyms for a word (or phrase). 383 384 - `field` is the field which this synonym is specific to. If no field 385 is specified, the synonym will be used for searches which are not 386 specific to any particular field. 387 388 """ 389 if self._index is None: 390 raise errors.IndexerError("IndexerConnection has been closed") 391 key = self._make_synonym_key(original, field) 392 self._index.clear_synonyms(key)
393
394 - def _assert_facet(self, facet):
395 """Raise an error if facet is not a declared facet field. 396 397 """ 398 for action in self._field_actions[facet]._actions: 399 if action == FieldActions.FACET: 400 return 401 raise errors.IndexerError("Field %r is not indexed as a facet" % facet)
402
403 - def add_subfacet(self, subfacet, facet):
404 """Add a subfacet-facet relationship to the facet hierarchy. 405 406 Any existing relationship for that subfacet is replaced. 407 408 Raises a KeyError if either facet or subfacet is not a field, 409 and an IndexerError if either facet or subfacet is not a facet field. 410 """ 411 if self._index is None: 412 raise errors.IndexerError("IndexerConnection has been closed") 413 self._assert_facet(facet) 414 self._assert_facet(subfacet) 415 self._facet_hierarchy[subfacet] = facet 416 self._config_modified = True
417
418 - def remove_subfacet(self, subfacet):
419 """Remove any existing facet hierarchy relationship for a subfacet. 420 421 """ 422 if self._index is None: 423 raise errors.IndexerError("IndexerConnection has been closed") 424 if subfacet in self._facet_hierarchy: 425 del self._facet_hierarchy[subfacet] 426 self._config_modified = True
427
428 - def get_subfacets(self, facet):
429 """Get a list of subfacets of a facet. 430 431 """ 432 if self._index is None: 433 raise errors.IndexerError("IndexerConnection has been closed") 434 return [k for k, v in self._facet_hierarchy.iteritems() if v == facet]
435 436 FacetQueryType_Preferred = 1; 437 FacetQueryType_Never = 2;
438 - def set_facet_for_query_type(self, query_type, facet, association):
439 """Set the association between a query type and a facet. 440 441 The value of `association` must be one of 442 IndexerConnection.FacetQueryType_Preferred, 443 IndexerConnection.FacetQueryType_Never or None. A value of None removes 444 any previously set association. 445 446 """ 447 if self._index is None: 448 raise errors.IndexerError("IndexerConnection has been closed") 449 if query_type is None: 450 raise errors.IndexerError("Cannot set query type information for None") 451 self._assert_facet(facet) 452 if query_type not in self._facet_query_table: 453 self._facet_query_table[query_type] = {} 454 if association is None: 455 if facet in self._facet_query_table[query_type]: 456 del self._facet_query_table[query_type][facet] 457 else: 458 self._facet_query_table[query_type][facet] = association; 459 if self._facet_query_table[query_type] == {}: 460 del self._facet_query_table[query_type] 461 self._config_modified = True
462
463 - def get_facets_for_query_type(self, query_type, association):
464 """Get the set of facets associated with a query type. 465 466 Only those facets associated with the query type in the specified 467 manner are returned; `association` must be one of 468 IndexerConnection.FacetQueryType_Preferred or 469 IndexerConnection.FacetQueryType_Never. 470 471 If the query type has no facets associated with it, None is returned. 472 473 """ 474 if self._index is None: 475 raise errors.IndexerError("IndexerConnection has been closed") 476 if query_type not in self._facet_query_table: 477 return None 478 facet_dict = self._facet_query_table[query_type] 479 return set([facet for facet, assoc in facet_dict.iteritems() if assoc == association])
480
481 - def set_metadata(self, key, value):
482 """Set an item of metadata stored in the connection. 483 484 The value supplied will be returned by subsequent calls to 485 get_metadata() which use the same key. 486 487 Keys with a leading underscore are reserved for internal use - you 488 should not use such keys unless you really know what you are doing. 489 490 This will store the value supplied in the database. It will not be 491 visible to readers (ie, search connections) until after the next flush. 492 493 The key is limited to about 200 characters (the same length as a term 494 is limited to). The value can be several megabytes in size. 495 496 To remove an item of metadata, simply call this with a `value` 497 parameter containing an empty string. 498 499 """ 500 if self._index is None: 501 raise errors.IndexerError("IndexerConnection has been closed") 502 if not hasattr(self._index, 'set_metadata'): 503 raise errors.IndexerError("Version of xapian in use does not support metadata") 504 log(self._index.set_metadata, key, value)
505
506 - def get_metadata(self, key):
507 """Get an item of metadata stored in the connection. 508 509 This returns a value stored by a previous call to set_metadata. 510 511 If the value is not found, this will return the empty string. 512 513 """ 514 if self._index is None: 515 raise errors.IndexerError("IndexerConnection has been closed") 516 if not hasattr(self._index, 'get_metadata'): 517 raise errors.IndexerError("Version of xapian in use does not support metadata") 518 return log(self._index.get_metadata, key)
519
520 - def delete(self, id):
521 """Delete a document from the search engine index. 522 523 If the id does not already exist in the database, this method 524 will have no effect (and will not report an error). 525 526 """ 527 if self._index is None: 528 raise errors.IndexerError("IndexerConnection has been closed") 529 self._index.delete_document('Q' + id)
530
531 - def flush(self):
532 """Apply recent changes to the database. 533 534 If an exception occurs, any changes since the last call to flush() may 535 be lost. 536 537 """ 538 if self._index is None: 539 raise errors.IndexerError("IndexerConnection has been closed") 540 if self._config_modified: 541 self._store_config() 542 self._index.flush() 543 self._mem_buffered = 0
544
545 - def close(self):
546 """Close the connection to the database. 547 548 It is important to call this method before allowing the class to be 549 garbage collected, because it will ensure that any un-flushed changes 550 will be flushed. It also ensures that the connection is cleaned up 551 promptly. 552 553 No other methods may be called on the connection after this has been 554 called. (It is permissible to call close() multiple times, but 555 only the first call will have any effect.) 556 557 If an exception occurs, the database will be closed, but changes since 558 the last call to flush may be lost. 559 560 """ 561 if self._index is None: 562 return 563 try: 564 self.flush() 565 finally: 566 # There is currently no "close()" method for xapian databases, so 567 # we have to rely on the garbage collector. Since we never copy 568 # the _index property out of this class, there should be no cycles, 569 # so the standard python implementation should garbage collect 570 # _index straight away. A close() method is planned to be added to 571 # xapian at some point - when it is, we should call it here to make 572 # the code more robust. 573 self._index = None 574 self._indexpath = None 575 self._field_actions = None 576 self._config_modified = False
577
578 - def get_doccount(self):
579 """Count the number of documents in the database. 580 581 This count will include documents which have been added or removed but 582 not yet flushed(). 583 584 """ 585 if self._index is None: 586 raise errors.IndexerError("IndexerConnection has been closed") 587 return self._index.get_doccount()
588
589 - def iterids(self):
590 """Get an iterator which returns all the ids in the database. 591 592 The unqiue_ids are currently returned in binary lexicographical sort 593 order, but this should not be relied on. 594 595 """ 596 if self._index is None: 597 raise errors.IndexerError("IndexerConnection has been closed") 598 return PrefixedTermIter('Q', self._index.allterms())
599
600 - def get_document(self, id):
601 """Get the document with the specified unique ID. 602 603 Raises a KeyError if there is no such document. Otherwise, it returns 604 a ProcessedDocument. 605 606 """ 607 if self._index is None: 608 raise errors.IndexerError("IndexerConnection has been closed") 609 postlist = self._index.postlist('Q' + id) 610 try: 611 plitem = postlist.next() 612 except StopIteration: 613 # Unique ID not found 614 raise KeyError('Unique ID %r not found' % id) 615 try: 616 postlist.next() 617 raise errors.IndexerError("Multiple documents " #pragma: no cover 618 "found with same unique ID") 619 except StopIteration: 620 # Only one instance of the unique ID found, as it should be. 621 pass 622 623 result = ProcessedDocument(self._field_mappings) 624 result.id = id 625 result._doc = self._index.get_document(plitem.docid) 626 return result
627
628 - def iter_synonyms(self, prefix=""):
629 """Get an iterator over the synonyms. 630 631 - `prefix`: if specified, only synonym keys with this prefix will be 632 returned. 633 634 The iterator returns 2-tuples, in which the first item is the key (ie, 635 a 2-tuple holding the term or terms which will be synonym expanded, 636 followed by the fieldname specified (or None if no fieldname)), and the 637 second item is a tuple of strings holding the synonyms for the first 638 item. 639 640 These return values are suitable for the dict() builtin, so you can 641 write things like: 642 643 >>> conn = IndexerConnection('foo') 644 >>> conn.add_synonym('foo', 'bar') 645 >>> conn.add_synonym('foo bar', 'baz') 646 >>> conn.add_synonym('foo bar', 'foo baz') 647 >>> dict(conn.iter_synonyms()) 648 {('foo', None): ('bar',), ('foo bar', None): ('baz', 'foo baz')} 649 650 """ 651 if self._index is None: 652 raise errors.IndexerError("IndexerConnection has been closed") 653 return SynonymIter(self._index, self._field_mappings, prefix)
654
655 - def iter_subfacets(self):
656 """Get an iterator over the facet hierarchy. 657 658 The iterator returns 2-tuples, in which the first item is the 659 subfacet and the second item is its parent facet. 660 661 The return values are suitable for the dict() builtin, for example: 662 663 >>> conn = IndexerConnection('db') 664 >>> conn.add_field_action('foo', FieldActions.FACET) 665 >>> conn.add_field_action('bar', FieldActions.FACET) 666 >>> conn.add_field_action('baz', FieldActions.FACET) 667 >>> conn.add_subfacet('foo', 'bar') 668 >>> conn.add_subfacet('baz', 'bar') 669 >>> dict(conn.iter_subfacets()) 670 {'foo': 'bar', 'baz': 'bar'} 671 672 """ 673 if self._index is None: 674 raise errors.IndexerError("IndexerConnection has been closed") 675 if 'facets' in _checkxapian.missing_features: 676 raise errors.IndexerError("Facets unsupported with this release of xapian") 677 return self._facet_hierarchy.iteritems()
678
679 - def iter_facet_query_types(self, association):
680 """Get an iterator over query types and their associated facets. 681 682 Only facets associated with the query types in the specified manner 683 are returned; `association` must be one of IndexerConnection.FacetQueryType_Preferred 684 or IndexerConnection.FacetQueryType_Never. 685 686 The iterator returns 2-tuples, in which the first item is the query 687 type and the second item is the associated set of facets. 688 689 The return values are suitable for the dict() builtin, for example: 690 691 >>> conn = IndexerConnection('db') 692 >>> conn.add_field_action('foo', FieldActions.FACET) 693 >>> conn.add_field_action('bar', FieldActions.FACET) 694 >>> conn.add_field_action('baz', FieldActions.FACET) 695 >>> conn.set_facet_for_query_type('type1', 'foo', conn.FacetQueryType_Preferred) 696 >>> conn.set_facet_for_query_type('type1', 'bar', conn.FacetQueryType_Never) 697 >>> conn.set_facet_for_query_type('type1', 'baz', conn.FacetQueryType_Never) 698 >>> conn.set_facet_for_query_type('type2', 'bar', conn.FacetQueryType_Preferred) 699 >>> dict(conn.iter_facet_query_types(conn.FacetQueryType_Preferred)) 700 {'type1': set(['foo']), 'type2': set(['bar'])} 701 >>> dict(conn.iter_facet_query_types(conn.FacetQueryType_Never)) 702 {'type1': set(['bar', 'baz'])} 703 704 """ 705 if self._index is None: 706 raise errors.IndexerError("IndexerConnection has been closed") 707 if 'facets' in _checkxapian.missing_features: 708 raise errors.IndexerError("Facets unsupported with this release of xapian") 709 return FacetQueryTypeIter(self._facet_query_table, association)
710
711 -class PrefixedTermIter(object):
712 """Iterate through all the terms with a given prefix. 713 714 """
715 - def __init__(self, prefix, termiter):
716 """Initialise the prefixed term iterator. 717 718 - `prefix` is the prefix to return terms for. 719 - `termiter` is a xapian TermIterator, which should be at its start. 720 721 """ 722 723 # The algorithm used in next() currently only works for single 724 # character prefixes, so assert that the prefix is single character. 725 # To deal with multicharacter prefixes, we need to check for terms 726 # which have a starting prefix equal to that given, but then have a 727 # following uppercase alphabetic character, indicating that the actual 728 # prefix is longer than the target prefix. We then need to skip over 729 # these. Not too hard to implement, but we don't need it yet. 730 assert(len(prefix) == 1) 731 732 self._started = False 733 self._prefix = prefix 734 self._prefixlen = len(prefix) 735 self._termiter = termiter
736
737 - def __iter__(self):
738 return self
739
740 - def next(self):
741 """Get the next term with the specified prefix. 742 743 """ 744 if not self._started: 745 term = self._termiter.skip_to(self._prefix).term 746 self._started = True 747 else: 748 term = self._termiter.next().term 749 if len(term) < self._prefixlen or term[:self._prefixlen] != self._prefix: 750 raise StopIteration 751 return term[self._prefixlen:]
752 753
754 -class SynonymIter(object):
755 """Iterate through a list of synonyms. 756 757 """
758 - def __init__(self, index, field_mappings, prefix):
759 """Initialise the synonym iterator. 760 761 - `index` is the index to get the synonyms from. 762 - `field_mappings` is the FieldMappings object for the iterator. 763 - `prefix` is the prefix to restrict the returned synonyms to. 764 765 """ 766 self._index = index 767 self._field_mappings = field_mappings 768 self._syniter = self._index.synonym_keys(prefix)
769
770 - def __iter__(self):
771 return self
772
773 - def next(self):
774 """Get the next synonym. 775 776 """ 777 synkey = self._syniter.next() 778 pos = 0 779 for char in synkey: 780 if char.isupper(): pos += 1 781 else: break 782 if pos == 0: 783 fieldname = None 784 terms = synkey 785 else: 786 prefix = synkey[:pos] 787 fieldname = self._field_mappings.get_fieldname_from_prefix(prefix) 788 terms = ' '.join((term[pos:] for term in synkey.split(' '))) 789 synval = tuple(self._index.synonyms(synkey)) 790 return ((terms, fieldname), synval)
791
792 -class FacetQueryTypeIter(object):
793 """Iterate through all the query types and their associated facets. 794 795 """
796 - def __init__(self, facet_query_table, association):
797 """Initialise the query type facet iterator. 798 799 Only facets associated with each query type in the specified 800 manner are returned (`association` must be one of 801 IndexerConnection.FacetQueryType_Preferred or 802 IndexerConnection.FacetQueryType_Never). 803 804 """ 805 self._table_iter = facet_query_table.iteritems() 806 self._association = association
807
808 - def __iter__(self):
809 return self
810
811 - def next(self):
812 """Get the next (query type, facet set) 2-tuple. 813 814 """ 815 query_type, facet_dict = self._table_iter.next() 816 facet_list = [facet for facet, association in facet_dict.iteritems() if association == self._association] 817 if len(facet_list) == 0: 818 return self.next() 819 return (query_type, set(facet_list))
820 821 if __name__ == '__main__': 822 import doctest, sys 823 doctest.testmod (sys.modules[__name__]) 824