Object
IndexReader is used for reading data from the index. This class is usually used directly for more advanced tasks like iterating through terms in an index, accessing term-vectors or deleting documents by document id. It is also used internally by IndexSearcher.
Create a new IndexReader. You can either pass a string path to a file-system directory or an actual Ferret::Store::Directory object. For example;
dir = RAMDirectory.new() iw = IndexReader.new(dir) dir = FSDirectory.new("/path/to/index") iw = IndexReader.new(dir) iw = IndexReader.new("/path/to/index")
You can also create a what used to be known as a MultiReader by passing an array of IndexReader objects, Ferret::Store::Directory objects or file-system paths;
iw = IndexReader.new([dir, dir2, dir3]) iw = IndexReader.new([reader1, reader2, reader3]) iw = IndexReader.new(["/path/to/index1", "/path/to/index2"])
static VALUE frb_ir_init(VALUE self, VALUE rdir) { Store *store = NULL; IndexReader *ir; int i; FieldInfos *fis; VALUE rfield_num_map = rb_hash_new(); if (TYPE(rdir) == T_ARRAY) { VALUE rdirs = rdir; const int reader_cnt = RARRAY_LEN(rdir); IndexReader **sub_readers = ALLOC_N(IndexReader *, reader_cnt); int i; for (i = 0; i < reader_cnt; i++) { rdir = RARRAY_PTR(rdirs)[i]; switch (TYPE(rdir)) { case T_DATA: if (CLASS_OF(rdir) == cIndexReader) { Data_Get_Struct(rdir, IndexReader, sub_readers[i]); REF(sub_readers[i]); continue; } else if (RTEST(rb_obj_is_kind_of(rdir, cDirectory))) { store = DATA_PTR(rdir); } else { rb_raise(rb_eArgError, "A Multi-IndexReader can only " "be created from other IndexReaders, " "Directory objects or file-system paths. " "Not %s", rs2s(rb_obj_as_string(rdir))); } break; case T_STRING: frb_create_dir(rdir); store = open_fs_store(rs2s(rdir)); DEREF(store); break; default: rb_raise(rb_eArgError, "%s isn't a valid directory " "argument. You should use either a String or " "a Directory", rs2s(rb_obj_as_string(rdir))); break; } sub_readers[i] = ir_open(store); } ir = mr_open(sub_readers, reader_cnt); Frt_Wrap_Struct(self, &frb_mr_mark, &frb_ir_free, ir); } else { switch (TYPE(rdir)) { case T_DATA: store = DATA_PTR(rdir); break; case T_STRING: frb_create_dir(rdir); store = open_fs_store(rs2s(rdir)); DEREF(store); break; default: rb_raise(rb_eArgError, "%s isn't a valid directory argument. " "You should use either a String or a Directory", rs2s(rb_obj_as_string(rdir))); break; } ir = ir_open(store); Frt_Wrap_Struct(self, &frb_ir_mark, &frb_ir_free, ir); } object_add(ir, self); fis = ir->fis; for (i = 0; i < fis->size; i++) { FieldInfo *fi = fis->fields[i]; rb_hash_aset(rfield_num_map, FSYM2SYM(fi->name), INT2FIX(fi->number)); } rb_ivar_set(self, id_fld_num_map, rfield_num_map); return self; }
Retrieve a document from the index. See LazyDoc for more details on the document returned. Documents are referenced internally by document ids which are returned by the Searchers search methods.
static VALUE frb_ir_get_doc(int argc, VALUE *argv, VALUE self) { IndexReader *ir = (IndexReader *)DATA_PTR(self); VALUE arg1, arg2; long pos, len; long max = ir->max_doc(ir); rb_scan_args(argc, argv, "11", &arg1, &arg2); if (argc == 1) { if (FIXNUM_P(arg1)) { pos = FIX2INT(arg1); pos = (pos < 0) ? (max + pos) : pos; if (pos < 0 || pos >= max) { rb_raise(rb_eArgError, "index %d is out of range [%d..%d] for " "IndexReader#[]", pos, 0, max, -1); } return frb_get_lazy_doc(ir->get_lazy_doc(ir, pos)); } /* check if idx is Range */ /* FIXME: test this with dodgy values */ switch (rb_range_beg_len(arg1, &pos, &len, max, 0)) { case Qfalse: rb_raise(rb_eArgError, ":%s isn't a valid argument for " "IndexReader.get_document(index)", rb_id2name(SYM2ID(arg1))); case Qnil: return Qnil; default: return frb_get_doc_range(ir, pos, len, max); } } else { pos = FIX2LONG(arg1); len = FIX2LONG(arg2); return frb_get_doc_range(ir, pos, len, max); } }
Close the IndexReader. This method also commits any deletions made by this IndexReader. This method will be called explicitly by the garbage collector but you should call it explicitly to commit any changes as soon as possible and to close any locks held by the object to prevent locking errors.
static VALUE frb_ir_close(VALUE self) { IndexReader *ir = (IndexReader *)DATA_PTR(self); object_del(ir); Frt_Unwrap_Struct(self); ir_close(ir); return self; }
Commit any deletes made by this particular IndexReader to the index. This will use open a Commit lock.
static VALUE frb_ir_commit(VALUE self) { IndexReader *ir = (IndexReader *)DATA_PTR(self); ir_commit(ir); return self; }
Delete document referenced internally by document id doc_id. The document_id is the number used to reference documents in the index and is returned by search methods.
static VALUE frb_ir_delete(VALUE self, VALUE rdoc_id) { IndexReader *ir = (IndexReader *)DATA_PTR(self); ir_delete_doc(ir, FIX2INT(rdoc_id)); return self; }
Returns true if the document at doc_id has been deleted.
static VALUE frb_ir_is_deleted(VALUE self, VALUE rdoc_id) { IndexReader *ir = (IndexReader *)DATA_PTR(self); return ir->is_deleted(ir, FIX2INT(rdoc_id)) ? Qtrue : Qfalse; }
Return the number of documents in which the term term appears in the field field.
static VALUE frb_ir_doc_freq(VALUE self, VALUE rfield, VALUE rterm) { IndexReader *ir = (IndexReader *)DATA_PTR(self); return INT2FIX(ir_doc_freq(ir, frb_field(rfield), StringValuePtr(rterm))); }
Get the FieldInfos object for this IndexReader.
static VALUE frb_ir_field_infos(VALUE self) { IndexReader *ir = (IndexReader *)DATA_PTR(self); return frb_get_field_infos(ir->fis); }
Returns an array of field names in the index. This can be used to pass to the QueryParser so that the QueryParser knows how to expand the “*” wild-card to all fields in the index. A list of field names can also be gathered from the FieldInfos object.
static VALUE frb_ir_fields(VALUE self) { IndexReader *ir = (IndexReader *)DATA_PTR(self); FieldInfos *fis = ir->fis; VALUE rfield_names = rb_ary_new(); int i; for (i = 0; i < fis->size; i++) { rb_ary_push(rfield_names, FSYM2SYM(fis->fields[i]->name)); } return rfield_names; }
Returns an array of field names in the index. This can be used to pass to the QueryParser so that the QueryParser knows how to expand the “*” wild-card to all fields in the index. A list of field names can also be gathered from the FieldInfos object.
static VALUE frb_ir_fields(VALUE self) { IndexReader *ir = (IndexReader *)DATA_PTR(self); FieldInfos *fis = ir->fis; VALUE rfield_names = rb_ary_new(); int i; for (i = 0; i < fis->size; i++) { rb_ary_push(rfield_names, FSYM2SYM(fis->fields[i]->name)); } return rfield_names; }
Retrieve a document from the index. See LazyDoc for more details on the document returned. Documents are referenced internally by document ids which are returned by the Searchers search methods.
static VALUE frb_ir_get_doc(int argc, VALUE *argv, VALUE self) { IndexReader *ir = (IndexReader *)DATA_PTR(self); VALUE arg1, arg2; long pos, len; long max = ir->max_doc(ir); rb_scan_args(argc, argv, "11", &arg1, &arg2); if (argc == 1) { if (FIXNUM_P(arg1)) { pos = FIX2INT(arg1); pos = (pos < 0) ? (max + pos) : pos; if (pos < 0 || pos >= max) { rb_raise(rb_eArgError, "index %d is out of range [%d..%d] for " "IndexReader#[]", pos, 0, max, -1); } return frb_get_lazy_doc(ir->get_lazy_doc(ir, pos)); } /* check if idx is Range */ /* FIXME: test this with dodgy values */ switch (rb_range_beg_len(arg1, &pos, &len, max, 0)) { case Qfalse: rb_raise(rb_eArgError, ":%s isn't a valid argument for " "IndexReader.get_document(index)", rb_id2name(SYM2ID(arg1))); case Qnil: return Qnil; default: return frb_get_doc_range(ir, pos, len, max); } } else { pos = FIX2LONG(arg1); len = FIX2LONG(arg2); return frb_get_doc_range(ir, pos, len, max); } }
Expert: Get the norm values into a string buffer starting at offset.
static VALUE frb_ir_get_norms_into(VALUE self, VALUE rfield, VALUE rnorms, VALUE roffset) { IndexReader *ir = (IndexReader *)DATA_PTR(self); int offset; offset = FIX2INT(roffset); Check_Type(rnorms, T_STRING); if (RSTRING_LEN(rnorms) < offset + ir->max_doc(ir)) { rb_raise(rb_eArgError, "supplied a string of length:%d to " "IndexReader#get_norms_into but needed a string of length " "offset:%d + maxdoc:%d", RSTRING_LEN(rnorms), offset, ir->max_doc(ir)); } ir_get_norms_into(ir, frb_field(rfield), (uchar *)rs2s(rnorms) + offset); return rnorms; }
Return true if the index has any deletions, either uncommitted by this IndexReader or committed by any other IndexReader.
static VALUE frb_ir_has_deletions(VALUE self) { IndexReader *ir = (IndexReader *)DATA_PTR(self); return ir->has_deletions(ir) ? Qtrue : Qfalse; }
Return true if the index version referenced by this IndexReader is the latest version of the index. If it isn’t you should close and reopen the index to search the latest documents added to the index.
static VALUE frb_ir_is_latest(VALUE self) { IndexReader *ir = (IndexReader *)DATA_PTR(self); return ir_is_latest(ir) ? Qtrue : Qfalse; }
Returns 1 + the maximum document id in the index. It is the document_id that will be used by the next document added to the index. If there are no deletions, this number also refers to the number of documents in the index.
static VALUE frb_ir_max_doc(VALUE self) { IndexReader *ir = (IndexReader *)DATA_PTR(self); return INT2FIX(ir->max_doc(ir)); }
Expert: Returns a string containing the norm values for a field. The string length will be equal to the number of documents in the index and it could have null bytes.
static VALUE frb_ir_norms(VALUE self, VALUE rfield) { IndexReader *ir = (IndexReader *)DATA_PTR(self); uchar *norms; norms = ir_get_norms(ir, frb_field(rfield)); if (norms) { return rb_str_new((char *)norms, ir->max_doc(ir)); } else { return Qnil; } }
Returns the number of accessible (not deleted) documents in the index. This will be equal to IndexReader#max_doc if there have been no documents deleted from the index.
static VALUE frb_ir_num_docs(VALUE self) { IndexReader *ir = (IndexReader *)DATA_PTR(self); return INT2FIX(ir->num_docs(ir)); }
Expert: change the boost value for a field in document at doc_id. val should be an integer in the range 0..255 which corresponds to an encoded float value.
static VALUE frb_ir_set_norm(VALUE self, VALUE rdoc_id, VALUE rfield, VALUE rval) { IndexReader *ir = (IndexReader *)DATA_PTR(self); ir_set_norm(ir, FIX2INT(rdoc_id), frb_field(rfield), (uchar)NUM2CHR(rval)); return self; }
Same return a count of the number of terms in the field
static VALUE frb_ir_term_count(VALUE self, VALUE rfield) { IndexReader *ir = (IndexReader *)DATA_PTR(self); TermEnum *te = ir_terms(ir, frb_field(rfield)); int count = 0; while (te->next(te)) { count++; } te->close(te); return INT2FIX(count); }
Builds a TermDocEnum (term-document enumerator) for the index. You can use this object to iterate through the documents in which certain terms occur. See TermDocEnum for more info.
static VALUE frb_ir_term_docs(VALUE self) { IndexReader *ir = (IndexReader *)DATA_PTR(self); return frb_get_tde(self, ir->term_docs(ir)); }
Builds a TermDocEnum to iterate through the documents that contain the term term in the field field. See TermDocEnum for more info.
static VALUE frb_ir_term_docs_for(VALUE self, VALUE rfield, VALUE rterm) { IndexReader *ir = (IndexReader *)DATA_PTR(self); return frb_get_tde(self, ir_term_docs_for(ir, frb_field(rfield), StringValuePtr(rterm))); }
Same as IndexReader#term_docs except the TermDocEnum will also allow you to scan through the positions at which a term occurs. See TermDocEnum for more info.
static VALUE frb_ir_term_positions(VALUE self) { IndexReader *ir = (IndexReader *)DATA_PTR(self); return frb_get_tde(self, ir->term_positions(ir)); }
Same as IndexReader#term_docs_for(field, term) except the TermDocEnum will also allow you to scan through the positions at which a term occurs. See TermDocEnum for more info.
static VALUE frb_ir_t_pos_for(VALUE self, VALUE rfield, VALUE rterm) { IndexReader *ir = (IndexReader *)DATA_PTR(self); return frb_get_tde(self, ir_term_positions_for(ir, frb_field(rfield), StringValuePtr(rterm))); }
Return the TermVector for the field field in the document at doc_id in the index. Return nil if no such term_vector exists. See TermVector.
static VALUE frb_ir_term_vector(VALUE self, VALUE rdoc_id, VALUE rfield) { IndexReader *ir = (IndexReader *)DATA_PTR(self); TermVector *tv; VALUE rtv; tv = ir->term_vector(ir, FIX2INT(rdoc_id), frb_field(rfield)); if (tv) { rtv = frb_get_tv(tv); tv_destroy(tv); return rtv; } else { return Qnil; } }
Return the TermVectors for the document at doc_id in the index. The value returned is a hash of the TermVectors for each field in the document and they are referenced by field names (as symbols).
static VALUE frb_ir_term_vectors(VALUE self, VALUE rdoc_id) { IndexReader *ir = (IndexReader *)DATA_PTR(self); Hash *tvs = ir->term_vectors(ir, FIX2INT(rdoc_id)); VALUE rtvs = rb_hash_new(); h_each(tvs, &frb_add_each_tv, (void *)rtvs); h_destroy(tvs); return rtvs; }
Returns a term enumerator which allows you to iterate through all the terms in the field field in the index.
static VALUE frb_ir_terms(VALUE self, VALUE rfield) { IndexReader *ir = (IndexReader *)DATA_PTR(self); return frb_get_te(self, ir_terms(ir, frb_field(rfield))); }
Same as IndexReader#terms(fields) except that it starts the enumerator off at term term.
static VALUE frb_ir_terms_from(VALUE self, VALUE rfield, VALUE rterm) { IndexReader *ir = (IndexReader *)DATA_PTR(self); return frb_get_te(self, ir_terms_from(ir, frb_field(rfield), StringValuePtr(rterm))); }
Returns an array of field names of all of the tokenized fields in the index. This can be used to pass to the QueryParser so that the QueryParser knows how to expand the “*” wild-card to all fields in the index. A list of field names can also be gathered from the FieldInfos object.
static VALUE frb_ir_tk_fields(VALUE self) { IndexReader *ir = (IndexReader *)DATA_PTR(self); FieldInfos *fis = ir->fis; VALUE rfield_names = rb_ary_new(); int i; for (i = 0; i < fis->size; i++) { if (!fi_is_tokenized(fis->fields[i])) continue; rb_ary_push(rfield_names, FSYM2SYM(fis->fields[i]->name)); } return rfield_names; }
Undelete all deleted documents in the index. This is kind of like a rollback feature. Not that once an index is committed or a merge happens during index, deletions will be committed and undelete_all will have no effect on these documents.
static VALUE frb_ir_undelete_all(VALUE self) { IndexReader *ir = (IndexReader *)DATA_PTR(self); ir_undelete_all(ir); return self; }
Generated with the Darkfish Rdoc Generator 2.