core/Lucy/Index/LexiconReader.c - lucy - Git at Google

 /* Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #define C_LUCY_LEXICONREADER
 #define C_LUCY_POLYLEXICONREADER
 #define C_LUCY_DEFAULTLEXICONREADER
 #include "Lucy/Util/ToolSet.h"

 #include "Lucy/Index/LexiconReader.h"
 #include "Lucy/Plan/FieldType.h"
 #include "Lucy/Plan/Schema.h"
 #include "Lucy/Index/PolyLexicon.h"
 #include "Lucy/Index/SegLexicon.h"
 #include "Lucy/Index/Segment.h"
 #include "Lucy/Index/Snapshot.h"
 #include "Lucy/Index/TermInfo.h"
 #include "Lucy/Store/Folder.h"

 LexiconReader*
 LexReader_init(LexiconReader *self, Schema *schema, Folder *folder,
                Snapshot *snapshot, VArray *segments, int32_t seg_tick) {
     DataReader_init((DataReader*)self, schema, folder, snapshot, segments,
                     seg_tick);
     ABSTRACT_CLASS_CHECK(self, LEXICONREADER);
     return self;
 }

 LexiconReader*
 LexReader_aggregator(LexiconReader *self, VArray *readers, I32Array *offsets) {
     UNUSED_VAR(self);
     return (LexiconReader*)PolyLexReader_new(readers, offsets);
 }

 PolyLexiconReader*
 PolyLexReader_new(VArray *readers, I32Array *offsets) {
     PolyLexiconReader *self
         = (PolyLexiconReader*)VTable_Make_Obj(POLYLEXICONREADER);
     return PolyLexReader_init(self, readers, offsets);
 }

 PolyLexiconReader*
 PolyLexReader_init(PolyLexiconReader *self, VArray *readers,
                    I32Array *offsets) {
     Schema *schema = NULL;
     for (uint32_t i = 0, max = VA_Get_Size(readers); i < max; i++) {
         LexiconReader *reader
             = (LexiconReader*)CERTIFY(VA_Fetch(readers, i), LEXICONREADER);
         if (!schema) { schema = LexReader_Get_Schema(reader); }
     }
     LexReader_init((LexiconReader*)self, schema, NULL, NULL, NULL, -1);
     self->readers = (VArray*)INCREF(readers);
     self->offsets = (I32Array*)INCREF(offsets);
     return self;
 }

 void
 PolyLexReader_close(PolyLexiconReader *self) {
     if (self->readers) {
         for (uint32_t i = 0, max = VA_Get_Size(self->readers); i < max; i++) {
             LexiconReader *reader
                 = (LexiconReader*)VA_Fetch(self->readers, i);
             if (reader) { LexReader_Close(reader); }
         }
         VA_Clear(self->readers);
     }
 }

 void
 PolyLexReader_destroy(PolyLexiconReader *self) {
     DECREF(self->readers);
     DECREF(self->offsets);
     SUPER_DESTROY(self, POLYLEXICONREADER);
 }

 Lexicon*
 PolyLexReader_lexicon(PolyLexiconReader *self, const CharBuf *field,
                       Obj *term) {
     PolyLexicon *lexicon = NULL;

     if (field != NULL) {
         Schema *schema = PolyLexReader_Get_Schema(self);
         FieldType *type = Schema_Fetch_Type(schema, field);
         if (type != NULL) {
             lexicon = PolyLex_new(field, self->readers);
             if (!PolyLex_Get_Num_Seg_Lexicons(lexicon)) {
                 DECREF(lexicon);
                 return NULL;
             }
             if (term) { PolyLex_Seek(lexicon, term); }
         }
     }

     return (Lexicon*)lexicon;
 }

 uint32_t
 PolyLexReader_doc_freq(PolyLexiconReader *self, const CharBuf *field,
                        Obj *term) {
     uint32_t doc_freq = 0;
     for (uint32_t i = 0, max = VA_Get_Size(self->readers); i < max; i++) {
         LexiconReader *reader = (LexiconReader*)VA_Fetch(self->readers, i);
         if (reader) {
             doc_freq += LexReader_Doc_Freq(reader, field, term);
         }
     }
     return doc_freq;
 }

 DefaultLexiconReader*
 DefLexReader_new(Schema *schema, Folder *folder, Snapshot *snapshot,
                  VArray *segments, int32_t seg_tick) {
     DefaultLexiconReader *self
         = (DefaultLexiconReader*)VTable_Make_Obj(DEFAULTLEXICONREADER);
     return DefLexReader_init(self, schema, folder, snapshot, segments,
                              seg_tick);
 }

 // Indicate whether it is safe to build a SegLexicon using the given
 // parameters. Will return false if the field is not indexed or if no terms
 // are present for this field in this segment.
 static bool_t
 S_has_data(Schema *schema, Folder *folder, Segment *segment, CharBuf *field) {
     FieldType *type = Schema_Fetch_Type(schema, field);

     if (!type || !FType_Indexed(type)) {
         // If the field isn't indexed, bail out.
         return false;
     }
     else {
         // Bail out if there are no terms for this field in this segment.
         int32_t  field_num = Seg_Field_Num(segment, field);
         CharBuf *seg_name  = Seg_Get_Name(segment);
         CharBuf *file = CB_newf("%o/lexicon-%i32.dat", seg_name, field_num);
         bool_t retval = Folder_Exists(folder, file);
         DECREF(file);
         return retval;
     }
 }

 DefaultLexiconReader*
 DefLexReader_init(DefaultLexiconReader *self, Schema *schema, Folder *folder,
                   Snapshot *snapshot, VArray *segments, int32_t seg_tick) {

     // Init.
     LexReader_init((LexiconReader*)self, schema, folder, snapshot, segments,
                    seg_tick);
     Segment *segment = DefLexReader_Get_Segment(self);

     // Build an array of SegLexicon objects.
     self->lexicons = VA_new(Schema_Num_Fields(schema));
     for (uint32_t i = 1, max = Schema_Num_Fields(schema) + 1; i < max; i++) {
         CharBuf *field = Seg_Field_Name(segment, i);
         if (field && S_has_data(schema, folder, segment, field)) {
             SegLexicon *lexicon = SegLex_new(schema, folder, segment, field);
             VA_Store(self->lexicons, i, (Obj*)lexicon);
         }
     }

     return self;
 }

 void
 DefLexReader_close(DefaultLexiconReader *self) {
     DECREF(self->lexicons);
     self->lexicons = NULL;
 }

 void
 DefLexReader_destroy(DefaultLexiconReader *self) {
     DECREF(self->lexicons);
     SUPER_DESTROY(self, DEFAULTLEXICONREADER);
 }

 Lexicon*
 DefLexReader_lexicon(DefaultLexiconReader *self, const CharBuf *field,
                      Obj *term) {
     int32_t     field_num = Seg_Field_Num(self->segment, field);
     SegLexicon *orig      = (SegLexicon*)VA_Fetch(self->lexicons, field_num);
     SegLexicon *lexicon   = NULL;

     if (orig) { // i.e. has data
         lexicon
             = SegLex_new(self->schema, self->folder, self->segment, field);
         SegLex_Seek(lexicon, term);
     }

     return (Lexicon*)lexicon;
 }

 static TermInfo*
 S_find_tinfo(DefaultLexiconReader *self, const CharBuf *field, Obj *target) {
     if (field != NULL && target != NULL) {
         int32_t field_num = Seg_Field_Num(self->segment, field);
         SegLexicon *lexicon
             = (SegLexicon*)VA_Fetch(self->lexicons, field_num);

         if (lexicon) {
             // Iterate until the result is ge the term.
             SegLex_Seek(lexicon, target);

             //if found matches target, return info; otherwise NULL
             Obj *found = SegLex_Get_Term(lexicon);
             if (found && Obj_Equals(target, found)) {
                 return SegLex_Get_Term_Info(lexicon);
             }
         }
     }
     return NULL;
 }

 TermInfo*
 DefLexReader_fetch_term_info(DefaultLexiconReader *self,
                              const CharBuf *field, Obj *target) {
     TermInfo *tinfo = S_find_tinfo(self, field, target);
     return tinfo ? TInfo_Clone(tinfo) : NULL;
 }

 uint32_t
 DefLexReader_doc_freq(DefaultLexiconReader *self, const CharBuf *field,
                       Obj *term) {
     TermInfo *tinfo = S_find_tinfo(self, field, term);
     return tinfo ? TInfo_Get_Doc_Freq(tinfo) : 0;
 }
	/* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#define C_LUCY_LEXICONREADER
	#define C_LUCY_POLYLEXICONREADER
	#define C_LUCY_DEFAULTLEXICONREADER
	#include "Lucy/Util/ToolSet.h"

	#include "Lucy/Index/LexiconReader.h"
	#include "Lucy/Plan/FieldType.h"
	#include "Lucy/Plan/Schema.h"
	#include "Lucy/Index/PolyLexicon.h"
	#include "Lucy/Index/SegLexicon.h"
	#include "Lucy/Index/Segment.h"
	#include "Lucy/Index/Snapshot.h"
	#include "Lucy/Index/TermInfo.h"
	#include "Lucy/Store/Folder.h"

	LexiconReader*
	LexReader_init(LexiconReader self, Schema schema, Folder *folder,
	Snapshot snapshot, VArray segments, int32_t seg_tick) {
	DataReader_init((DataReader*)self, schema, folder, snapshot, segments,
	seg_tick);
	ABSTRACT_CLASS_CHECK(self, LEXICONREADER);
	return self;
	}

	LexiconReader*
	LexReader_aggregator(LexiconReader self, VArray readers, I32Array *offsets) {
	UNUSED_VAR(self);
	return (LexiconReader*)PolyLexReader_new(readers, offsets);
	}

	PolyLexiconReader*
	PolyLexReader_new(VArray readers, I32Array offsets) {
	PolyLexiconReader *self
	= (PolyLexiconReader*)VTable_Make_Obj(POLYLEXICONREADER);
	return PolyLexReader_init(self, readers, offsets);
	}

	PolyLexiconReader*
	PolyLexReader_init(PolyLexiconReader self, VArray readers,
	I32Array *offsets) {
	Schema *schema = NULL;
	for (uint32_t i = 0, max = VA_Get_Size(readers); i < max; i++) {
	LexiconReader *reader
	= (LexiconReader*)CERTIFY(VA_Fetch(readers, i), LEXICONREADER);
	if (!schema) { schema = LexReader_Get_Schema(reader); }
	}
	LexReader_init((LexiconReader*)self, schema, NULL, NULL, NULL, -1);
	self->readers = (VArray*)INCREF(readers);
	self->offsets = (I32Array*)INCREF(offsets);
	return self;
	}

	void
	PolyLexReader_close(PolyLexiconReader *self) {
	if (self->readers) {
	for (uint32_t i = 0, max = VA_Get_Size(self->readers); i < max; i++) {
	LexiconReader *reader
	= (LexiconReader*)VA_Fetch(self->readers, i);
	if (reader) { LexReader_Close(reader); }
	}
	VA_Clear(self->readers);
	}
	}

	void
	PolyLexReader_destroy(PolyLexiconReader *self) {
	DECREF(self->readers);
	DECREF(self->offsets);
	SUPER_DESTROY(self, POLYLEXICONREADER);
	}

	Lexicon*
	PolyLexReader_lexicon(PolyLexiconReader self, const CharBuf field,
	Obj *term) {
	PolyLexicon *lexicon = NULL;

	if (field != NULL) {
	Schema *schema = PolyLexReader_Get_Schema(self);
	FieldType *type = Schema_Fetch_Type(schema, field);
	if (type != NULL) {
	lexicon = PolyLex_new(field, self->readers);
	if (!PolyLex_Get_Num_Seg_Lexicons(lexicon)) {
	DECREF(lexicon);
	return NULL;
	}
	if (term) { PolyLex_Seek(lexicon, term); }
	}
	}

	return (Lexicon*)lexicon;
	}

	uint32_t
	PolyLexReader_doc_freq(PolyLexiconReader self, const CharBuf field,
	Obj *term) {
	uint32_t doc_freq = 0;
	for (uint32_t i = 0, max = VA_Get_Size(self->readers); i < max; i++) {
	LexiconReader reader = (LexiconReader)VA_Fetch(self->readers, i);
	if (reader) {
	doc_freq += LexReader_Doc_Freq(reader, field, term);
	}
	}
	return doc_freq;
	}

	DefaultLexiconReader*
	DefLexReader_new(Schema schema, Folder folder, Snapshot *snapshot,
	VArray *segments, int32_t seg_tick) {
	DefaultLexiconReader *self
	= (DefaultLexiconReader*)VTable_Make_Obj(DEFAULTLEXICONREADER);
	return DefLexReader_init(self, schema, folder, snapshot, segments,
	seg_tick);
	}

	// Indicate whether it is safe to build a SegLexicon using the given
	// parameters. Will return false if the field is not indexed or if no terms
	// are present for this field in this segment.
	static bool_t
	S_has_data(Schema schema, Folder folder, Segment segment, CharBuf field) {
	FieldType *type = Schema_Fetch_Type(schema, field);

	if (!type \|\| !FType_Indexed(type)) {
	// If the field isn't indexed, bail out.
	return false;
	}
	else {
	// Bail out if there are no terms for this field in this segment.
	int32_t field_num = Seg_Field_Num(segment, field);
	CharBuf *seg_name = Seg_Get_Name(segment);
	CharBuf *file = CB_newf("%o/lexicon-%i32.dat", seg_name, field_num);
	bool_t retval = Folder_Exists(folder, file);
	DECREF(file);
	return retval;
	}
	}

	DefaultLexiconReader*
	DefLexReader_init(DefaultLexiconReader self, Schema schema, Folder *folder,
	Snapshot snapshot, VArray segments, int32_t seg_tick) {

	// Init.
	LexReader_init((LexiconReader*)self, schema, folder, snapshot, segments,
	seg_tick);
	Segment *segment = DefLexReader_Get_Segment(self);

	// Build an array of SegLexicon objects.
	self->lexicons = VA_new(Schema_Num_Fields(schema));
	for (uint32_t i = 1, max = Schema_Num_Fields(schema) + 1; i < max; i++) {
	CharBuf *field = Seg_Field_Name(segment, i);
	if (field && S_has_data(schema, folder, segment, field)) {
	SegLexicon *lexicon = SegLex_new(schema, folder, segment, field);
	VA_Store(self->lexicons, i, (Obj*)lexicon);
	}
	}

	return self;
	}

	void
	DefLexReader_close(DefaultLexiconReader *self) {
	DECREF(self->lexicons);
	self->lexicons = NULL;
	}

	void
	DefLexReader_destroy(DefaultLexiconReader *self) {
	DECREF(self->lexicons);
	SUPER_DESTROY(self, DEFAULTLEXICONREADER);
	}

	Lexicon*
	DefLexReader_lexicon(DefaultLexiconReader self, const CharBuf field,
	Obj *term) {
	int32_t field_num = Seg_Field_Num(self->segment, field);
	SegLexicon orig = (SegLexicon)VA_Fetch(self->lexicons, field_num);
	SegLexicon *lexicon = NULL;

	if (orig) { // i.e. has data
	lexicon
	= SegLex_new(self->schema, self->folder, self->segment, field);
	SegLex_Seek(lexicon, term);
	}

	return (Lexicon*)lexicon;
	}

	static TermInfo*
	S_find_tinfo(DefaultLexiconReader self, const CharBuf field, Obj *target) {
	if (field != NULL && target != NULL) {
	int32_t field_num = Seg_Field_Num(self->segment, field);
	SegLexicon *lexicon
	= (SegLexicon*)VA_Fetch(self->lexicons, field_num);

	if (lexicon) {
	// Iterate until the result is ge the term.
	SegLex_Seek(lexicon, target);

	//if found matches target, return info; otherwise NULL
	Obj *found = SegLex_Get_Term(lexicon);
	if (found && Obj_Equals(target, found)) {
	return SegLex_Get_Term_Info(lexicon);
	}
	}
	}
	return NULL;
	}

	TermInfo*
	DefLexReader_fetch_term_info(DefaultLexiconReader *self,
	const CharBuf field, Obj target) {
	TermInfo *tinfo = S_find_tinfo(self, field, target);
	return tinfo ? TInfo_Clone(tinfo) : NULL;
	}

	uint32_t
	DefLexReader_doc_freq(DefaultLexiconReader self, const CharBuf field,
	Obj *term) {
	TermInfo *tinfo = S_find_tinfo(self, field, term);
	return tinfo ? TInfo_Get_Doc_Freq(tinfo) : 0;
	}