core/Lucy/Index/SegLexicon.c - lucy - Git at Google

 /* Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #define C_LUCY_SEGLEXICON
 #include "Lucy/Util/ToolSet.h"

 #include "Lucy/Index/SegLexicon.h"
 #include "Lucy/Index/Segment.h"
 #include "Lucy/Index/PostingList.h"
 #include "Lucy/Index/TermInfo.h"
 #include "Lucy/Index/LexIndex.h"
 #include "Lucy/Index/LexiconWriter.h"
 #include "Lucy/Index/Posting/MatchPosting.h"
 #include "Lucy/Index/SegPostingList.h"
 #include "Lucy/Index/TermStepper.h"
 #include "Lucy/Plan/Architecture.h"
 #include "Lucy/Plan/FieldType.h"
 #include "Lucy/Plan/Schema.h"
 #include "Lucy/Store/Folder.h"
 #include "Lucy/Store/InStream.h"

 // Iterate until the state is greater than or equal to the target.
 static void
 S_scan_to(SegLexicon *self, Obj *target);

 SegLexicon*
 SegLex_new(Schema *schema, Folder *folder, Segment *segment,
            const CharBuf *field) {
     SegLexicon *self = (SegLexicon*)VTable_Make_Obj(SEGLEXICON);
     return SegLex_init(self, schema, folder, segment, field);
 }

 SegLexicon*
 SegLex_init(SegLexicon *self, Schema *schema, Folder *folder,
             Segment *segment, const CharBuf *field) {
     Hash *metadata = (Hash*)CERTIFY(
                          Seg_Fetch_Metadata_Str(segment, "lexicon", 7),
                          HASH);
     Architecture *arch      = Schema_Get_Architecture(schema);
     Hash         *counts    = (Hash*)Hash_Fetch_Str(metadata, "counts", 6);
     Obj          *format    = Hash_Fetch_Str(metadata, "format", 6);
     CharBuf      *seg_name  = Seg_Get_Name(segment);
     int32_t       field_num = Seg_Field_Num(segment, field);
     FieldType    *type      = Schema_Fetch_Type(schema, field);
     CharBuf *filename = CB_newf("%o/lexicon-%i32.dat", seg_name, field_num);

     Lex_init((Lexicon*)self, field);

     // Check format.
     if (!format) { THROW(ERR, "Missing 'format'"); }
     else {
         if (Obj_To_I64(format) > LexWriter_current_file_format) {
             THROW(ERR, "Unsupported lexicon format: %i64",
                   Obj_To_I64(format));
         }
     }

     // Extract count from metadata.
     if (!counts) { THROW(ERR, "Failed to extract 'counts'"); }
     else {
         Obj *count = CERTIFY(Hash_Fetch(counts, (Obj*)field), OBJ);
         self->size = (int32_t)Obj_To_I64(count);
     }

     // Assign.
     self->segment        = (Segment*)INCREF(segment);

     // Derive.
     self->lex_index      = LexIndex_new(schema, folder, segment, field);
     self->field_num      = field_num;
     self->index_interval = Arch_Index_Interval(arch);
     self->skip_interval  = Arch_Skip_Interval(arch);
     self->instream       = Folder_Open_In(folder, filename);
     if (!self->instream) {
         Err *error = (Err*)INCREF(Err_get_error());
         DECREF(filename);
         DECREF(self);
         RETHROW(error);
     }
     DECREF(filename);

     // Define the term_num as "not yet started".
     self->term_num = -1;

     // Get steppers.
     self->term_stepper  = FType_Make_Term_Stepper(type);
     self->tinfo_stepper = (TermStepper*)MatchTInfoStepper_new(schema);

     return self;
 }

 void
 SegLex_destroy(SegLexicon *self) {
     DECREF(self->segment);
     DECREF(self->term_stepper);
     DECREF(self->tinfo_stepper);
     DECREF(self->lex_index);
     DECREF(self->instream);
     SUPER_DESTROY(self, SEGLEXICON);
 }

 void
 SegLex_seek(SegLexicon *self, Obj *target) {
     LexIndex *const lex_index = self->lex_index;

     // Reset upon null term.
     if (target == NULL) {
         SegLex_Reset(self);
         return;
     }

     // Use the LexIndex to get in the ballpark.
     LexIndex_Seek(lex_index, target);
     {
         TermInfo *target_tinfo = LexIndex_Get_Term_Info(lex_index);
         TermInfo *my_tinfo
             = (TermInfo*)TermStepper_Get_Value(self->tinfo_stepper);
         Obj *lex_index_term = Obj_Clone(LexIndex_Get_Term(lex_index));
         TInfo_Mimic(my_tinfo, (Obj*)target_tinfo);
         TermStepper_Set_Value(self->term_stepper, lex_index_term);
         DECREF(lex_index_term);
         InStream_Seek(self->instream, TInfo_Get_Lex_FilePos(target_tinfo));
     }
     self->term_num = LexIndex_Get_Term_Num(lex_index);

     // Scan to the precise location.
     S_scan_to(self, target);
 }

 void
 SegLex_reset(SegLexicon* self) {
     self->term_num = -1;
     InStream_Seek(self->instream, 0);
     TermStepper_Reset(self->term_stepper);
     TermStepper_Reset(self->tinfo_stepper);
 }

 int32_t
 SegLex_get_field_num(SegLexicon *self) {
     return self->field_num;
 }

 Obj*
 SegLex_get_term(SegLexicon *self) {
     return TermStepper_Get_Value(self->term_stepper);
 }

 int32_t
 SegLex_doc_freq(SegLexicon *self) {
     TermInfo *tinfo = (TermInfo*)TermStepper_Get_Value(self->tinfo_stepper);
     return tinfo ? TInfo_Get_Doc_Freq(tinfo) : 0;
 }

 TermInfo*
 SegLex_get_term_info(SegLexicon *self) {
     return (TermInfo*)TermStepper_Get_Value(self->tinfo_stepper);
 }

 Segment*
 SegLex_get_segment(SegLexicon *self) {
     return self->segment;
 }

 bool_t
 SegLex_next(SegLexicon *self) {
     // If we've run out of terms, null out and return.
     if (++self->term_num >= self->size) {
         self->term_num = self->size; // don't keep growing
         TermStepper_Reset(self->term_stepper);
         TermStepper_Reset(self->tinfo_stepper);
         return false;
     }

     // Read next term/terminfo.
     TermStepper_Read_Delta(self->term_stepper, self->instream);
     TermStepper_Read_Delta(self->tinfo_stepper, self->instream);

     return true;
 }

 static void
 S_scan_to(SegLexicon *self, Obj *target) {
     // (mildly evil encapsulation violation, since value can be null)
     Obj *current = TermStepper_Get_Value(self->term_stepper);
     if (!Obj_Is_A(target, Obj_Get_VTable(current))) {
         THROW(ERR, "Target is a %o, and not comparable to a %o",
               Obj_Get_Class_Name(target), Obj_Get_Class_Name(current));
     }

     // Keep looping until the term text is ge target.
     do {
         const int32_t comparison = Obj_Compare_To(current, target);
         if (comparison >= 0 &&  self->term_num != -1) { break; }
     } while (SegLex_Next(self));
 }
	/* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#define C_LUCY_SEGLEXICON
	#include "Lucy/Util/ToolSet.h"

	#include "Lucy/Index/SegLexicon.h"
	#include "Lucy/Index/Segment.h"
	#include "Lucy/Index/PostingList.h"
	#include "Lucy/Index/TermInfo.h"
	#include "Lucy/Index/LexIndex.h"
	#include "Lucy/Index/LexiconWriter.h"
	#include "Lucy/Index/Posting/MatchPosting.h"
	#include "Lucy/Index/SegPostingList.h"
	#include "Lucy/Index/TermStepper.h"
	#include "Lucy/Plan/Architecture.h"
	#include "Lucy/Plan/FieldType.h"
	#include "Lucy/Plan/Schema.h"
	#include "Lucy/Store/Folder.h"
	#include "Lucy/Store/InStream.h"

	// Iterate until the state is greater than or equal to the target.
	static void
	S_scan_to(SegLexicon self, Obj target);

	SegLexicon*
	SegLex_new(Schema schema, Folder folder, Segment *segment,
	const CharBuf *field) {
	SegLexicon self = (SegLexicon)VTable_Make_Obj(SEGLEXICON);
	return SegLex_init(self, schema, folder, segment, field);
	}

	SegLexicon*
	SegLex_init(SegLexicon self, Schema schema, Folder *folder,
	Segment segment, const CharBuf field) {
	Hash metadata = (Hash)CERTIFY(
	Seg_Fetch_Metadata_Str(segment, "lexicon", 7),
	HASH);
	Architecture *arch = Schema_Get_Architecture(schema);
	Hash counts = (Hash)Hash_Fetch_Str(metadata, "counts", 6);
	Obj *format = Hash_Fetch_Str(metadata, "format", 6);
	CharBuf *seg_name = Seg_Get_Name(segment);
	int32_t field_num = Seg_Field_Num(segment, field);
	FieldType *type = Schema_Fetch_Type(schema, field);
	CharBuf *filename = CB_newf("%o/lexicon-%i32.dat", seg_name, field_num);

	Lex_init((Lexicon*)self, field);

	// Check format.
	if (!format) { THROW(ERR, "Missing 'format'"); }
	else {
	if (Obj_To_I64(format) > LexWriter_current_file_format) {
	THROW(ERR, "Unsupported lexicon format: %i64",
	Obj_To_I64(format));
	}
	}

	// Extract count from metadata.
	if (!counts) { THROW(ERR, "Failed to extract 'counts'"); }
	else {
	Obj count = CERTIFY(Hash_Fetch(counts, (Obj)field), OBJ);
	self->size = (int32_t)Obj_To_I64(count);
	}

	// Assign.
	self->segment = (Segment*)INCREF(segment);

	// Derive.
	self->lex_index = LexIndex_new(schema, folder, segment, field);
	self->field_num = field_num;
	self->index_interval = Arch_Index_Interval(arch);
	self->skip_interval = Arch_Skip_Interval(arch);
	self->instream = Folder_Open_In(folder, filename);
	if (!self->instream) {
	Err error = (Err)INCREF(Err_get_error());
	DECREF(filename);
	DECREF(self);
	RETHROW(error);
	}
	DECREF(filename);

	// Define the term_num as "not yet started".
	self->term_num = -1;

	// Get steppers.
	self->term_stepper = FType_Make_Term_Stepper(type);
	self->tinfo_stepper = (TermStepper*)MatchTInfoStepper_new(schema);

	return self;
	}

	void
	SegLex_destroy(SegLexicon *self) {
	DECREF(self->segment);
	DECREF(self->term_stepper);
	DECREF(self->tinfo_stepper);
	DECREF(self->lex_index);
	DECREF(self->instream);
	SUPER_DESTROY(self, SEGLEXICON);
	}

	void
	SegLex_seek(SegLexicon self, Obj target) {
	LexIndex *const lex_index = self->lex_index;

	// Reset upon null term.
	if (target == NULL) {
	SegLex_Reset(self);
	return;
	}

	// Use the LexIndex to get in the ballpark.
	LexIndex_Seek(lex_index, target);
	{
	TermInfo *target_tinfo = LexIndex_Get_Term_Info(lex_index);
	TermInfo *my_tinfo
	= (TermInfo*)TermStepper_Get_Value(self->tinfo_stepper);
	Obj *lex_index_term = Obj_Clone(LexIndex_Get_Term(lex_index));
	TInfo_Mimic(my_tinfo, (Obj*)target_tinfo);
	TermStepper_Set_Value(self->term_stepper, lex_index_term);
	DECREF(lex_index_term);
	InStream_Seek(self->instream, TInfo_Get_Lex_FilePos(target_tinfo));
	}
	self->term_num = LexIndex_Get_Term_Num(lex_index);

	// Scan to the precise location.
	S_scan_to(self, target);
	}

	void
	SegLex_reset(SegLexicon* self) {
	self->term_num = -1;
	InStream_Seek(self->instream, 0);
	TermStepper_Reset(self->term_stepper);
	TermStepper_Reset(self->tinfo_stepper);
	}

	int32_t
	SegLex_get_field_num(SegLexicon *self) {
	return self->field_num;
	}

	Obj*
	SegLex_get_term(SegLexicon *self) {
	return TermStepper_Get_Value(self->term_stepper);
	}

	int32_t
	SegLex_doc_freq(SegLexicon *self) {
	TermInfo tinfo = (TermInfo)TermStepper_Get_Value(self->tinfo_stepper);
	return tinfo ? TInfo_Get_Doc_Freq(tinfo) : 0;
	}

	TermInfo*
	SegLex_get_term_info(SegLexicon *self) {
	return (TermInfo*)TermStepper_Get_Value(self->tinfo_stepper);
	}

	Segment*
	SegLex_get_segment(SegLexicon *self) {
	return self->segment;
	}

	bool_t
	SegLex_next(SegLexicon *self) {
	// If we've run out of terms, null out and return.
	if (++self->term_num >= self->size) {
	self->term_num = self->size; // don't keep growing
	TermStepper_Reset(self->term_stepper);
	TermStepper_Reset(self->tinfo_stepper);
	return false;
	}

	// Read next term/terminfo.
	TermStepper_Read_Delta(self->term_stepper, self->instream);
	TermStepper_Read_Delta(self->tinfo_stepper, self->instream);

	return true;
	}

	static void
	S_scan_to(SegLexicon self, Obj target) {
	// (mildly evil encapsulation violation, since value can be null)
	Obj *current = TermStepper_Get_Value(self->term_stepper);
	if (!Obj_Is_A(target, Obj_Get_VTable(current))) {
	THROW(ERR, "Target is a %o, and not comparable to a %o",
	Obj_Get_Class_Name(target), Obj_Get_Class_Name(current));
	}

	// Keep looping until the term text is ge target.
	do {
	const int32_t comparison = Obj_Compare_To(current, target);
	if (comparison >= 0 && self->term_num != -1) { break; }
	} while (SegLex_Next(self));
	}