blob: 43087aad1b9663609842c72bb1b04d6bb39f37ca [file] [log] [blame]
/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define C_LUCY_SEGLEXICON
#include "Lucy/Util/ToolSet.h"
#include "Lucy/Index/SegLexicon.h"
#include "Lucy/Index/Segment.h"
#include "Lucy/Index/PostingList.h"
#include "Lucy/Index/TermInfo.h"
#include "Lucy/Index/LexIndex.h"
#include "Lucy/Index/LexiconWriter.h"
#include "Lucy/Index/Posting/MatchPosting.h"
#include "Lucy/Index/SegPostingList.h"
#include "Lucy/Index/TermStepper.h"
#include "Lucy/Plan/Architecture.h"
#include "Lucy/Plan/FieldType.h"
#include "Lucy/Plan/Schema.h"
#include "Lucy/Store/Folder.h"
#include "Lucy/Store/InStream.h"
// Iterate until the state is greater than or equal to the target.
static void
S_scan_to(SegLexicon *self, Obj *target);
SegLexicon*
SegLex_new(Schema *schema, Folder *folder, Segment *segment,
const CharBuf *field) {
SegLexicon *self = (SegLexicon*)VTable_Make_Obj(SEGLEXICON);
return SegLex_init(self, schema, folder, segment, field);
}
SegLexicon*
SegLex_init(SegLexicon *self, Schema *schema, Folder *folder,
Segment *segment, const CharBuf *field) {
Hash *metadata = (Hash*)CERTIFY(
Seg_Fetch_Metadata_Str(segment, "lexicon", 7),
HASH);
Architecture *arch = Schema_Get_Architecture(schema);
Hash *counts = (Hash*)Hash_Fetch_Str(metadata, "counts", 6);
Obj *format = Hash_Fetch_Str(metadata, "format", 6);
CharBuf *seg_name = Seg_Get_Name(segment);
int32_t field_num = Seg_Field_Num(segment, field);
FieldType *type = Schema_Fetch_Type(schema, field);
CharBuf *filename = CB_newf("%o/lexicon-%i32.dat", seg_name, field_num);
Lex_init((Lexicon*)self, field);
// Check format.
if (!format) { THROW(ERR, "Missing 'format'"); }
else {
if (Obj_To_I64(format) > LexWriter_current_file_format) {
THROW(ERR, "Unsupported lexicon format: %i64",
Obj_To_I64(format));
}
}
// Extract count from metadata.
if (!counts) { THROW(ERR, "Failed to extract 'counts'"); }
else {
Obj *count = CERTIFY(Hash_Fetch(counts, (Obj*)field), OBJ);
self->size = (int32_t)Obj_To_I64(count);
}
// Assign.
self->segment = (Segment*)INCREF(segment);
// Derive.
self->lex_index = LexIndex_new(schema, folder, segment, field);
self->field_num = field_num;
self->index_interval = Arch_Index_Interval(arch);
self->skip_interval = Arch_Skip_Interval(arch);
self->instream = Folder_Open_In(folder, filename);
if (!self->instream) {
Err *error = (Err*)INCREF(Err_get_error());
DECREF(filename);
DECREF(self);
RETHROW(error);
}
DECREF(filename);
// Define the term_num as "not yet started".
self->term_num = -1;
// Get steppers.
self->term_stepper = FType_Make_Term_Stepper(type);
self->tinfo_stepper = (TermStepper*)MatchTInfoStepper_new(schema);
return self;
}
void
SegLex_destroy(SegLexicon *self) {
DECREF(self->segment);
DECREF(self->term_stepper);
DECREF(self->tinfo_stepper);
DECREF(self->lex_index);
DECREF(self->instream);
SUPER_DESTROY(self, SEGLEXICON);
}
void
SegLex_seek(SegLexicon *self, Obj *target) {
LexIndex *const lex_index = self->lex_index;
// Reset upon null term.
if (target == NULL) {
SegLex_Reset(self);
return;
}
// Use the LexIndex to get in the ballpark.
LexIndex_Seek(lex_index, target);
{
TermInfo *target_tinfo = LexIndex_Get_Term_Info(lex_index);
TermInfo *my_tinfo
= (TermInfo*)TermStepper_Get_Value(self->tinfo_stepper);
Obj *lex_index_term = Obj_Clone(LexIndex_Get_Term(lex_index));
TInfo_Mimic(my_tinfo, (Obj*)target_tinfo);
TermStepper_Set_Value(self->term_stepper, lex_index_term);
DECREF(lex_index_term);
InStream_Seek(self->instream, TInfo_Get_Lex_FilePos(target_tinfo));
}
self->term_num = LexIndex_Get_Term_Num(lex_index);
// Scan to the precise location.
S_scan_to(self, target);
}
void
SegLex_reset(SegLexicon* self) {
self->term_num = -1;
InStream_Seek(self->instream, 0);
TermStepper_Reset(self->term_stepper);
TermStepper_Reset(self->tinfo_stepper);
}
int32_t
SegLex_get_field_num(SegLexicon *self) {
return self->field_num;
}
Obj*
SegLex_get_term(SegLexicon *self) {
return TermStepper_Get_Value(self->term_stepper);
}
int32_t
SegLex_doc_freq(SegLexicon *self) {
TermInfo *tinfo = (TermInfo*)TermStepper_Get_Value(self->tinfo_stepper);
return tinfo ? TInfo_Get_Doc_Freq(tinfo) : 0;
}
TermInfo*
SegLex_get_term_info(SegLexicon *self) {
return (TermInfo*)TermStepper_Get_Value(self->tinfo_stepper);
}
Segment*
SegLex_get_segment(SegLexicon *self) {
return self->segment;
}
bool_t
SegLex_next(SegLexicon *self) {
// If we've run out of terms, null out and return.
if (++self->term_num >= self->size) {
self->term_num = self->size; // don't keep growing
TermStepper_Reset(self->term_stepper);
TermStepper_Reset(self->tinfo_stepper);
return false;
}
// Read next term/terminfo.
TermStepper_Read_Delta(self->term_stepper, self->instream);
TermStepper_Read_Delta(self->tinfo_stepper, self->instream);
return true;
}
static void
S_scan_to(SegLexicon *self, Obj *target) {
// (mildly evil encapsulation violation, since value can be null)
Obj *current = TermStepper_Get_Value(self->term_stepper);
if (!Obj_Is_A(target, Obj_Get_VTable(current))) {
THROW(ERR, "Target is a %o, and not comparable to a %o",
Obj_Get_Class_Name(target), Obj_Get_Class_Name(current));
}
// Keep looping until the term text is ge target.
do {
const int32_t comparison = Obj_Compare_To(current, target);
if (comparison >= 0 && self->term_num != -1) { break; }
} while (SegLex_Next(self));
}