blob: dca9adfcbbdddf66c328bd42f52b81faf46b40c3 [file] [log] [blame]
/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define C_LUCY_LEXINDEX
#define C_LUCY_TERMINFO
#include "Lucy/Util/ToolSet.h"
#include "Lucy/Index/LexIndex.h"
#include "Lucy/Index/Segment.h"
#include "Lucy/Index/TermInfo.h"
#include "Lucy/Index/TermStepper.h"
#include "Lucy/Plan/Architecture.h"
#include "Lucy/Plan/FieldType.h"
#include "Lucy/Plan/Schema.h"
#include "Lucy/Store/Folder.h"
#include "Lucy/Store/InStream.h"
// Read the data we've arrived at after a seek operation.
static void
S_read_entry(LexIndex *self);
LexIndex*
LexIndex_new(Schema *schema, Folder *folder, Segment *segment,
const CharBuf *field) {
LexIndex *self = (LexIndex*)VTable_Make_Obj(LEXINDEX);
return LexIndex_init(self, schema, folder, segment, field);
}
LexIndex*
LexIndex_init(LexIndex *self, Schema *schema, Folder *folder,
Segment *segment, const CharBuf *field) {
int32_t field_num = Seg_Field_Num(segment, field);
CharBuf *seg_name = Seg_Get_Name(segment);
CharBuf *ixix_file = CB_newf("%o/lexicon-%i32.ixix", seg_name, field_num);
CharBuf *ix_file = CB_newf("%o/lexicon-%i32.ix", seg_name, field_num);
Architecture *arch = Schema_Get_Architecture(schema);
// Init.
Lex_init((Lexicon*)self, field);
self->tinfo = TInfo_new(0);
self->tick = 0;
// Derive
self->field_type = Schema_Fetch_Type(schema, field);
if (!self->field_type) {
CharBuf *mess = MAKE_MESS("Unknown field: '%o'", field);
DECREF(ix_file);
DECREF(ixix_file);
DECREF(self);
Err_throw_mess(ERR, mess);
}
INCREF(self->field_type);
self->term_stepper = FType_Make_Term_Stepper(self->field_type);
self->ixix_in = Folder_Open_In(folder, ixix_file);
if (!self->ixix_in) {
Err *error = (Err*)INCREF(Err_get_error());
DECREF(ix_file);
DECREF(ixix_file);
DECREF(self);
RETHROW(error);
}
self->ix_in = Folder_Open_In(folder, ix_file);
if (!self->ix_in) {
Err *error = (Err*)INCREF(Err_get_error());
DECREF(ix_file);
DECREF(ixix_file);
DECREF(self);
RETHROW(error);
}
self->index_interval = Arch_Index_Interval(arch);
self->skip_interval = Arch_Skip_Interval(arch);
self->size = (int32_t)(InStream_Length(self->ixix_in) / sizeof(int64_t));
self->offsets = (int64_t*)InStream_Buf(self->ixix_in,
(size_t)InStream_Length(self->ixix_in));
DECREF(ixix_file);
DECREF(ix_file);
return self;
}
void
LexIndex_destroy(LexIndex *self) {
DECREF(self->field_type);
DECREF(self->ixix_in);
DECREF(self->ix_in);
DECREF(self->term_stepper);
DECREF(self->tinfo);
SUPER_DESTROY(self, LEXINDEX);
}
int32_t
LexIndex_get_term_num(LexIndex *self) {
return (self->index_interval * self->tick) - 1;
}
Obj*
LexIndex_get_term(LexIndex *self) {
return TermStepper_Get_Value(self->term_stepper);
}
TermInfo*
LexIndex_get_term_info(LexIndex *self) {
return self->tinfo;
}
static void
S_read_entry(LexIndex *self) {
InStream *ix_in = self->ix_in;
TermInfo *tinfo = self->tinfo;
int64_t offset = (int64_t)NumUtil_decode_bigend_u64(self->offsets + self->tick);
InStream_Seek(ix_in, offset);
TermStepper_Read_Key_Frame(self->term_stepper, ix_in);
tinfo->doc_freq = InStream_Read_C32(ix_in);
tinfo->post_filepos = InStream_Read_C64(ix_in);
tinfo->skip_filepos = tinfo->doc_freq >= self->skip_interval
? InStream_Read_C64(ix_in)
: 0;
tinfo->lex_filepos = InStream_Read_C64(ix_in);
}
void
LexIndex_seek(LexIndex *self, Obj *target) {
TermStepper *term_stepper = self->term_stepper;
InStream *ix_in = self->ix_in;
FieldType *type = self->field_type;
int32_t lo = 0;
int32_t hi = self->size - 1;
int32_t result = -100;
if (target == NULL || self->size == 0) {
self->tick = 0;
return;
}
else {
if (!Obj_Is_A(target, CHARBUF)) {
THROW(ERR, "Target is a %o, and not comparable to a %o",
Obj_Get_Class_Name(target), VTable_Get_Name(CHARBUF));
}
/* TODO:
Obj *first_obj = VA_Fetch(terms, 0);
if (!Obj_Is_A(target, Obj_Get_VTable(first_obj))) {
THROW(ERR, "Target is a %o, and not comparable to a %o",
Obj_Get_Class_Name(target), Obj_Get_Class_Name(first_obj));
}
*/
}
// Divide and conquer.
while (hi >= lo) {
const int32_t mid = lo + ((hi - lo) / 2);
const int64_t offset
= (int64_t)NumUtil_decode_bigend_u64(self->offsets + mid);
InStream_Seek(ix_in, offset);
TermStepper_Read_Key_Frame(term_stepper, ix_in);
// Compare values. There is no need for a NULL-check because the term
// number is alway between 0 and self->size - 1.
Obj *value = TermStepper_Get_Value(term_stepper);
int32_t comparison = FType_Compare_Values(type, target, value);
if (comparison < 0) {
hi = mid - 1;
}
else if (comparison > 0) {
lo = mid + 1;
}
else {
result = mid;
break;
}
}
// Record the index of the entry we've seeked to, then read entry.
self->tick = hi == -1 // indicating that target lt first entry
? 0
: result == -100 // if result is still -100, it wasn't set
? hi
: result;
S_read_entry(self);
}