blob: d0195c8084e5ba85f2c5a4c66b1f205dfe5be798 [file] [log] [blame]
/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define C_LUCY_SEGPOSTINGLIST
#define C_LUCY_POSTING
#define C_LUCY_SKIPSTEPPER
#include "Lucy/Util/ToolSet.h"
#include "Lucy/Index/SegPostingList.h"
#include "Lucy/Index/Posting.h"
#include "Lucy/Index/Posting/RawPosting.h"
#include "Lucy/Index/PostingListReader.h"
#include "Lucy/Index/Segment.h"
#include "Lucy/Index/SkipStepper.h"
#include "Lucy/Index/TermInfo.h"
#include "Lucy/Index/SegLexicon.h"
#include "Lucy/Index/LexiconReader.h"
#include "Lucy/Index/Similarity.h"
#include "Lucy/Plan/Architecture.h"
#include "Lucy/Plan/FieldType.h"
#include "Lucy/Plan/Schema.h"
#include "Lucy/Search/Compiler.h"
#include "Lucy/Search/Matcher.h"
#include "Lucy/Store/InStream.h"
#include "Lucy/Store/Folder.h"
#include "Lucy/Util/MemoryPool.h"
// Low level seek call.
static void
S_seek_tinfo(SegPostingList *self, TermInfo *tinfo);
SegPostingList*
SegPList_new(PostingListReader *plist_reader, const CharBuf *field) {
SegPostingList *self = (SegPostingList*)VTable_Make_Obj(SEGPOSTINGLIST);
return SegPList_init(self, plist_reader, field);
}
SegPostingList*
SegPList_init(SegPostingList *self, PostingListReader *plist_reader,
const CharBuf *field) {
Schema *const schema = PListReader_Get_Schema(plist_reader);
Folder *const folder = PListReader_Get_Folder(plist_reader);
Segment *const segment = PListReader_Get_Segment(plist_reader);
Architecture *const arch = Schema_Get_Architecture(schema);
CharBuf *const seg_name = Seg_Get_Name(segment);
int32_t field_num = Seg_Field_Num(segment, field);
CharBuf *post_file = CB_newf("%o/postings-%i32.dat",
seg_name, field_num);
CharBuf *skip_file = CB_newf("%o/postings.skip", seg_name);
// Init.
self->doc_freq = 0;
self->count = 0;
// Init skipping vars.
self->skip_stepper = SkipStepper_new();
self->skip_count = 0;
self->num_skips = 0;
// Assign.
self->plist_reader = (PostingListReader*)INCREF(plist_reader);
self->field = CB_Clone(field);
self->skip_interval = Arch_Skip_Interval(arch);
// Derive.
Similarity *sim = Schema_Fetch_Sim(schema, field);
self->posting = Sim_Make_Posting(sim);
self->field_num = field_num;
// Open both a main stream and a skip stream if the field exists.
if (Folder_Exists(folder, post_file)) {
self->post_stream = Folder_Open_In(folder, post_file);
if (!self->post_stream) {
Err *error = (Err*)INCREF(Err_get_error());
DECREF(post_file);
DECREF(skip_file);
DECREF(self);
RETHROW(error);
}
self->skip_stream = Folder_Open_In(folder, skip_file);
if (!self->skip_stream) {
Err *error = (Err*)INCREF(Err_get_error());
DECREF(post_file);
DECREF(skip_file);
DECREF(self);
RETHROW(error);
}
}
else {
// Empty, so don't bother with these.
self->post_stream = NULL;
self->skip_stream = NULL;
}
DECREF(post_file);
DECREF(skip_file);
return self;
}
void
SegPList_destroy(SegPostingList *self) {
DECREF(self->plist_reader);
DECREF(self->posting);
DECREF(self->skip_stepper);
DECREF(self->field);
if (self->post_stream != NULL) {
InStream_Close(self->post_stream);
InStream_Close(self->skip_stream);
DECREF(self->post_stream);
DECREF(self->skip_stream);
}
SUPER_DESTROY(self, SEGPOSTINGLIST);
}
Posting*
SegPList_get_posting(SegPostingList *self) {
return self->posting;
}
uint32_t
SegPList_get_doc_freq(SegPostingList *self) {
return self->doc_freq;
}
int32_t
SegPList_get_doc_id(SegPostingList *self) {
return self->posting->doc_id;
}
uint32_t
SegPList_get_count(SegPostingList *self) {
return self->count;
}
InStream*
SegPList_get_post_stream(SegPostingList *self) {
return self->post_stream;
}
int32_t
SegPList_next(SegPostingList *self) {
InStream *const post_stream = self->post_stream;
Posting *const posting = self->posting;
// Bail if we're out of docs.
if (self->count >= self->doc_freq) {
Post_Reset(posting);
return 0;
}
self->count++;
Post_Read_Record(posting, post_stream);
return posting->doc_id;
}
int32_t
SegPList_advance(SegPostingList *self, int32_t target) {
Posting *posting = self->posting;
const uint32_t skip_interval = self->skip_interval;
if (self->doc_freq >= skip_interval) {
InStream *post_stream = self->post_stream;
InStream *skip_stream = self->skip_stream;
SkipStepper *const skip_stepper = self->skip_stepper;
uint32_t new_doc_id = skip_stepper->doc_id;
int64_t new_filepos = InStream_Tell(post_stream);
/* Assuming the default skip_interval of 16...
*
* Say we're currently on the 5th doc matching this term, and we get a
* request to skip to the 18th doc matching it. We won't have skipped
* yet, but we'll have already gone past 5 of the 16 skip docs --
* ergo, the modulus in the following formula.
*/
int32_t num_skipped = 0 - (self->count % skip_interval);
if (num_skipped == 0 && self->count != 0) {
num_skipped = 0 - skip_interval;
}
// See if there's anything to skip.
while (target > skip_stepper->doc_id) {
new_doc_id = skip_stepper->doc_id;
new_filepos = skip_stepper->filepos;
if (skip_stepper->doc_id != 0
&& skip_stepper->doc_id >= posting->doc_id
) {
num_skipped += skip_interval;
}
if (self->skip_count >= self->num_skips) {
break;
}
SkipStepper_Read_Record(skip_stepper, skip_stream);
self->skip_count++;
}
// If we found something to skip, skip it.
if (new_filepos > InStream_Tell(post_stream)) {
// Move the postings filepointer up.
InStream_Seek(post_stream, new_filepos);
// Jump to the new doc id.
posting->doc_id = new_doc_id;
// Increase count by the number of docs we skipped over.
self->count += num_skipped;
}
}
// Done skipping, so scan.
while (1) {
int32_t doc_id = SegPList_Next(self);
if (doc_id == 0 || doc_id >= target) {
return doc_id;
}
}
}
void
SegPList_seek(SegPostingList *self, Obj *target) {
LexiconReader *lex_reader = PListReader_Get_Lex_Reader(self->plist_reader);
TermInfo *tinfo = LexReader_Fetch_Term_Info(lex_reader,
self->field, target);
S_seek_tinfo(self, tinfo);
DECREF(tinfo);
}
void
SegPList_seek_lex(SegPostingList *self, Lexicon *lexicon) {
// Maybe true, maybe not.
SegLexicon *const seg_lexicon = (SegLexicon*)lexicon;
// Optimized case.
if (Obj_Is_A((Obj*)lexicon, SEGLEXICON)
&& (SegLex_Get_Segment(seg_lexicon)
== PListReader_Get_Segment(self->plist_reader)) // i.e. same segment
) {
S_seek_tinfo(self, SegLex_Get_Term_Info(seg_lexicon));
}
// Punt case. This is more expensive because of the call to
// LexReader_Fetch_Term_Info() in Seek().
else {
Obj *term = Lex_Get_Term(lexicon);
SegPList_Seek(self, term);
}
}
static void
S_seek_tinfo(SegPostingList *self, TermInfo *tinfo) {
self->count = 0;
if (tinfo == NULL) {
// Next will return false; other methods invalid now.
self->doc_freq = 0;
}
else {
// Transfer doc_freq, seek main stream.
int64_t post_filepos = TInfo_Get_Post_FilePos(tinfo);
self->doc_freq = TInfo_Get_Doc_Freq(tinfo);
InStream_Seek(self->post_stream, post_filepos);
// Prepare posting.
Post_Reset(self->posting);
// Prepare to skip.
self->skip_count = 0;
self->num_skips = self->doc_freq / self->skip_interval;
SkipStepper_Set_ID_And_Filepos(self->skip_stepper, 0, post_filepos);
InStream_Seek(self->skip_stream, TInfo_Get_Skip_FilePos(tinfo));
}
}
Matcher*
SegPList_make_matcher(SegPostingList *self, Similarity *sim,
Compiler *compiler, bool_t need_score) {
return Post_Make_Matcher(self->posting, sim, (PostingList*)self, compiler,
need_score);
}
RawPosting*
SegPList_read_raw(SegPostingList *self, int32_t last_doc_id, CharBuf *term_text,
MemoryPool *mem_pool) {
return Post_Read_Raw(self->posting, self->post_stream,
last_doc_id, term_text, mem_pool);
}