| /* Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #define C_LUCY_SEGPOSTINGLIST |
| #define C_LUCY_POSTING |
| #define C_LUCY_SKIPSTEPPER |
| #include "Lucy/Util/ToolSet.h" |
| |
| #include "Lucy/Index/SegPostingList.h" |
| #include "Lucy/Index/Posting.h" |
| #include "Lucy/Index/Posting/RawPosting.h" |
| #include "Lucy/Index/PostingListReader.h" |
| #include "Lucy/Index/Segment.h" |
| #include "Lucy/Index/SkipStepper.h" |
| #include "Lucy/Index/TermInfo.h" |
| #include "Lucy/Index/SegLexicon.h" |
| #include "Lucy/Index/LexiconReader.h" |
| #include "Lucy/Index/Similarity.h" |
| #include "Lucy/Plan/Architecture.h" |
| #include "Lucy/Plan/FieldType.h" |
| #include "Lucy/Plan/Schema.h" |
| #include "Lucy/Search/Compiler.h" |
| #include "Lucy/Search/Matcher.h" |
| #include "Lucy/Store/InStream.h" |
| #include "Lucy/Store/Folder.h" |
| #include "Lucy/Util/MemoryPool.h" |
| |
| // Low level seek call. |
| static void |
| S_seek_tinfo(SegPostingList *self, TermInfo *tinfo); |
| |
| SegPostingList* |
| SegPList_new(PostingListReader *plist_reader, const CharBuf *field) { |
| SegPostingList *self = (SegPostingList*)VTable_Make_Obj(SEGPOSTINGLIST); |
| return SegPList_init(self, plist_reader, field); |
| } |
| |
| SegPostingList* |
| SegPList_init(SegPostingList *self, PostingListReader *plist_reader, |
| const CharBuf *field) { |
| Schema *const schema = PListReader_Get_Schema(plist_reader); |
| Folder *const folder = PListReader_Get_Folder(plist_reader); |
| Segment *const segment = PListReader_Get_Segment(plist_reader); |
| Architecture *const arch = Schema_Get_Architecture(schema); |
| CharBuf *const seg_name = Seg_Get_Name(segment); |
| int32_t field_num = Seg_Field_Num(segment, field); |
| CharBuf *post_file = CB_newf("%o/postings-%i32.dat", |
| seg_name, field_num); |
| CharBuf *skip_file = CB_newf("%o/postings.skip", seg_name); |
| |
| // Init. |
| self->doc_freq = 0; |
| self->count = 0; |
| |
| // Init skipping vars. |
| self->skip_stepper = SkipStepper_new(); |
| self->skip_count = 0; |
| self->num_skips = 0; |
| |
| // Assign. |
| self->plist_reader = (PostingListReader*)INCREF(plist_reader); |
| self->field = CB_Clone(field); |
| self->skip_interval = Arch_Skip_Interval(arch); |
| |
| // Derive. |
| Similarity *sim = Schema_Fetch_Sim(schema, field); |
| self->posting = Sim_Make_Posting(sim); |
| self->field_num = field_num; |
| |
| // Open both a main stream and a skip stream if the field exists. |
| if (Folder_Exists(folder, post_file)) { |
| self->post_stream = Folder_Open_In(folder, post_file); |
| if (!self->post_stream) { |
| Err *error = (Err*)INCREF(Err_get_error()); |
| DECREF(post_file); |
| DECREF(skip_file); |
| DECREF(self); |
| RETHROW(error); |
| } |
| self->skip_stream = Folder_Open_In(folder, skip_file); |
| if (!self->skip_stream) { |
| Err *error = (Err*)INCREF(Err_get_error()); |
| DECREF(post_file); |
| DECREF(skip_file); |
| DECREF(self); |
| RETHROW(error); |
| } |
| } |
| else { |
| // Empty, so don't bother with these. |
| self->post_stream = NULL; |
| self->skip_stream = NULL; |
| } |
| DECREF(post_file); |
| DECREF(skip_file); |
| |
| return self; |
| } |
| |
| void |
| SegPList_destroy(SegPostingList *self) { |
| DECREF(self->plist_reader); |
| DECREF(self->posting); |
| DECREF(self->skip_stepper); |
| DECREF(self->field); |
| |
| if (self->post_stream != NULL) { |
| InStream_Close(self->post_stream); |
| InStream_Close(self->skip_stream); |
| DECREF(self->post_stream); |
| DECREF(self->skip_stream); |
| } |
| |
| SUPER_DESTROY(self, SEGPOSTINGLIST); |
| } |
| |
| Posting* |
| SegPList_get_posting(SegPostingList *self) { |
| return self->posting; |
| } |
| |
| uint32_t |
| SegPList_get_doc_freq(SegPostingList *self) { |
| return self->doc_freq; |
| } |
| |
| int32_t |
| SegPList_get_doc_id(SegPostingList *self) { |
| return self->posting->doc_id; |
| } |
| |
| uint32_t |
| SegPList_get_count(SegPostingList *self) { |
| return self->count; |
| } |
| |
| InStream* |
| SegPList_get_post_stream(SegPostingList *self) { |
| return self->post_stream; |
| } |
| |
| int32_t |
| SegPList_next(SegPostingList *self) { |
| InStream *const post_stream = self->post_stream; |
| Posting *const posting = self->posting; |
| |
| // Bail if we're out of docs. |
| if (self->count >= self->doc_freq) { |
| Post_Reset(posting); |
| return 0; |
| } |
| self->count++; |
| |
| Post_Read_Record(posting, post_stream); |
| |
| return posting->doc_id; |
| } |
| |
| int32_t |
| SegPList_advance(SegPostingList *self, int32_t target) { |
| Posting *posting = self->posting; |
| const uint32_t skip_interval = self->skip_interval; |
| |
| if (self->doc_freq >= skip_interval) { |
| InStream *post_stream = self->post_stream; |
| InStream *skip_stream = self->skip_stream; |
| SkipStepper *const skip_stepper = self->skip_stepper; |
| uint32_t new_doc_id = skip_stepper->doc_id; |
| int64_t new_filepos = InStream_Tell(post_stream); |
| |
| /* Assuming the default skip_interval of 16... |
| * |
| * Say we're currently on the 5th doc matching this term, and we get a |
| * request to skip to the 18th doc matching it. We won't have skipped |
| * yet, but we'll have already gone past 5 of the 16 skip docs -- |
| * ergo, the modulus in the following formula. |
| */ |
| int32_t num_skipped = 0 - (self->count % skip_interval); |
| if (num_skipped == 0 && self->count != 0) { |
| num_skipped = 0 - skip_interval; |
| } |
| |
| // See if there's anything to skip. |
| while (target > skip_stepper->doc_id) { |
| new_doc_id = skip_stepper->doc_id; |
| new_filepos = skip_stepper->filepos; |
| |
| if (skip_stepper->doc_id != 0 |
| && skip_stepper->doc_id >= posting->doc_id |
| ) { |
| num_skipped += skip_interval; |
| } |
| |
| if (self->skip_count >= self->num_skips) { |
| break; |
| } |
| |
| SkipStepper_Read_Record(skip_stepper, skip_stream); |
| self->skip_count++; |
| } |
| |
| // If we found something to skip, skip it. |
| if (new_filepos > InStream_Tell(post_stream)) { |
| |
| // Move the postings filepointer up. |
| InStream_Seek(post_stream, new_filepos); |
| |
| // Jump to the new doc id. |
| posting->doc_id = new_doc_id; |
| |
| // Increase count by the number of docs we skipped over. |
| self->count += num_skipped; |
| } |
| } |
| |
| // Done skipping, so scan. |
| while (1) { |
| int32_t doc_id = SegPList_Next(self); |
| if (doc_id == 0 || doc_id >= target) { |
| return doc_id; |
| } |
| } |
| } |
| |
| void |
| SegPList_seek(SegPostingList *self, Obj *target) { |
| LexiconReader *lex_reader = PListReader_Get_Lex_Reader(self->plist_reader); |
| TermInfo *tinfo = LexReader_Fetch_Term_Info(lex_reader, |
| self->field, target); |
| S_seek_tinfo(self, tinfo); |
| DECREF(tinfo); |
| } |
| |
| void |
| SegPList_seek_lex(SegPostingList *self, Lexicon *lexicon) { |
| // Maybe true, maybe not. |
| SegLexicon *const seg_lexicon = (SegLexicon*)lexicon; |
| |
| // Optimized case. |
| if (Obj_Is_A((Obj*)lexicon, SEGLEXICON) |
| && (SegLex_Get_Segment(seg_lexicon) |
| == PListReader_Get_Segment(self->plist_reader)) // i.e. same segment |
| ) { |
| S_seek_tinfo(self, SegLex_Get_Term_Info(seg_lexicon)); |
| } |
| // Punt case. This is more expensive because of the call to |
| // LexReader_Fetch_Term_Info() in Seek(). |
| else { |
| Obj *term = Lex_Get_Term(lexicon); |
| SegPList_Seek(self, term); |
| } |
| } |
| |
| static void |
| S_seek_tinfo(SegPostingList *self, TermInfo *tinfo) { |
| self->count = 0; |
| |
| if (tinfo == NULL) { |
| // Next will return false; other methods invalid now. |
| self->doc_freq = 0; |
| } |
| else { |
| // Transfer doc_freq, seek main stream. |
| int64_t post_filepos = TInfo_Get_Post_FilePos(tinfo); |
| self->doc_freq = TInfo_Get_Doc_Freq(tinfo); |
| InStream_Seek(self->post_stream, post_filepos); |
| |
| // Prepare posting. |
| Post_Reset(self->posting); |
| |
| // Prepare to skip. |
| self->skip_count = 0; |
| self->num_skips = self->doc_freq / self->skip_interval; |
| SkipStepper_Set_ID_And_Filepos(self->skip_stepper, 0, post_filepos); |
| InStream_Seek(self->skip_stream, TInfo_Get_Skip_FilePos(tinfo)); |
| } |
| } |
| |
| Matcher* |
| SegPList_make_matcher(SegPostingList *self, Similarity *sim, |
| Compiler *compiler, bool_t need_score) { |
| return Post_Make_Matcher(self->posting, sim, (PostingList*)self, compiler, |
| need_score); |
| } |
| |
| RawPosting* |
| SegPList_read_raw(SegPostingList *self, int32_t last_doc_id, CharBuf *term_text, |
| MemoryPool *mem_pool) { |
| return Post_Read_Raw(self->posting, self->post_stream, |
| last_doc_id, term_text, mem_pool); |
| } |
| |
| |
| |