blob: 5d91283fcc9129177ac61a76ccab4632ab4c446d [file] [log] [blame]
/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define C_LUCY_MATCHPOSTING
#define C_LUCY_MATCHPOSTINGMATCHER
#define C_LUCY_MATCHPOSTINGWRITER
#define C_LUCY_MATCHTERMINFOSTEPPER
#define C_LUCY_RAWPOSTING
#define C_LUCY_TERMINFO
#define C_LUCY_TOKEN
#include "Lucy/Util/ToolSet.h"
#include "Lucy/Index/Posting/MatchPosting.h"
#include "Lucy/Analysis/Inversion.h"
#include "Lucy/Analysis/Token.h"
#include "Lucy/Index/Posting/RawPosting.h"
#include "Lucy/Index/PostingList.h"
#include "Lucy/Index/PostingPool.h"
#include "Lucy/Index/PolyReader.h"
#include "Lucy/Index/Segment.h"
#include "Lucy/Index/Similarity.h"
#include "Lucy/Index/Snapshot.h"
#include "Lucy/Index/TermInfo.h"
#include "Lucy/Plan/Architecture.h"
#include "Lucy/Plan/FieldType.h"
#include "Lucy/Plan/Schema.h"
#include "Lucy/Search/Compiler.h"
#include "Lucy/Store/Folder.h"
#include "Lucy/Store/InStream.h"
#include "Lucy/Store/OutStream.h"
#include "Lucy/Util/MemoryPool.h"
#define MAX_RAW_POSTING_LEN(_text_len) \
( sizeof(RawPosting) \
+ _text_len + 1 /* term text content */ \
)
MatchPosting*
MatchPost_new(Similarity *sim) {
MatchPosting *self = (MatchPosting*)VTable_Make_Obj(MATCHPOSTING);
return MatchPost_init(self, sim);
}
MatchPosting*
MatchPost_init(MatchPosting *self, Similarity *sim) {
self->sim = (Similarity*)INCREF(sim);
return (MatchPosting*)Post_init((Posting*)self);
}
void
MatchPost_destroy(MatchPosting *self) {
DECREF(self->sim);
SUPER_DESTROY(self, MATCHPOSTING);
}
int32_t
MatchPost_get_freq(MatchPosting *self) {
return self->freq;
}
void
MatchPost_reset(MatchPosting *self) {
self->doc_id = 0;
}
void
MatchPost_read_record(MatchPosting *self, InStream *instream) {
const uint32_t doc_code = InStream_Read_C32(instream);
const uint32_t doc_delta = doc_code >> 1;
// Apply delta doc and retrieve freq.
self->doc_id += doc_delta;
if (doc_code & 1) {
self->freq = 1;
}
else {
self->freq = InStream_Read_C32(instream);
}
}
RawPosting*
MatchPost_read_raw(MatchPosting *self, InStream *instream, int32_t last_doc_id,
CharBuf *term_text, MemoryPool *mem_pool) {
char *const text_buf = (char*)CB_Get_Ptr8(term_text);
const size_t text_size = CB_Get_Size(term_text);
const uint32_t doc_code = InStream_Read_C32(instream);
const uint32_t delta_doc = doc_code >> 1;
const int32_t doc_id = last_doc_id + delta_doc;
const uint32_t freq = (doc_code & 1)
? 1
: InStream_Read_C32(instream);
size_t raw_post_bytes = MAX_RAW_POSTING_LEN(text_size);
void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes);
UNUSED_VAR(self);
return RawPost_new(allocation, doc_id, freq, text_buf, text_size);
}
void
MatchPost_add_inversion_to_pool(MatchPosting *self, PostingPool *post_pool,
Inversion *inversion, FieldType *type,
int32_t doc_id, float doc_boost,
float length_norm) {
MemoryPool *mem_pool = PostPool_Get_Mem_Pool(post_pool);
Token **tokens;
uint32_t freq;
UNUSED_VAR(self);
UNUSED_VAR(type);
UNUSED_VAR(doc_boost);
UNUSED_VAR(length_norm);
Inversion_Reset(inversion);
while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) {
Token *token = *tokens;
uint32_t raw_post_bytes = MAX_RAW_POSTING_LEN(token->len);
RawPosting *raw_posting
= RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id,
freq, token->text, token->len);
PostPool_Feed(post_pool, &raw_posting);
}
}
MatchPostingMatcher*
MatchPost_make_matcher(MatchPosting *self, Similarity *sim,
PostingList *plist, Compiler *compiler,
bool_t need_score) {
MatchPostingMatcher *matcher
= (MatchPostingMatcher*)VTable_Make_Obj(MATCHPOSTINGMATCHER);
UNUSED_VAR(self);
UNUSED_VAR(need_score);
return MatchPostMatcher_init(matcher, sim, plist, compiler);
}
/***************************************************************************/
MatchPostingMatcher*
MatchPostMatcher_init(MatchPostingMatcher *self, Similarity *sim,
PostingList *plist, Compiler *compiler) {
TermMatcher_init((TermMatcher*)self, sim, plist, compiler);
return self;
}
float
MatchPostMatcher_score(MatchPostingMatcher* self) {
return self->weight;
}
/***************************************************************************/
MatchPostingWriter*
MatchPostWriter_new(Schema *schema, Snapshot *snapshot, Segment *segment,
PolyReader *polyreader, int32_t field_num) {
MatchPostingWriter *self
= (MatchPostingWriter*)VTable_Make_Obj(MATCHPOSTINGWRITER);
return MatchPostWriter_init(self, schema, snapshot, segment, polyreader,
field_num);
}
MatchPostingWriter*
MatchPostWriter_init(MatchPostingWriter *self, Schema *schema,
Snapshot *snapshot, Segment *segment,
PolyReader *polyreader, int32_t field_num) {
Folder *folder = PolyReader_Get_Folder(polyreader);
CharBuf *filename
= CB_newf("%o/postings-%i32.dat", Seg_Get_Name(segment), field_num);
PostWriter_init((PostingWriter*)self, schema, snapshot, segment,
polyreader, field_num);
self->outstream = Folder_Open_Out(folder, filename);
if (!self->outstream) { RETHROW(INCREF(Err_get_error())); }
DECREF(filename);
return self;
}
void
MatchPostWriter_destroy(MatchPostingWriter *self) {
DECREF(self->outstream);
SUPER_DESTROY(self, MATCHPOSTINGWRITER);
}
void
MatchPostWriter_write_posting(MatchPostingWriter *self, RawPosting *posting) {
OutStream *const outstream = self->outstream;
const int32_t doc_id = posting->doc_id;
const uint32_t delta_doc = doc_id - self->last_doc_id;
char *const aux_content = posting->blob + posting->content_len;
if (posting->freq == 1) {
const uint32_t doc_code = (delta_doc << 1) | 1;
OutStream_Write_C32(outstream, doc_code);
}
else {
const uint32_t doc_code = delta_doc << 1;
OutStream_Write_C32(outstream, doc_code);
OutStream_Write_C32(outstream, posting->freq);
}
OutStream_Write_Bytes(outstream, aux_content, posting->aux_len);
self->last_doc_id = doc_id;
}
void
MatchPostWriter_start_term(MatchPostingWriter *self, TermInfo *tinfo) {
self->last_doc_id = 0;
tinfo->post_filepos = OutStream_Tell(self->outstream);
}
void
MatchPostWriter_update_skip_info(MatchPostingWriter *self, TermInfo *tinfo) {
tinfo->post_filepos = OutStream_Tell(self->outstream);
}
/***************************************************************************/
MatchTermInfoStepper*
MatchTInfoStepper_new(Schema *schema) {
MatchTermInfoStepper *self
= (MatchTermInfoStepper*)VTable_Make_Obj(MATCHTERMINFOSTEPPER);
return MatchTInfoStepper_init(self, schema);
}
MatchTermInfoStepper*
MatchTInfoStepper_init(MatchTermInfoStepper *self, Schema *schema) {
Architecture *arch = Schema_Get_Architecture(schema);
TermStepper_init((TermStepper*)self);
self->skip_interval = Arch_Skip_Interval(arch);
self->value = (Obj*)TInfo_new(0);
return self;
}
void
MatchTInfoStepper_reset(MatchTermInfoStepper *self) {
TInfo_Reset((TermInfo*)self->value);
}
void
MatchTInfoStepper_write_key_frame(MatchTermInfoStepper *self,
OutStream *outstream, Obj *value) {
TermInfo *tinfo = (TermInfo*)CERTIFY(value, TERMINFO);
int32_t doc_freq = TInfo_Get_Doc_Freq(tinfo);
// Write doc_freq.
OutStream_Write_C32(outstream, doc_freq);
// Write postings file pointer.
OutStream_Write_C64(outstream, tinfo->post_filepos);
// Write skip file pointer (maybe).
if (doc_freq >= self->skip_interval) {
OutStream_Write_C64(outstream, tinfo->skip_filepos);
}
TInfo_Mimic((TermInfo*)self->value, (Obj*)tinfo);
}
void
MatchTInfoStepper_write_delta(MatchTermInfoStepper *self,
OutStream *outstream, Obj *value) {
TermInfo *tinfo = (TermInfo*)CERTIFY(value, TERMINFO);
TermInfo *last_tinfo = (TermInfo*)self->value;
int32_t doc_freq = TInfo_Get_Doc_Freq(tinfo);
int64_t post_delta = tinfo->post_filepos - last_tinfo->post_filepos;
// Write doc_freq.
OutStream_Write_C32(outstream, doc_freq);
// Write postings file pointer delta.
OutStream_Write_C64(outstream, post_delta);
// Write skip file pointer (maybe).
if (doc_freq >= self->skip_interval) {
OutStream_Write_C64(outstream, tinfo->skip_filepos);
}
TInfo_Mimic((TermInfo*)self->value, (Obj*)tinfo);
}
void
MatchTInfoStepper_read_key_frame(MatchTermInfoStepper *self,
InStream *instream) {
TermInfo *const tinfo = (TermInfo*)self->value;
// Read doc freq.
tinfo->doc_freq = InStream_Read_C32(instream);
// Read postings file pointer.
tinfo->post_filepos = InStream_Read_C64(instream);
// Maybe read skip pointer.
if (tinfo->doc_freq >= self->skip_interval) {
tinfo->skip_filepos = InStream_Read_C64(instream);
}
else {
tinfo->skip_filepos = 0;
}
}
void
MatchTInfoStepper_read_delta(MatchTermInfoStepper *self, InStream *instream) {
TermInfo *const tinfo = (TermInfo*)self->value;
// Read doc freq.
tinfo->doc_freq = InStream_Read_C32(instream);
// Adjust postings file pointer.
tinfo->post_filepos += InStream_Read_C64(instream);
// Maybe read skip pointer.
if (tinfo->doc_freq >= self->skip_interval) {
tinfo->skip_filepos = InStream_Read_C64(instream);
}
else {
tinfo->skip_filepos = 0;
}
}