blob: 8d9bdb197b297712c3b5396c37c741b8a1c0ba41 [file] [log] [blame]
/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define C_LUCY_TERMQUERY
#define C_LUCY_TERMCOMPILER
#include "Lucy/Util/ToolSet.h"
#include "Lucy/Search/TermQuery.h"
#include "Lucy/Index/DocVector.h"
#include "Lucy/Index/SegReader.h"
#include "Lucy/Index/PostingList.h"
#include "Lucy/Index/PostingListReader.h"
#include "Lucy/Index/Similarity.h"
#include "Lucy/Index/TermVector.h"
#include "Lucy/Plan/Schema.h"
#include "Lucy/Search/Compiler.h"
#include "Lucy/Search/Searcher.h"
#include "Lucy/Search/Span.h"
#include "Lucy/Search/TermMatcher.h"
#include "Lucy/Store/InStream.h"
#include "Lucy/Store/OutStream.h"
#include "Lucy/Util/Freezer.h"
TermQuery*
TermQuery_new(const CharBuf *field, const Obj *term) {
TermQuery *self = (TermQuery*)VTable_Make_Obj(TERMQUERY);
return TermQuery_init(self, field, term);
}
TermQuery*
TermQuery_init(TermQuery *self, const CharBuf *field, const Obj *term) {
Query_init((Query*)self, 1.0f);
self->field = CB_Clone(field);
self->term = Obj_Clone(term);
return self;
}
void
TermQuery_destroy(TermQuery *self) {
DECREF(self->field);
DECREF(self->term);
SUPER_DESTROY(self, TERMQUERY);
}
void
TermQuery_serialize(TermQuery *self, OutStream *outstream) {
CB_Serialize(self->field, outstream);
FREEZE(self->term, outstream);
OutStream_Write_F32(outstream, self->boost);
}
TermQuery*
TermQuery_deserialize(TermQuery *self, InStream *instream) {
self = self ? self : (TermQuery*)VTable_Make_Obj(TERMQUERY);
self->field = CB_deserialize(NULL, instream);
self->term = (Obj*)THAW(instream);
self->boost = InStream_Read_F32(instream);
return self;
}
CharBuf*
TermQuery_get_field(TermQuery *self) {
return self->field;
}
Obj*
TermQuery_get_term(TermQuery *self) {
return self->term;
}
bool_t
TermQuery_equals(TermQuery *self, Obj *other) {
TermQuery *twin = (TermQuery*)other;
if (twin == self) { return true; }
if (!Obj_Is_A(other, TERMQUERY)) { return false; }
if (self->boost != twin->boost) { return false; }
if (!CB_Equals(self->field, (Obj*)twin->field)) { return false; }
if (!Obj_Equals(self->term, twin->term)) { return false; }
return true;
}
CharBuf*
TermQuery_to_string(TermQuery *self) {
CharBuf *term_str = Obj_To_String(self->term);
CharBuf *retval = CB_newf("%o:%o", self->field, term_str);
DECREF(term_str);
return retval;
}
Compiler*
TermQuery_make_compiler(TermQuery *self, Searcher *searcher, float boost) {
return (Compiler*)TermCompiler_new((Query*)self, searcher, boost);
}
/******************************************************************/
TermCompiler*
TermCompiler_new(Query *parent, Searcher *searcher, float boost) {
TermCompiler *self = (TermCompiler*)VTable_Make_Obj(TERMCOMPILER);
return TermCompiler_init(self, parent, searcher, boost);
}
TermCompiler*
TermCompiler_init(TermCompiler *self, Query *parent, Searcher *searcher,
float boost) {
Schema *schema = Searcher_Get_Schema(searcher);
TermQuery *tparent = (TermQuery*)parent;
Similarity *sim = Schema_Fetch_Sim(schema, tparent->field);
// Try harder to get a Similarity if necessary.
if (!sim) { sim = Schema_Get_Similarity(schema); }
// Init.
Compiler_init((Compiler*)self, parent, searcher, sim, boost);
self->normalized_weight = 0.0f;
self->query_norm_factor = 0.0f;
// Derive.
int32_t doc_max = Searcher_Doc_Max(searcher);
int32_t doc_freq = Searcher_Doc_Freq(searcher, tparent->field,
tparent->term);
self->idf = Sim_IDF(sim, doc_freq, doc_max);
/* The score of any document is approximately equal to:
*
* (tf_d * idf_t / norm_d) * (tf_q * idf_t / norm_q)
*
* Here we add in the first IDF, plus user-supplied boost.
*
* The second clause is factored in by the call to Normalize().
*
* tf_d and norm_d can only be added by the Matcher, since they are
* per-document.
*/
self->raw_weight = self->idf * self->boost;
// Make final preparations.
TermCompiler_Normalize(self);
return self;
}
bool_t
TermCompiler_equals(TermCompiler *self, Obj *other) {
TermCompiler *twin = (TermCompiler*)other;
if (!Compiler_equals((Compiler*)self, other)) { return false; }
if (!Obj_Is_A(other, TERMCOMPILER)) { return false; }
if (self->idf != twin->idf) { return false; }
if (self->raw_weight != twin->raw_weight) { return false; }
if (self->query_norm_factor != twin->query_norm_factor) { return false; }
if (self->normalized_weight != twin->normalized_weight) { return false; }
return true;
}
void
TermCompiler_serialize(TermCompiler *self, OutStream *outstream) {
Compiler_serialize((Compiler*)self, outstream);
OutStream_Write_F32(outstream, self->idf);
OutStream_Write_F32(outstream, self->raw_weight);
OutStream_Write_F32(outstream, self->query_norm_factor);
OutStream_Write_F32(outstream, self->normalized_weight);
}
TermCompiler*
TermCompiler_deserialize(TermCompiler *self, InStream *instream) {
self = self ? self : (TermCompiler*)VTable_Make_Obj(TERMCOMPILER);
Compiler_deserialize((Compiler*)self, instream);
self->idf = InStream_Read_F32(instream);
self->raw_weight = InStream_Read_F32(instream);
self->query_norm_factor = InStream_Read_F32(instream);
self->normalized_weight = InStream_Read_F32(instream);
return self;
}
float
TermCompiler_sum_of_squared_weights(TermCompiler *self) {
return self->raw_weight * self->raw_weight;
}
void
TermCompiler_apply_norm_factor(TermCompiler *self, float query_norm_factor) {
self->query_norm_factor = query_norm_factor;
/* Multiply raw weight by the idf and norm_q factors in this:
*
* (tf_q * idf_q / norm_q)
*
* Note: factoring in IDF a second time is correct. See formula.
*/
self->normalized_weight
= self->raw_weight * self->idf * query_norm_factor;
}
float
TermCompiler_get_weight(TermCompiler *self) {
return self->normalized_weight;
}
Matcher*
TermCompiler_make_matcher(TermCompiler *self, SegReader *reader,
bool_t need_score) {
TermQuery *tparent = (TermQuery*)self->parent;
PostingListReader *plist_reader
= (PostingListReader*)SegReader_Fetch(
reader, VTable_Get_Name(POSTINGLISTREADER));
PostingList *plist = plist_reader
? PListReader_Posting_List(plist_reader, tparent->field, tparent->term)
: NULL;
if (plist == NULL || PList_Get_Doc_Freq(plist) == 0) {
DECREF(plist);
return NULL;
}
else {
Matcher *retval = PList_Make_Matcher(plist, self->sim,
(Compiler*)self, need_score);
DECREF(plist);
return retval;
}
}
VArray*
TermCompiler_highlight_spans(TermCompiler *self, Searcher *searcher,
DocVector *doc_vec, const CharBuf *field) {
TermQuery *const parent = (TermQuery*)self->parent;
VArray *spans = VA_new(0);
TermVector *term_vector;
I32Array *starts, *ends;
uint32_t i, max;
UNUSED_VAR(searcher);
if (!CB_Equals(parent->field, (Obj*)field)) { return spans; }
// Add all starts and ends.
term_vector = DocVec_Term_Vector(doc_vec, field, (CharBuf*)parent->term);
if (!term_vector) { return spans; }
starts = TV_Get_Start_Offsets(term_vector);
ends = TV_Get_End_Offsets(term_vector);
for (i = 0, max = I32Arr_Get_Size(starts); i < max; i++) {
int32_t start = I32Arr_Get(starts, i);
int32_t length = I32Arr_Get(ends, i) - start;
VA_Push(spans,
(Obj*)Span_new(start, length, TermCompiler_Get_Weight(self)));
}
DECREF(term_vector);
return spans;
}