blob: 92f916298c1e938e35412fde35a49c74e4bdb65a [file] [log] [blame]
/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define C_LUCY_PROXIMITYMATCHER
#define C_LUCY_POSTING
#define C_LUCY_SCOREPOSTING
#include "Lucy/Util/ToolSet.h"
#include "LucyX/Search/ProximityMatcher.h"
#include "Lucy/Index/Posting/ScorePosting.h"
#include "Lucy/Index/PostingList.h"
#include "Lucy/Index/Similarity.h"
#include "Lucy/Search/Compiler.h"
ProximityMatcher*
ProximityMatcher_new(Similarity *sim, VArray *plists, Compiler *compiler,
uint32_t within) {
ProximityMatcher *self =
(ProximityMatcher*)VTable_Make_Obj(PROXIMITYMATCHER);
return ProximityMatcher_init(self, sim, plists, compiler, within);
}
ProximityMatcher*
ProximityMatcher_init(ProximityMatcher *self, Similarity *similarity,
VArray *plists, Compiler *compiler, uint32_t within) {
Matcher_init((Matcher*)self);
// Init.
self->anchor_set = BB_new(0);
self->proximity_freq = 0.0;
self->proximity_boost = 0.0;
self->first_time = true;
self->more = true;
self->within = within;
// Extract PostingLists out of VArray into local C array for quick access.
self->num_elements = VA_Get_Size(plists);
self->plists = (PostingList**)MALLOCATE(
self->num_elements * sizeof(PostingList*));
for (size_t i = 0; i < self->num_elements; i++) {
PostingList *const plist
= (PostingList*)CERTIFY(VA_Fetch(plists, i), POSTINGLIST);
if (plist == NULL) {
THROW(ERR, "Missing element %u32", i);
}
self->plists[i] = (PostingList*)INCREF(plist);
}
// Assign.
self->sim = (Similarity*)INCREF(similarity);
self->compiler = (Compiler*)INCREF(compiler);
self->weight = Compiler_Get_Weight(compiler);
return self;
}
void
ProximityMatcher_destroy(ProximityMatcher *self) {
if (self->plists) {
for (size_t i = 0; i < self->num_elements; i++) {
DECREF(self->plists[i]);
}
FREEMEM(self->plists);
}
DECREF(self->sim);
DECREF(self->anchor_set);
DECREF(self->compiler);
SUPER_DESTROY(self, PROXIMITYMATCHER);
}
int32_t
ProximityMatcher_next(ProximityMatcher *self) {
if (self->first_time) {
return ProximityMatcher_Advance(self, 1);
}
else if (self->more) {
const int32_t target = PList_Get_Doc_ID(self->plists[0]) + 1;
return ProximityMatcher_Advance(self, target);
}
else {
return 0;
}
}
int32_t
ProximityMatcher_advance(ProximityMatcher *self, int32_t target) {
PostingList **const plists = self->plists;
const uint32_t num_elements = self->num_elements;
int32_t highest = 0;
// Reset match variables to indicate no match. New values will be
// assigned if a match succeeds.
self->proximity_freq = 0.0;
self->doc_id = 0;
// Find the lowest possible matching doc ID greater than the current doc
// ID. If any one of the PostingLists is exhausted, we're done.
if (self->first_time) {
self->first_time = false;
// On the first call to Advance(), advance all PostingLists.
for (size_t i = 0, max = self->num_elements; i < max; i++) {
int32_t candidate = PList_Advance(plists[i], target);
if (!candidate) {
self->more = false;
return 0;
}
else if (candidate > highest) {
// Remember the highest doc ID so far.
highest = candidate;
}
}
}
else {
// On subsequent iters, advance only one PostingList. Its new doc ID
// becomes the minimum target which all the others must move up to.
highest = PList_Advance(plists[0], target);
if (highest == 0) {
self->more = false;
return 0;
}
}
// Find a doc which contains all the terms.
while (1) {
bool_t agreement = true;
// Scoot all posting lists up to at least the current minimum.
for (uint32_t i = 0; i < num_elements; i++) {
PostingList *const plist = plists[i];
int32_t candidate = PList_Get_Doc_ID(plist);
// Is this PostingList already beyond the minimum? Then raise the
// bar for everyone else.
if (highest < candidate) { highest = candidate; }
if (target < highest) { target = highest; }
// Scoot this posting list up.
if (candidate < target) {
candidate = PList_Advance(plist, target);
// If this PostingList is exhausted, we're done.
if (candidate == 0) {
self->more = false;
return 0;
}
// After calling PList_Advance(), we are guaranteed to be
// either at or beyond the minimum, so we can assign without
// checking and the minumum will either go up or stay the
// same.
highest = candidate;
}
}
// See whether all the PostingLists have managed to converge on a
// single doc ID.
for (uint32_t i = 0; i < num_elements; i++) {
const int32_t candidate = PList_Get_Doc_ID(plists[i]);
if (candidate != highest) { agreement = false; }
}
// If we've found a doc with all terms in it, see if they form a
// phrase.
if (agreement && highest >= target) {
self->proximity_freq = ProximityMatcher_Calc_Proximity_Freq(self);
if (self->proximity_freq == 0.0) {
// No phrase. Move on to another doc.
target += 1;
}
else {
// Success!
self->doc_id = highest;
return highest;
}
}
}
}
static INLINE uint32_t
SI_winnow_anchors(uint32_t *anchors_start, const uint32_t *const anchors_end,
const uint32_t *candidates, const uint32_t *const candidates_end,
uint32_t offset, uint32_t within) {
uint32_t *anchors = anchors_start;
uint32_t *anchors_found = anchors_start;
uint32_t target_anchor;
uint32_t target_candidate;
// Safety check, so there's no chance of a bad dereference.
if (anchors_start == anchors_end || candidates == candidates_end) {
return 0;
}
/* This function is a loop that finds terms that can continue a phrase.
* It overwrites the anchors in place, and returns the number remaining.
* The basic algorithm is to alternately increment the candidates' pointer
* until it is at or beyond its target position, and then increment the
* anchors' pointer until it is at or beyond its target. The non-standard
* form is to avoid unnecessary comparisons. This loop has not been
* tested for speed, but glancing at the object code produced (objdump -S)
* it appears to be significantly faster than the nested loop alternative.
* But given the vagaries of modern processors, it merits actual
* testing.*/
SPIN_CANDIDATES:
target_candidate = *anchors + offset;
while (*candidates < target_candidate) {
if (++candidates == candidates_end) { goto DONE; }
}
if ((*candidates - target_candidate) < within) { goto MATCH; }
goto SPIN_ANCHORS;
SPIN_ANCHORS:
target_anchor = *candidates - offset;
while (*anchors < target_anchor) {
if (++anchors == anchors_end) { goto DONE; }
};
if (*anchors == target_anchor) { goto MATCH; }
goto SPIN_CANDIDATES;
MATCH:
*anchors_found++ = *anchors;
if (++anchors == anchors_end) { goto DONE; }
goto SPIN_CANDIDATES;
DONE:
// Return number of anchors remaining.
return anchors_found - anchors_start;
}
float
ProximityMatcher_calc_proximity_freq(ProximityMatcher *self) {
PostingList **const plists = self->plists;
/* Create a overwriteable "anchor set" from the first posting.
*
* Each "anchor" is a position, measured in tokens, corresponding to a a
* term which might start a phrase. We start off with an "anchor set"
* comprised of all positions at which the first term in the phrase occurs
* in the field.
*
* There can never be more proximity matches than instances of this first
* term. There may be fewer however, which we will determine by seeing
* whether all the other terms line up at subsequent position slots.
*
* Every time we eliminate an anchor from the anchor set, we splice it out
* of the array. So if we begin with an anchor set of (15, 51, 72) and we
* discover that matches occur at the first and last instances of the
* first term but not the middle one, the final array will be (15, 72).
*
* The number of elements in the anchor set when we are finished winnowing
* is our proximity freq.
*/
ScorePosting *posting = (ScorePosting*)PList_Get_Posting(plists[0]);
uint32_t anchors_remaining = posting->freq;
if (!anchors_remaining) { return 0.0f; }
size_t amount = anchors_remaining * sizeof(uint32_t);
uint32_t *anchors_start = (uint32_t*)BB_Grow(self->anchor_set, amount);
uint32_t *anchors_end = anchors_start + anchors_remaining;
memcpy(anchors_start, posting->prox, amount);
// Match the positions of other terms against the anchor set.
for (uint32_t i = 1, max = self->num_elements; i < max; i++) {
// Get the array of positions for the next term. Unlike the anchor
// set (which is a copy), these won't be overwritten.
ScorePosting *posting = (ScorePosting*)PList_Get_Posting(plists[i]);
uint32_t *candidates_start = posting->prox;
uint32_t *candidates_end = candidates_start + posting->freq;
// Splice out anchors that don't match the next term. Bail out if
// we've eliminated all possible anchors.
if (self->within == 1) { // exact phrase match
anchors_remaining = SI_winnow_anchors(anchors_start, anchors_end,
candidates_start,
candidates_end, i, 1);
}
else { // fuzzy-phrase match
anchors_remaining = SI_winnow_anchors(anchors_start, anchors_end,
candidates_start,
candidates_end, i,
self->within);
}
if (!anchors_remaining) { return 0.0f; }
// Adjust end for number of anchors that remain.
anchors_end = anchors_start + anchors_remaining;
}
// The number of anchors left is the proximity freq.
return (float)anchors_remaining;
}
int32_t
ProximityMatcher_get_doc_id(ProximityMatcher *self) {
return self->doc_id;
}
float
ProximityMatcher_score(ProximityMatcher *self) {
ScorePosting *posting = (ScorePosting*)PList_Get_Posting(self->plists[0]);
float score = Sim_TF(self->sim, self->proximity_freq)
* self->weight
* posting->weight;
return score;
}