core/LucyX/Search/ProximityMatcher.c - lucy - Git at Google

 /* Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #define C_LUCY_PROXIMITYMATCHER
 #define C_LUCY_POSTING
 #define C_LUCY_SCOREPOSTING
 #include "Lucy/Util/ToolSet.h"

 #include "LucyX/Search/ProximityMatcher.h"
 #include "Lucy/Index/Posting/ScorePosting.h"
 #include "Lucy/Index/PostingList.h"
 #include "Lucy/Index/Similarity.h"
 #include "Lucy/Search/Compiler.h"


 ProximityMatcher*
 ProximityMatcher_new(Similarity *sim, VArray *plists, Compiler *compiler,
                      uint32_t within) {
     ProximityMatcher *self =
         (ProximityMatcher*)VTable_Make_Obj(PROXIMITYMATCHER);
     return ProximityMatcher_init(self, sim, plists, compiler, within);

 }

 ProximityMatcher*
 ProximityMatcher_init(ProximityMatcher *self, Similarity *similarity,
                       VArray *plists, Compiler *compiler, uint32_t within) {
     Matcher_init((Matcher*)self);

     // Init.
     self->anchor_set       = BB_new(0);
     self->proximity_freq   = 0.0;
     self->proximity_boost  = 0.0;
     self->first_time       = true;
     self->more             = true;
     self->within           = within;

     // Extract PostingLists out of VArray into local C array for quick access.
     self->num_elements = VA_Get_Size(plists);
     self->plists = (PostingList**)MALLOCATE(
                        self->num_elements * sizeof(PostingList*));
     for (size_t i = 0; i < self->num_elements; i++) {
         PostingList *const plist
             = (PostingList*)CERTIFY(VA_Fetch(plists, i), POSTINGLIST);
         if (plist == NULL) {
             THROW(ERR, "Missing element %u32", i);
         }
         self->plists[i] = (PostingList*)INCREF(plist);
     }

     // Assign.
     self->sim       = (Similarity*)INCREF(similarity);
     self->compiler  = (Compiler*)INCREF(compiler);
     self->weight    = Compiler_Get_Weight(compiler);

     return self;
 }

 void
 ProximityMatcher_destroy(ProximityMatcher *self) {
     if (self->plists) {
         for (size_t i = 0; i < self->num_elements; i++) {
             DECREF(self->plists[i]);
         }
         FREEMEM(self->plists);
     }
     DECREF(self->sim);
     DECREF(self->anchor_set);
     DECREF(self->compiler);
     SUPER_DESTROY(self, PROXIMITYMATCHER);
 }

 int32_t
 ProximityMatcher_next(ProximityMatcher *self) {
     if (self->first_time) {
         return ProximityMatcher_Advance(self, 1);
     }
     else if (self->more) {
         const int32_t target = PList_Get_Doc_ID(self->plists[0]) + 1;
         return ProximityMatcher_Advance(self, target);
     }
     else {
         return 0;
     }
 }

 int32_t
 ProximityMatcher_advance(ProximityMatcher *self, int32_t target) {
     PostingList **const plists       = self->plists;
     const uint32_t      num_elements = self->num_elements;
     int32_t             highest      = 0;

     // Reset match variables to indicate no match.  New values will be
     // assigned if a match succeeds.
     self->proximity_freq = 0.0;
     self->doc_id         = 0;

     // Find the lowest possible matching doc ID greater than the current doc
     // ID.  If any one of the PostingLists is exhausted, we're done.
     if (self->first_time) {
         self->first_time = false;

         // On the first call to Advance(), advance all PostingLists.
         for (size_t i = 0, max = self->num_elements; i < max; i++) {
             int32_t candidate = PList_Advance(plists[i], target);
             if (!candidate) {
                 self->more = false;
                 return 0;
             }
             else if (candidate > highest) {
                 // Remember the highest doc ID so far.
                 highest = candidate;
             }
         }
     }
     else {
         // On subsequent iters, advance only one PostingList.  Its new doc ID
         // becomes the minimum target which all the others must move up to.
         highest = PList_Advance(plists[0], target);
         if (highest == 0) {
             self->more = false;
             return 0;
         }
     }

     // Find a doc which contains all the terms.
     while (1) {
         bool_t agreement = true;

         // Scoot all posting lists up to at least the current minimum.
         for (uint32_t i = 0; i < num_elements; i++) {
             PostingList *const plist = plists[i];
             int32_t candidate = PList_Get_Doc_ID(plist);

             // Is this PostingList already beyond the minimum?  Then raise the
             // bar for everyone else.
             if (highest < candidate) { highest = candidate; }
             if (target < highest)    { target = highest; }

             // Scoot this posting list up.
             if (candidate < target) {
                 candidate = PList_Advance(plist, target);

                 // If this PostingList is exhausted, we're done.
                 if (candidate == 0) {
                     self->more = false;
                     return 0;
                 }

                 // After calling PList_Advance(), we are guaranteed to be
                 // either at or beyond the minimum, so we can assign without
                 // checking and the minumum will either go up or stay the
                 // same.
                 highest = candidate;
             }
         }

         // See whether all the PostingLists have managed to converge on a
         // single doc ID.
         for (uint32_t i = 0; i < num_elements; i++) {
             const int32_t candidate = PList_Get_Doc_ID(plists[i]);
             if (candidate != highest) { agreement = false; }
         }

         // If we've found a doc with all terms in it, see if they form a
         // phrase.
         if (agreement && highest >= target) {
             self->proximity_freq = ProximityMatcher_Calc_Proximity_Freq(self);
             if (self->proximity_freq == 0.0) {
                 // No phrase.  Move on to another doc.
                 target += 1;
             }
             else {
                 // Success!
                 self->doc_id = highest;
                 return highest;
             }
         }
     }
 }


 static INLINE uint32_t
 SI_winnow_anchors(uint32_t *anchors_start, const uint32_t *const anchors_end,
                   const uint32_t *candidates, const uint32_t *const candidates_end,
                   uint32_t offset, uint32_t within) {
     uint32_t *anchors = anchors_start;
     uint32_t *anchors_found = anchors_start;
     uint32_t target_anchor;
     uint32_t target_candidate;

     // Safety check, so there's no chance of a bad dereference.
     if (anchors_start == anchors_end || candidates == candidates_end) {
         return 0;
     }

     /* This function is a loop that finds terms that can continue a phrase.
      * It overwrites the anchors in place, and returns the number remaining.
      * The basic algorithm is to alternately increment the candidates' pointer
      * until it is at or beyond its target position, and then increment the
      * anchors' pointer until it is at or beyond its target.  The non-standard
      * form is to avoid unnecessary comparisons.  This loop has not been
      * tested for speed, but glancing at the object code produced (objdump -S)
      * it appears to be significantly faster than the nested loop alternative.
      * But given the vagaries of modern processors, it merits actual
      * testing.*/

 SPIN_CANDIDATES:
     target_candidate = *anchors + offset;
     while (*candidates < target_candidate) {
         if (++candidates == candidates_end) { goto DONE; }
     }
     if ((*candidates - target_candidate) < within) { goto MATCH; }
     goto SPIN_ANCHORS;

 SPIN_ANCHORS:
     target_anchor = *candidates - offset;
     while (*anchors < target_anchor) {
         if (++anchors == anchors_end) { goto DONE; }
     };
     if (*anchors == target_anchor) { goto MATCH; }
     goto SPIN_CANDIDATES;

 MATCH:
     *anchors_found++ = *anchors;
     if (++anchors == anchors_end) { goto DONE; }
     goto SPIN_CANDIDATES;

 DONE:
     // Return number of anchors remaining.
     return anchors_found - anchors_start;
 }

 float
 ProximityMatcher_calc_proximity_freq(ProximityMatcher *self) {
     PostingList **const plists   = self->plists;

     /* Create a overwriteable "anchor set" from the first posting.
      *
      * Each "anchor" is a position, measured in tokens, corresponding to a a
      * term which might start a phrase.  We start off with an "anchor set"
      * comprised of all positions at which the first term in the phrase occurs
      * in the field.
      *
      * There can never be more proximity matches than instances of this first
      * term.  There may be fewer however, which we will determine by seeing
      * whether all the other terms line up at subsequent position slots.
      *
      * Every time we eliminate an anchor from the anchor set, we splice it out
      * of the array.  So if we begin with an anchor set of (15, 51, 72) and we
      * discover that matches occur at the first and last instances of the
      * first term but not the middle one, the final array will be (15, 72).
      *
      * The number of elements in the anchor set when we are finished winnowing
      * is our proximity freq.
      */
     ScorePosting *posting = (ScorePosting*)PList_Get_Posting(plists[0]);
     uint32_t anchors_remaining = posting->freq;
     if (!anchors_remaining) { return 0.0f; }

     size_t    amount        = anchors_remaining * sizeof(uint32_t);
     uint32_t *anchors_start = (uint32_t*)BB_Grow(self->anchor_set, amount);
     uint32_t *anchors_end   = anchors_start + anchors_remaining;
     memcpy(anchors_start, posting->prox, amount);

     // Match the positions of other terms against the anchor set.
     for (uint32_t i = 1, max = self->num_elements; i < max; i++) {
         // Get the array of positions for the next term.  Unlike the anchor
         // set (which is a copy), these won't be overwritten.
         ScorePosting *posting = (ScorePosting*)PList_Get_Posting(plists[i]);
         uint32_t *candidates_start = posting->prox;
         uint32_t *candidates_end   = candidates_start + posting->freq;

         // Splice out anchors that don't match the next term.  Bail out if
         // we've eliminated all possible anchors.
         if (self->within == 1) { // exact phrase match
             anchors_remaining = SI_winnow_anchors(anchors_start, anchors_end,
                                                   candidates_start,
                                                   candidates_end, i, 1);
         }
         else {  // fuzzy-phrase match
             anchors_remaining = SI_winnow_anchors(anchors_start, anchors_end,
                                                   candidates_start,
                                                   candidates_end, i,
                                                   self->within);
         }
         if (!anchors_remaining) { return 0.0f; }

         // Adjust end for number of anchors that remain.
         anchors_end = anchors_start + anchors_remaining;
     }

     // The number of anchors left is the proximity freq.
     return (float)anchors_remaining;
 }

 int32_t
 ProximityMatcher_get_doc_id(ProximityMatcher *self) {
     return self->doc_id;
 }

 float
 ProximityMatcher_score(ProximityMatcher *self) {
     ScorePosting *posting = (ScorePosting*)PList_Get_Posting(self->plists[0]);
     float score = Sim_TF(self->sim, self->proximity_freq)
                   * self->weight
                   * posting->weight;
     return score;
 }
	/* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#define C_LUCY_PROXIMITYMATCHER
	#define C_LUCY_POSTING
	#define C_LUCY_SCOREPOSTING
	#include "Lucy/Util/ToolSet.h"

	#include "LucyX/Search/ProximityMatcher.h"
	#include "Lucy/Index/Posting/ScorePosting.h"
	#include "Lucy/Index/PostingList.h"
	#include "Lucy/Index/Similarity.h"
	#include "Lucy/Search/Compiler.h"


	ProximityMatcher*
	ProximityMatcher_new(Similarity sim, VArray plists, Compiler *compiler,
	uint32_t within) {
	ProximityMatcher *self =
	(ProximityMatcher*)VTable_Make_Obj(PROXIMITYMATCHER);
	return ProximityMatcher_init(self, sim, plists, compiler, within);

	}

	ProximityMatcher*
	ProximityMatcher_init(ProximityMatcher self, Similarity similarity,
	VArray plists, Compiler compiler, uint32_t within) {
	Matcher_init((Matcher*)self);

	// Init.
	self->anchor_set = BB_new(0);
	self->proximity_freq = 0.0;
	self->proximity_boost = 0.0;
	self->first_time = true;
	self->more = true;
	self->within = within;

	// Extract PostingLists out of VArray into local C array for quick access.
	self->num_elements = VA_Get_Size(plists);
	self->plists = (PostingList**)MALLOCATE(
	self->num_elements * sizeof(PostingList*));
	for (size_t i = 0; i < self->num_elements; i++) {
	PostingList *const plist
	= (PostingList*)CERTIFY(VA_Fetch(plists, i), POSTINGLIST);
	if (plist == NULL) {
	THROW(ERR, "Missing element %u32", i);
	}
	self->plists[i] = (PostingList*)INCREF(plist);
	}

	// Assign.
	self->sim = (Similarity*)INCREF(similarity);
	self->compiler = (Compiler*)INCREF(compiler);
	self->weight = Compiler_Get_Weight(compiler);

	return self;
	}

	void
	ProximityMatcher_destroy(ProximityMatcher *self) {
	if (self->plists) {
	for (size_t i = 0; i < self->num_elements; i++) {
	DECREF(self->plists[i]);
	}
	FREEMEM(self->plists);
	}
	DECREF(self->sim);
	DECREF(self->anchor_set);
	DECREF(self->compiler);
	SUPER_DESTROY(self, PROXIMITYMATCHER);
	}

	int32_t
	ProximityMatcher_next(ProximityMatcher *self) {
	if (self->first_time) {
	return ProximityMatcher_Advance(self, 1);
	}
	else if (self->more) {
	const int32_t target = PList_Get_Doc_ID(self->plists[0]) + 1;
	return ProximityMatcher_Advance(self, target);
	}
	else {
	return 0;
	}
	}

	int32_t
	ProximityMatcher_advance(ProximityMatcher *self, int32_t target) {
	PostingList **const plists = self->plists;
	const uint32_t num_elements = self->num_elements;
	int32_t highest = 0;

	// Reset match variables to indicate no match. New values will be
	// assigned if a match succeeds.
	self->proximity_freq = 0.0;
	self->doc_id = 0;

	// Find the lowest possible matching doc ID greater than the current doc
	// ID. If any one of the PostingLists is exhausted, we're done.
	if (self->first_time) {
	self->first_time = false;

	// On the first call to Advance(), advance all PostingLists.
	for (size_t i = 0, max = self->num_elements; i < max; i++) {
	int32_t candidate = PList_Advance(plists[i], target);
	if (!candidate) {
	self->more = false;
	return 0;
	}
	else if (candidate > highest) {
	// Remember the highest doc ID so far.
	highest = candidate;
	}
	}
	}
	else {
	// On subsequent iters, advance only one PostingList. Its new doc ID
	// becomes the minimum target which all the others must move up to.
	highest = PList_Advance(plists[0], target);
	if (highest == 0) {
	self->more = false;
	return 0;
	}
	}

	// Find a doc which contains all the terms.
	while (1) {
	bool_t agreement = true;

	// Scoot all posting lists up to at least the current minimum.
	for (uint32_t i = 0; i < num_elements; i++) {
	PostingList *const plist = plists[i];
	int32_t candidate = PList_Get_Doc_ID(plist);

	// Is this PostingList already beyond the minimum? Then raise the
	// bar for everyone else.
	if (highest < candidate) { highest = candidate; }
	if (target < highest) { target = highest; }

	// Scoot this posting list up.
	if (candidate < target) {
	candidate = PList_Advance(plist, target);

	// If this PostingList is exhausted, we're done.
	if (candidate == 0) {
	self->more = false;
	return 0;
	}

	// After calling PList_Advance(), we are guaranteed to be
	// either at or beyond the minimum, so we can assign without
	// checking and the minumum will either go up or stay the
	// same.
	highest = candidate;
	}
	}

	// See whether all the PostingLists have managed to converge on a
	// single doc ID.
	for (uint32_t i = 0; i < num_elements; i++) {
	const int32_t candidate = PList_Get_Doc_ID(plists[i]);
	if (candidate != highest) { agreement = false; }
	}

	// If we've found a doc with all terms in it, see if they form a
	// phrase.
	if (agreement && highest >= target) {
	self->proximity_freq = ProximityMatcher_Calc_Proximity_Freq(self);
	if (self->proximity_freq == 0.0) {
	// No phrase. Move on to another doc.
	target += 1;
	}
	else {
	// Success!
	self->doc_id = highest;
	return highest;
	}
	}
	}
	}


	static INLINE uint32_t
	SI_winnow_anchors(uint32_t anchors_start, const uint32_t const anchors_end,
	const uint32_t candidates, const uint32_t const candidates_end,
	uint32_t offset, uint32_t within) {
	uint32_t *anchors = anchors_start;
	uint32_t *anchors_found = anchors_start;
	uint32_t target_anchor;
	uint32_t target_candidate;

	// Safety check, so there's no chance of a bad dereference.
	if (anchors_start == anchors_end \|\| candidates == candidates_end) {
	return 0;
	}

	/* This function is a loop that finds terms that can continue a phrase.
	* It overwrites the anchors in place, and returns the number remaining.
	* The basic algorithm is to alternately increment the candidates' pointer
	* until it is at or beyond its target position, and then increment the
	* anchors' pointer until it is at or beyond its target. The non-standard
	* form is to avoid unnecessary comparisons. This loop has not been
	* tested for speed, but glancing at the object code produced (objdump -S)
	* it appears to be significantly faster than the nested loop alternative.
	* But given the vagaries of modern processors, it merits actual
	* testing.*/

	SPIN_CANDIDATES:
	target_candidate = *anchors + offset;
	while (*candidates < target_candidate) {
	if (++candidates == candidates_end) { goto DONE; }
	}
	if ((*candidates - target_candidate) < within) { goto MATCH; }
	goto SPIN_ANCHORS;

	SPIN_ANCHORS:
	target_anchor = *candidates - offset;
	while (*anchors < target_anchor) {
	if (++anchors == anchors_end) { goto DONE; }
	};
	if (*anchors == target_anchor) { goto MATCH; }
	goto SPIN_CANDIDATES;

	MATCH:
	anchors_found++ = anchors;
	if (++anchors == anchors_end) { goto DONE; }
	goto SPIN_CANDIDATES;

	DONE:
	// Return number of anchors remaining.
	return anchors_found - anchors_start;
	}

	float
	ProximityMatcher_calc_proximity_freq(ProximityMatcher *self) {
	PostingList **const plists = self->plists;

	/* Create a overwriteable "anchor set" from the first posting.
	*
	* Each "anchor" is a position, measured in tokens, corresponding to a a
	* term which might start a phrase. We start off with an "anchor set"
	* comprised of all positions at which the first term in the phrase occurs
	* in the field.
	*
	* There can never be more proximity matches than instances of this first
	* term. There may be fewer however, which we will determine by seeing
	* whether all the other terms line up at subsequent position slots.
	*
	* Every time we eliminate an anchor from the anchor set, we splice it out
	* of the array. So if we begin with an anchor set of (15, 51, 72) and we
	* discover that matches occur at the first and last instances of the
	* first term but not the middle one, the final array will be (15, 72).
	*
	* The number of elements in the anchor set when we are finished winnowing
	* is our proximity freq.
	*/
	ScorePosting posting = (ScorePosting)PList_Get_Posting(plists[0]);
	uint32_t anchors_remaining = posting->freq;
	if (!anchors_remaining) { return 0.0f; }

	size_t amount = anchors_remaining * sizeof(uint32_t);
	uint32_t anchors_start = (uint32_t)BB_Grow(self->anchor_set, amount);
	uint32_t *anchors_end = anchors_start + anchors_remaining;
	memcpy(anchors_start, posting->prox, amount);

	// Match the positions of other terms against the anchor set.
	for (uint32_t i = 1, max = self->num_elements; i < max; i++) {
	// Get the array of positions for the next term. Unlike the anchor
	// set (which is a copy), these won't be overwritten.
	ScorePosting posting = (ScorePosting)PList_Get_Posting(plists[i]);
	uint32_t *candidates_start = posting->prox;
	uint32_t *candidates_end = candidates_start + posting->freq;

	// Splice out anchors that don't match the next term. Bail out if
	// we've eliminated all possible anchors.
	if (self->within == 1) { // exact phrase match
	anchors_remaining = SI_winnow_anchors(anchors_start, anchors_end,
	candidates_start,
	candidates_end, i, 1);
	}
	else { // fuzzy-phrase match
	anchors_remaining = SI_winnow_anchors(anchors_start, anchors_end,
	candidates_start,
	candidates_end, i,
	self->within);
	}
	if (!anchors_remaining) { return 0.0f; }

	// Adjust end for number of anchors that remain.
	anchors_end = anchors_start + anchors_remaining;
	}

	// The number of anchors left is the proximity freq.
	return (float)anchors_remaining;
	}

	int32_t
	ProximityMatcher_get_doc_id(ProximityMatcher *self) {
	return self->doc_id;
	}

	float
	ProximityMatcher_score(ProximityMatcher *self) {
	ScorePosting posting = (ScorePosting)PList_Get_Posting(self->plists[0]);
	float score = Sim_TF(self->sim, self->proximity_freq)
	* self->weight
	* posting->weight;
	return score;
	}