core/Lucy/Index/Similarity.cfh - lucy - Git at Google

 /* Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 parcel Lucy;

 /** Judge how well a document matches a query.
  *
  * After determining whether a document matches a given query, a score must be
  * calculated which indicates how I<well> the document matches the query.  The
  * Similarity class is used to judge how "similar" the query and the document
  * are to each other; the closer the resemblance, they higher the document
  * scores.
  *
  * The default implementation uses Lucene's modified cosine similarity
  * measure.  Subclasses might tweak the existing algorithms, or might be used
  * in conjunction with custom Query subclasses to implement arbitrary scoring
  * schemes.
  *
  * Most of the methods operate on single fields, but some are used to combine
  * scores from multiple fields.
  */

 class Lucy::Index::Similarity cnick Sim
     inherits Lucy::Object::Obj : dumpable {

     float  *norm_decoder;

     inert incremented Similarity*
     new();

     /** Constructor. Takes no arguments.
      */
     public inert Similarity*
     init(Similarity *self);

     /** Factory method for creating a Posting.
      */
     public incremented Posting*
     Make_Posting(Similarity *self);

     /** Factory method for creating a PostingWriter.
      */
     incremented PostingWriter*
     Make_Posting_Writer(Similarity *self, Schema *schema, Snapshot *snapshot,
                         Segment *segment, PolyReader *polyreader,
                         int32_t field_num);

     /** Return a score factor based on the frequency of a term in a given
      * document.  The default implementation is sqrt(freq).  Other
      * implementations typically produce ascending scores with ascending
      * freqs, since the more times a doc matches, the more relevant it is
      * likely to be.
      */
     public float
     TF(Similarity *self, float freq);

     /** Calculate the Inverse Document Frequecy for a term in a given
      * collection.
      *
      * @param doc_freq The number of documents that the term appears in.
      * @param total_docs The number of documents in the collection.
      */
     public float
     IDF(Similarity *self, int64_t doc_freq, int64_t total_docs);

     /** Calculate a score factor based on the number of terms which match.
      */
     public float
     Coord(Similarity *self, uint32_t overlap, uint32_t max_overlap);

     /** Dampen the scores of long documents.
      *
      * After a field is broken up into terms at index-time, each term must be
      * assigned a weight.  One of the factors in calculating this weight is
      * the number of tokens that the original field was broken into.
      *
      * Typically, we assume that the more tokens in a field, the less
      * important any one of them is -- so that, e.g. 5 mentions of "Kafka" in
      * a short article are given more heft than 5 mentions of "Kafka" in an
      * entire book.  The default implementation of length_norm expresses this
      * using an inverted square root.
      *
      * However, the inverted square root has a tendency to reward very short
      * fields highly, which isn't always appropriate for fields you expect to
      * have a lot of tokens on average.
      */
     public float
     Length_Norm(Similarity *self, uint32_t num_tokens);

     /** Normalize a Query's weight so that it is comparable to other Queries.
      */
     public float
     Query_Norm(Similarity *self, float sum_of_squared_weights);

     /** encode_norm and decode_norm encode and decode between 32-bit IEEE
      * floating point numbers and a 5-bit exponent, 3-bit mantissa float.  The
      * range covered by the single-byte encoding is 7x10^9 to 2x10^-9.  The
      * accuracy is about one significant decimal digit.
      */
     uint32_t
     Encode_Norm(Similarity *self, float f);

     /** See encode_norm.
      */
     float
     Decode_Norm(Similarity *self, uint32_t input);

     float*
     Get_Norm_Decoder(Similarity *self);

     public void
     Destroy(Similarity *self);

     public incremented Obj*
     Dump(Similarity *self);

     public incremented Similarity*
     Load(Similarity *self, Obj *dump);

     public bool_t
     Equals(Similarity *self, Obj *other);

     public void
     Serialize(Similarity *self, OutStream *outstream);

     public incremented Similarity*
     Deserialize(Similarity *self, InStream *instream);
 }
	/* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	parcel Lucy;

	/** Judge how well a document matches a query.
	*
	* After determining whether a document matches a given query, a score must be
	* calculated which indicates how I<well> the document matches the query. The
	* Similarity class is used to judge how "similar" the query and the document
	* are to each other; the closer the resemblance, they higher the document
	* scores.
	*
	* The default implementation uses Lucene's modified cosine similarity
	* measure. Subclasses might tweak the existing algorithms, or might be used
	* in conjunction with custom Query subclasses to implement arbitrary scoring
	* schemes.
	*
	* Most of the methods operate on single fields, but some are used to combine
	* scores from multiple fields.
	*/

	class Lucy::Index::Similarity cnick Sim
	inherits Lucy::Object::Obj : dumpable {

	float *norm_decoder;

	inert incremented Similarity*
	new();

	/** Constructor. Takes no arguments.
	*/
	public inert Similarity*
	init(Similarity *self);

	/** Factory method for creating a Posting.
	*/
	public incremented Posting*
	Make_Posting(Similarity *self);

	/** Factory method for creating a PostingWriter.
	*/
	incremented PostingWriter*
	Make_Posting_Writer(Similarity self, Schema schema, Snapshot *snapshot,
	Segment segment, PolyReader polyreader,
	int32_t field_num);

	/** Return a score factor based on the frequency of a term in a given
	* document. The default implementation is sqrt(freq). Other
	* implementations typically produce ascending scores with ascending
	* freqs, since the more times a doc matches, the more relevant it is
	* likely to be.
	*/
	public float
	TF(Similarity *self, float freq);

	/** Calculate the Inverse Document Frequecy for a term in a given
	* collection.
	*
	* @param doc_freq The number of documents that the term appears in.
	* @param total_docs The number of documents in the collection.
	*/
	public float
	IDF(Similarity *self, int64_t doc_freq, int64_t total_docs);

	/** Calculate a score factor based on the number of terms which match.
	*/
	public float
	Coord(Similarity *self, uint32_t overlap, uint32_t max_overlap);

	/** Dampen the scores of long documents.
	*
	* After a field is broken up into terms at index-time, each term must be
	* assigned a weight. One of the factors in calculating this weight is
	* the number of tokens that the original field was broken into.
	*
	* Typically, we assume that the more tokens in a field, the less
	* important any one of them is -- so that, e.g. 5 mentions of "Kafka" in
	* a short article are given more heft than 5 mentions of "Kafka" in an
	* entire book. The default implementation of length_norm expresses this
	* using an inverted square root.
	*
	* However, the inverted square root has a tendency to reward very short
	* fields highly, which isn't always appropriate for fields you expect to
	* have a lot of tokens on average.
	*/
	public float
	Length_Norm(Similarity *self, uint32_t num_tokens);

	/** Normalize a Query's weight so that it is comparable to other Queries.
	*/
	public float
	Query_Norm(Similarity *self, float sum_of_squared_weights);

	/** encode_norm and decode_norm encode and decode between 32-bit IEEE
	* floating point numbers and a 5-bit exponent, 3-bit mantissa float. The
	* range covered by the single-byte encoding is 7x10^9 to 2x10^-9. The
	* accuracy is about one significant decimal digit.
	*/
	uint32_t
	Encode_Norm(Similarity *self, float f);

	/** See encode_norm.
	*/
	float
	Decode_Norm(Similarity *self, uint32_t input);

	float*
	Get_Norm_Decoder(Similarity *self);

	public void
	Destroy(Similarity *self);

	public incremented Obj*
	Dump(Similarity *self);

	public incremented Similarity*
	Load(Similarity self, Obj dump);

	public bool_t
	Equals(Similarity self, Obj other);

	public void
	Serialize(Similarity self, OutStream outstream);

	public incremented Similarity*
	Deserialize(Similarity self, InStream instream);
	}