core/Lucy/Analysis/Token.cfh - lucy - Git at Google

 /* Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 parcel Lucy;

 /** Unit of text.
  *
  * Token is the fundamental unit used by Apache Lucy's Analyzer subclasses.
  * Each Token has 5 attributes: <code>text</code>, <code>start_offset</code>,
  * <code>end_offset</code>, <code>boost</code>, and <code>pos_inc</code>.
  *
  * The <code>text</code> attribute is a Unicode string encoded as UTF-8.
  *
  * <code>start_offset</code> is the start point of the token text, measured in
  * Unicode code points from the top of the stored field;
  * <code>end_offset</code> delimits the corresponding closing boundary.
  * <code>start_offset</code> and <code>end_offset</code> locate the Token
  * within a larger context, even if the Token's text attribute gets modified
  * -- by stemming, for instance.  The Token for "beating" in the text "beating
  * a dead horse" begins life with a start_offset of 0 and an end_offset of 7;
  * after stemming, the text is "beat", but the start_offset is still 0 and the
  * end_offset is still 7.  This allows "beating" to be highlighted correctly
  * after a search matches "beat".
  *
  * <code>boost</code> is a per-token weight.  Use this when you want to assign
  * more or less importance to a particular token, as you might for emboldened
  * text within an HTML document, for example.  (Note: The field this token
  * belongs to must be spec'd to use a posting of type
  * L<Lucy::Index::Posting::RichPosting>.)
  *
  * <code>pos_inc</code is the POSition INCrement, measured in Tokens.  This
  * attribute, which defaults to 1, is a an advanced tool for manipulating
  * phrase matching.  Ordinarily, Tokens are assigned consecutive position
  * numbers: 0, 1, and 2 for <code>"three blind mice"</code>.  However, if you
  * set the position increment for "blind" to, say, 1000, then the three tokens
  * will end up assigned to positions 0, 1, and 1001 -- and will no longer
  * produce a phrase match for the query <code>"three blind mice"</code>.
  */
 class Lucy::Analysis::Token inherits Lucy::Object::Obj {

     char     *text;
     size_t    len;
     uint32_t  start_offset;
     uint32_t  end_offset;
     float     boost;
     int32_t   pos_inc;
     int32_t   pos;

     inert incremented Token*
     new(const char *text, size_t len, uint32_t start_offset,
         uint32_t end_offset, float boost = 1.0, int32_t pos_inc = 1);

     inert Token*
     init(Token *self, const char *text, size_t len,
          uint32_t start_offset, uint32_t end_offset,
          float boost = 1.0, int32_t pos_inc = 1);

     /** Sort_quicksort-compatible comparison routine.
      */
     inert int
     compare(void *context, const void *va, const void *vb);

     uint32_t
     Get_Start_Offset(Token *self);

     uint32_t
     Get_End_Offset(Token *self);

     float
     Get_Boost(Token *self);

     int32_t
     Get_Pos_Inc(Token *self);

     char*
     Get_Text(Token *self);

     size_t
     Get_Len(Token *self);

     void
     Set_Text(Token *self, char *text, size_t len);

     public void
     Destroy(Token *self);
 }
	/* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	parcel Lucy;

	/** Unit of text.
	*
	* Token is the fundamental unit used by Apache Lucy's Analyzer subclasses.
	* Each Token has 5 attributes: <code>text</code>, <code>start_offset</code>,
	* <code>end_offset</code>, <code>boost</code>, and <code>pos_inc</code>.
	*
	* The <code>text</code> attribute is a Unicode string encoded as UTF-8.
	*
	* <code>start_offset</code> is the start point of the token text, measured in
	* Unicode code points from the top of the stored field;
	* <code>end_offset</code> delimits the corresponding closing boundary.
	* <code>start_offset</code> and <code>end_offset</code> locate the Token
	* within a larger context, even if the Token's text attribute gets modified
	* -- by stemming, for instance. The Token for "beating" in the text "beating
	* a dead horse" begins life with a start_offset of 0 and an end_offset of 7;
	* after stemming, the text is "beat", but the start_offset is still 0 and the
	* end_offset is still 7. This allows "beating" to be highlighted correctly
	* after a search matches "beat".
	*
	* <code>boost</code> is a per-token weight. Use this when you want to assign
	* more or less importance to a particular token, as you might for emboldened
	* text within an HTML document, for example. (Note: The field this token
	* belongs to must be spec'd to use a posting of type
	* L<Lucy::Index::Posting::RichPosting>.)
	*
	* <code>pos_inc</code is the POSition INCrement, measured in Tokens. This
	* attribute, which defaults to 1, is a an advanced tool for manipulating
	* phrase matching. Ordinarily, Tokens are assigned consecutive position
	* numbers: 0, 1, and 2 for <code>"three blind mice"</code>. However, if you
	* set the position increment for "blind" to, say, 1000, then the three tokens
	* will end up assigned to positions 0, 1, and 1001 -- and will no longer
	* produce a phrase match for the query <code>"three blind mice"</code>.
	*/
	class Lucy::Analysis::Token inherits Lucy::Object::Obj {

	char *text;
	size_t len;
	uint32_t start_offset;
	uint32_t end_offset;
	float boost;
	int32_t pos_inc;
	int32_t pos;

	inert incremented Token*
	new(const char *text, size_t len, uint32_t start_offset,
	uint32_t end_offset, float boost = 1.0, int32_t pos_inc = 1);

	inert Token*
	init(Token self, const char text, size_t len,
	uint32_t start_offset, uint32_t end_offset,
	float boost = 1.0, int32_t pos_inc = 1);

	/** Sort_quicksort-compatible comparison routine.
	*/
	inert int
	compare(void context, const void va, const void *vb);

	uint32_t
	Get_Start_Offset(Token *self);

	uint32_t
	Get_End_Offset(Token *self);

	float
	Get_Boost(Token *self);

	int32_t
	Get_Pos_Inc(Token *self);

	char*
	Get_Text(Token *self);

	size_t
	Get_Len(Token *self);

	void
	Set_Text(Token self, char text, size_t len);

	public void
	Destroy(Token *self);
	}