core/Lucy/Analysis/RegexTokenizer.cfh - lucy - Git at Google

 /* Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 parcel Lucy;

 /** Split a string into tokens.
  *
  * Generically, "tokenizing" is a process of breaking up a string into an
  * array of "tokens".  For instance, the string "three blind mice" might be
  * tokenized into "three", "blind", "mice".
  *
  * Lucy::Analysis::RegexTokenizer decides where it should break up the text
  * based on a regular expression compiled from a supplied `pattern`
  * matching one token.  If our source string is...
  *
  *     "Eats, Shoots and Leaves."
  *
  * ... then a "whitespace tokenizer" with a `pattern` of
  * `"\\S+"` produces...
  *
  *     Eats,
  *     Shoots
  *     and
  *     Leaves.
  *
  * ... while a "word character tokenizer" with a `pattern` of
  * `"\\w+"` produces...
  *
  *     Eats
  *     Shoots
  *     and
  *     Leaves
  *
  * ... the difference being that the word character tokenizer skips over
  * punctuation as well as whitespace when determining token boundaries.
  */
 public class Lucy::Analysis::RegexTokenizer
     inherits Lucy::Analysis::Analyzer {

     String *pattern;
     void   *token_re;

     /**
      * @return true if RegexTokenizer is available in this build.
      */
     inert bool
     is_available();

     /** Create a new RegexTokenizer.
      *
      * @param pattern A string specifying a Perl-syntax regular expression
      * which should match one token.  The default value is
      * `\w+(?:[\x{2019}']\w+)*`, which matches "it's" as well as
      * "it" and "O'Henry's" as well as "Henry".
      */
     public inert incremented RegexTokenizer*
     new(String *pattern = NULL);

     /** Initialize a RegexTokenizer.
      *
      * @param pattern A string specifying a Perl-syntax regular expression
      * which should match one token.  The default value is
      * `\w+(?:[\x{2019}']\w+)*`, which matches "it's" as well as
      * "it" and "O'Henry's" as well as "Henry".
      */
     public inert RegexTokenizer*
     init(RegexTokenizer *self, String *pattern = NULL);

     public incremented Inversion*
     Transform(RegexTokenizer *self, Inversion *inversion);

     public incremented Inversion*
     Transform_Text(RegexTokenizer *self, String *text);

     /** Tokenize the supplied string and add any Tokens generated to the
      * supplied Inversion.
      */
     void
     Tokenize_Utf8(RegexTokenizer *self, const char *text, size_t len,
                   Inversion *inversion);

     public incremented Obj*
     Dump(RegexTokenizer *self);

     public incremented RegexTokenizer*
     Load(RegexTokenizer *self, Obj *dump);

     public bool
     Equals(RegexTokenizer *self, Obj *other);

     public void
     Destroy(RegexTokenizer *self);
 }
	/* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	parcel Lucy;

	/** Split a string into tokens.
	*
	* Generically, "tokenizing" is a process of breaking up a string into an
	* array of "tokens". For instance, the string "three blind mice" might be
	* tokenized into "three", "blind", "mice".
	*
	* Lucy::Analysis::RegexTokenizer decides where it should break up the text
	* based on a regular expression compiled from a supplied `pattern`
	* matching one token. If our source string is...
	*
	* "Eats, Shoots and Leaves."
	*
	* ... then a "whitespace tokenizer" with a `pattern` of
	* `"\\S+"` produces...
	*
	* Eats,
	* Shoots
	* and
	* Leaves.
	*
	* ... while a "word character tokenizer" with a `pattern` of
	* `"\\w+"` produces...
	*
	* Eats
	* Shoots
	* and
	* Leaves
	*
	* ... the difference being that the word character tokenizer skips over
	* punctuation as well as whitespace when determining token boundaries.
	*/
	public class Lucy::Analysis::RegexTokenizer
	inherits Lucy::Analysis::Analyzer {

	String *pattern;
	void *token_re;

	/**
	* @return true if RegexTokenizer is available in this build.
	*/
	inert bool
	is_available();

	/** Create a new RegexTokenizer.
	*
	* @param pattern A string specifying a Perl-syntax regular expression
	* which should match one token. The default value is
	* `\w+(?:[\x{2019}']\w+)*`, which matches "it's" as well as
	* "it" and "O'Henry's" as well as "Henry".
	*/
	public inert incremented RegexTokenizer*
	new(String *pattern = NULL);

	/** Initialize a RegexTokenizer.
	*
	* @param pattern A string specifying a Perl-syntax regular expression
	* which should match one token. The default value is
	* `\w+(?:[\x{2019}']\w+)*`, which matches "it's" as well as
	* "it" and "O'Henry's" as well as "Henry".
	*/
	public inert RegexTokenizer*
	init(RegexTokenizer self, String pattern = NULL);

	public incremented Inversion*
	Transform(RegexTokenizer self, Inversion inversion);

	public incremented Inversion*
	Transform_Text(RegexTokenizer self, String text);

	/** Tokenize the supplied string and add any Tokens generated to the
	* supplied Inversion.
	*/
	void
	Tokenize_Utf8(RegexTokenizer self, const char text, size_t len,
	Inversion *inversion);

	public incremented Obj*
	Dump(RegexTokenizer *self);

	public incremented RegexTokenizer*
	Load(RegexTokenizer self, Obj dump);

	public bool
	Equals(RegexTokenizer self, Obj other);

	public void
	Destroy(RegexTokenizer *self);
	}