src/java/org/apache/solr/analysis/PatternTokenizerFactory.java - lucene - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.solr.analysis;

 import java.io.IOException;
 import java.io.Reader;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.solr.common.SolrException;


 /**
  * This tokenizer uses regex pattern matching to construct distinct tokens
  * for the input stream.  It takes two arguments:  "pattern" and "group".
  * <p/>
  * <ul>
  * <li>"pattern" is the regular expression.</li>
  * <li>"group" says which group to extract into tokens.</li>
  *  </ul>
  * <p>
  * group=-1 (the default) is equivalent to "split".  In this case, the tokens will
  * be equivalent to the output from (without empty tokens):
  * {@link String#split(java.lang.String)}
  * </p>
  * <p>
  * Using group >= 0 selects the matching group as the token.  For example, if you have:<br/>
  * <pre>
  *  pattern = \'([^\']+)\'
  *  group = 0
  *  input = aaa 'bbb' 'ccc'
  *</pre>
  * the output will be two tokens: 'bbb' and 'ccc' (including the ' marks).  With the same input
  * but using group=1, the output would be: bbb and ccc (no ' marks)
  * </p>
  * <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p>
  *
  * @see PatternTokenizer
  * @since solr1.2
  * @version $Id:$
  */
 public class PatternTokenizerFactory extends BaseTokenizerFactory
 {
   public static final String PATTERN = "pattern";
   public static final String GROUP = "group";

   protected Map<String,String> args;
   protected Pattern pattern;
   protected int group;

   /**
    * Require a configured pattern
    */
   @Override
   public void init(Map<String,String> args)
   {
     this.args = args;
     String regex = args.get( PATTERN );
     if( regex == null ) {
       throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "missing required argument: "+PATTERN );
     }
     int flags = 0; // TODO? -- read flags from config CASE_INSENSITIVE, etc
     pattern = Pattern.compile( regex, flags );

     group = -1;  // use 'split'
     String g = args.get( GROUP );
     if( g != null ) {
       try {
         group = Integer.parseInt( g );
       }
       catch( Exception ex ) {
         throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "invalid group argument: "+g );
       }
     }
   }

   /**
    * Split the input using configured pattern
    */
   public Tokenizer create(final Reader in) {
     try {
       return new PatternTokenizer(in, pattern, group);
     } catch( IOException ex ) {
       throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, ex );
     }
   }

   /**
    * This behaves just like String.split( ), but returns a list of Tokens
    * rather then an array of strings
    * NOTE: This method is not used in 1.4.
    * @deprecated
    */
   @Deprecated
   public static List<Token> split( Matcher matcher, String input )
   {
     int index = 0;
     int lastNonEmptySize = Integer.MAX_VALUE;
     ArrayList<Token> matchList = new ArrayList<Token>();

     // Add segments before each match found
     while(matcher.find()) {
       String match = input.subSequence(index, matcher.start()).toString();
       matchList.add( new Token( match, index, matcher.start()) );
       index = matcher.end();
       if( match.length() > 0 ) {
         lastNonEmptySize = matchList.size();
       }
     }

     // If no match is found, return the full string
     if (index == 0) {
       matchList.add( new Token( input, 0, input.length()) );
     }
     else {
       String match = input.subSequence(index, input.length()).toString();
       matchList.add( new Token( match, index, input.length()) );
       if( match.length() > 0 ) {
         lastNonEmptySize = matchList.size();
       }
     }

     // Don't use trailing empty strings.  This behavior matches String.split();
     if( lastNonEmptySize < matchList.size() ) {
       return matchList.subList( 0, lastNonEmptySize );
     }
     return matchList;
   }

   /**
    * Create tokens from the matches in a matcher
    * NOTE: This method is not used in 1.4.
    * @deprecated
    */
   @Deprecated
   public static List<Token> group( Matcher matcher, String input, int group )
   {
     ArrayList<Token> matchList = new ArrayList<Token>();
     while(matcher.find()) {
       Token t = new Token(
         matcher.group(group),
         matcher.start(group),
         matcher.end(group) );
       matchList.add( t );
     }
     return matchList;
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.solr.analysis;

	import java.io.IOException;
	import java.io.Reader;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.Map;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import org.apache.lucene.analysis.Token;
	import org.apache.lucene.analysis.Tokenizer;
	import org.apache.solr.common.SolrException;


	/**
	* This tokenizer uses regex pattern matching to construct distinct tokens
	* for the input stream. It takes two arguments: "pattern" and "group".
	* <p/>
	* <ul>
	* <li>"pattern" is the regular expression.</li>
	* <li>"group" says which group to extract into tokens.</li>
	* </ul>
	* <p>
	* group=-1 (the default) is equivalent to "split". In this case, the tokens will
	* be equivalent to the output from (without empty tokens):
	* {@link String#split(java.lang.String)}
	* </p>
	* <p>
	* Using group >= 0 selects the matching group as the token. For example, if you have:<br/>
	* <pre>
	* pattern = \'([^\']+)\'
	* group = 0
	* input = aaa 'bbb' 'ccc'
	*</pre>
	* the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input
	* but using group=1, the output would be: bbb and ccc (no ' marks)
	* </p>
	* <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p>
	*
	* @see PatternTokenizer
	* @since solr1.2
	* @version $Id:$
	*/
	public class PatternTokenizerFactory extends BaseTokenizerFactory
	{
	public static final String PATTERN = "pattern";
	public static final String GROUP = "group";

	protected Map<String,String> args;
	protected Pattern pattern;
	protected int group;

	/**
	* Require a configured pattern
	*/
	@Override
	public void init(Map<String,String> args)
	{
	this.args = args;
	String regex = args.get( PATTERN );
	if( regex == null ) {
	throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "missing required argument: "+PATTERN );
	}
	int flags = 0; // TODO? -- read flags from config CASE_INSENSITIVE, etc
	pattern = Pattern.compile( regex, flags );

	group = -1; // use 'split'
	String g = args.get( GROUP );
	if( g != null ) {
	try {
	group = Integer.parseInt( g );
	}
	catch( Exception ex ) {
	throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "invalid group argument: "+g );
	}
	}
	}

	/**
	* Split the input using configured pattern
	*/
	public Tokenizer create(final Reader in) {
	try {
	return new PatternTokenizer(in, pattern, group);
	} catch( IOException ex ) {
	throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, ex );
	}
	}

	/**
	* This behaves just like String.split( ), but returns a list of Tokens
	* rather then an array of strings
	* NOTE: This method is not used in 1.4.
	* @deprecated
	*/
	@Deprecated
	public static List<Token> split( Matcher matcher, String input )
	{
	int index = 0;
	int lastNonEmptySize = Integer.MAX_VALUE;
	ArrayList<Token> matchList = new ArrayList<Token>();

	// Add segments before each match found
	while(matcher.find()) {
	String match = input.subSequence(index, matcher.start()).toString();
	matchList.add( new Token( match, index, matcher.start()) );
	index = matcher.end();
	if( match.length() > 0 ) {
	lastNonEmptySize = matchList.size();
	}
	}

	// If no match is found, return the full string
	if (index == 0) {
	matchList.add( new Token( input, 0, input.length()) );
	}
	else {
	String match = input.subSequence(index, input.length()).toString();
	matchList.add( new Token( match, index, input.length()) );
	if( match.length() > 0 ) {
	lastNonEmptySize = matchList.size();
	}
	}

	// Don't use trailing empty strings. This behavior matches String.split();
	if( lastNonEmptySize < matchList.size() ) {
	return matchList.subList( 0, lastNonEmptySize );
	}
	return matchList;
	}

	/**
	* Create tokens from the matches in a matcher
	* NOTE: This method is not used in 1.4.
	* @deprecated
	*/
	@Deprecated
	public static List<Token> group( Matcher matcher, String input, int group )
	{
	ArrayList<Token> matchList = new ArrayList<Token>();
	while(matcher.find()) {
	Token t = new Token(
	matcher.group(group),
	matcher.start(group),
	matcher.end(group) );
	matchList.add( t );
	}
	return matchList;
	}
	}