solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.solr.schema;

 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;

 import org.apache.commons.io.IOUtils;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.analysis.util.ResourceLoader;
 import org.apache.lucene.collation.ICUCollationKeyAnalyzer;
 import org.apache.lucene.document.SortedDocValuesField;
 import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.util.BytesRef;
 import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.response.TextResponseWriter;
 import org.apache.solr.search.QParser;
 import org.apache.solr.uninverting.UninvertingReader.Type;

 import com.ibm.icu.text.Collator;
 import com.ibm.icu.text.RuleBasedCollator;
 import com.ibm.icu.util.ULocale;

 /**
  * Field for collated sort keys.
  * These can be used for locale-sensitive sort and range queries.
  * <p>
  * This field can be created in two ways:
  * <ul>
  *  <li>Based upon a system collator associated with a Locale.
  *  <li>Based upon a tailored ruleset.
  * </ul>
  * <p>
  * Using a System collator:
  * <ul>
  *  <li>locale: RFC 3066 locale ID (mandatory)
  *  <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
  *  <li>decomposition: 'no', or 'canonical' (optional)
  * </ul>
  * <p>
  * Using a Tailored ruleset:
  * <ul>
  *  <li>custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)
  *  <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
  *  <li>decomposition: 'no' or 'canonical' (optional)
  * </ul>
  * <p>
  * Expert options:
  * <ul>
  *  <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
  *  <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
  *  <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
  *  <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
  *  <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
  * </ul>
  *
  * @see Collator
  * @see ULocale
  * @see RuleBasedCollator
  */
 public class ICUCollationField extends FieldType {
   private Analyzer analyzer;

   @Override
   protected void init(IndexSchema schema, Map<String,String> args) {
     properties |= TOKENIZED; // this ensures our analyzer gets hit
     setup(schema.getResourceLoader(), args);
     super.init(schema, args);
   }

   /**
    * Setup the field according to the provided parameters
    */
   private void setup(ResourceLoader loader, Map<String,String> args) {
     String custom = args.remove("custom");
     String localeID = args.remove("locale");
     String strength = args.remove("strength");
     String decomposition = args.remove("decomposition");

     String alternate = args.remove("alternate");
     String caseLevel = args.remove("caseLevel");
     String caseFirst = args.remove("caseFirst");
     String numeric = args.remove("numeric");
     String variableTop = args.remove("variableTop");

     if (custom == null && localeID == null)
       throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");

     if (custom != null && localeID != null)
       throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. "
           + "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. "
           + "Then save the entire customized ruleset to a file, and use with the custom parameter");

     final Collator collator;

     if (localeID != null) {
       // create from a system collator, based on Locale.
       collator = createFromLocale(localeID);
     } else {
       // create from a custom ruleset
       collator = createFromRules(custom, loader);
     }

     // set the strength flag, otherwise it will be the default.
     if (strength != null) {
       if (strength.equalsIgnoreCase("primary"))
         collator.setStrength(Collator.PRIMARY);
       else if (strength.equalsIgnoreCase("secondary"))
         collator.setStrength(Collator.SECONDARY);
       else if (strength.equalsIgnoreCase("tertiary"))
         collator.setStrength(Collator.TERTIARY);
       else if (strength.equalsIgnoreCase("quaternary"))
         collator.setStrength(Collator.QUATERNARY);
       else if (strength.equalsIgnoreCase("identical"))
         collator.setStrength(Collator.IDENTICAL);
       else
         throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
     }

     // set the decomposition flag, otherwise it will be the default.
     if (decomposition != null) {
       if (decomposition.equalsIgnoreCase("no"))
         collator.setDecomposition(Collator.NO_DECOMPOSITION);
       else if (decomposition.equalsIgnoreCase("canonical"))
         collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
       else
         throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
     }

     // expert options: concrete subclasses are always a RuleBasedCollator
     RuleBasedCollator rbc = (RuleBasedCollator) collator;
     if (alternate != null) {
       if (alternate.equalsIgnoreCase("shifted")) {
         rbc.setAlternateHandlingShifted(true);
       } else if (alternate.equalsIgnoreCase("non-ignorable")) {
         rbc.setAlternateHandlingShifted(false);
       } else {
         throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
       }
     }
     if (caseLevel != null) {
       rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
     }
     if (caseFirst != null) {
       if (caseFirst.equalsIgnoreCase("lower")) {
         rbc.setLowerCaseFirst(true);
       } else if (caseFirst.equalsIgnoreCase("upper")) {
         rbc.setUpperCaseFirst(true);
       } else {
         throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
       }
     }
     if (numeric != null) {
       rbc.setNumericCollation(Boolean.parseBoolean(numeric));
     }
     if (variableTop != null) {
       rbc.setVariableTop(variableTop);
     }

     analyzer = new ICUCollationKeyAnalyzer(collator);
   }

   /**
    * Create a locale from localeID.
    * Then return the appropriate collator for the locale.
    */
   private Collator createFromLocale(String localeID) {
     return Collator.getInstance(new ULocale(localeID));
   }

   /**
    * Read custom rules from a file, and create a RuleBasedCollator
    * The file cannot support comments, as # might be in the rules!
    */
   static Collator createFromRules(String fileName, ResourceLoader loader) {
     InputStream input = null;
     try {
      input = loader.openResource(fileName);
      String rules = IOUtils.toString(input, "UTF-8");
      return new RuleBasedCollator(rules);
     } catch (Exception e) {
       // io error or invalid rules
       throw new RuntimeException(e);
     } finally {
       IOUtils.closeQuietly(input);
     }
   }

   @Override
   public void write(TextResponseWriter writer, String name, IndexableField f) throws IOException {
     writer.writeStr(name, f.stringValue(), true);
   }

   @Override
   public SortField getSortField(SchemaField field, boolean top) {
     return getStringSort(field, top);
   }

   @Override
   public Type getUninversionType(SchemaField sf) {
     if (sf.multiValued()) {
       return Type.SORTED_SET_BINARY;
     } else {
       return Type.SORTED;
     }
   }

   @Override
   public Analyzer getIndexAnalyzer() {
     return analyzer;
   }

   @Override
   public Analyzer getQueryAnalyzer() {
     return analyzer;
   }

   /**
    * analyze the text with the analyzer, instead of the collator.
    * because icu collators are not thread safe, this keeps things
    * simple (we already have a threadlocal clone in the reused TS)
    */
   private BytesRef getCollationKey(String field, String text) {
     try (TokenStream source = analyzer.tokenStream(field, text)) {
       source.reset();

       TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);


       // we control the analyzer here: most errors are impossible
       if (!source.incrementToken())
         throw new IllegalArgumentException("analyzer returned no terms for text: " + text);
       BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
       assert !source.incrementToken();

       source.end();
       return bytes;
     } catch (IOException e) {
       throw new RuntimeException("Unable to analyze text: " + text, e);
     }
   }

   @Override
   protected Query getSpecializedRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
     String f = field.getName();
     BytesRef low = part1 == null ? null : getCollationKey(f, part1);
     BytesRef high = part2 == null ? null : getCollationKey(f, part2);
     if (!field.indexed() && field.hasDocValues()) {
       return SortedSetDocValuesField.newSlowRangeQuery(
           field.getName(), low, high, minInclusive, maxInclusive);
     } else {
       return new TermRangeQuery(field.getName(), low, high, minInclusive, maxInclusive);
     }
   }

   @Override
   protected void checkSupportsDocValues() { // we support DocValues
   }

   @Override
   public List<IndexableField> createFields(SchemaField field, Object value) {
     if (field.hasDocValues()) {
       List<IndexableField> fields = new ArrayList<>();
       fields.add(createField(field, value));
       final BytesRef bytes = getCollationKey(field.getName(), value.toString());
       if (field.multiValued()) {
         fields.add(new SortedSetDocValuesField(field.getName(), bytes));
       } else {
         fields.add(new SortedDocValuesField(field.getName(), bytes));
       }
       return fields;
     } else {
       return Collections.singletonList(createField(field, value));
     }
   }

   @Override
   public Object marshalSortValue(Object value) {
     return marshalBase64SortValue(value);
   }

   @Override
   public Object unmarshalSortValue(Object value) {
     return unmarshalBase64SortValue(value);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.solr.schema;

	import java.io.IOException;
	import java.io.InputStream;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.List;
	import java.util.Map;

	import org.apache.commons.io.IOUtils;
	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
	import org.apache.lucene.analysis.util.ResourceLoader;
	import org.apache.lucene.collation.ICUCollationKeyAnalyzer;
	import org.apache.lucene.document.SortedDocValuesField;
	import org.apache.lucene.document.SortedSetDocValuesField;
	import org.apache.lucene.index.IndexableField;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.SortField;
	import org.apache.lucene.search.TermRangeQuery;
	import org.apache.lucene.util.BytesRef;
	import org.apache.solr.common.SolrException.ErrorCode;
	import org.apache.solr.common.SolrException;
	import org.apache.solr.response.TextResponseWriter;
	import org.apache.solr.search.QParser;
	import org.apache.solr.uninverting.UninvertingReader.Type;

	import com.ibm.icu.text.Collator;
	import com.ibm.icu.text.RuleBasedCollator;
	import com.ibm.icu.util.ULocale;

	/**
	* Field for collated sort keys.
	* These can be used for locale-sensitive sort and range queries.
	* <p>
	* This field can be created in two ways:
	* <ul>
	* <li>Based upon a system collator associated with a Locale.
	* <li>Based upon a tailored ruleset.
	* </ul>
	* <p>
	* Using a System collator:
	* <ul>
	* <li>locale: RFC 3066 locale ID (mandatory)
	* <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
	* <li>decomposition: 'no', or 'canonical' (optional)
	* </ul>
	* <p>
	* Using a Tailored ruleset:
	* <ul>
	* <li>custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)
	* <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
	* <li>decomposition: 'no' or 'canonical' (optional)
	* </ul>
	* <p>
	* Expert options:
	* <ul>
	* <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
	* <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
	* <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
	* <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
	* <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
	* </ul>
	*
	* @see Collator
	* @see ULocale
	* @see RuleBasedCollator
	*/
	public class ICUCollationField extends FieldType {
	private Analyzer analyzer;

	@Override
	protected void init(IndexSchema schema, Map<String,String> args) {
	properties \|= TOKENIZED; // this ensures our analyzer gets hit
	setup(schema.getResourceLoader(), args);
	super.init(schema, args);
	}

	/**
	* Setup the field according to the provided parameters
	*/
	private void setup(ResourceLoader loader, Map<String,String> args) {
	String custom = args.remove("custom");
	String localeID = args.remove("locale");
	String strength = args.remove("strength");
	String decomposition = args.remove("decomposition");

	String alternate = args.remove("alternate");
	String caseLevel = args.remove("caseLevel");
	String caseFirst = args.remove("caseFirst");
	String numeric = args.remove("numeric");
	String variableTop = args.remove("variableTop");

	if (custom == null && localeID == null)
	throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");

	if (custom != null && localeID != null)
	throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. "
	+ "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. "
	+ "Then save the entire customized ruleset to a file, and use with the custom parameter");

	final Collator collator;

	if (localeID != null) {
	// create from a system collator, based on Locale.
	collator = createFromLocale(localeID);
	} else {
	// create from a custom ruleset
	collator = createFromRules(custom, loader);
	}

	// set the strength flag, otherwise it will be the default.
	if (strength != null) {
	if (strength.equalsIgnoreCase("primary"))
	collator.setStrength(Collator.PRIMARY);
	else if (strength.equalsIgnoreCase("secondary"))
	collator.setStrength(Collator.SECONDARY);
	else if (strength.equalsIgnoreCase("tertiary"))
	collator.setStrength(Collator.TERTIARY);
	else if (strength.equalsIgnoreCase("quaternary"))
	collator.setStrength(Collator.QUATERNARY);
	else if (strength.equalsIgnoreCase("identical"))
	collator.setStrength(Collator.IDENTICAL);
	else
	throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
	}

	// set the decomposition flag, otherwise it will be the default.
	if (decomposition != null) {
	if (decomposition.equalsIgnoreCase("no"))
	collator.setDecomposition(Collator.NO_DECOMPOSITION);
	else if (decomposition.equalsIgnoreCase("canonical"))
	collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
	else
	throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
	}

	// expert options: concrete subclasses are always a RuleBasedCollator
	RuleBasedCollator rbc = (RuleBasedCollator) collator;
	if (alternate != null) {
	if (alternate.equalsIgnoreCase("shifted")) {
	rbc.setAlternateHandlingShifted(true);
	} else if (alternate.equalsIgnoreCase("non-ignorable")) {
	rbc.setAlternateHandlingShifted(false);
	} else {
	throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
	}
	}
	if (caseLevel != null) {
	rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
	}
	if (caseFirst != null) {
	if (caseFirst.equalsIgnoreCase("lower")) {
	rbc.setLowerCaseFirst(true);
	} else if (caseFirst.equalsIgnoreCase("upper")) {
	rbc.setUpperCaseFirst(true);
	} else {
	throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
	}
	}
	if (numeric != null) {
	rbc.setNumericCollation(Boolean.parseBoolean(numeric));
	}
	if (variableTop != null) {
	rbc.setVariableTop(variableTop);
	}

	analyzer = new ICUCollationKeyAnalyzer(collator);
	}

	/**
	* Create a locale from localeID.
	* Then return the appropriate collator for the locale.
	*/
	private Collator createFromLocale(String localeID) {
	return Collator.getInstance(new ULocale(localeID));
	}

	/**
	* Read custom rules from a file, and create a RuleBasedCollator
	* The file cannot support comments, as # might be in the rules!
	*/
	static Collator createFromRules(String fileName, ResourceLoader loader) {
	InputStream input = null;
	try {
	input = loader.openResource(fileName);
	String rules = IOUtils.toString(input, "UTF-8");
	return new RuleBasedCollator(rules);
	} catch (Exception e) {
	// io error or invalid rules
	throw new RuntimeException(e);
	} finally {
	IOUtils.closeQuietly(input);
	}
	}

	@Override
	public void write(TextResponseWriter writer, String name, IndexableField f) throws IOException {
	writer.writeStr(name, f.stringValue(), true);
	}

	@Override
	public SortField getSortField(SchemaField field, boolean top) {
	return getStringSort(field, top);
	}

	@Override
	public Type getUninversionType(SchemaField sf) {
	if (sf.multiValued()) {
	return Type.SORTED_SET_BINARY;
	} else {
	return Type.SORTED;
	}
	}

	@Override
	public Analyzer getIndexAnalyzer() {
	return analyzer;
	}

	@Override
	public Analyzer getQueryAnalyzer() {
	return analyzer;
	}

	/**
	* analyze the text with the analyzer, instead of the collator.
	* because icu collators are not thread safe, this keeps things
	* simple (we already have a threadlocal clone in the reused TS)
	*/
	private BytesRef getCollationKey(String field, String text) {
	try (TokenStream source = analyzer.tokenStream(field, text)) {
	source.reset();

	TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);


	// we control the analyzer here: most errors are impossible
	if (!source.incrementToken())
	throw new IllegalArgumentException("analyzer returned no terms for text: " + text);
	BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
	assert !source.incrementToken();

	source.end();
	return bytes;
	} catch (IOException e) {
	throw new RuntimeException("Unable to analyze text: " + text, e);
	}
	}

	@Override
	protected Query getSpecializedRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
	String f = field.getName();
	BytesRef low = part1 == null ? null : getCollationKey(f, part1);
	BytesRef high = part2 == null ? null : getCollationKey(f, part2);
	if (!field.indexed() && field.hasDocValues()) {
	return SortedSetDocValuesField.newSlowRangeQuery(
	field.getName(), low, high, minInclusive, maxInclusive);
	} else {
	return new TermRangeQuery(field.getName(), low, high, minInclusive, maxInclusive);
	}
	}

	@Override
	protected void checkSupportsDocValues() { // we support DocValues
	}

	@Override
	public List<IndexableField> createFields(SchemaField field, Object value) {
	if (field.hasDocValues()) {
	List<IndexableField> fields = new ArrayList<>();
	fields.add(createField(field, value));
	final BytesRef bytes = getCollationKey(field.getName(), value.toString());
	if (field.multiValued()) {
	fields.add(new SortedSetDocValuesField(field.getName(), bytes));
	} else {
	fields.add(new SortedDocValuesField(field.getName(), bytes));
	}
	return fields;
	} else {
	return Collections.singletonList(createField(field, value));
	}
	}

	@Override
	public Object marshalSortValue(Object value) {
	return marshalBase64SortValue(value);
	}

	@Override
	public Object unmarshalSortValue(Object value) {
	return unmarshalBase64SortValue(value);
	}
	}