| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.synonym; |
| |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.LineNumberReader; |
| import java.io.Reader; |
| import java.text.ParseException; |
| import java.util.ArrayList; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.util.CharsRef; |
| import org.apache.lucene.util.CharsRefBuilder; |
| |
| /** |
| * Parser for the Solr synonyms format. |
| * |
| * <ol> |
| * <li>Blank lines and lines starting with '#' are comments. |
| * <li>Explicit mappings match any token sequence on the LHS of "=>" and replace with all |
| * alternatives on the RHS. These types of mappings ignore the expand parameter in the |
| * constructor. Example: |
| * <blockquote> |
| * i-pod, i pod => ipod |
| * </blockquote> |
| * <li>Equivalent synonyms may be separated with commas and give no explicit mapping. In this case |
| * the mapping behavior will be taken from the expand parameter in the constructor. This |
| * allows the same synonym file to be used in different synonym handling strategies. Example: |
| * <blockquote> |
| * ipod, i-pod, i pod |
| * </blockquote> |
| * <li>Multiple synonym mapping entries are merged. Example: |
| * <blockquote> |
| * foo => foo bar<br> |
| * foo => baz<br> |
| * <br> |
| * is equivalent to<br> |
| * <br> |
| * foo => foo bar, baz |
| * </blockquote> |
| * </ol> |
| * |
| * @lucene.experimental |
| */ |
| public class SolrSynonymParser extends SynonymMap.Parser { |
| private final boolean expand; |
| |
| public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) { |
| super(dedup, analyzer); |
| this.expand = expand; |
| } |
| |
| @Override |
| public void parse(Reader in) throws IOException, ParseException { |
| LineNumberReader br = new LineNumberReader(in); |
| try { |
| addInternal(br); |
| } catch (IllegalArgumentException e) { |
| ParseException ex = |
| new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0); |
| ex.initCause(e); |
| throw ex; |
| } finally { |
| br.close(); |
| } |
| } |
| |
| private void addInternal(BufferedReader in) throws IOException { |
| String line = null; |
| while ((line = in.readLine()) != null) { |
| if (line.length() == 0 || line.charAt(0) == '#') { |
| continue; // ignore empty lines and comments |
| } |
| |
| // TODO: we could process this more efficiently. |
| String sides[] = split(line, "=>"); |
| if (sides.length > 1) { // explicit mapping |
| if (sides.length != 2) { |
| throw new IllegalArgumentException( |
| "more than one explicit mapping specified on the same line"); |
| } |
| String inputStrings[] = split(sides[0], ","); |
| CharsRef[] inputs = new CharsRef[inputStrings.length]; |
| for (int i = 0; i < inputs.length; i++) { |
| inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRefBuilder()); |
| } |
| |
| String outputStrings[] = split(sides[1], ","); |
| CharsRef[] outputs = new CharsRef[outputStrings.length]; |
| for (int i = 0; i < outputs.length; i++) { |
| outputs[i] = analyze(unescape(outputStrings[i]).trim(), new CharsRefBuilder()); |
| } |
| // these mappings are explicit and never preserve original |
| for (int i = 0; i < inputs.length; i++) { |
| for (int j = 0; j < outputs.length; j++) { |
| add(inputs[i], outputs[j], false); |
| } |
| } |
| } else { |
| String inputStrings[] = split(line, ","); |
| CharsRef[] inputs = new CharsRef[inputStrings.length]; |
| for (int i = 0; i < inputs.length; i++) { |
| inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRefBuilder()); |
| } |
| if (expand) { |
| // all pairs |
| for (int i = 0; i < inputs.length; i++) { |
| for (int j = 0; j < inputs.length; j++) { |
| if (i != j) { |
| add(inputs[i], inputs[j], true); |
| } |
| } |
| } |
| } else { |
| // all subsequent inputs map to first one; we also add inputs[0] here |
| // so that we "effectively" (because we remove the original input and |
| // add back a synonym with the same text) change that token's type to |
| // SYNONYM (matching legacy behavior): |
| for (int i = 0; i < inputs.length; i++) { |
| add(inputs[i], inputs[0], false); |
| } |
| } |
| } |
| } |
| } |
| |
| private static String[] split(String s, String separator) { |
| ArrayList<String> list = new ArrayList<>(2); |
| StringBuilder sb = new StringBuilder(); |
| int pos = 0, end = s.length(); |
| while (pos < end) { |
| if (s.startsWith(separator, pos)) { |
| if (sb.length() > 0) { |
| list.add(sb.toString()); |
| sb = new StringBuilder(); |
| } |
| pos += separator.length(); |
| continue; |
| } |
| |
| char ch = s.charAt(pos++); |
| if (ch == '\\') { |
| sb.append(ch); |
| if (pos >= end) break; // ERROR, or let it go? |
| ch = s.charAt(pos++); |
| } |
| |
| sb.append(ch); |
| } |
| |
| if (sb.length() > 0) { |
| list.add(sb.toString()); |
| } |
| |
| return list.toArray(new String[list.size()]); |
| } |
| |
| private String unescape(String s) { |
| if (s.indexOf("\\") >= 0) { |
| StringBuilder sb = new StringBuilder(); |
| for (int i = 0; i < s.length(); i++) { |
| char ch = s.charAt(i); |
| if (ch == '\\' && i < s.length() - 1) { |
| sb.append(s.charAt(++i)); |
| } else { |
| sb.append(ch); |
| } |
| } |
| return sb.toString(); |
| } |
| return s; |
| } |
| } |