| using Lucene.Net.Util; |
| using System; |
| using System.Collections.Generic; |
| using System.IO; |
| using System.Text; |
| |
| namespace Lucene.Net.Analysis.Synonym |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Parser for the Solr synonyms format. |
| /// <list type="bullet"> |
| /// <item><description> Blank lines and lines starting with '#' are comments.</description></item> |
| /// <item><description> Explicit mappings match any token sequence on the LHS of "=>" |
| /// and replace with all alternatives on the RHS. These types of mappings |
| /// ignore the expand parameter in the constructor. |
| /// Example: |
| /// <code>i-pod, i pod => ipod</code> |
| /// </description></item> |
| /// <item><description> Equivalent synonyms may be separated with commas and give |
| /// no explicit mapping. In this case the mapping behavior will |
| /// be taken from the expand parameter in the constructor. This allows |
| /// the same synonym file to be used in different synonym handling strategies. |
| /// Example: |
| /// <code>ipod, i-pod, i pod</code> |
| /// </description></item> |
| /// <item><description> Multiple synonym mapping entries are merged. |
| /// Example: |
| /// <code> |
| /// foo => foo bar |
| /// foo => baz |
| /// is equivalent to |
| /// foo => foo bar, baz |
| /// </code> |
| /// </description></item> |
| /// </list> |
| /// @lucene.experimental |
| /// </summary> |
| public class SolrSynonymParser : SynonymMap.Parser |
| { |
| private readonly bool expand; |
| |
| public SolrSynonymParser(bool dedup, bool expand, Analyzer analyzer) |
| : base(dedup, analyzer) |
| { |
| this.expand = expand; |
| } |
| |
| public override void Parse(TextReader @in) |
| { |
| int lineNumber = 0; |
| try |
| { |
| string line = null; |
| while ((line = @in.ReadLine()) != null) |
| { |
| lineNumber++; |
| if (line.Length == 0 || line[0] == '#') |
| { |
| continue; // ignore empty lines and comments |
| } |
| |
| CharsRef[] inputs; |
| CharsRef[] outputs; |
| |
| // TODO: we could process this more efficiently. |
| string[] sides = Split(line, "=>"); |
| if (sides.Length > 1) // explicit mapping |
| { |
| if (sides.Length != 2) |
| { |
| throw new ArgumentException("more than one explicit mapping specified on the same line"); |
| } |
| string[] inputStrings = Split(sides[0], ","); |
| inputs = new CharsRef[inputStrings.Length]; |
| for (int i = 0; i < inputs.Length; i++) |
| { |
| inputs[i] = Analyze(Unescape(inputStrings[i]).Trim(), new CharsRef()); |
| } |
| |
| string[] outputStrings = Split(sides[1], ","); |
| outputs = new CharsRef[outputStrings.Length]; |
| for (int i = 0; i < outputs.Length; i++) |
| { |
| outputs[i] = Analyze(Unescape(outputStrings[i]).Trim(), new CharsRef()); |
| } |
| } |
| else |
| { |
| string[] inputStrings = Split(line, ","); |
| inputs = new CharsRef[inputStrings.Length]; |
| for (int i = 0; i < inputs.Length; i++) |
| { |
| inputs[i] = Analyze(Unescape(inputStrings[i]).Trim(), new CharsRef()); |
| } |
| if (expand) |
| { |
| outputs = inputs; |
| } |
| else |
| { |
| outputs = new CharsRef[1]; |
| outputs[0] = inputs[0]; |
| } |
| } |
| |
| // currently we include the term itself in the map, |
| // and use includeOrig = false always. |
| // this is how the existing filter does it, but its actually a bug, |
| // especially if combined with ignoreCase = true |
| for (int i = 0; i < inputs.Length; i++) |
| { |
| for (int j = 0; j < outputs.Length; j++) |
| { |
| Add(inputs[i], outputs[j], false); |
| } |
| } |
| } |
| } |
| catch (ArgumentException e) |
| { |
| throw new Exception("Invalid synonym rule at line " + lineNumber, e); |
| //ex.initCause(e); |
| //throw ex; |
| } |
| finally |
| { |
| @in.Dispose(); |
| } |
| } |
| |
| private static string[] Split(string s, string separator) |
| { |
| List<string> list = new List<string>(2); |
| StringBuilder sb = new StringBuilder(); |
| int pos = 0, end = s.Length; |
| while (pos < end) |
| { |
| //if (s.StartsWith(separator, pos)) |
| if (s.Substring(pos).StartsWith(separator, StringComparison.Ordinal)) |
| { |
| if (sb.Length > 0) |
| { |
| list.Add(sb.ToString()); |
| sb = new StringBuilder(); |
| } |
| pos += separator.Length; |
| continue; |
| } |
| |
| char ch = s[pos++]; |
| if (ch == '\\') |
| { |
| sb.Append(ch); |
| if (pos >= end) // ERROR, or let it go? |
| { |
| break; |
| } |
| ch = s[pos++]; |
| } |
| |
| sb.Append(ch); |
| } |
| |
| if (sb.Length > 0) |
| { |
| list.Add(sb.ToString()); |
| } |
| |
| return list.ToArray(); |
| } |
| |
| private string Unescape(string s) |
| { |
| if (s.IndexOf("\\", StringComparison.Ordinal) >= 0) |
| { |
| StringBuilder sb = new StringBuilder(); |
| for (int i = 0; i < s.Length; i++) |
| { |
| char ch = s[i]; |
| if (ch == '\\' && i < s.Length - 1) |
| { |
| sb.Append(s[++i]); |
| } |
| else |
| { |
| sb.Append(ch); |
| } |
| } |
| return sb.ToString(); |
| } |
| return s; |
| } |
| } |
| } |