| using Lucene.Net.Analysis.CharFilters; |
| using Lucene.Net.Analysis.CommonGrams; |
| using Lucene.Net.Analysis.Miscellaneous; |
| using Lucene.Net.Analysis.NGram; |
| using Lucene.Net.Analysis.Shingle; |
| using Lucene.Net.Analysis.Util; |
| using Lucene.Net.Analysis.Wikipedia; |
| using NUnit.Framework; |
| using System; |
| using System.IO; |
| using Console = Lucene.Net.Util.SystemConsole; |
| using JCG = J2N.Collections.Generic; |
| |
| namespace Lucene.Net.Analysis.Core |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| [SuppressCodecs("Direct")] |
| public class TestBugInSomething : BaseTokenStreamTestCase |
| { |
| [Test] |
| public virtual void Test() |
| { |
| CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 3, false); |
| cas.add("jjp"); |
| cas.add("wlmwoknt"); |
| cas.add("tcgyreo"); |
| |
| NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); |
| builder.Add("mtqlpi", ""); |
| builder.Add("mwoknt", "jjp"); |
| builder.Add("tcgyreo", "zpfpajyws"); |
| NormalizeCharMap map = builder.Build(); |
| |
| Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => |
| { |
| Tokenizer t = new MockTokenizer(new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader), MockTokenFilter.ENGLISH_STOPSET, false, -65); |
| TokenFilter f = new CommonGramsFilter(TEST_VERSION_CURRENT, t, cas); |
| return new TokenStreamComponents(t, f); |
| }, initReader: (fieldName, reader) => |
| { |
| reader = new MockCharFilter(reader, 0); |
| reader = new MappingCharFilter(map, reader); |
| return reader; |
| }); |
| CheckAnalysisConsistency(Random, a, false, "wmgddzunizdomqyj"); |
| } |
| |
| internal CharFilter wrappedStream = new CharFilterAnonymousClass(new StringReader("bogus")); |
| |
| private sealed class CharFilterAnonymousClass : CharFilter |
| { |
| public CharFilterAnonymousClass(StringReader java) : base(java) |
| { |
| } |
| |
| |
| public override void Mark(int readAheadLimit) |
| { |
| throw new NotSupportedException("Mark(int)"); |
| } |
| |
| public override bool IsMarkSupported => throw new NotSupportedException("IsMarkSupported"); |
| |
| public override int Read() |
| { |
| throw new NotSupportedException("Read()"); |
| } |
| |
| // LUCENENET: We don't support these overloads in .NET |
| // public override int Read(char[] cbuf) |
| // { |
| //throw new NotSupportedException("Read(char[])"); |
| // } |
| |
| //public override int read(CharBuffer target) |
| //{ |
| // throw new NotSupportedException("Read(CharBuffer)"); |
| //} |
| |
| public override bool IsReady => throw new NotSupportedException("Ready()"); |
| |
| public override void Reset() |
| { |
| throw new NotSupportedException("Reset()"); |
| } |
| |
| public override long Skip(int n) |
| { |
| throw new NotSupportedException("Skip(long)"); |
| } |
| |
| protected override int Correct(int currentOff) |
| { |
| throw new NotSupportedException("Correct(int)"); |
| } |
| |
| protected override void Dispose(bool disposing) |
| { |
| throw new NotSupportedException("Close()"); |
| } |
| |
| public override int Read(char[] arg0, int arg1, int arg2) |
| { |
| throw new NotSupportedException("Read(char[], int, int)"); |
| } |
| } |
| |
| [Test] |
| public virtual void TestWrapping() |
| { |
| CharFilter cs = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream); |
| try |
| { |
| cs.Mark(1); |
| fail(); |
| } |
| catch (Exception e) |
| { |
| assertEquals("Mark(int)", e.Message); |
| } |
| |
| try |
| { |
| var supported = cs.IsMarkSupported; |
| fail(); |
| } |
| catch (Exception e) |
| { |
| assertEquals("IsMarkSupported", e.Message); |
| } |
| |
| try |
| { |
| cs.Read(); |
| fail(); |
| } |
| catch (Exception e) |
| { |
| assertEquals("Read()", e.Message); |
| } |
| |
| try |
| { |
| cs.read(new char[0]); |
| fail(); |
| } |
| catch (Exception e) |
| { |
| // LUCENENET NOTE: TextReader doesn't support an overload that doesn't supply |
| // index and count. We have an extension method that does in test environment, |
| // but the error will be for the cascaded overload |
| //assertEquals("Read(char[])", e.Message); |
| assertEquals("Read(char[], int, int)", e.Message); |
| } |
| |
| // LUCENENET NOTE: We don't have a CharBuffer type in Lucene.Net, |
| // nor do we have an overload that accepts it. |
| //try |
| //{ |
| // cs.read(CharBuffer.wrap(new char[0])); |
| // fail(); |
| //} |
| //catch (Exception e) |
| //{ |
| // assertEquals("Read(CharBuffer)", e.Message); |
| //} |
| |
| try |
| { |
| cs.Reset(); |
| fail(); |
| } |
| catch (Exception e) |
| { |
| assertEquals("Reset()", e.Message); |
| } |
| |
| try |
| { |
| cs.Skip(1); |
| fail(); |
| } |
| catch (Exception e) |
| { |
| assertEquals("Skip(long)", e.Message); |
| } |
| |
| try |
| { |
| cs.CorrectOffset(1); |
| fail(); |
| } |
| catch (Exception e) |
| { |
| assertEquals("Correct(int)", e.Message); |
| } |
| |
| try |
| { |
| cs.Dispose(); |
| fail(); |
| } |
| catch (Exception e) |
| { |
| assertEquals("Close()", e.Message); |
| } |
| |
| try |
| { |
| cs.Read(new char[0], 0, 0); |
| fail(); |
| } |
| catch (Exception e) |
| { |
| assertEquals("Read(char[], int, int)", e.Message); |
| } |
| } |
| |
| // todo: test framework? |
| |
| internal sealed class SopTokenFilter : TokenFilter |
| { |
| |
| internal SopTokenFilter(TokenStream input) : base(input) |
| { |
| } |
| |
| public override bool IncrementToken() |
| { |
| if (m_input.IncrementToken()) |
| { |
| Console.WriteLine(m_input.GetType().Name + "->" + this.ReflectAsString(false)); |
| return true; |
| } |
| else |
| { |
| return false; |
| } |
| } |
| |
| public override void End() |
| { |
| base.End(); |
| Console.WriteLine(m_input.GetType().Name + ".end()"); |
| } |
| |
| protected override void Dispose(bool disposing) |
| { |
| base.Dispose(disposing); |
| if (disposing) |
| { |
| Console.WriteLine(m_input.GetType().Name + ".close()"); |
| } |
| } |
| |
| public override void Reset() |
| { |
| base.Reset(); |
| Console.WriteLine(m_input.GetType().Name + ".reset()"); |
| } |
| } |
| |
| // LUCENE-5269 |
| [Test] |
| [Slow] |
| public virtual void TestUnicodeShinglesAndNgrams() |
| { |
| Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => |
| { |
| Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94); |
| //TokenStream stream = new SopTokenFilter(tokenizer); |
| TokenStream stream = new ShingleFilter(tokenizer, 5); |
| //stream = new SopTokenFilter(stream); |
| stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83); |
| //stream = new SopTokenFilter(stream); |
| return new TokenStreamComponents(tokenizer, stream); |
| }); |
| CheckRandomData(Random, analyzer, 2000); |
| } |
| |
| [Test] |
| public virtual void TestCuriousWikipediaString() |
| { |
| CharArraySet protWords = new CharArraySet(TEST_VERSION_CURRENT, new JCG.HashSet<string> { "rrdpafa", "pupmmlu", "xlq", "dyy", "zqrxrrck", "o", "hsrlfvcha" }, false); |
| byte[] table = (byte[])(Array)new sbyte[] { -57, 26, 1, 48, 63, -23, 55, -84, 18, 120, -97, 103, 58, 13, 84, 89, 57, -13, -63, 5, 28, 97, -54, -94, 102, -108, -5, 5, 46, 40, 43, 78, 43, -72, 36, 29, 124, -106, -22, -51, 65, 5, 31, -42, 6, -99, 97, 14, 81, -128, 74, 100, 54, -55, -25, 53, -71, -98, 44, 33, 86, 106, -42, 47, 115, -89, -18, -26, 22, -95, -43, 83, -125, 105, -104, -24, 106, -16, 126, 115, -105, 97, 65, -33, 57, 44, -1, 123, -68, 100, 13, -41, -64, -119, 0, 92, 94, -36, 53, -9, -102, -18, 90, 94, -26, 31, 71, -20 }; |
| Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => |
| { |
| Tokenizer tokenizer = new WikipediaTokenizer(reader); |
| TokenStream stream = new SopTokenFilter(tokenizer); |
| stream = new WordDelimiterFilter(TEST_VERSION_CURRENT, stream, table, (WordDelimiterFlags)(object)-50, protWords); |
| stream = new SopTokenFilter(stream); |
| return new TokenStreamComponents(tokenizer, stream); |
| }); |
| CheckAnalysisConsistency(Random, a, false, "B\u28c3\ue0f8[ \ud800\udfc2 </p> jb"); |
| } |
| } |
| } |