Ported Analysis.Miscellaneous.PatternAnalyzer + tests

commit: 7f877fdfc2ba25a7c1b0386795b4f83b46f50767 [log] [tgz]
author: Shad Storhaug <shad@shadstorhaug.com> Sun Aug 28 00:11:18 2016 +0700
committer: Shad Storhaug <shad@shadstorhaug.com> Sun Aug 28 11:24:43 2016 +0700
tree: 2b29a81186bf36b9be484427d4adc0a21b4461f8
parent: 8a05b1682b9d1405ed71d42e75d06a296736641d [diff]
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzer.cs
index 6c28927..933e714 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzer.cs

@@ -1,13 +1,14 @@
-using System;
-using System.IO;
-using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Core;
 using Lucene.Net.Analysis.Tokenattributes;
 using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
 using Lucene.Net.Util;
+using System;
+using System.IO;
+using System.Text.RegularExpressions;
 
 namespace Lucene.Net.Analysis.Miscellaneous
 {
-
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
@@ -24,6 +25,7 @@
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */
+
     /// <summary>
     /// Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
     /// <seealso cref="TextReader"/>, that can flexibly separate text into terms via a regular expression <seealso cref="Pattern"/>
@@ -61,19 +63,65 @@
 
         /// <summary>
         /// <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) </summary>
-        public static readonly Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
+        public static readonly Regex NON_WORD_PATTERN = new Regex("\\W+", RegexOptions.Compiled);
 
         /// <summary>
         /// <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) </summary>
-        public static readonly Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
+        public static readonly Regex WHITESPACE_PATTERN = new Regex("\\s+", RegexOptions.Compiled);
 
-        private static readonly CharArraySet EXTENDED_ENGLISH_STOP_WORDS = CharArraySet.UnmodifiableSet(new CharArraySet(LuceneVersion.LUCENE_CURRENT, Arrays.asList("a", "about", "above", "across", "adj", "after", "afterwards", "again", "against", "albeit", "all", "almost", "alone", "along", "already", "also", "although", "always", "among", "amongst", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anywhere", "are", "around", "as", "at", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", "can", "cannot", "co", "could", "down", "during", "each", "eg", "either", "else", "elsewhere", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "first", "for", "former", "formerly", "from", "further", "had", "has", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "i", "ie", "if", "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last", "latter", "latterly", "least", "less", "ltd", "many", "may", "me", "meanwhile", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "namely", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps", "rather", "s", "same", "seem", "seemed", "seeming", "seems", "several", "she", "should", "since", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "t", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefor", "therein", "thereupon", "these", "they", "this", "those", "though", "through", "throughout", "thru", "thus", "to", "together", "too", "toward", "towards", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", "where", "whereafter", "whereas", "whereat", "whereby", "wherefrom", "wherein", "whereinto", "whereof", "whereon", "whereto", "whereunto", "whereupon", "wherever", "wherewith", "whether", "which", "whichever", "whichsoever", "while", "whilst", "whither", "who", "whoever", "whole", "whom", "whomever", "whomsoever", "whose", "whosoever", "why", "will", "with", "within", "without", "would", "xsubj", "xcal", "xauthor", "xother ", "xnote", "yet", "you", "your", "yours", "yourself", "yourselves"), true));
+        private static readonly CharArraySet EXTENDED_ENGLISH_STOP_WORDS = 
+            CharArraySet.UnmodifiableSet(new CharArraySet(LuceneVersion.LUCENE_CURRENT, 
+                Arrays.AsList(
+                    "a", "about", "above", "across", "adj", "after", "afterwards",
+                    "again", "against", "albeit", "all", "almost", "alone", "along",
+                    "already", "also", "although", "always", "among", "amongst", "an",
+                    "and", "another", "any", "anyhow", "anyone", "anything",
+                    "anywhere", "are", "around", "as", "at", "be", "became", "because",
+                    "become", "becomes", "becoming", "been", "before", "beforehand",
+                    "behind", "being", "below", "beside", "besides", "between",
+                    "beyond", "both", "but", "by", "can", "cannot", "co", "could",
+                    "down", "during", "each", "eg", "either", "else", "elsewhere",
+                    "enough", "etc", "even", "ever", "every", "everyone", "everything",
+                    "everywhere", "except", "few", "first", "for", "former",
+                    "formerly", "from", "further", "had", "has", "have", "he", "hence",
+                    "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
+                    "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
+                    "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
+                    "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
+                    "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
+                    "must", "my", "myself", "namely", "neither", "never",
+                    "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
+                    "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
+                    "once one", "only", "onto", "or", "other", "others", "otherwise",
+                    "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
+                    "rather", "s", "same", "seem", "seemed", "seeming", "seems",
+                    "several", "she", "should", "since", "so", "some", "somehow",
+                    "someone", "something", "sometime", "sometimes", "somewhere",
+                    "still", "such", "t", "than", "that", "the", "their", "them",
+                    "themselves", "then", "thence", "there", "thereafter", "thereby",
+                    "therefor", "therein", "thereupon", "these", "they", "this",
+                    "those", "though", "through", "throughout", "thru", "thus", "to",
+                    "together", "too", "toward", "towards", "under", "until", "up",
+                    "upon", "us", "very", "via", "was", "we", "well", "were", "what",
+                    "whatever", "whatsoever", "when", "whence", "whenever",
+                    "whensoever", "where", "whereafter", "whereas", "whereat",
+                    "whereby", "wherefrom", "wherein", "whereinto", "whereof",
+                    "whereon", "whereto", "whereunto", "whereupon", "wherever",
+                    "wherewith", "whether", "which", "whichever", "whichsoever",
+                    "while", "whilst", "whither", "who", "whoever", "whole", "whom",
+                    "whomever", "whomsoever", "whose", "whosoever", "why", "will",
+                    "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
+                    "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
+                    "yourselves"
+                
+                    ), true));
 
         /// <summary>
         /// A lower-casing word analyzer with English stop words (can be shared
         /// freely across threads without harm); global per class loader.
         /// </summary>
-        public static readonly PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(LuceneVersion.LUCENE_CURRENT, NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+        public static readonly PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
+            LuceneVersion.LUCENE_CURRENT, NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
 
         /// <summary>
         /// A lower-casing word analyzer with <b>extended </b> English stop words
@@ -82,9 +130,10 @@
         /// http://thomas.loc.gov/home/stopwords.html, see
         /// http://thomas.loc.gov/home/all.about.inquery.html
         /// </summary>
-        public static readonly PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(LuceneVersion.LUCENE_CURRENT, NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
+        public static readonly PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
+            LuceneVersion.LUCENE_CURRENT, NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
 
-        private readonly Pattern pattern;
+        private readonly Regex pattern;
         private readonly bool toLowerCase;
         private readonly CharArraySet stopWords;
 
@@ -108,23 +157,23 @@
         ///            <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
         ///            or <a href="http://www.unine.ch/info/clef/">other stop words
         ///            lists </a>. </param>
-        public PatternAnalyzer(LuceneVersion matchVersion, Pattern pattern, bool toLowerCase, CharArraySet stopWords)
+        public PatternAnalyzer(LuceneVersion matchVersion, Regex pattern, bool toLowerCase, CharArraySet stopWords)
         {
             if (pattern == null)
             {
                 throw new System.ArgumentException("pattern must not be null");
             }
 
-            if (eqPattern(NON_WORD_PATTERN, pattern))
+            if (EqPattern(NON_WORD_PATTERN, pattern))
             {
                 pattern = NON_WORD_PATTERN;
             }
-            else if (eqPattern(WHITESPACE_PATTERN, pattern))
+            else if (EqPattern(WHITESPACE_PATTERN, pattern))
             {
                 pattern = WHITESPACE_PATTERN;
             }
 
-            if (stopWords != null && stopWords.Size == 0)
+            if (stopWords != null && stopWords.Count == 0)
             {
                 stopWords = null;
             }
@@ -146,7 +195,7 @@
         /// <param name="text">
         ///            the string to tokenize </param>
         /// <returns> a new token stream </returns>
-        public TokenStreamComponents createComponents(string fieldName, TextReader reader, string text)
+        public TokenStreamComponents CreateComponents(string fieldName, TextReader reader, string text)
         {
             // Ideally the Analyzer superclass should have a method with the same signature, 
             // with a default impl that simply delegates to the StringReader flavour. 
@@ -165,7 +214,7 @@
             }
 
             Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase);
-            TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
+            TokenStream result = (stopWords != null) ? (TokenStream)new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
             return new TokenStreamComponents(tokenizer, result);
         }
 
@@ -181,7 +230,7 @@
         /// <returns> a new token stream </returns>
         public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
         {
-            return createComponents(fieldName, reader, null);
+            return CreateComponents(fieldName, reader, null);
         }
 
         /// <summary>
@@ -208,7 +257,7 @@
             var p2 = other as PatternAnalyzer;
             if (p2 != null)
             {
-                return toLowerCase == p2.toLowerCase && eqPattern(pattern, p2.pattern) && eq(stopWords, p2.stopWords);
+                return toLowerCase == p2.toLowerCase && EqPattern(pattern, p2.pattern) && Eq(stopWords, p2.stopWords);
             }
             return false;
         }
@@ -229,8 +278,8 @@
             }
 
             int h = 1;
-            h = 31 * h + pattern.pattern().GetHashCode();
-            h = 31 * h + pattern.flags();
+            h = 31 * h + pattern.ToString().GetHashCode();
+            h = 31 * h + (int)pattern.Options;
             h = 31 * h + (toLowerCase ? 1231 : 1237);
             h = 31 * h + (stopWords != null ? stopWords.GetHashCode() : 0);
             return h;
@@ -238,16 +287,16 @@
 
         /// <summary>
         /// equality where o1 and/or o2 can be null </summary>
-        private static bool eq(object o1, object o2)
+        private static bool Eq(object o1, object o2)
         {
             return (o1 == o2) || (o1 != null ? o1.Equals(o2) : false);
         }
 
         /// <summary>
         /// assumes p1 and p2 are not null </summary>
-        private static bool eqPattern(Pattern p1, Pattern p2)
+        private static bool EqPattern(Regex p1, Regex p2)
         {
-            return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().Equals(p2.pattern()));
+            return p1 == p2 || (p1.Options == p2.Options && p1.ToString().Equals(p2.ToString()));
         }
 
         /// <summary>
@@ -271,7 +320,7 @@
 
                 len = 0;
                 int n;
-                while ((n = input.Read(buffer)) >= 0)
+                while ((n = input.Read(buffer, 0, buffer.Length)) > 0)
                 {
                     if (len + n > output.Length) // grow capacity
                     {
@@ -306,23 +355,23 @@
         /// </summary>
         private sealed class PatternTokenizer : Tokenizer
         {
-            private readonly Pattern pattern;
+            private readonly Regex pattern;
             private string str;
             private readonly bool toLowerCase;
-            private Matcher matcher;
+            private Match matcher;
             private int pos = 0;
             private bool initialized = false;
-            private static readonly Locale locale = Locale.Default;
+            private bool isReset = false; // Flag to keep track of the first match vs subsequent matches
             private readonly ICharTermAttribute termAtt;
             private readonly IOffsetAttribute offsetAtt;
 
-            public PatternTokenizer(TextReader input, Pattern pattern, bool toLowerCase)
+            public PatternTokenizer(TextReader input, Regex pattern, bool toLowerCase)
                 : base(input)
             {
                 termAtt = AddAttribute<ICharTermAttribute>();
                 offsetAtt = AddAttribute<IOffsetAttribute>();
                 this.pattern = pattern;
-                this.matcher = pattern.matcher("");
+                this.matcher = pattern.Match("");
                 this.toLowerCase = toLowerCase;
             }
 
@@ -340,28 +389,33 @@
                 while (true) // loop takes care of leading and trailing boundary cases
                 {
                     int start = pos;
-                    int end_Renamed;
-                    bool isMatch = matcher.find();
+                    int end;
+                    if (!isReset)
+                    {
+                        matcher = matcher.NextMatch();
+                    }
+                    isReset = false;
+                    bool isMatch = matcher.Success;
                     if (isMatch)
                     {
-                        end_Renamed = matcher.start();
-                        pos = matcher.end();
+                        end = matcher.Index;
+                        pos = matcher.Index + matcher.Length;
                     }
                     else
                     {
-                        end_Renamed = str.Length;
+                        end = str.Length;
                         matcher = null; // we're finished
                     }
 
-                    if (start != end_Renamed) // non-empty match (header/trailer)
+                    if (start != end) // non-empty match (header/trailer)
                     {
-                        string text = str.Substring(start, end_Renamed - start);
+                        string text = str.Substring(start, end - start);
                         if (toLowerCase)
                         {
-                            text = text.ToLower(locale);
+                            text = text.ToLower();
                         }
                         termAtt.SetEmpty().Append(text);
-                        offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end_Renamed));
+                        offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
                         return true;
                     }
                     if (!isMatch)
@@ -389,7 +443,18 @@
             {
                 base.Reset();
                 this.str = PatternAnalyzer.ToString(input);
-                this.matcher = pattern.matcher(this.str);
+
+                // LUCENENET: Since we need to "reset" the Match
+                // object, we also need an "isReset" flag to indicate
+                // whether we are at the head of the match and to 
+                // take the appropriate measures to ensure we don't 
+                // overwrite our matcher variable with 
+                // matcher = matcher.NextMatch();
+                // before it is time. A string could potentially
+                // match on index 0, so we need another variable to
+                // manage this state.
+                this.matcher = pattern.Match(this.str);
+                this.isReset = true;
                 this.pos = 0;
                 this.initialized = true;
             }
@@ -410,7 +475,6 @@
             private readonly bool isLetter;
             private readonly bool toLowerCase;
             private readonly CharArraySet stopWords;
-            private static readonly Locale locale = Locale.Default;
             private readonly ICharTermAttribute termAtt;
             private readonly IOffsetAttribute offsetAtt;
 
@@ -444,7 +508,7 @@
                 {
                     // find beginning of token
                     text = null;
-                    while (i < len && !isTokenChar(s[i], letter))
+                    while (i < len && !IsTokenChar(s[i], letter))
                     {
                         i++;
                     }
@@ -452,7 +516,7 @@
                     if (i < len) // found beginning; now find end of token
                     {
                         start = i;
-                        while (i < len && isTokenChar(s[i], letter))
+                        while (i < len && IsTokenChar(s[i], letter))
                         {
                             i++;
                         }
@@ -460,7 +524,7 @@
                         text = s.Substring(start, i - start);
                         if (toLowerCase)
                         {
-                            text = text.ToLower(locale);
+                            text = text.ToLower();
                         }
                         //          if (toLowerCase) {            
                         ////            use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
@@ -473,7 +537,7 @@
                         //            text = s.substring(start, i);
                         //          }
                     }
-                } while (text != null && isStopWord(text));
+                } while (text != null && IsStopWord(text));
 
                 pos = i;
                 if (text == null)
@@ -493,12 +557,12 @@
                 this.offsetAtt.SetOffset(CorrectOffset(finalOffset), CorrectOffset(finalOffset));
             }
 
-            private bool isTokenChar(char c, bool isLetter)
+            private bool IsTokenChar(char c, bool isLetter)
             {
                 return isLetter ? char.IsLetter(c) : !char.IsWhiteSpace(c);
             }
 
-            private bool isStopWord(string text)
+            private bool IsStopWord(string text)
             {
                 return stopWords != null && stopWords.Contains(text);
             }
@@ -544,6 +608,5 @@
                 }
             }
         }
-
     }
 }
\ No newline at end of file

diff --git a/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj b/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
index 0679473..475338b 100644
--- a/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
+++ b/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj

@@ -256,6 +256,7 @@
     <Compile Include="Analysis\Miscellaneous\LimitTokenPositionFilter.cs" />
     <Compile Include="Analysis\Miscellaneous\LimitTokenPositionFilterFactory.cs" />
     <Compile Include="Analysis\Miscellaneous\Lucene47WordDelimiterFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\PatternAnalyzer.cs" />
     <Compile Include="Analysis\Miscellaneous\PatternKeywordMarkerFilter.cs" />
     <Compile Include="Analysis\Miscellaneous\PerFieldAnalyzerWrapper.cs" />
     <Compile Include="Analysis\Miscellaneous\PrefixAndSuffixAwareTokenFilter.cs" />

diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzerTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzerTest.cs
index de1db38..85a9632 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzerTest.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzerTest.cs

@@ -1,11 +1,13 @@
-using System;
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Support;
+using NUnit.Framework;
+using System.IO;
 using System.Text;
-using System.Threading;
+using System.Text.RegularExpressions;
 
-namespace org.apache.lucene.analysis.miscellaneous
+namespace Lucene.Net.Analysis.Miscellaneous
 {
-
-	/*
+    /*
 	 * Licensed to the Apache Software Foundation (ASF) under one or more
 	 * contributor license agreements.  See the NOTICE file distributed with
 	 * this work for additional information regarding copyright ownership.
@@ -22,193 +24,120 @@
 	 * limitations under the License.
 	 */
 
-	using UncaughtExceptionHandler = Thread.UncaughtExceptionHandler;
+    /// <summary>
+    /// Verifies the behavior of PatternAnalyzer.
+    /// </summary>
+    public class PatternAnalyzerTest : BaseTokenStreamTestCase
+    {
 
-	using StopAnalyzer = org.apache.lucene.analysis.core.StopAnalyzer;
+        /// <summary>
+        /// Test PatternAnalyzer when it is configured with a non-word pattern.
+        /// Behavior can be similar to SimpleAnalyzer (depending upon options)
+        /// </summary>
+        [Test]
+        public virtual void TestNonWordPattern()
+        {
+            // Split on non-letter pattern, do not lowercase, no stopwords
+            PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN, false, null);
+            Check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] { "The", "quick", "brown", "Fox", "the", "abcd", "dc" });
 
-	/// <summary>
-	/// Verifies the behavior of PatternAnalyzer.
-	/// </summary>
-	public class PatternAnalyzerTest : BaseTokenStreamTestCase
-	{
+            // split on non-letter pattern, lowercase, english stopwords
+            PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+            Check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] { "quick", "brown", "fox", "abcd", "dc" });
+        }
 
-	  /// <summary>
-	  /// Test PatternAnalyzer when it is configured with a non-word pattern.
-	  /// Behavior can be similar to SimpleAnalyzer (depending upon options)
-	  /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testNonWordPattern() throws java.io.IOException
-	  public virtual void testNonWordPattern()
-	  {
-		// Split on non-letter pattern, do not lowercase, no stopwords
-		PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN, false, null);
-		check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] {"The", "quick", "brown", "Fox", "the", "abcd", "dc"});
+        /// <summary>
+        /// Test PatternAnalyzer when it is configured with a whitespace pattern.
+        /// Behavior can be similar to WhitespaceAnalyzer (depending upon options)
+        /// </summary>
+        [Test]
+        public virtual void TestWhitespacePattern()
+        {
+            // Split on whitespace patterns, do not lowercase, no stopwords
+            PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN, false, null);
+            Check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] { "The", "quick", "brown", "Fox,the", "abcd1234", "(56.78)", "dc." });
 
-		// split on non-letter pattern, lowercase, english stopwords
-		PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
-		check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] {"quick", "brown", "fox", "abcd", "dc"});
-	  }
+            // Split on whitespace patterns, lowercase, english stopwords
+            PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+            Check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] { "quick", "brown", "fox,the", "abcd1234", "(56.78)", "dc." });
+        }
 
-	  /// <summary>
-	  /// Test PatternAnalyzer when it is configured with a whitespace pattern.
-	  /// Behavior can be similar to WhitespaceAnalyzer (depending upon options)
-	  /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testWhitespacePattern() throws java.io.IOException
-	  public virtual void testWhitespacePattern()
-	  {
-		// Split on whitespace patterns, do not lowercase, no stopwords
-		PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN, false, null);
-		check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] {"The", "quick", "brown", "Fox,the", "abcd1234", "(56.78)", "dc."});
+        /// <summary>
+        /// Test PatternAnalyzer when it is configured with a custom pattern. In this
+        /// case, text is tokenized on the comma ","
+        /// </summary>
+        [Test]
+        public virtual void TestCustomPattern()
+        {
+            // Split on comma, do not lowercase, no stopwords
+            PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, new Regex(",", RegexOptions.Compiled), false, null);
+            Check(a, "Here,Are,some,Comma,separated,words,", new string[] { "Here", "Are", "some", "Comma", "separated", "words" });
 
-		// Split on whitespace patterns, lowercase, english stopwords
-		PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
-		check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] {"quick", "brown", "fox,the", "abcd1234", "(56.78)", "dc."});
-	  }
+            // split on comma, lowercase, english stopwords
+            PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, new Regex(",", RegexOptions.Compiled), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+            Check(b, "Here,Are,some,Comma,separated,words,", new string[] { "here", "some", "comma", "separated", "words" });
+        }
 
-	  /// <summary>
-	  /// Test PatternAnalyzer when it is configured with a custom pattern. In this
-	  /// case, text is tokenized on the comma ","
-	  /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testCustomPattern() throws java.io.IOException
-	  public virtual void testCustomPattern()
-	  {
-		// Split on comma, do not lowercase, no stopwords
-		PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), false, null);
-		check(a, "Here,Are,some,Comma,separated,words,", new string[] {"Here", "Are", "some", "Comma", "separated", "words"});
+        /// <summary>
+        /// Test PatternAnalyzer against a large document.
+        /// </summary>
+        [Test]
+        public virtual void TestHugeDocument()
+        {
+            StringBuilder document = new StringBuilder();
+            // 5000 a's
+            char[] largeWord = new char[5000];
+            Arrays.Fill(largeWord, 'a');
+            document.Append(largeWord);
 
-		// split on comma, lowercase, english stopwords
-		PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
-		check(b, "Here,Are,some,Comma,separated,words,", new string[] {"here", "some", "comma", "separated", "words"});
-	  }
+            // a space
+            document.Append(' ');
 
-	  /// <summary>
-	  /// Test PatternAnalyzer against a large document.
-	  /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testHugeDocument() throws java.io.IOException
-	  public virtual void testHugeDocument()
-	  {
-		StringBuilder document = new StringBuilder();
-		// 5000 a's
-		char[] largeWord = new char[5000];
-		Arrays.fill(largeWord, 'a');
-		document.Append(largeWord);
+            // 2000 b's
+            char[] largeWord2 = new char[2000];
+            Arrays.Fill(largeWord2, 'b');
+            document.Append(largeWord2);
 
-		// a space
-		document.Append(' ');
+            // Split on whitespace patterns, do not lowercase, no stopwords
+            PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN, false, null);
+            Check(a, document.ToString(), new string[]
+            {
+            new string(largeWord),
+            new string(largeWord2)
+            });
+        }
 
-		// 2000 b's
-		char[] largeWord2 = new char[2000];
-		Arrays.fill(largeWord2, 'b');
-		document.Append(largeWord2);
+        /// <summary>
+        /// Verify the analyzer analyzes to the expected contents. For PatternAnalyzer,
+        /// several methods are verified:
+        /// <ul>
+        /// <li>Analysis with a normal Reader
+        /// <li>Analysis with a FastStringReader
+        /// <li>Analysis with a String
+        /// </ul>
+        /// </summary>
+        private void Check(PatternAnalyzer analyzer, string document, string[] expected)
+        {
+            // ordinary analysis of a Reader
+            AssertAnalyzesTo(analyzer, document, expected);
 
-		// Split on whitespace patterns, do not lowercase, no stopwords
-		PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN, false, null);
-		check(a, document.ToString(), new string[]
-		{
-			new string(largeWord),
-			new string(largeWord2)
-		});
-	  }
+            // analysis with a "FastStringReader"
+            TokenStream ts = analyzer.TokenStream("dummy", new PatternAnalyzer.FastStringReader(document));
+            AssertTokenStreamContents(ts, expected);
 
-	  /// <summary>
-	  /// Verify the analyzer analyzes to the expected contents. For PatternAnalyzer,
-	  /// several methods are verified:
-	  /// <ul>
-	  /// <li>Analysis with a normal Reader
-	  /// <li>Analysis with a FastStringReader
-	  /// <li>Analysis with a String
-	  /// </ul>
-	  /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: private void check(PatternAnalyzer analyzer, String document, String expected[]) throws java.io.IOException
-	  private void check(PatternAnalyzer analyzer, string document, string[] expected)
-	  {
-		// ordinary analysis of a Reader
-		assertAnalyzesTo(analyzer, document, expected);
+            // analysis of a String, uses PatternAnalyzer.tokenStream(String, String)
+            TokenStream ts2 = analyzer.TokenStream("dummy", new StringReader(document));
+            AssertTokenStreamContents(ts2, expected);
+        }
 
-		// analysis with a "FastStringReader"
-		TokenStream ts = analyzer.tokenStream("dummy", new PatternAnalyzer.FastStringReader(document));
-		assertTokenStreamContents(ts, expected);
+        /// <summary>
+        /// blast some random strings through the analyzer </summary>
+        [Test]
+        public virtual void TestRandomStrings()
+        {
+            Analyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, new Regex(",", RegexOptions.Compiled), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
 
-		// analysis of a String, uses PatternAnalyzer.tokenStream(String, String)
-		TokenStream ts2 = analyzer.tokenStream("dummy", new StringReader(document));
-		assertTokenStreamContents(ts2, expected);
-	  }
-
-	  /// <summary>
-	  /// blast some random strings through the analyzer </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testRandomStrings() throws Exception
-	  public virtual void testRandomStrings()
-	  {
-		Analyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
-
-		// dodge jre bug http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=7104012
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final Thread.UncaughtExceptionHandler savedHandler = Thread.getDefaultUncaughtExceptionHandler();
-		UncaughtExceptionHandler savedHandler = Thread.DefaultUncaughtExceptionHandler;
-		Thread.DefaultUncaughtExceptionHandler = new UncaughtExceptionHandlerAnonymousInnerClassHelper(this, savedHandler);
-
-		try
-		{
-		  Thread.DefaultUncaughtExceptionHandler;
-		  checkRandomData(random(), a, 10000 * RANDOM_MULTIPLIER);
-		}
-		catch (System.IndexOutOfRangeException ex)
-		{
-		  assumeTrue("not failing due to jre bug ", !isJREBug7104012(ex));
-		  throw ex; // otherwise rethrow
-		}
-		finally
-		{
-		  Thread.DefaultUncaughtExceptionHandler = savedHandler;
-		}
-	  }
-
-	  private class UncaughtExceptionHandlerAnonymousInnerClassHelper : UncaughtExceptionHandler
-	  {
-		  private readonly PatternAnalyzerTest outerInstance;
-
-		  private UncaughtExceptionHandler savedHandler;
-
-		  public UncaughtExceptionHandlerAnonymousInnerClassHelper(PatternAnalyzerTest outerInstance, UncaughtExceptionHandler savedHandler)
-		  {
-			  this.outerInstance = outerInstance;
-			  this.savedHandler = savedHandler;
-		  }
-
-		  public override void uncaughtException(Thread thread, Exception throwable)
-		  {
-			assumeTrue("not failing due to jre bug ", !isJREBug7104012(throwable));
-			// otherwise its some other bug, pass to default handler
-			savedHandler.uncaughtException(thread, throwable);
-		  }
-	  }
-
-	  internal static bool isJREBug7104012(Exception t)
-	  {
-		if (!(t is System.IndexOutOfRangeException))
-		{
-		  // BaseTokenStreamTestCase now wraps exc in a new RuntimeException:
-		  t = t.InnerException;
-		  if (!(t is System.IndexOutOfRangeException))
-		  {
-			return false;
-		  }
-		}
-		StackTraceElement[] trace = t.StackTrace;
-		foreach (StackTraceElement st in trace)
-		{
-		  if ("java.text.RuleBasedBreakIterator".Equals(st.ClassName) || "sun.util.locale.provider.RuleBasedBreakIterator".Equals(st.ClassName) && "lookupBackwardState".Equals(st.MethodName))
-		  {
-			return true;
-		  }
-		}
-		return false;
-	  }
-	}
-
+            CheckRandomData(Random(), a, 10000 * RANDOM_MULTIPLIER);
+        }
+    }
 }
\ No newline at end of file

diff --git a/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj b/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
index 6d6c668..029a40f 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
+++ b/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj

@@ -194,6 +194,7 @@
     <Compile Include="Analysis\Lv\TestLatvianAnalyzer.cs" />
     <Compile Include="Analysis\Lv\TestLatvianStemFilterFactory.cs" />
     <Compile Include="Analysis\Lv\TestLatvianStemmer.cs" />
+    <Compile Include="Analysis\Miscellaneous\PatternAnalyzerTest.cs" />
     <Compile Include="Analysis\Miscellaneous\TestASCIIFoldingFilter.cs" />
     <Compile Include="Analysis\Miscellaneous\TestCapitalizationFilter.cs" />
     <Compile Include="Analysis\Miscellaneous\TestCapitalizationFilterFactory.cs" />
commit	7f877fdfc2ba25a7c1b0386795b4f83b46f50767	[log] [tgz]
author	Shad Storhaug <shad@shadstorhaug.com>	Sun Aug 28 00:11:18 2016 +0700
committer	Shad Storhaug <shad@shadstorhaug.com>	Sun Aug 28 11:24:43 2016 +0700
tree	2b29a81186bf36b9be484427d4adc0a21b4461f8
parent	8a05b1682b9d1405ed71d42e75d06a296736641d [diff]