solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java - lucene-solr - Git at Google

 /*
  * This software was produced for the U. S. Government
  * under Contract No. W15P7T-11-C-F600, and is
  * subject to the Rights in Noncommercial Computer Software
  * and Noncommercial Computer Software Documentation
  * Clause 252.227-7014 (JUN 1995)
  *
  * Copyright 2013 The MITRE Corporation. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.solr.handler.tagger;

 import java.nio.charset.StandardCharsets;

 import org.junit.BeforeClass;
 import org.junit.Ignore;
 import org.junit.Test;

 /**
  * Test the {@link TaggerRequestHandler}.
  */
 public class Tagger2Test extends TaggerTestCase {

   @BeforeClass
   public static void beforeClass() throws Exception {
     initCore("solrconfig-tagger.xml", "schema-tagger.xml");
   }

   @Override
   public void setUp() throws Exception {
     super.setUp();
     baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT");
   }

   /** whole matching, no sub-tags */
   @Test
   public void testLongestDominantRight() throws Exception {
     buildNames("in", "San", "in San", "Francisco", "San Francisco",
         "San Francisco State College", "College of California",
         "Clayton", "Clayton North", "North Carolina");

     assertTags("He lived in San Francisco.",
         "in", "San Francisco");

     assertTags("He enrolled in San Francisco State College of California",
         "in", "San Francisco State College");

     assertTags("He lived in Clayton North Carolina",
         "in", "Clayton", "North Carolina");

   }

   // As of Lucene/Solr 4.9, StandardTokenizer never does this anymore (reported to Lucene dev-list,
   // Jan 26th 2015.  Honestly it's not particularly important to us but it renders this test
   // pointless.
   /** Orig issue https://github.com/OpenSextant/SolrTextTagger/issues/2  related: #13 */
   @Test
   @Ignore
   public void testVeryLongWord() throws Exception {
     String SANFRAN = "San Francisco";
     buildNames(SANFRAN);

     // exceeds default 255 max token length which means it in-effect becomes a stop-word
     StringBuilder STOP = new StringBuilder(260);//>255
     for (int i = 0; i < STOP.capacity(); i++) {
       STOP.append((char) ('0' + (i % 10)));
     }

     String doc = "San " + STOP + " Francisco";
     assertTags(doc);//no match due to default stop word handling
     //and we find it when we ignore stop words
     assertTags(reqDoc(doc, "ignoreStopwords", "true"), new TestTag(0, doc.length(), doc, lookupByName(SANFRAN)));
   }

   /** Support for stopwords (posInc &gt; 1);
    * discussion: https://github.com/OpenSextant/SolrTextTagger/issues/13 */
   @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8344")
   @Test
   public void testStopWords() throws Exception {
     baseParams.set("field", "name_tagStop");//stop filter (pos inc enabled) index & query

     String SOUTHOFWALES = "South of Wales";//'of' is stop word index time & query
     String ACITYA = "A City A";

     buildNames(SOUTHOFWALES, ACITYA);

     //round-trip works
     assertTags(reqDoc(SOUTHOFWALES), new TestTag(0, SOUTHOFWALES.length(), SOUTHOFWALES,
             lookupByName(SOUTHOFWALES)));
     //  but offsets doesn't include stopword when leading or trailing...
     assertTags(reqDoc(ACITYA), new TestTag(2, 6, "City",
             lookupByName(ACITYA)));
     //break on stop words
     assertTags(reqDoc(SOUTHOFWALES, "ignoreStopwords", "false"));//match nothing
   }

   /** Tests WordDelimiterGraphFilter, stacked/synonymous tokens at index time (catenate options) */
   @Test
   public void testWDF() throws Exception {
     baseParams.set("field", "name_tagWDF");

     final String WINSTONSALEM = "City of Winston-Salem";//hyphen
     final String BOSTONHARBOR = "Boston Harbor";//space
     buildNames(WINSTONSALEM, BOSTONHARBOR);

     //round-trip works
     assertTags(reqDoc(WINSTONSALEM), new TestTag(0, WINSTONSALEM.length(), WINSTONSALEM,
         lookupByName(WINSTONSALEM)));

     // space separated works
     final String WS_SPACE = WINSTONSALEM.replace('-', ' ');
     assertTags(reqDoc(WS_SPACE),
         new TestTag(0, WS_SPACE.length(), WS_SPACE,
         lookupByName(WINSTONSALEM)));

     //must be full match
     assertTags(reqDoc("Winston"));//match nothing
     assertTags(reqDoc("Salem"));//match nothing

     // round-trip works
     assertTags(reqDoc(BOSTONHARBOR), new TestTag(0, BOSTONHARBOR.length(), BOSTONHARBOR,
         lookupByName(BOSTONHARBOR)));

     // hyphen separated works
     final String BH_HYPHEN = BOSTONHARBOR.replace(' ', '-');
     assertTags(reqDoc(BH_HYPHEN),
         new TestTag(0, BH_HYPHEN.length(), BH_HYPHEN,
             lookupByName(BOSTONHARBOR)));
     //must be full match
     assertTags(reqDoc("Boston"));//match nothing
     assertTags(reqDoc("Harbor"));//match nothing
   }

   /** Ensure character offsets work for multi-byte characters */
   @Test
   public void testMultibyteChar() throws Exception {
     //  https://unicode-table.com/en/2019/
     //             0         1         2         3         4
     //             01234567890123456789012345678901234567890
     String TEXT = "He mentionned ’Obama’ in the White House";
     assertEquals(40, TEXT.length()); // char length (in Java, UTF16)

     String QUOTE = TEXT.substring(14, 15);
     assertEquals(8217, QUOTE.codePointAt(0));

     //UTF8
     assertEquals(3, QUOTE.getBytes(StandardCharsets.UTF_8).length);
     assertEquals(1, "a".getBytes(StandardCharsets.UTF_8).length);
     assertEquals(40 + 2*2, TEXT.getBytes(StandardCharsets.UTF_8).length);

     //UTF16 big endian    (by specifying big/little endian, there is no "byte order mark")
     assertEquals(2, QUOTE.getBytes(StandardCharsets.UTF_16BE).length);
     assertEquals(2, "a".getBytes(StandardCharsets.UTF_16BE).length);
     assertEquals(40 * 2, TEXT.getBytes(StandardCharsets.UTF_16BE).length);


     buildNames("Obama");

     assertTags(TEXT, "Obama");

     // TODO test surrogate pairs (i.e. code points not in the BMP)
   }

 }
	/*
	* This software was produced for the U. S. Government
	* under Contract No. W15P7T-11-C-F600, and is
	* subject to the Rights in Noncommercial Computer Software
	* and Noncommercial Computer Software Documentation
	* Clause 252.227-7014 (JUN 1995)
	*
	* Copyright 2013 The MITRE Corporation. All Rights Reserved.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.solr.handler.tagger;

	import java.nio.charset.StandardCharsets;

	import org.junit.BeforeClass;
	import org.junit.Ignore;
	import org.junit.Test;

	/**
	* Test the {@link TaggerRequestHandler}.
	*/
	public class Tagger2Test extends TaggerTestCase {

	@BeforeClass
	public static void beforeClass() throws Exception {
	initCore("solrconfig-tagger.xml", "schema-tagger.xml");
	}

	@Override
	public void setUp() throws Exception {
	super.setUp();
	baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT");
	}

	/** whole matching, no sub-tags */
	@Test
	public void testLongestDominantRight() throws Exception {
	buildNames("in", "San", "in San", "Francisco", "San Francisco",
	"San Francisco State College", "College of California",
	"Clayton", "Clayton North", "North Carolina");

	assertTags("He lived in San Francisco.",
	"in", "San Francisco");

	assertTags("He enrolled in San Francisco State College of California",
	"in", "San Francisco State College");

	assertTags("He lived in Clayton North Carolina",
	"in", "Clayton", "North Carolina");

	}

	// As of Lucene/Solr 4.9, StandardTokenizer never does this anymore (reported to Lucene dev-list,
	// Jan 26th 2015. Honestly it's not particularly important to us but it renders this test
	// pointless.
	/** Orig issue https://github.com/OpenSextant/SolrTextTagger/issues/2 related: #13 */
	@Test
	@Ignore
	public void testVeryLongWord() throws Exception {
	String SANFRAN = "San Francisco";
	buildNames(SANFRAN);

	// exceeds default 255 max token length which means it in-effect becomes a stop-word
	StringBuilder STOP = new StringBuilder(260);//>255
	for (int i = 0; i < STOP.capacity(); i++) {
	STOP.append((char) ('0' + (i % 10)));
	}

	String doc = "San " + STOP + " Francisco";
	assertTags(doc);//no match due to default stop word handling
	//and we find it when we ignore stop words
	assertTags(reqDoc(doc, "ignoreStopwords", "true"), new TestTag(0, doc.length(), doc, lookupByName(SANFRAN)));
	}

	/** Support for stopwords (posInc > 1);
	* discussion: https://github.com/OpenSextant/SolrTextTagger/issues/13 */
	@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8344")
	@Test
	public void testStopWords() throws Exception {
	baseParams.set("field", "name_tagStop");//stop filter (pos inc enabled) index & query

	String SOUTHOFWALES = "South of Wales";//'of' is stop word index time & query
	String ACITYA = "A City A";

	buildNames(SOUTHOFWALES, ACITYA);

	//round-trip works
	assertTags(reqDoc(SOUTHOFWALES), new TestTag(0, SOUTHOFWALES.length(), SOUTHOFWALES,
	lookupByName(SOUTHOFWALES)));
	// but offsets doesn't include stopword when leading or trailing...
	assertTags(reqDoc(ACITYA), new TestTag(2, 6, "City",
	lookupByName(ACITYA)));
	//break on stop words
	assertTags(reqDoc(SOUTHOFWALES, "ignoreStopwords", "false"));//match nothing
	}

	/** Tests WordDelimiterGraphFilter, stacked/synonymous tokens at index time (catenate options) */
	@Test
	public void testWDF() throws Exception {
	baseParams.set("field", "name_tagWDF");

	final String WINSTONSALEM = "City of Winston-Salem";//hyphen
	final String BOSTONHARBOR = "Boston Harbor";//space
	buildNames(WINSTONSALEM, BOSTONHARBOR);

	//round-trip works
	assertTags(reqDoc(WINSTONSALEM), new TestTag(0, WINSTONSALEM.length(), WINSTONSALEM,
	lookupByName(WINSTONSALEM)));

	// space separated works
	final String WS_SPACE = WINSTONSALEM.replace('-', ' ');
	assertTags(reqDoc(WS_SPACE),
	new TestTag(0, WS_SPACE.length(), WS_SPACE,
	lookupByName(WINSTONSALEM)));

	//must be full match
	assertTags(reqDoc("Winston"));//match nothing
	assertTags(reqDoc("Salem"));//match nothing

	// round-trip works
	assertTags(reqDoc(BOSTONHARBOR), new TestTag(0, BOSTONHARBOR.length(), BOSTONHARBOR,
	lookupByName(BOSTONHARBOR)));

	// hyphen separated works
	final String BH_HYPHEN = BOSTONHARBOR.replace(' ', '-');
	assertTags(reqDoc(BH_HYPHEN),
	new TestTag(0, BH_HYPHEN.length(), BH_HYPHEN,
	lookupByName(BOSTONHARBOR)));
	//must be full match
	assertTags(reqDoc("Boston"));//match nothing
	assertTags(reqDoc("Harbor"));//match nothing
	}

	/** Ensure character offsets work for multi-byte characters */
	@Test
	public void testMultibyteChar() throws Exception {
	// https://unicode-table.com/en/2019/
	// 0 1 2 3 4
	// 01234567890123456789012345678901234567890
	String TEXT = "He mentionned ’Obama’ in the White House";
	assertEquals(40, TEXT.length()); // char length (in Java, UTF16)

	String QUOTE = TEXT.substring(14, 15);
	assertEquals(8217, QUOTE.codePointAt(0));

	//UTF8
	assertEquals(3, QUOTE.getBytes(StandardCharsets.UTF_8).length);
	assertEquals(1, "a".getBytes(StandardCharsets.UTF_8).length);
	assertEquals(40 + 2*2, TEXT.getBytes(StandardCharsets.UTF_8).length);

	//UTF16 big endian (by specifying big/little endian, there is no "byte order mark")
	assertEquals(2, QUOTE.getBytes(StandardCharsets.UTF_16BE).length);
	assertEquals(2, "a".getBytes(StandardCharsets.UTF_16BE).length);
	assertEquals(40 * 2, TEXT.getBytes(StandardCharsets.UTF_16BE).length);


	buildNames("Obama");

	assertTags(TEXT, "Obama");

	// TODO test surrogate pairs (i.e. code points not in the BMP)
	}

	}