solr/core/src/test/org/apache/solr/handler/tagger/XmlInterpolationTest.java - lucene-solr - Git at Google

 /*
  * This software was produced for the U. S. Government
  * under Contract No. W15P7T-11-C-F600, and is
  * subject to the Rights in Noncommercial Computer Software
  * and Noncommercial Computer Software Documentation
  * Clause 252.227-7014 (JUN 1995)
  *
  * Copyright 2013 The MITRE Corporation. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.solr.handler.tagger;

 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.Locale;

 import org.apache.commons.io.IOUtils;
 import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.xml.sax.InputSource;

 public class XmlInterpolationTest extends TaggerTestCase {

   private static DocumentBuilder xmlDocBuilder;


   @BeforeClass
   public static void beforeClass() throws Exception {
     DocumentBuilderFactory xmlDocBuilderFactory = DocumentBuilderFactory.newInstance();
     xmlDocBuilderFactory.setValidating(true);
     xmlDocBuilderFactory.setNamespaceAware(true);
     xmlDocBuilder = xmlDocBuilderFactory.newDocumentBuilder();

     initCore("solrconfig-tagger.xml", "schema-tagger.xml");
   }

   @AfterClass
   public static void cleanUpAfterClass() throws Exception {
     xmlDocBuilder = null;
   }

   @Override
   public void setUp() throws Exception {
     super.setUp();
     baseParams.set("field", "name_tagXml");
     baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT");
     baseParams.set("xmlOffsetAdjust", "true");
   }

   @Test
   public void test() throws Exception {
     buildNames("start end");

     assertXmlTag("<doc>before start <!-- c --> end after</doc>", true);
     assertXmlTag("<doc>before start <br/> end after</doc>", true);
     assertXmlTag("<doc>before <em>start</em> <b>end</b> after</doc>", true);
     assertXmlTag("<doc>before <em>start</em> end after</doc>", true);
     assertXmlTag("<doc>before start end<em> after</em></doc>", true);
     assertXmlTag("<doc><em>before </em>start end after</doc>", true);//adjacent tags
     assertXmlTag("<doc>before <b> <em>start</em> </b> end after</doc>", true);
     assertXmlTag("<doc>before <b> <em>start</em> </b> <em>  end  </em> after</doc>", true);

     assertXmlTag("<doc><p>before start</p> end after</doc>", false);
     assertXmlTag("<doc>before start <p>end after</p> </doc>", false);

     assertXmlTag("<doc>before <em a='A' b='B'>start</em> <b a='A' b='B'>end</b> after</doc>", true);
   }

   @Test(expected = SolrException.class)
   public void testInvalidXml() throws Exception {
     assertXmlTag("notXml", false);
   }

   @Test(expected = Exception.class)
   public void testValidatingXml() throws Exception {
     validateXml("foo");
   }

   protected void assertXmlTag(String docText, boolean expected) throws Exception {
     final SolrQueryRequest req = reqDoc(docText);
     try { // 5.4 and beyond we can use try-with-resources
       final SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get("qt"), req);
       final TestTag[] testTags = pullTagsFromResponse(req, rsp);
       if (!expected) {
         assertEquals(0, testTags.length);
       } else {
         assertEquals(1, testTags.length);
         final TestTag tag = testTags[0];
         validateXml(insertAnchorAtOffsets(docText, tag.startOffset, tag.endOffset, tag.docName));
       }
     } finally {
       req.close();
     }
   }

   protected void validateXml(String xml) throws Exception {
     // the "parse" method also validates XML, will throw an exception if mis-formatted
     xmlDocBuilder.parse(new InputSource(new StringReader(xml)));
   }


   @Test
   public void testLuceneHtmlFilterBehavior() {
     String docText;

     //Close tag adjacent to start & end results in end offset including the close tag. LUCENE-5734
     docText = "<doc><a><b>start</b> end</a></doc>";
     assertArrayEquals(tagExpect(docText, "start", "end</a>"), analyzeTagOne(docText, "start", "end"));

     //Space after "end" means offset doesn't include </a>
     docText = "<doc><a><b>start</b> end </a></doc>";
     assertArrayEquals(tagExpect(docText, "start", "end"), analyzeTagOne(docText, "start", "end"));

     //Matches entity at end
     final String endStr = String.format(Locale.ROOT, "en&#x%02x;", (int) 'd');
     docText = "<doc>start " + endStr + "</doc>";
     assertArrayEquals(tagExpect(docText, "start", endStr), analyzeTagOne(docText, "start", "end"));
     //... and at start
     final String startStr = String.format(Locale.ROOT, "&#x%02x;tart", (int) 's');
     docText = "<doc>" + startStr + " end</doc>";
     assertArrayEquals(tagExpect(docText, startStr, "end"), analyzeTagOne(docText, "start", "end"));

     //Test ignoring proc instructions & comments. Note: doesn't expand the entity to "start".
     docText = "<!DOCTYPE start [ "
             + "<!ENTITY start \"start\">"
             + "]><start><?start start ?><!-- start --><start/>&start;</start>";
     assertArrayEquals(new int[]{-1, -1}, analyzeTagOne(docText, "start", "start"));

     //Test entity behavior
     docText =                " &mdash; &ndash; &amp; &foo; &#xA0; a&nbsp;b";
     assertArrayEquals(new String[]{"—", "–", "&", "&foo;", "\u00A0", "a", "b"},
             analyzeReturnTokens(docText));

     //Observe offset adjustment of trailing entity to end tag
     docText = "foo&nbsp;bar";
     assertArrayEquals(tagExpect(docText, "foo", "foo"), analyzeTagOne(docText, "foo", "foo"));
   }

   private String insertAnchorAtOffsets(String docText, int startOffset, int endOffset, String id) {
     String insertStart = "<A id='"+ id +"'>";// (normally we'd escape id)
     String insertEnd = "</A>";
     return docText.substring(0, startOffset)
             + insertStart
             + docText.substring(startOffset, endOffset)
             + insertEnd
             + docText.substring(endOffset);
   }

   private int[] tagExpect(String docText, String start, String end) {
     return new int[]{docText.indexOf(start), docText.indexOf(end) + end.length()};
   }

   private int[] analyzeTagOne(String docText, String start, String end) {
     int[] result = {-1, -1};

     Reader filter = new HTMLStripCharFilter(new StringReader(docText));

     WhitespaceTokenizer ts = new WhitespaceTokenizer();
     final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
     final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
     try {
       ts.setReader(filter);
       ts.reset();
       while (ts.incrementToken()) {
         final String termString = termAttribute.toString();
         if (termString.equals(start))
           result[0] = offsetAttribute.startOffset();
         if (termString.equals(end)) {
           result[1] = offsetAttribute.endOffset();
           return result;
         }
       }
       ts.end();
     } catch (IOException e) {
       throw new RuntimeException(e);
     } finally {
       IOUtils.closeQuietly(ts);
     }
     return result;
   }

   private String[] analyzeReturnTokens(String docText) {
     List<String> result = new ArrayList<>();

     Reader filter = new HTMLStripCharFilter(new StringReader(docText),
             Collections.singleton("unescaped"));
     WhitespaceTokenizer ts = new WhitespaceTokenizer();
     final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
     try {
       ts.setReader(filter);
       ts.reset();
       while (ts.incrementToken()) {
         result.add(termAttribute.toString());
       }
       ts.end();
     } catch (IOException e) {
       throw new RuntimeException(e);
     } finally {
       IOUtils.closeQuietly(ts);
     }
     return result.toArray(new String[result.size()]);
   }

 }
	/*
	* This software was produced for the U. S. Government
	* under Contract No. W15P7T-11-C-F600, and is
	* subject to the Rights in Noncommercial Computer Software
	* and Noncommercial Computer Software Documentation
	* Clause 252.227-7014 (JUN 1995)
	*
	* Copyright 2013 The MITRE Corporation. All Rights Reserved.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.solr.handler.tagger;

	import javax.xml.parsers.DocumentBuilder;
	import javax.xml.parsers.DocumentBuilderFactory;
	import java.io.IOException;
	import java.io.Reader;
	import java.io.StringReader;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.List;
	import java.util.Locale;

	import org.apache.commons.io.IOUtils;
	import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
	import org.apache.lucene.analysis.core.WhitespaceTokenizer;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.solr.common.SolrException;
	import org.apache.solr.request.SolrQueryRequest;
	import org.apache.solr.response.SolrQueryResponse;
	import org.junit.AfterClass;
	import org.junit.BeforeClass;
	import org.junit.Test;
	import org.xml.sax.InputSource;

	public class XmlInterpolationTest extends TaggerTestCase {

	private static DocumentBuilder xmlDocBuilder;


	@BeforeClass
	public static void beforeClass() throws Exception {
	DocumentBuilderFactory xmlDocBuilderFactory = DocumentBuilderFactory.newInstance();
	xmlDocBuilderFactory.setValidating(true);
	xmlDocBuilderFactory.setNamespaceAware(true);
	xmlDocBuilder = xmlDocBuilderFactory.newDocumentBuilder();

	initCore("solrconfig-tagger.xml", "schema-tagger.xml");
	}

	@AfterClass
	public static void cleanUpAfterClass() throws Exception {
	xmlDocBuilder = null;
	}

	@Override
	public void setUp() throws Exception {
	super.setUp();
	baseParams.set("field", "name_tagXml");
	baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT");
	baseParams.set("xmlOffsetAdjust", "true");
	}

	@Test
	public void test() throws Exception {
	buildNames("start end");

	assertXmlTag("<doc>before start <!-- c --> end after</doc>", true);
	assertXmlTag("<doc>before start <br/> end after</doc>", true);
	assertXmlTag("<doc>before <em>start</em> <b>end</b> after</doc>", true);
	assertXmlTag("<doc>before <em>start</em> end after</doc>", true);
	assertXmlTag("<doc>before start end<em> after</em></doc>", true);
	assertXmlTag("<doc><em>before </em>start end after</doc>", true);//adjacent tags
	assertXmlTag("<doc>before <b> <em>start</em> </b> end after</doc>", true);
	assertXmlTag("<doc>before <b> <em>start</em> </b> <em> end </em> after</doc>", true);

	assertXmlTag("<doc><p>before start</p> end after</doc>", false);
	assertXmlTag("<doc>before start <p>end after</p> </doc>", false);

	assertXmlTag("<doc>before <em a='A' b='B'>start</em> <b a='A' b='B'>end</b> after</doc>", true);
	}

	@Test(expected = SolrException.class)
	public void testInvalidXml() throws Exception {
	assertXmlTag("notXml", false);
	}

	@Test(expected = Exception.class)
	public void testValidatingXml() throws Exception {
	validateXml("foo");
	}

	protected void assertXmlTag(String docText, boolean expected) throws Exception {
	final SolrQueryRequest req = reqDoc(docText);
	try { // 5.4 and beyond we can use try-with-resources
	final SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get("qt"), req);
	final TestTag[] testTags = pullTagsFromResponse(req, rsp);
	if (!expected) {
	assertEquals(0, testTags.length);
	} else {
	assertEquals(1, testTags.length);
	final TestTag tag = testTags[0];
	validateXml(insertAnchorAtOffsets(docText, tag.startOffset, tag.endOffset, tag.docName));
	}
	} finally {
	req.close();
	}
	}

	protected void validateXml(String xml) throws Exception {
	// the "parse" method also validates XML, will throw an exception if mis-formatted
	xmlDocBuilder.parse(new InputSource(new StringReader(xml)));
	}


	@Test
	public void testLuceneHtmlFilterBehavior() {
	String docText;

	//Close tag adjacent to start & end results in end offset including the close tag. LUCENE-5734
	docText = "<doc><a><b>start</b> end</a></doc>";
	assertArrayEquals(tagExpect(docText, "start", "end</a>"), analyzeTagOne(docText, "start", "end"));

	//Space after "end" means offset doesn't include </a>
	docText = "<doc><a><b>start</b> end </a></doc>";
	assertArrayEquals(tagExpect(docText, "start", "end"), analyzeTagOne(docText, "start", "end"));

	//Matches entity at end
	final String endStr = String.format(Locale.ROOT, "en&#x%02x;", (int) 'd');
	docText = "<doc>start " + endStr + "</doc>";
	assertArrayEquals(tagExpect(docText, "start", endStr), analyzeTagOne(docText, "start", "end"));
	//... and at start
	final String startStr = String.format(Locale.ROOT, "&#x%02x;tart", (int) 's');
	docText = "<doc>" + startStr + " end</doc>";
	assertArrayEquals(tagExpect(docText, startStr, "end"), analyzeTagOne(docText, "start", "end"));

	//Test ignoring proc instructions & comments. Note: doesn't expand the entity to "start".
	docText = "<!DOCTYPE start [ "
	+ "<!ENTITY start \"start\">"
	+ "]><start><?start start ?><!-- start --><start/>&start;</start>";
	assertArrayEquals(new int[]{-1, -1}, analyzeTagOne(docText, "start", "start"));

	//Test entity behavior
	docText = " — – & &foo;   a b";
	assertArrayEquals(new String[]{"—", "–", "&", "&foo;", "\u00A0", "a", "b"},
	analyzeReturnTokens(docText));

	//Observe offset adjustment of trailing entity to end tag
	docText = "foo bar";
	assertArrayEquals(tagExpect(docText, "foo", "foo"), analyzeTagOne(docText, "foo", "foo"));
	}

	private String insertAnchorAtOffsets(String docText, int startOffset, int endOffset, String id) {
	String insertStart = "<A id='"+ id +"'>";// (normally we'd escape id)
	String insertEnd = "</A>";
	return docText.substring(0, startOffset)
	+ insertStart
	+ docText.substring(startOffset, endOffset)
	+ insertEnd
	+ docText.substring(endOffset);
	}

	private int[] tagExpect(String docText, String start, String end) {
	return new int[]{docText.indexOf(start), docText.indexOf(end) + end.length()};
	}

	private int[] analyzeTagOne(String docText, String start, String end) {
	int[] result = {-1, -1};

	Reader filter = new HTMLStripCharFilter(new StringReader(docText));

	WhitespaceTokenizer ts = new WhitespaceTokenizer();
	final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
	final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
	try {
	ts.setReader(filter);
	ts.reset();
	while (ts.incrementToken()) {
	final String termString = termAttribute.toString();
	if (termString.equals(start))
	result[0] = offsetAttribute.startOffset();
	if (termString.equals(end)) {
	result[1] = offsetAttribute.endOffset();
	return result;
	}
	}
	ts.end();
	} catch (IOException e) {
	throw new RuntimeException(e);
	} finally {
	IOUtils.closeQuietly(ts);
	}
	return result;
	}

	private String[] analyzeReturnTokens(String docText) {
	List<String> result = new ArrayList<>();

	Reader filter = new HTMLStripCharFilter(new StringReader(docText),
	Collections.singleton("unescaped"));
	WhitespaceTokenizer ts = new WhitespaceTokenizer();
	final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
	try {
	ts.setReader(filter);
	ts.reset();
	while (ts.incrementToken()) {
	result.add(termAttribute.toString());
	}
	ts.end();
	} catch (IOException e) {
	throw new RuntimeException(e);
	} finally {
	IOUtils.closeQuietly(ts);
	}
	return result.toArray(new String[result.size()]);
	}

	}