blob: e43212119bd025b46da81d2f4f0de6cd39abbe19 [file] [log] [blame]
/*
* This software was produced for the U. S. Government
* under Contract No. W15P7T-11-C-F600, and is
* subject to the Rights in Noncommercial Computer Software
* and Noncommercial Computer Software Documentation
* Clause 252.227-7014 (JUN 1995)
*
* Copyright 2013 The MITRE Corporation. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.tagger;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.solr.common.SolrException;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import org.xml.sax.InputSource;
public class XmlInterpolationTest extends TaggerTestCase {
private static DocumentBuilder xmlDocBuilder;
@BeforeClass
public static void beforeClass() throws Exception {
DocumentBuilderFactory xmlDocBuilderFactory = DocumentBuilderFactory.newInstance();
xmlDocBuilderFactory.setValidating(true);
xmlDocBuilderFactory.setNamespaceAware(true);
xmlDocBuilder = xmlDocBuilderFactory.newDocumentBuilder();
initCore("solrconfig-tagger.xml", "schema-tagger.xml");
}
@AfterClass
public static void cleanUpAfterClass() throws Exception {
xmlDocBuilder = null;
}
@Override
public void setUp() throws Exception {
super.setUp();
baseParams.set("field", "name_tagXml");
baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT");
baseParams.set("xmlOffsetAdjust", "true");
}
@Test
public void test() throws Exception {
buildNames("start end");
assertXmlTag("<doc>before start <!-- c --> end after</doc>", true);
assertXmlTag("<doc>before start <br/> end after</doc>", true);
assertXmlTag("<doc>before <em>start</em> <b>end</b> after</doc>", true);
assertXmlTag("<doc>before <em>start</em> end after</doc>", true);
assertXmlTag("<doc>before start end<em> after</em></doc>", true);
assertXmlTag("<doc><em>before </em>start end after</doc>", true);//adjacent tags
assertXmlTag("<doc>before <b> <em>start</em> </b> end after</doc>", true);
assertXmlTag("<doc>before <b> <em>start</em> </b> <em> end </em> after</doc>", true);
assertXmlTag("<doc><p>before start</p> end after</doc>", false);
assertXmlTag("<doc>before start <p>end after</p> </doc>", false);
assertXmlTag("<doc>before <em a='A' b='B'>start</em> <b a='A' b='B'>end</b> after</doc>", true);
}
@Test(expected = SolrException.class)
public void testInvalidXml() throws Exception {
assertXmlTag("notXml", false);
}
@Test(expected = Exception.class)
public void testValidatingXml() throws Exception {
validateXml("foo");
}
protected void assertXmlTag(String docText, boolean expected) throws Exception {
final SolrQueryRequest req = reqDoc(docText);
try { // 5.4 and beyond we can use try-with-resources
final SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get("qt"), req);
final TestTag[] testTags = pullTagsFromResponse(req, rsp);
if (!expected) {
assertEquals(0, testTags.length);
} else {
assertEquals(1, testTags.length);
final TestTag tag = testTags[0];
validateXml(insertAnchorAtOffsets(docText, tag.startOffset, tag.endOffset, tag.docName));
}
} finally {
req.close();
}
}
protected void validateXml(String xml) throws Exception {
// the "parse" method also validates XML, will throw an exception if mis-formatted
xmlDocBuilder.parse(new InputSource(new StringReader(xml)));
}
@Test
public void testLuceneHtmlFilterBehavior() {
String docText;
//Close tag adjacent to start & end results in end offset including the close tag. LUCENE-5734
docText = "<doc><a><b>start</b> end</a></doc>";
assertArrayEquals(tagExpect(docText, "start", "end</a>"), analyzeTagOne(docText, "start", "end"));
//Space after "end" means offset doesn't include </a>
docText = "<doc><a><b>start</b> end </a></doc>";
assertArrayEquals(tagExpect(docText, "start", "end"), analyzeTagOne(docText, "start", "end"));
//Matches entity at end
final String endStr = String.format(Locale.ROOT, "en&#x%02x;", (int) 'd');
docText = "<doc>start " + endStr + "</doc>";
assertArrayEquals(tagExpect(docText, "start", endStr), analyzeTagOne(docText, "start", "end"));
//... and at start
final String startStr = String.format(Locale.ROOT, "&#x%02x;tart", (int) 's');
docText = "<doc>" + startStr + " end</doc>";
assertArrayEquals(tagExpect(docText, startStr, "end"), analyzeTagOne(docText, "start", "end"));
//Test ignoring proc instructions & comments. Note: doesn't expand the entity to "start".
docText = "<!DOCTYPE start [ "
+ "<!ENTITY start \"start\">"
+ "]><start><?start start ?><!-- start --><start/>&start;</start>";
assertArrayEquals(new int[]{-1, -1}, analyzeTagOne(docText, "start", "start"));
//Test entity behavior
docText = " &mdash; &ndash; &amp; &foo; &#xA0; a&nbsp;b";
assertArrayEquals(new String[]{"—", "–", "&", "&foo;", "\u00A0", "a", "b"},
analyzeReturnTokens(docText));
//Observe offset adjustment of trailing entity to end tag
docText = "foo&nbsp;bar";
assertArrayEquals(tagExpect(docText, "foo", "foo"), analyzeTagOne(docText, "foo", "foo"));
}
private String insertAnchorAtOffsets(String docText, int startOffset, int endOffset, String id) {
String insertStart = "<A id='"+ id +"'>";// (normally we'd escape id)
String insertEnd = "</A>";
return docText.substring(0, startOffset)
+ insertStart
+ docText.substring(startOffset, endOffset)
+ insertEnd
+ docText.substring(endOffset);
}
private int[] tagExpect(String docText, String start, String end) {
return new int[]{docText.indexOf(start), docText.indexOf(end) + end.length()};
}
private int[] analyzeTagOne(String docText, String start, String end) {
int[] result = {-1, -1};
Reader filter = new HTMLStripCharFilter(new StringReader(docText));
WhitespaceTokenizer ts = new WhitespaceTokenizer();
final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
try {
ts.setReader(filter);
ts.reset();
while (ts.incrementToken()) {
final String termString = termAttribute.toString();
if (termString.equals(start))
result[0] = offsetAttribute.startOffset();
if (termString.equals(end)) {
result[1] = offsetAttribute.endOffset();
return result;
}
}
ts.end();
} catch (IOException e) {
throw new RuntimeException(e);
} finally {
IOUtils.closeQuietly(ts);
}
return result;
}
private String[] analyzeReturnTokens(String docText) {
List<String> result = new ArrayList<>();
Reader filter = new HTMLStripCharFilter(new StringReader(docText),
Collections.singleton("unescaped"));
WhitespaceTokenizer ts = new WhitespaceTokenizer();
final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
try {
ts.setReader(filter);
ts.reset();
while (ts.incrementToken()) {
result.add(termAttribute.toString());
}
ts.end();
} catch (IOException e) {
throw new RuntimeException(e);
} finally {
IOUtils.closeQuietly(ts);
}
return result.toArray(new String[result.size()]);
}
}