| /* |
| * This software was produced for the U. S. Government |
| * under Contract No. W15P7T-11-C-F600, and is |
| * subject to the Rights in Noncommercial Computer Software |
| * and Noncommercial Computer Software Documentation |
| * Clause 252.227-7014 (JUN 1995) |
| * |
| * Copyright 2013 The MITRE Corporation. All Rights Reserved. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.solr.handler.tagger; |
| |
| import javax.xml.parsers.DocumentBuilder; |
| import javax.xml.parsers.DocumentBuilderFactory; |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.List; |
| import java.util.Locale; |
| |
| import org.apache.commons.io.IOUtils; |
| import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; |
| import org.apache.lucene.analysis.core.WhitespaceTokenizer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.solr.common.SolrException; |
| import org.apache.solr.request.SolrQueryRequest; |
| import org.apache.solr.response.SolrQueryResponse; |
| import org.junit.AfterClass; |
| import org.junit.BeforeClass; |
| import org.junit.Test; |
| import org.xml.sax.InputSource; |
| |
| public class XmlInterpolationTest extends TaggerTestCase { |
| |
| private static DocumentBuilder xmlDocBuilder; |
| |
| |
| @BeforeClass |
| public static void beforeClass() throws Exception { |
| DocumentBuilderFactory xmlDocBuilderFactory = DocumentBuilderFactory.newInstance(); |
| xmlDocBuilderFactory.setValidating(true); |
| xmlDocBuilderFactory.setNamespaceAware(true); |
| xmlDocBuilder = xmlDocBuilderFactory.newDocumentBuilder(); |
| |
| initCore("solrconfig-tagger.xml", "schema-tagger.xml"); |
| } |
| |
| @AfterClass |
| public static void cleanUpAfterClass() throws Exception { |
| xmlDocBuilder = null; |
| } |
| |
| @Override |
| public void setUp() throws Exception { |
| super.setUp(); |
| baseParams.set("field", "name_tagXml"); |
| baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT"); |
| baseParams.set("xmlOffsetAdjust", "true"); |
| } |
| |
| @Test |
| public void test() throws Exception { |
| buildNames("start end"); |
| |
| assertXmlTag("<doc>before start <!-- c --> end after</doc>", true); |
| assertXmlTag("<doc>before start <br/> end after</doc>", true); |
| assertXmlTag("<doc>before <em>start</em> <b>end</b> after</doc>", true); |
| assertXmlTag("<doc>before <em>start</em> end after</doc>", true); |
| assertXmlTag("<doc>before start end<em> after</em></doc>", true); |
| assertXmlTag("<doc><em>before </em>start end after</doc>", true);//adjacent tags |
| assertXmlTag("<doc>before <b> <em>start</em> </b> end after</doc>", true); |
| assertXmlTag("<doc>before <b> <em>start</em> </b> <em> end </em> after</doc>", true); |
| |
| assertXmlTag("<doc><p>before start</p> end after</doc>", false); |
| assertXmlTag("<doc>before start <p>end after</p> </doc>", false); |
| |
| assertXmlTag("<doc>before <em a='A' b='B'>start</em> <b a='A' b='B'>end</b> after</doc>", true); |
| } |
| |
| @Test(expected = SolrException.class) |
| public void testInvalidXml() throws Exception { |
| assertXmlTag("notXml", false); |
| } |
| |
| @Test(expected = Exception.class) |
| public void testValidatingXml() throws Exception { |
| validateXml("foo"); |
| } |
| |
| protected void assertXmlTag(String docText, boolean expected) throws Exception { |
| final SolrQueryRequest req = reqDoc(docText); |
| try { // 5.4 and beyond we can use try-with-resources |
| final SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get("qt"), req); |
| final TestTag[] testTags = pullTagsFromResponse(req, rsp); |
| if (!expected) { |
| assertEquals(0, testTags.length); |
| } else { |
| assertEquals(1, testTags.length); |
| final TestTag tag = testTags[0]; |
| validateXml(insertAnchorAtOffsets(docText, tag.startOffset, tag.endOffset, tag.docName)); |
| } |
| } finally { |
| req.close(); |
| } |
| } |
| |
| protected void validateXml(String xml) throws Exception { |
| // the "parse" method also validates XML, will throw an exception if mis-formatted |
| xmlDocBuilder.parse(new InputSource(new StringReader(xml))); |
| } |
| |
| |
| @Test |
| public void testLuceneHtmlFilterBehavior() { |
| String docText; |
| |
| //Close tag adjacent to start & end results in end offset including the close tag. LUCENE-5734 |
| docText = "<doc><a><b>start</b> end</a></doc>"; |
| assertArrayEquals(tagExpect(docText, "start", "end</a>"), analyzeTagOne(docText, "start", "end")); |
| |
| //Space after "end" means offset doesn't include </a> |
| docText = "<doc><a><b>start</b> end </a></doc>"; |
| assertArrayEquals(tagExpect(docText, "start", "end"), analyzeTagOne(docText, "start", "end")); |
| |
| //Matches entity at end |
| final String endStr = String.format(Locale.ROOT, "en&#x%02x;", (int) 'd'); |
| docText = "<doc>start " + endStr + "</doc>"; |
| assertArrayEquals(tagExpect(docText, "start", endStr), analyzeTagOne(docText, "start", "end")); |
| //... and at start |
| final String startStr = String.format(Locale.ROOT, "&#x%02x;tart", (int) 's'); |
| docText = "<doc>" + startStr + " end</doc>"; |
| assertArrayEquals(tagExpect(docText, startStr, "end"), analyzeTagOne(docText, "start", "end")); |
| |
| //Test ignoring proc instructions & comments. Note: doesn't expand the entity to "start". |
| docText = "<!DOCTYPE start [ " |
| + "<!ENTITY start \"start\">" |
| + "]><start><?start start ?><!-- start --><start/>&start;</start>"; |
| assertArrayEquals(new int[]{-1, -1}, analyzeTagOne(docText, "start", "start")); |
| |
| //Test entity behavior |
| docText = " — – & &foo;   a b"; |
| assertArrayEquals(new String[]{"—", "–", "&", "&foo;", "\u00A0", "a", "b"}, |
| analyzeReturnTokens(docText)); |
| |
| //Observe offset adjustment of trailing entity to end tag |
| docText = "foo bar"; |
| assertArrayEquals(tagExpect(docText, "foo", "foo"), analyzeTagOne(docText, "foo", "foo")); |
| } |
| |
| private String insertAnchorAtOffsets(String docText, int startOffset, int endOffset, String id) { |
| String insertStart = "<A id='"+ id +"'>";// (normally we'd escape id) |
| String insertEnd = "</A>"; |
| return docText.substring(0, startOffset) |
| + insertStart |
| + docText.substring(startOffset, endOffset) |
| + insertEnd |
| + docText.substring(endOffset); |
| } |
| |
| private int[] tagExpect(String docText, String start, String end) { |
| return new int[]{docText.indexOf(start), docText.indexOf(end) + end.length()}; |
| } |
| |
| private int[] analyzeTagOne(String docText, String start, String end) { |
| int[] result = {-1, -1}; |
| |
| Reader filter = new HTMLStripCharFilter(new StringReader(docText)); |
| |
| WhitespaceTokenizer ts = new WhitespaceTokenizer(); |
| final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); |
| final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); |
| try { |
| ts.setReader(filter); |
| ts.reset(); |
| while (ts.incrementToken()) { |
| final String termString = termAttribute.toString(); |
| if (termString.equals(start)) |
| result[0] = offsetAttribute.startOffset(); |
| if (termString.equals(end)) { |
| result[1] = offsetAttribute.endOffset(); |
| return result; |
| } |
| } |
| ts.end(); |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } finally { |
| IOUtils.closeQuietly(ts); |
| } |
| return result; |
| } |
| |
| private String[] analyzeReturnTokens(String docText) { |
| List<String> result = new ArrayList<>(); |
| |
| Reader filter = new HTMLStripCharFilter(new StringReader(docText), |
| Collections.singleton("unescaped")); |
| WhitespaceTokenizer ts = new WhitespaceTokenizer(); |
| final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); |
| try { |
| ts.setReader(filter); |
| ts.reset(); |
| while (ts.incrementToken()) { |
| result.add(termAttribute.toString()); |
| } |
| ts.end(); |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } finally { |
| IOUtils.closeQuietly(ts); |
| } |
| return result.toArray(new String[result.size()]); |
| } |
| |
| } |