blob: a68870844cf26b75786ef42c0c3fd2a083aa0aca [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.extraction;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.TimeZone;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.BufferingRequestProcessor;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
/**
*
*
**/
public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
// Is the JDK/env affected by a known bug?
final String tzDisplayName = TimeZone.getDefault().getDisplayName(false, TimeZone.SHORT, Locale.US);
if (!tzDisplayName.matches("[A-Za-z]{3,}([+-]\\d\\d(:\\d\\d)?)?")) {
assertTrue("Is some other JVM affected? Or bad regex? TzDisplayName: " + tzDisplayName,
System.getProperty("java.version").startsWith("11"));
assumeTrue("SOLR-12759 JDK 11 (1st release) and Tika 1.x can result in extracting dates in a bad format.", false);
}
initCore("solrconfig.xml", "schema.xml", getFile("extraction/solr").getAbsolutePath());
}
@Override
@Before
public void setUp() throws Exception {
super.setUp();
clearIndex();
assertU(commit());
}
@Test
public void testExtraction() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
loadLocal("extraction/solr-word.pdf",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"uprefix", "ignored_",
"fmap.Author", "extractedAuthor",
"fmap.content", "extractedContent",
"literal.id", "one",
"fmap.Last-Modified", "extractedDate"
);
assertQ(req("title:solr-word"), "//*[@numFound='0']");
assertU(commit());
assertQ(req("title:solr-word"), "//*[@numFound='1']");
loadLocal("extraction/simple.html", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"fmap.language", "extractedLanguage",
"literal.id", "two",
"uprefix", "ignored_",
"fmap.content", "extractedContent",
"fmap.Last-Modified", "extractedDate"
);
assertQ(req("title:Welcome"), "//*[@numFound='0']");
assertU(commit());
assertQ(req("title:Welcome"), "//*[@numFound='1']");
assertQ(req("extractedContent:distinctwords"), "//*[@numFound='0']");
assertQ(req("extractedContent:distinct"), "//*[@numFound='1']");
assertQ(req("extractedContent:words"), "//*[@numFound='2']");
assertQ(req("extractedContent:\"distinct words\""), "//*[@numFound='1']");
loadLocal("extraction/simple.html",
"literal.id","simple2",
"uprefix", "t_",
"lowernames", "true",
"captureAttr", "true",
"fmap.a","t_href",
"fmap.content_type", "abcxyz", // test that lowernames is applied before mapping, and uprefix is applied after mapping
"commit", "true" // test immediate commit
);
// test that purposely causes a failure to print out the doc for test debugging
// assertQ(req("q","id:simple2","indent","true"), "//*[@numFound='0']");
// test both lowernames and unknown field mapping
//assertQ(req("+id:simple2 +t_content_type:[* TO *]"), "//*[@numFound='1']");
assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*[@numFound='1']");
assertQ(req("+id:simple2 +t_content:serif"), "//*[@numFound='0']"); // make sure <style> content is excluded
assertQ(req("+id:simple2 +t_content:blur"), "//*[@numFound='0']"); // make sure <script> content is excluded
// make sure the fact there is an index-time boost does not fail the parsing
loadLocal("extraction/simple.html",
"literal.id","simple3",
"uprefix", "t_",
"lowernames", "true",
"captureAttr", "true", "fmap.a","t_href",
"commit", "true"
,"boost.t_href", "100.0"
);
assertQ(req("t_href:http"), "//*[@numFound='2']");
assertQ(req("t_href:http"), "//doc[2]/str[.='simple3']");
assertQ(req("+id:simple3 +t_content_type:[* TO *]"), "//*[@numFound='1']");//test lowercase and then uprefix
loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"literal.id", "three",
"uprefix", "ignored_",
"fmap.content", "extractedContent",
"fmap.language", "extractedLanguage",
"fmap.Last-Modified", "extractedDate"
);
assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
assertU(commit());
assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']");
loadLocal("extraction/word2003.doc", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"literal.id", "four",
"uprefix", "ignored_",
"fmap.content", "extractedContent",
"fmap.language", "extractedLanguage",
"fmap.Last-Modified", "extractedDate"
);
assertQ(req("title:\"Word 2003 Title\""), "//*[@numFound='0']");
// There is already a PDF file with this content:
assertQ(req("extractedContent:\"This is a test of PDF and Word extraction in Solr, it is only a test\""), "//*[@numFound='1']");
assertU(commit());
assertQ(req("title:\"Word 2003 Title\""), "//*[@numFound='1']");
// now 2 of them:
assertQ(req("extractedContent:\"This is a test of PDF and Word extraction in Solr, it is only a test\""), "//*[@numFound='2']");
// compressed file
loadLocal("extraction/tiny.txt.gz",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"uprefix", "ignored_",
"fmap.content", "extractedContent",
"fmap.language", "extractedLanguage",
"fmap.Last-Modified", "extractedDate",
"literal.id", "tiny.txt.gz");
assertU(commit());
assertQ(req("id:tiny.txt.gz")
, "//*[@numFound='1']"
, "//*/arr[@name='stream_name']/str[.='tiny.txt.gz']"
);
// compressed file
loadLocal("extraction/open-document.odt",
"uprefix", "ignored_",
"fmap.content", "extractedContent",
"literal.id", "open-document");
assertU(commit());
assertQ(req("extractedContent:\"Práctica sobre GnuPG\"")
, "//*[@numFound='1']"
, "//*/arr[@name='stream_name']/str[.='open-document.odt']"
);
}
@Test
public void testCapture() throws Exception {
loadLocal("extraction/simple.html",
"literal.id","capture1",
"uprefix","t_",
"capture","div",
"fmap.div", "foo_t",
"commit", "true"
);
assertQ(req("+id:capture1 +t_content:Solr"), "//*[@numFound='1']");
assertQ(req("+id:capture1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
loadLocal("extraction/simple.html",
"literal.id", "capture2",
"captureAttr", "true",
"defaultField", "text",
"fmap.div", "div_t",
"fmap.a", "anchor_t",
"capture", "div",
"capture", "a",
"commit", "true"
);
assertQ(req("+id:capture2 +text:Solr"), "//*[@numFound='1']");
assertQ(req("+id:capture2 +div_t:\"here is some text in a div\""), "//*[@numFound='1']");
assertQ(req("+id:capture2 +anchor_t:http\\://www.apache.org"), "//*[@numFound='1']");
assertQ(req("+id:capture2 +anchor_t:link"), "//*[@numFound='1']");
}
@Test
public void testDefaultField() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertNotNull("handler is null and it shouldn't be", handler);
try {
ignoreException("unknown field 'a'");
ignoreException("unknown field 'meta'"); // TODO: should this exception be happening?
expectThrows(SolrException.class, () -> {
loadLocal("extraction/simple.html",
"literal.id", "simple2",
"lowernames", "true",
"captureAttr", "true",
//"fmap.content_type", "abcxyz",
"commit", "true" // test immediate commit
);
});
} finally {
resetExceptionIgnores();
}
loadLocal("extraction/simple.html",
"literal.id","simple2",
ExtractingParams.DEFAULT_FIELD, "defaultExtr",//test that unmapped fields go to the text field when no uprefix is specified
"lowernames", "true",
"captureAttr", "true",
//"fmap.content_type", "abcxyz",
"commit", "true" // test immediate commit
);
assertQ(req("id:simple2"), "//*[@numFound='1']");
assertQ(req("defaultExtr:http\\:\\/\\/www.apache.org"), "//*[@numFound='1']");
//Test when both uprefix and default are specified.
loadLocal("extraction/simple.html",
"literal.id","simple2",
ExtractingParams.DEFAULT_FIELD, "defaultExtr",//test that unmapped fields go to the text field when no uprefix is specified
ExtractingParams.UNKNOWN_FIELD_PREFIX, "t_",
"lowernames", "true",
"captureAttr", "true",
"fmap.a","t_href",
//"fmap.content_type", "abcxyz",
"commit", "true" // test immediate commit
);
assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
}
@Test
public void testLiterals() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
//test literal
loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"fmap.content", "extractedContent",
"literal.id", "one",
"uprefix", "ignored_",
"fmap.language", "extractedLanguage",
"literal.extractionLiteralMV", "one",
"literal.extractionLiteralMV", "two",
"fmap.Last-Modified", "extractedDate"
);
assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
assertU(commit());
assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']");
assertQ(req("extractionLiteralMV:one"), "//*[@numFound='1']");
assertQ(req("extractionLiteralMV:two"), "//*[@numFound='1']");
try {
// TODO: original author did not specify why an exception should be thrown... how to fix?
loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"fmap.content", "extractedContent",
"literal.id", "two",
"fmap.language", "extractedLanguage",
"literal.extractionLiteral", "one",
"literal.extractionLiteral", "two",
"fmap.X-Parsed-By", "ignored_parser",
"fmap.Last-Modified", "extractedDate"
);
// TODO: original author did not specify why an exception should be thrown... how to fix?
// assertTrue("Exception should have been thrown", false);
} catch (SolrException e) {
//nothing to see here, move along
}
loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"fmap.content", "extractedContent",
"literal.id", "three",
"fmap.language", "extractedLanguage",
"literal.extractionLiteral", "one",
"fmap.X-Parsed-By", "ignored_parser",
"fmap.Last-Modified", "extractedDate"
);
assertU(commit());
assertQ(req("extractionLiteral:one"), "//*[@numFound='1']");
}
public void testLiteralDefaults() throws Exception {
// sanity check config
loadLocalFromHandler("/update/extract/lit-def",
"extraction/simple.html",
"literal.id", "lit-def-simple");
assertU(commit());
assertQ(req("q", "id:lit-def-simple")
, "//*[@numFound='1']"
, "count(//arr[@name='foo_s']/str)=1"
, "//arr[@name='foo_s']/str[.='x']"
, "count(//arr[@name='bar_s']/str)=1"
, "//arr[@name='bar_s']/str[.='y']"
, "count(//arr[@name='zot_s']/str)=1"
, "//arr[@name='zot_s']/str[.='z']"
);
// override the default foo_s
loadLocalFromHandler("/update/extract/lit-def",
"extraction/simple.html",
"literal.foo_s", "1111",
"literal.id", "lit-def-simple");
assertU(commit());
assertQ(req("q", "id:lit-def-simple")
, "//*[@numFound='1']"
, "count(//arr[@name='foo_s']/str)=1"
, "//arr[@name='foo_s']/str[.='1111']"
, "count(//arr[@name='bar_s']/str)=1"
, "//arr[@name='bar_s']/str[.='y']"
, "count(//arr[@name='zot_s']/str)=1"
, "//arr[@name='zot_s']/str[.='z']"
);
// pre-pend the bar_s
loadLocalFromHandler("/update/extract/lit-def",
"extraction/simple.html",
"literal.bar_s", "2222",
"literal.id", "lit-def-simple");
assertU(commit());
assertQ(req("q", "id:lit-def-simple")
, "//*[@numFound='1']"
, "count(//arr[@name='foo_s']/str)=1"
, "//arr[@name='foo_s']/str[.='x']"
, "count(//arr[@name='bar_s']/str)=2"
, "//arr[@name='bar_s']/str[.='2222']"
, "//arr[@name='bar_s']/str[.='y']"
, "count(//arr[@name='zot_s']/str)=1"
, "//arr[@name='zot_s']/str[.='z']"
);
// invariant zot_s can not be changed
loadLocalFromHandler("/update/extract/lit-def",
"extraction/simple.html",
"literal.zot_s", "3333",
"literal.id", "lit-def-simple");
assertU(commit());
assertQ(req("q", "id:lit-def-simple")
, "//*[@numFound='1']"
, "count(//arr[@name='foo_s']/str)=1"
, "//arr[@name='foo_s']/str[.='x']"
, "count(//arr[@name='bar_s']/str)=1"
, "//arr[@name='bar_s']/str[.='y']"
, "count(//arr[@name='zot_s']/str)=1"
, "//arr[@name='zot_s']/str[.='z']"
);
}
@Test
public void testPlainTextSpecifyingMimeType() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
// Load plain text specifying MIME type:
loadLocal("extraction/version_control.txt", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"literal.id", "one",
"fmap.language", "extractedLanguage",
"fmap.X-Parsed-By", "ignored_parser",
"fmap.content", "extractedContent",
ExtractingParams.STREAM_TYPE, "text/plain"
);
assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");
assertU(commit());
assertQ(req("extractedContent:Apache"), "//*[@numFound='1']");
}
@Test
public void testPlainTextSpecifyingResourceName() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
// Load plain text specifying filename
loadLocal("extraction/version_control.txt", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"literal.id", "one",
"fmap.language", "extractedLanguage",
"fmap.X-Parsed-By", "ignored_parser",
"fmap.content", "extractedContent",
ExtractingParams.RESOURCE_NAME, "extraction/version_control.txt"
);
assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");
assertU(commit());
assertQ(req("extractedContent:Apache"), "//*[@numFound='1']");
}
@Test
public void testCommitWithin() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
SolrQueryRequest req = req("literal.id", "one",
ExtractingParams.RESOURCE_NAME, "extraction/version_control.txt",
"commitWithin", "200"
);
SolrQueryResponse rsp = new SolrQueryResponse();
BufferingRequestProcessor p = new BufferingRequestProcessor(null);
ExtractingDocumentLoader loader = (ExtractingDocumentLoader) handler.newLoader(req, p);
loader.load(req, rsp, new ContentStreamBase.FileStream(getFile("extraction/version_control.txt")),p);
AddUpdateCommand add = p.addCommands.get(0);
assertEquals(200, add.commitWithin);
req.close();
}
// Note: If you load a plain text file specifying neither MIME type nor filename, extraction will silently fail. This is because Tika's
// automatic MIME type detection will fail, and it will default to using an empty-string-returning default parser
@Test
public void testExtractOnly() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
SolrQueryResponse rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true");
assertTrue("rsp is null and it shouldn't be", rsp != null);
@SuppressWarnings({"rawtypes"})
NamedList list = rsp.getValues();
String extraction = (String) list.get("solr-word.pdf");
assertTrue("extraction is null and it shouldn't be", extraction != null);
assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
@SuppressWarnings({"rawtypes"})
NamedList nl = (NamedList) list.get("solr-word.pdf_metadata");
assertTrue("nl is null and it shouldn't be", nl != null);
Object title = nl.get("title");
assertTrue("title is null and it shouldn't be", title != null);
assertTrue(extraction.indexOf("<?xml") != -1);
rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true",
ExtractingParams.EXTRACT_FORMAT, ExtractingDocumentLoader.TEXT_FORMAT);
assertTrue("rsp is null and it shouldn't be", rsp != null);
list = rsp.getValues();
extraction = (String) list.get("solr-word.pdf");
assertTrue("extraction is null and it shouldn't be", extraction != null);
assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
assertTrue(extraction.indexOf("<?xml") == -1);
nl = (NamedList) list.get("solr-word.pdf_metadata");
assertTrue("nl is null and it shouldn't be", nl != null);
title = nl.get("title");
assertTrue("title is null and it shouldn't be", title != null);
}
@Test
public void testXPath() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
SolrQueryResponse rsp = loadLocal("extraction/example.html",
ExtractingParams.XPATH_EXPRESSION, "/xhtml:html/xhtml:body/xhtml:a/descendant::node()",
ExtractingParams.EXTRACT_ONLY, "true"
);
assertTrue("rsp is null and it shouldn't be", rsp != null);
@SuppressWarnings({"rawtypes"})
NamedList list = rsp.getValues();
String val = (String) list.get("example.html");
assertEquals("News", val.trim()); //there is only one matching <a> tag
loadLocal("extraction/example.html",
"literal.id", "example1",
"captureAttr", "true",
"defaultField", "text",
"capture", "div",
"fmap.div", "foo_t",
"boost.foo_t", "3",
"xpath", "/xhtml:html/xhtml:body/xhtml:div//node()",
"commit", "true"
);
assertQ(req("+id:example1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
}
/** test arabic PDF extraction is functional */
@Test
public void testArabicPDF() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
loadLocal("extraction/arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"fmap.Author", "extractedAuthor",
"uprefix", "ignored_",
"fmap.content", "wdf_nocase",
"literal.id", "one",
"fmap.Last-Modified", "extractedDate");
assertQ(req("wdf_nocase:السلم"), "//result[@numFound=0]");
assertU(commit());
assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]");
}
@Test
public void testTikaExceptionHandling() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
expectThrows(Exception.class, () -> {
loadLocal("extraction/password-is-solrcell.docx", "literal.id", "one");
});
assertU(commit());
assertQ(req("*:*"), "//result[@numFound=0]");
try{
loadLocal("extraction/password-is-solrcell.docx", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"uprefix", "ignored_",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"literal.id", "one",
"ignoreTikaException", "true", // set ignore flag
"fmap.Last-Modified", "extractedDate");
}
catch(Exception e){
fail("TikaException should be ignored.");
}
assertU(commit());
assertQ(req("*:*"), "//result[@numFound=1]");
}
@Test
public void testWrongStreamType() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
expectThrows(Exception.class, () -> {
// Load plain text specifying another mime type, should fail
loadLocal("extraction/version_control.txt",
"literal.id", "one",
ExtractingParams.STREAM_TYPE, "application/pdf"
);
});
expectThrows(Exception.class, () -> {
// Load plain text specifying non existing mimetype, should fail
loadLocal("extraction/version_control.txt",
"literal.id", "one",
ExtractingParams.STREAM_TYPE, "foo/bar"
);
});
}
public void testLiteralsOverride() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
assertQ(req("*:*"), "//*[@numFound='0']");
// Here Tika should parse out a title for this document:
loadLocal("extraction/solr-word.pdf",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"literal.id", "three",
"fmap.content", "extractedContent",
"fmap.language", "extractedLanguage",
"fmap.Creation-Date", "extractedDate",
"uprefix", "ignored_",
"fmap.Last-Modified", "extractedDate");
// Here the literal value should override the Tika-parsed title:
loadLocal("extraction/solr-word.pdf",
"literal.title", "wolf-man",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"literal.id", "four",
"fmap.content", "extractedContent",
"fmap.language", "extractedLanguage",
"fmap.Creation-Date", "extractedDate",
"uprefix", "ignored_",
"fmap.Last-Modified", "extractedDate");
// Here we mimic the old behaviour where literals are added, not overridden
loadLocal("extraction/solr-word.pdf",
"literalsOverride", "false",
// Trick - we first map the metadata-title to an ignored field before we replace with literal title
"fmap.title", "ignored_a",
"literal.title", "old-behaviour",
"literal.extractedKeywords", "literalkeyword",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"literal.id", "five",
"fmap.content", "extractedContent",
"fmap.language", "extractedLanguage",
"fmap.Creation-Date", "extractedDate",
"uprefix", "ignored_",
"fmap.Last-Modified", "extractedDate");
assertU(commit());
assertQ(req("title:solr-word"), "//*[@numFound='1']");
assertQ(req("title:wolf-man"), "//*[@numFound='1']");
assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
}
@Test
public void testPdfWithImages() throws Exception {
//Tests possibility to configure ParseContext (by example to extract embedded images from pdf)
loadLocal("extraction/pdf-with-image.pdf",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"uprefix", "ignored_",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"literal.id", "pdfWithImage",
"resource.name", "pdf-with-image.pdf",
"resource.password", "solrRules",
"fmap.Last-Modified", "extractedDate");
assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']");
assertU(commit());
assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']");
}
@Test
public void testPasswordProtected() throws Exception {
// PDF, Passwords from resource.password
loadLocal("extraction/encrypted-password-is-solrRules.pdf",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"uprefix", "ignored_",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"literal.id", "pdfpwliteral",
"resource.name", "encrypted-password-is-solrRules.pdf",
"resource.password", "solrRules",
"fmap.Last-Modified", "extractedDate");
// PDF, Passwords from passwords property file
loadLocal("extraction/encrypted-password-is-solrRules.pdf",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"uprefix", "ignored_",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"literal.id", "pdfpwfile",
"resource.name", "encrypted-password-is-solrRules.pdf",
"passwordsFile", "passwordRegex.properties", // Passwords-file
"fmap.Last-Modified", "extractedDate");
// DOCX, Explicit password
loadLocal("extraction/password-is-Word2010.docx",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"uprefix", "ignored_",
"literal.id", "docxpwliteral",
"resource.name", "password-is-Word2010.docx",
"resource.password", "Word2010", // Explicit password
"fmap.Last-Modified", "extractedDate");
// DOCX, Passwords from file
loadLocal("extraction/password-is-Word2010.docx",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"uprefix", "ignored_",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"literal.id", "docxpwfile",
"resource.name", "password-is-Word2010.docx",
"passwordsFile", "passwordRegex.properties", // Passwords-file
"fmap.Last-Modified", "extractedDate");
assertU(commit());
Thread.sleep(100);
assertQ(req("wdf_nocase:\"This is a test of PDF\""), "//*[@numFound='2']");
assertQ(req("wdf_nocase:\"Test password protected word doc\""), "//*[@numFound='2']");
}
SolrQueryResponse loadLocalFromHandler(String handler, String filename,
String... args) throws Exception {
LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
try {
// TODO: stop using locally defined streams once stream.file and
// stream.body work everywhere
List<ContentStream> cs = new ArrayList<>();
cs.add(new ContentStreamBase.FileStream(getFile(filename)));
req.setContentStreams(cs);
return h.queryAndResponse(handler, req);
} finally {
req.close();
}
}
SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
return loadLocalFromHandler("/update/extract", filename, args);
}
}