Merge pull request #462 from sebastian-nagel/NUTCH-2729-protocol-okhttp-mark-truncated
NUTCH-2729 protocol-okhttp: fix marking of truncated content
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 41a337a..fd201c7 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1314,6 +1314,20 @@
</description>
</property>
+<property>
+ <name>indexer.indexwriters.file</name>
+ <value>index-writers.xml</value>
+ <description>The configuration file for index writers.</description>
+</property>
+
+<!-- Exchanges properties -->
+
+<property>
+ <name>exchanges.exchanges.file</name>
+ <value>exchanges.xml</value>
+ <description>The configuration file used by the Exchange component.</description>
+</property>
+
<!-- URL normalizer properties -->
<property>
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index e753c6f..2ffeac4 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -80,12 +80,11 @@
<exclude module="hadoop-client" />
</dependency>
- <!--dependency org="org.apache.cxf" name="cxf" rev="3.0.4" conf="*->default"/-->
- <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.2.7" conf="*->default"/>
- <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.2.7" conf="*->default"/>
- <dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.2.7" conf="*->default"/>
- <dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.2.7" conf="*->default"/>
- <dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.2.7" conf="test->default"/>
+ <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.3.3" conf="*->default"/>
+ <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.3.3" conf="*->default"/>
+ <dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.3.3" conf="*->default"/>
+ <dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.3.3" conf="*->default"/>
+ <dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.3.3" conf="test->default"/>
<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.9" conf="*->default"/>
<dependency org="com.fasterxml.jackson.core" name="jackson-annotations" rev="2.9.9" conf="*->default"/>
<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.9.9" conf="*->default"/>
diff --git a/ivy/ivysettings.xml b/ivy/ivysettings.xml
index 7a3b949..18038a5 100644
--- a/ivy/ivysettings.xml
+++ b/ivy/ivysettings.xml
@@ -29,14 +29,6 @@
value="[organisation]/[module]/[revision]/[module]-[revision](-[classifier])"/>
<property name="maven2.pattern.ext"
value="${maven2.pattern}.[ext]"/>
- <!-- define packaging.type=jar to work around the failing dependency download of
- javax.ws.rs-api.jar
- required by Tika (1.19 and higher), cf.
- https://github.com/eclipse-ee4j/jaxrs-api/issues/572
- https://github.com/jax-rs/api/pull/576
- -->
- <property name="packaging.type"
- value="jar"/>
<!-- pull in the local repository -->
<include url="${ivy.default.conf.dir}/ivyconf-local.xml"/>
<settings defaultResolver="default"/>
diff --git a/src/java/org/apache/nutch/exchange/Exchanges.java b/src/java/org/apache/nutch/exchange/Exchanges.java
index 1f443d4..1e0518b 100644
--- a/src/java/org/apache/nutch/exchange/Exchanges.java
+++ b/src/java/org/apache/nutch/exchange/Exchanges.java
@@ -96,8 +96,10 @@
* @return An array with each exchange's configuration.
*/
private ExchangeConfig[] loadConfigurations(Configuration conf) {
+ String filename = conf.get("exchanges.exchanges.file",
+ "exchanges.xml");
InputSource inputSource = new InputSource(
- conf.getConfResourceAsInputStream("exchanges.xml"));
+ conf.getConfResourceAsInputStream(filename));
final List<ExchangeConfig> configList = new LinkedList<>();
@@ -120,7 +122,7 @@
}
} catch (SAXException | IOException | ParserConfigurationException e) {
- LOG.warn(e.toString());
+ LOG.error(e.toString());
}
return configList.toArray(new ExchangeConfig[0]);
diff --git a/src/java/org/apache/nutch/indexer/IndexWriters.java b/src/java/org/apache/nutch/indexer/IndexWriters.java
index 9fac2e2..5778997 100644
--- a/src/java/org/apache/nutch/indexer/IndexWriters.java
+++ b/src/java/org/apache/nutch/indexer/IndexWriters.java
@@ -16,7 +16,6 @@
*/
package org.apache.nutch.indexer;
-import de.vandermeer.asciitable.AT_ColumnWidthCalculator;
import de.vandermeer.asciitable.AT_Row;
import de.vandermeer.asciitable.AsciiTable;
import de.vandermeer.skb.interfaces.document.TableRowType;
@@ -115,8 +114,10 @@
* @param conf Nutch configuration instance.
*/
private IndexWriterConfig[] loadWritersConfiguration(Configuration conf) {
+ String filename = conf.get("indexer.indexwriters.file",
+ "index-writers.xml");
InputStream ssInputStream = conf
- .getConfResourceAsInputStream("index-writers.xml");
+ .getConfResourceAsInputStream(filename);
InputSource inputSource = new InputSource(ssInputStream);
try {
@@ -136,7 +137,7 @@
return indexWriterConfigs;
} catch (SAXException | IOException | ParserConfigurationException e) {
- LOG.warn(e.toString());
+ LOG.error(e.toString());
return new IndexWriterConfig[0];
}
}
@@ -218,6 +219,10 @@
public void write(NutchDocument doc) throws IOException {
for (String indexWriterId : getIndexWriters(doc)) {
+ if (!this.indexWriters.containsKey(indexWriterId)) {
+ LOG.warn("Index writer {} is not present. Maybe the plugin is not in plugin.includes or there is a misspelling.", indexWriterId);
+ continue;
+ }
NutchDocument mappedDocument = mapDocument(doc,
this.indexWriters.get(indexWriterId).getIndexWriterConfig()
.getMapping());
@@ -228,6 +233,10 @@
public void update(NutchDocument doc) throws IOException {
for (String indexWriterId : getIndexWriters(doc)) {
+ if (!this.indexWriters.containsKey(indexWriterId)) {
+ LOG.warn("Index writer {} is not present. Maybe the plugin is not in plugin.includes or there is a misspelling.", indexWriterId);
+ continue;
+ }
NutchDocument mappedDocument = mapDocument(doc,
this.indexWriters.get(indexWriterId).getIndexWriterConfig()
.getMapping());
diff --git a/src/plugin/parse-tika/build-ivy.xml b/src/plugin/parse-tika/build-ivy.xml
index a8a0fe9..738f041 100644
--- a/src/plugin/parse-tika/build-ivy.xml
+++ b/src/plugin/parse-tika/build-ivy.xml
@@ -25,13 +25,6 @@
<property name="ivy.checksums" value="" />
<property name="ivy.jar.dir" value="${ivy.home}/lib" />
<property name="ivy.jar.file" value="${ivy.jar.dir}/ivy-${ivy.install.version}.jar" />
- <!-- define packaging.type=jar to work around the failing dependency download of
- javax.ws.rs-api.jar
- required by Tika (1.19 and higher), cf.
- https://github.com/eclipse-ee4j/jaxrs-api/issues/572
- https://github.com/jax-rs/api/pull/576
- -->
- <property name="packaging.type" value="jar"/>
<target name="download-ivy" unless="offline">