Merge pull request #567 from sebastian-nagel/NUTCH-2847-http-date-format-new-api
NUTCH-2847 HttpDateFormat: Simplify based on new Java 8 DateTime API
diff --git a/.gitignore b/.gitignore
index 249ca77..0612a99 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,7 @@
ivy/ivy-2.4.0.jar
ivy/ivy-2.5.0-rc1.jar
ivy/ivy-2.5.0.jar
+ivy/spotbugs-*/
naivebayes-model
.naivebayes-model.crc
.gitconfig
@@ -24,3 +25,5 @@
*.iml
*.swp
csvindexwriter
+lib/spotbugs-*
+ivy/dependency-check-ant/*
diff --git a/build.xml b/build.xml
index 68a0f44..57ec4fa 100644
--- a/build.xml
+++ b/build.xml
@@ -37,12 +37,14 @@
<property name="maven-javadoc-jar" value="${release.dir}/${artifactId}-${version}-javadoc.jar" />
<property name="maven-sources-jar" value="${release.dir}/${artifactId}-${version}-sources.jar" />
+ <property name="dependency-check-ant.version" value="6.1.0" />
+ <property name="dependency-check-ant.home" value="${ivy.dir}/dependency-check-ant" />
+ <property name="dependency-check-ant.jar" value="${dependency-check-ant.home}/dependency-check-ant.jar" />
+
<property environment="env"/>
- <property name="dependency-check.home" value="${ivy.dir}/dependency-check-ant/"/>
-
- <property name="spotbugs.version" value="4.1.1" />
- <property name="spotbugs.home" value="${basedir}/lib/spotbugs-${spotbugs.version}" />
+ <property name="spotbugs.version" value="4.2.0" />
+ <property name="spotbugs.home" value="${ivy.dir}/spotbugs-${spotbugs.version}" />
<property name="spotbugs.jar" value="${spotbugs.home}/lib/spotbugs-ant.jar" />
<property name="apache-rat.version" value="0.13" />
@@ -241,6 +243,7 @@
<packageset dir="${plugins.dir}/scoring-opic/src/java"/>
<packageset dir="${plugins.dir}/scoring-orphan/src/java"/>
<packageset dir="${plugins.dir}/scoring-similarity/src/java"/>
+ <packageset dir="${plugins.dir}/scoring-metadata/src/java"/>
<packageset dir="${plugins.dir}/subcollection/src/java"/>
<packageset dir="${plugins.dir}/tld/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-automaton/src/java"/>
@@ -646,24 +649,38 @@
</target>
<!-- Check dependencies for security vulnerabilities -->
- <!-- requires installation of OWASP dependency check tool, see -->
- <!-- https://jeremylong.github.io/DependencyCheck/dependency-check-ant/index.html -->
- <!-- get http://dl.bintray.com/jeremy-long/owasp/dependency-check-ant-3.3.2-release.zip -->
- <!-- and unzip in directory ./ivy/ -->
- <path id="dependency-check.path">
- <pathelement location="${dependency-check.home}/dependency-check-ant.jar"/>
- <fileset dir="${dependency-check.home}/lib" erroronmissingdir="false">
+ <target name="dependency-check-ant-download" description="--> download dependency-check-ant jar">
+ <available file="${dependency-check-ant.jar}" property="dependency-check-ant.jar.found"/>
+ <antcall target="dependency-check-ant-download-unchecked"/>
+ </target>
+
+ <target name="dependency-check-ant-download-unchecked" unless="dependency-check-ant.jar.found"
+ description="--> downloads the dependency-check-ant binary (dependency-check-ant-*.zip).">
+ <get src="https://github.com/jeremylong/DependencyCheck/releases/download/v${dependency-check-ant.version}/dependency-check-ant-${dependency-check-ant.version}-release.zip"
+ dest="${ivy.dir}/dependency-check-ant-${dependency-check-ant.version}-release.zip" usetimestamp="false" />
+
+ <unzip src="${ivy.dir}/dependency-check-ant-${dependency-check-ant.version}-release.zip"
+ dest="${ivy.dir}">
+ </unzip>
+
+ <delete file="${ivy.dir}/dependency-check-ant-${dependency-check-ant.version}-release.zip" />
+ </target>
+
+ <path id="dependency-check-ant.path">
+ <pathelement location="${dependency-check-ant.home}/dependency-check-ant.jar"/>
+ <fileset dir="${dependency-check-ant.home}/lib">
<include name="*.jar"/>
</fileset>
</path>
- <taskdef resource="dependency-check-taskdefs.properties" onerror="ignore">
- <classpath refid="dependency-check.path" />
- </taskdef>
- <target name="report-vulnerabilities" description="--> check dependencies for security vulnerabilities">
+
+ <target name="report-vulnerabilities" depends="jar, compile-plugins, dependency-check-ant-download" description="--> check dependencies for security vulnerabilities">
+ <taskdef resource="dependency-check-taskdefs.properties">
+ <classpath refid="dependency-check-ant.path" />
+ </taskdef>
<dependency-check projectname="${name}"
- reportoutputdirectory="${build.dir}"
+ reportoutputdirectory="${dependency-check-ant.home}"
reportformat="ALL">
- <suppressionfile path="${dependency-check.home}/dependency-check-suppressions.xml" />
+ <suppressionfile path="${dependency-check-ant.home}/dependency-check-suppressions.xml" />
<retirejsFilter regex="copyright.*jeremy long" />
<fileset dir="${build.dir}">
<include name="lib/*.jar"/>
@@ -754,6 +771,7 @@
<packageset dir="${plugins.dir}/scoring-opic/src/java"/>
<packageset dir="${plugins.dir}/scoring-orphan/src/java"/>
<packageset dir="${plugins.dir}/scoring-similarity/src/java"/>
+ <packageset dir="${plugins.dir}/scoring-metadata/src/java"/>
<packageset dir="${plugins.dir}/subcollection/src/java"/>
<packageset dir="${plugins.dir}/tld/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-automaton/src/java"/>
@@ -1066,20 +1084,19 @@
<target name="spotbugs-download-unchecked" unless="spotbugs.jar.found"
description="--> downloads the spotbugs binary (spotbugs-*.tgz).">
<get src="https://github.com/spotbugs/spotbugs/releases/download/${spotbugs.version}/spotbugs-${spotbugs.version}.tgz "
- dest="${basedir}/lib/spotbugs-${spotbugs.version}.tgz" usetimestamp="false" />
+ dest="${ivy.dir}/spotbugs-${spotbugs.version}.tgz" usetimestamp="false" />
- <untar src="${basedir}/lib/spotbugs-${spotbugs.version}.tgz"
- dest="${basedir}/lib/" compression="gzip">
+ <untar src="${ivy.dir}/spotbugs-${spotbugs.version}.tgz"
+ dest="${ivy.dir}" compression="gzip">
</untar>
- <delete file="${basedir}/lib/spotbugs-${spotbugs.version}.tgz" />
+ <delete file="${ivy.dir}/spotbugs-${spotbugs.version}.tgz" />
</target>
- <taskdef
- resource="edu/umd/cs/findbugs/anttask/tasks.properties"
- classpath="${spotbugs.jar}" />
-
<target name="spotbugs" depends="jar, compile-plugins, spotbugs-download" description="--> runs spotbugs source code analysis.">
+ <taskdef
+ resource="edu/umd/cs/findbugs/anttask/tasks.properties"
+ classpath="${spotbugs.jar}" />
<spotbugs home="${spotbugs.home}"
output="html"
outputFile="${build.dir}/nutch-spotbugs.html"
@@ -1262,6 +1279,8 @@
<source path="${plugins.dir}/scoring-orphan/src/java"/>
<source path="${plugins.dir}/scoring-orphan/src/test"/>
<source path="${plugins.dir}/scoring-similarity/src/java/" />
+ <source path="${plugins.dir}/scoring-metadata/src/java/" />
+ <source path="${plugins.dir}/scoring-metadata/src/test" />
<source path="${plugins.dir}/subcollection/src/java/" />
<source path="${plugins.dir}/subcollection/src/test/" />
<source path="${plugins.dir}/tld/src/java/" />
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 36c6f86..5548a30 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1871,6 +1871,37 @@
</description>
</property>
+
+<!-- scoring metadata properties
+Add scoring-metadata to the list of active plugins
+ in the parameter 'plugin.includes' in order to use it.
+ -->
+<property>
+ <name>scoring.db.md</name>
+ <value></value>
+ <description>
+ Comma-separated list of keys to be taken from crawldb metadata of a url to the fetched content metadata.
+ </description>
+</property>
+
+<property>
+ <name>scoring.content.md</name>
+ <value></value>
+ <description>
+ Comma-separated list of keys to be taken from content metadata of a url and put as metadata in the parse data.
+ </description>
+</property>
+
+<property>
+ <name>scoring.parse.md</name>
+ <value></value>
+ <description>
+ Comma-separated list of keys to be taken from metadata of the parse data of a url and propogated as metadata to the url outlinks.
+ </description>
+</property>
+
+
+
<!-- language-identifier plugin properties -->
<property>
diff --git a/conf/suffix-urlfilter.txt.template b/conf/suffix-urlfilter.txt.template
index 6f02aed..e329f3c 100644
--- a/conf/suffix-urlfilter.txt.template
+++ b/conf/suffix-urlfilter.txt.template
@@ -19,13 +19,18 @@
### prohibit these
# pictures
.gif
+.gifv
.jpg
.jpeg
+.jp2
+.jpf
+.jpx
.bmp
.png
.tif
.tiff
.ico
+.icns
.eps
.ps
.wmf
@@ -38,13 +43,19 @@
.psp
.psd
.tga
+.webp
.xbm
.xpm
+.kdc
+.svg
+.svgz
# web-formats
.css
+.js
# archives/packages
+.apk
.arj
.arc
.7z
@@ -52,14 +63,25 @@
.lzw
.lha
.lzh
+.mar
.zip
.gz
.tar
.tgz
+.rar
.sit
.rpm
.deb
+.udeb
.pkg
+.bz2
+.dmg
+.lzma
+.xz
+.ipk
+.whl
+.egg
+.crx
# audio/video
.mid
@@ -68,11 +90,19 @@
.mpeg
.mpg
.mpe
+.mp4
.mp3
.mp2
.aac
.mov
+.m4a
+.m4r
+.m4v
+.mp4a
+.mpga
+.f4v
.fla
+.flac
.flv
.ra
.ram
@@ -82,14 +112,41 @@
.wmv
.wav
.wave
+.oga
.ogg
+.webm
.avi
+.avif
.au
.snd
+.3gp
+.3g2
+.qt
+.mka
+.mks
+.mkv
+.mk3d
+.opus
+.xm
+.m3u8
+.movie
+.aif
+.aiff
+.gblorb
+.xhr
-# executables
+# fonts
+.ttf
+.otf
+.pfb
+.afm
+.woff
+.woff2
+
+# executables and shared libraries
.exe
.com
+.dll
# windows links
.lnk
diff --git a/default.properties b/default.properties
index a675853..48bdb43 100644
--- a/default.properties
+++ b/default.properties
@@ -134,7 +134,8 @@
org.apache.nutch.scoring.orphan*:\
org.apache.nutch.scoring.similarity*:\
org.apache.nutch.scoring.tld*:\
- org.apache.nutch.scoring.urlmeta*
+ org.apache.nutch.scoring.urlmeta*\
+ org.apache.nutch.scoring.metadata*
#
# Parse Plugins
diff --git a/ivy/dependency-check-ant/lib/.gitignore b/ivy/dependency-check-ant/lib/.gitignore
new file mode 100644
index 0000000..e2dec72
--- /dev/null
+++ b/ivy/dependency-check-ant/lib/.gitignore
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
diff --git a/src/java/org/apache/nutch/crawl/CrawlDatum.java b/src/java/org/apache/nutch/crawl/CrawlDatum.java
index 5159bdb..b18eda3 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDatum.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDatum.java
@@ -524,7 +524,8 @@
int res = 0;
if (signature != null) {
for (int i = 0; i < signature.length / 4; i += 4) {
- res ^= (signature[i] << 24 + signature[i + 1] << 16 + signature[i + 2] << 8 + signature[i + 3]);
+ res ^= ((signature[i] << 24) + (signature[i + 1] << 16)
+ + (signature[i + 2] << 8) + signature[i + 3]);
}
}
if (metaData != null) {
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java
index 6d4c195..568bf8e 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -337,7 +337,7 @@
int averageBdwPerThread = 0;
if (activeThreads.get() > 0)
- averageBdwPerThread = Math.round(bpsSinceLastCheck
+ averageBdwPerThread = (int) (bpsSinceLastCheck
/ activeThreads.get());
LOG.info("averageBdwPerThread : {} kbps", (averageBdwPerThread / 1000));
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 6cd1772..40b7201 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -386,7 +386,7 @@
if (pstatus != null && pstatus.isSuccess()
&& pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
String newUrl = pstatus.getMessage();
- int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
+ int refreshTime = Integer.parseInt(pstatus.getArgs()[1]);
Text redirUrl = handleRedirect(fit, newUrl,
refreshTime < Fetcher.PERM_REFRESH_TIME,
Fetcher.CONTENT_REDIR);
diff --git a/src/java/org/apache/nutch/parse/ParseOutputFormat.java b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
index fcaa1d1..d47043c 100644
--- a/src/java/org/apache/nutch/parse/ParseOutputFormat.java
+++ b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
@@ -271,7 +271,7 @@
if (pstatus != null && pstatus.isSuccess()
&& pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
String newUrl = pstatus.getMessage();
- int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
+ int refreshTime = Integer.parseInt(pstatus.getArgs()[1]);
newUrl = filterNormalize(fromUrl, newUrl, origin,
ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers,
URLNormalizers.SCOPE_FETCHER);
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java
index 76dcef9..bcbacdd 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -487,7 +487,7 @@
LOG.info("SitemapProcessor: sitemap urls dir: {}", urlDir);
}
else if (args[i].equals("-threads")) {
- threads = Integer.valueOf(args[++i]);
+ threads = Integer.parseInt(args[++i]);
LOG.info("SitemapProcessor: threads: {}", threads);
}
else if (args[i].equals("-noStrict")) {
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index dd2a507..95d7a16 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -84,6 +84,7 @@
<ant dir="scoring-opic" target="deploy"/>
<ant dir="scoring-orphan" target="deploy"/>
<ant dir="scoring-similarity" target="deploy"/>
+ <ant dir="scoring-metadata" target="deploy"/>
<ant dir="subcollection" target="deploy"/>
<ant dir="tld" target="deploy"/>
<ant dir="urlfilter-automaton" target="deploy"/>
@@ -142,6 +143,7 @@
<ant dir="protocol-httpclient" target="test"/>
<ant dir="protocol-okhttp" target="test"/>
<ant dir="scoring-orphan" target="test"/>
+ <ant dir="scoring-metadata" target="test"/>
<ant dir="subcollection" target="test"/>
<ant dir="urlfilter-automaton" target="test"/>
<ant dir="urlfilter-domain" target="test"/>
@@ -230,6 +232,7 @@
<ant dir="scoring-opic" target="clean"/>
<ant dir="scoring-orphan" target="clean"/>
<ant dir="scoring-similarity" target="clean"/>
+ <ant dir="scoring-metadata" target="clean"/>
<ant dir="subcollection" target="clean"/>
<ant dir="tld" target="clean"/>
<ant dir="urlfilter-automaton" target="clean"/>
diff --git a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
index c98a843..9bad065 100644
--- a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
+++ b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
@@ -70,15 +70,15 @@
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(
fs.open(new Path("naivebayes-model"))));
- uniquewords_size = Integer.valueOf(bufferedReader.readLine());
+ uniquewords_size = Integer.parseInt(bufferedReader.readLine());
bufferedReader.readLine();
- numof_ir = Integer.valueOf(bufferedReader.readLine());
- numwords_ir = Integer.valueOf(bufferedReader.readLine());
+ numof_ir = Integer.parseInt(bufferedReader.readLine());
+ numwords_ir = Integer.parseInt(bufferedReader.readLine());
wordfreq_ir = unflattenToHashmap(bufferedReader.readLine());
bufferedReader.readLine();
- numof_r = Integer.valueOf(bufferedReader.readLine());
- numwords_r = Integer.valueOf(bufferedReader.readLine());
+ numof_r = Integer.parseInt(bufferedReader.readLine());
+ numwords_r = Integer.parseInt(bufferedReader.readLine());
wordfreq_r = unflattenToHashmap(bufferedReader.readLine());
ismodel = true;
diff --git a/src/plugin/scoring-metadata/build.xml b/src/plugin/scoring-metadata/build.xml
new file mode 100644
index 0000000..4f62ed1
--- /dev/null
+++ b/src/plugin/scoring-metadata/build.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-metadata" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+</project>
diff --git a/src/plugin/scoring-metadata/ivy.xml b/src/plugin/scoring-metadata/ivy.xml
new file mode 100644
index 0000000..24d7606
--- /dev/null
+++ b/src/plugin/scoring-metadata/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../../ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
diff --git a/src/plugin/scoring-metadata/plugin.xml b/src/plugin/scoring-metadata/plugin.xml
new file mode 100644
index 0000000..ca47e37
--- /dev/null
+++ b/src/plugin/scoring-metadata/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="scoring-metadata"
+ name="Metadata Scoring Filter"
+ version="1.0.0"
+ provider-name="nutch">
+
+
+ <runtime>
+ <library name="scoring-metadata.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.scoring.metadata"
+ name="Metadata Scoring Filter"
+ point="org.apache.nutch.scoring.ScoringFilter">
+ <implementation id="scoring-metadata"
+ class="org.apache.nutch.scoring.metadata.MetadataScoringFilter" />
+ </extension>
+</plugin>
diff --git a/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/MetadataScoringFilter.java b/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/MetadataScoringFilter.java
new file mode 100644
index 0000000..e3ad56e
--- /dev/null
+++ b/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/MetadataScoringFilter.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.metadata;
+
+import java.util.Collection;
+import java.util.Map.Entry;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.AbstractScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+
+
+/**
+ * For documentation:
+ *
+ * {@link org.apache.nutch.scoring.metadata}
+ */
+public class MetadataScoringFilter extends AbstractScoringFilter {
+
+ public static final String METADATA_DATUM = "scoring.db.md";
+ public static final String METADATA_CONTENT = "scoring.content.md";
+ public static final String METADATA_PARSED = "scoring.parse.md";
+ private static String[] datumMetadata;
+ private static String[] contentMetadata;
+ private static String[] parseMetadata;
+ private Configuration conf;
+
+ /**
+ * This will take the metadata that you have listed in your "scoring.parse.md"
+ * property, and looks for them inside the parseData object. If they exist,
+ * this will be propagated into your 'targets' Collection's ["outlinks"]
+ * attributes.
+ *
+ * @see ScoringFilter#distributeScoreToOutlinks
+ */
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+ ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+ CrawlDatum adjust, int allCount) throws ScoringFilterException {
+ if (parseMetadata == null || targets == null || parseData == null)
+ return adjust;
+
+ Iterator<Entry<Text, CrawlDatum>> targetIterator = targets.iterator();
+
+ while (targetIterator.hasNext()) {
+ Entry<Text, CrawlDatum> nextTarget = targetIterator.next();
+
+ for (String meta : parseMetadata) {
+ String metaFromParse = parseData.getMeta(meta);
+
+ if (metaFromParse == null)
+ continue;
+
+ nextTarget.getValue().getMetaData()
+ .put(new Text(meta), new Text(metaFromParse));
+ }
+ }
+ return adjust;
+ }
+
+ /**
+ * Takes the metadata, specified in your "scoring.db.md" property, from the
+ * datum object and injects it into the content. This is transfered to the
+ * parseData object.
+ *
+ * @see ScoringFilter#passScoreBeforeParsing
+ * @see MetadataScoringFilter#passScoreAfterParsing
+ */
+ public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
+ if (datumMetadata == null || content == null || datum == null)
+ return;
+
+ for (String meta : datumMetadata) {
+ Text metaFromDatum = (Text) datum.getMetaData().get(new Text(meta));
+
+ if (metaFromDatum == null) {
+ continue;
+ }
+
+ content.getMetadata().set(meta, metaFromDatum.toString());
+ }
+ }
+
+ /**
+ * Takes the metadata, which was lumped inside the content, and replicates it
+ * within your parse data.
+ *
+ * @see MetadataScoringFilter#passScoreBeforeParsing
+ * @see ScoringFilter#passScoreAfterParsing
+ */
+ public void passScoreAfterParsing(Text url, Content content, Parse parse) {
+ if (contentMetadata == null || content == null || parse == null)
+ return;
+
+ for (String meta : contentMetadata) {
+ String metaFromContent = content.getMetadata().get(meta);
+
+ if (metaFromContent == null)
+ continue;
+
+ parse.getData().getParseMeta().set(meta, metaFromContent);
+ }
+ }
+
+ /**
+ * handles conf assignment and pulls the value assignment from the
+ * "scoring.db.md", "scoring.content.md" and "scoring.parse.md" properties.
+ */
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+
+ if (conf == null)
+ return;
+
+ datumMetadata = conf.getStrings(METADATA_DATUM);
+ contentMetadata = conf.getStrings(METADATA_CONTENT);
+ parseMetadata = conf.getStrings(METADATA_PARSED);
+ }
+}
diff --git a/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/package.html b/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/package.html
new file mode 100644
index 0000000..0356152
--- /dev/null
+++ b/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/package.html
@@ -0,0 +1,33 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+ <body>
+ <p>
+ Metadata Scoring Plugin
+ </p>
+ <p>
+ Propagates Metadata from an injected or outlink url in the crawldb to the url's different procecssed objects. In moving any metadata item, you need to copy metadata in three steps:
+ <ul>
+ <li>Crawldb to content: Copy a metadata entry stored in the crawldb record of the url to the url's fetched content object. You need to specify the entry in the <b>scoring.db.md</b> property</li>
+ <li>Content to parsedData: Copy a metadata entry stored in the Content object of a crawled url to its parsedData. You need to specify the entry in the <b>scoring.content.md</b> property</li>
+ <li>ParsedData to outlink objects: Copy a metadata entry stored in the parsedData of a crawl item to the crawldb records of the url's outlinks. You need to specify the entry in the <b>scoring.parse.md</b> property</li>
+ </ul>
+
+ Note that you can not move data directly from a crawldb record to parseData or outlink objects. The sequence of moving the metadata should be crawldb -> content -> parsedData -> outlink objects.
+ </p>
+ </body>
+</html>
diff --git a/src/plugin/scoring-metadata/src/test/org/apache/nutch/scoring/metadata/TestMetadataScoringFilter.java b/src/plugin/scoring-metadata/src/test/org/apache/nutch/scoring/metadata/TestMetadataScoringFilter.java
new file mode 100644
index 0000000..8683cec
--- /dev/null
+++ b/src/plugin/scoring-metadata/src/test/org/apache/nutch/scoring/metadata/TestMetadataScoringFilter.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.metadata;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class TestMetadataScoringFilter {
+
+
+ @Test
+ public void distributeScoreToOutlinks() throws ScoringFilterException {
+ Configuration conf = NutchConfiguration.create();
+ conf.set(MetadataScoringFilter.METADATA_PARSED,"parent,depth");
+
+ MetadataScoringFilter metadataScoringFilter = new MetadataScoringFilter();
+ metadataScoringFilter.setConf(conf);
+ CrawlDatum crawlDatum = new CrawlDatum();
+
+ Text from = new Text("https://nutch.apache.org/");
+ ParseData parseData = new ParseData();
+ String PARENT = "parent";
+ String DEPTH = "depth";
+
+ String parentMD = "https://nutch.apache.org/";
+ String depthMD = "1";
+ parseData.getParseMeta().add("parent",parentMD);
+ parseData.getParseMeta().add("depth",depthMD);
+
+ HashMap<Text,CrawlDatum> targets = new HashMap();
+ targets.put(new Text("https://nutch.apache.org/downloads.html"),new CrawlDatum());
+ targets.put(new Text("https://wiki.apache.org/nutch"),new CrawlDatum());
+
+ metadataScoringFilter.distributeScoreToOutlinks(from,parseData,targets.entrySet(),crawlDatum,2);
+
+ for (CrawlDatum outlink : targets.values()){
+ Text parent = (Text) outlink.getMetaData().get(new Text(PARENT));
+ Text depth = (Text) outlink.getMetaData().get(new Text(DEPTH));
+
+ Assert.assertEquals(parentMD,parent.toString());
+ Assert.assertEquals(depthMD,depth.toString());
+ }
+ }
+
+ @Test
+ public void passScoreBeforeParsing() {
+ Configuration conf = NutchConfiguration.create();
+ conf.set(MetadataScoringFilter.METADATA_DATUM,"parent,depth");
+
+ MetadataScoringFilter metadataScoringFilter = new MetadataScoringFilter();
+ metadataScoringFilter.setConf(conf);
+ CrawlDatum crawlDatum = new CrawlDatum();
+
+ Text from = new Text("https://nutch.apache.org/");
+
+ String PARENT = "parent";
+ String DEPTH = "depth";
+
+ String parentMD = "https://nutch.apache.org/";
+ String depthMD = "1";
+ crawlDatum.getMetaData().put(new Text(PARENT), new Text(parentMD));
+ crawlDatum.getMetaData().put(new Text(DEPTH), new Text(depthMD));
+ Content content = new Content();
+
+ metadataScoringFilter.passScoreBeforeParsing(from,crawlDatum,content);
+
+ Assert.assertEquals(parentMD,content.getMetadata().get(PARENT));
+ Assert.assertEquals(depthMD,content.getMetadata().get(DEPTH));
+ }
+
+ @Test
+ public void passScoreAfterParsing() {
+ Configuration conf = NutchConfiguration.create();
+ conf.set(MetadataScoringFilter.METADATA_DATUM,"parent,depth");
+ conf.set(MetadataScoringFilter.METADATA_CONTENT,"parent,depth");
+
+ MetadataScoringFilter metadataScoringFilter = new MetadataScoringFilter();
+ metadataScoringFilter.setConf(conf);
+ CrawlDatum crawlDatum = new CrawlDatum();
+
+ Text from = new Text("https://nutch.apache.org/");
+
+ String PARENT = "parent";
+ String DEPTH = "depth";
+
+ String parentMD = "https://nutch.apache.org/";
+ String depthMD = "1";
+ crawlDatum.getMetaData().put(new Text(PARENT), new Text(parentMD));
+ crawlDatum.getMetaData().put(new Text(DEPTH), new Text(depthMD));
+ Content content = new Content();
+ metadataScoringFilter.passScoreBeforeParsing(from,crawlDatum,content);
+
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, null, null, content.getMetadata());
+ Parse parse = new ParseImpl(from.toString(),parseData);
+ metadataScoringFilter.passScoreAfterParsing(from,content,parse);
+
+
+ Assert.assertEquals(parentMD,parse.getData().getMeta(PARENT));
+ Assert.assertEquals(depthMD,parse.getData().getMeta(DEPTH));
+ }
+}
diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index 5479882..48c4a66 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -150,9 +150,6 @@
private boolean hostASCIItoIDN;
private boolean hostTrimTrailingDot;
- public void BasicUrlNormalizer() {
- }
-
@Override
public Configuration getConf() {
return conf;