Add 'webindex/' from commit '91dc7cb6fc72c79a53c6b7d0a6c0599cd8eacb9b'
git-subtree-dir: webindex
git-subtree-mainline: f762da6d8f93dec655741632dd534d1287d1a6ec
git-subtree-split: 91dc7cb6fc72c79a53c6b7d0a6c0599cd8eacb9b
diff --git a/webindex/.gitignore b/webindex/.gitignore
new file mode 100644
index 0000000..54549dd
--- /dev/null
+++ b/webindex/.gitignore
@@ -0,0 +1,9 @@
+*.class
+.idea/
+*.iml
+.classpath
+.project
+.settings
+target/
+/logs/
+/data/
diff --git a/webindex/.travis.yml b/webindex/.travis.yml
new file mode 100644
index 0000000..fb79ab6
--- /dev/null
+++ b/webindex/.travis.yml
@@ -0,0 +1,17 @@
+# Copyright 2015 Webindex authors (see AUTHORS)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+language: java
+jdk:
+ - openjdk8
+script: mvn -U clean verify
diff --git a/webindex/AUTHORS b/webindex/AUTHORS
new file mode 100644
index 0000000..3b81590
--- /dev/null
+++ b/webindex/AUTHORS
@@ -0,0 +1,5 @@
+AUTHORS
+-------
+
+Mike Walch - Peterson Technologies
+Keith Turner - Peterson Technologies
diff --git a/webindex/LICENSE b/webindex/LICENSE
new file mode 100644
index 0000000..8f71f43
--- /dev/null
+++ b/webindex/LICENSE
@@ -0,0 +1,202 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "{}"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright {yyyy} {name of copyright owner}
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
diff --git a/webindex/README.md b/webindex/README.md
new file mode 100644
index 0000000..f869d20
--- /dev/null
+++ b/webindex/README.md
@@ -0,0 +1,76 @@
+![Webindex][logo]
+---
+[![Build Status][ti]][tl] [![Apache License][li]][ll]
+
+Webindex is an example [Apache Fluo][fluo] application that incrementally indexes links to web pages
+in multiple ways. If you are new to Fluo, you may want start with the [Fluo tour][tour] as the
+WebIndex application is more complicated. For more information on how the WebIndex application
+works, view the [tables](docs/tables.md) and [code](docs/code-guide.md) documentation.
+
+Webindex utilizes multiple projects. [Common Crawl][cc] web crawl data is used as the input.
+[Apache Spark][spark] is used to initialize Fluo and incrementally load data into Fluo. [Apache
+Accumulo][accumulo] is used to hold the indexes and Fluo's data. Fluo is used to continuously
+combine new and historical information about web pages and update an external index when changes
+occur. Webindex has simple UI built using [Spark Java][sparkjava] that allows querying the indexes.
+
+Below is a video showing repeatedly querying stackoverflow.com while Webindex was running for three
+days on EC2. The video was made by querying the Webindex instance periodically and taking a
+screenshot. More details about this video are available in this [blog post][bp].
+
+[![Querying stackoverflow.com](http://img.youtube.com/vi/mJJNJbPN2EI/0.jpg)](http://www.youtube.com/watch?v=mJJNJbPN2EI)
+
+## Running WebIndex
+
+If you are new to WebIndex, the simplest way to run the application is to run the development
+server. First, clone the WebIndex repo:
+
+ git clone https://github.com/astralway/webindex.git
+
+Next, on a machine where Java and Maven are installed, run the development server using the
+`webindex` command:
+
+ cd webindex/
+ ./bin/webindex dev
+
+This will build and start the development server which will log to the console. This 'dev' command
+has several command line options which can be viewed by running with `-h`. When you want to
+terminate the server, press `CTRL-c`.
+
+The development server starts a MiniAccumuloCluster and runs MiniFluo on top of it. It parses a
+CommonCrawl data file and creates a file at `data/1000-pages.txt` with 1000 pages that are loaded
+into MiniFluo. The number of pages loaded can be changed to 5000 by using the command below:
+
+ ./bin/webindex dev --pages 5000
+
+The pages are processed by Fluo which exports indexes to Accumulo. The development server also
+starts a web application at [http://localhost:4567](http://localhost:4567) that queries indexes in
+Accumulo.
+
+If you would like to run WebIndex on a cluster, follow the [install] instructions.
+
+### Viewing metrics
+
+Metrics can be sent from the development server to InfluxDB and viewed in Grafana. You can either
+setup InfluxDB+Grafana on you own or use [Uno] command `uno setup metrics`. After a metrics server
+is started, start the development server the option `--metrics` to start sending metrics:
+
+ ./bin/webindex dev --metrics
+
+Fluo metrics can be viewed in Grafana. To view application-specific metrics for Webindex, import
+the WebIndex Grafana dashboard located at `contrib/webindex-dashboard.json`.
+
+[tour]: https://fluo.apache.org/tour/
+[sparkjava]: http://sparkjava.com/
+[spark]: https://spark.apache.org/
+[accumulo]: https://accumulo.apache.org/
+[fluo]: https://fluo.apache.org/
+[pc]: https://github.com/astralway/phrasecount
+[Uno]: https://github.com/astralway/uno
+[cc]: https://commoncrawl.org/
+[install]: docs/install.md
+[ti]: https://travis-ci.org/astralway/webindex.svg?branch=master
+[tl]: https://travis-ci.org/astralway/webindex
+[li]: http://img.shields.io/badge/license-ASL-blue.svg
+[ll]: https://github.com/astralway/webindex/blob/master/LICENSE
+[logo]: contrib/webindex.png
+[bp]: https://fluo.apache.org/blog/2016/01/11/webindex-long-run/#videos-from-run
diff --git a/webindex/bin/impl/base.sh b/webindex/bin/impl/base.sh
new file mode 100755
index 0000000..af089f5
--- /dev/null
+++ b/webindex/bin/impl/base.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# Copyright 2015 Webindex authors (see AUTHORS)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: ${WI_HOME?"WI_HOME must be set"}
+: ${WI_CONFIG?"WI_CONFIG must be set"}
+: ${SPARK_HOME?"SPARK_HOME must be set"}
+
+function get_prop {
+ echo "`grep $1 $WI_CONFIG | cut -d ' ' -f 2`"
+}
+
+: ${HADOOP_CONF_DIR?"HADOOP_CONF_DIR must be set in bash env or conf/webindex-env.sh"}
+if [ ! -d $HADOOP_CONF_DIR ]; then
+ echo "HADOOP_CONF_DIR=$HADOOP_CONF_DIR does not exist"
+ exit 1
+fi
+: ${FLUO_HOME?"FLUO_HOME must be set in bash env or conf/webindex-env.sh"}
+if [ ! -d $FLUO_HOME ]; then
+ echo "FLUO_HOME=$FLUO_HOME does not exist"
+ exit 1
+fi
+
+: ${WI_EXECUTOR_INSTANCES?"WI_EXECUTOR_INSTANCES must be set in bash env or conf/webindex-env.sh"}
+: ${WI_EXECUTOR_MEMORY?"WI_EXECUTOR_MEMORY must be set in bash env or conf/webindex-env.sh"}
+export COMMON_SPARK_OPTS="--master yarn-client --num-executors $WI_EXECUTOR_INSTANCES --executor-memory $WI_EXECUTOR_MEMORY"
+
+export SPARK_SUBMIT=$SPARK_HOME/bin/spark-submit
+if [ ! -f $SPARK_SUBMIT ]; then
+ echo "The spark-submit command cannot be found in SPARK_HOME=$SPARK_HOME. Please set SPARK_HOME in conf/webindex-env.sh"
+ exit 1
+fi
+
+hash mvn 2>/dev/null || { echo >&2 "Maven must be installed & mvn command must be on path. Aborting."; exit 1; }
+
+# Stop if any command after this fails
+set -e
+
+export WI_DATA_JAR=$WI_HOME/modules/data/target/webindex-data-$WI_VERSION.jar
+export WI_DATA_DEP_JAR=$WI_HOME/modules/data/target/webindex-data-$WI_VERSION-shaded.jar
+if [ ! -f $WI_DATA_DEP_JAR ]; then
+ echo "Building $WI_DATA_DEP_JAR"
+ cd $WI_HOME
+
+ : ${ACCUMULO_VERSION?"ACCUMULO_VERSION must be set in bash env or conf/webindex-env.sh"}
+ : ${FLUO_VERSION?"FLUO_VERSION must be set in bash env or conf/webindex-env.sh"}
+ : ${THRIFT_VERSION?"THRIFT_VERSION must be set in bash env or conf/webindex-env.sh"}
+ mvn clean package -Pcreate-shade-jar -DskipTests -Dfluo.version=$FLUO_VERSION -Daccumulo.version=$ACCUMULO_VERSION -Dthrift.version=$THRIFT_VERSION
+fi
diff --git a/webindex/bin/impl/init.sh b/webindex/bin/impl/init.sh
new file mode 100755
index 0000000..52fd3cd
--- /dev/null
+++ b/webindex/bin/impl/init.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# Copyright 2015 Webindex authors (see AUTHORS)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+: "${WI_HOME?"WI_HOME must be set"}"
+
+. "$WI_HOME/bin/impl/base.sh"
+
+if [[ "$OSTYPE" == "darwin"* ]]; then
+ export SED="sed -i .bak"
+else
+ export SED="sed -i"
+fi
+
+# stop if any command fails
+set -e
+
+: "${SPARK_SUBMIT?"SPARK_SUBMIT must be set"}"
+: "${WI_DATA_JAR?"WI_DATA_JAR must be set"}"
+: "${WI_DATA_DEP_JAR?"WI_DATA_DEP_JAR must be set"}"
+
+fluo_app=$(get_prop fluoApp)
+fluo_cmd=$FLUO_HOME/bin/fluo
+if [ ! -f "$fluo_cmd" ]; then
+ echo "Fluo command script does not exist at $fluo_cmd"
+ exit 1
+fi
+
+app_lib=$WI_HOME/target/lib
+mkdir -p "$app_lib"
+cp "$WI_DATA_JAR" "$app_lib"
+mvn package -Pcopy-dependencies -DskipTests -DoutputDirectory="$app_lib"
+# Add webindex core and its dependencies
+cp "$WI_HOME/modules/core/target/webindex-core-$WI_VERSION.jar" "$app_lib"
+
+app_props=$WI_HOME/target/fluo-app.properties
+cp "$FLUO_HOME/conf/fluo-app.properties" "$app_props"
+$SED "s#^.*fluo.observer.init.dir=[^ ]*#fluo.observer.init.dir=${app_lib}#" "$app_props"
+
+java -cp "$app_lib/*:$("$fluo_cmd" classpath)" webindex.data.Configure "$WI_CONFIG" "$app_props"
+
+"$fluo_cmd" init -a "$fluo_app" -p "$app_props" --force
+
+"$SPARK_SUBMIT" --class webindex.data.Init $COMMON_SPARK_OPTS \
+ --conf spark.shuffle.service.enabled=true \
+ --conf spark.executor.extraJavaOptions=-XX:+UseCompressedOops \
+ $WI_DATA_DEP_JAR $1
+
+echo "Webindex init has completed successfully."
diff --git a/webindex/bin/webindex b/webindex/bin/webindex
new file mode 100755
index 0000000..35c2f07
--- /dev/null
+++ b/webindex/bin/webindex
@@ -0,0 +1,201 @@
+#! /usr/bin/env bash
+
+# Copyright 2015 Webindex authors (see AUTHORS)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+BIN_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+export WI_HOME=$( cd "$( dirname "$BIN_DIR" )" && pwd )
+export WI_VERSION=0.0.1-SNAPSHOT
+
+if [ ! -f $WI_HOME/conf/webindex-env.sh ]; then
+ echo "webindex-env.sh must exist in $WI_HOME/conf"
+ exit 1
+fi
+. "$WI_HOME/conf/webindex-env.sh"
+
+mkdir -p "$WI_HOME/logs"
+
+export WI_CONFIG=$WI_HOME/conf/webindex.yml
+if [ ! -f "$WI_CONFIG" ]; then
+ echo "webindex.yml must exist in $WI_HOME/conf"
+ exit 1
+fi
+
+log4j_config=$WI_HOME/conf/log4j.properties
+if [ ! -f "$log4j_config" ]; then
+ echo "logj4.properties must exist in $WI_HOME/conf"
+ exit 1
+fi
+
+conn_props=$FLUO_HOME/conf/fluo-conn.properties
+if [ ! -f "$conn_props" ]; then
+ echo "fluo-conn.properties must exist in $FLUO_HOME/conf"
+ exit 1
+fi
+
+function get_prop {
+ echo "`grep $1 $WI_CONFIG | cut -d ' ' -f 2`"
+}
+
+COMMAND_LOGFILE=$WI_HOME/logs/$1_`date +%s`.log
+DATA_DIR=$WI_HOME/data
+mkdir -p $DATA_DIR
+
+case "$1" in
+dev)
+ pkill -9 -f webindex-dev-server
+ cd $WI_HOME
+ dev_args="${@:2}"
+ mvn -q compile -P webindex-dev-server -Dlog4j.configuration=file:$log4j_config -Dexec.args="$dev_args"
+ ;;
+getpaths)
+ mkdir -p $DATA_DIR
+ PATHS_FILE="$2".wat.paths
+ if [ ! -f $DATA_DIR/$PATHS_FILE ]; then
+ rm -f $DATA_DIR/wat.paths.gz
+ PATHS_URL=https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-$2/wat.paths.gz
+ if [[ `wget -S --spider $PATHS_URL 2>&1 | grep 'HTTP/1.1 200 OK'` ]]; then
+ wget -P $DATA_DIR $PATHS_URL
+ gzip -d $DATA_DIR/wat.paths.gz
+ mv $DATA_DIR/wat.paths $DATA_DIR/$PATHS_FILE
+ echo "Downloaded paths file to $DATA_DIR/$PATHS_FILE"
+ else
+ echo "Crawl paths file for date $2 does not exist at $PATHS_URL"
+ exit 1
+ fi
+ else
+ echo "Crawl paths file already exists at $DATA_DIR/$PATHS_FILE"
+ fi
+ ;;
+copy)
+ if [ "$#" -lt 4 -o "$#" -gt 5 ]; then
+ echo "Usage: webindex copy <DATE> <RANGE> <DEST> [-fg]"
+ exit 1
+ fi
+ . $BIN_DIR/impl/base.sh
+ COMMAND="$SPARK_SUBMIT --class webindex.data.Copy $COMMON_SPARK_OPTS \
+ $WI_DATA_DEP_JAR $DATA_DIR/"$2".wat.paths $3 $4"
+ if [ "$5" != "-fg" ]; then
+ nohup ${COMMAND} &> $COMMAND_LOGFILE &
+ echo "Started copy. Logs are being output to $COMMAND_LOGFILE"
+ else
+ ${COMMAND}
+ fi
+ ;;
+init)
+ if [ "$#" -lt 1 -o "$#" -gt 3 ]; then
+ echo "Usage: webindex init <SRC> [-fg]"
+ exit 1
+ fi
+ . $BIN_DIR/impl/base.sh
+ COMMAND="$BIN_DIR/impl/init.sh $2"
+ if [ "$2" == "-fg" ]; then
+ COMMAND="$BIN_DIR/impl/init.sh"
+ fi
+ if [ "$2" != "-fg" -a "$3" != "-fg" ]; then
+ nohup ${COMMAND} &> $COMMAND_LOGFILE &
+ echo "Started init. Logs are being output to $COMMAND_LOGFILE"
+ else
+ ${COMMAND}
+ fi
+ ;;
+load-hdfs)
+ if [ "$#" -lt 2 -o "$#" -gt 3 ]; then
+ echo "Usage: webindex load-hdfs <SRC> [-fg]"
+ exit 1
+ fi
+ . $BIN_DIR/impl/base.sh
+ COMMAND="$SPARK_SUBMIT --class webindex.data.LoadHdfs $COMMON_SPARK_OPTS \
+ --files $conn_props $WI_DATA_DEP_JAR $2"
+ if [ "$3" != "-fg" ]; then
+ nohup ${COMMAND} &> $COMMAND_LOGFILE &
+ echo "Started load-hdfs. Logs are being output to $COMMAND_LOGFILE"
+ else
+ ${COMMAND}
+ fi
+ ;;
+load-s3)
+ if [ "$#" -lt 3 -o "$#" -gt 4 ]; then
+ echo "Usage: webindex load-s3 <DATE> <RANGE> [-fg]"
+ exit 1
+ fi
+ . $BIN_DIR/impl/base.sh
+ COMMAND="$SPARK_SUBMIT --class webindex.data.LoadS3 $COMMON_SPARK_OPTS \
+ --files $conn_props $WI_DATA_DEP_JAR $DATA_DIR/"$2".wat.paths $3"
+ if [ "$4" != "-fg" ]; then
+ nohup ${COMMAND} &> $COMMAND_LOGFILE &
+ echo "Started load-s3. Logs are being output to $COMMAND_LOGFILE"
+ else
+ ${COMMAND}
+ fi
+ ;;
+test-parser)
+ if [ "$#" -lt 3 -o "$#" -gt 4 ]; then
+ echo "Usage: webindex test-parser <DATE> <RANGE> [-fg]"
+ exit 1
+ fi
+ . $BIN_DIR/impl/base.sh
+ COMMAND="$SPARK_SUBMIT --class webindex.data.TestParser $COMMON_SPARK_OPTS \
+ $WI_DATA_DEP_JAR $DATA_DIR/"$2".wat.paths $3"
+ if [ "$4" != "-fg" ]; then
+ nohup ${COMMAND} &> $COMMAND_LOGFILE &
+ echo "Started data-verify. Logs are being output to $COMMAND_LOGFILE"
+ else
+ ${COMMAND}
+ fi
+ ;;
+ui)
+ pkill -9 -f webindex-web-server
+ cd $WI_HOME
+ COMMAND="mvn -q compile -P webindex-web-server -Dlog4j.configuration=file:$log4j_config"
+ if [ "$2" != "-fg" ]; then
+ nohup ${COMMAND} &> $COMMAND_LOGFILE &
+ echo "Started UI. Logs are being output to $COMMAND_LOGFILE"
+ else
+ ${COMMAND}
+ fi
+ ;;
+splits)
+ . $BIN_DIR/impl/base.sh
+ COMMAND="$SPARK_SUBMIT --class webindex.data.CalcSplits \
+ $COMMON_SPARK_OPTS \
+ --conf spark.shuffle.service.enabled=true \
+ $WI_DATA_DEP_JAR $2"
+ if [ "$2" != "-fg" ]; then
+ nohup ${COMMAND} &> $COMMAND_LOGFILE &
+ echo "Started splits calculation. Logs are being output to $COMMAND_LOGFILE"
+ else
+ ${COMMAND}
+ fi
+ ;;
+*)
+ echo -e "Usage: webindex <command> (<argument>)\n"
+ echo -e "Possible commands:\n"
+ echo " dev Runs WebIndex development server"
+ echo " getpaths <DATE> Retrieves paths file for given crawl <DATE> (i.e 2015-18) and stores file in the 'data/' directory"
+ echo " See https://commoncrawl.org/the-data/get-started/ for possible crawl dates"
+ echo " copy <DATE> <RANGE> <DEST> Copies CommonCrawl data files from S3 given a <DATE> and <RANGE> (i.e 0-8) into HDFS <DEST> directory"
+ echo " init [<SRC>] Initializes and starts the WebIndex application. Optionally, a <SRC> HDFS directory can be added to"
+ echo " to the command to initialize Fluo's table in Accumulo with data before starting the application"
+ echo " load-hdfs <SRC> Loads data from the HDFS <SRC> directory into Fluo"
+ echo " load-s3 <DATE> <RANGE> Loads data from S3 into Fluo. Data is selected using a paths file <DATE> and file <RANGE> (i.e 5-7)"
+ echo " ui Starts the webindex UI"
+ echo " splits <SRC> Calculate splits using data in HDFS <SRC> directory"
+ echo " test-parser <DATE> <RANGE> Tests parser on data loaded from S3. Data is selected using a paths file <DATE> and file <RANGE> (i.e 5-7)"
+ echo " "
+ echo "NOTE: All commands except getpaths will run in background and output to a log by default. Add -fg to end of these commands"
+ echo "to run them in the foreground."
+ echo " "
+ exit 1
+esac
diff --git a/webindex/conf/.gitignore b/webindex/conf/.gitignore
new file mode 100644
index 0000000..40c8d7c
--- /dev/null
+++ b/webindex/conf/.gitignore
@@ -0,0 +1,3 @@
+webindex.yml
+webindex-env.sh
+log4j.properties
diff --git a/webindex/conf/examples/log4j.properties b/webindex/conf/examples/log4j.properties
new file mode 100644
index 0000000..694c884
--- /dev/null
+++ b/webindex/conf/examples/log4j.properties
@@ -0,0 +1,29 @@
+# Copyright 2016 Webindex authors (see AUTHORS)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+log4j.rootLogger=INFO, CA
+log4j.appender.CA=org.apache.log4j.ConsoleAppender
+log4j.appender.CA.layout=org.apache.log4j.PatternLayout
+log4j.appender.CA.layout.ConversionPattern=%d{ISO8601} [%c] %-5p: %m%n
+
+log4j.logger.org.apache.accumulo=WARN
+log4j.logger.org.apache.curator=ERROR
+log4j.logger.org.apache.fluo=WARN
+log4j.logger.org.apache.hadoop=WARN
+log4j.logger.org.apache.hadoop.mapreduce=ERROR
+log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
+log4j.logger.org.apache.zookeeper=ERROR
+log4j.logger.org.eclipse.jetty=WARN
+log4j.logger.org.spark-project=WARN
+log4j.logger.webindex=INFO
diff --git a/webindex/conf/examples/webindex-env.sh b/webindex/conf/examples/webindex-env.sh
new file mode 100644
index 0000000..2363583
--- /dev/null
+++ b/webindex/conf/examples/webindex-env.sh
@@ -0,0 +1,42 @@
+# Copyright 2015 Webindex authors (see AUTHORS)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Set environment variables if they are not already set. Please modify the
+# export statement to use the correct directory. Remove the test statement
+# to override any previously set environment.
+
+## Installation directories
+test -z "$HADOOP_PREFIX" && export HADOOP_PREFIX=/path/to/hadoop
+test -z "$HADOOP_CONF_DIR" && export HADOOP_CONF_DIR=/path/to/hadoop/etc/hadoop
+test -z "$FLUO_HOME" && export FLUO_HOME=/path/to/fluo
+test -z "$SPARK_HOME" && export SPARK_HOME=/path/to/spark
+
+## Accumulo and Fluo versions that should be included in the shaded jar created for Spark.
+export FLUO_VERSION=`$FLUO_HOME/bin/fluo version`
+export ACCUMULO_VERSION=`accumulo version`
+
+## Accumulo client will likely not work without correct thrift version
+if [[ $ACCUMULO_VERSION < "1.8" ]]; then
+ THRIFT_VERSION="0.9.1"
+elif [[ $ACCUMULO_VERSION < "2.0" ]]; then
+ THRIFT_VERSION="0.9.3"
+else
+ THRIFT_VERSION="0.10.0"
+fi
+
+## Spark
+# Number of Spark executor instances
+export WI_EXECUTOR_INSTANCES=2
+# Amount of memory given to each Spark executor
+export WI_EXECUTOR_MEMORY=512m
diff --git a/webindex/conf/examples/webindex.yml b/webindex/conf/examples/webindex.yml
new file mode 100644
index 0000000..9599207
--- /dev/null
+++ b/webindex/conf/examples/webindex.yml
@@ -0,0 +1,33 @@
+# Copyright 2015 Webindex authors (see AUTHORS)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Accumulo table where indexes are exported for search
+accumuloIndexTable: webindex_search
+# Fluo Application Name
+fluoApp: webindex
+# Webindex builds multiple data sets for its computation. Each of these data
+# sets needs to be spread across the cluster. The setting below determines how
+# much each dataset will be split up. Ideally this would be a small multiple of
+# the number of Accumulo tablet servers.
+numTablets: 20
+# Number of buckets for collision free maps and export queue. This setting is
+# used during initialization of the Fluo table and when Spark loads the initial
+# data. The value of numBuckets must be the same for these two task.
+numBuckets: 100
+#This determines how fast each Spark load task will load documents. Set to 0
+#for no limit. Setting this to 50 and running 10 conucurrent load task would
+#limit the load rate to 500 documents/sec.
+loadRateLimit: 0
+# HDFS temporary directory
+hdfsTempDir: /cc/temp
diff --git a/webindex/contrib/webindex-dashboard.json b/webindex/contrib/webindex-dashboard.json
new file mode 100644
index 0000000..0a55505
--- /dev/null
+++ b/webindex/contrib/webindex-dashboard.json
@@ -0,0 +1,645 @@
+{
+ "id": null,
+ "title": "Webindex",
+ "originalTitle": "Webindex",
+ "tags": [],
+ "style": "dark",
+ "timezone": "browser",
+ "editable": true,
+ "hideControls": false,
+ "sharedCrosshair": false,
+ "rows": [
+ {
+ "collapse": false,
+ "editable": true,
+ "height": "250px",
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "datasource": null,
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "grid": {
+ "leftLogBase": 1,
+ "leftMax": null,
+ "leftMin": null,
+ "rightLogBase": 1,
+ "rightMax": null,
+ "rightMin": null,
+ "threshold1": null,
+ "threshold1Color": "rgba(216, 200, 27, 0.27)",
+ "threshold2": null,
+ "threshold2Color": "rgba(234, 112, 112, 0.22)"
+ },
+ "id": 3,
+ "interval": "30s",
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "alias": "ingested / sec",
+ "fields": [
+ {
+ "func": "sum",
+ "name": "value"
+ }
+ ],
+ "groupBy": [
+ {
+ "interval": "auto",
+ "type": "time"
+ }
+ ],
+ "measurement": "webindex_pages_ingested",
+ "query": "SELECT sum(\"value\") AS \"value\" FROM \"webindex_pages_ingested\" WHERE \"field\" = 'm1_rate' AND $timeFilter GROUP BY time($interval)",
+ "rawQuery": false,
+ "refId": "A",
+ "tags": [
+ {
+ "key": "field",
+ "operator": "=",
+ "value": "m1_rate"
+ }
+ ]
+ },
+ {
+ "alias": "changed / sec",
+ "fields": [
+ {
+ "func": "sum",
+ "name": "value"
+ }
+ ],
+ "groupBy": [
+ {
+ "interval": "auto",
+ "type": "time"
+ }
+ ],
+ "measurement": "webindex_pages_changed",
+ "query": "SELECT sum(\"value\") AS \"value\" FROM \"webindex_pages_changed\" WHERE \"field\" = 'm1_rate' AND $timeFilter GROUP BY time($interval)",
+ "refId": "B",
+ "tags": [
+ {
+ "key": "field",
+ "operator": "=",
+ "value": "m1_rate"
+ }
+ ]
+ },
+ {
+ "refId": "C",
+ "tags": [
+ {
+ "key": "field",
+ "operator": "=",
+ "value": "m1_rate"
+ }
+ ],
+ "groupBy": [
+ {
+ "type": "time",
+ "interval": "auto"
+ }
+ ],
+ "fields": [
+ {
+ "name": "value",
+ "func": "sum"
+ }
+ ],
+ "measurement": "webindex_pages_exported",
+ "query": "SELECT sum(\"value\") AS \"value\" FROM \"webindex_pages_exported\" WHERE \"field\" = 'm1_rate' AND $timeFilter GROUP BY time($interval)",
+ "alias": "exported / sec"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Pages",
+ "tooltip": {
+ "shared": true,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "x-axis": true,
+ "y-axis": true,
+ "y_formats": [
+ "short",
+ "short"
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "datasource": null,
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "grid": {
+ "leftLogBase": 1,
+ "leftMax": null,
+ "leftMin": null,
+ "rightLogBase": 1,
+ "rightMax": null,
+ "rightMin": null,
+ "threshold1": null,
+ "threshold1Color": "rgba(216, 200, 27, 0.27)",
+ "threshold2": null,
+ "threshold2Color": "rgba(234, 112, 112, 0.22)"
+ },
+ "id": 8,
+ "interval": "30s",
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "alias": "new / sec",
+ "fields": [
+ {
+ "func": "sum",
+ "name": "value"
+ }
+ ],
+ "groupBy": [
+ {
+ "interval": "auto",
+ "type": "time"
+ }
+ ],
+ "measurement": "webindex_domains_new",
+ "query": "SELECT sum(\"value\") AS \"value\" FROM \"webindex_domains_new\" WHERE \"field\" = 'm1_rate' AND $timeFilter GROUP BY time($interval)",
+ "refId": "A",
+ "tags": [
+ {
+ "key": "field",
+ "operator": "=",
+ "value": "m1_rate"
+ }
+ ]
+ },
+ {
+ "alias": "changed / sec",
+ "fields": [
+ {
+ "func": "sum",
+ "name": "value"
+ }
+ ],
+ "groupBy": [
+ {
+ "interval": "auto",
+ "type": "time"
+ }
+ ],
+ "measurement": "webindex_domains_changed",
+ "query": "SELECT sum(\"value\") AS \"value\" FROM \"webindex_domains_changed\" WHERE \"field\" = 'm1_rate' AND $timeFilter GROUP BY time($interval)",
+ "refId": "B",
+ "tags": [
+ {
+ "key": "field",
+ "operator": "=",
+ "value": "m1_rate"
+ }
+ ]
+ },
+ {
+ "refId": "C",
+ "tags": [
+ {
+ "key": "field",
+ "operator": "=",
+ "value": "m1_rate"
+ }
+ ],
+ "groupBy": [
+ {
+ "type": "time",
+ "interval": "auto"
+ }
+ ],
+ "fields": [
+ {
+ "name": "value",
+ "func": "sum"
+ }
+ ],
+ "measurement": "webindex_domains_exported",
+ "query": "SELECT sum(\"value\") AS \"value\" FROM \"webindex_domains_exported\" WHERE \"field\" = 'm1_rate' AND $timeFilter GROUP BY time($interval)",
+ "alias": "exported / sec"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Domains",
+ "tooltip": {
+ "shared": true,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "x-axis": true,
+ "y-axis": true,
+ "y_formats": [
+ "short",
+ "short"
+ ]
+ }
+ ],
+ "title": "Row"
+ },
+ {
+ "collapse": false,
+ "editable": true,
+ "height": "250px",
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "datasource": null,
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "grid": {
+ "leftLogBase": 1,
+ "leftMax": null,
+ "leftMin": null,
+ "rightLogBase": 1,
+ "rightMax": null,
+ "rightMin": null,
+ "threshold1": null,
+ "threshold1Color": "rgba(216, 200, 27, 0.27)",
+ "threshold2": null,
+ "threshold2Color": "rgba(234, 112, 112, 0.22)"
+ },
+ "id": 6,
+ "interval": "30s",
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "alias": "ingested / sec",
+ "fields": [
+ {
+ "func": "sum",
+ "name": "value"
+ }
+ ],
+ "groupBy": [
+ {
+ "interval": "auto",
+ "type": "time"
+ }
+ ],
+ "measurement": "webindex_links_ingested",
+ "query": "SELECT sum(\"value\") AS \"value\" FROM \"webindex_links_ingested\" WHERE \"field\" = 'm1_rate' AND $timeFilter GROUP BY time($interval)",
+ "refId": "A",
+ "tags": [
+ {
+ "key": "field",
+ "operator": "=",
+ "value": "m1_rate"
+ }
+ ]
+ },
+ {
+ "alias": "new / sec",
+ "fields": [
+ {
+ "func": "sum",
+ "name": "value"
+ }
+ ],
+ "groupBy": [
+ {
+ "interval": "auto",
+ "type": "time"
+ }
+ ],
+ "measurement": "webindex_links_new",
+ "query": "SELECT sum(\"value\") AS \"value\" FROM \"webindex_links_new\" WHERE \"field\" = 'm1_rate' AND $timeFilter GROUP BY time($interval)",
+ "refId": "B",
+ "tags": [
+ {
+ "key": "field",
+ "operator": "=",
+ "value": "m1_rate"
+ }
+ ]
+ },
+ {
+ "alias": "changed / sec",
+ "fields": [
+ {
+ "func": "sum",
+ "name": "value"
+ }
+ ],
+ "groupBy": [
+ {
+ "interval": "auto",
+ "type": "time"
+ }
+ ],
+ "measurement": "webindex_links_changed",
+ "query": "SELECT sum(\"value\") AS \"value\" FROM \"webindex_links_changed\" WHERE \"field\" = 'm1_rate' AND $timeFilter GROUP BY time($interval)",
+ "refId": "C",
+ "tags": [
+ {
+ "key": "field",
+ "operator": "=",
+ "value": "m1_rate"
+ }
+ ]
+ },
+ {
+ "refId": "D",
+ "tags": [
+ {
+ "key": "field",
+ "operator": "=",
+ "value": "m1_rate"
+ }
+ ],
+ "groupBy": [
+ {
+ "type": "time",
+ "interval": "auto"
+ }
+ ],
+ "fields": [
+ {
+ "name": "value",
+ "func": "sum"
+ }
+ ],
+ "measurement": "webindex_links_exported",
+ "query": "SELECT sum(\"value\") AS \"value\" FROM \"webindex_links_exported\" WHERE \"field\" = 'm1_rate' AND $timeFilter GROUP BY time($interval)",
+ "alias": "exported / sec"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Links",
+ "tooltip": {
+ "shared": true,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "x-axis": true,
+ "y-axis": true,
+ "y_formats": [
+ "short",
+ "short"
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "datasource": null,
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "grid": {
+ "leftLogBase": 1,
+ "leftMax": null,
+ "leftMin": null,
+ "rightLogBase": 1,
+ "rightMax": null,
+ "rightMin": null,
+ "threshold1": null,
+ "threshold1Color": "rgba(216, 200, 27, 0.27)",
+ "threshold2": null,
+ "threshold2Color": "rgba(234, 112, 112, 0.22)"
+ },
+ "id": 9,
+ "interval": "30s",
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "alias": "pages / sec",
+ "fields": [
+ {
+ "func": "sum",
+ "name": "value"
+ }
+ ],
+ "groupBy": [
+ {
+ "interval": "auto",
+ "type": "time"
+ }
+ ],
+ "measurement": "webindex_pages_exported",
+ "query": "SELECT sum(\"value\") AS \"value\" FROM \"webindex_pages_exported\" WHERE \"field\" = 'm1_rate' AND $timeFilter GROUP BY time($interval)",
+ "refId": "A",
+ "tags": [
+ {
+ "key": "field",
+ "operator": "=",
+ "value": "m1_rate"
+ }
+ ]
+ },
+ {
+ "alias": "links / sec",
+ "fields": [
+ {
+ "func": "sum",
+ "name": "value"
+ }
+ ],
+ "groupBy": [
+ {
+ "interval": "auto",
+ "type": "time"
+ }
+ ],
+ "measurement": "webindex_links_exported",
+ "query": "SELECT sum(\"value\") AS \"value\" FROM \"webindex_links_exported\" WHERE \"field\" = 'm1_rate' AND $timeFilter GROUP BY time($interval)",
+ "refId": "B",
+ "tags": [
+ {
+ "key": "field",
+ "operator": "=",
+ "value": "m1_rate"
+ }
+ ]
+ },
+ {
+ "alias": "domains / sec",
+ "fields": [
+ {
+ "func": "sum",
+ "name": "value"
+ }
+ ],
+ "groupBy": [
+ {
+ "interval": "auto",
+ "type": "time"
+ }
+ ],
+ "measurement": "webindex_domains_exported",
+ "query": "SELECT sum(\"value\") AS \"value\" FROM \"webindex_domains_exported\" WHERE \"field\" = 'm5_rate' AND $timeFilter GROUP BY time($interval)",
+ "refId": "C",
+ "tags": [
+ {
+ "key": "field",
+ "operator": "=",
+ "value": "m5_rate"
+ }
+ ]
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Exported Comparison",
+ "tooltip": {
+ "shared": true,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "x-axis": true,
+ "y-axis": true,
+ "y_formats": [
+ "short",
+ "short"
+ ]
+ }
+ ],
+ "title": "New row"
+ },
+ {
+ "collapse": false,
+ "editable": true,
+ "height": "250px",
+ "panels": [],
+ "title": "New row"
+ },
+ {
+ "collapse": false,
+ "editable": true,
+ "height": "250px",
+ "panels": [],
+ "title": "New row"
+ }
+ ],
+ "time": {
+ "from": "now-15m",
+ "to": "now"
+ },
+ "timepicker": {
+ "now": true,
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "templating": {
+ "list": []
+ },
+ "annotations": {
+ "list": []
+ },
+ "refresh": "30s",
+ "schemaVersion": 7,
+ "version": 1,
+ "links": []
+}
\ No newline at end of file
diff --git a/webindex/contrib/webindex.png b/webindex/contrib/webindex.png
new file mode 100644
index 0000000..1f1cf4f
--- /dev/null
+++ b/webindex/contrib/webindex.png
Binary files differ
diff --git a/webindex/contrib/webindex.svg b/webindex/contrib/webindex.svg
new file mode 100644
index 0000000..c9e3e98
--- /dev/null
+++ b/webindex/contrib/webindex.svg
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!--
+ Copyright 2016 Webindex authors (see AUTHORS)
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- This logo was created using:
+ - Flow-merge icon by Stephen Hutchings (CC BY-SA license)
+ https://github.com/stephenhutchings/typicons.font
+ - Audiowide Font (SIL Open Font License 1.1)
+ https://www.google.com/fonts/specimen/Audiowide
+-->
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="744.09448819"
+ height="1052.3622047"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.47 r22583"
+ sodipodi:docname="webindex.svg">
+ <defs
+ id="defs4">
+ <inkscape:perspective
+ sodipodi:type="inkscape:persp3d"
+ inkscape:vp_x="0 : 526.18109 : 1"
+ inkscape:vp_y="0 : 1000 : 0"
+ inkscape:vp_z="744.09448 : 526.18109 : 1"
+ inkscape:persp3d-origin="372.04724 : 350.78739 : 1"
+ id="perspective10" />
+ <inkscape:perspective
+ id="perspective9120"
+ inkscape:persp3d-origin="12 : 8 : 1"
+ inkscape:vp_z="24 : 12 : 1"
+ inkscape:vp_y="0 : 1000 : 0"
+ inkscape:vp_x="0 : 12 : 1"
+ sodipodi:type="inkscape:persp3d" />
+ </defs>
+ <sodipodi:namedview
+ id="base"
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1.0"
+ inkscape:pageopacity="0.0"
+ inkscape:pageshadow="2"
+ inkscape:zoom="0.58630005"
+ inkscape:cx="493.67526"
+ inkscape:cy="526.18109"
+ inkscape:document-units="px"
+ inkscape:current-layer="layer1"
+ showgrid="false"
+ inkscape:window-width="1670"
+ inkscape:window-height="872"
+ inkscape:window-x="97"
+ inkscape:window-y="25"
+ inkscape:window-maximized="0" />
+ <metadata
+ id="metadata7">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <g
+ inkscape:label="Layer 1"
+ inkscape:groupmode="layer"
+ id="layer1">
+ <flowRoot
+ xml:space="preserve"
+ id="flowRoot2816"
+ style="font-size:40px;font-style:normal;font-weight:normal;fill:#ff6600;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+ transform="translate(100.05314,168.95898)"><flowRegion
+ id="flowRegion2818"><rect
+ id="rect2820"
+ width="457.14285"
+ height="217.14285"
+ x="120"
+ y="315.21933"
+ style="fill:#ff6600" /></flowRegion><flowPara
+ id="flowPara2822"
+ style="font-size:72px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#ff6600;font-family:Audiowide;-inkscape-font-specification:Audiowide">WebIndex</flowPara></flowRoot> <path
+ id="path9112"
+ d="m 209.78738,539.25646 0,-4.97635 c 0,-5.18875 -7.72387,-9.40964 -17.2188,-9.40964 -4.06856,0 -7.37949,-1.80934 -7.37949,-4.0327 l 0,-4.07842 c 5.71172,-1.11571 9.83932,-4.07033 9.83932,-7.57073 0,-4.44672 -6.62187,-8.0654 -14.75899,-8.0654 -8.13712,0 -14.75897,3.61868 -14.75897,8.0654 0,3.5004 4.12758,6.45502 9.83932,7.56804 l 0,4.08111 c 0,2.22336 -3.31094,4.0327 -7.3795,4.0327 -9.49494,0 -17.21881,4.22089 -17.21881,9.40964 l 0,4.97635 c -5.71172,1.11572 -9.83932,4.07035 -9.83932,7.57073 0,4.44673 6.62187,8.06542 14.75899,8.06542 8.1371,0 14.75897,-3.61869 14.75897,-8.06542 0,-3.50038 -4.12759,-6.45501 -9.83932,-7.57073 l 0,-4.97635 c 0,-2.22337 3.31093,-4.0327 7.37949,-4.0327 4.81635,0 9.17024,-1.09421 12.29915,-2.8444 3.12891,1.75019 7.4828,2.8444 12.29916,2.8444 4.06855,0 7.37948,1.80933 7.37948,4.0327 l 0,4.97635 c -5.71172,1.11572 -9.83932,4.07035 -9.83932,7.57073 0,4.44673 6.62187,8.06542 14.75899,8.06542 8.13712,0 14.75897,-3.61869 14.75897,-8.06542 0,-3.50038 -4.12759,-6.45501 -9.83932,-7.57073 z m -54.11625,10.25921 c -2.71566,0 -4.91967,-1.20713 -4.91967,-2.68848 0,-1.48134 2.20401,-2.68846 4.91967,-2.68846 2.71565,0 4.91965,1.20712 4.91965,2.68846 0,1.48135 -2.204,2.68848 -4.91965,2.68848 z m 24.59829,-43.01551 c 2.71565,0 4.91967,1.20712 4.91967,2.68846 0,1.48135 -2.20402,2.68848 -4.91967,2.68848 -2.71565,0 -4.91965,-1.20713 -4.91965,-2.68848 0,-1.48134 2.204,-2.68846 4.91965,-2.68846 z m 24.59831,43.01551 c -2.71566,0 -4.91967,-1.20713 -4.91967,-2.68848 0,-1.48134 2.20401,-2.68846 4.91967,-2.68846 2.71565,0 4.91965,1.20712 4.91965,2.68846 0,1.48135 -2.204,2.68848 -4.91965,2.68848 z"
+ style="fill:#0055d4" />
+ </g>
+</svg>
diff --git a/webindex/docs/code-guide.md b/webindex/docs/code-guide.md
new file mode 100644
index 0000000..5c1df10
--- /dev/null
+++ b/webindex/docs/code-guide.md
@@ -0,0 +1,84 @@
+
+# Code Guide
+
+The Webindex example has three major code components.
+
+ * Spark component : Generates initial Fluo and Query [tables].
+ * Fluo component : Updates the [Query table][qt] as web pages are added, removed, and updated.
+ * Web component : Web application that uses the [Query table][qt].
+
+Since all of these components either read or write the Query table, you may
+want to read about the [Query Table][qt] before reading about the code.
+
+## Guide to Fluo Component.
+
+The following image shows a high level view of how data flows through the Fluo
+Webindex code.
+
+<center>![Observer Map](webindex_graphic.png)</center>
+<!--
+The image was produced using Google Docs. A link to the source is here.
+https://docs.google.com/drawings/d/1vl26uXtScXn1ssj3WEb-qskuH-15OOmWul1B562oWDc/edit?usp=sharing
+-->
+
+
+### Page Loader
+
+The [PageLoader] queues updated page content for processing by the [PageObserver].
+
+### Observer Provider
+
+All Observers are setup by [WebindexObservers]. This class wires up everything discussed below.
+
+### Page Observer
+
+The [PageObserver] computes changes to links in a page. It queues `+1` and `-1` for new and
+deleted URIs to the uriQ. It also queues up changes in URIs to the export queue.
+
+### URI Combine Queue
+
+A CombineQueue is setup to track the number of pages linking to a URI. The `reduce()` function in
+[UriInfo] combines multiple updates into a single value.
+[UriCombineQ.UriUpdateObserver][UriCombineQ] is called when a keys values changes. The update
+observer queues '+1' and '-1' to the domain map. The update observer also queues change in URI
+inbound link counts to the export queue.
+
+### Domain Combine Queue
+
+A CombineQueue is setup to track the number of unique URIs observed in each domain. The
+SummingCombiner from Fluo Recipes combines updates.
+[DomainCombineQ.DomainUpdateObserver][DomainCombineQ] is called when
+a keys value changes and it queues the changes on the export queue.
+
+### Export Queue
+
+All other observers place [IndexUpdate] observers on the export queue. [IndexUpdateTranslator] is a
+function that translates [IndexUpdate]s to Accumulo Mutations. This function is passed to the Fluo
+Recipe that exports to Accumulo tables.
+
+[IndexUpdate] is is implemented by the following classes:
+
+1. [DomainUpdate] - Updates information related to domain (like page count).
+
+2. [PageUpdate] - Updates information related to page (like links being added or deleted).
+
+3. [UriUpdate] - Updates information related to URI.
+
+These objects are translated to mutations using code in the [IndexClient].
+
+
+[PageLoader]: ../modules/data/src/main/java/webindex/data/fluo/PageLoader.java
+[PageObserver]: ../modules/data/src/main/java/webindex/data/fluo/PageObserver.java
+[WebindexObservers]: ../modules/data/src/main/java/webindex/data/fluo/WebindexObservers.java
+[UriCombineQ]: ../modules/data/src/main/java/webindex/data/fluo/UriCombineQ.java
+[DomainCombineQ]: ../modules/data/src/main/java/webindex/data/fluo/DomainCombineQ.java
+[IndexUpdateTranslator]: ../modules/data/src/main/java/webindex/data/fluo/IndexUpdateTranslator.java
+[IndexUpdate]: ../modules/core/src/main/java/webindex/core/models/export/IndexUpdate.java
+[DomainUpdate]: ../modules/core/src/main/java/webindex/core/models/export/DomainUpdate.java
+[PageUpdate]: ../modules/core/src/main/java/webindex/core/models/export/PageUpdate.java
+[UriUpdate]: ../modules/core/src/main/java/webindex/core/models/export/UriUpdate.java
+[UriInfo]: ../modules/core/src/main/java/webindex/core/models/UriInfo.java
+[IndexClient]: ../modules/core/src/main/java/webindex/core/IndexClient.java
+[qt]: tables.md#query-table-schema
+[tables]: tables.md
+
diff --git a/webindex/docs/install.md b/webindex/docs/install.md
new file mode 100644
index 0000000..207fa41
--- /dev/null
+++ b/webindex/docs/install.md
@@ -0,0 +1,142 @@
+# WebIndex Install
+
+Below are instructions for installing WebIndex on a cluster.
+
+## Requirements
+
+To run WebIndex, you need the following installed on your cluster:
+
+* Java
+* Hadoop (HDFS & YARN)
+* Accumulo
+* Fluo
+* Maven
+
+Hadoop & Accumulo should be running before starting these instructions. Fluo and Maven only need to
+be installed on the machine where you run the `webindex` command. Consider using [Uno] to setup
+Hadoop, Accumulo & Fluo if you are running on a single node.
+
+## Configure your environment
+
+First, clone the WebIndex repo:
+
+ git clone https://github.com/astralway/webindex.git
+
+Copy the configuration files in `conf/examples` to `conf/` and edit for your environment:
+
+ cd webindex/conf/
+ cp examples/* .
+ vim webindex.yml
+ vim webindex-env.sh
+
+## Download the paths file for a crawl
+
+For each crawl of the web, Common Crawl produces a file containing a list of paths to the data
+files produced by that crawl. The webindex `copy` and `load-s3` commands use this file to
+retrieve Common Crawl data stored in S3. The `getpaths` command below downloads this paths
+file for the April 2015 crawl (identified by `2015-18`) to the `paths/` directory as it will
+be necessary for future commands. If you would like to use a different crawl, the
+[Common Crawl website][cdata] has a list of possible crawls which are identified by the
+`YEAR-WEEK` (i.e. `2015-18`) of the time the crawl occurred.
+
+ ./bin/webindex getpaths 2015-18
+
+Take a look at the paths file that was just retrieved.
+
+ $ less paths/2015-18.wat.paths
+
+Each line in the paths file contains a path to a different common crawl data file. In later
+commands, you will select paths by specifying a range (in the format of `START-END`). Ranges
+can start at index 0 and their start/end points are inclusive. Therefore, a range of `4-6`
+would select 3 paths from line 4, 5, and 6 of the file. Using the command below, you can
+find the max endpoint for ranges in a paths file.
+
+ $ wc -l paths/2015-18.wat.paths
+ 38609 paths/2015-18.wat.paths
+
+The 2015-18 paths file has 38609 different paths. A range of `0-38608` would select all
+paths in the file.
+
+## Copy Common Crawl data from AWS into HDFS
+
+After retrieving a paths file, the command below runs a Spark job that copies data files from S3
+to HDFS. The command below will copy 3 files in the file range of `4-6` of the `2015-18` paths
+file into the HDFS directory `/cc/data/a`. Common Crawl data files are large (~330 MB each) so
+be mindful of how many you copy.
+
+ ./bin/webindex copy 2015-18 4-6 /cc/data/a
+
+To create multiple data sets, run the command with different range and HDFS directory.
+
+ ./bin/webindex copy 2015-18 7-8 /cc/data/b
+
+## Initialize the webindex Fluo application
+
+After copying data into HDFS, run the following to initialize and start the webindex
+Fluo application.
+
+ ./bin/webindex init
+
+Optionally, add a HDFS directory (with previously copied data) to the end of the command.
+When a directory is specified, `init` will run a Spark job that initializes the webindex
+Fluo application with data before starting it.
+
+ ./bin/webindex init /cc/data/a
+
+## Start the webindex Fluo application
+
+After the Fluo application has been initialized, pick a method below to run the application:
+
+1. Run local processes:
+
+ fluo oracle -a webindex &> oracle.log
+ fluo worker -a webindex &> worker.log
+
+1. Run in YARN:
+
+ fluo-yarn start webindex /path/to/fluo-yarn.properties
+
+1. [Run in Docker](https://fluo.apache.org/docs/fluo/1.2/administration/run-fluo-in-docker)
+
+
+## Load data into the webindex Fluo application
+
+The `init` command should only be run on an empty cluster. To add more data, run the
+`load-hdfs` or `load-s3` commands. Both start a Spark job that parses Common Crawl data
+and inserts this data into the Fluo table of the webindex application. The webindex Fluo
+observers will incrementally process this data and export indexes to Accumulo.
+
+The `load-hdfs` command below loads data stored in the HDFS directory `/cc/data/b` into
+Fluo.
+
+ ./bin/webindex load-hdfs /cc/data/b
+
+The `load-s3` command below loads data hosted on S3 into Fluo. It select files in the
+`9-10` range of the `2015-18` paths file.
+
+ ./bin/webindex load-s3 2015-18 9-10
+
+## Compact Transient Ranges
+
+For long runs, this example has [transient ranges][transient] that need to be
+periodically compacted. This can be accomplished with the following command.
+
+```bash
+nohup fluo exec webindex org.apache.fluo.recipes.accumulo.cmds.CompactTransient 600 &> your_log_file.log &
+```
+
+As long as this command is running, it will initiate a compaction of all transient
+ranges every 10 minutes.
+
+## Run the webindex UI
+
+Run the following command to run the webindex UI which can be viewed at
+[http://localhost:4567/](http://localhost:4567/).
+
+ ./bin/webindex ui
+
+The UI queries indexes stored in Accumulo that were exported by Fluo.
+
+[Uno]: https://github.com/astralway/uno
+[transient]: https://github.com/apache/fluo-recipes/blob/master/docs/transient.md
+[cdata]: https://commoncrawl.org/the-data/get-started/
diff --git a/webindex/docs/tables.md b/webindex/docs/tables.md
new file mode 100644
index 0000000..cc9bdb8
--- /dev/null
+++ b/webindex/docs/tables.md
@@ -0,0 +1,109 @@
+# Webindex tables
+
+ The example uses two tables in Accumulo. One table is used by the
+Fluo component of the example. The second table stores an index used by the
+web application to service queries.
+
+Web pages are the input for the Webindex example and a query table is the
+output. The Fluo table is an intermediate table needed to incrementally keep
+the query table up to date.
+
+## Fluo table Schema
+
+Data is stored in the Fluo table using the following row prefixes.
+
+ Row Prefix | Description
+:----------:|------------
+ dm: | Under this prefix a `CollisionFreeMap<String, Long>` is stored. Domains are used for the map key. The map keeps track of the total number of unique URIs seen in each domain.
+ eq: | Under this prefix an `ExportQueue<String, Transmutable<String>>` is stored. This export queue is used to push one of three update types to the query table.
+ p: | Rows in this range contain information about individual pages content. Only new and current page content is stored per URI here.
+ um: | Under this prefix a `CollisionFreeMap<String, UriInfo>` is stored. URIs are used for the map key. The map keeps track of the number of URIs referencing a URI. If also keeps track of wether or not content for a URI was seen.
+
+## Query Table Schema
+
+The data in the query table is structured so that the web application can efficiently answer the following questions.
+
+ * Which web page is referenced by the most web pages?
+ * For a given domain, which pages in that domain are referenced by the most web pages?
+ * For a given web page, what pages reference it?
+
+To answer these questions the query table has three top level row prefixes :
+
+ Row Prefix | Description
+:----------:|------------
+ d: | Rows in this range contain domain information
+ p: | Rows in this range contain information about individual pages
+ t: | Rows in this range sort all URIs by reference count
+
+### Domain Row Range
+
+In the domain section of the table all rows are of the form `d:<domain>`. The
+following table shows the possible columns for domain rows.
+
+| Family | Qualifier | Timestamp | Value | Description
+|---------|----------------|-----------|---------------|-------------
+| domain | pagecount | \<eseq\> | \<#pages\> | Count of the number of rank columns in the row
+| rank | \<urc>:\<uri\> | \<eseq\> | \<urc\> | Count of how many times a URI in the domain is referenced. The count is encoded in the qualifier so that the URI with the most references sorts first.
+
+Legend :
+
+ * \<eseq\> : export sequence number
+ * \<urc\> : URI reference count
+
+### Page Row range
+
+In the page section of the table all rows are of the form `p:<uri>`. The
+following table shows the possible columns for page rows.
+
+| Family | Qualifier | Timestamp | Value | Description
+|---------|-------------------|-----------|-----------|------------
+| page | cur | \<eseq\> | \<json\> | The value contains information about a web pages outgoing links encoded in json.
+| page | incount | \<eseq\> | \<urc\> | A count of the number of pages that reference this URI. This is also the numnber of inlinks column families in this row.
+| inlinks | \<uri\> | \<eseq\> | \<anchor\> | A URI that references this URI/page. The value contains the anchor text from the referencing link.
+
+### Total row range
+
+All rows in this range are of the form `t:<uri-ref-count>:<uri>` and there are
+no columns. This row range contains all URIs sorted from most referenced to
+least referenced. The URI reference count in the row is encoded in a special
+way to achieve this sort order.
+
+### Example
+
+Input Data :
+
+ a.com/page1 links to c.com, b.com
+ b.com links to c.com/page1, c.com
+ d.com links to c.com
+
+Resulting Accumulo Table :
+
+ row cf cq value
+ --------------------------------------------------
+ d:com.a domain pagecount 1
+ rank 1:com.a/page1 1
+ d:com.b domain pagecount 1
+ rank 2:com.b 2
+ d:com.c domain pagecount 2
+ rank 3:com.c 3
+ 1:com.c/page1 1
+ d:com.d domain pagecount 1
+ rank 1:com.d 1
+ p:com.a/page1 page cur {"outlinkcount": 2, "outlinks":[c.com, b.com]}
+ incount 0
+ p:com.b inlinks com.a/page1 anchorText
+ page cur {"outlinkcount": 2, "outlinks":[c.com/page1, c.com]}
+ incount 1
+ p:com.c inlinks com.a/page1 anchorText
+ com.b anchorText
+ com.d anchorText
+ page incount 3
+ p:com.c/page1 inlinks com.b anchorText
+ page incount 1
+ p:com.d page cur {"outlinkcount": 1, "outlinks":[c.com]}
+ incount 0
+ t:3:com.c 3
+ t:2:com.b 2
+ t:1:com.c/page1 1
+ t:0:com.a/page1 0
+ t:0:com.d 0
diff --git a/webindex/docs/webindex_graphic.png b/webindex/docs/webindex_graphic.png
new file mode 100644
index 0000000..24ea450
--- /dev/null
+++ b/webindex/docs/webindex_graphic.png
Binary files differ
diff --git a/webindex/modules/core/pom.xml b/webindex/modules/core/pom.xml
new file mode 100644
index 0000000..25c7a63
--- /dev/null
+++ b/webindex/modules/core/pom.xml
@@ -0,0 +1,77 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2015 Webindex authors (see AUTHORS)
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>io.github.astralway</groupId>
+ <artifactId>webindex-parent</artifactId>
+ <version>0.0.1-SNAPSHOT</version>
+ <relativePath>../../pom.xml</relativePath>
+ </parent>
+ <artifactId>webindex-core</artifactId>
+ <name>WebIndex Core</name>
+ <dependencies>
+ <dependency>
+ <groupId>com.esotericsoftware.yamlbeans</groupId>
+ <artifactId>yamlbeans</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>com.google.code.gson</groupId>
+ <artifactId>gson</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>commons-validator</groupId>
+ <artifactId>commons-validator</artifactId>
+ <version>1.4.1</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.accumulo</groupId>
+ <artifactId>accumulo-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-api</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-recipes-accumulo</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+ <!-- Test Dependencies -->
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+</project>
diff --git a/webindex/modules/core/src/main/java/webindex/core/Constants.java b/webindex/modules/core/src/main/java/webindex/core/Constants.java
new file mode 100644
index 0000000..3597456
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/Constants.java
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core;
+
+import org.apache.fluo.api.data.Column;
+import org.apache.fluo.recipes.core.types.StringEncoder;
+import org.apache.fluo.recipes.core.types.TypeLayer;
+
+public class Constants {
+
+ // Column Families
+ // for page
+ public static final String PAGE = "page";
+ public static final String INLINKS = "inlinks";
+ // for domains
+ public static final String DOMAIN = "domain";
+ public static final String PAGES = "pages";
+ public static final String RANK = "rank";
+
+ // Column Qualifiers
+ // for page
+ public static final String INCOUNT = "incount";
+ public static final String NEW = "new";
+ public static final String CUR = "cur";
+ // for domains
+ public static final String PAGECOUNT = "pagecount";
+
+ // Columns
+ public static final Column PAGE_NEW_COL = new Column(PAGE, NEW);
+ public static final Column PAGE_CUR_COL = new Column(PAGE, CUR);
+ public static final Column PAGE_INCOUNT_COL = new Column(PAGE, INCOUNT);
+ public static final Column PAGECOUNT_COL = new Column(DOMAIN, PAGECOUNT);
+
+ public static final TypeLayer TYPEL = new TypeLayer(new StringEncoder());
+}
diff --git a/webindex/modules/core/src/main/java/webindex/core/IndexClient.java b/webindex/modules/core/src/main/java/webindex/core/IndexClient.java
new file mode 100644
index 0000000..ba38bf0
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/IndexClient.java
@@ -0,0 +1,320 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.function.Consumer;
+
+import com.google.gson.Gson;
+import org.apache.accumulo.core.client.Connector;
+import org.apache.accumulo.core.client.Scanner;
+import org.apache.accumulo.core.client.TableNotFoundException;
+import org.apache.accumulo.core.client.lexicoder.Lexicoder;
+import org.apache.accumulo.core.client.lexicoder.ReverseLexicoder;
+import org.apache.accumulo.core.client.lexicoder.ULongLexicoder;
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.Mutation;
+import org.apache.accumulo.core.data.Range;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.security.Authorizations;
+import org.apache.commons.codec.binary.Hex;
+import org.apache.fluo.api.data.Bytes;
+import org.apache.fluo.api.data.Column;
+import org.apache.fluo.api.data.RowColumn;
+import org.apache.fluo.recipes.accumulo.export.function.AccumuloTranslator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import webindex.core.models.DomainStats;
+import webindex.core.models.Link;
+import webindex.core.models.Links;
+import webindex.core.models.Page;
+import webindex.core.models.Pages;
+import webindex.core.models.TopResults;
+import webindex.core.models.URL;
+import webindex.core.models.UriInfo;
+import webindex.core.models.export.DomainUpdate;
+import webindex.core.models.export.PageUpdate;
+import webindex.core.models.export.UriUpdate;
+import webindex.core.util.Pager;
+
+public class IndexClient {
+
+ private static final Logger log = LoggerFactory.getLogger(IndexClient.class);
+ private static final int PAGE_SIZE = 25;
+
+ private Connector conn;
+ private String accumuloIndexTable;
+ private Gson gson = new Gson();
+
+ public IndexClient(String accumuloIndexTable, Connector conn) {
+ this.accumuloIndexTable = accumuloIndexTable;
+ this.conn = conn;
+ }
+
+ public TopResults getTopResults(String next, int pageNum) {
+
+ TopResults results = new TopResults();
+
+ results.setPageNum(pageNum);
+ try {
+ Scanner scanner = conn.createScanner(accumuloIndexTable, Authorizations.EMPTY);
+ Pager pager = Pager.build(scanner, Range.prefix("t:"), PAGE_SIZE, entry -> {
+ String row = entry.getKey().getRow().toString();
+ if (entry.isNext()) {
+ results.setNext(row);
+ } else {
+ String url = URL.fromUri(row.split(":", 3)[2]).toString();
+ Long num = Long.parseLong(entry.getValue().toString());
+ results.addResult(url, num);
+ }
+ });
+ if (next.isEmpty()) {
+ pager.read(pageNum);
+ } else {
+ pager.read(new Key(next));
+ }
+ } catch (TableNotFoundException e) {
+ log.error("Table {} not found", accumuloIndexTable);
+ }
+ return results;
+ }
+
+ private static Long getLongValue(Map.Entry<Key, Value> entry) {
+ return Long.parseLong(entry.getValue().toString());
+ }
+
+ public Page getPage(String rawUrl) {
+ Page page = null;
+ Long incount = (long) 0;
+ URL url;
+ try {
+ url = URL.from(rawUrl);
+ } catch (Exception e) {
+ log.error("Failed to parse URL {}", rawUrl);
+ return null;
+ }
+
+ try {
+ Scanner scanner = conn.createScanner(accumuloIndexTable, Authorizations.EMPTY);
+ scanner.setRange(Range.exact("p:" + url.toUri(), Constants.PAGE));
+ for (Map.Entry<Key, Value> entry : scanner) {
+ switch (entry.getKey().getColumnQualifier().toString()) {
+ case Constants.INCOUNT:
+ incount = getLongValue(entry);
+ break;
+ case Constants.CUR:
+ page = gson.fromJson(entry.getValue().toString(), Page.class);
+ break;
+ default:
+ log.error("Unknown page stat {}", entry.getKey().getColumnQualifier());
+ }
+ }
+ } catch (TableNotFoundException e) {
+ e.printStackTrace();
+ }
+
+ if (page == null) {
+ page = new Page(url.toUri());
+ }
+ page.setNumInbound(incount);
+ return page;
+ }
+
+ public DomainStats getDomainStats(String domain) {
+ DomainStats stats = new DomainStats(domain);
+ Scanner scanner;
+ try {
+ scanner = conn.createScanner(accumuloIndexTable, Authorizations.EMPTY);
+ scanner.setRange(Range.exact("d:" + URL.reverseHost(domain), Constants.DOMAIN));
+ for (Map.Entry<Key, Value> entry : scanner) {
+ switch (entry.getKey().getColumnQualifier().toString()) {
+ case Constants.PAGECOUNT:
+ stats.setTotal(getLongValue(entry));
+ break;
+ default:
+ log.error("Unknown page domain {}", entry.getKey().getColumnQualifier());
+ }
+ }
+ } catch (TableNotFoundException e) {
+ e.printStackTrace();
+ }
+ return stats;
+ }
+
+ public Pages getPages(String domain, String next, int pageNum) {
+ DomainStats stats = getDomainStats(domain);
+ Pages pages = new Pages(domain, pageNum);
+ pages.setTotal(stats.getTotal());
+ String row = "d:" + URL.reverseHost(domain);
+ String cf = Constants.RANK;
+ try {
+ Scanner scanner = conn.createScanner(accumuloIndexTable, Authorizations.EMPTY);
+ Pager pager =
+ Pager.build(scanner, Range.prefix(row + ":"), PAGE_SIZE, entry -> {
+ if (entry.isNext()) {
+ pages.setNext(entry.getKey().getRowData().toString().split(":", 3)[2]);
+ } else {
+ String url =
+ URL.fromUri(entry.getKey().getRowData().toString().split(":", 4)[3]).toString();
+ Long count = Long.parseLong(entry.getValue().toString());
+ pages.addPage(url, count);
+ }
+ });
+ if (next.isEmpty()) {
+ pager.read(pageNum);
+ } else {
+ pager.read(new Key(row + ":" + next, cf, ""));
+
+ }
+ } catch (TableNotFoundException e) {
+ log.error("Table {} not found", accumuloIndexTable);
+ }
+ return pages;
+ }
+
+ public Links getLinks(String rawUrl, String linkType, String next, int pageNum) {
+
+ Links links = new Links(rawUrl, linkType, pageNum);
+
+ URL url;
+ try {
+ url = URL.from(rawUrl);
+ } catch (Exception e) {
+ log.error("Failed to parse URL: " + rawUrl);
+ return links;
+ }
+
+ try {
+ Scanner scanner = conn.createScanner(accumuloIndexTable, Authorizations.EMPTY);
+ String row = "p:" + url.toUri();
+ if (linkType.equals("in")) {
+ Page page = getPage(rawUrl);
+ String cf = Constants.INLINKS;
+ links.setTotal(page.getNumInbound());
+ Pager pager = Pager.build(scanner, Range.exact(row, cf), PAGE_SIZE, entry -> {
+ String uri = entry.getKey().getColumnQualifier().toString();
+ if (entry.isNext()) {
+ links.setNext(uri);
+ } else {
+ String anchorText = entry.getValue().toString();
+ links.addLink(Link.of(uri, anchorText));
+ }
+ });
+ if (next.isEmpty()) {
+ pager.read(pageNum);
+ } else {
+ pager.read(new Key(row, cf, next));
+ }
+ } else {
+ scanner.setRange(Range.exact(row, Constants.PAGE, Constants.CUR));
+ Iterator<Map.Entry<Key, Value>> iter = scanner.iterator();
+ if (iter.hasNext()) {
+ Page curPage = gson.fromJson(iter.next().getValue().toString(), Page.class);
+ links.setTotal(curPage.getNumOutbound());
+ int skip = 0;
+ int add = 0;
+ for (Link l : curPage.getOutboundLinks()) {
+ if (skip < (pageNum * PAGE_SIZE)) {
+ skip++;
+ } else if (add < PAGE_SIZE) {
+ links.addLink(l);
+ add++;
+ } else {
+ links.setNext(l.getUri());
+ break;
+ }
+ }
+ }
+ }
+ } catch (TableNotFoundException e) {
+ log.error("Table {} not found", accumuloIndexTable);
+ }
+ return links;
+ }
+
+ public static void genDomainMutations(DomainUpdate update, long seq, Consumer<Mutation> consumer) {
+ Map<RowColumn, Bytes> oldData = genDomainData(update.getDomain(), update.getOldPageCount());
+ Map<RowColumn, Bytes> newData = genDomainData(update.getDomain(), update.getNewPageCount());
+ AccumuloTranslator.generateMutations(seq, oldData, newData, consumer);
+ }
+
+ public static Map<RowColumn, Bytes> genDomainData(String domain, Long pageCount) {
+ if (pageCount == 0) {
+ return Collections.emptyMap();
+ }
+ return Collections.singletonMap(new RowColumn("d:" + domain, Constants.PAGECOUNT_COL),
+ Bytes.of(pageCount + ""));
+ }
+
+ public static void genPageMutations(PageUpdate update, long seq, Consumer<Mutation> consumer) {
+ Mutation jsonMutation = new Mutation("p:" + update.getUri());
+ if (update.getJson().equals(Page.DELETE_JSON)) {
+ jsonMutation.putDelete(Constants.PAGE, Constants.CUR, seq);
+ } else {
+ jsonMutation.put(Constants.PAGE, Constants.CUR, seq, update.getJson());
+ }
+ consumer.accept(jsonMutation);
+
+ // invert links on export
+ for (Link link : update.getAddedLinks()) {
+ Mutation m = new Mutation("p:" + link.getUri());
+ m.put(Constants.INLINKS, update.getUri(), seq, link.getAnchorText());
+ consumer.accept(m);
+ }
+
+ for (Link link : update.getDeletedLinks()) {
+ Mutation m = new Mutation("p:" + link.getUri());
+ m.putDelete(Constants.INLINKS, update.getUri(), seq);
+ consumer.accept(m);
+ }
+ }
+
+ public static void genUriMutations(UriUpdate update, long seq, Consumer<Mutation> consumer) {
+ Map<RowColumn, Bytes> oldData = genUriData(update.getUri(), update.getOldInfo());
+ Map<RowColumn, Bytes> newData = genUriData(update.getUri(), update.getNewInfo());
+ AccumuloTranslator.generateMutations(seq, oldData, newData, consumer);
+ }
+
+ public static Map<RowColumn, Bytes> genUriData(String uri, UriInfo info) {
+ if (info.equals(UriInfo.ZERO)) {
+ return Collections.emptyMap();
+ }
+
+ Map<RowColumn, Bytes> rcMap = new HashMap<>();
+ Bytes linksTo = Bytes.of("" + info.linksTo);
+ rcMap.put(new RowColumn(createTotalRow(uri, info.linksTo), Column.EMPTY), linksTo);
+ String domain = URL.fromUri(uri).getReverseDomain();
+ String domainRow = encodeDomainRankUri(domain, info.linksTo, uri);
+ rcMap.put(new RowColumn(domainRow, new Column(Constants.RANK, "")), linksTo);
+ rcMap.put(new RowColumn("p:" + uri, Constants.PAGE_INCOUNT_COL), linksTo);
+ return rcMap;
+ }
+
+ public static String revEncodeLong(Long num) {
+ Lexicoder<Long> lexicoder = new ReverseLexicoder<>(new ULongLexicoder());
+ return Hex.encodeHexString(lexicoder.encode(num));
+ }
+
+ public static String encodeDomainRankUri(String domain, long linksTo, String uri) {
+ return "d:" + domain + ":" + revEncodeLong(linksTo) + ":" + uri;
+ }
+
+ private static String createTotalRow(String uri, long curr) {
+ return "t:" + revEncodeLong(curr) + ":" + uri;
+ }
+}
diff --git a/webindex/modules/core/src/main/java/webindex/core/WebIndexConfig.java b/webindex/modules/core/src/main/java/webindex/core/WebIndexConfig.java
new file mode 100644
index 0000000..2f44cdd
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/WebIndexConfig.java
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core;
+
+import java.io.File;
+import java.io.FileReader;
+
+import com.esotericsoftware.yamlbeans.YamlReader;
+import com.google.common.base.Preconditions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class WebIndexConfig {
+
+ private static final Logger log = LoggerFactory.getLogger(WebIndexConfig.class);
+
+ public static String CC_URL_PREFIX = "https://commoncrawl.s3.amazonaws.com/";
+ public static final String WI_EXECUTOR_INSTANCES = "WI_EXECUTOR_INSTANCES";
+
+ public String fluoHome;
+ public String hadoopConfDir;
+ public String accumuloIndexTable;
+ public String fluoApp;
+ public int numTablets = -1;
+ public int numBuckets = -1;
+ public String hdfsTempDir;
+ public int loadRateLimit = 0;
+
+ public String getConnPropsPath() {
+ return addSlash(fluoHome) + "conf/fluo-conn.properties";
+ }
+
+ public int getNumExecutorInstances() {
+ String numInstances = getEnv(WI_EXECUTOR_INSTANCES);
+ try {
+ return Integer.parseInt(numInstances);
+ } catch (NumberFormatException e) {
+ throw new IllegalStateException("Failed to parse value of " + numInstances + " for "
+ + WI_EXECUTOR_INSTANCES);
+ }
+ }
+
+ public int getLoadRateLimit() {
+ return loadRateLimit;
+ }
+
+ public static String getEnv(String name) {
+ String value = System.getenv(name);
+ if (value == null) {
+ throw new IllegalStateException(name + " must be set in environment!");
+ }
+ return value;
+ }
+
+ public static String getEnvPath(String name) {
+ String path = getEnv(name);
+ if (!(new File(path).exists())) {
+ throw new IllegalStateException("Directory set by " + name + "=" + path + " does not exist");
+ }
+ return path;
+ }
+
+ public static WebIndexConfig load() {
+ final String homePath = getEnvPath("WI_HOME");
+ final String userPath = homePath + "/conf/webindex.yml";
+ final String defaultPath = homePath + "/conf/examples/webindex.yml";
+ if ((new File(userPath).exists())) {
+ log.info("Using user config at {}", userPath);
+ return load(userPath);
+ } else {
+ log.info("Using default config at {}", defaultPath);
+ return load(defaultPath);
+ }
+ }
+
+ public static WebIndexConfig load(String configPath) {
+ return load(configPath, true);
+ }
+
+ protected static WebIndexConfig load(String configPath, boolean useEnv) {
+ Preconditions.checkArgument(new File(configPath).exists(), "Config does not exist at "
+ + configPath);
+ try {
+ YamlReader reader = new YamlReader(new FileReader(configPath));
+ WebIndexConfig config = reader.read(WebIndexConfig.class);
+ if (useEnv) {
+ config.hadoopConfDir = getEnvPath("HADOOP_CONF_DIR");
+ config.fluoHome = getEnvPath("FLUO_HOME");
+ }
+ return config;
+ } catch (Exception e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+ public static String addSlash(String prefix) {
+ if (!prefix.endsWith("/")) {
+ return prefix + "/";
+ }
+ return prefix;
+ }
+}
diff --git a/webindex/modules/core/src/main/java/webindex/core/models/DomainStats.java b/webindex/modules/core/src/main/java/webindex/core/models/DomainStats.java
new file mode 100644
index 0000000..7459f4f
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/models/DomainStats.java
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.models;
+
+public class DomainStats {
+
+ private String domain;
+ private Long total = (long) 0;
+
+ public DomainStats(String domain) {
+ this.domain = domain;
+ }
+
+ public String getDomain() {
+ return domain;
+ }
+
+ public Long getTotal() {
+ return total;
+ }
+
+ public void setTotal(Long total) {
+ this.total = total;
+ }
+}
diff --git a/webindex/modules/core/src/main/java/webindex/core/models/Link.java b/webindex/modules/core/src/main/java/webindex/core/models/Link.java
new file mode 100644
index 0000000..5d8ed5e
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/models/Link.java
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2016 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.models;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+public class Link implements Serializable, Comparable<Link> {
+
+ private static final long serialVersionUID = 1L;
+
+ private String url;
+ private String uri;
+ private String anchorText;
+
+ public Link() {}
+
+ public Link(String uri, String anchorText) {
+ Objects.requireNonNull(uri);
+ Objects.requireNonNull(anchorText);
+ this.url = URL.fromUri(uri).toString();
+ this.uri = uri;
+ this.anchorText = anchorText;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public String getUri() {
+ return uri;
+ }
+
+ public String getAnchorText() {
+ return anchorText;
+ }
+
+
+ public static Link of(String uri, String anchorText) {
+ return new Link(uri, anchorText);
+ }
+
+ public static Link of(String uri) {
+ return new Link(uri, "");
+ }
+
+ public static Link of(URL url, String anchorText) {
+ return new Link(url.toUri(), anchorText);
+ }
+
+ public static Link of(URL url) {
+ return new Link(url.toUri(), "");
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o instanceof Link) {
+ Link other = (Link) o;
+ return url.equals(other.url) && uri.equals(other.uri);
+ }
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = url.hashCode();
+ result = 31 * result + uri.hashCode();
+ return result;
+ }
+
+ @Override
+ public int compareTo(Link o) {
+ int c = uri.compareTo(o.uri);
+ if (c == 0) {
+ c = url.compareTo(o.url);
+ }
+
+ return c;
+ }
+}
diff --git a/webindex/modules/core/src/main/java/webindex/core/models/Links.java b/webindex/modules/core/src/main/java/webindex/core/models/Links.java
new file mode 100644
index 0000000..c870852
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/models/Links.java
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.models;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class Links {
+
+ private String url;
+ private String linkType;
+ private String next = "";
+ private Integer pageNum;
+ private Long total;
+ private List<Link> links = new ArrayList<>();
+
+ public Links() {
+ // Jackson deserialization
+ }
+
+ public Links(String url, String linkType, Integer pageNum) {
+ this.url = url;
+ this.linkType = linkType;
+ this.pageNum = pageNum;
+ }
+
+ public Long getTotal() {
+ return total;
+ }
+
+ public void setTotal(Long total) {
+ this.total = total;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public List<Link> getLinks() {
+ return links;
+ }
+
+ public void addLink(Link link) {
+ links.add(link);
+ }
+
+ public String getLinkType() {
+ return linkType;
+ }
+
+ public Integer getPageNum() {
+ return pageNum;
+ }
+
+ public String getNext() {
+ return next;
+ }
+
+ public void setNext(String next) {
+ this.next = next;
+ }
+}
diff --git a/webindex/modules/core/src/main/java/webindex/core/models/Page.java b/webindex/modules/core/src/main/java/webindex/core/models/Page.java
new file mode 100644
index 0000000..3f8117a
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/models/Page.java
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.models;
+
+import java.io.Serializable;
+import java.util.Objects;
+import java.util.Set;
+import java.util.TreeSet;
+
+import com.google.gson.Gson;
+
+public class Page implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ public static final Page EMPTY = new Page();
+ public static final Page DELETE = new Page(true);
+ public static final String DELETE_JSON = "delete";
+
+ private String url;
+ private String uri;
+ private Long numInbound;
+ private Long numOutbound = 0L;
+ private String crawlDate;
+ private String server;
+ private String title;
+ // This is a tree set so that json serializes consistently. Wanted to use hashset and sort on
+ // serialization, but could not figure out how to do that.
+ private Set<Link> outboundLinks = new TreeSet<>();
+ private transient boolean isDelete = false;
+
+ private Page() {}
+
+ private Page(boolean isDelete) {
+ this.isDelete = isDelete;
+ }
+
+ public Page(String uri) {
+ Objects.requireNonNull(uri);
+ this.url = URL.fromUri(uri).toString();
+ this.uri = uri;
+ }
+
+ public String getServer() {
+ return server;
+ }
+
+ public void setServer(String server) {
+ this.server = server;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public String getUri() {
+ return uri;
+ }
+
+ public Set<Link> getOutboundLinks() {
+ return outboundLinks;
+ }
+
+ /**
+ * @return True if page did not already contain link
+ */
+ public boolean addOutbound(Link link) {
+ boolean added = outboundLinks.add(link);
+ if (added) {
+ numOutbound++;
+ }
+ return added;
+ }
+
+ /**
+ * @return True if link was removed
+ */
+ public boolean removeOutbound(Link link) {
+ boolean removed = outboundLinks.remove(link);
+ if (removed) {
+ numOutbound--;
+ }
+ return removed;
+ }
+
+ public boolean isEmpty() {
+ return url == null && outboundLinks.isEmpty();
+ }
+
+ public String getDomain() {
+ return URL.fromUri(uri).getDomain();
+ }
+
+ public Long getNumInbound() {
+ return numInbound;
+ }
+
+ public void setNumInbound(Long numInbound) {
+ this.numInbound = numInbound;
+ }
+
+ public Long getNumOutbound() {
+ return numOutbound;
+ }
+
+ public String getCrawlDate() {
+ return crawlDate;
+ }
+
+ public void setCrawlDate(String crawlDate) {
+ this.crawlDate = crawlDate;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public void setTitle(String title) {
+ this.title = title;
+ }
+
+ public boolean isDelete() {
+ return isDelete;
+ }
+
+ public static Page fromJson(Gson gson, String pageJson) {
+ if (pageJson.isEmpty()) {
+ return Page.EMPTY;
+ }
+
+ if (pageJson.equals(DELETE_JSON)) {
+ return Page.DELETE;
+ }
+
+ return gson.fromJson(pageJson, Page.class);
+ }
+}
diff --git a/webindex/modules/core/src/main/java/webindex/core/models/Pages.java b/webindex/modules/core/src/main/java/webindex/core/models/Pages.java
new file mode 100644
index 0000000..de06db3
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/models/Pages.java
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.models;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class Pages {
+
+ private String domain;
+ private String next = "";
+ private Integer pageNum;
+ private Long total;
+ private List<PageScore> pages = new ArrayList<>();
+
+ public Pages() {
+ // Jackson deserialization
+ }
+
+ public Pages(String domain, Integer pageNum) {
+ this.domain = domain;
+ this.pageNum = pageNum;
+ }
+
+ public Long getTotal() {
+ return total;
+ }
+
+ public void setTotal(Long total) {
+ this.total = total;
+ }
+
+ public String getDomain() {
+ return domain;
+ }
+
+ public List<PageScore> getPages() {
+ return pages;
+ }
+
+ public String getNext() {
+ return next;
+ }
+
+ public void setNext(String next) {
+ this.next = next;
+ }
+
+ public Integer getPageNum() {
+ return pageNum;
+ }
+
+ public void addPage(PageScore pc) {
+ pages.add(pc);
+ }
+
+ public void addPage(String url, Long score) {
+ pages.add(new PageScore(url, score));
+ }
+
+ public class PageScore {
+
+ private String url;
+ private Long score;
+
+ public PageScore(String url, Long score) {
+ this.url = url;
+ this.score = score;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public Long getScore() {
+ return score;
+ }
+ }
+}
diff --git a/webindex/modules/core/src/main/java/webindex/core/models/TopResults.java b/webindex/modules/core/src/main/java/webindex/core/models/TopResults.java
new file mode 100644
index 0000000..6171b2e
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/models/TopResults.java
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.models;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class TopResults {
+
+ private String next = "";
+ private Integer pageNum;
+ private List<Result> results = new ArrayList<>();
+
+ public Integer getPageNum() {
+ return pageNum;
+ }
+
+ public void setPageNum(Integer pageNum) {
+ this.pageNum = pageNum;
+ }
+
+ public String getNext() {
+ return next;
+ }
+
+ public void addResult(String key, Long value) {
+ results.add(new Result(key, value));
+ }
+
+ public List<Result> getResults() {
+ return results;
+ }
+
+ public void setNext(String next) {
+ this.next = next;
+ }
+
+ public class Result {
+
+ private String key;
+ private Long value;
+
+ Result(String key, Long value) {
+ this.key = key;
+ this.value = value;
+ }
+
+ public String getKey() {
+ return key;
+ }
+
+ public Long getValue() {
+ return value;
+ }
+ }
+}
diff --git a/webindex/modules/core/src/main/java/webindex/core/models/URL.java b/webindex/modules/core/src/main/java/webindex/core/models/URL.java
new file mode 100644
index 0000000..98722b6
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/models/URL.java
@@ -0,0 +1,299 @@
+/*
+ * Copyright 2016 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.models;
+
+import java.io.Serializable;
+import java.util.Objects;
+import java.util.function.Function;
+
+import com.google.common.net.HostSpecifier;
+import com.google.common.net.InternetDomainName;
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.validator.routines.InetAddressValidator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class URL implements Serializable {
+
+ private static final Logger log = LoggerFactory.getLogger(URL.class);
+
+ private static final String URL_SEP_REGEX = "[/?#]";
+ private static final String HTTP_PROTO = "http://";
+ private static final String HTTPS_PROTO = "https://";
+ private static final String URI_SEP = ">";
+ public static final InetAddressValidator validator = InetAddressValidator.getInstance();
+
+ private static final long serialVersionUID = 1L;
+
+ private String domain;
+ private String host;
+ private String path;
+ private int port;
+ private boolean secure;
+ private boolean ipHost;
+
+ public URL(String domain, String host, String path, int port, boolean secure, boolean ipHost) {
+ Objects.requireNonNull(domain);
+ Objects.requireNonNull(host);
+ Objects.requireNonNull(path);
+ this.domain = domain;
+ this.host = host;
+ this.path = path;
+ this.port = port;
+ this.secure = secure;
+ this.ipHost = ipHost;
+ }
+
+ public static void badUrl(boolean logError, String msg) {
+ if (logError) {
+ log.error(msg);
+ } else {
+ log.debug(msg);
+ }
+ throw new IllegalArgumentException(msg);
+ }
+
+ public static String domainFromHost(String host) {
+ return InternetDomainName.from(host).topPrivateDomain().name();
+ }
+
+ public static boolean isValidHost(String host) {
+ return HostSpecifier.isValid(host) && InternetDomainName.isValid(host)
+ && InternetDomainName.from(host).isUnderPublicSuffix();
+ }
+
+ public static URL from(String rawUrl) {
+ return URL.from(rawUrl, URL::domainFromHost, URL::isValidHost);
+ }
+
+ public static URL from(String rawUrl, Function<String, String> domainFromHost,
+ Function<String, Boolean> isValidHost) {
+
+ if (rawUrl.contains(URI_SEP)) {
+ badUrl(false, "Skipping raw URL as it contains '" + URI_SEP + "':" + rawUrl);
+ }
+
+ String trimUrl = rawUrl.trim();
+ if (trimUrl.length() < 8) {
+ badUrl(false, "Raw URL is too short to start with valid protocol: " + rawUrl);
+ }
+
+ String urlNoProto = "";
+ boolean secure = false;
+ int port = 80;
+ if (trimUrl.substring(0, 7).equalsIgnoreCase(HTTP_PROTO)) {
+ urlNoProto = trimUrl.substring(7);
+ } else if (trimUrl.substring(0, 8).equalsIgnoreCase(HTTPS_PROTO)) {
+ urlNoProto = trimUrl.substring(8);
+ secure = true;
+ port = 443;
+ } else {
+ badUrl(false, "Raw URL does not start with valid protocol: " + rawUrl);
+ }
+
+ String hostPort;
+ String[] args = urlNoProto.split(URL_SEP_REGEX, 2);
+ String path;
+ String sep;
+ if (args.length == 2) {
+ hostPort = args[0].toLowerCase();
+ int sepIndex = args[0].length();
+ sep = urlNoProto.substring(sepIndex, sepIndex + 1);
+ path = sep + args[1];
+ } else {
+ hostPort = urlNoProto.toLowerCase();
+ path = "/";
+ }
+
+ args = hostPort.split(":", 2);
+ String host;
+ if (args.length == 2) {
+ host = args[0];
+ try {
+ port = Integer.parseInt(args[1]);
+ } catch (NumberFormatException e) {
+ badUrl(false, "Raw URL (" + rawUrl + ") has invalid port: " + args[1]);
+ }
+ } else {
+ host = hostPort;
+ }
+
+ if (host.isEmpty()) {
+ badUrl(false, "Raw URL cannot have empty host: " + rawUrl);
+ }
+
+ String domain = host;
+ boolean ipHost = isValidIP(host);
+ if (!ipHost) {
+ if (!isValidHost.apply(host)) {
+ badUrl(false, "Raw URL (" + rawUrl + ") has invalid host: " + host);
+ }
+ domain = domainFromHost.apply(host);
+ }
+
+ return new URL(domain, host, path, port, secure, ipHost);
+ }
+
+ public static boolean isValid(String rawUrl) {
+ return URL.isValid(rawUrl, URL::domainFromHost, URL::isValidHost);
+ }
+
+ public static boolean isValid(String rawUrl, Function<String, String> domainFromHost,
+ Function<String, Boolean> isValidHost) {
+ try {
+ from(rawUrl, domainFromHost, isValidHost);
+ return true;
+ } catch (Exception e) {
+ return false;
+ }
+ }
+
+ public static boolean isValidIP(String host) {
+ return validator.isValid(host);
+ }
+
+ public static String reverseHost(String host) {
+ String[] hostArgs = host.split("\\.");
+ ArrayUtils.reverse(hostArgs);
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < hostArgs.length - 1; i++) {
+ sb.append(hostArgs[i]);
+ sb.append(".");
+ }
+ sb.append(hostArgs[hostArgs.length - 1]);
+ if (host.endsWith(".")) {
+ sb.append(".");
+ }
+ return sb.toString();
+ }
+
+ public boolean hasIPHost() {
+ return ipHost;
+ }
+
+ public String getHost() {
+ return host;
+ }
+
+ public String getReverseHost() {
+ if (hasIPHost()) {
+ return host;
+ }
+ return reverseHost(host);
+ }
+
+ public String getPath() {
+ return path;
+ }
+
+ public boolean isSecure() {
+ return secure;
+ }
+
+ public int getPort() {
+ return port;
+ }
+
+ public boolean isImage() {
+ return path.matches("([^\\s]+(\\.(?i)(jpeg|jpg|png|gif|bmp))$)");
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder url = new StringBuilder();
+ url.append("http");
+ if (secure) {
+ url.append("s");
+ }
+ url.append("://");
+ url.append(host);
+ if (!(port == 80 && !secure) && !(port == 443 && secure)) {
+ url.append(":");
+ url.append(port);
+ }
+ url.append(path);
+ return url.toString();
+ }
+
+ public String toUri() {
+ String reverseDomain = getReverseDomain();
+ String nonDomain = getReverseHost().substring(reverseDomain.length());
+ String portStr = "";
+ if ((!secure && port != 80) || (secure && port != 443)) {
+ portStr = Integer.toString(port);
+ }
+ return reverseDomain + URI_SEP + nonDomain + URI_SEP + (secure ? "s" : "o") + portStr + URI_SEP
+ + path;
+ }
+
+ public static URL fromUri(String uri) {
+ String[] idArgs = uri.split(URI_SEP);
+ if (idArgs.length != 4) {
+ throw new IllegalArgumentException("Page ID has too few or many parts: " + uri);
+ }
+ String domain = idArgs[0];
+ String host = idArgs[0] + idArgs[1];
+ boolean ipHost = isValidIP(host);
+ if (!ipHost) {
+ domain = reverseHost(domain);
+ host = reverseHost(host);
+ }
+ boolean secure = false;
+ int port = 80;
+ if (idArgs[2].startsWith("s")) {
+ secure = true;
+ port = 443;
+ } else if (!idArgs[2].startsWith("o")) {
+ throw new IllegalArgumentException("Page ID does not have port info beg with 's' or 'o': "
+ + uri);
+ }
+ if (idArgs[2].length() > 1) {
+ port = Integer.parseInt(idArgs[2].substring(1));
+ }
+ String path = idArgs[3];
+ return new URL(domain, host, path, port, secure, ipHost);
+ }
+
+ public String getDomain() {
+ return domain;
+ }
+
+ public String getReverseDomain() {
+ if (hasIPHost()) {
+ return domain;
+ }
+ return reverseHost(domain);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o instanceof URL) {
+ URL other = (URL) o;
+ return domain.equals(other.domain) && host.equals(other.host) && path.equals(other.path)
+ && port == other.port && secure == other.secure;
+ }
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = domain.hashCode();
+ result = 31 * result + host.hashCode();
+ result = 31 * result + path.hashCode();
+ result = 31 * result + port;
+ result = 31 * result + (secure ? 1 : 0);
+ return result;
+ }
+}
diff --git a/webindex/modules/core/src/main/java/webindex/core/models/UriInfo.java b/webindex/modules/core/src/main/java/webindex/core/models/UriInfo.java
new file mode 100644
index 0000000..6cfec28
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/models/UriInfo.java
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2016 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.models;
+
+import java.io.Serializable;
+import java.util.Optional;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * Used by URI collision free map
+ */
+public class UriInfo implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ public static final UriInfo ZERO = new UriInfo(0, 0);
+
+ // the numbers of documents that link to this URI
+ public long linksTo;
+
+ // the number of documents with this URI. Should be 0 or 1
+ public int docs;
+
+ public UriInfo() {}
+
+ public UriInfo(long linksTo, int docs) {
+ this.linksTo = linksTo;
+ this.docs = docs;
+ }
+
+ public void add(UriInfo other) {
+ Preconditions.checkArgument(this != ZERO);
+ this.linksTo += other.linksTo;
+ this.docs += other.docs;
+ }
+
+ @Override
+ public String toString() {
+ return linksTo + " " + docs;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o instanceof UriInfo) {
+ UriInfo oui = (UriInfo) o;
+ return linksTo == oui.linksTo && docs == oui.docs;
+ }
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ return docs + (int) linksTo;
+ }
+
+ public static UriInfo merge(UriInfo u1, UriInfo u2) {
+ UriInfo total = new UriInfo(0, 0);
+ total.add(u1);
+ total.add(u2);
+ return total;
+ }
+
+ public static Optional<UriInfo> reduce(Iterable<UriInfo> uriInfos) {
+ UriInfo sum = new UriInfo();
+ for (UriInfo uriInfo : uriInfos) {
+ sum.add(uriInfo);
+ }
+ return sum.equals(ZERO) ? Optional.empty() : Optional.of(sum);
+ }
+}
diff --git a/webindex/modules/core/src/main/java/webindex/core/models/export/DomainUpdate.java b/webindex/modules/core/src/main/java/webindex/core/models/export/DomainUpdate.java
new file mode 100644
index 0000000..a3732ed
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/models/export/DomainUpdate.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2016 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.models.export;
+
+/**
+ * Represents index updates for domain
+ */
+public class DomainUpdate implements IndexUpdate {
+
+ private String domain;
+ private Long oldPageCount;
+ private Long newPageCount;
+
+ public DomainUpdate() {} // For serialization
+
+ public DomainUpdate(String domain, Long oldPageCount, Long newPageCount) {
+ this.domain = domain;
+ this.oldPageCount = oldPageCount;
+ this.newPageCount = newPageCount;
+ }
+
+ public String getDomain() {
+ return domain;
+ }
+
+ public Long getOldPageCount() {
+ return oldPageCount;
+ }
+
+ public Long getNewPageCount() {
+ return newPageCount;
+ }
+}
diff --git a/webindex/modules/core/src/main/java/webindex/core/models/export/IndexUpdate.java b/webindex/modules/core/src/main/java/webindex/core/models/export/IndexUpdate.java
new file mode 100644
index 0000000..cec6e4f
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/models/export/IndexUpdate.java
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2016 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.models.export;
+
+/**
+ * Base class for updating indexes
+ */
+public interface IndexUpdate {
+
+}
diff --git a/webindex/modules/core/src/main/java/webindex/core/models/export/PageUpdate.java b/webindex/modules/core/src/main/java/webindex/core/models/export/PageUpdate.java
new file mode 100644
index 0000000..f171ccb
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/models/export/PageUpdate.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2016 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.models.export;
+
+import java.util.List;
+
+import webindex.core.models.Link;
+
+/**
+ * Represents index updates for pages
+ */
+public class PageUpdate implements IndexUpdate {
+
+ private String uri;
+ private String json;
+ private List<Link> addedLinks;
+ private List<Link> deletedLinks;
+
+ public PageUpdate() {} // For serialization
+
+ public PageUpdate(String uri, String json, List<Link> addedLinks, List<Link> deletedLinks) {
+ this.uri = uri;
+ this.json = json;
+ this.addedLinks = addedLinks;
+ this.deletedLinks = deletedLinks;
+ }
+
+ public String getUri() {
+ return uri;
+ }
+
+ public String getJson() {
+ return json;
+ }
+
+ public List<Link> getAddedLinks() {
+ return addedLinks;
+ }
+
+ public List<Link> getDeletedLinks() {
+ return deletedLinks;
+ }
+}
diff --git a/webindex/modules/core/src/main/java/webindex/core/models/export/UriUpdate.java b/webindex/modules/core/src/main/java/webindex/core/models/export/UriUpdate.java
new file mode 100644
index 0000000..0974c45
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/models/export/UriUpdate.java
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2016 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.models.export;
+
+import webindex.core.models.UriInfo;
+
+/**
+ * Represents index updates for URIs
+ */
+public class UriUpdate implements IndexUpdate {
+
+ private String uri;
+ private UriInfo oldInfo;
+ private UriInfo newInfo;
+
+ public UriUpdate() {} // For serialization
+
+ public UriUpdate(String uri, UriInfo oldInfo, UriInfo newInfo) {
+ this.uri = uri;
+ this.oldInfo = oldInfo;
+ this.newInfo = newInfo;
+ }
+
+ public String getUri() {
+ return uri;
+ }
+
+ public UriInfo getOldInfo() {
+ return oldInfo;
+ }
+
+ public UriInfo getNewInfo() {
+ return newInfo;
+ }
+}
diff --git a/webindex/modules/core/src/main/java/webindex/core/util/Pager.java b/webindex/modules/core/src/main/java/webindex/core/util/Pager.java
new file mode 100644
index 0000000..27b4cf8
--- /dev/null
+++ b/webindex/modules/core/src/main/java/webindex/core/util/Pager.java
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.util;
+
+import java.util.Iterator;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.Consumer;
+
+import org.apache.accumulo.core.client.Scanner;
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.Range;
+import org.apache.accumulo.core.data.Value;
+
+public class Pager {
+
+ private Scanner scanner;
+ private int pageSize;
+ private Range pageRange;
+ private Consumer<PageEntry> entryHandler;
+ private AtomicBoolean pageRead = new AtomicBoolean(false);
+
+ public class PageEntry {
+
+ private Key key;
+ private Value value;
+ private boolean isNext;
+
+ public PageEntry(Key key, Value value, boolean isNext) {
+ this.key = key;
+ this.value = value;
+ this.isNext = isNext;
+ }
+
+ public Key getKey() {
+ return key;
+ }
+
+ public Value getValue() {
+ return value;
+ }
+
+ public boolean isNext() {
+ return isNext;
+ }
+ }
+
+ private Pager(Scanner scanner, Range pageRange, int pageSize, Consumer<PageEntry> entryHandler) {
+ this.scanner = scanner;
+ this.pageRange = pageRange;
+ this.pageSize = pageSize;
+ this.entryHandler = entryHandler;
+ }
+
+ public void read(Key startKey) {
+ if (pageRead.get() == true) {
+ throw new IllegalStateException("Pager.read() cannot be called twice");
+ }
+ scanner.setRange(new Range(startKey, pageRange.getEndKey()));
+ handleStart(scanner.iterator());
+ }
+
+ public void read(int pageNum) {
+ if (pageRead.get() == true) {
+ throw new IllegalStateException("Pager.read() cannot be called twice");
+ }
+ scanner.setRange(pageRange);
+ Iterator<Map.Entry<Key, Value>> iterator = scanner.iterator();
+ if (pageNum > 0) {
+ long skip = 0;
+ while (skip < (pageNum * pageSize)) {
+ iterator.next();
+ skip++;
+ }
+ }
+ handleStart(iterator);
+ }
+
+ private void handleStart(Iterator<Map.Entry<Key, Value>> iterator) {
+ long num = 0;
+ while (iterator.hasNext() && (num < (pageSize + 1))) {
+ Map.Entry<Key, Value> entry = iterator.next();
+ entryHandler.accept(new PageEntry(entry.getKey(), entry.getValue(), num == pageSize));
+ num++;
+ }
+ }
+
+ public static Pager build(Scanner scanner, Range pageRange, int pageSize,
+ Consumer<PageEntry> entryHandler) {
+ return new Pager(scanner, pageRange, pageSize, entryHandler);
+ }
+}
diff --git a/webindex/modules/core/src/test/java/webindex/core/WebIndexConfigTest.java b/webindex/modules/core/src/test/java/webindex/core/WebIndexConfigTest.java
new file mode 100644
index 0000000..bb031ab
--- /dev/null
+++ b/webindex/modules/core/src/test/java/webindex/core/WebIndexConfigTest.java
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class WebIndexConfigTest {
+
+ @Test
+ public void testBasic() throws Exception {
+ WebIndexConfig config = WebIndexConfig.load("../../conf/examples/webindex.yml", false);
+ Assert.assertEquals("webindex_search", config.accumuloIndexTable);
+ Assert.assertEquals("webindex", config.fluoApp);
+ Assert.assertEquals("/cc/temp", config.hdfsTempDir);
+ }
+}
diff --git a/webindex/modules/core/src/test/java/webindex/core/models/LinkTest.java b/webindex/modules/core/src/test/java/webindex/core/models/LinkTest.java
new file mode 100644
index 0000000..433e383
--- /dev/null
+++ b/webindex/modules/core/src/test/java/webindex/core/models/LinkTest.java
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2016 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.models;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class LinkTest {
+
+ @Test
+ public void testBasic() {
+ Link link1 = Link.of("com.a>>o>/", "anchor text");
+ Assert.assertEquals("http://a.com/", link1.getUrl());
+ Assert.assertEquals("com.a>>o>/", link1.getUri());
+ Assert.assertEquals("anchor text", link1.getAnchorText());
+
+ Link link2 = Link.of("com.a>>o>/", "other text");
+ Assert.assertEquals(link1, link2);
+
+ Link link3 = Link.of(URLTest.from("http://a.com"), "more other text");
+ Assert.assertEquals("com.a>>o>/", link3.getUri());
+ Assert.assertEquals(link1, link3);
+ }
+
+}
diff --git a/webindex/modules/core/src/test/java/webindex/core/models/PageTest.java b/webindex/modules/core/src/test/java/webindex/core/models/PageTest.java
new file mode 100644
index 0000000..3ea7b2b
--- /dev/null
+++ b/webindex/modules/core/src/test/java/webindex/core/models/PageTest.java
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.models;
+
+import com.google.gson.Gson;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class PageTest {
+
+ @Test
+ public void testBasic() {
+
+ Page page = new Page(URLTest.from("http://example.com").toUri());
+ Assert.assertEquals("http://example.com/", page.getUrl());
+ Assert.assertEquals("com.example>>o>/", page.getUri());
+ Assert.assertEquals(Long.valueOf(0), page.getNumOutbound());
+ Assert.assertTrue(page.addOutbound(Link.of(URLTest.from("http://test1.com"), "test1")));
+ Assert.assertEquals(Long.valueOf(1), page.getNumOutbound());
+ Assert.assertTrue(page.addOutbound(Link.of(URLTest.from("http://test2.com"), "test2")));
+ Assert.assertEquals(Long.valueOf(2), page.getNumOutbound());
+ Assert.assertFalse(page.addOutbound(Link.of(URLTest.from("http://test2.com"), "test1234")));
+ Assert.assertEquals(Long.valueOf(2), page.getNumOutbound());
+
+ Gson gson = new Gson();
+ String json = gson.toJson(page);
+ Assert.assertNotNull(json);
+ Assert.assertFalse(json.isEmpty());
+
+ Page after = gson.fromJson(json, Page.class);
+ Assert.assertEquals(page.getUrl(), after.getUrl());
+ Assert.assertEquals(page.getOutboundLinks().size(), after.getOutboundLinks().size());
+ }
+}
diff --git a/webindex/modules/core/src/test/java/webindex/core/models/URLTest.java b/webindex/modules/core/src/test/java/webindex/core/models/URLTest.java
new file mode 100644
index 0000000..1a9abb3
--- /dev/null
+++ b/webindex/modules/core/src/test/java/webindex/core/models/URLTest.java
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2016 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.core.models;
+
+import java.text.ParseException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class URLTest {
+
+ public static URL from(String rawUrl) {
+ return URL.from(rawUrl);
+ }
+
+ public static String toID(String rawUrl) {
+ return from(rawUrl).toUri();
+ }
+
+
+ public static URL url80(String host, String path) {
+ return new URL(URL.domainFromHost(host), host, path, 80, false, URL.isValidIP(host));
+ }
+
+ public static URL url443(String host, String path) {
+ return new URL(URL.domainFromHost(host), host, path, 443, true, URL.isValidIP(host));
+ }
+
+ public static URL urlOpen(String host, String path, int port) {
+ return new URL(URL.domainFromHost(host), host, path, port, false, URL.isValidIP(host));
+ }
+
+ public static URL urlSecure(String host, String path, int port) {
+ return new URL(URL.domainFromHost(host), host, path, port, true, URL.isValidIP(host));
+ }
+
+ @Test
+ public void testBasic() throws ParseException {
+
+ String[] validUrls =
+ {"http://ab.com/", "https://ab.com/1/2/3", "https://ab.com:8080?1/2/3",
+ "http://ab.com#1/2/3", "https://ab.com/", "https://h.d.ab.com/1/2/3"};
+
+ for (String rawUrl : validUrls) {
+ Assert.assertTrue(URL.isValid(rawUrl));
+ Assert.assertEquals(rawUrl, from(rawUrl).toString());
+ }
+
+ String[] failureUrls =
+ {"ab.com", "ab.com/1/2/3", "htttp://ab.com/", "httpss://ab.com/", "http:/ab.com/",
+ "http::/ab.com/", "http:///ab.com/", "hhttp://ab.com/", "http://a.com:/test/",
+ "http://a.com:"};
+
+ for (String rawUrl : failureUrls) {
+ Assert.assertFalse(URL.isValid(rawUrl));
+ }
+ }
+
+ @Test
+ public void testClean() {
+ Assert.assertEquals("http://example.com/", from("Http://example.com ").toString());
+ Assert.assertEquals("https://example.com/", from(" HTTPS://example.com/ ").toString());
+ Assert.assertEquals("http://a.b.com:281/a/b", from("http://A.B.Com:281/a/b").toString());
+ Assert.assertEquals("http://a.b.com:281/A/b", from("http://A.b.Com:281/A/b").toString());
+ Assert.assertEquals("http://a.b.com?A/b/C", from("http://a.B.Com?A/b/C").toString());
+ Assert.assertEquals("http://a.be.com/", from("http://A.Be.COM").toString());
+ }
+
+ @Test
+ public void testPort() {
+ Assert.assertEquals(80, from("http://www.ab.com:80/").getPort());
+ Assert.assertEquals("www.ab.com", from("http://www.ab.com:80/").getHost());
+ Assert.assertEquals("http://www.ab.com/", from("http://www.ab.com:80/").toString());
+ Assert.assertEquals(443, from("https://ab.com/").getPort());
+ Assert.assertTrue(from("https://ab.com/").isSecure());
+ Assert.assertEquals("www.ab.com", from("https://www.ab.com:443/").getHost());
+ Assert.assertEquals("https://www.ab.com/", from("https://www.ab.com:443/").toString());
+ Assert.assertEquals(8888, from("https://ab.com:8888/").getPort());
+ Assert.assertEquals("www.ab.com", from("https://www.ab.com:8888/").getHost());
+ Assert.assertEquals("http://www.ab.com:8888/", from("http://www.ab.com:8888/").toString());
+ }
+
+ @Test
+ public void testHost() {
+ URL u = from("http://a.b.c.d.com/1/2/3");
+ Assert.assertEquals("a.b.c.d.com", u.getHost());
+ Assert.assertEquals("com.d.c.b.a", u.getReverseHost());
+ Assert.assertEquals("d.com", u.getDomain());
+ Assert.assertEquals("com.d", u.getReverseDomain());
+ }
+
+ @Test
+ public void testAdvanced() {
+ Assert.assertEquals(urlOpen("example.com", "?A&B", 83), from("http://EXAMPLE.COM:83?A&B"));
+ Assert.assertEquals(urlOpen("example.com", "#a&b", 83), from("http://example.com:83#a&b"));
+ Assert.assertEquals(url80("a.b.example.com", "/page?1&2"),
+ from("http://a.b.example.com/page?1&2"));
+ Assert.assertEquals(url80("example.com", "/1/2/3?c&d&e"),
+ from("http://example.com/1/2/3?c&d&e"));
+ Assert.assertEquals(url80("a.b.example.com", "/"), from("http://a.b.example.com"));
+ Assert.assertEquals(url443("a.b.example.com", "/"), from("https://A.b.example.com/"));
+ Assert.assertEquals(urlSecure("a.b.example.com", "/", 8329),
+ from("https://a.b.Example.com:8329/"));
+ Assert
+ .assertEquals(urlOpen("a.b.example.com", "/", 8333), from("http://a.B.example.com:8333/"));
+ Assert.assertEquals(url443("example.com", "/"), from("https://example.com/"));
+ Assert.assertEquals(url80("example.com", "/b?1#2&3#4"), from("http://example.com/b?1#2&3#4"));
+ Assert.assertEquals(urlOpen("example.com", "/b", 8080), from("http://example.com:8080/b"));
+ }
+
+ @Test
+ public void testId() {
+ URL u1 = urlSecure("a.b.c.com", "/", 8329);
+ URL u2 = from("https://a.b.C.com:8329");
+ String r1 = u2.toUri();
+ Assert.assertEquals("com.c>.b.a>s8329>/", r1);
+ URL u3 = URL.fromUri(r1);
+ Assert.assertEquals(u1, u2);
+ Assert.assertEquals(u1, u3);
+ Assert.assertEquals(u2, u3);
+
+ URL u4 = url80("d.com", "/a/b/c");
+ String id4 = u4.toUri();
+ Assert.assertEquals("com.d>>o>/a/b/c", id4);
+ Assert.assertEquals(u4, URL.fromUri(id4));
+
+ URL u5 = from("http://1.2.3.4/a/b/c");
+ String id5 = u5.toUri();
+ Assert.assertEquals("1.2.3.4>>o>/a/b/c", id5);
+ Assert.assertEquals(u5, URL.fromUri(id5));
+
+ Assert.assertEquals("com.b>.a>s80>/", from("https://a.b.com:80").toUri());
+ }
+
+ @Test
+ public void testMore() throws Exception {
+
+ // valid urls
+ Assert.assertTrue(URL.isValid(" \thttp://example.com/ \t\n\r\n"));
+ Assert.assertTrue(URL.isValid("http://1.2.3.4:80/test?a=b&c=d"));
+ Assert.assertTrue(URL.isValid("http://1.2.3.4/"));
+ Assert.assertTrue(URL.isValid("http://a.b.c.d.com/1/2/3/4/5"));
+ Assert.assertTrue(URL.isValid("http://a.b.com:281/1/2"));
+ Assert.assertTrue(URL.isValid("http://A.B.Com:281/a/b"));
+ Assert.assertTrue(URL.isValid("http://A.b.Com:281/A/b"));
+ Assert.assertTrue(URL.isValid("http://a.B.Com?A/b/C"));
+ Assert.assertTrue(URL.isValid("http://A.Be.COM"));
+ Assert.assertTrue(URL.isValid("http://1.2.3.4:281/1/2"));
+
+ // invalid urls
+ Assert.assertFalse(URL.isValid("http://a.com:/test"));
+ Assert.assertFalse(URL.isValid("http://z.com:"));
+ Assert.assertFalse(URL.isValid("http://1.2.3:80/test?a=b&c=d"));
+ Assert.assertFalse(URL.isValid("http://1.2.3/"));
+ Assert.assertFalse(URL.isValid("http://com/"));
+ Assert.assertFalse(URL.isValid("http://a.b.c.com/bad>et"));
+ Assert.assertFalse(URL.isValid("http://test"));
+ Assert.assertFalse(URL.isValid("http://co.uk"));
+ Assert.assertFalse(URL.isValid("http:///example.com/"));
+ Assert.assertFalse(URL.isValid("http:://example.com/"));
+ Assert.assertFalse(URL.isValid("example.com"));
+ Assert.assertFalse(URL.isValid("127.0.0.1"));
+ Assert.assertFalse(URL.isValid("http://ab@example.com"));
+ Assert.assertFalse(URL.isValid("ftp://example.com"));
+
+ Assert.assertEquals("example.com", from("http://example.com:281/1/2").getHost());
+ Assert.assertEquals("a.b.example.com", from("http://a.b.example.com/1/2").getHost());
+ Assert.assertEquals("a.b.example.com", from("http://A.B.Example.Com/1/2").getHost());
+ Assert.assertEquals("1.2.3.4", from("http://1.2.3.4:89/1/2").getHost());
+
+ Assert.assertEquals("/A/b/C", from("http://A.B.Example.Com/A/b/C").getPath());
+ Assert.assertEquals("?D/E/f", from("http://A.B.Example.Com?D/E/f").getPath());
+
+ URL u = from("http://a.b.c.d.com/1/2/3");
+ Assert.assertEquals("a.b.c.d.com", u.getHost());
+ Assert.assertEquals("com.d.c.b.a", u.getReverseHost());
+ Assert.assertEquals("d.com", u.getDomain());
+ Assert.assertEquals("com.d", u.getReverseDomain());
+
+ Assert.assertEquals("com.example", from("http://example.com:281/1").getReverseHost());
+ Assert.assertEquals("com.example.b.a", from("http://a.b.example.com/1/2").getReverseHost());
+ Assert.assertEquals("1.2.3.4", from("http://1.2.3.4:89/1/2").getReverseHost());
+
+ Assert.assertTrue(from("http://a.com/a.jpg").isImage());
+ Assert.assertTrue(from("http://a.com/a.JPEG").isImage());
+ Assert.assertTrue(from("http://a.com/c/b/a.png").isImage());
+
+ Assert.assertEquals("c.com", from("http://a.b.c.com").getDomain());
+ Assert.assertEquals("com.c", from("http://a.b.c.com").getReverseDomain());
+ Assert.assertEquals("c.co.uk", from("http://a.b.c.co.uk").getDomain());
+ Assert.assertEquals("uk.co.c", from("http://a.b.c.co.uk").getReverseDomain());
+ Assert.assertEquals("d.com.au", from("http://www.d.com.au").getDomain());
+ Assert.assertEquals("au.com.d", from("http://www.d.com.au").getReverseDomain());
+
+ u = from("https://www.d.com.au:9443/a/bc");
+ Assert.assertEquals("au.com.d>.www>s9443>/a/bc", u.toUri());
+ Assert.assertEquals("https://www.d.com.au:9443/a/bc", u.toString());
+ URL u2 = URL.fromUri(u.toUri());
+ Assert.assertEquals("https://www.d.com.au:9443/a/bc", u2.toString());
+ Assert.assertEquals("d.com.au", u2.getDomain());
+ Assert.assertEquals("www.d.com.au", u2.getHost());
+ }
+}
diff --git a/webindex/modules/core/src/test/resources/log4j.properties b/webindex/modules/core/src/test/resources/log4j.properties
new file mode 100644
index 0000000..7add931
--- /dev/null
+++ b/webindex/modules/core/src/test/resources/log4j.properties
@@ -0,0 +1,20 @@
+# Copyright 2014 Webindex authors (see AUTHORS)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+log4j.rootLogger=INFO, CA
+log4j.appender.CA=org.apache.log4j.ConsoleAppender
+log4j.appender.CA.layout=org.apache.log4j.PatternLayout
+log4j.appender.CA.layout.ConversionPattern=%d{ISO8601} [%c] %-5p: %m%n
+
+log4j.logger.webindex=WARN
diff --git a/webindex/modules/data/pom.xml b/webindex/modules/data/pom.xml
new file mode 100644
index 0000000..3005c9a
--- /dev/null
+++ b/webindex/modules/data/pom.xml
@@ -0,0 +1,227 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2015 Webindex authors (see AUTHORS)
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>io.github.astralway</groupId>
+ <artifactId>webindex-parent</artifactId>
+ <version>0.0.1-SNAPSHOT</version>
+ <relativePath>../../pom.xml</relativePath>
+ </parent>
+ <artifactId>webindex-data</artifactId>
+ <name>WebIndex Data</name>
+ <dependencies>
+ <dependency>
+ <groupId>com.google.code.gson</groupId>
+ <artifactId>gson</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>2.4</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>io.github.astralway</groupId>
+ <artifactId>webindex-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.accumulo</groupId>
+ <artifactId>accumulo-core</artifactId>
+ <exclusions>
+ <exclusion>
+ <groupId>org.apache.zookeeper</groupId>
+ <artifactId>zookeeper</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-api</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-recipes-accumulo</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-recipes-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-recipes-kryo</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-recipes-spark</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.netpreserve.commons</groupId>
+ <artifactId>webarchive-commons</artifactId>
+ <exclusions>
+ <exclusion>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-core</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>ch.qos.logback</groupId>
+ <artifactId>logback-classic</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>ch.qos.logback</groupId>
+ <artifactId>logback-core</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.hadoop.thirdparty.guava</groupId>
+ <artifactId>guava</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ </dependency>
+ <!-- provided scope is used so hadoop and spark do not end up in shaded jar used for spark. Its assumed the spark runtime environment will provide these. -->
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-client</artifactId>
+ <scope>provided</scope>
+ <exclusions>
+ <exclusion>
+ <groupId>javax.servlet</groupId>
+ <artifactId>servlet-api</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.spark</groupId>
+ <artifactId>spark-core_2.10</artifactId>
+ <scope>provided</scope>
+ <exclusions>
+ <!-- Excluded as we only want Kryo's asm dependency -->
+ <exclusion>
+ <groupId>asm</groupId>
+ <artifactId>asm</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <!-- Test Dependencies -->
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-mini</artifactId>
+ <scope>test</scope>
+ <exclusions>
+ <exclusion>
+ <groupId>javax.servlet</groupId>
+ <artifactId>servlet-api</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-recipes-test</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+ <profiles>
+ <profile>
+ <id>copy-dependencies</id>
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-dependency-plugin</artifactId>
+ <version>2.10</version>
+ <executions>
+ <execution>
+ <id>copy</id>
+ <goals>
+ <goal>copy-dependencies</goal>
+ </goals>
+ <phase>package</phase>
+ <configuration>
+ <!--define the specific dependencies to copy into the Fluo application dir-->
+ <includeArtifactIds>fluo-recipes-core,fluo-recipes-accumulo,fluo-recipes-kryo,kryo,minlog,reflectasm,asm,objenesis,commons-validator,yamlbeans</includeArtifactIds>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+ </profile>
+ <profile>
+ <id>create-shade-jar</id>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-shade-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>spark-shade-jar</id>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <phase>package</phase>
+ <configuration>
+ <shadedArtifactAttached>true</shadedArtifactAttached>
+ <shadedClassifierName>shaded</shadedClassifierName>
+ <!-- Relocate Thrift because Accumulo 1.8 uses Thrift 0.9.3 and Spark uses 0.9.1. -->
+ <relocations>
+ <relocation>
+ <pattern>org.apache.thrift</pattern>
+ <shadedPattern>webindex.org.apache.thrift</shadedPattern>
+ </relocation>
+ </relocations>
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>META-INF/*.SF</exclude>
+ <exclude>META-INF/*.DSA</exclude>
+ <exclude>META-INF/*.RSA</exclude>
+ </excludes>
+ </filter>
+ </filters>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+ </profile>
+ </profiles>
+</project>
diff --git a/webindex/modules/data/src/main/java/webindex/data/CalcSplits.java b/webindex/modules/data/src/main/java/webindex/data/CalcSplits.java
new file mode 100644
index 0000000..5f76624
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/CalcSplits.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data;
+
+import java.util.SortedSet;
+
+import org.apache.fluo.api.data.Bytes;
+import org.apache.fluo.api.data.RowColumn;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.archive.io.ArchiveReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import webindex.core.models.Page;
+import webindex.core.models.UriInfo;
+import webindex.data.spark.IndexEnv;
+import webindex.data.spark.IndexStats;
+import webindex.data.spark.IndexUtil;
+import webindex.data.util.WARCFileInputFormat;
+
+public class CalcSplits {
+
+ private static final Logger log = LoggerFactory.getLogger(CalcSplits.class);
+
+ public static void main(String[] args) {
+ if (args.length != 1) {
+ log.error("Usage: CalcSplits <dataDir>");
+ System.exit(1);
+ }
+ final String dataDir = args[0];
+ IndexEnv.validateDataDir(dataDir);
+
+ SparkConf sparkConf = new SparkConf().setAppName("webindex-calcsplits");
+ try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
+
+ IndexStats stats = new IndexStats(ctx);
+
+ final JavaPairRDD<Text, ArchiveReader> archives =
+ ctx.newAPIHadoopFile(dataDir, WARCFileInputFormat.class, Text.class, ArchiveReader.class,
+ new Configuration());
+
+ JavaRDD<Page> pages = IndexUtil.createPages(archives);
+
+ JavaPairRDD<String, UriInfo> uriMap = IndexUtil.createUriMap(pages);
+ JavaPairRDD<String, Long> domainMap = IndexUtil.createDomainMap(uriMap);
+ JavaPairRDD<RowColumn, Bytes> accumuloIndex =
+ IndexUtil.createAccumuloIndex(stats, pages, uriMap, domainMap);
+ SortedSet<Text> splits = IndexUtil.calculateSplits(accumuloIndex, 100);
+ log.info("Accumulo splits:");
+ splits.forEach(System.out::println);
+ }
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/Configure.java b/webindex/modules/data/src/main/java/webindex/data/Configure.java
new file mode 100644
index 0000000..691762f
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/Configure.java
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.PrintWriter;
+import java.util.Iterator;
+
+import com.google.common.base.Preconditions;
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import webindex.core.WebIndexConfig;
+import webindex.data.spark.IndexEnv;
+
+public class Configure {
+
+ private static final Logger log = LoggerFactory.getLogger(Configure.class);
+
+ public static void main(String[] args) throws Exception {
+
+ if (args.length != 2) {
+ log.error("Usage: Configure <webindexConfigPath> <fluoAppProps>");
+ System.exit(1);
+ }
+ WebIndexConfig webIndexConfig = WebIndexConfig.load(args[0]);
+ String appPropsPath = args[1];
+ Preconditions.checkArgument(new File(appPropsPath).exists(), "File does not exist: "
+ + appPropsPath);
+
+ FluoConfiguration fluoConfig =
+ new FluoConfiguration(new File(webIndexConfig.getConnPropsPath()));
+ fluoConfig.load(new File(appPropsPath));
+
+
+ IndexEnv env = new IndexEnv(webIndexConfig, fluoConfig);
+ env.initAccumuloIndexTable();
+
+ FluoConfiguration appConfig = new FluoConfiguration();
+ env.configureApplication(fluoConfig, appConfig);
+ Iterator<String> iter = appConfig.getKeys();
+ try (PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(appPropsPath, true)))) {
+ while (iter.hasNext()) {
+ String key = iter.next();
+ out.println(key + " = " + appConfig.getRawString(key));
+ }
+ }
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/Copy.java b/webindex/modules/data/src/main/java/webindex/data/Copy.java
new file mode 100644
index 0000000..4e71908
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/Copy.java
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.net.URL;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import webindex.core.WebIndexConfig;
+import webindex.data.spark.IndexEnv;
+
+public class Copy {
+
+ private static final Logger log = LoggerFactory.getLogger(Copy.class);
+
+ public static String getFilename(String fullPath) {
+ int slashIndex = fullPath.lastIndexOf("/");
+ if (slashIndex == -1) {
+ return fullPath;
+ }
+ return fullPath.substring(slashIndex + 1);
+ }
+
+ public static void main(String[] args) throws Exception {
+
+ if (args.length != 3) {
+ log.error("Usage: Copy <pathsFile> <range> <dest>");
+ System.exit(1);
+ }
+ final String hadoopConfDir = IndexEnv.getHadoopConfDir();
+ final List<String> copyList = IndexEnv.getPathsRange(args[0], args[1]);
+ if (copyList.isEmpty()) {
+ log.error("No files to copy given {} {}", args[0], args[1]);
+ System.exit(1);
+ }
+
+ WebIndexConfig webIndexConfig = WebIndexConfig.load();
+
+ SparkConf sparkConf = new SparkConf().setAppName("webindex-copy");
+ try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
+
+ FileSystem hdfs = FileSystem.get(ctx.hadoopConfiguration());
+ Path destPath = new Path(args[2]);
+ if (!hdfs.exists(destPath)) {
+ hdfs.mkdirs(destPath);
+ }
+
+ log.info("Copying {} files (Range {} of paths file {}) from AWS to HDFS {}", copyList.size(),
+ args[1], args[0], destPath.toString());
+
+ JavaRDD<String> copyRDD = ctx.parallelize(copyList, webIndexConfig.getNumExecutorInstances());
+
+ final String prefix = WebIndexConfig.CC_URL_PREFIX;
+ final String destDir = destPath.toString();
+
+ copyRDD
+ .foreachPartition(iter -> {
+ FileSystem fs = IndexEnv.getHDFS(hadoopConfDir);
+ iter.forEachRemaining(ccPath -> {
+ try {
+ Path dfsPath = new Path(destDir + "/" + getFilename(ccPath));
+ if (fs.exists(dfsPath)) {
+ log.error("File {} exists in HDFS and should have been previously filtered",
+ dfsPath.getName());
+ } else {
+ String urlToCopy = prefix + ccPath;
+ log.info("Starting copy of {} to {}", urlToCopy, destDir);
+ try (OutputStream out = fs.create(dfsPath);
+ BufferedInputStream in =
+ new BufferedInputStream(new URL(urlToCopy).openStream())) {
+ IOUtils.copy(in, out);
+ }
+ log.info("Created {}", dfsPath.getName());
+ }
+ } catch (IOException e) {
+ log.error("Exception while copying {}", ccPath, e);
+ }
+ });
+ });
+ }
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/FluoApp.java b/webindex/modules/data/src/main/java/webindex/data/FluoApp.java
new file mode 100644
index 0000000..7d1354e
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/FluoApp.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data;
+
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.apache.fluo.recipes.accumulo.export.function.AccumuloExporter;
+import org.apache.fluo.recipes.core.data.RowHasher;
+import org.apache.fluo.recipes.core.export.ExportQueue;
+import org.apache.fluo.recipes.kryo.KryoSimplerSerializer;
+import webindex.core.models.export.IndexUpdate;
+import webindex.data.fluo.DomainCombineQ;
+import webindex.data.fluo.PageObserver;
+import webindex.data.fluo.UriCombineQ;
+import webindex.data.fluo.WebindexObservers;
+import webindex.serialization.WebindexKryoFactory;
+
+public class FluoApp {
+
+ public static final String EXPORT_QUEUE_ID = "eq";
+
+ public static void configureApplication(FluoConfiguration connectionConfig,
+ FluoConfiguration appConfig, String exportTable, int numBuckets, int numTablets) {
+
+ appConfig.setObserverProvider(WebindexObservers.class);
+
+ KryoSimplerSerializer.setKryoFactory(appConfig, WebindexKryoFactory.class);
+
+ UriCombineQ.configure(appConfig, numBuckets, numTablets);
+ DomainCombineQ.configure(appConfig, numBuckets, numTablets);
+
+ ExportQueue.configure(EXPORT_QUEUE_ID).keyType(String.class).valueType(IndexUpdate.class)
+ .buckets(numBuckets).bucketsPerTablet(numBuckets / numTablets).save(appConfig);
+
+ AccumuloExporter.configure(EXPORT_QUEUE_ID)
+ .instance(connectionConfig.getAccumuloInstance(), connectionConfig.getAccumuloZookeepers())
+ .credentials(connectionConfig.getAccumuloUser(), connectionConfig.getAccumuloPassword())
+ .table(exportTable).save(appConfig);
+
+ RowHasher.configure(appConfig, PageObserver.getPageRowHasher().getPrefix(), numTablets);
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/Init.java b/webindex/modules/data/src/main/java/webindex/data/Init.java
new file mode 100644
index 0000000..a3c7f4c
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/Init.java
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.archive.io.ArchiveReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import webindex.core.WebIndexConfig;
+import webindex.core.models.Page;
+import webindex.data.spark.IndexEnv;
+import webindex.data.spark.IndexStats;
+import webindex.data.spark.IndexUtil;
+import webindex.data.util.WARCFileInputFormat;
+
+public class Init {
+
+ private static final Logger log = LoggerFactory.getLogger(Init.class);
+
+ public static void main(String[] args) throws Exception {
+
+ if (args.length > 1) {
+ log.error("Usage: Init [<dataDir>]");
+ System.exit(1);
+ }
+ WebIndexConfig webIndexConfig = WebIndexConfig.load();
+
+ IndexEnv env = new IndexEnv(webIndexConfig);
+ env.setFluoTableSplits();
+ log.info("Initialized Fluo table splits");
+
+ if (args.length == 1) {
+ final String dataDir = args[0];
+ IndexEnv.validateDataDir(dataDir);
+
+ SparkConf sparkConf = new SparkConf().setAppName("webindex-init");
+ try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
+ IndexStats stats = new IndexStats(ctx);
+
+ final JavaPairRDD<Text, ArchiveReader> archives =
+ ctx.newAPIHadoopFile(dataDir, WARCFileInputFormat.class, Text.class,
+ ArchiveReader.class, new Configuration());
+
+ JavaRDD<Page> pages = IndexUtil.createPages(archives);
+
+ env.initializeIndexes(ctx, pages, stats);
+
+ stats.print();
+ }
+ } else {
+ log.info("An init data dir was not specified");
+ }
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/LoadHdfs.java b/webindex/modules/data/src/main/java/webindex/data/LoadHdfs.java
new file mode 100644
index 0000000..5f28af7
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/LoadHdfs.java
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import com.google.common.util.concurrent.RateLimiter;
+import org.apache.fluo.api.client.FluoClient;
+import org.apache.fluo.api.client.FluoFactory;
+import org.apache.fluo.api.client.LoaderExecutor;
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.warc.WARCReaderFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import webindex.core.WebIndexConfig;
+import webindex.core.models.Page;
+import webindex.data.fluo.PageLoader;
+import webindex.data.spark.IndexEnv;
+import webindex.data.util.ArchiveUtil;
+
+public class LoadHdfs {
+
+ private static final Logger log = LoggerFactory.getLogger(LoadHdfs.class);
+
+ public static void main(String[] args) throws Exception {
+
+ if (args.length != 1) {
+ log.error("Usage: LoadHdfs <dataDir>");
+ System.exit(1);
+ }
+ final String dataDir = args[0];
+ IndexEnv.validateDataDir(dataDir);
+
+ final String hadoopConfDir = IndexEnv.getHadoopConfDir();
+ final WebIndexConfig webIndexConfig = WebIndexConfig.load();
+ final int rateLimit = webIndexConfig.getLoadRateLimit();
+ final String appName = webIndexConfig.fluoApp;
+
+ List<String> loadPaths = new ArrayList<>();
+ FileSystem hdfs = IndexEnv.getHDFS();
+ RemoteIterator<LocatedFileStatus> listIter = hdfs.listFiles(new Path(dataDir), true);
+ while (listIter.hasNext()) {
+ LocatedFileStatus status = listIter.next();
+ if (status.isFile()) {
+ loadPaths.add(status.getPath().toString());
+ }
+ }
+
+ log.info("Loading {} files into Fluo from {}", loadPaths.size(), dataDir);
+
+ SparkConf sparkConf = new SparkConf().setAppName("webindex-load-hdfs");
+ try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
+
+ JavaRDD<String> paths = ctx.parallelize(loadPaths, loadPaths.size());
+
+ paths.foreachPartition(iter -> {
+ final FluoConfiguration fluoConfig =
+ new FluoConfiguration(new File("fluo-conn.properties"));
+ fluoConfig.setApplicationName(appName);
+ final RateLimiter rateLimiter = rateLimit > 0 ? RateLimiter.create(rateLimit) : null;
+ FileSystem fs = IndexEnv.getHDFS(hadoopConfDir);
+ try (FluoClient client = FluoFactory.newClient(fluoConfig);
+ LoaderExecutor le = client.newLoaderExecutor()) {
+ iter.forEachRemaining(path -> {
+ Path filePath = new Path(path);
+ try {
+ if (fs.exists(filePath)) {
+ FSDataInputStream fsin = fs.open(filePath);
+ ArchiveReader reader = WARCReaderFactory.get(filePath.getName(), fsin, true);
+ for (ArchiveRecord record : reader) {
+ Page page = ArchiveUtil.buildPageIgnoreErrors(record);
+ if (page.getOutboundLinks().size() > 0) {
+ log.info("Loading page {} with {} links", page.getUrl(), page
+ .getOutboundLinks().size());
+ if (rateLimiter != null) {
+ rateLimiter.acquire();
+ }
+ le.execute(PageLoader.updatePage(page));
+ }
+ }
+ }
+ } catch (IOException e) {
+ log.error("Exception while processing {}", path, e);
+ }
+ });
+ }
+ });
+ }
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/LoadS3.java b/webindex/modules/data/src/main/java/webindex/data/LoadS3.java
new file mode 100644
index 0000000..d30f68a
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/LoadS3.java
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data;
+
+import java.io.File;
+import java.net.URL;
+import java.util.List;
+
+import com.google.common.util.concurrent.RateLimiter;
+import org.apache.fluo.api.client.FluoClient;
+import org.apache.fluo.api.client.FluoFactory;
+import org.apache.fluo.api.client.LoaderExecutor;
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.warc.WARCReaderFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import webindex.core.WebIndexConfig;
+import webindex.core.models.Page;
+import webindex.data.fluo.PageLoader;
+import webindex.data.spark.IndexEnv;
+import webindex.data.util.ArchiveUtil;
+
+public class LoadS3 {
+
+ private static final Logger log = LoggerFactory.getLogger(LoadS3.class);
+
+ public static void main(String[] args) throws Exception {
+
+ if (args.length != 2) {
+ log.error("Usage: LoadS3 <pathsFile> <range>");
+ System.exit(1);
+ }
+ final List<String> loadList = IndexEnv.getPathsRange(args[0], args[1]);
+ if (loadList.isEmpty()) {
+ log.error("No files to load given {} {}", args[0], args[1]);
+ System.exit(1);
+ }
+
+ final WebIndexConfig webIndexConfig = WebIndexConfig.load();
+
+ final int rateLimit = webIndexConfig.getLoadRateLimit();
+ final String appName = webIndexConfig.fluoApp;
+
+ SparkConf sparkConf = new SparkConf().setAppName("webindex-load-s3");
+ try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
+
+ log.info("Loading {} files (Range {} of paths file {}) from AWS", loadList.size(), args[1],
+ args[0]);
+
+ JavaRDD<String> loadRDD = ctx.parallelize(loadList, loadList.size());
+
+ final String prefix = WebIndexConfig.CC_URL_PREFIX;
+
+ loadRDD.foreachPartition(iter -> {
+ final FluoConfiguration fluoConfig =
+ new FluoConfiguration(new File("fluo-conn.properties"));
+ fluoConfig.setApplicationName(appName);
+ final RateLimiter rateLimiter = rateLimit > 0 ? RateLimiter.create(rateLimit) : null;
+ try (FluoClient client = FluoFactory.newClient(fluoConfig);
+ LoaderExecutor le = client.newLoaderExecutor()) {
+ iter.forEachRemaining(path -> {
+ String urlToCopy = prefix + path;
+ log.info("Loading {} to Fluo", urlToCopy);
+ try {
+ ArchiveReader reader = WARCReaderFactory.get(new URL(urlToCopy), 0);
+ for (ArchiveRecord record : reader) {
+ Page page = ArchiveUtil.buildPageIgnoreErrors(record);
+ if (page.getOutboundLinks().size() > 0) {
+ log.info("Loading page {} with {} links", page.getUrl(), page.getOutboundLinks()
+ .size());
+ if (rateLimiter != null) {
+ rateLimiter.acquire();
+ }
+ le.execute(PageLoader.updatePage(page));
+ }
+ }
+ } catch (Exception e) {
+ log.error("Exception while processing {}", path, e);
+ }
+ });
+ }
+ });
+ }
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/TestParser.java b/webindex/modules/data/src/main/java/webindex/data/TestParser.java
new file mode 100644
index 0000000..30fb024
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/TestParser.java
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data;
+
+import java.net.URL;
+import java.util.List;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.warc.WARCReaderFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import webindex.core.WebIndexConfig;
+import webindex.data.spark.IndexEnv;
+import webindex.data.util.ArchiveUtil;
+
+public class TestParser {
+
+ private static final Logger log = LoggerFactory.getLogger(TestParser.class);
+
+ public static void main(String[] args) throws Exception {
+
+ if (args.length != 2) {
+ log.error("Usage: TestParser <pathsFile> <range>");
+ System.exit(1);
+ }
+ final List<String> loadList = IndexEnv.getPathsRange(args[0], args[1]);
+ if (loadList.isEmpty()) {
+ log.error("No files to load given {} {}", args[0], args[1]);
+ System.exit(1);
+ }
+
+ WebIndexConfig.load();
+
+ SparkConf sparkConf = new SparkConf().setAppName("webindex-test-parser");
+ try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
+
+ log.info("Parsing {} files (Range {} of paths file {}) from AWS", loadList.size(), args[1],
+ args[0]);
+
+ JavaRDD<String> loadRDD = ctx.parallelize(loadList, loadList.size());
+
+ final String prefix = WebIndexConfig.CC_URL_PREFIX;
+
+ loadRDD.foreachPartition(iter -> iter.forEachRemaining(path -> {
+ String urlToCopy = prefix + path;
+ log.info("Parsing {}", urlToCopy);
+ try {
+ ArchiveReader reader = WARCReaderFactory.get(new URL(urlToCopy), 0);
+ for (ArchiveRecord record : reader) {
+ ArchiveUtil.buildPageIgnoreErrors(record);
+ }
+ } catch (Exception e) {
+ log.error("Exception while processing {}", path, e);
+ }
+ }));
+ }
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/fluo/DomainCombineQ.java b/webindex/modules/data/src/main/java/webindex/data/fluo/DomainCombineQ.java
new file mode 100644
index 0000000..69a8b7f
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/fluo/DomainCombineQ.java
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.fluo;
+
+import org.apache.fluo.api.client.TransactionBase;
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.apache.fluo.api.metrics.Meter;
+import org.apache.fluo.api.metrics.MetricsReporter;
+import org.apache.fluo.recipes.core.combine.ChangeObserver;
+import org.apache.fluo.recipes.core.combine.CombineQueue;
+import org.apache.fluo.recipes.core.export.ExportQueue;
+import webindex.core.models.export.DomainUpdate;
+import webindex.core.models.export.IndexUpdate;
+
+public class DomainCombineQ {
+
+ public static final String DOMAIN_COMBINE_Q_ID = "dm";
+
+ /**
+ * Observes domain map updates and adds those updates to an export queue.
+ */
+ public static class DomainUpdateObserver implements ChangeObserver<String, Long> {
+
+ private ExportQueue<String, IndexUpdate> exportQ;
+ private Meter domainsNew;
+ private Meter domainsChanged;
+
+ DomainUpdateObserver(ExportQueue<String, IndexUpdate> exportQ, MetricsReporter reporter) {
+ this.exportQ = exportQ;
+ domainsNew = reporter.meter("webindex_domains_new");
+ domainsChanged = reporter.meter("webindex_domains_changed");
+ }
+
+ @Override
+ public void process(TransactionBase tx, Iterable<Change<String, Long>> updates) {
+ for (Change<String, Long> update : updates) {
+ String domain = update.getKey();
+ Long oldVal = update.getOldValue().orElse(0L);
+ Long newVal = update.getNewValue().orElse(0L);
+ if (oldVal == 0L && newVal > 0L) {
+ domainsNew.mark();
+ }
+ exportQ.add(tx, domain, new DomainUpdate(domain, oldVal, newVal));
+ domainsChanged.mark();
+ }
+ }
+ }
+
+ /**
+ * A helper method for configuring the domain map before initializing Fluo.
+ */
+ public static void configure(FluoConfiguration config, int numBuckets, int numTablets) {
+ CombineQueue.configure(DOMAIN_COMBINE_Q_ID).keyType(String.class).valueType(Long.class)
+ .buckets(numBuckets).bucketsPerTablet(numBuckets / numTablets).save(config);
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/fluo/IndexUpdateTranslator.java b/webindex/modules/data/src/main/java/webindex/data/fluo/IndexUpdateTranslator.java
new file mode 100644
index 0000000..be820d5
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/fluo/IndexUpdateTranslator.java
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.fluo;
+
+import java.util.function.Consumer;
+
+import org.apache.accumulo.core.data.Mutation;
+import org.apache.fluo.api.metrics.Meter;
+import org.apache.fluo.api.metrics.MetricsReporter;
+import org.apache.fluo.recipes.accumulo.export.function.AccumuloTranslator;
+import org.apache.fluo.recipes.core.export.SequencedExport;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import webindex.core.IndexClient;
+import webindex.core.models.export.DomainUpdate;
+import webindex.core.models.export.IndexUpdate;
+import webindex.core.models.export.PageUpdate;
+import webindex.core.models.export.UriUpdate;
+
+public class IndexUpdateTranslator implements AccumuloTranslator<String, IndexUpdate> {
+
+ private static final Logger log = LoggerFactory.getLogger(IndexUpdateTranslator.class);
+
+ private Meter pagesExported;
+ private Meter linksExported;
+ private Meter domainsExported;
+
+ public IndexUpdateTranslator(MetricsReporter reporter) {
+ pagesExported = reporter.meter("webindex_pages_exported");
+ linksExported = reporter.meter("webindex_links_exported");
+ domainsExported = reporter.meter("webindex_domains_exported");
+ }
+
+ @Override
+ public void translate(SequencedExport<String, IndexUpdate> export, Consumer<Mutation> consumer) {
+ if (export.getValue() instanceof DomainUpdate) {
+ domainsExported.mark();
+ IndexClient.genDomainMutations((DomainUpdate) export.getValue(), export.getSequence(),
+ consumer);
+ } else if (export.getValue() instanceof PageUpdate) {
+ pagesExported.mark();
+ IndexClient.genPageMutations((PageUpdate) export.getValue(), export.getSequence(), consumer);
+ } else if (export.getValue() instanceof UriUpdate) {
+ linksExported.mark();
+ IndexClient.genUriMutations((UriUpdate) export.getValue(), export.getSequence(), consumer);
+ } else {
+ String msg =
+ "An object with an IndexUpdate class (" + export.getValue().getClass().toString()
+ + ") was placed on the export queue";
+ log.error(msg);
+ throw new IllegalStateException(msg);
+ }
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/fluo/PageLoader.java b/webindex/modules/data/src/main/java/webindex/data/fluo/PageLoader.java
new file mode 100644
index 0000000..deb01e7
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/fluo/PageLoader.java
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.fluo;
+
+import java.net.MalformedURLException;
+import java.util.Objects;
+
+import com.google.common.base.Preconditions;
+import com.google.gson.Gson;
+import org.apache.fluo.api.client.Loader;
+import org.apache.fluo.api.client.TransactionBase;
+import org.apache.fluo.recipes.core.data.RowHasher;
+import org.apache.fluo.recipes.core.types.TypedTransactionBase;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import webindex.core.Constants;
+import webindex.core.models.Page;
+import webindex.core.models.URL;
+
+public class PageLoader implements Loader {
+
+ private static final Logger log = LoggerFactory.getLogger(PageLoader.class);
+ private Action action;
+ private Page page;
+ private URL delUrl;
+
+ private PageLoader() {}
+
+ public static PageLoader updatePage(Page page) {
+ Preconditions.checkArgument(!page.isEmpty(), "Page cannot be empty");
+ PageLoader update = new PageLoader();
+ update.action = Action.UPDATE;
+ update.page = page;
+ return update;
+ }
+
+ public static PageLoader deletePage(URL url) throws MalformedURLException {
+ Objects.requireNonNull(url, "Url cannot be null");
+ PageLoader update = new PageLoader();
+ update.action = Action.DELETE;
+ update.delUrl = url;
+ return update;
+ }
+
+ @Override
+ public void load(TransactionBase tx, Context context) throws Exception {
+
+ TypedTransactionBase ttx = Constants.TYPEL.wrap(tx);
+
+ Gson gson = new Gson();
+ RowHasher rowHasher = PageObserver.getPageRowHasher();
+
+ switch (action) {
+ case DELETE:
+ ttx.mutate().row(rowHasher.addHash(delUrl.toUri())).col(Constants.PAGE_NEW_COL)
+ .set(Page.DELETE_JSON);
+ break;
+ case UPDATE:
+ String newJson = gson.toJson(page);
+ ttx.mutate().row(rowHasher.addHash(page.getUri())).col(Constants.PAGE_NEW_COL).set(newJson);
+ break;
+ default:
+ log.error("PageUpdate called with no action");
+ }
+ }
+
+ private enum Action {
+ UPDATE, DELETE,
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/fluo/PageObserver.java b/webindex/modules/data/src/main/java/webindex/data/fluo/PageObserver.java
new file mode 100644
index 0000000..c3cff55
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/fluo/PageObserver.java
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.fluo;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.collect.Sets;
+import com.google.gson.Gson;
+import org.apache.fluo.api.client.TransactionBase;
+import org.apache.fluo.api.data.Bytes;
+import org.apache.fluo.api.data.Column;
+import org.apache.fluo.api.metrics.Meter;
+import org.apache.fluo.api.metrics.MetricsReporter;
+import org.apache.fluo.api.observer.Observer;
+import org.apache.fluo.recipes.core.combine.CombineQueue;
+import org.apache.fluo.recipes.core.data.RowHasher;
+import org.apache.fluo.recipes.core.export.ExportQueue;
+import org.apache.fluo.recipes.core.types.TypedSnapshotBase.Value;
+import org.apache.fluo.recipes.core.types.TypedTransactionBase;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import webindex.core.Constants;
+import webindex.core.models.Link;
+import webindex.core.models.Page;
+import webindex.core.models.UriInfo;
+import webindex.core.models.export.IndexUpdate;
+import webindex.core.models.export.PageUpdate;
+
+public class PageObserver implements Observer {
+
+ private static final Logger log = LoggerFactory.getLogger(PageObserver.class);
+ private static final Gson gson = new Gson();
+
+ private CombineQueue<String, UriInfo> uriQ;
+ private ExportQueue<String, IndexUpdate> exportQ;
+ private Meter pagesIngested;
+ private Meter linksIngested;
+ private Meter pagesChanged;
+
+ private static final RowHasher PAGE_ROW_HASHER = new RowHasher("p");
+
+ public static RowHasher getPageRowHasher() {
+ return PAGE_ROW_HASHER;
+ }
+
+ PageObserver(CombineQueue<String, UriInfo> uriQ, ExportQueue<String, IndexUpdate> exportQ,
+ MetricsReporter reporter) {
+ this.uriQ = uriQ;
+ this.exportQ = exportQ;
+ pagesIngested = reporter.meter("webindex_pages_ingested");
+ linksIngested = reporter.meter("webindex_links_ingested");
+ pagesChanged = reporter.meter("webindex_pages_changed");
+
+ }
+
+ @Override
+ public void process(TransactionBase tx, Bytes row, Column col) throws Exception {
+
+ TypedTransactionBase ttx = Constants.TYPEL.wrap(tx);
+
+ Map<Column, Value> pages =
+ ttx.get().row(row).columns(Constants.PAGE_NEW_COL, Constants.PAGE_CUR_COL);
+
+ String nextJson = pages.get(Constants.PAGE_NEW_COL).toString("");
+ if (nextJson.isEmpty()) {
+ log.error("An empty page was set at row {} col {}", row.toString(), col.toString());
+ return;
+ }
+
+ Page curPage = Page.fromJson(gson, pages.get(Constants.PAGE_CUR_COL).toString(""));
+ Set<Link> curLinks = curPage.getOutboundLinks();
+
+ Map<String, UriInfo> updates = new HashMap<>();
+ String pageUri = getPageRowHasher().removeHash(row).toString();
+
+ Page nextPage = Page.fromJson(gson, nextJson);
+ if (nextPage.isDelete()) {
+ ttx.mutate().row(row).col(Constants.PAGE_CUR_COL).delete();
+ updates.put(pageUri, new UriInfo(0, -1));
+ } else {
+ ttx.mutate().row(row).col(Constants.PAGE_CUR_COL).set(nextJson);
+ if (curPage.isEmpty()) {
+ updates.put(pageUri, new UriInfo(0, 1));
+ }
+ pagesIngested.mark();
+ }
+
+ Set<Link> nextLinks = nextPage.getOutboundLinks();
+
+ List<Link> addLinks = new ArrayList<>(Sets.difference(nextLinks, curLinks));
+ for (Link link : addLinks) {
+ updates.put(link.getUri(), new UriInfo(1, 0));
+ }
+ linksIngested.mark(addLinks.size());
+
+ List<Link> delLinks = new ArrayList<>(Sets.difference(curLinks, nextLinks));
+ for (Link link : delLinks) {
+ updates.put(link.getUri(), new UriInfo(-1, 0));
+ }
+
+ uriQ.addAll(tx, updates);
+
+ exportQ.add(tx, pageUri, new PageUpdate(pageUri, nextJson, addLinks, delLinks));
+ pagesChanged.mark();
+
+ // clean up
+ ttx.mutate().row(row).col(Constants.PAGE_NEW_COL).delete();
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/fluo/UriCombineQ.java b/webindex/modules/data/src/main/java/webindex/data/fluo/UriCombineQ.java
new file mode 100644
index 0000000..cf0ef2f
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/fluo/UriCombineQ.java
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.fluo;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.fluo.api.client.TransactionBase;
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.apache.fluo.api.metrics.Meter;
+import org.apache.fluo.api.metrics.MetricsReporter;
+import org.apache.fluo.recipes.core.combine.ChangeObserver;
+import org.apache.fluo.recipes.core.combine.CombineQueue;
+import org.apache.fluo.recipes.core.export.ExportQueue;
+import webindex.core.models.URL;
+import webindex.core.models.UriInfo;
+import webindex.core.models.export.IndexUpdate;
+import webindex.core.models.export.UriUpdate;
+
+/**
+ * This class contains code related to a CombineQueue that keeps track of the count of information
+ * about URIs.
+ */
+public class UriCombineQ {
+
+ public static final String URI_COMBINE_Q_ID = "um";
+
+ /**
+ * Observes uri map updates and adds those updates to an export queue.
+ */
+ public static class UriUpdateObserver implements ChangeObserver<String, UriInfo> {
+
+ private ExportQueue<String, IndexUpdate> exportQ;
+ private CombineQueue<String, Long> domainQ;
+ private Meter linksNew;
+ private Meter linksChanged;
+
+ public UriUpdateObserver(ExportQueue<String, IndexUpdate> exportQ,
+ CombineQueue<String, Long> domainMap, MetricsReporter metricsReporter) {
+ this.exportQ = exportQ;
+ this.domainQ = domainMap;
+ linksNew = metricsReporter.meter("webindex_links_new");
+ linksChanged = metricsReporter.meter("webindex_links_changed");
+ }
+
+ @Override
+ public void process(TransactionBase tx, Iterable<Change<String, UriInfo>> updates) {
+
+ Map<String, Long> domainUpdates = new HashMap<>();
+
+ for (Change<String, UriInfo> update : updates) {
+ String uri = update.getKey();
+ UriInfo oldVal = update.getOldValue().orElse(UriInfo.ZERO);
+ UriInfo newVal = update.getNewValue().orElse(UriInfo.ZERO);
+
+ exportQ.add(tx, uri, new UriUpdate(uri, oldVal, newVal));
+ linksChanged.mark();
+
+ String pageDomain = URL.fromUri(uri).getReverseDomain();
+ if (oldVal.equals(UriInfo.ZERO) && !newVal.equals(UriInfo.ZERO)) {
+ domainUpdates.merge(pageDomain, 1L, (o, n) -> o + n);
+ linksNew.mark();
+ } else if (newVal.equals(UriInfo.ZERO) && !oldVal.equals(UriInfo.ZERO)) {
+ domainUpdates.merge(pageDomain, -1L, (o, n) -> o + n);
+ }
+ }
+
+ domainQ.addAll(tx, domainUpdates);
+ }
+ }
+
+ /**
+ * A helper method for configuring the uri map before initializing Fluo.
+ */
+ public static void configure(FluoConfiguration config, int numBuckets, int numTablets) {
+ CombineQueue.configure(URI_COMBINE_Q_ID).keyType(String.class).valueType(UriInfo.class)
+ .buckets(numBuckets).bucketsPerTablet(numBuckets / numTablets).save(config);
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/fluo/WebindexObservers.java b/webindex/modules/data/src/main/java/webindex/data/fluo/WebindexObservers.java
new file mode 100644
index 0000000..fa8ed79
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/fluo/WebindexObservers.java
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.fluo;
+
+import org.apache.fluo.api.config.SimpleConfiguration;
+import org.apache.fluo.api.metrics.MetricsReporter;
+import org.apache.fluo.api.observer.ObserverProvider;
+import org.apache.fluo.api.observer.Observer.NotificationType;
+import org.apache.fluo.recipes.accumulo.export.function.AccumuloExporter;
+import org.apache.fluo.recipes.core.combine.CombineQueue;
+import org.apache.fluo.recipes.core.combine.SummingCombiner;
+import org.apache.fluo.recipes.core.export.ExportQueue;
+
+import webindex.core.Constants;
+import webindex.core.models.UriInfo;
+import webindex.core.models.export.IndexUpdate;
+import webindex.data.FluoApp;
+
+/**
+ * Provides all of the observers needed for this application.
+ */
+public class WebindexObservers implements ObserverProvider {
+
+ @Override
+ public void provide(Registry obsRegistry, Context ctx) {
+ SimpleConfiguration appCfg = ctx.getAppConfiguration();
+ MetricsReporter reporter = ctx.getMetricsReporter();
+
+ // Create an export queue that handles all updates to the query table.
+ ExportQueue<String, IndexUpdate> exportQ =
+ ExportQueue.getInstance(FluoApp.EXPORT_QUEUE_ID, appCfg);
+
+ // Create a combineQ that tracks the number of pages linking to a URI.
+ CombineQueue<String, UriInfo> uriQ =
+ CombineQueue.getInstance(UriCombineQ.URI_COMBINE_Q_ID, appCfg);
+
+ // Create a combineQ that tracks the number of unique URIs observed per domain.
+ CombineQueue<String, Long> domainQ =
+ CombineQueue.getInstance(DomainCombineQ.DOMAIN_COMBINE_Q_ID, appCfg);
+
+ // Register an observer that handles changes to pages content.
+ obsRegistry.forColumn(Constants.PAGE_NEW_COL, NotificationType.STRONG).withId("PageObserver")
+ .useObserver(new PageObserver(uriQ, exportQ, reporter));
+
+ // Register an observer to processes queued export data.
+ exportQ.registerObserver(obsRegistry, new AccumuloExporter<>(FluoApp.EXPORT_QUEUE_ID, appCfg,
+ new IndexUpdateTranslator(reporter)));
+
+ // Register an observer to process updates to the URI map.
+ uriQ.registerObserver(obsRegistry, UriInfo::reduce, new UriCombineQ.UriUpdateObserver(exportQ,
+ domainQ, reporter));
+
+ // Register an observer to process updates to the domain map.
+ domainQ.registerObserver(obsRegistry, new SummingCombiner<>(),
+ new DomainCombineQ.DomainUpdateObserver(exportQ, reporter));
+ }
+
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/spark/IndexEnv.java b/webindex/modules/data/src/main/java/webindex/data/spark/IndexEnv.java
new file mode 100644
index 0000000..066659b
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/spark/IndexEnv.java
@@ -0,0 +1,273 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.spark;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedSet;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import com.google.common.base.Preconditions;
+import org.apache.accumulo.core.client.AccumuloException;
+import org.apache.accumulo.core.client.AccumuloSecurityException;
+import org.apache.accumulo.core.client.Connector;
+import org.apache.accumulo.core.client.TableExistsException;
+import org.apache.accumulo.core.client.TableNotFoundException;
+import org.apache.fluo.api.client.FluoAdmin;
+import org.apache.fluo.api.client.FluoFactory;
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.apache.fluo.api.data.Bytes;
+import org.apache.fluo.api.data.RowColumn;
+import org.apache.fluo.core.util.AccumuloUtil;
+import org.apache.fluo.recipes.accumulo.ops.TableOperations;
+import org.apache.fluo.recipes.spark.FluoSparkHelper;
+import org.apache.fluo.recipes.spark.FluoSparkHelper.BulkImportOptions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.io.Text;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import webindex.core.WebIndexConfig;
+import webindex.core.models.Page;
+import webindex.core.models.UriInfo;
+import webindex.data.FluoApp;
+
+public class IndexEnv {
+
+ private static final Logger log = LoggerFactory.getLogger(IndexEnv.class);
+
+ private final String accumuloTable;
+ private Connector conn;
+ private FluoConfiguration fluoConfig;
+ private Path accumuloTempDir;
+ private Path fluoTempDir;
+ private int numTablets;
+ private int numBuckets;
+
+ public IndexEnv(WebIndexConfig webIndexConfig) {
+ this(webIndexConfig, getFluoConfig(webIndexConfig));
+ }
+
+ public IndexEnv(WebIndexConfig webIndexConfig, FluoConfiguration fluoConfig) {
+ this(fluoConfig, webIndexConfig.accumuloIndexTable, webIndexConfig.hdfsTempDir,
+ webIndexConfig.numBuckets, webIndexConfig.numTablets);
+ }
+
+ public IndexEnv(FluoConfiguration fluoConfig, String accumuloTable, String hdfsTempDir,
+ int numBuckets, int numTablets) {
+ this.fluoConfig = fluoConfig;
+ this.accumuloTable = accumuloTable;
+ this.numBuckets = numBuckets;
+ this.numTablets = numTablets;
+ conn = AccumuloUtil.getConnector(fluoConfig);
+ fluoTempDir = new Path(hdfsTempDir + "/fluo");
+ accumuloTempDir = new Path(hdfsTempDir + "/accumulo");
+ }
+
+ public static String getHadoopConfDir() {
+ final String hadoopConfDir = System.getenv("HADOOP_CONF_DIR");
+ if (hadoopConfDir == null) {
+ log.error("HADOOP_CONF_DIR must be set in environment!");
+ System.exit(1);
+ }
+ if (!(new File(hadoopConfDir).exists())) {
+ log.error("Directory set by HADOOP_CONF_DIR={} does not exist", hadoopConfDir);
+ System.exit(1);
+ }
+ return hadoopConfDir;
+ }
+
+ private static FluoConfiguration getFluoConfig(WebIndexConfig webIndexConfig) {
+ File connPropsFile = new File(webIndexConfig.getConnPropsPath());
+ Preconditions.checkArgument(connPropsFile.exists(),
+ "fluoPropsPath must be set in webindex.yml and exist");
+ FluoConfiguration fluoConfig = new FluoConfiguration(connPropsFile);
+ Preconditions.checkArgument(!webIndexConfig.fluoApp.isEmpty(), "app name is empty");
+ fluoConfig.setApplicationName(webIndexConfig.fluoApp);
+ try (FluoAdmin admin = FluoFactory.newAdmin(fluoConfig)) {
+ for (Map.Entry<String, String> entry : admin.getApplicationConfig().toMap().entrySet()) {
+ fluoConfig.setProperty(entry.getKey(), entry.getValue());
+ }
+ }
+ return fluoConfig;
+ }
+
+ public FluoConfiguration getFluoConfig() {
+ return fluoConfig;
+ }
+
+ private static SortedSet<Text> getSplits(String filename) {
+ SortedSet<Text> splits = new TreeSet<>();
+ InputStream is = IndexEnv.class.getClassLoader().getResourceAsStream("splits/" + filename);
+ try {
+ try (BufferedReader br = new BufferedReader(new InputStreamReader(is))) {
+ String line;
+ while ((line = br.readLine()) != null) {
+ splits.add(new Text(line));
+ }
+ }
+ } catch (IOException e) {
+ log.error("Failed to read splits/accumulo-default.txt resource", e);
+ System.exit(-1);
+ }
+ return splits;
+ }
+
+ public static SortedSet<Text> getAccumuloDefaultSplits() {
+ return getSplits("accumulo-default.txt");
+ }
+
+ public static FileSystem getHDFS() throws IOException {
+ return getHDFS(getHadoopConfDir());
+ }
+
+ public static FileSystem getHDFS(String hadoopConfDir) throws IOException {
+ Configuration config = new Configuration();
+ config.addResource(hadoopConfDir);
+ return FileSystem.get(config);
+ }
+
+ public static void validateDataDir(String dataDir) {
+ try {
+ FileSystem hdfs = getHDFS();
+ Path dataPath = new Path(dataDir);
+ if (!hdfs.exists(dataPath)) {
+ log.error("HDFS data directory {} does not exist", dataDir);
+ System.exit(-1);
+ }
+ RemoteIterator<LocatedFileStatus> listIter = hdfs.listFiles(dataPath, true);
+ while (listIter.hasNext()) {
+ LocatedFileStatus status = listIter.next();
+ if (status.isFile()) {
+ return;
+ }
+ }
+ log.error("HDFS data directory {} has no files", dataDir);
+ System.exit(-1);
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+ public void initAccumuloIndexTable() {
+ if (conn.tableOperations().exists(accumuloTable)) {
+ try {
+ conn.tableOperations().delete(accumuloTable);
+ } catch (TableNotFoundException | AccumuloSecurityException | AccumuloException e) {
+ throw new IllegalStateException("Failed to delete Accumulo table " + accumuloTable, e);
+ }
+ }
+ try {
+ conn.tableOperations().create(accumuloTable);
+ } catch (AccumuloException | AccumuloSecurityException | TableExistsException e) {
+ throw new IllegalStateException("Failed to create Accumulo table " + accumuloTable, e);
+ }
+
+ try {
+ conn.tableOperations().addSplits(accumuloTable, IndexEnv.getAccumuloDefaultSplits());
+ } catch (AccumuloException | AccumuloSecurityException | TableNotFoundException e) {
+ throw new IllegalStateException("Failed to add splits to Accumulo table " + accumuloTable, e);
+ }
+ }
+
+ public void setFluoTableSplits() {
+ final String table = fluoConfig.getAccumuloTable();
+ try {
+ TableOperations.optimizeTable(getFluoConfig());
+ } catch (Exception e) {
+ throw new IllegalStateException("Failed to add splits to Fluo's Accumulo table " + table, e);
+ }
+ }
+
+ public void configureApplication(FluoConfiguration connectionConfig, FluoConfiguration appConfig) {
+ FluoApp
+ .configureApplication(connectionConfig, appConfig, accumuloTable, numBuckets, numTablets);
+ }
+
+ public void initializeIndexes(JavaSparkContext ctx, JavaRDD<Page> pages, IndexStats stats)
+ throws Exception {
+
+ JavaPairRDD<String, UriInfo> uriMap = IndexUtil.createUriMap(pages);
+ JavaPairRDD<String, Long> domainMap = IndexUtil.createDomainMap(uriMap);
+
+ // Create the Accumulo index from pages RDD
+ JavaPairRDD<RowColumn, Bytes> accumuloIndex =
+ IndexUtil.createAccumuloIndex(stats, pages, uriMap, domainMap);
+
+ // Create a Fluo index by filtering a subset of data from Accumulo index
+ JavaPairRDD<RowColumn, Bytes> fluoIndex =
+ IndexUtil.createFluoTable(pages, uriMap, domainMap, numBuckets);
+
+ // Load the indexes into Fluo and Accumulo
+ saveRowColBytesToFluo(ctx, fluoIndex);
+ saveRowColBytesToAccumulo(ctx, accumuloIndex);
+ }
+
+ public void saveRowColBytesToFluo(JavaSparkContext ctx, JavaPairRDD<RowColumn, Bytes> data)
+ throws Exception {
+ new FluoSparkHelper(fluoConfig, ctx.hadoopConfiguration(), fluoTempDir).bulkImportRcvToFluo(
+ data, new BulkImportOptions().setAccumuloConnector(conn));
+ }
+
+ public void saveRowColBytesToAccumulo(JavaSparkContext ctx, JavaPairRDD<RowColumn, Bytes> data)
+ throws Exception {
+ new FluoSparkHelper(fluoConfig, ctx.hadoopConfiguration(), accumuloTempDir)
+ .bulkImportRcvToAccumulo(data, accumuloTable,
+ new BulkImportOptions().setAccumuloConnector(conn));
+ }
+
+ public static List<String> getPathsRange(String ccPaths, String range) {
+ if (!(new File(ccPaths).exists())) {
+ log.error("CC paths file {} does not exist", ccPaths);
+ System.exit(1);
+ }
+ int start = 0;
+ int end = 0;
+ try {
+ start = Integer.parseInt(range.split("-")[0]);
+ end = Integer.parseInt(range.split("-")[1]);
+ } catch (NumberFormatException e) {
+ log.error("Invalid range: {}", range);
+ System.exit(1);
+ }
+ if (start > end) {
+ log.error("Invalid range: {}", range);
+ System.exit(1);
+ }
+ try (Stream<String> lines = Files.lines(Paths.get(ccPaths))) {
+ return lines.skip(start).limit(end - start + 1).collect(Collectors.toList());
+ } catch (IOException e) {
+ log.error("Failed to read CC paths file {}", ccPaths, e);
+ System.exit(1);
+ }
+ return Collections.emptyList();
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/spark/IndexStats.java b/webindex/modules/data/src/main/java/webindex/data/spark/IndexStats.java
new file mode 100644
index 0000000..0f73795
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/spark/IndexStats.java
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.spark;
+
+import java.io.Serializable;
+
+import org.apache.spark.Accumulator;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class IndexStats implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ private static final Logger log = LoggerFactory.getLogger(IndexUtil.class);
+
+ private Accumulator<Integer> numPages;
+ private Accumulator<Integer> numEmpty;
+ private Accumulator<Integer> numExternalLinks;
+
+ public IndexStats(JavaSparkContext ctx) {
+ numPages = ctx.accumulator(0);
+ numEmpty = ctx.accumulator(0);
+ numExternalLinks = ctx.accumulator(0);
+ }
+
+ public void addPage(Integer num) {
+ numPages.add(num);
+ }
+
+ public void addEmpty(Integer num) {
+ numEmpty.add(num);
+ }
+
+ public void addExternalLinks(Integer num) {
+ numExternalLinks.add(num);
+ }
+
+ public void print() {
+ log.info("Num empty = {}", numEmpty.value());
+ log.info("Num pages = {}", numPages.value());
+ log.info("Num external links = {}", numExternalLinks.value());
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/spark/IndexUtil.java b/webindex/modules/data/src/main/java/webindex/data/spark/IndexUtil.java
new file mode 100644
index 0000000..4201ad9
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/spark/IndexUtil.java
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.spark;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import com.google.gson.Gson;
+import org.apache.fluo.api.data.Bytes;
+import org.apache.fluo.api.data.Column;
+import org.apache.fluo.api.data.RowColumn;
+import org.apache.fluo.api.data.RowColumnValue;
+import org.apache.fluo.recipes.core.combine.CombineQueue;
+import org.apache.fluo.recipes.core.combine.CombineQueue.Initializer;
+import org.apache.fluo.recipes.kryo.KryoSimplerSerializer;
+import org.apache.hadoop.io.Text;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.storage.StorageLevel;
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveRecord;
+import scala.Tuple2;
+import webindex.core.Constants;
+import webindex.core.IndexClient;
+import webindex.core.models.Link;
+import webindex.core.models.Page;
+import webindex.core.models.URL;
+import webindex.core.models.UriInfo;
+import webindex.data.fluo.DomainCombineQ;
+import webindex.data.fluo.PageObserver;
+
+import webindex.data.fluo.UriCombineQ;
+
+import webindex.data.util.ArchiveUtil;
+import webindex.serialization.WebindexKryoFactory;
+
+public class IndexUtil {
+
+ private static Gson gson = new Gson();
+
+ private static void addRCV(List<Tuple2<RowColumn, Bytes>> tuples, String r, Column c, Long v) {
+ addRCV(tuples, r, c, v.toString());
+ }
+
+ private static void addRCV(List<Tuple2<RowColumn, Bytes>> tuples, String r, Column c, String v) {
+ tuples.add(new Tuple2<>(new RowColumn(r, c), Bytes.of(v)));
+ }
+
+ /**
+ * Creates an RDD of pages from an RDD archive
+ */
+ public static JavaRDD<Page> createPages(JavaPairRDD<Text, ArchiveReader> archives) {
+ int numPartitions = 50 * (int) archives.count();
+ JavaRDD<ArchiveRecord> records = archives.flatMap(Tuple2::_2);
+ return records.map(ArchiveUtil::buildPageIgnoreErrors).repartition(numPartitions)
+ .persist(StorageLevel.DISK_ONLY_2());
+ }
+
+ public static JavaPairRDD<String, UriInfo> createUriMap(JavaRDD<Page> pages) {
+ JavaPairRDD<String, UriInfo> uriMap = pages.flatMapToPair(page -> {
+ List<Tuple2<String, UriInfo>> ret = new ArrayList<>();
+
+ if (!page.isEmpty()) {
+ ret.add(new Tuple2<>(page.getUri(), new UriInfo(0, 1)));
+
+ for (Link link : page.getOutboundLinks()) {
+ ret.add(new Tuple2<>(link.getUri(), new UriInfo(1, 0)));
+ }
+ }
+ return ret;
+ }).reduceByKey(UriInfo::merge);
+
+ uriMap.persist(StorageLevel.DISK_ONLY());
+
+ return uriMap;
+ }
+
+ public static JavaPairRDD<String, Long> createDomainMap(JavaPairRDD<String, UriInfo> uriMap) {
+
+ JavaPairRDD<String, Long> domainMap =
+ uriMap.mapToPair(t -> new Tuple2<>(URL.fromUri(t._1()).getReverseDomain(), 1L))
+ .reduceByKey(Long::sum);
+
+ domainMap.persist(StorageLevel.DISK_ONLY());
+
+ return domainMap;
+ }
+
+ /**
+ * Creates initial data for external Accumulo index table
+ */
+ public static JavaPairRDD<RowColumn, Bytes> createAccumuloIndex(IndexStats stats,
+ JavaRDD<Page> pages, JavaPairRDD<String, UriInfo> uriMap, JavaPairRDD<String, Long> domainMap) {
+
+ JavaPairRDD<RowColumn, Bytes> accumuloIndex =
+ pages.flatMapToPair(page -> {
+ if (page.isEmpty()) {
+ stats.addEmpty(1);
+ return new ArrayList<>();
+ }
+ stats.addPage(1);
+ Set<Link> links1 = page.getOutboundLinks();
+ stats.addExternalLinks(links1.size());
+
+ List<Tuple2<RowColumn, Bytes>> ret = new ArrayList<>();
+ String uri = page.getUri();
+ if (links1.size() > 0) {
+ addRCV(ret, "p:" + uri, Constants.PAGE_CUR_COL, gson.toJson(page));
+ }
+ for (Link link : links1) {
+ addRCV(ret, "p:" + link.getUri(), new Column(Constants.INLINKS, uri),
+ link.getAnchorText());
+ }
+ return ret;
+ });
+
+ accumuloIndex =
+ accumuloIndex.union(uriMap.flatMapToPair(t -> {
+ List<Tuple2<RowColumn, Bytes>> ret = new ArrayList<>();
+ String uri = t._1();
+ UriInfo uriInfo = t._2();
+ addRCV(ret, "t:" + IndexClient.revEncodeLong(uriInfo.linksTo) + ":" + uri, Column.EMPTY,
+ uriInfo.linksTo);
+ String domain = URL.fromUri(t._1()).getReverseDomain();
+ String domainRow = IndexClient.encodeDomainRankUri(domain, uriInfo.linksTo, uri);
+ addRCV(ret, domainRow, new Column(Constants.RANK, ""), uriInfo.linksTo);
+ addRCV(ret, "p:" + uri, Constants.PAGE_INCOUNT_COL, uriInfo.linksTo);
+ return ret;
+ }));
+
+ accumuloIndex =
+ accumuloIndex.union(domainMap.mapToPair(t -> new Tuple2<>(new RowColumn("d:" + t._1(),
+ new Column(Constants.DOMAIN, Constants.PAGECOUNT)), Bytes.of(t._2() + ""))));
+
+ accumuloIndex.persist(StorageLevel.DISK_ONLY());
+
+ return accumuloIndex;
+ }
+
+ /**
+ * Creates initial data for Fluo table
+ */
+ public static JavaPairRDD<RowColumn, Bytes> createFluoTable(JavaRDD<Page> pages,
+ JavaPairRDD<String, UriInfo> uriMap, JavaPairRDD<String, Long> domainMap, int numBuckets) {
+
+ KryoSimplerSerializer serializer = new KryoSimplerSerializer(new WebindexKryoFactory());
+
+ JavaPairRDD<RowColumn, Bytes> fluoIndex = pages.flatMapToPair(page -> {
+ if (page.isEmpty()) {
+
+ return new ArrayList<>();
+ }
+ Set<Link> links1 = page.getOutboundLinks();
+ List<Tuple2<RowColumn, Bytes>> ret = new ArrayList<>();
+ String uri = page.getUri();
+ if (links1.size() > 0) {
+ String hashedRow = PageObserver.getPageRowHasher().addHash(uri).toString();
+ addRCV(ret, hashedRow, new Column(Constants.PAGE, Constants.CUR), gson.toJson(page));
+ }
+ return ret;
+ });
+
+ Initializer<String, UriInfo> uriCombineQueueInitializer =
+ CombineQueue.getInitializer(UriCombineQ.URI_COMBINE_Q_ID, numBuckets, serializer);
+
+ fluoIndex = fluoIndex.union(uriMap.mapToPair(t -> {
+ RowColumnValue rcv = uriCombineQueueInitializer.convert(t._1(), t._2());
+ return new Tuple2<>(new RowColumn(rcv.getRow(), rcv.getColumn()), rcv.getValue());
+ }));
+
+ Initializer<String, Long> domainMapInitializer =
+ CombineQueue.getInitializer(DomainCombineQ.DOMAIN_COMBINE_Q_ID, numBuckets, serializer);
+
+ fluoIndex = fluoIndex.union(domainMap.mapToPair(t -> {
+ RowColumnValue rcv = domainMapInitializer.convert(t._1(), t._2());
+ return new Tuple2<>(new RowColumn(rcv.getRow(), rcv.getColumn()), rcv.getValue());
+ }));
+
+ fluoIndex.persist(StorageLevel.DISK_ONLY());
+
+ return fluoIndex;
+ }
+
+ public static SortedSet<Text> calculateSplits(JavaPairRDD<RowColumn, Bytes> accumuloIndex,
+ int numSplits) {
+ List<Tuple2<RowColumn, Bytes>> sample = accumuloIndex.takeSample(false, numSplits);
+
+ SortedSet<Text> splits = new TreeSet<>();
+ for (Tuple2<RowColumn, Bytes> tuple : sample) {
+ Bytes row = tuple._1().getRow();
+ if (row.length() < 29) {
+ splits.add(new Text(row.toArray()));
+ } else {
+ splits.add(new Text(row.subSequence(0, 29).toArray()));
+ }
+ }
+ return splits;
+ }
+
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/util/ArchiveUtil.java b/webindex/modules/data/src/main/java/webindex/data/util/ArchiveUtil.java
new file mode 100644
index 0000000..1491d6e
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/util/ArchiveUtil.java
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.util;
+
+import java.io.IOException;
+import java.text.ParseException;
+
+import org.apache.commons.io.IOUtils;
+import org.archive.io.ArchiveRecord;
+import org.json.JSONArray;
+import org.json.JSONException;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import webindex.core.models.Link;
+import webindex.core.models.Page;
+import webindex.core.models.URL;
+
+public class ArchiveUtil {
+
+ private static final Logger log = LoggerFactory.getLogger(ArchiveUtil.class);
+
+ public static Page buildPage(ArchiveRecord archiveRecord) throws IOException, ParseException {
+ if (archiveRecord.getHeader().getMimetype().equalsIgnoreCase("application/json")) {
+ byte[] rawData = IOUtils.toByteArray(archiveRecord, archiveRecord.available());
+ if (rawData.length == 0) {
+ return Page.EMPTY;
+ }
+ String jsonString = new String(rawData);
+ if (jsonString.isEmpty()) {
+ return Page.EMPTY;
+ }
+ JSONObject json;
+ try {
+ json = new JSONObject(new String(rawData));
+ } catch (JSONException e) {
+ throw new ParseException(e.getMessage(), 0);
+ }
+ String rawPageUrl = archiveRecord.getHeader().getUrl();
+ URL pageUrl;
+ try {
+ pageUrl = URL.from(rawPageUrl);
+ } catch (IllegalArgumentException e) {
+ return Page.EMPTY;
+ } catch (Exception e) {
+ log.error("Unexpected exception while parsing raw page URL: " + rawPageUrl, e);
+ return Page.EMPTY;
+ }
+ Page page = new Page(pageUrl.toUri());
+ page.setCrawlDate(archiveRecord.getHeader().getDate());
+ try {
+ JSONObject responseMeta =
+ json.getJSONObject("Envelope").getJSONObject("Payload-Metadata")
+ .getJSONObject("HTTP-Response-Metadata");
+
+ if (archiveRecord.getHeader().getMimetype().equals("application/json")) {
+ try {
+ JSONArray links = responseMeta.getJSONObject("HTML-Metadata").getJSONArray("Links");
+ for (int i = 0; i < links.length(); i++) {
+ JSONObject link = links.getJSONObject(i);
+ if (link.has("path") && link.get("path").equals("A@/href") && link.has("url")) {
+ String anchorText = "";
+ if (link.has("text")) {
+ anchorText = link.getString("text");
+ } else if (link.has("title")) {
+ anchorText = link.getString("title");
+ }
+ String rawLinkUrl = link.getString("url");
+ URL linkUrl;
+ try {
+ linkUrl = URL.from(rawLinkUrl);
+ if (!page.getDomain().equals(linkUrl.getDomain())) {
+ page.addOutbound(Link.of(linkUrl, anchorText));
+ }
+ } catch (IllegalArgumentException e) {
+ log.debug("Failed to parse link: " + rawLinkUrl);
+ } catch (Exception e) {
+ log.error("Unexpected exception while parsing link URL: " + rawLinkUrl, e);
+ }
+ }
+ }
+ } catch (JSONException e) {
+ log.debug("Exception trying retrieve links", e);
+ }
+ }
+ try {
+ page.setTitle(responseMeta.getJSONObject("HTML-Metadata").getJSONObject("Head")
+ .getString("Title"));
+ } catch (JSONException e) {
+ log.debug("Failed to retrieve title", e);
+ }
+ try {
+ page.setServer(responseMeta.getJSONObject("Headers").getString("Server"));
+ } catch (JSONException e) {
+ log.debug("Failed to retrieve server", e);
+ }
+ } catch (JSONException e) {
+ log.debug("Exception trying retrieve responseMeta", e);
+ }
+ return page;
+ }
+ return Page.EMPTY;
+ }
+
+ public static Page buildPageIgnoreErrors(ArchiveRecord record) {
+ try {
+ return buildPage(record);
+ } catch (Exception e) {
+ log.info("Exception parsing ArchiveRecord with url {} due to {}",
+ record.getHeader().getUrl(), e.getMessage());
+ return Page.EMPTY;
+ }
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/util/WARCFileInputFormat.java b/webindex/modules/data/src/main/java/webindex/data/util/WARCFileInputFormat.java
new file mode 100644
index 0000000..bdb63df
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/util/WARCFileInputFormat.java
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.util;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.archive.io.ArchiveReader;
+
+/**
+ * Minimal implementation of FileInputFormat for WARC files. Hadoop is told that splitting these
+ * compressed files is not possible.
+ *
+ * @author Stephen Merity (Smerity)
+ */
+public class WARCFileInputFormat extends FileInputFormat<Text, ArchiveReader> {
+
+ @Override
+ public RecordReader<Text, ArchiveReader> createRecordReader(InputSplit split,
+ TaskAttemptContext context) throws IOException, InterruptedException {
+ return new WARCFileRecordReader();
+ }
+
+ @Override
+ protected boolean isSplitable(JobContext context, Path filename) {
+ // As these are compressed files, they cannot be (sanely) split
+ return false;
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/data/util/WARCFileRecordReader.java b/webindex/modules/data/src/main/java/webindex/data/util/WARCFileRecordReader.java
new file mode 100644
index 0000000..994b304
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/data/util/WARCFileRecordReader.java
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.util;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.archive.io.ArchiveReader;
+import org.archive.io.warc.WARCReaderFactory;
+
+/**
+ * The WARC File Record Reader processes a single compressed input. The Record Reader returns a
+ * single WARC ArchiveReader that can contain numerous individual documents, each document handled
+ * in a single mapper.
+ *
+ * @author Stephen Merity (Smerity)
+ */
+public class WARCFileRecordReader extends RecordReader<Text, ArchiveReader> {
+
+ private String arPath;
+ private ArchiveReader ar;
+ private FSDataInputStream fsin;
+ private boolean hasBeenRead = false;
+
+ @Override
+ public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException,
+ InterruptedException {
+ FileSplit split = (FileSplit) inputSplit;
+ Configuration conf = context.getConfiguration();
+ Path path = split.getPath();
+ FileSystem fs = path.getFileSystem(conf);
+ fsin = fs.open(path);
+ arPath = path.getName();
+ ar = WARCReaderFactory.get(path.getName(), fsin, true);
+ }
+
+ @Override
+ public void close() throws IOException {
+ fsin.close();
+ ar.close();
+ }
+
+ @Override
+ public Text getCurrentKey() throws IOException, InterruptedException {
+ // Provide the path used for the compressed file as the key
+ return new Text(arPath);
+ }
+
+ @Override
+ public ArchiveReader getCurrentValue() throws IOException, InterruptedException {
+ // We only ever have one value to give -- the output of the compressed file
+ return ar;
+ }
+
+ @Override
+ public float getProgress() throws IOException, InterruptedException {
+ // Progress of reader through the data as a float
+ // As each file only produces one ArchiveReader, this will be one immediately
+ return hasBeenRead ? 1 : 0;
+ }
+
+ @Override
+ public boolean nextKeyValue() throws IOException, InterruptedException {
+ // As each file only produces one ArchiveReader, if it has been read, there are no more
+ if (hasBeenRead) {
+ return false;
+ }
+ hasBeenRead = true;
+ return true;
+ }
+}
diff --git a/webindex/modules/data/src/main/java/webindex/serialization/WebindexKryoFactory.java b/webindex/modules/data/src/main/java/webindex/serialization/WebindexKryoFactory.java
new file mode 100644
index 0000000..d07f6e2
--- /dev/null
+++ b/webindex/modules/data/src/main/java/webindex/serialization/WebindexKryoFactory.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.serialization;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+
+import com.esotericsoftware.kryo.Kryo;
+import com.esotericsoftware.kryo.pool.KryoFactory;
+import webindex.core.models.Link;
+import webindex.core.models.UriInfo;
+import webindex.core.models.export.DomainUpdate;
+import webindex.core.models.export.IndexUpdate;
+import webindex.core.models.export.PageUpdate;
+import webindex.core.models.export.UriUpdate;
+
+public class WebindexKryoFactory implements KryoFactory, Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public Kryo create() {
+ Kryo kryo = new Kryo();
+
+ // Explicitly set class ids when registering. Did not set ids (because thought if registered in
+ // same order it would be ok) and ran into issue where Spark and Fluo code were using different
+ // ids for some reason.
+ kryo.register(UriInfo.class, 9);
+ kryo.register(IndexUpdate.class, 10);
+ kryo.register(DomainUpdate.class, 11);
+ kryo.register(PageUpdate.class, 12);
+ kryo.register(UriUpdate.class, 13);
+ kryo.register(ArrayList.class, 14);
+ kryo.register(Link.class, 15);
+
+ kryo.setRegistrationRequired(true);
+
+ return kryo;
+ }
+
+}
diff --git a/webindex/modules/data/src/main/resources/splits/accumulo-default.txt b/webindex/modules/data/src/main/resources/splits/accumulo-default.txt
new file mode 100644
index 0000000..2cfc925
--- /dev/null
+++ b/webindex/modules/data/src/main/resources/splits/accumulo-default.txt
@@ -0,0 +1,76 @@
+d:com.blogg
+d:com.dd
+d:com.fe
+d:com.hg
+d:com.mar
+d:com.p
+d:com.sh
+d:com.tu
+d:com.y
+d:j
+d:org.h
+d:us.i
+p:ca.h
+p:com.af
+p:com.applec
+p:com.beaut
+p:com.blogger.www/delete-comment.g?blogID=24
+p:com.blogger.www/profile/067
+p:com.blogspot.ben
+p:com.blogspot.in
+p:com.blogspot.sm
+p:com.buf
+p:com.chick
+p:com.cru
+p:com.detroitnews.www/article/2014
+p:com.ebe
+p:com.facebook.www/a
+p:com.facebook.www:s/c
+p:com.fir
+p:com.gee
+p:com.google.plus:s/+N
+p:com.gotethnicfoods.secure:s/Indian%20Foods%20Company/Store/Login.cfm?Logout=&cfid=18
+p:com.homet
+p:com.inm
+p:com.kay
+p:com.linkedin.www:
+p:com.mel
+p:com.moms
+p:com.neimanmarcus.www/p
+p:com.ohio/
+p:com.pie
+p:com.pro
+p:com.rivals.o
+p:com.sho
+p:com.sportsf
+p:com.stun
+p:com.thecl
+p:com.toy
+p:com.twitter/E
+p:com.twitter:s/B
+p:com.uni
+p:com.w
+p:com.wordpress.du
+p:com.y
+p:com.youtube/
+p:edu.p
+p:gov.ni
+p:jp.n
+p:net.doubleclick.g.pubads/gampad/j
+p:net.to
+p:org.cro
+p:org.li
+p:org.scp
+p:pl.z
+p:uk.co.r
+t:fefdfaff:o
+t:fefdfefdff:com.am
+t:fefdfefdff:com.blogger.www/profile/16
+t:fefdfefdff:com.facebook.www/s
+t:fefdfefdff:com.in
+t:fefdfefdff:com.ph
+t:fefdfefdff:com.tumblr.www:
+t:fefdfefdff:e
+t:fefdfefdff:org.s
+t:fefeff:com.g
+t:fefeff:d
diff --git a/webindex/modules/data/src/test/java/webindex/data/SparkTestUtil.java b/webindex/modules/data/src/test/java/webindex/data/SparkTestUtil.java
new file mode 100644
index 0000000..24ebf4a
--- /dev/null
+++ b/webindex/modules/data/src/test/java/webindex/data/SparkTestUtil.java
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+
+public class SparkTestUtil {
+
+ public static JavaSparkContext getSparkContext(String appName) {
+ SparkConf sparkConf = new SparkConf();
+ sparkConf.setMaster("local");
+ sparkConf.setAppName(appName);
+ sparkConf.set("spark.app.id", appName);
+ sparkConf.set("spark.ui.port", "4444");
+ return new JavaSparkContext(sparkConf);
+ }
+
+}
diff --git a/webindex/modules/data/src/test/java/webindex/data/fluo/it/IndexIT.java b/webindex/modules/data/src/test/java/webindex/data/fluo/it/IndexIT.java
new file mode 100644
index 0000000..2315352
--- /dev/null
+++ b/webindex/modules/data/src/test/java/webindex/data/fluo/it/IndexIT.java
@@ -0,0 +1,258 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.fluo.it;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Lists;
+import org.apache.fluo.api.client.FluoClient;
+import org.apache.fluo.api.client.FluoFactory;
+import org.apache.fluo.api.client.LoaderExecutor;
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.apache.fluo.api.data.Bytes;
+import org.apache.fluo.api.data.RowColumn;
+import org.apache.fluo.api.data.RowColumnValue;
+import org.apache.fluo.recipes.test.AccumuloExportITBase;
+import org.apache.fluo.recipes.test.FluoITHelper;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.warc.WARCReaderFactory;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.Tuple2;
+import webindex.core.models.Link;
+import webindex.core.models.Page;
+import webindex.core.models.URL;
+import webindex.core.models.UriInfo;
+import webindex.data.SparkTestUtil;
+import webindex.data.fluo.PageLoader;
+import webindex.data.spark.Hex;
+import webindex.data.spark.IndexEnv;
+import webindex.data.spark.IndexStats;
+import webindex.data.spark.IndexUtil;
+import webindex.data.util.ArchiveUtil;
+
+public class IndexIT extends AccumuloExportITBase {
+
+ private static final Logger log = LoggerFactory.getLogger(IndexIT.class);
+ private transient JavaSparkContext ctx;
+ private IndexEnv env;
+ private String exportTable;
+
+ private static final int TEST_SPLITS = 119;
+
+ @Override
+ protected void preFluoInitHook() throws Exception {
+ FluoConfiguration config = getFluoConfiguration();
+ config.setApplicationName("lit");
+ config.setWorkerThreads(5);
+
+ // create and configure export table
+ exportTable = "export" + tableCounter.getAndIncrement();
+
+ ctx = SparkTestUtil.getSparkContext(getClass().getSimpleName());
+ env = new IndexEnv(config, exportTable, "/tmp", TEST_SPLITS, TEST_SPLITS);
+ env.initAccumuloIndexTable();
+ env.configureApplication(config, config);
+ }
+
+ @Override
+ protected void postFluoInitHook() throws Exception {
+ env.setFluoTableSplits();
+ }
+
+ @After
+ public void tearCloseContext() throws Exception {
+ ctx.close();
+ ctx = null;
+ }
+
+ public static Map<URL, Page> readPages(File input) throws Exception {
+ Map<URL, Page> pageMap = new HashMap<>();
+ ArchiveReader ar = WARCReaderFactory.get(input);
+ for (ArchiveRecord r : ar) {
+ Page p = ArchiveUtil.buildPage(r);
+ if (p.isEmpty() || p.getOutboundLinks().isEmpty()) {
+ continue;
+ }
+ pageMap.put(URL.fromUri(p.getUri()), p);
+ }
+ ar.close();
+ return pageMap;
+ }
+
+ private void assertOutput(Collection<Page> pages) throws Exception {
+ JavaRDD<Page> pagesRDD = ctx.parallelize(new ArrayList<>(pages));
+ Assert.assertEquals(pages.size(), pagesRDD.count());
+
+ // Create expected output using spark
+ IndexStats stats = new IndexStats(ctx);
+
+ JavaPairRDD<String, UriInfo> uriMap = IndexUtil.createUriMap(pagesRDD);
+ JavaPairRDD<String, Long> domainMap = IndexUtil.createDomainMap(uriMap);
+ JavaPairRDD<RowColumn, Bytes> accumuloIndex =
+ IndexUtil.createAccumuloIndex(stats, pagesRDD, uriMap, domainMap).sortByKey();
+ JavaPairRDD<RowColumn, Bytes> fluoIndex =
+ IndexUtil.createFluoTable(pagesRDD, uriMap, domainMap, TEST_SPLITS).sortByKey();
+
+ // Compare against actual
+ try (FluoClient client = FluoFactory.newClient(getMiniFluo().getClientConfiguration())) {
+ boolean foundDiff =
+ !FluoITHelper.verifyAccumuloTable(getAccumuloConnector(), exportTable,
+ tuples2rcv(accumuloIndex.collect()));
+ foundDiff |= !FluoITHelper.verifyFluoTable(client, tuples2rcv(fluoIndex.collect()));
+ if (foundDiff) {
+ FluoITHelper.printFluoTable(client);
+ FluoITHelper.printAccumuloTable(getAccumuloConnector(), exportTable);
+ printRDD(accumuloIndex.collect());
+ printRDD(fluoIndex.collect());
+ }
+ Assert.assertFalse(foundDiff);
+ }
+ }
+
+ public static Link newLink(String url) {
+ return Link.of(URL.from(url));
+ }
+
+ public static Link newLink(String url, String anchorText) {
+ return Link.of(URL.from(url), anchorText);
+ }
+
+ @Test
+ public void testFluoIndexing() throws Exception {
+
+ Map<URL, Page> pages = readPages(new File("src/test/resources/wat-18.warc"));
+
+ try (FluoClient client = FluoFactory.newClient(getMiniFluo().getClientConfiguration())) {
+
+ try (LoaderExecutor le = client.newLoaderExecutor()) {
+ for (Page page : pages.values()) {
+ log.debug("Loading page {} with {} links", page.getUrl(), page.getOutboundLinks().size());
+ le.execute(PageLoader.updatePage(page));
+ }
+ }
+
+ getMiniFluo().waitForObservers();
+ assertOutput(pages.values());
+
+ URL deleteUrl = URL.from("http://1000games.me/games/gametion/");
+ log.debug("Deleting page {}", deleteUrl);
+ try (LoaderExecutor le = client.newLoaderExecutor()) {
+ le.execute(PageLoader.deletePage(deleteUrl));
+ }
+ getMiniFluo().waitForObservers();
+
+ int numPages = pages.size();
+ Assert.assertNotNull(pages.remove(deleteUrl));
+ Assert.assertEquals(numPages - 1, pages.size());
+ assertOutput(pages.values());
+
+ URL updateUrl = URL.from("http://100zone.blogspot.com/2013/03/please-memp3-4shared.html");
+ Page updatePage = pages.get(updateUrl);
+ long numLinks = updatePage.getNumOutbound();
+ Assert.assertTrue(updatePage.addOutbound(newLink("http://example.com", "Example")));
+ Assert.assertEquals(numLinks + 1, (long) updatePage.getNumOutbound());
+ Assert.assertTrue(updatePage.removeOutbound(newLink("http://www.blogger.com")));
+ Assert.assertEquals(numLinks, (long) updatePage.getNumOutbound());
+
+ try (LoaderExecutor le = client.newLoaderExecutor()) {
+ le.execute(PageLoader.updatePage(updatePage));
+ }
+ getMiniFluo().waitForObservers();
+
+ // create a URL that has an inlink count of 2
+ URL updateUrl2 = URL.from("http://00assclown.newgrounds.com/");
+ Page updatePage2 = pages.get(updateUrl2);
+ long numLinks2 = updatePage2.getNumOutbound();
+ Assert.assertTrue(updatePage2.addOutbound(newLink("http://example.com", "Example")));
+ Assert.assertEquals(numLinks2 + 1, (long) updatePage2.getNumOutbound());
+
+ try (LoaderExecutor le = client.newLoaderExecutor()) {
+ le.execute(PageLoader.updatePage(updatePage2));
+ }
+ getMiniFluo().waitForObservers();
+
+ Assert.assertNotNull(pages.put(updateUrl, updatePage));
+ Assert.assertNotNull(pages.put(updateUrl2, updatePage2));
+ assertOutput(pages.values());
+
+ // completely remove link that had an inlink count of 2
+ updatePage = pages.get(updateUrl);
+ numLinks = updatePage.getNumOutbound();
+ Assert.assertTrue(updatePage.removeOutbound(newLink("http://example.com")));
+ Assert.assertEquals(numLinks - 1, (long) updatePage.getNumOutbound());
+
+ updatePage2 = pages.get(updateUrl2);
+ numLinks2 = updatePage2.getNumOutbound();
+ Assert.assertTrue(updatePage2.removeOutbound(newLink("http://example.com")));
+ Assert.assertEquals(numLinks2 - 1, (long) updatePage2.getNumOutbound());
+
+ try (LoaderExecutor le = client.newLoaderExecutor()) {
+ le.execute(PageLoader.updatePage(updatePage));
+ le.execute(PageLoader.updatePage(updatePage2));
+ }
+ getMiniFluo().waitForObservers();
+
+ Assert.assertNotNull(pages.put(updateUrl, updatePage));
+ Assert.assertNotNull(pages.put(updateUrl2, updatePage2));
+ assertOutput(pages.values());
+ }
+ }
+
+ @Test
+ public void testSparkThenFluoIndexing() throws Exception {
+
+ Map<URL, Page> pageMap = readPages(new File("src/test/resources/wat-18.warc"));
+ List<Page> pages = new ArrayList<>(pageMap.values());
+
+ env.initializeIndexes(ctx, ctx.parallelize(pages.subList(0, 2)), new IndexStats(ctx));
+
+ assertOutput(pages.subList(0, 2));
+
+ try (FluoClient client = FluoFactory.newClient(getMiniFluo().getClientConfiguration());
+ LoaderExecutor le = client.newLoaderExecutor()) {
+ for (Page page : pages.subList(2, pages.size())) {
+ log.debug("Loading page {} with {} links", page.getUrl(), page.getOutboundLinks().size());
+ le.execute(PageLoader.updatePage(page));
+ }
+ }
+ getMiniFluo().waitForObservers();
+
+ assertOutput(pages);
+ }
+
+ private void printRDD(List<Tuple2<RowColumn, Bytes>> rcvRDD) {
+ System.out.println("== RDD start ==");
+ rcvRDD.forEach(t -> System.out.println("rc " + Hex.encNonAscii(t, " ")));
+ System.out.println("== RDD end ==");
+ }
+
+ private static List<RowColumnValue> tuples2rcv(List<Tuple2<RowColumn, Bytes>> linkIndex) {
+ return Lists.transform(linkIndex, t -> new RowColumnValue(t._1().getRow(), t._1().getColumn(),
+ t._2()));
+ }
+}
diff --git a/webindex/modules/data/src/test/java/webindex/data/spark/Hex.java b/webindex/modules/data/src/test/java/webindex/data/spark/Hex.java
new file mode 100644
index 0000000..2e47f29
--- /dev/null
+++ b/webindex/modules/data/src/test/java/webindex/data/spark/Hex.java
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.spark;
+
+import java.io.ByteArrayOutputStream;
+
+import org.apache.fluo.api.data.Bytes;
+import org.apache.fluo.api.data.Column;
+import org.apache.fluo.api.data.RowColumn;
+import scala.Tuple2;
+
+public class Hex {
+ public static void encNonAscii(StringBuilder sb, Bytes bytes) {
+ for (int i = 0; i < bytes.length(); i++) {
+ byte b = bytes.byteAt(i);
+ if (b >= 32 && b <= 126 && b != '\\') {
+ sb.append((char) b);
+ } else {
+ sb.append(String.format("\\x%02x", b & 0xff));
+ }
+ }
+ }
+
+ public static String encNonAscii(Bytes bytes) {
+ StringBuilder sb = new StringBuilder();
+ encNonAscii(sb, bytes);
+ return sb.toString();
+ }
+
+ public static void encNonAscii(StringBuilder sb, Column c, String sep) {
+ encNonAscii(sb, c.getFamily());
+ sb.append(sep);
+ encNonAscii(sb, c.getQualifier());
+ }
+
+ public static void encNonAscii(StringBuilder sb, RowColumn rc, String sep) {
+ encNonAscii(sb, rc.getRow());
+ sb.append(sep);
+ encNonAscii(sb, rc.getColumn(), sep);
+ }
+
+ public static String encNonAscii(Tuple2<RowColumn, Bytes> t, String sep) {
+ StringBuilder sb = new StringBuilder();
+ encNonAscii(sb, t._1(), sep);
+ sb.append(sep);
+ encNonAscii(sb, t._2());
+ return sb.toString();
+ }
+
+ static byte[] decode(String s) {
+
+ // the next best thing to a StringBuilder for bytes
+ ByteArrayOutputStream baos = new ByteArrayOutputStream(s.length());
+
+ for (int i = 0; i < s.length(); i++) {
+ byte b;
+
+ if (s.charAt(i) == '\\') {
+ if (s.charAt(i + 1) != 'x') {
+ throw new IllegalArgumentException();
+ }
+
+ String num = "" + s.charAt(i + 2) + s.charAt(i + 3);
+ b = (byte) (0xff & Integer.parseInt(num, 16));
+ i += 3;
+ } else {
+ char c = s.charAt(i);
+ if (c < 32 || c > 126) {
+ throw new IllegalArgumentException();
+ }
+
+ b = (byte) (0xff & c);
+ }
+
+ baos.write(b);
+ }
+
+ return baos.toByteArray();
+ }
+}
diff --git a/webindex/modules/data/src/test/java/webindex/data/spark/IndexEnvTest.java b/webindex/modules/data/src/test/java/webindex/data/spark/IndexEnvTest.java
new file mode 100644
index 0000000..7d19d63
--- /dev/null
+++ b/webindex/modules/data/src/test/java/webindex/data/spark/IndexEnvTest.java
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.spark;
+
+import java.util.SortedSet;
+
+import org.apache.hadoop.io.Text;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class IndexEnvTest {
+
+ @Test
+ public void testGetSplits() throws Exception {
+ SortedSet<Text> splits = IndexEnv.getAccumuloDefaultSplits();
+
+ Assert.assertEquals(76, splits.size());
+ Assert.assertEquals(new Text("d:com.blogg"), splits.first());
+ Assert.assertEquals(new Text("t:fefeff:d"), splits.last());
+ }
+}
diff --git a/webindex/modules/data/src/test/java/webindex/data/spark/IndexUtilTest.java b/webindex/modules/data/src/test/java/webindex/data/spark/IndexUtilTest.java
new file mode 100644
index 0000000..3ab30e6
--- /dev/null
+++ b/webindex/modules/data/src/test/java/webindex/data/spark/IndexUtilTest.java
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.spark;
+
+import java.io.BufferedReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.fluo.api.data.Bytes;
+import org.apache.fluo.api.data.RowColumn;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import scala.Tuple2;
+import webindex.core.models.Link;
+import webindex.core.models.Page;
+import webindex.core.models.URL;
+import webindex.core.models.UriInfo;
+import webindex.data.SparkTestUtil;
+
+public class IndexUtilTest {
+
+ private transient JavaSparkContext sc;
+
+ @Before
+ public void setUp() {
+ sc = SparkTestUtil.getSparkContext(getClass().getSimpleName());
+ }
+
+ @After
+ public void tearDown() {
+ sc.close();
+ sc = null;
+ }
+
+ @Test
+ public void testDataSet1() throws Exception {
+ // Create pages
+ JavaRDD<Page> pages = sc.parallelize(getPagesSet1());
+ IndexStats stats = new IndexStats(sc);
+
+ // Create an Accumulo index from pages and verify
+ JavaPairRDD<String, UriInfo> uriMap = IndexUtil.createUriMap(pages);
+ JavaPairRDD<String, Long> domainMap = IndexUtil.createDomainMap(uriMap);
+ JavaPairRDD<RowColumn, Bytes> accumuloIndex =
+ IndexUtil.createAccumuloIndex(stats, pages, uriMap, domainMap).sortByKey();
+ verifyRDD("data/set1/accumulo-data.txt", accumuloIndex);
+
+ // Use Accumulo index to create Fluo index and verify
+ JavaPairRDD<RowColumn, Bytes> fluoIndex =
+ IndexUtil.createFluoTable(pages, uriMap, domainMap, 119).sortByKey();
+ verifyRDD("data/set1/fluo-data.txt", fluoIndex);
+
+ // Use Fluo index to create Accumulo index and verify
+ // JavaPairRDD<RowColumn, Bytes> accumuloIndexRecreated =
+ // IndexUtil.createAccumuloIndex(fluoIndex);
+ // verifyRDD("data/set1/accumulo-data.txt", accumuloIndexRecreated);
+ }
+
+ public void dump(JavaPairRDD<RowColumn, Bytes> rcb) {
+ rcb.foreach(t -> System.out.println(Hex.encNonAscii(t, "|")));
+ }
+
+ public void verifyRDD(String expectedFilename, JavaPairRDD<RowColumn, Bytes> actual)
+ throws Exception {
+ List<String> expectedList = new ArrayList<>();
+ InputStream is = getClass().getClassLoader().getResourceAsStream(expectedFilename);
+ try (BufferedReader br = new BufferedReader(new InputStreamReader(is))) {
+ String line;
+ while ((line = br.readLine()) != null) {
+ expectedList.add(line);
+ }
+ }
+
+ List<Tuple2<RowColumn, Bytes>> actualList = actual.collect();
+ Assert.assertEquals(expectedList.size(), actualList.size());
+
+ Iterator<Tuple2<RowColumn, Bytes>> actualIter = actualList.iterator();
+ Iterator<String> expectedIter = expectedList.iterator();
+
+ while (actualIter.hasNext() && expectedIter.hasNext()) {
+ String exp = expectedIter.next();
+ Tuple2<RowColumn, Bytes> act = actualIter.next();
+ Assert.assertEquals(exp, Hex.encNonAscii(act, "|"));
+ }
+ }
+
+ private List<Page> getPagesSet1() {
+ List<Page> pages = new ArrayList<>();
+ Page pageA = new Page(URL.from("http://a.com/1").toUri());
+ pageA.addOutbound(Link.of(URL.from("http://b.com/1"), "b1"));
+ pageA.addOutbound(Link.of(URL.from("http://b.com/3"), "b3"));
+ pageA.addOutbound(Link.of(URL.from("http://c.com/1"), "c1"));
+ Page pageB = new Page(URL.from("http://b.com").toUri());
+ pageB.addOutbound(Link.of(URL.from("http://c.com/1"), "c1"));
+ pageB.addOutbound(Link.of(URL.from("http://b.com/2"), "b2"));
+ pageB.addOutbound(Link.of(URL.from("http://b.com/3"), "b3"));
+ pages.add(pageA);
+ pages.add(pageB);
+ return pages;
+ }
+}
diff --git a/webindex/modules/data/src/test/java/webindex/data/util/ArchiveUtilTest.java b/webindex/modules/data/src/test/java/webindex/data/util/ArchiveUtilTest.java
new file mode 100644
index 0000000..243fd2f
--- /dev/null
+++ b/webindex/modules/data/src/test/java/webindex/data/util/ArchiveUtilTest.java
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.data.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.text.ParseException;
+import java.util.Iterator;
+
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.warc.WARCReaderFactory;
+import org.junit.Assert;
+import org.junit.Test;
+import webindex.core.models.Page;
+
+public class ArchiveUtilTest {
+
+ @Test
+ public void testBasic() throws IOException, ParseException {
+
+ ArchiveReader archiveReader = WARCReaderFactory.get(new File("src/test/resources/wat.warc"));
+ Page page = ArchiveUtil.buildPage(archiveReader.get());
+ Assert.assertNotNull(page);
+ Assert.assertFalse(page.isEmpty());
+
+ Assert
+ .assertEquals(
+ "http://1079ishot.com/presale-password-trey-songz-young-jeezy-pre-christmas-bash/screen-shot-2011-10-27-at-11-12-06-am/",
+ page.getUrl());
+ Assert
+ .assertEquals(
+ "com.1079ishot>>o>/presale-password-trey-songz-young-jeezy-pre-christmas-bash/screen-shot-2011-10-27-at-11-12-06-am/",
+ page.getUri());
+
+ Assert.assertEquals("2015-04-18T03:35:13Z", page.getCrawlDate());
+ Assert.assertEquals("nginx/1.6.2", page.getServer());
+ Assert
+ .assertEquals(
+ "Presale Password – Trey Songz & Young Jeezy Pre-Christmas Bash Screen shot 2011-10-27 at ",
+ page.getTitle());
+ Assert.assertEquals(0, page.getOutboundLinks().size());
+
+ ArchiveReader ar2 = WARCReaderFactory.get(new File("src/test/resources/wat-18.warc"));
+
+ int valid = 0;
+ int invalid = 0;
+ Iterator<ArchiveRecord> records = ar2.iterator();
+ while (records.hasNext()) {
+ try {
+ ArchiveRecord r = records.next();
+ ArchiveUtil.buildPage(r);
+ valid++;
+ } catch (ParseException e) {
+ invalid++;
+ }
+ }
+ Assert.assertEquals(18, valid);
+ Assert.assertEquals(0, invalid);
+ }
+}
diff --git a/webindex/modules/data/src/test/resources/data/set1/accumulo-data.txt b/webindex/modules/data/src/test/resources/data/set1/accumulo-data.txt
new file mode 100644
index 0000000..e82938b
--- /dev/null
+++ b/webindex/modules/data/src/test/resources/data/set1/accumulo-data.txt
@@ -0,0 +1,29 @@
+d:com.a|domain|pagecount|1
+d:com.a:fefeff:com.a>>o>/1|rank||0
+d:com.b|domain|pagecount|4
+d:com.b:fefdfdff:com.b>>o>/3|rank||2
+d:com.b:fefdfefdff:com.b>>o>/1|rank||1
+d:com.b:fefdfefdff:com.b>>o>/2|rank||1
+d:com.b:fefeff:com.b>>o>/|rank||0
+d:com.c|domain|pagecount|1
+d:com.c:fefdfdff:com.c>>o>/1|rank||2
+p:com.a>>o>/1|page|cur|{"url":"http://a.com/1","uri":"com.a\x5cu003e\x5cu003eo\x5cu003e/1","numOutbound":3,"outboundLinks":[{"url":"http://b.com/1","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/1","anchorText":"b1"},{"url":"http://b.com/3","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/3","anchorText":"b3"},{"url":"http://c.com/1","uri":"com.c\x5cu003e\x5cu003eo\x5cu003e/1","anchorText":"c1"}]}
+p:com.a>>o>/1|page|incount|0
+p:com.b>>o>/|page|cur|{"url":"http://b.com/","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/","numOutbound":3,"outboundLinks":[{"url":"http://b.com/2","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/2","anchorText":"b2"},{"url":"http://b.com/3","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/3","anchorText":"b3"},{"url":"http://c.com/1","uri":"com.c\x5cu003e\x5cu003eo\x5cu003e/1","anchorText":"c1"}]}
+p:com.b>>o>/|page|incount|0
+p:com.b>>o>/1|inlinks|com.a>>o>/1|b1
+p:com.b>>o>/1|page|incount|1
+p:com.b>>o>/2|inlinks|com.b>>o>/|b2
+p:com.b>>o>/2|page|incount|1
+p:com.b>>o>/3|inlinks|com.a>>o>/1|b3
+p:com.b>>o>/3|inlinks|com.b>>o>/|b3
+p:com.b>>o>/3|page|incount|2
+p:com.c>>o>/1|inlinks|com.a>>o>/1|c1
+p:com.c>>o>/1|inlinks|com.b>>o>/|c1
+p:com.c>>o>/1|page|incount|2
+t:fefdfdff:com.b>>o>/3|||2
+t:fefdfdff:com.c>>o>/1|||2
+t:fefdfefdff:com.b>>o>/1|||1
+t:fefdfefdff:com.b>>o>/2|||1
+t:fefeff:com.a>>o>/1|||0
+t:fefeff:com.b>>o>/|||0
diff --git a/webindex/modules/data/src/test/resources/data/set1/fluo-data.txt b/webindex/modules/data/src/test/resources/data/set1/fluo-data.txt
new file mode 100644
index 0000000..e085c81
--- /dev/null
+++ b/webindex/modules/data/src/test/resources/data/set1/fluo-data.txt
@@ -0,0 +1,11 @@
+dm:d:28:\x03\x01com.\xe3|data|current|\x09\x02
+dm:d:57:\x03\x01com.\xe1|data|current|\x09\x02
+dm:d:5a:\x03\x01com.\xe2|data|current|\x09\x08
+p:saxb:com.a>>o>/1|page|cur|{"url":"http://a.com/1","uri":"com.a\x5cu003e\x5cu003eo\x5cu003e/1","numOutbound":3,"outboundLinks":[{"url":"http://b.com/1","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/1","anchorText":"b1"},{"url":"http://b.com/3","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/3","anchorText":"b3"},{"url":"http://c.com/1","uri":"com.c\x5cu003e\x5cu003eo\x5cu003e/1","anchorText":"c1"}]}
+p:xdjd:com.b>>o>/|page|cur|{"url":"http://b.com/","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/","numOutbound":3,"outboundLinks":[{"url":"http://b.com/2","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/2","anchorText":"b2"},{"url":"http://b.com/3","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/3","anchorText":"b3"},{"url":"http://c.com/1","uri":"com.c\x5cu003e\x5cu003eo\x5cu003e/1","anchorText":"c1"}]}
+um:d:06:\x03\x01com.b>>o>/\xb3|data|current|\x0b\x01\x00\x04
+um:d:2d:\x03\x01com.a>>o>/\xb1|data|current|\x0b\x01\x02\x00
+um:d:3c:\x03\x01com.c>>o>/\xb1|data|current|\x0b\x01\x00\x04
+um:d:43:\x03\x01com.b>>o>\xaf|data|current|\x0b\x01\x02\x00
+um:d:59:\x03\x01com.b>>o>/\xb1|data|current|\x0b\x01\x00\x02
+um:d:76:\x03\x01com.b>>o>/\xb2|data|current|\x0b\x01\x00\x02
diff --git a/webindex/modules/data/src/test/resources/log4j.properties b/webindex/modules/data/src/test/resources/log4j.properties
new file mode 100644
index 0000000..c80a759
--- /dev/null
+++ b/webindex/modules/data/src/test/resources/log4j.properties
@@ -0,0 +1,32 @@
+# Copyright 2014 Webindex authors (see AUTHORS)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+log4j.rootLogger=INFO, CA
+log4j.appender.CA=org.apache.log4j.ConsoleAppender
+log4j.appender.CA.layout=org.apache.log4j.PatternLayout
+log4j.appender.CA.layout.ConversionPattern=%d{ISO8601} [%c] %-5p: %m%n
+
+log4j.logger.akka=WARN
+log4j.logger.org.apache.accumulo=WARN
+log4j.logger.org.apache.curator=ERROR
+log4j.logger.org.apache.fluo=WARN
+log4j.logger.org.apache.hadoop=WARN
+log4j.logger.org.apache.hadoop.mapreduce=ERROR
+log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
+log4j.logger.org.apache.spark=WARN
+log4j.logger.org.apache.zookeeper=WARN
+log4j.logger.org.apache.zookeeper.ClientCnxn=ERROR
+log4j.logger.org.spark-project=WARN
+log4j.logger.webindex=WARN
+log4j.logger.Remoting=WARN
diff --git a/webindex/modules/data/src/test/resources/wat-18.warc b/webindex/modules/data/src/test/resources/wat-18.warc
new file mode 100644
index 0000000..aacd5b4
--- /dev/null
+++ b/webindex/modules/data/src/test/resources/wat-18.warc
@@ -0,0 +1,200 @@
+WARC/1.0
+WARC-Type: warcinfo
+WARC-Date: 2015-05-21T01:22:33Z
+WARC-Filename: CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz
+WARC-Record-ID: <urn:uuid:5bb856f0-bd0d-49f6-827c-0c4060260e35>
+Content-Type: application/warc-fields
+Content-Length: 108
+
+Software-Info: ia-web-commons.1.0-SNAPSHOT-20150415075912
+Extracted-Date: Thu, 21 May 2015 01:22:33 GMT
+
+
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz
+WARC-Date: 2015-05-15T10:18:43Z
+WARC-Record-ID: <urn:uuid:1b315711-e568-453b-bd1d-3d9fb5e7e960>
+WARC-Refers-To: <urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>
+Content-Type: application/json
+Content-Length: 1149
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"273","Block-Digest":"sha1:IHT665GEELC5ENYTJAE2HR2NGRCMQEE2","Actual-Content-Length":"341","WARC-Header-Metadata":{"WARC-Type":"warcinfo","WARC-Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz","WARC-Date":"2015-05-15T10:18:43Z","Content-Length":"341","WARC-Record-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Type":"application/warc-fields"},"Payload-Metadata":{"Trailing-Slop-Length":"0","Actual-Content-Type":"application/warc-fields","Actual-Content-Length":"341","Headers-Corrupt":true,"WARC-Info-Metadata":{"robots":"classic","software":"Nutch 1.6 (CC)/CC WarcExport 1.0","description":"Wide crawl of the web for April 2015","hostname":"ip-10-235-10-82.ec2.internal","format":"WARC File Format 1.0","isPartOf":"CC-MAIN-2015-18","operator":"CommonCrawl Admin","publisher":"CommonCrawl"}}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"428","Header-Length":"10","Inflated-CRC":"-1148418993","Inflated-Length":"618"},"Offset":"0","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://0.r.msn.com/?ld=7vbVzbpUDAd0qm2bWud6cigjVUCUxlyL-YoTXA18CeXPypaUenoNPYlOR6Q9zg-o5JFSqn-aZqW7vfaeEOcx9MskmI91bEJ8AgoKeDmZZL97q7gL1BYKUywdhKxTmPDqnqGE0q5Q&u=www.jamesallen.com%2F
+WARC-Date: 2015-04-18T03:30:14Z
+WARC-Record-ID: <urn:uuid:e278c6a0-1dc5-4045-b9ea-e268c9d84112>
+WARC-Refers-To: <urn:uuid:9a25a871-ab38-4925-b4c3-1dd43322c93e>
+Content-Type: application/json
+Content-Length: 1678
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"499","Block-Digest":"sha1:V7WYE32GO5UNRRMUJQBABLYUBBV5GCNA","Actual-Content-Length":"414","WARC-Header-Metadata":{"WARC-Type":"request","WARC-Date":"2015-04-18T03:30:14Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"414","WARC-Record-ID":"<urn:uuid:9a25a871-ab38-4925-b4c3-1dd43322c93e>","WARC-Target-URI":"http://0.r.msn.com/?ld=7vbVzbpUDAd0qm2bWud6cigjVUCUxlyL-YoTXA18CeXPypaUenoNPYlOR6Q9zg-o5JFSqn-aZqW7vfaeEOcx9MskmI91bEJ8AgoKeDmZZL97q7gL1BYKUywdhKxTmPDqnqGE0q5Q&u=www.jamesallen.com%2F","WARC-IP-Address":"65.52.108.2","Content-Type":"application/http; msgtype=request"},"Payload-Metadata":{"Trailing-Slop-Length":"4","HTTP-Request-Metadata":{"Headers":{"Accept-Language":"en-us,en-gb,en;q=0.7,*;q=0.3","Host":"0.r.msn.com","Accept-Encoding":"x-gzip, gzip, deflate","User-Agent":"CCBot/2.0 (http://commoncrawl.org/faq/)","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"},"Headers-Length":"412","Entity-Length":"0","Entity-Trailing-Slop-Bytes":"0","Request-Message":{"Method":"GET","Version":"HTTP/1.0","Path":"/?ld=7vbVzbpUDAd0qm2bWud6cigjVUCUxlyL-YoTXA18CeXPypaUenoNPYlOR6Q9zg-o5JFSqn-aZqW7vfaeEOcx9MskmI91bEJ8AgoKeDmZZL97q7gL1BYKUywdhKxTmPDqnqGE0q5Q&u=www.jamesallen.com%2F"},"Entity-Digest":"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"},"Actual-Content-Type":"application/http; msgtype=request"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"550","Header-Length":"10","Inflated-CRC":"594922758","Inflated-Length":"917"},"Offset":"428","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://0.r.msn.com/?ld=7vbVzbpUDAd0qm2bWud6cigjVUCUxlyL-YoTXA18CeXPypaUenoNPYlOR6Q9zg-o5JFSqn-aZqW7vfaeEOcx9MskmI91bEJ8AgoKeDmZZL97q7gL1BYKUywdhKxTmPDqnqGE0q5Q&u=www.jamesallen.com%2F
+WARC-Date: 2015-04-18T03:30:14Z
+WARC-Record-ID: <urn:uuid:b2492a1c-e9ee-423b-b7e9-e3e926289151>
+WARC-Refers-To: <urn:uuid:7472bd8c-66b5-495d-bedb-b885a9bf9eb3>
+Content-Type: application/json
+Content-Length: 1639
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"688","Block-Digest":"sha1:GBR6ZWRVDDCT6ASYZXA7YATY52UTEMVJ","Actual-Content-Length":"174","WARC-Header-Metadata":{"WARC-Type":"response","WARC-Date":"2015-04-18T03:30:14Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"174","WARC-Record-ID":"<urn:uuid:7472bd8c-66b5-495d-bedb-b885a9bf9eb3>","WARC-Block-Digest":"sha1:GBR6ZWRVDDCT6ASYZXA7YATY52UTEMVJ","WARC-Payload-Digest":"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ","WARC-Target-URI":"http://0.r.msn.com/?ld=7vbVzbpUDAd0qm2bWud6cigjVUCUxlyL-YoTXA18CeXPypaUenoNPYlOR6Q9zg-o5JFSqn-aZqW7vfaeEOcx9MskmI91bEJ8AgoKeDmZZL97q7gL1BYKUywdhKxTmPDqnqGE0q5Q&u=www.jamesallen.com%2F","WARC-IP-Address":"65.52.108.2","WARC-Concurrent-To":"<urn:uuid:9a25a871-ab38-4925-b4c3-1dd43322c93e>","Content-Type":"application/http; msgtype=response"},"Payload-Metadata":{"Trailing-Slop-Length":"4","Actual-Content-Type":"application/http; msgtype=response","HTTP-Response-Metadata":{"Headers":{"p3p":"CP=BUS CUR CONo FIN IVDo ONL OUR PHY SAMo TELo","Date":"Sat, 18 Apr 2015 03:30:14 GMT","Content-Length":"0","Connection":"close","Server":"Microsoft-IIS/8.0"},"Headers-Length":"174","Entity-Length":"0","Entity-Trailing-Slop-Bytes":"0","Response-Message":{"Status":"200","Version":"HTTP/1.1","Reason":"OK"},"Entity-Digest":"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"}}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"630","Header-Length":"10","Inflated-CRC":"-764833871","Inflated-Length":"866"},"Offset":"978","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://0.r.msn.com/?ld=7vbVzbpUDAd0qm2bWud6cigjVUCUxlyL-YoTXA18CeXPypaUenoNPYlOR6Q9zg-o5JFSqn-aZqW7vfaeEOcx9MskmI91bEJ8AgoKeDmZZL97q7gL1BYKUywdhKxTmPDqnqGE0q5Q&u=www.jamesallen.com%2F
+WARC-Date: 2015-04-18T03:30:14Z
+WARC-Record-ID: <urn:uuid:3979b3b0-7e72-46a4-b2c2-1476aff29f37>
+WARC-Refers-To: <urn:uuid:ee94cc39-7d42-4bc3-a092-855806de67d2>
+Content-Type: application/json
+Content-Length: 1200
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"528","Block-Digest":"sha1:Y2G6LJYB53BFFLHBZQGLKYVVYDPOUPML","Actual-Content-Length":"19","WARC-Header-Metadata":{"WARC-Type":"metadata","WARC-Date":"2015-04-18T03:30:14Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"19","WARC-Record-ID":"<urn:uuid:ee94cc39-7d42-4bc3-a092-855806de67d2>","WARC-Target-URI":"http://0.r.msn.com/?ld=7vbVzbpUDAd0qm2bWud6cigjVUCUxlyL-YoTXA18CeXPypaUenoNPYlOR6Q9zg-o5JFSqn-aZqW7vfaeEOcx9MskmI91bEJ8AgoKeDmZZL97q7gL1BYKUywdhKxTmPDqnqGE0q5Q&u=www.jamesallen.com%2F","WARC-Concurrent-To":"<urn:uuid:7472bd8c-66b5-495d-bedb-b885a9bf9eb3>","Content-Type":"application/warc-fields"},"Payload-Metadata":{"Trailing-Slop-Length":"4","WARC-Metadata-Metadata":{"Trailing-Slop-Length":"0","Metadata-Records":[{"Name":"fetchTimeMs","Value":"17"}],"Actual-Content-Length":"19"},"Actual-Content-Type":"application/metadata-fields"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"428","Header-Length":"10","Inflated-CRC":"-1542478084","Inflated-Length":"551"},"Offset":"1608","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://00assclown.newgrounds.com/
+WARC-Date: 2015-04-18T03:18:39Z
+WARC-Record-ID: <urn:uuid:2b042822-d74d-451d-b724-aecc4a6fa17b>
+WARC-Refers-To: <urn:uuid:34e0aac9-004e-406f-9eff-362c54d73f09>
+Content-Type: application/json
+Content-Length: 1384
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"353","Block-Digest":"sha1:NGAN4WTLGPL6356BH7IXCEAVNQJNJHXN","Actual-Content-Length":"264","WARC-Header-Metadata":{"WARC-Type":"request","WARC-Date":"2015-04-18T03:18:39Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"264","WARC-Record-ID":"<urn:uuid:34e0aac9-004e-406f-9eff-362c54d73f09>","WARC-Target-URI":"http://00assclown.newgrounds.com/","WARC-IP-Address":"162.159.254.106","Content-Type":"application/http; msgtype=request"},"Payload-Metadata":{"Trailing-Slop-Length":"4","HTTP-Request-Metadata":{"Headers":{"Accept-Language":"en-us,en-gb,en;q=0.7,*;q=0.3","Host":"00assclown.newgrounds.com","Accept-Encoding":"x-gzip, gzip, deflate","User-Agent":"CCBot/2.0 (http://commoncrawl.org/faq/)","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"},"Headers-Length":"262","Entity-Length":"0","Entity-Trailing-Slop-Bytes":"0","Request-Message":{"Method":"GET","Version":"HTTP/1.0","Path":"/"},"Entity-Digest":"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"},"Actual-Content-Type":"application/http; msgtype=request"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"419","Header-Length":"10","Inflated-CRC":"1175582372","Inflated-Length":"621"},"Offset":"2036","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://00assclown.newgrounds.com/
+WARC-Date: 2015-04-18T03:18:39Z
+WARC-Record-ID: <urn:uuid:47b8a150-9026-4f02-b86c-9c89fcdb76a8>
+WARC-Refers-To: <urn:uuid:90f14ce3-a38b-4638-a076-ec88e16168f5>
+Content-Type: application/json
+Content-Length: 16282
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"568","Block-Digest":"sha1:5MMYKWJTBGI624TG7C25DM4WSK73LIJI","Actual-Content-Length":"29678","WARC-Header-Metadata":{"WARC-Type":"response","WARC-Truncated":"length","WARC-Date":"2015-04-18T03:18:39Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"29678","WARC-Record-ID":"<urn:uuid:90f14ce3-a38b-4638-a076-ec88e16168f5>","WARC-Block-Digest":"sha1:5MMYKWJTBGI624TG7C25DM4WSK73LIJI","WARC-Payload-Digest":"sha1:3XLT4X6EBBBAKGG67Z7M5LAIAWTPYXX5","WARC-Target-URI":"http://00assclown.newgrounds.com/","WARC-IP-Address":"162.159.254.106","WARC-Concurrent-To":"<urn:uuid:34e0aac9-004e-406f-9eff-362c54d73f09>","Content-Type":"application/http; msgtype=response"},"Payload-Metadata":{"Trailing-Slop-Length":"4","Actual-Content-Type":"application/http; msgtype=response","HTTP-Response-Metadata":{"Headers":{"Vary":"Host,Accept-Encoding","Access-Control-Allow-Origin":"http://www.newgrounds.com","Date":"Sat, 18 Apr 2015 03:18:39 GMT","Content-Encoding":"gzip","CF-RAY":"1d8d39415d70020d-IAD","Set-Cookie":"ng_user0=a%3A1%3A%7Bs%3A7%3A%22default%22%3Ba%3A0%3A%7B%7D%7D; path=/; domain=.newgrounds.com","Connection":"close","Content-Type":"text/html","Server":"cloudflare-nginx","X-Powered-By":"PHP/5.4.39-0+deb7u2"},"Headers-Length":"1613","Entity-Length":"28065","Entity-Trailing-Slop-Bytes":"0","Response-Message":{"Status":"200","Version":"HTTP/1.1","Reason":"OK"},"HTML-Metadata":{"Links":[{"text":"Skip to content.","path":"A@/href","url":"#main"},{"text":"Newgrounds.com — Everything, By Everyone.","path":"A@/href","url":"http://www.newgrounds.com"},{"text":"Games","path":"A@/href","url":"http://www.newgrounds.com/games"},{"text":"Latest","path":"A@/href","url":"http://www.newgrounds.com/games/browse"},{"text":"Greatest","path":"A@/href","url":"http://www.newgrounds.com/games/browse/sort/score/interval/month"},{"text":"Popular","path":"A@/href","url":"http://www.newgrounds.com/games/browse/sort/views/interval/week"},{"text":"Under Judgment","path":"A@/href","url":"http://www.newgrounds.com/games/under_judgment"},{"text":"Classic Portal","path":"A@/href","url":"http://www.newgrounds.com/portal"},{"text":"Submit Yours!","path":"A@/href","url":"http://www.newgrounds.com/projects/games"},{"text":"Team Up!","path":"A@/href","url":"http://www.newgrounds.com/collab/browse/programmer"},{"text":"Movies","path":"A@/href","url":"http://www.newgrounds.com/movies"},{"text":"Latest","path":"A@/href","url":"http://www.newgrounds.com/movies/browse"},{"text":"Greatest","path":"A@/href","url":"http://www.newgrounds.com/movies/browse/sort/score/interval/month"},{"text":"Popular","path":"A@/href","url":"http://www.newgrounds.com/movies/browse/sort/views/interval/week"},{"text":"Under Judgment","path":"A@/href","url":"http://www.newgrounds.com/movies/under_judgment"},{"text":"Classic Portal","path":"A@/href","url":"http://www.newgrounds.com/portal"},{"text":"Submit Yours!","path":"A@/href","url":"http://www.newgrounds.com/projects/movies"},{"text":"Team Up!","path":"A@/href","url":"http://www.newgrounds.com/collab/browse/artist"},{"text":"Audio","path":"A@/href","url":"http://www.newgrounds.com/audio"},{"text":"Latest","path":"A@/href","url":"http://www.newgrounds.com/audio/browse/sort/date"},{"text":"Greatest","path":"A@/href","url":"http://www.newgrounds.com/audio/browse/sort/score/interval/month"},{"text":"Popular","path":"A@/href","url":"http://www.newgrounds.com/audio/browse/sort/views/interval/week"},{"text":"Audio Forum","path":"A@/href","url":"http://www.newgrounds.com/bbs/forum/13"},{"text":"Submit Yours!","path":"A@/href","url":"http://www.newgrounds.com/projects/audio"},{"text":"Team Up!","path":"A@/href","url":"http://www.newgrounds.com/collab/browse/musician"},{"text":"Art","path":"A@/href","url":"http://www.newgrounds.com/art"},{"text":"Latest","path":"A@/href","url":"http://www.newgrounds.com/art/browse"},{"text":"Greatest","path":"A@/href","url":"http://www.newgrounds.com/art/browse/sort/score/interval/month"},{"text":"Popular","path":"A@/href","url":"http://www.newgrounds.com/art/browse/sort/views/interval/week"},{"text":"Art Forum","path":"A@/href","url":"http://www.newgrounds.com/bbs/forum/14"},{"text":"Submit Yours!","path":"A@/href","url":"http://www.newgrounds.com/art/submit/create"},{"text":"Team Up!","path":"A@/href","url":"http://www.newgrounds.com/collab/browse/artist"},{"text":"Channels","path":"A@/href","url":"http://www.newgrounds.com/collection"},{"text":"Series","path":"A@/href","url":"http://www.newgrounds.com/collection/series"},{"text":"Collections","path":"A@/href","url":"http://www.newgrounds.com/collection"},{"text":"Judgment","path":"A@/href","url":"http://www.newgrounds.com/movies/under_judgment"},{"text":"Playlists","path":"A@/href","url":"http://www.newgrounds.com/playlists"},{"text":"Community","path":"A@/href","url":"http://www.newgrounds.com/bbs"},{"text":"Forums","path":"A@/href","url":"http://www.newgrounds.com/bbs"},{"text":"Calendar","path":"A@/href","url":"http://www.newgrounds.com/calendar"},{"text":"Artist News","path":"A@/href","url":"http://www.newgrounds.com/news/artists"},{"text":"Rankings","path":"A@/href","url":"http://www.newgrounds.com/rankings"},{"text":"Downloads","path":"A@/href","url":"http://www.newgrounds.com/downloads"},{"text":"Wiki","path":"A@/href","url":"http://www.newgrounds.com/wiki"},{"text":"Feeds","path":"A@/href","url":"http://www.newgrounds.com/wiki/help-information/user-accounts/feeds"},{"text":"About Feeds","path":"A@/href","url":"http://www.newgrounds.com/wiki/help-information/user-accounts/feeds"},{"text":"Wall Artist","path":"A@/href","url":"http://keepwalking.newgrounds.com"},{"text":"Login / Sign Up","path":"A@/href","url":"//www.newgrounds.com/passport"},{"path":"FORM@/action","method":"get","url":"http://www.newgrounds.com/search"},{"text":"Latest News","path":"A@/href","url":"http://www.newgrounds.com/bbs/topic/1389193"},{"text":"Lakeview Cabin Fan Art Challenge!","path":"A@/href","url":"http://www.newgrounds.com/bbs/topic/1389193"},{"alt":"Be a Supporter!","path":"IMG@/src","url":"http://www.newgrounds.com/img/ads/ad-support0.gif"},{"path":"A@/href","url":"http://www.newgrounds.com/supporter"},{"title":"","alt":"00Assclown","path":"IMG@/src","url":"http://img.ngfiles.com/defaults/icon-user-smallest.gif"},{"text":"Main","path":"A@/href","url":"/"},{"text":"News","path":"A@/href","url":"/news/"},{"text":"Movies","path":"A@/href","url":"/movies/"},{"text":"Favorites","path":"A@/href","url":"/favorites/"},{"text":"Reviews","path":"A@/href","url":"/reviews/"},{"text":"Stats","path":"A@/href","url":"/stats/"},{"path":"FORM@/action","method":"post","url":"/favorites/add"},{"title":"","alt":"00Assclown","path":"IMG@/src","url":"http://img.ngfiles.com/defaults/image-user.gif"},{"text":"Send a Private Message (PM)","path":"A@/href","url":"http://www.newgrounds.com/pm/send/00assclown"},{"text":"Add Friend","path":"A@/href","url":"/addfriend"},{"text":"All Stats >","path":"A@/href","url":"/stats"},{"text":"All Stats","path":"A@/href","url":"/stats"},{"text":"Send a Private Message (PM)","path":"A@/href","url":"http://www.newgrounds.com/pm/send/00assclown"},{"text":"View All","path":"A@/href","url":"/news/"},{"text":"This would be the greatest rock band in the world...","path":"A@/href","url":"http://00assclown.newgrounds.com/news/post/241931"},{"text":"00Assclown","path":"A@/href","url":"http://00assclown.newgrounds.com/"},{"text":"1 comment","path":"A@/href","url":"http://00assclown.newgrounds.com/news/post/241931#comments"},{"text":"Facebook","target":"_blank","path":"A@/href","url":"http://www.facebook.com/sharer.php?u=http%3A%2F%2F00assclown.newgrounds.com%2Fnews%2Fpost%2F241931&t=This+would+be+the+greatest+rock+band+in+the+world..."},{"text":"Twitter","target":"_blank","path":"A@/href","url":"http://twitter.com/home?status=Check+out+http%3A%2F%2F00assclown.newgrounds.com%2Fnews%2Fpost%2F241931"},{"text":"Reddit","target":"_blank","path":"A@/href","url":"http://reddit.com/submit?url=http%3A%2F%2F00assclown.newgrounds.com%2Fnews%2Fpost%2F241931&title=This+would+be+the+greatest+rock+band+in+the+world..."},{"text":"View all Favorite Movies","path":"A@/href","url":"favorites/movies"},{"title":"","alt":"Salad Fingers","path":"IMG@/src","url":"http://picon.ngfiles.com/178000/flash_178546.gif"},{"text":"Salad Fingers Rated 4.2315 Stars Salad Fingers is a gentle creature Other","path":"A@/href","url":"http://www.newgrounds.com/portal/view/178546"},{"title":"","alt":"Salad Fingers Episode 2","path":"IMG@/src","url":"http://picon.ngfiles.com/181000/flash_181169.jpg"},{"text":"Salad Fingers Episode 2 Rated 4.2736 Stars Salad Fingers has some friends over for tea. Comed","path":"A@/href","url":"http://www.newgrounds.com/portal/view/181169"},{"title":"","alt":"Salad Fingers Episode 3","path":"IMG@/src","url":"http://picon.ngfiles.com/184000/flash_184511.jpg"},{"text":"Salad Fingers Episode 3 Rated 4.2551 Stars Salad Fingers Episode 3 "Nettles" Comedy","path":"A@/href","url":"http://www.newgrounds.com/portal/view/184511"},{"title":"","alt":"Salad Fingers Episode 4","path":"IMG@/src","url":"http://picon.ngfiles.com/187000/flash_187864.gif"},{"text":"Salad Fingers Episode 4 Rated 4.2665 Stars Episode 4: "Cage" Guess what.... Comedy","path":"A@/href","url":"http://www.newgrounds.com/portal/view/187864"},{"text":"View all Favorite Games","path":"A@/href","url":"favorites/games"},{"title":"","alt":"Portal Defenders","path":"IMG@/src","url":"http://picon.ngfiles.com/484000/flash_484931.gif"},{"text":"Portal Defenders Rated 4.4301 Stars Defend the NG portal through waves of SPAM Action - Fight","path":"A@/href","url":"http://www.newgrounds.com/portal/view/484931"},{"title":"","alt":"How to draw Salad Fingers","path":"IMG@/src","url":"http://picon.ngfiles.com/272000/flash_272117.jpeg"},{"text":"How to draw Salad Fingers Rated 2.7370 Stars Learn to draw Salad Fingers and the Gang Tutoria","path":"A@/href","url":"http://www.newgrounds.com/portal/view/272117"},{"text":"Subscribe to RSS Feed","path":"A@/href","url":"http://rss.ngfiles.com/users/2619000/00assclown/flash/"},{"title":"","alt":"Suicidal Advert (Maybe)","path":"IMG@/src","url":"http://img.ngfiles.com/defaults/icon-portal.gif"},{"text":"Suicidal Advert (Maybe) Rated Stars First flash so it's not up to much. About a man and his pho","path":"A@/href","url":"http://www.newgrounds.com/portal/view/467033"},{"text":"Games","path":"A@/href","url":"/ajax/footer_feature.php?footer_feature=games"},{"text":"Movies","path":"A@/href","url":"/ajax/footer_feature.php?footer_feature=movies"},{"text":"Audio","path":"A@/href","url":"/ajax/footer_feature.php?footer_feature=audio"},{"text":"Art","path":"A@/href","url":"/ajax/footer_feature.php?footer_feature=art"},{"text":"Channels","path":"A@/href","url":"/ajax/footer_feature.php?footer_feature=channels"},{"text":"Users","path":"A@/href","url":"/ajax/footer_feature.php?footer_feature=users"},{"text":"Previous Section","path":"A@/href","url":"/ajax/footer_feature.php?footer_feature=channels"},{"text":"ManaSakura First time (Paki-improve po)","path":"A@/href","url":"http://manasakura.newgrounds.com/news/post/927704"},{"text":"Viper Playing at a benefit thing for my aunt","path":"A@/href","url":"http://viper.newgrounds.com/news/post/927703"},{"text":"moonvamp I'm a waste of time and space","path":"A@/href","url":"http://moonvamp.newgrounds.com/news/post/927702"},{"text":"jacklehamster Oozie plays with himself (Dave's Dream 2) is released!","path":"A@/href","url":"http://jacklehamster.newgrounds.com/news/post/927699"},{"text":"Hydecka I have a goal now. I know what I want to make.","path":"A@/href","url":"http://hydecka.newgrounds.com/news/post/927698"},{"text":"Mekkatech Themes and other stuff","path":"A@/href","url":"http://mekkatech.newgrounds.com/news/post/927696"},{"text":"Next Section","path":"A@/href","url":"/ajax/footer_feature.php?footer_feature=games"},{"alt":"Become a Supporter!","path":"IMG@/src","url":"http://www.newgrounds.com/img/ads/supporter-foot.jpg"},{"path":"A@/href","url":"http://www.newgrounds.com/supporter"},{"text":"Privacy Policy","path":"A@/href","url":"http://www.newgrounds.com/wiki/help-information/privacy-policy"},{"text":"Terms of Use","path":"A@/href","url":"http://www.newgrounds.com/wiki/help-information/terms-of-use"},{"text":"Games","path":"A@/href","url":"http://www.newgrounds.com/games"},{"text":"Movies","path":"A@/href","url":"http://www.newgrounds.com/movies"},{"text":"Art","path":"A@/href","url":"http://www.newgrounds.com/art"},{"text":"Audio","path":"A@/href","url":"http://www.newgrounds.com/audio"},{"text":"Series","path":"A@/href","url":"http://www.newgrounds.com/collection/series"},{"text":"Collections","path":"A@/href","url":"http://www.newgrounds.com/collection"},{"text":"Game Judging","path":"A@/href","url":"http://www.newgrounds.com/games/under_judgment"},{"text":"Movie Judging","path":"A@/href","url":"http://www.newgrounds.com/movies/under_judgment"},{"text":"Classic Portal","path":"A@/href","url":"http://www.newgrounds.com/portal"},{"text":"Downloads","path":"A@/href","url":"http://www.newgrounds.com/downloads"},{"text":"Creator Resources","path":"A@/href","url":"http://www.newgrounds.com/wiki/creator-resources"},{"text":"Forums","path":"A@/href","url":"http://www.newgrounds.com/bbs"},{"text":"Calendar","path":"A@/href","url":"http://www.newgrounds.com/calendar"},{"text":"Artist News","path":"A@/href","url":"http://www.newgrounds.com/news/artists"},{"text":"Rankings","path":"A@/href","url":"http://www.newgrounds.com/rankings"},{"text":"NG Wiki","path":"A@/href","url":"http://www.newgrounds.com/wiki"},{"text":"About NG","path":"A@/href","url":"http://www.newgrounds.com/wiki/about-newgrounds"},{"text":"Support","path":"A@/href","url":"http://www.newgrounds.com/wiki/faq"},{"text":"The Staff","path":"A@/href","url":"http://www.newgrounds.com/wiki/about-newgrounds/staff"},{"text":"NG History","path":"A@/href","url":"http://www.newgrounds.com/wiki/about-newgrounds/history"},{"text":"RSS","path":"A@/href","url":"http://www.newgrounds.com/wiki/help-information/rss"}],"Head":{"Link":[{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://css.ngfiles.com/ng_publish.css?1427947206"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://css.ngfiles.com/iphone.css"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://css.ngfiles.com/print.css"},{"path":"LINK@/href","rel":"icon","type":"image/png","url":"http://www.newgrounds.com/img/icons/favicon.png"},{"path":"LINK@/href","rel":"shortcut icon","type":"image/vnd.microsoft.icon","url":"http://www.newgrounds.com/favicon.ico"},{"path":"LINK@/href","rel":"apple-touch-icon","url":"http://img.ngfiles.com/misc/newgrounds_webclip.png"}],"Scripts":[{"path":"SCRIPT@/src","type":"text/javascript","url":"http://ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://js.ngfiles.com/misc/postscribe.js"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://js.ngfiles.com/ng_publish.js?1427221156"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://js.ngfiles.com/prototype.js"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://js.ngfiles.com/css_browser_selector.js"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://js.ngfiles.com/global/passport.js"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://js.ngfiles.com/global/favorites.js"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://js.ngfiles.com/prototype.js"}],"Metas":[{"content":"text/html; charset=utf-8","http-equiv":"content-type"},{"content":"width=976","name":"viewport"},{"content":"http://www.newgrounds.com"},{"content":"http://img.ngfiles.com/misc/tank_logo_medium.gif"},{"content":"Newgrounds.com"}],"Title":"00Assclown"}},"Entity-Digest":"sha1:3XLT4X6EBBBAKGG67Z7M5LAIAWTPYXX5"}}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"8158","Header-Length":"10","Inflated-CRC":"948536856","Inflated-Length":"30250"},"Offset":"2455","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://00assclown.newgrounds.com/
+WARC-Date: 2015-04-18T03:18:39Z
+WARC-Record-ID: <urn:uuid:410d37b4-2155-479d-89de-eb979be39d7e>
+WARC-Refers-To: <urn:uuid:65825f3c-e5d4-454b-bf00-f05b8ac7ee1a>
+Content-Type: application/json
+Content-Length: 1051
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"378","Block-Digest":"sha1:X6NEMEQ57QVQVNE5DLFIOS3HUHG4FMCY","Actual-Content-Length":"20","WARC-Header-Metadata":{"WARC-Type":"metadata","WARC-Date":"2015-04-18T03:18:39Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"20","WARC-Record-ID":"<urn:uuid:65825f3c-e5d4-454b-bf00-f05b8ac7ee1a>","WARC-Target-URI":"http://00assclown.newgrounds.com/","WARC-Concurrent-To":"<urn:uuid:90f14ce3-a38b-4638-a076-ec88e16168f5>","Content-Type":"application/warc-fields"},"Payload-Metadata":{"Trailing-Slop-Length":"4","WARC-Metadata-Metadata":{"Trailing-Slop-Length":"0","Metadata-Records":[{"Name":"fetchTimeMs","Value":"242"}],"Actual-Content-Length":"20"},"Actual-Content-Type":"application/metadata-fields"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"295","Header-Length":"10","Inflated-CRC":"1000063254","Inflated-Length":"402"},"Offset":"10613","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://0x20.be/smw/index.php?title=Special:WhatLinksHere/Game_Night&hideredirs=1
+WARC-Date: 2015-04-18T03:28:35Z
+WARC-Record-ID: <urn:uuid:32654b0b-9a27-4326-b11a-1acaaa5fbdd5>
+WARC-Refers-To: <urn:uuid:deb789b8-987d-47c1-81c3-5ad590a4432a>
+Content-Type: application/json
+Content-Length: 1478
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"399","Block-Digest":"sha1:MLDIIMD3YUKFLXIRSXQMKB2VUND54K43","Actual-Content-Length":"311","WARC-Header-Metadata":{"WARC-Type":"request","WARC-Date":"2015-04-18T03:28:35Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"311","WARC-Record-ID":"<urn:uuid:deb789b8-987d-47c1-81c3-5ad590a4432a>","WARC-Target-URI":"http://0x20.be/smw/index.php?title=Special:WhatLinksHere/Game_Night&hideredirs=1","WARC-IP-Address":"88.151.247.138","Content-Type":"application/http; msgtype=request"},"Payload-Metadata":{"Trailing-Slop-Length":"4","HTTP-Request-Metadata":{"Headers":{"Accept-Language":"en-us,en-gb,en;q=0.7,*;q=0.3","Host":"0x20.be","Accept-Encoding":"x-gzip, gzip, deflate","User-Agent":"CCBot/2.0 (http://commoncrawl.org/faq/)","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"},"Headers-Length":"309","Entity-Length":"0","Entity-Trailing-Slop-Bytes":"0","Request-Message":{"Method":"GET","Version":"HTTP/1.0","Path":"/smw/index.php?title=Special:WhatLinksHere/Game_Night&hideredirs=1"},"Entity-Digest":"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"},"Actual-Content-Type":"application/http; msgtype=request"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"460","Header-Length":"10","Inflated-CRC":"-433525199","Inflated-Length":"714"},"Offset":"10908","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://0x20.be/smw/index.php?title=Special:WhatLinksHere/Game_Night&hideredirs=1
+WARC-Date: 2015-04-18T03:28:35Z
+WARC-Record-ID: <urn:uuid:1d99b532-c497-4c45-a62d-fa53422041fe>
+WARC-Refers-To: <urn:uuid:e75577ec-5e09-4e38-acd3-00a9693be7c8>
+Content-Type: application/json
+Content-Length: 8182
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"614","Block-Digest":"sha1:SLISO4Z6UE5BBII4ISWRTEXISMJ2BLC3","Actual-Content-Length":"17341","WARC-Header-Metadata":{"WARC-Type":"response","WARC-Truncated":"length","WARC-Date":"2015-04-18T03:28:35Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"17341","WARC-Record-ID":"<urn:uuid:e75577ec-5e09-4e38-acd3-00a9693be7c8>","WARC-Block-Digest":"sha1:SLISO4Z6UE5BBII4ISWRTEXISMJ2BLC3","WARC-Payload-Digest":"sha1:T7BEAGSIHQVH6QQHCQ6WU26AJ2HPPDCM","WARC-Target-URI":"http://0x20.be/smw/index.php?title=Special:WhatLinksHere/Game_Night&hideredirs=1","WARC-IP-Address":"88.151.247.138","WARC-Concurrent-To":"<urn:uuid:deb789b8-987d-47c1-81c3-5ad590a4432a>","Content-Type":"application/http; msgtype=response"},"Payload-Metadata":{"Trailing-Slop-Length":"4","Actual-Content-Type":"application/http; msgtype=response","HTTP-Response-Metadata":{"Headers":{"X-Frame-Options":"SAMEORIGIN","Content-language":"en","Vary":"Accept-Encoding,Cookie","Date":"Sat, 18 Apr 2015 03:23:33 GMT","Expires":"Thu, 01 Jan 1970 00:00:00 GMT","Content-Encoding":"gzip","Content-Type":"text/html; charset=UTF-8","Connection":"close","X-Powered-By":"PHP/5.3.10-1ubuntu3.17","Server":"Apache/2.2.22 (Ubuntu)","Cache-Control":"private, must-revalidate, max-age=0","X-Content-Type-Options":"nosniff"},"Headers-Length":"415","Entity-Length":"16926","Entity-Trailing-Slop-Bytes":"0","Response-Message":{"Status":"200","Version":"HTTP/1.1","Reason":"OK"},"HTML-Metadata":{"Links":[{"text":"Game Night","title":"Game Night","path":"A@/href","url":"/Game_Night"},{"text":"navigation","path":"A@/href","url":"#mw-navigation"},{"text":"search","path":"A@/href","url":"#p-search"},{"path":"FORM@/action","url":"/smw/index.php"},{"text":"Hide","title":"Special:WhatLinksHere/Game Night","path":"A@/href","url":"/smw/index.php?title=Special:WhatLinksHere/Game_Night&hideredirs=1&hidetrans=1"},{"text":"Hide","title":"Special:WhatLinksHere/Game Night","path":"A@/href","url":"/smw/index.php?title=Special:WhatLinksHere/Game_Night&hideredirs=1&hidelinks=1"},{"text":"Show","title":"Special:WhatLinksHere/Game Night","path":"A@/href","url":"/Special:WhatLinksHere/Game_Night"},{"text":"Game Night","title":"Game Night","path":"A@/href","url":"/Game_Night"},{"text":"http://0x20.be/Special:WhatLinksHere/Game_Night","path":"A@/href","url":"http://0x20.be/Special:WhatLinksHere/Game_Night"},{"text":"Create account","path":"A@/href","url":"/smw/index.php?title=Special:UserLogin&returnto=Special%3AWhatLinksHere%2FGame+Night&returntoquery=hideredirs%3D1&type=signup"},{"text":"Log in","title":"You are encouraged to log in; however, it is not mandatory [o]","path":"A@/href","url":"/smw/index.php?title=Special:UserLogin&returnto=Special%3AWhatLinksHere%2FGame+Night&returntoquery=hideredirs%3D1"},{"text":"Page","title":"View the content page [c]","path":"A@/href","url":"/Game_Night"},{"text":"Discussion","title":"Discussion about the content page [t]","path":"A@/href","url":"/smw/index.php?title=Talk:Game_Night&action=edit&redlink=1"},{"path":"A@/href","url":"#"},{"text":"Read","path":"A@/href","url":"/Game_Night"},{"text":"Edit","title":"You can edit this page. Please use the preview button before saving [e]","path":"A@/href","url":"/smw/index.php?title=Game_Night&action=edit"},{"text":"View history","title":"Past revisions of this page [h]","path":"A@/href","url":"/smw/index.php?title=Game_Night&action=history"},{"path":"A@/href","url":"#"},{"path":"FORM@/action","url":"/smw/index.php"},{"alt":"Search","path":"IMG@/src","url":"/smw/skins/vector/images/search-ltr.png?303"},{"title":"Visit the main page","path":"A@/href","url":"/Main_Page"},{"text":"Main page","title":"Visit the main page [z]","path":"A@/href","url":"/Main_Page"},{"text":"Recent changes","title":"A list of recent changes in the wiki [r]","path":"A@/href","url":"/Special:RecentChanges"},{"text":"Infrastructure","path":"A@/href","url":"/Infrastructure"},{"text":"Membership","path":"A@/href","url":"/Membership"},{"text":"FAQ","path":"A@/href","url":"/FAQ"},{"text":"Contact","path":"A@/href","url":"/Contact"},{"text":"Wanted","path":"A@/href","url":"/StuffWeNeed"},{"text":"Google+ page","path":"A@/href","url":"https://plus.google.com/111440983315819252895"},{"text":"G+ community","path":"A@/href","url":"https://plus.google.com/communities/109536490791438420196"},{"text":"Twitter","path":"A@/href","url":"https://twitter.com/hsghent"},{"text":"Facebook","path":"A@/href","url":"https://www.facebook.com/0x20.be"},{"text":"Tumblr","path":"A@/href","url":"http://blog.0x20.be/"},{"text":"Events","path":"A@/href","url":"/Form:Event"},{"text":"Meetings","path":"A@/href","url":"/Form:Meeting"},{"text":"Event images","path":"A@/href","url":"/Special:FormEdit/EventImage"},{"text":"Projects","path":"A@/href","url":"/Form:Project"},{"text":"Locations","path":"A@/href","url":"/Form:Location"},{"text":"People","path":"A@/href","url":"/Form:Person"},{"text":"Documentation","path":"A@/href","url":"/Documentation"},{"text":"List of spaces","path":"A@/href","url":"http://hackerspaces.org/wiki/List_of_Hacker_Spaces"},{"text":"Belgian spaces","path":"A@/href","url":"http://hackerspaces.be/"},{"text":"Special pages","title":"A list of all special pages [q]","path":"A@/href","url":"/Special:SpecialPages"},{"text":"Printable version","title":"Printable version of this page [p]","path":"A@/href","url":"/smw/index.php?title=Special:WhatLinksHere/Game_Night&hideredirs=1&printable=yes"},{"text":"Privacy policy","title":"Whitespace (Hackerspace Gent):Privacy policy","path":"A@/href","url":"/Whitespace_(Hackerspace_Gent):Privacy_policy"},{"text":"About Whitespace (Hackerspace Gent)","title":"Whitespace (Hackerspace Gent):About","path":"A@/href","url":"/Whitespace_(Hackerspace_Gent):About"},{"text":"Disclaimers","title":"Whitespace (Hackerspace Gent):General disclaimer","path":"A@/href","url":"/Whitespace_(Hackerspace_Gent):General_disclaimer"},{"alt":"Powered by MediaWiki","path":"IMG@/src","url":"/smw/skins/common/images/poweredby_mediawiki_88x31.png"},{"path":"A@/href","url":"//www.mediawiki.org/"},{"alt":"Powered by Semantic MediaWiki","path":"IMG@/src","url":"/smw/extensions/SemanticMediaWiki/includes/../resources/images/smw_button.png"},{"path":"A@/href","url":"https://www.semantic-mediawiki.org/wiki/Semantic_MediaWiki"}],"Head":{"Link":[{"path":"LINK@/href","rel":"shortcut icon","url":"/favicon.ico"},{"path":"LINK@/href","rel":"search","type":"application/opensearchdescription+xml","url":"/smw/opensearch_desc.php"},{"path":"LINK@/href","rel":"EditURI","type":"application/rsd+xml","url":"http://0x20.be/smw/api.php?action=rsd"},{"path":"LINK@/href","rel":"alternate","type":"application/atom+xml","url":"/smw/index.php?title=Special:RecentChanges&feed=atom"},{"path":"LINK@/href","rel":"stylesheet","url":"http://0x20.be/smw/load.php?debug=false&lang=en&modules=mediawiki.legacy.commonPrint%2Cshared%7Cskins.common.interface%7Cskins.vector.styles&only=styles&skin=vector&*"},{"path":"LINK@/href","rel":"stylesheet","url":"http://0x20.be/smw/load.php?debug=false&lang=en&modules=site&only=styles&skin=vector&*"}],"Scripts":[{"path":"SCRIPT@/src","url":"http://0x20.be/smw/load.php?debug=false&lang=en&modules=startup&only=scripts&skin=vector&*"},{"path":"SCRIPT@/src","url":"http://0x20.be/smw/load.php?debug=false&lang=en&modules=site&only=scripts&skin=vector&*"}],"Metas":[{"content":"IE=EDGE","http-equiv":"X-UA-Compatible"},{"content":"MediaWiki 1.23alpha","name":"generator"},{"content":"noindex,nofollow","name":"robots"},{"content":"","name":"ResourceLoaderDynamicStyles"}],"Title":"Pages that link to \"Game Night\" - Whitespace (Hackerspace Gent)"}},"Entity-Digest":"sha1:T7BEAGSIHQVH6QQHCQ6WU26AJ2HPPDCM"}}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"5813","Header-Length":"10","Inflated-CRC":"-1871682579","Inflated-Length":"17959"},"Offset":"11368","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://0x20.be/smw/index.php?title=Special:WhatLinksHere/Game_Night&hideredirs=1
+WARC-Date: 2015-04-18T03:28:35Z
+WARC-Record-ID: <urn:uuid:0e63fbc1-b974-4a70-a51d-300d310daaf6>
+WARC-Refers-To: <urn:uuid:fbf7bdc0-3a71-4ad5-82a4-15160900d320>
+Content-Type: application/json
+Content-Length: 1098
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"425","Block-Digest":"sha1:ZIBFO65UTA75T63SYHC6EJ5ZSCFDAKJ5","Actual-Content-Length":"20","WARC-Header-Metadata":{"WARC-Type":"metadata","WARC-Date":"2015-04-18T03:28:35Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"20","WARC-Record-ID":"<urn:uuid:fbf7bdc0-3a71-4ad5-82a4-15160900d320>","WARC-Target-URI":"http://0x20.be/smw/index.php?title=Special:WhatLinksHere/Game_Night&hideredirs=1","WARC-Concurrent-To":"<urn:uuid:e75577ec-5e09-4e38-acd3-00a9693be7c8>","Content-Type":"application/warc-fields"},"Payload-Metadata":{"Trailing-Slop-Length":"4","WARC-Metadata-Metadata":{"Trailing-Slop-Length":"0","Metadata-Records":[{"Name":"fetchTimeMs","Value":"417"}],"Actual-Content-Length":"20"},"Actual-Content-Type":"application/metadata-fields"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"332","Header-Length":"10","Inflated-CRC":"2117109200","Inflated-Length":"449"},"Offset":"17181","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://1000games.me/games/gametion/
+WARC-Date: 2015-04-18T03:21:15Z
+WARC-Record-ID: <urn:uuid:da432585-d488-49c1-b0c0-689bd9b4ec45>
+WARC-Refers-To: <urn:uuid:696c8ab2-0646-4f77-b363-7ed9c13e70e4>
+Content-Type: application/json
+Content-Length: 1387
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"353","Block-Digest":"sha1:XVZV6JG2UP3E7SJHWGUVNKLG53QHE664","Actual-Content-Length":"266","WARC-Header-Metadata":{"WARC-Type":"request","WARC-Date":"2015-04-18T03:21:15Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"266","WARC-Record-ID":"<urn:uuid:696c8ab2-0646-4f77-b363-7ed9c13e70e4>","WARC-Target-URI":"http://1000games.me/games/gametion/","WARC-IP-Address":"104.18.55.184","Content-Type":"application/http; msgtype=request"},"Payload-Metadata":{"Trailing-Slop-Length":"4","HTTP-Request-Metadata":{"Headers":{"Accept-Language":"en-us,en-gb,en;q=0.7,*;q=0.3","Host":"1000games.me","Accept-Encoding":"x-gzip, gzip, deflate","User-Agent":"CCBot/2.0 (http://commoncrawl.org/faq/)","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"},"Headers-Length":"264","Entity-Length":"0","Entity-Trailing-Slop-Bytes":"0","Request-Message":{"Method":"GET","Version":"HTTP/1.0","Path":"/games/gametion/"},"Entity-Digest":"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"},"Actual-Content-Type":"application/http; msgtype=request"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"413","Header-Length":"10","Inflated-CRC":"1487536538","Inflated-Length":"623"},"Offset":"17513","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://1000games.me/games/gametion/
+WARC-Date: 2015-04-18T03:21:15Z
+WARC-Record-ID: <urn:uuid:4577a099-5324-48a1-8538-942f34fa7d9f>
+WARC-Refers-To: <urn:uuid:3ace6a03-bbf2-4efa-9805-b1ce0232b4c6>
+Content-Type: application/json
+Content-Length: 36087
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"568","Block-Digest":"sha1:SSZDUZX5H5BP2W3XJPFKFE7G5YPRPAIR","Actual-Content-Length":"50282","WARC-Header-Metadata":{"WARC-Type":"response","WARC-Truncated":"length","WARC-Date":"2015-04-18T03:21:15Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"50282","WARC-Record-ID":"<urn:uuid:3ace6a03-bbf2-4efa-9805-b1ce0232b4c6>","WARC-Block-Digest":"sha1:SSZDUZX5H5BP2W3XJPFKFE7G5YPRPAIR","WARC-Payload-Digest":"sha1:ZW2OA5GZLKAGX7Y66P5VAPV5442FQSTH","WARC-Target-URI":"http://1000games.me/games/gametion/","WARC-IP-Address":"104.18.55.184","WARC-Concurrent-To":"<urn:uuid:696c8ab2-0646-4f77-b363-7ed9c13e70e4>","Content-Type":"application/http; msgtype=response"},"Payload-Metadata":{"Trailing-Slop-Length":"4","Actual-Content-Type":"application/http; msgtype=response","HTTP-Response-Metadata":{"Headers":{"Last-Modified":"Sun, 16 Mar 2014 12:28:22 GMT","X-Cacheable":"YES","Set-Cookie":"__cfduid=da2c077fa3e77d0f9964571f7aa5dbf531429327275; expires=Sun, 17-Apr-16 03:21:15 GMT; path=/; domain=.1000games.me; HttpOnly","Connection":"close","X-Cache":"MISS","Server":"cloudflare-nginx","X-Varnish":"967374763","Date":"Sat, 18 Apr 2015 03:21:15 GMT","Vary":"Accept-Encoding,Cookie","age":"0","Content-Encoding":"gzip","CF-RAY":"1d8d3d0d54250f21-IAD","Via":"1.1 varnish","Content-Type":"text/html; charset=UTF-8"},"Headers-Length":"495","Entity-Length":"49787","Entity-Trailing-Slop-Bytes":"0","Response-Message":{"Status":"200","Version":"HTTP/1.1","Reason":"OK"},"HTML-Metadata":{"Links":[{"path":"STYLE/#text","href":"http://1000games.me/wp-content/themes/1000games/style.css"},{"alt":"","path":"IMG@/src","url":"http://xboxc.com/www/delivery/avw.php?zoneid=31&n=76ca1bd"},{"target":"_blank","path":"A@/href","url":"http://xboxc.com/www/delivery/ck.php?n=76ca1bd"},{"alt":"","path":"IMG@/src","url":"http://xboxc.com/www/delivery/avw.php?zoneid=9&n=1bfc21c"},{"target":"_blank","path":"A@/href","url":"http://xboxc.com/www/delivery/ck.php?n=1bfc21c"},{"path":"FORM@/action","url":"http://1000games.me/search"},{"text":"1000 games","title":"1000 games","path":"A@/href","url":"http://1000games.me"},{"text":"Popular","title":"Popular games","path":"A@/href","url":"http://1000games.me/popular/"},{"text":"Featured","title":"Featured games","path":"A@/href","url":"http://1000games.me/featured/"},{"text":"New","title":"New games","path":"A@/href","url":"http://1000games.me/newgames/"},{"text":"Home","path":"A@/href","url":"http://1000games.me"},{"text":"Action","title":"View all posts filed under Action","path":"A@/href","url":"http://1000games.me/games/category/action/"},{"text":"Defense","title":"View all posts filed under Defense","path":"A@/href","url":"http://1000games.me/games/category/tower-defense/"},{"text":"Driving","title":"View all posts filed under Driving","path":"A@/href","url":"http://1000games.me/games/category/driving/"},{"text":"Fighting","title":"View all posts filed under Fighting","path":"A@/href","url":"http://1000games.me/games/category/fighting/"},{"text":"Puzzles","title":"View all posts filed under Puzzles","path":"A@/href","url":"http://1000games.me/games/category/puzzles/"},{"text":"Shooting","title":"View all posts filed under Shooting","path":"A@/href","url":"http://1000games.me/games/category/shooting/"},{"text":"Sports","title":"View all posts filed under Sports","path":"A@/href","url":"http://1000games.me/games/category/sports/"},{"text":"Strategy","title":"View all posts filed under Strategy","path":"A@/href","url":"http://1000games.me/games/category/strategy/"},{"text":"More","path":"A@/href","url":"http://1000games.me/allcategories/"},{"text":"Home","path":"A@/href","url":"http://1000games.me"},{"text":"gametion","path":"A@/href","url":""},{"text":"1000 gametion Games","path":"A@/href","url":"http://1000games.me/games/gametion/"},{"path":"A@/href","url":"/games/gametion/"},{"path":"A@/href","url":"/games/gametion/?lang=ar"},{"path":"A@/href","url":"/games/gametion/?lang=bg"},{"path":"A@/href","url":"/games/gametion/?lang=ca"},{"path":"A@/href","url":"/games/gametion/?lang=zh"},{"path":"A@/href","url":"/games/gametion/?lang=zh-tw"},{"path":"A@/href","url":"/games/gametion/?lang=cs"},{"path":"A@/href","url":"/games/gametion/?lang=da"},{"path":"A@/href","url":"/games/gametion/?lang=nl"},{"path":"A@/href","url":"/games/gametion/?lang=et"},{"path":"A@/href","url":"/games/gametion/?lang=fi"},{"path":"A@/href","url":"/games/gametion/?lang=fr"},{"path":"A@/href","url":"/games/gametion/?lang=de"},{"path":"A@/href","url":"/games/gametion/?lang=el"},{"path":"A@/href","url":"/games/gametion/?lang=ht"},{"path":"A@/href","url":"/games/gametion/?lang=he"},{"path":"A@/href","url":"/games/gametion/?lang=hi"},{"path":"A@/href","url":"/games/gametion/?lang=hu"},{"path":"A@/href","url":"/games/gametion/?lang=id"},{"path":"A@/href","url":"/games/gametion/?lang=it"},{"path":"A@/href","url":"/games/gametion/?lang=ja"},{"path":"A@/href","url":"/games/gametion/?lang=ko"},{"path":"A@/href","url":"/games/gametion/?lang=lv"},{"path":"A@/href","url":"/games/gametion/?lang=lt"},{"path":"A@/href","url":"/games/gametion/?lang=ms"},{"path":"A@/href","url":"/games/gametion/?lang=no"},{"path":"A@/href","url":"/games/gametion/?lang=pl"},{"path":"A@/href","url":"/games/gametion/?lang=pt"},{"path":"A@/href","url":"/games/gametion/?lang=ro"},{"path":"A@/href","url":"/games/gametion/?lang=ru"},{"path":"A@/href","url":"/games/gametion/?lang=sk"},{"path":"A@/href","url":"/games/gametion/?lang=sl"},{"path":"A@/href","url":"/games/gametion/?lang=es"},{"path":"A@/href","url":"/games/gametion/?lang=sv"},{"path":"A@/href","url":"/games/gametion/?lang=th"},{"path":"A@/href","url":"/games/gametion/?lang=tr"},{"path":"A@/href","url":"/games/gametion/?lang=uk"},{"path":"A@/href","url":"/games/gametion/?lang=ur"},{"path":"A@/href","url":"/games/gametion/?lang=vi"},{"path":"A@/href","url":"/games/gametion/?lang=mw"},{"title":"Transposh - translation plugin for wordpress","alt":"Transposh - translation plugin for wordpress","path":"IMG@/src","url":"/wp-content/plugins/transposh-translation-filter-for-wordpress/img/tplogo.png"},{"path":"A@/href","url":"http://transposh.org/"},{"alt":"","path":"IMG@/src","url":"http://xboxc.com/www/delivery/avw.php?zoneid=6&n=4d8ba39"},{"target":"_blank","path":"A@/href","url":"http://xboxc.com/www/delivery/ck.php?n=4d8ba39"},{"alt":"Play FrogFly","path":"IMG@/src","url":"http://1000games.me/wp-content/themes/1000games/thumbs/timthumb.php?src=http://1pd.org/thumbs/7867.png&w=100&h=100&q=20"},{"path":"A@/href","url":"http://1000games.me/play/frogfly/"},{"text":"FrogFly","path":"A@/href","url":"http://1000games.me/play/frogfly/"},{"title":"0 votes, average: 0.00 out of 5","alt":"0 votes, average: 0.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_off.gif"},{"title":"0 votes, average: 0.00 out of 5","alt":"0 votes, average: 0.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_off.gif"},{"title":"0 votes, average: 0.00 out of 5","alt":"0 votes, average: 0.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_off.gif"},{"title":"0 votes, average: 0.00 out of 5","alt":"0 votes, average: 0.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_off.gif"},{"title":"0 votes, average: 0.00 out of 5","alt":"0 votes, average: 0.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_off.gif"},{"alt":"","path":"IMG@/src","url":"http://xboxc.com/www/delivery/avw.php?zoneid=3&n=ee44c81"},{"target":"_blank","path":"A@/href","url":"http://xboxc.com/www/delivery/ck.php?n=ee44c81"},{"alt":"Play FrogFly","path":"IMG@/src","url":"http://1000games.me/wp-content/themes/1000games/thumbs/timthumb.php?src=http://1pd.org/thumbs/7867.png&w=100&h=100&q=20"},{"path":"A@/href","url":"http://1000games.me/play/frogfly/"},{"text":"FrogFly","path":"A@/href","url":"http://1000games.me/play/frogfly/"},{"title":"0 votes, average: 0.00 out of 5","alt":"0 votes, average: 0.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_off.gif"},{"title":"0 votes, average: 0.00 out of 5","alt":"0 votes, average: 0.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_off.gif"},{"title":"0 votes, average: 0.00 out of 5","alt":"0 votes, average: 0.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_off.gif"},{"title":"0 votes, average: 0.00 out of 5","alt":"0 votes, average: 0.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_off.gif"},{"title":"0 votes, average: 0.00 out of 5","alt":"0 votes, average: 0.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_off.gif"},{"alt":"","path":"IMG@/src","url":"http://xboxc.com/www/delivery/avw.php?zoneid=4&n=e0aaa36"},{"target":"_blank","path":"A@/href","url":"http://xboxc.com/www/delivery/ck.php?n=e0aaa36"},{"alt":"Play FPA: World 2","path":"IMG@/src","url":"http://1000games.me/wp-content/themes/1000games/thumbs/timthumb.php?src=http://1pd.org/thumbs/10637.png&w=100&h=100&q=20"},{"path":"A@/href","url":"http://1000games.me/play/fpa-world-2/"},{"text":"FPA: World 2","path":"A@/href","url":"http://1000games.me/play/fpa-world-2/"},{"title":"18 votes, average: 594.11 out of 5","alt":"18 votes, average: 594.11 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"18 votes, average: 594.11 out of 5","alt":"18 votes, average: 594.11 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"18 votes, average: 594.11 out of 5","alt":"18 votes, average: 594.11 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"18 votes, average: 594.11 out of 5","alt":"18 votes, average: 594.11 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"18 votes, average: 594.11 out of 5","alt":"18 votes, average: 594.11 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"alt":"Play Hero Fighter","path":"IMG@/src","url":"http://1000games.me/wp-content/themes/1000games/thumbs/timthumb.php?src=http://1pd.org/thumbs/1369.png&w=100&h=100&q=20"},{"path":"A@/href","url":"http://1000games.me/play/hero-fighter/"},{"text":"Hero Fighter","path":"A@/href","url":"http://1000games.me/play/hero-fighter/"},{"title":"10 votes, average: 5.00 out of 5","alt":"10 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"10 votes, average: 5.00 out of 5","alt":"10 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"10 votes, average: 5.00 out of 5","alt":"10 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"10 votes, average: 5.00 out of 5","alt":"10 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"10 votes, average: 5.00 out of 5","alt":"10 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"alt":"Play Comic Stars Fighting 3","path":"IMG@/src","url":"http://1000games.me/wp-content/themes/1000games/thumbs/timthumb.php?src=http://gamerbullet.com/wp-content/gamesfeed/images/gf-comic-stars-fighting-3-icon-2.png&w=100&h=100&q=20"},{"path":"A@/href","url":"http://1000games.me/play/comic-stars-fighting-3/"},{"text":"Comic Stars Fighting 3","path":"A@/href","url":"http://1000games.me/play/comic-stars-fighting-3/"},{"title":"6 votes, average: 5.00 out of 5","alt":"6 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"6 votes, average: 5.00 out of 5","alt":"6 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"6 votes, average: 5.00 out of 5","alt":"6 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"6 votes, average: 5.00 out of 5","alt":"6 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"6 votes, average: 5.00 out of 5","alt":"6 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"alt":"Play Sift Heads 4","path":"IMG@/src","url":"http://1000games.me/wp-content/themes/1000games/thumbs/timthumb.php?src=http://1pd.org/thumbs/4338.png&w=100&h=100&q=20"},{"path":"A@/href","url":"http://1000games.me/play/sift-heads-4/"},{"text":"Sift Heads 4","path":"A@/href","url":"http://1000games.me/play/sift-heads-4/"},{"title":"5 votes, average: 5.00 out of 5","alt":"5 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"5 votes, average: 5.00 out of 5","alt":"5 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"5 votes, average: 5.00 out of 5","alt":"5 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"5 votes, average: 5.00 out of 5","alt":"5 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"5 votes, average: 5.00 out of 5","alt":"5 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"alt":"Play Sim Taxi 2","path":"IMG@/src","url":"http://1000games.me/wp-content/themes/1000games/thumbs/timthumb.php?src=http://1pd.org/thumbs/11797.png&w=100&h=100&q=20"},{"path":"A@/href","url":"http://1000games.me/play/sim-taxi-2/"},{"text":"Sim Taxi 2","path":"A@/href","url":"http://1000games.me/play/sim-taxi-2/"},{"title":"5 votes, average: 5.00 out of 5","alt":"5 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"5 votes, average: 5.00 out of 5","alt":"5 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"5 votes, average: 5.00 out of 5","alt":"5 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"5 votes, average: 5.00 out of 5","alt":"5 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"5 votes, average: 5.00 out of 5","alt":"5 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"alt":"Play Sara 2","path":"IMG@/src","url":"http://1000games.me/wp-content/themes/1000games/thumbs/timthumb.php?src=http://1pd.org/games/Sara-2.png&w=100&h=100&q=20"},{"path":"A@/href","url":"http://1000games.me/play/sara-2/"},{"text":"Sara 2","path":"A@/href","url":"http://1000games.me/play/sara-2/"},{"title":"5 votes, average: 5.00 out of 5","alt":"5 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"5 votes, average: 5.00 out of 5","alt":"5 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"5 votes, average: 5.00 out of 5","alt":"5 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"5 votes, average: 5.00 out of 5","alt":"5 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"5 votes, average: 5.00 out of 5","alt":"5 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"alt":"Play Street Fighter II’ Champion Edition","path":"IMG@/src","url":"http://1000games.me/wp-content/themes/1000games/thumbs/timthumb.php?src=http://1pd.org/thumbs/100.png&w=100&h=100&q=20"},{"path":"A@/href","url":"http://1000games.me/play/street-fighter-ii-champion-edition/"},{"text":"Street Fighter II’ Champion Edition","path":"A@/href","url":"http://1000games.me/play/street-fighter-ii-champion-edition/"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"alt":"Play Monkey Go Happy 4","path":"IMG@/src","url":"http://1000games.me/wp-content/themes/1000games/thumbs/timthumb.php?src=http://www.gamerbullet.com/wp-content/uploads/2011/09/monkey_go_happy_4.gif&w=100&h=100&q=20"},{"path":"A@/href","url":"http://1000games.me/play/monkey-go-happy-4/"},{"text":"Monkey Go Happy 4","path":"A@/href","url":"http://1000games.me/play/monkey-go-happy-4/"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"alt":"Play Tribe Boy Vs Monsters","path":"IMG@/src","url":"http://1000games.me/wp-content/themes/1000games/thumbs/timthumb.php?src=http://www.gamerbullet.com/wp-content/uploads/2011/10/tribeboy_vs_monsters.gif&w=100&h=100&q=20"},{"path":"A@/href","url":"http://1000games.me/play/tribe-boy-vs-monsters/"},{"text":"Tribe Boy Vs Monsters","path":"A@/href","url":"http://1000games.me/play/tribe-boy-vs-monsters/"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"alt":"Play Draw And Fly","path":"IMG@/src","url":"http://1000games.me/wp-content/themes/1000games/thumbs/timthumb.php?src=http://1pd.org/games/Draw-and-Fly.jpg&w=100&h=100&q=20"},{"path":"A@/href","url":"http://1000games.me/play/draw-and-fly/"},{"text":"Draw And Fly","path":"A@/href","url":"http://1000games.me/play/draw-and-fly/"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"title":"4 votes, average: 5.00 out of 5","alt":"4 votes, average: 5.00 out of 5","path":"IMG@/src","url":"http://1000games.me/wp-content/plugins/wp-postratings/images/stars_crystal/rating_on.gif"},{"alt":"Play FrogFly","path":"IMG@/src","url":"http://1000games.me/wp-content/themes/1000games/thumbs/timthumb.php?src=http://1pd.org/thumbs/7867.png&w=56&h=56&q=20"},{"path":"A@/href","url":"http://1000games.me/play/frogfly/"},{"text":"FrogFly","path":"A@/href","url":"http://1000games.me/play/frogfly/"},{"text":"Privacy policy","path":"A@/href","url":"http://1000games.me/privacy/"},{"text":"Contact Us","path":"A@/href","url":"http://1000games.me/contact-us/"},{"text":"1000 car games","path":"A@/href","url":"http://1000games.me/games/car/"},{"text":"Ben 1000 games","path":"A@/href","url":"http://1000games.me/games/ben10/"},{"alt":"","path":"IMG@/src","url":"http://xboxc.com/www/delivery/avw.php?zoneid=7&n=9c7fd06"},{"target":"_blank","path":"A@/href","url":"http://xboxc.com/www/delivery/ck.php?n=9c7fd06"}],"Head":{"Link":[{"path":"LINK@/href","rel":"alternate","type":"application/rss+xml","url":"http://1000games.me/feed/"},{"path":"LINK@/href","rel":"alternate","type":"text/xml","url":"http://1000games.me/feed/rss/"},{"path":"LINK@/href","rel":"alternate","type":"application/atom+xml","url":"http://1000games.me/feed/atom/"},{"path":"LINK@/href","rel":"pingback","url":"http://1000games.me/xmlrpc.php"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2014/03/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2013/12/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2013/11/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2013/10/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2013/09/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2013/08/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2013/07/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2013/06/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2013/05/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2013/04/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2013/03/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2013/02/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2013/01/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2012/12/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2012/11/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2012/10/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2012/09/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2012/08/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2012/07/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2012/06/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2012/05/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2012/04/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2012/03/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2012/02/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2012/01/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2011/12/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2011/11/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2011/10/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2011/09/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2011/08/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2011/06/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2011/05/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2011/03/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2011/02/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2011/01/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2010/12/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2010/11/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2010/10/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2010/09/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2010/08/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2010/07/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2010/06/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2010/05/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2010/04/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2010/03/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2010/02/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2010/01/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2009/12/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2009/11/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2009/10/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2009/09/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2009/08/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2009/07/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2009/06/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2009/05/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2009/04/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2009/03/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2009/02/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2009/01/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2008/12/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2008/11/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2008/10/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2008/09/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2008/08/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2008/07/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2008/06/"},{"path":"LINK@/href","rel":"archives","url":"http://1000games.me/play/2008/05/"},{"path":"LINK@/href","rel":"alternate","type":"application/rss+xml","url":"http://1000games.me/games/gametion/feed/"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://1000games.me/wp-content/plugins/jetpack/modules/subscriptions/subscriptions.css?ver=3.8.1"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://1000games.me/wp-content/plugins/wp-postratings/postratings-css.css?ver=1.63"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://1000games.me/wp-content/plugins/jetpack/modules/widgets/widgets.css?ver=20121003"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"//1000games.me/wp-content/plugins/transposh-translation-filter-for-wordpress/widgets/flags/tpw_flags_css.css?ver=0.9.5.1"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://1000games.me/wp-content/plugins/wp-pagenavi/pagenavi-css.css?ver=2.70"},{"path":"LINK@/href","rel":"EditURI","type":"application/rsd+xml","url":"http://1000games.me/xmlrpc.php?rsd"},{"path":"LINK@/href","rel":"wlwmanifest","type":"application/wlwmanifest+xml","url":"http://1000games.me/wp-includes/wlwmanifest.xml"},{"path":"LINK@/href","rel":"canonical","url":"http://1000games.me/games/gametion/"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=ar"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=bg"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=ca"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=zh"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=zh-tw"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=cs"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=da"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=nl"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=et"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=fi"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=fr"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=de"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=el"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=ht"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=he"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=hi"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=hu"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=id"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=it"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=ja"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=ko"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=lv"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=lt"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=ms"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=no"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=pl"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=pt"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=ro"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=ru"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=sk"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=sl"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=es"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=sv"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=th"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=tr"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=uk"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=ur"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=vi"},{"path":"LINK@/href","rel":"alternate","url":"/games/gametion/?lang=mw"}],"Scripts":[{"path":"SCRIPT@/src","type":"text/javascript","url":"//ajax.googleapis.com/ajax/libs/jquery/1.10.2/jquery.min.js"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://1000games.me/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://1000games.me/wp-content/themes/1000games/js/script_new2.js"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://xboxc.com/www/delivery/spcjs.php?id=1&target=_blank"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://apis.google.com/js/plusone.js"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://www.google.com/coop/cse/brand?form=cse-search-box&lang=en"},{"path":"SCRIPT@/src","url":"http://www.stumbleupon.com/hostedbadge.php?s=1"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://s0.wp.com/wp-content/js/devicepx-jetpack.js?ver=201411"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://s.gravatar.com/js/gprofiles.js?ver=2014Maraa"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://1000games.me/wp-content/plugins/jetpack/modules/wpgroho.js?ver=3.8.1"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://stats.wordpress.com/e-201411.js"}],"Metas":[{"content":"text/html; charset=UTF-8","http-equiv":"Content-Type"},{"content":"global","name":"distribution"},{"content":"follow, all","name":"robots"},{"content":"en,es","name":"language"},{"content":"1000 games","name":"hover"},{"content":"hello@1000games.me","name":"email"},{"content":"requiresActiveX=true","http-equiv":"X-UA-Compatible"},{"content":"WordPress 3.8.1","name":"generator"},{"content":"index,follow,noodp,noydir","name":"robots"},{"content":"1000 games"},{"content":"128010180569309"},{"content":"en_US"},{"content":"1000 games"},{"content":"Gametion | 1000 games"},{"content":"over 1000 games to play."},{"content":"website"},{"content":"http://1000games.me/games/gametion/"},{"content":"en_us"}],"Title":"Gametion | 1000 games"}},"Entity-Digest":"sha1:ZW2OA5GZLKAGX7Y66P5VAPV5442FQSTH"}}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"9531","Header-Length":"10","Inflated-CRC":"-1740978128","Inflated-Length":"50854"},"Offset":"17926","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://1000games.me/games/gametion/
+WARC-Date: 2015-04-18T03:21:15Z
+WARC-Record-ID: <urn:uuid:a02f23b9-a646-4de2-b948-d3191772cda0>
+WARC-Refers-To: <urn:uuid:1a2a5eb8-1e34-414c-99b8-340dd293b1d0>
+Content-Type: application/json
+Content-Length: 1053
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"380","Block-Digest":"sha1:WYHNJMPIE2MQCLNSM3HI4XSUM357AGXN","Actual-Content-Length":"20","WARC-Header-Metadata":{"WARC-Type":"metadata","WARC-Date":"2015-04-18T03:21:15Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"20","WARC-Record-ID":"<urn:uuid:1a2a5eb8-1e34-414c-99b8-340dd293b1d0>","WARC-Target-URI":"http://1000games.me/games/gametion/","WARC-Concurrent-To":"<urn:uuid:3ace6a03-bbf2-4efa-9805-b1ce0232b4c6>","Content-Type":"application/warc-fields"},"Payload-Metadata":{"Trailing-Slop-Length":"4","WARC-Metadata-Metadata":{"Trailing-Slop-Length":"0","Metadata-Records":[{"Name":"fetchTimeMs","Value":"170"}],"Actual-Content-Length":"20"},"Actual-Content-Type":"application/metadata-fields"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"289","Header-Length":"10","Inflated-CRC":"1013923283","Inflated-Length":"404"},"Offset":"27457","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://100zone.blogspot.com/2013/03/please-memp3-4shared.html
+WARC-Date: 2015-04-18T03:20:36Z
+WARC-Record-ID: <urn:uuid:e0d4817f-cf72-497f-9c35-85ee53e73a9d>
+WARC-Refers-To: <urn:uuid:177f1a0d-a84f-4aac-a4c2-d34562e2526d>
+Content-Type: application/json
+Content-Length: 1440
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"380","Block-Digest":"sha1:UQD7V3VJND4YRUWV55KC5QMYRJULIGAU","Actual-Content-Length":"292","WARC-Header-Metadata":{"WARC-Type":"request","WARC-Date":"2015-04-18T03:20:36Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"292","WARC-Record-ID":"<urn:uuid:177f1a0d-a84f-4aac-a4c2-d34562e2526d>","WARC-Target-URI":"http://100zone.blogspot.com/2013/03/please-memp3-4shared.html","WARC-IP-Address":"216.58.217.129","Content-Type":"application/http; msgtype=request"},"Payload-Metadata":{"Trailing-Slop-Length":"4","HTTP-Request-Metadata":{"Headers":{"Accept-Language":"en-us,en-gb,en;q=0.7,*;q=0.3","Host":"100zone.blogspot.com","Accept-Encoding":"x-gzip, gzip, deflate","User-Agent":"CCBot/2.0 (http://commoncrawl.org/faq/)","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"},"Headers-Length":"290","Entity-Length":"0","Entity-Trailing-Slop-Bytes":"0","Request-Message":{"Method":"GET","Version":"HTTP/1.0","Path":"/2013/03/please-memp3-4shared.html"},"Entity-Digest":"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"},"Actual-Content-Type":"application/http; msgtype=request"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"440","Header-Length":"10","Inflated-CRC":"-322512595","Inflated-Length":"676"},"Offset":"27746","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://100zone.blogspot.com/2013/03/please-memp3-4shared.html
+WARC-Date: 2015-04-18T03:20:36Z
+WARC-Record-ID: <urn:uuid:ae89a176-2a4d-495b-8f61-757a9b9664b0>
+WARC-Refers-To: <urn:uuid:6a2d0bfa-c6eb-49b4-99dd-e1708960e2a7>
+Content-Type: application/json
+Content-Length: 38629
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"596","Block-Digest":"sha1:JZXV6OKBRI44RMAEM6FB7LSKI2P6SDNI","Actual-Content-Length":"119155","WARC-Header-Metadata":{"WARC-Type":"response","WARC-Truncated":"length","WARC-Date":"2015-04-18T03:20:36Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"119155","WARC-Record-ID":"<urn:uuid:6a2d0bfa-c6eb-49b4-99dd-e1708960e2a7>","WARC-Block-Digest":"sha1:JZXV6OKBRI44RMAEM6FB7LSKI2P6SDNI","WARC-Payload-Digest":"sha1:B7ZYUGWISVE3PJLCKTJ3Y6XLJ4PDGXP3","WARC-Target-URI":"http://100zone.blogspot.com/2013/03/please-memp3-4shared.html","WARC-IP-Address":"216.58.217.129","WARC-Concurrent-To":"<urn:uuid:177f1a0d-a84f-4aac-a4c2-d34562e2526d>","Content-Type":"application/http; msgtype=response"},"Payload-Metadata":{"Trailing-Slop-Length":"4","Actual-Content-Type":"application/http; msgtype=response","HTTP-Response-Metadata":{"Headers":{"Vary":"Accept-Encoding","Date":"Sat, 18 Apr 2015 03:20:36 GMT","X-XSS-Protection":"1; mode=block","Expires":"Sat, 18 Apr 2015 03:20:36 GMT","Alternate-Protocol":"80:quic,p=1","Last-Modified":"Sun, 05 Oct 2014 04:40:47 GMT","Accept-Ranges":"none","Content-Type":"text/html; charset=UTF-8","Server":"GSE","X-Content-Type-Options":"nosniff","Cache-Control":"private, max-age=0"},"Headers-Length":"373","Entity-Length":"118782","Entity-Trailing-Slop-Bytes":"0","Response-Message":{"Status":"200","Version":"HTTP/1.0","Reason":"OK"},"HTML-Metadata":{"Links":[{"path":"STYLE/#text","href":"http://1.bp.blogspot.com/-gSuyWawcDQI/T_8E2MHTq4I/AAAAAAAAHh8/28wngpY_1VE/s1600/body1.jpg"},{"path":"STYLE/#text","href":"http://1.bp.blogspot.com/-GjygTCuQHIo/UAALHd3yfMI/AAAAAAAAHkU/JL9YDC5C7i8/s1600/header+maskolis.png"},{"path":"STYLE/#text","href":"http://2.bp.blogspot.com/-B1EJ_8T064o/UAAJaabsGXI/AAAAAAAAHkM/ESPCL1q_zDs/s1600/toppic.png"},{"path":"STYLE/#text","href":"http://4.bp.blogspot.com/-nYOCVDgfjs0/T__7tfORxaI/AAAAAAAAHjg/34w55pkOsRI/s1600/nasi+goreng.png"},{"path":"STYLE/#text","href":"http://3.bp.blogspot.com/-h2VKwi0QJ8o/UABDnt7g2NI/AAAAAAAAHlE/ZONRawNK674/s1600/mas+goreng.png"},{"path":"STYLE/#text","href":"http://3.bp.blogspot.com/-nYU6zP4yUeQ/T_8Nq6peTkI/AAAAAAAAHiY/7hIWjzI7fU0/s1600/newsf.png"},{"path":"STYLE/#text","href":"http://3.bp.blogspot.com/-UxcaQB5vgbE/UABZdfOoaYI/AAAAAAAAHmA/ak31LT2Mrmw/s1600/anonym+maskolis.png"},{"path":"STYLE/#text","href":"http://4.bp.blogspot.com/-jWA-qoWbav0/T__pNZq6fSI/AAAAAAAAHiw/MJW57BCArbE/s1600/sidebar.png"},{"path":"STYLE/#text","href":"http://2.bp.blogspot.com/-Pra74ieAyu0/T_7zUVLremI/AAAAAAAAHg8/ojy_B2rUVRs/s1600/split.png"},{"path":"STYLE/#text","href":"http://2.bp.blogspot.com/-7BeF7FZiHo0/T6vZzVSzTzI/AAAAAAAABJs/FlrWN7ZRxmk/s1600/drid.gif"},{"path":"STYLE/#text","href":"http://4.bp.blogspot.com/-bL05vrENgGI/T_8GYosuCCI/AAAAAAAAHiE/kHkfWzgszgY/s1600/outer.png"},{"path":"STYLE/#text","href":"http://2.bp.blogspot.com/-3m9ZaGhFRqM/T-OcKK0I31I/AAAAAAAAG-U/0U2O4f5HYIw/s1600/video.png"},{"path":"STYLE/#text","href":"http://4.bp.blogspot.com/-tk5hQcNMq6M/T8zPEwjH-RI/AAAAAAAAGm0/t8xkrJitkxg/s1600/batas.gif"},{"path":"STYLE/#text","href":"http://4.bp.blogspot.com/-Bt0JYGRHfpk/T7ZpN5RNSQI/AAAAAAAAGJQ/zQtrWVZwgHA/s1600/bullet.png"},{"path":"STYLE/#text","href":"http://4.bp.blogspot.com/-nYOCVDgfjs0/T__7tfORxaI/AAAAAAAAHjg/34w55pkOsRI/s1600/nasi+goreng.png"},{"path":"STYLE/#text","href":"http://2.bp.blogspot.com/-luBF-v-UnRI/T_7Cyy0FBeI/AAAAAAAAHfc/ehPm3zH9tJQ/s1600/slider-bg.png"},{"path":"STYLE/#text","href":"http://4.bp.blogspot.com/-cCewNgDnUGs/T_8Dxak-MTI/AAAAAAAAHh0/od90YXiAMwc/s1600/previous.png"},{"path":"STYLE/#text","href":"http://2.bp.blogspot.com/-1O5rFDTbamk/T_8DwrGUnyI/AAAAAAAAHhs/GXyMGqtNYoA/s1600/next.png"},{"path":"STYLE/#text","href":"http://3.bp.blogspot.com/-isErZdr-e3w/T_7Lgjhr5OI/AAAAAAAAHf0/tjJYvmFYxS8/s1600/sliderbg.png"},{"path":"STYLE/#text","href":"http://1.bp.blogspot.com/-ZGWuQXg-PUM/T__vVbeakhI/AAAAAAAAHjI/N1aq06no9KU/s1600/horiz-menu-bg.png"},{"path":"STYLE/#text","href":"http://1.bp.blogspot.com/-ZGWuQXg-PUM/T__vVbeakhI/AAAAAAAAHjI/N1aq06no9KU/s1600/horiz-menu-bg.png"},{"path":"STYLE/#text","href":"http://4.bp.blogspot.com/-P_-9gmcXjRs/UAABz0ySIGI/AAAAAAAAHkA/QvinOFBRrqA/s1600/mas+goreng.png"},{"path":"STYLE/#text","href":"http://3.bp.blogspot.com/-h2VKwi0QJ8o/UABDnt7g2NI/AAAAAAAAHlE/ZONRawNK674/s1600/mas+goreng.png"},{"path":"STYLE/#text","href":"http://1.bp.blogspot.com/-jxemfWZZ39E/T_65sv622uI/AAAAAAAAHfQ/vaP9YXA0P5w/s1600/main-shadow.png"},{"path":"STYLE/#text","href":"http://1.bp.blogspot.com/-Tg7BikflCm8/T4_x9T1aflI/AAAAAAAAAoo/jV53zVYzVDY/s1600/social+letter.png"},{"title":"Subscribe via RSS","target":"_blank","path":"A@/href","url":"#"},{"title":"Become a Fan","target":"_blank","path":"A@/href","url":"#"},{"title":"Follow Us","target":"_blank","path":"A@/href","url":"#"},{"title":"Google Plus Profile","target":"_blank","path":"A@/href","url":"#"},{"title":"Watch The Video","target":"_blank","path":"A@/href","url":"#"},{"text":"100zone","path":"A@/href","url":"http://100zone.blogspot.com/"},{"text":"Home","path":"A@/href","url":"/"},{"text":"Drop menu","path":"A@/href","url":"#"},{"text":"Menu 1","path":"A@/href","url":"#"},{"text":"Menu 2","path":"A@/href","url":"#"},{"text":"Menu 3","path":"A@/href","url":"#"},{"text":"Menu 4","path":"A@/href","url":"#"},{"text":"Menu 5","path":"A@/href","url":"#"},{"text":"Drop menu 2","path":"A@/href","url":"#"},{"text":"Menu 1","path":"A@/href","url":"#"},{"text":"Menu 2","path":"A@/href","url":"#"},{"text":"Menu 3","path":"A@/href","url":"#"},{"text":"Menu 3.1","path":"A@/href","url":"#"},{"text":"Menu 3.2","path":"A@/href","url":"#"},{"text":"Menu 3.3","path":"A@/href","url":"#"},{"text":"Menu 3.4","path":"A@/href","url":"#"},{"text":"Menu 4","path":"A@/href","url":"#"},{"text":"Menu 5","path":"A@/href","url":"#"},{"text":"Drop menu 3","path":"A@/href","url":"#"},{"text":"Menu 1","path":"A@/href","url":"#"},{"text":"Menu 2","path":"A@/href","url":"#"},{"text":"Menu 3","path":"A@/href","url":"#"},{"text":"Menu 4","path":"A@/href","url":"#"},{"text":"Menu 5","path":"A@/href","url":"#"},{"text":"Instruction to use","path":"A@/href","url":"http://creatingwebsite-maskolis.blogspot.com/2012/07/johny-joss-banget-template-joss-untuk.html"},{"target":"_blank","path":"FORM@/action","method":"get","url":"http://www.google.com/search"},{"path":"INPUT@/src","url":"http://2.bp.blogspot.com/-BmpnVNvHgs0/T94HNdsyt8I/AAAAAAAAG50/Jn2uw4OJOUk/s1600/search_btn.png"},{"text":"skip to main","path":"A@/href","url":"#main"},{"text":"skip to sidebar","path":"A@/href","url":"#sidebar"},{"text":"เต๋า เศรษฐพงศ์ เพียงพอ - อย่าทำให้รัก (Please me).mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/please-memp3-4shared.html"},{"path":"A@/href","url":"http://1b841c5a.linkbucks.com/"},{"text":"เต๋า เศรษฐพงศ์ เพียงพอ - อย่าทำให้รัก (Please me).mp3 4shared","path":"A@/href","url":"http://1b841c5a.linkbucks.com/"},{"text":"เต๋า เศรษฐพงศ์ เพียงพอ - อย่าทำให้รัก (Please me).mp3 4shared","path":"A@/href","url":"http://1b841c5a.linkbucks.com/"},{"path":"IMG@/src","url":"http://4.bp.blogspot.com/--7bZ84tpPkw/UTiyJdrxaKI/AAAAAAAAEbE/0U4JK0BQgLU/s200/%E0%B9%80%E0%B8%95%E0%B9%8B%E0%B8%B2+%E0%B9%80%E0%B8%A8%E0%B8%A3%E0%B8%A9%E0%B8%90%E0%B8%9E%E0%B8%87%E0%B8%A8%E0%B9%8C+%E0%B9%80%E0%B8%9E%E0%B8%B5%E0%B8%A2%E0%B8%87%E0%B8%9E%E0%B8%AD+-+%E0%B8%AD%E0%B8%A2%E0%B9%88%E0%B8%B2%E0%B8%97%E0%B8%B3%E0%B9%83%E0%B8%AB%E0%B9%89%E0%B8%A3%E0%B8%B1%E0%B8%81+(Please+me).mp3+4shared.jpeg"},{"path":"A@/href","url":"http://1b841c5a.linkbucks.com/"},{"path":"A@/href","url":"http://3.bp.blogspot.com/-mygVRa1B4XA/TzcdSaF0L_I/AAAAAAAAANE/cpb41SgGnIc/s1600/%E0%B9%81%E0%B8%88%E0%B8%81%E0%B9%80%E0%B8%9E%E0%B8%A5%E0%B8%87++%E0%B9%80%E0%B8%9E%E0%B8%A3%E0%B8%B2%E0%B8%B0%E0%B8%A7%E0%B9%88%E0%B8%B2...%E0%B8%A3%E0%B8%B1%E0%B8%81+%28Ost.%E0%B8%A3%E0%B8%B1%E0%B8%81+An+Ordinary+Love+Story%29+-+Johnnie+Runner+4shared.jpg"},{"text":"เต๋า เศรษฐพงศ์ เพียงพอ - อย่าทำให้รัก (Please me).mp3 4shared","path":"A@/href","url":"http://1b841c5a.linkbucks.com/"},{"path":"A@/href","url":"http://1b841c5a.linkbucks.com/"},{"text":"---Download Click---","path":"A@/href","url":"http://1b841c5a.linkbucks.com/"},{"path":"IMG@/src","url":"http://1.bp.blogspot.com/-8Gqvqp33o_A/TzOH5vO5u3I/AAAAAAAAALc/OzjfhzuHEu0/s640/%E0%B8%9B%E0%B9%89%E0%B8%B2%E0%B8%A2%E0%B8%94%E0%B8%B2%E0%B8%A7%E0%B8%99%E0%B9%8C%E0%B9%82%E0%B8%AB%E0%B8%A5%E0%B8%94555.jpg"},{"path":"A@/href","url":"http://1b841c5a.linkbucks.com/"},{"text":"Download Now","path":"A@/href","url":"http://1b841c5a.linkbucks.com/"},{"text":"เต๋า เศรษฐพงศ์ เพียงพอ - อย่าทำให้รัก (Please me).mp3 4shared","path":"A@/href","url":"http://1b841c5a.linkbucks.com/"},{"text":"เต๋า เศรษฐพงศ์ เพียงพอ - อย่าทำให้รัก (Please me).mp3 4shared","path":"A@/href","url":"http://1b841c5a.linkbucks.com/"},{"text":"เต๋า เศรษฐพงศ์ เพียงพอ - อย่าทำให้รัก (Please me).mp3 4shared","path":"A@/href","url":"http://1b841c5a.linkbucks.com/"},{"text":"เต๋า เศรษฐพงศ์ เพียงพอ - อย่าทำให้รัก (Please me).mp3 4shared","path":"A@/href","url":"http://1b841c5a.linkbucks.com/"},{"text":"เต๋า เศรษฐพงศ์ เพียงพอ - อย่าทำให้รัก (Please me).mp3 4shared","path":"A@/href","url":"http://1b841c5a.linkbucks.com/"},{"text":"เต๋า เศรษฐพงศ์ เพียงพอ - อย่าทำให้รัก (Please me).mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/search/label/%E0%B9%80%E0%B8%95%E0%B9%8B%E0%B8%B2%20%E0%B9%80%E0%B8%A8%E0%B8%A3%E0%B8%A9%E0%B8%90%E0%B8%9E%E0%B8%87%E0%B8%A8%E0%B9%8C%20%E0%B9%80%E0%B8%9E%E0%B8%B5%E0%B8%A2%E0%B8%87%E0%B8%9E%E0%B8%AD%20-%20%E0%B8%AD%E0%B8%A2%E0%B9%88%E0%B8%B2%E0%B8%97%E0%B8%B3%E0%B9%83%E0%B8%AB%E0%B9%89%E0%B8%A3%E0%B8%B1%E0%B8%81%20%28Please%20me%29.mp3%204shared"},{"path":"A@/href","url":"https://www.blogger.com/comment-iframe.g?blogID=5508487274257568142&postID=4500939284242849003"},{"path":"IFRAME@/src","url":""},{"text":"« Prev Movie","title":"Previous Product","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/etcmp3-4shared.html"},{"text":"Next Movie »","title":"Next Product","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/how-long-38-years-agomp3-4shared.html"},{"text":"Home","path":"A@/href","url":"http://100zone.blogspot.com/"},{"alt":"","path":"IMG@/src","url":"http://1.bp.blogspot.com/-N2yx1at1tMc/UeGZEst7foI/AAAAAAAAFM0/oXdOUtpBdoQ/s72-c/Fah+Demo+Project+-+%E0%B9%81%E0%B8%9F%E0%B8%99%E0%B9%80%E0%B8%81%E0%B9%88%E0%B8%B2%E0%B8%81%E0%B9%87%E0%B9%80%E0%B8%AB%E0%B8%87%E0%B8%B2%E0%B9%80%E0%B8%9B%E0%B9%87%E0%B8%99.mp3+4shared.jpg"},{"target":"_blank","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/fah-demo-project-mp3-4shared.html"},{"text":"Fah Demo Project - แฟนเก่าก็เหงาเป็น.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/fah-demo-project-mp3-4shared.html"},{"alt":"","path":"IMG@/src","url":"http://4.bp.blogspot.com/-kaJUHVqK2-w/UdKVJYxykuI/AAAAAAAAFG4/kv7QnVtQfzw/s72-c/%E0%B8%9F%E0%B8%B1%E0%B8%87%E0%B9%80%E0%B8%82%E0%B9%89%E0%B8%B2%E0%B9%83%E0%B8%88%E0%B9%81%E0%B8%95%E0%B9%88%E0%B9%84%E0%B8%A1%E0%B9%88%E0%B8%A3%E0%B8%B9%E0%B9%89%E0%B8%AA%E0%B8%B6%E0%B8%81+-+7+DAYS+CRAZY.mp3+4shared.jpg"},{"target":"_blank","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/7-days-crazymp3-4shared.html"},{"text":"ฟังเข้าใจแต่ไม่รู้สึก - 7 DAYS CRAZY.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/7-days-crazymp3-4shared.html"},{"alt":"","path":"IMG@/src","url":"http://1.bp.blogspot.com/-7qZ6ewYTplI/Ud2ABZ5RBLI/AAAAAAAAFKE/-1qKuzTOti0/s72-c/%E0%B8%97%E0%B8%B0%E0%B9%80%E0%B8%A5%E0%B8%A5%E0%B8%B4%E0%B8%9F%E0%B8%97%E0%B9%8C+-+%E0%B8%A7%E0%B8%B1%E0%B8%8A%E0%B8%A3%E0%B8%B2%E0%B8%A7%E0%B8%A5%E0%B8%B5.mp3+4shared.jpg"},{"target":"_blank","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/mp3-4shared_10.html"},{"text":"ทะเลลิฟท์ - วัชราวลี.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/mp3-4shared_10.html"},{"alt":"","path":"IMG@/src","url":"http://2.bp.blogspot.com/-Oa4-2ggytyw/Ua6pAHcN0LI/AAAAAAAAE3Y/b93JxB_gpVo/s72-c/%E0%B9%80%E0%B8%A3%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B8%87%E0%B8%88%E0%B8%A3%E0%B8%B4%E0%B8%87%E0%B9%80%E0%B8%A3%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B8%87%E0%B8%AA%E0%B8%B8%E0%B8%94%E0%B8%97%E0%B9%89%E0%B8%B2%E0%B8%A2+-+%E0%B9%82%E0%B8%94%E0%B8%A1+%E0%B8%88%E0%B8%B2%E0%B8%A3%E0%B8%B8%E0%B8%A7%E0%B8%B1%E0%B8%92%E0%B8%99%E0%B9%8C.mp3+4shared.jpg"},{"target":"_blank","path":"A@/href","url":"http://100zone.blogspot.com/2013/06/mp3-4shared_4.html"},{"text":"เรื่องจริงเรื่องสุดท้าย - โดม จารุวัฒน์.","path":"A@/href","url":"http://100zone.blogspot.com/2013/06/mp3-4shared_4.html"},{"alt":"","path":"IMG@/src","url":"http://3.bp.blogspot.com/-OHIez-FX39E/UeggLm5CwMI/AAAAAAAAFPM/3NolsZXul_Q/s72-c/%E0%B8%A1%E0%B8%99%E0%B8%B8%E0%B8%A9%E0%B8%A2%E0%B9%8C%E0%B8%A5%E0%B9%88%E0%B8%AD%E0%B8%87%E0%B8%AB%E0%B8%99+-+Yes%27sir+days.mp3+4shared.jpg"},{"target":"_blank","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/yessir-daysmp3-4shared.html"},{"text":"มนุษย์ล่องหน - Yes'sir days.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/yessir-daysmp3-4shared.html"},{"alt":"","path":"IMG@/src","url":"http://1.bp.blogspot.com/-lxOkPz7DX_A/Ud1tNWKKGUI/AAAAAAAAFJk/on7w0fUYVNo/s72-c/%E0%B9%80%E0%B8%AB%E0%B8%99%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B8%A2%E0%B9%84%E0%B8%AB%E0%B8%A1%E0%B8%AB%E0%B8%B1%E0%B8%A7%E0%B9%83%E0%B8%88+-+Retrospect.mp3+4shared.jpg"},{"target":"_blank","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/retrospectmp3-4shared.html"},{"text":"เหนื่อยไหมหัวใจ - Retrospect.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/retrospectmp3-4shared.html"},{"alt":"","path":"IMG@/src","url":"http://4.bp.blogspot.com/-Veq2bbeo-58/UdKSHFovjUI/AAAAAAAAFGQ/5AxACeYuzSk/s72-c/%E0%B9%83%E0%B8%88%E0%B9%80%E0%B8%AD%E0%B8%A2+-+%E0%B8%AA%E0%B8%99+%E0%B8%A2%E0%B8%B8%E0%B8%81%E0%B8%95%E0%B9%8C+%E0%B9%80%E0%B8%9E%E0%B8%A5%E0%B8%87%E0%B8%9B%E0%B8%A3%E0%B8%B0%E0%B8%81%E0%B8%AD%E0%B8%9A%E0%B8%A5%E0%B8%B0%E0%B8%84%E0%B8%A3+%E0%B9%81%E0%B8%84%E0%B9%89%E0%B8%99%E0%B9%80%E0%B8%AA%E0%B8%99%E0%B9%88%E0%B8%AB%E0%B8%B2.mp3+4shared.jpg"},{"target":"_blank","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/mp3-4shared_6841.html"},{"text":"ใจเอย - สน ยุกต์ เพลงประกอบละคร แค้นเสน่หา.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/mp3-4shared_6841.html"},{"alt":"","path":"IMG@/src","url":"http://4.bp.blogspot.com/-3o3flJnEtRI/UbHQaeZWRHI/AAAAAAAAE6o/57GKJUf1KlU/s72-c/%E0%B8%99%E0%B8%B2%E0%B8%99%E0%B9%80%E0%B8%97%E0%B9%88%E0%B8%B2%E0%B8%99%E0%B8%B2%E0%B8%99+%28%E0%B9%80%E0%B8%9E%E0%B8%A5%E0%B8%87%E0%B8%9B%E0%B8%A3%E0%B8%B0%E0%B8%81%E0%B8%AD%E0%B8%9A%E0%B8%A5%E0%B8%B0%E0%B8%84%E0%B8%A3+%E0%B8%AB%E0%B8%B1%E0%B8%A7%E0%B9%83%E0%B8%88%E0%B9%80%E0%B8%A3%E0%B8%B7%E0%B8%AD%E0%B8%9E%E0%B9%88%E0%B8%A7%E0%B8%87%29+-+%E0%B8%AB%E0%B8%99%E0%B8%B8%E0%B9%88%E0%B8%A1+KALA.mp3+4shared.jpg"},{"target":"_blank","path":"A@/href","url":"http://100zone.blogspot.com/2013/06/kalamp3-4shared.html"},{"text":"นานเท่านาน (เพลงประกอบละคร หัวใจเรือพ่วง) - หนุ่ม KALA.mp3 4shar","path":"A@/href","url":"http://100zone.blogspot.com/2013/06/kalamp3-4shared.html"},{"alt":"","path":"IMG@/src","url":"http://4.bp.blogspot.com/-ikU6wNsjky0/Ud2Eb5sbzpI/AAAAAAAAFKU/a7BFjVkanEs/s72-c/%E0%B9%81%E0%B8%81%E0%B8%87%E0%B8%AA%E0%B9%89%E0%B8%A1+-+%E0%B8%84%E0%B8%B8%E0%B8%93%E0%B9%81%E0%B8%A5%E0%B8%B0%E0%B8%84%E0%B8%B8%E0%B8%93%E0%B9%80%E0%B8%97%E0%B9%88%E0%B8%B2%E0%B8%99%E0%B8%B1%E0%B9%89%E0%B8%99.mp3+4shared.jpg"},{"target":"_blank","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/mp3-4shared_6607.html"},{"text":"แกงส้ม - คุณและคุณเท่านั้น.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/mp3-4shared_6607.html"},{"alt":"","path":"IMG@/src","url":"http://2.bp.blogspot.com/-GKiybo5ctaw/UegiVT4uS4I/AAAAAAAAFPo/taYmRpr6pdg/s72-c/%E0%B9%84%E0%B8%A1%E0%B9%88%E0%B8%A1%E0%B8%B5%E0%B8%AD%E0%B8%B2%E0%B8%A3%E0%B8%A1%E0%B8%93%E0%B9%8C%E0%B8%AB%E0%B8%B2%E0%B8%A2%E0%B9%83%E0%B8%88+-+Dr.Fuu.jpg"},{"target":"_blank","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/drfuump3-4shared.html"},{"text":"ไม่มีอารมณ์หายใจ - Dr.Fuu.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/07/drfuump3-4shared.html"},{"alt":"","path":"IMG@/src","url":"http://img1.blogblog.com/img/icon18_wrench_allbkg.png"},{"title":"Edit","target":"configPopularPosts1","path":"A@/href","url":"//www.blogger.com/rearrange?blogID=5508487274257568142&widgetType=PopularPosts&widgetId=PopularPosts1&action=editWidget§ionId=sidebar"},{"text":"▼ ","path":"A@/href","url":"javascript:void(0)"},{"text":"2013","path":"A@/href","url":"http://100zone.blogspot.com/search?updated-min=2013-01-01T00:00:00-08:00&updated-max=2014-01-01T00:00:00-08:00&max-results=50"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"August","path":"A@/href","url":"http://100zone.blogspot.com/2013_08_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"July","path":"A@/href","url":"http://100zone.blogspot.com/2013_07_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"June","path":"A@/href","url":"http://100zone.blogspot.com/2013_06_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"May","path":"A@/href","url":"http://100zone.blogspot.com/2013_05_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"April","path":"A@/href","url":"http://100zone.blogspot.com/2013_04_01_archive.html"},{"text":"▼ ","path":"A@/href","url":"javascript:void(0)"},{"text":"March","path":"A@/href","url":"http://100zone.blogspot.com/2013_03_01_archive.html"},{"text":"คืนนี้อยากได้กี่ครั้ง (เพลงประกอบภาพยนตร์ Smal","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/small-rule-kala-mp3-4shared.html"},{"text":"Almost Love Mr.Lazy feat.P's Voice (Ploy Chava) - ...","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/almost-love-mrlazy-featp-voice-ploy.html"},{"text":"Love Call - T.C. Ft. วุ้น ภัทรดา จันทรางกรู.mp3 4s...","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/love-call-tc-ft-mp3-4shared.html"},{"text":"ขาหัก(Remix) - Coconut Sunday.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/remix-coconut-sundaymp3-4shared.html"},{"text":"หากฉัน - Under 18+.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/under-18mp3-4shared.html"},{"text":"เธอ เธอ เธอ Mr.Lazy feat.ที JETSET’ER - Mr.Lazy.mp...","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/mrlazy-feat-jetseter-mrlazymp3-4shared.html"},{"text":"มะนาวลูกนั้น (เพลงประกอบภาพยนตร์ พี่มาก..พระโขนง) ...","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/mp3-4shared.html"},{"text":"กามเทพ - แก้ม วิชญาณี.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/mp3-4shared_27.html"},{"text":"Natthew - She's Bad.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/natthew-she-badmp3-4shared.html"},{"text":"มาได้จังหวะ (In Time) - Timethai.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/in-time-timethaimp3-4shared.html"},{"text":"คนที่ฉันรักคือเธอ - Sweets.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/sweetsmp3-4shared.html"},{"text":"หัวใจไม่ให้ความร่วมมือ - PREEN (Ost.แผนร้ายพ่ายรัก..","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/preen-ostmp3-4shared.html"},{"text":"[ALBUM] Super Junior M - Break Down.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/album-super-junior-m-break-downmp3.html"},{"text":"เอก สุระเชษฐ์ - ที่รัก (เธอ).mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/mp3-4shared_22.html"},{"text":"ค้นหา - ปันปัน.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/mp3-4shared_1800.html"},{"text":"คิดถึง...ซึ้งป่ะ - ริท เรืองฤทธิ์.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/mp3-4shared_21.html"},{"text":"นาทีที่ไม่มีฉัน - ต้อล วันธงชัย.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/mp3-4shared_4869.html"},{"text":"ฉันดีใจที่มีเธอ (Ost.รักข้ามเส้น) - The Begins.mp3..","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/ost-beginsmp3-4shared.html"},{"text":"Hari (하리) - kwi­yomi song _ cut­ie song (귀요미송).mp3...","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/hari-kwiyomi-song-cutie-song-mp3-4shared.html"},{"text":"รอ - O-PAVEE.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/o-paveemp3-4shared.html"},{"text":"เท่าตัว - แอน ณัฎฐ์ณัชชา นําเจริญสมบัติ.mp3 4s","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/mp3-4shared_616.html"},{"text":"Paradox - ไม่มีเธอ (กล่องดวงใจ).mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/paradox-mp3-4shared.html"},{"text":"ผจญภัย - Mrs.Slave.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/mrsslavemp3-4shared.html"},{"text":"Thaitanium Feat. Big Ron - Love for my city.mp3 4s...","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/thaitanium-feat-big-ron-love-for-my.html"},{"text":"แชมป์ ศุภวัฒน์ - ขอให้เธอผิดหวัง.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/mp3-4shared_20.html"},{"text":"Can You Hear Me - โดม ปกรณ์ ลัม.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/can-you-hear-me-mp3-4shared.html"},{"text":"will.i.am - #thatPOWER (feat. Justin Bieber).mp3 4...","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/william-thatpower-feat-justin-biebermp3.html"},{"text":"Jay Sean - Where You Are.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/jay-sean-where-you-aremp3-4shared.html"},{"text":"Lights & Owl City - Cactus In the Valley (Acoustic...","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/lights-owl-city-cactus-in-valley.html"},{"text":"แฟนคันแรก feat.The Richman Toy - FOUR-MOD","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/featthe-richman-toy-four-mod.html"},{"text":"สมเกียรติ - คนที่ยังไม่พร้อม.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/mp3-4shared_15.html"},{"text":"ฟิล์ม รัฐภูมิ - เหงา...เข้าใจ (Ost. แผนร้ายพ่ายร$","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/ost-mp3-4shared.html"},{"text":"ขอบคุณที่มารักกัน - กบ ช่างศิลป์.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/mp3-4shared_4026.html"},{"text":"เหตุผลใดถึงรักเธอ - BlackHead.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/blackheadmp3-4shared.html"},{"text":"C-Quint - ยังเป็นความลับ (Invisible).mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/c-quint-invisiblemp3-4shared.html"},{"text":"บ่องตง - คาวบอย.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/mp3-4shared_7947.html"},{"text":"วัชราวลี - ผมคือเวลา.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/mp3-4shared_3861.html"},{"text":"ฝาก - Etc.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/etcmp3-4shared.html"},{"text":"เต๋า เศรษฐพงศ์ เพียงพอ - อย่าทำให้รัก (Please me)....","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/please-memp3-4shared.html"},{"text":"How Long - The 38 Years Ago.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/how-long-38-years-agomp3-4shared.html"},{"text":"จับที่หัวใจ(15 ปี ไบเทค)-ตู่ ภพธร, มิ้นท์","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/15-buddha-blessmp3-4shared.html"},{"text":"Jamezpat - ไกลแค่ไหนคือใกล้.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/jamezpat-mp3-4shared.html"},{"text":"คนด้านมืด - ไอเฟล Feat.ไทเทเนี่ยม.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/featmp3-4shared.html"},{"text":"TOR+ Saksit - กั้๊ก (Gugg).mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/tor-saksit-guggmp3-4shared.html"},{"text":"ความรักมีค่าเมื่อมีเธอ - Boyz Joyboy.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/boyz-joyboymp3-4shared.html"},{"text":"เธอเป็นเธอ - Lipta.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/liptamp3-4shared.html"},{"text":"เพื่อดาวดวงนั้น - The Star.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/starmp3-4shared.html"},{"text":"Painting - เตเต.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/painting-mp3-4shared.html"},{"text":"จะเป็นจะตาย - Sweet Mullet.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/sweet-mulletmp3-4shared.html"},{"text":"อยากให้เธอได้ยินหัวใจ feat. ฟิล์ม บงกช (Ost. พรพรห...","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/feat-ost-yes-days-mp3-4shared.html"},{"text":"ของตาย - อ๊อฟ ปองศักดิ์.mp3 4shared","path":"A@/href","url":"http://100zone.blogspot.com/2013/03/mp3-4shared_1.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"February","path":"A@/href","url":"http://100zone.blogspot.com/2013_02_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"January","path":"A@/href","url":"http://100zone.blogspot.com/2013_01_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"2012","path":"A@/href","url":"http://100zone.blogspot.com/search?updated-min=2012-01-01T00:00:00-08:00&updated-max=2013-01-01T00:00:00-08:00&max-results=50"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"December","path":"A@/href","url":"http://100zone.blogspot.com/2012_12_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"November","path":"A@/href","url":"http://100zone.blogspot.com/2012_11_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"October","path":"A@/href","url":"http://100zone.blogspot.com/2012_10_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"September","path":"A@/href","url":"http://100zone.blogspot.com/2012_09_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"August","path":"A@/href","url":"http://100zone.blogspot.com/2012_08_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"July","path":"A@/href","url":"http://100zone.blogspot.com/2012_07_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"June","path":"A@/href","url":"http://100zone.blogspot.com/2012_06_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"May","path":"A@/href","url":"http://100zone.blogspot.com/2012_05_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"April","path":"A@/href","url":"http://100zone.blogspot.com/2012_04_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"March","path":"A@/href","url":"http://100zone.blogspot.com/2012_03_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"February","path":"A@/href","url":"http://100zone.blogspot.com/2012_02_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"January","path":"A@/href","url":"http://100zone.blogspot.com/2012_01_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"2011","path":"A@/href","url":"http://100zone.blogspot.com/search?updated-min=2011-01-01T00:00:00-08:00&updated-max=2012-01-01T00:00:00-08:00&max-results=50"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"November","path":"A@/href","url":"http://100zone.blogspot.com/2011_11_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"July","path":"A@/href","url":"http://100zone.blogspot.com/2011_07_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"March","path":"A@/href","url":"http://100zone.blogspot.com/2011_03_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"February","path":"A@/href","url":"http://100zone.blogspot.com/2011_02_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"January","path":"A@/href","url":"http://100zone.blogspot.com/2011_01_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"2010","path":"A@/href","url":"http://100zone.blogspot.com/search?updated-min=2010-01-01T00:00:00-08:00&updated-max=2011-01-01T00:00:00-08:00&max-results=50"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"December","path":"A@/href","url":"http://100zone.blogspot.com/2010_12_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"November","path":"A@/href","url":"http://100zone.blogspot.com/2010_11_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"October","path":"A@/href","url":"http://100zone.blogspot.com/2010_10_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"September","path":"A@/href","url":"http://100zone.blogspot.com/2010_09_01_archive.html"},{"text":"► ","path":"A@/href","url":"javascript:void(0)"},{"text":"August","path":"A@/href","url":"http://100zone.blogspot.com/2010_08_01_archive.html"},{"alt":"","path":"IMG@/src","url":"http://img1.blogblog.com/img/icon18_wrench_allbkg.png"},{"title":"Edit","target":"configBlogArchive1","path":"A@/href","url":"//www.blogger.com/rearrange?blogID=5508487274257568142&widgetType=BlogArchive&widgetId=BlogArchive1&action=editWidget§ionId=sidebar"},{"text":"Creating Website","path":"A@/href","url":"http://www.maskolis.com/"},{"text":"Johny Template","path":"A@/href","url":"http://johnytemplate.blogspot.com/"},{"text":"Mas Template","path":"A@/href","url":"http://www.mastemplate.com/"},{"text":"100zone","title":"100zone","path":"A@/href","url":"http://100zone.blogspot.com/"},{"text":"Creating Website","path":"A@/href","url":"http://www.maskolis.com/"},{"text":"Mas Template","path":"A@/href","url":"http://www.mastemplate.com/"},{"text":"Blogger","path":"A@/href","url":"http://www.blogger.com"}],"Head":{"Link":[{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://fonts.googleapis.com/css?family=Oswald"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://fonts.googleapis.com/css?family=PT+Sans+Narrow"},{"path":"LINK@/href","rel":"icon","type":"image/x-icon","url":"http://100zone.blogspot.com/favicon.ico"},{"path":"LINK@/href","rel":"canonical","url":"http://100zone.blogspot.com/2013/03/please-memp3-4shared.html"},{"path":"LINK@/href","rel":"alternate","type":"application/atom+xml","url":"http://100zone.blogspot.com/feeds/posts/default"},{"path":"LINK@/href","rel":"alternate","type":"application/rss+xml","url":"http://100zone.blogspot.com/feeds/posts/default?alt=rss"},{"path":"LINK@/href","rel":"service.post","type":"application/atom+xml","url":"http://www.blogger.com/feeds/5508487274257568142/posts/default"},{"path":"LINK@/href","rel":"alternate","type":"application/atom+xml","url":"http://100zone.blogspot.com/feeds/4500939284242849003/comments/default"},{"path":"LINK@/href","rel":"image_src","url":"http://4.bp.blogspot.com/--7bZ84tpPkw/UTiyJdrxaKI/AAAAAAAAEbE/0U4JK0BQgLU/s200/%E0%B9%80%E0%B8%95%E0%B9%8B%E0%B8%B2+%E0%B9%80%E0%B8%A8%E0%B8%A3%E0%B8%A9%E0%B8%90%E0%B8%9E%E0%B8%87%E0%B8%A8%E0%B9%8C+%E0%B9%80%E0%B8%9E%E0%B8%B5%E0%B8%A2%E0%B8%87%E0%B8%9E%E0%B8%AD+-+%E0%B8%AD%E0%B8%A2%E0%B9%88%E0%B8%B2%E0%B8%97%E0%B8%B3%E0%B9%83%E0%B8%AB%E0%B9%89%E0%B8%A3%E0%B8%B1%E0%B8%81+(Please+me).mp3+4shared.jpeg"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"https://www.blogger.com/static/v1/widgets/728935430-widget_css_bundle.css"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"https://www.blogger.com/dyn-css/authorization.css?targetBlogID=5508487274257568142&zx=f8704c09-085e-4ba3-888d-570da86c13dd"}],"Scripts":[{"path":"SCRIPT@/src","type":"text/javascript","url":"http://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js"},{"path":"SCRIPT@/src","type":"text/javascript","url":"https://www.google.com/jsapi?key=ABQIAAAAlQIoliUVPjZwD8UDgw_U3RTUhB4JyH-ajz-fA9t4yePPPdGAfRTC_mtuh6Iq1MLEipD0I2rCi30Png"},{"path":"SCRIPT@/src","type":"text/javascript","url":"https://apis.google.com/js/plusone.js"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://s7.addthis.com/js/250/addthis_widget.js#pubid=4rifin"},{"path":"SCRIPT@/src","type":"text/javascript","url":"/feeds/posts/default/-/เต๋า เศรษฐพงศ์ เพียงพอ - อย่าทำให้รัก (Please me).mp3 4shared?alt=json-in-script&callback=related_results_labels_thumbs&max-results=12"},{"path":"SCRIPT@/src","type":"text/javascript","url":"https://www.blogger.com/static/v1/jsbin/666035058-comment_from_post_iframe.js"},{"path":"SCRIPT@/src","type":"text/javascript","url":"https://www.blogger.com/static/v1/widgets/849756096-widgets.js"},{"path":"SCRIPT@/src","type":"text/javascript","url":"https://apis.google.com/js/plusone.js"}],"Metas":[{"content":"text/html; charset=UTF-8","http-equiv":"Content-Type"},{"content":"blogger","name":"generator"},{"content":"Your Blog Description here!","name":"description"},{"content":"Your Keywords here!","name":"keywords"},{"content":"Author Name here!","name":"Author"},{"content":"Author Email Address here!","name":"Email"},{"content":"document","name":"resource-type"},{"content":"all","name":"audience"},{"content":"general","name":"rating"},{"content":"all","name":"robots"},{"content":"index, follow","name":"robots"},{"content":"id","name":"language"},{"content":"id","name":"geo.country"},{"content":"global","name":"distribution"},{"content":"1 days","name":"revisit-after"},{"content":"Indonesia","name":"geo.placename"}],"Title":"เต๋า เศรษฐพงศ์ เพียงพอ - อย่าทำให้รัก (Please me).mp3 4shared - "}},"Entity-Digest":"sha1:B7ZYUGWISVE3PJLCKTJ3Y6XLJ4PDGXP3"}}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"27915","Header-Length":"10","Inflated-CRC":"-709990921","Inflated-Length":"119755"},"Offset":"28186","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://100zone.blogspot.com/2013/03/please-memp3-4shared.html
+WARC-Date: 2015-04-18T03:20:36Z
+WARC-Record-ID: <urn:uuid:12702325-a4b9-4247-a146-c9c1b91f224d>
+WARC-Refers-To: <urn:uuid:a39be72a-3a45-4007-8f89-85aeb7db716c>
+Content-Type: application/json
+Content-Length: 1078
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"406","Block-Digest":"sha1:5ZDPMWG5PEUVMPLSDCD27D24JP52X2LL","Actual-Content-Length":"20","WARC-Header-Metadata":{"WARC-Type":"metadata","WARC-Date":"2015-04-18T03:20:36Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"20","WARC-Record-ID":"<urn:uuid:a39be72a-3a45-4007-8f89-85aeb7db716c>","WARC-Target-URI":"http://100zone.blogspot.com/2013/03/please-memp3-4shared.html","WARC-Concurrent-To":"<urn:uuid:6a2d0bfa-c6eb-49b4-99dd-e1708960e2a7>","Content-Type":"application/warc-fields"},"Payload-Metadata":{"Trailing-Slop-Length":"4","WARC-Metadata-Metadata":{"Trailing-Slop-Length":"0","Metadata-Records":[{"Name":"fetchTimeMs","Value":"345"}],"Actual-Content-Length":"20"},"Actual-Content-Type":"application/metadata-fields"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"314","Header-Length":"10","Inflated-CRC":"-42288549","Inflated-Length":"430"},"Offset":"56101","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://1019ampradio.cbslocal.com/tag/Daft-Punk/
+WARC-Date: 2015-04-18T03:29:19Z
+WARC-Record-ID: <urn:uuid:89c55285-2540-4149-a667-dac51453f008>
+WARC-Refers-To: <urn:uuid:7cf4bd15-7d03-46b0-9ed3-a4510628061d>
+Content-Type: application/json
+Content-Length: 1408
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"363","Block-Digest":"sha1:AAQSDACJCYB4LPE666YXF3QG5HD4SSL7","Actual-Content-Length":"278","WARC-Header-Metadata":{"WARC-Type":"request","WARC-Date":"2015-04-18T03:29:19Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"278","WARC-Record-ID":"<urn:uuid:7cf4bd15-7d03-46b0-9ed3-a4510628061d>","WARC-Target-URI":"http://1019ampradio.cbslocal.com/tag/Daft-Punk/","WARC-IP-Address":"192.0.79.32","Content-Type":"application/http; msgtype=request"},"Payload-Metadata":{"Trailing-Slop-Length":"4","HTTP-Request-Metadata":{"Headers":{"Accept-Language":"en-us,en-gb,en;q=0.7,*;q=0.3","Host":"1019ampradio.cbslocal.com","Accept-Encoding":"x-gzip, gzip, deflate","User-Agent":"CCBot/2.0 (http://commoncrawl.org/faq/)","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"},"Headers-Length":"276","Entity-Length":"0","Entity-Trailing-Slop-Bytes":"0","Request-Message":{"Method":"GET","Version":"HTTP/1.0","Path":"/tag/Daft-Punk/"},"Entity-Digest":"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"},"Actual-Content-Type":"application/http; msgtype=request"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"429","Header-Length":"10","Inflated-CRC":"232263512","Inflated-Length":"645"},"Offset":"56415","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
+
diff --git a/webindex/modules/data/src/test/resources/wat.warc b/webindex/modules/data/src/test/resources/wat.warc
new file mode 100644
index 0000000..02781c5
--- /dev/null
+++ b/webindex/modules/data/src/test/resources/wat.warc
@@ -0,0 +1,10 @@
+WARC/1.0
+WARC-Type: metadata
+WARC-Target-URI: http://1079ishot.com/presale-password-trey-songz-young-jeezy-pre-christmas-bash/screen-shot-2011-10-27-at-11-12-06-am/
+WARC-Date: 2015-04-18T03:35:13Z
+WARC-Record-ID: <urn:uuid:4780362b-9e27-4380-a1fe-41dabcc2727c>
+WARC-Refers-To: <urn:uuid:c3f6e826-b2ad-4c52-96cd-ef5146acaa5a>
+Content-Type: application/json
+Content-Length: 7978
+
+{"Envelope":{"Format":"WARC","WARC-Header-Length":"627","Block-Digest":"sha1:MBRGC3JDYCRJL3HGQNO4SWPM7QHS442T","Actual-Content-Length":"74695","WARC-Header-Metadata":{"WARC-Type":"response","WARC-Date":"2015-04-18T03:35:13Z","WARC-Warcinfo-ID":"<urn:uuid:f67c933a-afe2-4ea3-87e5-ceddabb61ef1>","Content-Length":"74695","WARC-Record-ID":"<urn:uuid:c3f6e826-b2ad-4c52-96cd-ef5146acaa5a>","WARC-Block-Digest":"sha1:MBRGC3JDYCRJL3HGQNO4SWPM7QHS442T","WARC-Payload-Digest":"sha1:F2QC6EH57LBLLBAJYYL7YBH7M5FJWGPN","WARC-Target-URI":"http://1079ishot.com/presale-password-trey-songz-young-jeezy-pre-christmas-bash/screen-shot-2011-10-27-at-11-12-06-am/","WARC-IP-Address":"66.155.35.250","WARC-Concurrent-To":"<urn:uuid:7a84e335-ad70-4561-9544-e423c669dd94>","Content-Type":"application/http; msgtype=response"},"Payload-Metadata":{"Trailing-Slop-Length":"4","Actual-Content-Type":"application/http; msgtype=response","HTTP-Response-Metadata":{"Headers":{"X-Pingback":"http://1079ishot.com/xmlrpc.php","Age":"0","Content-Length":"16283","Connection":"close","X-Cache":"MISS","Server":"nginx/1.6.2","X-Powered-By":"W3 Total Cache/0.9.1.3","X-Varnish":"2385998252","Date":"Sat, 18 Apr 2015 03:35:13 GMT","Vary":"Accept-Encoding","Content-Encoding":"gzip","X-Device":"desktop","Via":"1.1 varnish","Content-Type":"text/html; charset=UTF-8","Accept-Ranges":"bytes"},"Headers-Length":"394","Entity-Length":"16283","Entity-Trailing-Slop-Bytes":"58018","Response-Message":{"Status":"200","Version":"HTTP/1.1","Reason":"OK"},"HTML-Metadata":{"Links":[{"path":"STYLE/#text","href":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/plugins/wp-table-reloaded/css/plugin.css?ver=1.9.3"},{"path":"IMG@/src","url":"http://b.scorecardresearch.com/p?c1=2&c2=6665296&cv=2.0&cj=1"},{"alt":"HOT 107.9","path":"IMG@/src","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/files/2013/11/1079-new-logo.png"},{"title":"HOT 107.9","path":"A@/href","url":"http://1079ishot.com"},{"path":"FORM@/action","method":"get","url":"http://1079ishot.com/"},{"text":"Contact Us","path":"A@/href","url":"/help/"},{"text":"Sign In","path":"A@/href","url":"#fb-auth-login"},{"text":"Sign Up","path":"A@/href","url":"#fb-auth-registration"},{"text":"Home","path":"A@/href","url":"/"},{"text":"On Air","path":"A@/href","url":"/djs/"}],"Head":{"Link":[{"path":"LINK@/href","rel":"shortcut icon","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/files/2013/11/favicon.ico"},{"path":"LINK@/href","rel":"alternate","type":"application/rss+xml","url":"http://1079ishot.com/feed/"},{"path":"LINK@/href","rel":"profile","url":"http://gmpg.org/xfn/11"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/themes/townsquare3-music/style.css"},{"path":"LINK@/href","rel":"alternate","type":"application/rss+xml","url":"http://1079ishot.com/presale-password-trey-songz-young-jeezy-pre-christmas-bash/screen-shot-2011-10-27-at-11-12-06-am/feed/"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/plugins/social-sharing/resources/css/social-overlay.css?ver=111d0646fbe1e27e2a5487caef68b66267e70ad7"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/plugins/weather-widget/resources/style.css?ver=3.4.2"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/plugins/loyalty-alert-system/resources/css/admin.css?ver=3.4.2"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/plugins/gallery/resources/sp-gallery.css?ver=3.4.2"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/mu-plugins/fancybox/vendor/jquery.fancybox-1.3.3.css?ver=111d0646fbe1e27e2a5487caef68b66267e70ad7"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/css/ts3.css?ver=1404462291111d0646fb"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/plugins/facebook-apps/css/facebook-auth.css?ver=111d0646fbe1e27e2a5487caef68b66267e70ad7"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/plugins/radio-players/resources/css/radio-players.css?ver=111d0646fbe1e27e2a5487caef68b66267e70ad7"},{"path":"LINK@/href","rel":"EditURI","type":"application/rsd+xml","url":"http://1079ishot.com/xmlrpc.php?rsd"},{"path":"LINK@/href","rel":"wlwmanifest","type":"application/wlwmanifest+xml","url":"http://1079ishot.com/wp-includes/wlwmanifest.xml"},{"path":"LINK@/href","rel":"canonical","url":"http://1079ishot.com/presale-password-trey-songz-young-jeezy-pre-christmas-bash/screen-shot-2011-10-27-at-11-12-06-am/"}],"Scripts":[{"path":"SCRIPT@/src","type":"text/javascript","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/plugins/jquery-updater/js/jquery-1.7.2.min.js?ver=1.7.2"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/mu-plugins/handlebarsjs/resources/js/handlebars.min.js?ver=111d0646fbe1e27e2a5487caef68b66267e70ad7"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/plugins/radio-players/resources/js/radio-players.js?ver=111d0646fbe1e27e2a5487caef68b66267e70ad7"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/plugins/tsq-sailthru-newsletter/resources/js/horizon.js?ver=111d0646fbe1e27e2a5487caef68b66267e70ad7"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/themes/townsquare3/resources/js/html5.js?ver=111d0646fbe1e27e2a5487caef68b66267e70ad7"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/plugins/google-analyticator/external-tracking.min.js?ver=6.1.1.d"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://c.amazon-adsystem.com/aax2/amzn_ads.js"},{"path":"SCRIPT@/src","type":"text/javascript","url":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/wp-content/mu-plugins/google-ads/js/tsmadhelper.js?v=111d0646fbe1e27e2a5487caef68b66267e70ad7"}],"Metas":[{"content":"width=1020","name":"viewport"},{"content":"149749458418013"},{"content":"1141488271"},{"content":"583052867"},{"content":"100002025987268"},{"content":"732998853"},{"content":"Presale Password – Trey Songz & Young Jeezy Pre-Christmas Bash Screen shot 2011-10-27 at 11.12.06 AM \u2013 HOT 107.9"},{"content":"http://1079ishot.com/presale-password-trey-songz-young-jeezy-pre-christmas-bash/screen-shot-2011-10-27-at-11-12-06-am/"},{"content":"HOT 107.9"},{"content":"blog"},{"content":""},{"content":"http://wac.450F.edgecastcdn.net/80450F/1079ishot.com/files/2013/11/1079-new-logo.png?w=250&zc=1&s=0&a=t&q=90"},{"content":"summary","name":"twitter:card"},{"content":"2011-10-27 11:12:33","name":"sailthru.date"},{"content":"Screen shot 2011-10-27 at 11.12.06 AM","name":"sailthru.title"},{"content":"tsmsite-khxt, tsmmarket-lafayette, tsmtype-local","name":"sailthru.tags"},{"content":"astroderd","name":"sailthru.author"}],"Title":"Presale Password – Trey Songz & Young Jeezy Pre-Christmas Bash Screen shot 2011-10-27 at "}},"Entity-Digest":"sha1:F6TCAM4VXWWB2JTQFQ427CGKBT273C7B"}}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"16844","Header-Length":"10","Inflated-CRC":"-1339801538","Inflated-Length":"75326"},"Offset":"195779","Filename":"CC-MAIN-20150417045713-00000-ip-10-235-10-82.ec2.internal.warc.gz"}}
diff --git a/webindex/modules/integration/pom.xml b/webindex/modules/integration/pom.xml
new file mode 100644
index 0000000..890489f
--- /dev/null
+++ b/webindex/modules/integration/pom.xml
@@ -0,0 +1,138 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2015 Webindex authors (see AUTHORS)
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>io.github.astralway</groupId>
+ <artifactId>webindex-parent</artifactId>
+ <version>0.0.1-SNAPSHOT</version>
+ <relativePath>../../pom.xml</relativePath>
+ </parent>
+ <artifactId>webindex-integration</artifactId>
+ <name>WebIndex Integration</name>
+ <dependencies>
+ <dependency>
+ <groupId>com.google.code.gson</groupId>
+ <artifactId>gson</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>com.sparkjava</groupId>
+ <artifactId>spark-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>io.github.astralway</groupId>
+ <artifactId>webindex-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>io.github.astralway</groupId>
+ <artifactId>webindex-data</artifactId>
+ <exclusions>
+ <exclusion>
+ <groupId>asm</groupId>
+ <artifactId>asm</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>io.github.astralway</groupId>
+ <artifactId>webindex-ui</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.accumulo</groupId>
+ <artifactId>accumulo-minicluster</artifactId>
+ <exclusions>
+ <exclusion>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-api</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-mini</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-recipes-test</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.netpreserve.commons</groupId>
+ <artifactId>webarchive-commons</artifactId>
+ <exclusions>
+ <exclusion>
+ <groupId>ch.qos.logback</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ </dependency>
+ <!-- Test dependencies -->
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.jsoup</groupId>
+ <artifactId>jsoup</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+ <profiles>
+ <profile>
+ <id>webindex-dev-server</id>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>exec-maven-plugin</artifactId>
+ <executions>
+ <execution>
+ <goals>
+ <goal>java</goal>
+ </goals>
+ <phase>compile</phase>
+ <configuration>
+ <mainClass>webindex.integration.DevServer</mainClass>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+ </profile>
+ </profiles>
+</project>
diff --git a/webindex/modules/integration/src/main/java/webindex/integration/DevServer.java b/webindex/modules/integration/src/main/java/webindex/integration/DevServer.java
new file mode 100644
index 0000000..7f71c38
--- /dev/null
+++ b/webindex/modules/integration/src/main/java/webindex/integration/DevServer.java
@@ -0,0 +1,189 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.integration;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import com.beust.jcommander.JCommander;
+import com.beust.jcommander.ParameterException;
+import com.google.gson.Gson;
+import org.apache.accumulo.minicluster.MiniAccumuloCluster;
+import org.apache.accumulo.minicluster.MiniAccumuloConfig;
+import org.apache.fluo.api.client.FluoAdmin;
+import org.apache.fluo.api.client.FluoClient;
+import org.apache.fluo.api.client.FluoFactory;
+import org.apache.fluo.api.client.LoaderExecutor;
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.apache.fluo.api.config.SimpleConfiguration;
+import org.apache.fluo.api.mini.MiniFluo;
+import org.apache.fluo.recipes.test.AccumuloExportITBase;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import webindex.core.IndexClient;
+import webindex.core.models.Page;
+import webindex.data.fluo.PageLoader;
+import webindex.data.spark.IndexEnv;
+import webindex.ui.WebServer;
+
+public class DevServer {
+
+ private static final Logger log = LoggerFactory.getLogger(DevServer.class);
+ private static final int TEST_SPLITS = 119;
+
+ private Path dataPath;
+ private int webPort;
+ private Path templatePath;
+ private MiniAccumuloCluster cluster;
+ private MiniFluo miniFluo;
+ private WebServer webServer;
+ private IndexClient client;
+ private AtomicBoolean running = new AtomicBoolean(false);
+ private Path baseDir;
+ private boolean metrics;
+
+ public DevServer(Path dataPath, int webPort, Path templatePath, Path baseDir, boolean metrics) {
+ this.dataPath = dataPath;
+ this.webPort = webPort;
+ this.templatePath = templatePath;
+ this.baseDir = baseDir;
+ this.metrics = metrics;
+ this.webServer = new WebServer();
+ }
+
+ public IndexClient getIndexClient() {
+ if (!running.get()) {
+ throw new IllegalStateException("DevServer must be running before retrieving index client");
+ }
+ return client;
+ }
+
+ public SimpleConfiguration configureMetrics(SimpleConfiguration config) {
+ if (metrics) {
+ config.setProperty("fluo.metrics.reporter.graphite.enable", true);
+ config.setProperty("fluo.metrics.reporter.graphite.host", "localhost");
+ config.setProperty("fluo.metrics.reporter.graphite.port", 2003);
+ config.setProperty("fluo.metrics.reporter.graphite.frequency", 30);
+ }
+ return config;
+ }
+
+ public void start() throws Exception {
+ log.info("Starting WebIndex development server...");
+
+ log.info("Starting MiniAccumuloCluster at {}", baseDir);
+
+ MiniAccumuloConfig cfg = new MiniAccumuloConfig(baseDir.toFile(), "secret");
+ cluster = new MiniAccumuloCluster(cfg);
+ cluster.start();
+
+ FluoConfiguration config = new FluoConfiguration();
+ AccumuloExportITBase.configureFromMAC(config, cluster);
+ config.setApplicationName("webindex");
+ config.setAccumuloTable("webindex");
+ configureMetrics(config);
+
+ String exportTable = "webindex_search";
+
+ log.info("Initializing Accumulo & Fluo");
+ IndexEnv env = new IndexEnv(config, exportTable, "/tmp", TEST_SPLITS, TEST_SPLITS);
+ env.initAccumuloIndexTable();
+ env.configureApplication(config, config);
+
+ FluoFactory.newAdmin(config).initialize(
+ new FluoAdmin.InitializationOptions().setClearTable(true).setClearZookeeper(true));
+
+ env.setFluoTableSplits();
+
+ log.info("Starting web server");
+ client = new IndexClient(exportTable, cluster.getConnector("root", "secret"));
+ webServer.start(client, webPort, templatePath);
+
+ log.info("Loading data from {}", dataPath);
+ Gson gson = new Gson();
+ miniFluo = FluoFactory.newMiniFluo(config);
+
+ running.set(true);
+
+ try (FluoClient client =
+ FluoFactory.newClient(configureMetrics(miniFluo.getClientConfiguration()))) {
+
+ try (LoaderExecutor le = client.newLoaderExecutor()) {
+
+ Files
+ .lines(dataPath)
+ .map(json -> Page.fromJson(gson, json))
+ .forEach(
+ page -> {
+ log.debug("Loading page {} with {} links", page.getUrl(), page.getOutboundLinks()
+ .size());
+ le.execute(PageLoader.updatePage(page));
+ });
+ }
+
+ log.info("Finished loading data. Waiting for observers to finish...");
+ miniFluo.waitForObservers();
+ log.info("Observers finished");
+ }
+ }
+
+ public void stop() {
+ miniFluo.close();
+ webServer.stop();
+ try {
+ cluster.stop();
+ } catch (Exception e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+
+ DevServerOpts opts = new DevServerOpts();
+ JCommander commander = new JCommander(opts);
+ commander.setProgramName("webindex dev");
+ try {
+ commander.parse(args);
+ } catch (ParameterException e) {
+ System.out.println(e.getMessage() + "\n");
+ commander.usage();
+ System.exit(1);
+ }
+
+ if (opts.help) {
+ commander.usage();
+ System.exit(1);
+ }
+
+ Path dataPath = Paths.get(String.format("data/%d-pages.txt", opts.numPages));
+ if (Files.notExists(dataPath)) {
+ log.info("Generating sample data at {} for dev server", dataPath);
+ SampleData.generate(dataPath, opts.numPages);
+ }
+ log.info("Loading data at {}", dataPath);
+
+ Path templatePath = Paths.get(opts.templateDir);
+ if (Files.notExists(templatePath)) {
+ log.info("Template location {} does not exits", templatePath);
+ throw new IllegalArgumentException("Template location does not exist");
+ }
+
+ Path baseDir = Files.createTempDirectory(Paths.get("target"), "webindex-dev-");
+ DevServer devServer = new DevServer(dataPath, 4567, templatePath, baseDir, opts.metrics);
+ devServer.start();
+ }
+}
diff --git a/webindex/modules/integration/src/main/java/webindex/integration/DevServerOpts.java b/webindex/modules/integration/src/main/java/webindex/integration/DevServerOpts.java
new file mode 100644
index 0000000..4bd601e
--- /dev/null
+++ b/webindex/modules/integration/src/main/java/webindex/integration/DevServerOpts.java
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.integration;
+
+import com.beust.jcommander.Parameter;
+
+public class DevServerOpts {
+
+ @Parameter(names = {"--metrics", "-m"}, description = "Enables sending metrics to localhost:3000")
+ boolean metrics = false;
+
+ @Parameter(names = {"--pages", "-p"}, description = "Number of pages to load")
+ int numPages = 1000;
+
+ @Parameter(names = {"--templateDir", "-t"}, description = "Specifies template directory")
+ String templateDir = "modules/ui/src/main/resources/spark/template/freemarker";
+
+ @Parameter(names = {"--help", "-h"}, description = "Prints usage", help = true)
+ boolean help;
+}
diff --git a/webindex/modules/integration/src/main/java/webindex/integration/SampleData.java b/webindex/modules/integration/src/main/java/webindex/integration/SampleData.java
new file mode 100644
index 0000000..a67c861
--- /dev/null
+++ b/webindex/modules/integration/src/main/java/webindex/integration/SampleData.java
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.integration;
+
+import java.io.BufferedWriter;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import com.google.gson.Gson;
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.warc.WARCReaderFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import webindex.core.models.Page;
+import webindex.data.util.ArchiveUtil;
+
+public class SampleData {
+
+ private static final Logger log = LoggerFactory.getLogger(SampleData.class);
+
+ private static final String sourceURL = "https://commoncrawl.s3.amazonaws.com/crawl-data/"
+ + "CC-MAIN-2015-32/segments/1438042981460.12/wat/"
+ + "CC-MAIN-20150728002301-00043-ip-10-236-191-2.ec2.internal.warc.wat.gz";
+
+ public static void generate(Path path, int numPages) throws Exception {
+
+ Gson gson = new Gson();
+ long count = 0;
+ try (BufferedWriter writer = Files.newBufferedWriter(path)) {
+ ArchiveReader ar = WARCReaderFactory.get(new URL(sourceURL), 0);
+ for (ArchiveRecord r : ar) {
+ Page p = ArchiveUtil.buildPage(r);
+ if (p.isEmpty() || p.getOutboundLinks().isEmpty()) {
+ log.debug("Skipping {}", p.getUrl());
+ continue;
+ }
+ log.debug("Found {} {}", p.getUrl(), p.getNumOutbound());
+ String json = gson.toJson(p);
+ writer.write(json);
+ writer.newLine();
+ count++;
+ if (count == numPages) {
+ break;
+ } else if ((count % 1000) == 0) {
+ log.info("Wrote {} of {} pages to {}", count, numPages, path);
+ }
+ }
+ }
+ log.info("Wrote {} pages to {}", numPages, path);
+ }
+}
diff --git a/webindex/modules/integration/src/test/java/webindex/integration/DevServerIT.java b/webindex/modules/integration/src/test/java/webindex/integration/DevServerIT.java
new file mode 100644
index 0000000..32b164d
--- /dev/null
+++ b/webindex/modules/integration/src/test/java/webindex/integration/DevServerIT.java
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.integration;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import org.apache.commons.io.FileUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import webindex.core.IndexClient;
+import webindex.core.models.Pages;
+
+public class DevServerIT {
+
+ static DevServer devServer;
+ static Path tempPath;
+
+ @BeforeClass
+ public static void init() throws Exception {
+ tempPath = Files.createTempDirectory(Paths.get("target/"), "webindex-dev-");
+ Path dataPath = Paths.get("src/test/resources/5-pages.txt");
+ devServer = new DevServer(dataPath, 24567, null, tempPath, false);
+ devServer.start();
+ }
+
+ @Test
+ public void basic() throws Exception {
+ Document doc = Jsoup.connect("http://localhost:24567/").get();
+ Assert.assertTrue(doc.text().contains("Enter a domain to view known webpages in that domain"));
+
+ IndexClient client = devServer.getIndexClient();
+ Pages pages = client.getPages("stackoverflow.com", "", 0);
+ Assert.assertEquals(4, pages.getTotal().intValue());
+
+ Pages.PageScore pageScore = pages.getPages().get(0);
+ Assert.assertEquals("http://blog.stackoverflow.com/2009/06/attribution-required/",
+ pageScore.getUrl());
+ Assert.assertEquals(4, pageScore.getScore().intValue());
+ }
+
+ @AfterClass
+ public static void destroy() throws IOException {
+ devServer.stop();
+ FileUtils.deleteDirectory(tempPath.toFile());
+ }
+}
diff --git a/webindex/modules/integration/src/test/resources/5-pages.txt b/webindex/modules/integration/src/test/resources/5-pages.txt
new file mode 100644
index 0000000..81316e0
--- /dev/null
+++ b/webindex/modules/integration/src/test/resources/5-pages.txt
@@ -0,0 +1,5 @@
+{"url":"http://app.cheezburger.com/Rokas08/TrophyDetails/13f82307-8f12-402e-a544-76db8a2dc19c","uri":"com.cheezburger\u003e.app\u003eo\u003e/Rokas08/TrophyDetails/13f82307-8f12-402e-a544-76db8a2dc19c","numOutbound":19,"crawlDate":"2015-07-28T03:06:17Z","title":"Rokas08\u0026#39;s Profile - Trophy Details - Cheezburger.com","outboundLinks":[{"url":"https://www.facebook.com/Cheezburger","uri":"com.facebook\u003e.www\u003es\u003e/Cheezburger","anchorText":"Facebook"},{"url":"https://plus.google.com/105247221600709734681","uri":"com.google\u003e.plus\u003es\u003e/105247221600709734681","anchorText":"Google+"},{"url":"http://knowyourmeme.com/forums","uri":"com.knowyourmeme\u003e\u003eo\u003e/forums","anchorText":"Forums"},{"url":"http://knowyourmeme.com/memes/popular","uri":"com.knowyourmeme\u003e\u003eo\u003e/memes/popular","anchorText":"Popular Memes"},{"url":"http://knowyourmeme.com/photos/most-viewed","uri":"com.knowyourmeme\u003e\u003eo\u003e/photos/most-viewed","anchorText":"All Images"},{"url":"http://knowyourmeme.com/search?q\u003dcategory%3Aevent\u0026amp;sort\u003dnewest","uri":"com.knowyourmeme\u003e\u003eo\u003e/search?q\u003dcategory%3Aevent\u0026amp;sort\u003dnewest","anchorText":"New Events"},{"url":"http://knowyourmeme.com/search?q\u003dcategory%3Aperson\u0026amp;sort\u003dnewest","uri":"com.knowyourmeme\u003e\u003eo\u003e/search?q\u003dcategory%3Aperson\u0026amp;sort\u003dnewest","anchorText":"New People"},{"url":"http://knowyourmeme.com/search?q\u003dcategory%3Asite\u0026amp;sort\u003dnewest","uri":"com.knowyourmeme\u003e\u003eo\u003e/search?q\u003dcategory%3Asite\u0026amp;sort\u003dnewest","anchorText":"New Sites"},{"url":"http://knowyourmeme.com/search?q\u003dcategory%3Asubculture\u0026amp;sort\u003dnewest","uri":"com.knowyourmeme\u003e\u003eo\u003e/search?q\u003dcategory%3Asubculture\u0026amp;sort\u003dnewest","anchorText":"New Subcultures"},{"url":"http://knowyourmeme.com/search?utf8\u003d%E2%9C%93\u0026amp;context\u003dentries\u0026amp;q\u003dstatus%3Aconfirmed+category%3Ameme","uri":"com.knowyourmeme\u003e\u003eo\u003e/search?utf8\u003d%E2%9C%93\u0026amp;context\u003dentries\u0026amp;q\u003dstatus%3Aconfirmed+category%3Ameme","anchorText":"All Memes"},{"url":"http://knowyourmeme.com/videos/most-viewed","uri":"com.knowyourmeme\u003e\u003eo\u003e/videos/most-viewed","anchorText":"All Videos"},{"url":"http://knowyourmeme.com?ref\u003dnavbar","uri":"com.knowyourmeme\u003e\u003eo\u003e?ref\u003dnavbar","anchorText":"KYM Wiki"},{"url":"https://twitter.com/Cheezburger","uri":"com.twitter\u003e\u003es\u003e/Cheezburger","anchorText":"Follow"},{"url":"http://chzb.gr/1riG0EZ?ref\u003dfooternav","uri":"gr.chzb\u003e\u003eo\u003e/1riG0EZ?ref\u003dfooternav","anchorText":"Videos"},{"url":"http://chzb.gr/1riG0EZ?ref\u003dnavbar","uri":"gr.chzb\u003e\u003eo\u003e/1riG0EZ?ref\u003dnavbar","anchorText":"Videos Find all our FAIL videos here!"},{"url":"http://chzb.gr/1riGhru?ref\u003dfooternav","uri":"gr.chzb\u003e\u003eo\u003e/1riGhru?ref\u003dfooternav","anchorText":"Videos"},{"url":"http://chzb.gr/1riGhru?ref\u003dnavbar","uri":"gr.chzb\u003e\u003eo\u003e/1riGhru?ref\u003dnavbar","anchorText":"Videos See all our Geek videos here!"},{"url":"http://chzb.gr/1riGzi6?ref\u003dfooternav","uri":"gr.chzb\u003e\u003eo\u003e/1riGzi6?ref\u003dfooternav","anchorText":"Videos"},{"url":"http://chzb.gr/1riGzi6?ref\u003dnavbar","uri":"gr.chzb\u003e\u003eo\u003e/1riGzi6?ref\u003dnavbar","anchorText":"Videos Watch and learn from all of our trolling videos here!"}]}
+{"url":"http://apple.stackexchange.com/help/badges/9/autobiographer?userid\u003d796","uri":"com.stackexchange\u003e.apple\u003eo\u003e/help/badges/9/autobiographer?userid\u003d796","numOutbound":4,"crawlDate":"2015-07-28T01:32:26Z","server":"cloudflare-nginx","title":"Autobiographer - Badge - Ask Different","outboundLinks":[{"url":"http://apple.blogoverflow.com/","uri":"com.blogoverflow\u003e.apple\u003eo\u003e/","anchorText":"blog"},{"url":"http://apple.blogoverflow.com?blb\u003d1","uri":"com.blogoverflow\u003e.apple\u003eo\u003e?blb\u003d1","anchorText":"blog"},{"url":"http://blog.stackoverflow.com/2009/06/attribution-required/","uri":"com.stackoverflow\u003e.blog\u003eo\u003e/2009/06/attribution-required/","anchorText":"attribution required"},{"url":"http://creativecommons.org/licenses/by-sa/3.0/","uri":"org.creativecommons\u003e\u003eo\u003e/licenses/by-sa/3.0/","anchorText":"cc by-sa 3.0"}]}
+{"url":"http://apple.stackexchange.com/questions/15006/spotlight-sometimes-cant-find-a-file-that-actually-exists","uri":"com.stackexchange\u003e.apple\u003eo\u003e/questions/15006/spotlight-sometimes-cant-find-a-file-that-actually-exists","numOutbound":6,"crawlDate":"2015-07-28T01:58:50Z","server":"cloudflare-nginx","title":"Spotlight sometimes can\u0026#39;t find a file. (that actually exists) - Ask Different","outboundLinks":[{"url":"http://askubuntu.com/questions/653335/using-sed-how-could-we-cut-a-specific-string-from-a-line-of-text","uri":"com.askubuntu\u003e\u003eo\u003e/questions/653335/using-sed-how-could-we-cut-a-specific-string-from-a-line-of-text","anchorText":"Using sed, how could we cut a specific string from a line of text?"},{"url":"http://apple.blogoverflow.com/","uri":"com.blogoverflow\u003e.apple\u003eo\u003e/","anchorText":"blog"},{"url":"http://apple.blogoverflow.com?blb\u003d1","uri":"com.blogoverflow\u003e.apple\u003eo\u003e?blb\u003d1","anchorText":"blog"},{"url":"http://blog.stackoverflow.com/2009/06/attribution-required/","uri":"com.stackoverflow\u003e.blog\u003eo\u003e/2009/06/attribution-required/","anchorText":"attribution required"},{"url":"http://stackoverflow.com/questions/31654274/is-it-ever-justified-to-have-an-object-which-has-itself-as-a-field","uri":"com.stackoverflow\u003e\u003eo\u003e/questions/31654274/is-it-ever-justified-to-have-an-object-which-has-itself-as-a-field","anchorText":"Is it ever justified to have an object which has itself as a field?"},{"url":"http://creativecommons.org/licenses/by-sa/3.0/","uri":"org.creativecommons\u003e\u003eo\u003e/licenses/by-sa/3.0/","anchorText":"cc by-sa 3.0"}]}
+{"url":"http://apple.stackexchange.com/users/208/john-allers","uri":"com.stackexchange\u003e.apple\u003eo\u003e/users/208/john-allers","numOutbound":8,"crawlDate":"2015-07-28T01:40:51Z","server":"cloudflare-nginx","title":"User John Allers - Ask Different","outboundLinks":[{"url":"http://apple.blogoverflow.com/","uri":"com.blogoverflow\u003e.apple\u003eo\u003e/","anchorText":"blog"},{"url":"http://apple.blogoverflow.com?blb\u003d1","uri":"com.blogoverflow\u003e.apple\u003eo\u003e?blb\u003d1","anchorText":"blog"},{"url":"http://serverfault.com/users/2870/","uri":"com.serverfault\u003e\u003eo\u003e/users/2870/","anchorText":"Server Fault 111 111 3"},{"url":"http://blog.stackoverflow.com/2009/06/attribution-required/","uri":"com.stackoverflow\u003e.blog\u003eo\u003e/2009/06/attribution-required/","anchorText":"attribution required"},{"url":"http://stackoverflow.com/users/73986/","uri":"com.stackoverflow\u003e\u003eo\u003e/users/73986/","anchorText":"Stack Overflow 2.2k 2.2k 11828"},{"url":"http://superuser.com/users/3552/","uri":"com.superuser\u003e\u003eo\u003e/users/3552/","anchorText":"Super User 231 231 26"},{"url":"http://www.zooplet.com/","uri":"com.zooplet\u003e.www\u003eo\u003e/","anchorText":"zooplet.com"},{"url":"http://creativecommons.org/licenses/by-sa/3.0/","uri":"org.creativecommons\u003e\u003eo\u003e/licenses/by-sa/3.0/","anchorText":"cc by-sa 3.0"}]}
+{"url":"http://apple.stackexchange.com/users/3126/mjb?tab\u003dsummary","uri":"com.stackexchange\u003e.apple\u003eo\u003e/users/3126/mjb?tab\u003dsummary","numOutbound":7,"crawlDate":"2015-07-28T01:53:49Z","server":"cloudflare-nginx","title":"User mjb - Ask Different","outboundLinks":[{"url":"http://apple.blogoverflow.com/","uri":"com.blogoverflow\u003e.apple\u003eo\u003e/","anchorText":"blog"},{"url":"http://apple.blogoverflow.com?blb\u003d1","uri":"com.blogoverflow\u003e.apple\u003eo\u003e?blb\u003d1","anchorText":"blog"},{"url":"http://serverfault.com/users/117061/","uri":"com.serverfault\u003e\u003eo\u003e/users/117061/","anchorText":"Server Fault"},{"url":"http://blog.stackoverflow.com/2009/06/attribution-required/","uri":"com.stackoverflow\u003e.blog\u003eo\u003e/2009/06/attribution-required/","anchorText":"attribution required"},{"url":"http://stackoverflow.com/users/581665/","uri":"com.stackoverflow\u003e\u003eo\u003e/users/581665/","anchorText":"Stack Overflow"},{"url":"http://superuser.com/users/63808/","uri":"com.superuser\u003e\u003eo\u003e/users/63808/","anchorText":"Super User"},{"url":"http://creativecommons.org/licenses/by-sa/3.0/","uri":"org.creativecommons\u003e\u003eo\u003e/licenses/by-sa/3.0/","anchorText":"cc by-sa 3.0"}]}
diff --git a/webindex/modules/integration/src/test/resources/log4j.properties b/webindex/modules/integration/src/test/resources/log4j.properties
new file mode 100644
index 0000000..c18c21a
--- /dev/null
+++ b/webindex/modules/integration/src/test/resources/log4j.properties
@@ -0,0 +1,31 @@
+# Copyright 2014 Webindex authors (see AUTHORS)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+log4j.rootLogger=INFO, CA
+log4j.appender.CA=org.apache.log4j.ConsoleAppender
+log4j.appender.CA.layout=org.apache.log4j.PatternLayout
+log4j.appender.CA.layout.ConversionPattern=%d{ISO8601} [%c] %-5p: %m%n
+
+log4j.logger.org.apache.accumulo=WARN
+log4j.logger.org.apache.curator=ERROR
+log4j.logger.org.apache.fluo=WARN
+log4j.logger.org.apache.hadoop=WARN
+log4j.logger.org.apache.hadoop.mapreduce=ERROR
+log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
+log4j.logger.org.apache.spark=WARN
+log4j.logger.org.apache.zookeeper=ERROR
+log4j.logger.org.eclipse.jetty=WARN
+log4j.logger.org.spark-project=WARN
+log4j.logger.webindex=WARN
+log4j.logger.spark=WARN
diff --git a/webindex/modules/ui/.gitignore b/webindex/modules/ui/.gitignore
new file mode 100644
index 0000000..916e17c
--- /dev/null
+++ b/webindex/modules/ui/.gitignore
@@ -0,0 +1 @@
+dependency-reduced-pom.xml
diff --git a/webindex/modules/ui/pom.xml b/webindex/modules/ui/pom.xml
new file mode 100644
index 0000000..bf9e1e3
--- /dev/null
+++ b/webindex/modules/ui/pom.xml
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2015 Webindex authors (see AUTHORS)
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>io.github.astralway</groupId>
+ <artifactId>webindex-parent</artifactId>
+ <version>0.0.1-SNAPSHOT</version>
+ <relativePath>../../pom.xml</relativePath>
+ </parent>
+ <artifactId>webindex-ui</artifactId>
+ <name>WebIndex UI</name>
+ <dependencies>
+ <dependency>
+ <groupId>com.sparkjava</groupId>
+ <artifactId>spark-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>com.sparkjava</groupId>
+ <artifactId>spark-template-freemarker</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>io.github.astralway</groupId>
+ <artifactId>webindex-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.accumulo</groupId>
+ <artifactId>accumulo-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-api</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ </dependency>
+ </dependencies>
+ <profiles>
+ <profile>
+ <id>webindex-web-server</id>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>exec-maven-plugin</artifactId>
+ <executions>
+ <execution>
+ <goals>
+ <goal>java</goal>
+ </goals>
+ <phase>compile</phase>
+ <configuration>
+ <mainClass>webindex.ui.WebServer</mainClass>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+ </profile>
+ </profiles>
+</project>
diff --git a/webindex/modules/ui/src/main/java/webindex/ui/WebServer.java b/webindex/modules/ui/src/main/java/webindex/ui/WebServer.java
new file mode 100644
index 0000000..f13526f
--- /dev/null
+++ b/webindex/modules/ui/src/main/java/webindex/ui/WebServer.java
@@ -0,0 +1,154 @@
+/*
+ * Copyright 2015 Webindex authors (see AUTHORS)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package webindex.ui;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Optional;
+
+import com.google.gson.Gson;
+import freemarker.template.Configuration;
+import org.apache.accumulo.core.client.Connector;
+import org.apache.fluo.api.client.FluoAdmin;
+import org.apache.fluo.api.client.FluoFactory;
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.apache.fluo.core.util.AccumuloUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import spark.ModelAndView;
+import spark.Request;
+import spark.Spark;
+import spark.template.freemarker.FreeMarkerEngine;
+import webindex.core.IndexClient;
+import webindex.core.WebIndexConfig;
+import webindex.core.models.Links;
+import webindex.core.models.Page;
+import webindex.core.models.Pages;
+import webindex.core.models.TopResults;
+
+import static spark.Spark.get;
+import static spark.Spark.halt;
+import static spark.Spark.staticFiles;
+
+public class WebServer {
+
+ private static final Logger log = LoggerFactory.getLogger(WebServer.class);
+
+ private IndexClient client;
+
+ public WebServer() {}
+
+ private TopResults getTop(Request req) {
+ String next = Optional.ofNullable(req.queryParams("next")).orElse("");
+ Integer pageNum = Integer.parseInt(Optional.ofNullable(req.queryParams("pageNum")).orElse("0"));
+ return client.getTopResults(next, pageNum);
+ }
+
+ private Page getPage(Request req) {
+ String url = req.queryParams("url");
+ if (url == null) {
+ halt(400, "Bad request: url parameter was not set");
+ }
+ return client.getPage(url);
+ }
+
+ private Pages getPages(Request req) {
+ String domain = req.queryParams("domain");
+ if (domain == null) {
+ halt(400, "Bad request: domain parameter was not set");
+ }
+ String next = Optional.ofNullable(req.queryParams("next")).orElse("");
+ Integer pageNum = Integer.parseInt(Optional.ofNullable(req.queryParams("pageNum")).orElse("0"));
+ return client.getPages(domain, next, pageNum);
+ }
+
+ private Links getLinks(Request req) {
+ String rawUrl = req.queryParams("url");
+ if (rawUrl == null) {
+ halt(400, "Bad request: url parameter was not set");
+ }
+ String linkType = req.queryParams("linkType");
+ if (linkType == null) {
+ halt(400, "Bad request: linkType parameter was not set");
+ }
+ String next = Optional.ofNullable(req.queryParams("next")).orElse("");
+ Integer pageNum = Integer.parseInt(Optional.ofNullable(req.queryParams("pageNum")).orElse("0"));
+ return client.getLinks(rawUrl, linkType, next, pageNum);
+ }
+
+ public void start(IndexClient client, int port, Path templatePath) {
+ this.client = client;
+
+ Spark.port(port);
+
+ staticFiles.location("/assets");
+
+ FreeMarkerEngine freeMarkerEngine = new FreeMarkerEngine();
+ if (templatePath != null && Files.exists(templatePath)) {
+ log.info("Serving freemarker templates from {}", templatePath.toAbsolutePath());
+ Configuration freeMarkerConfig = new Configuration();
+ try {
+ freeMarkerConfig.setDirectoryForTemplateLoading(templatePath.toFile());
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+ freeMarkerEngine.setConfiguration(freeMarkerConfig);
+ }
+
+ get("/", (req, res) -> new ModelAndView(null, "home.ftl"), freeMarkerEngine);
+
+ get("/top", (req, res) -> new ModelAndView(Collections.singletonMap("top", getTop(req)),
+ "top.ftl"), freeMarkerEngine);
+
+ Gson gson = new Gson();
+ get("/api/top", (req, res) -> getTop(req), gson::toJson);
+
+ get("/page", (req, res) -> new ModelAndView(Collections.singletonMap("page", getPage(req)),
+ "page.ftl"), freeMarkerEngine);
+ get("/api/page", (req, res) -> getPage(req), gson::toJson);
+
+ get("/pages", (req, res) -> new ModelAndView(Collections.singletonMap("pages", getPages(req)),
+ "pages.ftl"), freeMarkerEngine);
+ get("/api/pages", (req, res) -> getPages(req), gson::toJson);
+
+ get("/links", (req, res) -> new ModelAndView(Collections.singletonMap("links", getLinks(req)),
+ "links.ftl"), freeMarkerEngine);
+ get("/api/links", (req, res) -> getLinks(req), gson::toJson);
+ }
+
+ public void stop() {
+ Spark.stop();
+ }
+
+ public static void main(String[] args) throws Exception {
+ WebIndexConfig webIndexConfig = WebIndexConfig.load();
+ File connPropsFile = new File(webIndexConfig.getConnPropsPath());
+ FluoConfiguration fluoConfig = new FluoConfiguration(connPropsFile);
+ fluoConfig.setApplicationName(webIndexConfig.fluoApp);
+ try (FluoAdmin admin = FluoFactory.newAdmin(fluoConfig)) {
+ for (Map.Entry<String, String> entry : admin.getApplicationConfig().toMap().entrySet()) {
+ fluoConfig.setProperty(entry.getKey(), entry.getValue());
+ }
+ }
+ Connector conn = AccumuloUtil.getConnector(fluoConfig);
+ IndexClient client = new IndexClient(webIndexConfig.accumuloIndexTable, conn);
+ WebServer webServer = new WebServer();
+ webServer.start(client, 4567, null);
+ }
+}
diff --git a/webindex/modules/ui/src/main/resources/assets/img/webindex.png b/webindex/modules/ui/src/main/resources/assets/img/webindex.png
new file mode 100644
index 0000000..1f1cf4f
--- /dev/null
+++ b/webindex/modules/ui/src/main/resources/assets/img/webindex.png
Binary files differ
diff --git a/webindex/modules/ui/src/main/resources/spark/template/freemarker/404.ftl b/webindex/modules/ui/src/main/resources/spark/template/freemarker/404.ftl
new file mode 100644
index 0000000..24753a1
--- /dev/null
+++ b/webindex/modules/ui/src/main/resources/spark/template/freemarker/404.ftl
@@ -0,0 +1,10 @@
+<html>
+<#include "common/head.ftl">
+<body>
+<div class="container" style="margin-top: 20px">
+<div class="row">
+ <div class="col-md-6 col-md-offset-3" style="margin-top: 200px">
+ <h2>404: Page not found</h2>
+ </div>
+</div>
+<#include "common/footer.ftl">
diff --git a/webindex/modules/ui/src/main/resources/spark/template/freemarker/common/footer.ftl b/webindex/modules/ui/src/main/resources/spark/template/freemarker/common/footer.ftl
new file mode 100644
index 0000000..9943ff0
--- /dev/null
+++ b/webindex/modules/ui/src/main/resources/spark/template/freemarker/common/footer.ftl
@@ -0,0 +1,3 @@
+</div>
+</body>
+</html>
diff --git a/webindex/modules/ui/src/main/resources/spark/template/freemarker/common/head.ftl b/webindex/modules/ui/src/main/resources/spark/template/freemarker/common/head.ftl
new file mode 100644
index 0000000..39b4c11
--- /dev/null
+++ b/webindex/modules/ui/src/main/resources/spark/template/freemarker/common/head.ftl
@@ -0,0 +1,5 @@
+<head>
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" crossorigin="anonymous">
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap-theme.min.css" integrity="sha384-aUGj/X2zp5rLCbBxumKTCw2Z50WgIr1vs/PFN4praOTvYXWlVyh2UtNUU0KAUhAX" crossorigin="anonymous">
+ <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/js/bootstrap.min.js" integrity="sha512-K1qjQ+NcF2TYO/eI3M6v8EiNYZfA95pQumfvcVrTHtwQVDG+aHRqLi/ETn2uB+1JqwYqVG3LIvdm9lj6imS/pQ==" crossorigin="anonymous"></script>
+</head>
diff --git a/webindex/modules/ui/src/main/resources/spark/template/freemarker/common/header.ftl b/webindex/modules/ui/src/main/resources/spark/template/freemarker/common/header.ftl
new file mode 100644
index 0000000..0163231
--- /dev/null
+++ b/webindex/modules/ui/src/main/resources/spark/template/freemarker/common/header.ftl
@@ -0,0 +1,10 @@
+<#setting url_escaping_charset='ISO-8859-1'>
+<html>
+<#include "head.ftl">
+<body>
+<div class="container" style="margin-top: 20px">
+<div class="row" style="margin-bottom: 10px">
+ <div class="col-md-6">
+ <a href="/"><img src="/img/webindex.png" alt="WebIndex Home" style="height:30px;"></a>
+ </div>
+</div>
diff --git a/webindex/modules/ui/src/main/resources/spark/template/freemarker/home.ftl b/webindex/modules/ui/src/main/resources/spark/template/freemarker/home.ftl
new file mode 100644
index 0000000..c3d9f20
--- /dev/null
+++ b/webindex/modules/ui/src/main/resources/spark/template/freemarker/home.ftl
@@ -0,0 +1,26 @@
+<html>
+<#include "common/head.ftl">
+<body>
+<div class="container" style="margin-top: 20px">
+<div class="row">
+ <div class="col-md-6 col-md-offset-3" style="margin-top: 200px">
+ <img src="/img/webindex.png" alt="WebIndex">
+ <div style="margin-top: 25px;">
+ <h4>Enter a domain to view known webpages in that domain:</h4>
+ </div>
+ <form action="pages" method="get">
+ <div class="input-group" style="margin-top: 25px">
+ <input type="text" class="form-control" name="domain" placeholder="Example: apache.org">
+ <span class="input-group-btn">
+ <button class="btn btn-default" type="submit">Search</button>
+ </span>
+ </div>
+ </form>
+ </div>
+</div>
+<div class="row">
+ <div class="col-md-6 col-md-offset-3" style="margin-top: 20px">
+ <p><b>Or view the webpages with the most inbound links for <a href="/top">all processed data</a>.</p>
+ </div>
+</div>
+<#include "common/footer.ftl">
diff --git a/webindex/modules/ui/src/main/resources/spark/template/freemarker/links.ftl b/webindex/modules/ui/src/main/resources/spark/template/freemarker/links.ftl
new file mode 100644
index 0000000..c14ff74
--- /dev/null
+++ b/webindex/modules/ui/src/main/resources/spark/template/freemarker/links.ftl
@@ -0,0 +1,45 @@
+<#include "common/header.ftl">
+<#if links.links?has_content>
+<div class="row">
+ <div class="col-md-12">
+ <#if links.linkType == "in">
+ <h4>Webpages that link to <a href="/page?url=${links.url?url}">${links.url?html}</a></h4>
+ <#else>
+ <h4>Outbound links from <a href="/page?url=${links.url?url}">${links.url?html}</a></h4>
+ </#if>
+ </div>
+</div>
+<div class="row">
+ <div class="col-md-6">
+ <h4>Page ${links.pageNum+1} of ${links.total} results</h4>
+ </div>
+ <div class="col-md-6">
+ <#if (links.next?length > 0)>
+ <a class="btn btn-default pull-right" href="/links?url=${links.url?url}&linkType=${links.linkType}&next=${links.next?url}&pageNum=${links.pageNum+1}">Next</a>
+ </#if>
+ <#if (links.pageNum - 1 >= 0)>
+ <a class="btn btn-default pull-right" href="/links?url=${links.url?url}&linkType=${links.linkType}&pageNum=${links.pageNum - 1}">Previous</a>
+ </#if>
+ </div>
+</div>
+<div class="row">
+ <div class="col-md-12">
+ <table class="table table-striped">
+ <thead><th>URL</th><th>Anchor Text</th></thead>
+ <#list links.links as link>
+ <tr>
+ <td><a href="/page?url=${link.url?url}">${link.url?html}</a></td>
+ <td>${link.anchorText?html}</td>
+ </tr>
+ </#list>
+ </table>
+ </div>
+</div>
+<#else>
+<div class="row">
+ <div class="col-md-12">
+ <h3>No ${links.linkType?cap_first}bound links to page: ${links.url?html}</h3>
+ </div>
+</div>
+</#if>
+<#include "common/footer.ftl">
diff --git a/webindex/modules/ui/src/main/resources/spark/template/freemarker/page.ftl b/webindex/modules/ui/src/main/resources/spark/template/freemarker/page.ftl
new file mode 100644
index 0000000..5db7610
--- /dev/null
+++ b/webindex/modules/ui/src/main/resources/spark/template/freemarker/page.ftl
@@ -0,0 +1,20 @@
+<#include "common/header.ftl">
+<div class="row>
+ <div class="col-md-8 col-md-offset-4">
+ <h3>Page Info</h3>
+ <table class="table table-striped">
+ <#if page.crawlDate??>
+ <tr><td>Title<td>${page.title!''?html}</tr>
+ </#if>
+ <tr><td>URL<td>${page.url?html} - <a href="${page.url?html}">Go to page</a></tr>
+ <tr><td>Domain<td><a href="pages?domain=${page.domain?url}">${page.domain?html}</a></tr>
+ <tr><td>Inbound links<td><a href="/links?url=${page.url?url}&linkType=in">${page.numInbound}</a></tr>
+ <#if page.crawlDate??>
+ <tr><td>Outbound links<td><a href="/links?url=${page.url?url}&linkType=out">${page.numOutbound}</a></tr>
+ <tr><td>Server<td>${page.server!''?html}</tr>
+ <tr><td>Last Crawled<td>${page.crawlDate!''?html}</tr>
+ </#if>
+ </table>
+ </div>
+</div>
+<#include "common/footer.ftl">
diff --git a/webindex/modules/ui/src/main/resources/spark/template/freemarker/pages.ftl b/webindex/modules/ui/src/main/resources/spark/template/freemarker/pages.ftl
new file mode 100644
index 0000000..8c1a614
--- /dev/null
+++ b/webindex/modules/ui/src/main/resources/spark/template/freemarker/pages.ftl
@@ -0,0 +1,41 @@
+<#include "common/header.ftl">
+<#if pages.pages?has_content>
+<div class="row">
+ <div class="col-md-12">
+ <h3>Webpages in <b>${pages.domain?html}</b> domain (ordered by number of inbound links)</h3>
+ </div>
+</div>
+<div class="row">
+ <div class="col-md-6">
+ <h4>Page ${pages.pageNum+1} of ${pages.total} results</h4>
+ </div>
+ <div class="col-md-6">
+ <#if (pages.next?length > 0)>
+ <a class="btn btn-default pull-right" href="/pages?domain=${pages.domain?url}&next=${pages.next?url}&pageNum=${pages.pageNum+1}">Next</a>
+ </#if>
+ <#if (pages.pageNum - 1 >= 0)>
+ <a class="btn btn-default pull-right" href="/pages?domain=${pages.domain?url}&pageNum=${pages.pageNum - 1}">Previous</a>
+ </#if>
+ </div>
+</div>
+<div class="row">
+ <div class="col-md-12">
+ <table class="table table-striped">
+ <thead><th>Inbound Links</th><th>URL</th></thead>
+ <#list pages.pages as page>
+ <tr>
+ <td class="col-md-2">${page.score?html}</td>
+ <td class="col-md-10"><a href="/page?url=${page.url?url}">${page.url?html}</a></td>
+ </tr>
+ </#list>
+ </table>
+ </div>
+</div>
+<#else>
+<div class="row">
+ <div class="col-md-12">
+ <h3>No results for ${pages.domain?html}</h3>
+ </div>
+</div>
+</#if>
+<#include "common/footer.ftl">
diff --git a/webindex/modules/ui/src/main/resources/spark/template/freemarker/top.ftl b/webindex/modules/ui/src/main/resources/spark/template/freemarker/top.ftl
new file mode 100644
index 0000000..34bd312
--- /dev/null
+++ b/webindex/modules/ui/src/main/resources/spark/template/freemarker/top.ftl
@@ -0,0 +1,41 @@
+<#include "common/header.ftl">
+<#if top.results?has_content>
+<div class="row">
+ <div class="col-md-12">
+ <h3>Webpages with the most inbound links for <b>all processed data</b></h3>
+ </div>
+</div>
+<div class="row">
+ <div class="col-md-6">
+ <h4>Page ${top.pageNum+1}</h4>
+ </div>
+ <div class="col-md-6">
+ <#if top.next??>
+ <a class="btn btn-default pull-right" href="/top?next=${top.next?url}&pageNum=${top.pageNum+1}">Next</a>
+ </#if>
+ <#if (top.pageNum - 1 >= 0)>
+ <a class="btn btn-default pull-right" href="/top?pageNum=${top.pageNum-1}">Previous</a>
+ </#if>
+ </div>
+</div>
+<div class="row">
+ <div class="col-md-12">
+ <table class="table table-striped">
+ <thead><th>Inbound Links</th><th>URL</th></thead>
+ <#list top.results as result>
+ <tr>
+ <td class="col-md-2">${result.value?html}</td>
+ <td class="col-md-10"><a href="/page?url=${result.key?url}">${result.key?html}</a></td>
+ </tr>
+ </#list>
+ </table>
+ </div>
+</div>
+<#else>
+<div class="row">
+ <div class="col-md-12">
+ <h3>No results found</h3>
+ </div>
+</div>
+</#if>
+<#include "common/footer.ftl">
diff --git a/webindex/pom.xml b/webindex/pom.xml
new file mode 100644
index 0000000..93838f9
--- /dev/null
+++ b/webindex/pom.xml
@@ -0,0 +1,270 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2015 Webindex authors (see AUTHORS)
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>io.fluo</groupId>
+ <artifactId>fluo-io-parent</artifactId>
+ <version>2</version>
+ </parent>
+ <groupId>io.github.astralway</groupId>
+ <artifactId>webindex-parent</artifactId>
+ <version>0.0.1-SNAPSHOT</version>
+ <packaging>pom</packaging>
+ <name>WebIndex Parent</name>
+ <description>Example Fluo application that creates web index using CommonCrawl data.</description>
+ <url>https://github.com/astralway/webindex</url>
+ <modules>
+ <module>modules/core</module>
+ <module>modules/data</module>
+ <module>modules/ui</module>
+ <module>modules/integration</module>
+ </modules>
+ <properties>
+ <accumulo.version>1.7.3</accumulo.version>
+ <fluo-recipes.version>1.1.0-incubating</fluo-recipes.version>
+ <fluo.version>1.2.0</fluo.version>
+ <hadoop.version>2.6.3</hadoop.version>
+ <maven.compiler.source>1.8</maven.compiler.source>
+ <maven.compiler.target>1.8</maven.compiler.target>
+ <spark.version>1.6.2</spark.version>
+ <thrift.version>0.9.1</thrift.version>
+ </properties>
+ <dependencyManagement>
+ <dependencies>
+ <dependency>
+ <groupId>com.esotericsoftware.yamlbeans</groupId>
+ <artifactId>yamlbeans</artifactId>
+ <version>1.09</version>
+ </dependency>
+ <dependency>
+ <groupId>com.google.code.gson</groupId>
+ <artifactId>gson</artifactId>
+ <version>2.3.1</version>
+ </dependency>
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ <version>14.0.1</version>
+ </dependency>
+ <dependency>
+ <groupId>com.sparkjava</groupId>
+ <artifactId>spark-core</artifactId>
+ <version>2.5</version>
+ </dependency>
+ <dependency>
+ <groupId>com.sparkjava</groupId>
+ <artifactId>spark-template-freemarker</artifactId>
+ <version>2.3</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>2.4</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ <version>2.6</version>
+ </dependency>
+ <dependency>
+ <groupId>io.github.astralway</groupId>
+ <artifactId>webindex-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>io.github.astralway</groupId>
+ <artifactId>webindex-data</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>io.github.astralway</groupId>
+ <artifactId>webindex-ui</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.12</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.accumulo</groupId>
+ <artifactId>accumulo-core</artifactId>
+ <version>${accumulo.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.accumulo</groupId>
+ <artifactId>accumulo-minicluster</artifactId>
+ <version>${accumulo.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.accumulo</groupId>
+ <artifactId>accumulo-test</artifactId>
+ <version>${accumulo.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-api</artifactId>
+ <version>${fluo.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-core</artifactId>
+ <version>${fluo.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-integration</artifactId>
+ <version>${fluo.version}</version>
+ <type>test-jar</type>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-mapreduce</artifactId>
+ <version>${fluo.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-mini</artifactId>
+ <version>${fluo.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-recipes-accumulo</artifactId>
+ <version>${fluo-recipes.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-recipes-core</artifactId>
+ <version>${fluo-recipes.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-recipes-kryo</artifactId>
+ <version>${fluo-recipes.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-recipes-spark</artifactId>
+ <version>${fluo-recipes.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-recipes-test</artifactId>
+ <version>${fluo-recipes.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-client</artifactId>
+ <version>${hadoop.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.spark</groupId>
+ <artifactId>spark-core_2.10</artifactId>
+ <version>${spark.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.thrift</groupId>
+ <artifactId>libthrift</artifactId>
+ <version>${thrift.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.zookeeper</groupId>
+ <artifactId>zookeeper</artifactId>
+ <version>3.4.6</version>
+ </dependency>
+ <dependency>
+ <groupId>org.jsoup</groupId>
+ <artifactId>jsoup</artifactId>
+ <version>1.9.2</version>
+ </dependency>
+ <dependency>
+ <groupId>org.netpreserve.commons</groupId>
+ <artifactId>webarchive-commons</artifactId>
+ <version>1.1.7</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>1.7.12</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <version>1.7.12</version>
+ </dependency>
+ </dependencies>
+ </dependencyManagement>
+ <build>
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifest>
+ <addDefaultSpecificationEntries>true</addDefaultSpecificationEntries>
+ <addDefaultImplementationEntries>true</addDefaultImplementationEntries>
+ </manifest>
+ <manifestEntries>
+ <!-- sealing breaks ITs with shaded jar, which is used by this example -->
+ <Sealed>false</Sealed>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ <configuration>
+ <excludes>
+ <exclude>README.md</exclude>
+ <exclude>contrib/webindex-dashboard.json</exclude>
+ <exclude>docs/**.md</exclude>
+ <exclude>conf/webindex-tests.txt</exclude>
+ <exclude>src/test/resources/5-pages.txt</exclude>
+ <exclude>src/test/resources/*.warc</exclude>
+ <exclude>src/test/resources/data/set1/*.txt</exclude>
+ <exclude>src/main/resources/splits/*.txt</exclude>
+ <exclude>src/main/resources/spark/template/freemarker/*.ftl</exclude>
+ <exclude>src/main/resources/spark/template/freemarker/common/*.ftl</exclude>
+ <exclude>logs/*</exclude>
+ <exclude>data/*</exclude>
+ <exclude>dependency-reduced-pom.xml</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <configuration>
+ <systemPropertyVariables>
+ <fluo.it.instance.name>it-instance-maven</fluo.it.instance.name>
+ <fluo.it.instance.clear>false</fluo.it.instance.clear>
+ </systemPropertyVariables>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>exec-maven-plugin</artifactId>
+ <version>1.5.0</version>
+ </plugin>
+ </plugins>
+ </pluginManagement>
+ </build>
+</project>