Add 'phrasecount/' from commit '64bf7e524073e5997fdc7cd2e72bc0f2343cbeff'
git-subtree-dir: phrasecount
git-subtree-mainline: 6ebcf9b448b25cf927a14f38ad4b6223e84760bb
git-subtree-split: 64bf7e524073e5997fdc7cd2e72bc0f2343cbeff
diff --git a/phrasecount/.gitignore b/phrasecount/.gitignore
new file mode 100644
index 0000000..93eea5d
--- /dev/null
+++ b/phrasecount/.gitignore
@@ -0,0 +1,6 @@
+.classpath
+.project
+.settings
+target
+.idea
+*.iml
diff --git a/phrasecount/.travis.yml b/phrasecount/.travis.yml
new file mode 100644
index 0000000..e36964e
--- /dev/null
+++ b/phrasecount/.travis.yml
@@ -0,0 +1,12 @@
+language: java
+jdk:
+ - oraclejdk8
+script: mvn verify
+notifications:
+ irc:
+ channels:
+ - "chat.freenode.net#fluo"
+ on_success: always
+ on_failure: always
+ use_notice: true
+ skip_join: true
diff --git a/phrasecount/LICENSE b/phrasecount/LICENSE
new file mode 100644
index 0000000..e06d208
--- /dev/null
+++ b/phrasecount/LICENSE
@@ -0,0 +1,202 @@
+Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "{}"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright {yyyy} {name of copyright owner}
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
diff --git a/phrasecount/README.md b/phrasecount/README.md
new file mode 100644
index 0000000..74a6509
--- /dev/null
+++ b/phrasecount/README.md
@@ -0,0 +1,164 @@
+# Phrase Count
+
+[![Build Status](https://travis-ci.org/astralway/phrasecount.svg?branch=master)](https://travis-ci.org/astralway/phrasecount)
+
+An example application that computes phrase counts for unique documents using Apache Fluo. Each
+unique document that is added causes phrase counts to be incremented. Unique documents have
+reference counts based on the number of locations that point to them. When a unique document is no
+longer referenced by any location, then the phrase counts will be decremented appropriately.
+
+After phrase counts are incremented, export transactions send phrase counts to an Accumulo table.
+The purpose of exporting data is to make it available for query. Percolator is not designed to
+support queries, because its transactions are designed for throughput and not responsiveness.
+
+This example uses the Collision Free Map and Export Queue from [Apache Fluo Recipes][11]. A
+Collision Free Map is used to calculate phrase counts. An Export Queue is used to update the
+external Accumulo table in a fault tolerant manner. Before using Fluo Recipes, this example was
+quite complex. Switching to Fluo Recipes dramatically simplified this example.
+
+## Schema
+
+### Fluo Table Schema
+
+This example uses the following schema for the table used by Apache Fluo.
+
+Row | Column | Value | Purpose
+-------------|---------------|-------------------|---------------------------------------------------------------------
+uri:\<uri\> | doc:hash | \<hash\> | Contains the hash of the document found at the URI
+doc:\<hash\> | doc:content | \<document\> | The contents of the document
+doc:\<hash\> | doc:refCount | \<int\> | The number of URIs that reference this document
+doc:\<hash\> | index:check | empty | Setting this columns triggers the observer that indexes the document
+doc:\<hash\> | index:status | INDEXED or empty | Used to track the status of whether this document was indexed
+
+Additionally the two recipes used by the example store their data in the table
+under two row prefixes. Nothing else should be stored within these prefixes.
+The collision free map used to compute phrasecounts stores data within the row
+prefix `pcm:`. The export queue stores data within the row prefix `aeq:`.
+
+### External Table Schema
+
+This example uses the following schema for the external Accumulo table.
+
+Row | Column | Value | Purpose
+-----------|-----------------|------------|---------------------------------------------------------------------
+\<phrase\> | stat:totalCount | \<count\> | For a given phrase, the value is the total number of times that phrase occurred in all documents.
+\<phrase\> | stat:docCount | \<count\> | For a given phrase, the values is the number of documents in which that phrase occurred.
+
+[PhraseCountTable][14] encapsulates all of the code for interacting with this
+external table.
+
+## Code Overview
+
+Documents are loaded into the Fluo table by [DocumentLoader][1] which is
+executed by [Load][2]. [DocumentLoader][1] handles reference counting of
+unique documents and may set a notification for [DocumentObserver][3].
+[DocumentObserver][3] increments or decrements global phrase counts by
+inserting `+1` or `-1` into a collision free map for each phrase in a document.
+[PhraseMap][4] contains the code called by the collision free map recipe. The
+code in [PhraseMap][4] does two things. First it computes the phrase counts by
+summing the updates. Second it places the newly computed phrase count on an
+export queue. [PhraseExporter][5] is called by the export queue recipe to
+generate mutations to update the external Accumulo table.
+
+All observers and recipes are configured by code in [Application][10]. All
+observers are run by the Fluo worker processes when notifications trigger them.
+
+## Building
+
+After cloning this repository, build with following command.
+
+```
+mvn package
+```
+
+## Running via Maven
+
+If you do not have Accumulo, Hadoop, Zookeeper, and Fluo setup, then you can
+start an MiniFluo instance with the [mini.sh](bin/mini.sh) script. This script
+will run [Mini.java][12] using Maven. The command will create a
+`fluo.properties` file that can be used by the other commands in this section.
+
+```bash
+./bin/mini.sh /tmp/mac fluo.properties
+```
+
+After the mini command prints out `Wrote : fluo.properties` then its ready to
+use. Run `tail -f mini.log` and look for the message about writing
+fluo.properties.
+
+This command will automatically configure [PhraseExporter][5] to export phrases
+to an Accumulo table named `pcExport`.
+
+The reason `-Dexec.classpathScope=test` is set is because it adds the test
+[log4j.properties][7] file to the classpath.
+
+### Adding documents
+
+The [load.sh](bin/load.sh) runs [Load.java][2] which scans the directory
+`$TXT_DIR` looking for .txt files to add. The scan is recursive.
+
+```bash
+./bin/load.sh fluo.properties $TXT_DIR
+```
+
+### Printing phrases
+
+After documents are added, [print.sh](bin/print.sh) will run [Print.java][13]
+which prints out phrase counts. Try modifying a document you added and running
+the load command again, you should eventually see the phrase counts change.
+
+```bash
+./bin/print.sh fluo.properties pcExport
+```
+
+The command will print out the number of unique documents and the number
+of processed documents. If the number of processed documents is less than the
+number of unique documents, then there is still work to do. After the load
+command runs, the documents will have been added or updated. However the
+phrase counts will not update until the Observer runs in the background.
+
+### Killing mini
+
+Make sure to kill mini when finished testing. The following command will kill it.
+
+```bash
+pkill -f phrasecount.cmd.Mini
+```
+
+## Deploying example
+
+The following script can run this example on a cluster using the Fluo
+distribution and serves as executable documentation for deployment. The
+previous maven commands using the exec plugin are convenient for a development
+environment, using the following scripts shows how things would work in a
+production environment.
+
+ * [run.sh] (bin/run.sh) : Runs this example with YARN using the Fluo tar
+ distribution. Running in this way requires setting up Hadoop, Zookeeper,
+ and Accumulo instances separately. The [Uno][8] and [Muchos][9]
+ projects were created to ease setting up these external dependencies.
+
+## Generating data
+
+Need some data? Use `elinks` to generate text files from web pages.
+
+```
+mkdir data
+elinks -dump 1 -no-numbering -no-references http://accumulo.apache.org > data/accumulo.txt
+elinks -dump 1 -no-numbering -no-references http://hadoop.apache.org > data/hadoop.txt
+elinks -dump 1 -no-numbering -no-references http://zookeeper.apache.org > data/zookeeper.txt
+```
+
+[1]: src/main/java/phrasecount/DocumentLoader.java
+[2]: src/main/java/phrasecount/cmd/Load.java
+[3]: src/main/java/phrasecount/DocumentObserver.java
+[4]: src/main/java/phrasecount/PhraseMap.java
+[5]: src/main/java/phrasecount/PhraseExporter.java
+[7]: src/test/resources/log4j.properties
+[8]: https://github.com/astralway/uno
+[9]: https://github.com/astralway/muchos
+[10]: src/main/java/phrasecount/Application.java
+[11]: https://github.com/apache/fluo-recipes
+[12]: src/main/java/phrasecount/cmd/Mini.java
+[13]: src/main/java/phrasecount/cmd/Print.java
+[14]: src/main/java/phrasecount/query/PhraseCountTable.java
diff --git a/phrasecount/bin/copy-jars.sh b/phrasecount/bin/copy-jars.sh
new file mode 100755
index 0000000..a92ac5f
--- /dev/null
+++ b/phrasecount/bin/copy-jars.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+#This script will copy the phrase count jar and its dependencies to the Fluo
+#application lib dir
+
+
+if [ "$#" -ne 2 ]; then
+ echo "Usage : $0 <FLUO HOME> <PHRASECOUNT_HOME>"
+ exit
+fi
+
+FLUO_HOME=$1
+PC_HOME=$2
+
+PC_JAR=$PC_HOME/target/phrasecount-0.0.1-SNAPSHOT.jar
+
+#build and copy phrasecount jar
+(cd $PC_HOME; mvn package -DskipTests)
+
+FLUO_APP_LIB=$FLUO_HOME/apps/phrasecount/lib/
+
+cp $PC_JAR $FLUO_APP_LIB
+(cd $PC_HOME; mvn dependency:copy-dependencies -DoutputDirectory=$FLUO_APP_LIB)
+
diff --git a/phrasecount/bin/load.sh b/phrasecount/bin/load.sh
new file mode 100755
index 0000000..4c9a904
--- /dev/null
+++ b/phrasecount/bin/load.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+mvn exec:java -Dexec.mainClass=phrasecount.cmd.Load -Dexec.args="${*:1}" -Dexec.classpathScope=test
diff --git a/phrasecount/bin/mini.sh b/phrasecount/bin/mini.sh
new file mode 100755
index 0000000..b8b60a4
--- /dev/null
+++ b/phrasecount/bin/mini.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+mvn exec:java -Dexec.mainClass=phrasecount.cmd.Mini -Dexec.args="${*:1}" -Dexec.classpathScope=test &>mini.log &
+echo "Started Mini in background. Writing output to mini.log."
diff --git a/phrasecount/bin/print.sh b/phrasecount/bin/print.sh
new file mode 100755
index 0000000..198fad9
--- /dev/null
+++ b/phrasecount/bin/print.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+mvn exec:java -Dexec.mainClass=phrasecount.cmd.Print -Dexec.args="${*:1}" -Dexec.classpathScope=test
+
diff --git a/phrasecount/bin/run.sh b/phrasecount/bin/run.sh
new file mode 100755
index 0000000..8f6e46a
--- /dev/null
+++ b/phrasecount/bin/run.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+BIN_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+PC_HOME=$( cd "$( dirname "$BIN_DIR" )" && pwd )
+
+# stop if any command fails
+set -e
+
+if [ "$#" -ne 1 ]; then
+ echo "Usage : $0 <TXT FILES DIR>"
+ exit
+fi
+
+#set the following to a directory containing text files
+TXT_DIR=$1
+if [ ! -d $TXT_DIR ]; then
+ echo "Document directory $TXT_DIR does not exist"
+ exit 1
+fi
+
+#ensure $FLUO_HOME is set
+if [ -z "$FLUO_HOME" ]; then
+ echo '$FLUO_HOME must be set!'
+ exit 1
+fi
+
+#Set application name. $FLUO_APP_NAME is set by fluo-dev and zetten
+APP=${FLUO_APP_NAME:-phrasecount}
+
+#derived variables
+APP_PROPS=$FLUO_HOME/apps/$APP/conf/fluo.properties
+
+if [ ! -f $FLUO_HOME/conf/fluo.properties ]; then
+ echo "Fluo is not configured, exiting."
+ exit 1
+fi
+
+#remove application if it exists
+if [ -d $FLUO_HOME/apps/$APP ]; then
+ echo "Restarting '$APP' application. Errors may be printed if it's not running..."
+ $FLUO_HOME/bin/fluo kill $APP || true
+ rm -rf $FLUO_HOME/apps/$APP
+fi
+
+#create new application dir
+$FLUO_HOME/bin/fluo new $APP
+
+#copy phrasecount jars to Fluo application lib dir
+$PC_HOME/bin/copy-jars.sh $FLUO_HOME $PC_HOME
+
+#Create export table and output Fluo configuration
+$FLUO_HOME/bin/fluo exec $APP phrasecount.cmd.Setup $APP_PROPS pcExport >> $APP_PROPS
+
+$FLUO_HOME/bin/fluo init $APP -f
+$FLUO_HOME/bin/fluo exec $APP org.apache.fluo.recipes.accumulo.cmds.OptimizeTable
+$FLUO_HOME/bin/fluo start $APP
+$FLUO_HOME/bin/fluo info $APP
+
+#Load data
+$FLUO_HOME/bin/fluo exec $APP phrasecount.cmd.Load $APP_PROPS $TXT_DIR
+
+#wait for all notifications to be processed.
+$FLUO_HOME/bin/fluo wait $APP
+
+#print phrase counts
+$FLUO_HOME/bin/fluo exec $APP phrasecount.cmd.Print $APP_PROPS pcExport
+
+$FLUO_HOME/bin/fluo stop $APP
+
diff --git a/phrasecount/pom.xml b/phrasecount/pom.xml
new file mode 100644
index 0000000..bb9afde
--- /dev/null
+++ b/phrasecount/pom.xml
@@ -0,0 +1,98 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>io.github.astralway</groupId>
+ <artifactId>phrasecount</artifactId>
+ <version>0.0.1-SNAPSHOT</version>
+ <packaging>jar</packaging>
+
+ <name>phrasecount</name>
+ <url>https://github.com/astralway/phrasecount</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <accumulo.version>1.7.2</accumulo.version>
+ <fluo.version>1.0.0-incubating</fluo.version>
+ <fluo-recipes.version>1.0.0-incubating</fluo-recipes.version>
+ </properties>
+
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>3.1</version>
+ <configuration>
+ <source>1.8</source>
+ <target>1.8</target>
+ <optimize>true</optimize>
+ <encoding>UTF-8</encoding>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-dependency-plugin</artifactId>
+ <version>2.10</version>
+ <configuration>
+ <!--define the specific dependencies to copy into the Fluo application dir-->
+ <includeArtifactIds>fluo-recipes-core,fluo-recipes-accumulo,fluo-recipes-kryo,kryo,minlog,reflectasm,objenesis</includeArtifactIds>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.11</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.beust</groupId>
+ <artifactId>jcommander</artifactId>
+ <version>1.32</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-api</artifactId>
+ <version>${fluo.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-core</artifactId>
+ <version>${fluo.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-recipes-core</artifactId>
+ <version>${fluo-recipes.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-recipes-accumulo</artifactId>
+ <version>${fluo-recipes.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-recipes-kryo</artifactId>
+ <version>${fluo-recipes.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.accumulo</groupId>
+ <artifactId>accumulo-core</artifactId>
+ <version>${accumulo.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.fluo</groupId>
+ <artifactId>fluo-mini</artifactId>
+ <version>${fluo.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.accumulo</groupId>
+ <artifactId>accumulo-minicluster</artifactId>
+ <version>${accumulo.version}</version>
+ </dependency>
+ </dependencies>
+</project>
diff --git a/phrasecount/src/main/java/phrasecount/Application.java b/phrasecount/src/main/java/phrasecount/Application.java
new file mode 100644
index 0000000..30d7c3a
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/Application.java
@@ -0,0 +1,71 @@
+package phrasecount;
+
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.apache.fluo.api.config.ObserverSpecification;
+import org.apache.fluo.recipes.accumulo.export.AccumuloExporter;
+import org.apache.fluo.recipes.core.export.ExportQueue;
+import org.apache.fluo.recipes.core.map.CollisionFreeMap;
+import org.apache.fluo.recipes.kryo.KryoSimplerSerializer;
+import phrasecount.pojos.Counts;
+import phrasecount.pojos.PcKryoFactory;
+
+import static phrasecount.Constants.EXPORT_QUEUE_ID;
+import static phrasecount.Constants.PCM_ID;
+
+public class Application {
+
+ public static class Options {
+ public Options(int pcmBuckets, int eqBuckets, String instance, String zooKeepers, String user,
+ String password, String eTable) {
+ this.phraseCountMapBuckets = pcmBuckets;
+ this.exportQueueBuckets = eqBuckets;
+ this.instance = instance;
+ this.zookeepers = zooKeepers;
+ this.user = user;
+ this.password = password;
+ this.exportTable = eTable;
+
+ }
+
+ public int phraseCountMapBuckets;
+ public int exportQueueBuckets;
+
+ public String instance;
+ public String zookeepers;
+ public String user;
+ public String password;
+ public String exportTable;
+ }
+
+ /**
+ * Sets Fluo configuration needed to run the phrase count application
+ *
+ * @param fluoConfig FluoConfiguration
+ * @param opts Options
+ */
+ public static void configure(FluoConfiguration fluoConfig, Options opts) {
+ // set up an observer that watches the reference counts of documents. When a document is
+ // referenced or dereferenced, it will add or subtract phrase counts from a collision free map.
+ fluoConfig.addObserver(new ObserverSpecification(DocumentObserver.class.getName()));
+
+ // configure which KryoFactory recipes should use
+ KryoSimplerSerializer.setKryoFactory(fluoConfig, PcKryoFactory.class);
+
+ // set up a collision free map to combine phrase counts
+ CollisionFreeMap.configure(fluoConfig,
+ new CollisionFreeMap.Options(PCM_ID, PhraseMap.PcmCombiner.class,
+ PhraseMap.PcmUpdateObserver.class, String.class, Counts.class,
+ opts.phraseCountMapBuckets));
+
+ AccumuloExporter.Configuration accumuloConfig =
+ new AccumuloExporter.Configuration(opts.instance, opts.zookeepers, opts.user, opts.password,
+ opts.exportTable);
+
+ // setup an Accumulo export queue to to send phrase count updates to an Accumulo table
+ ExportQueue.Options exportQueueOpts =
+ new ExportQueue.Options(EXPORT_QUEUE_ID, PhraseExporter.class.getName(),
+ String.class.getName(), Counts.class.getName(),
+ opts.exportQueueBuckets).setExporterConfiguration(accumuloConfig);
+ ExportQueue.configure(fluoConfig, exportQueueOpts);
+ }
+}
diff --git a/phrasecount/src/main/java/phrasecount/Constants.java b/phrasecount/src/main/java/phrasecount/Constants.java
new file mode 100644
index 0000000..1f73bee
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/Constants.java
@@ -0,0 +1,21 @@
+package phrasecount;
+
+import org.apache.fluo.api.data.Column;
+import org.apache.fluo.recipes.core.types.StringEncoder;
+import org.apache.fluo.recipes.core.types.TypeLayer;
+
+public class Constants {
+
+ // set the encoder to use in once place
+ public static final TypeLayer TYPEL = new TypeLayer(new StringEncoder());
+
+ public static final Column INDEX_CHECK_COL = TYPEL.bc().fam("index").qual("check").vis();
+ public static final Column INDEX_STATUS_COL = TYPEL.bc().fam("index").qual("status").vis();
+ public static final Column DOC_CONTENT_COL = TYPEL.bc().fam("doc").qual("content").vis();
+ public static final Column DOC_HASH_COL = TYPEL.bc().fam("doc").qual("hash").vis();
+ public static final Column DOC_REF_COUNT_COL = TYPEL.bc().fam("doc").qual("refCount").vis();
+
+ public static final String EXPORT_QUEUE_ID = "aeq";
+ //phrase count map id
+ public static final String PCM_ID = "pcm";
+}
diff --git a/phrasecount/src/main/java/phrasecount/DocumentLoader.java b/phrasecount/src/main/java/phrasecount/DocumentLoader.java
new file mode 100644
index 0000000..8384b35
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/DocumentLoader.java
@@ -0,0 +1,73 @@
+package phrasecount;
+
+import org.apache.fluo.api.client.Loader;
+import org.apache.fluo.api.client.TransactionBase;
+import org.apache.fluo.recipes.core.types.TypedTransactionBase;
+import phrasecount.pojos.Document;
+
+import static phrasecount.Constants.DOC_CONTENT_COL;
+import static phrasecount.Constants.DOC_HASH_COL;
+import static phrasecount.Constants.DOC_REF_COUNT_COL;
+import static phrasecount.Constants.INDEX_CHECK_COL;
+import static phrasecount.Constants.TYPEL;
+
+/**
+ * Executes document load transactions which dedupe and reference count documents. If needed, the
+ * observer that updates phrase counts is triggered.
+ */
+public class DocumentLoader implements Loader {
+
+ private Document document;
+
+ public DocumentLoader(Document doc) {
+ this.document = doc;
+ }
+
+ @Override
+ public void load(TransactionBase tx, Context context) throws Exception {
+
+ // TODO Need a strategy for dealing w/ large documents. If a worker processes many large
+ // documents concurrently, it could cause memory exhaustion. Could break up large documents
+ // into pieces, However, not sure if the example should be complicated with this.
+
+ TypedTransactionBase ttx = TYPEL.wrap(tx);
+ String storedHash = ttx.get().row("uri:" + document.getURI()).col(DOC_HASH_COL).toString();
+
+ if (storedHash == null || !storedHash.equals(document.getHash())) {
+
+ ttx.mutate().row("uri:" + document.getURI()).col(DOC_HASH_COL).set(document.getHash());
+
+ Integer refCount =
+ ttx.get().row("doc:" + document.getHash()).col(DOC_REF_COUNT_COL).toInteger();
+ if (refCount == null) {
+ // this document was never seen before
+ addNewDocument(ttx, document);
+ } else {
+ setRefCount(ttx, document.getHash(), refCount, refCount + 1);
+ }
+
+ if (storedHash != null) {
+ decrementRefCount(ttx, refCount, storedHash);
+ }
+ }
+ }
+
+ private void setRefCount(TypedTransactionBase tx, String hash, Integer prevRc, int rc) {
+ tx.mutate().row("doc:" + hash).col(DOC_REF_COUNT_COL).set(rc);
+
+ if (rc == 0 || (rc == 1 && (prevRc == null || prevRc == 0))) {
+ // setting this triggers DocumentObserver
+ tx.mutate().row("doc:" + hash).col(INDEX_CHECK_COL).set();
+ }
+ }
+
+ private void decrementRefCount(TypedTransactionBase tx, Integer prevRc, String hash) {
+ int rc = tx.get().row("doc:" + hash).col(DOC_REF_COUNT_COL).toInteger();
+ setRefCount(tx, hash, prevRc, rc - 1);
+ }
+
+ private void addNewDocument(TypedTransactionBase tx, Document doc) {
+ setRefCount(tx, doc.getHash(), null, 1);
+ tx.mutate().row("doc:" + doc.getHash()).col(DOC_CONTENT_COL).set(doc.getContent());
+ }
+}
diff --git a/phrasecount/src/main/java/phrasecount/DocumentObserver.java b/phrasecount/src/main/java/phrasecount/DocumentObserver.java
new file mode 100644
index 0000000..1c50bfc
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/DocumentObserver.java
@@ -0,0 +1,102 @@
+package phrasecount;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.fluo.api.client.TransactionBase;
+import org.apache.fluo.api.data.Bytes;
+import org.apache.fluo.api.data.Column;
+import org.apache.fluo.api.observer.AbstractObserver;
+import org.apache.fluo.recipes.core.map.CollisionFreeMap;
+import org.apache.fluo.recipes.core.types.TypedTransactionBase;
+import phrasecount.pojos.Counts;
+import phrasecount.pojos.Document;
+
+import static phrasecount.Constants.DOC_CONTENT_COL;
+import static phrasecount.Constants.DOC_REF_COUNT_COL;
+import static phrasecount.Constants.INDEX_CHECK_COL;
+import static phrasecount.Constants.INDEX_STATUS_COL;
+import static phrasecount.Constants.PCM_ID;
+import static phrasecount.Constants.TYPEL;
+
+/**
+ * An Observer that updates phrase counts when a document is added or removed.
+ */
+public class DocumentObserver extends AbstractObserver {
+
+ private CollisionFreeMap<String, Counts> pcMap;
+
+ private enum IndexStatus {
+ INDEXED, UNINDEXED
+ }
+
+ @Override
+ public void init(Context context) throws Exception {
+ pcMap = CollisionFreeMap.getInstance(PCM_ID, context.getAppConfiguration());
+ }
+
+ @Override
+ public void process(TransactionBase tx, Bytes row, Column col) throws Exception {
+
+ TypedTransactionBase ttx = TYPEL.wrap(tx);
+
+ IndexStatus status = getStatus(ttx, row);
+ int refCount = ttx.get().row(row).col(DOC_REF_COUNT_COL).toInteger(0);
+
+ if (status == IndexStatus.UNINDEXED && refCount > 0) {
+ updatePhraseCounts(ttx, row, 1);
+ ttx.mutate().row(row).col(INDEX_STATUS_COL).set(IndexStatus.INDEXED.name());
+ } else if (status == IndexStatus.INDEXED && refCount == 0) {
+ updatePhraseCounts(ttx, row, -1);
+ deleteDocument(ttx, row);
+ }
+
+ // TODO modifying the trigger is currently broken, enable more than one observer to commit for a
+ // notification
+ // tx.delete(row, col);
+
+ }
+
+ @Override
+ public ObservedColumn getObservedColumn() {
+ return new ObservedColumn(INDEX_CHECK_COL, NotificationType.STRONG);
+ }
+
+ private void deleteDocument(TypedTransactionBase tx, Bytes row) {
+ // TODO it would probably be useful to have a deleteRow method on Transaction... this method
+ // could start off w/ a simple implementation and later be
+ // optimized... or could have a delete range option
+
+ // TODO this is brittle, this code assumes it knows all possible columns
+ tx.delete(row, DOC_CONTENT_COL);
+ tx.delete(row, DOC_REF_COUNT_COL);
+ tx.delete(row, INDEX_STATUS_COL);
+ }
+
+ private void updatePhraseCounts(TypedTransactionBase ttx, Bytes row, int multiplier) {
+ String content = ttx.get().row(row).col(Constants.DOC_CONTENT_COL).toString();
+
+ // this makes the assumption that the implementation of getPhrases is invariant. This is
+ // probably a bad assumption. A possible way to make this more robust
+ // is to store the output of getPhrases when indexing and use the stored output when unindexing.
+ // Alternatively, could store the version of Document used for
+ // indexing.
+ Map<String, Integer> phrases = new Document(null, content).getPhrases();
+ Map<String, Counts> updates = new HashMap<>(phrases.size());
+ for (Entry<String, Integer> entry : phrases.entrySet()) {
+ updates.put(entry.getKey(), new Counts(multiplier, entry.getValue() * multiplier));
+ }
+
+ pcMap.update(ttx, updates);
+ }
+
+ private IndexStatus getStatus(TypedTransactionBase tx, Bytes row) {
+ String status = tx.get().row(row).col(INDEX_STATUS_COL).toString();
+
+ if (status == null)
+ return IndexStatus.UNINDEXED;
+
+ return IndexStatus.valueOf(status);
+ }
+}
diff --git a/phrasecount/src/main/java/phrasecount/PhraseExporter.java b/phrasecount/src/main/java/phrasecount/PhraseExporter.java
new file mode 100644
index 0000000..5aec44a
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/PhraseExporter.java
@@ -0,0 +1,24 @@
+package phrasecount;
+
+import java.util.function.Consumer;
+
+import org.apache.accumulo.core.data.Mutation;
+import org.apache.fluo.recipes.accumulo.export.AccumuloExporter;
+import org.apache.fluo.recipes.core.export.SequencedExport;
+import phrasecount.pojos.Counts;
+import phrasecount.query.PhraseCountTable;
+
+/**
+ * Export code that converts {@link Counts} objects from the export queue to Mutations that are
+ * written to Accumulo.
+ */
+public class PhraseExporter extends AccumuloExporter<String, Counts> {
+
+ @Override
+ protected void translate(SequencedExport<String, Counts> export, Consumer<Mutation> consumer) {
+ String phrase = export.getKey();
+ long seq = export.getSequence();
+ Counts counts = export.getValue();
+ consumer.accept(PhraseCountTable.createMutation(phrase, seq, counts));
+ }
+}
diff --git a/phrasecount/src/main/java/phrasecount/PhraseMap.java b/phrasecount/src/main/java/phrasecount/PhraseMap.java
new file mode 100644
index 0000000..01c3bfb
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/PhraseMap.java
@@ -0,0 +1,63 @@
+package phrasecount;
+
+import java.util.Iterator;
+import java.util.Optional;
+
+import com.google.common.collect.Iterators;
+import org.apache.fluo.api.client.TransactionBase;
+import org.apache.fluo.api.observer.Observer.Context;
+import org.apache.fluo.recipes.core.export.Export;
+import org.apache.fluo.recipes.core.export.ExportQueue;
+import org.apache.fluo.recipes.core.map.CollisionFreeMap;
+import org.apache.fluo.recipes.core.map.Combiner;
+import org.apache.fluo.recipes.core.map.Update;
+import org.apache.fluo.recipes.core.map.UpdateObserver;
+import phrasecount.pojos.Counts;
+
+import static phrasecount.Constants.EXPORT_QUEUE_ID;
+
+/**
+ * This class contains all of the code related to the {@link CollisionFreeMap} that keeps track of
+ * phrase counts.
+ */
+public class PhraseMap {
+
+ /**
+ * A combiner for the {@link CollisionFreeMap} that stores phrase counts. The
+ * {@link CollisionFreeMap} calls this combiner when it lazily updates the counts for a phrase.
+ */
+ public static class PcmCombiner implements Combiner<String, Counts> {
+
+ @Override
+ public Optional<Counts> combine(String key, Iterator<Counts> updates) {
+ Counts sum = new Counts(0, 0);
+ while (updates.hasNext()) {
+ sum = sum.add(updates.next());
+ }
+ return Optional.of(sum);
+ }
+ }
+
+ /**
+ * This class is notified when the {@link CollisionFreeMap} used to store phrase counts updates a
+ * phrase count. Updates are placed an Accumulo export queue to be exported to the table storing
+ * phrase counts for query.
+ */
+ public static class PcmUpdateObserver extends UpdateObserver<String, Counts> {
+
+ private ExportQueue<String, Counts> pcEq;
+
+ @Override
+ public void init(String mapId, Context observerContext) throws Exception {
+ pcEq = ExportQueue.getInstance(EXPORT_QUEUE_ID, observerContext.getAppConfiguration());
+ }
+
+ @Override
+ public void updatingValues(TransactionBase tx, Iterator<Update<String, Counts>> updates) {
+ Iterator<Export<String, Counts>> exports =
+ Iterators.transform(updates, u -> new Export<>(u.getKey(), u.getNewValue().get()));
+ pcEq.addAll(tx, exports);
+ }
+ }
+
+}
diff --git a/phrasecount/src/main/java/phrasecount/cmd/Load.java b/phrasecount/src/main/java/phrasecount/cmd/Load.java
new file mode 100644
index 0000000..82e4e75
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/cmd/Load.java
@@ -0,0 +1,51 @@
+package phrasecount.cmd;
+
+import java.io.File;
+
+import com.google.common.base.Charsets;
+import com.google.common.io.Files;
+import org.apache.fluo.api.client.FluoClient;
+import org.apache.fluo.api.client.FluoFactory;
+import org.apache.fluo.api.client.LoaderExecutor;
+import org.apache.fluo.api.config.FluoConfiguration;
+import phrasecount.DocumentLoader;
+import phrasecount.pojos.Document;
+
+public class Load {
+
+ public static void main(String[] args) throws Exception {
+
+ if (args.length != 2) {
+ System.err.println("Usage : " + Load.class.getName() + " <fluo props file> <txt file dir>");
+ System.exit(-1);
+ }
+
+ FluoConfiguration config = new FluoConfiguration(new File(args[0]));
+ config.setLoaderThreads(20);
+ config.setLoaderQueueSize(40);
+
+ try (FluoClient fluoClient = FluoFactory.newClient(config);
+ LoaderExecutor le = fluoClient.newLoaderExecutor()) {
+ File[] files = new File(args[1]).listFiles();
+
+ if (files == null) {
+ System.out.println("Text file dir does not exist: " + args[1]);
+ } else {
+ for (File txtFile : files) {
+ if (txtFile.getName().endsWith(".txt")) {
+ String uri = txtFile.toURI().toString();
+ String content = Files.toString(txtFile, Charsets.UTF_8);
+
+ System.out.println("Processing : " + txtFile.toURI());
+ le.execute(new DocumentLoader(new Document(uri, content)));
+ } else {
+ System.out.println("Ignoring : " + txtFile.toURI());
+ }
+ }
+ }
+ }
+
+ // TODO figure what threads are hanging around
+ System.exit(0);
+ }
+}
diff --git a/phrasecount/src/main/java/phrasecount/cmd/Mini.java b/phrasecount/src/main/java/phrasecount/cmd/Mini.java
new file mode 100644
index 0000000..e43c1f5
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/cmd/Mini.java
@@ -0,0 +1,97 @@
+package phrasecount.cmd;
+
+import java.io.File;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import com.beust.jcommander.JCommander;
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.ParameterException;
+import org.apache.accumulo.core.conf.Property;
+import org.apache.accumulo.minicluster.MemoryUnit;
+import org.apache.accumulo.minicluster.MiniAccumuloCluster;
+import org.apache.accumulo.minicluster.MiniAccumuloConfig;
+import org.apache.accumulo.minicluster.ServerType;
+import org.apache.fluo.api.client.FluoAdmin.InitializationOptions;
+import org.apache.fluo.api.client.FluoFactory;
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.apache.fluo.api.mini.MiniFluo;
+import phrasecount.Application;
+
+public class Mini {
+
+ static class Parameters {
+ @Parameter(names = {"-m", "--moreMemory"}, description = "Use more memory")
+ boolean moreMemory = false;
+
+ @Parameter(names = {"-w", "--workerThreads"}, description = "Number of worker threads")
+ int workerThreads = 5;
+
+ @Parameter(names = {"-t", "--tabletServers"}, description = "Number of tablet servers")
+ int tabletServers = 2;
+
+ @Parameter(names = {"-z", "--zookeeperPort"}, description = "Port to use for zookeeper")
+ int zookeeperPort = 0;
+
+ @Parameter(description = "<MAC dir> <output props file>")
+ List<String> args;
+ }
+
+ public static void main(String[] args) throws Exception {
+
+ Parameters params = new Parameters();
+ JCommander jc = new JCommander(params);
+
+ try {
+ jc.parse(args);
+ if (params.args == null || params.args.size() != 2)
+ throw new ParameterException("Expected two arguments");
+ } catch (ParameterException pe) {
+ System.out.println(pe.getMessage());
+ jc.setProgramName(Mini.class.getSimpleName());
+ jc.usage();
+ System.exit(-1);
+ }
+
+ MiniAccumuloConfig cfg = new MiniAccumuloConfig(new File(params.args.get(0)), "secret");
+ cfg.setZooKeeperPort(params.zookeeperPort);
+ cfg.setNumTservers(params.tabletServers);
+ if (params.moreMemory) {
+ cfg.setMemory(ServerType.TABLET_SERVER, 2, MemoryUnit.GIGABYTE);
+ Map<String, String> site = new HashMap<>();
+ site.put(Property.TSERV_DATACACHE_SIZE.getKey(), "768M");
+ site.put(Property.TSERV_INDEXCACHE_SIZE.getKey(), "256M");
+ cfg.setSiteConfig(site);
+ }
+
+ MiniAccumuloCluster cluster = new MiniAccumuloCluster(cfg);
+ cluster.start();
+
+ FluoConfiguration fluoConfig = new FluoConfiguration();
+
+ fluoConfig.setMiniStartAccumulo(false);
+ fluoConfig.setAccumuloInstance(cluster.getInstanceName());
+ fluoConfig.setAccumuloUser("root");
+ fluoConfig.setAccumuloPassword("secret");
+ fluoConfig.setAccumuloZookeepers(cluster.getZooKeepers());
+ fluoConfig.setInstanceZookeepers(cluster.getZooKeepers() + "/fluo");
+
+ fluoConfig.setAccumuloTable("data");
+ fluoConfig.setWorkerThreads(params.workerThreads);
+
+ fluoConfig.setApplicationName("phrasecount");
+
+ Application.configure(fluoConfig, new Application.Options(17, 17, cluster.getInstanceName(),
+ cluster.getZooKeepers(), "root", "secret", "pcExport"));
+
+ FluoFactory.newAdmin(fluoConfig).initialize(new InitializationOptions());
+
+ MiniFluo miniFluo = FluoFactory.newMiniFluo(fluoConfig);
+
+ miniFluo.getClientConfiguration().save(new File(params.args.get(1)));
+
+ System.out.println();
+ System.out.println("Wrote : " + params.args.get(1));
+ }
+}
diff --git a/phrasecount/src/main/java/phrasecount/cmd/Print.java b/phrasecount/src/main/java/phrasecount/cmd/Print.java
new file mode 100644
index 0000000..79819b2
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/cmd/Print.java
@@ -0,0 +1,55 @@
+package phrasecount.cmd;
+
+import java.io.File;
+
+import com.google.common.collect.Iterables;
+import org.apache.fluo.api.client.FluoClient;
+import org.apache.fluo.api.client.FluoFactory;
+import org.apache.fluo.api.client.Snapshot;
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.apache.fluo.api.data.Column;
+import org.apache.fluo.api.data.Span;
+import phrasecount.Constants;
+import phrasecount.pojos.PhraseAndCounts;
+import phrasecount.query.PhraseCountTable;
+
+public class Print {
+
+ public static void main(String[] args) throws Exception {
+ if (args.length != 2) {
+ System.err
+ .println("Usage : " + Print.class.getName() + " <fluo props file> <export table name>");
+ System.exit(-1);
+ }
+
+ FluoConfiguration fluoConfig = new FluoConfiguration(new File(args[0]));
+
+ PhraseCountTable pcTable = new PhraseCountTable(fluoConfig, args[1]);
+ for (PhraseAndCounts phraseCount : pcTable) {
+ System.out.printf("%7d %7d '%s'\n", phraseCount.docPhraseCount, phraseCount.totalPhraseCount,
+ phraseCount.phrase);
+ }
+
+ try (FluoClient fluoClient = FluoFactory.newClient(fluoConfig);
+ Snapshot snap = fluoClient.newSnapshot()) {
+
+ // TODO could precompute this using observers
+ int uriCount = count(snap, "uri:", Constants.DOC_HASH_COL);
+ int documentCount = count(snap, "doc:", Constants.DOC_REF_COUNT_COL);
+ int numIndexedDocs = count(snap, "doc:", Constants.INDEX_STATUS_COL);
+
+ System.out.println();
+ System.out.printf("# uris : %,d\n", uriCount);
+ System.out.printf("# unique documents : %,d\n", documentCount);
+ System.out.printf("# processed documents : %,d\n", numIndexedDocs);
+ System.out.println();
+ }
+
+ // TODO figure what threads are hanging around
+ System.exit(0);
+ }
+
+ private static int count(Snapshot snap, String prefix, Column col) {
+ return Iterables.size(snap.scanner().over(Span.prefix(prefix)).fetch(col).byRow().build());
+ }
+}
diff --git a/phrasecount/src/main/java/phrasecount/cmd/Setup.java b/phrasecount/src/main/java/phrasecount/cmd/Setup.java
new file mode 100644
index 0000000..9d27917
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/cmd/Setup.java
@@ -0,0 +1,38 @@
+package phrasecount.cmd;
+
+import java.io.File;
+
+import org.apache.accumulo.core.client.Connector;
+import org.apache.accumulo.core.client.TableNotFoundException;
+import org.apache.accumulo.core.client.ZooKeeperInstance;
+import org.apache.accumulo.core.client.security.tokens.PasswordToken;
+import org.apache.fluo.api.config.FluoConfiguration;
+import phrasecount.Application;
+import phrasecount.Application.Options;
+
+public class Setup {
+
+ public static void main(String[] args) throws Exception {
+ FluoConfiguration config = new FluoConfiguration(new File(args[0]));
+
+ String exportTable = args[1];
+
+ Connector conn =
+ new ZooKeeperInstance(config.getAccumuloInstance(), config.getAccumuloZookeepers())
+ .getConnector("root", new PasswordToken("secret"));
+ try {
+ conn.tableOperations().delete(exportTable);
+ } catch (TableNotFoundException e) {
+ // ignore if table not found
+ }
+
+ conn.tableOperations().create(exportTable);
+
+ Options opts = new Options(103, 103, config.getAccumuloInstance(), config.getAccumuloZookeepers(),
+ config.getAccumuloUser(), config.getAccumuloPassword(), exportTable);
+
+ FluoConfiguration observerConfig = new FluoConfiguration();
+ Application.configure(observerConfig, opts);
+ observerConfig.save(System.out);
+ }
+}
diff --git a/phrasecount/src/main/java/phrasecount/cmd/Split.java b/phrasecount/src/main/java/phrasecount/cmd/Split.java
new file mode 100644
index 0000000..cc9d145
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/cmd/Split.java
@@ -0,0 +1,40 @@
+package phrasecount.cmd;
+
+import java.io.File;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import org.apache.accumulo.core.client.Connector;
+import org.apache.accumulo.core.client.ZooKeeperInstance;
+import org.apache.accumulo.core.client.security.tokens.PasswordToken;
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.apache.hadoop.io.Text;
+
+/**
+ * Utility to add splits to the Accumulo table used by Fluo.
+ */
+public class Split {
+ public static void main(String[] args) throws Exception {
+ if (args.length != 2) {
+ System.err.println("Usage : " + Split.class.getName() + " <fluo props file> <table name>");
+ System.exit(-1);
+ }
+
+ FluoConfiguration fluoConfig = new FluoConfiguration(new File(args[0]));
+ ZooKeeperInstance zki =
+ new ZooKeeperInstance(fluoConfig.getAccumuloInstance(), fluoConfig.getAccumuloZookeepers());
+ Connector conn = zki.getConnector(fluoConfig.getAccumuloUser(),
+ new PasswordToken(fluoConfig.getAccumuloPassword()));
+
+ SortedSet<Text> splits = new TreeSet<>();
+
+ for (char c = 'b'; c < 'z'; c++) {
+ splits.add(new Text("phrase:" + c));
+ }
+
+ conn.tableOperations().addSplits(args[1], splits);
+
+ // TODO figure what threads are hanging around
+ System.exit(0);
+ }
+}
diff --git a/phrasecount/src/main/java/phrasecount/pojos/Counts.java b/phrasecount/src/main/java/phrasecount/pojos/Counts.java
new file mode 100644
index 0000000..d8e0829
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/pojos/Counts.java
@@ -0,0 +1,44 @@
+package phrasecount.pojos;
+
+import com.google.common.base.Objects;
+
+public class Counts {
+ // number of documents a phrase was seen in
+ public final long docPhraseCount;
+ // total times a phrase was seen in all documents
+ public final long totalPhraseCount;
+
+ public Counts() {
+ docPhraseCount = 0;
+ totalPhraseCount = 0;
+ }
+
+ public Counts(long docPhraseCount, long totalPhraseCount) {
+ this.docPhraseCount = docPhraseCount;
+ this.totalPhraseCount = totalPhraseCount;
+ }
+
+ public Counts add(Counts other) {
+ return new Counts(this.docPhraseCount + other.docPhraseCount, this.totalPhraseCount + other.totalPhraseCount);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o instanceof Counts) {
+ Counts opc = (Counts) o;
+ return opc.docPhraseCount == docPhraseCount && opc.totalPhraseCount == totalPhraseCount;
+ }
+
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ return (int) (993 * totalPhraseCount + 17 * docPhraseCount);
+ }
+
+ @Override
+ public String toString() {
+ return Objects.toStringHelper(this).add("documents", docPhraseCount).add("total", totalPhraseCount).toString();
+ }
+}
diff --git a/phrasecount/src/main/java/phrasecount/pojos/Document.java b/phrasecount/src/main/java/phrasecount/pojos/Document.java
new file mode 100644
index 0000000..5fc0e70
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/pojos/Document.java
@@ -0,0 +1,59 @@
+package phrasecount.pojos;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import com.google.common.hash.Hasher;
+import com.google.common.hash.Hashing;
+
+public class Document {
+ // the location where the document came from. This is needed inorder to detect when a document
+ // changes.
+ private String uri;
+
+ // the text of a document.
+ private String content;
+
+ private String hash = null;
+
+ public Document(String uri, String content) {
+ this.content = content;
+ this.uri = uri;
+ }
+
+ public String getURI() {
+ return uri;
+ }
+
+ public String getHash() {
+ if (hash != null)
+ return hash;
+
+ Hasher hasher = Hashing.sha1().newHasher();
+ String[] tokens = content.toLowerCase().split("[^\\p{Alnum}]+");
+
+ for (String token : tokens) {
+ hasher.putString(token);
+ }
+
+ return hash = hasher.hash().toString();
+ }
+
+ public Map<String, Integer> getPhrases() {
+ String[] tokens = content.toLowerCase().split("[^\\p{Alnum}]+");
+
+ Map<String, Integer> phrases = new HashMap<>();
+ for (int i = 3; i < tokens.length; i++) {
+ String phrase = tokens[i - 3] + " " + tokens[i - 2] + " " + tokens[i - 1] + " " + tokens[i];
+ Integer old = phrases.put(phrase, 1);
+ if (old != null)
+ phrases.put(phrase, 1 + old);
+ }
+
+ return phrases;
+ }
+
+ public String getContent() {
+ return content;
+ }
+}
diff --git a/phrasecount/src/main/java/phrasecount/pojos/PcKryoFactory.java b/phrasecount/src/main/java/phrasecount/pojos/PcKryoFactory.java
new file mode 100644
index 0000000..3158f00
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/pojos/PcKryoFactory.java
@@ -0,0 +1,13 @@
+package phrasecount.pojos;
+
+import com.esotericsoftware.kryo.Kryo;
+import com.esotericsoftware.kryo.pool.KryoFactory;
+
+public class PcKryoFactory implements KryoFactory {
+ @Override
+ public Kryo create() {
+ Kryo kryo = new Kryo();
+ kryo.register(Counts.class, 9);
+ return kryo;
+ }
+}
diff --git a/phrasecount/src/main/java/phrasecount/pojos/PhraseAndCounts.java b/phrasecount/src/main/java/phrasecount/pojos/PhraseAndCounts.java
new file mode 100644
index 0000000..d6ddc33
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/pojos/PhraseAndCounts.java
@@ -0,0 +1,24 @@
+package phrasecount.pojos;
+
+public class PhraseAndCounts extends Counts {
+ public String phrase;
+
+ public PhraseAndCounts(String phrase, int docPhraseCount, int totalPhraseCount) {
+ super(docPhraseCount, totalPhraseCount);
+ this.phrase = phrase;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o instanceof PhraseAndCounts) {
+ PhraseAndCounts op = (PhraseAndCounts) o;
+ return phrase.equals(op.phrase) && super.equals(op);
+ }
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ return super.hashCode() + 31 * phrase.hashCode();
+ }
+}
diff --git a/phrasecount/src/main/java/phrasecount/query/PhraseCountTable.java b/phrasecount/src/main/java/phrasecount/query/PhraseCountTable.java
new file mode 100644
index 0000000..f5f670a
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/query/PhraseCountTable.java
@@ -0,0 +1,107 @@
+package phrasecount.query;
+
+import java.util.Iterator;
+import java.util.Map.Entry;
+
+import com.google.common.collect.Iterators;
+import org.apache.accumulo.core.client.ClientConfiguration;
+import org.apache.accumulo.core.client.Connector;
+import org.apache.accumulo.core.client.RowIterator;
+import org.apache.accumulo.core.client.Scanner;
+import org.apache.accumulo.core.client.ZooKeeperInstance;
+import org.apache.accumulo.core.client.security.tokens.PasswordToken;
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.Mutation;
+import org.apache.accumulo.core.data.Range;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.security.Authorizations;
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.apache.hadoop.io.Text;
+import phrasecount.pojos.Counts;
+import phrasecount.pojos.PhraseAndCounts;
+
+/**
+ * All of the code for dealing with the Accumulo table that Fluo is exporting to
+ */
+public class PhraseCountTable implements Iterable<PhraseAndCounts> {
+
+ static final String STAT_CF = "stat";
+
+ //name of column qualifier used to store phrase count across all documents
+ static final String TOTAL_PC_CQ = "totalCount";
+
+ //name of column qualifier used to store number of documents containing a phrase
+ static final String DOC_PC_CQ = "docCount";
+
+ public static Mutation createMutation(String phrase, long seq, Counts pc) {
+ Mutation mutation = new Mutation(phrase);
+
+ // use the sequence number for the Accumulo timestamp, this will cause older updates to fall
+ // behind newer ones
+ if (pc.totalPhraseCount == 0)
+ mutation.putDelete(STAT_CF, TOTAL_PC_CQ, seq);
+ else
+ mutation.put(STAT_CF, TOTAL_PC_CQ, seq, pc.totalPhraseCount + "");
+
+ if (pc.docPhraseCount == 0)
+ mutation.putDelete(STAT_CF, DOC_PC_CQ, seq);
+ else
+ mutation.put(STAT_CF, DOC_PC_CQ, seq, pc.docPhraseCount + "");
+
+ return mutation;
+ }
+
+ private Connector conn;
+ private String table;
+
+ public PhraseCountTable(FluoConfiguration fluoConfig, String table) throws Exception {
+ ZooKeeperInstance zki = new ZooKeeperInstance(
+ new ClientConfiguration().withZkHosts(fluoConfig.getAccumuloZookeepers())
+ .withInstance(fluoConfig.getAccumuloInstance()));
+ this.conn = zki.getConnector(fluoConfig.getAccumuloUser(),
+ new PasswordToken(fluoConfig.getAccumuloPassword()));
+ this.table = table;
+ }
+
+ public PhraseCountTable(Connector conn, String table) {
+ this.conn = conn;
+ this.table = table;
+ }
+
+
+ public Counts getPhraseCounts(String phrase) throws Exception {
+ Scanner scanner = conn.createScanner(table, Authorizations.EMPTY);
+ scanner.setRange(new Range(phrase));
+
+ int sum = 0;
+ int docCount = 0;
+
+ for (Entry<Key, Value> entry : scanner) {
+ String cq = entry.getKey().getColumnQualifierData().toString();
+ if (cq.equals(TOTAL_PC_CQ)) {
+ sum = Integer.valueOf(entry.getValue().toString());
+ }
+
+ if (cq.equals(DOC_PC_CQ)) {
+ docCount = Integer.valueOf(entry.getValue().toString());
+ }
+ }
+
+ return new Counts(docCount, sum);
+ }
+
+ @Override
+ public Iterator<PhraseAndCounts> iterator() {
+ try {
+ Scanner scanner = conn.createScanner(table, Authorizations.EMPTY);
+ scanner.fetchColumn(new Text(STAT_CF), new Text(TOTAL_PC_CQ));
+ scanner.fetchColumn(new Text(STAT_CF), new Text(DOC_PC_CQ));
+
+ return Iterators.transform(new RowIterator(scanner), new RowTransform());
+ } catch (RuntimeException e) {
+ throw e;
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+}
diff --git a/phrasecount/src/main/java/phrasecount/query/RowTransform.java b/phrasecount/src/main/java/phrasecount/query/RowTransform.java
new file mode 100644
index 0000000..e86439c
--- /dev/null
+++ b/phrasecount/src/main/java/phrasecount/query/RowTransform.java
@@ -0,0 +1,34 @@
+package phrasecount.query;
+
+import java.util.Iterator;
+import java.util.Map.Entry;
+
+import com.google.common.base.Function;
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.Value;
+import phrasecount.pojos.PhraseAndCounts;
+
+public class RowTransform implements Function<Iterator<Entry<Key, Value>>, PhraseAndCounts> {
+ @Override
+ public PhraseAndCounts apply(Iterator<Entry<Key, Value>> input) {
+ String phrase = null;
+
+ int totalPhraseCount = 0;
+ int docPhraseCount = 0;
+
+ while (input.hasNext()) {
+ Entry<Key, Value> colEntry = input.next();
+ String cq = colEntry.getKey().getColumnQualifierData().toString();
+
+ if (cq.equals(PhraseCountTable.TOTAL_PC_CQ))
+ totalPhraseCount = Integer.parseInt(colEntry.getValue().toString());
+ else
+ docPhraseCount = Integer.parseInt(colEntry.getValue().toString());
+
+ if (phrase == null)
+ phrase = colEntry.getKey().getRowData().toString();
+ }
+
+ return new PhraseAndCounts(phrase, docPhraseCount, totalPhraseCount);
+ }
+}
diff --git a/phrasecount/src/test/java/phrasecount/PhraseCounterTest.java b/phrasecount/src/test/java/phrasecount/PhraseCounterTest.java
new file mode 100644
index 0000000..5815883
--- /dev/null
+++ b/phrasecount/src/test/java/phrasecount/PhraseCounterTest.java
@@ -0,0 +1,215 @@
+package phrasecount;
+
+import java.util.Random;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.accumulo.core.client.Connector;
+import org.apache.accumulo.core.client.security.tokens.PasswordToken;
+import org.apache.accumulo.minicluster.MiniAccumuloCluster;
+import org.apache.accumulo.minicluster.MiniAccumuloConfig;
+import org.apache.fluo.api.client.FluoAdmin.InitializationOptions;
+import org.apache.fluo.api.client.FluoClient;
+import org.apache.fluo.api.client.FluoFactory;
+import org.apache.fluo.api.client.LoaderExecutor;
+import org.apache.fluo.api.config.FluoConfiguration;
+import org.apache.fluo.api.mini.MiniFluo;
+import org.apache.fluo.recipes.core.types.TypedSnapshot;
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import phrasecount.pojos.Counts;
+import phrasecount.pojos.Document;
+import phrasecount.query.PhraseCountTable;
+
+import static phrasecount.Constants.DOC_CONTENT_COL;
+import static phrasecount.Constants.DOC_REF_COUNT_COL;
+import static phrasecount.Constants.TYPEL;
+
+// TODO make this an integration test
+
+public class PhraseCounterTest {
+ public static TemporaryFolder folder = new TemporaryFolder();
+ public static MiniAccumuloCluster cluster;
+ private static FluoConfiguration props;
+ private static MiniFluo miniFluo;
+ private static final PasswordToken password = new PasswordToken("secret");
+ private static AtomicInteger tableCounter = new AtomicInteger(1);
+ private PhraseCountTable pcTable;
+
+ @BeforeClass
+ public static void setUpBeforeClass() throws Exception {
+ folder.create();
+ MiniAccumuloConfig cfg = new MiniAccumuloConfig(folder.newFolder("miniAccumulo"),
+ new String(password.getPassword()));
+ cluster = new MiniAccumuloCluster(cfg);
+ cluster.start();
+ }
+
+ @AfterClass
+ public static void tearDownAfterClass() throws Exception {
+ cluster.stop();
+ folder.delete();
+ }
+
+ @Before
+ public void setUpFluo() throws Exception {
+
+ // configure Fluo to use mini instance. Could avoid all of this code and let MiniFluo create a
+ // MiniAccumulo instance. However we need access to the MiniAccumulo instance inorder to create
+ // the export/query table.
+ props = new FluoConfiguration();
+ props.setMiniStartAccumulo(false);
+ props.setApplicationName("phrasecount");
+ props.setAccumuloInstance(cluster.getInstanceName());
+ props.setAccumuloUser("root");
+ props.setAccumuloPassword("secret");
+ props.setInstanceZookeepers(cluster.getZooKeepers() + "/fluo");
+ props.setAccumuloZookeepers(cluster.getZooKeepers());
+ props.setAccumuloTable("data" + tableCounter.getAndIncrement());
+ props.setWorkerThreads(5);
+
+ // create the export/query table
+ String queryTable = "pcq" + tableCounter.getAndIncrement();
+ Connector conn = cluster.getConnector("root", "secret");
+ conn.tableOperations().create(queryTable);
+ pcTable = new PhraseCountTable(conn, queryTable);
+
+ // configure phrase count observers
+ Application.configure(props, new Application.Options(13, 13, cluster.getInstanceName(),
+ cluster.getZooKeepers(), "root", "secret", queryTable));
+
+ FluoFactory.newAdmin(props)
+ .initialize(new InitializationOptions().setClearTable(true).setClearZookeeper(true));
+
+ miniFluo = FluoFactory.newMiniFluo(props);
+ }
+
+ @After
+ public void tearDownFluo() throws Exception {
+ miniFluo.close();
+ }
+
+ private void loadDocument(FluoClient fluoClient, String uri, String content) {
+ try (LoaderExecutor le = fluoClient.newLoaderExecutor()) {
+ Document doc = new Document(uri, content);
+ le.execute(new DocumentLoader(doc));
+ }
+ miniFluo.waitForObservers();
+ }
+
+ @Test
+ public void test1() throws Exception {
+ try (FluoClient fluoClient = FluoFactory.newClient(props)) {
+
+ loadDocument(fluoClient, "/foo1", "This is only a test. Do not panic. This is only a test.");
+
+ Assert.assertEquals(new Counts(1, 2), pcTable.getPhraseCounts("is only a test"));
+ Assert.assertEquals(new Counts(1, 1), pcTable.getPhraseCounts("test do not panic"));
+
+ // add new document w/ different content and overlapping phrase.. should change some counts
+ loadDocument(fluoClient, "/foo2", "This is only a test");
+
+ Assert.assertEquals(new Counts(2, 3), pcTable.getPhraseCounts("is only a test"));
+ Assert.assertEquals(new Counts(1, 1), pcTable.getPhraseCounts("test do not panic"));
+
+ // add new document w/ same content, should not change any counts
+ loadDocument(fluoClient, "/foo3", "This is only a test");
+
+ Assert.assertEquals(new Counts(2, 3), pcTable.getPhraseCounts("is only a test"));
+ Assert.assertEquals(new Counts(1, 1), pcTable.getPhraseCounts("test do not panic"));
+
+ // change the content of /foo1, should change counts
+ loadDocument(fluoClient, "/foo1", "The test is over, for now.");
+
+ Assert.assertEquals(new Counts(1, 1), pcTable.getPhraseCounts("the test is over"));
+ Assert.assertEquals(new Counts(1, 1), pcTable.getPhraseCounts("is only a test"));
+ Assert.assertEquals(new Counts(0, 0), pcTable.getPhraseCounts("test do not panic"));
+
+ // change content of foo2, should not change anything
+ loadDocument(fluoClient, "/foo2", "The test is over, for now.");
+
+ Assert.assertEquals(new Counts(1, 1), pcTable.getPhraseCounts("the test is over"));
+ Assert.assertEquals(new Counts(1, 1), pcTable.getPhraseCounts("is only a test"));
+ Assert.assertEquals(new Counts(0, 0), pcTable.getPhraseCounts("test do not panic"));
+
+ String oldHash = new Document("/foo3", "This is only a test").getHash();
+ try(TypedSnapshot tsnap = TYPEL.wrap(fluoClient.newSnapshot())){
+ Assert.assertNotNull(tsnap.get().row("doc:" + oldHash).col(DOC_CONTENT_COL).toString());
+ Assert.assertEquals(1, tsnap.get().row("doc:" + oldHash).col(DOC_REF_COUNT_COL).toInteger(0));
+ }
+ // dereference document that foo3 was referencing
+ loadDocument(fluoClient, "/foo3", "The test is over, for now.");
+
+ Assert.assertEquals(new Counts(1, 1), pcTable.getPhraseCounts("the test is over"));
+ Assert.assertEquals(new Counts(0, 0), pcTable.getPhraseCounts("is only a test"));
+ Assert.assertEquals(new Counts(0, 0), pcTable.getPhraseCounts("test do not panic"));
+
+ try(TypedSnapshot tsnap = TYPEL.wrap(fluoClient.newSnapshot())){
+ Assert.assertNull(tsnap.get().row("doc:" + oldHash).col(DOC_CONTENT_COL).toString());
+ Assert.assertNull(tsnap.get().row("doc:" + oldHash).col(DOC_REF_COUNT_COL).toInteger());
+ }
+ }
+
+ }
+
+ @Test
+ public void testHighCardinality() throws Exception {
+ try (FluoClient fluoClient = FluoFactory.newClient(props)) {
+
+ Random rand = new Random();
+
+ loadDocsWithRandomWords(fluoClient, rand, "This is only a test", 0, 100);
+
+ Assert.assertEquals(new Counts(100, 100), pcTable.getPhraseCounts("this is only a"));
+ Assert.assertEquals(new Counts(100, 100), pcTable.getPhraseCounts("is only a test"));
+
+ loadDocsWithRandomWords(fluoClient, rand, "This is not a test", 0, 2);
+
+ Assert.assertEquals(new Counts(2, 2), pcTable.getPhraseCounts("this is not a"));
+ Assert.assertEquals(new Counts(2, 2), pcTable.getPhraseCounts("is not a test"));
+ Assert.assertEquals(new Counts(98, 98), pcTable.getPhraseCounts("this is only a"));
+ Assert.assertEquals(new Counts(98, 98), pcTable.getPhraseCounts("is only a test"));
+
+ loadDocsWithRandomWords(fluoClient, rand, "This is not a test", 2, 100);
+
+ Assert.assertEquals(new Counts(100, 100), pcTable.getPhraseCounts("this is not a"));
+ Assert.assertEquals(new Counts(100, 100), pcTable.getPhraseCounts("is not a test"));
+ Assert.assertEquals(new Counts(0, 0), pcTable.getPhraseCounts("this is only a"));
+ Assert.assertEquals(new Counts(0, 0), pcTable.getPhraseCounts("is only a test"));
+
+ loadDocsWithRandomWords(fluoClient, rand, "This is only a test", 0, 50);
+
+ Assert.assertEquals(new Counts(50, 50), pcTable.getPhraseCounts("this is not a"));
+ Assert.assertEquals(new Counts(50, 50), pcTable.getPhraseCounts("is not a test"));
+ Assert.assertEquals(new Counts(50, 50), pcTable.getPhraseCounts("this is only a"));
+ Assert.assertEquals(new Counts(50, 50), pcTable.getPhraseCounts("is only a test"));
+
+ }
+ }
+
+ void loadDocsWithRandomWords(FluoClient fluoClient, Random rand, String phrase, int start,
+ int end) {
+
+ try (LoaderExecutor le = fluoClient.newLoaderExecutor()) {
+ // load many documents that share the same phrase
+ for (int i = start; i < end; i++) {
+ String uri = "/foo" + i;
+ StringBuilder content = new StringBuilder(phrase);
+ // add a bunch of random words
+ for (int j = 0; j < 20; j++) {
+ content.append(' ');
+ content.append(Integer.toString(rand.nextInt(10000), 36));
+ }
+
+ Document doc = new Document(uri, content.toString());
+ le.execute(new DocumentLoader(doc));
+ }
+ }
+ miniFluo.waitForObservers();
+ }
+}
+
diff --git a/phrasecount/src/test/resources/log4j.properties b/phrasecount/src/test/resources/log4j.properties
new file mode 100644
index 0000000..1ed12ff
--- /dev/null
+++ b/phrasecount/src/test/resources/log4j.properties
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+log4j.rootLogger=INFO, CA
+log4j.appender.CA=org.apache.log4j.ConsoleAppender
+log4j.appender.CA.layout=org.apache.log4j.PatternLayout
+log4j.appender.CA.layout.ConversionPattern=%d{ISO8601} [%c{2}] %-5p: %m%n
+
+#Uncomment to see debugging output for Fluo.
+#log4j.logger.org.apache.fluo=DEBUG
+
+#uncomment the following to see all transaction activity
+#log4j.logger.fluo.tx=TRACE
+
+log4j.logger.org.apache.zookeeper.ClientCnxn=FATAL
+log4j.logger.org.apache.zookeeper.ZooKeeper=WARN
+log4j.logger.org.apache.curator=WARN