Make RdfFileInputTool to accept multiple input paths. Doc improvements

commit: 639b980ce80677ec4703ba39e19cfd9e7943c506 [log] [tgz]
author: Maxim Kolchin <kolchinmax@gmail.com> Wed Jul 04 13:04:30 2018 +0300
committer: Maxim Kolchin <kolchinmax@gmail.com> Wed Jul 04 13:04:30 2018 +0300
tree: d81d44966121eabbade9a7713ea5b209e56befd4
parent: 3c3ab0dfdb758602dba51e9ce043b7d108898091 [diff]
diff --git a/extras/rya.manual/src/site/markdown/loaddata.md b/extras/rya.manual/src/site/markdown/loaddata.md
index e5c7bd2..9d43edd 100644
--- a/extras/rya.manual/src/site/markdown/loaddata.md
+++ b/extras/rya.manual/src/site/markdown/loaddata.md

@@ -21,7 +21,7 @@
 -->
 # Load Data
 
-There are a few mechanisms to load data
+There are a few mechanisms to load data.
 
 ## Web REST endpoint
 
@@ -92,29 +92,55 @@
 
 ## Bulk Loading data
 
-Bulk loading data is done through Map Reduce jobs
+Bulk loading data is done through Map Reduce jobs.
 
 ### Bulk Load RDF data
 
-This Map Reduce job will read files into memory and parse them into statements. The statements are saved into the store. Here is an example for storing in Accumulo:
+This Map Reduce job will read files into memory and parse them into statements. The statements are saved into the triplestore. 
+Here are the steps to prepare and run the job:
+
+  * Load the RDF data to HDFS. It can be single of multiple volumes and directories in them.
+  * Also load the `mapreduce/target/rya.mapreduce-<version>-shaded.jar` executable jar file to HDFS.
+  * Run the following sample command:
 
 ```
-hadoop jar target/rya.mapreduce-3.2.10-SNAPSHOT-shaded.jar org.apache.rya.accumulo.mr.RdfFileInputTool -Dac.zk=localhost:2181 -Dac.instance=accumulo -Dac.username=root -Dac.pwd=secret -Drdf.tablePrefix=triplestore_ -Drdf.format=N-Triples /tmp/temp.ntrips
+hadoop hdfs://volume/rya.mapreduce-<version>-shaded.jar org.apache.rya.accumulo.mr.tools.RdfFileInputTool -Dac.zk=localhost:2181 -Dac.instance=accumulo -Dac.username=root -Dac.pwd=secret -Drdf.tablePrefix=triplestore_ -Drdf.format=N-Triples hdfs://volume/dir1,hdfs://volume/dir2,hdfs://volume/file1.nt
 ```
 
 Options:
 
-- rdf.tablePrefix : The tables (spo, po, osp) are prefixed with this qualifier. The tables become: (rdf.tablePrefix)spo,(rdf.tablePrefix)po,(rdf.tablePrefix)osp
-- ac.* : Accumulo connection parameters
-- rdf.format : See RDFFormat from RDF4J, samples include (Trig, N-Triples, RDF/XML)
-- sc.use_freetext, sc.use_geo, sc.use_temporal, sc.use_entity : If any of these are set to true, statements will also be
+- **rdf.tablePrefix** - The tables (spo, po, osp) are prefixed with this qualifier.
+    The tables become: (rdf.tablePrefix)spo,(rdf.tablePrefix)po,(rdf.tablePrefix)osp
+- **ac.*** - Accumulo connection parameters
+- **rdf.format** - See RDFFormat from RDF4J, samples include (Trig, N-Triples, RDF/XML)
+- **sc.use_freetext, sc.use_geo, sc.use_temporal, sc.use_entity** - If any of these are set to true, statements will also be
     added to the enabled secondary indices.
-- sc.freetext.predicates, sc.geo.predicates, sc.temporal.predicates: If the associated indexer is enabled, these options specify
+- **sc.freetext.predicates, sc.geo.predicates, sc.temporal.predicates** - If the associated indexer is enabled, these options specify
     which statements should be sent to that indexer (based on the predicate). If not given, all indexers will attempt to index
     all statements.
 
-The argument is the directory/file to load. This file needs to be loaded into HDFS before running. If loading a directory, all files should have the same RDF
-format.
+The positional argument is a comma separated list of directories/files to load.
+They need to be loaded into HDFS before running. If loading a directory,
+all files should have the same RDF format.
+
+Once the data is loaded, it is actually a good practice to compact your tables.
+You can do this by opening the accumulo shell shell and running the compact
+command on the generated tables. Remember the generated tables will be
+prefixed by the rdf.tablePrefix property you assigned above.
+The default tablePrefix is `rts`.
+Here is a sample Accumulo Shell command:
+
+```
+compact -p triplestore_(.*)
+```
+
+### Generate Prospects table
+
+For the best query performance, it is recommended to run the job that
+creates the Prospects table. This job will read through your data and
+gather statistics on the distribution of the dataset. This table is then
+queried before query execution to reorder queries based on the data
+distribution. See the [Prospects Table](eval.md) section on how to do this.
 
 ## Direct RDF4J API
 

diff --git a/extras/rya.manual/src/site/markdown/quickstart.md b/extras/rya.manual/src/site/markdown/quickstart.md
index f0d76a8..7a93cda 100644
--- a/extras/rya.manual/src/site/markdown/quickstart.md
+++ b/extras/rya.manual/src/site/markdown/quickstart.md

@@ -56,7 +56,7 @@
 
 ## Usage
 
-First, we need to load data. See the [Load Data Section] (loaddata.md)
+First, we need to load data. See the [Load Data](loaddata.md) section.
 
-Second, we need to query that data. See the [Query Data Section](querydata.md)
+Second, we need to query that data. See the [Query Data](querydata.md) section.
 

diff --git a/mapreduce/pom.xml b/mapreduce/pom.xml
index dc3cec4..bc019da 100644
--- a/mapreduce/pom.xml
+++ b/mapreduce/pom.xml

@@ -88,6 +88,35 @@
     </dependencies>
 
     <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*.SF</exclude>
+                                        <exclude>META-INF/*.DSA</exclude>
+                                        <exclude>META-INF/*.RSA</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <transformers>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
         <pluginManagement>
             <plugins>
                 <plugin>
@@ -101,33 +130,6 @@
                         </excludes>
                     </configuration>
                 </plugin>
-                <plugin>
-                    <groupId>org.apache.maven.plugins</groupId>
-                    <artifactId>maven-shade-plugin</artifactId>
-                    <executions>
-                        <execution>
-                            <phase>package</phase>
-                            <goals>
-                                <goal>shade</goal>
-                            </goals>
-                            <configuration>
-                                <filters>
-                                    <filter>
-                                        <artifact>*:*</artifact>
-                                        <excludes>
-                                            <exclude>META-INF/*.SF</exclude>
-                                            <exclude>META-INF/*.DSA</exclude>
-                                            <exclude>META-INF/*.RSA</exclude>
-                                        </excludes>
-                                    </filter>
-                                </filters>
-                                <transformers>
-                                    <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
-                                </transformers>
-                            </configuration>
-                        </execution>
-                    </executions>
-                </plugin>
             </plugins>
         </pluginManagement>
     </build>

diff --git a/mapreduce/src/main/java/org/apache/rya/accumulo/mr/AbstractAccumuloMRTool.java b/mapreduce/src/main/java/org/apache/rya/accumulo/mr/AbstractAccumuloMRTool.java
index 7489391..cd29e1e 100644
--- a/mapreduce/src/main/java/org/apache/rya/accumulo/mr/AbstractAccumuloMRTool.java
+++ b/mapreduce/src/main/java/org/apache/rya/accumulo/mr/AbstractAccumuloMRTool.java

@@ -209,18 +209,18 @@
      * ({@link org.apache.hadoop.io.LongWritable}, {@link RyaStatementWritable})
      * pairs from RDF file(s) found at the specified path.
      * @param   job   Job to configure
-     * @param   inputPath     File or directory name
+     * @param   commaSeparatedPaths a comma separated list of files or directories
      * @param   defaultFormat  Default RDF serialization format, can be
      *                         overridden by {@link MRUtils#FORMAT_PROP}
      * @throws  IOException if there's an error interacting with the
      *          {@link org.apache.hadoop.fs.FileSystem}.
      */
-    protected void setupFileInput(Job job, String inputPath, RDFFormat defaultFormat) throws IOException {
+    protected void setupFileInputs(Job job, String commaSeparatedPaths, RDFFormat defaultFormat) throws IOException {
         RDFFormat format = MRUtils.getRDFFormat(conf);
         if (format == null) {
             format = defaultFormat;
         }
-        RdfFileInputFormat.addInputPath(job, new Path(inputPath));
+        RdfFileInputFormat.addInputPaths(job, commaSeparatedPaths);
         RdfFileInputFormat.setRDFFormat(job, format);
         job.setInputFormatClass(RdfFileInputFormat.class);
     }

diff --git a/mapreduce/src/main/java/org/apache/rya/accumulo/mr/tools/RdfFileInputTool.java b/mapreduce/src/main/java/org/apache/rya/accumulo/mr/tools/RdfFileInputTool.java
index c004f4e..5d7209a 100644
--- a/mapreduce/src/main/java/org/apache/rya/accumulo/mr/tools/RdfFileInputTool.java
+++ b/mapreduce/src/main/java/org/apache/rya/accumulo/mr/tools/RdfFileInputTool.java

@@ -65,7 +65,7 @@
         job.setJarByClass(RdfFileInputTool.class);
 
         String inputPath = conf.get(MRUtils.INPUT_PATH, args[0]);
-        setupFileInput(job, inputPath, RDFFormat.RDFXML);
+        setupFileInputs(job, inputPath, RDFFormat.RDFXML);
         setupRyaOutput(job);
         job.setNumReduceTasks(0);
 

diff --git a/mapreduce/src/test/java/org/apache/rya/accumulo/mr/tools/RdfFileInputToolTest.java b/mapreduce/src/test/java/org/apache/rya/accumulo/mr/tools/RdfFileInputToolTest.java
index 8f92cf1..020122b 100644
--- a/mapreduce/src/test/java/org/apache/rya/accumulo/mr/tools/RdfFileInputToolTest.java
+++ b/mapreduce/src/test/java/org/apache/rya/accumulo/mr/tools/RdfFileInputToolTest.java

@@ -19,7 +19,6 @@
  * under the License.
  */
 
-import junit.framework.TestCase;
 import org.apache.accumulo.core.client.Connector;
 import org.apache.accumulo.core.client.admin.SecurityOperations;
 import org.apache.accumulo.core.client.mock.MockInstance;
@@ -29,10 +28,12 @@
 import org.apache.rya.accumulo.AccumuloRdfConfiguration;
 import org.apache.rya.accumulo.mr.TestUtils;
 import org.apache.rya.api.RdfCloudTripleStoreConstants;
+import org.apache.rya.api.domain.RyaIRI;
 import org.apache.rya.api.domain.RyaStatement;
 import org.apache.rya.api.domain.RyaType;
-import org.apache.rya.api.domain.RyaIRI;
 import org.eclipse.rdf4j.rio.RDFFormat;
+import org.junit.After;
+import org.junit.Before;
 import org.junit.Test;
 
 /**
@@ -41,7 +42,7 @@
  * Time: 10:51 AM
  * To change this template use File | Settings | File Templates.
  */
-public class RdfFileInputToolTest extends TestCase {
+public class RdfFileInputToolTest {
 
     private String user = "user";
     private String pwd = "pwd";
@@ -50,9 +51,8 @@
     private Authorizations auths = new Authorizations("test_auths");
     private Connector connector;
 
-    @Override
+    @Before
     public void setUp() throws Exception {
-        super.setUp();
         connector = new MockInstance(instance).getConnector(user, new PasswordToken(pwd));
         connector.tableOperations().create(tablePrefix + RdfCloudTripleStoreConstants.TBL_SPO_SUFFIX);
         connector.tableOperations().create(tablePrefix + RdfCloudTripleStoreConstants.TBL_PO_SUFFIX);
@@ -70,9 +70,8 @@
         secOps.grantTablePermission(user, tablePrefix + RdfCloudTripleStoreConstants.TBL_EVAL_SUFFIX, TablePermission.WRITE);
     }
 
-    @Override
+    @After
     public void tearDown() throws Exception {
-        super.tearDown();
         connector.tableOperations().delete(tablePrefix + RdfCloudTripleStoreConstants.TBL_SPO_SUFFIX);
         connector.tableOperations().delete(tablePrefix + RdfCloudTripleStoreConstants.TBL_PO_SUFFIX);
         connector.tableOperations().delete(tablePrefix + RdfCloudTripleStoreConstants.TBL_OSP_SUFFIX);
@@ -104,6 +103,33 @@
     }
 
     @Test
+    public void testMultipleNTriplesInputs() throws Exception {
+        RdfFileInputTool.main(new String[]{
+                "-Dac.mock=true",
+                "-Dac.instance=" + instance,
+                "-Dac.username=" + user,
+                "-Dac.pwd=" + pwd,
+                "-Dac.auth=" + auths.toString(),
+                "-Dac.cv=" + auths.toString(),
+                "-Drdf.tablePrefix=" + tablePrefix,
+                "-Drdf.format=" + RDFFormat.NTRIPLES.getName(),
+                "src/test/resources/test.ntriples,src/test/resources/test2.ntriples",
+        });
+        RyaStatement rs1 = new RyaStatement(new RyaIRI("urn:lubm:rdfts#GraduateStudent01"),
+                new RyaIRI("urn:lubm:rdfts#hasFriend"),
+                new RyaIRI("urn:lubm:rdfts#GraduateStudent02"));
+        RyaStatement rs2 = new RyaStatement(new RyaIRI("urn:lubm:rdfts#GraduateStudent05"),
+                new RyaIRI("urn:lubm:rdfts#hasFriend"),
+                new RyaIRI("urn:lubm:rdfts#GraduateStudent07"));
+        rs1.setColumnVisibility(auths.toString().getBytes());
+        rs2.setColumnVisibility(auths.toString().getBytes());
+        AccumuloRdfConfiguration conf = new AccumuloRdfConfiguration();
+        conf.setTablePrefix(tablePrefix);
+        conf.setAuths(auths.toString());
+        TestUtils.verify(connector, conf, rs1, rs2);
+    }
+
+    @Test
     public void testInputContext() throws Exception {
         RdfFileInputTool.main(new String[]{
                 "-Dac.mock=true",

diff --git a/mapreduce/src/test/resources/test2.ntriples b/mapreduce/src/test/resources/test2.ntriples
new file mode 100644
index 0000000..692f66a
--- /dev/null
+++ b/mapreduce/src/test/resources/test2.ntriples

@@ -0,0 +1,3 @@
+<urn:lubm:rdfts#GraduateStudent05> <urn:lubm:rdfts#hasFriend> <urn:lubm:rdfts#GraduateStudent07> .
+<urn:lubm:rdfts#GraduateStudent06> <urn:lubm:rdfts#hasFriend> <urn:lubm:rdfts#GraduateStudent06> .
+<urn:lubm:rdfts#GraduateStudent07> <urn:lubm:rdfts#hasFriend> <urn:lubm:rdfts#GraduateStudent05> .
commit	639b980ce80677ec4703ba39e19cfd9e7943c506	[log] [tgz]
author	Maxim Kolchin <kolchinmax@gmail.com>	Wed Jul 04 13:04:30 2018 +0300
committer	Maxim Kolchin <kolchinmax@gmail.com>	Wed Jul 04 13:04:30 2018 +0300
tree	d81d44966121eabbade9a7713ea5b209e56befd4
parent	3c3ab0dfdb758602dba51e9ce043b7d108898091 [diff]