jena-elephas/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java - jena - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.jena.hadoop.rdf.stats.jobs;

 import java.io.IOException;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.SequenceFile.CompressionType;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.jena.hadoop.rdf.io.input.QuadsInputFormat;
 import org.apache.jena.hadoop.rdf.io.input.TriplesInputFormat;
 import org.apache.jena.hadoop.rdf.io.input.TriplesOrQuadsInputFormat;
 import org.apache.jena.hadoop.rdf.io.input.nquads.NQuadsInputFormat;
 import org.apache.jena.hadoop.rdf.io.input.ntriples.NTriplesInputFormat;
 import org.apache.jena.hadoop.rdf.io.output.nquads.NQuadsOutputFormat;
 import org.apache.jena.hadoop.rdf.io.output.ntriples.NTriplesNodeOutputFormat;
 import org.apache.jena.hadoop.rdf.io.output.ntriples.NTriplesOutputFormat;
 import org.apache.jena.hadoop.rdf.mapreduce.KeyMapper;
 import org.apache.jena.hadoop.rdf.mapreduce.RdfMapReduceConstants;
 import org.apache.jena.hadoop.rdf.mapreduce.TextCountReducer;
 import org.apache.jena.hadoop.rdf.mapreduce.characteristics.CharacteristicSetReducer;
 import org.apache.jena.hadoop.rdf.mapreduce.characteristics.QuadCharacteristicSetGeneratingReducer;
 import org.apache.jena.hadoop.rdf.mapreduce.characteristics.TripleCharacteristicSetGeneratingReducer;
 import org.apache.jena.hadoop.rdf.mapreduce.count.NodeCountReducer;
 import org.apache.jena.hadoop.rdf.mapreduce.count.QuadNodeCountMapper;
 import org.apache.jena.hadoop.rdf.mapreduce.count.TripleNodeCountMapper;
 import org.apache.jena.hadoop.rdf.mapreduce.count.datatypes.QuadDataTypeCountMapper;
 import org.apache.jena.hadoop.rdf.mapreduce.count.datatypes.TripleDataTypeCountMapper;
 import org.apache.jena.hadoop.rdf.mapreduce.count.namespaces.QuadNamespaceCountMapper;
 import org.apache.jena.hadoop.rdf.mapreduce.count.namespaces.TripleNamespaceCountMapper;
 import org.apache.jena.hadoop.rdf.mapreduce.count.positional.QuadGraphCountMapper;
 import org.apache.jena.hadoop.rdf.mapreduce.count.positional.QuadObjectCountMapper;
 import org.apache.jena.hadoop.rdf.mapreduce.count.positional.TripleObjectCountMapper;
 import org.apache.jena.hadoop.rdf.mapreduce.filter.positional.QuadFilterByPredicateMapper;
 import org.apache.jena.hadoop.rdf.mapreduce.filter.positional.TripleFilterByPredicateUriMapper;
 import org.apache.jena.hadoop.rdf.mapreduce.group.QuadGroupBySubjectMapper;
 import org.apache.jena.hadoop.rdf.mapreduce.group.TripleGroupBySubjectMapper;
 import org.apache.jena.hadoop.rdf.mapreduce.transform.TriplesToQuadsConstantGraphMapper;
 import org.apache.jena.hadoop.rdf.types.CharacteristicSetWritable;
 import org.apache.jena.hadoop.rdf.types.NodeWritable;
 import org.apache.jena.hadoop.rdf.types.QuadWritable;
 import org.apache.jena.hadoop.rdf.types.TripleWritable;
 import org.apache.jena.vocabulary.RDF ;

 /**
  * Factory that can produce {@link Job} instances for computing various RDF
  * statistics
  *
  *
  *
  */
 public class JobFactory {

     /**
      * Private constructor prevents instantiation
      */
     private JobFactory() {
     }

     /**
      * Gets a job for computing node counts on RDF triple inputs
      *
      * @param config
      *            Configuration
      * @param inputPaths
      *            Input paths
      * @param outputPath
      *            Output path
      * @return Job
      * @throws IOException
      */
     public static Job getTripleNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Triples Node Usage Count");

         // Map/Reduce classes
         job.setMapperClass(TripleNodeCountMapper.class);
         job.setMapOutputKeyClass(NodeWritable.class);
         job.setMapOutputValueClass(LongWritable.class);
         job.setReducerClass(NodeCountReducer.class);

         // Input and Output
         job.setInputFormatClass(TriplesInputFormat.class);
         job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         return job;
     }

     /**
      * Gets a job for computing node counts on RDF quad inputs
      *
      * @param config
      *            Configuration
      * @param inputPaths
      *            Input paths
      * @param outputPath
      *            Output path
      * @return Job
      * @throws IOException
      */
     public static Job getQuadNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Quads Node Usage Count");

         // Map/Reduce classes
         job.setMapperClass(QuadNodeCountMapper.class);
         job.setMapOutputKeyClass(NodeWritable.class);
         job.setMapOutputValueClass(LongWritable.class);
         job.setReducerClass(NodeCountReducer.class);

         // Input and Output
         job.setInputFormatClass(QuadsInputFormat.class);
         job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         return job;
     }

     /**
      * Gets a job for computing node counts on RDF triple and/or quad inputs
      *
      * @param config
      *            Configuration
      * @param inputPaths
      *            Input paths
      * @param outputPath
      *            Output path
      * @return Job
      * @throws IOException
      */
     public static Job getNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Node Usage Count");

         // Map/Reduce classes
         job.setMapperClass(QuadNodeCountMapper.class);
         job.setMapOutputKeyClass(NodeWritable.class);
         job.setMapOutputValueClass(LongWritable.class);
         job.setReducerClass(NodeCountReducer.class);

         // Input and Output
         job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
         job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         return job;
     }

     public static Job getTripleGraphSizesJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Triples Graph Sizes");

         // Map/Reduce classes
         ChainMapper.addMapper(job, TriplesToQuadsConstantGraphMapper.class, LongWritable.class, TripleWritable.class, LongWritable.class, QuadWritable.class, config);
         ChainMapper.addMapper(job, QuadGraphCountMapper.class, LongWritable.class, QuadWritable.class, NodeWritable.class, LongWritable.class, config);
         job.setMapOutputKeyClass(NodeWritable.class);
         job.setMapOutputValueClass(LongWritable.class);
         job.setReducerClass(NodeCountReducer.class);

         // Input and Output
         job.setInputFormatClass(TriplesInputFormat.class);
         job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         return job;
     }

     public static Job getQuadGraphSizesJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Quads Graph Sizes");

         // Map/Reduce classes
         job.setMapperClass(QuadGraphCountMapper.class);
         job.setMapOutputKeyClass(NodeWritable.class);
         job.setMapOutputValueClass(LongWritable.class);
         job.setReducerClass(NodeCountReducer.class);

         // Input and Output
         job.setInputFormatClass(QuadsInputFormat.class);
         job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         return job;
     }

     public static Job getGraphSizesJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Graph Sizes");

         // Map/Reduce classes
         job.setMapperClass(QuadGraphCountMapper.class);
         job.setMapOutputKeyClass(NodeWritable.class);
         job.setMapOutputValueClass(LongWritable.class);
         job.setReducerClass(NodeCountReducer.class);

         // Input and Output
         job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
         job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         return job;
     }

     /**
      * Gets a sequence of jobs that can be used to compute characteristic sets
      * for RDF triples
      *
      * @param config
      *            Configuration
      * @param inputPaths
      *            Input paths
      * @param intermediateOutputPath
      *            Intermediate output path
      * @param outputPath
      *            Final output path
      * @return Sequence of jobs
      * @throws IOException
      */
     public static Job[] getTripleCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
             String outputPath) throws IOException {
         Job[] jobs = new Job[2];

         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Triples Characteristic Set (Generation)");

         // Map/Reduce classes
         job.setMapperClass(TripleGroupBySubjectMapper.class);
         job.setMapOutputKeyClass(NodeWritable.class);
         job.setMapOutputValueClass(TripleWritable.class);
         job.setReducerClass(TripleCharacteristicSetGeneratingReducer.class);
         job.setOutputKeyClass(CharacteristicSetWritable.class);
         job.setOutputValueClass(NullWritable.class);

         // Input and Output
         job.setInputFormatClass(TriplesInputFormat.class);
         job.setOutputFormatClass(SequenceFileOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
         SequenceFileOutputFormat.setCompressOutput(job, true);
         FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
         SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

         jobs[0] = job;

         job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Triples Characteristic Set (Reduction)");

         // Map/Reduce classes
         job.setMapperClass(KeyMapper.class);
         job.setMapOutputKeyClass(CharacteristicSetWritable.class);
         job.setMapOutputValueClass(CharacteristicSetWritable.class);
         job.setReducerClass(CharacteristicSetReducer.class);
         job.setOutputKeyClass(CharacteristicSetWritable.class);
         job.setOutputValueClass(CharacteristicSetWritable.class);

         // Input and Output
         job.setInputFormatClass(SequenceFileInputFormat.class);
         job.setOutputFormatClass(TextOutputFormat.class);
         FileInputFormat.setInputPaths(job, intermediateOutputPath);
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         jobs[1] = job;
         return jobs;
     }

     /**
      * Gets a sequence of jobs that can be used to compute characteristic sets
      * for RDF quads
      *
      * @param config
      *            Configuration
      * @param inputPaths
      *            Input paths
      * @param intermediateOutputPath
      *            Intermediate output path
      * @param outputPath
      *            Final output path
      * @return Sequence of jobs
      * @throws IOException
      */
     public static Job[] getQuadCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
             String outputPath) throws IOException {
         Job[] jobs = new Job[2];

         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Quads Characteristic Set (Generation)");

         // Map/Reduce classes
         job.setMapperClass(QuadGroupBySubjectMapper.class);
         job.setMapOutputKeyClass(NodeWritable.class);
         job.setMapOutputValueClass(QuadWritable.class);
         job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class);
         job.setOutputKeyClass(CharacteristicSetWritable.class);
         job.setOutputValueClass(NullWritable.class);

         // Input and Output
         job.setInputFormatClass(QuadsInputFormat.class);
         job.setOutputFormatClass(SequenceFileOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
         SequenceFileOutputFormat.setCompressOutput(job, true);
         FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
         SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

         jobs[0] = job;

         job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Quads Characteristic Set (Reduction)");

         // Map/Reduce classes
         job.setMapperClass(KeyMapper.class);
         job.setMapOutputKeyClass(CharacteristicSetWritable.class);
         job.setMapOutputValueClass(CharacteristicSetWritable.class);
         job.setReducerClass(CharacteristicSetReducer.class);
         job.setOutputKeyClass(CharacteristicSetWritable.class);
         job.setOutputValueClass(CharacteristicSetWritable.class);

         // Input and Output
         job.setInputFormatClass(SequenceFileInputFormat.class);
         job.setOutputFormatClass(TextOutputFormat.class);
         FileInputFormat.setInputPaths(job, intermediateOutputPath);
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         jobs[1] = job;
         return jobs;
     }

     /**
      * Gets a sequence of jobs that can be used to compute characteristic sets
      * for RDF triple and/or quad inputs
      *
      * @param config
      *            Configuration
      * @param inputPaths
      *            Input paths
      * @param intermediateOutputPath
      *            Intermediate output path
      * @param outputPath
      *            Final output path
      * @return Sequence of jobs
      * @throws IOException
      */
     public static Job[] getCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
             String outputPath) throws IOException {
         Job[] jobs = new Job[2];

         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Characteristic Set (Generation)");

         // Map/Reduce classes
         job.setMapperClass(QuadGroupBySubjectMapper.class);
         job.setMapOutputKeyClass(NodeWritable.class);
         job.setMapOutputValueClass(QuadWritable.class);
         job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class);
         job.setOutputKeyClass(CharacteristicSetWritable.class);
         job.setOutputValueClass(NullWritable.class);

         // Input and Output
         job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
         job.setOutputFormatClass(SequenceFileOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
         SequenceFileOutputFormat.setCompressOutput(job, true);
         FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
         SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

         jobs[0] = job;

         job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Characteristic Set (Reduction)");

         // Map/Reduce classes
         job.setMapperClass(KeyMapper.class);
         job.setMapOutputKeyClass(CharacteristicSetWritable.class);
         job.setMapOutputValueClass(CharacteristicSetWritable.class);
         job.setReducerClass(CharacteristicSetReducer.class);
         job.setOutputKeyClass(CharacteristicSetWritable.class);
         job.setOutputValueClass(CharacteristicSetWritable.class);

         // Input and Output
         job.setInputFormatClass(SequenceFileInputFormat.class);
         job.setOutputFormatClass(TextOutputFormat.class);
         FileInputFormat.setInputPaths(job, intermediateOutputPath);
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         jobs[1] = job;
         return jobs;
     }

     /**
      * Gets a job for computing type counts on RDF triple inputs
      *
      * @param config
      *            Configuration
      * @param inputPaths
      *            Input paths
      * @param intermediateOutputPath
      *            Path for intermediate output which will be all the type
      *            declaration triples present in the inputs
      * @param outputPath
      *            Output path
      * @return Job
      * @throws IOException
      */
     public static Job[] getTripleTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
             String outputPath) throws IOException {
         Job[] jobs = new Job[2];

         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Type Triples Extraction");

         // Map/Reduce classes
         job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI());
         job.setMapperClass(TripleFilterByPredicateUriMapper.class);
         job.setMapOutputKeyClass(LongWritable.class);
         job.setMapOutputValueClass(TripleWritable.class);

         // Input and Output Format
         job.setInputFormatClass(TriplesInputFormat.class);
         job.setOutputFormatClass(NTriplesOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));

         jobs[0] = job;

         // Object Node Usage count job
         job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Triples Type Usage Count");

         // Map/Reduce classes
         job.setMapperClass(TripleObjectCountMapper.class);
         job.setMapOutputKeyClass(NodeWritable.class);
         job.setMapOutputValueClass(LongWritable.class);
         job.setReducerClass(NodeCountReducer.class);

         // Input and Output
         job.setInputFormatClass(NTriplesInputFormat.class);
         NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be
                                                           // better if this was
                                                           // intelligently
                                                           // configured
         job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
         FileInputFormat.setInputPaths(job, intermediateOutputPath);
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         jobs[1] = job;

         return jobs;
     }

     /**
      * Gets a job for computing type counts on RDF quad inputs
      *
      * @param config
      *            Configuration
      * @param inputPaths
      *            Input paths
      * @param intermediateOutputPath
      *            Path for intermediate output which will be all the type
      *            declaration quads present in the inputs
      * @param outputPath
      *            Output path
      * @return Job
      * @throws IOException
      */
     public static Job[] getQuadTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
             String outputPath) throws IOException {
         Job[] jobs = new Job[2];

         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Type Quads Extraction");

         // Map/Reduce classes
         job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI());
         job.setMapperClass(QuadFilterByPredicateMapper.class);
         job.setMapOutputKeyClass(LongWritable.class);
         job.setMapOutputValueClass(QuadWritable.class);

         // Input and Output Format
         job.setInputFormatClass(QuadsInputFormat.class);
         job.setOutputFormatClass(NQuadsOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));

         jobs[0] = job;

         // Object Node Usage count job
         job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Quads Type Usage Count");

         // Map/Reduce classes
         job.setMapperClass(QuadObjectCountMapper.class);
         job.setMapOutputKeyClass(NodeWritable.class);
         job.setMapOutputValueClass(LongWritable.class);
         job.setReducerClass(NodeCountReducer.class);

         // Input and Output
         job.setInputFormatClass(NQuadsInputFormat.class);
         NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be
                                                           // better if this was
                                                           // intelligently
                                                           // configured
         job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
         FileInputFormat.setInputPaths(job, intermediateOutputPath);
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         jobs[1] = job;

         return jobs;
     }

     /**
      * Gets a job for computing type counts on RDF triple and/or quad inputs
      *
      * @param config
      *            Configuration
      * @param inputPaths
      *            Input paths
      * @param intermediateOutputPath
      *            Path for intermediate output which will be all the type
      *            declaration quads present in the inputs
      * @param outputPath
      *            Output path
      * @return Job
      * @throws IOException
      */
     public static Job[] getTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
             String outputPath) throws IOException {
         Job[] jobs = new Job[2];

         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Type Extraction");

         // Map/Reduce classes
         job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI());
         job.setMapperClass(QuadFilterByPredicateMapper.class);
         job.setMapOutputKeyClass(LongWritable.class);
         job.setMapOutputValueClass(QuadWritable.class);

         // Input and Output Format
         job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
         job.setOutputFormatClass(NQuadsOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));

         jobs[0] = job;

         // Object Node Usage count job
         job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Type Usage Count");

         // Map/Reduce classes
         job.setMapperClass(QuadObjectCountMapper.class);
         job.setMapOutputKeyClass(NodeWritable.class);
         job.setMapOutputValueClass(LongWritable.class);
         job.setReducerClass(NodeCountReducer.class);

         // Input and Output
         job.setInputFormatClass(NQuadsInputFormat.class);
         NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be
                                                           // better if this was
                                                           // intelligently
                                                           // configured
         job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
         FileInputFormat.setInputPaths(job, intermediateOutputPath);
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         jobs[1] = job;

         return jobs;
     }

     /**
      * Gets a job for computing literal data type counts on RDF triple inputs
      *
      * @param config
      *            Configuration
      * @param inputPaths
      *            Input paths
      * @param outputPath
      *            Output path
      * @return Job
      * @throws IOException
      */
     public static Job getTripleDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Triples Literal Data Type Usage Count");

         // Map/Reduce classes
         job.setMapperClass(TripleDataTypeCountMapper.class);
         job.setMapOutputKeyClass(NodeWritable.class);
         job.setMapOutputValueClass(LongWritable.class);
         job.setReducerClass(NodeCountReducer.class);

         // Input and Output
         job.setInputFormatClass(TriplesInputFormat.class);
         job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         return job;
     }

     /**
      * Gets a job for computing literal data type counts on RDF quad inputs
      *
      * @param config
      *            Configuration
      * @param inputPaths
      *            Input paths
      * @param outputPath
      *            Output path
      * @return Job
      * @throws IOException
      */
     public static Job getQuadDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Quads Literal Data Type Usage Count");

         // Map/Reduce classes
         job.setMapperClass(QuadDataTypeCountMapper.class);
         job.setMapOutputKeyClass(NodeWritable.class);
         job.setMapOutputValueClass(LongWritable.class);
         job.setReducerClass(NodeCountReducer.class);

         // Input and Output
         job.setInputFormatClass(QuadsInputFormat.class);
         job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         return job;
     }

     /**
      * Gets a job for computing literal data type counts on RDF triple and/or
      * quad inputs
      *
      * @param config
      *            Configuration
      * @param inputPaths
      *            Input paths
      * @param outputPath
      *            Output path
      * @return Job
      * @throws IOException
      */
     public static Job getDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Literal Data Type Usage Count");

         // Map/Reduce classes
         job.setMapperClass(QuadDataTypeCountMapper.class);
         job.setMapOutputKeyClass(NodeWritable.class);
         job.setMapOutputValueClass(LongWritable.class);
         job.setReducerClass(NodeCountReducer.class);

         // Input and Output
         job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
         job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         return job;
     }

     /**
      * Gets a job for computing literal data type counts on RDF triple inputs
      *
      * @param config
      *            Configuration
      * @param inputPaths
      *            Input paths
      * @param outputPath
      *            Output path
      * @return Job
      * @throws IOException
      */
     public static Job getTripleNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Triples Namespace Usage Count");

         // Map/Reduce classes
         job.setMapperClass(TripleNamespaceCountMapper.class);
         job.setMapOutputKeyClass(Text.class);
         job.setMapOutputValueClass(LongWritable.class);
         job.setReducerClass(TextCountReducer.class);

         // Input and Output
         job.setInputFormatClass(TriplesInputFormat.class);
         job.setOutputFormatClass(TextOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         return job;
     }

     /**
      * Gets a job for computing literal data type counts on RDF quad inputs
      *
      * @param config
      *            Configuration
      * @param inputPaths
      *            Input paths
      * @param outputPath
      *            Output path
      * @return Job
      * @throws IOException
      */
     public static Job getQuadNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Quads Namespace Usage Count");

         // Map/Reduce classes
         job.setMapperClass(QuadNamespaceCountMapper.class);
         job.setMapOutputKeyClass(Text.class);
         job.setMapOutputValueClass(LongWritable.class);
         job.setReducerClass(TextCountReducer.class);

         // Input and Output
         job.setInputFormatClass(QuadsInputFormat.class);
         job.setOutputFormatClass(TextOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         return job;
     }

     /**
      * Gets a job for computing literal data type counts on RDF triple and/or
      * quad inputs
      *
      * @param config
      *            Configuration
      * @param inputPaths
      *            Input paths
      * @param outputPath
      *            Output path
      * @return Job
      * @throws IOException
      */
     public static Job getNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
         Job job = Job.getInstance(config);
         job.setJarByClass(JobFactory.class);
         job.setJobName("RDF Namespace Usage Count");

         // Map/Reduce classes
         job.setMapperClass(QuadNamespaceCountMapper.class);
         job.setMapOutputKeyClass(Text.class);
         job.setMapOutputValueClass(LongWritable.class);
         job.setReducerClass(TextCountReducer.class);

         // Input and Output
         job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
         job.setOutputFormatClass(TextOutputFormat.class);
         FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
         FileOutputFormat.setOutputPath(job, new Path(outputPath));

         return job;
     }
 }