examples/java/src/main/java/org/apache/beam/examples/complete/TfIdf.java - beam - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.beam.examples.complete;

 import java.io.File;
 import java.io.IOException;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.HashSet;
 import java.util.Set;
 import org.apache.beam.sdk.Pipeline;
 import org.apache.beam.sdk.coders.Coder;
 import org.apache.beam.sdk.coders.KvCoder;
 import org.apache.beam.sdk.coders.StringDelegateCoder;
 import org.apache.beam.sdk.coders.StringUtf8Coder;
 import org.apache.beam.sdk.extensions.gcp.options.GcsOptions;
 import org.apache.beam.sdk.io.TextIO;
 import org.apache.beam.sdk.options.Default;
 import org.apache.beam.sdk.options.Description;
 import org.apache.beam.sdk.options.PipelineOptions;
 import org.apache.beam.sdk.options.PipelineOptionsFactory;
 import org.apache.beam.sdk.options.Validation;
 import org.apache.beam.sdk.transforms.Count;
 import org.apache.beam.sdk.transforms.Distinct;
 import org.apache.beam.sdk.transforms.DoFn;
 import org.apache.beam.sdk.transforms.Flatten;
 import org.apache.beam.sdk.transforms.Keys;
 import org.apache.beam.sdk.transforms.PTransform;
 import org.apache.beam.sdk.transforms.ParDo;
 import org.apache.beam.sdk.transforms.Values;
 import org.apache.beam.sdk.transforms.View;
 import org.apache.beam.sdk.transforms.WithKeys;
 import org.apache.beam.sdk.transforms.join.CoGbkResult;
 import org.apache.beam.sdk.transforms.join.CoGroupByKey;
 import org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple;
 import org.apache.beam.sdk.util.GcsUtil;
 import org.apache.beam.sdk.util.gcsfs.GcsPath;
 import org.apache.beam.sdk.values.KV;
 import org.apache.beam.sdk.values.PBegin;
 import org.apache.beam.sdk.values.PCollection;
 import org.apache.beam.sdk.values.PCollectionList;
 import org.apache.beam.sdk.values.PCollectionView;
 import org.apache.beam.sdk.values.PDone;
 import org.apache.beam.sdk.values.TupleTag;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * An example that computes a basic TF-IDF search table for a directory or GCS prefix.
  *
  * <p>Concepts: joining data; side inputs; logging
  *
  * <p>To execute this pipeline locally, specify a local output file or output prefix on GCS:
  * <pre>{@code
  *   --output=[YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX]
  * }</pre>
  *
  * <p>To change the runner, specify:
  * <pre>{@code
  *   --runner=YOUR_SELECTED_RUNNER
  * }
  * </pre>
  * See examples/java/README.md for instructions about how to configure different runners.
  *
  * <p>The default input is {@code gs://apache-beam-samples/shakespeare/} and can be overridden with
  * {@code --input}.
  */
 public class TfIdf {
   /**
    * Options supported by {@link TfIdf}.
    *
    * <p>Inherits standard configuration options.
    */
   public interface Options extends PipelineOptions {
     @Description("Path to the directory or GCS prefix containing files to read from")
     @Default.String("gs://apache-beam-samples/shakespeare/")
     String getInput();
     void setInput(String value);

     @Description("Prefix of output URI to write to")
     @Validation.Required
     String getOutput();
     void setOutput(String value);
   }

   /**
    * Lists documents contained beneath the {@code options.input} prefix/directory.
    */
   public static Set<URI> listInputDocuments(Options options)
       throws URISyntaxException, IOException {
     URI baseUri = new URI(options.getInput());

     // List all documents in the directory or GCS prefix.
     URI absoluteUri;
     if (baseUri.getScheme() != null) {
       absoluteUri = baseUri;
     } else {
       absoluteUri = new URI(
           "file",
           baseUri.getAuthority(),
           baseUri.getPath(),
           baseUri.getQuery(),
           baseUri.getFragment());
     }

     Set<URI> uris = new HashSet<>();
     if (absoluteUri.getScheme().equals("file")) {
       File directory = new File(absoluteUri);
       for (String entry : directory.list()) {
         File path = new File(directory, entry);
         uris.add(path.toURI());
       }
     } else if (absoluteUri.getScheme().equals("gs")) {
       GcsUtil gcsUtil = options.as(GcsOptions.class).getGcsUtil();
       URI gcsUriGlob = new URI(
           absoluteUri.getScheme(),
           absoluteUri.getAuthority(),
           absoluteUri.getPath() + "*",
           absoluteUri.getQuery(),
           absoluteUri.getFragment());
       for (GcsPath entry : gcsUtil.expand(GcsPath.fromUri(gcsUriGlob))) {
         uris.add(entry.toUri());
       }
     }

     return uris;
   }

   /**
    * Reads the documents at the provided uris and returns all lines
    * from the documents tagged with which document they are from.
    */
   public static class ReadDocuments
       extends PTransform<PBegin, PCollection<KV<URI, String>>> {
     private Iterable<URI> uris;

     public ReadDocuments(Iterable<URI> uris) {
       this.uris = uris;
     }

     @Override
     public Coder<?> getDefaultOutputCoder() {
       return KvCoder.of(StringDelegateCoder.of(URI.class), StringUtf8Coder.of());
     }

     @Override
     public PCollection<KV<URI, String>> expand(PBegin input) {
       Pipeline pipeline = input.getPipeline();

       // Create one TextIO.Read transform for each document
       // and add its output to a PCollectionList
       PCollectionList<KV<URI, String>> urisToLines =
           PCollectionList.empty(pipeline);

       // TextIO.Read supports:
       //  - file: URIs and paths locally
       //  - gs: URIs on the service
       for (final URI uri : uris) {
         String uriString;
         if (uri.getScheme().equals("file")) {
           uriString = new File(uri).getPath();
         } else {
           uriString = uri.toString();
         }

         PCollection<KV<URI, String>> oneUriToLines = pipeline
             .apply("TextIO.Read(" + uriString + ")", TextIO.read().from(uriString))
             .apply("WithKeys(" + uriString + ")", WithKeys.<URI, String>of(uri));

         urisToLines = urisToLines.and(oneUriToLines);
       }

       return urisToLines.apply(Flatten.<KV<URI, String>>pCollections());
     }
   }

   /**
    * A transform containing a basic TF-IDF pipeline. The input consists of KV objects
    * where the key is the document's URI and the value is a piece
    * of the document's content. The output is mapping from terms to
    * scores for each document URI.
    */
   public static class ComputeTfIdf
       extends PTransform<PCollection<KV<URI, String>>, PCollection<KV<String, KV<URI, Double>>>> {
     public ComputeTfIdf() { }

     @Override
     public PCollection<KV<String, KV<URI, Double>>> expand(
       PCollection<KV<URI, String>> uriToContent) {

       // Compute the total number of documents, and
       // prepare this singleton PCollectionView for
       // use as a side input.
       final PCollectionView<Long> totalDocuments =
           uriToContent
           .apply("GetURIs", Keys.<URI>create())
           .apply("DistinctDocs", Distinct.<URI>create())
           .apply(Count.<URI>globally())
           .apply(View.<Long>asSingleton());

       // Create a collection of pairs mapping a URI to each
       // of the words in the document associated with that that URI.
       PCollection<KV<URI, String>> uriToWords = uriToContent
           .apply("SplitWords", ParDo.of(
               new DoFn<KV<URI, String>, KV<URI, String>>() {
                 @ProcessElement
                 public void processElement(ProcessContext c) {
                   URI uri = c.element().getKey();
                   String line = c.element().getValue();
                   for (String word : line.split("\\W+")) {
                     // Log INFO messages when the word “love” is found.
                     if (word.toLowerCase().equals("love")) {
                       LOG.info("Found {}", word.toLowerCase());
                     }

                     if (!word.isEmpty()) {
                       c.output(KV.of(uri, word.toLowerCase()));
                     }
                   }
                 }
               }));

       // Compute a mapping from each word to the total
       // number of documents in which it appears.
       PCollection<KV<String, Long>> wordToDocCount = uriToWords
           .apply("DistinctWords", Distinct.<KV<URI, String>>create())
           .apply(Values.<String>create())
           .apply("CountDocs", Count.<String>perElement());

       // Compute a mapping from each URI to the total
       // number of words in the document associated with that URI.
       PCollection<KV<URI, Long>> uriToWordTotal = uriToWords
           .apply("GetURIs2", Keys.<URI>create())
           .apply("CountWords", Count.<URI>perElement());

       // Count, for each (URI, word) pair, the number of
       // occurrences of that word in the document associated
       // with the URI.
       PCollection<KV<KV<URI, String>, Long>> uriAndWordToCount = uriToWords
           .apply("CountWordDocPairs", Count.<KV<URI, String>>perElement());

       // Adjust the above collection to a mapping from
       // (URI, word) pairs to counts into an isomorphic mapping
       // from URI to (word, count) pairs, to prepare for a join
       // by the URI key.
       PCollection<KV<URI, KV<String, Long>>> uriToWordAndCount = uriAndWordToCount
           .apply("ShiftKeys", ParDo.of(
               new DoFn<KV<KV<URI, String>, Long>, KV<URI, KV<String, Long>>>() {
                 @ProcessElement
                 public void processElement(ProcessContext c) {
                   URI uri = c.element().getKey().getKey();
                   String word = c.element().getKey().getValue();
                   Long occurrences = c.element().getValue();
                   c.output(KV.of(uri, KV.of(word, occurrences)));
                 }
               }));

       // Prepare to join the mapping of URI to (word, count) pairs with
       // the mapping of URI to total word counts, by associating
       // each of the input PCollection<KV<URI, ...>> with
       // a tuple tag. Each input must have the same key type, URI
       // in this case. The type parameter of the tuple tag matches
       // the types of the values for each collection.
       final TupleTag<Long> wordTotalsTag = new TupleTag<Long>();
       final TupleTag<KV<String, Long>> wordCountsTag = new TupleTag<KV<String, Long>>();
       KeyedPCollectionTuple<URI> coGbkInput = KeyedPCollectionTuple
           .of(wordTotalsTag, uriToWordTotal)
           .and(wordCountsTag, uriToWordAndCount);

       // Perform a CoGroupByKey (a sort of pre-join) on the prepared
       // inputs. This yields a mapping from URI to a CoGbkResult
       // (CoGroupByKey Result). The CoGbkResult is a mapping
       // from the above tuple tags to the values in each input
       // associated with a particular URI. In this case, each
       // KV<URI, CoGbkResult> group a URI with the total number of
       // words in that document as well as all the (word, count)
       // pairs for particular words.
       PCollection<KV<URI, CoGbkResult>> uriToWordAndCountAndTotal = coGbkInput
           .apply("CoGroupByUri", CoGroupByKey.<URI>create());

       // Compute a mapping from each word to a (URI, term frequency)
       // pair for each URI. A word's term frequency for a document
       // is simply the number of times that word occurs in the document
       // divided by the total number of words in the document.
       PCollection<KV<String, KV<URI, Double>>> wordToUriAndTf = uriToWordAndCountAndTotal
           .apply("ComputeTermFrequencies", ParDo.of(
               new DoFn<KV<URI, CoGbkResult>, KV<String, KV<URI, Double>>>() {
                 @ProcessElement
                 public void processElement(ProcessContext c) {
                   URI uri = c.element().getKey();
                   Long wordTotal = c.element().getValue().getOnly(wordTotalsTag);

                   for (KV<String, Long> wordAndCount
                            : c.element().getValue().getAll(wordCountsTag)) {
                     String word = wordAndCount.getKey();
                     Long wordCount = wordAndCount.getValue();
                     Double termFrequency = wordCount.doubleValue() / wordTotal.doubleValue();
                     c.output(KV.of(word, KV.of(uri, termFrequency)));
                   }
                 }
               }));

       // Compute a mapping from each word to its document frequency.
       // A word's document frequency in a corpus is the number of
       // documents in which the word appears divided by the total
       // number of documents in the corpus. Note how the total number of
       // documents is passed as a side input; the same value is
       // presented to each invocation of the DoFn.
       PCollection<KV<String, Double>> wordToDf = wordToDocCount
           .apply("ComputeDocFrequencies", ParDo
               .of(new DoFn<KV<String, Long>, KV<String, Double>>() {
                 @ProcessElement
                 public void processElement(ProcessContext c) {
                   String word = c.element().getKey();
                   Long documentCount = c.element().getValue();
                   Long documentTotal = c.sideInput(totalDocuments);
                   Double documentFrequency = documentCount.doubleValue()
                       / documentTotal.doubleValue();

                   c.output(KV.of(word, documentFrequency));
                 }
               }).withSideInputs(totalDocuments));

       // Join the term frequency and document frequency
       // collections, each keyed on the word.
       final TupleTag<KV<URI, Double>> tfTag = new TupleTag<KV<URI, Double>>();
       final TupleTag<Double> dfTag = new TupleTag<Double>();
       PCollection<KV<String, CoGbkResult>> wordToUriAndTfAndDf = KeyedPCollectionTuple
           .of(tfTag, wordToUriAndTf)
           .and(dfTag, wordToDf)
           .apply(CoGroupByKey.<String>create());

       // Compute a mapping from each word to a (URI, TF-IDF) score
       // for each URI. There are a variety of definitions of TF-IDF
       // ("term frequency - inverse document frequency") score;
       // here we use a basic version that is the term frequency
       // divided by the log of the document frequency.
       PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf = wordToUriAndTfAndDf
           .apply("ComputeTfIdf", ParDo.of(
               new DoFn<KV<String, CoGbkResult>, KV<String, KV<URI, Double>>>() {
                 @ProcessElement
                 public void processElement(ProcessContext c) {
                   String word = c.element().getKey();
                   Double df = c.element().getValue().getOnly(dfTag);

                   for (KV<URI, Double> uriAndTf : c.element().getValue().getAll(tfTag)) {
                     URI uri = uriAndTf.getKey();
                     Double tf = uriAndTf.getValue();
                     Double tfIdf = tf * Math.log(1 / df);
                     c.output(KV.of(word, KV.of(uri, tfIdf)));
                   }
                 }
               }));

       return wordToUriAndTfIdf;
     }

     // Instantiate Logger.
     // It is suggested that the user specify the class name of the containing class
     // (in this case ComputeTfIdf).
     private static final Logger LOG = LoggerFactory.getLogger(ComputeTfIdf.class);
   }

   /**
    * A {@link PTransform} to write, in CSV format, a mapping from term and URI
    * to score.
    */
   public static class WriteTfIdf
       extends PTransform<PCollection<KV<String, KV<URI, Double>>>, PDone> {
     private String output;

     public WriteTfIdf(String output) {
       this.output = output;
     }

     @Override
     public PDone expand(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) {
       return wordToUriAndTfIdf
           .apply("Format", ParDo.of(new DoFn<KV<String, KV<URI, Double>>, String>() {
             @ProcessElement
             public void processElement(ProcessContext c) {
               c.output(String.format("%s,\t%s,\t%f",
                   c.element().getKey(),
                   c.element().getValue().getKey(),
                   c.element().getValue().getValue()));
             }
           }))
           .apply(TextIO.write()
               .to(output)
               .withSuffix(".csv"));
     }
   }

   public static void main(String[] args) throws Exception {
     Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
     Pipeline pipeline = Pipeline.create(options);
     pipeline.getCoderRegistry().registerCoderForClass(URI.class, StringDelegateCoder.of(URI.class));

     pipeline
         .apply(new ReadDocuments(listInputDocuments(options)))
         .apply(new ComputeTfIdf())
         .apply(new WriteTfIdf(options.getOutput()));

     pipeline.run().waitUntilFinish();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.beam.examples.complete;

	import java.io.File;
	import java.io.IOException;
	import java.net.URI;
	import java.net.URISyntaxException;
	import java.util.HashSet;
	import java.util.Set;
	import org.apache.beam.sdk.Pipeline;
	import org.apache.beam.sdk.coders.Coder;
	import org.apache.beam.sdk.coders.KvCoder;
	import org.apache.beam.sdk.coders.StringDelegateCoder;
	import org.apache.beam.sdk.coders.StringUtf8Coder;
	import org.apache.beam.sdk.extensions.gcp.options.GcsOptions;
	import org.apache.beam.sdk.io.TextIO;
	import org.apache.beam.sdk.options.Default;
	import org.apache.beam.sdk.options.Description;
	import org.apache.beam.sdk.options.PipelineOptions;
	import org.apache.beam.sdk.options.PipelineOptionsFactory;
	import org.apache.beam.sdk.options.Validation;
	import org.apache.beam.sdk.transforms.Count;
	import org.apache.beam.sdk.transforms.Distinct;
	import org.apache.beam.sdk.transforms.DoFn;
	import org.apache.beam.sdk.transforms.Flatten;
	import org.apache.beam.sdk.transforms.Keys;
	import org.apache.beam.sdk.transforms.PTransform;
	import org.apache.beam.sdk.transforms.ParDo;
	import org.apache.beam.sdk.transforms.Values;
	import org.apache.beam.sdk.transforms.View;
	import org.apache.beam.sdk.transforms.WithKeys;
	import org.apache.beam.sdk.transforms.join.CoGbkResult;
	import org.apache.beam.sdk.transforms.join.CoGroupByKey;
	import org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple;
	import org.apache.beam.sdk.util.GcsUtil;
	import org.apache.beam.sdk.util.gcsfs.GcsPath;
	import org.apache.beam.sdk.values.KV;
	import org.apache.beam.sdk.values.PBegin;
	import org.apache.beam.sdk.values.PCollection;
	import org.apache.beam.sdk.values.PCollectionList;
	import org.apache.beam.sdk.values.PCollectionView;
	import org.apache.beam.sdk.values.PDone;
	import org.apache.beam.sdk.values.TupleTag;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* An example that computes a basic TF-IDF search table for a directory or GCS prefix.
	*
	* <p>Concepts: joining data; side inputs; logging
	*
	* <p>To execute this pipeline locally, specify a local output file or output prefix on GCS:
	* <pre>{@code
	* --output=[YOUR_LOCAL_FILE \| gs://YOUR_OUTPUT_PREFIX]
	* }</pre>
	*
	* <p>To change the runner, specify:
	* <pre>{@code
	* --runner=YOUR_SELECTED_RUNNER
	* }
	* </pre>
	* See examples/java/README.md for instructions about how to configure different runners.
	*
	* <p>The default input is {@code gs://apache-beam-samples/shakespeare/} and can be overridden with
	* {@code --input}.
	*/
	public class TfIdf {
	/**
	* Options supported by {@link TfIdf}.
	*
	* <p>Inherits standard configuration options.
	*/
	public interface Options extends PipelineOptions {
	@Description("Path to the directory or GCS prefix containing files to read from")
	@Default.String("gs://apache-beam-samples/shakespeare/")
	String getInput();
	void setInput(String value);

	@Description("Prefix of output URI to write to")
	@Validation.Required
	String getOutput();
	void setOutput(String value);
	}

	/**
	* Lists documents contained beneath the {@code options.input} prefix/directory.
	*/
	public static Set<URI> listInputDocuments(Options options)
	throws URISyntaxException, IOException {
	URI baseUri = new URI(options.getInput());

	// List all documents in the directory or GCS prefix.
	URI absoluteUri;
	if (baseUri.getScheme() != null) {
	absoluteUri = baseUri;
	} else {
	absoluteUri = new URI(
	"file",
	baseUri.getAuthority(),
	baseUri.getPath(),
	baseUri.getQuery(),
	baseUri.getFragment());
	}

	Set<URI> uris = new HashSet<>();
	if (absoluteUri.getScheme().equals("file")) {
	File directory = new File(absoluteUri);
	for (String entry : directory.list()) {
	File path = new File(directory, entry);
	uris.add(path.toURI());
	}
	} else if (absoluteUri.getScheme().equals("gs")) {
	GcsUtil gcsUtil = options.as(GcsOptions.class).getGcsUtil();
	URI gcsUriGlob = new URI(
	absoluteUri.getScheme(),
	absoluteUri.getAuthority(),
	absoluteUri.getPath() + "*",
	absoluteUri.getQuery(),
	absoluteUri.getFragment());
	for (GcsPath entry : gcsUtil.expand(GcsPath.fromUri(gcsUriGlob))) {
	uris.add(entry.toUri());
	}
	}

	return uris;
	}

	/**
	* Reads the documents at the provided uris and returns all lines
	* from the documents tagged with which document they are from.
	*/
	public static class ReadDocuments
	extends PTransform<PBegin, PCollection<KV<URI, String>>> {
	private Iterable<URI> uris;

	public ReadDocuments(Iterable<URI> uris) {
	this.uris = uris;
	}

	@Override
	public Coder<?> getDefaultOutputCoder() {
	return KvCoder.of(StringDelegateCoder.of(URI.class), StringUtf8Coder.of());
	}

	@Override
	public PCollection<KV<URI, String>> expand(PBegin input) {
	Pipeline pipeline = input.getPipeline();

	// Create one TextIO.Read transform for each document
	// and add its output to a PCollectionList
	PCollectionList<KV<URI, String>> urisToLines =
	PCollectionList.empty(pipeline);

	// TextIO.Read supports:
	// - file: URIs and paths locally
	// - gs: URIs on the service
	for (final URI uri : uris) {
	String uriString;
	if (uri.getScheme().equals("file")) {
	uriString = new File(uri).getPath();
	} else {
	uriString = uri.toString();
	}

	PCollection<KV<URI, String>> oneUriToLines = pipeline
	.apply("TextIO.Read(" + uriString + ")", TextIO.read().from(uriString))
	.apply("WithKeys(" + uriString + ")", WithKeys.<URI, String>of(uri));

	urisToLines = urisToLines.and(oneUriToLines);
	}

	return urisToLines.apply(Flatten.<KV<URI, String>>pCollections());
	}
	}

	/**
	* A transform containing a basic TF-IDF pipeline. The input consists of KV objects
	* where the key is the document's URI and the value is a piece
	* of the document's content. The output is mapping from terms to
	* scores for each document URI.
	*/
	public static class ComputeTfIdf
	extends PTransform<PCollection<KV<URI, String>>, PCollection<KV<String, KV<URI, Double>>>> {
	public ComputeTfIdf() { }

	@Override
	public PCollection<KV<String, KV<URI, Double>>> expand(
	PCollection<KV<URI, String>> uriToContent) {

	// Compute the total number of documents, and
	// prepare this singleton PCollectionView for
	// use as a side input.
	final PCollectionView<Long> totalDocuments =
	uriToContent
	.apply("GetURIs", Keys.<URI>create())
	.apply("DistinctDocs", Distinct.<URI>create())
	.apply(Count.<URI>globally())
	.apply(View.<Long>asSingleton());

	// Create a collection of pairs mapping a URI to each
	// of the words in the document associated with that that URI.
	PCollection<KV<URI, String>> uriToWords = uriToContent
	.apply("SplitWords", ParDo.of(
	new DoFn<KV<URI, String>, KV<URI, String>>() {
	@ProcessElement
	public void processElement(ProcessContext c) {
	URI uri = c.element().getKey();
	String line = c.element().getValue();
	for (String word : line.split("\\W+")) {
	// Log INFO messages when the word “love” is found.
	if (word.toLowerCase().equals("love")) {
	LOG.info("Found {}", word.toLowerCase());
	}

	if (!word.isEmpty()) {
	c.output(KV.of(uri, word.toLowerCase()));
	}
	}
	}
	}));

	// Compute a mapping from each word to the total
	// number of documents in which it appears.
	PCollection<KV<String, Long>> wordToDocCount = uriToWords
	.apply("DistinctWords", Distinct.<KV<URI, String>>create())
	.apply(Values.<String>create())
	.apply("CountDocs", Count.<String>perElement());

	// Compute a mapping from each URI to the total
	// number of words in the document associated with that URI.
	PCollection<KV<URI, Long>> uriToWordTotal = uriToWords
	.apply("GetURIs2", Keys.<URI>create())
	.apply("CountWords", Count.<URI>perElement());

	// Count, for each (URI, word) pair, the number of
	// occurrences of that word in the document associated
	// with the URI.
	PCollection<KV<KV<URI, String>, Long>> uriAndWordToCount = uriToWords
	.apply("CountWordDocPairs", Count.<KV<URI, String>>perElement());

	// Adjust the above collection to a mapping from
	// (URI, word) pairs to counts into an isomorphic mapping
	// from URI to (word, count) pairs, to prepare for a join
	// by the URI key.
	PCollection<KV<URI, KV<String, Long>>> uriToWordAndCount = uriAndWordToCount
	.apply("ShiftKeys", ParDo.of(
	new DoFn<KV<KV<URI, String>, Long>, KV<URI, KV<String, Long>>>() {
	@ProcessElement
	public void processElement(ProcessContext c) {
	URI uri = c.element().getKey().getKey();
	String word = c.element().getKey().getValue();
	Long occurrences = c.element().getValue();
	c.output(KV.of(uri, KV.of(word, occurrences)));
	}
	}));

	// Prepare to join the mapping of URI to (word, count) pairs with
	// the mapping of URI to total word counts, by associating
	// each of the input PCollection<KV<URI, ...>> with
	// a tuple tag. Each input must have the same key type, URI
	// in this case. The type parameter of the tuple tag matches
	// the types of the values for each collection.
	final TupleTag<Long> wordTotalsTag = new TupleTag<Long>();
	final TupleTag<KV<String, Long>> wordCountsTag = new TupleTag<KV<String, Long>>();
	KeyedPCollectionTuple<URI> coGbkInput = KeyedPCollectionTuple
	.of(wordTotalsTag, uriToWordTotal)
	.and(wordCountsTag, uriToWordAndCount);

	// Perform a CoGroupByKey (a sort of pre-join) on the prepared
	// inputs. This yields a mapping from URI to a CoGbkResult
	// (CoGroupByKey Result). The CoGbkResult is a mapping
	// from the above tuple tags to the values in each input
	// associated with a particular URI. In this case, each
	// KV<URI, CoGbkResult> group a URI with the total number of
	// words in that document as well as all the (word, count)
	// pairs for particular words.
	PCollection<KV<URI, CoGbkResult>> uriToWordAndCountAndTotal = coGbkInput
	.apply("CoGroupByUri", CoGroupByKey.<URI>create());

	// Compute a mapping from each word to a (URI, term frequency)
	// pair for each URI. A word's term frequency for a document
	// is simply the number of times that word occurs in the document
	// divided by the total number of words in the document.
	PCollection<KV<String, KV<URI, Double>>> wordToUriAndTf = uriToWordAndCountAndTotal
	.apply("ComputeTermFrequencies", ParDo.of(
	new DoFn<KV<URI, CoGbkResult>, KV<String, KV<URI, Double>>>() {
	@ProcessElement
	public void processElement(ProcessContext c) {
	URI uri = c.element().getKey();
	Long wordTotal = c.element().getValue().getOnly(wordTotalsTag);

	for (KV<String, Long> wordAndCount
	: c.element().getValue().getAll(wordCountsTag)) {
	String word = wordAndCount.getKey();
	Long wordCount = wordAndCount.getValue();
	Double termFrequency = wordCount.doubleValue() / wordTotal.doubleValue();
	c.output(KV.of(word, KV.of(uri, termFrequency)));
	}
	}
	}));

	// Compute a mapping from each word to its document frequency.
	// A word's document frequency in a corpus is the number of
	// documents in which the word appears divided by the total
	// number of documents in the corpus. Note how the total number of
	// documents is passed as a side input; the same value is
	// presented to each invocation of the DoFn.
	PCollection<KV<String, Double>> wordToDf = wordToDocCount
	.apply("ComputeDocFrequencies", ParDo
	.of(new DoFn<KV<String, Long>, KV<String, Double>>() {
	@ProcessElement
	public void processElement(ProcessContext c) {
	String word = c.element().getKey();
	Long documentCount = c.element().getValue();
	Long documentTotal = c.sideInput(totalDocuments);
	Double documentFrequency = documentCount.doubleValue()
	/ documentTotal.doubleValue();

	c.output(KV.of(word, documentFrequency));
	}
	}).withSideInputs(totalDocuments));

	// Join the term frequency and document frequency
	// collections, each keyed on the word.
	final TupleTag<KV<URI, Double>> tfTag = new TupleTag<KV<URI, Double>>();
	final TupleTag<Double> dfTag = new TupleTag<Double>();
	PCollection<KV<String, CoGbkResult>> wordToUriAndTfAndDf = KeyedPCollectionTuple
	.of(tfTag, wordToUriAndTf)
	.and(dfTag, wordToDf)
	.apply(CoGroupByKey.<String>create());

	// Compute a mapping from each word to a (URI, TF-IDF) score
	// for each URI. There are a variety of definitions of TF-IDF
	// ("term frequency - inverse document frequency") score;
	// here we use a basic version that is the term frequency
	// divided by the log of the document frequency.
	PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf = wordToUriAndTfAndDf
	.apply("ComputeTfIdf", ParDo.of(
	new DoFn<KV<String, CoGbkResult>, KV<String, KV<URI, Double>>>() {
	@ProcessElement
	public void processElement(ProcessContext c) {
	String word = c.element().getKey();
	Double df = c.element().getValue().getOnly(dfTag);

	for (KV<URI, Double> uriAndTf : c.element().getValue().getAll(tfTag)) {
	URI uri = uriAndTf.getKey();
	Double tf = uriAndTf.getValue();
	Double tfIdf = tf * Math.log(1 / df);
	c.output(KV.of(word, KV.of(uri, tfIdf)));
	}
	}
	}));

	return wordToUriAndTfIdf;
	}

	// Instantiate Logger.
	// It is suggested that the user specify the class name of the containing class
	// (in this case ComputeTfIdf).
	private static final Logger LOG = LoggerFactory.getLogger(ComputeTfIdf.class);
	}

	/**
	* A {@link PTransform} to write, in CSV format, a mapping from term and URI
	* to score.
	*/
	public static class WriteTfIdf
	extends PTransform<PCollection<KV<String, KV<URI, Double>>>, PDone> {
	private String output;

	public WriteTfIdf(String output) {
	this.output = output;
	}

	@Override
	public PDone expand(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) {
	return wordToUriAndTfIdf
	.apply("Format", ParDo.of(new DoFn<KV<String, KV<URI, Double>>, String>() {
	@ProcessElement
	public void processElement(ProcessContext c) {
	c.output(String.format("%s,\t%s,\t%f",
	c.element().getKey(),
	c.element().getValue().getKey(),
	c.element().getValue().getValue()));
	}
	}))
	.apply(TextIO.write()
	.to(output)
	.withSuffix(".csv"));
	}
	}

	public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	Pipeline pipeline = Pipeline.create(options);
	pipeline.getCoderRegistry().registerCoderForClass(URI.class, StringDelegateCoder.of(URI.class));

	pipeline
	.apply(new ReadDocuments(listInputDocuments(options)))
	.apply(new ComputeTfIdf())
	.apply(new WriteTfIdf(options.getOutput()));

	pipeline.run().waitUntilFinish();
	}
	}