examples/java/src/main/java/org/apache/beam/examples/complete/AutoComplete.java - beam - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.beam.examples.complete;

 import static com.google.datastore.v1.client.DatastoreHelper.makeKey;
 import static com.google.datastore.v1.client.DatastoreHelper.makeValue;
 import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument;

 import com.google.api.services.bigquery.model.TableFieldSchema;
 import com.google.api.services.bigquery.model.TableReference;
 import com.google.api.services.bigquery.model.TableRow;
 import com.google.api.services.bigquery.model.TableSchema;
 import com.google.datastore.v1.Entity;
 import com.google.datastore.v1.Key;
 import com.google.datastore.v1.Value;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import org.apache.beam.examples.common.ExampleBigQueryTableOptions;
 import org.apache.beam.examples.common.ExampleOptions;
 import org.apache.beam.examples.common.ExampleUtils;
 import org.apache.beam.sdk.Pipeline;
 import org.apache.beam.sdk.PipelineResult;
 import org.apache.beam.sdk.coders.AvroCoder;
 import org.apache.beam.sdk.coders.DefaultCoder;
 import org.apache.beam.sdk.io.TextIO;
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 import org.apache.beam.sdk.io.gcp.datastore.DatastoreIO;
 import org.apache.beam.sdk.options.Default;
 import org.apache.beam.sdk.options.Description;
 import org.apache.beam.sdk.options.PipelineOptionsFactory;
 import org.apache.beam.sdk.options.StreamingOptions;
 import org.apache.beam.sdk.options.Validation;
 import org.apache.beam.sdk.testing.PAssert;
 import org.apache.beam.sdk.transforms.Count;
 import org.apache.beam.sdk.transforms.DoFn;
 import org.apache.beam.sdk.transforms.Filter;
 import org.apache.beam.sdk.transforms.Flatten;
 import org.apache.beam.sdk.transforms.PTransform;
 import org.apache.beam.sdk.transforms.ParDo;
 import org.apache.beam.sdk.transforms.Partition;
 import org.apache.beam.sdk.transforms.Partition.PartitionFn;
 import org.apache.beam.sdk.transforms.SerializableFunction;
 import org.apache.beam.sdk.transforms.Sum;
 import org.apache.beam.sdk.transforms.Top;
 import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
 import org.apache.beam.sdk.transforms.windowing.SlidingWindows;
 import org.apache.beam.sdk.transforms.windowing.Window;
 import org.apache.beam.sdk.transforms.windowing.WindowFn;
 import org.apache.beam.sdk.values.KV;
 import org.apache.beam.sdk.values.PCollection;
 import org.apache.beam.sdk.values.PCollectionList;
 import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.MoreObjects;
 import org.joda.time.Duration;

 /**
  * An example that computes the most popular hash tags for every prefix, which can be used for
  * auto-completion.
  *
  * <p>Concepts: Using the same pipeline in both streaming and batch, combiners, composite
  * transforms.
  *
  * <p>To execute this pipeline in streaming mode, specify:
  *
  * <pre>{@code
  * --streaming
  * }</pre>
  *
  * <p>To change the runner, specify:
  *
  * <pre>{@code
  * --runner=YOUR_SELECTED_RUNNER
  * }</pre>
  *
  * See examples/java/README.md for instructions about how to configure different runners.
  *
  * <p>This will update the Cloud Datastore every 10 seconds based on the last 30 minutes of data
  * received.
  */
 public class AutoComplete {

   /**
    * A PTransform that takes as input a list of tokens and returns the most common tokens per
    * prefix.
    */
   public static class ComputeTopCompletions
       extends PTransform<PCollection<String>, PCollection<KV<String, List<CompletionCandidate>>>> {
     private final int candidatesPerPrefix;
     private final boolean recursive;

     protected ComputeTopCompletions(int candidatesPerPrefix, boolean recursive) {
       this.candidatesPerPrefix = candidatesPerPrefix;
       this.recursive = recursive;
     }

     public static ComputeTopCompletions top(int candidatesPerPrefix, boolean recursive) {
       return new ComputeTopCompletions(candidatesPerPrefix, recursive);
     }

     @Override
     public PCollection<KV<String, List<CompletionCandidate>>> expand(PCollection<String> input) {
       PCollection<CompletionCandidate> candidates =
           input
               // First count how often each token appears.
               .apply(Count.perElement())

               // Map the KV outputs of Count into our own CompletionCandiate class.
               .apply(
                   "CreateCompletionCandidates",
                   ParDo.of(
                       new DoFn<KV<String, Long>, CompletionCandidate>() {
                         @ProcessElement
                         public void processElement(ProcessContext c) {
                           c.output(
                               new CompletionCandidate(
                                   c.element().getKey(), c.element().getValue()));
                         }
                       }));

       // Compute the top via either a flat or recursive algorithm.
       if (recursive) {
         return candidates
             .apply(new ComputeTopRecursive(candidatesPerPrefix, 1))
             .apply(Flatten.pCollections());
       } else {
         return candidates.apply(new ComputeTopFlat(candidatesPerPrefix, 1));
       }
     }
   }

   /** Lower latency, but more expensive. */
   private static class ComputeTopFlat
       extends PTransform<
           PCollection<CompletionCandidate>, PCollection<KV<String, List<CompletionCandidate>>>> {
     private final int candidatesPerPrefix;
     private final int minPrefix;

     public ComputeTopFlat(int candidatesPerPrefix, int minPrefix) {
       this.candidatesPerPrefix = candidatesPerPrefix;
       this.minPrefix = minPrefix;
     }

     @Override
     public PCollection<KV<String, List<CompletionCandidate>>> expand(
         PCollection<CompletionCandidate> input) {
       return input
           // For each completion candidate, map it to all prefixes.
           .apply(ParDo.of(new AllPrefixes(minPrefix)))

           // Find and return the top candiates for each prefix.
           .apply(
               Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix)
                   .withHotKeyFanout(new HotKeyFanout()));
     }

     private static class HotKeyFanout implements SerializableFunction<String, Integer> {
       @Override
       public Integer apply(String input) {
         return (int) Math.pow(4, 5 - input.length());
       }
     }
   }

   /**
    * Cheaper but higher latency.
    *
    * <p>Returns two PCollections, the first is top prefixes of size greater than minPrefix, and the
    * second is top prefixes of size exactly minPrefix.
    */
   private static class ComputeTopRecursive
       extends PTransform<
           PCollection<CompletionCandidate>,
           PCollectionList<KV<String, List<CompletionCandidate>>>> {
     private final int candidatesPerPrefix;
     private final int minPrefix;

     public ComputeTopRecursive(int candidatesPerPrefix, int minPrefix) {
       this.candidatesPerPrefix = candidatesPerPrefix;
       this.minPrefix = minPrefix;
     }

     private class KeySizePartitionFn implements PartitionFn<KV<String, List<CompletionCandidate>>> {
       @Override
       public int partitionFor(KV<String, List<CompletionCandidate>> elem, int numPartitions) {
         return elem.getKey().length() > minPrefix ? 0 : 1;
       }
     }

     private static class FlattenTops
         extends DoFn<KV<String, List<CompletionCandidate>>, CompletionCandidate> {
       @ProcessElement
       public void processElement(ProcessContext c) {
         for (CompletionCandidate cc : c.element().getValue()) {
           c.output(cc);
         }
       }
     }

     @Override
     public PCollectionList<KV<String, List<CompletionCandidate>>> expand(
         PCollection<CompletionCandidate> input) {
       if (minPrefix > 10) {
         // Base case, partitioning to return the output in the expected format.
         return input
             .apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix))
             .apply(Partition.of(2, new KeySizePartitionFn()));
       } else {
         // If a candidate is in the top N for prefix a...b, it must also be in the top
         // N for a...bX for every X, which is typlically a much smaller set to consider.
         // First, compute the top candidate for prefixes of size at least minPrefix + 1.
         PCollectionList<KV<String, List<CompletionCandidate>>> larger =
             input.apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1));
         // Consider the top candidates for each prefix of length minPrefix + 1...
         PCollection<KV<String, List<CompletionCandidate>>> small =
             PCollectionList.of(larger.get(1).apply(ParDo.of(new FlattenTops())))
                 // ...together with those (previously excluded) candidates of length
                 // exactly minPrefix...
                 .and(input.apply(Filter.by(c -> c.getValue().length() == minPrefix)))
                 .apply("FlattenSmall", Flatten.pCollections())
                 // ...set the key to be the minPrefix-length prefix...
                 .apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix)))
                 // ...and (re)apply the Top operator to all of them together.
                 .apply(Top.largestPerKey(candidatesPerPrefix));

         PCollection<KV<String, List<CompletionCandidate>>> flattenLarger =
             larger.apply("FlattenLarge", Flatten.pCollections());

         return PCollectionList.of(flattenLarger).and(small);
       }
     }
   }

   /** A DoFn that keys each candidate by all its prefixes. */
   private static class AllPrefixes
       extends DoFn<CompletionCandidate, KV<String, CompletionCandidate>> {
     private final int minPrefix;
     private final int maxPrefix;

     public AllPrefixes(int minPrefix) {
       this(minPrefix, Integer.MAX_VALUE);
     }

     public AllPrefixes(int minPrefix, int maxPrefix) {
       this.minPrefix = minPrefix;
       this.maxPrefix = maxPrefix;
     }

     @ProcessElement
     public void processElement(ProcessContext c) {
       String word = c.element().value;
       for (int i = minPrefix; i <= Math.min(word.length(), maxPrefix); i++) {
         c.output(KV.of(word.substring(0, i), c.element()));
       }
     }
   }

   /** Class used to store tag-count pairs. */
   @DefaultCoder(AvroCoder.class)
   static class CompletionCandidate implements Comparable<CompletionCandidate> {
     private long count;
     private String value;

     public CompletionCandidate(String value, long count) {
       this.value = value;
       this.count = count;
     }

     public long getCount() {
       return count;
     }

     public String getValue() {
       return value;
     }

     // Empty constructor required for Avro decoding.
     public CompletionCandidate() {}

     @Override
     public int compareTo(CompletionCandidate o) {
       if (this.count < o.count) {
         return -1;
       } else if (this.count == o.count) {
         return this.value.compareTo(o.value);
       } else {
         return 1;
       }
     }

     @Override
     public boolean equals(Object other) {
       if (other instanceof CompletionCandidate) {
         CompletionCandidate that = (CompletionCandidate) other;
         return this.count == that.count && this.value.equals(that.value);
       } else {
         return false;
       }
     }

     @Override
     public int hashCode() {
       return Long.valueOf(count).hashCode() ^ value.hashCode();
     }

     @Override
     public String toString() {
       return "CompletionCandidate[" + value + ", " + count + "]";
     }
   }

   /** Takes as input a set of strings, and emits each #hashtag found therein. */
   static class ExtractHashtags extends DoFn<String, String> {
     @ProcessElement
     public void processElement(ProcessContext c) {
       Matcher m = Pattern.compile("#\\S+").matcher(c.element());
       while (m.find()) {
         c.output(m.group().substring(1));
       }
     }
   }

   static class FormatForBigquery extends DoFn<KV<String, List<CompletionCandidate>>, TableRow> {
     @ProcessElement
     public void processElement(ProcessContext c) {
       List<TableRow> completions = new ArrayList<>();
       for (CompletionCandidate cc : c.element().getValue()) {
         completions.add(new TableRow().set("count", cc.getCount()).set("tag", cc.getValue()));
       }
       TableRow row = new TableRow().set("prefix", c.element().getKey()).set("tags", completions);
       c.output(row);
     }

     /** Defines the BigQuery schema used for the output. */
     static TableSchema getSchema() {
       List<TableFieldSchema> tagFields = new ArrayList<>();
       tagFields.add(new TableFieldSchema().setName("count").setType("INTEGER"));
       tagFields.add(new TableFieldSchema().setName("tag").setType("STRING"));
       List<TableFieldSchema> fields = new ArrayList<>();
       fields.add(new TableFieldSchema().setName("prefix").setType("STRING"));
       fields.add(
           new TableFieldSchema()
               .setName("tags")
               .setType("RECORD")
               .setMode("REPEATED")
               .setFields(tagFields));
       return new TableSchema().setFields(fields);
     }
   }

   /**
    * Takes as input a the top candidates per prefix, and emits an entity suitable for writing to
    * Cloud Datastore.
    *
    * <p>Note: We use ancestor keys for strong consistency. See the Cloud Datastore documentation on
    * <a href="https://cloud.google.com/datastore/docs/concepts/structuring_for_strong_consistency">
    * Structuring Data for Strong Consistency</a>
    */
   static class FormatForDatastore extends DoFn<KV<String, List<CompletionCandidate>>, Entity> {
     private String kind;
     private String ancestorKey;

     public FormatForDatastore(String kind, String ancestorKey) {
       this.kind = kind;
       this.ancestorKey = ancestorKey;
     }

     @ProcessElement
     public void processElement(ProcessContext c) {
       Entity.Builder entityBuilder = Entity.newBuilder();
       Key key = makeKey(makeKey(kind, ancestorKey).build(), kind, c.element().getKey()).build();

       entityBuilder.setKey(key);
       List<Value> candidates = new ArrayList<>();
       Map<String, Value> properties = new HashMap<>();
       for (CompletionCandidate tag : c.element().getValue()) {
         Entity.Builder tagEntity = Entity.newBuilder();
         properties.put("tag", makeValue(tag.value).build());
         properties.put("count", makeValue(tag.count).build());
         candidates.add(makeValue(tagEntity).build());
       }
       properties.put("candidates", makeValue(candidates).build());
       entityBuilder.putAllProperties(properties);
       c.output(entityBuilder.build());
     }
   }

   /**
    * Options supported by this class.
    *
    * <p>Inherits standard Beam example configuration options.
    */
   public interface Options extends ExampleOptions, ExampleBigQueryTableOptions, StreamingOptions {
     @Description("Input text file")
     @Validation.Required
     String getInputFile();

     void setInputFile(String value);

     @Description("Whether to use the recursive algorithm")
     @Default.Boolean(true)
     Boolean getRecursive();

     void setRecursive(Boolean value);

     @Description("Cloud Datastore entity kind")
     @Default.String("autocomplete-demo")
     String getKind();

     void setKind(String value);

     @Description("Whether output to BigQuery")
     @Default.Boolean(true)
     Boolean getOutputToBigQuery();

     void setOutputToBigQuery(Boolean value);

     @Description("Whether to send output to checksum Transform.")
     @Default.Boolean(true)
     Boolean getOutputToChecksum();

     void setOutputToChecksum(Boolean value);

     @Description("Expected result of the checksum transform.")
     Long getExpectedChecksum();

     void setExpectedChecksum(Long value);

     @Description("Whether output to Cloud Datastore")
     @Default.Boolean(false)
     Boolean getOutputToDatastore();

     void setOutputToDatastore(Boolean value);

     @Description("Cloud Datastore ancestor key")
     @Default.String("root")
     String getDatastoreAncestorKey();

     void setDatastoreAncestorKey(String value);

     @Description("Cloud Datastore output project ID, defaults to project ID")
     String getOutputProject();

     void setOutputProject(String value);
   }

   public static void runAutocompletePipeline(Options options) throws IOException {

     options.setBigQuerySchema(FormatForBigquery.getSchema());
     ExampleUtils exampleUtils = new ExampleUtils(options);

     // We support running the same pipeline in either
     // batch or windowed streaming mode.
     WindowFn<Object, ?> windowFn;
     if (options.isStreaming()) {
       checkArgument(!options.getOutputToDatastore(), "DatastoreIO is not supported in streaming.");
       windowFn = SlidingWindows.of(Duration.standardMinutes(30)).every(Duration.standardSeconds(5));
     } else {
       windowFn = new GlobalWindows();
     }

     // Create the pipeline.
     Pipeline p = Pipeline.create(options);
     PCollection<KV<String, List<CompletionCandidate>>> toWrite =
         p.apply(TextIO.read().from(options.getInputFile()))
             .apply(ParDo.of(new ExtractHashtags()))
             .apply(Window.into(windowFn))
             .apply(ComputeTopCompletions.top(10, options.getRecursive()));

     if (options.getOutputToDatastore()) {
       toWrite
           .apply(
               "FormatForDatastore",
               ParDo.of(
                   new FormatForDatastore(options.getKind(), options.getDatastoreAncestorKey())))
           .apply(
               DatastoreIO.v1()
                   .write()
                   .withProjectId(
                       MoreObjects.firstNonNull(options.getOutputProject(), options.getProject())));
     }
     if (options.getOutputToBigQuery()) {
       exampleUtils.setupBigQueryTable();

       TableReference tableRef = new TableReference();
       tableRef.setProjectId(options.getProject());
       tableRef.setDatasetId(options.getBigQueryDataset());
       tableRef.setTableId(options.getBigQueryTable());

       toWrite
           .apply(ParDo.of(new FormatForBigquery()))
           .apply(
               BigQueryIO.writeTableRows()
                   .to(tableRef)
                   .withSchema(FormatForBigquery.getSchema())
                   .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
                   .withWriteDisposition(
                       options.isStreaming()
                           ? BigQueryIO.Write.WriteDisposition.WRITE_APPEND
                           : BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
     }

     if (options.getOutputToChecksum()) {
       PCollection<Long> checksum =
           toWrite
               .apply(
                   ParDo.of(
                       new DoFn<KV<String, List<CompletionCandidate>>, Long>() {
                         @ProcessElement
                         public void process(ProcessContext c) {
                           KV<String, List<CompletionCandidate>> elm = c.element();
                           Long listHash =
                               c.element().getValue().stream().mapToLong(cc -> cc.hashCode()).sum();
                           c.output(Long.valueOf(elm.getKey().hashCode()) + listHash);
                         }
                       }))
               .apply(Sum.longsGlobally());

       PAssert.that(checksum).containsInAnyOrder(options.getExpectedChecksum());
     }

     // Run the pipeline.
     PipelineResult result = p.run();

     // ExampleUtils will try to cancel the pipeline and the injector before the program exists.
     exampleUtils.waitToFinish(result);
   }

   public static void main(String[] args) throws IOException {
     Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);

     runAutocompletePipeline(options);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.beam.examples.complete;

	import static com.google.datastore.v1.client.DatastoreHelper.makeKey;
	import static com.google.datastore.v1.client.DatastoreHelper.makeValue;
	import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument;

	import com.google.api.services.bigquery.model.TableFieldSchema;
	import com.google.api.services.bigquery.model.TableReference;
	import com.google.api.services.bigquery.model.TableRow;
	import com.google.api.services.bigquery.model.TableSchema;
	import com.google.datastore.v1.Entity;
	import com.google.datastore.v1.Key;
	import com.google.datastore.v1.Value;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;
	import org.apache.beam.examples.common.ExampleBigQueryTableOptions;
	import org.apache.beam.examples.common.ExampleOptions;
	import org.apache.beam.examples.common.ExampleUtils;
	import org.apache.beam.sdk.Pipeline;
	import org.apache.beam.sdk.PipelineResult;
	import org.apache.beam.sdk.coders.AvroCoder;
	import org.apache.beam.sdk.coders.DefaultCoder;
	import org.apache.beam.sdk.io.TextIO;
	import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
	import org.apache.beam.sdk.io.gcp.datastore.DatastoreIO;
	import org.apache.beam.sdk.options.Default;
	import org.apache.beam.sdk.options.Description;
	import org.apache.beam.sdk.options.PipelineOptionsFactory;
	import org.apache.beam.sdk.options.StreamingOptions;
	import org.apache.beam.sdk.options.Validation;
	import org.apache.beam.sdk.testing.PAssert;
	import org.apache.beam.sdk.transforms.Count;
	import org.apache.beam.sdk.transforms.DoFn;
	import org.apache.beam.sdk.transforms.Filter;
	import org.apache.beam.sdk.transforms.Flatten;
	import org.apache.beam.sdk.transforms.PTransform;
	import org.apache.beam.sdk.transforms.ParDo;
	import org.apache.beam.sdk.transforms.Partition;
	import org.apache.beam.sdk.transforms.Partition.PartitionFn;
	import org.apache.beam.sdk.transforms.SerializableFunction;
	import org.apache.beam.sdk.transforms.Sum;
	import org.apache.beam.sdk.transforms.Top;
	import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
	import org.apache.beam.sdk.transforms.windowing.SlidingWindows;
	import org.apache.beam.sdk.transforms.windowing.Window;
	import org.apache.beam.sdk.transforms.windowing.WindowFn;
	import org.apache.beam.sdk.values.KV;
	import org.apache.beam.sdk.values.PCollection;
	import org.apache.beam.sdk.values.PCollectionList;
	import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.MoreObjects;
	import org.joda.time.Duration;

	/**
	* An example that computes the most popular hash tags for every prefix, which can be used for
	* auto-completion.
	*
	* <p>Concepts: Using the same pipeline in both streaming and batch, combiners, composite
	* transforms.
	*
	* <p>To execute this pipeline in streaming mode, specify:
	*
	* <pre>{@code
	* --streaming
	* }</pre>
	*
	* <p>To change the runner, specify:
	*
	* <pre>{@code
	* --runner=YOUR_SELECTED_RUNNER
	* }</pre>
	*
	* See examples/java/README.md for instructions about how to configure different runners.
	*
	* <p>This will update the Cloud Datastore every 10 seconds based on the last 30 minutes of data
	* received.
	*/
	public class AutoComplete {

	/**
	* A PTransform that takes as input a list of tokens and returns the most common tokens per
	* prefix.
	*/
	public static class ComputeTopCompletions
	extends PTransform<PCollection<String>, PCollection<KV<String, List<CompletionCandidate>>>> {
	private final int candidatesPerPrefix;
	private final boolean recursive;

	protected ComputeTopCompletions(int candidatesPerPrefix, boolean recursive) {
	this.candidatesPerPrefix = candidatesPerPrefix;
	this.recursive = recursive;
	}

	public static ComputeTopCompletions top(int candidatesPerPrefix, boolean recursive) {
	return new ComputeTopCompletions(candidatesPerPrefix, recursive);
	}

	@Override
	public PCollection<KV<String, List<CompletionCandidate>>> expand(PCollection<String> input) {
	PCollection<CompletionCandidate> candidates =
	input
	// First count how often each token appears.
	.apply(Count.perElement())

	// Map the KV outputs of Count into our own CompletionCandiate class.
	.apply(
	"CreateCompletionCandidates",
	ParDo.of(
	new DoFn<KV<String, Long>, CompletionCandidate>() {
	@ProcessElement
	public void processElement(ProcessContext c) {
	c.output(
	new CompletionCandidate(
	c.element().getKey(), c.element().getValue()));
	}
	}));

	// Compute the top via either a flat or recursive algorithm.
	if (recursive) {
	return candidates
	.apply(new ComputeTopRecursive(candidatesPerPrefix, 1))
	.apply(Flatten.pCollections());
	} else {
	return candidates.apply(new ComputeTopFlat(candidatesPerPrefix, 1));
	}
	}
	}

	/** Lower latency, but more expensive. */
	private static class ComputeTopFlat
	extends PTransform<
	PCollection<CompletionCandidate>, PCollection<KV<String, List<CompletionCandidate>>>> {
	private final int candidatesPerPrefix;
	private final int minPrefix;

	public ComputeTopFlat(int candidatesPerPrefix, int minPrefix) {
	this.candidatesPerPrefix = candidatesPerPrefix;
	this.minPrefix = minPrefix;
	}

	@Override
	public PCollection<KV<String, List<CompletionCandidate>>> expand(
	PCollection<CompletionCandidate> input) {
	return input
	// For each completion candidate, map it to all prefixes.
	.apply(ParDo.of(new AllPrefixes(minPrefix)))

	// Find and return the top candiates for each prefix.
	.apply(
	Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix)
	.withHotKeyFanout(new HotKeyFanout()));
	}

	private static class HotKeyFanout implements SerializableFunction<String, Integer> {
	@Override
	public Integer apply(String input) {
	return (int) Math.pow(4, 5 - input.length());
	}
	}
	}

	/**
	* Cheaper but higher latency.
	*
	* <p>Returns two PCollections, the first is top prefixes of size greater than minPrefix, and the
	* second is top prefixes of size exactly minPrefix.
	*/
	private static class ComputeTopRecursive
	extends PTransform<
	PCollection<CompletionCandidate>,
	PCollectionList<KV<String, List<CompletionCandidate>>>> {
	private final int candidatesPerPrefix;
	private final int minPrefix;

	public ComputeTopRecursive(int candidatesPerPrefix, int minPrefix) {
	this.candidatesPerPrefix = candidatesPerPrefix;
	this.minPrefix = minPrefix;
	}

	private class KeySizePartitionFn implements PartitionFn<KV<String, List<CompletionCandidate>>> {
	@Override
	public int partitionFor(KV<String, List<CompletionCandidate>> elem, int numPartitions) {
	return elem.getKey().length() > minPrefix ? 0 : 1;
	}
	}

	private static class FlattenTops
	extends DoFn<KV<String, List<CompletionCandidate>>, CompletionCandidate> {
	@ProcessElement
	public void processElement(ProcessContext c) {
	for (CompletionCandidate cc : c.element().getValue()) {
	c.output(cc);
	}
	}
	}

	@Override
	public PCollectionList<KV<String, List<CompletionCandidate>>> expand(
	PCollection<CompletionCandidate> input) {
	if (minPrefix > 10) {
	// Base case, partitioning to return the output in the expected format.
	return input
	.apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix))
	.apply(Partition.of(2, new KeySizePartitionFn()));
	} else {
	// If a candidate is in the top N for prefix a...b, it must also be in the top
	// N for a...bX for every X, which is typlically a much smaller set to consider.
	// First, compute the top candidate for prefixes of size at least minPrefix + 1.
	PCollectionList<KV<String, List<CompletionCandidate>>> larger =
	input.apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1));
	// Consider the top candidates for each prefix of length minPrefix + 1...
	PCollection<KV<String, List<CompletionCandidate>>> small =
	PCollectionList.of(larger.get(1).apply(ParDo.of(new FlattenTops())))
	// ...together with those (previously excluded) candidates of length
	// exactly minPrefix...
	.and(input.apply(Filter.by(c -> c.getValue().length() == minPrefix)))
	.apply("FlattenSmall", Flatten.pCollections())
	// ...set the key to be the minPrefix-length prefix...
	.apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix)))
	// ...and (re)apply the Top operator to all of them together.
	.apply(Top.largestPerKey(candidatesPerPrefix));

	PCollection<KV<String, List<CompletionCandidate>>> flattenLarger =
	larger.apply("FlattenLarge", Flatten.pCollections());

	return PCollectionList.of(flattenLarger).and(small);
	}
	}
	}

	/** A DoFn that keys each candidate by all its prefixes. */
	private static class AllPrefixes
	extends DoFn<CompletionCandidate, KV<String, CompletionCandidate>> {
	private final int minPrefix;
	private final int maxPrefix;

	public AllPrefixes(int minPrefix) {
	this(minPrefix, Integer.MAX_VALUE);
	}

	public AllPrefixes(int minPrefix, int maxPrefix) {
	this.minPrefix = minPrefix;
	this.maxPrefix = maxPrefix;
	}

	@ProcessElement
	public void processElement(ProcessContext c) {
	String word = c.element().value;
	for (int i = minPrefix; i <= Math.min(word.length(), maxPrefix); i++) {
	c.output(KV.of(word.substring(0, i), c.element()));
	}
	}
	}

	/** Class used to store tag-count pairs. */
	@DefaultCoder(AvroCoder.class)
	static class CompletionCandidate implements Comparable<CompletionCandidate> {
	private long count;
	private String value;

	public CompletionCandidate(String value, long count) {
	this.value = value;
	this.count = count;
	}

	public long getCount() {
	return count;
	}

	public String getValue() {
	return value;
	}

	// Empty constructor required for Avro decoding.
	public CompletionCandidate() {}

	@Override
	public int compareTo(CompletionCandidate o) {
	if (this.count < o.count) {
	return -1;
	} else if (this.count == o.count) {
	return this.value.compareTo(o.value);
	} else {
	return 1;
	}
	}

	@Override
	public boolean equals(Object other) {
	if (other instanceof CompletionCandidate) {
	CompletionCandidate that = (CompletionCandidate) other;
	return this.count == that.count && this.value.equals(that.value);
	} else {
	return false;
	}
	}

	@Override
	public int hashCode() {
	return Long.valueOf(count).hashCode() ^ value.hashCode();
	}

	@Override
	public String toString() {
	return "CompletionCandidate[" + value + ", " + count + "]";
	}
	}

	/** Takes as input a set of strings, and emits each #hashtag found therein. */
	static class ExtractHashtags extends DoFn<String, String> {
	@ProcessElement
	public void processElement(ProcessContext c) {
	Matcher m = Pattern.compile("#\\S+").matcher(c.element());
	while (m.find()) {
	c.output(m.group().substring(1));
	}
	}
	}

	static class FormatForBigquery extends DoFn<KV<String, List<CompletionCandidate>>, TableRow> {
	@ProcessElement
	public void processElement(ProcessContext c) {
	List<TableRow> completions = new ArrayList<>();
	for (CompletionCandidate cc : c.element().getValue()) {
	completions.add(new TableRow().set("count", cc.getCount()).set("tag", cc.getValue()));
	}
	TableRow row = new TableRow().set("prefix", c.element().getKey()).set("tags", completions);
	c.output(row);
	}

	/** Defines the BigQuery schema used for the output. */
	static TableSchema getSchema() {
	List<TableFieldSchema> tagFields = new ArrayList<>();
	tagFields.add(new TableFieldSchema().setName("count").setType("INTEGER"));
	tagFields.add(new TableFieldSchema().setName("tag").setType("STRING"));
	List<TableFieldSchema> fields = new ArrayList<>();
	fields.add(new TableFieldSchema().setName("prefix").setType("STRING"));
	fields.add(
	new TableFieldSchema()
	.setName("tags")
	.setType("RECORD")
	.setMode("REPEATED")
	.setFields(tagFields));
	return new TableSchema().setFields(fields);
	}
	}

	/**
	* Takes as input a the top candidates per prefix, and emits an entity suitable for writing to
	* Cloud Datastore.
	*
	* <p>Note: We use ancestor keys for strong consistency. See the Cloud Datastore documentation on
	* <a href="https://cloud.google.com/datastore/docs/concepts/structuring_for_strong_consistency">
	* Structuring Data for Strong Consistency</a>
	*/
	static class FormatForDatastore extends DoFn<KV<String, List<CompletionCandidate>>, Entity> {
	private String kind;
	private String ancestorKey;

	public FormatForDatastore(String kind, String ancestorKey) {
	this.kind = kind;
	this.ancestorKey = ancestorKey;
	}

	@ProcessElement
	public void processElement(ProcessContext c) {
	Entity.Builder entityBuilder = Entity.newBuilder();
	Key key = makeKey(makeKey(kind, ancestorKey).build(), kind, c.element().getKey()).build();

	entityBuilder.setKey(key);
	List<Value> candidates = new ArrayList<>();
	Map<String, Value> properties = new HashMap<>();
	for (CompletionCandidate tag : c.element().getValue()) {
	Entity.Builder tagEntity = Entity.newBuilder();
	properties.put("tag", makeValue(tag.value).build());
	properties.put("count", makeValue(tag.count).build());
	candidates.add(makeValue(tagEntity).build());
	}
	properties.put("candidates", makeValue(candidates).build());
	entityBuilder.putAllProperties(properties);
	c.output(entityBuilder.build());
	}
	}

	/**
	* Options supported by this class.
	*
	* <p>Inherits standard Beam example configuration options.
	*/
	public interface Options extends ExampleOptions, ExampleBigQueryTableOptions, StreamingOptions {
	@Description("Input text file")
	@Validation.Required
	String getInputFile();

	void setInputFile(String value);

	@Description("Whether to use the recursive algorithm")
	@Default.Boolean(true)
	Boolean getRecursive();

	void setRecursive(Boolean value);

	@Description("Cloud Datastore entity kind")
	@Default.String("autocomplete-demo")
	String getKind();

	void setKind(String value);

	@Description("Whether output to BigQuery")
	@Default.Boolean(true)
	Boolean getOutputToBigQuery();

	void setOutputToBigQuery(Boolean value);

	@Description("Whether to send output to checksum Transform.")
	@Default.Boolean(true)
	Boolean getOutputToChecksum();

	void setOutputToChecksum(Boolean value);

	@Description("Expected result of the checksum transform.")
	Long getExpectedChecksum();

	void setExpectedChecksum(Long value);

	@Description("Whether output to Cloud Datastore")
	@Default.Boolean(false)
	Boolean getOutputToDatastore();

	void setOutputToDatastore(Boolean value);

	@Description("Cloud Datastore ancestor key")
	@Default.String("root")
	String getDatastoreAncestorKey();

	void setDatastoreAncestorKey(String value);

	@Description("Cloud Datastore output project ID, defaults to project ID")
	String getOutputProject();

	void setOutputProject(String value);
	}

	public static void runAutocompletePipeline(Options options) throws IOException {

	options.setBigQuerySchema(FormatForBigquery.getSchema());
	ExampleUtils exampleUtils = new ExampleUtils(options);

	// We support running the same pipeline in either
	// batch or windowed streaming mode.
	WindowFn<Object, ?> windowFn;
	if (options.isStreaming()) {
	checkArgument(!options.getOutputToDatastore(), "DatastoreIO is not supported in streaming.");
	windowFn = SlidingWindows.of(Duration.standardMinutes(30)).every(Duration.standardSeconds(5));
	} else {
	windowFn = new GlobalWindows();
	}

	// Create the pipeline.
	Pipeline p = Pipeline.create(options);
	PCollection<KV<String, List<CompletionCandidate>>> toWrite =
	p.apply(TextIO.read().from(options.getInputFile()))
	.apply(ParDo.of(new ExtractHashtags()))
	.apply(Window.into(windowFn))
	.apply(ComputeTopCompletions.top(10, options.getRecursive()));

	if (options.getOutputToDatastore()) {
	toWrite
	.apply(
	"FormatForDatastore",
	ParDo.of(
	new FormatForDatastore(options.getKind(), options.getDatastoreAncestorKey())))
	.apply(
	DatastoreIO.v1()
	.write()
	.withProjectId(
	MoreObjects.firstNonNull(options.getOutputProject(), options.getProject())));
	}
	if (options.getOutputToBigQuery()) {
	exampleUtils.setupBigQueryTable();

	TableReference tableRef = new TableReference();
	tableRef.setProjectId(options.getProject());
	tableRef.setDatasetId(options.getBigQueryDataset());
	tableRef.setTableId(options.getBigQueryTable());

	toWrite
	.apply(ParDo.of(new FormatForBigquery()))
	.apply(
	BigQueryIO.writeTableRows()
	.to(tableRef)
	.withSchema(FormatForBigquery.getSchema())
	.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
	.withWriteDisposition(
	options.isStreaming()
	? BigQueryIO.Write.WriteDisposition.WRITE_APPEND
	: BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
	}

	if (options.getOutputToChecksum()) {
	PCollection<Long> checksum =
	toWrite
	.apply(
	ParDo.of(
	new DoFn<KV<String, List<CompletionCandidate>>, Long>() {
	@ProcessElement
	public void process(ProcessContext c) {
	KV<String, List<CompletionCandidate>> elm = c.element();
	Long listHash =
	c.element().getValue().stream().mapToLong(cc -> cc.hashCode()).sum();
	c.output(Long.valueOf(elm.getKey().hashCode()) + listHash);
	}
	}))
	.apply(Sum.longsGlobally());

	PAssert.that(checksum).containsInAnyOrder(options.getExpectedChecksum());
	}

	// Run the pipeline.
	PipelineResult result = p.run();

	// ExampleUtils will try to cancel the pipeline and the injector before the program exists.
	exampleUtils.waitToFinish(result);
	}

	public static void main(String[] args) throws IOException {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);

	runAutocompletePipeline(options);
	}
	}