crunch-core/src/main/java/org/apache/crunch/Pipeline.java - crunch - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.crunch;

 import org.apache.crunch.types.PTableType;
 import org.apache.crunch.types.PType;
 import org.apache.hadoop.conf.Configuration;

 import java.util.List;

 /**
  * Manages the state of a pipeline execution.
  *
  */
 public interface Pipeline {

   /**
    * Set the {@code Configuration} to use with this pipeline.
    */
   void setConfiguration(Configuration conf);

   /**
    * Returns the name of this pipeline.
    *
    * @return Name of the pipeline
    */
   String getName();

   /**
    * Returns the {@code Configuration} instance associated with this pipeline.
    */
   Configuration getConfiguration();

   /**
    * Converts the given {@code Source} into a {@code PCollection} that is
    * available to jobs run using this {@code Pipeline} instance.
    *
    * @param source
    *          The source of data
    * @return A PCollection that references the given source
    */
   <T> PCollection<T> read(Source<T> source);

   /**
    * Converts the given {@code Source} into a {@code PCollection} that is
    * available to jobs run using this {@code Pipeline} instance.
    *
    * @param source The source of data
    * @param named A name for the returned PCollection
    * @return A PCollection that references the given source
    */
   <T> PCollection<T> read(Source<T> source, String named);

   /**
    * A version of the read method for {@code TableSource} instances that map to
    * {@code PTable}s.
    *
    * @param tableSource
    *          The source of the data
    * @return A PTable that references the given source
    */
   <K, V> PTable<K, V> read(TableSource<K, V> tableSource);

  /**
    * A version of the read method for {@code TableSource} instances that map to
    * {@code PTable}s.
    *
    * @param tableSource The source of the data
    * @param named A name for the returned PTable
    * @return A PTable that references the given source
    */
   <K, V> PTable<K, V> read(TableSource<K, V> tableSource, String named);

   /**
    * Write the given collection to the given target on the next pipeline run. The
    * system will check to see if the target's location already exists using the
    * {@code WriteMode.DEFAULT} rule for the given {@code Target}.
    *
    * @param collection
    *          The collection
    * @param target
    *          The output target
    */
   void write(PCollection<?> collection, Target target);

   /**
   * Write the contents of the {@code PCollection} to the given {@code Target},
   * using the storage format specified by the target and the given
   * {@code WriteMode} for cases where the referenced {@code Target}
   * already exists.
   *
   * @param collection
   *          The collection
   * @param target
   *          The target to write to
   * @param writeMode
   *          The strategy to use for handling existing outputs
   */
  void write(PCollection<?> collection, Target target,
      Target.WriteMode writeMode);

  /**
    * Create the given PCollection and read the data it contains into the
    * returned Collection instance for client use.
    *
    * @param pcollection
    *          The PCollection to materialize
    * @return the data from the PCollection as a read-only Collection
    */
   <T> Iterable<T> materialize(PCollection<T> pcollection);

   /**
    * Caches the given PCollection so that it will be processed at most once
    * during pipeline execution.
    *
    * @param pcollection The PCollection to cache
    * @param options The options for how the cached data is stored
    */
   <T> void cache(PCollection<T> pcollection, CachingOptions options);

   /**
    * Creates an empty {@code PCollection} of the given {@code PType}.
    *
    * @param ptype The PType of the empty PCollection
    * @return A valid PCollection with no contents
    */
   <T> PCollection<T> emptyPCollection(PType<T> ptype);

   /**
    * Creates an empty {@code PTable} of the given {@code PTable Type}.
    *
    * @param ptype The PTableType of the empty PTable
    * @return A valid PTable with no contents
    */
   <K, V> PTable<K, V> emptyPTable(PTableType<K, V> ptype);

   /**
    * Creates a {@code PCollection} containing the values found in the given {@code Iterable}
    * using an implementation-specific distribution mechanism.
    *
    * @param contents The values the new PCollection will contain
    * @param ptype The PType of the PCollection
    * @return A PCollection that contains the given values
    */
   <T> PCollection<T> create(Iterable<T> contents, PType<T> ptype);

   /**
    * Creates a {@code PCollection} containing the values found in the given {@code Iterable}
    * using an implementation-specific distribution mechanism.
    *
    * @param contents The values the new PCollection will contain
    * @param ptype The PType of the PCollection
    * @param options Additional options, such as the name or desired parallelism of the PCollection
    * @return A PCollection that contains the given values
    */
   <T> PCollection<T> create(Iterable<T> contents, PType<T> ptype, CreateOptions options);

   /**
    * Creates a {@code PTable} containing the values found in the given {@code Iterable}
    * using an implementation-specific distribution mechanism.
    *
    * @param contents The values the new PTable will contain
    * @param ptype The PTableType of the PTable
    * @return A PTable that contains the given values
    */
   <K, V> PTable<K, V> create(Iterable<Pair<K, V>> contents, PTableType<K, V> ptype);

   /**
    * Creates a {@code PTable} containing the values found in the given {@code Iterable}
    * using an implementation-specific distribution mechanism.
    *
    * @param contents The values the new PTable will contain
    * @param ptype The PTableType of the PTable
    * @param options Additional options, such as the name or desired parallelism of the PTable
    * @return A PTable that contains the given values
    */
   <K, V> PTable<K, V> create(Iterable<Pair<K, V>> contents, PTableType<K, V> ptype, CreateOptions options);

   <S> PCollection<S> union(List<PCollection<S>> collections);

   <K, V> PTable<K, V> unionTables(List<PTable<K, V>> tables);

   /**
    * Executes the given {@code PipelineCallable} on the client after the {@code Targets}
    * that the PipelineCallable depends on (if any) have been created by other pipeline
    * processing steps.
    *
    * @param pipelineCallable The sequential logic to execute
    * @param <Output> The return type of the PipelineCallable
    * @return The result of executing the PipelineCallable
    */
   <Output> Output sequentialDo(PipelineCallable<Output> pipelineCallable);

   /**
    * Constructs and executes a series of MapReduce jobs in order to write data
    * to the output targets.
    */
   PipelineResult run();

   /**
    * Constructs and starts a series of MapReduce jobs in order ot write data to
    * the output targets, but returns a {@code ListenableFuture} to allow clients to control
    * job execution.
    * @return
    */
   PipelineExecution runAsync();

   /**
    * Run any remaining jobs required to generate outputs and then clean up any
    * intermediate data files that were created in this run or previous calls to
    * {@code run}.
    */
   PipelineResult done();

   /**
   * Cleans up any artifacts created as a result of {@link #run() running} the pipeline.
   * @param force forces the cleanup even if all targets of the pipeline have not been completed.
   */
   void cleanup(boolean force);

   /**
    * A convenience method for reading a text file.
    */
   PCollection<String> readTextFile(String pathName);

   /**
    * A convenience method for writing a text file.
    */
   <T> void writeTextFile(PCollection<T> collection, String pathName);

   /**
    * Turn on debug logging for jobs that are run from this pipeline.
    */
   void enableDebug();
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.crunch;

	import org.apache.crunch.types.PTableType;
	import org.apache.crunch.types.PType;
	import org.apache.hadoop.conf.Configuration;

	import java.util.List;

	/**
	* Manages the state of a pipeline execution.
	*
	*/
	public interface Pipeline {

	/**
	* Set the {@code Configuration} to use with this pipeline.
	*/
	void setConfiguration(Configuration conf);

	/**
	* Returns the name of this pipeline.
	*
	* @return Name of the pipeline
	*/
	String getName();

	/**
	* Returns the {@code Configuration} instance associated with this pipeline.
	*/
	Configuration getConfiguration();

	/**
	* Converts the given {@code Source} into a {@code PCollection} that is
	* available to jobs run using this {@code Pipeline} instance.
	*
	* @param source
	* The source of data
	* @return A PCollection that references the given source
	*/
	<T> PCollection<T> read(Source<T> source);

	/**
	* Converts the given {@code Source} into a {@code PCollection} that is
	* available to jobs run using this {@code Pipeline} instance.
	*
	* @param source The source of data
	* @param named A name for the returned PCollection
	* @return A PCollection that references the given source
	*/
	<T> PCollection<T> read(Source<T> source, String named);

	/**
	* A version of the read method for {@code TableSource} instances that map to
	* {@code PTable}s.
	*
	* @param tableSource
	* The source of the data
	* @return A PTable that references the given source
	*/
	<K, V> PTable<K, V> read(TableSource<K, V> tableSource);

	/**
	* A version of the read method for {@code TableSource} instances that map to
	* {@code PTable}s.
	*
	* @param tableSource The source of the data
	* @param named A name for the returned PTable
	* @return A PTable that references the given source
	*/
	<K, V> PTable<K, V> read(TableSource<K, V> tableSource, String named);

	/**
	* Write the given collection to the given target on the next pipeline run. The
	* system will check to see if the target's location already exists using the
	* {@code WriteMode.DEFAULT} rule for the given {@code Target}.
	*
	* @param collection
	* The collection
	* @param target
	* The output target
	*/
	void write(PCollection<?> collection, Target target);

	/**
	* Write the contents of the {@code PCollection} to the given {@code Target},
	* using the storage format specified by the target and the given
	* {@code WriteMode} for cases where the referenced {@code Target}
	* already exists.
	*
	* @param collection
	* The collection
	* @param target
	* The target to write to
	* @param writeMode
	* The strategy to use for handling existing outputs
	*/
	void write(PCollection<?> collection, Target target,
	Target.WriteMode writeMode);

	/**
	* Create the given PCollection and read the data it contains into the
	* returned Collection instance for client use.
	*
	* @param pcollection
	* The PCollection to materialize
	* @return the data from the PCollection as a read-only Collection
	*/
	<T> Iterable<T> materialize(PCollection<T> pcollection);

	/**
	* Caches the given PCollection so that it will be processed at most once
	* during pipeline execution.
	*
	* @param pcollection The PCollection to cache
	* @param options The options for how the cached data is stored
	*/
	<T> void cache(PCollection<T> pcollection, CachingOptions options);

	/**
	* Creates an empty {@code PCollection} of the given {@code PType}.
	*
	* @param ptype The PType of the empty PCollection
	* @return A valid PCollection with no contents
	*/
	<T> PCollection<T> emptyPCollection(PType<T> ptype);

	/**
	* Creates an empty {@code PTable} of the given {@code PTable Type}.
	*
	* @param ptype The PTableType of the empty PTable
	* @return A valid PTable with no contents
	*/
	<K, V> PTable<K, V> emptyPTable(PTableType<K, V> ptype);

	/**
	* Creates a {@code PCollection} containing the values found in the given {@code Iterable}
	* using an implementation-specific distribution mechanism.
	*
	* @param contents The values the new PCollection will contain
	* @param ptype The PType of the PCollection
	* @return A PCollection that contains the given values
	*/
	<T> PCollection<T> create(Iterable<T> contents, PType<T> ptype);

	/**
	* Creates a {@code PCollection} containing the values found in the given {@code Iterable}
	* using an implementation-specific distribution mechanism.
	*
	* @param contents The values the new PCollection will contain
	* @param ptype The PType of the PCollection
	* @param options Additional options, such as the name or desired parallelism of the PCollection
	* @return A PCollection that contains the given values
	*/
	<T> PCollection<T> create(Iterable<T> contents, PType<T> ptype, CreateOptions options);

	/**
	* Creates a {@code PTable} containing the values found in the given {@code Iterable}
	* using an implementation-specific distribution mechanism.
	*
	* @param contents The values the new PTable will contain
	* @param ptype The PTableType of the PTable
	* @return A PTable that contains the given values
	*/
	<K, V> PTable<K, V> create(Iterable<Pair<K, V>> contents, PTableType<K, V> ptype);

	/**
	* Creates a {@code PTable} containing the values found in the given {@code Iterable}
	* using an implementation-specific distribution mechanism.
	*
	* @param contents The values the new PTable will contain
	* @param ptype The PTableType of the PTable
	* @param options Additional options, such as the name or desired parallelism of the PTable
	* @return A PTable that contains the given values
	*/
	<K, V> PTable<K, V> create(Iterable<Pair<K, V>> contents, PTableType<K, V> ptype, CreateOptions options);

	<S> PCollection<S> union(List<PCollection<S>> collections);

	<K, V> PTable<K, V> unionTables(List<PTable<K, V>> tables);

	/**
	* Executes the given {@code PipelineCallable} on the client after the {@code Targets}
	* that the PipelineCallable depends on (if any) have been created by other pipeline
	* processing steps.
	*
	* @param pipelineCallable The sequential logic to execute
	* @param <Output> The return type of the PipelineCallable
	* @return The result of executing the PipelineCallable
	*/
	<Output> Output sequentialDo(PipelineCallable<Output> pipelineCallable);

	/**
	* Constructs and executes a series of MapReduce jobs in order to write data
	* to the output targets.
	*/
	PipelineResult run();

	/**
	* Constructs and starts a series of MapReduce jobs in order ot write data to
	* the output targets, but returns a {@code ListenableFuture} to allow clients to control
	* job execution.
	* @return
	*/
	PipelineExecution runAsync();

	/**
	* Run any remaining jobs required to generate outputs and then clean up any
	* intermediate data files that were created in this run or previous calls to
	* {@code run}.
	*/
	PipelineResult done();

	/**
	* Cleans up any artifacts created as a result of {@link #run() running} the pipeline.
	* @param force forces the cleanup even if all targets of the pipeline have not been completed.
	*/
	void cleanup(boolean force);

	/**
	* A convenience method for reading a text file.
	*/
	PCollection<String> readTextFile(String pathName);

	/**
	* A convenience method for writing a text file.
	*/
	<T> void writeTextFile(PCollection<T> collection, String pathName);

	/**
	* Turn on debug logging for jobs that are run from this pipeline.
	*/
	void enableDebug();
	}