crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PipelineLike.scala - crunch - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.crunch.scrunch

 import org.apache.hadoop.conf.Configuration

 import org.apache.crunch.{Pipeline => JPipeline, _}
 import org.apache.crunch.scrunch.interpreter.InterpreterRunner
 import org.apache.crunch.types.{PTableType, PType}

 import scala.collection.JavaConversions
 import scala.collection.JavaConversions.asJavaCollection

 trait PipelineLike {
   def jpipeline: JPipeline

   // Call this to ensure we set this up before any subsequent calls to the system
   PipelineLike.setupConf(getConfiguration())

   /**
    * Gets the configuration object associated with this pipeline.
    */
   def getConfiguration(): Configuration = jpipeline.getConfiguration()

   /**
    * Sets the configuration object associated with this pipeline.
    */
   def setConfiguration(conf: Configuration) {
     jpipeline.setConfiguration(conf)
   }

   /**
    * Returns the name of this pipeline instance.
    */
   def getName() = jpipeline.getName()

   /**
    * Reads a source into a [[org.apache.crunch.scrunch.PCollection]]
    *
    * @param source The source to read from.
    * @tparam T The type of the values being read.
    * @return A PCollection containing data read from the specified source.
    */
   def read[T](source: Source[T]): PCollection[T] = new PCollection(jpipeline.read(source))

   /**
    * Reads a source into a [[org.apache.crunch.scrunch.PCollection]]
    *
    * @param source The source to read from.
    * @param named A short name to use for the returned PCollection.
    * @tparam T The type of the values being read.
    * @return A PCollection containing data read from the specified source.
    */
   def read[T](source: Source[T], named: String): PCollection[T] = new PCollection(jpipeline.read(source, named))

   /**
    * Reads a source into a [[org.apache.crunch.scrunch.PTable]]
    *
    * @param source The source to read from.
    * @tparam K The type of the keys being read.
    * @tparam V The type of the values being read.
    * @return A PCollection containing data read from the specified source.
    */
   def read[K, V](source: TableSource[K, V]): PTable[K, V] = new PTable(jpipeline.read(source))

   /**
    * Reads a source into a [[org.apache.crunch.scrunch.PTable]]
    *
    * @param source The source to read from.
    * @param named A short name to use for the return PTable.
    * @tparam K The type of the keys being read.
    * @tparam V The type of the values being read.
    * @return A PTable containing data read from the specified source.
    */
   def read[K, V](source: TableSource[K, V], named: String): PTable[K, V] = new PTable(jpipeline.read(source, named))

   /**
    * Writes a parallel collection to a target.
    *
    * @param collection The collection to write.
    * @param target The destination target for this write.
    */
   def write(collection: PCollection[_], target: Target): Unit = jpipeline.write(collection.native, target)

   /**
    * Writes a parallel collection to a target using an output strategy.
    *
    * @param collection The collection to write.
    * @param target The destination target for this write.
    * @param writeMode The WriteMode to use for handling existing outputs.
    */
   def write(collection: PCollection[_], target: Target, writeMode: Target.WriteMode): Unit = {
     jpipeline.write(collection.native, target, writeMode)
   }

   /**
    * Writes a parallel table to a target.
    *
    * @param table The table to write.
    * @param target The destination target for this write.
    */
   def write(table: PTable[_, _], target: Target): Unit = jpipeline.write(table.native, target)

   /**
    * Writes a parallel table to a target.
    *
    * @param table The table to write.
    * @param target The destination target for this write.
    * @param writeMode The write mode to use on the target
    */
   def write(table: PTable[_, _], target: Target, writeMode: Target.WriteMode): Unit = {
     jpipeline.write(table.native, target, writeMode)
   }

   /**
    * Creates an empty PCollection of the given PType.
    */
   def emptyPCollection[T](pt: PType[T]) = new PCollection[T](jpipeline.emptyPCollection(pt))

   /**
    * Creates an empty PTable of the given PTableType.
    */
   def emptyPTable[K, V](pt: PTableType[K, V]) = new PTable[K, V](jpipeline.emptyPTable(pt))

   /**
    * Creates a new PCollection from the given elements.
    */
   def create[T](elements: Iterable[T], pt: PType[T]) = {
     new PCollection[T](jpipeline.create(asJavaCollection(elements), pt))
   }

   /**
    * Creates a new PCollection from the given elements.
    */
   def create[T](elements: Iterable[T], pt: PType[T], options: CreateOptions) = {
     new PCollection[T](jpipeline.create(asJavaCollection(elements), pt, options))
   }

   /**
    *  Creates a new PTable from the given elements.
    */
   def create[K, V](elements: Iterable[(K, V)], pt: PTableType[K, V]) = {
     new PTable[K, V](jpipeline.create(asJavaCollection(elements.map(t => Pair.of(t._1, t._2))), pt))
   }

   /**
    *  Creates a new PTable from the given elements.
    */
   def create[K, V](elements: Iterable[(K, V)], pt: PTableType[K, V], options: CreateOptions) = {
     new PTable[K, V](jpipeline.create(asJavaCollection(elements.map(t => Pair.of(t._1, t._2))), pt, options))
   }

   /**
    * Creates a new PCollection as the union of the given elements.
    */
   def union[S](elements: Seq[PCollection[S]]) = {
     val natives = elements.map(pc => pc.native)
     val jpc = jpipeline.union(JavaConversions.seqAsJavaList(natives))
     new PCollection[S](jpc)
   }

   /**
    * Creates a new PTable as the union of the given elements.
    */
   def unionTables[K, V](elements: Seq[PTable[K, V]]) = {
     val natives = elements.map(pc => pc.native)
     val jpt = jpipeline.unionTables(JavaConversions.seqAsJavaList(natives))
     new PTable[K, V](jpt)
   }

   /**
    * Adds the given {@code SeqDoFn} to the pipeline execution and returns its output.
    */
   def sequentialDo[Output](seqDoFn: PipelineCallable[Output]) = jpipeline.sequentialDo(seqDoFn)

   /**
    * Returns a handler for controlling the execution of the underlying MapReduce
    * pipeline.
    */
   def runAsync(): PipelineExecution = {
     PipelineLike.setupConf(getConfiguration())
     jpipeline.runAsync()
   }

   /**
    * Constructs and executes a series of MapReduce jobs in order
    * to write data to the output targets.
    */
   def run(): PipelineResult = {
     PipelineLike.setupConf(getConfiguration())
     jpipeline.run()
   }

   /**
    * Run any remaining jobs required to generate outputs and then
    * clean up any intermediate data files that were created in
    * this run or previous calls to `run`.
    */
   def done(): PipelineResult =  {
     PipelineLike.setupConf(getConfiguration())
     jpipeline.done()
   }

   /**
    * Cleans up any artifacts created as a result of {@link #run() running} the pipeline.
    *
    * @param force forces the cleanup even if all targets of the pipeline have not been completed.
    */
   def cleanup(force: Boolean): Unit = {
     jpipeline.cleanup(force)
   }

   /**
    * Turn on debug logging for jobs that are run from this pipeline.
    */
   def debug(): Unit = jpipeline.enableDebug()
 }

 object PipelineLike {
   def setupConf(conf: Configuration) {
     InterpreterRunner.addReplJarsToJob(conf)
     if (conf.get("crunch.reflectdatafactory", "").isEmpty) {
       // Enables the Scala-specific ReflectDataFactory
       conf.set("crunch.reflectdatafactory", classOf[ScalaReflectDataFactory].getCanonicalName)
     }
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.crunch.scrunch

	import org.apache.hadoop.conf.Configuration

	import org.apache.crunch.{Pipeline => JPipeline, _}
	import org.apache.crunch.scrunch.interpreter.InterpreterRunner
	import org.apache.crunch.types.{PTableType, PType}

	import scala.collection.JavaConversions
	import scala.collection.JavaConversions.asJavaCollection

	trait PipelineLike {
	def jpipeline: JPipeline

	// Call this to ensure we set this up before any subsequent calls to the system
	PipelineLike.setupConf(getConfiguration())

	/**
	* Gets the configuration object associated with this pipeline.
	*/
	def getConfiguration(): Configuration = jpipeline.getConfiguration()

	/**
	* Sets the configuration object associated with this pipeline.
	*/
	def setConfiguration(conf: Configuration) {
	jpipeline.setConfiguration(conf)
	}

	/**
	* Returns the name of this pipeline instance.
	*/
	def getName() = jpipeline.getName()

	/**
	* Reads a source into a [[org.apache.crunch.scrunch.PCollection]]
	*
	* @param source The source to read from.
	* @tparam T The type of the values being read.
	* @return A PCollection containing data read from the specified source.
	*/
	def read[T](source: Source[T]): PCollection[T] = new PCollection(jpipeline.read(source))

	/**
	* Reads a source into a [[org.apache.crunch.scrunch.PCollection]]
	*
	* @param source The source to read from.
	* @param named A short name to use for the returned PCollection.
	* @tparam T The type of the values being read.
	* @return A PCollection containing data read from the specified source.
	*/
	def read[T](source: Source[T], named: String): PCollection[T] = new PCollection(jpipeline.read(source, named))

	/**
	* Reads a source into a [[org.apache.crunch.scrunch.PTable]]
	*
	* @param source The source to read from.
	* @tparam K The type of the keys being read.
	* @tparam V The type of the values being read.
	* @return A PCollection containing data read from the specified source.
	*/
	def read[K, V](source: TableSource[K, V]): PTable[K, V] = new PTable(jpipeline.read(source))

	/**
	* Reads a source into a [[org.apache.crunch.scrunch.PTable]]
	*
	* @param source The source to read from.
	* @param named A short name to use for the return PTable.
	* @tparam K The type of the keys being read.
	* @tparam V The type of the values being read.
	* @return A PTable containing data read from the specified source.
	*/
	def read[K, V](source: TableSource[K, V], named: String): PTable[K, V] = new PTable(jpipeline.read(source, named))

	/**
	* Writes a parallel collection to a target.
	*
	* @param collection The collection to write.
	* @param target The destination target for this write.
	*/
	def write(collection: PCollection[_], target: Target): Unit = jpipeline.write(collection.native, target)

	/**
	* Writes a parallel collection to a target using an output strategy.
	*
	* @param collection The collection to write.
	* @param target The destination target for this write.
	* @param writeMode The WriteMode to use for handling existing outputs.
	*/
	def write(collection: PCollection[_], target: Target, writeMode: Target.WriteMode): Unit = {
	jpipeline.write(collection.native, target, writeMode)
	}

	/**
	* Writes a parallel table to a target.
	*
	* @param table The table to write.
	* @param target The destination target for this write.
	*/
	def write(table: PTable[_, _], target: Target): Unit = jpipeline.write(table.native, target)

	/**
	* Writes a parallel table to a target.
	*
	* @param table The table to write.
	* @param target The destination target for this write.
	* @param writeMode The write mode to use on the target
	*/
	def write(table: PTable[_, _], target: Target, writeMode: Target.WriteMode): Unit = {
	jpipeline.write(table.native, target, writeMode)
	}

	/**
	* Creates an empty PCollection of the given PType.
	*/
	def emptyPCollection[T](pt: PType[T]) = new PCollection[T](jpipeline.emptyPCollection(pt))

	/**
	* Creates an empty PTable of the given PTableType.
	*/
	def emptyPTable[K, V](pt: PTableType[K, V]) = new PTable[K, V](jpipeline.emptyPTable(pt))

	/**
	* Creates a new PCollection from the given elements.
	*/
	def create[T](elements: Iterable[T], pt: PType[T]) = {
	new PCollection[T](jpipeline.create(asJavaCollection(elements), pt))
	}

	/**
	* Creates a new PCollection from the given elements.
	*/
	def create[T](elements: Iterable[T], pt: PType[T], options: CreateOptions) = {
	new PCollection[T](jpipeline.create(asJavaCollection(elements), pt, options))
	}

	/**
	* Creates a new PTable from the given elements.
	*/
	def create[K, V](elements: Iterable[(K, V)], pt: PTableType[K, V]) = {
	new PTable[K, V](jpipeline.create(asJavaCollection(elements.map(t => Pair.of(t._1, t._2))), pt))
	}

	/**
	* Creates a new PTable from the given elements.
	*/
	def create[K, V](elements: Iterable[(K, V)], pt: PTableType[K, V], options: CreateOptions) = {
	new PTable[K, V](jpipeline.create(asJavaCollection(elements.map(t => Pair.of(t._1, t._2))), pt, options))
	}

	/**
	* Creates a new PCollection as the union of the given elements.
	*/
	def union[S](elements: Seq[PCollection[S]]) = {
	val natives = elements.map(pc => pc.native)
	val jpc = jpipeline.union(JavaConversions.seqAsJavaList(natives))
	new PCollection[S](jpc)
	}

	/**
	* Creates a new PTable as the union of the given elements.
	*/
	def unionTables[K, V](elements: Seq[PTable[K, V]]) = {
	val natives = elements.map(pc => pc.native)
	val jpt = jpipeline.unionTables(JavaConversions.seqAsJavaList(natives))
	new PTable[K, V](jpt)
	}

	/**
	* Adds the given {@code SeqDoFn} to the pipeline execution and returns its output.
	*/
	def sequentialDo[Output](seqDoFn: PipelineCallable[Output]) = jpipeline.sequentialDo(seqDoFn)

	/**
	* Returns a handler for controlling the execution of the underlying MapReduce
	* pipeline.
	*/
	def runAsync(): PipelineExecution = {
	PipelineLike.setupConf(getConfiguration())
	jpipeline.runAsync()
	}

	/**
	* Constructs and executes a series of MapReduce jobs in order
	* to write data to the output targets.
	*/
	def run(): PipelineResult = {
	PipelineLike.setupConf(getConfiguration())
	jpipeline.run()
	}

	/**
	* Run any remaining jobs required to generate outputs and then
	* clean up any intermediate data files that were created in
	* this run or previous calls to `run`.
	*/
	def done(): PipelineResult = {
	PipelineLike.setupConf(getConfiguration())
	jpipeline.done()
	}

	/**
	* Cleans up any artifacts created as a result of {@link #run() running} the pipeline.
	*
	* @param force forces the cleanup even if all targets of the pipeline have not been completed.
	*/
	def cleanup(force: Boolean): Unit = {
	jpipeline.cleanup(force)
	}

	/**
	* Turn on debug logging for jobs that are run from this pipeline.
	*/
	def debug(): Unit = jpipeline.enableDebug()
	}

	object PipelineLike {
	def setupConf(conf: Configuration) {
	InterpreterRunner.addReplJarsToJob(conf)
	if (conf.get("crunch.reflectdatafactory", "").isEmpty) {
	// Enables the Scala-specific ReflectDataFactory
	conf.set("crunch.reflectdatafactory", classOf[ScalaReflectDataFactory].getCanonicalName)
	}
	}
	}