examples/java/src/main/java/org/apache/beam/examples/cookbook/FilterExamples.java - beam - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.beam.examples.cookbook;

 import com.google.api.services.bigquery.model.TableFieldSchema;
 import com.google.api.services.bigquery.model.TableRow;
 import com.google.api.services.bigquery.model.TableSchema;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.logging.Logger;
 import org.apache.beam.sdk.Pipeline;
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 import org.apache.beam.sdk.options.Default;
 import org.apache.beam.sdk.options.Description;
 import org.apache.beam.sdk.options.PipelineOptions;
 import org.apache.beam.sdk.options.PipelineOptionsFactory;
 import org.apache.beam.sdk.options.Validation;
 import org.apache.beam.sdk.transforms.DoFn;
 import org.apache.beam.sdk.transforms.Mean;
 import org.apache.beam.sdk.transforms.PTransform;
 import org.apache.beam.sdk.transforms.ParDo;
 import org.apache.beam.sdk.transforms.View;
 import org.apache.beam.sdk.values.PCollection;
 import org.apache.beam.sdk.values.PCollectionView;

 /**
  * This is an example that demonstrates several approaches to filtering, and use of the Mean
  * transform. It shows how to dynamically set parameters by defining and using new pipeline options,
  * and how to use a value derived by the pipeline.
  *
  * <p>Concepts: The Mean transform; Options configuration; using pipeline-derived data as a side
  * input; approaches to filtering, selection, and projection.
  *
  * <p>The example reads public samples of weather data from BigQuery. It performs a
  * projection on the data, finds the global mean of the temperature readings, filters on readings
  * for a single given month, and then outputs only data (for that month) that has a mean temp
  * smaller than the derived global mean.
 *
  * <p>Note: Before running this example, you must create a BigQuery dataset to contain your output
  * table.
  *
  * <p>To execute this pipeline locally, specify the BigQuery table for the output:
  * <pre>{@code
  *   --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
  *   [--monthFilter=<month_number>]
  * }
  * </pre>
  * where optional parameter {@code --monthFilter} is set to a number 1-12.
  *
  * <p>To change the runner, specify:
  * <pre>{@code
  *   --runner=YOUR_SELECTED_RUNNER
  * }
  * </pre>
  * See examples/java/README.md for instructions about how to configure different runners.
  *
  * <p>The BigQuery input table defaults to {@code clouddataflow-readonly:samples.weather_stations}
  * and can be overridden with {@code --input}.
  */
 public class FilterExamples {
   // Default to using a 1000 row subset of the public weather station table publicdata:samples.gsod.
   private static final String WEATHER_SAMPLES_TABLE =
       "clouddataflow-readonly:samples.weather_stations";
   static final Logger LOG = Logger.getLogger(FilterExamples.class.getName());
   static final int MONTH_TO_FILTER = 7;

   /**
    * Examines each row in the input table. Outputs only the subset of the cells this example
    * is interested in-- the mean_temp and year, month, and day-- as a bigquery table row.
    */
   static class ProjectionFn extends DoFn<TableRow, TableRow> {
     @ProcessElement
     public void processElement(ProcessContext c){
       TableRow row = c.element();
       // Grab year, month, day, mean_temp from the row
       Integer year = Integer.parseInt((String) row.get("year"));
       Integer month = Integer.parseInt((String) row.get("month"));
       Integer day = Integer.parseInt((String) row.get("day"));
       Double meanTemp = Double.parseDouble(row.get("mean_temp").toString());
       // Prepares the data for writing to BigQuery by building a TableRow object
       TableRow outRow = new TableRow()
           .set("year", year).set("month", month)
           .set("day", day).set("mean_temp", meanTemp);
       c.output(outRow);
     }
   }

   /**
    * Implements 'filter' functionality.
    *
    * <p>Examines each row in the input table. Outputs only rows from the month
    * monthFilter, which is passed in as a parameter during construction of this DoFn.
    */
   static class FilterSingleMonthDataFn extends DoFn<TableRow, TableRow> {
     Integer monthFilter;

     public FilterSingleMonthDataFn(Integer monthFilter) {
       this.monthFilter = monthFilter;
     }

     @ProcessElement
     public void processElement(ProcessContext c){
       TableRow row = c.element();
       Integer month;
       month = (Integer) row.get("month");
       if (month.equals(this.monthFilter)) {
         c.output(row);
       }
     }
   }

   /**
    * Examines each row (weather reading) in the input table. Output the temperature
    * reading for that row ('mean_temp').
    */
   static class ExtractTempFn extends DoFn<TableRow, Double> {
     @ProcessElement
     public void processElement(ProcessContext c){
       TableRow row = c.element();
       Double meanTemp = Double.parseDouble(row.get("mean_temp").toString());
       c.output(meanTemp);
     }
   }


   /**
    * Finds the global mean of the mean_temp for each day/record, and outputs
    * only data that has a mean temp larger than this global mean.
    */
   static class BelowGlobalMean
       extends PTransform<PCollection<TableRow>, PCollection<TableRow>> {
     Integer monthFilter;

     public BelowGlobalMean(Integer monthFilter) {
       this.monthFilter = monthFilter;
     }


     @Override
     public PCollection<TableRow> expand(PCollection<TableRow> rows) {

       // Extract the mean_temp from each row.
       PCollection<Double> meanTemps = rows.apply(
           ParDo.of(new ExtractTempFn()));

       // Find the global mean, of all the mean_temp readings in the weather data,
       // and prepare this singleton PCollectionView for use as a side input.
       final PCollectionView<Double> globalMeanTemp =
           meanTemps.apply(Mean.<Double>globally())
                .apply(View.<Double>asSingleton());

       // Rows filtered to remove all but a single month
       PCollection<TableRow> monthFilteredRows = rows
           .apply(ParDo.of(new FilterSingleMonthDataFn(monthFilter)));

       // Then, use the global mean as a side input, to further filter the weather data.
       // By using a side input to pass in the filtering criteria, we can use a value
       // that is computed earlier in pipeline execution.
       // We'll only output readings with temperatures below this mean.
       PCollection<TableRow> filteredRows = monthFilteredRows
           .apply("ParseAndFilter", ParDo
               .of(new DoFn<TableRow, TableRow>() {
                 @ProcessElement
                 public void processElement(ProcessContext c) {
                   Double meanTemp = Double.parseDouble(c.element().get("mean_temp").toString());
                   Double gTemp = c.sideInput(globalMeanTemp);
                   if (meanTemp < gTemp) {
                     c.output(c.element());
                   }
                 }
               }).withSideInputs(globalMeanTemp));

       return filteredRows;
     }
   }


   /**
    * Options supported by {@link FilterExamples}.
    *
    * <p>Inherits standard configuration options.
    */
   private interface Options extends PipelineOptions {
     @Description("Table to read from, specified as "
         + "<project_id>:<dataset_id>.<table_id>")
     @Default.String(WEATHER_SAMPLES_TABLE)
     String getInput();
     void setInput(String value);

     @Description("Table to write to, specified as "
         + "<project_id>:<dataset_id>.<table_id>. "
         + "The dataset_id must already exist")
     @Validation.Required
     String getOutput();
     void setOutput(String value);

     @Description("Numeric value of month to filter on")
     @Default.Integer(MONTH_TO_FILTER)
     Integer getMonthFilter();
     void setMonthFilter(Integer value);
   }

   /**
    * Helper method to build the table schema for the output table.
    */
   private static TableSchema buildWeatherSchemaProjection() {
     List<TableFieldSchema> fields = new ArrayList<>();
     fields.add(new TableFieldSchema().setName("year").setType("INTEGER"));
     fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
     fields.add(new TableFieldSchema().setName("day").setType("INTEGER"));
     fields.add(new TableFieldSchema().setName("mean_temp").setType("FLOAT"));
     TableSchema schema = new TableSchema().setFields(fields);
     return schema;
   }

   public static void main(String[] args)
       throws Exception {

     Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
     Pipeline p = Pipeline.create(options);

     TableSchema schema = buildWeatherSchemaProjection();

     p.apply(BigQueryIO.read().from(options.getInput()))
      .apply(ParDo.of(new ProjectionFn()))
      .apply(new BelowGlobalMean(options.getMonthFilter()))
      .apply(BigQueryIO.writeTableRows()
         .to(options.getOutput())
         .withSchema(schema)
         .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
         .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));

     p.run().waitUntilFinish();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.beam.examples.cookbook;

	import com.google.api.services.bigquery.model.TableFieldSchema;
	import com.google.api.services.bigquery.model.TableRow;
	import com.google.api.services.bigquery.model.TableSchema;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.logging.Logger;
	import org.apache.beam.sdk.Pipeline;
	import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
	import org.apache.beam.sdk.options.Default;
	import org.apache.beam.sdk.options.Description;
	import org.apache.beam.sdk.options.PipelineOptions;
	import org.apache.beam.sdk.options.PipelineOptionsFactory;
	import org.apache.beam.sdk.options.Validation;
	import org.apache.beam.sdk.transforms.DoFn;
	import org.apache.beam.sdk.transforms.Mean;
	import org.apache.beam.sdk.transforms.PTransform;
	import org.apache.beam.sdk.transforms.ParDo;
	import org.apache.beam.sdk.transforms.View;
	import org.apache.beam.sdk.values.PCollection;
	import org.apache.beam.sdk.values.PCollectionView;

	/**
	* This is an example that demonstrates several approaches to filtering, and use of the Mean
	* transform. It shows how to dynamically set parameters by defining and using new pipeline options,
	* and how to use a value derived by the pipeline.
	*
	* <p>Concepts: The Mean transform; Options configuration; using pipeline-derived data as a side
	* input; approaches to filtering, selection, and projection.
	*
	* <p>The example reads public samples of weather data from BigQuery. It performs a
	* projection on the data, finds the global mean of the temperature readings, filters on readings
	* for a single given month, and then outputs only data (for that month) that has a mean temp
	* smaller than the derived global mean.
	*
	* <p>Note: Before running this example, you must create a BigQuery dataset to contain your output
	* table.
	*
	* <p>To execute this pipeline locally, specify the BigQuery table for the output:
	* <pre>{@code
	* --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
	* [--monthFilter=<month_number>]
	* }
	* </pre>
	* where optional parameter {@code --monthFilter} is set to a number 1-12.
	*
	* <p>To change the runner, specify:
	* <pre>{@code
	* --runner=YOUR_SELECTED_RUNNER
	* }
	* </pre>
	* See examples/java/README.md for instructions about how to configure different runners.
	*
	* <p>The BigQuery input table defaults to {@code clouddataflow-readonly:samples.weather_stations}
	* and can be overridden with {@code --input}.
	*/
	public class FilterExamples {
	// Default to using a 1000 row subset of the public weather station table publicdata:samples.gsod.
	private static final String WEATHER_SAMPLES_TABLE =
	"clouddataflow-readonly:samples.weather_stations";
	static final Logger LOG = Logger.getLogger(FilterExamples.class.getName());
	static final int MONTH_TO_FILTER = 7;

	/**
	* Examines each row in the input table. Outputs only the subset of the cells this example
	* is interested in-- the mean_temp and year, month, and day-- as a bigquery table row.
	*/
	static class ProjectionFn extends DoFn<TableRow, TableRow> {
	@ProcessElement
	public void processElement(ProcessContext c){
	TableRow row = c.element();
	// Grab year, month, day, mean_temp from the row
	Integer year = Integer.parseInt((String) row.get("year"));
	Integer month = Integer.parseInt((String) row.get("month"));
	Integer day = Integer.parseInt((String) row.get("day"));
	Double meanTemp = Double.parseDouble(row.get("mean_temp").toString());
	// Prepares the data for writing to BigQuery by building a TableRow object
	TableRow outRow = new TableRow()
	.set("year", year).set("month", month)
	.set("day", day).set("mean_temp", meanTemp);
	c.output(outRow);
	}
	}

	/**
	* Implements 'filter' functionality.
	*
	* <p>Examines each row in the input table. Outputs only rows from the month
	* monthFilter, which is passed in as a parameter during construction of this DoFn.
	*/
	static class FilterSingleMonthDataFn extends DoFn<TableRow, TableRow> {
	Integer monthFilter;

	public FilterSingleMonthDataFn(Integer monthFilter) {
	this.monthFilter = monthFilter;
	}

	@ProcessElement
	public void processElement(ProcessContext c){
	TableRow row = c.element();
	Integer month;
	month = (Integer) row.get("month");
	if (month.equals(this.monthFilter)) {
	c.output(row);
	}
	}
	}

	/**
	* Examines each row (weather reading) in the input table. Output the temperature
	* reading for that row ('mean_temp').
	*/
	static class ExtractTempFn extends DoFn<TableRow, Double> {
	@ProcessElement
	public void processElement(ProcessContext c){
	TableRow row = c.element();
	Double meanTemp = Double.parseDouble(row.get("mean_temp").toString());
	c.output(meanTemp);
	}
	}



	/**
	* Finds the global mean of the mean_temp for each day/record, and outputs
	* only data that has a mean temp larger than this global mean.
	*/
	static class BelowGlobalMean
	extends PTransform<PCollection<TableRow>, PCollection<TableRow>> {
	Integer monthFilter;

	public BelowGlobalMean(Integer monthFilter) {
	this.monthFilter = monthFilter;
	}


	@Override
	public PCollection<TableRow> expand(PCollection<TableRow> rows) {

	// Extract the mean_temp from each row.
	PCollection<Double> meanTemps = rows.apply(
	ParDo.of(new ExtractTempFn()));

	// Find the global mean, of all the mean_temp readings in the weather data,
	// and prepare this singleton PCollectionView for use as a side input.
	final PCollectionView<Double> globalMeanTemp =
	meanTemps.apply(Mean.<Double>globally())
	.apply(View.<Double>asSingleton());

	// Rows filtered to remove all but a single month
	PCollection<TableRow> monthFilteredRows = rows
	.apply(ParDo.of(new FilterSingleMonthDataFn(monthFilter)));

	// Then, use the global mean as a side input, to further filter the weather data.
	// By using a side input to pass in the filtering criteria, we can use a value
	// that is computed earlier in pipeline execution.
	// We'll only output readings with temperatures below this mean.
	PCollection<TableRow> filteredRows = monthFilteredRows
	.apply("ParseAndFilter", ParDo
	.of(new DoFn<TableRow, TableRow>() {
	@ProcessElement
	public void processElement(ProcessContext c) {
	Double meanTemp = Double.parseDouble(c.element().get("mean_temp").toString());
	Double gTemp = c.sideInput(globalMeanTemp);
	if (meanTemp < gTemp) {
	c.output(c.element());
	}
	}
	}).withSideInputs(globalMeanTemp));

	return filteredRows;
	}
	}


	/**
	* Options supported by {@link FilterExamples}.
	*
	* <p>Inherits standard configuration options.
	*/
	private interface Options extends PipelineOptions {
	@Description("Table to read from, specified as "
	+ "<project_id>:<dataset_id>.<table_id>")
	@Default.String(WEATHER_SAMPLES_TABLE)
	String getInput();
	void setInput(String value);

	@Description("Table to write to, specified as "
	+ "<project_id>:<dataset_id>.<table_id>. "
	+ "The dataset_id must already exist")
	@Validation.Required
	String getOutput();
	void setOutput(String value);

	@Description("Numeric value of month to filter on")
	@Default.Integer(MONTH_TO_FILTER)
	Integer getMonthFilter();
	void setMonthFilter(Integer value);
	}

	/**
	* Helper method to build the table schema for the output table.
	*/
	private static TableSchema buildWeatherSchemaProjection() {
	List<TableFieldSchema> fields = new ArrayList<>();
	fields.add(new TableFieldSchema().setName("year").setType("INTEGER"));
	fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
	fields.add(new TableFieldSchema().setName("day").setType("INTEGER"));
	fields.add(new TableFieldSchema().setName("mean_temp").setType("FLOAT"));
	TableSchema schema = new TableSchema().setFields(fields);
	return schema;
	}

	public static void main(String[] args)
	throws Exception {

	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	Pipeline p = Pipeline.create(options);

	TableSchema schema = buildWeatherSchemaProjection();

	p.apply(BigQueryIO.read().from(options.getInput()))
	.apply(ParDo.of(new ProjectionFn()))
	.apply(new BelowGlobalMean(options.getMonthFilter()))
	.apply(BigQueryIO.writeTableRows()
	.to(options.getOutput())
	.withSchema(schema)
	.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
	.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));

	p.run().waitUntilFinish();
	}
	}