sdks/python/apache_beam/yaml/examples/wordcount_minimal.yaml - beam - Git at Google

 # coding=utf-8
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 # This examples reads from a public file stored on Google Cloud. This
 # requires authenticating with Google Cloud, or setting the file in
 #`ReadFromText` to a local file.
 #
 # To set up Application Default Credentials,
 # see https://cloud.google.com/docs/authentication/external/set-up-adc.
 #
 # This pipeline reads in a text file, counts distinct words found in the text,
 # then logs a row containing each word and its count.
 pipeline:
   type: chain
   transforms:

     # Read text file into a collection of rows, each with one field, "line"
     - type: ReadFromText
       config:
         path: gs://dataflow-samples/shakespeare/kinglear.txt

     # Split line field in each row into list of words
     - type: MapToFields
       config:
         language: python
         fields:
           words:
             callable: |
               import re
               def my_mapping(row):
                 return re.findall(r"[A-Za-z\']+", row.line.lower())

     # Explode each list of words into separate rows
     - type: Explode
       config:
         fields: words

     # Since each word is now distinct row, rename field to "word"
     - type: MapToFields
       config:
         fields:
           word: words

     # Group by distinct words in the collection and add field "count" that
     # contains number of instances, or count, for each word in the collection.
     - type: Combine
       config:
         language: python
         group_by: word
         combine:
           count:
             value: word
             fn: count

     # Log out results
     - type: LogForTesting

 # Expected:
 #  Row(word='king', count=311)
 #  Row(word='lear', count=253)
 #  Row(word='dramatis', count=1)
 #  Row(word='personae', count=1)
 #  Row(word='of', count=483)
 #  Row(word='britain', count=2)
 #  Row(word='france', count=32)
 #  Row(word='duke', count=26)
 #  Row(word='burgundy', count=20)
 #  Row(word='cornwall', count=75)
	# coding=utf-8
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	# This examples reads from a public file stored on Google Cloud. This
	# requires authenticating with Google Cloud, or setting the file in
	#`ReadFromText` to a local file.
	#
	# To set up Application Default Credentials,
	# see https://cloud.google.com/docs/authentication/external/set-up-adc.
	#
	# This pipeline reads in a text file, counts distinct words found in the text,
	# then logs a row containing each word and its count.
	pipeline:
	type: chain
	transforms:

	# Read text file into a collection of rows, each with one field, "line"
	- type: ReadFromText
	config:
	path: gs://dataflow-samples/shakespeare/kinglear.txt

	# Split line field in each row into list of words
	- type: MapToFields
	config:
	language: python
	fields:
	words:
	callable: \|
	import re
	def my_mapping(row):
	return re.findall(r"[A-Za-z\']+", row.line.lower())

	# Explode each list of words into separate rows
	- type: Explode
	config:
	fields: words

	# Since each word is now distinct row, rename field to "word"
	- type: MapToFields
	config:
	fields:
	word: words

	# Group by distinct words in the collection and add field "count" that
	# contains number of instances, or count, for each word in the collection.
	- type: Combine
	config:
	language: python
	group_by: word
	combine:
	count:
	value: word
	fn: count

	# Log out results
	- type: LogForTesting

	# Expected:
	# Row(word='king', count=311)
	# Row(word='lear', count=253)
	# Row(word='dramatis', count=1)
	# Row(word='personae', count=1)
	# Row(word='of', count=483)
	# Row(word='britain', count=2)
	# Row(word='france', count=32)
	# Row(word='duke', count=26)
	# Row(word='burgundy', count=20)
	# Row(word='cornwall', count=75)