Merge pull request #9821 from nahuellofeudo/logs-cleanup
Avoid polluting Stackdriver logs with noise during autoscaling events.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 1aa17d4..b9dd9d6 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -8,6 +8,8 @@
- [ ] Format the pull request title like `[BEAM-XXX] Fixes bug in ApproximateQuantiles`, where you replace `BEAM-XXX` with the appropriate JIRA issue, if applicable. This will automatically link the pull request to the issue.
- [ ] If this contribution is large, please file an Apache [Individual Contributor License Agreement](https://www.apache.org/licenses/icla.pdf).
+See the [Contributor Guide](https://beam.apache.org/contribute) for more tips on [how to make review process smoother](https://beam.apache.org/contribute/#make-reviewers-job-easier).
+
Post-Commit Tests Status (on master branch)
------------------------------------------------------------------------------------------------
diff --git a/.test-infra/dataproc/flink_cluster.sh b/.test-infra/dataproc/flink_cluster.sh
index 86d9b23..78a45c4 100755
--- a/.test-infra/dataproc/flink_cluster.sh
+++ b/.test-infra/dataproc/flink_cluster.sh
@@ -24,6 +24,7 @@
# JOB_SERVER_IMAGE: Url to job server docker image to pull on dataproc master (optional)
# ARTIFACTS_DIR: Url to bucket where artifacts will be stored for staging (optional)
# FLINK_DOWNLOAD_URL: Url to Flink .tar archive to be installed on the cluster
+# HADOOP_DOWNLOAD_URL: Url to a pre-packaged Hadoop jar
# FLINK_NUM_WORKERS: Number of Flink workers
# FLINK_TASKMANAGER_SLOTS: Number of slots per Flink task manager
# DETACHED_MODE: Detached mode: should the SSH tunnel run in detached mode?
@@ -34,7 +35,8 @@
# HARNESS_IMAGES_TO_PULL='gcr.io/<IMAGE_REPOSITORY>/python:latest gcr.io/<IMAGE_REPOSITORY>/java:latest' \
# JOB_SERVER_IMAGE=gcr.io/<IMAGE_REPOSITORY>/job-server-flink:latest \
# ARTIFACTS_DIR=gs://<bucket-for-artifacts> \
-# FLINK_DOWNLOAD_URL=http://archive.apache.org/dist/flink/flink-1.7.0/flink-1.7.0-bin-hadoop28-scala_2.12.tgz \
+# FLINK_DOWNLOAD_URL=https://archive.apache.org/dist/flink/flink-1.9.1/flink-1.9.1-bin-scala_2.11.tgz \
+# HADOOP_DOWNLOAD_URL=https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-7.0/flink-shaded-hadoop-2-uber-2.8.3-7.0.jar \
# FLINK_NUM_WORKERS=2 \
# FLINK_TASKMANAGER_SLOTS=1 \
# DETACHED_MODE=false \
@@ -118,7 +120,8 @@
function create_cluster() {
local metadata="flink-snapshot-url=${FLINK_DOWNLOAD_URL},"
metadata+="flink-start-yarn-session=true,"
- metadata+="flink-taskmanager-slots=${FLINK_TASKMANAGER_SLOTS}"
+ metadata+="flink-taskmanager-slots=${FLINK_TASKMANAGER_SLOTS},"
+ metadata+="hadoop-jar-url=${HADOOP_DOWNLOAD_URL}"
[[ -n "${HARNESS_IMAGES_TO_PULL:=}" ]] && metadata+=",beam-sdk-harness-images-to-pull=${HARNESS_IMAGES_TO_PULL}"
[[ -n "${JOB_SERVER_IMAGE:=}" ]] && metadata+=",beam-job-server-image=${JOB_SERVER_IMAGE}"
@@ -131,7 +134,7 @@
# Docker init action restarts yarn so we need to start yarn session after this restart happens.
# This is why flink init action is invoked last.
- gcloud dataproc clusters create $CLUSTER_NAME --num-workers=$num_dataproc_workers --initialization-actions $DOCKER_INIT,$BEAM_INIT,$FLINK_INIT --metadata "${metadata}", --image-version=$image_version --zone=$GCLOUD_ZONE --quiet
+ gcloud dataproc clusters create $CLUSTER_NAME --region=global --num-workers=$num_dataproc_workers --initialization-actions $DOCKER_INIT,$BEAM_INIT,$FLINK_INIT --metadata "${metadata}", --image-version=$image_version --zone=$GCLOUD_ZONE --quiet
}
# Runs init actions for Docker, Portability framework (Beam) and Flink cluster
@@ -152,7 +155,7 @@
# Deletes a Flink cluster.
function delete() {
- gcloud dataproc clusters delete $CLUSTER_NAME --quiet
+ gcloud dataproc clusters delete $CLUSTER_NAME --region=global --quiet
}
"$@"
diff --git a/.test-infra/dataproc/init-actions/flink.sh b/.test-infra/dataproc/init-actions/flink.sh
index 1959872..7e06b7e 100644
--- a/.test-infra/dataproc/init-actions/flink.sh
+++ b/.test-infra/dataproc/init-actions/flink.sh
@@ -56,6 +56,9 @@
# Set this to install flink from a snapshot URL instead of apt
readonly FLINK_SNAPSHOT_URL_METADATA_KEY='flink-snapshot-url'
+# Set this to install pre-packaged Hadoop jar
+readonly HADOOP_JAR_URL_METADATA_KEY='hadoop-jar-url'
+
# Set this to define how many task slots are there per flink task manager
readonly FLINK_TASKMANAGER_SLOTS_METADATA_KEY='flink-taskmanager-slots'
@@ -88,6 +91,7 @@
function install_flink_snapshot() {
local work_dir="$(mktemp -d)"
local flink_url="$(/usr/share/google/get_metadata_value "attributes/${FLINK_SNAPSHOT_URL_METADATA_KEY}")"
+ local hadoop_url="$(/usr/share/google/get_metadata_value "attributes/${HADOOP_JAR_URL_METADATA_KEY}")"
local flink_local="${work_dir}/flink.tgz"
local flink_toplevel_pattern="${work_dir}/flink-*"
@@ -103,6 +107,9 @@
popd # work_dir
+ if [[ ! -z "${hadoop_url}" ]]; then
+ cd "${FLINK_INSTALL_DIR}/lib"; curl -O "${hadoop_url}"
+ fi
}
function configure_flink() {
@@ -205,4 +212,4 @@
fi
}
-main
\ No newline at end of file
+main
diff --git a/.test-infra/jenkins/CommonTestProperties.groovy b/.test-infra/jenkins/CommonTestProperties.groovy
index 203e398..0d750ee 100644
--- a/.test-infra/jenkins/CommonTestProperties.groovy
+++ b/.test-infra/jenkins/CommonTestProperties.groovy
@@ -35,7 +35,7 @@
JAVA: [
DATAFLOW: ":runners:google-cloud-dataflow-java",
SPARK: ":runners:spark",
- FLINK: ":runners:flink:1.8",
+ FLINK: ":runners:flink:1.9",
DIRECT: ":runners:direct-java"
],
PYTHON: [
diff --git a/.test-infra/jenkins/Flink.groovy b/.test-infra/jenkins/Flink.groovy
index a986d64..0c0df64 100644
--- a/.test-infra/jenkins/Flink.groovy
+++ b/.test-infra/jenkins/Flink.groovy
@@ -17,7 +17,8 @@
*/
class Flink {
- private static final String flinkDownloadUrl = 'https://archive.apache.org/dist/flink/flink-1.7.0/flink-1.7.0-bin-hadoop28-scala_2.11.tgz'
+ private static final String flinkDownloadUrl = 'https://archive.apache.org/dist/flink/flink-1.9.1/flink-1.9.1-bin-scala_2.11.tgz'
+ private static final String hadoopDownloadUrl = 'https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-7.0/flink-shaded-hadoop-2-uber-2.8.3-7.0.jar'
private static final String FLINK_DIR = '"$WORKSPACE/src/.test-infra/dataproc"'
private static final String FLINK_SCRIPT = 'flink_cluster.sh'
private def job
@@ -53,6 +54,7 @@
env("CLUSTER_NAME", clusterName)
env("GCS_BUCKET", gcsBucket)
env("FLINK_DOWNLOAD_URL", flinkDownloadUrl)
+ env("HADOOP_DOWNLOAD_URL", hadoopDownloadUrl)
env("FLINK_NUM_WORKERS", workerCount)
env("FLINK_TASKMANAGER_SLOTS", slotsPerTaskmanager)
env("DETACHED_MODE", 'true')
diff --git a/.test-infra/jenkins/job_LoadTests_Combine_Flink_Python.groovy b/.test-infra/jenkins/job_LoadTests_Combine_Flink_Python.groovy
index 2f847e7..884cbc8 100644
--- a/.test-infra/jenkins/job_LoadTests_Combine_Flink_Python.groovy
+++ b/.test-infra/jenkins/job_LoadTests_Combine_Flink_Python.groovy
@@ -105,7 +105,7 @@
List<Map> testScenarios = scenarios(datasetName, pythonHarnessImageTag)
publisher.publish(':sdks:python:container:py2:docker', 'python2.7_sdk')
- publisher.publish(':runners:flink:1.7:job-server-container:docker', 'flink-job-server')
+ publisher.publish(':runners:flink:1.9:job-server-container:docker', 'flink-job-server')
def flink = new Flink(scope, 'beam_LoadTests_Python_Combine_Flink_Batch')
flink.setUp([pythonHarnessImageTag], numberOfWorkers, publisher.getFullImageName('flink-job-server'))
diff --git a/.test-infra/jenkins/job_LoadTests_GBK_Flink_Python.groovy b/.test-infra/jenkins/job_LoadTests_GBK_Flink_Python.groovy
index 7f3bb21..d6b497f 100644
--- a/.test-infra/jenkins/job_LoadTests_GBK_Flink_Python.groovy
+++ b/.test-infra/jenkins/job_LoadTests_GBK_Flink_Python.groovy
@@ -172,7 +172,7 @@
List<Map> testScenarios = scenarios(datasetName, pythonHarnessImageTag)
publisher.publish(':sdks:python:container:py2:docker', 'python2.7_sdk')
- publisher.publish(':runners:flink:1.7:job-server-container:docker', 'flink-job-server')
+ publisher.publish(':runners:flink:1.9:job-server-container:docker', 'flink-job-server')
def flink = new Flink(scope, 'beam_LoadTests_Python_GBK_Flink_Batch')
flink.setUp([pythonHarnessImageTag], numberOfWorkers, publisher.getFullImageName('flink-job-server'))
diff --git a/.test-infra/jenkins/job_LoadTests_ParDo_Flink_Python.groovy b/.test-infra/jenkins/job_LoadTests_ParDo_Flink_Python.groovy
index 97c37dd..5841ae3 100644
--- a/.test-infra/jenkins/job_LoadTests_ParDo_Flink_Python.groovy
+++ b/.test-infra/jenkins/job_LoadTests_ParDo_Flink_Python.groovy
@@ -129,7 +129,7 @@
List<Map> testScenarios = scenarios(datasetName, pythonHarnessImageTag)
publisher.publish(':sdks:python:container:py2:docker', 'python2.7_sdk')
- publisher.publish(':runners:flink:1.7:job-server-container:docker', 'flink-job-server')
+ publisher.publish(':runners:flink:1.9:job-server-container:docker', 'flink-job-server')
Flink flink = new Flink(scope, 'beam_LoadTests_Python_ParDo_Flink_Batch')
flink.setUp([pythonHarnessImageTag], numberOfWorkers, publisher.getFullImageName('flink-job-server'))
diff --git a/.test-infra/jenkins/job_LoadTests_coGBK_Flink_Python.groovy b/.test-infra/jenkins/job_LoadTests_coGBK_Flink_Python.groovy
index 026d197..b6549c8 100644
--- a/.test-infra/jenkins/job_LoadTests_coGBK_Flink_Python.groovy
+++ b/.test-infra/jenkins/job_LoadTests_coGBK_Flink_Python.groovy
@@ -157,7 +157,7 @@
List<Map> testScenarios = scenarios(datasetName, pythonHarnessImageTag)
publisher.publish(':sdks:python:container:py2:docker', 'python2.7_sdk')
- publisher.publish('runners:flink:1.7:job-server-container:docker', 'flink-job-server')
+ publisher.publish(':runners:flink:1.9:job-server-container:docker', 'flink-job-server')
def flink = new Flink(scope, 'beam_LoadTests_Python_CoGBK_Flink_Batch')
flink.setUp([pythonHarnessImageTag], numberOfWorkers, publisher.getFullImageName('flink-job-server'))
diff --git a/.test-infra/jenkins/job_PerformanceTests_FileBasedIO_IT.groovy b/.test-infra/jenkins/job_PerformanceTests_FileBasedIO_IT.groovy
index ddccd5d..fdb82b7 100644
--- a/.test-infra/jenkins/job_PerformanceTests_FileBasedIO_IT.groovy
+++ b/.test-infra/jenkins/job_PerformanceTests_FileBasedIO_IT.groovy
@@ -28,11 +28,12 @@
pipelineOptions : [
bigQueryDataset : 'beam_performance',
bigQueryTable : 'textioit_results',
- numberOfRecords : '1000000',
+ numberOfRecords : '25000000',
+ expectedHash : 'f8453256ccf861e8a312c125dfe0e436',
+ datasetSize : '1062290000',
numWorkers : '5',
autoscalingAlgorithm: 'NONE'
]
-
],
[
name : 'beam_PerformanceTests_Compressed_TextIOIT',
@@ -43,7 +44,9 @@
pipelineOptions : [
bigQueryDataset : 'beam_performance',
bigQueryTable : 'compressed_textioit_results',
- numberOfRecords : '1000000',
+ numberOfRecords : '450000000',
+ expectedHash : '8a3de973354abc6fba621c6797cc0f06',
+ datasetSize : '1097840000',
compressionType : 'GZIP',
numWorkers : '5',
autoscalingAlgorithm: 'NONE'
@@ -60,7 +63,9 @@
bigQueryTable : 'many_files_textioit_results',
reportGcsPerformanceMetrics: 'true',
gcsPerformanceMetrics : 'true',
- numberOfRecords : '1000000',
+ numberOfRecords : '25000000',
+ expectedHash : 'f8453256ccf861e8a312c125dfe0e436',
+ datasetSize : '1062290000',
numberOfShards : '1000',
numWorkers : '5',
autoscalingAlgorithm : 'NONE'
@@ -74,7 +79,9 @@
githubTitle : 'Java AvroIO Performance Test',
githubTriggerPhrase: 'Run Java AvroIO Performance Test',
pipelineOptions : [
- numberOfRecords : '1000000',
+ numberOfRecords : '225000000',
+ expectedHash : '2f9f5ca33ea464b25109c0297eb6aecb',
+ datasetSize : '1089730000',
bigQueryDataset : 'beam_performance',
bigQueryTable : 'avroioit_results',
numWorkers : '5',
@@ -90,7 +97,9 @@
pipelineOptions : [
bigQueryDataset : 'beam_performance',
bigQueryTable : 'tfrecordioit_results',
- numberOfRecords : '1000000',
+ numberOfRecords : '18000000',
+ expectedHash : '543104423f8b6eb097acb9f111c19fe4',
+ datasetSize : '1019380000',
numWorkers : '5',
autoscalingAlgorithm: 'NONE'
]
@@ -104,7 +113,9 @@
pipelineOptions : [
bigQueryDataset : 'beam_performance',
bigQueryTable : 'xmlioit_results',
- numberOfRecords : '100000000',
+ numberOfRecords : '12000000',
+ expectedHash : 'b3b717e7df8f4878301b20f314512fb3',
+ datasetSize : '1076590000',
charset : 'UTF-8',
numWorkers : '5',
autoscalingAlgorithm: 'NONE'
@@ -119,7 +130,9 @@
pipelineOptions : [
bigQueryDataset : 'beam_performance',
bigQueryTable : 'parquetioit_results',
- numberOfRecords : '100000000',
+ numberOfRecords : '225000000',
+ expectedHash : '2f9f5ca33ea464b25109c0297eb6aecb',
+ datasetSize : '1087370000',
numWorkers : '5',
autoscalingAlgorithm: 'NONE'
]
@@ -133,7 +146,9 @@
pipelineOptions : [
bigQueryDataset : 'beam_performance',
bigQueryTable : 'textioit_hdfs_results',
- numberOfRecords : '1000000',
+ numberOfRecords : '25000000',
+ expectedHash : 'f8453256ccf861e8a312c125dfe0e436',
+ datasetSize : '1062290000',
numWorkers : '5',
autoscalingAlgorithm: 'NONE'
]
@@ -148,7 +163,9 @@
pipelineOptions : [
bigQueryDataset : 'beam_performance',
bigQueryTable : 'compressed_textioit_hdfs_results',
- numberOfRecords : '1000000',
+ numberOfRecords : '450000000',
+ expectedHash : '8a3de973354abc6fba621c6797cc0f06',
+ datasetSize : '1097840000',
compressionType : 'GZIP',
numWorkers : '5',
autoscalingAlgorithm: 'NONE'
@@ -165,7 +182,9 @@
bigQueryTable : 'many_files_textioit_hdfs_results',
reportGcsPerformanceMetrics: 'true',
gcsPerformanceMetrics : 'true',
- numberOfRecords : '1000000',
+ numberOfRecords : '25000000',
+ expectedHash : 'f8453256ccf861e8a312c125dfe0e436',
+ datasetSize : '1062290000',
numberOfShards : '1000',
numWorkers : '5',
autoscalingAlgorithm : 'NONE'
@@ -181,7 +200,9 @@
pipelineOptions : [
bigQueryDataset : 'beam_performance',
bigQueryTable : 'avroioit_hdfs_results',
- numberOfRecords : '1000000',
+ numberOfRecords : '225000000',
+ expectedHash : '2f9f5ca33ea464b25109c0297eb6aecb',
+ datasetSize : '1089730000',
numWorkers : '5',
autoscalingAlgorithm: 'NONE'
]
@@ -193,7 +214,9 @@
githubTitle : 'Java TFRecordIO Performance Test on HDFS',
githubTriggerPhrase: 'Run Java TFRecordIO Performance Test HDFS',
pipelineOptions : [
- numberOfRecords : '1000000',
+ numberOfRecords : '18000000',
+ expectedHash : '543104423f8b6eb097acb9f111c19fe4',
+ datasetSize : '1019380000',
numWorkers : '5',
autoscalingAlgorithm: 'NONE'
]
@@ -207,7 +230,9 @@
pipelineOptions : [
bigQueryDataset : 'beam_performance',
bigQueryTable : 'xmlioit_hdfs_results',
- numberOfRecords : '100000',
+ numberOfRecords : '12000000',
+ expectedHash : 'b3b717e7df8f4878301b20f314512fb3',
+ datasetSize : '1076590000',
charset : 'UTF-8',
numWorkers : '5',
autoscalingAlgorithm: 'NONE'
@@ -222,7 +247,9 @@
pipelineOptions : [
bigQueryDataset : 'beam_performance',
bigQueryTable : 'parquetioit_hdfs_results',
- numberOfRecords : '1000000',
+ numberOfRecords : '225000000',
+ expectedHash : '2f9f5ca33ea464b25109c0297eb6aecb',
+ datasetSize : '1087370000',
numWorkers : '5',
autoscalingAlgorithm: 'NONE'
]
diff --git a/.test-infra/jenkins/job_PostCommit_CrossLanguageValidatesRunner_Flink.groovy b/.test-infra/jenkins/job_PostCommit_CrossLanguageValidatesRunner_Flink.groovy
index e4bf45e..5a9a238 100644
--- a/.test-infra/jenkins/job_PostCommit_CrossLanguageValidatesRunner_Flink.groovy
+++ b/.test-infra/jenkins/job_PostCommit_CrossLanguageValidatesRunner_Flink.groovy
@@ -36,7 +36,7 @@
steps {
gradle {
rootBuildScriptDir(commonJobProperties.checkoutDir)
- tasks(':runners:flink:1.8:job-server:validatesCrossLanguageRunner')
+ tasks(':runners:flink:1.9:job-server:validatesCrossLanguageRunner')
commonJobProperties.setGradleSwitches(delegate)
}
}
diff --git a/.test-infra/jenkins/job_PostCommit_Java11_ValidatesRunner_PortabilityApi_Dataflow.groovy b/.test-infra/jenkins/job_PostCommit_Java11_ValidatesRunner_PortabilityApi_Dataflow.groovy
index fa5053a..6ac6ff8 100644
--- a/.test-infra/jenkins/job_PostCommit_Java11_ValidatesRunner_PortabilityApi_Dataflow.groovy
+++ b/.test-infra/jenkins/job_PostCommit_Java11_ValidatesRunner_PortabilityApi_Dataflow.groovy
@@ -25,7 +25,7 @@
description('Runs the ValidatesRunner suite on the Java 11 enabled Dataflow PortabilityApi runner.')
- commonJobProperties.setTopLevelMainJobProperties(delegate, 'master', 180)
+ commonJobProperties.setTopLevelMainJobProperties(delegate, 'master', 270)
publishers {
archiveJunit('**/build/test-results/**/*.xml')
diff --git a/.test-infra/jenkins/job_PostCommit_Java_Nexmark_Flink.groovy b/.test-infra/jenkins/job_PostCommit_Java_Nexmark_Flink.groovy
index 038d6b3..cbcd0ba 100644
--- a/.test-infra/jenkins/job_PostCommit_Java_Nexmark_Flink.groovy
+++ b/.test-infra/jenkins/job_PostCommit_Java_Nexmark_Flink.groovy
@@ -40,7 +40,7 @@
rootBuildScriptDir(commonJobProperties.checkoutDir)
tasks(':sdks:java:testing:nexmark:run')
commonJobProperties.setGradleSwitches(delegate)
- switches('-Pnexmark.runner=":runners:flink:1.8"' +
+ switches('-Pnexmark.runner=":runners:flink:1.9"' +
' -Pnexmark.args="' +
[NexmarkBigqueryProperties.nexmarkBigQueryArgs,
'--streaming=false',
@@ -55,7 +55,7 @@
rootBuildScriptDir(commonJobProperties.checkoutDir)
tasks(':sdks:java:testing:nexmark:run')
commonJobProperties.setGradleSwitches(delegate)
- switches('-Pnexmark.runner=":runners:flink:1.8"' +
+ switches('-Pnexmark.runner=":runners:flink:1.9"' +
' -Pnexmark.args="' +
[NexmarkBigqueryProperties.nexmarkBigQueryArgs,
'--streaming=true',
@@ -70,7 +70,7 @@
rootBuildScriptDir(commonJobProperties.checkoutDir)
tasks(':sdks:java:testing:nexmark:run')
commonJobProperties.setGradleSwitches(delegate)
- switches('-Pnexmark.runner=":runners:flink:1.8"' +
+ switches('-Pnexmark.runner=":runners:flink:1.9"' +
' -Pnexmark.args="' +
[NexmarkBigqueryProperties.nexmarkBigQueryArgs,
'--queryLanguage=sql',
@@ -85,7 +85,7 @@
rootBuildScriptDir(commonJobProperties.checkoutDir)
tasks(':sdks:java:testing:nexmark:run')
commonJobProperties.setGradleSwitches(delegate)
- switches('-Pnexmark.runner=":runners:flink:1.8"' +
+ switches('-Pnexmark.runner=":runners:flink:1.9"' +
' -Pnexmark.args="' +
[NexmarkBigqueryProperties.nexmarkBigQueryArgs,
'--queryLanguage=sql',
diff --git a/.test-infra/jenkins/job_PostCommit_Java_PortableValidatesRunner_Flink_Batch.groovy b/.test-infra/jenkins/job_PostCommit_Java_PortableValidatesRunner_Flink_Batch.groovy
index 4da75f9..d618688 100644
--- a/.test-infra/jenkins/job_PostCommit_Java_PortableValidatesRunner_Flink_Batch.groovy
+++ b/.test-infra/jenkins/job_PostCommit_Java_PortableValidatesRunner_Flink_Batch.groovy
@@ -36,7 +36,7 @@
steps {
gradle {
rootBuildScriptDir(commonJobProperties.checkoutDir)
- tasks(':runners:flink:1.8:job-server:validatesPortableRunnerBatch')
+ tasks(':runners:flink:1.9:job-server:validatesPortableRunnerBatch')
commonJobProperties.setGradleSwitches(delegate)
}
}
diff --git a/.test-infra/jenkins/job_PostCommit_Java_PortableValidatesRunner_Flink_Streaming.groovy b/.test-infra/jenkins/job_PostCommit_Java_PortableValidatesRunner_Flink_Streaming.groovy
index 612c154..bf4708a 100644
--- a/.test-infra/jenkins/job_PostCommit_Java_PortableValidatesRunner_Flink_Streaming.groovy
+++ b/.test-infra/jenkins/job_PostCommit_Java_PortableValidatesRunner_Flink_Streaming.groovy
@@ -36,7 +36,7 @@
steps {
gradle {
rootBuildScriptDir(commonJobProperties.checkoutDir)
- tasks(':runners:flink:1.8:job-server:validatesPortableRunnerStreaming')
+ tasks(':runners:flink:1.9:job-server:validatesPortableRunnerStreaming')
commonJobProperties.setGradleSwitches(delegate)
}
}
diff --git a/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_Dataflow.groovy b/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_Dataflow.groovy
index 530fba6..32527bb 100644
--- a/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_Dataflow.groovy
+++ b/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_Dataflow.groovy
@@ -27,8 +27,7 @@
description('Runs the ValidatesRunner suite on the Dataflow runner.')
- // Set common parameters. Sets a 3 hour timeout.
- commonJobProperties.setTopLevelMainJobProperties(delegate, 'master', 300)
+ commonJobProperties.setTopLevelMainJobProperties(delegate, 'master', 270)
previousNames(/beam_PostCommit_Java_ValidatesRunner_Dataflow_Gradle/)
// Publish all test results to Jenkins
diff --git a/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_Dataflow_Java11.groovy b/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_Dataflow_Java11.groovy
index 74e49b6..a1e7fc9 100644
--- a/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_Dataflow_Java11.groovy
+++ b/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_Dataflow_Java11.groovy
@@ -25,7 +25,7 @@
description('Runs the ValidatesRunner suite on the Dataflow runner with Java 11 worker harness.')
- commonJobProperties.setTopLevelMainJobProperties(delegate, 'master', 180)
+ commonJobProperties.setTopLevelMainJobProperties(delegate, 'master', 270)
publishers {
archiveJunit('**/build/test-results/**/*.xml')
diff --git a/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_Flink.groovy b/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_Flink.groovy
index d5e6da9..499947e 100644
--- a/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_Flink.groovy
+++ b/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_Flink.groovy
@@ -37,7 +37,7 @@
steps {
gradle {
rootBuildScriptDir(commonJobProperties.checkoutDir)
- tasks(':runners:flink:1.8:validatesRunner')
+ tasks(':runners:flink:1.9:validatesRunner')
commonJobProperties.setGradleSwitches(delegate)
}
}
diff --git a/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_PortabilityApi_Dataflow.groovy b/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_PortabilityApi_Dataflow.groovy
index 54ad764..357b473 100644
--- a/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_PortabilityApi_Dataflow.groovy
+++ b/.test-infra/jenkins/job_PostCommit_Java_ValidatesRunner_PortabilityApi_Dataflow.groovy
@@ -28,8 +28,7 @@
description('Runs the ValidatesRunner suite on the Dataflow PortabilityApi runner.')
previousNames(/beam_PostCommit_Java_ValidatesRunner_PortabilityApi_Dataflow_Gradle/)
- // Set common parameters. Sets a 3 hour timeout.
- commonJobProperties.setTopLevelMainJobProperties(delegate, 'master', 400)
+ commonJobProperties.setTopLevelMainJobProperties(delegate, 'master', 270)
// Publish all test results to Jenkins
publishers {
diff --git a/.test-infra/jenkins/job_PostCommit_PortableJar_Flink.groovy b/.test-infra/jenkins/job_PostCommit_PortableJar_Flink.groovy
index a2bc53e..80b2aa3 100644
--- a/.test-infra/jenkins/job_PostCommit_PortableJar_Flink.groovy
+++ b/.test-infra/jenkins/job_PostCommit_PortableJar_Flink.groovy
@@ -31,7 +31,7 @@
steps {
gradle {
rootBuildScriptDir(commonJobProperties.checkoutDir)
- tasks(':runners:flink:1.8:job-server:testPipelineJar')
+ tasks(':runners:flink:1.9:job-server:testPipelineJar')
commonJobProperties.setGradleSwitches(delegate)
}
}
diff --git a/.test-infra/jenkins/job_PostCommit_Python37.groovy b/.test-infra/jenkins/job_PostCommit_Python37.groovy
index ea511cd..e4f2e17 100644
--- a/.test-infra/jenkins/job_PostCommit_Python37.groovy
+++ b/.test-infra/jenkins/job_PostCommit_Python37.groovy
@@ -27,7 +27,7 @@
previousNames('/beam_PostCommit_Python3_Verify/')
// Set common parameters.
- commonJobProperties.setTopLevelMainJobProperties(delegate)
+ commonJobProperties.setTopLevelMainJobProperties(delegate, 'master', 150)
publishers {
archiveJunit('**/nosetests*.xml')
diff --git a/build.gradle b/build.gradle
index 55193e7..59b1496 100644
--- a/build.gradle
+++ b/build.gradle
@@ -100,6 +100,10 @@
"**/*.json",
// Katas files
+ "learning/katas/**/course-remote-info.yaml",
+ "learning/katas/**/section-remote-info.yaml",
+ "learning/katas/**/lesson-remote-info.yaml",
+ "learning/katas/**/task-remote-info.yaml",
"learning/katas/*/IO/**/*.txt",
// Mockito extensions
diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy
index b7e9d2a..cf4fab4 100644
--- a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy
+++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy
@@ -78,9 +78,6 @@
/** A class defining the set of configurable properties accepted by applyJavaNature. */
class JavaNatureConfiguration {
- /** Controls the JDK source language and target compatibility. */
- double javaVersion = 1.8
-
/** Controls whether the spotbugs plugin is enabled and configured. */
boolean enableSpotbugs = true
@@ -300,7 +297,7 @@
// Automatically use the official release version if we are performing a release
// otherwise append '-SNAPSHOT'
- project.version = '2.17.0'
+ project.version = '2.18.0'
if (!isRelease(project)) {
project.version += '-SNAPSHOT'
}
@@ -370,7 +367,7 @@
def generated_grpc_ga_version = "1.43.0"
def generated_grpc_dc_beta_version = "0.27.0-alpha"
def google_auth_version = "0.12.0"
- def google_clients_version = "1.27.0"
+ def google_clients_version = "1.28.0"
def google_cloud_bigdataoss_version = "1.9.16"
def google_cloud_core_version = "1.61.0"
def google_cloud_spanner_version = "1.6.0"
@@ -440,12 +437,12 @@
google_api_client_jackson2 : "com.google.api-client:google-api-client-jackson2:$google_clients_version",
google_api_client_java6 : "com.google.api-client:google-api-client-java6:$google_clients_version",
google_api_common : "com.google.api:api-common:1.7.0",
- google_api_services_bigquery : "com.google.apis:google-api-services-bigquery:v2-rev20181104-$google_clients_version",
- google_api_services_clouddebugger : "com.google.apis:google-api-services-clouddebugger:v2-rev20180801-$google_clients_version",
+ google_api_services_bigquery : "com.google.apis:google-api-services-bigquery:v2-rev20181221-$google_clients_version",
+ google_api_services_clouddebugger : "com.google.apis:google-api-services-clouddebugger:v2-rev20181114-$google_clients_version",
google_api_services_cloudresourcemanager : "com.google.apis:google-api-services-cloudresourcemanager:v1-rev20181015-$google_clients_version",
- google_api_services_dataflow : "com.google.apis:google-api-services-dataflow:v1b3-rev20190607-$google_clients_version",
- google_api_services_pubsub : "com.google.apis:google-api-services-pubsub:v1-rev20181105-$google_clients_version",
- google_api_services_storage : "com.google.apis:google-api-services-storage:v1-rev20181013-$google_clients_version",
+ google_api_services_dataflow : "com.google.apis:google-api-services-dataflow:v1b3-rev20190927-$google_clients_version",
+ google_api_services_pubsub : "com.google.apis:google-api-services-pubsub:v1-rev20181213-$google_clients_version",
+ google_api_services_storage : "com.google.apis:google-api-services-storage:v1-rev20181109-$google_clients_version",
google_auth_library_credentials : "com.google.auth:google-auth-library-credentials:$google_auth_version",
google_auth_library_oauth2_http : "com.google.auth:google-auth-library-oauth2-http:$google_auth_version",
google_cloud_bigquery : "com.google.cloud:google-cloud-bigquery:$google_clients_version",
@@ -596,7 +593,7 @@
// Configures a project with a default set of plugins that should apply to all Java projects.
//
// Users should invoke this method using Groovy map syntax. For example:
- // applyJavaNature(javaVersion: 1.8)
+ // applyJavaNature(enableSpotbugs: true)
//
// See JavaNatureConfiguration for the set of accepted properties.
//
@@ -656,8 +653,8 @@
// Configure the Java compiler source language and target compatibility levels. Also ensure that
// we configure the Java compiler to use UTF-8.
- project.sourceCompatibility = configuration.javaVersion
- project.targetCompatibility = configuration.javaVersion
+ project.sourceCompatibility = project.javaVersion
+ project.targetCompatibility = project.javaVersion
def defaultLintSuppressions = [
'options',
@@ -1303,7 +1300,7 @@
}
if (runner?.equalsIgnoreCase('flink')) {
- testRuntime it.project(path: ":runners:flink:1.8", configuration: 'testRuntime')
+ testRuntime it.project(path: ":runners:flink:1.9", configuration: 'testRuntime')
}
if (runner?.equalsIgnoreCase('spark')) {
@@ -1724,7 +1721,7 @@
dependsOn setupTask
// We need flink-job-server-container dependency since Python PortableRunner automatically
// brings the flink-job-server-container up when --job_endpoint is not specified.
- dependsOn ':runners:flink:1.8:job-server-container:docker'
+ dependsOn ':runners:flink:1.9:job-server-container:docker'
}
mainTask.dependsOn pythonTask
cleanupTask.mustRunAfter pythonTask
@@ -1906,7 +1903,7 @@
project.task('portableWordCount' + (isStreaming ? 'Streaming' : 'Batch')) {
dependsOn = ['installGcpTest']
mustRunAfter = [
- ':runners:flink:1.8:job-server-container:docker',
+ ':runners:flink:1.9:job-server-container:docker',
':sdks:python:container:py2:docker',
':sdks:python:container:py35:docker',
':sdks:python:container:py36:docker',
diff --git a/examples/java/build.gradle b/examples/java/build.gradle
index 3936398..912889e 100644
--- a/examples/java/build.gradle
+++ b/examples/java/build.gradle
@@ -78,7 +78,7 @@
// https://issues.apache.org/jira/browse/BEAM-3583
// apexRunnerPreCommit project(":runners:apex")
directRunnerPreCommit project(path: ":runners:direct-java", configuration: "shadow")
- flinkRunnerPreCommit project(":runners:flink:1.8")
+ flinkRunnerPreCommit project(":runners:flink:1.9")
// TODO: Make the netty version used configurable, we add netty-all 4.1.17.Final so it appears on the classpath
// before 4.1.8.Final defined by Apache Beam
sparkRunnerPreCommit "io.netty:netty-all:4.1.17.Final"
diff --git a/examples/kotlin/build.gradle b/examples/kotlin/build.gradle
index 7b55870..dcfc65d 100644
--- a/examples/kotlin/build.gradle
+++ b/examples/kotlin/build.gradle
@@ -81,7 +81,7 @@
// https://issues.apache.org/jira/browse/BEAM-3583
// apexRunnerPreCommit project(":runners:apex")
directRunnerPreCommit project(path: ":runners:direct-java", configuration: "shadow")
- flinkRunnerPreCommit project(":runners:flink:1.8")
+ flinkRunnerPreCommit project(":runners:flink:1.9")
// TODO: Make the netty version used configurable, we add netty-all 4.1.17.Final so it appears on the classpath
// before 4.1.8.Final defined by Apache Beam
sparkRunnerPreCommit "io.netty:netty-all:4.1.17.Final"
diff --git a/gradle.properties b/gradle.properties
index 412f5a1..3f608c2 100644
--- a/gradle.properties
+++ b/gradle.properties
@@ -23,5 +23,7 @@
signing.gnupg.executable=gpg
signing.gnupg.useLegacyGpg=true
-version=2.17.0-SNAPSHOT
-python_sdk_version=2.17.0.dev
+version=2.18.0-SNAPSHOT
+python_sdk_version=2.18.0.dev
+
+javaVersion=1.8
diff --git a/learning/katas/java/.idea/study_project.xml b/learning/katas/java/.idea/study_project.xml
deleted file mode 100644
index cee5d67..0000000
--- a/learning/katas/java/.idea/study_project.xml
+++ /dev/null
@@ -1,3151 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
- <component name="StudySettings">
- <StudyTaskManager>
- <option name="VERSION" value="14" />
- <option name="myUserTests">
- <map />
- </option>
- <option name="course">
- <EduCourse>
- <option name="authors">
- <list>
- <StepikUserInfo>
- <option name="firstName" value="Henry" />
- <option name="id" value="48485817" />
- <option name="lastName" value="Suryawirawan" />
- </StepikUserInfo>
- </list>
- </option>
- <option name="compatible" value="true" />
- <option name="courseMode" value="Course Creator" />
- <option name="createDate" value="1557823043901" />
- <option name="customPresentableName" />
- <option name="description" value="This course provides a series of katas to get familiar with Apache Beam. Apache Beam website – https://beam.apache.org/" />
- <option name="environment" value="" />
- <option name="fromZip" value="false" />
- <option name="id" value="54530" />
- <option name="index" value="-1" />
- <option name="instructors">
- <list>
- <option value="48485817" />
- </list>
- </option>
- <option name="language" value="JAVA 8" />
- <option name="languageCode" value="en" />
- <option name="name" value="Beam Katas - Java" />
- <option name="public" value="true" />
- <option name="sectionIds">
- <list />
- </option>
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="type" value="pycharm11 JAVA 8" />
- <option name="updateDate" value="1560936271000" />
- <option name="items">
- <list>
- <Section>
- <option name="courseId" value="54530" />
- <option name="customPresentableName" />
- <option name="id" value="85639" />
- <option name="index" value="1" />
- <option name="name" value="Introduction" />
- <option name="position" value="1" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1559325015000" />
- <option name="items">
- <list>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="229506" />
- <option name="index" value="1" />
- <option name="name" value="Hello Beam" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1559325015000" />
- <option name="unitId" value="-1" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Hello Beam Pipeline</h2> <p> Apache Beam is an open source, unified model for defining both batch and streaming data-parallel processing pipelines. Using one of the open source Beam SDKs, you build a program that defines the pipeline. The pipeline is then executed by one of Beam’s supported distributed processing back-ends, which include Apache Apex, Apache Flink, Apache Spark, and Google Cloud Dataflow. </p> <p> Beam is particularly useful for Embarrassingly Parallel data processing tasks, in which the problem can be decomposed into many smaller bundles of data that can be processed independently and in parallel. You can also use Beam for Extract, Transform, and Load (ETL) tasks and pure data integration. These tasks are useful for moving data between different storage media and data sources, transforming data into a more desirable format, or loading data onto a new system. </p> <p> To learn more about Apache Beam, refer to <a href="https://beam.apache.org/get-started/beam-overview/">Apache Beam Overview</a>. </p> <p> <b>Kata:</b> Your first kata is to create a simple pipeline that takes a hardcoded input element "Hello Beam". </p> <br> <div class="hint"> Hardcoded input can be created using <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Create.html"> Create</a>. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#creating-pcollection-in-memory"> "Creating a PCollection from in-memory data"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713723" />
- <option name="index" value="1" />
- <option name="name" value="Hello Beam" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/intro/hello/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1552" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="pipeline.apply(Create.of("Hello Beam"))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/intro/hello/Task.java" />
- <option name="text" value="class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/intro/hello/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/intro/hello/TaskTest.java" />
- <option name="text" value="public class Test { // put your test here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936162000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- </list>
- </option>
- </Section>
- <Section>
- <option name="courseId" value="54530" />
- <option name="customPresentableName" />
- <option name="id" value="85640" />
- <option name="index" value="2" />
- <option name="name" value="Core Transforms" />
- <option name="position" value="2" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1559325050000" />
- <option name="items">
- <list>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="229507" />
- <option name="index" value="1" />
- <option name="name" value="Map" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1559325026000" />
- <option name="unitId" value="-1" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>ParDo</h2> <p> ParDo is a Beam transform for generic parallel processing. The ParDo processing paradigm is similar to the “Map” phase of a Map/Shuffle/Reduce-style algorithm: a ParDo transform considers each element in the input PCollection, performs some processing function (your user code) on that element, and emits zero, one, or multiple elements to an output PCollection. </p> <p> <b>Kata:</b> Please write a simple ParDo that maps the input element by multiplying it by 10. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/ParDo.html"> ParDo</a> with <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/DoFn.html"> DoFn</a>. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#pardo">"ParDo"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713724" />
- <option name="index" value="1" />
- <option name="name" value="ParDo" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/map/pardo/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1752" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply(ParDo.of(new DoFn<Integer, Integer>() { @ProcessElement public void processElement(@Element Integer number, OutputReceiver<Integer> out) { out.output(number * 10); } }))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/map/pardo/Task.java" />
- <option name="text" value="class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/map/pardo/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/map/pardo/TaskTest.java" />
- <option name="text" value="public class Test { // put your test here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936166000" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>ParDo OneToMany</h2> <p> <b>Kata:</b> Please write a ParDo that maps each input sentence into words tokenized by whitespace (" "). </p> <br> <div class="hint"> You can call <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/DoFn.OutputReceiver.html"> OutputReceiver</a> multiple times in a <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/ParDo.html"> ParDo</a>. </div> <div class="hint"> If you're using Beam version before v2.5.0, you can call <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/DoFn.WindowedContext.html#output-OutputT-"> DoFn.ProcessContext.output(..)</a> multiple times in a <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/ParDo.html">ParDo</a>. </div> </html>" />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713725" />
- <option name="index" value="2" />
- <option name="name" value="ParDo OneToMany" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/map/pardoonetomany/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1777" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply(ParDo.of(new DoFn<String, String>() { @ProcessElement public void processElement(@Element String sentence, OutputReceiver<String> out) { String[] words = sentence.split(" "); for (String word : words) { out.output(word); } } }))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/map/pardoonetomany/Task.java" />
- <option name="text" value="class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/map/pardoonetomany/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/map/pardoonetomany/TaskTest.java" />
- <option name="text" value="public class Test { // put your test here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936169000" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>MapElements</h2> <p> The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. </p> <p> MapElements can be used to simplify DoFn that maps an element to another element (one to one). </p> <p> <b>Kata:</b> Implement a simple map function that multiplies all input elements by 5 using <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/MapElements.html"> MapElements.into(...).via(...)</a>. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/MapElements.html"> MapElements.into(...).via(...)</a>. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#lightweight-dofns"> "Lightweight DoFns and other abstractions"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713726" />
- <option name="index" value="3" />
- <option name="name" value="MapElements" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/map/mapelements/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1776" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply( MapElements.into(TypeDescriptors.integers()) .via(number -> number * 5) )" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/map/mapelements/Task.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/map/mapelements/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/map/mapelements/TaskTest.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936172000" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>FlatMapElements</h2> <p> The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. </p> <p> FlatMapElements can be used to simplify DoFn that maps an element to multiple elements (one to many). </p> <p> <b>Kata:</b> Implement a function that maps each input sentence into words tokenized by whitespace (" ") using <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/FlatMapElements.html"> FlatMapElements.into(...).via(...)</a>. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/FlatMapElements.html"> FlatMapElements.into(...).via(...)</a>. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#lightweight-dofns"> "Lightweight DoFns and other abstractions"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713727" />
- <option name="index" value="4" />
- <option name="name" value="FlatMapElements" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/map/flatmapelements/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1835" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply( FlatMapElements.into(TypeDescriptors.strings()) .via(sentence -> Arrays.asList(sentence.split(" "))) )" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/map/flatmapelements/Task.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/map/flatmapelements/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/map/flatmapelements/TaskTest.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560791586000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="229508" />
- <option name="index" value="2" />
- <option name="name" value="GroupByKey" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1559325029000" />
- <option name="unitId" value="-1" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>GroupByKey</h2> <p> GroupByKey is a Beam transform for processing collections of key/value pairs. It’s a parallel reduction operation, analogous to the Shuffle phase of a Map/Shuffle/Reduce-style algorithm. The input to GroupByKey is a collection of key/value pairs that represents a multimap, where the collection contains multiple pairs that have the same key, but different values. Given such a collection, you use GroupByKey to collect all of the values associated with each unique key. </p> <p> <b>Kata:</b> Implement a <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/GroupByKey.html"> GroupByKey</a> transform that groups words by its first letter. </p> <br> <div class="hint"> Refer to <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/values/KV.html"> KV</a> and <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/GroupByKey.html"> GroupByKey</a> to solve this problem. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#groupbykey"> "GroupByKey"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713728" />
- <option name="index" value="1" />
- <option name="name" value="GroupByKey" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/groupbykey/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="2025" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input .apply(MapElements.into(kvs(strings(), strings())) .via(word -> KV.of(word.substring(0, 1), word))) .apply(GroupByKey.create())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/groupbykey/Task.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/groupbykey/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/groupbykey/TaskTest.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936177000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="229509" />
- <option name="index" value="3" />
- <option name="name" value="CoGroupByKey" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1559325032000" />
- <option name="unitId" value="-1" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>CoGroupByKey</h2> <p> CoGroupByKey performs a relational join of two or more key/value PCollections that have the same key type. </p> <p> <b>Kata:</b> Implement a <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/join/CoGroupByKey.html"> CoGroupByKey</a> transform that join words by its first alphabetical letter, and then produces the toString() representation of the WordsAlphabet model. </p> <br> <div class="hint"> Refer to <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/join/CoGroupByKey.html"> CoGroupByKey</a>, <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/values/TupleTag.html"> TupleTag</a>, and <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/join/CoGbkResult.html"> CoGbkResult</a>. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#cogroupbykey"> "CoGroupByKey"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713729" />
- <option name="index" value="1" />
- <option name="name" value="CoGroupByKey" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/cogroupbykey/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="2418" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="TupleTag<String> fruitsTag = new TupleTag<>(); TupleTag<String> countriesTag = new TupleTag<>(); MapElements<String, KV<String, String>> mapToAlphabetKv = MapElements.into(kvs(strings(), strings())) .via(word -> KV.of(word.substring(0, 1), word)); PCollection<KV<String, String>> fruitsPColl = fruits.apply("Fruit to KV", mapToAlphabetKv); PCollection<KV<String, String>> countriesPColl = countries .apply("Country to KV", mapToAlphabetKv); return KeyedPCollectionTuple .of(fruitsTag, fruitsPColl) .and(countriesTag, countriesPColl) .apply(CoGroupByKey.create()) .apply(ParDo.of(new DoFn<KV<String, CoGbkResult>, String>() { @ProcessElement public void processElement( @Element KV<String, CoGbkResult> element, OutputReceiver<String> out) { String alphabet = element.getKey(); CoGbkResult coGbkResult = element.getValue(); String fruit = coGbkResult.getOnly(fruitsTag); String country = coGbkResult.getOnly(countriesTag); out.output(new WordsAlphabet(alphabet, fruit, country).toString()); } }));" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/cogroupbykey/Task.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/cogroupbykey/WordsAlphabet.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/cogroupbykey/WordsAlphabet.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/cogroupbykey/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/cogroupbykey/TaskTest.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936180000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="229510" />
- <option name="index" value="4" />
- <option name="name" value="Combine" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1559325044000" />
- <option name="unitId" value="-1" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Combine - Simple Function</h2> <p> Combine is a Beam transform for combining collections of elements or values in your data. When you apply a Combine transform, you must provide the function that contains the logic for combining the elements or values. The combining function should be commutative and associative, as the function is not necessarily invoked exactly once on all values with a given key. Because the input data (including the value collection) may be distributed across multiple workers, the combining function might be called multiple times to perform partial combining on subsets of the value collection. </p> <p> Simple combine operations, such as sums, can usually be implemented as a simple function. </p> <p> <b>Kata:</b> Implement the summation of numbers using <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/SerializableFunction.html"> Combine.globally(SerializableFunction)</a>. </p> <br> <div class="hint"> Implement the <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/SerializableFunction.html#apply-InputT-"> SerializableFunction.apply</a> method that performs the summation of the Iterable. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#simple-combines"> "Simple combinations using simple functions"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713730" />
- <option name="index" value="1" />
- <option name="name" value="Simple Function" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/combine/simple/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1923" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="@Override public Integer apply(Iterable<Integer> input) { int sum = 0; for (int item : input) { sum += item; } return sum; }" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/combine/simple/Task.java" />
- <option name="text" value="class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/combine/simple/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/combine/simple/TaskTest.java" />
- <option name="text" value="public class Test { // put your test here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936184000" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Combine - CombineFn</h2> <p> Combine is a Beam transform for combining collections of elements or values in your data. When you apply a Combine transform, you must provide the function that contains the logic for combining the elements or values. The combining function should be commutative and associative, as the function is not necessarily invoked exactly once on all values with a given key. Because the input data (including the value collection) may be distributed across multiple workers, the combining function might be called multiple times to perform partial combining on subsets of the value collection. </p> <p> Complex combination operations might require you to create a subclass of CombineFn that has an accumulation type distinct from the input/output type. You should use CombineFn if the combine function requires a more sophisticated accumulator, must perform additional pre- or post-processing, might change the output type, or takes the key into account. </p> <p> <b>Kata:</b> Implement the average of numbers using <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Combine.CombineFn.html"> Combine.CombineFn</a>. </p> <br> <div class="hint"> Extend the <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Combine.CombineFn.html"> Combine.CombineFn</a> class that counts the average of the number. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#advanced-combines"> "Advanced combinations using CombineFn"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713731" />
- <option name="index" value="2" />
- <option name="name" value="CombineFn" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/combine/combinefn/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1962" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="class Accum implements Serializable { int sum = 0; int count = 0; @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } Accum accum = (Accum) o; return sum == accum.sum && count == accum.count; } @Override public int hashCode() { return Objects.hash(sum, count); } } @Override public Accum createAccumulator() { return new Accum(); } @Override public Accum addInput(Accum accumulator, Integer input) { accumulator.sum += input; accumulator.count++; return accumulator; } @Override public Accum mergeAccumulators(Iterable<Accum> accumulators) { Accum merged = createAccumulator(); for (Accum accumulator : accumulators) { merged.sum += accumulator.sum; merged.count += accumulator.count; } return merged; } @Override public Double extractOutput(Accum accumulator) { return ((double) accumulator.sum) / accumulator.count; }" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/combine/combinefn/Task.java" />
- <option name="text" value="class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/combine/combinefn/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/combine/combinefn/TaskTest.java" />
- <option name="text" value="public class Test { // put your test here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936188000" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Combine - BinaryCombineFn</h2> <p> Combine is a Beam transform for combining collections of elements or values in your data. When you apply a Combine transform, you must provide the function that contains the logic for combining the elements or values. The combining function should be commutative and associative, as the function is not necessarily invoked exactly once on all values with a given key. Because the input data (including the value collection) may be distributed across multiple workers, the combining function might be called multiple times to perform partial combining on subsets of the value collection. </p> <p> BinaryCombineFn is used for implementing combiners that are more easily expressed as binary operations. </p> <p> <b>Kata:</b> Implement the summation of BigInteger using <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Combine.BinaryCombineFn.html"> Combine.BinaryCombineFn</a>. </p> <br> <div class="hint"> Extend the <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Combine.BinaryCombineFn.html"> Combine.BinaryCombineFn</a> class that counts the sum of the number. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#combine"> "Combine"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713732" />
- <option name="index" value="3" />
- <option name="name" value="BinaryCombineFn" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/combine/binarycombinefn/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="2125" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="@Override public BigInteger apply(BigInteger left, BigInteger right) { return left.add(right); }" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/combine/binarycombinefn/Task.java" />
- <option name="text" value="class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/combine/binarycombinefn/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/combine/binarycombinefn/TaskTest.java" />
- <option name="text" value="public class Test { // put your test here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936191000" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Combine - BinaryCombineFn Lambda</h2> <p> BinaryCombineFn is used for implementing combiners that are more easily expressed as binary operations. </p> <p> Since Beam v2.13.0, you can also use lambda or method reference in order to create the BinaryCombineFn. </p> <p> <b>Kata:</b> Implement the summation of BigInteger using lambda or method reference. </p> <br> <div class="hint"> Refer to <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/SerializableBiFunction.html"> SerializableBiFunction</a>. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#combine"> "Combine"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="750324" />
- <option name="index" value="4" />
- <option name="name" value="BinaryCombineFn Lambda" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/combine/binarycombinefnlambda/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1922" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply(Combine.globally(BigInteger::add))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/combine/binarycombinefnlambda/Task.java" />
- <option name="text" value="public class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/combine/binarycombinefnlambda/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/combine/binarycombinefnlambda/TaskTest.java" />
- <option name="text" value="import org.junit.Assert; import org.junit.Test; public class Tests { @Test public void testSolution() { // put your test here Assert.fail("Tests not implemented for the task"); } }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936195000" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Combine - Combine PerKey</h2> <p> After creating a keyed PCollection (for example, by using a GroupByKey transform), a common pattern is to combine the collection of values associated with each key into a single, merged value. This pattern of a GroupByKey followed by merging the collection of values is equivalent to Combine PerKey transform. The combine function you supply to Combine PerKey must be an associative reduction function or a subclass of CombineFn. </p> <p> <b>Kata:</b> Implement the sum of scores per player using <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/CombineFnBase.GlobalCombineFn.html"> Combine.perKey</a>. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/CombineFnBase.GlobalCombineFn.html"> Combine.perKey(GlobalCombineFn)</a>. </div> <div class="hint"> Extend the <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Combine.BinaryCombineFn.html"> Combine.BinaryCombineFn</a> class that counts the sum of the number. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#combining-values-in-a-keyed-pcollection"> "Combining values in a keyed PCollection"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713733" />
- <option name="index" value="5" />
- <option name="name" value="Combine PerKey" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/combine/combineperkey/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="2155" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply(Combine.perKey(new SumIntBinaryCombineFn()))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="1" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="2295" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="@Override public Integer apply(Integer left, Integer right) { return left + right; }" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/combine/combineperkey/Task.java" />
- <option name="text" value="class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/combine/combineperkey/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/combine/combineperkey/TaskTest.java" />
- <option name="text" value="public class Test { // put your test here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936199000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="229511" />
- <option name="index" value="5" />
- <option name="name" value="Flatten" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1559325047000" />
- <option name="unitId" value="-1" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Flatten</h2> <p> Flatten is a Beam transform for PCollection objects that store the same data type. Flatten merges multiple PCollection objects into a single logical PCollection. </p> <p> <b>Kata:</b> Implement a <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Flatten.html"> Flatten</a> transform that merges two PCollection of words into a single PCollection. </p> <br> <div class="hint"> Refer to <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Flatten.html"> Flatten</a> to solve this problem. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#flatten"> "Flatten"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713734" />
- <option name="index" value="1" />
- <option name="name" value="Flatten" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/flatten/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="2040" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="PCollectionList.of(words1).and(words2) .apply(Flatten.pCollections())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/flatten/Task.java" />
- <option name="text" value="class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/flatten/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/flatten/TaskTest.java" />
- <option name="text" value="public class Test { // put your test here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936202000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="229512" />
- <option name="index" value="6" />
- <option name="name" value="Partition" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1559325050000" />
- <option name="unitId" value="-1" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Partition</h2> <p> Partition is a Beam transform for PCollection objects that store the same data type. Partition splits a single PCollection into a fixed number of smaller collections. </p> <p> Partition divides the elements of a PCollection according to a partitioning function that you provide. The partitioning function contains the logic that determines how to split up the elements of the input PCollection into each resulting partition PCollection. </p> <p> <b>Kata:</b> Implement a <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Partition.html"> Partition</a> transform that splits a PCollection of numbers into two PCollections. The first PCollection contains numbers greater than 100, and the second PCollection contains the remaining numbers. </p> <br> <div class="hint"> Refer to <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Partition.html"> Partition</a> to solve this problem. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#partition"> "Partition"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713735" />
- <option name="index" value="1" />
- <option name="name" value="Partition" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/partition/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1966" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input .apply(Partition.of(2, (PartitionFn<Integer>) (number, numPartitions) -> { if (number > 100) { return 0; } else { return 1; } }))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/partition/Task.java" />
- <option name="text" value="class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/partition/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/partition/TaskTest.java" />
- <option name="text" value="public class Test { // put your test here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936206000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="237989" />
- <option name="index" value="7" />
- <option name="name" value="Side Input" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560791406453" />
- <option name="unitId" value="-1" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Side Input</h2> <p> In addition to the main input PCollection, you can provide additional inputs to a ParDo transform in the form of side inputs. A side input is an additional input that your DoFn can access each time it processes an element in the input PCollection. When you specify a side input, you create a view of some other data that can be read from within the ParDo transform’s DoFn while processing each element. </p> <p> Side inputs are useful if your ParDo needs to inject additional data when processing each element in the input PCollection, but the additional data needs to be determined at runtime (and not hard-coded). Such values might be determined by the input data, or depend on a different branch of your pipeline. </p> <p> <b>Kata:</b> Please enrich each Person with the country based on the city he/she lives in. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/View.html"> View</a> to create PCollectionView of citiesToCountries. </div> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/ParDo.html"> ParDo</a> with <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/DoFn.html"> DoFn</a> that accepts <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/ParDo.SingleOutput.html#withSideInputs-org.apache.beam.sdk.values.PCollectionView...-"> side input</a>. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#side-inputs">"Side inputs"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="754085" />
- <option name="index" value="1" />
- <option name="name" value="Side Input" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/sideinput/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="2716" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="citiesToCountries.apply(View.asMap())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="1" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="2914" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="persons.apply(ParDo.of(new DoFn<Person, Person>() { @ProcessElement public void processElement(@Element Person person, OutputReceiver<Person> out, ProcessContext context) { Map<String, String> citiesToCountries = context.sideInput(citiesToCountriesView); String city = person.getCity(); String country = citiesToCountries.get(city); out.output(new Person(person.getName(), city, country)); } }).withSideInputs(citiesToCountriesView))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/sideinput/Task.java" />
- <option name="text" value="public class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/sideinput/Person.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/sideinput/Person.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/sideinput/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/sideinput/TaskTest.java" />
- <option name="text" value="import org.junit.Assert; import org.junit.Test; public class Tests { @Test public void testSolution() { // put your test here Assert.fail("Tests not implemented for the task"); } }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936210000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="237990" />
- <option name="index" value="8" />
- <option name="name" value="Side Output" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560791445676" />
- <option name="unitId" value="-1" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Side Output</h2> <p> While ParDo always produces a main output PCollection (as the return value from apply), you can also have your ParDo produce any number of additional output PCollections. If you choose to have multiple outputs, your ParDo returns all of the output PCollections (including the main output) bundled together. </p> <p> <b>Kata:</b> Implement additional output to your ParDo for numbers bigger than 100. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/DoFn.MultiOutputReceiver.html"> MultiOutputReceiver</a> and <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/ParDo.SingleOutput.html#withOutputTags-org.apache.beam.sdk.values.TupleTag-org.apache.beam.sdk.values.TupleTagList-"> .withOutputTags</a> to output multiple tagged-outputs in a <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/ParDo.html"> ParDo.</a> </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#additional-outputs"> "Additional outputs"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="754087" />
- <option name="index" value="1" />
- <option name="name" value="Side Output" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/sideoutput/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="2253" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="numbers.apply(ParDo.of(new DoFn<Integer, Integer>() { @ProcessElement public void processElement(@Element Integer number, MultiOutputReceiver out) { if (number <= 100) { out.get(numBelow100Tag).output(number); } else { out.get(numAbove100Tag).output(number); } } }).withOutputTags(numBelow100Tag, TupleTagList.of(numAbove100Tag)))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/sideoutput/Task.java" />
- <option name="text" value="public class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/sideoutput/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/sideoutput/TaskTest.java" />
- <option name="text" value="import org.junit.Assert; import org.junit.Test; public class Tests { @Test public void testSolution() { // put your test here Assert.fail("Tests not implemented for the task"); } }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936215000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="237991" />
- <option name="index" value="9" />
- <option name="name" value="Branching" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560791458069" />
- <option name="unitId" value="-1" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Branching</h2> <p> You can use the same PCollection as input for multiple transforms without consuming the input or altering it. </p> <p> <b>Kata:</b> Branch out the numbers to two different transforms: one transform is multiplying each number by 5 and the other transform is multiplying each number by 10. </p> <br> <div class="hint"> Refer to the Beam Design Your Pipeline Guide <a href="https://beam.apache.org/documentation/pipelines/design-your-pipeline/#multiple-transforms-process-the-same-pcollection"> "Multiple transforms process the same PCollection"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="754088" />
- <option name="index" value="1" />
- <option name="name" value="Branching" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/branching/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1994" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply("Multiply by 5", MapElements.into(integers()).via(num -> num * 5))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="1" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="2175" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply("Multiply by 10", MapElements.into(integers()).via(num -> num * 10))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/branching/Task.java" />
- <option name="text" value="public class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/branching/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/branching/TaskTest.java" />
- <option name="text" value="import org.junit.Assert; import org.junit.Test; public class Tests { @Test public void testSolution() { // put your test here Assert.fail("Tests not implemented for the task"); } }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936219000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="237192" />
- <option name="index" value="10" />
- <option name="name" value="Composite Transform" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560431460000" />
- <option name="unitId" value="-1" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Composite Transform</h2> <p> Transforms can have a nested structure, where a complex transform performs multiple simpler transforms (such as more than one ParDo, Combine, GroupByKey, or even other composite transforms). These transforms are called composite transforms. Nesting multiple transforms inside a single composite transform can make your code more modular and easier to understand. </p> <p> To create your own composite transform, create a subclass of the PTransform class and override the expand method to specify the actual processing logic. You can then use this transform just as you would a built-in transform from the Beam SDK. For the PTransform class type parameters, you pass the PCollection types that your transform takes as input, and produces as output. Within your PTransform subclass, you’ll need to override the expand method. The expand method is where you add the processing logic for the PTransform. Your override of expand must accept the appropriate type of input PCollection as a parameter, and specify the output PCollection as the return value. </p> <p> <b>Kata:</b> Please implement a composite transform "ExtractAndMultiplyNumbers" that extracts numbers from comma separated line and then multiplies each number by 10. </p> <br> <div class="hint"> Refer to <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/PTransform.html"> PTransform</a>. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#composite-transforms"> "Composite transforms"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="750323" />
- <option name="index" value="1" />
- <option name="name" value="Composite Transform" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/composite/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1929" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="@Override public PCollection<Integer> expand(PCollection<String> input) { return input .apply(ParDo.of(new DoFn<String, Integer>() { @ProcessElement public void processElement(@Element String numbers, OutputReceiver<Integer> out) { Arrays.stream(numbers.split(",")) .forEach(numStr -> out.output(Integer.parseInt(numStr))); } })) .apply(MapElements.into(integers()).via(number -> number * 10)); }" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/composite/Task.java" />
- <option name="text" value="public class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/composite/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/composite/TaskTest.java" />
- <option name="text" value="import org.junit.Assert; import org.junit.Test; public class Tests { @Test public void testSolution() { // put your test here Assert.fail("Tests not implemented for the task"); } }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560791618000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="237765" />
- <option name="index" value="11" />
- <option name="name" value="DoFn Additional Parameters" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="0" />
- <option name="unitId" value="-1" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>DoFn Additional Parameters</h2> <p> In addition to the element and the OutputReceiver, Beam will populate other parameters to your DoFn’s @ProcessElement method. Any combination of these parameters can be added to your process method in any order. </p> <div> <ul> <li> <b>Timestamp</b>: To access the timestamp of an input element, add a parameter annotated with @Timestamp of type Instant </li> <li> <b>Window</b>: To access the window an input element falls into, add a parameter of the type of the window used for the input PCollection. </li> <li> <b>PaneInfo</b>: When triggers are used, Beam provides a PaneInfo object that contains information about the current firing. Using PaneInfo you can determine whether this is an early or a late firing, and how many times this window has already fired for this key. </li> <li> <b>PipelineOptions</b>: The PipelineOptions for the current pipeline can always be accessed in a process method by adding it as a parameter. </li> </ul> </div> <p> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#other-dofn-parameters"> "Accessing additional parameters in your DoFn"</a> section for more information. </p> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="753154" />
- <option name="index" value="1" />
- <option name="name" value="DoFn Additional Parameters" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/coretransforms/dofnadditionalparams/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/coretransforms/dofnadditionalparams/Task.java" />
- <option name="text" value="public class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/coretransforms/dofnadditionalparams/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/coretransforms/dofnadditionalparams/TaskTest.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560699463688" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- </list>
- </option>
- </Section>
- <Section>
- <option name="courseId" value="54530" />
- <option name="customPresentableName" />
- <option name="id" value="85641" />
- <option name="index" value="3" />
- <option name="name" value="Common Transforms" />
- <option name="position" value="3" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1559325072000" />
- <option name="items">
- <list>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="229513" />
- <option name="index" value="1" />
- <option name="name" value="Filter" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1559325056000" />
- <option name="unitId" value="202038" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Filter using ParDo</h2> <p> <b>Kata:</b> Implement a filter function that filters out the even numbers by using <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/DoFn.html"> DoFn</a>. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/ParDo.html"> ParDo</a> with <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/DoFn.html"> DoFn</a> and only output the intended element. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713736" />
- <option name="index" value="1" />
- <option name="name" value="ParDo" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/commontransforms/filter/pardo/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1752" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply(ParDo.of( new DoFn<Integer, Integer>() { @ProcessElement public void processElement(@Element Integer number, OutputReceiver<Integer> out) { if (number % 2 == 1) { out.output(number); } } }) )" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/commontransforms/filter/pardo/Task.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/commontransforms/filter/pardo/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/commontransforms/filter/pardo/TaskTest.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936224000" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Filter</h2> <p> The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. </p> <p> <b>Kata:</b> Implement a filter function that filters out the odd numbers by using <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Filter.html"> Filter</a>. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Filter.html"> Filter.by(...)</a>. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713737" />
- <option name="index" value="2" />
- <option name="name" value="Filter" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/commontransforms/filter/filter/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1718" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply(Filter.by(number -> number % 2 == 0))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/commontransforms/filter/filter/Task.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/commontransforms/filter/filter/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/commontransforms/filter/filter/TaskTest.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936227000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="229514" />
- <option name="index" value="2" />
- <option name="name" value="Aggregation" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1559325072000" />
- <option name="unitId" value="202039" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Aggregation - Count</h2> <p> <b>Kata:</b> Count the number of elements from an input. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Count.html"> Count</a>. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713738" />
- <option name="index" value="1" />
- <option name="name" value="Count" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/commontransforms/aggregation/count/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1707" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply(Count.globally())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/commontransforms/aggregation/count/Task.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/commontransforms/aggregation/count/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/commontransforms/aggregation/count/TaskTest.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936231000" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Aggregation - Sum</h2> <p> <b>Kata:</b> Compute the sum of all elements from an input. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Sum.html"> Sum</a>. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713739" />
- <option name="index" value="2" />
- <option name="name" value="Sum" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/commontransforms/aggregation/sum/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1709" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply(Sum.integersGlobally())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/commontransforms/aggregation/sum/Task.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/commontransforms/aggregation/sum/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/commontransforms/aggregation/sum/TaskTest.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936235000" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Aggregation - Mean</h2> <p> <b>Kata:</b> Compute the mean/average of all elements from an input. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Mean.html"> Mean</a>. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713740" />
- <option name="index" value="3" />
- <option name="name" value="Mean" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/commontransforms/aggregation/mean/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1709" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply(Mean.globally())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/commontransforms/aggregation/mean/Task.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/commontransforms/aggregation/mean/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/commontransforms/aggregation/mean/TaskTest.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936238000" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Aggregation - Min</h2> <p> <b>Kata:</b> Compute the minimum of the elements from an input. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Min.html"> Min</a>. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713741" />
- <option name="index" value="4" />
- <option name="name" value="Min" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/commontransforms/aggregation/min/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1709" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply(Min.integersGlobally())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/commontransforms/aggregation/min/Task.java" />
- <option name="text" value="class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/commontransforms/aggregation/min/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/commontransforms/aggregation/min/TaskTest.java" />
- <option name="text" value="public class Test { // put your test here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936242000" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Aggregation - Max</h2> <p> <b>Kata:</b> Compute the maximum of the elements from an input. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Max.html"> Max</a>. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713742" />
- <option name="index" value="5" />
- <option name="name" value="Max" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/commontransforms/aggregation/max/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1709" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply(Max.integersGlobally())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/commontransforms/aggregation/max/Task.java" />
- <option name="text" value="class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/commontransforms/aggregation/max/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/commontransforms/aggregation/max/TaskTest.java" />
- <option name="text" value="public class Test { // put your test here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936246000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="237992" />
- <option name="index" value="3" />
- <option name="name" value="WithKeys" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="updateDate" value="1560791491864" />
- <option name="unitId" value="-1" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>WithKeys</h2> <p> <b>Kata:</b> Convert each fruit name into a KV of its first letter and itself, e.g. <code>apple => KV.of("a", "apple")</code> </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/WithKeys.html"> WithKeys</a>. </div> <div class="hint"> If using a lambda in Java 8, <code>withKeyType(TypeDescriptor)</code> must be called on the result PTransform. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="754089" />
- <option name="index" value="1" />
- <option name="name" value="WithKeys" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/commontransforms/withkeys/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1875" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input .apply(WithKeys.<String, String>of(fruit -> fruit.substring(0, 1)) .withKeyType(strings()))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/commontransforms/withkeys/Task.java" />
- <option name="text" value="public class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/commontransforms/withkeys/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/commontransforms/withkeys/TaskTest.java" />
- <option name="text" value="import org.junit.Assert; import org.junit.Test; public class Tests { @Test public void testSolution() { // put your test here Assert.fail("Tests not implemented for the task"); } }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936249000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- </list>
- </option>
- </Section>
- <Section>
- <option name="courseId" value="54530" />
- <option name="customPresentableName" />
- <option name="id" value="88010" />
- <option name="index" value="4" />
- <option name="name" value="IO" />
- <option name="position" value="4" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1560431425000" />
- <option name="items">
- <list>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="237187" />
- <option name="index" value="1" />
- <option name="name" value="TextIO" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560431430000" />
- <option name="unitId" value="209563" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>TextIO Read</h2> <p> When you create a pipeline, you often need to read data from some external source, such as a file or a database. Likewise, you may want your pipeline to output its result data to an external storage system. Beam provides read and write transforms for a number of common data storage types. If you want your pipeline to read from or write to a data storage format that isn’t supported by the built-in transforms, you can implement your own read and write transforms. </p> <p> To read a PCollection from one or more text files, use TextIO.read() to instantiate a transform and use TextIO.Read.from(String) to specify the path of the file(s) to be read. </p> <p> <b>Kata:</b> Read the 'countries.txt' file and convert each country name into uppercase. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/io/TextIO.html"> TextIO</a> and its corresponding <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/io/TextIO.html#read--"> TextIO.read()</a> method. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#pipeline-io-reading-data"> "Reading input data"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="750317" />
- <option name="index" value="1" />
- <option name="name" value="TextIO Read" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="countries.txt">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="countries.txt" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="src/org/apache/beam/learning/katas/io/textio/read/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1615" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="TextIO.read().from(FILE_PATH)" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="1" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1855" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input.apply(MapElements.into(strings()).via(String::toUpperCase))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/io/textio/read/Task.java" />
- <option name="text" value="public class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/io/textio/read/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/io/textio/read/TaskTest.java" />
- <option name="text" value="import org.junit.Assert; import org.junit.Test; public class Tests { @Test public void testSolution() { // put your test here Assert.fail("Tests not implemented for the task"); } }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936253000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="237188" />
- <option name="index" value="2" />
- <option name="name" value="Built-in IOs" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560431436000" />
- <option name="unitId" value="209564" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Built-in I/Os</h2> <p> Beam SDKs provide many out of the box I/O transforms that can be used to read from many different sources and write to many different sinks. </p> <p> See the <a href="https://beam.apache.org/documentation/io/built-in/">Beam-provided I/O Transforms</a> page for a list of the currently available I/O transforms. </p> </html>" />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="750319" />
- <option name="index" value="1" />
- <option name="name" value="Built-in IOs" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/io/builtinios/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/io/builtinios/Task.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/io/builtinios/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/io/builtinios/TaskTest.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936257000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- </list>
- </option>
- </Section>
- <Section>
- <option name="courseId" value="54530" />
- <option name="customPresentableName" />
- <option name="id" value="88156" />
- <option name="index" value="5" />
- <option name="name" value="Windowing" />
- <option name="position" value="5" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1560698891352" />
- <option name="items">
- <list>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="237760" />
- <option name="index" value="1" />
- <option name="name" value="Adding Timestamp" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="0" />
- <option name="unitId" value="210092" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Adding Timestamp - ParDo</h2> <p> Bounded sources (such as a file from TextIO) do not provide timestamps for elements. If you need timestamps, you must add them to your PCollection’s elements. </p> <p> You can assign new timestamps to the elements of a PCollection by applying a ParDo transform that outputs new elements with timestamps that you set. </p> <p> <b>Kata:</b> Please assign each element a timestamp based on the the <code>Event.getDate()</code>. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/ParDo.html"> ParDo</a> with <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/DoFn.html"> DoFn</a>. </div> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/DoFn.OutputReceiver.html#outputWithTimestamp-T-org.joda.time.Instant-"> OutputReceiver.outputWithTimestamp</a> method to assign timestamp to the element. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#adding-timestamps-to-a-pcollections-elements"> "Adding timestamps to a PCollection’s elements"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="753142" />
- <option name="index" value="1" />
- <option name="name" value="ParDo" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/windowing/addingtimestamp/pardo/Event.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/windowing/addingtimestamp/pardo/Event.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="src/org/apache/beam/learning/katas/windowing/addingtimestamp/pardo/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="2249" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="events.apply(ParDo.of(new DoFn<Event, Event>() { @ProcessElement public void processElement(@Element Event event, OutputReceiver<Event> out) { out.outputWithTimestamp(event, event.getDate().toInstant()); } }))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/windowing/addingtimestamp/pardo/Task.java" />
- <option name="text" value="public class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/windowing/addingtimestamp/pardo/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/windowing/addingtimestamp/pardo/TaskTest.java" />
- <option name="text" value="import org.junit.Assert; import org.junit.Test; public class Tests { @Test public void testSolution() { // put your test here Assert.fail("Tests not implemented for the task"); } }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560698905262" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Adding Timestamp - WithTimestamps</h2> <p> Bounded sources (such as a file from TextIO) do not provide timestamps for elements. If you need timestamps, you must add them to your PCollection’s elements. </p> <p> You can assign new timestamps to the elements of a PCollection by applying a ParDo transform that outputs new elements with timestamps that you set. </p> <p> <b>Kata:</b> Please assign each element a timestamp based on the the <code>Event.getDate()</code>. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/WithTimestamps.html"> WithTimestamps</a>. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#adding-timestamps-to-a-pcollections-elements"> "Adding timestamps to a PCollection’s elements"</a> section for more information. </div> </html>" />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="753143" />
- <option name="index" value="2" />
- <option name="name" value="WithTimestamps" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/windowing/addingtimestamp/withtimestamps/Event.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/windowing/addingtimestamp/withtimestamps/Event.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="src/org/apache/beam/learning/katas/windowing/addingtimestamp/withtimestamps/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="2223" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="events.apply(WithTimestamps.of(event -> event.getDate().toInstant()))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/windowing/addingtimestamp/withtimestamps/Task.java" />
- <option name="text" value="public class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/windowing/addingtimestamp/withtimestamps/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/windowing/addingtimestamp/withtimestamps/TaskTest.java" />
- <option name="text" value="import org.junit.Assert; import org.junit.Test; public class Tests { @Test public void testSolution() { // put your test here Assert.fail("Tests not implemented for the task"); } }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560698907450" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="237761" />
- <option name="index" value="2" />
- <option name="name" value="Fixed Time Window" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="0" />
- <option name="unitId" value="210093" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Fixed Time Window</h2> <p> Windowing subdivides a PCollection according to the timestamps of its individual elements. Transforms that aggregate multiple elements, such as GroupByKey and Combine, work implicitly on a per-window basis — they process each PCollection as a succession of multiple, finite windows, though the entire collection itself may be of unbounded size. </p> <p> In the Beam model, any PCollection (including unbounded PCollections) can be subdivided into logical windows. Each element in a PCollection is assigned to one or more windows according to the PCollection’s windowing function, and each individual window contains a finite number of elements. Grouping transforms then consider each PCollection’s elements on a per-window basis. GroupByKey, for example, implicitly groups the elements of a PCollection by key and window. </p> <div> Beam provides several windowing functions, including: <ul> <li>Fixed Time Windows</li> <li>Sliding Time Windows</li> <li>Per-Session Windows</li> <li>Single Global Window</li> </ul> </div> <p> The simplest form of windowing is using fixed time windows. A fixed time window represents a consistent duration, non overlapping time interval in the data stream. </p> <p> <b>Kata:</b> Please count the number of events that happened based on fixed window with 1-day duration. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/windowing/FixedWindows.html"> FixedWindows</a> with 1-day duration. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#fixed-time-windows"> "Fixed time windows"</a> section for more information. </div> </html>" />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="753144" />
- <option name="index" value="1" />
- <option name="name" value="Fixed Time Window" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/windowing/fixedwindow/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="2906" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="events .apply(Window.into(FixedWindows.of(Duration.standardDays(1)))) .apply(Count.perElement())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/windowing/fixedwindow/Task.java" />
- <option name="text" value="public class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/windowing/fixedwindow/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/windowing/fixedwindow/TaskTest.java" />
- <option name="text" value="import org.junit.Assert; import org.junit.Test; public class Tests { @Test public void testSolution() { // put your test here Assert.fail("Tests not implemented for the task"); } }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/windowing/fixedwindow/WindowedEvent.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/windowing/fixedwindow/WindowedEvent.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560698912954" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- </list>
- </option>
- </Section>
- <Section>
- <option name="courseId" value="54530" />
- <option name="customPresentableName" />
- <option name="id" value="88157" />
- <option name="index" value="6" />
- <option name="name" value="Triggers" />
- <option name="position" value="6" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1560923505422" />
- <option name="items">
- <list>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="237762" />
- <option name="index" value="1" />
- <option name="name" value="Event Time Triggers" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1560923508379" />
- <option name="unitId" value="210094" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Event Time Triggers</h2> <p> When collecting and grouping data into windows, Beam uses triggers to determine when to emit the aggregated results of each window (referred to as a pane). If you use Beam’s default windowing configuration and default trigger, Beam outputs the aggregated result when it estimates all data has arrived, and discards all subsequent data for that window. </p> <p> You can set triggers for your PCollections to change this default behavior. Beam provides a number of pre-built triggers that you can set: </p> <div> <ul> <li>Event time triggers</li> <li>Processing time triggers</li> <li>Data-driven triggers</li> <li>Composite triggers</li> </ul> </div> <p> Event time triggers operate on the event time, as indicated by the timestamp on each data element. Beam’s default trigger is event time-based. </p> <p> The AfterWatermark trigger operates on event time. The AfterWatermark trigger emits the contents of a window after the watermark passes the end of the window, based on the timestamps attached to the data elements. The watermark is a global progress metric, and is Beam’s notion of input completeness within your pipeline at any given point. AfterWatermark.pastEndOfWindow() only fires when the watermark passes the end of the window. </p> <p> <b>Kata:</b> Given that events are being generated every second, please implement a trigger that emits the number of events count within a fixed window of 5-second duration. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/windowing/FixedWindows.html"> FixedWindows</a> with 5-second duration using <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/windowing/AfterWatermark.html#pastEndOfWindow--"> AfterWatermark.pastEndOfWindow()</a> trigger. </div> <div class="hint"> Set the <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/windowing/Window.html#withAllowedLateness-org.joda.time.Duration-"> allowed lateness</a> to 0 with <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/windowing/Window.html#discardingFiredPanes--"> discarding accumulation mode</a>. </div> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Combine.html#globally-org.apache.beam.sdk.transforms.CombineFnBase.GlobalCombineFn-"> Combine.globally</a> and <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Count.html#combineFn--"> Count.combineFn</a> to calculate the count of events. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#event-time-triggers"> "Event time triggers"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="753145" />
- <option name="index" value="1" />
- <option name="name" value="Event Time Triggers" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/triggers/eventtimetriggers/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1905" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="events .apply( Window.<String>into(FixedWindows.of(Duration.standardSeconds(5))) .triggering(AfterWatermark.pastEndOfWindow()) .withAllowedLateness(Duration.ZERO) .discardingFiredPanes()) .apply(Combine.globally(Count.<String>combineFn()).withoutDefaults())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/triggers/eventtimetriggers/Task.java" />
- <option name="text" value="public class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="src/org/apache/beam/learning/katas/triggers/eventtimetriggers/GenerateEvent.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/triggers/eventtimetriggers/GenerateEvent.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/triggers/eventtimetriggers/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/triggers/eventtimetriggers/TaskTest.java" />
- <option name="text" value="import org.junit.Assert; import org.junit.Test; public class Tests { @Test public void testSolution() { // put your test here Assert.fail("Tests not implemented for the task"); } }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560923517000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="237763" />
- <option name="index" value="2" />
- <option name="name" value="Early Triggers" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1560923523075" />
- <option name="unitId" value="210095" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Early Triggers</h2> <p> Triggers allow Beam to emit early results, before all the data in a given window has arrived. For example, emitting after a certain amount of time elapses, or after a certain number of elements arrives. </p> <p> <b>Kata:</b> Given that events are being generated every second and a fixed window of 1-day duration, please implement an early trigger that emits the number of events count immediately after new element is processed. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/windowing/AfterWatermark.AfterWatermarkEarlyAndLate.html#withEarlyFirings-org.apache.beam.sdk.transforms.windowing.Trigger.OnceTrigger-"> withEarlyFirings</a> to set early firing triggers. </div> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/windowing/FixedWindows.html"> FixedWindows</a> with 1-day duration using <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/windowing/AfterWatermark.html#pastEndOfWindow--"> AfterWatermark.pastEndOfWindow()</a> trigger. </div> <div class="hint"> Set the <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/windowing/Window.html#withAllowedLateness-org.joda.time.Duration-"> allowed lateness</a> to 0 with <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/windowing/Window.html#discardingFiredPanes--"> discarding accumulation mode</a>. </div> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Combine.html#globally-org.apache.beam.sdk.transforms.CombineFnBase.GlobalCombineFn-"> Combine.globally</a> and <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Count.html#combineFn--"> Count.combineFn</a> to calculate the count of events. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#event-time-triggers"> "Event time triggers"</a> section for more information. </div> </html>" />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="753146" />
- <option name="index" value="1" />
- <option name="name" value="Early Triggers" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="resources/log4j2.xml">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="false" />
- <option name="name" value="resources/log4j2.xml" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="src/org/apache/beam/learning/katas/triggers/earlytriggers/GenerateEvent.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/triggers/earlytriggers/GenerateEvent.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="src/org/apache/beam/learning/katas/triggers/earlytriggers/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1970" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="events .apply( Window.<String>into(FixedWindows.of(Duration.standardDays(1))) .triggering( AfterWatermark.pastEndOfWindow() .withEarlyFirings( AfterProcessingTime.pastFirstElementInPane())) .withAllowedLateness(Duration.ZERO) .discardingFiredPanes()) .apply(Combine.globally(Count.<String>combineFn()).withoutDefaults())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/triggers/earlytriggers/Task.java" />
- <option name="text" value="public class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/triggers/earlytriggers/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/triggers/earlytriggers/TaskTest.java" />
- <option name="text" value="import org.junit.Assert; import org.junit.Test; public class Tests { @Test public void testSolution() { // put your test here Assert.fail("Tests not implemented for the task"); } }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560923531000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="237764" />
- <option name="index" value="3" />
- <option name="name" value="Window Accumulation Mode" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1560923537697" />
- <option name="unitId" value="210096" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Window Accumulation Mode</h2> <p> When you specify a trigger, you must also set the the window’s accumulation mode. When a trigger fires, it emits the current contents of the window as a pane. Since a trigger can fire multiple times, the accumulation mode determines whether the system accumulates the window panes as the trigger fires, or discards them. </p> <p> <b>Kata:</b> Given that events are being generated every second and a fixed window of 1-day duration, please implement an early trigger that emits the number of events count immediately after new element is processed in accumulating mode. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/2.13.0/org/apache/beam/sdk/transforms/windowing/Window.html#accumulatingFiredPanes--"> accumulatingFiredPanes()</a> to set a window to accumulate the panes that are produced when the trigger fires. </div> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/windowing/AfterWatermark.AfterWatermarkEarlyAndLate.html#withEarlyFirings-org.apache.beam.sdk.transforms.windowing.Trigger.OnceTrigger-"> withEarlyFirings</a> to set early firing triggers. </div> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/windowing/FixedWindows.html"> FixedWindows</a> with 1-day duration using <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/windowing/AfterWatermark.html#pastEndOfWindow--"> AfterWatermark.pastEndOfWindow()</a> trigger. </div> <div class="hint"> Set the <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/windowing/Window.html#withAllowedLateness-org.joda.time.Duration-"> allowed lateness</a> to 0. </div> <div class="hint"> Use <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Combine.html#globally-org.apache.beam.sdk.transforms.CombineFnBase.GlobalCombineFn-"> Combine.globally</a> and <a href="https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Count.html#combineFn--"> Count.combineFn</a> to calculate the count of events. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#event-time-triggers"> "Event time triggers"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="753147" />
- <option name="index" value="1" />
- <option name="name" value="Window Accumulation Mode" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/triggers/windowaccummode/GenerateEvent.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/triggers/windowaccummode/GenerateEvent.java" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="src/org/apache/beam/learning/katas/triggers/windowaccummode/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1972" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="events .apply( Window.<String>into(FixedWindows.of(Duration.standardDays(1))) .triggering( AfterWatermark.pastEndOfWindow() .withEarlyFirings( AfterProcessingTime.pastFirstElementInPane())) .withAllowedLateness(Duration.ZERO) .accumulatingFiredPanes()) .apply(Combine.globally(Count.<String>combineFn()).withoutDefaults())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/triggers/windowaccummode/Task.java" />
- <option name="text" value="public class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/triggers/windowaccummode/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/triggers/windowaccummode/TaskTest.java" />
- <option name="text" value="import org.junit.Assert; import org.junit.Test; public class Tests { @Test public void testSolution() { // put your test here Assert.fail("Tests not implemented for the task"); } }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560923544000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- </list>
- </option>
- </Section>
- <Section>
- <option name="courseId" value="54530" />
- <option name="customPresentableName" />
- <option name="id" value="85642" />
- <option name="index" value="7" />
- <option name="name" value="Examples" />
- <option name="position" value="7" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1557824624000" />
- <option name="items">
- <list>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="229515" />
- <option name="index" value="1" />
- <option name="name" value="Word Count" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1557824624000" />
- <option name="unitId" value="202040" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Word Count Pipeline</h2> <p> <b>Kata:</b> Create a pipeline that counts the number of words. </p> <p> Please output the count of each word in the following format: </p> <pre> word:count ball:5 book:3 </pre> <br> <div class="hint"> Refer to your katas above. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="713743" />
- <option name="index" value="1" />
- <option name="name" value="Word Count" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="src/org/apache/beam/learning/katas/examples/wordcount/Task.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="2075" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="input .apply(FlatMapElements.into(TypeDescriptors.strings()) .via(line -> Arrays.asList(line.split(" ")))) .apply(Count.perElement()) .apply(ParDo.of(new DoFn<KV<String, Long>, String>() { @ProcessElement public void processElement( @Element KV<String, Long> element, OutputReceiver<String> out) { out.output(element.getKey() + ":" + element.getValue()); } }))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="src/org/apache/beam/learning/katas/examples/wordcount/Task.java" />
- <option name="text" value="class Task { //put your task here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="test/org/apache/beam/learning/katas/examples/wordcount/TaskTest.java">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="test/org/apache/beam/learning/katas/examples/wordcount/TaskTest.java" />
- <option name="text" value="public class Test { // put your test here }" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560936261000" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- </list>
- </option>
- </Section>
- </list>
- </option>
- </EduCourse>
- </option>
- </StudyTaskManager>
- </component>
-</project>
\ No newline at end of file
diff --git a/learning/katas/java/Common Transforms/Aggregation/Count/task-info.yaml b/learning/katas/java/Common Transforms/Aggregation/Count/task-info.yaml
new file mode 100644
index 0000000..3240233
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Aggregation/Count/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/commontransforms/aggregation/count/Task.java
+ visible: true
+ placeholders:
+ - offset: 1707
+ length: 29
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/commontransforms/aggregation/count/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Common Transforms/Aggregation/Count/task-remote-info.yaml b/learning/katas/java/Common Transforms/Aggregation/Count/task-remote-info.yaml
new file mode 100644
index 0000000..fa07c2c
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Aggregation/Count/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713738
+update_date: Wed, 19 Jun 2019 09:23:51 UTC
diff --git a/learning/katas/java/Common Transforms/Aggregation/Max/task-info.yaml b/learning/katas/java/Common Transforms/Aggregation/Max/task-info.yaml
new file mode 100644
index 0000000..cd10c1f
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Aggregation/Max/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/commontransforms/aggregation/max/Task.java
+ visible: true
+ placeholders:
+ - offset: 1709
+ length: 35
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/commontransforms/aggregation/max/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Common Transforms/Aggregation/Max/task-remote-info.yaml b/learning/katas/java/Common Transforms/Aggregation/Max/task-remote-info.yaml
new file mode 100644
index 0000000..8118f55
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Aggregation/Max/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713742
+update_date: Wed, 19 Jun 2019 09:24:06 UTC
diff --git a/learning/katas/java/Common Transforms/Aggregation/Mean/task-info.yaml b/learning/katas/java/Common Transforms/Aggregation/Mean/task-info.yaml
new file mode 100644
index 0000000..5bf18f3
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Aggregation/Mean/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/commontransforms/aggregation/mean/Task.java
+ visible: true
+ placeholders:
+ - offset: 1709
+ length: 28
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/commontransforms/aggregation/mean/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Common Transforms/Aggregation/Mean/task-remote-info.yaml b/learning/katas/java/Common Transforms/Aggregation/Mean/task-remote-info.yaml
new file mode 100644
index 0000000..4178484
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Aggregation/Mean/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713740
+update_date: Wed, 19 Jun 2019 09:23:58 UTC
diff --git a/learning/katas/java/Common Transforms/Aggregation/Min/task-info.yaml b/learning/katas/java/Common Transforms/Aggregation/Min/task-info.yaml
new file mode 100644
index 0000000..b49a33f
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Aggregation/Min/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/commontransforms/aggregation/min/Task.java
+ visible: true
+ placeholders:
+ - offset: 1709
+ length: 35
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/commontransforms/aggregation/min/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Common Transforms/Aggregation/Min/task-remote-info.yaml b/learning/katas/java/Common Transforms/Aggregation/Min/task-remote-info.yaml
new file mode 100644
index 0000000..27c135f
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Aggregation/Min/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713741
+update_date: Wed, 19 Jun 2019 09:24:02 UTC
diff --git a/learning/katas/java/Common Transforms/Aggregation/Sum/task-info.yaml b/learning/katas/java/Common Transforms/Aggregation/Sum/task-info.yaml
new file mode 100644
index 0000000..c2427be
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Aggregation/Sum/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/commontransforms/aggregation/sum/Task.java
+ visible: true
+ placeholders:
+ - offset: 1709
+ length: 35
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/commontransforms/aggregation/sum/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Common Transforms/Aggregation/Sum/task-remote-info.yaml b/learning/katas/java/Common Transforms/Aggregation/Sum/task-remote-info.yaml
new file mode 100644
index 0000000..8fcdebd
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Aggregation/Sum/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713739
+update_date: Wed, 19 Jun 2019 09:23:55 UTC
diff --git a/learning/katas/java/Common Transforms/Aggregation/lesson-info.yaml b/learning/katas/java/Common Transforms/Aggregation/lesson-info.yaml
new file mode 100644
index 0000000..8ea5f25
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Aggregation/lesson-info.yaml
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Count
+- Sum
+- Mean
+- Min
+- Max
diff --git a/learning/katas/java/Common Transforms/Aggregation/lesson-remote-info.yaml b/learning/katas/java/Common Transforms/Aggregation/lesson-remote-info.yaml
new file mode 100644
index 0000000..89a3ea1
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Aggregation/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 229514
+update_date: Fri, 31 May 2019 17:51:12 UTC
+unit: 202039
diff --git a/learning/katas/java/Common Transforms/Filter/Filter/task-info.yaml b/learning/katas/java/Common Transforms/Filter/Filter/task-info.yaml
new file mode 100644
index 0000000..e8a3893
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Filter/Filter/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/commontransforms/filter/filter/Task.java
+ visible: true
+ placeholders:
+ - offset: 1718
+ length: 49
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/commontransforms/filter/filter/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Common Transforms/Filter/Filter/task-remote-info.yaml b/learning/katas/java/Common Transforms/Filter/Filter/task-remote-info.yaml
new file mode 100644
index 0000000..541af99
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Filter/Filter/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713737
+update_date: Wed, 19 Jun 2019 09:23:47 UTC
diff --git a/learning/katas/java/Common Transforms/Filter/ParDo/task-info.yaml b/learning/katas/java/Common Transforms/Filter/ParDo/task-info.yaml
new file mode 100644
index 0000000..530f7ae
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Filter/ParDo/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/commontransforms/filter/pardo/Task.java
+ visible: true
+ placeholders:
+ - offset: 1752
+ length: 292
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/commontransforms/filter/pardo/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Common Transforms/Filter/ParDo/task-remote-info.yaml b/learning/katas/java/Common Transforms/Filter/ParDo/task-remote-info.yaml
new file mode 100644
index 0000000..c8c7e67
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Filter/ParDo/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713736
+update_date: Wed, 19 Jun 2019 09:23:44 UTC
diff --git a/learning/katas/java/Common Transforms/Filter/lesson-info.yaml b/learning/katas/java/Common Transforms/Filter/lesson-info.yaml
new file mode 100644
index 0000000..93f7b5a
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Filter/lesson-info.yaml
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- ParDo
+- Filter
diff --git a/learning/katas/java/Common Transforms/Filter/lesson-remote-info.yaml b/learning/katas/java/Common Transforms/Filter/lesson-remote-info.yaml
new file mode 100644
index 0000000..2cc11c0
--- /dev/null
+++ b/learning/katas/java/Common Transforms/Filter/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 229513
+update_date: Fri, 31 May 2019 17:50:56 UTC
+unit: 202038
diff --git a/learning/katas/java/Common Transforms/WithKeys/WithKeys/task-info.yaml b/learning/katas/java/Common Transforms/WithKeys/WithKeys/task-info.yaml
new file mode 100644
index 0000000..a89b0ad
--- /dev/null
+++ b/learning/katas/java/Common Transforms/WithKeys/WithKeys/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/commontransforms/withkeys/Task.java
+ visible: true
+ placeholders:
+ - offset: 1875
+ length: 117
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/commontransforms/withkeys/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Common Transforms/WithKeys/WithKeys/task-remote-info.yaml b/learning/katas/java/Common Transforms/WithKeys/WithKeys/task-remote-info.yaml
new file mode 100644
index 0000000..483d9c7
--- /dev/null
+++ b/learning/katas/java/Common Transforms/WithKeys/WithKeys/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 754089
+update_date: Wed, 19 Jun 2019 09:24:09 UTC
diff --git a/learning/katas/java/Common Transforms/WithKeys/lesson-info.yaml b/learning/katas/java/Common Transforms/WithKeys/lesson-info.yaml
new file mode 100644
index 0000000..1179567
--- /dev/null
+++ b/learning/katas/java/Common Transforms/WithKeys/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- WithKeys
diff --git a/learning/katas/java/Common Transforms/WithKeys/lesson-remote-info.yaml b/learning/katas/java/Common Transforms/WithKeys/lesson-remote-info.yaml
new file mode 100644
index 0000000..f0b0043
--- /dev/null
+++ b/learning/katas/java/Common Transforms/WithKeys/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 237992
+update_date: Mon, 17 Jun 2019 17:11:31 UTC
+unit: -1
diff --git a/learning/katas/java/Common Transforms/section-info.yaml b/learning/katas/java/Common Transforms/section-info.yaml
new file mode 100644
index 0000000..b32b98a
--- /dev/null
+++ b/learning/katas/java/Common Transforms/section-info.yaml
@@ -0,0 +1,23 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Filter
+- Aggregation
+- WithKeys
diff --git a/learning/katas/java/Common Transforms/section-remote-info.yaml b/learning/katas/java/Common Transforms/section-remote-info.yaml
new file mode 100644
index 0000000..e0a23e3
--- /dev/null
+++ b/learning/katas/java/Common Transforms/section-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 85641
+update_date: Fri, 31 May 2019 17:51:12 UTC
diff --git a/learning/katas/java/Core Transforms/Branching/Branching/task-info.yaml b/learning/katas/java/Core Transforms/Branching/Branching/task-info.yaml
new file mode 100644
index 0000000..9f8b8f8
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Branching/Branching/task-info.yaml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/branching/Task.java
+ visible: true
+ placeholders:
+ - offset: 1994
+ length: 78
+ placeholder_text: TODO()
+ - offset: 2175
+ length: 80
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/coretransforms/branching/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/Branching/Branching/task-remote-info.yaml b/learning/katas/java/Core Transforms/Branching/Branching/task-remote-info.yaml
new file mode 100644
index 0000000..9964051
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Branching/Branching/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 754088
+update_date: Wed, 19 Jun 2019 09:23:39 UTC
diff --git a/learning/katas/java/Core Transforms/Branching/lesson-info.yaml b/learning/katas/java/Core Transforms/Branching/lesson-info.yaml
new file mode 100644
index 0000000..25ecc7c
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Branching/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Branching
diff --git a/learning/katas/java/Core Transforms/Branching/lesson-remote-info.yaml b/learning/katas/java/Core Transforms/Branching/lesson-remote-info.yaml
new file mode 100644
index 0000000..d97bc3c
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Branching/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 237991
+update_date: Mon, 17 Jun 2019 17:10:58 UTC
+unit: -1
diff --git a/learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task-info.yaml b/learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task-info.yaml
new file mode 100644
index 0000000..84ee96d
--- /dev/null
+++ b/learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task-info.yaml
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/cogroupbykey/Task.java
+ visible: true
+ placeholders:
+ - offset: 2418
+ length: 1198
+ placeholder_text: TODO()
+- name: src/org/apache/beam/learning/katas/coretransforms/cogroupbykey/WordsAlphabet.java
+ visible: true
+- name: test/org/apache/beam/learning/katas/coretransforms/cogroupbykey/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task-remote-info.yaml b/learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task-remote-info.yaml
new file mode 100644
index 0000000..b3a4f7e
--- /dev/null
+++ b/learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713729
+update_date: Wed, 19 Jun 2019 09:23:00 UTC
diff --git a/learning/katas/java/Core Transforms/CoGroupByKey/lesson-info.yaml b/learning/katas/java/Core Transforms/CoGroupByKey/lesson-info.yaml
new file mode 100644
index 0000000..273c077
--- /dev/null
+++ b/learning/katas/java/Core Transforms/CoGroupByKey/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- CoGroupByKey
diff --git a/learning/katas/java/Core Transforms/CoGroupByKey/lesson-remote-info.yaml b/learning/katas/java/Core Transforms/CoGroupByKey/lesson-remote-info.yaml
new file mode 100644
index 0000000..90bafc0
--- /dev/null
+++ b/learning/katas/java/Core Transforms/CoGroupByKey/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 229509
+update_date: Fri, 31 May 2019 17:50:32 UTC
+unit: -1
diff --git a/learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task-info.yaml b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task-info.yaml
new file mode 100644
index 0000000..7498a41
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/combine/binarycombinefnlambda/Task.java
+ visible: true
+ placeholders:
+ - offset: 1922
+ length: 46
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/coretransforms/combine/binarycombinefnlambda/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task-remote-info.yaml b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task-remote-info.yaml
new file mode 100644
index 0000000..e0abb18
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 750324
+update_date: Wed, 19 Jun 2019 09:23:15 UTC
diff --git a/learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task-info.yaml b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task-info.yaml
new file mode 100644
index 0000000..c63c45a
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/combine/binarycombinefn/Task.java
+ visible: true
+ placeholders:
+ - offset: 2125
+ length: 110
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/coretransforms/combine/binarycombinefn/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task-remote-info.yaml b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task-remote-info.yaml
new file mode 100644
index 0000000..6eccbcb
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713732
+update_date: Wed, 19 Jun 2019 09:23:11 UTC
diff --git a/learning/katas/java/Core Transforms/Combine/Combine PerKey/task-info.yaml b/learning/katas/java/Core Transforms/Combine/Combine PerKey/task-info.yaml
new file mode 100644
index 0000000..6e55b86
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Combine/Combine PerKey/task-info.yaml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/combine/combineperkey/Task.java
+ visible: true
+ placeholders:
+ - offset: 2155
+ length: 56
+ placeholder_text: TODO()
+ - offset: 2295
+ length: 98
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/coretransforms/combine/combineperkey/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/Combine/Combine PerKey/task-remote-info.yaml b/learning/katas/java/Core Transforms/Combine/Combine PerKey/task-remote-info.yaml
new file mode 100644
index 0000000..92af2aa
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Combine/Combine PerKey/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713733
+update_date: Wed, 19 Jun 2019 09:23:19 UTC
diff --git a/learning/katas/java/Core Transforms/Combine/CombineFn/task-info.yaml b/learning/katas/java/Core Transforms/Combine/CombineFn/task-info.yaml
new file mode 100644
index 0000000..12d049a
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Combine/CombineFn/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/combine/combinefn/Task.java
+ visible: true
+ placeholders:
+ - offset: 1962
+ length: 1173
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/coretransforms/combine/combinefn/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/Combine/CombineFn/task-remote-info.yaml b/learning/katas/java/Core Transforms/Combine/CombineFn/task-remote-info.yaml
new file mode 100644
index 0000000..c96549b
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Combine/CombineFn/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713731
+update_date: Wed, 19 Jun 2019 09:23:08 UTC
diff --git a/learning/katas/java/Core Transforms/Combine/Simple Function/task-info.yaml b/learning/katas/java/Core Transforms/Combine/Simple Function/task-info.yaml
new file mode 100644
index 0000000..8ccafb1
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Combine/Simple Function/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/combine/simple/Task.java
+ visible: true
+ placeholders:
+ - offset: 1923
+ length: 166
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/coretransforms/combine/simple/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/Combine/Simple Function/task-remote-info.yaml b/learning/katas/java/Core Transforms/Combine/Simple Function/task-remote-info.yaml
new file mode 100644
index 0000000..599514b
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Combine/Simple Function/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713730
+update_date: Wed, 19 Jun 2019 09:23:04 UTC
diff --git a/learning/katas/java/Core Transforms/Combine/lesson-info.yaml b/learning/katas/java/Core Transforms/Combine/lesson-info.yaml
new file mode 100644
index 0000000..b275018
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Combine/lesson-info.yaml
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Simple Function
+- CombineFn
+- BinaryCombineFn
+- BinaryCombineFn Lambda
+- Combine PerKey
diff --git a/learning/katas/java/Core Transforms/Combine/lesson-remote-info.yaml b/learning/katas/java/Core Transforms/Combine/lesson-remote-info.yaml
new file mode 100644
index 0000000..615d21b
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Combine/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 229510
+update_date: Fri, 31 May 2019 17:50:44 UTC
+unit: -1
diff --git a/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task-info.yaml b/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task-info.yaml
new file mode 100644
index 0000000..4278037
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/composite/Task.java
+ visible: true
+ placeholders:
+ - offset: 1929
+ length: 511
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/coretransforms/composite/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task-remote-info.yaml b/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task-remote-info.yaml
new file mode 100644
index 0000000..1ec9f21
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 750323
+update_date: Mon, 17 Jun 2019 17:13:38 UTC
diff --git a/learning/katas/java/Core Transforms/Composite Transform/lesson-info.yaml b/learning/katas/java/Core Transforms/Composite Transform/lesson-info.yaml
new file mode 100644
index 0000000..177eab1
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Composite Transform/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Composite Transform
diff --git a/learning/katas/java/Core Transforms/Composite Transform/lesson-remote-info.yaml b/learning/katas/java/Core Transforms/Composite Transform/lesson-remote-info.yaml
new file mode 100644
index 0000000..405c1c0
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Composite Transform/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 237192
+update_date: Thu, 13 Jun 2019 13:11:00 UTC
+unit: -1
diff --git a/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task-info.yaml b/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task-info.yaml
new file mode 100644
index 0000000..c39551e
--- /dev/null
+++ b/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task-info.yaml
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/dofnadditionalparams/Task.java
+ visible: true
+- name: test/org/apache/beam/learning/katas/coretransforms/dofnadditionalparams/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task-remote-info.yaml b/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task-remote-info.yaml
new file mode 100644
index 0000000..fa52285
--- /dev/null
+++ b/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 753154
+update_date: Sun, 16 Jun 2019 15:37:43 UTC
diff --git a/learning/katas/java/Core Transforms/DoFn Additional Parameters/lesson-info.yaml b/learning/katas/java/Core Transforms/DoFn Additional Parameters/lesson-info.yaml
new file mode 100644
index 0000000..1a41bfb
--- /dev/null
+++ b/learning/katas/java/Core Transforms/DoFn Additional Parameters/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- DoFn Additional Parameters
diff --git a/learning/katas/java/Core Transforms/DoFn Additional Parameters/lesson-remote-info.yaml b/learning/katas/java/Core Transforms/DoFn Additional Parameters/lesson-remote-info.yaml
new file mode 100644
index 0000000..acff592
--- /dev/null
+++ b/learning/katas/java/Core Transforms/DoFn Additional Parameters/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 237765
+update_date: Thu, 01 Jan 1970 00:00:00 UTC
+unit: -1
diff --git a/learning/katas/java/Core Transforms/Flatten/Flatten/task-info.yaml b/learning/katas/java/Core Transforms/Flatten/Flatten/task-info.yaml
new file mode 100644
index 0000000..81405f5
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Flatten/Flatten/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/flatten/Task.java
+ visible: true
+ placeholders:
+ - offset: 2040
+ length: 77
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/coretransforms/flatten/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/Flatten/Flatten/task-remote-info.yaml b/learning/katas/java/Core Transforms/Flatten/Flatten/task-remote-info.yaml
new file mode 100644
index 0000000..6666fa7
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Flatten/Flatten/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713734
+update_date: Wed, 19 Jun 2019 09:23:22 UTC
diff --git a/learning/katas/java/Core Transforms/Flatten/lesson-info.yaml b/learning/katas/java/Core Transforms/Flatten/lesson-info.yaml
new file mode 100644
index 0000000..fd01c86
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Flatten/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Flatten
diff --git a/learning/katas/java/Core Transforms/Flatten/lesson-remote-info.yaml b/learning/katas/java/Core Transforms/Flatten/lesson-remote-info.yaml
new file mode 100644
index 0000000..04b0820
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Flatten/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 229511
+update_date: Fri, 31 May 2019 17:50:47 UTC
+unit: -1
diff --git a/learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task-info.yaml b/learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task-info.yaml
new file mode 100644
index 0000000..e5d80a9
--- /dev/null
+++ b/learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/groupbykey/Task.java
+ visible: true
+ placeholders:
+ - offset: 2025
+ length: 162
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/coretransforms/groupbykey/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task-remote-info.yaml b/learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task-remote-info.yaml
new file mode 100644
index 0000000..839d84c
--- /dev/null
+++ b/learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713728
+update_date: Wed, 19 Jun 2019 09:22:57 UTC
diff --git a/learning/katas/java/Core Transforms/GroupByKey/lesson-info.yaml b/learning/katas/java/Core Transforms/GroupByKey/lesson-info.yaml
new file mode 100644
index 0000000..5de9eb6
--- /dev/null
+++ b/learning/katas/java/Core Transforms/GroupByKey/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- GroupByKey
diff --git a/learning/katas/java/Core Transforms/GroupByKey/lesson-remote-info.yaml b/learning/katas/java/Core Transforms/GroupByKey/lesson-remote-info.yaml
new file mode 100644
index 0000000..32f0bc7
--- /dev/null
+++ b/learning/katas/java/Core Transforms/GroupByKey/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 229508
+update_date: Fri, 31 May 2019 17:50:29 UTC
+unit: -1
diff --git a/learning/katas/java/Core Transforms/Map/FlatMapElements/task-info.yaml b/learning/katas/java/Core Transforms/Map/FlatMapElements/task-info.yaml
new file mode 100644
index 0000000..2d21f7d
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Map/FlatMapElements/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/map/flatmapelements/Task.java
+ visible: true
+ placeholders:
+ - offset: 1835
+ length: 139
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/coretransforms/map/flatmapelements/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/Map/FlatMapElements/task-remote-info.yaml b/learning/katas/java/Core Transforms/Map/FlatMapElements/task-remote-info.yaml
new file mode 100644
index 0000000..26d32fb
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Map/FlatMapElements/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713727
+update_date: Mon, 17 Jun 2019 17:13:06 UTC
diff --git a/learning/katas/java/Core Transforms/Map/MapElements/task-info.yaml b/learning/katas/java/Core Transforms/Map/MapElements/task-info.yaml
new file mode 100644
index 0000000..6f378dc
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Map/MapElements/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/map/mapelements/Task.java
+ visible: true
+ placeholders:
+ - offset: 1776
+ length: 110
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/coretransforms/map/mapelements/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/Map/MapElements/task-remote-info.yaml b/learning/katas/java/Core Transforms/Map/MapElements/task-remote-info.yaml
new file mode 100644
index 0000000..419e2969
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Map/MapElements/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713726
+update_date: Wed, 19 Jun 2019 09:22:52 UTC
diff --git a/learning/katas/java/Core Transforms/Map/ParDo OneToMany/task-info.yaml b/learning/katas/java/Core Transforms/Map/ParDo OneToMany/task-info.yaml
new file mode 100644
index 0000000..f9edc2b
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Map/ParDo OneToMany/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/map/pardoonetomany/Task.java
+ visible: true
+ placeholders:
+ - offset: 1777
+ length: 299
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/coretransforms/map/pardoonetomany/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/Map/ParDo OneToMany/task-remote-info.yaml b/learning/katas/java/Core Transforms/Map/ParDo OneToMany/task-remote-info.yaml
new file mode 100644
index 0000000..bcbabab
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Map/ParDo OneToMany/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713725
+update_date: Wed, 19 Jun 2019 09:22:49 UTC
diff --git a/learning/katas/java/Core Transforms/Map/ParDo/task-info.yaml b/learning/katas/java/Core Transforms/Map/ParDo/task-info.yaml
new file mode 100644
index 0000000..9c63446
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Map/ParDo/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/map/pardo/Task.java
+ visible: true
+ placeholders:
+ - offset: 1752
+ length: 213
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/coretransforms/map/pardo/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/Map/ParDo/task-remote-info.yaml b/learning/katas/java/Core Transforms/Map/ParDo/task-remote-info.yaml
new file mode 100644
index 0000000..a3e4393
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Map/ParDo/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713724
+update_date: Wed, 19 Jun 2019 09:22:46 UTC
diff --git a/learning/katas/java/Core Transforms/Map/lesson-info.yaml b/learning/katas/java/Core Transforms/Map/lesson-info.yaml
new file mode 100644
index 0000000..ad6558f
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Map/lesson-info.yaml
@@ -0,0 +1,24 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- ParDo
+- ParDo OneToMany
+- MapElements
+- FlatMapElements
diff --git a/learning/katas/java/Core Transforms/Map/lesson-remote-info.yaml b/learning/katas/java/Core Transforms/Map/lesson-remote-info.yaml
new file mode 100644
index 0000000..b6dbff4
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Map/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 229507
+update_date: Fri, 31 May 2019 17:50:26 UTC
+unit: -1
diff --git a/learning/katas/java/Core Transforms/Partition/Partition/task-info.yaml b/learning/katas/java/Core Transforms/Partition/Partition/task-info.yaml
new file mode 100644
index 0000000..6537f92
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Partition/Partition/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/partition/Task.java
+ visible: true
+ placeholders:
+ - offset: 1966
+ length: 241
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/coretransforms/partition/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/Partition/Partition/task-remote-info.yaml b/learning/katas/java/Core Transforms/Partition/Partition/task-remote-info.yaml
new file mode 100644
index 0000000..6548036
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Partition/Partition/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713735
+update_date: Wed, 19 Jun 2019 09:23:26 UTC
diff --git a/learning/katas/java/Core Transforms/Partition/lesson-info.yaml b/learning/katas/java/Core Transforms/Partition/lesson-info.yaml
new file mode 100644
index 0000000..c15773b2
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Partition/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Partition
diff --git a/learning/katas/java/Core Transforms/Partition/lesson-remote-info.yaml b/learning/katas/java/Core Transforms/Partition/lesson-remote-info.yaml
new file mode 100644
index 0000000..551c068
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Partition/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 229512
+update_date: Fri, 31 May 2019 17:50:50 UTC
+unit: -1
diff --git a/learning/katas/java/Core Transforms/Side Input/Side Input/task-info.yaml b/learning/katas/java/Core Transforms/Side Input/Side Input/task-info.yaml
new file mode 100644
index 0000000..1568f8c
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Side Input/Side Input/task-info.yaml
@@ -0,0 +1,34 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/sideinput/Task.java
+ visible: true
+ placeholders:
+ - offset: 2716
+ length: 37
+ placeholder_text: TODO()
+ - offset: 2914
+ length: 500
+ placeholder_text: TODO()
+- name: src/org/apache/beam/learning/katas/coretransforms/sideinput/Person.java
+ visible: true
+- name: test/org/apache/beam/learning/katas/coretransforms/sideinput/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/Side Input/Side Input/task-remote-info.yaml b/learning/katas/java/Core Transforms/Side Input/Side Input/task-remote-info.yaml
new file mode 100644
index 0000000..ca24e4d
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Side Input/Side Input/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 754085
+update_date: Wed, 19 Jun 2019 09:23:30 UTC
diff --git a/learning/katas/java/Core Transforms/Side Input/lesson-info.yaml b/learning/katas/java/Core Transforms/Side Input/lesson-info.yaml
new file mode 100644
index 0000000..210e3b0
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Side Input/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Side Input
diff --git a/learning/katas/java/Core Transforms/Side Input/lesson-remote-info.yaml b/learning/katas/java/Core Transforms/Side Input/lesson-remote-info.yaml
new file mode 100644
index 0000000..1c5d7d6
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Side Input/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 237989
+update_date: Mon, 17 Jun 2019 17:10:06 UTC
+unit: -1
diff --git a/learning/katas/java/Core Transforms/Side Output/Side Output/task-info.yaml b/learning/katas/java/Core Transforms/Side Output/Side Output/task-info.yaml
new file mode 100644
index 0000000..bd51850
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Side Output/Side Output/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/coretransforms/sideoutput/Task.java
+ visible: true
+ placeholders:
+ - offset: 2253
+ length: 398
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/coretransforms/sideoutput/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Core Transforms/Side Output/Side Output/task-remote-info.yaml b/learning/katas/java/Core Transforms/Side Output/Side Output/task-remote-info.yaml
new file mode 100644
index 0000000..0b6ad16
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Side Output/Side Output/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 754087
+update_date: Wed, 19 Jun 2019 09:23:35 UTC
diff --git a/learning/katas/java/Core Transforms/Side Output/lesson-info.yaml b/learning/katas/java/Core Transforms/Side Output/lesson-info.yaml
new file mode 100644
index 0000000..e9096c9
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Side Output/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Side Output
diff --git a/learning/katas/java/Core Transforms/Side Output/lesson-remote-info.yaml b/learning/katas/java/Core Transforms/Side Output/lesson-remote-info.yaml
new file mode 100644
index 0000000..9e69ea4
--- /dev/null
+++ b/learning/katas/java/Core Transforms/Side Output/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 237990
+update_date: Mon, 17 Jun 2019 17:10:45 UTC
+unit: -1
diff --git a/learning/katas/java/Core Transforms/section-info.yaml b/learning/katas/java/Core Transforms/section-info.yaml
new file mode 100644
index 0000000..7a9eda8
--- /dev/null
+++ b/learning/katas/java/Core Transforms/section-info.yaml
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Map
+- GroupByKey
+- CoGroupByKey
+- Combine
+- Flatten
+- Partition
+- Side Input
+- Side Output
+- Branching
+- Composite Transform
+- DoFn Additional Parameters
diff --git a/learning/katas/java/Core Transforms/section-remote-info.yaml b/learning/katas/java/Core Transforms/section-remote-info.yaml
new file mode 100644
index 0000000..75279de
--- /dev/null
+++ b/learning/katas/java/Core Transforms/section-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 85640
+update_date: Fri, 31 May 2019 17:50:50 UTC
diff --git a/learning/katas/java/Examples/Word Count/Word Count/task-info.yaml b/learning/katas/java/Examples/Word Count/Word Count/task-info.yaml
new file mode 100644
index 0000000..198ff72
--- /dev/null
+++ b/learning/katas/java/Examples/Word Count/Word Count/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/examples/wordcount/Task.java
+ visible: true
+ placeholders:
+ - offset: 2075
+ length: 466
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/examples/wordcount/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Examples/Word Count/Word Count/task-remote-info.yaml b/learning/katas/java/Examples/Word Count/Word Count/task-remote-info.yaml
new file mode 100644
index 0000000..e9435cc
--- /dev/null
+++ b/learning/katas/java/Examples/Word Count/Word Count/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713743
+update_date: Wed, 19 Jun 2019 09:24:21 UTC
diff --git a/learning/katas/java/Examples/Word Count/lesson-info.yaml b/learning/katas/java/Examples/Word Count/lesson-info.yaml
new file mode 100644
index 0000000..cbe1d6f
--- /dev/null
+++ b/learning/katas/java/Examples/Word Count/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Word Count
diff --git a/learning/katas/java/Examples/Word Count/lesson-remote-info.yaml b/learning/katas/java/Examples/Word Count/lesson-remote-info.yaml
new file mode 100644
index 0000000..c781960
--- /dev/null
+++ b/learning/katas/java/Examples/Word Count/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 229515
+update_date: Tue, 14 May 2019 09:03:44 UTC
+unit: 202040
diff --git a/learning/katas/java/Examples/section-info.yaml b/learning/katas/java/Examples/section-info.yaml
new file mode 100644
index 0000000..cbe1d6f
--- /dev/null
+++ b/learning/katas/java/Examples/section-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Word Count
diff --git a/learning/katas/java/Examples/section-remote-info.yaml b/learning/katas/java/Examples/section-remote-info.yaml
new file mode 100644
index 0000000..7eb38ff
--- /dev/null
+++ b/learning/katas/java/Examples/section-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 85642
+update_date: Tue, 14 May 2019 09:03:44 UTC
diff --git a/learning/katas/java/IO/Built-in IOs/Built-in IOs/task-info.yaml b/learning/katas/java/IO/Built-in IOs/Built-in IOs/task-info.yaml
new file mode 100644
index 0000000..d210f95
--- /dev/null
+++ b/learning/katas/java/IO/Built-in IOs/Built-in IOs/task-info.yaml
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/io/builtinios/Task.java
+ visible: true
+- name: test/org/apache/beam/learning/katas/io/builtinios/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/IO/Built-in IOs/Built-in IOs/task-remote-info.yaml b/learning/katas/java/IO/Built-in IOs/Built-in IOs/task-remote-info.yaml
new file mode 100644
index 0000000..882f03d
--- /dev/null
+++ b/learning/katas/java/IO/Built-in IOs/Built-in IOs/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 750319
+update_date: Wed, 19 Jun 2019 09:24:17 UTC
diff --git a/learning/katas/java/IO/Built-in IOs/Built-in IOs/task.html b/learning/katas/java/IO/Built-in IOs/Built-in IOs/task.html
index fa59837..447dfa3 100644
--- a/learning/katas/java/IO/Built-in IOs/Built-in IOs/task.html
+++ b/learning/katas/java/IO/Built-in IOs/Built-in IOs/task.html
@@ -26,4 +26,8 @@
See the <a href="https://beam.apache.org/documentation/io/built-in/">Beam-provided I/O
Transforms</a> page for a list of the currently available I/O transforms.
</p>
+<p>
+ <b>Note:</b> There is no kata for this task. Please click the "Check" button and
+ proceed to the next task.
+</p>
</html>
\ No newline at end of file
diff --git a/learning/katas/java/IO/Built-in IOs/lesson-info.yaml b/learning/katas/java/IO/Built-in IOs/lesson-info.yaml
new file mode 100644
index 0000000..af969f1
--- /dev/null
+++ b/learning/katas/java/IO/Built-in IOs/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Built-in IOs
diff --git a/learning/katas/java/IO/Built-in IOs/lesson-remote-info.yaml b/learning/katas/java/IO/Built-in IOs/lesson-remote-info.yaml
new file mode 100644
index 0000000..394c9d7
--- /dev/null
+++ b/learning/katas/java/IO/Built-in IOs/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 237188
+update_date: Thu, 13 Jun 2019 13:10:36 UTC
+unit: 209564
diff --git a/learning/katas/java/IO/TextIO/TextIO Read/task-info.yaml b/learning/katas/java/IO/TextIO/TextIO Read/task-info.yaml
new file mode 100644
index 0000000..4afb958
--- /dev/null
+++ b/learning/katas/java/IO/TextIO/TextIO Read/task-info.yaml
@@ -0,0 +1,34 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: countries.txt
+ visible: true
+- name: src/org/apache/beam/learning/katas/io/textio/read/Task.java
+ visible: true
+ placeholders:
+ - offset: 1615
+ length: 29
+ placeholder_text: TODO()
+ - offset: 1855
+ length: 65
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/io/textio/read/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/IO/TextIO/TextIO Read/task-remote-info.yaml b/learning/katas/java/IO/TextIO/TextIO Read/task-remote-info.yaml
new file mode 100644
index 0000000..1855111
--- /dev/null
+++ b/learning/katas/java/IO/TextIO/TextIO Read/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 750317
+update_date: Wed, 19 Jun 2019 09:24:13 UTC
diff --git a/learning/katas/java/IO/TextIO/lesson-info.yaml b/learning/katas/java/IO/TextIO/lesson-info.yaml
new file mode 100644
index 0000000..e671ddc
--- /dev/null
+++ b/learning/katas/java/IO/TextIO/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- TextIO Read
diff --git a/learning/katas/java/IO/TextIO/lesson-remote-info.yaml b/learning/katas/java/IO/TextIO/lesson-remote-info.yaml
new file mode 100644
index 0000000..32b85ae
--- /dev/null
+++ b/learning/katas/java/IO/TextIO/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 237187
+update_date: Thu, 13 Jun 2019 13:10:30 UTC
+unit: 209563
diff --git a/learning/katas/java/IO/section-info.yaml b/learning/katas/java/IO/section-info.yaml
new file mode 100644
index 0000000..1d93752
--- /dev/null
+++ b/learning/katas/java/IO/section-info.yaml
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- TextIO
+- Built-in IOs
diff --git a/learning/katas/java/IO/section-remote-info.yaml b/learning/katas/java/IO/section-remote-info.yaml
new file mode 100644
index 0000000..75c6c4b
--- /dev/null
+++ b/learning/katas/java/IO/section-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 88010
+update_date: Thu, 13 Jun 2019 13:10:25 UTC
diff --git a/learning/katas/java/Introduction/Hello Beam/Hello Beam/task-info.yaml b/learning/katas/java/Introduction/Hello Beam/Hello Beam/task-info.yaml
new file mode 100644
index 0000000..743146c
--- /dev/null
+++ b/learning/katas/java/Introduction/Hello Beam/Hello Beam/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/intro/hello/Task.java
+ visible: true
+ placeholders:
+ - offset: 1552
+ length: 39
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/intro/hello/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Introduction/Hello Beam/Hello Beam/task-remote-info.yaml b/learning/katas/java/Introduction/Hello Beam/Hello Beam/task-remote-info.yaml
new file mode 100644
index 0000000..8b745e4
--- /dev/null
+++ b/learning/katas/java/Introduction/Hello Beam/Hello Beam/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 713723
+update_date: Wed, 19 Jun 2019 09:22:42 UTC
diff --git a/learning/katas/java/Introduction/Hello Beam/lesson-info.yaml b/learning/katas/java/Introduction/Hello Beam/lesson-info.yaml
new file mode 100644
index 0000000..040e0ac
--- /dev/null
+++ b/learning/katas/java/Introduction/Hello Beam/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Hello Beam
diff --git a/learning/katas/java/Introduction/Hello Beam/lesson-remote-info.yaml b/learning/katas/java/Introduction/Hello Beam/lesson-remote-info.yaml
new file mode 100644
index 0000000..6dbcc30
--- /dev/null
+++ b/learning/katas/java/Introduction/Hello Beam/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 229506
+update_date: Fri, 31 May 2019 17:50:15 UTC
+unit: -1
diff --git a/learning/katas/java/Introduction/section-info.yaml b/learning/katas/java/Introduction/section-info.yaml
new file mode 100644
index 0000000..040e0ac
--- /dev/null
+++ b/learning/katas/java/Introduction/section-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Hello Beam
diff --git a/learning/katas/java/Introduction/section-remote-info.yaml b/learning/katas/java/Introduction/section-remote-info.yaml
new file mode 100644
index 0000000..fb06afd
--- /dev/null
+++ b/learning/katas/java/Introduction/section-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 85639
+update_date: Fri, 31 May 2019 17:50:15 UTC
diff --git a/learning/katas/java/Triggers/Early Triggers/Early Triggers/task-info.yaml b/learning/katas/java/Triggers/Early Triggers/Early Triggers/task-info.yaml
new file mode 100644
index 0000000..e22c28c
--- /dev/null
+++ b/learning/katas/java/Triggers/Early Triggers/Early Triggers/task-info.yaml
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/triggers/earlytriggers/GenerateEvent.java
+ visible: true
+- name: src/org/apache/beam/learning/katas/triggers/earlytriggers/Task.java
+ visible: true
+ placeholders:
+ - offset: 1970
+ length: 461
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/triggers/earlytriggers/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Triggers/Early Triggers/Early Triggers/task-remote-info.yaml b/learning/katas/java/Triggers/Early Triggers/Early Triggers/task-remote-info.yaml
new file mode 100644
index 0000000..b803384
--- /dev/null
+++ b/learning/katas/java/Triggers/Early Triggers/Early Triggers/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 753146
+update_date: Wed, 19 Jun 2019 05:52:11 UTC
diff --git a/learning/katas/java/Triggers/Early Triggers/lesson-info.yaml b/learning/katas/java/Triggers/Early Triggers/lesson-info.yaml
new file mode 100644
index 0000000..184f82e
--- /dev/null
+++ b/learning/katas/java/Triggers/Early Triggers/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Early Triggers
diff --git a/learning/katas/java/Triggers/Early Triggers/lesson-remote-info.yaml b/learning/katas/java/Triggers/Early Triggers/lesson-remote-info.yaml
new file mode 100644
index 0000000..e8483a3
--- /dev/null
+++ b/learning/katas/java/Triggers/Early Triggers/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 237763
+update_date: Wed, 19 Jun 2019 05:52:03 UTC
+unit: 210095
diff --git a/learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task-info.yaml b/learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task-info.yaml
new file mode 100644
index 0000000..c66ccc3
--- /dev/null
+++ b/learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task-info.yaml
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/triggers/eventtimetriggers/Task.java
+ visible: true
+ placeholders:
+ - offset: 1905
+ length: 334
+ placeholder_text: TODO()
+- name: src/org/apache/beam/learning/katas/triggers/eventtimetriggers/GenerateEvent.java
+ visible: true
+- name: test/org/apache/beam/learning/katas/triggers/eventtimetriggers/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task-remote-info.yaml b/learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task-remote-info.yaml
new file mode 100644
index 0000000..8e5e9a6
--- /dev/null
+++ b/learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 753145
+update_date: Wed, 19 Jun 2019 05:51:57 UTC
diff --git a/learning/katas/java/Triggers/Event Time Triggers/lesson-info.yaml b/learning/katas/java/Triggers/Event Time Triggers/lesson-info.yaml
new file mode 100644
index 0000000..e423635
--- /dev/null
+++ b/learning/katas/java/Triggers/Event Time Triggers/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Event Time Triggers
diff --git a/learning/katas/java/Triggers/Event Time Triggers/lesson-remote-info.yaml b/learning/katas/java/Triggers/Event Time Triggers/lesson-remote-info.yaml
new file mode 100644
index 0000000..220a642
--- /dev/null
+++ b/learning/katas/java/Triggers/Event Time Triggers/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 237762
+update_date: Wed, 19 Jun 2019 05:51:48 UTC
+unit: 210094
diff --git a/learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task-info.yaml b/learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task-info.yaml
new file mode 100644
index 0000000..73124eb
--- /dev/null
+++ b/learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task-info.yaml
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/triggers/windowaccummode/GenerateEvent.java
+ visible: true
+- name: src/org/apache/beam/learning/katas/triggers/windowaccummode/Task.java
+ visible: true
+ placeholders:
+ - offset: 1972
+ length: 471
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/triggers/windowaccummode/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task-remote-info.yaml b/learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task-remote-info.yaml
new file mode 100644
index 0000000..e6520db
--- /dev/null
+++ b/learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 753147
+update_date: Wed, 19 Jun 2019 05:52:24 UTC
diff --git a/learning/katas/java/Triggers/Window Accumulation Mode/lesson-info.yaml b/learning/katas/java/Triggers/Window Accumulation Mode/lesson-info.yaml
new file mode 100644
index 0000000..8a260af
--- /dev/null
+++ b/learning/katas/java/Triggers/Window Accumulation Mode/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Window Accumulation Mode
diff --git a/learning/katas/java/Triggers/Window Accumulation Mode/lesson-remote-info.yaml b/learning/katas/java/Triggers/Window Accumulation Mode/lesson-remote-info.yaml
new file mode 100644
index 0000000..65910cb
--- /dev/null
+++ b/learning/katas/java/Triggers/Window Accumulation Mode/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 237764
+update_date: Wed, 19 Jun 2019 05:52:17 UTC
+unit: 210096
diff --git a/learning/katas/java/Triggers/section-info.yaml b/learning/katas/java/Triggers/section-info.yaml
new file mode 100644
index 0000000..f62f316
--- /dev/null
+++ b/learning/katas/java/Triggers/section-info.yaml
@@ -0,0 +1,23 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Event Time Triggers
+- Early Triggers
+- Window Accumulation Mode
diff --git a/learning/katas/java/Triggers/section-remote-info.yaml b/learning/katas/java/Triggers/section-remote-info.yaml
new file mode 100644
index 0000000..c9311e9
--- /dev/null
+++ b/learning/katas/java/Triggers/section-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 88157
+update_date: Wed, 19 Jun 2019 05:51:45 UTC
diff --git a/learning/katas/java/Windowing/Adding Timestamp/ParDo/task-info.yaml b/learning/katas/java/Windowing/Adding Timestamp/ParDo/task-info.yaml
new file mode 100644
index 0000000..b31d737
--- /dev/null
+++ b/learning/katas/java/Windowing/Adding Timestamp/ParDo/task-info.yaml
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/windowing/addingtimestamp/pardo/Event.java
+ visible: true
+- name: src/org/apache/beam/learning/katas/windowing/addingtimestamp/pardo/Task.java
+ visible: true
+ placeholders:
+ - offset: 2249
+ length: 241
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/windowing/addingtimestamp/pardo/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Windowing/Adding Timestamp/ParDo/task-remote-info.yaml b/learning/katas/java/Windowing/Adding Timestamp/ParDo/task-remote-info.yaml
new file mode 100644
index 0000000..580180a
--- /dev/null
+++ b/learning/katas/java/Windowing/Adding Timestamp/ParDo/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 753142
+update_date: Sun, 16 Jun 2019 15:28:25 UTC
diff --git a/learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task-info.yaml b/learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task-info.yaml
new file mode 100644
index 0000000..a5933ec
--- /dev/null
+++ b/learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task-info.yaml
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/windowing/addingtimestamp/withtimestamps/Event.java
+ visible: true
+- name: src/org/apache/beam/learning/katas/windowing/addingtimestamp/withtimestamps/Task.java
+ visible: true
+ placeholders:
+ - offset: 2223
+ length: 69
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/windowing/addingtimestamp/withtimestamps/TaskTest.java
+ visible: false
diff --git a/learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task-remote-info.yaml b/learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task-remote-info.yaml
new file mode 100644
index 0000000..d30cf9b
--- /dev/null
+++ b/learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 753143
+update_date: Sun, 16 Jun 2019 15:28:27 UTC
diff --git a/learning/katas/java/Windowing/Adding Timestamp/lesson-info.yaml b/learning/katas/java/Windowing/Adding Timestamp/lesson-info.yaml
new file mode 100644
index 0000000..c6a234c
--- /dev/null
+++ b/learning/katas/java/Windowing/Adding Timestamp/lesson-info.yaml
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- ParDo
+- WithTimestamps
diff --git a/learning/katas/java/Windowing/Adding Timestamp/lesson-remote-info.yaml b/learning/katas/java/Windowing/Adding Timestamp/lesson-remote-info.yaml
new file mode 100644
index 0000000..b53679e
--- /dev/null
+++ b/learning/katas/java/Windowing/Adding Timestamp/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 237760
+update_date: Thu, 01 Jan 1970 00:00:00 UTC
+unit: 210092
diff --git a/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task-info.yaml b/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task-info.yaml
new file mode 100644
index 0000000..546807c
--- /dev/null
+++ b/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task-info.yaml
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: src/org/apache/beam/learning/katas/windowing/fixedwindow/Task.java
+ visible: true
+ placeholders:
+ - offset: 2906
+ length: 112
+ placeholder_text: TODO()
+- name: test/org/apache/beam/learning/katas/windowing/fixedwindow/TaskTest.java
+ visible: false
+- name: test/org/apache/beam/learning/katas/windowing/fixedwindow/WindowedEvent.java
+ visible: false
diff --git a/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task-remote-info.yaml b/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task-remote-info.yaml
new file mode 100644
index 0000000..1547b4d
--- /dev/null
+++ b/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 753144
+update_date: Sun, 16 Jun 2019 15:28:32 UTC
diff --git a/learning/katas/java/Windowing/Fixed Time Window/lesson-info.yaml b/learning/katas/java/Windowing/Fixed Time Window/lesson-info.yaml
new file mode 100644
index 0000000..9f65c8a
--- /dev/null
+++ b/learning/katas/java/Windowing/Fixed Time Window/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Fixed Time Window
diff --git a/learning/katas/java/Windowing/Fixed Time Window/lesson-remote-info.yaml b/learning/katas/java/Windowing/Fixed Time Window/lesson-remote-info.yaml
new file mode 100644
index 0000000..f2ff2fd
--- /dev/null
+++ b/learning/katas/java/Windowing/Fixed Time Window/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 237761
+update_date: Thu, 01 Jan 1970 00:00:00 UTC
+unit: 210093
diff --git a/learning/katas/java/Windowing/section-info.yaml b/learning/katas/java/Windowing/section-info.yaml
new file mode 100644
index 0000000..e5121f4
--- /dev/null
+++ b/learning/katas/java/Windowing/section-info.yaml
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Adding Timestamp
+- Fixed Time Window
diff --git a/learning/katas/java/Windowing/section-remote-info.yaml b/learning/katas/java/Windowing/section-remote-info.yaml
new file mode 100644
index 0000000..e476c4e
--- /dev/null
+++ b/learning/katas/java/Windowing/section-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 88156
+update_date: Sun, 16 Jun 2019 15:28:11 UTC
diff --git a/learning/katas/java/build.gradle b/learning/katas/java/build.gradle
index 4c962ec..2397072 100644
--- a/learning/katas/java/build.gradle
+++ b/learning/katas/java/build.gradle
@@ -18,14 +18,14 @@
buildscript {
ext {
- beamVersion = '2.13.0'
- guavaVersion = '27.1-jre'
- jodaTimeVersion = '2.10.3'
- slf4jVersion = '1.7.26'
- log4jSlf4jImpl = '2.11.2'
+ beamVersion = '2.16.0'
+ guavaVersion = '28.1-jre'
+ jodaTimeVersion = '2.10.4'
+ slf4jVersion = '1.7.28'
+ log4jSlf4jImpl = '2.12.1'
- assertjVersion = '3.12.2'
- hamcrestVersion = '1.3'
+ assertjVersion = '3.13.2'
+ hamcrestVersion = '2.1'
junitVersion = '4.12'
}
@@ -113,6 +113,6 @@
}
}
-task wrapper(type: Wrapper) {
- gradleVersion = '4.8'
+wrapper {
+ gradleVersion = '5.0'
}
diff --git a/learning/katas/java/course-info.yaml b/learning/katas/java/course-info.yaml
new file mode 100644
index 0000000..971fb91
--- /dev/null
+++ b/learning/katas/java/course-info.yaml
@@ -0,0 +1,33 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+title: Beam Katas - Java
+language: English
+summary: "This course provides a series of katas to get familiar with Apache Beam.\
+ \ \n\nApache Beam website – https://beam.apache.org/"
+programming_language: Java
+programming_language_version: 8
+content:
+- Introduction
+- Core Transforms
+- Common Transforms
+- IO
+- Windowing
+- Triggers
+- Examples
diff --git a/learning/katas/java/course-remote-info.yaml b/learning/katas/java/course-remote-info.yaml
new file mode 100644
index 0000000..e2b8f75
--- /dev/null
+++ b/learning/katas/java/course-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 54530
+update_date: Sun, 27 Oct 2019 16:05:43 UTC
diff --git a/learning/katas/python/.idea/study_project.xml b/learning/katas/python/.idea/study_project.xml
deleted file mode 100644
index 84e3db9..0000000
--- a/learning/katas/python/.idea/study_project.xml
+++ /dev/null
@@ -1,2317 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
- <component name="StudySettings">
- <StudyTaskManager>
- <option name="VERSION" value="14" />
- <option name="myUserTests">
- <map />
- </option>
- <option name="course">
- <EduCourse>
- <option name="authors">
- <list>
- <StepikUserInfo>
- <option name="firstName" value="Henry" />
- <option name="id" value="48485817" />
- <option name="lastName" value="Suryawirawan" />
- </StepikUserInfo>
- </list>
- </option>
- <option name="compatible" value="true" />
- <option name="courseMode" value="Course Creator" />
- <option name="createDate" value="1557824500323" />
- <option name="customPresentableName" />
- <option name="description" value="This course provides a series of katas to get familiar with Apache Beam. Apache Beam website – https://beam.apache.org/" />
- <option name="environment" value="" />
- <option name="fromZip" value="false" />
- <option name="id" value="54532" />
- <option name="index" value="-1" />
- <option name="instructors">
- <list>
- <option value="48485817" />
- </list>
- </option>
- <option name="language" value="Python 2.7" />
- <option name="languageCode" value="en" />
- <option name="name" value="Beam Katas - Python" />
- <option name="public" value="true" />
- <option name="sectionIds">
- <list />
- </option>
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="type" value="pycharm11 Python 2.7" />
- <option name="updateDate" value="1560937766000" />
- <option name="items">
- <list>
- <Section>
- <option name="courseId" value="54532" />
- <option name="customPresentableName" />
- <option name="id" value="85644" />
- <option name="index" value="1" />
- <option name="name" value="Introduction" />
- <option name="position" value="0" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1559325495000" />
- <option name="items">
- <list>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238426" />
- <option name="index" value="1" />
- <option name="name" value="Hello Beam" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1560937886298" />
- <option name="unitId" value="210886" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Hello Beam Pipeline</h2> <p> Apache Beam is an open source, unified model for defining both batch and streaming data-parallel processing pipelines. Using one of the open source Beam SDKs, you build a program that defines the pipeline. The pipeline is then executed by one of Beam’s supported distributed processing back-ends, which include Apache Apex, Apache Flink, Apache Spark, and Google Cloud Dataflow. </p> <p> Beam is particularly useful for Embarrassingly Parallel data processing tasks, in which the problem can be decomposed into many smaller bundles of data that can be processed independently and in parallel. You can also use Beam for Extract, Transform, and Load (ETL) tasks and pure data integration. These tasks are useful for moving data between different storage media and data sources, transforming data into a more desirable format, or loading data onto a new system. </p> <p> To learn more about Apache Beam, refer to <a href="https://beam.apache.org/get-started/beam-overview/">Apache Beam Overview</a>. </p> <p> <b>Kata:</b> Your first kata is to create a simple pipeline that takes a hardcoded input element "Hello Beam". </p> <br> <div class="hint"> Hardcoded input can be created using <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.Create"> Create</a>. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#creating-pcollection-in-memory"> "Creating a PCollection from in-memory data"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755575" />
- <option name="index" value="1" />
- <option name="name" value="Hello Beam" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="903" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.Create(['Hello Beam'])" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560937891911" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- </list>
- </option>
- </Section>
- <Section>
- <option name="courseId" value="54532" />
- <option name="customPresentableName" />
- <option name="id" value="85645" />
- <option name="index" value="2" />
- <option name="name" value="Core Transforms" />
- <option name="position" value="0" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1560432551000" />
- <option name="items">
- <list>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238427" />
- <option name="index" value="1" />
- <option name="name" value="Map" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560937929994" />
- <option name="unitId" value="210887" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>ParDo</h2> <p> ParDo is a Beam transform for generic parallel processing. The ParDo processing paradigm is similar to the “Map” phase of a Map/Shuffle/Reduce-style algorithm: a ParDo transform considers each element in the input PCollection, performs some processing function (your user code) on that element, and emits zero, one, or multiple elements to an output PCollection. </p> <p> <b>Kata:</b> Please write a simple ParDo that maps the input element by multiplying it by 10. </p> <br> <div class="hint"> Override <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.DoFn.process"> process</a> method. </div> <div class="hint"> Use <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.ParDo"> ParDo</a> with <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.DoFn">DoFn</a>. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#pardo">"ParDo"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755577" />
- <option name="index" value="1" />
- <option name="name" value="ParDo" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="919" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="def process(self, element): yield element * 10" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="1" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1036" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.ParDo(MultiplyByTenDoFn())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560937936091" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>ParDo OneToMany</h2> <p> <b>Kata:</b> Please write a ParDo that maps each input sentence into words tokenized by whitespace (" "). </p> <br> <div class="hint"> Override <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.DoFn.process"> process</a> method. You can return an Iterable for multiple elements or call "yield" for each element to return a generator. </div> <div class="hint"> Use <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.ParDo"> ParDo</a> with <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.DoFn"> DoFn</a>. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#pardo">"ParDo"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755578" />
- <option name="index" value="2" />
- <option name="name" value="ParDo OneToMany" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Info and Content changed" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="920" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="def process(self, element): return element.split()" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="1" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1057" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.ParDo(BreakIntoWordsDoFn())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560937938522" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>MapElements</h2> <p> The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. </p> <p> <b>Kata:</b> Implement a simple map function that multiplies all input elements by 5 using <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.Map"> Map</a>. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.Map"> Map</a> with a lambda. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#lightweight-dofns"> "Lightweight DoFns and other abstractions"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755579" />
- <option name="index" value="3" />
- <option name="name" value="Map" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="942" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.Map(lambda num: num * 5)" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560937942178" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>FlatMapElements</h2> <p> The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. </p> <p> FlatMap can be used to simplify DoFn that maps an element to multiple elements (one to many). </p> <p> <b>Kata:</b> Implement a function that maps each input sentence into words tokenized by whitespace (" ") using <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.FlatMap"> FlatMap</a>. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.FlatMap"> FlatMap</a> with a lambda. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#lightweight-dofns"> "Lightweight DoFns and other abstractions"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755580" />
- <option name="index" value="4" />
- <option name="name" value="FlatMap" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="968" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.FlatMap(lambda sentence: sentence.split())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560937944601" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238428" />
- <option name="index" value="2" />
- <option name="name" value="GroupByKey" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1560937980839" />
- <option name="unitId" value="210888" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>GroupByKey</h2> <p> GroupByKey is a Beam transform for processing collections of key/value pairs. It’s a parallel reduction operation, analogous to the Shuffle phase of a Map/Shuffle/Reduce-style algorithm. The input to GroupByKey is a collection of key/value pairs that represents a multimap, where the collection contains multiple pairs that have the same key, but different values. Given such a collection, you use GroupByKey to collect all of the values associated with each unique key. </p> <p> <b>Kata:</b> Implement a <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.GroupByKey"> GroupByKey</a> transform that groups words by its first letter. </p> <br> <div class="hint"> Refer to <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.GroupByKey">GroupByKey</a> to solve this problem. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#groupbykey"> "GroupByKey"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755582" />
- <option name="index" value="1" />
- <option name="name" value="GroupByKey" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="8" />
- <option name="offset" value="970" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="| TODO()" />
- <option name="possibleAnswer" value="| beam.Map(lambda word: (word[0], word)) | beam.GroupByKey()" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560937986273" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238429" />
- <option name="index" value="3" />
- <option name="name" value="CoGroupByKey" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1560938006360" />
- <option name="unitId" value="210889" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>CoGroupByKey</h2> <p> CoGroupByKey performs a relational join of two or more key/value PCollections that have the same key type. </p> <p> <b>Kata:</b> Implement a <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.CoGroupByKey"> CoGroupByKey</a> transform that join words by its first alphabetical letter, and then produces the string representation of the WordsAlphabet model. </p> <br> <div class="hint"> Refer to <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.CoGroupByKey"> CoGroupByKey</a>to solve this problem. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#cogroupbykey"> "CoGroupByKey"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755583" />
- <option name="index" value="1" />
- <option name="name" value="CoGroupByKey" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1228" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="def map_to_alphabet_kv(word): return (word[0], word) def cogbk_result_to_wordsalphabet(cgbk_result): (alphabet, words) = cgbk_result return WordsAlphabet(alphabet, words['fruits'][0], words['countries'][0]) fruits_kv = (fruits | 'Fruit to KV' >> beam.Map(map_to_alphabet_kv)) countries_kv = (countries | 'Country to KV' >> beam.Map(map_to_alphabet_kv)) return ({'fruits': fruits_kv, 'countries': countries_kv} | beam.CoGroupByKey() | beam.Map(cogbk_result_to_wordsalphabet))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938011025" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238430" />
- <option name="index" value="4" />
- <option name="name" value="Combine" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560938016807" />
- <option name="unitId" value="210890" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Combine - Simple Function</h2> <p> Combine is a Beam transform for combining collections of elements or values in your data. When you apply a Combine transform, you must provide the function that contains the logic for combining the elements or values. The combining function should be commutative and associative, as the function is not necessarily invoked exactly once on all values with a given key. Because the input data (including the value collection) may be distributed across multiple workers, the combining function might be called multiple times to perform partial combining on subsets of the value collection. </p> <p> Simple combine operations, such as sums, can usually be implemented as a simple function. </p> <p> <b>Kata:</b> Implement the summation of numbers using <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.CombineGlobally"> CombineGlobally</a>. </p> <br> <div class="hint"> Implement a simple Python function that performs the summation of the values. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#simple-combines"> "Simple combinations using simple functions"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755584" />
- <option name="index" value="1" />
- <option name="name" value="Simple Function" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="900" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="total = 0 for num in numbers: total += num return total" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="1" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1036" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.CombineGlobally(sum)" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938025042" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Combine - CombineFn</h2> <p> Combine is a Beam transform for combining collections of elements or values in your data. When you apply a Combine transform, you must provide the function that contains the logic for combining the elements or values. The combining function should be commutative and associative, as the function is not necessarily invoked exactly once on all values with a given key. Because the input data (including the value collection) may be distributed across multiple workers, the combining function might be called multiple times to perform partial combining on subsets of the value collection. </p> <p> Complex combination operations might require you to create a subclass of CombineFn that has an accumulation type distinct from the input/output type. You should use CombineFn if the combine function requires a more sophisticated accumulator, must perform additional pre- or post-processing, might change the output type, or takes the key into account. </p> <p> <b>Kata:</b> Implement the average of numbers using <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.CombineFn"> Combine.CombineFn</a>. </p> <br> <div class="hint"> Extend the <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.CombineFn"> CombineFn</a> class that counts the average of the number. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#advanced-combines"> "Advanced combinations using CombineFn"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755585" />
- <option name="index" value="2" />
- <option name="name" value="CombineFn" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="916" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="def create_accumulator(self): return 0.0, 0 def add_input(self, accumulator, element): (sum, count) = accumulator return sum + element, count + 1 def merge_accumulators(self, accumulators): sums, counts = zip(*accumulators) return sum(sums), sum(counts) def extract_output(self, accumulator): (sum, count) = accumulator return sum / count if count else float('NaN')" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="1" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1420" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.CombineGlobally(AverageFn())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938027519" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Combine - Combine PerKey</h2> <p> After creating a keyed PCollection (for example, by using a GroupByKey transform), a common pattern is to combine the collection of values associated with each key into a single, merged value. This pattern of a GroupByKey followed by merging the collection of values is equivalent to Combine PerKey transform. The combine function you supply to Combine PerKey must be an associative reduction function or a subclass of CombineFn. </p> <p> <b>Kata:</b> Implement the sum of scores per player using <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.CombinePerKey"> CombinePerKey</a>. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.CombinePerKey"> CombinePerKey(CombineFn)</a>. </div> <div class="hint"> Extend the <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.CombineFn"> CombineFn</a> class that counts the sum of the number. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#combining-values-in-a-keyed-pcollection"> "Combining values in a keyed PCollection"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755587" />
- <option name="index" value="3" />
- <option name="name" value="Combine PerKey" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1088" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.CombinePerKey(sum)" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938030159" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238431" />
- <option name="index" value="5" />
- <option name="name" value="Flatten" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560938036123" />
- <option name="unitId" value="210891" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Flatten</h2> <p> Flatten is a Beam transform for PCollection objects that store the same data type. Flatten merges multiple PCollection objects into a single logical PCollection. </p> <p> <b>Kata:</b> Implement a <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.Flatten"> Flatten</a> transform that merges two PCollection of words into a single PCollection. </p> <br> <div class="hint"> Refer to <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.Flatten"> Flatten</a> to solve this problem. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#flatten"> "Flatten"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755588" />
- <option name="index" value="1" />
- <option name="name" value="Flatten" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1140" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.Flatten()" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938041998" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238432" />
- <option name="index" value="6" />
- <option name="name" value="Partition" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560938052303" />
- <option name="unitId" value="210892" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Partition</h2> <p> Partition is a Beam transform for PCollection objects that store the same data type. Partition splits a single PCollection into a fixed number of smaller collections. </p> <p> Partition divides the elements of a PCollection according to a partitioning function that you provide. The partitioning function contains the logic that determines how to split up the elements of the input PCollection into each resulting partition PCollection. </p> <p> <b>Kata:</b> Implement a <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.Partition"> Partition</a> transform that splits a PCollection of numbers into two PCollections. The first PCollection contains numbers greater than 100, and the second PCollection contains the remaining numbers. </p> <br> <div class="hint"> Refer to <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.Partition"> Partition</a> to solve this problem. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#partition"> "Partition"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755589" />
- <option name="index" value="1" />
- <option name="name" value="Partition" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="924" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="if number > 100: return 0 else: return 1" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="1" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1087" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.Partition(partition_fn, 2)" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938058938" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238433" />
- <option name="index" value="7" />
- <option name="name" value="Side Input" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560938065022" />
- <option name="unitId" value="210893" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Side Input</h2> <p> In addition to the main input PCollection, you can provide additional inputs to a ParDo transform in the form of side inputs. A side input is an additional input that your DoFn can access each time it processes an element in the input PCollection. When you specify a side input, you create a view of some other data that can be read from within the ParDo transform’s DoFn while processing each element. </p> <p> Side inputs are useful if your ParDo needs to inject additional data when processing each element in the input PCollection, but the additional data needs to be determined at runtime (and not hard-coded). Such values might be determined by the input data, or depend on a different branch of your pipeline. </p> <p> <b>Kata:</b> Please enrich each Person with the country based on the city he/she lives in. </p> <br> <div class="hint"> Override <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.DoFn.process"> process</a> method that also accepts side input argument. </div> <div class="hint"> Use <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.ParDo"> ParDo</a> with <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.DoFn"> DoFn</a> that accepts side input. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#side-inputs">"Side inputs"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755590" />
- <option name="index" value="1" />
- <option name="name" value="Side Input" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1534" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="def process(self, element, cities_to_countries): yield Person(element.name, element.city, cities_to_countries[element.city])" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="1" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="2096" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.ParDo(EnrichCountryDoFn(), cities_to_countries)" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938069904" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238434" />
- <option name="index" value="8" />
- <option name="name" value="Side Output" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560938076976" />
- <option name="unitId" value="210894" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Side Output</h2> <p> While ParDo always produces a main output PCollection (as the return value from apply), you can also have your ParDo produce any number of additional output PCollections. If you choose to have multiple outputs, your ParDo returns all of the output PCollections (including the main output) bundled together. </p> <p> <b>Kata:</b> Implement additional output to your ParDo for numbers bigger than 100. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.pvalue.html#apache_beam.pvalue.TaggedOutput"> pvalue.TaggedOutput</a> and <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.ParDo.with_outputs"> .with_outputs</a> to output multiple tagged-outputs in a <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.ParDo"> ParDo.</a> </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#additional-outputs"> "Additional outputs"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755591" />
- <option name="index" value="1" />
- <option name="name" value="Side Output" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1011" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="def process(self, element): if element <= 100: yield element else: yield pvalue.TaggedOutput(num_above_100_tag, element)" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="1" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1264" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.ParDo(ProcessNumbersDoFn()) .with_outputs(num_above_100_tag, main=num_below_100_tag))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938083234" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238435" />
- <option name="index" value="9" />
- <option name="name" value="Branching" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560938090650" />
- <option name="unitId" value="210895" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Branching</h2> <p> You can use the same PCollection as input for multiple transforms without consuming the input or altering it. </p> <p> <b>Kata:</b> Branch out the numbers to two different transforms: one transform is multiplying each number by 5 and the other transform is multiplying each number by 10. </p> <br> <div class="hint"> Refer to the Beam Design Your Pipeline Guide <a href="https://beam.apache.org/documentation/pipelines/design-your-pipeline/#multiple-transforms-process-the-same-pcollection"> "Multiple transforms process the same PCollection"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755592" />
- <option name="index" value="1" />
- <option name="name" value="Branching" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="945" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="numbers | beam.Map(lambda num: num * 5)" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="1" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1002" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="numbers | beam.Map(lambda num: num * 10)" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938095634" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238436" />
- <option name="index" value="10" />
- <option name="name" value="Composite Transform" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560938102699" />
- <option name="unitId" value="210896" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Composite Transform</h2> <p> Transforms can have a nested structure, where a complex transform performs multiple simpler transforms (such as more than one ParDo, Combine, GroupByKey, or even other composite transforms). These transforms are called composite transforms. Nesting multiple transforms inside a single composite transform can make your code more modular and easier to understand. </p> <p> To create your own composite transform, create a subclass of the PTransform class and override the expand method to specify the actual processing logic. You can then use this transform just as you would a built-in transform from the Beam SDK. Within your PTransform subclass, you’ll need to override the expand method. The expand method is where you add the processing logic for the PTransform. Your override of expand must accept the appropriate type of input PCollection as a parameter, and specify the output PCollection as the return value. </p> <p> <b>Kata:</b> Please implement a composite transform "ExtractAndMultiplyNumbers" that extracts numbers from comma separated line and then multiplies each number by 10. </p> <br> <div class="hint"> Refer to <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform"> PTransform</a>. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#composite-transforms"> "Composite transforms"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755593" />
- <option name="index" value="1" />
- <option name="name" value="Composite Transform" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="920" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="def expand(self, pcoll): return (pcoll | beam.FlatMap(lambda line: map(int, line.split(','))) | beam.Map(lambda num: num * 10) )" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="1" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1179" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="ExtractAndMultiplyNumbers()" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938107880" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- </list>
- </option>
- </Section>
- <Section>
- <option name="courseId" value="54532" />
- <option name="customPresentableName" />
- <option name="id" value="85646" />
- <option name="index" value="3" />
- <option name="name" value="Common Transforms" />
- <option name="position" value="0" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1560431009000" />
- <option name="items">
- <list>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238437" />
- <option name="index" value="1" />
- <option name="name" value="Filter" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560938208485" />
- <option name="unitId" value="210897" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Filter using ParDo</h2> <p> <b>Kata:</b> Implement a filter function that filters out the even numbers by using <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.ParDo"> ParDo</a>. </p> <br> <div class="hint"> Override <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.DoFn.process"> process</a> method. You can use "yield" for each intended element. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755595" />
- <option name="index" value="1" />
- <option name="name" value="ParDo" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="942" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="def process(self, element): if element % 2 == 1: yield element" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938213611" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Filter</h2> <p> The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. </p> <p> <b>Kata:</b> Implement a filter function that filters out the odd numbers by using <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.Filter"> Filter</a>. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.Filter"> Filter</a> with a lambda. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755596" />
- <option name="index" value="2" />
- <option name="name" value="Filter" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="934" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.Filter(lambda num: num % 2 == 0)" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938217127" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238438" />
- <option name="index" value="2" />
- <option name="name" value="Aggregation" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560938223924" />
- <option name="unitId" value="210898" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Aggregation - Count</h2> <p> <b>Kata:</b> Count the number of elements from an input. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.combiners.html#apache_beam.transforms.combiners.Count"> Count</a>. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755597" />
- <option name="index" value="1" />
- <option name="name" value="Count" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="934" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.combiners.Count.Globally()" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938230679" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Aggregation - Sum</h2> <p> <b>Kata:</b> Compute the sum of all elements from an input. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.CombineGlobally"> CombineGlobally</a> and Python built-in <a href="https://docs.python.org/2/library/functions.html#sum">sum</a>. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755598" />
- <option name="index" value="2" />
- <option name="name" value="Sum" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="934" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.CombineGlobally(sum)" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938232928" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Aggregation - Mean</h2> <p> <b>Kata:</b> Compute the mean/average of all elements from an input. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.combiners.html#apache_beam.transforms.combiners.Mean"> Mean</a>. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755599" />
- <option name="index" value="3" />
- <option name="name" value="Mean" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="934" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.combiners.Mean.Globally()" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938235730" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Aggregation - Smallest</h2> <p> <b>Kata:</b> Compute the smallest of the elements from an input. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.combiners.html#apache_beam.transforms.combiners.Top.Smallest"> Top.Smallest</a>. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755600" />
- <option name="index" value="4" />
- <option name="name" value="Smallest" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="934" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.combiners.Top.Smallest(1)" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938237747" />
- </EduTask>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Aggregation - Largest</h2> <p> <b>Kata:</b> Compute the largest of the elements from an input. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.combiners.html#apache_beam.transforms.combiners.Top.Largest"> Top.Largest</a>. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755601" />
- <option name="index" value="5" />
- <option name="name" value="Largest" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="934" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.combiners.Top.Largest(1)" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938239860" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- </list>
- </option>
- </Section>
- <Section>
- <option name="courseId" value="54532" />
- <option name="customPresentableName" />
- <option name="id" value="88017" />
- <option name="index" value="4" />
- <option name="name" value="IO" />
- <option name="position" value="5" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1560436240000" />
- <option name="items">
- <list>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238439" />
- <option name="index" value="1" />
- <option name="name" value="TextIO" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560938245888" />
- <option name="unitId" value="210899" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>ReadFromText</h2> <p> When you create a pipeline, you often need to read data from some external source, such as a file or a database. Likewise, you may want your pipeline to output its result data to an external storage system. Beam provides read and write transforms for a number of common data storage types. If you want your pipeline to read from or write to a data storage format that isn’t supported by the built-in transforms, you can implement your own read and write transforms. </p> <p> To read a PCollection from one or more text files, use beam.io.ReadFromText to instantiate a transform and specify the path of the file(s) to be read. </p> <p> <b>Kata:</b> Read the 'countries.txt' file and convert each country name into uppercase. </p> <br> <div class="hint"> Use <a href="https://beam.apache.org/releases/pydoc/current/apache_beam.io.textio.html#apache_beam.io.textio.ReadFromText"> beam.io.ReadFromText</a>. </div> <div class="hint"> Refer to the Beam Programming Guide <a href="https://beam.apache.org/documentation/programming-guide/#pipeline-io-reading-data"> "Reading input data"</a> section for more information. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755602" />
- <option name="index" value="1" />
- <option name="name" value="ReadFromText" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="919" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.io.ReadFromText(file_path)" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="1" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="956" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.Map(lambda country: country.upper())" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- <entry key="countries.txt">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="countries.txt" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938252130" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238440" />
- <option name="index" value="2" />
- <option name="name" value="Built-in IOs" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560938258337" />
- <option name="unitId" value="210900" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Built-in I/Os</h2> <p> Beam SDKs provide many out of the box I/O transforms that can be used to read from many different sources and write to many different sinks. </p> <p> See the <a href="https://beam.apache.org/documentation/io/built-in/">Beam-provided I/O Transforms</a> page for a list of the currently available I/O transforms. </p> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755603" />
- <option name="index" value="1" />
- <option name="name" value="Built-in IOs" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="" />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938263697" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- </list>
- </option>
- </Section>
- <Section>
- <option name="courseId" value="54532" />
- <option name="customPresentableName" />
- <option name="id" value="85647" />
- <option name="index" value="5" />
- <option name="name" value="Examples" />
- <option name="position" value="0" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="updateDate" value="1560435414000" />
- <option name="items">
- <list>
- <Lesson>
- <option name="customPresentableName" />
- <option name="id" value="238441" />
- <option name="index" value="1" />
- <option name="name" value="Word Count" />
- <option name="stepikChangeStatus" value="Content changed" />
- <option name="updateDate" value="1560938269193" />
- <option name="unitId" value="210901" />
- <option name="items">
- <list>
- <EduTask>
- <option name="customPresentableName" />
- <option name="descriptionFormat" value="HTML" />
- <option name="descriptionText" value="<!-- ~ Licensed to the Apache Software Foundation (ASF) under one ~ or more contributor license agreements. See the NOTICE file ~ distributed with this work for additional information ~ regarding copyright ownership. The ASF licenses this file ~ to you under the Apache License, Version 2.0 (the ~ "License"); you may not use this file except in compliance ~ with the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <html> <h2>Word Count Pipeline</h2> <p> <b>Kata:</b> Create a pipeline that counts the number of words. </p> <p> Please output the count of each word in the following format: </p> <pre> word:count ball:5 book:3 </pre> <br> <div class="hint"> Refer to your katas above. </div> </html> " />
- <option name="feedbackLink">
- <FeedbackLink>
- <option name="link" />
- <option name="type" value="STEPIK" />
- </FeedbackLink>
- </option>
- <option name="id" value="755604" />
- <option name="index" value="1" />
- <option name="name" value="Word Count" />
- <option name="record" value="-1" />
- <option name="status" value="Unchecked" />
- <option name="stepikChangeStatus" value="Up to date" />
- <option name="files">
- <map>
- <entry key="task.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list>
- <AnswerPlaceholder>
- <option name="hints">
- <list />
- </option>
- <option name="index" value="0" />
- <option name="initialState" />
- <option name="initializedFromDependency" value="false" />
- <option name="length" value="6" />
- <option name="offset" value="1021" />
- <option name="placeholderDependency" />
- <option name="placeholderText" value="TODO()" />
- <option name="possibleAnswer" value="beam.FlatMap(lambda sentence: sentence.split()) | beam.combiners.Count.PerElement() | beam.Map(lambda (k, v): k + ":" + str(v))" />
- <option name="selected" value="false" />
- <option name="status" value="Unchecked" />
- <option name="studentAnswer" />
- <option name="useLength" value="false" />
- </AnswerPlaceholder>
- </list>
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="task.py" />
- <option name="text" value="# TODO: type solution here " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="true" />
- </TaskFile>
- </value>
- </entry>
- <entry key="tests.py">
- <value>
- <TaskFile>
- <option name="answerPlaceholders">
- <list />
- </option>
- <option name="highlightErrors" value="true" />
- <option name="name" value="tests.py" />
- <option name="text" value="from test_helper import run_common_tests, failed, passed, get_answer_placeholders def test_answer_placeholders(): placeholders = get_answer_placeholders() placeholder = placeholders[0] if placeholder == "": # TODO: your condition here passed() else: failed() if __name__ == '__main__': run_common_tests() # test_answer_placeholders() # TODO: uncomment test call " />
- <option name="trackChanges" value="true" />
- <option name="trackLengths" value="true" />
- <option name="visible" value="false" />
- </TaskFile>
- </value>
- </entry>
- </map>
- </option>
- <option name="updateDate" value="1560938273811" />
- </EduTask>
- </list>
- </option>
- </Lesson>
- </list>
- </option>
- </Section>
- </list>
- </option>
- </EduCourse>
- </option>
- </StudyTaskManager>
- </component>
-</project>
\ No newline at end of file
diff --git a/learning/katas/python/Common Transforms/Aggregation/Count/task-info.yaml b/learning/katas/python/Common Transforms/Aggregation/Count/task-info.yaml
new file mode 100644
index 0000000..8259cde
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Aggregation/Count/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 934
+ length: 31
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Common Transforms/Aggregation/Count/task-remote-info.yaml b/learning/katas/python/Common Transforms/Aggregation/Count/task-remote-info.yaml
new file mode 100644
index 0000000..49def6a
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Aggregation/Count/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755597
+update_date: Wed, 19 Jun 2019 09:57:10 UTC
diff --git a/learning/katas/python/Common Transforms/Aggregation/Largest/task-info.yaml b/learning/katas/python/Common Transforms/Aggregation/Largest/task-info.yaml
new file mode 100644
index 0000000..cdc5440
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Aggregation/Largest/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 934
+ length: 29
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Common Transforms/Aggregation/Largest/task-remote-info.yaml b/learning/katas/python/Common Transforms/Aggregation/Largest/task-remote-info.yaml
new file mode 100644
index 0000000..6b85a20
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Aggregation/Largest/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755601
+update_date: Wed, 19 Jun 2019 09:57:19 UTC
diff --git a/learning/katas/python/Common Transforms/Aggregation/Mean/task-info.yaml b/learning/katas/python/Common Transforms/Aggregation/Mean/task-info.yaml
new file mode 100644
index 0000000..15c8e41
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Aggregation/Mean/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 934
+ length: 30
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Common Transforms/Aggregation/Mean/task-remote-info.yaml b/learning/katas/python/Common Transforms/Aggregation/Mean/task-remote-info.yaml
new file mode 100644
index 0000000..8f6bbe1
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Aggregation/Mean/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755599
+update_date: Wed, 19 Jun 2019 09:57:15 UTC
diff --git a/learning/katas/python/Common Transforms/Aggregation/Smallest/task-info.yaml b/learning/katas/python/Common Transforms/Aggregation/Smallest/task-info.yaml
new file mode 100644
index 0000000..15c8e41
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Aggregation/Smallest/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 934
+ length: 30
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Common Transforms/Aggregation/Smallest/task-remote-info.yaml b/learning/katas/python/Common Transforms/Aggregation/Smallest/task-remote-info.yaml
new file mode 100644
index 0000000..d4ff756
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Aggregation/Smallest/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755600
+update_date: Wed, 19 Jun 2019 09:57:17 UTC
diff --git a/learning/katas/python/Common Transforms/Aggregation/Sum/task-info.yaml b/learning/katas/python/Common Transforms/Aggregation/Sum/task-info.yaml
new file mode 100644
index 0000000..c9adc6d
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Aggregation/Sum/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 934
+ length: 25
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Common Transforms/Aggregation/Sum/task-remote-info.yaml b/learning/katas/python/Common Transforms/Aggregation/Sum/task-remote-info.yaml
new file mode 100644
index 0000000..09b7fba
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Aggregation/Sum/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755598
+update_date: Wed, 19 Jun 2019 09:57:12 UTC
diff --git a/learning/katas/python/Common Transforms/Aggregation/lesson-info.yaml b/learning/katas/python/Common Transforms/Aggregation/lesson-info.yaml
new file mode 100644
index 0000000..7a6744c
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Aggregation/lesson-info.yaml
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Count
+- Sum
+- Mean
+- Smallest
+- Largest
diff --git a/learning/katas/python/Common Transforms/Aggregation/lesson-remote-info.yaml b/learning/katas/python/Common Transforms/Aggregation/lesson-remote-info.yaml
new file mode 100644
index 0000000..d3a1750
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Aggregation/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238438
+update_date: Wed, 19 Jun 2019 09:57:03 UTC
+unit: 210898
diff --git a/learning/katas/python/Common Transforms/Filter/Filter/task-info.yaml b/learning/katas/python/Common Transforms/Filter/Filter/task-info.yaml
new file mode 100644
index 0000000..1c1c20d
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Filter/Filter/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 934
+ length: 37
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Common Transforms/Filter/Filter/task-remote-info.yaml b/learning/katas/python/Common Transforms/Filter/Filter/task-remote-info.yaml
new file mode 100644
index 0000000..76a0033
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Filter/Filter/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755596
+update_date: Wed, 19 Jun 2019 09:56:57 UTC
diff --git a/learning/katas/python/Common Transforms/Filter/ParDo/task-info.yaml b/learning/katas/python/Common Transforms/Filter/ParDo/task-info.yaml
new file mode 100644
index 0000000..5d0d5bb
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Filter/ParDo/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 942
+ length: 82
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Common Transforms/Filter/ParDo/task-remote-info.yaml b/learning/katas/python/Common Transforms/Filter/ParDo/task-remote-info.yaml
new file mode 100644
index 0000000..9d3d627
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Filter/ParDo/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755595
+update_date: Wed, 19 Jun 2019 09:56:53 UTC
diff --git a/learning/katas/python/Common Transforms/Filter/lesson-info.yaml b/learning/katas/python/Common Transforms/Filter/lesson-info.yaml
new file mode 100644
index 0000000..93f7b5a
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Filter/lesson-info.yaml
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- ParDo
+- Filter
diff --git a/learning/katas/python/Common Transforms/Filter/lesson-remote-info.yaml b/learning/katas/python/Common Transforms/Filter/lesson-remote-info.yaml
new file mode 100644
index 0000000..96fc4c3
--- /dev/null
+++ b/learning/katas/python/Common Transforms/Filter/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238437
+update_date: Wed, 19 Jun 2019 09:56:48 UTC
+unit: 210897
diff --git a/learning/katas/python/Common Transforms/section-info.yaml b/learning/katas/python/Common Transforms/section-info.yaml
new file mode 100644
index 0000000..2155c27
--- /dev/null
+++ b/learning/katas/python/Common Transforms/section-info.yaml
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Filter
+- Aggregation
diff --git a/learning/katas/python/Common Transforms/section-remote-info.yaml b/learning/katas/python/Common Transforms/section-remote-info.yaml
new file mode 100644
index 0000000..4f76ab5
--- /dev/null
+++ b/learning/katas/python/Common Transforms/section-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 85646
+update_date: Thu, 13 Jun 2019 13:03:29 UTC
diff --git a/learning/katas/python/Core Transforms/Branching/Branching/task-info.yaml b/learning/katas/python/Core Transforms/Branching/Branching/task-info.yaml
new file mode 100644
index 0000000..aa799df
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Branching/Branching/task-info.yaml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 945
+ length: 39
+ placeholder_text: TODO()
+ - offset: 1002
+ length: 40
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Core Transforms/Branching/Branching/task-remote-info.yaml b/learning/katas/python/Core Transforms/Branching/Branching/task-remote-info.yaml
new file mode 100644
index 0000000..2815154
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Branching/Branching/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755592
+update_date: Wed, 19 Jun 2019 09:54:55 UTC
diff --git a/learning/katas/python/Core Transforms/Branching/lesson-info.yaml b/learning/katas/python/Core Transforms/Branching/lesson-info.yaml
new file mode 100644
index 0000000..25ecc7c
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Branching/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Branching
diff --git a/learning/katas/python/Core Transforms/Branching/lesson-remote-info.yaml b/learning/katas/python/Core Transforms/Branching/lesson-remote-info.yaml
new file mode 100644
index 0000000..3848b9c
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Branching/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238435
+update_date: Wed, 19 Jun 2019 09:54:50 UTC
+unit: 210895
diff --git a/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task-info.yaml b/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task-info.yaml
new file mode 100644
index 0000000..3e192e2
--- /dev/null
+++ b/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 1228
+ length: 541
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task-remote-info.yaml b/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task-remote-info.yaml
new file mode 100644
index 0000000..6a0305e
--- /dev/null
+++ b/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755583
+update_date: Wed, 19 Jun 2019 09:53:31 UTC
diff --git a/learning/katas/python/Core Transforms/CoGroupByKey/lesson-info.yaml b/learning/katas/python/Core Transforms/CoGroupByKey/lesson-info.yaml
new file mode 100644
index 0000000..273c077
--- /dev/null
+++ b/learning/katas/python/Core Transforms/CoGroupByKey/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- CoGroupByKey
diff --git a/learning/katas/python/Core Transforms/CoGroupByKey/lesson-remote-info.yaml b/learning/katas/python/Core Transforms/CoGroupByKey/lesson-remote-info.yaml
new file mode 100644
index 0000000..bdca1ad
--- /dev/null
+++ b/learning/katas/python/Core Transforms/CoGroupByKey/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238429
+update_date: Wed, 19 Jun 2019 09:53:26 UTC
+unit: 210889
diff --git a/learning/katas/python/Core Transforms/Combine/Combine PerKey/task-info.yaml b/learning/katas/python/Core Transforms/Combine/Combine PerKey/task-info.yaml
new file mode 100644
index 0000000..fcdb9c50
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Combine/Combine PerKey/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 1088
+ length: 23
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Core Transforms/Combine/Combine PerKey/task-remote-info.yaml b/learning/katas/python/Core Transforms/Combine/Combine PerKey/task-remote-info.yaml
new file mode 100644
index 0000000..5d67292
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Combine/Combine PerKey/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755587
+update_date: Wed, 19 Jun 2019 09:53:50 UTC
diff --git a/learning/katas/python/Core Transforms/Combine/CombineFn/task-info.yaml b/learning/katas/python/Core Transforms/Combine/CombineFn/task-info.yaml
new file mode 100644
index 0000000..1be0f5b
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Combine/CombineFn/task-info.yaml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 916
+ length: 436
+ placeholder_text: TODO()
+ - offset: 1420
+ length: 33
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Core Transforms/Combine/CombineFn/task-remote-info.yaml b/learning/katas/python/Core Transforms/Combine/CombineFn/task-remote-info.yaml
new file mode 100644
index 0000000..09210cf
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Combine/CombineFn/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755585
+update_date: Wed, 19 Jun 2019 09:53:47 UTC
diff --git a/learning/katas/python/Core Transforms/Combine/Simple Function/task-info.yaml b/learning/katas/python/Core Transforms/Combine/Simple Function/task-info.yaml
new file mode 100644
index 0000000..5fbd37f
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Combine/Simple Function/task-info.yaml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 900
+ length: 73
+ placeholder_text: TODO()
+ - offset: 1036
+ length: 25
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Core Transforms/Combine/Simple Function/task-remote-info.yaml b/learning/katas/python/Core Transforms/Combine/Simple Function/task-remote-info.yaml
new file mode 100644
index 0000000..073e5af
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Combine/Simple Function/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755584
+update_date: Wed, 19 Jun 2019 09:53:45 UTC
diff --git a/learning/katas/python/Core Transforms/Combine/lesson-info.yaml b/learning/katas/python/Core Transforms/Combine/lesson-info.yaml
new file mode 100644
index 0000000..899ab5d
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Combine/lesson-info.yaml
@@ -0,0 +1,23 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Simple Function
+- CombineFn
+- Combine PerKey
diff --git a/learning/katas/python/Core Transforms/Combine/lesson-remote-info.yaml b/learning/katas/python/Core Transforms/Combine/lesson-remote-info.yaml
new file mode 100644
index 0000000..f778f59
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Combine/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238430
+update_date: Wed, 19 Jun 2019 09:53:36 UTC
+unit: 210890
diff --git a/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task-info.yaml b/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task-info.yaml
new file mode 100644
index 0000000..727e22d
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task-info.yaml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 920
+ length: 184
+ placeholder_text: TODO()
+ - offset: 1179
+ length: 27
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task-remote-info.yaml b/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task-remote-info.yaml
new file mode 100644
index 0000000..d057902
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755593
+update_date: Wed, 19 Jun 2019 09:55:07 UTC
diff --git a/learning/katas/python/Core Transforms/Composite Transform/lesson-info.yaml b/learning/katas/python/Core Transforms/Composite Transform/lesson-info.yaml
new file mode 100644
index 0000000..177eab1
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Composite Transform/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Composite Transform
diff --git a/learning/katas/python/Core Transforms/Composite Transform/lesson-remote-info.yaml b/learning/katas/python/Core Transforms/Composite Transform/lesson-remote-info.yaml
new file mode 100644
index 0000000..d0e3a1b
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Composite Transform/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238436
+update_date: Wed, 19 Jun 2019 09:55:02 UTC
+unit: 210896
diff --git a/learning/katas/python/Core Transforms/Flatten/Flatten/task-info.yaml b/learning/katas/python/Core Transforms/Flatten/Flatten/task-info.yaml
new file mode 100644
index 0000000..4cb2da7
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Flatten/Flatten/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 1140
+ length: 14
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Core Transforms/Flatten/Flatten/task-remote-info.yaml b/learning/katas/python/Core Transforms/Flatten/Flatten/task-remote-info.yaml
new file mode 100644
index 0000000..d441d1e
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Flatten/Flatten/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755588
+update_date: Wed, 19 Jun 2019 09:54:01 UTC
diff --git a/learning/katas/python/Core Transforms/Flatten/lesson-info.yaml b/learning/katas/python/Core Transforms/Flatten/lesson-info.yaml
new file mode 100644
index 0000000..fd01c86
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Flatten/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Flatten
diff --git a/learning/katas/python/Core Transforms/Flatten/lesson-remote-info.yaml b/learning/katas/python/Core Transforms/Flatten/lesson-remote-info.yaml
new file mode 100644
index 0000000..892a41d
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Flatten/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238431
+update_date: Wed, 19 Jun 2019 09:53:56 UTC
+unit: 210891
diff --git a/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task-info.yaml b/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task-info.yaml
new file mode 100644
index 0000000..4151745
--- /dev/null
+++ b/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 970
+ length: 63
+ placeholder_text: '| TODO()'
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task-remote-info.yaml b/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task-remote-info.yaml
new file mode 100644
index 0000000..e369f71
--- /dev/null
+++ b/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755582
+update_date: Wed, 19 Jun 2019 09:53:06 UTC
diff --git a/learning/katas/python/Core Transforms/GroupByKey/lesson-info.yaml b/learning/katas/python/Core Transforms/GroupByKey/lesson-info.yaml
new file mode 100644
index 0000000..5de9eb6
--- /dev/null
+++ b/learning/katas/python/Core Transforms/GroupByKey/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- GroupByKey
diff --git a/learning/katas/python/Core Transforms/GroupByKey/lesson-remote-info.yaml b/learning/katas/python/Core Transforms/GroupByKey/lesson-remote-info.yaml
new file mode 100644
index 0000000..6401fb6
--- /dev/null
+++ b/learning/katas/python/Core Transforms/GroupByKey/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238428
+update_date: Wed, 19 Jun 2019 09:53:00 UTC
+unit: 210888
diff --git a/learning/katas/python/Core Transforms/Map/FlatMap/task-info.yaml b/learning/katas/python/Core Transforms/Map/FlatMap/task-info.yaml
new file mode 100644
index 0000000..60eb861
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Map/FlatMap/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 968
+ length: 47
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Core Transforms/Map/FlatMap/task-remote-info.yaml b/learning/katas/python/Core Transforms/Map/FlatMap/task-remote-info.yaml
new file mode 100644
index 0000000..7b07812
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Map/FlatMap/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755580
+update_date: Wed, 19 Jun 2019 09:52:24 UTC
diff --git a/learning/katas/python/Core Transforms/Map/Map/task-info.yaml b/learning/katas/python/Core Transforms/Map/Map/task-info.yaml
new file mode 100644
index 0000000..271d8cb
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Map/Map/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 942
+ length: 29
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Core Transforms/Map/Map/task-remote-info.yaml b/learning/katas/python/Core Transforms/Map/Map/task-remote-info.yaml
new file mode 100644
index 0000000..7a0fb73
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Map/Map/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755579
+update_date: Wed, 19 Jun 2019 09:52:22 UTC
diff --git a/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task-info.yaml b/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task-info.yaml
new file mode 100644
index 0000000..9ebdc5e
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task-info.yaml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 920
+ length: 58
+ placeholder_text: TODO()
+ - offset: 1057
+ length: 32
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task-remote-info.yaml b/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task-remote-info.yaml
new file mode 100644
index 0000000..902905b
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755578
+update_date: Wed, 19 Jun 2019 09:52:18 UTC
diff --git a/learning/katas/python/Core Transforms/Map/ParDo/task-info.yaml b/learning/katas/python/Core Transforms/Map/ParDo/task-info.yaml
new file mode 100644
index 0000000..1d1767f
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Map/ParDo/task-info.yaml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 919
+ length: 54
+ placeholder_text: TODO()
+ - offset: 1036
+ length: 31
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Core Transforms/Map/ParDo/task-remote-info.yaml b/learning/katas/python/Core Transforms/Map/ParDo/task-remote-info.yaml
new file mode 100644
index 0000000..90ea335
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Map/ParDo/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755577
+update_date: Wed, 19 Jun 2019 09:52:16 UTC
diff --git a/learning/katas/python/Core Transforms/Map/lesson-info.yaml b/learning/katas/python/Core Transforms/Map/lesson-info.yaml
new file mode 100644
index 0000000..24ea3e3
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Map/lesson-info.yaml
@@ -0,0 +1,24 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- ParDo
+- ParDo OneToMany
+- Map
+- FlatMap
diff --git a/learning/katas/python/Core Transforms/Map/lesson-remote-info.yaml b/learning/katas/python/Core Transforms/Map/lesson-remote-info.yaml
new file mode 100644
index 0000000..3b52f9f
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Map/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238427
+update_date: Wed, 19 Jun 2019 09:52:09 UTC
+unit: 210887
diff --git a/learning/katas/python/Core Transforms/Partition/Partition/task-info.yaml b/learning/katas/python/Core Transforms/Partition/Partition/task-info.yaml
new file mode 100644
index 0000000..fb4e439
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Partition/Partition/task-info.yaml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 924
+ length: 60
+ placeholder_text: TODO()
+ - offset: 1087
+ length: 31
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Core Transforms/Partition/Partition/task-remote-info.yaml b/learning/katas/python/Core Transforms/Partition/Partition/task-remote-info.yaml
new file mode 100644
index 0000000..67f84c0
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Partition/Partition/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755589
+update_date: Wed, 19 Jun 2019 09:54:18 UTC
diff --git a/learning/katas/python/Core Transforms/Partition/lesson-info.yaml b/learning/katas/python/Core Transforms/Partition/lesson-info.yaml
new file mode 100644
index 0000000..c15773b2
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Partition/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Partition
diff --git a/learning/katas/python/Core Transforms/Partition/lesson-remote-info.yaml b/learning/katas/python/Core Transforms/Partition/lesson-remote-info.yaml
new file mode 100644
index 0000000..c46be4a
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Partition/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238432
+update_date: Wed, 19 Jun 2019 09:54:12 UTC
+unit: 210892
diff --git a/learning/katas/python/Core Transforms/Side Input/Side Input/task-info.yaml b/learning/katas/python/Core Transforms/Side Input/Side Input/task-info.yaml
new file mode 100644
index 0000000..4ab34f3
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Side Input/Side Input/task-info.yaml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 1534
+ length: 153
+ placeholder_text: TODO()
+ - offset: 2096
+ length: 52
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Core Transforms/Side Input/Side Input/task-remote-info.yaml b/learning/katas/python/Core Transforms/Side Input/Side Input/task-remote-info.yaml
new file mode 100644
index 0000000..ae8918e
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Side Input/Side Input/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755590
+update_date: Wed, 19 Jun 2019 09:54:29 UTC
diff --git a/learning/katas/python/Core Transforms/Side Input/lesson-info.yaml b/learning/katas/python/Core Transforms/Side Input/lesson-info.yaml
new file mode 100644
index 0000000..210e3b0
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Side Input/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Side Input
diff --git a/learning/katas/python/Core Transforms/Side Input/lesson-remote-info.yaml b/learning/katas/python/Core Transforms/Side Input/lesson-remote-info.yaml
new file mode 100644
index 0000000..a8304b3
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Side Input/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238433
+update_date: Wed, 19 Jun 2019 09:54:25 UTC
+unit: 210893
diff --git a/learning/katas/python/Core Transforms/Side Output/Side Output/task-info.yaml b/learning/katas/python/Core Transforms/Side Output/Side Output/task-info.yaml
new file mode 100644
index 0000000..5f65c7f
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Side Output/Side Output/task-info.yaml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 1011
+ length: 160
+ placeholder_text: TODO()
+ - offset: 1264
+ length: 98
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Core Transforms/Side Output/Side Output/task-remote-info.yaml b/learning/katas/python/Core Transforms/Side Output/Side Output/task-remote-info.yaml
new file mode 100644
index 0000000..e2c5d33
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Side Output/Side Output/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755591
+update_date: Wed, 19 Jun 2019 09:54:43 UTC
diff --git a/learning/katas/python/Core Transforms/Side Output/lesson-info.yaml b/learning/katas/python/Core Transforms/Side Output/lesson-info.yaml
new file mode 100644
index 0000000..e9096c9
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Side Output/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Side Output
diff --git a/learning/katas/python/Core Transforms/Side Output/lesson-remote-info.yaml b/learning/katas/python/Core Transforms/Side Output/lesson-remote-info.yaml
new file mode 100644
index 0000000..9dc9d4d
--- /dev/null
+++ b/learning/katas/python/Core Transforms/Side Output/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238434
+update_date: Wed, 19 Jun 2019 09:54:36 UTC
+unit: 210894
diff --git a/learning/katas/python/Core Transforms/section-info.yaml b/learning/katas/python/Core Transforms/section-info.yaml
new file mode 100644
index 0000000..ce72010
--- /dev/null
+++ b/learning/katas/python/Core Transforms/section-info.yaml
@@ -0,0 +1,30 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Map
+- GroupByKey
+- CoGroupByKey
+- Combine
+- Flatten
+- Partition
+- Side Input
+- Side Output
+- Branching
+- Composite Transform
diff --git a/learning/katas/python/Core Transforms/section-remote-info.yaml b/learning/katas/python/Core Transforms/section-remote-info.yaml
new file mode 100644
index 0000000..51df567
--- /dev/null
+++ b/learning/katas/python/Core Transforms/section-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 85645
+update_date: Thu, 13 Jun 2019 13:29:11 UTC
diff --git a/learning/katas/python/Examples/Word Count/Word Count/task-info.yaml b/learning/katas/python/Examples/Word Count/Word Count/task-info.yaml
new file mode 100644
index 0000000..0eef4b3
--- /dev/null
+++ b/learning/katas/python/Examples/Word Count/Word Count/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 1021
+ length: 133
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Examples/Word Count/Word Count/task-remote-info.yaml b/learning/katas/python/Examples/Word Count/Word Count/task-remote-info.yaml
new file mode 100644
index 0000000..f3b4608
--- /dev/null
+++ b/learning/katas/python/Examples/Word Count/Word Count/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755604
+update_date: Wed, 19 Jun 2019 09:57:53 UTC
diff --git a/learning/katas/python/Examples/Word Count/lesson-info.yaml b/learning/katas/python/Examples/Word Count/lesson-info.yaml
new file mode 100644
index 0000000..cbe1d6f
--- /dev/null
+++ b/learning/katas/python/Examples/Word Count/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Word Count
diff --git a/learning/katas/python/Examples/Word Count/lesson-remote-info.yaml b/learning/katas/python/Examples/Word Count/lesson-remote-info.yaml
new file mode 100644
index 0000000..0fd1404
--- /dev/null
+++ b/learning/katas/python/Examples/Word Count/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238441
+update_date: Wed, 19 Jun 2019 09:57:49 UTC
+unit: 210901
diff --git a/learning/katas/python/Examples/section-info.yaml b/learning/katas/python/Examples/section-info.yaml
new file mode 100644
index 0000000..cbe1d6f
--- /dev/null
+++ b/learning/katas/python/Examples/section-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Word Count
diff --git a/learning/katas/python/Examples/section-remote-info.yaml b/learning/katas/python/Examples/section-remote-info.yaml
new file mode 100644
index 0000000..de5c439
--- /dev/null
+++ b/learning/katas/python/Examples/section-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 85647
+update_date: Thu, 13 Jun 2019 14:16:54 UTC
diff --git a/learning/katas/python/IO/Built-in IOs/Built-in IOs/task-info.yaml b/learning/katas/python/IO/Built-in IOs/Built-in IOs/task-info.yaml
new file mode 100644
index 0000000..45ce4ef
--- /dev/null
+++ b/learning/katas/python/IO/Built-in IOs/Built-in IOs/task-info.yaml
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/IO/Built-in IOs/Built-in IOs/task-remote-info.yaml b/learning/katas/python/IO/Built-in IOs/Built-in IOs/task-remote-info.yaml
new file mode 100644
index 0000000..c08b723
--- /dev/null
+++ b/learning/katas/python/IO/Built-in IOs/Built-in IOs/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755603
+update_date: Wed, 19 Jun 2019 09:57:43 UTC
diff --git a/learning/katas/python/IO/Built-in IOs/Built-in IOs/task.html b/learning/katas/python/IO/Built-in IOs/Built-in IOs/task.html
index 55e369f..7d6cc8d 100644
--- a/learning/katas/python/IO/Built-in IOs/Built-in IOs/task.html
+++ b/learning/katas/python/IO/Built-in IOs/Built-in IOs/task.html
@@ -26,4 +26,8 @@
See the <a href="https://beam.apache.org/documentation/io/built-in/">Beam-provided I/O
Transforms</a> page for a list of the currently available I/O transforms.
</p>
+<p>
+ <b>Note:</b> There is no kata for this task. Please click the "Check" button and
+ proceed to the next task.
+</p>
</html>
diff --git a/learning/katas/python/IO/Built-in IOs/lesson-info.yaml b/learning/katas/python/IO/Built-in IOs/lesson-info.yaml
new file mode 100644
index 0000000..af969f1
--- /dev/null
+++ b/learning/katas/python/IO/Built-in IOs/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Built-in IOs
diff --git a/learning/katas/python/IO/Built-in IOs/lesson-remote-info.yaml b/learning/katas/python/IO/Built-in IOs/lesson-remote-info.yaml
new file mode 100644
index 0000000..c28a5ad
--- /dev/null
+++ b/learning/katas/python/IO/Built-in IOs/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238440
+update_date: Wed, 19 Jun 2019 09:57:38 UTC
+unit: 210900
diff --git a/learning/katas/python/IO/TextIO/ReadFromText/task-info.yaml b/learning/katas/python/IO/TextIO/ReadFromText/task-info.yaml
new file mode 100644
index 0000000..d42a178
--- /dev/null
+++ b/learning/katas/python/IO/TextIO/ReadFromText/task-info.yaml
@@ -0,0 +1,34 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 919
+ length: 31
+ placeholder_text: TODO()
+ - offset: 956
+ length: 41
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
+- name: countries.txt
+ visible: true
diff --git a/learning/katas/python/IO/TextIO/ReadFromText/task-remote-info.yaml b/learning/katas/python/IO/TextIO/ReadFromText/task-remote-info.yaml
new file mode 100644
index 0000000..0afe167
--- /dev/null
+++ b/learning/katas/python/IO/TextIO/ReadFromText/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755602
+update_date: Wed, 19 Jun 2019 09:57:32 UTC
diff --git a/learning/katas/python/IO/TextIO/lesson-info.yaml b/learning/katas/python/IO/TextIO/lesson-info.yaml
new file mode 100644
index 0000000..3052ae5
--- /dev/null
+++ b/learning/katas/python/IO/TextIO/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- ReadFromText
diff --git a/learning/katas/python/IO/TextIO/lesson-remote-info.yaml b/learning/katas/python/IO/TextIO/lesson-remote-info.yaml
new file mode 100644
index 0000000..28cc664
--- /dev/null
+++ b/learning/katas/python/IO/TextIO/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238439
+update_date: Wed, 19 Jun 2019 09:57:25 UTC
+unit: 210899
diff --git a/learning/katas/python/IO/section-info.yaml b/learning/katas/python/IO/section-info.yaml
new file mode 100644
index 0000000..1d93752
--- /dev/null
+++ b/learning/katas/python/IO/section-info.yaml
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- TextIO
+- Built-in IOs
diff --git a/learning/katas/python/IO/section-remote-info.yaml b/learning/katas/python/IO/section-remote-info.yaml
new file mode 100644
index 0000000..17618fe
--- /dev/null
+++ b/learning/katas/python/IO/section-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 88017
+update_date: Thu, 13 Jun 2019 14:30:40 UTC
diff --git a/learning/katas/python/Introduction/Hello Beam/Hello Beam/task-info.yaml b/learning/katas/python/Introduction/Hello Beam/Hello Beam/task-info.yaml
new file mode 100644
index 0000000..747b4e1
--- /dev/null
+++ b/learning/katas/python/Introduction/Hello Beam/Hello Beam/task-info.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+type: edu
+files:
+- name: task.py
+ visible: true
+ placeholders:
+ - offset: 903
+ length: 27
+ placeholder_text: TODO()
+- name: tests.py
+ visible: false
diff --git a/learning/katas/python/Introduction/Hello Beam/Hello Beam/task-remote-info.yaml b/learning/katas/python/Introduction/Hello Beam/Hello Beam/task-remote-info.yaml
new file mode 100644
index 0000000..ddcee19
--- /dev/null
+++ b/learning/katas/python/Introduction/Hello Beam/Hello Beam/task-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 755575
+update_date: Wed, 19 Jun 2019 09:51:31 UTC
diff --git a/learning/katas/python/Introduction/Hello Beam/lesson-info.yaml b/learning/katas/python/Introduction/Hello Beam/lesson-info.yaml
new file mode 100644
index 0000000..040e0ac
--- /dev/null
+++ b/learning/katas/python/Introduction/Hello Beam/lesson-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Hello Beam
diff --git a/learning/katas/python/Introduction/Hello Beam/lesson-remote-info.yaml b/learning/katas/python/Introduction/Hello Beam/lesson-remote-info.yaml
new file mode 100644
index 0000000..50d8ca1
--- /dev/null
+++ b/learning/katas/python/Introduction/Hello Beam/lesson-remote-info.yaml
@@ -0,0 +1,3 @@
+id: 238426
+update_date: Wed, 19 Jun 2019 09:51:26 UTC
+unit: 210886
diff --git a/learning/katas/python/Introduction/section-info.yaml b/learning/katas/python/Introduction/section-info.yaml
new file mode 100644
index 0000000..040e0ac
--- /dev/null
+++ b/learning/katas/python/Introduction/section-info.yaml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+content:
+- Hello Beam
diff --git a/learning/katas/python/Introduction/section-remote-info.yaml b/learning/katas/python/Introduction/section-remote-info.yaml
new file mode 100644
index 0000000..f1d2fa3
--- /dev/null
+++ b/learning/katas/python/Introduction/section-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 85644
+update_date: Fri, 31 May 2019 17:58:15 UTC
diff --git a/learning/katas/python/course-info.yaml b/learning/katas/python/course-info.yaml
new file mode 100644
index 0000000..b14f13a
--- /dev/null
+++ b/learning/katas/python/course-info.yaml
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+title: Beam Katas - Python
+language: English
+summary: "This course provides a series of katas to get familiar with Apache Beam.\
+ \ \n\nApache Beam website – https://beam.apache.org/"
+programming_language: Python
+programming_language_version: 2.7
+content:
+- Introduction
+- Core Transforms
+- Common Transforms
+- IO
+- Examples
diff --git a/learning/katas/python/course-remote-info.yaml b/learning/katas/python/course-remote-info.yaml
new file mode 100644
index 0000000..ed9c8a7
--- /dev/null
+++ b/learning/katas/python/course-remote-info.yaml
@@ -0,0 +1,2 @@
+id: 54532
+update_date: Wed, 19 Jun 2019 10:36:17 UTC
diff --git a/release/build.gradle b/release/build.gradle
index 44e9f98..d3a13cc 100644
--- a/release/build.gradle
+++ b/release/build.gradle
@@ -34,7 +34,7 @@
dependsOn ":runners:google-cloud-dataflow-java:runQuickstartJavaDataflow"
dependsOn ":runners:apex:runQuickstartJavaApex"
dependsOn ":runners:spark:runQuickstartJavaSpark"
- dependsOn ":runners:flink:1.8:runQuickstartJavaFlinkLocal"
+ dependsOn ":runners:flink:1.9:runQuickstartJavaFlinkLocal"
dependsOn ":runners:direct-java:runMobileGamingJavaDirect"
dependsOn ":runners:google-cloud-dataflow-java:runMobileGamingJavaDataflow"
}
diff --git a/release/src/main/scripts/run_rc_validation.sh b/release/src/main/scripts/run_rc_validation.sh
index 887e821..3f025d8 100755
--- a/release/src/main/scripts/run_rc_validation.sh
+++ b/release/src/main/scripts/run_rc_validation.sh
@@ -209,7 +209,7 @@
echo "*************************************************************"
echo "* Running Java Quickstart with Flink local runner"
echo "*************************************************************"
- ./gradlew :runners:flink:1.8:runQuickstartJavaFlinkLocal \
+ ./gradlew :runners:flink:1.9:runQuickstartJavaFlinkLocal \
-Prepourl=${REPO_URL} \
-Pver=${RELEASE_VER}
else
diff --git a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/PipelineOptionsTranslation.java b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/PipelineOptionsTranslation.java
index b87f0b6..baf7c36 100644
--- a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/PipelineOptionsTranslation.java
+++ b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/PipelineOptionsTranslation.java
@@ -19,6 +19,7 @@
import com.fasterxml.jackson.core.TreeNode;
import com.fasterxml.jackson.core.type.TypeReference;
+import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.util.HashMap;
@@ -48,18 +49,26 @@
try {
// TODO: Officially define URNs for options and their scheme.
- TreeNode treeNode = MAPPER.valueToTree(options);
- TreeNode rootOptions = treeNode.get("options");
- Iterator<String> optionsKeys = rootOptions.fieldNames();
+ JsonNode treeNode = MAPPER.valueToTree(options);
+ JsonNode rootOptions = treeNode.get("options");
+ Iterator<Map.Entry<String, JsonNode>> optionsEntries = rootOptions.fields();
+
+ if (!optionsEntries.hasNext()) {
+ // Due to mandatory options there is no way this map can be empty.
+ // If it is, then fail fast as it is due to incompatible jackson-core in the classpath.
+ // (observed with version 2.2.3)
+ throw new RuntimeException(
+ "Unable to convert pipeline options, please check for outdated jackson-core version in the classpath.");
+ }
+
Map<String, TreeNode> optionsUsingUrns = new HashMap<>();
- while (optionsKeys.hasNext()) {
- String optionKey = optionsKeys.next();
- TreeNode optionValue = rootOptions.get(optionKey);
+ while (optionsEntries.hasNext()) {
+ Map.Entry<String, JsonNode> entry = optionsEntries.next();
optionsUsingUrns.put(
"beam:option:"
- + CaseFormat.LOWER_CAMEL.to(CaseFormat.LOWER_UNDERSCORE, optionKey)
+ + CaseFormat.LOWER_CAMEL.to(CaseFormat.LOWER_UNDERSCORE, entry.getKey())
+ ":v1",
- optionValue);
+ entry.getValue());
}
// The JSON format of a Protobuf Struct is the JSON object that is equivalent to that struct
diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/GaugeData.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/GaugeData.java
index 34fe8cb..fd64425 100644
--- a/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/GaugeData.java
+++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/GaugeData.java
@@ -20,6 +20,7 @@
import com.google.auto.value.AutoValue;
import java.io.Serializable;
import org.apache.beam.sdk.metrics.GaugeResult;
+import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
import org.joda.time.Instant;
/**
@@ -57,7 +58,7 @@
public static class EmptyGaugeData extends GaugeData {
private static final EmptyGaugeData INSTANCE = new EmptyGaugeData();
- private static final Instant EPOCH = new Instant(0);
+ private static final Instant EPOCH = new Instant(GlobalWindow.TIMESTAMP_MIN_VALUE);
private EmptyGaugeData() {}
diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/MetricsContainerImpl.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/MetricsContainerImpl.java
index bcca019..ac471ca 100644
--- a/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/MetricsContainerImpl.java
+++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/MetricsContainerImpl.java
@@ -30,6 +30,7 @@
import org.apache.beam.model.pipeline.v1.MetricsApi.ExtremaData;
import org.apache.beam.model.pipeline.v1.MetricsApi.IntDistributionData;
import org.apache.beam.model.pipeline.v1.MetricsApi.MonitoringInfo;
+import org.apache.beam.runners.core.construction.BeamUrns;
import org.apache.beam.runners.core.metrics.MetricUpdates.MetricUpdate;
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.annotations.Experimental.Kind;
@@ -61,6 +62,9 @@
private static final Logger LOG = LoggerFactory.getLogger(MetricsContainerImpl.class);
+ private static final String GAUGE_URN =
+ BeamUrns.getUrn(MetricsApi.MonitoringInfoTypeUrns.Enum.LATEST_INT64_TYPE);
+
@Nullable private final String stepName;
private MetricsMap<MetricName, CounterCell> counters = new MetricsMap<>(CounterCell::new);
@@ -306,8 +310,13 @@
if (metric.hasCounterData()) {
CounterData counterData = metric.getCounterData();
if (counterData.getValueCase() == CounterData.ValueCase.INT64_VALUE) {
- Counter counter = getCounter(metricName);
- counter.inc(counterData.getInt64Value());
+ if (GAUGE_URN.equals(monitoringInfo.getType())) {
+ GaugeCell gauge = getGauge(metricName);
+ gauge.set(counterData.getInt64Value());
+ } else {
+ Counter counter = getCounter(metricName);
+ counter.inc(counterData.getInt64Value());
+ }
} else {
LOG.warn("Unsupported CounterData type: {}", counterData);
}
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/DirectTimerInternals.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/DirectTimerInternals.java
index 63b5008..8f3ab48 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/DirectTimerInternals.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/DirectTimerInternals.java
@@ -17,6 +17,7 @@
*/
package org.apache.beam.runners.direct;
+import java.util.stream.StreamSupport;
import javax.annotation.Nullable;
import org.apache.beam.runners.core.StateNamespace;
import org.apache.beam.runners.core.TimerInternals;
@@ -80,6 +81,12 @@
return timerUpdateBuilder.build();
}
+ public boolean containsUpdateForTimeBefore(Instant time) {
+ TimerUpdate update = timerUpdateBuilder.build();
+ return hasTimeBefore(update.getSetTimers(), time)
+ || hasTimeBefore(update.getDeletedTimers(), time);
+ }
+
@Override
public Instant currentProcessingTime() {
return processingTimeClock.now();
@@ -101,4 +108,9 @@
public Instant currentOutputWatermarkTime() {
return watermarks.getOutputWatermark();
}
+
+ private boolean hasTimeBefore(Iterable<? extends TimerData> timers, Instant time) {
+ return StreamSupport.stream(timers.spliterator(), false)
+ .anyMatch(td -> td.getTimestamp().isBefore(time));
+ }
}
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/EvaluationContext.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/EvaluationContext.java
index c5ebfaf..22e0a8a 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/EvaluationContext.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/EvaluationContext.java
@@ -140,6 +140,7 @@
* null} if the transform that produced the result is a root transform
* @param completedTimers the timers that were delivered to produce the {@code completedBundle},
* or an empty iterable if no timers were delivered
+ * @param pushedBackTimers timers that have been pushed back during processing
* @param result the result of evaluating the input bundle
* @return the committed bundles contained within the handled {@code result}
*/
@@ -226,7 +227,11 @@
private void fireAvailableCallbacks(AppliedPTransform<?, ?, ?> producingTransform) {
TransformWatermarks watermarks = watermarkManager.getWatermarks(producingTransform);
Instant outputWatermark = watermarks.getOutputWatermark();
- callbackExecutor.fireForWatermark(producingTransform, outputWatermark);
+ try {
+ callbackExecutor.fireForWatermark(producingTransform, outputWatermark);
+ } catch (InterruptedException ex) {
+ Thread.currentThread().interrupt();
+ }
}
/** Create a {@link UncommittedBundle} for use by a source. */
@@ -369,7 +374,7 @@
* <p>This is a destructive operation. Timers will only appear in the result of this method once
* for each time they are set.
*/
- public Collection<FiredTimers<AppliedPTransform<?, ?, ?>>> extractFiredTimers() {
+ Collection<FiredTimers<AppliedPTransform<?, ?, ?>>> extractFiredTimers() {
forceRefresh();
return watermarkManager.extractFiredTimers();
}
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/QuiescenceDriver.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/QuiescenceDriver.java
index 0802997..ca0ad61 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/QuiescenceDriver.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/QuiescenceDriver.java
@@ -249,6 +249,7 @@
* Exception)}.
*/
private class TimerIterableCompletionCallback implements CompletionCallback {
+
private final Iterable<TimerData> timers;
TimerIterableCompletionCallback(Iterable<TimerData> timers) {
@@ -258,8 +259,9 @@
@Override
public final CommittedResult handleResult(
CommittedBundle<?> inputBundle, TransformResult<?> result) {
- CommittedResult<AppliedPTransform<?, ?, ?>> committedResult =
- evaluationContext.handleResult(inputBundle, timers, result);
+
+ final CommittedResult<AppliedPTransform<?, ?, ?>> committedResult;
+ committedResult = evaluationContext.handleResult(inputBundle, timers, result);
for (CommittedBundle<?> outputBundle : committedResult.getOutputs()) {
pendingWork.offer(
WorkUpdate.fromBundle(
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/StatefulParDoEvaluatorFactory.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/StatefulParDoEvaluatorFactory.java
index 366ca05..e1080e5 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/StatefulParDoEvaluatorFactory.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/StatefulParDoEvaluatorFactory.java
@@ -20,10 +20,14 @@
import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState;
import com.google.auto.value.AutoValue;
+import java.util.ArrayList;
import java.util.Collections;
+import java.util.Comparator;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
+import java.util.PriorityQueue;
import org.apache.beam.runners.core.KeyedWorkItem;
import org.apache.beam.runners.core.KeyedWorkItems;
import org.apache.beam.runners.core.StateNamespace;
@@ -34,6 +38,7 @@
import org.apache.beam.runners.core.TimerInternals.TimerData;
import org.apache.beam.runners.direct.DirectExecutionContext.DirectStepContext;
import org.apache.beam.runners.direct.ParDoMultiOverrideFactory.StatefulParDo;
+import org.apache.beam.runners.direct.WatermarkManager.TimerUpdate;
import org.apache.beam.runners.local.StructuralKey;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.options.PipelineOptions;
@@ -56,6 +61,7 @@
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.CacheLoader;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LoadingCache;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists;
+import org.joda.time.Instant;
/** A {@link TransformEvaluatorFactory} for stateful {@link ParDo}. */
final class StatefulParDoEvaluatorFactory<K, InputT, OutputT> implements TransformEvaluatorFactory {
@@ -232,10 +238,13 @@
implements TransformEvaluator<KeyedWorkItem<K, KV<K, InputT>>> {
private final DoFnLifecycleManagerRemovingTransformEvaluator<KV<K, InputT>> delegateEvaluator;
+ private final List<TimerData> pushedBackTimers = new ArrayList<>();
+ private final DirectTimerInternals timerInternals;
public StatefulParDoEvaluator(
DoFnLifecycleManagerRemovingTransformEvaluator<KV<K, InputT>> delegateEvaluator) {
this.delegateEvaluator = delegateEvaluator;
+ this.timerInternals = delegateEvaluator.getParDoEvaluator().getStepContext().timerInternals();
}
@Override
@@ -245,7 +254,12 @@
delegateEvaluator.processElement(windowedValue);
}
- for (TimerData timer : gbkResult.getValue().timersIterable()) {
+ Instant currentInputWatermark = timerInternals.currentInputWatermarkTime();
+ PriorityQueue<TimerData> toBeFiredTimers =
+ new PriorityQueue<>(Comparator.comparing(TimerData::getTimestamp));
+ gbkResult.getValue().timersIterable().forEach(toBeFiredTimers::add);
+ while (!toBeFiredTimers.isEmpty()) {
+ TimerData timer = toBeFiredTimers.poll();
checkState(
timer.getNamespace() instanceof WindowNamespace,
"Expected Timer %s to be in a %s, but got %s",
@@ -255,17 +269,23 @@
WindowNamespace<?> windowNamespace = (WindowNamespace) timer.getNamespace();
BoundedWindow timerWindow = windowNamespace.getWindow();
delegateEvaluator.onTimer(timer, timerWindow);
+ if (timerInternals.containsUpdateForTimeBefore(currentInputWatermark)) {
+ break;
+ }
}
+ pushedBackTimers.addAll(toBeFiredTimers);
}
@Override
public TransformResult<KeyedWorkItem<K, KV<K, InputT>>> finishBundle() throws Exception {
TransformResult<KV<K, InputT>> delegateResult = delegateEvaluator.finishBundle();
-
+ TimerUpdate timerUpdate =
+ delegateResult.getTimerUpdate().withPushedBackTimers(pushedBackTimers);
+ pushedBackTimers.clear();
StepTransformResult.Builder<KeyedWorkItem<K, KV<K, InputT>>> regroupedResult =
StepTransformResult.<KeyedWorkItem<K, KV<K, InputT>>>withHold(
delegateResult.getTransform(), delegateResult.getWatermarkHold())
- .withTimerUpdate(delegateResult.getTimerUpdate())
+ .withTimerUpdate(timerUpdate)
.withState(delegateResult.getState())
.withMetricUpdates(delegateResult.getLogicalMetricUpdates())
.addOutput(Lists.newArrayList(delegateResult.getOutputBundles()));
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/WatermarkCallbackExecutor.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/WatermarkCallbackExecutor.java
index 7f6800e..1ca90db 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/WatermarkCallbackExecutor.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/WatermarkCallbackExecutor.java
@@ -19,9 +19,12 @@
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
import java.util.PriorityQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.CountDownLatch;
import java.util.concurrent.Executor;
import javax.annotation.Nonnull;
import org.apache.beam.sdk.runners.AppliedPTransform;
@@ -116,14 +119,30 @@
* Schedule all pending callbacks that must have produced output by the time of the provided
* watermark.
*/
- public void fireForWatermark(AppliedPTransform<?, ?, ?> step, Instant watermark) {
+ public void fireForWatermark(AppliedPTransform<?, ?, ?> step, Instant watermark)
+ throws InterruptedException {
PriorityQueue<WatermarkCallback> callbackQueue = callbacks.get(step);
if (callbackQueue == null) {
return;
}
synchronized (callbackQueue) {
+ List<Runnable> toFire = new ArrayList<>();
while (!callbackQueue.isEmpty() && callbackQueue.peek().shouldFire(watermark)) {
- executor.execute(callbackQueue.poll().getCallback());
+ toFire.add(callbackQueue.poll().getCallback());
+ }
+ if (!toFire.isEmpty()) {
+ CountDownLatch latch = new CountDownLatch(toFire.size());
+ toFire.forEach(
+ r ->
+ executor.execute(
+ () -> {
+ try {
+ r.run();
+ } finally {
+ latch.countDown();
+ }
+ }));
+ latch.await();
}
}
}
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/WatermarkManager.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/WatermarkManager.java
index 82dc0ae..d9e7ac2 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/WatermarkManager.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/WatermarkManager.java
@@ -36,11 +36,17 @@
import java.util.Objects;
import java.util.Set;
import java.util.TreeSet;
+import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.function.Consumer;
import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import javax.annotation.concurrent.GuardedBy;
@@ -63,7 +69,9 @@
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.HashBasedTable;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Ordering;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Sets;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.SortedMultiset;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Table;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.TreeMultiset;
@@ -237,13 +245,18 @@
// This per-key sorted set allows quick retrieval of timers that should fire for a key
private final Map<StructuralKey<?>, NavigableSet<TimerData>> objectTimers;
- private AtomicReference<Instant> currentWatermark;
+ private final AtomicReference<Instant> currentWatermark;
+
+ private final Consumer<TimerData> timerUpdateNotification;
public AppliedPTransformInputWatermark(
- String name, Collection<? extends Watermark> inputWatermarks) {
- this.name = name;
+ String name,
+ Collection<? extends Watermark> inputWatermarks,
+ Consumer<TimerData> timerUpdateNotification) {
+ this.name = name;
this.inputWatermarks = inputWatermarks;
+
// The ordering must order elements by timestamp, and must not compare two distinct elements
// as equal. This is built on the assumption that any element added as a pending element will
// be consumed without modifications.
@@ -255,7 +268,8 @@
this.pendingTimers = TreeMultiset.create();
this.objectTimers = new HashMap<>();
this.existingTimers = new HashMap<>();
- currentWatermark = new AtomicReference<>(BoundedWindow.TIMESTAMP_MIN_VALUE);
+ this.currentWatermark = new AtomicReference<>(BoundedWindow.TIMESTAMP_MIN_VALUE);
+ this.timerUpdateNotification = timerUpdateNotification;
}
@Override
@@ -333,12 +347,15 @@
if (existingTimer == null) {
pendingTimers.add(timer);
keyTimers.add(timer);
- } else if (!existingTimer.equals(timer)) {
+ } else {
+ // reinitialize the timer even if identical,
+ // because it might be removed from objectTimers
+ // by timer push back
pendingTimers.remove(existingTimer);
keyTimers.remove(existingTimer);
pendingTimers.add(timer);
keyTimers.add(timer);
- } // else the timer is already set identically, so noop
+ }
existingTimersForKey.put(timer.getNamespace(), timer.getTimerId(), timer);
}
@@ -364,6 +381,13 @@
pendingTimers.remove(timer);
}
}
+
+ if (!update.isEmpty()) {
+ // notify of TimerData update
+ Iterables.concat(
+ update.getCompletedTimers(), update.getDeletedTimers(), update.getSetTimers())
+ .forEach(timerUpdateNotification);
+ }
}
@VisibleForTesting
@@ -487,8 +511,13 @@
private AtomicReference<Instant> earliestHold;
+ private final Consumer<TimerData> timerUpdateNotification;
+
public SynchronizedProcessingTimeInputWatermark(
- String name, Collection<? extends Watermark> inputWms) {
+ String name,
+ Collection<? extends Watermark> inputWms,
+ Consumer<TimerData> timerUpdateNotification) {
+
this.name = name;
this.inputWms = inputWms;
this.pendingBundles = new HashSet<>();
@@ -500,7 +529,8 @@
for (Watermark wm : inputWms) {
initialHold = INSTANT_ORDERING.min(initialHold, wm.get());
}
- earliestHold = new AtomicReference<>(initialHold);
+ this.earliestHold = new AtomicReference<>(initialHold);
+ this.timerUpdateNotification = timerUpdateNotification;
}
@Override
@@ -619,6 +649,11 @@
for (TimerData completedTimer : update.completedTimers) {
pendingTimers.remove(completedTimer);
}
+
+ // notify of TimerData update
+ Iterables.concat(
+ update.getCompletedTimers(), update.getDeletedTimers(), update.getSetTimers())
+ .forEach(timerUpdateNotification);
}
private synchronized Map<StructuralKey<?>, List<TimerData>> extractFiredDomainTimers(
@@ -830,6 +865,14 @@
private final Set<ExecutableT> pendingRefreshes;
/**
+ * A set of executables with currently extracted timers, that are to be processed. Note that, due
+ * to consistency, we can have only single extracted set of timers that are being processed by
+ * bundle processor at a time.
+ */
+ private final Map<ExecutableT, Set<String>> transformsWithAlreadyExtractedTimers =
+ new ConcurrentHashMap<>();
+
+ /**
* Creates a new {@link WatermarkManager}. All watermarks within the newly created {@link
* WatermarkManager} start at {@link BoundedWindow#TIMESTAMP_MIN_VALUE}, the minimum watermark,
* with no watermark holds or pending elements.
@@ -881,13 +924,18 @@
if (wms == null) {
List<Watermark> inputCollectionWatermarks = getInputWatermarks(executable);
AppliedPTransformInputWatermark inputWatermark =
- new AppliedPTransformInputWatermark(name + ".in", inputCollectionWatermarks);
+ new AppliedPTransformInputWatermark(
+ name + ".in",
+ inputCollectionWatermarks,
+ timerUpdateConsumer(transformsWithAlreadyExtractedTimers, executable));
AppliedPTransformOutputWatermark outputWatermark =
new AppliedPTransformOutputWatermark(name + ".out", inputWatermark);
SynchronizedProcessingTimeInputWatermark inputProcessingWatermark =
new SynchronizedProcessingTimeInputWatermark(
- name + ".inProcessing", getInputProcessingWatermarks(executable));
+ name + ".inProcessing",
+ getInputProcessingWatermarks(executable),
+ timerUpdateConsumer(transformsWithAlreadyExtractedTimers, executable));
SynchronizedProcessingTimeOutputWatermark outputProcessingWatermark =
new SynchronizedProcessingTimeOutputWatermark(
name + ".outProcessing", inputProcessingWatermark);
@@ -904,6 +952,25 @@
return wms;
}
+ private static <ExecutableT> Consumer<TimerData> timerUpdateConsumer(
+ Map<ExecutableT, Set<String>> transformsWithAlreadyExtractedTimers, ExecutableT executable) {
+
+ return update -> {
+ String timerIdWithNs = TimerUpdate.getTimerIdWithNamespace(update);
+ transformsWithAlreadyExtractedTimers.compute(
+ executable,
+ (k, v) -> {
+ if (v != null) {
+ v.remove(timerIdWithNs);
+ if (v.isEmpty()) {
+ v = null;
+ }
+ }
+ return v;
+ });
+ };
+ }
+
private Collection<Watermark> getInputProcessingWatermarks(ExecutableT executable) {
ImmutableList.Builder<Watermark> inputWmsBuilder = ImmutableList.builder();
Collection<CollectionT> inputs = graph.getPerElementInputs(executable);
@@ -1122,7 +1189,7 @@
return newRefreshes;
}
- private Set<ExecutableT> refreshWatermarks(ExecutableT toRefresh) {
+ private Set<ExecutableT> refreshWatermarks(final ExecutableT toRefresh) {
TransformWatermarks myWatermarks = transformToWatermarks.get(toRefresh);
WatermarkUpdate updateResult = myWatermarks.refresh();
if (updateResult.isAdvanced()) {
@@ -1145,9 +1212,28 @@
try {
for (Map.Entry<ExecutableT, TransformWatermarks> watermarksEntry :
transformToWatermarks.entrySet()) {
- Collection<FiredTimers<ExecutableT>> firedTimers =
- watermarksEntry.getValue().extractFiredTimers();
- allTimers.addAll(firedTimers);
+ ExecutableT transform = watermarksEntry.getKey();
+ if (!transformsWithAlreadyExtractedTimers.containsKey(transform)) {
+ TransformWatermarks watermarks = watermarksEntry.getValue();
+ Collection<FiredTimers<ExecutableT>> firedTimers = watermarks.extractFiredTimers();
+ if (!firedTimers.isEmpty()) {
+ List<TimerData> newTimers =
+ firedTimers.stream()
+ .flatMap(f -> f.getTimers().stream())
+ .collect(Collectors.toList());
+ transformsWithAlreadyExtractedTimers.compute(
+ transform,
+ (k, v) -> {
+ if (v == null) {
+ v = new HashSet<>();
+ }
+ final Set<String> toUpdate = v;
+ newTimers.forEach(td -> toUpdate.add(TimerUpdate.getTimerIdWithNamespace(td)));
+ return v;
+ });
+ allTimers.addAll(firedTimers);
+ }
+ }
}
return allTimers;
} finally {
@@ -1264,6 +1350,8 @@
private Instant latestSynchronizedInputWm;
private Instant latestSynchronizedOutputWm;
+ private final ReadWriteLock transformWatermarkLock = new ReentrantReadWriteLock();
+
private TransformWatermarks(
ExecutableT executable,
AppliedPTransformInputWatermark inputWatermark,
@@ -1318,6 +1406,10 @@
return latestSynchronizedOutputWm;
}
+ private ReadWriteLock getWatermarkLock() {
+ return transformWatermarkLock;
+ }
+
private WatermarkUpdate refresh() {
inputWatermark.refresh();
synchronizedProcessingInputWatermark.refresh();
@@ -1397,19 +1489,24 @@
*
* <p>setTimers and deletedTimers are collections of {@link TimerData} that have been added to the
* {@link TimerInternals} of an executed step. completedTimers are timers that were delivered as
- * the input to the executed step.
+ * the input to the executed step. pushedBackTimers are timers that were in completedTimers at the
+ * input, but were pushed back due to processing constraints.
*/
public static class TimerUpdate {
private final StructuralKey<?> key;
private final Iterable<? extends TimerData> completedTimers;
-
private final Iterable<? extends TimerData> setTimers;
private final Iterable<? extends TimerData> deletedTimers;
+ private final Iterable<? extends TimerData> pushedBackTimers;
/** Returns a TimerUpdate for a null key with no timers. */
public static TimerUpdate empty() {
return new TimerUpdate(
- null, Collections.emptyList(), Collections.emptyList(), Collections.emptyList());
+ null,
+ Collections.emptyList(),
+ Collections.emptyList(),
+ Collections.emptyList(),
+ Collections.emptyList());
}
/**
@@ -1479,19 +1576,31 @@
key,
ImmutableList.copyOf(completedTimers),
ImmutableList.copyOf(setTimers),
- ImmutableList.copyOf(deletedTimers));
+ ImmutableList.copyOf(deletedTimers),
+ Collections.emptyList());
}
}
+ private static Map<String, TimerData> indexTimerData(Iterable<? extends TimerData> timerData) {
+ return StreamSupport.stream(timerData.spliterator(), false)
+ .collect(Collectors.toMap(TimerUpdate::getTimerIdWithNamespace, e -> e, (a, b) -> b));
+ }
+
+ private static String getTimerIdWithNamespace(TimerData td) {
+ return td.getNamespace() + td.getTimerId();
+ }
+
private TimerUpdate(
StructuralKey<?> key,
Iterable<? extends TimerData> completedTimers,
Iterable<? extends TimerData> setTimers,
- Iterable<? extends TimerData> deletedTimers) {
+ Iterable<? extends TimerData> deletedTimers,
+ Iterable<? extends TimerData> pushedBackTimers) {
this.key = key;
this.completedTimers = completedTimers;
this.setTimers = setTimers;
this.deletedTimers = deletedTimers;
+ this.pushedBackTimers = pushedBackTimers;
}
@VisibleForTesting
@@ -1514,11 +1623,45 @@
return deletedTimers;
}
+ Iterable<? extends TimerData> getPushedBackTimers() {
+ return pushedBackTimers;
+ }
+
+ boolean isEmpty() {
+ return Iterables.isEmpty(completedTimers)
+ && Iterables.isEmpty(setTimers)
+ && Iterables.isEmpty(deletedTimers)
+ && Iterables.isEmpty(pushedBackTimers);
+ }
+
/**
* Returns a {@link TimerUpdate} that is like this one, but with the specified completed timers.
+ * Note that if any of the completed timers is in pushedBackTimers, then it is set instead. The
+ * pushedBackTimers are cleared afterwards.
*/
public TimerUpdate withCompletedTimers(Iterable<TimerData> completedTimers) {
- return new TimerUpdate(this.key, completedTimers, setTimers, deletedTimers);
+ List<TimerData> timersToComplete = new ArrayList<>();
+ Set<TimerData> pushedBack = Sets.newHashSet(pushedBackTimers);
+ Map<String, TimerData> newSetTimers = indexTimerData(setTimers);
+ for (TimerData td : completedTimers) {
+ String timerIdWithNs = getTimerIdWithNamespace(td);
+ if (!pushedBack.contains(td)) {
+ timersToComplete.add(td);
+ } else if (!newSetTimers.containsKey(timerIdWithNs)) {
+ newSetTimers.put(timerIdWithNs, td);
+ }
+ }
+ return new TimerUpdate(
+ key, timersToComplete, newSetTimers.values(), deletedTimers, Collections.emptyList());
+ }
+
+ /**
+ * Returns a {@link TimerUpdate} that is like this one, but with the pushedBackTimersare removed
+ * set by provided pushedBackTimers.
+ */
+ public TimerUpdate withPushedBackTimers(Iterable<TimerData> pushedBackTimers) {
+ return new TimerUpdate(
+ key, completedTimers, setTimers, deletedTimers, Lists.newArrayList(pushedBackTimers));
}
@Override
@@ -1537,6 +1680,17 @@
&& Objects.equals(this.setTimers, that.setTimers)
&& Objects.equals(this.deletedTimers, that.deletedTimers);
}
+
+ @Override
+ public String toString() {
+ return MoreObjects.toStringHelper(this)
+ .add("key", key)
+ .add("setTimers", setTimers)
+ .add("completedTimers", completedTimers)
+ .add("deletedTimers", deletedTimers)
+ .add("pushedBackTimers", pushedBackTimers)
+ .toString();
+ }
}
/**
@@ -1580,7 +1734,10 @@
@Override
public String toString() {
- return MoreObjects.toStringHelper(FiredTimers.class).add("timers", timers).toString();
+ return MoreObjects.toStringHelper(FiredTimers.class)
+ .add("key", key)
+ .add("timers", timers)
+ .toString();
}
}
diff --git a/runners/direct-java/src/test/java/org/apache/beam/runners/direct/DirectRunnerTest.java b/runners/direct-java/src/test/java/org/apache/beam/runners/direct/DirectRunnerTest.java
index 9cc5a87..b58cab9 100644
--- a/runners/direct-java/src/test/java/org/apache/beam/runners/direct/DirectRunnerTest.java
+++ b/runners/direct-java/src/test/java/org/apache/beam/runners/direct/DirectRunnerTest.java
@@ -34,14 +34,18 @@
import java.util.Arrays;
import java.util.List;
import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.Optional;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
+import javax.annotation.Nullable;
import org.apache.beam.runners.direct.DirectRunner.DirectPipelineResult;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
@@ -51,12 +55,14 @@
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.CoderException;
import org.apache.beam.sdk.coders.ListCoder;
+import org.apache.beam.sdk.coders.SerializableCoder;
import org.apache.beam.sdk.coders.VarIntCoder;
import org.apache.beam.sdk.coders.VarLongCoder;
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.io.CountingSource;
import org.apache.beam.sdk.io.GenerateSequence;
import org.apache.beam.sdk.io.Read;
+import org.apache.beam.sdk.io.UnboundedSource;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
@@ -67,19 +73,30 @@
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.Flatten;
import org.apache.beam.sdk.transforms.MapElements;
+import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.SimpleFunction;
+import org.apache.beam.sdk.transforms.Sum;
import org.apache.beam.sdk.transforms.display.DisplayData;
+import org.apache.beam.sdk.transforms.windowing.AfterWatermark;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
+import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.util.CoderUtils;
import org.apache.beam.sdk.util.IllegalMutationException;
import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionList;
+import org.apache.beam.sdk.values.PDone;
import org.apache.beam.sdk.values.TypeDescriptor;
+import org.apache.beam.sdk.values.TypeDescriptors;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap;
import org.hamcrest.Matchers;
import org.joda.time.Duration;
+import org.joda.time.Instant;
import org.junit.Rule;
import org.junit.Test;
import org.junit.internal.matchers.ThrowableMessageMatcher;
@@ -93,9 +110,13 @@
@Rule public transient ExpectedException thrown = ExpectedException.none();
private Pipeline getPipeline() {
+ return getPipeline(true);
+ }
+
+ private Pipeline getPipeline(boolean blockOnRun) {
PipelineOptions opts = PipelineOptionsFactory.create();
opts.setRunner(DirectRunner.class);
-
+ opts.as(DirectOptions.class).setBlockOnRun(blockOnRun);
return Pipeline.create(opts);
}
@@ -617,6 +638,60 @@
}
/**
+ * Test running of {@link Pipeline} which has two {@link POutput POutputs} and finishing the first
+ * one triggers data being fed into the second one.
+ */
+ @Test(timeout = 10000)
+ public void testTwoPOutputsInPipelineWithCascade() throws InterruptedException {
+
+ StaticQueue<Integer> start = StaticQueue.of("start", VarIntCoder.of());
+ StaticQueue<Integer> messages = StaticQueue.of("messages", VarIntCoder.of());
+
+ Pipeline pipeline = getPipeline(false);
+ pipeline.begin().apply("outputStartSignal", outputStartTo(start));
+ PCollection<Integer> result =
+ pipeline
+ .apply("processMessages", messages.read())
+ .apply(
+ Window.<Integer>into(new GlobalWindows())
+ .triggering(AfterWatermark.pastEndOfWindow())
+ .discardingFiredPanes()
+ .withAllowedLateness(Duration.ZERO))
+ .apply(Sum.integersGlobally());
+
+ // the result should be 6, after the data will have been written
+ PAssert.that(result).containsInAnyOrder(6);
+
+ PipelineResult run = pipeline.run();
+
+ // wait until a message has been written to the start queue
+ while (start.take() == null) {}
+
+ // and publish messages
+ messages.add(1).add(2).add(3).terminate();
+
+ run.waitUntilFinish();
+ }
+
+ private PTransform<PBegin, PDone> outputStartTo(StaticQueue<Integer> queue) {
+ return new PTransform<PBegin, PDone>() {
+ @Override
+ public PDone expand(PBegin input) {
+ input
+ .apply(Create.of(1))
+ .apply(
+ MapElements.into(TypeDescriptors.voids())
+ .via(
+ in -> {
+ queue.add(in);
+ return null;
+ }));
+ return PDone.in(input.getPipeline());
+ }
+ };
+ }
+
+ /**
* Options for testing if {@link DirectRunner} drops {@link PipelineOptions} marked with {@link
* JsonIgnore} fields.
*/
@@ -684,4 +759,157 @@
return underlying.getOutputCoder();
}
}
+
+ private static class StaticQueue<T> implements Serializable {
+
+ static class StaticQueueSource<T> extends UnboundedSource<T, StaticQueueSource.Checkpoint<T>> {
+
+ static class Checkpoint<T> implements CheckpointMark, Serializable {
+
+ final T read;
+
+ Checkpoint(T read) {
+ this.read = read;
+ }
+
+ @Override
+ public void finalizeCheckpoint() throws IOException {
+ // nop
+ }
+ }
+
+ final StaticQueue<T> queue;
+
+ StaticQueueSource(StaticQueue<T> queue) {
+ this.queue = queue;
+ }
+
+ @Override
+ public List<? extends UnboundedSource<T, Checkpoint<T>>> split(
+ int desiredNumSplits, PipelineOptions options) throws Exception {
+ return Arrays.asList(this);
+ }
+
+ @Override
+ public UnboundedReader<T> createReader(PipelineOptions po, Checkpoint<T> cmt) {
+ return new UnboundedReader<T>() {
+
+ T read = cmt == null ? null : cmt.read;
+ boolean finished = false;
+
+ @Override
+ public boolean start() throws IOException {
+ return advance();
+ }
+
+ @Override
+ public boolean advance() throws IOException {
+ try {
+ Optional<T> taken = queue.take();
+ if (taken.isPresent()) {
+ read = taken.get();
+ return true;
+ }
+ finished = true;
+ return false;
+ } catch (InterruptedException ex) {
+ throw new IOException(ex);
+ }
+ }
+
+ @Override
+ public Instant getWatermark() {
+ if (finished) {
+ return BoundedWindow.TIMESTAMP_MAX_VALUE;
+ }
+ return BoundedWindow.TIMESTAMP_MIN_VALUE;
+ }
+
+ @Override
+ public CheckpointMark getCheckpointMark() {
+ return new Checkpoint(read);
+ }
+
+ @Override
+ public UnboundedSource<T, ?> getCurrentSource() {
+ return StaticQueueSource.this;
+ }
+
+ @Override
+ public T getCurrent() throws NoSuchElementException {
+ return read;
+ }
+
+ @Override
+ public Instant getCurrentTimestamp() {
+ return getWatermark();
+ }
+
+ @Override
+ public void close() throws IOException {
+ // nop
+ }
+ };
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public Coder<Checkpoint<T>> getCheckpointMarkCoder() {
+ return (Coder) SerializableCoder.of(Checkpoint.class);
+ }
+
+ @Override
+ public Coder<T> getOutputCoder() {
+ return queue.coder;
+ }
+ }
+
+ static final Map<String, StaticQueue<?>> QUEUES = new ConcurrentHashMap<>();
+
+ static <T> StaticQueue<T> of(String name, Coder<T> coder) {
+ return new StaticQueue<>(name, coder);
+ }
+
+ private final String name;
+ private final Coder<T> coder;
+ private final transient BlockingQueue<Optional<T>> queue = new ArrayBlockingQueue<>(10);
+
+ StaticQueue(String name, Coder<T> coder) {
+ this.name = name;
+ this.coder = coder;
+ Preconditions.checkState(
+ QUEUES.put(name, this) == null, "Queue " + name + " already exists.");
+ }
+
+ StaticQueue<T> add(T elem) {
+ queue.add(Optional.of(elem));
+ return this;
+ }
+
+ @Nullable
+ Optional<T> take() throws InterruptedException {
+ return queue.take();
+ }
+
+ PTransform<PBegin, PCollection<T>> read() {
+ return new PTransform<PBegin, PCollection<T>>() {
+ @Override
+ public PCollection<T> expand(PBegin input) {
+ return input.apply("readFrom:" + name, Read.from(asSource()));
+ }
+ };
+ }
+
+ UnboundedSource<T, ?> asSource() {
+ return new StaticQueueSource<>(this);
+ }
+
+ void terminate() {
+ queue.add(Optional.empty());
+ }
+
+ private Object readResolve() {
+ return QUEUES.get(name);
+ }
+ }
}
diff --git a/runners/direct-java/src/test/java/org/apache/beam/runners/direct/StatefulParDoEvaluatorFactoryTest.java b/runners/direct-java/src/test/java/org/apache/beam/runners/direct/StatefulParDoEvaluatorFactoryTest.java
index 171d9dd..04d03e8 100644
--- a/runners/direct-java/src/test/java/org/apache/beam/runners/direct/StatefulParDoEvaluatorFactoryTest.java
+++ b/runners/direct-java/src/test/java/org/apache/beam/runners/direct/StatefulParDoEvaluatorFactoryTest.java
@@ -42,6 +42,7 @@
import org.apache.beam.runners.core.construction.TransformInputs;
import org.apache.beam.runners.direct.ParDoMultiOverrideFactory.StatefulParDo;
import org.apache.beam.runners.direct.WatermarkManager.TimerUpdate;
+import org.apache.beam.runners.direct.WatermarkManager.TransformWatermarks;
import org.apache.beam.runners.local.StructuralKey;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.coders.VarIntCoder;
@@ -96,6 +97,11 @@
private final transient PipelineOptions options = PipelineOptionsFactory.create();
private final transient StateInternals stateInternals =
CopyOnAccessInMemoryStateInternals.<Object>withUnderlying(KEY, null);
+ private final transient DirectTimerInternals timerInternals =
+ DirectTimerInternals.create(
+ MockClock.fromInstant(Instant.now()),
+ Mockito.mock(TransformWatermarks.class),
+ TimerUpdate.builder(StructuralKey.of(KEY, StringUtf8Coder.of())));
private static final BundleFactory BUNDLE_FACTORY = ImmutableListBundleFactory.create();
@@ -103,10 +109,12 @@
public transient TestPipeline pipeline =
TestPipeline.create().enableAbandonedNodeEnforcement(false);
+ @SuppressWarnings("unchecked")
@Before
public void setup() {
MockitoAnnotations.initMocks(this);
when((StateInternals) mockStepContext.stateInternals()).thenReturn(stateInternals);
+ when(mockStepContext.timerInternals()).thenReturn(timerInternals);
when(mockEvaluationContext.createSideInputReader(anyList()))
.thenReturn(
SideInputContainer.create(mockEvaluationContext, Collections.emptyList())
diff --git a/runners/direct-java/src/test/java/org/apache/beam/runners/direct/WatermarkManagerTest.java b/runners/direct-java/src/test/java/org/apache/beam/runners/direct/WatermarkManagerTest.java
index 54a5ff6..5e9cfc2 100644
--- a/runners/direct-java/src/test/java/org/apache/beam/runners/direct/WatermarkManagerTest.java
+++ b/runners/direct-java/src/test/java/org/apache/beam/runners/direct/WatermarkManagerTest.java
@@ -28,6 +28,7 @@
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
import static org.mockito.Mockito.when;
import java.io.Serializable;
@@ -1250,6 +1251,19 @@
Collections.emptyList(),
new Instant(50_000L));
manager.refreshAll();
+ assertTrue(manager.extractFiredTimers().isEmpty());
+
+ // confirm processing of the firstExtracted timers
+ manager.updateWatermarks(
+ null,
+ TimerUpdate.builder(key).withCompletedTimers(firstFired.getTimers()).build(),
+ graph.getProducer(filtered),
+ null,
+ Collections.emptyList(),
+ new Instant(1000L));
+
+ manager.refreshAll();
+
Collection<FiredTimers<AppliedPTransform<?, ?, ?>>> secondFiredTimers =
manager.extractFiredTimers();
assertThat(secondFiredTimers, not(emptyIterable()));
@@ -1314,6 +1328,18 @@
Collections.emptyList(),
new Instant(50_000L));
manager.refreshAll();
+ assertTrue(manager.extractFiredTimers().isEmpty());
+
+ manager.updateWatermarks(
+ null,
+ TimerUpdate.builder(key).withCompletedTimers(firstFired.getTimers()).build(),
+ graph.getProducer(filtered),
+ null,
+ Collections.emptyList(),
+ new Instant(1000L));
+
+ manager.refreshAll();
+
Collection<FiredTimers<AppliedPTransform<?, ?, ?>>> secondFiredTimers =
manager.extractFiredTimers();
assertThat(secondFiredTimers, not(emptyIterable()));
@@ -1381,6 +1407,16 @@
Collections.emptyList(),
new Instant(50_000L));
manager.refreshAll();
+ assertTrue(manager.extractFiredTimers().isEmpty());
+
+ manager.updateWatermarks(
+ null,
+ TimerUpdate.builder(key).withCompletedTimers(firstFired.getTimers()).build(),
+ graph.getProducer(filtered),
+ null,
+ Collections.emptyList(),
+ new Instant(1000L));
+
Collection<FiredTimers<AppliedPTransform<?, ?, ?>>> secondFiredTimers =
manager.extractFiredTimers();
assertThat(secondFiredTimers, not(emptyIterable()));
@@ -1497,7 +1533,8 @@
Watermark mockWatermark = Mockito.mock(Watermark.class);
AppliedPTransformInputWatermark underTest =
- new AppliedPTransformInputWatermark("underTest", ImmutableList.of(mockWatermark));
+ new AppliedPTransformInputWatermark(
+ "underTest", ImmutableList.of(mockWatermark), update -> {});
// Refresh
when(mockWatermark.get()).thenReturn(new Instant(0));
diff --git a/runners/flink/1.7/build.gradle b/runners/flink/1.7/build.gradle
index 4013247..13c22e5 100644
--- a/runners/flink/1.7/build.gradle
+++ b/runners/flink/1.7/build.gradle
@@ -22,11 +22,11 @@
project.ext {
// Set the version of all Flink-related dependencies here.
flink_version = '1.7.2'
- // Main source directory and Flink version specific code.
- main_source_dirs = ["$basePath/src/main/java", "./src/main/java"]
- test_source_dirs = ["$basePath/src/test/java", "./src/test/java"]
- main_resources_dirs = ["$basePath/src/main/resources"]
- test_resources_dirs = ["$basePath/src/test/resources"]
+ // Version specific code overrides.
+ main_source_overrides = ['./src/main/java']
+ test_source_overrides = ['./src/test/java']
+ main_resources_overrides = []
+ test_resources_overrides = []
archives_base_name = 'beam-runners-flink-1.7'
}
diff --git a/runners/flink/1.7/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/BeamStoppableFunction.java b/runners/flink/1.7/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/BeamStoppableFunction.java
new file mode 100644
index 0000000..25eafd7
--- /dev/null
+++ b/runners/flink/1.7/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/BeamStoppableFunction.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming.io;
+
+import org.apache.flink.api.common.functions.StoppableFunction;
+
+/**
+ * Custom StoppableFunction for backward compatibility.
+ *
+ * @see <a
+ * href="https://github.com/apache/flink/commit/e95b347dda5233f22fb03e408f2aa521ff924996">Flink
+ * interface removal commit.</a>
+ */
+public interface BeamStoppableFunction extends StoppableFunction {}
diff --git a/runners/flink/1.7/src/test/java/org/apache/beam/runners/flink/streaming/StreamSources.java b/runners/flink/1.7/src/test/java/org/apache/beam/runners/flink/streaming/StreamSources.java
new file mode 100644
index 0000000..6c49ea2
--- /dev/null
+++ b/runners/flink/1.7/src/test/java/org/apache/beam/runners/flink/streaming/StreamSources.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.streaming;
+
+import org.apache.flink.streaming.api.functions.source.SourceFunction;
+import org.apache.flink.streaming.api.operators.Output;
+import org.apache.flink.streaming.api.operators.StreamSource;
+import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
+import org.apache.flink.streaming.runtime.streamstatus.StreamStatusMaintainer;
+
+/** {@link StreamSource} utilities, that bridge incompatibilities between Flink releases. */
+public class StreamSources {
+
+ public static <OutT, SrcT extends SourceFunction<OutT>> void run(
+ StreamSource<OutT, SrcT> streamSource,
+ Object lockingObject,
+ StreamStatusMaintainer streamStatusMaintainer,
+ Output<StreamRecord<OutT>> collector)
+ throws Exception {
+ streamSource.run(lockingObject, streamStatusMaintainer, collector);
+ }
+}
diff --git a/runners/flink/1.8/build.gradle b/runners/flink/1.8/build.gradle
index d956493..2a05f8c 100644
--- a/runners/flink/1.8/build.gradle
+++ b/runners/flink/1.8/build.gradle
@@ -22,11 +22,11 @@
project.ext {
// Set the version of all Flink-related dependencies here.
flink_version = '1.8.2'
- // Main source directory and Flink version specific code.
- main_source_dirs = ["$basePath/src/main/java", "./src/main/java"]
- test_source_dirs = ["$basePath/src/test/java", "./src/test/java"]
- main_resources_dirs = ["$basePath/src/main/resources"]
- test_resources_dirs = ["$basePath/src/test/resources"]
+ // Version specific code overrides.
+ main_source_overrides = ["${basePath}/1.7/src/main/java", './src/main/java']
+ test_source_overrides = ["${basePath}/1.7/src/test/java", './src/test/java']
+ main_resources_overrides = []
+ test_resources_overrides = []
archives_base_name = 'beam-runners-flink-1.8'
}
diff --git a/runners/flink/1.9/build.gradle b/runners/flink/1.9/build.gradle
new file mode 100644
index 0000000..3396f0b
--- /dev/null
+++ b/runners/flink/1.9/build.gradle
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+def basePath = '..'
+/* All properties required for loading the Flink build script */
+project.ext {
+ // Set the version of all Flink-related dependencies here.
+ flink_version = '1.9.1'
+ // Version specific code overrides.
+ main_source_overrides = ["${basePath}/1.7/src/main/java", "${basePath}/1.8/src/main/java", './src/main/java']
+ test_source_overrides = ["${basePath}/1.7/src/test/java", "${basePath}/1.8/src/test/java", './src/test/java']
+ main_resources_overrides = []
+ test_resources_overrides = []
+ archives_base_name = 'beam-runners-flink-1.9'
+}
+
+// Load the main build script which contains all build logic.
+apply from: "$basePath/flink_runner.gradle"
diff --git a/runners/flink/1.9/job-server-container/build.gradle b/runners/flink/1.9/job-server-container/build.gradle
new file mode 100644
index 0000000..afdb68a
--- /dev/null
+++ b/runners/flink/1.9/job-server-container/build.gradle
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+def basePath = '../../job-server-container'
+
+project.ext {
+ resource_path = basePath
+}
+
+// Load the main build script which contains all build logic.
+apply from: "$basePath/flink_job_server_container.gradle"
diff --git a/runners/flink/1.9/job-server/build.gradle b/runners/flink/1.9/job-server/build.gradle
new file mode 100644
index 0000000..b094dda
--- /dev/null
+++ b/runners/flink/1.9/job-server/build.gradle
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+def basePath = '../../job-server'
+
+project.ext {
+ // Look for the source code in the parent module
+ main_source_dirs = ["$basePath/src/main/java"]
+ test_source_dirs = ["$basePath/src/test/java"]
+ main_resources_dirs = ["$basePath/src/main/resources"]
+ test_resources_dirs = ["$basePath/src/test/resources"]
+ archives_base_name = 'beam-runners-flink-1.9-job-server'
+}
+
+// Load the main build script which contains all build logic.
+apply from: "$basePath/flink_job_server.gradle"
diff --git a/runners/flink/1.9/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/BeamStoppableFunction.java b/runners/flink/1.9/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/BeamStoppableFunction.java
new file mode 100644
index 0000000..4a29036
--- /dev/null
+++ b/runners/flink/1.9/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/BeamStoppableFunction.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming.io;
+
+/**
+ * Custom StoppableFunction for backward compatibility.
+ *
+ * @see <a
+ * href="https://github.com/apache/flink/commit/e95b347dda5233f22fb03e408f2aa521ff924996">Flink
+ * interface removal commit.</a>
+ */
+public interface BeamStoppableFunction {
+
+ /** Unused method for backward compatibility. */
+ void stop();
+}
diff --git a/runners/flink/1.9/src/test/java/org/apache/beam/runners/flink/streaming/StreamSources.java b/runners/flink/1.9/src/test/java/org/apache/beam/runners/flink/streaming/StreamSources.java
new file mode 100644
index 0000000..24674eb
--- /dev/null
+++ b/runners/flink/1.9/src/test/java/org/apache/beam/runners/flink/streaming/StreamSources.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.streaming;
+
+import org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder;
+import org.apache.flink.streaming.api.functions.source.SourceFunction;
+import org.apache.flink.streaming.api.operators.AbstractStreamOperator;
+import org.apache.flink.streaming.api.operators.Output;
+import org.apache.flink.streaming.api.operators.StreamSource;
+import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
+import org.apache.flink.streaming.runtime.streamstatus.StreamStatusMaintainer;
+import org.apache.flink.streaming.runtime.tasks.OperatorChain;
+import org.apache.flink.streaming.runtime.tasks.StreamTask;
+
+/** {@link StreamSource} utilities, that bridge incompatibilities between Flink releases. */
+public class StreamSources {
+
+ public static <OutT, SrcT extends SourceFunction<OutT>> void run(
+ StreamSource<OutT, SrcT> streamSource,
+ Object lockingObject,
+ StreamStatusMaintainer streamStatusMaintainer,
+ Output<StreamRecord<OutT>> collector)
+ throws Exception {
+ streamSource.run(
+ lockingObject, streamStatusMaintainer, collector, createOperatorChain(streamSource));
+ }
+
+ private static OperatorChain<?, ?> createOperatorChain(AbstractStreamOperator<?> operator) {
+ return new OperatorChain<>(
+ operator.getContainingTask(),
+ StreamTask.createRecordWriters(
+ operator.getOperatorConfig(), new MockEnvironmentBuilder().build()));
+ }
+}
diff --git a/runners/flink/flink_runner.gradle b/runners/flink/flink_runner.gradle
index 893f153..1ffcaf6 100644
--- a/runners/flink/flink_runner.gradle
+++ b/runners/flink/flink_runner.gradle
@@ -42,24 +42,58 @@
evaluationDependsOn(":runners:core-java")
/*
+ * Copy & merge source overrides into build directory.
+ */
+def sourceOverridesBase = "${project.buildDir}/source-overrides/src"
+
+def copySourceOverrides = tasks.register('copySourceOverrides', Copy) {
+ it.from main_source_overrides
+ it.into "${sourceOverridesBase}/main/java"
+ it.duplicatesStrategy DuplicatesStrategy.INCLUDE
+}
+compileJava.dependsOn copySourceOverrides
+
+def copyResourcesOverrides = tasks.register('copyResourcesOverrides', Copy) {
+ it.from main_resources_overrides
+ it.into "${sourceOverridesBase}/main/resources"
+ it.duplicatesStrategy DuplicatesStrategy.INCLUDE
+}
+compileJava.dependsOn copyResourcesOverrides
+
+def copyTestSourceOverrides = tasks.register('copyTestSourceOverrides', Copy) {
+ it.from test_source_overrides
+ it.into "${sourceOverridesBase}/test/java"
+ it.duplicatesStrategy DuplicatesStrategy.INCLUDE
+}
+compileTestJava.dependsOn copyTestSourceOverrides
+
+def copyTestResourcesOverrides = tasks.register('copyTestResourcesOverrides', Copy) {
+ it.from test_resources_overrides
+ it.into "${sourceOverridesBase}/test/resources"
+ it.duplicatesStrategy DuplicatesStrategy.INCLUDE
+}
+compileJava.dependsOn copyTestResourcesOverrides
+
+/*
* We have to explicitly set all directories here to make sure each
* version of Flink has the correct overrides set.
*/
+def sourceBase = "${project.projectDir}/../src"
sourceSets {
main {
java {
- srcDirs = main_source_dirs
+ srcDirs = ["${sourceBase}/main/java", "${sourceOverridesBase}/main/java"]
}
resources {
- srcDirs = main_resources_dirs
+ srcDirs = ["${sourceBase}/main/resources", "${sourceOverridesBase}/main/resources"]
}
}
test {
java {
- srcDirs = test_source_dirs
+ srcDirs = ["${sourceBase}/test/java", "${sourceOverridesBase}/test/java"]
}
resources {
- srcDirs = test_resources_dirs
+ srcDirs = ["${sourceBase}/test/resources", "${sourceOverridesBase}/test/resources"]
}
}
}
@@ -71,7 +105,7 @@
*/
spotless {
java {
- target project.sourceSets.main.allJava + project.sourceSets.test.allJava
+ target target + project.fileTree(project.projectDir.parentFile) { include 'src/*/java/**/*.java' }
}
}
@@ -86,6 +120,9 @@
// Run them serially for now, to avoid "Exit code 137", i.e. Jenkins host killing the Gradle test process
if (project.path == ":runners:flink:1.8") {
mustRunAfter(":runners:flink:1.7:test")
+ } else if (project.path == ":runners:flink:1.9") {
+ mustRunAfter(":runners:flink:1.7:test")
+ mustRunAfter(":runners:flink:1.8:test")
}
}
@@ -142,7 +179,7 @@
def createValidatesRunnerTask(Map m) {
def config = m as ValidatesRunnerConfig
- tasks.create(name: config.name, type: Test) {
+ tasks.register(config.name, Test) {
group = "Verification"
def runnerType = config.streaming ? "streaming" : "batch"
description = "Validates the ${runnerType} runner"
@@ -178,12 +215,12 @@
createValidatesRunnerTask(name: "validatesRunnerBatch", streaming: false)
createValidatesRunnerTask(name: "validatesRunnerStreaming", streaming: true)
-task validatesRunner {
- group = "Verification"
+tasks.register('validatesRunner') {
+ group = 'Verification'
description "Validates Flink runner"
dependsOn validatesRunnerBatch
dependsOn validatesRunnerStreaming
}
-// Generates :runners:flink:1.8:runQuickstartJavaFlinkLocal
+// Generates :runners:flink:1.9:runQuickstartJavaFlinkLocal
createJavaExamplesArchetypeValidationTask(type: 'Quickstart', runner: 'FlinkLocal')
diff --git a/runners/flink/job-server/flink_job_server.gradle b/runners/flink/job-server/flink_job_server.gradle
index ee856e7..03ddced 100644
--- a/runners/flink/job-server/flink_job_server.gradle
+++ b/runners/flink/job-server/flink_job_server.gradle
@@ -89,6 +89,7 @@
// For resolving external transform requests
runtime project(":sdks:java:io:kafka")
runtime library.java.kafka_clients
+ runtime project(":sdks:java:io:google-cloud-platform")
}
// NOTE: runShadow must be used in order to run the job server. The standard run
@@ -147,6 +148,7 @@
excludeCategories 'org.apache.beam.sdk.testing.UsesParDoLifecycle'
excludeCategories 'org.apache.beam.sdk.testing.UsesMapState'
excludeCategories 'org.apache.beam.sdk.testing.UsesSetState'
+ excludeCategories 'org.apache.beam.sdk.testing.UsesStrictTimerOrdering'
if (streaming) {
excludeCategories 'org.apache.beam.sdk.testing.UsesTestStreamWithProcessingTime'
} else {
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkExecutionEnvironments.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkExecutionEnvironments.java
index e2a8900..4a13f91 100644
--- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkExecutionEnvironments.java
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkExecutionEnvironments.java
@@ -70,20 +70,22 @@
LOG.info("Creating a Batch Execution Environment.");
- String masterUrl = options.getFlinkMaster();
+ // Although Flink uses Rest, it expects the address not to contain a http scheme
+ String flinkMasterHostPort = stripHttpSchema(options.getFlinkMaster());
Configuration flinkConfiguration = getFlinkConfiguration(confDir);
ExecutionEnvironment flinkBatchEnv;
// depending on the master, create the right environment.
- if ("[local]".equals(masterUrl)) {
+ if ("[local]".equals(flinkMasterHostPort)) {
flinkBatchEnv = ExecutionEnvironment.createLocalEnvironment(flinkConfiguration);
- } else if ("[collection]".equals(masterUrl)) {
+ } else if ("[collection]".equals(flinkMasterHostPort)) {
flinkBatchEnv = new CollectionEnvironment();
- } else if ("[auto]".equals(masterUrl)) {
+ } else if ("[auto]".equals(flinkMasterHostPort)) {
flinkBatchEnv = ExecutionEnvironment.getExecutionEnvironment();
} else {
int defaultPort = flinkConfiguration.getInteger(RestOptions.PORT);
- HostAndPort hostAndPort = HostAndPort.fromString(masterUrl).withDefaultPort(defaultPort);
+ HostAndPort hostAndPort =
+ HostAndPort.fromString(flinkMasterHostPort).withDefaultPort(defaultPort);
flinkConfiguration.setInteger(RestOptions.PORT, hostAndPort.getPort());
flinkBatchEnv =
ExecutionEnvironment.createRemoteEnvironment(
@@ -145,7 +147,8 @@
LOG.info("Creating a Streaming Environment.");
- String masterUrl = options.getFlinkMaster();
+ // Although Flink uses Rest, it expects the address not to contain a http scheme
+ String masterUrl = stripHttpSchema(options.getFlinkMaster());
Configuration flinkConfiguration = getFlinkConfiguration(confDir);
final StreamExecutionEnvironment flinkStreamEnv;
@@ -264,6 +267,15 @@
return flinkStreamEnv;
}
+ /**
+ * Removes the http:// or https:// schema from a url string. This is commonly used with the
+ * flink_master address which is expected to be of form host:port but users may specify a URL;
+ * Python code also assumes a URL which may be passed here.
+ */
+ private static String stripHttpSchema(String url) {
+ return url.trim().replaceFirst("^http[s]?://", "");
+ }
+
private static int determineParallelism(
final int pipelineOptionsParallelism,
final int envParallelism,
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkJobInvoker.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkJobInvoker.java
index b6e040c..a123653 100644
--- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkJobInvoker.java
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkJobInvoker.java
@@ -45,7 +45,7 @@
private final FlinkJobServerDriver.FlinkServerConfiguration serverConfig;
- private FlinkJobInvoker(FlinkJobServerDriver.FlinkServerConfiguration serverConfig) {
+ protected FlinkJobInvoker(FlinkJobServerDriver.FlinkServerConfiguration serverConfig) {
super("flink-runner-job-invoker");
this.serverConfig = serverConfig;
}
@@ -66,7 +66,7 @@
String.format("%s_%s", flinkOptions.getJobName(), UUID.randomUUID().toString());
if (FlinkPipelineOptions.AUTO.equals(flinkOptions.getFlinkMaster())) {
- flinkOptions.setFlinkMaster(serverConfig.getFlinkMasterUrl());
+ flinkOptions.setFlinkMaster(serverConfig.getFlinkMaster());
}
PortablePipelineOptions portableOptions = flinkOptions.as(PortablePipelineOptions.class);
@@ -90,7 +90,7 @@
invocationId, retrievalToken, executorService, pipeline, flinkOptions, pipelineRunner);
}
- static JobInvocation createJobInvocation(
+ protected JobInvocation createJobInvocation(
String invocationId,
String retrievalToken,
ListeningExecutorService executorService,
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkJobServerDriver.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkJobServerDriver.java
index 0c283d8..c5ad768 100644
--- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkJobServerDriver.java
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkJobServerDriver.java
@@ -19,7 +19,6 @@
import javax.annotation.Nullable;
import org.apache.beam.runners.fnexecution.ServerFactory;
-import org.apache.beam.runners.fnexecution.jobsubmission.JobInvoker;
import org.apache.beam.runners.fnexecution.jobsubmission.JobServerDriver;
import org.apache.beam.sdk.extensions.gcp.options.GcsOptions;
import org.apache.beam.sdk.io.FileSystems;
@@ -38,11 +37,16 @@
/** Flink runner-specific Configuration for the jobServer. */
public static class FlinkServerConfiguration extends ServerConfiguration {
- @Option(name = "--flink-master-url", usage = "Flink master url to submit job.")
- private String flinkMasterUrl = "[auto]";
+ @Option(
+ name = "--flink-master",
+ usage =
+ "Flink master address (host:port) to submit the job against. Use Use \"[local]\" to start a local "
+ + "cluster for the execution. Use \"[auto]\" if you plan to either execute locally or submit through "
+ + "Flink\'s CLI.")
+ private String flinkMaster = FlinkPipelineOptions.AUTO;
- String getFlinkMasterUrl() {
- return this.flinkMasterUrl;
+ String getFlinkMaster() {
+ return this.flinkMaster;
}
@Option(
@@ -76,7 +80,7 @@
System.err.println();
}
- public static FlinkJobServerDriver fromParams(String[] args) {
+ public static FlinkServerConfiguration parseArgs(String[] args) {
FlinkServerConfiguration configuration = new FlinkServerConfiguration();
CmdLineParser parser = new CmdLineParser(configuration);
try {
@@ -86,33 +90,45 @@
printUsage(parser);
throw new IllegalArgumentException("Unable to parse command line arguments.", e);
}
+ return configuration;
+ }
- return fromConfig(configuration);
+ // this method is used via reflection in TestPortableRunner
+ public static FlinkJobServerDriver fromParams(String[] args) {
+ return fromConfig(parseArgs(args));
}
public static FlinkJobServerDriver fromConfig(FlinkServerConfiguration configuration) {
return create(
configuration,
createJobServerFactory(configuration),
- createArtifactServerFactory(configuration));
+ createArtifactServerFactory(configuration),
+ () -> FlinkJobInvoker.create(configuration));
}
- public static FlinkJobServerDriver create(
+ public static FlinkJobServerDriver fromConfig(
+ FlinkServerConfiguration configuration, JobInvokerFactory jobInvokerFactory) {
+ return create(
+ configuration,
+ createJobServerFactory(configuration),
+ createArtifactServerFactory(configuration),
+ jobInvokerFactory);
+ }
+
+ private static FlinkJobServerDriver create(
FlinkServerConfiguration configuration,
ServerFactory jobServerFactory,
- ServerFactory artifactServerFactory) {
- return new FlinkJobServerDriver(configuration, jobServerFactory, artifactServerFactory);
+ ServerFactory artifactServerFactory,
+ JobInvokerFactory jobInvokerFactory) {
+ return new FlinkJobServerDriver(
+ configuration, jobServerFactory, artifactServerFactory, jobInvokerFactory);
}
private FlinkJobServerDriver(
FlinkServerConfiguration configuration,
ServerFactory jobServerFactory,
- ServerFactory artifactServerFactory) {
- super(configuration, jobServerFactory, artifactServerFactory);
- }
-
- @Override
- protected JobInvoker createJobInvoker() {
- return FlinkJobInvoker.create((FlinkServerConfiguration) configuration);
+ ServerFactory artifactServerFactory,
+ JobInvokerFactory jobInvokerFactory) {
+ super(configuration, jobServerFactory, artifactServerFactory, jobInvokerFactory);
}
}
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPortableClientEntryPoint.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPortableClientEntryPoint.java
new file mode 100644
index 0000000..04be5fe
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPortableClientEntryPoint.java
@@ -0,0 +1,257 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import java.io.File;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.time.Duration;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import org.apache.beam.model.pipeline.v1.RunnerApi;
+import org.apache.beam.runners.fnexecution.environment.ProcessManager;
+import org.apache.beam.runners.fnexecution.jobsubmission.JobInvocation;
+import org.apache.beam.runners.fnexecution.jobsubmission.JobInvoker;
+import org.apache.beam.runners.fnexecution.jobsubmission.PortablePipelineResult;
+import org.apache.beam.runners.fnexecution.jobsubmission.PortablePipelineRunner;
+import org.apache.beam.runners.fnexecution.provisioning.JobInfo;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.ListeningExecutorService;
+import org.apache.flink.api.common.time.Deadline;
+import org.kohsuke.args4j.CmdLineException;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Flink job entry point to launch a Beam pipeline by executing an external SDK driver program.
+ *
+ * <p>Designed for non-interactive Flink REST client and container with Beam job server jar and SDK
+ * client (for example when using the FlinkK8sOperator). In the future it would be possible to
+ * support driver program execution in a separate (sidecar) container by introducing a client
+ * environment abstraction similar to how it exists for SDK workers.
+ *
+ * <p>Using this entry point eliminates the need to build jar files with materialized pipeline
+ * protos offline. Allows the driver program to access actual execution environment and services, on
+ * par with code executed by SDK workers.
+ *
+ * <p>The entry point starts the job server and provides the endpoint to the the driver program.
+ *
+ * <p>The external driver program constructs the Beam pipeline and submits it to the job service.
+ *
+ * <p>The job service defers execution of the pipeline to the plan environment and returns the
+ * "detached" status to the driver program.
+ *
+ * <p>Upon arrival of the job invocation, the entry point executes the runner, which prepares
+ * ("executes") the Flink job through the plan environment.
+ *
+ * <p>Finally Flink launches the job.
+ */
+public class FlinkPortableClientEntryPoint {
+ private static final Logger LOG = LoggerFactory.getLogger(FlinkPortableClientEntryPoint.class);
+ private static final String JOB_ENDPOINT_FLAG = "--job_endpoint";
+ private static final Duration JOB_INVOCATION_TIMEOUT = Duration.ofSeconds(30);
+ private static final Duration JOB_SERVICE_STARTUP_TIMEOUT = Duration.ofSeconds(30);
+
+ private final String driverCmd;
+ private FlinkJobServerDriver jobServer;
+ private Thread jobServerThread;
+ private DetachedJobInvokerFactory jobInvokerFactory;
+ private int jobPort = 0; // pick any free port
+
+ public FlinkPortableClientEntryPoint(String driverCmd) {
+ Preconditions.checkState(
+ !driverCmd.contains(JOB_ENDPOINT_FLAG),
+ "Driver command must not contain " + JOB_ENDPOINT_FLAG);
+ this.driverCmd = driverCmd;
+ }
+
+ /** Main method to be called standalone or by Flink (CLI or REST API). */
+ public static void main(String[] args) throws Exception {
+ LOG.info("entry points args: {}", Arrays.asList(args));
+ EntryPointConfiguration configuration = parseArgs(args);
+ FlinkPortableClientEntryPoint runner =
+ new FlinkPortableClientEntryPoint(configuration.driverCmd);
+ try {
+ runner.startJobService();
+ runner.runDriverProgram();
+ } catch (Exception e) {
+ throw new RuntimeException(String.format("Job %s failed.", configuration.driverCmd), e);
+ } finally {
+ LOG.info("Stopping job service");
+ runner.stopJobService();
+ }
+ LOG.info("Job submitted successfully.");
+ }
+
+ private static class EntryPointConfiguration {
+ @Option(
+ name = "--driver-cmd",
+ required = true,
+ usage =
+ "Command that launches the Python driver program. "
+ + "(The job service endpoint will be appended as --job_endpoint=localhost:<port>.)")
+ private String driverCmd;
+ }
+
+ private static EntryPointConfiguration parseArgs(String[] args) {
+ EntryPointConfiguration configuration = new EntryPointConfiguration();
+ CmdLineParser parser = new CmdLineParser(configuration);
+ try {
+ parser.parseArgument(args);
+ } catch (CmdLineException e) {
+ LOG.error("Unable to parse command line arguments.", e);
+ parser.printUsage(System.err);
+ throw new IllegalArgumentException("Unable to parse command line arguments.", e);
+ }
+ return configuration;
+ }
+
+ private void startJobService() throws Exception {
+ jobInvokerFactory = new DetachedJobInvokerFactory();
+ jobServer =
+ FlinkJobServerDriver.fromConfig(
+ FlinkJobServerDriver.parseArgs(
+ new String[] {"--job-port=" + jobPort, "--artifact-port=0", "--expansion-port=0"}),
+ jobInvokerFactory);
+ jobServerThread = new Thread(jobServer);
+ jobServerThread.start();
+
+ Deadline deadline = Deadline.fromNow(JOB_SERVICE_STARTUP_TIMEOUT);
+ while (jobServer.getJobServerUrl() == null && deadline.hasTimeLeft()) {
+ try {
+ Thread.sleep(500);
+ } catch (InterruptedException interruptEx) {
+ Thread.currentThread().interrupt();
+ throw new RuntimeException(interruptEx);
+ }
+ }
+
+ if (!jobServerThread.isAlive()) {
+ throw new IllegalStateException("Job service thread is not alive");
+ }
+
+ if (jobServer.getJobServerUrl() == null) {
+ String msg = String.format("Timeout of %s waiting for job service to start.", deadline);
+ throw new TimeoutException(msg);
+ }
+ }
+
+ private void runDriverProgram() throws Exception {
+ ProcessManager processManager = ProcessManager.create();
+ String executable = "bash";
+ List<String> args =
+ ImmutableList.of(
+ "-c",
+ String.format("%s %s=%s", driverCmd, JOB_ENDPOINT_FLAG, jobServer.getJobServerUrl()));
+ String processId = "client1";
+ File outputFile = File.createTempFile("beam-driver-program", ".log");
+
+ try {
+ final ProcessManager.RunningProcess driverProcess =
+ processManager.startProcess(processId, executable, args, System.getenv(), outputFile);
+ driverProcess.isAliveOrThrow();
+ LOG.info("Started driver program");
+
+ // await effect of the driver program submitting the job
+ jobInvokerFactory.executeDetachedJob();
+ } catch (Exception e) {
+ try {
+ processManager.stopProcess(processId);
+ } catch (Exception processKillException) {
+ e.addSuppressed(processKillException);
+ }
+ byte[] output = Files.readAllBytes(outputFile.toPath());
+ String msg =
+ String.format(
+ "Failed to start job with driver program: %s %s output: %s",
+ executable, args, new String(output, Charset.defaultCharset()));
+ throw new RuntimeException(msg, e);
+ }
+ }
+
+ private void stopJobService() throws InterruptedException {
+ if (jobServer != null) {
+ jobServer.stop();
+ }
+ if (jobServerThread != null) {
+ jobServerThread.interrupt();
+ jobServerThread.join();
+ }
+ }
+
+ private class DetachedJobInvokerFactory implements FlinkJobServerDriver.JobInvokerFactory {
+
+ private CountDownLatch latch = new CountDownLatch(1);
+ private volatile PortablePipelineRunner actualPipelineRunner;
+ private volatile RunnerApi.Pipeline pipeline;
+ private volatile JobInfo jobInfo;
+
+ private PortablePipelineRunner handoverPipelineRunner =
+ new PortablePipelineRunner() {
+ @Override
+ public PortablePipelineResult run(RunnerApi.Pipeline pipeline, JobInfo jobInfo) {
+ DetachedJobInvokerFactory.this.pipeline = pipeline;
+ DetachedJobInvokerFactory.this.jobInfo = jobInfo;
+ LOG.info("Pipeline execution handover for {}", jobInfo.jobId());
+ latch.countDown();
+ return new FlinkPortableRunnerResult.Detached();
+ }
+ };
+
+ @Override
+ public JobInvoker create() {
+ return new FlinkJobInvoker(
+ (FlinkJobServerDriver.FlinkServerConfiguration) jobServer.configuration) {
+ @Override
+ protected JobInvocation createJobInvocation(
+ String invocationId,
+ String retrievalToken,
+ ListeningExecutorService executorService,
+ RunnerApi.Pipeline pipeline,
+ FlinkPipelineOptions flinkOptions,
+ PortablePipelineRunner pipelineRunner) {
+ // replace pipeline runner to handover execution
+ actualPipelineRunner = pipelineRunner;
+ return super.createJobInvocation(
+ invocationId,
+ retrievalToken,
+ executorService,
+ pipeline,
+ flinkOptions,
+ handoverPipelineRunner);
+ }
+ };
+ }
+
+ private void executeDetachedJob() throws Exception {
+ long timeoutSeconds = JOB_INVOCATION_TIMEOUT.getSeconds();
+ if (latch.await(timeoutSeconds, TimeUnit.SECONDS)) {
+ actualPipelineRunner.run(pipeline, jobInfo);
+ } else {
+ throw new TimeoutException(
+ String.format("Timeout of %s seconds waiting for job submission.", timeoutSeconds));
+ }
+ }
+ }
+}
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java
index 785bf2b..cdb3060 100644
--- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java
@@ -49,6 +49,7 @@
import org.apache.beam.runners.flink.translation.wrappers.streaming.SplittableDoFnOperator;
import org.apache.beam.runners.flink.translation.wrappers.streaming.WindowDoFnOperator;
import org.apache.beam.runners.flink.translation.wrappers.streaming.WorkItemKeySelector;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.io.BeamStoppableFunction;
import org.apache.beam.runners.flink.translation.wrappers.streaming.io.DedupingOperator;
import org.apache.beam.runners.flink.translation.wrappers.streaming.io.TestStreamSource;
import org.apache.beam.runners.flink.translation.wrappers.streaming.io.UnboundedSourceWrapper;
@@ -95,7 +96,6 @@
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.RichFlatMapFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
-import org.apache.flink.api.common.functions.StoppableFunction;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
@@ -1412,7 +1412,7 @@
OutputT, CheckpointMarkT extends UnboundedSource.CheckpointMark>
extends RichParallelSourceFunction<WindowedValue<OutputT>>
implements ProcessingTimeCallback,
- StoppableFunction,
+ BeamStoppableFunction,
CheckpointListener,
CheckpointedFunction {
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainer.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainer.java
index f8f26eb..2db34a1 100644
--- a/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainer.java
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainer.java
@@ -74,6 +74,8 @@
metricsAccumulator = new MetricsAccumulator();
try {
runtimeContext.addAccumulator(ACCUMULATOR_NAME, metricsAccumulator);
+ } catch (UnsupportedOperationException e) {
+ // Not supported in all environments, e.g. tests
} catch (Exception e) {
LOG.error("Failed to create metrics accumulator.", e);
}
@@ -119,8 +121,8 @@
Counter counter =
flinkCounterCache.computeIfAbsent(
flinkMetricName, n -> runtimeContext.getMetricGroup().counter(n));
- counter.dec(counter.getCount());
- counter.inc(update);
+ // Beam counters are already pre-aggregated, just update with the current value here
+ counter.inc(update - counter.getCount());
}
}
@@ -190,7 +192,7 @@
}
/** Flink {@link Gauge} for {@link GaugeResult}. */
- public static class FlinkGauge implements Gauge<GaugeResult> {
+ public static class FlinkGauge implements Gauge<Long> {
GaugeResult data;
@@ -203,8 +205,8 @@
}
@Override
- public GaugeResult getValue() {
- return data;
+ public Long getValue() {
+ return data.getValue();
}
}
}
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java
index 4f48287..b4ce64f 100644
--- a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java
@@ -630,7 +630,6 @@
@Override
public void processWatermark1(Watermark mark) throws Exception {
- checkInvokeStartBundle();
// We do the check here because we are guaranteed to at least get the +Inf watermark on the
// main input when the job finishes.
if (currentSideInputWatermark >= BoundedWindow.TIMESTAMP_MAX_VALUE.getMillis()) {
@@ -677,7 +676,6 @@
@Override
public void processWatermark2(Watermark mark) throws Exception {
- checkInvokeStartBundle();
setCurrentSideInputWatermark(mark.getTimestamp());
if (mark.getTimestamp() >= BoundedWindow.TIMESTAMP_MAX_VALUE.getMillis()) {
@@ -698,6 +696,7 @@
Iterator<WindowedValue<InputT>> it = pushedBackElementsHandler.getElements().iterator();
while (it.hasNext()) {
+ checkInvokeStartBundle();
WindowedValue<InputT> element = it.next();
// we need to set the correct key in case the operator is
// a (keyed) window operator
@@ -790,8 +789,7 @@
@Override
public void onEventTime(InternalTimer<ByteBuffer, TimerData> timer) throws Exception {
- // We don't have to cal checkInvokeStartBundle() because it's already called in
- // processWatermark*().
+ checkInvokeStartBundle();
fireTimer(timer);
}
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapper.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapper.java
index 28cc507..56744f6 100644
--- a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapper.java
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapper.java
@@ -43,7 +43,6 @@
import org.apache.beam.sdk.values.ValueWithRecordId;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting;
import org.apache.flink.api.common.ExecutionConfig;
-import org.apache.flink.api.common.functions.StoppableFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.state.OperatorStateStore;
@@ -64,7 +63,10 @@
/** Wrapper for executing {@link UnboundedSource UnboundedSources} as a Flink Source. */
public class UnboundedSourceWrapper<OutputT, CheckpointMarkT extends UnboundedSource.CheckpointMark>
extends RichParallelSourceFunction<WindowedValue<ValueWithRecordId<OutputT>>>
- implements ProcessingTimeCallback, StoppableFunction, CheckpointListener, CheckpointedFunction {
+ implements ProcessingTimeCallback,
+ BeamStoppableFunction,
+ CheckpointListener,
+ CheckpointedFunction {
private static final Logger LOG = LoggerFactory.getLogger(UnboundedSourceWrapper.class);
@@ -422,6 +424,7 @@
}
OperatorStateStore stateStore = context.getOperatorStateStore();
+ @SuppressWarnings("unchecked")
CoderTypeInformation<KV<? extends UnboundedSource<OutputT, CheckpointMarkT>, CheckpointMarkT>>
typeInformation = (CoderTypeInformation) new CoderTypeInformation<>(checkpointCoder);
stateForCheckpoint =
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkExecutionEnvironmentsTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkExecutionEnvironmentsTest.java
index af8c4ba..d74bbb9 100644
--- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkExecutionEnvironmentsTest.java
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkExecutionEnvironmentsTest.java
@@ -17,10 +17,10 @@
*/
package org.apache.beam.runners.flink;
+import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.core.Is.is;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertThat;
import java.io.File;
import java.io.IOException;
@@ -382,6 +382,42 @@
assertThat(Whitebox.getInternalState(sev, "port"), is(RestOptions.PORT.defaultValue()));
}
+ @Test
+ public void shouldRemoveHttpProtocolFromHostBatch() {
+ FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class);
+ options.setRunner(FlinkRunner.class);
+
+ for (String flinkMaster :
+ new String[] {
+ "http://host:1234", " http://host:1234", "https://host:1234", " https://host:1234"
+ }) {
+ options.setFlinkMaster(flinkMaster);
+ ExecutionEnvironment sev =
+ FlinkExecutionEnvironments.createBatchExecutionEnvironment(
+ options, Collections.emptyList());
+ assertThat(Whitebox.getInternalState(sev, "host"), is("host"));
+ assertThat(Whitebox.getInternalState(sev, "port"), is(1234));
+ }
+ }
+
+ @Test
+ public void shouldRemoveHttpProtocolFromHostStreaming() {
+ FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class);
+ options.setRunner(FlinkRunner.class);
+
+ for (String flinkMaster :
+ new String[] {
+ "http://host:1234", " http://host:1234", "https://host:1234", " https://host:1234"
+ }) {
+ options.setFlinkMaster(flinkMaster);
+ StreamExecutionEnvironment sev =
+ FlinkExecutionEnvironments.createStreamExecutionEnvironment(
+ options, Collections.emptyList());
+ assertThat(Whitebox.getInternalState(sev, "host"), is("host"));
+ assertThat(Whitebox.getInternalState(sev, "port"), is(1234));
+ }
+ }
+
private String extractFlinkConfig() throws IOException {
InputStream inputStream = getClass().getResourceAsStream("/flink-conf.yaml");
File root = temporaryFolder.getRoot();
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkJobServerDriverTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkJobServerDriverTest.java
index 1e345d0..a75f5f0 100644
--- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkJobServerDriverTest.java
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkJobServerDriverTest.java
@@ -17,6 +17,7 @@
*/
package org.apache.beam.runners.flink;
+import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.CoreMatchers.not;
import static org.hamcrest.CoreMatchers.nullValue;
import static org.hamcrest.MatcherAssert.assertThat;
@@ -42,7 +43,7 @@
assertThat(config.getPort(), is(8099));
assertThat(config.getArtifactPort(), is(8098));
assertThat(config.getExpansionPort(), is(8097));
- assertThat(config.getFlinkMasterUrl(), is("[auto]"));
+ assertThat(config.getFlinkMaster(), is("[auto]"));
assertThat(config.isCleanArtifactsPerJob(), is(true));
FlinkJobServerDriver flinkJobServerDriver = FlinkJobServerDriver.fromConfig(config);
assertThat(flinkJobServerDriver, is(not(nullValue())));
@@ -50,8 +51,8 @@
@Test
public void testConfigurationFromArgs() {
- FlinkJobServerDriver driver =
- FlinkJobServerDriver.fromParams(
+ FlinkJobServerDriver.FlinkServerConfiguration config =
+ FlinkJobServerDriver.parseArgs(
new String[] {
"--job-host=test",
"--job-port",
@@ -63,13 +64,11 @@
"--flink-master-url=jobmanager",
"--clean-artifacts-per-job=false",
});
- FlinkJobServerDriver.FlinkServerConfiguration config =
- (FlinkJobServerDriver.FlinkServerConfiguration) driver.configuration;
assertThat(config.getHost(), is("test"));
assertThat(config.getPort(), is(42));
assertThat(config.getArtifactPort(), is(43));
assertThat(config.getExpansionPort(), is(44));
- assertThat(config.getFlinkMasterUrl(), is("jobmanager"));
+ assertThat(config.getFlinkMaster(), is("jobmanager"));
assertThat(config.isCleanArtifactsPerJob(), is(false));
}
@@ -107,6 +106,8 @@
Thread.sleep(100);
}
}
+ assertThat(driver.getJobServerUrl(), is(not(nullValue())));
+ assertThat(baos.toString(Charsets.UTF_8.name()), containsString(driver.getJobServerUrl()));
assertThat(driverThread.isAlive(), is(true));
} catch (Throwable t) {
// restore to print exception
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSavepointTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSavepointTest.java
index 4315e62..2ea9160 100644
--- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSavepointTest.java
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSavepointTest.java
@@ -209,13 +209,14 @@
FlinkPipelineOptions pipelineOptions = pipeline.getOptions().as(FlinkPipelineOptions.class);
try {
JobInvocation jobInvocation =
- FlinkJobInvoker.createJobInvocation(
- "id",
- "none",
- executorService,
- pipelineProto,
- pipelineOptions,
- new FlinkPipelineRunner(pipelineOptions, null, Collections.emptyList()));
+ FlinkJobInvoker.create(null)
+ .createJobInvocation(
+ "id",
+ "none",
+ executorService,
+ pipelineProto,
+ pipelineOptions,
+ new FlinkPipelineRunner(pipelineOptions, null, Collections.emptyList()));
jobInvocation.start();
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslatorsTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslatorsTest.java
index 8c9eb11..dc43330 100644
--- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslatorsTest.java
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslatorsTest.java
@@ -48,7 +48,6 @@
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.transformations.OneInputTransformation;
import org.apache.flink.streaming.api.transformations.SourceTransformation;
-import org.apache.flink.streaming.api.transformations.StreamTransformation;
import org.junit.Test;
/** Tests for Flink streaming transform translators. */
@@ -65,12 +64,12 @@
env.setParallelism(parallelism);
env.setMaxParallelism(maxParallelism);
- StreamTransformation<?> sourceTransform =
- applyReadSourceTransform(transform, PCollection.IsBounded.BOUNDED, env);
+ SourceTransformation<?> sourceTransform =
+ (SourceTransformation)
+ applyReadSourceTransform(transform, PCollection.IsBounded.BOUNDED, env);
UnboundedSourceWrapperNoValueWithRecordId source =
- (UnboundedSourceWrapperNoValueWithRecordId)
- ((SourceTransformation<?>) sourceTransform).getOperator().getUserFunction();
+ (UnboundedSourceWrapperNoValueWithRecordId) sourceTransform.getOperator().getUserFunction();
assertEquals(maxParallelism, source.getUnderlyingSource().getSplitSources().size());
}
@@ -84,12 +83,12 @@
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(parallelism);
- StreamTransformation<?> sourceTransform =
- applyReadSourceTransform(transform, PCollection.IsBounded.BOUNDED, env);
+ SourceTransformation<?> sourceTransform =
+ (SourceTransformation)
+ applyReadSourceTransform(transform, PCollection.IsBounded.BOUNDED, env);
UnboundedSourceWrapperNoValueWithRecordId source =
- (UnboundedSourceWrapperNoValueWithRecordId)
- ((SourceTransformation<?>) sourceTransform).getOperator().getUserFunction();
+ (UnboundedSourceWrapperNoValueWithRecordId) sourceTransform.getOperator().getUserFunction();
assertEquals(parallelism, source.getUnderlyingSource().getSplitSources().size());
}
@@ -105,14 +104,13 @@
env.setParallelism(parallelism);
env.setMaxParallelism(maxParallelism);
- StreamTransformation<?> sourceTransform =
- applyReadSourceTransform(transform, PCollection.IsBounded.UNBOUNDED, env);
+ OneInputTransformation<?, ?> sourceTransform =
+ (OneInputTransformation)
+ applyReadSourceTransform(transform, PCollection.IsBounded.UNBOUNDED, env);
UnboundedSourceWrapper source =
(UnboundedSourceWrapper)
- ((SourceTransformation) ((OneInputTransformation) sourceTransform).getInput())
- .getOperator()
- .getUserFunction();
+ ((SourceTransformation) sourceTransform.getInput()).getOperator().getUserFunction();
assertEquals(maxParallelism, source.getSplitSources().size());
}
@@ -126,19 +124,18 @@
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(parallelism);
- StreamTransformation<?> sourceTransform =
- applyReadSourceTransform(transform, PCollection.IsBounded.UNBOUNDED, env);
+ OneInputTransformation<?, ?> sourceTransform =
+ (OneInputTransformation)
+ applyReadSourceTransform(transform, PCollection.IsBounded.UNBOUNDED, env);
UnboundedSourceWrapper source =
(UnboundedSourceWrapper)
- ((SourceTransformation) ((OneInputTransformation) sourceTransform).getInput())
- .getOperator()
- .getUserFunction();
+ ((SourceTransformation) sourceTransform.getInput()).getOperator().getUserFunction();
assertEquals(parallelism, source.getSplitSources().size());
}
- private StreamTransformation<?> applyReadSourceTransform(
+ private Object applyReadSourceTransform(
PTransform<?, ?> transform, PCollection.IsBounded isBounded, StreamExecutionEnvironment env) {
FlinkStreamingPipelineTranslator.StreamTransformTranslator<PTransform<?, ?>> translator =
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/PortableExecutionTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/PortableExecutionTest.java
index 10db32c..18bf64e 100644
--- a/runners/flink/src/test/java/org/apache/beam/runners/flink/PortableExecutionTest.java
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/PortableExecutionTest.java
@@ -142,14 +142,15 @@
// execute the pipeline
JobInvocation jobInvocation =
- FlinkJobInvoker.createJobInvocation(
- "fakeId",
- "fakeRetrievalToken",
- flinkJobExecutor,
- pipelineProto,
- options.as(FlinkPipelineOptions.class),
- new FlinkPipelineRunner(
- options.as(FlinkPipelineOptions.class), null, Collections.emptyList()));
+ FlinkJobInvoker.create(null)
+ .createJobInvocation(
+ "fakeId",
+ "fakeRetrievalToken",
+ flinkJobExecutor,
+ pipelineProto,
+ options.as(FlinkPipelineOptions.class),
+ new FlinkPipelineRunner(
+ options.as(FlinkPipelineOptions.class), null, Collections.emptyList()));
jobInvocation.start();
while (jobInvocation.getState() != Enum.DONE) {
Thread.sleep(1000);
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/PortableStateExecutionTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/PortableStateExecutionTest.java
index 91f243b..9ba0721 100644
--- a/runners/flink/src/test/java/org/apache/beam/runners/flink/PortableStateExecutionTest.java
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/PortableStateExecutionTest.java
@@ -196,14 +196,15 @@
RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p);
JobInvocation jobInvocation =
- FlinkJobInvoker.createJobInvocation(
- "id",
- "none",
- flinkJobExecutor,
- pipelineProto,
- options.as(FlinkPipelineOptions.class),
- new FlinkPipelineRunner(
- options.as(FlinkPipelineOptions.class), null, Collections.emptyList()));
+ FlinkJobInvoker.create(null)
+ .createJobInvocation(
+ "id",
+ "none",
+ flinkJobExecutor,
+ pipelineProto,
+ options.as(FlinkPipelineOptions.class),
+ new FlinkPipelineRunner(
+ options.as(FlinkPipelineOptions.class), null, Collections.emptyList()));
jobInvocation.start();
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/PortableTimersExecutionTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/PortableTimersExecutionTest.java
index 9cddcd6..669cc51 100644
--- a/runners/flink/src/test/java/org/apache/beam/runners/flink/PortableTimersExecutionTest.java
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/PortableTimersExecutionTest.java
@@ -183,14 +183,15 @@
RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline);
JobInvocation jobInvocation =
- FlinkJobInvoker.createJobInvocation(
- "id",
- "none",
- flinkJobExecutor,
- pipelineProto,
- options.as(FlinkPipelineOptions.class),
- new FlinkPipelineRunner(
- options.as(FlinkPipelineOptions.class), null, Collections.emptyList()));
+ FlinkJobInvoker.create(null)
+ .createJobInvocation(
+ "id",
+ "none",
+ flinkJobExecutor,
+ pipelineProto,
+ options.as(FlinkPipelineOptions.class),
+ new FlinkPipelineRunner(
+ options.as(FlinkPipelineOptions.class), null, Collections.emptyList()));
jobInvocation.start();
while (jobInvocation.getState() != Enum.DONE) {
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourcePortableTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourcePortableTest.java
index 40d621f..88c2a8d 100644
--- a/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourcePortableTest.java
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourcePortableTest.java
@@ -104,14 +104,15 @@
// execute the pipeline
JobInvocation jobInvocation =
- FlinkJobInvoker.createJobInvocation(
- "fakeId",
- "fakeRetrievalToken",
- flinkJobExecutor,
- pipelineProto,
- options.as(FlinkPipelineOptions.class),
- new FlinkPipelineRunner(
- options.as(FlinkPipelineOptions.class), null, Collections.emptyList()));
+ FlinkJobInvoker.create(null)
+ .createJobInvocation(
+ "fakeId",
+ "fakeRetrievalToken",
+ flinkJobExecutor,
+ pipelineProto,
+ options.as(FlinkPipelineOptions.class),
+ new FlinkPipelineRunner(
+ options.as(FlinkPipelineOptions.class), null, Collections.emptyList()));
jobInvocation.start();
while (jobInvocation.getState() != Enum.DONE) {
Thread.sleep(100);
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainerTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainerTest.java
index 8a5f027..b4bad56 100644
--- a/runners/flink/src/test/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainerTest.java
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainerTest.java
@@ -111,13 +111,13 @@
MetricName metricName = MetricName.named("namespace", "name");
Gauge gauge = step.getGauge(metricName);
- assertThat(flinkGauge.getValue(), is(GaugeResult.empty()));
+ assertThat(flinkGauge.getValue(), is(-1L));
// first set will install the mocked gauge
container.updateMetrics("step");
gauge.set(1);
gauge.set(42);
container.updateMetrics("step");
- assertThat(flinkGauge.getValue().getValue(), is(42L));
+ assertThat(flinkGauge.getValue(), is(42L));
}
@Test
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/BoundedSourceRestoreTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/BoundedSourceRestoreTest.java
index 5c553b2..e6a95a1 100644
--- a/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/BoundedSourceRestoreTest.java
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/BoundedSourceRestoreTest.java
@@ -102,7 +102,8 @@
boolean readFirstBatchOfElements = false;
try {
testHarness.open();
- sourceOperator.run(
+ StreamSources.run(
+ sourceOperator,
checkpointLock,
new TestStreamStatusMaintainer(),
new PartialCollector<>(emittedElements, firstBatchSize));
@@ -147,7 +148,8 @@
boolean readSecondBatchOfElements = false;
try {
restoredTestHarness.open();
- restoredSourceOperator.run(
+ StreamSources.run(
+ restoredSourceOperator,
checkpointLock,
new TestStreamStatusMaintainer(),
new PartialCollector<>(emittedElements, secondBatchSize));
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperatorTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperatorTest.java
index 57f7694..6e96b3b 100644
--- a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperatorTest.java
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperatorTest.java
@@ -19,6 +19,7 @@
import static org.apache.beam.runners.flink.translation.wrappers.streaming.StreamRecordStripper.stripStreamRecordFromWindowedValue;
import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.hamcrest.Matchers.emptyIterable;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.instanceOf;
@@ -1038,7 +1039,7 @@
assertThat(
stripStreamRecordFromWindowedValue(testHarness.getOutput()),
- contains(helloElement, worldElement));
+ containsInAnyOrder(helloElement, worldElement));
testHarness.close();
}
@@ -1274,7 +1275,8 @@
WindowedValue.valueInGlobalWindow("d"),
WindowedValue.valueInGlobalWindow("finishBundle")));
- // A final bundle will be created when sending the MAX watermark
+ // No bundle will be created when sending the MAX watermark
+ // (unless pushed back items are emitted)
newHarness.close();
assertThat(
@@ -1282,7 +1284,6 @@
contains(
WindowedValue.valueInGlobalWindow("finishBundle"),
WindowedValue.valueInGlobalWindow("d"),
- WindowedValue.valueInGlobalWindow("finishBundle"),
WindowedValue.valueInGlobalWindow("finishBundle")));
// close() will also call dispose(), but call again to verify no new bundle
@@ -1294,7 +1295,6 @@
contains(
WindowedValue.valueInGlobalWindow("finishBundle"),
WindowedValue.valueInGlobalWindow("d"),
- WindowedValue.valueInGlobalWindow("finishBundle"),
WindowedValue.valueInGlobalWindow("finishBundle")));
}
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/ExecutableStageDoFnOperatorTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/ExecutableStageDoFnOperatorTest.java
index 8134b24..0d7c99f 100644
--- a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/ExecutableStageDoFnOperatorTest.java
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/ExecutableStageDoFnOperatorTest.java
@@ -403,10 +403,6 @@
verify(stageBundleFactory).getProcessBundleDescriptor();
verify(stageBundleFactory).close();
verify(stageContext).close();
- // DoFnOperator generates a final watermark, which triggers a new bundle..
- verify(stageBundleFactory).getBundle(any(), any(), any());
- verify(bundle).getInputReceivers();
- verify(bundle).close();
verifyNoMoreInteractions(stageBundleFactory);
// close() will also call dispose(), but call again to verify no new bundle
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapperTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapperTest.java
index eb868ed..7b0f9b8 100644
--- a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapperTest.java
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapperTest.java
@@ -36,6 +36,7 @@
import java.util.stream.LongStream;
import org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource;
import org.apache.beam.runners.flink.FlinkPipelineOptions;
+import org.apache.beam.runners.flink.streaming.StreamSources;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.io.CountingSource;
import org.apache.beam.sdk.options.PipelineOptions;
@@ -183,7 +184,8 @@
try {
testHarness.open();
- sourceOperator.run(
+ StreamSources.run(
+ sourceOperator,
testHarness.getCheckpointLock(),
new TestStreamStatusMaintainer(),
new Output<StreamRecord<WindowedValue<ValueWithRecordId<KV<Integer, Integer>>>>>() {
@@ -285,7 +287,8 @@
new Thread(
() -> {
try {
- sourceOperator.run(
+ StreamSources.run(
+ sourceOperator,
testHarness.getCheckpointLock(),
new TestStreamStatusMaintainer(),
new Output<
@@ -397,7 +400,8 @@
try {
testHarness.open();
- sourceOperator.run(
+ StreamSources.run(
+ sourceOperator,
checkpointLock,
new TestStreamStatusMaintainer(),
new Output<StreamRecord<WindowedValue<ValueWithRecordId<KV<Integer, Integer>>>>>() {
@@ -477,7 +481,8 @@
// run again and verify that we see the other elements
try {
restoredTestHarness.open();
- restoredSourceOperator.run(
+ StreamSources.run(
+ restoredSourceOperator,
checkpointLock,
new TestStreamStatusMaintainer(),
new Output<StreamRecord<WindowedValue<ValueWithRecordId<KV<Integer, Integer>>>>>() {
diff --git a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/environment/ProcessManager.java b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/environment/ProcessManager.java
index e5864f9..65fcdf2 100644
--- a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/environment/ProcessManager.java
+++ b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/environment/ProcessManager.java
@@ -38,6 +38,9 @@
public class ProcessManager {
private static final Logger LOG = LoggerFactory.getLogger(ProcessManager.class);
+ /** A symbolic file to indicate that we want to inherit I/O of parent process. */
+ public static final File INHERIT_IO_FILE = new File("_inherit_io_unused_filename_");
+
/** For debugging purposes, we inherit I/O of processes. */
private static final boolean INHERIT_IO = LOG.isDebugEnabled();
@@ -63,7 +66,7 @@
this.processes = Collections.synchronizedMap(new HashMap<>());
}
- static class RunningProcess {
+ public static class RunningProcess {
private Process process;
RunningProcess(Process process) {
@@ -71,7 +74,7 @@
}
/** Checks if the underlying process is still running. */
- void isAliveOrThrow() throws IllegalStateException {
+ public void isAliveOrThrow() throws IllegalStateException {
if (!process.isAlive()) {
throw new IllegalStateException("Process died with exit code " + process.exitValue());
}
@@ -106,27 +109,41 @@
*/
public RunningProcess startProcess(
String id, String command, List<String> args, Map<String, String> env) throws IOException {
+ final File outputFile;
+ if (INHERIT_IO) {
+ LOG.debug(
+ "==> DEBUG enabled: Inheriting stdout/stderr of process (adjustable in ProcessManager)");
+ outputFile = INHERIT_IO_FILE;
+ } else {
+ // Pipe stdout and stderr to /dev/null to avoid blocking the process due to filled PIPE
+ // buffer
+ if (System.getProperty("os.name", "").startsWith("Windows")) {
+ outputFile = new File("nul");
+ } else {
+ outputFile = new File("/dev/null");
+ }
+ }
+ return startProcess(id, command, args, env, outputFile);
+ }
+
+ public RunningProcess startProcess(
+ String id, String command, List<String> args, Map<String, String> env, File outputFile)
+ throws IOException {
checkNotNull(id, "Process id must not be null");
checkNotNull(command, "Command must not be null");
checkNotNull(args, "Process args must not be null");
checkNotNull(env, "Environment map must not be null");
+ checkNotNull(outputFile, "Output redirect file must not be null");
ProcessBuilder pb =
new ProcessBuilder(ImmutableList.<String>builder().add(command).addAll(args).build());
pb.environment().putAll(env);
- if (INHERIT_IO) {
- LOG.debug(
- "==> DEBUG enabled: Inheriting stdout/stderr of process (adjustable in ProcessManager)");
+ if (INHERIT_IO_FILE.equals(outputFile)) {
pb.inheritIO();
} else {
pb.redirectErrorStream(true);
- // Pipe stdout and stderr to /dev/null to avoid blocking the process due to filled PIPE buffer
- if (System.getProperty("os.name", "").startsWith("Windows")) {
- pb.redirectOutput(new File("nul"));
- } else {
- pb.redirectOutput(new File("/dev/null"));
- }
+ pb.redirectOutput(outputFile);
}
LOG.debug("Attempting to start process with command: {}", pb.command());
diff --git a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/jobsubmission/JobServerDriver.java b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/jobsubmission/JobServerDriver.java
index 0c5bf94..d0061d2 100644
--- a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/jobsubmission/JobServerDriver.java
+++ b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/jobsubmission/JobServerDriver.java
@@ -40,18 +40,21 @@
private final ServerFactory jobServerFactory;
private final ServerFactory artifactServerFactory;
+ private final JobInvokerFactory jobInvokerFactory;
private volatile GrpcFnServer<InMemoryJobService> jobServer;
private volatile GrpcFnServer<BeamFileSystemArtifactStagingService> artifactStagingServer;
private volatile ExpansionServer expansionServer;
- protected abstract JobInvoker createJobInvoker();
+ public interface JobInvokerFactory {
+ JobInvoker create();
+ }
protected InMemoryJobService createJobService() throws IOException {
artifactStagingServer = createArtifactStagingService();
expansionServer = createExpansionService();
- JobInvoker invoker = createJobInvoker();
+ JobInvoker invoker = jobInvokerFactory.create();
return InMemoryJobService.create(
artifactStagingServer.getApiServiceDescriptor(),
this::createSessionToken,
@@ -130,10 +133,17 @@
protected JobServerDriver(
ServerConfiguration configuration,
ServerFactory jobServerFactory,
- ServerFactory artifactServerFactory) {
+ ServerFactory artifactServerFactory,
+ JobInvokerFactory jobInvokerFactory) {
this.configuration = configuration;
this.jobServerFactory = jobServerFactory;
this.artifactServerFactory = artifactServerFactory;
+ this.jobInvokerFactory = jobInvokerFactory;
+ }
+
+ // Can be used to discover the address of the job server, and if it is ready
+ public String getJobServerUrl() {
+ return (jobServer != null) ? jobServer.getApiServiceDescriptor().getUrl() : null;
}
// This method is executed by TestPortableRunner via Reflection
diff --git a/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/environment/ProcessManagerTest.java b/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/environment/ProcessManagerTest.java
index 39efeef..d0c02c6 100644
--- a/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/environment/ProcessManagerTest.java
+++ b/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/environment/ProcessManagerTest.java
@@ -19,10 +19,17 @@
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.fail;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
import java.io.IOException;
+import java.io.PrintStream;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
import java.util.Arrays;
import java.util.Collections;
import org.junit.Test;
@@ -99,4 +106,55 @@
assertThat(process.getUnderlyingProcess().exitValue(), is(1));
processManager.stopProcess("1");
}
+
+ @Test
+ public void testRedirectOutput() throws IOException, InterruptedException {
+ File outputFile = File.createTempFile("beam-redirect-output-", "");
+ outputFile.deleteOnExit();
+ ProcessManager processManager = ProcessManager.create();
+ ProcessManager.RunningProcess process =
+ processManager.startProcess(
+ "1",
+ "bash",
+ Arrays.asList("-c", "echo 'testing123'"),
+ Collections.emptyMap(),
+ outputFile);
+ for (int i = 0; i < 10 && process.getUnderlyingProcess().isAlive(); i++) {
+ Thread.sleep(100);
+ }
+ processManager.stopProcess("1");
+ byte[] output = Files.readAllBytes(outputFile.toPath());
+ assertNotNull(output);
+ String outputStr = new String(output, Charset.defaultCharset());
+ assertThat(outputStr, containsString("testing123"));
+ }
+
+ @Test
+ public void testInheritIO() throws IOException, InterruptedException {
+ final PrintStream oldOut = System.out;
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ PrintStream newOut = new PrintStream(baos);
+ try {
+ System.setOut(newOut);
+ ProcessManager processManager = ProcessManager.create();
+ ProcessManager.RunningProcess process =
+ processManager.startProcess(
+ "1",
+ "bash",
+ Arrays.asList("-c", "echo 'testing123' 1>&2;"),
+ Collections.emptyMap(),
+ ProcessManager.INHERIT_IO_FILE);
+ for (int i = 0; i < 10 && process.getUnderlyingProcess().isAlive(); i++) {
+ Thread.sleep(100);
+ }
+ processManager.stopProcess("1");
+ } finally {
+ System.setOut(oldOut);
+ }
+ // TODO: this doesn't work as inherit IO bypasses System.out/err
+ // the output instead appears in the console
+ // String outputStr = new String(baos.toByteArray(), Charset.defaultCharset());
+ // assertThat(outputStr, containsString("testing123"));
+ assertFalse(ProcessManager.INHERIT_IO_FILE.exists());
+ }
}
diff --git a/runners/reference/java/src/main/java/org/apache/beam/runners/reference/testing/TestPortablePipelineOptions.java b/runners/reference/java/src/main/java/org/apache/beam/runners/reference/testing/TestPortablePipelineOptions.java
index 33ba8b1..3713b8d 100644
--- a/runners/reference/java/src/main/java/org/apache/beam/runners/reference/testing/TestPortablePipelineOptions.java
+++ b/runners/reference/java/src/main/java/org/apache/beam/runners/reference/testing/TestPortablePipelineOptions.java
@@ -18,6 +18,7 @@
package org.apache.beam.runners.reference.testing;
import com.google.auto.service.AutoService;
+import org.apache.beam.runners.fnexecution.jobsubmission.JobServerDriver;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.DefaultValueFactory;
import org.apache.beam.sdk.options.Description;
@@ -32,11 +33,10 @@
public interface TestPortablePipelineOptions extends TestPipelineOptions, PortablePipelineOptions {
@Required
- @Description(
- "Fully qualified class name of TestJobServiceDriver capable of managing the JobService.")
- Class getJobServerDriver();
+ @Description("Fully qualified class name of a JobServerDriver subclass.")
+ Class<JobServerDriver> getJobServerDriver();
- void setJobServerDriver(Class jobServerDriver);
+ void setJobServerDriver(Class<JobServerDriver> jobServerDriver);
@Description("String containing comma separated arguments for the JobServer.")
@Default.InstanceFactory(DefaultJobServerConfigFactory.class)
diff --git a/runners/reference/java/src/main/java/org/apache/beam/runners/reference/testing/TestPortableRunner.java b/runners/reference/java/src/main/java/org/apache/beam/runners/reference/testing/TestPortableRunner.java
index d7295f2..3492419 100644
--- a/runners/reference/java/src/main/java/org/apache/beam/runners/reference/testing/TestPortableRunner.java
+++ b/runners/reference/java/src/main/java/org/apache/beam/runners/reference/testing/TestPortableRunner.java
@@ -19,7 +19,8 @@
import static org.hamcrest.MatcherAssert.assertThat;
-import java.lang.reflect.InvocationTargetException;
+import java.io.IOException;
+import org.apache.beam.runners.fnexecution.jobsubmission.JobServerDriver;
import org.apache.beam.runners.reference.PortableRunner;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
@@ -37,13 +38,8 @@
* {@link TestPortableRunner} is a pipeline runner that wraps a {@link PortableRunner} when running
* tests against the {@link TestPipeline}.
*
- * <p>This runner requires a JobServerDriver with following methods.
- *
- * <ul>
- * <li>public static Object fromParams(String... params)
- * <li>public String start() // Start JobServer and returns the JobServer host and port.
- * <li>public void stop() // Stop the JobServer and free all resources.
- * </ul>
+ * <p>This runner requires a {@link JobServerDriver} subclass with the following factory method:
+ * <code>public static JobServerDriver fromParams(String[] args)</code>
*
* @see TestPipeline
*/
@@ -64,8 +60,8 @@
TestPortablePipelineOptions testPortablePipelineOptions =
options.as(TestPortablePipelineOptions.class);
String jobServerHostPort;
- Object jobServerDriver;
- Class<?> jobServerDriverClass = testPortablePipelineOptions.getJobServerDriver();
+ JobServerDriver jobServerDriver;
+ Class<JobServerDriver> jobServerDriverClass = testPortablePipelineOptions.getJobServerDriver();
String[] parameters = testPortablePipelineOptions.getJobServerConfig();
try {
jobServerDriver =
@@ -73,9 +69,9 @@
.fromFactoryMethod("fromParams")
.withArg(String[].class, parameters)
.build();
- jobServerHostPort = (String) jobServerDriverClass.getMethod("start").invoke(jobServerDriver);
- } catch (IllegalAccessException | NoSuchMethodException | InvocationTargetException e) {
- throw new IllegalArgumentException(e);
+ jobServerHostPort = jobServerDriver.start();
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to start job server", e);
}
try {
@@ -87,14 +83,7 @@
assertThat("Pipeline did not succeed.", result.waitUntilFinish(), Matchers.is(State.DONE));
return result;
} finally {
- try {
- jobServerDriverClass.getMethod("stop").invoke(jobServerDriver);
- } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) {
- LOG.error(
- String.format(
- "Provided JobServiceDriver %s does not implement stop().", jobServerDriverClass),
- e);
- }
+ jobServerDriver.stop();
}
}
}
diff --git a/runners/samza/build.gradle b/runners/samza/build.gradle
index 209db64..ae6f48f 100644
--- a/runners/samza/build.gradle
+++ b/runners/samza/build.gradle
@@ -87,6 +87,7 @@
excludeCategories 'org.apache.beam.sdk.testing.UsesTestStream'
excludeCategories 'org.apache.beam.sdk.testing.UsesMetricsPusher'
excludeCategories 'org.apache.beam.sdk.testing.UsesParDoLifecycle'
+ excludeCategories 'org.apache.beam.sdk.testing.UsesStrictTimerOrdering'
}
}
diff --git a/runners/samza/src/main/java/org/apache/beam/runners/samza/SamzaPipelineOptions.java b/runners/samza/src/main/java/org/apache/beam/runners/samza/SamzaPipelineOptions.java
index ed4437f..3ff64e3 100644
--- a/runners/samza/src/main/java/org/apache/beam/runners/samza/SamzaPipelineOptions.java
+++ b/runners/samza/src/main/java/org/apache/beam/runners/samza/SamzaPipelineOptions.java
@@ -105,4 +105,16 @@
List<MetricsReporter> getMetricsReporters();
void setMetricsReporters(List<MetricsReporter> reporters);
+
+ @Description("The maximum number of elements in a bundle.")
+ @Default.Long(1)
+ long getMaxBundleSize();
+
+ void setMaxBundleSize(long maxBundleSize);
+
+ @Description("The maximum time to wait before finalising a bundle (in milliseconds).")
+ @Default.Long(1000)
+ long getMaxBundleTimeMs();
+
+ void setMaxBundleTimeMs(long maxBundleTimeMs);
}
diff --git a/runners/samza/src/main/java/org/apache/beam/runners/samza/SamzaPipelineOptionsValidator.java b/runners/samza/src/main/java/org/apache/beam/runners/samza/SamzaPipelineOptionsValidator.java
index 24ed330..f965e5a 100644
--- a/runners/samza/src/main/java/org/apache/beam/runners/samza/SamzaPipelineOptionsValidator.java
+++ b/runners/samza/src/main/java/org/apache/beam/runners/samza/SamzaPipelineOptionsValidator.java
@@ -18,17 +18,44 @@
package org.apache.beam.runners.samza;
import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument;
+import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptionsValidator;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.samza.config.TaskConfig;
/** Validates that the {@link SamzaPipelineOptions} conforms to all the criteria. */
public class SamzaPipelineOptionsValidator {
- public static SamzaPipelineOptions validate(PipelineOptions opts) {
- final SamzaPipelineOptions samzaOptions =
- PipelineOptionsValidator.validate(SamzaPipelineOptions.class, opts);
+ public static void validate(SamzaPipelineOptions opts) {
+ checkArgument(opts.getMaxSourceParallelism() >= 1);
+ validateBundlingRelatedOptions(opts);
+ }
- checkArgument(samzaOptions.getMaxSourceParallelism() >= 1);
- return samzaOptions;
+ /*
+ * Perform some bundling related validation for pipeline option .
+ */
+ private static void validateBundlingRelatedOptions(SamzaPipelineOptions pipelineOptions) {
+ if (pipelineOptions.getMaxBundleSize() > 1) {
+ // TODO: remove this check and implement bundling for side input, timer, etc in DoFnOp.java
+ checkState(
+ isPortable(pipelineOptions),
+ "Bundling is not supported in non portable mode. Please disable by setting maxBundleSize to 1.");
+
+ String taskConcurrencyConfig = TaskConfig.MAX_CONCURRENCY();
+ Map<String, String> configs =
+ pipelineOptions.getConfigOverride() == null
+ ? new HashMap<>()
+ : pipelineOptions.getConfigOverride();
+ long taskConcurrency = Long.parseLong(configs.getOrDefault(taskConcurrencyConfig, "1"));
+ checkState(
+ taskConcurrency == 1,
+ "Bundling is not supported if "
+ + taskConcurrencyConfig
+ + " is greater than 1. Please disable bundling by setting maxBundleSize to 1. Or disable task concurrency.");
+ }
+ }
+
+ private static boolean isPortable(SamzaPipelineOptions options) {
+ return options instanceof SamzaPortablePipelineOptions;
}
}
diff --git a/runners/samza/src/main/java/org/apache/beam/runners/samza/SamzaRunner.java b/runners/samza/src/main/java/org/apache/beam/runners/samza/SamzaRunner.java
index 0eb50c4..4a94626 100644
--- a/runners/samza/src/main/java/org/apache/beam/runners/samza/SamzaRunner.java
+++ b/runners/samza/src/main/java/org/apache/beam/runners/samza/SamzaRunner.java
@@ -36,6 +36,7 @@
import org.apache.beam.sdk.PipelineRunner;
import org.apache.beam.sdk.metrics.MetricsEnvironment;
import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.options.PipelineOptionsValidator;
import org.apache.beam.sdk.values.PValue;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterators;
import org.apache.samza.application.StreamApplication;
@@ -56,7 +57,8 @@
private static final Logger LOG = LoggerFactory.getLogger(SamzaRunner.class);
public static SamzaRunner fromOptions(PipelineOptions opts) {
- final SamzaPipelineOptions samzaOptions = SamzaPipelineOptionsValidator.validate(opts);
+ final SamzaPipelineOptions samzaOptions =
+ PipelineOptionsValidator.validate(SamzaPipelineOptions.class, opts);
return new SamzaRunner(samzaOptions);
}
@@ -133,6 +135,9 @@
pipeline, new TranslationContext(appDescriptor, idMap, options));
};
+ // perform a final round of validation for the pipeline options now that all configs are
+ // generated
+ SamzaPipelineOptionsValidator.validate(options);
ApplicationRunner runner = runSamzaApp(app, config);
return new SamzaPipelineResult(app, runner, executionContext, listener, config);
}
diff --git a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/DoFnOp.java b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/DoFnOp.java
index 86e2ee6..795b8c0 100644
--- a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/DoFnOp.java
+++ b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/DoFnOp.java
@@ -24,6 +24,8 @@
import java.util.List;
import java.util.Map;
import java.util.ServiceLoader;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicLong;
import org.apache.beam.model.pipeline.v1.RunnerApi;
import org.apache.beam.runners.core.DoFnRunner;
import org.apache.beam.runners.core.DoFnRunners;
@@ -32,14 +34,15 @@
import org.apache.beam.runners.core.SimplePushbackSideInputDoFnRunner;
import org.apache.beam.runners.core.StateNamespace;
import org.apache.beam.runners.core.StateNamespaces;
+import org.apache.beam.runners.core.StateTags;
import org.apache.beam.runners.core.TimerInternals;
-import org.apache.beam.runners.core.construction.SerializablePipelineOptions;
import org.apache.beam.runners.core.construction.graph.ExecutableStage;
-import org.apache.beam.runners.core.serialization.Base64Serializer;
import org.apache.beam.runners.fnexecution.control.StageBundleFactory;
import org.apache.beam.runners.samza.SamzaExecutionContext;
import org.apache.beam.runners.samza.SamzaPipelineOptions;
import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.state.BagState;
+import org.apache.beam.sdk.state.TimeDomain;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.DoFnSchemaInformation;
import org.apache.beam.sdk.transforms.join.RawUnionValue;
@@ -57,6 +60,7 @@
import org.apache.samza.config.Config;
import org.apache.samza.context.Context;
import org.apache.samza.operators.Scheduler;
+import org.joda.time.Duration;
import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -64,6 +68,7 @@
/** Samza operator for {@link DoFn}. */
public class DoFnOp<InT, FnOutT, OutT> implements Op<InT, OutT, Void> {
private static final Logger LOG = LoggerFactory.getLogger(DoFnOp.class);
+ private static final long MIN_BUNDLE_CHECK_TIME_MS = 10L;
private final TupleTag<FnOutT> mainOutputTag;
private final DoFn<InT, FnOutT> doFn;
@@ -77,8 +82,11 @@
private final String transformFullName;
private final String transformId;
private final Coder<InT> inputCoder;
+ private final Coder<WindowedValue<InT>> windowedValueCoder;
private final HashMap<TupleTag<?>, Coder<?>> outputCoders;
private final PCollection.IsBounded isBounded;
+ private final String bundleCheckTimerId;
+ private final String bundleStateId;
// portable api related
private final boolean isPortable;
@@ -90,6 +98,7 @@
private transient PushbackSideInputDoFnRunner<InT, FnOutT> pushbackFnRunner;
private transient SideInputHandler sideInputHandler;
private transient DoFnInvoker<InT, FnOutT> doFnInvoker;
+ private transient SamzaPipelineOptions samzaPipelineOptions;
// This is derivable from pushbackValues which is persisted to a store.
// TODO: eagerly initialize the hold in init
@@ -100,9 +109,16 @@
// TODO: add this to checkpointable state
private transient Instant inputWatermark;
+ private transient Instant bundleWatermarkHold;
private transient Instant sideInputWatermark;
private transient List<WindowedValue<InT>> pushbackValues;
private transient StageBundleFactory stageBundleFactory;
+ private transient long maxBundleSize;
+ private transient long maxBundleTimeMs;
+ private transient AtomicLong currentBundleElementCount;
+ private transient AtomicLong bundleStartTime;
+ private transient AtomicBoolean isBundleStarted;
+ private transient Scheduler<KeyedTimerData<Void>> bundleTimerScheduler;
private DoFnSchemaInformation doFnSchemaInformation;
private Map<String, PCollectionView<?>> sideInputMapping;
@@ -111,6 +127,7 @@
DoFn<InT, FnOutT> doFn,
Coder<?> keyCoder,
Coder<InT> inputCoder,
+ Coder<WindowedValue<InT>> windowedValueCoder,
Map<TupleTag<?>, Coder<?>> outputCoders,
Collection<PCollectionView<?>> sideInputs,
List<TupleTag<?>> sideOutputTags,
@@ -130,6 +147,7 @@
this.sideInputs = sideInputs;
this.sideOutputTags = sideOutputTags;
this.inputCoder = inputCoder;
+ this.windowedValueCoder = windowedValueCoder;
this.outputCoders = new HashMap<>(outputCoders);
this.windowingStrategy = windowingStrategy;
this.idToViewMap = new HashMap<>(idToViewMap);
@@ -141,6 +159,8 @@
this.isPortable = isPortable;
this.stagePayload = stagePayload;
this.idToTupleTagMap = new HashMap<>(idToTupleTagMap);
+ this.bundleCheckTimerId = "_samza_bundle_check_" + transformId;
+ this.bundleStateId = "_samza_bundle_" + transformId;
this.doFnSchemaInformation = doFnSchemaInformation;
this.sideInputMapping = sideInputMapping;
}
@@ -154,17 +174,26 @@
this.inputWatermark = BoundedWindow.TIMESTAMP_MIN_VALUE;
this.sideInputWatermark = BoundedWindow.TIMESTAMP_MIN_VALUE;
this.pushbackWatermarkHold = BoundedWindow.TIMESTAMP_MAX_VALUE;
+ this.currentBundleElementCount = new AtomicLong(0L);
+ this.bundleStartTime = new AtomicLong(Long.MAX_VALUE);
+ this.isBundleStarted = new AtomicBoolean(false);
+ this.bundleWatermarkHold = null;
final DoFnSignature signature = DoFnSignatures.getSignature(doFn.getClass());
- final SamzaPipelineOptions pipelineOptions =
- Base64Serializer.deserializeUnchecked(
- config.get("beamPipelineOptions"), SerializablePipelineOptions.class)
- .get()
- .as(SamzaPipelineOptions.class);
+ final SamzaExecutionContext samzaExecutionContext =
+ (SamzaExecutionContext) context.getApplicationContainerContext();
+ this.samzaPipelineOptions = samzaExecutionContext.getPipelineOptions();
+ this.maxBundleSize = samzaPipelineOptions.getMaxBundleSize();
+ this.maxBundleTimeMs = samzaPipelineOptions.getMaxBundleTimeMs();
+ this.bundleTimerScheduler = timerRegistry;
+
+ if (this.maxBundleSize > 1) {
+ scheduleNextBundleCheck();
+ }
final SamzaStoreStateInternals.Factory<?> nonKeyedStateInternalsFactory =
SamzaStoreStateInternals.createStateInternalFactory(
- transformId, null, context.getTaskContext(), pipelineOptions, signature);
+ transformId, null, context.getTaskContext(), samzaPipelineOptions, signature);
this.timerInternalsFactory =
SamzaTimerInternalsFactory.createTimerInternalFactory(
@@ -174,18 +203,23 @@
nonKeyedStateInternalsFactory,
windowingStrategy,
isBounded,
- pipelineOptions);
+ samzaPipelineOptions);
this.sideInputHandler =
new SideInputHandler(sideInputs, nonKeyedStateInternalsFactory.stateInternalsForKey(null));
if (isPortable) {
- SamzaExecutionContext samzaExecutionContext =
- (SamzaExecutionContext) context.getApplicationContainerContext();
- ExecutableStage executableStage = ExecutableStage.fromPayload(stagePayload);
+ // storing events within a bundle in states
+ final BagState<WindowedValue<InT>> bundledEventsBagState =
+ nonKeyedStateInternalsFactory
+ .stateInternalsForKey(null)
+ .state(StateNamespaces.global(), StateTags.bag(bundleStateId, windowedValueCoder));
+ final ExecutableStage executableStage = ExecutableStage.fromPayload(stagePayload);
stageBundleFactory = samzaExecutionContext.getJobBundleFactory().forStage(executableStage);
this.fnRunner =
SamzaDoFnRunners.createPortable(
+ samzaPipelineOptions,
+ bundledEventsBagState,
outputManagerFactory.create(emitter),
stageBundleFactory,
mainOutputTag,
@@ -195,7 +229,7 @@
} else {
this.fnRunner =
SamzaDoFnRunners.create(
- pipelineOptions,
+ samzaPipelineOptions,
doFn,
windowingStrategy,
transformFullName,
@@ -229,6 +263,25 @@
doFnInvoker.invokeSetup();
}
+ /*
+ * Schedule in processing time to check whether the current bundle should be closed. Note that
+ * we only approximately achieve max bundle time by checking as frequent as half of the max bundle
+ * time set by users. This would violate the max bundle time by up to half of it but should
+ * acceptable in most cases (and cheaper than scheduling a timer at the beginning of every bundle).
+ */
+ private void scheduleNextBundleCheck() {
+ final Instant nextBundleCheckTime =
+ Instant.now().plus(Duration.millis(maxBundleTimeMs / 2 + MIN_BUNDLE_CHECK_TIME_MS));
+ final TimerInternals.TimerData timerData =
+ TimerInternals.TimerData.of(
+ bundleCheckTimerId,
+ StateNamespaces.global(),
+ nextBundleCheckTime,
+ TimeDomain.PROCESSING_TIME);
+ bundleTimerScheduler.schedule(
+ new KeyedTimerData<>(new byte[0], null, timerData), nextBundleCheckTime.getMillis());
+ }
+
private String getTimerStateId(DoFnSignature signature) {
final StringBuilder builder = new StringBuilder("timer");
if (signature.usesTimers()) {
@@ -237,9 +290,39 @@
return builder.toString();
}
+ private void attemptStartBundle() {
+ if (isBundleStarted.compareAndSet(false, true)) {
+ currentBundleElementCount.set(0L);
+ bundleStartTime.set(System.currentTimeMillis());
+ pushbackFnRunner.startBundle();
+ }
+ }
+
+ private void finishBundle(OpEmitter<OutT> emitter) {
+ if (isBundleStarted.compareAndSet(true, false)) {
+ currentBundleElementCount.set(0L);
+ bundleStartTime.set(Long.MAX_VALUE);
+ pushbackFnRunner.finishBundle();
+ if (bundleWatermarkHold != null) {
+ doProcessWatermark(bundleWatermarkHold, emitter);
+ }
+ bundleWatermarkHold = null;
+ }
+ }
+
+ private void attemptFinishBundle(OpEmitter<OutT> emitter) {
+ if (!isBundleStarted.get()) {
+ return;
+ }
+ if (currentBundleElementCount.get() >= maxBundleSize
+ || System.currentTimeMillis() - bundleStartTime.get() > maxBundleTimeMs) {
+ finishBundle(emitter);
+ }
+ }
+
@Override
public void processElement(WindowedValue<InT> inputElement, OpEmitter<OutT> emitter) {
- pushbackFnRunner.startBundle();
+ attemptStartBundle();
final Iterable<WindowedValue<InT>> rejectedValues =
pushbackFnRunner.processElementInReadyWindows(inputElement);
@@ -250,11 +333,11 @@
pushbackValues.add(rejectedValue);
}
- pushbackFnRunner.finishBundle();
+ currentBundleElementCount.incrementAndGet();
+ attemptFinishBundle(emitter);
}
- @Override
- public void processWatermark(Instant watermark, OpEmitter<OutT> emitter) {
+ private void doProcessWatermark(Instant watermark, OpEmitter<OutT> emitter) {
this.inputWatermark = watermark;
if (sideInputWatermark.isEqual(BoundedWindow.TIMESTAMP_MAX_VALUE)) {
@@ -281,6 +364,20 @@
}
@Override
+ public void processWatermark(Instant watermark, OpEmitter<OutT> emitter) {
+ if (!isBundleStarted.get()) {
+ doProcessWatermark(watermark, emitter);
+ } else {
+ // if there is a bundle in progress, hold back the watermark until end of the bundle
+ this.bundleWatermarkHold = watermark;
+ if (watermark.isEqual(BoundedWindow.TIMESTAMP_MAX_VALUE)) {
+ // for batch mode, the max watermark should force the bundle to close
+ finishBundle(emitter);
+ }
+ }
+ }
+
+ @Override
public void processSideInput(
String id, WindowedValue<? extends Iterable<?>> elements, OpEmitter<OutT> emitter) {
@SuppressWarnings("unchecked")
@@ -317,7 +414,14 @@
}
@Override
- public void processTimer(KeyedTimerData<Void> keyedTimerData) {
+ public void processTimer(KeyedTimerData<Void> keyedTimerData, OpEmitter<OutT> emitter) {
+ // this is internal timer in processing time to check whether a bundle should be closed
+ if (bundleCheckTimerId.equals(keyedTimerData.getTimerData().getTimerId())) {
+ attemptFinishBundle(emitter);
+ scheduleNextBundleCheck();
+ return;
+ }
+
pushbackFnRunner.startBundle();
fireTimer(keyedTimerData);
pushbackFnRunner.finishBundle();
@@ -327,6 +431,7 @@
@Override
public void close() {
+ bundleWatermarkHold = null;
doFnInvoker.invokeTeardown();
try (AutoCloseable closer = stageBundleFactory) {
// do nothing
diff --git a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/GroupByKeyOp.java b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/GroupByKeyOp.java
index 387ca9a..1b0ab61 100644
--- a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/GroupByKeyOp.java
+++ b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/GroupByKeyOp.java
@@ -195,7 +195,7 @@
}
@Override
- public void processWatermark(Instant watermark, OpEmitter<KV<K, OutputT>> ctx) {
+ public void processWatermark(Instant watermark, OpEmitter<KV<K, OutputT>> emitter) {
timerInternalsFactory.setInputWatermark(watermark);
fnRunner.startBundle();
@@ -207,12 +207,12 @@
if (timerInternalsFactory.getOutputWatermark() == null
|| timerInternalsFactory.getOutputWatermark().isBefore(watermark)) {
timerInternalsFactory.setOutputWatermark(watermark);
- ctx.emitWatermark(timerInternalsFactory.getOutputWatermark());
+ emitter.emitWatermark(timerInternalsFactory.getOutputWatermark());
}
}
@Override
- public void processTimer(KeyedTimerData<K> keyedTimerData) {
+ public void processTimer(KeyedTimerData<K> keyedTimerData, OpEmitter<KV<K, OutputT>> emitter) {
fnRunner.startBundle();
fireTimer(keyedTimerData.getKey(), keyedTimerData.getTimerData());
fnRunner.finishBundle();
diff --git a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/Op.java b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/Op.java
index cbf5c46d..93e6a9c 100644
--- a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/Op.java
+++ b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/Op.java
@@ -58,7 +58,7 @@
default void processSideInputWatermark(Instant watermark, OpEmitter<OutT> emitter) {}
- default void processTimer(KeyedTimerData<K> keyedTimerData) {};
+ default void processTimer(KeyedTimerData<K> keyedTimerData, OpEmitter<OutT> emitter) {}
default void close() {}
}
diff --git a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/OpAdapter.java b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/OpAdapter.java
index 8b958db..e663a04 100644
--- a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/OpAdapter.java
+++ b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/OpAdapter.java
@@ -128,7 +128,7 @@
assert outputList.isEmpty();
try {
- op.processTimer(keyedTimerData);
+ op.processTimer(keyedTimerData, emitter);
} catch (Exception e) {
LOG.error("Op {} threw an exception during processing timer", this.getClass().getName(), e);
throw UserCodeException.wrap(e);
diff --git a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/SamzaDoFnRunners.java b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/SamzaDoFnRunners.java
index 49b4a28..3b1b938 100644
--- a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/SamzaDoFnRunners.java
+++ b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/SamzaDoFnRunners.java
@@ -37,6 +37,7 @@
import org.apache.beam.runners.samza.metrics.DoFnRunnerWithMetrics;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.fn.data.FnDataReceiver;
+import org.apache.beam.sdk.state.BagState;
import org.apache.beam.sdk.state.TimeDomain;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.DoFnSchemaInformation;
@@ -163,6 +164,8 @@
/** Create DoFnRunner for portable runner. */
public static <InT, FnOutT> DoFnRunner<InT, FnOutT> createPortable(
+ SamzaPipelineOptions pipelineOptions,
+ BagState<WindowedValue<InT>> bundledEventsBag,
DoFnRunners.OutputManager outputManager,
StageBundleFactory stageBundleFactory,
TupleTag<FnOutT> mainOutputTag,
@@ -173,7 +176,7 @@
(SamzaExecutionContext) context.getApplicationContainerContext();
final DoFnRunner<InT, FnOutT> sdkHarnessDoFnRunner =
new SdkHarnessDoFnRunner<>(
- outputManager, stageBundleFactory, mainOutputTag, idToTupleTagMap);
+ outputManager, stageBundleFactory, mainOutputTag, idToTupleTagMap, bundledEventsBag);
return DoFnRunnerWithMetrics.wrap(
sdkHarnessDoFnRunner, executionContext.getMetricsContainer(), transformFullName);
}
@@ -184,23 +187,25 @@
private final TupleTag<FnOutT> mainOutputTag;
private final Map<String, TupleTag<?>> idToTupleTagMap;
private final LinkedBlockingQueue<KV<String, FnOutT>> outputQueue = new LinkedBlockingQueue<>();
+ private final BagState<WindowedValue<InT>> bundledEventsBag;
+ private RemoteBundle remoteBundle;
+ private FnDataReceiver<WindowedValue<?>> inputReceiver;
private SdkHarnessDoFnRunner(
DoFnRunners.OutputManager outputManager,
StageBundleFactory stageBundleFactory,
TupleTag<FnOutT> mainOutputTag,
- Map<String, TupleTag<?>> idToTupleTagMap) {
+ Map<String, TupleTag<?>> idToTupleTagMap,
+ BagState<WindowedValue<InT>> bundledEventsBag) {
this.outputManager = outputManager;
this.stageBundleFactory = stageBundleFactory;
this.mainOutputTag = mainOutputTag;
this.idToTupleTagMap = idToTupleTagMap;
+ this.bundledEventsBag = bundledEventsBag;
}
@Override
- public void startBundle() {}
-
- @Override
- public void processElement(WindowedValue<InT> elem) {
+ public void startBundle() {
try {
OutputReceiverFactory receiverFactory =
new OutputReceiverFactory() {
@@ -213,31 +218,66 @@
}
};
- try (RemoteBundle bundle =
+ remoteBundle =
stageBundleFactory.getBundle(
receiverFactory,
StateRequestHandler.unsupported(),
- BundleProgressHandler.ignored())) {
- Iterables.getOnlyElement(bundle.getInputReceivers().values()).accept(elem);
- }
+ BundleProgressHandler.ignored());
- // RemoteBundle close blocks until all results are received
- KV<String, FnOutT> result;
- while ((result = outputQueue.poll()) != null) {
- outputManager.output(
- idToTupleTagMap.get(result.getKey()), (WindowedValue) result.getValue());
- }
+ // TODO: side input support needs to implement to handle this properly
+ inputReceiver = Iterables.getOnlyElement(remoteBundle.getInputReceivers().values());
+ bundledEventsBag
+ .read()
+ .forEach(
+ elem -> {
+ try {
+ inputReceiver.accept(elem);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ });
} catch (Exception e) {
throw new RuntimeException(e);
}
}
@Override
+ public void processElement(WindowedValue<InT> elem) {
+ try {
+ bundledEventsBag.add(elem);
+ inputReceiver.accept(elem);
+ emitResults();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private void emitResults() {
+ KV<String, FnOutT> result;
+ while ((result = outputQueue.poll()) != null) {
+ outputManager.output(
+ idToTupleTagMap.get(result.getKey()), (WindowedValue) result.getValue());
+ }
+ }
+
+ @Override
public void onTimer(
String timerId, BoundedWindow window, Instant timestamp, TimeDomain timeDomain) {}
@Override
- public void finishBundle() {}
+ public void finishBundle() {
+ try {
+ // RemoteBundle close blocks until all results are received
+ remoteBundle.close();
+ emitResults();
+ bundledEventsBag.clear();
+ } catch (Exception e) {
+ throw new RuntimeException("Failed to finish remote bundle", e);
+ } finally {
+ remoteBundle = null;
+ inputReceiver = null;
+ }
+ }
@Override
public DoFn<InT, FnOutT> getFn() {
diff --git a/runners/samza/src/main/java/org/apache/beam/runners/samza/translation/ParDoBoundMultiTranslator.java b/runners/samza/src/main/java/org/apache/beam/runners/samza/translation/ParDoBoundMultiTranslator.java
index f00c34b..6550ebf 100644
--- a/runners/samza/src/main/java/org/apache/beam/runners/samza/translation/ParDoBoundMultiTranslator.java
+++ b/runners/samza/src/main/java/org/apache/beam/runners/samza/translation/ParDoBoundMultiTranslator.java
@@ -144,6 +144,7 @@
transform.getFn(),
keyCoder,
(Coder<InT>) input.getCoder(),
+ null,
outputCoders,
transform.getSideInputs().values(),
transform.getAdditionalOutputTags().getAll(),
@@ -254,6 +255,7 @@
new NoOpDoFn<>(),
null, // key coder not in use
windowedInputCoder.getValueCoder(), // input coder not in use
+ windowedInputCoder,
Collections.emptyMap(), // output coders not in use
Collections.emptyList(), // sideInputs not in use until side input support
new ArrayList<>(idToTupleTagMap.values()), // used by java runner only
diff --git a/runners/samza/src/main/java/org/apache/beam/runners/samza/translation/SamzaPipelineTranslator.java b/runners/samza/src/main/java/org/apache/beam/runners/samza/translation/SamzaPipelineTranslator.java
index c30b181..bfa2e10 100644
--- a/runners/samza/src/main/java/org/apache/beam/runners/samza/translation/SamzaPipelineTranslator.java
+++ b/runners/samza/src/main/java/org/apache/beam/runners/samza/translation/SamzaPipelineTranslator.java
@@ -18,6 +18,7 @@
package org.apache.beam.runners.samza.translation;
import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument;
+import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState;
import com.google.auto.service.AutoService;
import java.util.HashMap;
@@ -53,6 +54,9 @@
private SamzaPipelineTranslator() {}
public static void translate(Pipeline pipeline, TranslationContext ctx) {
+ checkState(
+ ctx.getPipelineOptions().getMaxBundleSize() <= 1,
+ "bundling is not supported for non portable mode. Please disable bundling (by setting max bundle size to 1).");
final TransformVisitorFn translateFn =
new TransformVisitorFn() {
diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/SparkJobServerDriver.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/SparkJobServerDriver.java
index f0302f1..301cf48 100644
--- a/runners/spark/src/main/java/org/apache/beam/runners/spark/SparkJobServerDriver.java
+++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/SparkJobServerDriver.java
@@ -18,7 +18,6 @@
package org.apache.beam.runners.spark;
import org.apache.beam.runners.fnexecution.ServerFactory;
-import org.apache.beam.runners.fnexecution.jobsubmission.JobInvoker;
import org.apache.beam.runners.fnexecution.jobsubmission.JobServerDriver;
import org.apache.beam.sdk.extensions.gcp.options.GcsOptions;
import org.apache.beam.sdk.io.FileSystems;
@@ -33,11 +32,6 @@
/** Driver program that starts a job server for the Spark runner. */
public class SparkJobServerDriver extends JobServerDriver {
- @Override
- protected JobInvoker createJobInvoker() {
- return SparkJobInvoker.create((SparkServerConfiguration) configuration);
- }
-
private static final Logger LOG = LoggerFactory.getLogger(SparkJobServerDriver.class);
/** Spark runner-specific Configuration for the jobServer. */
@@ -100,6 +94,10 @@
SparkServerConfiguration configuration,
ServerFactory jobServerFactory,
ServerFactory artifactServerFactory) {
- super(configuration, jobServerFactory, artifactServerFactory);
+ super(
+ configuration,
+ jobServerFactory,
+ artifactServerFactory,
+ () -> SparkJobInvoker.create(configuration));
}
}
diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/stateful/SparkGroupAlsoByWindowViaWindowSet.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/stateful/SparkGroupAlsoByWindowViaWindowSet.java
index 5cb0bec..f8ff5e6 100644
--- a/runners/spark/src/main/java/org/apache/beam/runners/spark/stateful/SparkGroupAlsoByWindowViaWindowSet.java
+++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/stateful/SparkGroupAlsoByWindowViaWindowSet.java
@@ -42,6 +42,7 @@
import org.apache.beam.runners.spark.translation.TranslationUtils;
import org.apache.beam.runners.spark.util.ByteArray;
import org.apache.beam.runners.spark.util.GlobalWatermarkHolder;
+import org.apache.beam.runners.spark.util.TimerUtils;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.IterableCoder;
import org.apache.beam.sdk.coders.KvCoder;
@@ -338,6 +339,9 @@
outputHolder.getWindowedValues();
if (!outputs.isEmpty() || !stateInternals.getState().isEmpty()) {
+
+ TimerUtils.dropExpiredTimers(timerInternals, windowingStrategy);
+
// empty outputs are filtered later using DStream filtering
final StateAndTimers updated =
new StateAndTimers(
diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/stateful/SparkTimerInternals.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/stateful/SparkTimerInternals.java
index 02305a5..6cdcef4 100644
--- a/runners/spark/src/main/java/org/apache/beam/runners/spark/stateful/SparkTimerInternals.java
+++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/stateful/SparkTimerInternals.java
@@ -100,7 +100,7 @@
: forStreamFromSources(Lists.newArrayList(watermarks.keySet()), watermarks);
}
- Collection<TimerData> getTimers() {
+ public Collection<TimerData> getTimers() {
return timers;
}
diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/util/TimerUtils.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/util/TimerUtils.java
new file mode 100644
index 0000000..e383d8c
--- /dev/null
+++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/util/TimerUtils.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.spark.util;
+
+import java.util.Collection;
+import java.util.stream.Collectors;
+import org.apache.beam.runners.core.TimerInternals;
+import org.apache.beam.runners.spark.stateful.SparkTimerInternals;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.values.WindowingStrategy;
+
+public class TimerUtils {
+
+ public static <W extends BoundedWindow> void dropExpiredTimers(
+ SparkTimerInternals sparkTimerInternals, WindowingStrategy<?, W> windowingStrategy) {
+ Collection<TimerInternals.TimerData> expiredTimers =
+ sparkTimerInternals.getTimers().stream()
+ .filter(
+ timer ->
+ timer
+ .getTimestamp()
+ .plus(windowingStrategy.getAllowedLateness())
+ .isBefore(sparkTimerInternals.currentInputWatermarkTime()))
+ .collect(Collectors.toList());
+
+ // Remove the expired timer from the timerInternals structure
+ expiredTimers.forEach(sparkTimerInternals::deleteTimer);
+ }
+}
diff --git a/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/CreateStreamTest.java b/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/CreateStreamTest.java
index 1ea8ce8..6e795a5 100644
--- a/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/CreateStreamTest.java
+++ b/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/CreateStreamTest.java
@@ -22,7 +22,9 @@
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.lessThanOrEqualTo;
+import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertThat;
+import static org.junit.Assert.fail;
import java.io.IOException;
import java.io.Serializable;
@@ -31,8 +33,10 @@
import org.apache.beam.runners.spark.SparkPipelineOptions;
import org.apache.beam.runners.spark.StreamingTest;
import org.apache.beam.runners.spark.io.CreateStream;
+import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.coders.VarIntCoder;
+import org.apache.beam.sdk.coders.VarLongCoder;
import org.apache.beam.sdk.testing.PAssert;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.transforms.Combine;
@@ -41,6 +45,7 @@
import org.apache.beam.sdk.transforms.Flatten;
import org.apache.beam.sdk.transforms.GroupByKey;
import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.Sum;
import org.apache.beam.sdk.transforms.Values;
import org.apache.beam.sdk.transforms.WithKeys;
@@ -54,6 +59,7 @@
import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
import org.apache.beam.sdk.transforms.windowing.Never;
import org.apache.beam.sdk.transforms.windowing.Window;
+import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionList;
import org.apache.beam.sdk.values.PCollectionTuple;
@@ -451,6 +457,47 @@
source.advanceWatermarkForNextBatch(BoundedWindow.TIMESTAMP_MAX_VALUE);
}
+ @Test
+ public void testInStreamingModeCountByKey() throws Exception {
+ Instant instant = new Instant(0);
+
+ CreateStream<KV<Integer, Long>> kvSource =
+ CreateStream.of(KvCoder.of(VarIntCoder.of(), VarLongCoder.of()), batchDuration())
+ .emptyBatch()
+ .advanceWatermarkForNextBatch(instant)
+ .nextBatch(
+ TimestampedValue.of(KV.of(1, 100L), instant.plus(Duration.standardSeconds(3L))),
+ TimestampedValue.of(KV.of(1, 300L), instant.plus(Duration.standardSeconds(4L))))
+ .advanceWatermarkForNextBatch(instant.plus(Duration.standardSeconds(7L)))
+ .nextBatch(
+ TimestampedValue.of(KV.of(1, 400L), instant.plus(Duration.standardSeconds(8L))))
+ .advanceNextBatchWatermarkToInfinity();
+
+ PCollection<KV<Integer, Long>> output =
+ p.apply("create kv Source", kvSource)
+ .apply(
+ "window input",
+ Window.<KV<Integer, Long>>into(FixedWindows.of(Duration.standardSeconds(3L)))
+ .withAllowedLateness(Duration.ZERO))
+ .apply(Count.perKey());
+
+ PAssert.that("Wrong count value ", output)
+ .satisfies(
+ (SerializableFunction<Iterable<KV<Integer, Long>>, Void>)
+ input -> {
+ for (KV<Integer, Long> element : input) {
+ if (element.getKey() == 1) {
+ Long countValue = element.getValue();
+ assertNotEquals("Count Value is 0 !!!", 0L, countValue.longValue());
+ } else {
+ fail("Unknown key in the output PCollection");
+ }
+ }
+ return null;
+ });
+ p.run();
+ }
+
private Duration batchDuration() {
return Duration.millis(
(p.getOptions().as(SparkPipelineOptions.class)).getBatchIntervalMillis());
diff --git a/sdks/go/pkg/beam/coder.go b/sdks/go/pkg/beam/coder.go
index 99d4c30..dbca436 100644
--- a/sdks/go/pkg/beam/coder.go
+++ b/sdks/go/pkg/beam/coder.go
@@ -151,21 +151,30 @@
return nil, err
}
return &coder.Coder{Kind: coder.Custom, T: t, Custom: c}, nil
- case reflectx.Float32, reflectx.Float64:
+
+ case reflectx.Float32:
c, err := coderx.NewFloat(t.Type())
if err != nil {
return nil, err
}
return &coder.Coder{Kind: coder.Custom, T: t, Custom: c}, nil
+ case reflectx.Float64:
+ return &coder.Coder{Kind: coder.Double, T: t}, nil
+
case reflectx.String:
c, err := coderx.NewString()
if err != nil {
return nil, err
}
return &coder.Coder{Kind: coder.Custom, T: t, Custom: c}, nil
+
case reflectx.ByteSlice:
return &coder.Coder{Kind: coder.Bytes, T: t}, nil
+
+ case reflectx.Bool:
+ return &coder.Coder{Kind: coder.Bool, T: t}, nil
+
default:
et := t.Type()
if c := coder.LookupCustomCoder(et); c != nil {
diff --git a/sdks/go/pkg/beam/core/graph/coder/coder.go b/sdks/go/pkg/beam/core/graph/coder/coder.go
index 4de630a..61ebdc6 100644
--- a/sdks/go/pkg/beam/core/graph/coder/coder.go
+++ b/sdks/go/pkg/beam/core/graph/coder/coder.go
@@ -159,7 +159,9 @@
const (
Custom Kind = "Custom" // Implicitly length-prefixed
Bytes Kind = "bytes" // Implicitly length-prefixed as part of the encoding
+ Bool Kind = "bool"
VarInt Kind = "varint"
+ Double Kind = "double"
WindowedValue Kind = "W"
KV Kind = "KV"
@@ -245,11 +247,21 @@
return &Coder{Kind: Bytes, T: typex.New(reflectx.ByteSlice)}
}
+// NewBool returns a new bool coder using the built-in scheme.
+func NewBool() *Coder {
+ return &Coder{Kind: Bool, T: typex.New(reflectx.Bool)}
+}
+
// NewVarInt returns a new int64 coder using the built-in scheme.
func NewVarInt() *Coder {
return &Coder{Kind: VarInt, T: typex.New(reflectx.Int64)}
}
+// NewDouble returns a new double coder using the built-in scheme.
+func NewDouble() *Coder {
+ return &Coder{Kind: Double, T: typex.New(reflectx.Float64)}
+}
+
// IsW returns true iff the coder is for a WindowedValue.
func IsW(c *Coder) bool {
return c.Kind == WindowedValue
diff --git a/sdks/go/pkg/beam/core/graph/coder/double.go b/sdks/go/pkg/beam/core/graph/coder/double.go
new file mode 100644
index 0000000..bb47afe
--- /dev/null
+++ b/sdks/go/pkg/beam/core/graph/coder/double.go
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package coder
+
+import (
+ "encoding/binary"
+ "io"
+ "math"
+
+ "github.com/apache/beam/sdks/go/pkg/beam/core/util/ioutilx"
+)
+
+// EncodeDouble encodes a float64 in big endian format.
+func EncodeDouble(value float64, w io.Writer) error {
+ var data [8]byte
+ binary.BigEndian.PutUint64(data[:], math.Float64bits(value))
+ _, err := ioutilx.WriteUnsafe(w, data[:])
+ return err
+}
+
+// DecodeDouble decodes a float64 in big endian format.
+func DecodeDouble(r io.Reader) (float64, error) {
+ var data [8]byte
+ if err := ioutilx.ReadNBufUnsafe(r, data[:]); err != nil {
+ return 0, err
+ }
+ return math.Float64frombits(binary.BigEndian.Uint64(data[:])), nil
+}
diff --git a/sdks/go/pkg/beam/core/runtime/exec/coder.go b/sdks/go/pkg/beam/core/runtime/exec/coder.go
index a55d7f7..0ef7260 100644
--- a/sdks/go/pkg/beam/core/runtime/exec/coder.go
+++ b/sdks/go/pkg/beam/core/runtime/exec/coder.go
@@ -66,9 +66,15 @@
case coder.Bytes:
return &bytesEncoder{}
+ case coder.Bool:
+ return &boolEncoder{}
+
case coder.VarInt:
return &varIntEncoder{}
+ case coder.Double:
+ return &doubleEncoder{}
+
case coder.Custom:
return &customEncoder{
t: c.Custom.Type,
@@ -93,9 +99,15 @@
case coder.Bytes:
return &bytesDecoder{}
+ case coder.Bool:
+ return &boolDecoder{}
+
case coder.VarInt:
return &varIntDecoder{}
+ case coder.Double:
+ return &doubleDecoder{}
+
case coder.Custom:
return &customDecoder{
t: c.Custom.Type,
@@ -147,6 +159,39 @@
return &FullValue{Elm: data}, nil
}
+type boolEncoder struct{}
+
+func (*boolEncoder) Encode(val *FullValue, w io.Writer) error {
+ // Encoding: false = 0, true = 1
+ var err error
+ if val.Elm.(bool) {
+ _, err = ioutilx.WriteUnsafe(w, []byte{1})
+ } else {
+ _, err = ioutilx.WriteUnsafe(w, []byte{0})
+ }
+ if err != nil {
+ return fmt.Errorf("error encoding bool: %v", err)
+ }
+ return nil
+}
+
+type boolDecoder struct{}
+
+func (*boolDecoder) Decode(r io.Reader) (*FullValue, error) {
+ // Encoding: false = 0, true = 1
+ b := make([]byte, 1, 1)
+ if err := ioutilx.ReadNBufUnsafe(r, b); err != nil {
+ return nil, fmt.Errorf("error decoding bool: %v", err)
+ }
+ switch b[0] {
+ case 0:
+ return &FullValue{Elm: false}, nil
+ case 1:
+ return &FullValue{Elm: true}, nil
+ }
+ return nil, fmt.Errorf("error decoding bool: received invalid value %v", b)
+}
+
type varIntEncoder struct{}
func (*varIntEncoder) Encode(val *FullValue, w io.Writer) error {
@@ -165,6 +210,24 @@
return &FullValue{Elm: n}, nil
}
+type doubleEncoder struct{}
+
+func (*doubleEncoder) Encode(val *FullValue, w io.Writer) error {
+ // Encoding: beam double (big-endian 64-bit IEEE 754 double)
+ return coder.EncodeDouble(val.Elm.(float64), w)
+}
+
+type doubleDecoder struct{}
+
+func (*doubleDecoder) Decode(r io.Reader) (*FullValue, error) {
+ // Encoding: beam double (big-endian 64-bit IEEE 754 double)
+ f, err := coder.DecodeDouble(r)
+ if err != nil {
+ return nil, err
+ }
+ return &FullValue{Elm: f}, nil
+}
+
type customEncoder struct {
t reflect.Type
enc Encoder
diff --git a/sdks/go/pkg/beam/core/runtime/exec/coder_test.go b/sdks/go/pkg/beam/core/runtime/exec/coder_test.go
new file mode 100644
index 0000000..4f663fa
--- /dev/null
+++ b/sdks/go/pkg/beam/core/runtime/exec/coder_test.go
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package exec
+
+import (
+ "bytes"
+ "fmt"
+ "testing"
+
+ "github.com/apache/beam/sdks/go/pkg/beam/core/typex"
+ "github.com/apache/beam/sdks/go/pkg/beam/core/util/reflectx"
+
+ "github.com/apache/beam/sdks/go/pkg/beam/core/graph/coder"
+ "github.com/apache/beam/sdks/go/pkg/beam/core/runtime/coderx"
+)
+
+func TestCoders(t *testing.T) {
+ for _, test := range []struct {
+ coder *coder.Coder
+ val *FullValue
+ }{
+ {
+ coder: coder.NewBool(),
+ val: &FullValue{Elm: true},
+ }, {
+ coder: coder.NewBytes(),
+ val: &FullValue{Elm: []byte("myBytes")},
+ }, {
+ coder: coder.NewVarInt(),
+ val: &FullValue{Elm: int64(65)},
+ }, {
+ coder: coder.NewDouble(),
+ val: &FullValue{Elm: float64(12.9)},
+ }, {
+ coder: func() *coder.Coder {
+ c, _ := coderx.NewString()
+ return &coder.Coder{Kind: coder.Custom, Custom: c, T: typex.New(reflectx.String)}
+ }(),
+ val: &FullValue{Elm: "myString"},
+ }, {
+ coder: coder.NewKV([]*coder.Coder{coder.NewVarInt(), coder.NewBool()}),
+ val: &FullValue{Elm: int64(72), Elm2: false},
+ },
+ } {
+ t.Run(fmt.Sprintf("%v", test.coder), func(t *testing.T) {
+ var buf bytes.Buffer
+ enc := MakeElementEncoder(test.coder)
+ if err := enc.Encode(test.val, &buf); err != nil {
+ t.Fatalf("Couldn't encode value: %v", err)
+ }
+
+ dec := MakeElementDecoder(test.coder)
+ result, err := dec.Decode(&buf)
+ if err != nil {
+ t.Fatalf("Couldn't decode value: %v", err)
+ }
+ // []bytes are incomparable, convert to strings first.
+ if b, ok := test.val.Elm.([]byte); ok {
+ test.val.Elm = string(b)
+ result.Elm = string(result.Elm.([]byte))
+ }
+ if got, want := result.Elm, test.val.Elm; got != want {
+ t.Errorf("got %v, want %v", got, want)
+ }
+ if got, want := result.Elm2, test.val.Elm2; got != want {
+ t.Errorf("got %v, want %v", got, want)
+ }
+
+ })
+ }
+}
diff --git a/sdks/go/pkg/beam/core/runtime/exec/translate.go b/sdks/go/pkg/beam/core/runtime/exec/translate.go
index e6caafc..139db34 100644
--- a/sdks/go/pkg/beam/core/runtime/exec/translate.go
+++ b/sdks/go/pkg/beam/core/runtime/exec/translate.go
@@ -17,7 +17,6 @@
import (
"fmt"
- "path"
"strconv"
"strings"
@@ -382,9 +381,7 @@
if err != nil {
return nil, err
}
- // transform.UniqueName may be per-bundle, which isn't useful for metrics.
- // Use the short name for the DoFn instead.
- n.PID = path.Base(n.Fn.Name())
+ n.PID = transform.GetUniqueName()
input := unmarshalKeyedValues(transform.GetInputs())
for i := 1; i < len(input); i++ {
@@ -414,9 +411,7 @@
}
cn.UsesKey = typex.IsKV(in[0].Type)
- // transform.UniqueName may be per-bundle, which isn't useful for metrics.
- // Use the short name for the DoFn instead.
- cn.PID = path.Base(cn.Fn.Name())
+ cn.PID = transform.GetUniqueName()
switch urn {
case urnPerKeyCombinePre:
diff --git a/sdks/go/pkg/beam/core/runtime/graphx/coder.go b/sdks/go/pkg/beam/core/runtime/graphx/coder.go
index 5ac1f59..8c1cec4 100644
--- a/sdks/go/pkg/beam/core/runtime/graphx/coder.go
+++ b/sdks/go/pkg/beam/core/runtime/graphx/coder.go
@@ -32,7 +32,9 @@
// Model constants
urnBytesCoder = "beam:coder:bytes:v1"
+ urnBoolCoder = "beam:coder:bool:v1"
urnVarIntCoder = "beam:coder:varint:v1"
+ urnDoubleCoder = "beam:coder:double:v1"
urnLengthPrefixCoder = "beam:coder:length_prefix:v1"
urnKVCoder = "beam:coder:kv:v1"
urnIterableCoder = "beam:coder:iterable:v1"
@@ -155,9 +157,15 @@
case urnBytesCoder:
return coder.NewBytes(), nil
+ case urnBoolCoder:
+ return coder.NewBool(), nil
+
case urnVarIntCoder:
return coder.NewVarInt(), nil
+ case urnDoubleCoder:
+ return coder.NewDouble(), nil
+
case urnKVCoder:
if len(components) != 2 {
return nil, errors.Errorf("could not unmarshal KV coder from %v, want exactly 2 components but have %d", c, len(components))
@@ -367,9 +375,15 @@
// TODO(herohde) 6/27/2017: add length-prefix and not assume nested by context?
return b.internBuiltInCoder(urnBytesCoder)
+ case coder.Bool:
+ return b.internBuiltInCoder(urnBoolCoder)
+
case coder.VarInt:
return b.internBuiltInCoder(urnVarIntCoder)
+ case coder.Double:
+ return b.internBuiltInCoder(urnDoubleCoder)
+
default:
panic(fmt.Sprintf("Failed to marshal custom coder %v, unexpected coder kind: %v", c, c.Kind))
}
diff --git a/sdks/go/pkg/beam/core/runtime/graphx/coder_test.go b/sdks/go/pkg/beam/core/runtime/graphx/coder_test.go
index 6f001c1..2a99df6 100644
--- a/sdks/go/pkg/beam/core/runtime/graphx/coder_test.go
+++ b/sdks/go/pkg/beam/core/runtime/graphx/coder_test.go
@@ -46,10 +46,18 @@
coder.NewBytes(),
},
{
+ "bool",
+ coder.NewBool(),
+ },
+ {
"varint",
coder.NewVarInt(),
},
{
+ "double",
+ coder.NewDouble(),
+ },
+ {
"foo",
foo,
},
diff --git a/sdks/go/pkg/beam/core/runtime/graphx/dataflow.go b/sdks/go/pkg/beam/core/runtime/graphx/dataflow.go
index fd29f40..da6d9b5 100644
--- a/sdks/go/pkg/beam/core/runtime/graphx/dataflow.go
+++ b/sdks/go/pkg/beam/core/runtime/graphx/dataflow.go
@@ -17,7 +17,7 @@
import (
"github.com/apache/beam/sdks/go/pkg/beam/core/graph/coder"
- "github.com/apache/beam/sdks/go/pkg/beam/core/runtime/graphx/v1"
+ v1 "github.com/apache/beam/sdks/go/pkg/beam/core/runtime/graphx/v1"
"github.com/apache/beam/sdks/go/pkg/beam/core/typex"
"github.com/apache/beam/sdks/go/pkg/beam/core/util/protox"
"github.com/apache/beam/sdks/go/pkg/beam/internal/errors"
@@ -40,7 +40,9 @@
const (
windowedValueType = "kind:windowed_value"
bytesType = "kind:bytes"
+ boolType = "kind:bool"
varIntType = "kind:varint"
+ doubleType = "kind:double"
streamType = "kind:stream"
pairType = "kind:pair"
lengthPrefixType = "kind:length_prefix"
@@ -147,9 +149,15 @@
// TODO(herohde) 6/27/2017: add length-prefix and not assume nested by context?
return &CoderRef{Type: bytesType}, nil
+ case coder.Bool:
+ return &CoderRef{Type: boolType}, nil
+
case coder.VarInt:
return &CoderRef{Type: varIntType}, nil
+ case coder.Double:
+ return &CoderRef{Type: doubleType}, nil
+
default:
return nil, errors.Errorf("bad coder kind: %v", c.Kind)
}
@@ -174,9 +182,15 @@
case bytesType:
return coder.NewBytes(), nil
+ case boolType:
+ return coder.NewBool(), nil
+
case varIntType:
return coder.NewVarInt(), nil
+ case doubleType:
+ return coder.NewDouble(), nil
+
case pairType:
if len(c.Components) != 2 {
return nil, errors.Errorf("bad pair: %+v", c)
diff --git a/sdks/go/pkg/beam/core/runtime/harness/datamgr.go b/sdks/go/pkg/beam/core/runtime/harness/datamgr.go
index 453cf9f..cf74505 100644
--- a/sdks/go/pkg/beam/core/runtime/harness/datamgr.go
+++ b/sdks/go/pkg/beam/core/runtime/harness/datamgr.go
@@ -36,7 +36,7 @@
// The indirection makes it easier to control access.
type ScopedDataManager struct {
mgr *DataChannelManager
- instID string
+ instID instructionID
// TODO(herohde) 7/20/2018: capture and force close open reads/writes. However,
// we would need the underlying Close to be idempotent or a separate method.
@@ -45,10 +45,11 @@
}
// NewScopedDataManager returns a ScopedDataManager for the given instruction.
-func NewScopedDataManager(mgr *DataChannelManager, instID string) *ScopedDataManager {
+func NewScopedDataManager(mgr *DataChannelManager, instID instructionID) *ScopedDataManager {
return &ScopedDataManager{mgr: mgr, instID: instID}
}
+// OpenRead opens an io.ReadCloser on the given stream.
func (s *ScopedDataManager) OpenRead(ctx context.Context, id exec.StreamID) (io.ReadCloser, error) {
ch, err := s.open(ctx, id.Port)
if err != nil {
@@ -57,6 +58,7 @@
return ch.OpenRead(ctx, id.PtransformID, s.instID), nil
}
+// OpenWrite opens an io.WriteCloser on the given stream.
func (s *ScopedDataManager) OpenWrite(ctx context.Context, id exec.StreamID) (io.WriteCloser, error) {
ch, err := s.open(ctx, id.Port)
if err != nil {
@@ -77,6 +79,7 @@
return local.Open(ctx, port) // don't hold lock over potentially slow operation
}
+// Close prevents new IO for this instruction.
func (s *ScopedDataManager) Close() error {
s.mu.Lock()
s.closed = true
@@ -119,7 +122,7 @@
// clientID identifies a client of a connected channel.
type clientID struct {
ptransformID string
- instID string
+ instID instructionID
}
// This is a reduced version of the full gRPC interface to help with testing.
@@ -141,6 +144,9 @@
readers map[clientID]*dataReader
// TODO: early/late closed, bad instructions, finer locks, reconnect?
+ // readErr indicates a client.Recv error and is used to prevent new readers.
+ readErr error
+
mu sync.Mutex // guards both the readers and writers maps.
}
@@ -169,11 +175,18 @@
return ret
}
-func (c *DataChannel) OpenRead(ctx context.Context, ptransformID string, instID string) io.ReadCloser {
- return c.makeReader(ctx, clientID{ptransformID: ptransformID, instID: instID})
+// OpenRead returns an io.ReadCloser of the data elements for the given instruction and ptransform.
+func (c *DataChannel) OpenRead(ctx context.Context, ptransformID string, instID instructionID) io.ReadCloser {
+ cid := clientID{ptransformID: ptransformID, instID: instID}
+ if c.readErr != nil {
+ log.Errorf(ctx, "opening a reader %v on a closed channel", cid)
+ return &errReader{c.readErr}
+ }
+ return c.makeReader(ctx, cid)
}
-func (c *DataChannel) OpenWrite(ctx context.Context, ptransformID string, instID string) io.WriteCloser {
+// OpenWrite returns an io.WriteCloser of the data elements for the given instruction and ptransform.
+func (c *DataChannel) OpenWrite(ctx context.Context, ptransformID string, instID instructionID) io.WriteCloser {
return c.makeWriter(ctx, clientID{ptransformID: ptransformID, instID: instID})
}
@@ -182,12 +195,25 @@
for {
msg, err := c.client.Recv()
if err != nil {
+ // This connection is bad, so we should close and delete all extant streams.
+ c.mu.Lock()
+ c.readErr = err // prevent not yet opened readers from hanging.
+ for _, r := range c.readers {
+ log.Errorf(ctx, "DataChannel.read %v reader %v closing due to error on channel", c.id, r.id)
+ if !r.completed {
+ r.completed = true
+ r.err = err
+ close(r.buf)
+ }
+ delete(cache, r.id)
+ }
+ c.mu.Unlock()
+
if err == io.EOF {
- // TODO(herohde) 10/12/2017: can this happen before shutdown? Reconnect?
log.Warnf(ctx, "DataChannel.read %v closed", c.id)
return
}
- log.Errorf(ctx, "DataChannel.read %v bad", c.id)
+ log.Errorf(ctx, "DataChannel.read %v bad: %v", c.id, err)
return
}
@@ -198,9 +224,7 @@
// to reduce lock contention.
for _, elm := range msg.GetData() {
- id := clientID{ptransformID: elm.TransformId, instID: elm.GetInstructionId()}
-
- // log.Printf("Chan read (%v): %v\n", sid, elm.GetData())
+ id := clientID{ptransformID: elm.TransformId, instID: instructionID(elm.GetInstructionId())}
var r *dataReader
if local, ok := cache[id]; ok {
@@ -219,6 +243,7 @@
}
if len(elm.GetData()) == 0 {
// Sentinel EOF segment for stream. Close buffer to signal EOF.
+ r.completed = true
close(r.buf)
// Clean up local bookkeeping. We'll never see another message
@@ -237,11 +262,24 @@
case r.buf <- elm.GetData():
case <-r.done:
r.completed = true
+ close(r.buf)
}
}
}
}
+type errReader struct {
+ err error
+}
+
+func (r *errReader) Read(_ []byte) (int, error) {
+ return 0, r.err
+}
+
+func (r *errReader) Close() error {
+ return r.err
+}
+
func (c *DataChannel) makeReader(ctx context.Context, id clientID) *dataReader {
c.mu.Lock()
defer c.mu.Unlock()
@@ -281,6 +319,7 @@
cur []byte
channel *DataChannel
completed bool
+ err error
}
func (r *dataReader) Close() error {
@@ -293,7 +332,10 @@
if r.cur == nil {
b, ok := <-r.buf
if !ok {
- return 0, io.EOF
+ if r.err == nil {
+ return 0, io.EOF
+ }
+ return 0, r.err
}
r.cur = b
}
@@ -333,7 +375,7 @@
msg := &pb.Elements{
Data: []*pb.Elements_Data{
{
- InstructionId: w.id.instID,
+ InstructionId: string(w.id.instID),
TransformId: w.id.ptransformID,
// Empty data == sentinel
},
@@ -357,7 +399,7 @@
msg := &pb.Elements{
Data: []*pb.Elements_Data{
{
- InstructionId: w.id.instID,
+ InstructionId: string(w.id.instID),
TransformId: w.id.ptransformID,
Data: w.buf,
},
@@ -373,7 +415,7 @@
l := len(w.buf)
// We can't fit this message into the buffer. We need to flush the buffer
if err := w.Flush(); err != nil {
- return 0, errors.Wrapf(err, "datamgr.go: error flushing buffer of length %d", l)
+ return 0, errors.Wrapf(err, "datamgr.go [%v]: error flushing buffer of length %d", w.id, l)
}
}
diff --git a/sdks/go/pkg/beam/core/runtime/harness/datamgr_test.go b/sdks/go/pkg/beam/core/runtime/harness/datamgr_test.go
index 1bbf22e..b82785e 100644
--- a/sdks/go/pkg/beam/core/runtime/harness/datamgr_test.go
+++ b/sdks/go/pkg/beam/core/runtime/harness/datamgr_test.go
@@ -17,6 +17,7 @@
import (
"context"
+ "fmt"
"io"
"io/ioutil"
"log"
@@ -25,10 +26,13 @@
pb "github.com/apache/beam/sdks/go/pkg/beam/model/fnexecution_v1"
)
+const extraData = 2
+
type fakeClient struct {
t *testing.T
done chan bool
calls int
+ err error
}
func (f *fakeClient) Recv() (*pb.Elements, error) {
@@ -42,7 +46,8 @@
msg := pb.Elements{}
- for i := 0; i < bufElements+1; i++ {
+ // Send extraData more than the number of elements buffered in the channel.
+ for i := 0; i < bufElements+extraData; i++ {
msg.Data = append(msg.Data, &elemData)
}
@@ -51,16 +56,16 @@
// Subsequent calls return no data.
switch f.calls {
case 1:
- return &msg, nil
+ return &msg, f.err
case 2:
- return &msg, nil
+ return &msg, f.err
case 3:
elemData.Data = []byte{}
msg.Data = []*pb.Elements_Data{&elemData}
// Broadcasting done here means that this code providing messages
// has not been blocked by the bug blocking the dataReader
// from getting more messages.
- return &msg, nil
+ return &msg, f.err
default:
f.done <- true
return nil, io.EOF
@@ -71,27 +76,76 @@
return nil
}
-func TestDataChannelTerminateOnClose(t *testing.T) {
+func TestDataChannelTerminate(t *testing.T) {
// The logging of channels closed is quite noisy for this test
log.SetOutput(ioutil.Discard)
- done := make(chan bool, 1)
- client := &fakeClient{t: t, done: done}
- c := makeDataChannel(context.Background(), "id", client)
- r := c.OpenRead(context.Background(), "ptr", "inst_ref")
- var read = make([]byte, 4)
+ expectedError := fmt.Errorf("EXPECTED ERROR")
- // We don't read up all the buffered data, but immediately close the reader.
- // Previously, since nothing was consuming the incoming gRPC data, the whole
- // data channel would get stuck, and the client.Recv() call was eventually
- // no longer called.
- _, err := r.Read(read)
- if err != nil {
- t.Errorf("Unexpected error from read: %v", err)
+ tests := []struct {
+ name string
+ expectedError error
+ caseFn func(t *testing.T, r io.ReadCloser, client *fakeClient, c *DataChannel)
+ }{
+ {
+ name: "onClose",
+ expectedError: io.EOF,
+ caseFn: func(t *testing.T, r io.ReadCloser, client *fakeClient, c *DataChannel) {
+ // We don't read up all the buffered data, but immediately close the reader.
+ // Previously, since nothing was consuming the incoming gRPC data, the whole
+ // data channel would get stuck, and the client.Recv() call was eventually
+ // no longer called.
+ r.Close()
+
+ // If done is signaled, that means client.Recv() has been called to flush the
+ // channel, meaning consumer code isn't stuck.
+ <-client.done
+ },
+ }, {
+ name: "onSentinel",
+ expectedError: io.EOF,
+ caseFn: func(t *testing.T, r io.ReadCloser, client *fakeClient, c *DataChannel) {
+ // fakeClient eventually returns a sentinel element.
+ },
+ }, {
+ name: "onRecvError",
+ expectedError: expectedError,
+ caseFn: func(t *testing.T, r io.ReadCloser, client *fakeClient, c *DataChannel) {
+ // The SDK starts reading in a goroutine immeadiately after open.
+ // Set the 2nd Recv call to have an error.
+ client.err = expectedError
+ },
+ },
}
- r.Close()
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ done := make(chan bool, 1)
+ client := &fakeClient{t: t, done: done}
+ c := makeDataChannel(context.Background(), "id", client)
- // If done is signaled, that means client.Recv() has been called to flush the
- // channel, meaning consumer code isn't stuck.
- <-done
+ r := c.OpenRead(context.Background(), "ptr", "inst_ref")
+
+ n, err := r.Read(make([]byte, 4))
+ if err != nil {
+ t.Errorf("Unexpected error from read: %v, read %d bytes.", err, n)
+ }
+ test.caseFn(t, r, client, c)
+ // Drain the reader.
+ i := 1 // For the earlier Read.
+ for err == nil {
+ read := make([]byte, 4)
+ _, err = r.Read(read)
+ i++
+ }
+
+ if got, want := err, test.expectedError; got != want {
+ t.Errorf("Unexpected error from read %d: got %v, want %v", i, got, want)
+ }
+ // Verify that new readers return the same their reads after client.Recv is done.
+ if n, err := c.OpenRead(context.Background(), "ptr", "inst_ref").Read(make([]byte, 4)); err != test.expectedError {
+ t.Errorf("Unexpected error from read: got %v, want, %v read %d bytes.", err, test.expectedError, n)
+ }
+ })
+ }
+
}
diff --git a/sdks/go/pkg/beam/core/runtime/harness/harness.go b/sdks/go/pkg/beam/core/runtime/harness/harness.go
index dcc7922..28bdf4c 100644
--- a/sdks/go/pkg/beam/core/runtime/harness/harness.go
+++ b/sdks/go/pkg/beam/core/runtime/harness/harness.go
@@ -82,8 +82,9 @@
}()
ctrl := &control{
- plans: make(map[string]*exec.Plan),
- active: make(map[string]*exec.Plan),
+ plans: make(map[bundleDescriptorID]*exec.Plan),
+ active: make(map[instructionID]*exec.Plan),
+ failed: make(map[instructionID]error),
data: &DataChannelManager{},
state: &StateChannelManager{},
}
@@ -132,12 +133,17 @@
}
}
+type bundleDescriptorID string
+type instructionID string
+
type control struct {
// plans that are candidates for execution.
- plans map[string]*exec.Plan // protected by mu
+ plans map[bundleDescriptorID]*exec.Plan // protected by mu
// plans that are actively being executed.
// a plan can only be in one of these maps at any time.
- active map[string]*exec.Plan // protected by mu
+ active map[instructionID]*exec.Plan // protected by mu
+ // plans that have failed during execution
+ failed map[instructionID]error // protected by mu
mu sync.Mutex
data *DataChannelManager
@@ -145,8 +151,8 @@
}
func (c *control) handleInstruction(ctx context.Context, req *fnpb.InstructionRequest) *fnpb.InstructionResponse {
- id := req.GetInstructionId()
- ctx = setInstID(ctx, id)
+ instID := instructionID(req.GetInstructionId())
+ ctx = setInstID(ctx, instID)
switch {
case req.GetRegister() != nil:
@@ -155,19 +161,19 @@
for _, desc := range msg.GetProcessBundleDescriptor() {
p, err := exec.UnmarshalPlan(desc)
if err != nil {
- return fail(id, "Invalid bundle desc: %v", err)
+ return fail(ctx, instID, "Invalid bundle desc: %v", err)
}
- pid := desc.GetId()
- log.Debugf(ctx, "Plan %v: %v", pid, p)
+ bdID := bundleDescriptorID(desc.GetId())
+ log.Debugf(ctx, "Plan %v: %v", bdID, p)
c.mu.Lock()
- c.plans[pid] = p
+ c.plans[bdID] = p
c.mu.Unlock()
}
return &fnpb.InstructionResponse{
- InstructionId: id,
+ InstructionId: string(instID),
Response: &fnpb.InstructionResponse_Register{
Register: &fnpb.RegisterResponse{},
},
@@ -178,40 +184,43 @@
// NOTE: the harness sends a 0-length process bundle request to sources (changed?)
- log.Debugf(ctx, "PB: %v", msg)
-
- ref := msg.GetProcessBundleDescriptorId()
+ bdID := bundleDescriptorID(msg.GetProcessBundleDescriptorId())
+ log.Debugf(ctx, "PB [%v]: %v", instID, msg)
c.mu.Lock()
- plan, ok := c.plans[ref]
+ plan, ok := c.plans[bdID]
// Make the plan active, and remove it from candidates
// since a plan can't be run concurrently.
- c.active[id] = plan
- delete(c.plans, ref)
+ c.active[instID] = plan
+ delete(c.plans, bdID)
c.mu.Unlock()
if !ok {
- return fail(id, "execution plan for %v not found", ref)
+ return fail(ctx, instID, "execution plan for %v not found", bdID)
}
- data := NewScopedDataManager(c.data, id)
- state := NewScopedStateReader(c.state, id)
- err := plan.Execute(ctx, id, exec.DataContext{Data: data, State: state})
+ data := NewScopedDataManager(c.data, instID)
+ state := NewScopedStateReader(c.state, instID)
+ err := plan.Execute(ctx, string(instID), exec.DataContext{Data: data, State: state})
data.Close()
state.Close()
m := plan.Metrics()
// Move the plan back to the candidate state
c.mu.Lock()
- c.plans[plan.ID()] = plan
- delete(c.active, id)
+ // Mark the instruction as failed.
+ if err != nil {
+ c.failed[instID] = err
+ }
+ c.plans[bdID] = plan
+ delete(c.active, instID)
c.mu.Unlock()
if err != nil {
- return fail(id, "execute failed: %v", err)
+ return fail(ctx, instID, "process bundle failed for instruction %v using plan %v : %v", instID, bdID, err)
}
return &fnpb.InstructionResponse{
- InstructionId: id,
+ InstructionId: string(instID),
Response: &fnpb.InstructionResponse_ProcessBundle{
ProcessBundle: &fnpb.ProcessBundleResponse{
Metrics: m,
@@ -222,20 +231,22 @@
case req.GetProcessBundleProgress() != nil:
msg := req.GetProcessBundleProgress()
- // log.Debugf(ctx, "PB Progress: %v", msg)
-
- ref := msg.GetInstructionId()
+ ref := instructionID(msg.GetInstructionId())
c.mu.Lock()
plan, ok := c.active[ref]
+ err := c.failed[ref]
c.mu.Unlock()
+ if err != nil {
+ return fail(ctx, instID, "failed to return progress: instruction %v failed: %v", ref, err)
+ }
if !ok {
- return fail(id, "execution plan for %v not found", ref)
+ return fail(ctx, instID, "failed to return progress: instruction %v not active", ref)
}
m := plan.Metrics()
return &fnpb.InstructionResponse{
- InstructionId: id,
+ InstructionId: string(instID),
Response: &fnpb.InstructionResponse_ProcessBundleProgress{
ProcessBundleProgress: &fnpb.ProcessBundleProgressResponse{
Metrics: m,
@@ -247,27 +258,31 @@
msg := req.GetProcessBundleSplit()
log.Debugf(ctx, "PB Split: %v", msg)
- ref := msg.GetInstructionId()
+ ref := instructionID(msg.GetInstructionId())
c.mu.Lock()
plan, ok := c.active[ref]
+ err := c.failed[ref]
c.mu.Unlock()
+ if err != nil {
+ return fail(ctx, instID, "failed to split: instruction %v failed: %v", ref, err)
+ }
if !ok {
- return fail(id, "execution plan for %v not found", ref)
+ return fail(ctx, instID, "failed to split: execution plan for %v not active", ref)
}
// Get the desired splits for the root FnAPI read operation.
ds := msg.GetDesiredSplits()[plan.SourcePTransformID()]
if ds == nil {
- return fail(id, "failed to split: desired splits for root was empty.")
+ return fail(ctx, instID, "failed to split: desired splits for root of %v was empty.", ref)
}
- split, err := plan.Split(exec.SplitPoints{ds.GetAllowedSplitPoints(), ds.GetFractionOfRemainder()})
+ split, err := plan.Split(exec.SplitPoints{Splits: ds.GetAllowedSplitPoints(), Frac: ds.GetFractionOfRemainder()})
if err != nil {
- return fail(id, "unable to split: %v", err)
+ return fail(ctx, instID, "unable to split %v: %v", ref, err)
}
return &fnpb.InstructionResponse{
- InstructionId: id,
+ InstructionId: string(instID),
Response: &fnpb.InstructionResponse_ProcessBundleSplit{
ProcessBundleSplit: &fnpb.ProcessBundleSplitResponse{
ChannelSplits: []*fnpb.ProcessBundleSplitResponse_ChannelSplit{
@@ -281,15 +296,16 @@
}
default:
- return fail(id, "Unexpected request: %v", req)
+ return fail(ctx, instID, "Unexpected request: %v", req)
}
}
-func fail(id, format string, args ...interface{}) *fnpb.InstructionResponse {
+func fail(ctx context.Context, id instructionID, format string, args ...interface{}) *fnpb.InstructionResponse {
+ log.Output(ctx, log.SevError, 1, fmt.Sprintf(format, args...))
dummy := &fnpb.InstructionResponse_Register{Register: &fnpb.RegisterResponse{}}
return &fnpb.InstructionResponse{
- InstructionId: id,
+ InstructionId: string(id),
Error: fmt.Sprintf(format, args...),
Response: dummy,
}
diff --git a/sdks/go/pkg/beam/core/runtime/harness/logging.go b/sdks/go/pkg/beam/core/runtime/harness/logging.go
index 63afe7c..2a1d0fa 100644
--- a/sdks/go/pkg/beam/core/runtime/harness/logging.go
+++ b/sdks/go/pkg/beam/core/runtime/harness/logging.go
@@ -37,7 +37,7 @@
const instKey contextKey = "beam:inst"
-func setInstID(ctx context.Context, id string) context.Context {
+func setInstID(ctx context.Context, id instructionID) context.Context {
return context.WithValue(ctx, instKey, id)
}
@@ -46,7 +46,7 @@
if id == nil {
return "", false
}
- return id.(string), true
+ return string(id.(instructionID)), true
}
type logger struct {
@@ -61,7 +61,7 @@
Severity: convertSeverity(sev),
Message: msg,
}
- if _, file, line, ok := runtime.Caller(calldepth); ok {
+ if _, file, line, ok := runtime.Caller(calldepth + 1); ok {
entry.LogLocation = fmt.Sprintf("%v:%v", file, line)
}
if id, ok := tryGetInstID(ctx); ok {
diff --git a/sdks/go/pkg/beam/core/runtime/harness/logging_test.go b/sdks/go/pkg/beam/core/runtime/harness/logging_test.go
new file mode 100644
index 0000000..606b3c7
--- /dev/null
+++ b/sdks/go/pkg/beam/core/runtime/harness/logging_test.go
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package harness
+
+import (
+ "context"
+ "strings"
+ "testing"
+
+ "github.com/apache/beam/sdks/go/pkg/beam/log"
+ pb "github.com/apache/beam/sdks/go/pkg/beam/model/fnexecution_v1"
+)
+
+func TestLogger(t *testing.T) {
+ ch := make(chan *pb.LogEntry, 1)
+ l := logger{out: ch}
+
+ instID := "INST"
+ ctx := setInstID(context.Background(), instructionID(instID))
+ msg := "expectedMessage"
+ l.Log(ctx, log.SevInfo, 0, msg)
+
+ e := <-ch
+
+ if got, want := e.GetInstructionId(), instID; got != want {
+ t.Errorf("incorrect InstructionID: got %v, want %v", got, want)
+ }
+ if got, want := e.GetMessage(), msg; got != want {
+ t.Errorf("incorrect Message: got %v, want %v", got, want)
+ }
+ // This check will fail if the imports change.
+ if got, want := e.GetLogLocation(), "logging_test.go:34"; !strings.HasSuffix(got, want) {
+ t.Errorf("incorrect LogLocation: got %v, want suffix %v", got, want)
+ }
+ if got, want := e.GetSeverity(), pb.LogEntry_Severity_INFO; got != want {
+ t.Errorf("incorrect Severity: got %v, want %v", got, want)
+ }
+}
diff --git a/sdks/go/pkg/beam/core/runtime/harness/statemgr.go b/sdks/go/pkg/beam/core/runtime/harness/statemgr.go
index ff20ff9..9669888 100644
--- a/sdks/go/pkg/beam/core/runtime/harness/statemgr.go
+++ b/sdks/go/pkg/beam/core/runtime/harness/statemgr.go
@@ -34,7 +34,7 @@
// for side input use. The indirection makes it easier to control access.
type ScopedStateReader struct {
mgr *StateChannelManager
- instID string
+ instID instructionID
opened []io.Closer // track open readers to force close all
closed bool
@@ -42,7 +42,7 @@
}
// NewScopedStateReader returns a ScopedStateReader for the given instruction.
-func NewScopedStateReader(mgr *StateChannelManager, instID string) *ScopedStateReader {
+func NewScopedStateReader(mgr *StateChannelManager, instID instructionID) *ScopedStateReader {
return &ScopedStateReader{mgr: mgr, instID: instID}
}
@@ -103,7 +103,7 @@
}
type stateKeyReader struct {
- instID string
+ instID instructionID
key *pb.StateKey
token []byte
@@ -115,7 +115,7 @@
mu sync.Mutex
}
-func newSideInputReader(ch *StateChannel, id exec.StreamID, sideInputID string, instID string, k, w []byte) *stateKeyReader {
+func newSideInputReader(ch *StateChannel, id exec.StreamID, sideInputID string, instID instructionID, k, w []byte) *stateKeyReader {
key := &pb.StateKey{
Type: &pb.StateKey_MultimapSideInput_{
MultimapSideInput: &pb.StateKey_MultimapSideInput{
@@ -133,7 +133,7 @@
}
}
-func newRunnerReader(ch *StateChannel, instID string, k []byte) *stateKeyReader {
+func newRunnerReader(ch *StateChannel, instID instructionID, k []byte) *stateKeyReader {
key := &pb.StateKey{
Type: &pb.StateKey_Runner_{
Runner: &pb.StateKey_Runner{
@@ -166,7 +166,7 @@
req := &pb.StateRequest{
// Id: set by channel
- InstructionId: r.instID,
+ InstructionId: string(r.instID),
StateKey: r.key,
Request: &pb.StateRequest_Get{
Get: &pb.StateGetRequest{
diff --git a/sdks/go/pkg/beam/create_test.go b/sdks/go/pkg/beam/create_test.go
index 9b296cf..a9da524 100644
--- a/sdks/go/pkg/beam/create_test.go
+++ b/sdks/go/pkg/beam/create_test.go
@@ -35,6 +35,10 @@
}{
{[]interface{}{1, 2, 3}},
{[]interface{}{"1", "2", "3"}},
+ {[]interface{}{float32(0.1), float32(0.2), float32(0.3)}},
+ {[]interface{}{float64(0.1), float64(0.2), float64(0.3)}},
+ {[]interface{}{uint(1), uint(2), uint(3)}},
+ {[]interface{}{false, true, true, false, true}},
{[]interface{}{wc{"a", 23}, wc{"b", 42}, wc{"c", 5}}},
{[]interface{}{&testProto{}, &testProto{stringValue("test")}}}, // Test for BEAM-4401
}
diff --git a/sdks/go/pkg/beam/log/log.go b/sdks/go/pkg/beam/log/log.go
index 70cd199..0bf0740 100644
--- a/sdks/go/pkg/beam/log/log.go
+++ b/sdks/go/pkg/beam/log/log.go
@@ -68,80 +68,80 @@
// Debug writes the fmt.Sprint-formatted arguments to the global logger with
// debug severity.
func Debug(ctx context.Context, v ...interface{}) {
- Output(ctx, SevDebug, 2, fmt.Sprint(v...))
+ Output(ctx, SevDebug, 1, fmt.Sprint(v...))
}
// Debugf writes the fmt.Sprintf-formatted arguments to the global logger with
// debug severity.
func Debugf(ctx context.Context, format string, v ...interface{}) {
- Output(ctx, SevDebug, 2, fmt.Sprintf(format, v...))
+ Output(ctx, SevDebug, 1, fmt.Sprintf(format, v...))
}
// Debugln writes the fmt.Sprintln-formatted arguments to the global logger with
// debug severity.
func Debugln(ctx context.Context, v ...interface{}) {
- Output(ctx, SevDebug, 2, fmt.Sprintln(v...))
+ Output(ctx, SevDebug, 1, fmt.Sprintln(v...))
}
// Info writes the fmt.Sprint-formatted arguments to the global logger with
// info severity.
func Info(ctx context.Context, v ...interface{}) {
- Output(ctx, SevInfo, 2, fmt.Sprint(v...))
+ Output(ctx, SevInfo, 1, fmt.Sprint(v...))
}
// Infof writes the fmt.Sprintf-formatted arguments to the global logger with
// info severity.
func Infof(ctx context.Context, format string, v ...interface{}) {
- Output(ctx, SevInfo, 2, fmt.Sprintf(format, v...))
+ Output(ctx, SevInfo, 1, fmt.Sprintf(format, v...))
}
// Infoln writes the fmt.Sprintln-formatted arguments to the global logger with
// info severity.
func Infoln(ctx context.Context, v ...interface{}) {
- Output(ctx, SevInfo, 2, fmt.Sprintln(v...))
+ Output(ctx, SevInfo, 1, fmt.Sprintln(v...))
}
// Warn writes the fmt.Sprint-formatted arguments to the global logger with
// warn severity.
func Warn(ctx context.Context, v ...interface{}) {
- Output(ctx, SevWarn, 2, fmt.Sprint(v...))
+ Output(ctx, SevWarn, 1, fmt.Sprint(v...))
}
// Warnf writes the fmt.Sprintf-formatted arguments to the global logger with
// warn severity.
func Warnf(ctx context.Context, format string, v ...interface{}) {
- Output(ctx, SevWarn, 2, fmt.Sprintf(format, v...))
+ Output(ctx, SevWarn, 1, fmt.Sprintf(format, v...))
}
// Warnln writes the fmt.Sprintln-formatted arguments to the global logger with
// warn severity.
func Warnln(ctx context.Context, v ...interface{}) {
- Output(ctx, SevWarn, 2, fmt.Sprintln(v...))
+ Output(ctx, SevWarn, 1, fmt.Sprintln(v...))
}
// Error writes the fmt.Sprint-formatted arguments to the global logger with
// error severity.
func Error(ctx context.Context, v ...interface{}) {
- Output(ctx, SevError, 2, fmt.Sprint(v...))
+ Output(ctx, SevError, 1, fmt.Sprint(v...))
}
// Errorf writes the fmt.Sprintf-formatted arguments to the global logger with
// error severity.
func Errorf(ctx context.Context, format string, v ...interface{}) {
- Output(ctx, SevError, 2, fmt.Sprintf(format, v...))
+ Output(ctx, SevError, 1, fmt.Sprintf(format, v...))
}
// Errorln writes the fmt.Sprintln-formatted arguments to the global logger with
// error severity.
func Errorln(ctx context.Context, v ...interface{}) {
- Output(ctx, SevError, 2, fmt.Sprintln(v...))
+ Output(ctx, SevError, 1, fmt.Sprintln(v...))
}
// Fatal writes the fmt.Sprint-formatted arguments to the global logger with
// fatal severity. It then panics.
func Fatal(ctx context.Context, v ...interface{}) {
msg := fmt.Sprint(v...)
- Output(ctx, SevFatal, 2, msg)
+ Output(ctx, SevFatal, 1, msg)
panic(msg)
}
@@ -149,7 +149,7 @@
// fatal severity. It then panics.
func Fatalf(ctx context.Context, format string, v ...interface{}) {
msg := fmt.Sprintf(format, v...)
- Output(ctx, SevFatal, 2, msg)
+ Output(ctx, SevFatal, 1, msg)
panic(msg)
}
@@ -157,27 +157,27 @@
// fatal severity. It then panics.
func Fatalln(ctx context.Context, v ...interface{}) {
msg := fmt.Sprintln(v...)
- Output(ctx, SevFatal, 2, msg)
+ Output(ctx, SevFatal, 1, msg)
panic(msg)
}
// Exit writes the fmt.Sprint-formatted arguments to the global logger with
// fatal severity. It then exits.
func Exit(ctx context.Context, v ...interface{}) {
- Output(ctx, SevFatal, 2, fmt.Sprint(v...))
+ Output(ctx, SevFatal, 1, fmt.Sprint(v...))
os.Exit(1)
}
// Exitf writes the fmt.Sprintf-formatted arguments to the global logger with
// fatal severity. It then exits.
func Exitf(ctx context.Context, format string, v ...interface{}) {
- Output(ctx, SevFatal, 2, fmt.Sprintf(format, v...))
+ Output(ctx, SevFatal, 1, fmt.Sprintf(format, v...))
os.Exit(1)
}
// Exitln writes the fmt.Sprintln-formatted arguments to the global logger with
// fatal severity. It then exits.
func Exitln(ctx context.Context, v ...interface{}) {
- Output(ctx, SevFatal, 2, fmt.Sprintln(v...))
+ Output(ctx, SevFatal, 1, fmt.Sprintln(v...))
os.Exit(1)
}
diff --git a/sdks/go/pkg/beam/runners/dataflow/dataflow.go b/sdks/go/pkg/beam/runners/dataflow/dataflow.go
index 7cdaa09..0da7590 100644
--- a/sdks/go/pkg/beam/runners/dataflow/dataflow.go
+++ b/sdks/go/pkg/beam/runners/dataflow/dataflow.go
@@ -57,6 +57,7 @@
region = flag.String("region", "", "GCP Region (optional but encouraged)")
network = flag.String("network", "", "GCP network (optional)")
subnetwork = flag.String("subnetwork", "", "GCP subnetwork (optional)")
+ noUsePublicIPs = flag.Bool("no_use_public_ips", false, "Workers must not use public IP addresses (optional)")
tempLocation = flag.String("temp_location", "", "Temp location (optional)")
machineType = flag.String("worker_machine_type", "", "GCE machine type (optional)")
minCPUPlatform = flag.String("min_cpu_platform", "", "GCE minimum cpu platform (optional)")
@@ -143,6 +144,7 @@
Zone: *zone,
Network: *network,
Subnetwork: *subnetwork,
+ NoUsePublicIPs: *noUsePublicIPs,
NumWorkers: *numWorkers,
MaxNumWorkers: *maxNumWorkers,
Algorithm: *autoscalingAlgorithm,
diff --git a/sdks/go/pkg/beam/runners/dataflow/dataflowlib/job.go b/sdks/go/pkg/beam/runners/dataflow/dataflowlib/job.go
index ef24348..6da3db1 100644
--- a/sdks/go/pkg/beam/runners/dataflow/dataflowlib/job.go
+++ b/sdks/go/pkg/beam/runners/dataflow/dataflowlib/job.go
@@ -46,6 +46,7 @@
Zone string
Network string
Subnetwork string
+ NoUsePublicIPs bool
NumWorkers int64
MachineType string
Labels map[string]string
@@ -105,6 +106,11 @@
experiments = append(experiments, "use_staged_dataflow_worker_jar")
}
+ ipConfiguration := "WORKER_IP_UNSPECIFIED"
+ if opts.NoUsePublicIPs {
+ ipConfiguration = "WORKER_IP_PRIVATE"
+ }
+
job := &df.Job{
ProjectId: opts.Project,
Name: opts.Name,
@@ -132,6 +138,7 @@
AutoscalingSettings: &df.AutoscalingSettings{
MaxNumWorkers: opts.MaxNumWorkers,
},
+ IpConfiguration: ipConfiguration,
Kind: "harness",
Packages: packages,
WorkerHarnessContainerImage: images[0],
diff --git a/sdks/go/test/build.gradle b/sdks/go/test/build.gradle
index 77fd3be..c453ccc 100644
--- a/sdks/go/test/build.gradle
+++ b/sdks/go/test/build.gradle
@@ -49,12 +49,12 @@
task flinkValidatesRunner {
dependsOn ":sdks:go:test:goBuild"
- dependsOn ":runners:flink:1.8:job-server:shadowJar"
+ dependsOn ":runners:flink:1.9:job-server:shadowJar"
doLast {
def options = [
"--runner flink",
"--parallel 1", // prevent memory overuse
- "--flink_job_server_jar ${project(":runners:flink:1.8:job-server").shadowJar.archivePath}",
+ "--flink_job_server_jar ${project(":runners:flink:1.9:job-server").shadowJar.archivePath}",
]
exec {
executable "sh"
diff --git a/sdks/java/build-tools/src/main/resources/beam/suppressions.xml b/sdks/java/build-tools/src/main/resources/beam/suppressions.xml
index 203d92b..41905ab 100644
--- a/sdks/java/build-tools/src/main/resources/beam/suppressions.xml
+++ b/sdks/java/build-tools/src/main/resources/beam/suppressions.xml
@@ -92,5 +92,6 @@
<!-- Checkstyle does not correctly detect package files across multiple source directories. -->
<suppress checks="JavadocPackage" files=".*runners.flink.*CoderTypeSerializer\.java"/>
<suppress checks="JavadocPackage" files=".*runners.flink.*EncodedTypeSerializer\.java"/>
+ <suppress checks="JavadocPackage" files=".*runners.flink.*BeamStoppableFunction\.java"/>
</suppressions>
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsEnvironment.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsEnvironment.java
index 5df351a..f226003 100644
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsEnvironment.java
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsEnvironment.java
@@ -114,7 +114,7 @@
public static MetricsContainer getCurrentContainer() {
MetricsContainer container = CONTAINER_FOR_THREAD.get();
if (container == null && REPORTED_MISSING_CONTAINER.compareAndSet(false, true)) {
- if (METRICS_SUPPORTED.get()) {
+ if (isMetricsSupported()) {
LOG.error(
"Unable to update metrics on the current thread. "
+ "Most likely caused by using metrics outside the managed work-execution thread.");
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/options/ValueProvider.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/options/ValueProvider.java
index 92f0644..903b4a8 100644
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/options/ValueProvider.java
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/options/ValueProvider.java
@@ -37,6 +37,7 @@
import java.lang.reflect.InvocationHandler;
import java.lang.reflect.Method;
import java.lang.reflect.Proxy;
+import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap;
import javax.annotation.Nullable;
import org.apache.beam.sdk.annotations.Internal;
@@ -100,6 +101,17 @@
public String toString() {
return String.valueOf(value);
}
+
+ @Override
+ public boolean equals(Object other) {
+ return other instanceof StaticValueProvider
+ && Objects.equals(value, ((StaticValueProvider) other).value);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hashCode(value);
+ }
}
/**
@@ -159,6 +171,18 @@
.add("translator", translator.getClass().getSimpleName())
.toString();
}
+
+ @Override
+ public boolean equals(Object other) {
+ return other instanceof NestedValueProvider
+ && Objects.equals(value, ((NestedValueProvider) other).value)
+ && Objects.equals(translator, ((NestedValueProvider) other).translator);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(value, translator);
+ }
}
/**
@@ -265,6 +289,21 @@
.add("default", defaultValue)
.toString();
}
+
+ @Override
+ public boolean equals(Object other) {
+ return other instanceof RuntimeValueProvider
+ && Objects.equals(klass, ((RuntimeValueProvider) other).klass)
+ && Objects.equals(methodName, ((RuntimeValueProvider) other).methodName)
+ && Objects.equals(propertyName, ((RuntimeValueProvider) other).propertyName)
+ && Objects.equals(defaultValue, ((RuntimeValueProvider) other).defaultValue)
+ && Objects.equals(optionsId, ((RuntimeValueProvider) other).optionsId);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(klass, methodName, propertyName, defaultValue, optionsId);
+ }
}
/** <b>For internal use only; no backwards compatibility guarantees.</b> */
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/CoderProperties.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/CoderProperties.java
index f10e95b..e89cacd 100644
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/CoderProperties.java
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/CoderProperties.java
@@ -105,6 +105,17 @@
}
/**
+ * Verifies that for the given {@code Coder<T>}, {@code Coder.Context}, and value of type {@code
+ * T}, encoding followed by decoding yields a value of type {@code T} and tests that the matcher
+ * succeeds on the values.
+ */
+ public static <T> void coderDecodeEncodeInContext(
+ Coder<T> coder, Coder.Context context, T value, org.hamcrest.Matcher<T> matcher)
+ throws Exception {
+ assertThat(decodeEncode(coder, context, value), matcher);
+ }
+
+ /**
* Verifies that for the given {@code Coder<Collection<T>>}, and value of type {@code
* Collection<T>}, encoding followed by decoding yields an equal value of type {@code
* Collection<T>}, in any {@code Coder.Context}.
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/UsesStrictTimerOrdering.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/UsesStrictTimerOrdering.java
new file mode 100644
index 0000000..ad9fda1
--- /dev/null
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/UsesStrictTimerOrdering.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.testing;
+
+/**
+ * Category for tests that enforce strict event-time ordering of fired timers, even in situations
+ * where multiple tests mutually set one another and watermark hops arbitrarily far to the future.
+ */
+public @interface UsesStrictTimerOrdering {}
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/JsonToRow.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/JsonToRow.java
index 33fb2a6..ab0b740 100644
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/JsonToRow.java
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/JsonToRow.java
@@ -25,7 +25,7 @@
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.schemas.Schema.TypeName;
-import org.apache.beam.sdk.util.RowJsonDeserializer;
+import org.apache.beam.sdk.util.RowJson.RowJsonDeserializer;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.Row;
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/ParDo.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/ParDo.java
index fb8524a..ac266d1 100644
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/ParDo.java
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/ParDo.java
@@ -323,10 +323,10 @@
* {@link DoFn}. This is good if the state needs to be computed by the pipeline, or if the
* state is very large and so is best read from file(s) rather than sent as part of the {@link
* DoFn DoFn's} serialized state.
- * <li>Initialize the state in each {@link DoFn} instance, in a {@link DoFn.StartBundle} method.
- * This is good if the initialization doesn't depend on any information known only by the main
- * program or computed by earlier pipeline operations, but is the same for all instances of
- * this {@link DoFn} for all program executions, say setting up empty caches or initializing
+ * <li>Initialize the state in each {@link DoFn} instance, in a {@link DoFn.Setup} method. This is
+ * good if the initialization doesn't depend on any information known only by the main program
+ * or computed by earlier pipeline operations, but is the same for all instances of this
+ * {@link DoFn} for all program executions, say setting up empty caches or initializing
* constant data.
* </ul>
*
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/ToJson.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/ToJson.java
index 28d6c46..edeea36 100644
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/ToJson.java
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/ToJson.java
@@ -24,7 +24,7 @@
import javax.annotation.Nullable;
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.schemas.Schema;
-import org.apache.beam.sdk.util.RowJsonSerializer;
+import org.apache.beam.sdk.util.RowJson.RowJsonSerializer;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.Row;
@@ -41,8 +41,10 @@
public class ToJson<T> extends PTransform<PCollection<T>, PCollection<String>> {
private transient volatile @Nullable ObjectMapper objectMapper;
- static <T> ToJson<T> of() {
- return new ToJson<T>();
+ private ToJson() {}
+
+ public static <T> ToJson<T> of() {
+ return new ToJson<>();
}
@Override
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJson.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJson.java
new file mode 100644
index 0000000..49ded07
--- /dev/null
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJson.java
@@ -0,0 +1,359 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.util;
+
+import static java.util.stream.Collectors.toList;
+import static org.apache.beam.sdk.schemas.Schema.TypeName.BOOLEAN;
+import static org.apache.beam.sdk.schemas.Schema.TypeName.BYTE;
+import static org.apache.beam.sdk.schemas.Schema.TypeName.DECIMAL;
+import static org.apache.beam.sdk.schemas.Schema.TypeName.DOUBLE;
+import static org.apache.beam.sdk.schemas.Schema.TypeName.FLOAT;
+import static org.apache.beam.sdk.schemas.Schema.TypeName.INT16;
+import static org.apache.beam.sdk.schemas.Schema.TypeName.INT32;
+import static org.apache.beam.sdk.schemas.Schema.TypeName.INT64;
+import static org.apache.beam.sdk.schemas.Schema.TypeName.STRING;
+import static org.apache.beam.sdk.util.RowJsonValueExtractors.booleanValueExtractor;
+import static org.apache.beam.sdk.util.RowJsonValueExtractors.byteValueExtractor;
+import static org.apache.beam.sdk.util.RowJsonValueExtractors.decimalValueExtractor;
+import static org.apache.beam.sdk.util.RowJsonValueExtractors.doubleValueExtractor;
+import static org.apache.beam.sdk.util.RowJsonValueExtractors.floatValueExtractor;
+import static org.apache.beam.sdk.util.RowJsonValueExtractors.intValueExtractor;
+import static org.apache.beam.sdk.util.RowJsonValueExtractors.longValueExtractor;
+import static org.apache.beam.sdk.util.RowJsonValueExtractors.shortValueExtractor;
+import static org.apache.beam.sdk.util.RowJsonValueExtractors.stringValueExtractor;
+import static org.apache.beam.sdk.values.Row.toRow;
+
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.deser.std.StdDeserializer;
+import com.fasterxml.jackson.databind.node.JsonNodeType;
+import com.fasterxml.jackson.databind.ser.std.StdSerializer;
+import com.google.auto.value.AutoValue;
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.util.List;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+import javax.annotation.Nullable;
+import org.apache.beam.sdk.schemas.Schema;
+import org.apache.beam.sdk.schemas.Schema.Field;
+import org.apache.beam.sdk.schemas.Schema.FieldType;
+import org.apache.beam.sdk.schemas.Schema.TypeName;
+import org.apache.beam.sdk.util.RowJsonValueExtractors.ValueExtractor;
+import org.apache.beam.sdk.values.Row;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap;
+
+/**
+ * Jackson serializer and deserializer for {@link Row Rows}.
+ *
+ * <p>Supports converting between JSON primitive types and:
+ *
+ * <ul>
+ * <li>{@link Schema.TypeName#BYTE}
+ * <li>{@link Schema.TypeName#INT16}
+ * <li>{@link Schema.TypeName#INT32}
+ * <li>{@link Schema.TypeName#INT64}
+ * <li>{@link Schema.TypeName#FLOAT}
+ * <li>{@link Schema.TypeName#DOUBLE}
+ * <li>{@link Schema.TypeName#BOOLEAN}
+ * <li>{@link Schema.TypeName#STRING}
+ * </ul>
+ */
+public class RowJson {
+ /** Jackson deserializer for parsing JSON into {@link Row Rows}. */
+ public static class RowJsonDeserializer extends StdDeserializer<Row> {
+
+ private static final boolean SEQUENTIAL = false;
+
+ private static final ImmutableMap<TypeName, ValueExtractor<?>> JSON_VALUE_GETTERS =
+ ImmutableMap.<TypeName, ValueExtractor<?>>builder()
+ .put(BYTE, byteValueExtractor())
+ .put(INT16, shortValueExtractor())
+ .put(INT32, intValueExtractor())
+ .put(INT64, longValueExtractor())
+ .put(FLOAT, floatValueExtractor())
+ .put(DOUBLE, doubleValueExtractor())
+ .put(BOOLEAN, booleanValueExtractor())
+ .put(STRING, stringValueExtractor())
+ .put(DECIMAL, decimalValueExtractor())
+ .build();
+
+ private final Schema schema;
+
+ /** Creates a deserializer for a {@link Row} {@link Schema}. */
+ public static RowJsonDeserializer forSchema(Schema schema) {
+ schema.getFields().forEach(RowJsonValidation::verifyFieldTypeSupported);
+ return new RowJsonDeserializer(schema);
+ }
+
+ private RowJsonDeserializer(Schema schema) {
+ super(Row.class);
+ this.schema = schema;
+ }
+
+ @Override
+ public Row deserialize(JsonParser jsonParser, DeserializationContext deserializationContext)
+ throws IOException {
+
+ // Parse and convert the root object to Row as if it's a nested field with name 'root'
+ return (Row)
+ extractJsonNodeValue(
+ FieldValue.of("root", FieldType.row(schema), jsonParser.readValueAsTree()));
+ }
+
+ private static Object extractJsonNodeValue(FieldValue fieldValue) {
+ if (!fieldValue.isJsonValuePresent()) {
+ throw new UnsupportedRowJsonException(
+ "Field '" + fieldValue.name() + "' is not present in the JSON object");
+ }
+
+ if (fieldValue.isJsonNull()) {
+ return null;
+ }
+
+ if (fieldValue.isRowType()) {
+ return jsonObjectToRow(fieldValue);
+ }
+
+ if (fieldValue.isArrayType()) {
+ return jsonArrayToList(fieldValue);
+ }
+
+ return extractJsonPrimitiveValue(fieldValue);
+ }
+
+ private static Row jsonObjectToRow(FieldValue rowFieldValue) {
+ if (!rowFieldValue.isJsonObject()) {
+ throw new UnsupportedRowJsonException(
+ "Expected JSON object for field '"
+ + rowFieldValue.name()
+ + "'. Unable to convert '"
+ + rowFieldValue.jsonValue().asText()
+ + "' to Beam Row, it is not a JSON object. Currently only JSON objects can be parsed to Beam Rows");
+ }
+
+ return rowFieldValue.rowSchema().getFields().stream()
+ .map(
+ schemaField ->
+ extractJsonNodeValue(
+ FieldValue.of(
+ schemaField.getName(),
+ schemaField.getType(),
+ rowFieldValue.jsonFieldValue(schemaField.getName()))))
+ .collect(toRow(rowFieldValue.rowSchema()));
+ }
+
+ private static Object jsonArrayToList(FieldValue arrayFieldValue) {
+ if (!arrayFieldValue.isJsonArray()) {
+ throw new UnsupportedRowJsonException(
+ "Expected JSON array for field '"
+ + arrayFieldValue.name()
+ + "'. Instead got "
+ + arrayFieldValue.jsonNodeType().name());
+ }
+
+ return arrayFieldValue
+ .jsonArrayElements()
+ .map(
+ jsonArrayElement ->
+ extractJsonNodeValue(
+ FieldValue.of(
+ arrayFieldValue.name() + "[]",
+ arrayFieldValue.arrayElementType(),
+ jsonArrayElement)))
+ .collect(toList());
+ }
+
+ private static Object extractJsonPrimitiveValue(FieldValue fieldValue) {
+ try {
+ return JSON_VALUE_GETTERS.get(fieldValue.typeName()).extractValue(fieldValue.jsonValue());
+ } catch (RuntimeException e) {
+ throw new UnsupportedRowJsonException(
+ "Unable to get value from field '"
+ + fieldValue.name()
+ + "'. Schema type '"
+ + fieldValue.typeName()
+ + "'. JSON node type "
+ + fieldValue.jsonNodeType().name(),
+ e);
+ }
+ }
+
+ /**
+ * Helper class to keep track of schema field type, name, and actual json value for the field.
+ */
+ @AutoValue
+ abstract static class FieldValue {
+ abstract String name();
+
+ abstract FieldType type();
+
+ abstract @Nullable JsonNode jsonValue();
+
+ TypeName typeName() {
+ return type().getTypeName();
+ }
+
+ boolean isJsonValuePresent() {
+ return jsonValue() != null;
+ }
+
+ boolean isJsonNull() {
+ return jsonValue().isNull();
+ }
+
+ JsonNodeType jsonNodeType() {
+ return jsonValue().getNodeType();
+ }
+
+ boolean isJsonArray() {
+ return jsonValue().isArray();
+ }
+
+ Stream<JsonNode> jsonArrayElements() {
+ return StreamSupport.stream(jsonValue().spliterator(), SEQUENTIAL);
+ }
+
+ boolean isArrayType() {
+ return TypeName.ARRAY.equals(type().getTypeName());
+ }
+
+ FieldType arrayElementType() {
+ return type().getCollectionElementType();
+ }
+
+ boolean isJsonObject() {
+ return jsonValue().isObject();
+ }
+
+ JsonNode jsonFieldValue(String fieldName) {
+ return jsonValue().get(fieldName);
+ }
+
+ boolean isRowType() {
+ return TypeName.ROW.equals(type().getTypeName());
+ }
+
+ Schema rowSchema() {
+ return type().getRowSchema();
+ }
+
+ static FieldValue of(String name, FieldType type, JsonNode jsonValue) {
+ return new AutoValue_RowJson_RowJsonDeserializer_FieldValue(name, type, jsonValue);
+ }
+ }
+
+ /** Gets thrown when Row parsing fails for any reason. */
+ public static class UnsupportedRowJsonException extends RuntimeException {
+
+ UnsupportedRowJsonException(String message, Throwable reason) {
+ super(message, reason);
+ }
+
+ UnsupportedRowJsonException(String message) {
+ super(message);
+ }
+ }
+ }
+
+ /** Jackson serializer for converting {@link Row Rows} to JSON. */
+ public static class RowJsonSerializer extends StdSerializer<Row> {
+
+ private final Schema schema;
+
+ /** Creates a serializer for a {@link Row} {@link Schema}. */
+ public static RowJsonSerializer forSchema(Schema schema) {
+ schema.getFields().forEach(RowJsonValidation::verifyFieldTypeSupported);
+ return new RowJsonSerializer(schema);
+ }
+
+ private RowJsonSerializer(Schema schema) {
+ super(Row.class);
+ this.schema = schema;
+ }
+
+ @Override
+ public void serialize(Row value, JsonGenerator gen, SerializerProvider provider)
+ throws IOException {
+ writeRow(value, this.schema, gen);
+ }
+
+ // TODO: ByteBuddy generate based on schema?
+ private void writeRow(Row row, Schema schema, JsonGenerator gen) throws IOException {
+ gen.writeStartObject();
+ for (int i = 0; i < schema.getFieldCount(); ++i) {
+ Field field = schema.getField(i);
+ Object value = row.getValue(i);
+ gen.writeFieldName(field.getName());
+ if (field.getType().getNullable() && value == null) {
+ gen.writeNull();
+ continue;
+ }
+ writeValue(gen, field.getType(), value);
+ }
+ gen.writeEndObject();
+ }
+
+ private void writeValue(JsonGenerator gen, FieldType type, Object value) throws IOException {
+ switch (type.getTypeName()) {
+ case BOOLEAN:
+ gen.writeBoolean((boolean) value);
+ break;
+ case STRING:
+ gen.writeString((String) value);
+ break;
+ case BYTE:
+ gen.writeNumber((byte) value);
+ break;
+ case DOUBLE:
+ gen.writeNumber((double) value);
+ break;
+ case FLOAT:
+ gen.writeNumber((float) value);
+ break;
+ case INT16:
+ gen.writeNumber((short) value);
+ break;
+ case INT32:
+ gen.writeNumber((int) value);
+ break;
+ case INT64:
+ gen.writeNumber((long) value);
+ break;
+ case DECIMAL:
+ gen.writeNumber((BigDecimal) value);
+ break;
+ case ARRAY:
+ gen.writeStartArray();
+ for (Object element : (List<Object>) value) {
+ writeValue(gen, type.getCollectionElementType(), element);
+ }
+ gen.writeEndArray();
+ break;
+ case ROW:
+ writeRow((Row) value, type.getRowSchema(), gen);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported field type: " + type);
+ }
+ }
+ }
+}
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonDeserializer.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonDeserializer.java
deleted file mode 100644
index 1929726..0000000
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonDeserializer.java
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.sdk.util;
-
-import static java.util.stream.Collectors.toList;
-import static org.apache.beam.sdk.schemas.Schema.TypeName.BOOLEAN;
-import static org.apache.beam.sdk.schemas.Schema.TypeName.BYTE;
-import static org.apache.beam.sdk.schemas.Schema.TypeName.DECIMAL;
-import static org.apache.beam.sdk.schemas.Schema.TypeName.DOUBLE;
-import static org.apache.beam.sdk.schemas.Schema.TypeName.FLOAT;
-import static org.apache.beam.sdk.schemas.Schema.TypeName.INT16;
-import static org.apache.beam.sdk.schemas.Schema.TypeName.INT32;
-import static org.apache.beam.sdk.schemas.Schema.TypeName.INT64;
-import static org.apache.beam.sdk.schemas.Schema.TypeName.STRING;
-import static org.apache.beam.sdk.util.RowJsonValueExtractors.booleanValueExtractor;
-import static org.apache.beam.sdk.util.RowJsonValueExtractors.byteValueExtractor;
-import static org.apache.beam.sdk.util.RowJsonValueExtractors.decimalValueExtractor;
-import static org.apache.beam.sdk.util.RowJsonValueExtractors.doubleValueExtractor;
-import static org.apache.beam.sdk.util.RowJsonValueExtractors.floatValueExtractor;
-import static org.apache.beam.sdk.util.RowJsonValueExtractors.intValueExtractor;
-import static org.apache.beam.sdk.util.RowJsonValueExtractors.longValueExtractor;
-import static org.apache.beam.sdk.util.RowJsonValueExtractors.shortValueExtractor;
-import static org.apache.beam.sdk.util.RowJsonValueExtractors.stringValueExtractor;
-import static org.apache.beam.sdk.values.Row.toRow;
-
-import com.fasterxml.jackson.core.JsonParser;
-import com.fasterxml.jackson.databind.DeserializationContext;
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.deser.std.StdDeserializer;
-import com.fasterxml.jackson.databind.node.JsonNodeType;
-import com.google.auto.value.AutoValue;
-import java.io.IOException;
-import java.util.stream.Stream;
-import java.util.stream.StreamSupport;
-import javax.annotation.Nullable;
-import org.apache.beam.sdk.schemas.Schema;
-import org.apache.beam.sdk.schemas.Schema.FieldType;
-import org.apache.beam.sdk.schemas.Schema.TypeName;
-import org.apache.beam.sdk.util.RowJsonValueExtractors.ValueExtractor;
-import org.apache.beam.sdk.values.Row;
-import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap;
-
-/**
- * Jackson deserializer for {@link Row Rows}.
- *
- * <p>Supports converting JSON primitive types to:
- *
- * <ul>
- * <li>{@link Schema.TypeName#BYTE}
- * <li>{@link Schema.TypeName#INT16}
- * <li>{@link Schema.TypeName#INT32}
- * <li>{@link Schema.TypeName#INT64}
- * <li>{@link Schema.TypeName#FLOAT}
- * <li>{@link Schema.TypeName#DOUBLE}
- * <li>{@link Schema.TypeName#BOOLEAN}
- * <li>{@link Schema.TypeName#STRING}
- * </ul>
- */
-public class RowJsonDeserializer extends StdDeserializer<Row> {
-
- private static final boolean SEQUENTIAL = false;
-
- private static final ImmutableMap<TypeName, ValueExtractor<?>> JSON_VALUE_GETTERS =
- ImmutableMap.<TypeName, ValueExtractor<?>>builder()
- .put(BYTE, byteValueExtractor())
- .put(INT16, shortValueExtractor())
- .put(INT32, intValueExtractor())
- .put(INT64, longValueExtractor())
- .put(FLOAT, floatValueExtractor())
- .put(DOUBLE, doubleValueExtractor())
- .put(BOOLEAN, booleanValueExtractor())
- .put(STRING, stringValueExtractor())
- .put(DECIMAL, decimalValueExtractor())
- .build();
-
- private final Schema schema;
-
- /** Creates a deserializer for a {@link Row} {@link Schema}. */
- public static RowJsonDeserializer forSchema(Schema schema) {
- schema.getFields().forEach(RowJsonValidation::verifyFieldTypeSupported);
- return new RowJsonDeserializer(schema);
- }
-
- private RowJsonDeserializer(Schema schema) {
- super(Row.class);
- this.schema = schema;
- }
-
- @Override
- public Row deserialize(JsonParser jsonParser, DeserializationContext deserializationContext)
- throws IOException {
-
- // Parse and convert the root object to Row as if it's a nested field with name 'root'
- return (Row)
- extractJsonNodeValue(
- FieldValue.of("root", FieldType.row(schema), jsonParser.readValueAsTree()));
- }
-
- private static Object extractJsonNodeValue(FieldValue fieldValue) {
- if (!fieldValue.isJsonValuePresent()) {
- throw new UnsupportedRowJsonException(
- "Field '" + fieldValue.name() + "' is not present in the JSON object");
- }
-
- if (fieldValue.isJsonNull()) {
- return null;
- }
-
- if (fieldValue.isRowType()) {
- return jsonObjectToRow(fieldValue);
- }
-
- if (fieldValue.isArrayType()) {
- return jsonArrayToList(fieldValue);
- }
-
- return extractJsonPrimitiveValue(fieldValue);
- }
-
- private static Row jsonObjectToRow(FieldValue rowFieldValue) {
- if (!rowFieldValue.isJsonObject()) {
- throw new UnsupportedRowJsonException(
- "Expected JSON object for field '"
- + rowFieldValue.name()
- + "'. "
- + "Unable to convert '"
- + rowFieldValue.jsonValue().asText()
- + "'"
- + " to Beam Row, it is not a JSON object. Currently only JSON objects "
- + "can be parsed to Beam Rows");
- }
-
- return rowFieldValue.rowSchema().getFields().stream()
- .map(
- schemaField ->
- extractJsonNodeValue(
- FieldValue.of(
- schemaField.getName(),
- schemaField.getType(),
- rowFieldValue.jsonFieldValue(schemaField.getName()))))
- .collect(toRow(rowFieldValue.rowSchema()));
- }
-
- private static Object jsonArrayToList(FieldValue arrayFieldValue) {
- if (!arrayFieldValue.isJsonArray()) {
- throw new UnsupportedRowJsonException(
- "Expected JSON array for field '"
- + arrayFieldValue.name()
- + "'. "
- + "Instead got "
- + arrayFieldValue.jsonNodeType().name());
- }
-
- return arrayFieldValue
- .jsonArrayElements()
- .map(
- jsonArrayElement ->
- extractJsonNodeValue(
- FieldValue.of(
- arrayFieldValue.name() + "[]",
- arrayFieldValue.arrayElementType(),
- jsonArrayElement)))
- .collect(toList());
- }
-
- private static Object extractJsonPrimitiveValue(FieldValue fieldValue) {
- try {
- return JSON_VALUE_GETTERS.get(fieldValue.typeName()).extractValue(fieldValue.jsonValue());
- } catch (RuntimeException e) {
- throw new UnsupportedRowJsonException(
- "Unable to get value from field '"
- + fieldValue.name()
- + "'. "
- + "Schema type '"
- + fieldValue.typeName()
- + "'. "
- + "JSON node type "
- + fieldValue.jsonNodeType().name(),
- e);
- }
- }
-
- /** Helper class to keep track of schema field type, name, and actual json value for the field. */
- @AutoValue
- abstract static class FieldValue {
- abstract String name();
-
- abstract FieldType type();
-
- abstract @Nullable JsonNode jsonValue();
-
- TypeName typeName() {
- return type().getTypeName();
- }
-
- boolean isJsonValuePresent() {
- return jsonValue() != null;
- }
-
- boolean isJsonNull() {
- return jsonValue().isNull();
- }
-
- JsonNodeType jsonNodeType() {
- return jsonValue().getNodeType();
- }
-
- boolean isJsonArray() {
- return jsonValue().isArray();
- }
-
- Stream<JsonNode> jsonArrayElements() {
- return StreamSupport.stream(jsonValue().spliterator(), SEQUENTIAL);
- }
-
- boolean isArrayType() {
- return TypeName.ARRAY.equals(type().getTypeName());
- }
-
- FieldType arrayElementType() {
- return type().getCollectionElementType();
- }
-
- boolean isJsonObject() {
- return jsonValue().isObject();
- }
-
- JsonNode jsonFieldValue(String fieldName) {
- return jsonValue().get(fieldName);
- }
-
- boolean isRowType() {
- return TypeName.ROW.equals(type().getTypeName());
- }
-
- Schema rowSchema() {
- return type().getRowSchema();
- }
-
- static FieldValue of(String name, FieldType type, JsonNode jsonValue) {
- return new AutoValue_RowJsonDeserializer_FieldValue(name, type, jsonValue);
- }
- }
-
- /** Gets thrown when Row parsing fails for any reason. */
- public static class UnsupportedRowJsonException extends RuntimeException {
-
- UnsupportedRowJsonException(String message, Throwable reason) {
- super(message, reason);
- }
-
- UnsupportedRowJsonException(String message) {
- super(message);
- }
- }
-}
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonSerializer.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonSerializer.java
deleted file mode 100644
index 0cb1672..0000000
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonSerializer.java
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.sdk.util;
-
-import com.fasterxml.jackson.core.JsonGenerator;
-import com.fasterxml.jackson.databind.SerializerProvider;
-import com.fasterxml.jackson.databind.ser.std.StdSerializer;
-import java.io.IOException;
-import java.math.BigDecimal;
-import java.util.List;
-import org.apache.beam.sdk.schemas.Schema;
-import org.apache.beam.sdk.schemas.Schema.Field;
-import org.apache.beam.sdk.schemas.Schema.FieldType;
-import org.apache.beam.sdk.values.Row;
-
-public class RowJsonSerializer extends StdSerializer<Row> {
-
- private final Schema schema;
-
- /** Creates a serializer for a {@link Row} {@link Schema}. */
- public static RowJsonSerializer forSchema(Schema schema) {
- schema.getFields().forEach(RowJsonValidation::verifyFieldTypeSupported);
- return new RowJsonSerializer(schema);
- }
-
- private RowJsonSerializer(Schema schema) {
- super(Row.class);
- this.schema = schema;
- }
-
- @Override
- public void serialize(Row value, JsonGenerator gen, SerializerProvider provider)
- throws IOException {
- writeRow(value, this.schema, gen);
- }
-
- // TODO: ByteBuddy generate based on schema?
- private void writeRow(Row row, Schema schema, JsonGenerator gen) throws IOException {
- gen.writeStartObject();
- for (int i = 0; i < schema.getFieldCount(); ++i) {
- Field field = schema.getField(i);
- Object value = row.getValue(i);
- gen.writeFieldName(field.getName());
- if (field.getType().getNullable() && value == null) {
- gen.writeNull();
- continue;
- }
- writeValue(gen, field.getType(), value);
- }
- gen.writeEndObject();
- }
-
- private void writeValue(JsonGenerator gen, FieldType type, Object value) throws IOException {
- switch (type.getTypeName()) {
- case BOOLEAN:
- gen.writeBoolean((boolean) value);
- break;
- case STRING:
- gen.writeString((String) value);
- break;
- case BYTE:
- gen.writeNumber((byte) value);
- break;
- case DOUBLE:
- gen.writeNumber((double) value);
- break;
- case FLOAT:
- gen.writeNumber((float) value);
- break;
- case INT16:
- gen.writeNumber((short) value);
- break;
- case INT32:
- gen.writeNumber((int) value);
- break;
- case INT64:
- gen.writeNumber((long) value);
- break;
- case DECIMAL:
- gen.writeNumber((BigDecimal) value);
- break;
- case ARRAY:
- gen.writeStartArray();
- for (Object element : (List<Object>) value) {
- writeValue(gen, type.getCollectionElementType(), element);
- }
- gen.writeEndArray();
- break;
- case ROW:
- writeRow((Row) value, type.getRowSchema(), gen);
- break;
- default:
- throw new IllegalArgumentException("Unsupported field type: " + type);
- }
- }
-}
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonUtils.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonUtils.java
index 598dc74..a882625 100644
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonUtils.java
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonUtils.java
@@ -24,14 +24,17 @@
import com.fasterxml.jackson.databind.module.SimpleModule;
import java.io.IOException;
import org.apache.beam.sdk.annotations.Internal;
-import org.apache.beam.sdk.util.RowJsonDeserializer.UnsupportedRowJsonException;
+import org.apache.beam.sdk.util.RowJson.RowJsonDeserializer.UnsupportedRowJsonException;
import org.apache.beam.sdk.values.Row;
-/** Utilities for working with {@link RowJsonSerializer} and {@link RowJsonDeserializer}. */
+/**
+ * Utilities for working with {@link RowJson.RowJsonSerializer} and {@link
+ * RowJson.RowJsonDeserializer}.
+ */
@Internal
public class RowJsonUtils {
- public static ObjectMapper newObjectMapperWith(RowJsonDeserializer deserializer) {
+ public static ObjectMapper newObjectMapperWith(RowJson.RowJsonDeserializer deserializer) {
SimpleModule module = new SimpleModule("rowDeserializationModule");
module.addDeserializer(Row.class, deserializer);
@@ -41,7 +44,7 @@
return objectMapper;
}
- public static ObjectMapper newObjectMapperWith(RowJsonSerializer serializer) {
+ public static ObjectMapper newObjectMapperWith(RowJson.RowJsonSerializer serializer) {
SimpleModule module = new SimpleModule("rowSerializationModule");
module.addSerializer(Row.class, serializer);
@@ -65,7 +68,7 @@
try {
return objectMapper.writeValueAsString(row);
} catch (JsonProcessingException e) {
- throw new IllegalArgumentException("Unable to serilize row: " + row);
+ throw new IllegalArgumentException("Unable to serilize row: " + row, e);
}
}
}
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonValidation.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonValidation.java
index 2ab7aec..69cd1d3 100644
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonValidation.java
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonValidation.java
@@ -60,7 +60,7 @@
}
if (!SUPPORTED_TYPES.contains(fieldTypeName)) {
- throw new RowJsonDeserializer.UnsupportedRowJsonException(
+ throw new RowJson.RowJsonDeserializer.UnsupportedRowJsonException(
fieldTypeName.name()
+ " is not supported when converting JSON objects to Rows. "
+ "Supported types are: "
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonValueExtractors.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonValueExtractors.java
index 4db0823..13bf854e 100644
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonValueExtractors.java
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowJsonValueExtractors.java
@@ -22,7 +22,7 @@
import java.math.BigDecimal;
import java.util.function.Function;
import java.util.function.Predicate;
-import org.apache.beam.sdk.util.RowJsonDeserializer.UnsupportedRowJsonException;
+import org.apache.beam.sdk.util.RowJson.RowJsonDeserializer.UnsupportedRowJsonException;
/**
* Contains utilities for extracting primitive values from JSON nodes.
diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/options/ValueProviderTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/options/ValueProviderTest.java
index 6d410f7..6c85ffd 100644
--- a/sdks/java/core/src/test/java/org/apache/beam/sdk/options/ValueProviderTest.java
+++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/options/ValueProviderTest.java
@@ -93,6 +93,7 @@
assertEquals("foo", provider.get());
assertTrue(provider.isAccessible());
assertEquals("foo", provider.toString());
+ assertEquals(provider, StaticValueProvider.of("foo"));
}
@Test
@@ -120,6 +121,7 @@
TestOptions options = PipelineOptionsFactory.as(TestOptions.class);
ValueProvider<String> provider = options.getBar();
assertFalse(provider.isAccessible());
+ assertEquals(provider, options.getBar());
}
@Test
@@ -232,11 +234,13 @@
@Test
public void testNestedValueProviderStatic() throws Exception {
+ SerializableFunction<String, String> function = from -> from + "bar";
ValueProvider<String> svp = StaticValueProvider.of("foo");
- ValueProvider<String> nvp = NestedValueProvider.of(svp, from -> from + "bar");
+ ValueProvider<String> nvp = NestedValueProvider.of(svp, function);
assertTrue(nvp.isAccessible());
assertEquals("foobar", nvp.get());
assertEquals("foobar", nvp.toString());
+ assertEquals(nvp, NestedValueProvider.of(svp, function));
}
@Test
diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/ParDoTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/ParDoTest.java
index ee7c784..db57335 100644
--- a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/ParDoTest.java
+++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/ParDoTest.java
@@ -46,11 +46,17 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
+import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
+import java.util.function.IntFunction;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+import java.util.stream.LongStream;
+import java.util.stream.StreamSupport;
import org.apache.beam.sdk.coders.AtomicCoder;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.KvCoder;
@@ -58,6 +64,7 @@
import org.apache.beam.sdk.coders.SetCoder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.coders.VarIntCoder;
+import org.apache.beam.sdk.coders.VoidCoder;
import org.apache.beam.sdk.io.GenerateSequence;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.PipelineOptions;
@@ -84,6 +91,7 @@
import org.apache.beam.sdk.testing.UsesSideInputs;
import org.apache.beam.sdk.testing.UsesSideInputsWithDifferentCoders;
import org.apache.beam.sdk.testing.UsesStatefulParDo;
+import org.apache.beam.sdk.testing.UsesStrictTimerOrdering;
import org.apache.beam.sdk.testing.UsesTestStream;
import org.apache.beam.sdk.testing.UsesTestStreamWithProcessingTime;
import org.apache.beam.sdk.testing.UsesTimersInParDo;
@@ -103,13 +111,16 @@
import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.util.common.ElementByteSizeObserver;
import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionTuple;
import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.beam.sdk.values.PDone;
import org.apache.beam.sdk.values.TimestampedValue;
import org.apache.beam.sdk.values.TupleTag;
import org.apache.beam.sdk.values.TupleTagList;
import org.apache.beam.sdk.values.TypeDescriptor;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Joiner;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.MoreObjects;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList;
@@ -3488,6 +3499,158 @@
pipeline.run();
}
+ /** A test makes sure that an event time timers are correctly ordered. */
+ @Test
+ @Category({
+ ValidatesRunner.class,
+ UsesTimersInParDo.class,
+ UsesTestStream.class,
+ UsesStatefulParDo.class,
+ UsesStrictTimerOrdering.class
+ })
+ public void testEventTimeTimerOrdering() throws Exception {
+ final int numTestElements = 100;
+ final Instant now = new Instant(1500000000000L);
+ TestStream.Builder<KV<String, String>> builder =
+ TestStream.create(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()))
+ .advanceWatermarkTo(new Instant(0));
+
+ for (int i = 0; i < numTestElements; i++) {
+ builder = builder.addElements(TimestampedValue.of(KV.of("dummy", "" + i), now.plus(i)));
+ builder = builder.advanceWatermarkTo(now.plus(i / 10 * 10));
+ }
+
+ testEventTimeTimerOrderingWithInputPTransform(
+ now, numTestElements, builder.advanceWatermarkToInfinity());
+ }
+
+ /** A test makes sure that an event time timers are correctly ordered using Create transform. */
+ @Test
+ @Category({
+ ValidatesRunner.class,
+ UsesTimersInParDo.class,
+ UsesStatefulParDo.class,
+ UsesStrictTimerOrdering.class
+ })
+ public void testEventTimeTimerOrderingWithCreate() throws Exception {
+ final int numTestElements = 100;
+ final Instant now = new Instant(1500000000000L);
+
+ List<TimestampedValue<KV<String, String>>> elements = new ArrayList<>();
+ for (int i = 0; i < numTestElements; i++) {
+ elements.add(TimestampedValue.of(KV.of("dummy", "" + i), now.plus(i)));
+ }
+
+ testEventTimeTimerOrderingWithInputPTransform(
+ now, numTestElements, Create.timestamped(elements));
+ }
+
+ private void testEventTimeTimerOrderingWithInputPTransform(
+ Instant now,
+ int numTestElements,
+ PTransform<PBegin, PCollection<KV<String, String>>> transform)
+ throws Exception {
+
+ final String timerIdBagAppend = "append";
+ final String timerIdGc = "gc";
+ final String bag = "bag";
+ final String minTimestamp = "minTs";
+ final Instant gcTimerStamp = now.plus(numTestElements + 1);
+
+ DoFn<KV<String, String>, String> fn =
+ new DoFn<KV<String, String>, String>() {
+
+ @TimerId(timerIdBagAppend)
+ private final TimerSpec appendSpec = TimerSpecs.timer(TimeDomain.EVENT_TIME);
+
+ @TimerId(timerIdGc)
+ private final TimerSpec gcSpec = TimerSpecs.timer(TimeDomain.EVENT_TIME);
+
+ @StateId(bag)
+ private final StateSpec<BagState<TimestampedValue<String>>> bagStateSpec =
+ StateSpecs.bag();
+
+ @StateId(minTimestamp)
+ private final StateSpec<ValueState<Instant>> minTimestampSpec = StateSpecs.value();
+
+ @ProcessElement
+ public void processElement(
+ ProcessContext context,
+ @TimerId(timerIdBagAppend) Timer bagTimer,
+ @TimerId(timerIdGc) Timer gcTimer,
+ @StateId(bag) BagState<TimestampedValue<String>> bagState,
+ @StateId(minTimestamp) ValueState<Instant> minStampState) {
+
+ Instant currentMinStamp =
+ MoreObjects.firstNonNull(minStampState.read(), BoundedWindow.TIMESTAMP_MAX_VALUE);
+ if (currentMinStamp.equals(BoundedWindow.TIMESTAMP_MAX_VALUE)) {
+ gcTimer.set(gcTimerStamp);
+ }
+ if (currentMinStamp.isAfter(context.timestamp())) {
+ minStampState.write(context.timestamp());
+ bagTimer.set(context.timestamp());
+ }
+ bagState.add(TimestampedValue.of(context.element().getValue(), context.timestamp()));
+ }
+
+ @OnTimer(timerIdBagAppend)
+ public void onTimer(
+ OnTimerContext context,
+ @TimerId(timerIdBagAppend) Timer timer,
+ @StateId(bag) BagState<TimestampedValue<String>> bagState) {
+
+ List<TimestampedValue<String>> flush = new ArrayList<>();
+ Instant flushTime = context.timestamp();
+ for (TimestampedValue<String> val : bagState.read()) {
+ if (!val.getTimestamp().isAfter(flushTime)) {
+ flush.add(val);
+ }
+ }
+ flush.sort(Comparator.comparing(TimestampedValue::getTimestamp));
+ context.output(
+ Joiner.on(":").join(flush.stream().map(TimestampedValue::getValue).iterator()));
+ Instant newMinStamp = flushTime.plus(1);
+ if (flush.size() < numTestElements) {
+ timer.set(newMinStamp);
+ }
+ }
+
+ @OnTimer(timerIdGc)
+ public void onTimer(
+ OnTimerContext context, @StateId(bag) BagState<TimestampedValue<String>> bagState) {
+
+ String output =
+ Joiner.on(":")
+ .join(
+ StreamSupport.stream(bagState.read().spliterator(), false)
+ .sorted(Comparator.comparing(TimestampedValue::getTimestamp))
+ .map(TimestampedValue::getValue)
+ .iterator())
+ + ":cleanup";
+ context.output(output);
+ bagState.clear();
+ }
+ };
+
+ PCollection<String> output = pipeline.apply(transform).apply(ParDo.of(fn));
+ List<String> expected =
+ IntStream.rangeClosed(0, numTestElements)
+ .mapToObj(expandFn(numTestElements))
+ .collect(Collectors.toList());
+ PAssert.that(output).containsInAnyOrder(expected);
+ pipeline.run();
+ }
+
+ private IntFunction<String> expandFn(int numTestElements) {
+ return i ->
+ Joiner.on(":")
+ .join(
+ IntStream.rangeClosed(0, Math.min(numTestElements - 1, i))
+ .mapToObj(String::valueOf)
+ .iterator())
+ + (i == numTestElements ? ":cleanup" : "");
+ }
+
@Test
@Category({
ValidatesRunner.class,
@@ -3538,6 +3701,134 @@
pipeline.run().waitUntilFinish();
}
+
+ @Test
+ @Category({
+ ValidatesRunner.class,
+ UsesTimersInParDo.class,
+ UsesTestStream.class,
+ UsesStrictTimerOrdering.class
+ })
+ public void testTwoTimersSettingEachOther() {
+ Instant now = new Instant(1500000000000L);
+ Instant end = now.plus(100);
+ TestStream<KV<Void, Void>> input =
+ TestStream.create(KvCoder.of(VoidCoder.of(), VoidCoder.of()))
+ .addElements(KV.of(null, null))
+ .advanceWatermarkToInfinity();
+ pipeline.apply(TwoTimerTest.of(now, end, input));
+ pipeline.run();
+ }
+
+ @Test
+ @Category({ValidatesRunner.class, UsesTimersInParDo.class, UsesStrictTimerOrdering.class})
+ public void testTwoTimersSettingEachOtherWithCreateAsInput() {
+ Instant now = new Instant(1500000000000L);
+ Instant end = now.plus(100);
+ pipeline.apply(TwoTimerTest.of(now, end, Create.of(KV.of(null, null))));
+ pipeline.run();
+ }
+
+ private static class TwoTimerTest extends PTransform<PBegin, PDone> {
+
+ private static PTransform<PBegin, PDone> of(
+ Instant start, Instant end, PTransform<PBegin, PCollection<KV<Void, Void>>> input) {
+ return new TwoTimerTest(start, end, input);
+ }
+
+ private final Instant start;
+ private final Instant end;
+ private final transient PTransform<PBegin, PCollection<KV<Void, Void>>> inputPTransform;
+
+ public TwoTimerTest(
+ Instant start, Instant end, PTransform<PBegin, PCollection<KV<Void, Void>>> input) {
+ this.start = start;
+ this.end = end;
+ this.inputPTransform = input;
+ }
+
+ @Override
+ public PDone expand(PBegin input) {
+
+ final String timerName1 = "t1";
+ final String timerName2 = "t2";
+ final String countStateName = "count";
+ PCollection<String> result =
+ input
+ .apply(inputPTransform)
+ .apply(
+ ParDo.of(
+ new DoFn<KV<Void, Void>, String>() {
+
+ @TimerId(timerName1)
+ final TimerSpec timerSpec1 = TimerSpecs.timer(TimeDomain.EVENT_TIME);
+
+ @TimerId(timerName2)
+ final TimerSpec timerSpec2 = TimerSpecs.timer(TimeDomain.EVENT_TIME);
+
+ @StateId(countStateName)
+ final StateSpec<ValueState<Integer>> countStateSpec = StateSpecs.value();
+
+ @ProcessElement
+ public void processElement(
+ ProcessContext context,
+ @TimerId(timerName1) Timer t1,
+ @TimerId(timerName2) Timer t2,
+ @StateId(countStateName) ValueState<Integer> state) {
+
+ state.write(0);
+ t1.set(start);
+ // set the t2 timer after end, so that we test that
+ // timers are correctly ordered in this case
+ t2.set(end.plus(1));
+ }
+
+ @OnTimer(timerName1)
+ public void onTimer1(
+ OnTimerContext context,
+ @TimerId(timerName2) Timer t2,
+ @StateId(countStateName) ValueState<Integer> state) {
+
+ Integer current = state.read();
+ t2.set(context.timestamp());
+
+ context.output(
+ "t1:"
+ + current
+ + ":"
+ + context.timestamp().minus(start.getMillis()).getMillis());
+ }
+
+ @OnTimer(timerName2)
+ public void onTimer2(
+ OnTimerContext context,
+ @TimerId(timerName1) Timer t1,
+ @StateId(countStateName) ValueState<Integer> state) {
+ Integer current = state.read();
+ if (context.timestamp().isBefore(end)) {
+ state.write(current + 1);
+ t1.set(context.timestamp().plus(1));
+ } else {
+ state.write(-1);
+ }
+ context.output(
+ "t2:"
+ + current
+ + ":"
+ + context.timestamp().minus(start.getMillis()).getMillis());
+ }
+ }));
+
+ List<String> expected =
+ LongStream.rangeClosed(0, 100)
+ .mapToObj(e -> (Long) e)
+ .flatMap(e -> Arrays.asList("t1:" + e + ":" + e, "t2:" + e + ":" + e).stream())
+ .collect(Collectors.toList());
+ PAssert.that(result).containsInAnyOrder(expected);
+
+ return PDone.in(input.getPipeline());
+ }
+ }
}
/** Tests validating Timer coder inference behaviors. */
diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/RowJsonTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/RowJsonTest.java
index f1f3fe9..3277bb0 100644
--- a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/RowJsonTest.java
+++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/RowJsonTest.java
@@ -30,7 +30,7 @@
import java.util.Collection;
import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.schemas.Schema.FieldType;
-import org.apache.beam.sdk.util.RowJsonDeserializer.UnsupportedRowJsonException;
+import org.apache.beam.sdk.util.RowJson.RowJsonDeserializer.UnsupportedRowJsonException;
import org.apache.beam.sdk.values.Row;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList;
import org.hamcrest.Matcher;
@@ -45,7 +45,7 @@
import org.junit.runners.Parameterized.Parameter;
import org.junit.runners.Parameterized.Parameters;
-/** Unit tests for {@link RowJsonDeserializer} and {@link RowJsonSerializer}. */
+/** Unit tests for {@link RowJson.RowJsonDeserializer} and {@link RowJson.RowJsonSerializer}. */
@RunWith(Enclosed.class)
public class RowJsonTest {
@RunWith(Parameterized.class)
@@ -313,7 +313,7 @@
thrown.expect(UnsupportedRowJsonException.class);
thrown.expectMessage("DATETIME is not supported");
- RowJsonDeserializer.forSchema(schema);
+ RowJson.RowJsonDeserializer.forSchema(schema);
}
@Test
@@ -323,7 +323,7 @@
thrown.expect(UnsupportedRowJsonException.class);
thrown.expectMessage("DATETIME is not supported");
- RowJsonDeserializer.forSchema(schema);
+ RowJson.RowJsonDeserializer.forSchema(schema);
}
@Test
@@ -336,7 +336,7 @@
thrown.expect(UnsupportedRowJsonException.class);
thrown.expectMessage("DATETIME is not supported");
- RowJsonDeserializer.forSchema(schema);
+ RowJson.RowJsonDeserializer.forSchema(schema);
}
@Test
@@ -511,8 +511,8 @@
private static ObjectMapper newObjectMapperFor(Schema schema) {
SimpleModule simpleModule = new SimpleModule("rowSerializationTesModule");
- simpleModule.addSerializer(Row.class, RowJsonSerializer.forSchema(schema));
- simpleModule.addDeserializer(Row.class, RowJsonDeserializer.forSchema(schema));
+ simpleModule.addSerializer(Row.class, RowJson.RowJsonSerializer.forSchema(schema));
+ simpleModule.addDeserializer(Row.class, RowJson.RowJsonDeserializer.forSchema(schema));
ObjectMapper objectMapper = new ObjectMapper();
objectMapper.registerModule(simpleModule);
return objectMapper;
diff --git a/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/DynamicProtoCoder.java b/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/DynamicProtoCoder.java
new file mode 100644
index 0000000..96ca0fa
--- /dev/null
+++ b/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/DynamicProtoCoder.java
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.protobuf;
+
+import com.google.protobuf.Descriptors;
+import com.google.protobuf.DynamicMessage;
+import com.google.protobuf.Message;
+import com.google.protobuf.Parser;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import org.apache.beam.sdk.coders.CannotProvideCoderException;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.CoderProvider;
+import org.apache.beam.sdk.coders.CoderRegistry;
+import org.apache.beam.sdk.coders.DefaultCoder;
+import org.apache.beam.sdk.values.TypeDescriptor;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Sets;
+
+/**
+ * A {@link Coder} using Google Protocol Buffers binary format. {@link DynamicProtoCoder} supports
+ * both Protocol Buffers syntax versions 2 and 3.
+ *
+ * <p>To learn more about Protocol Buffers, visit: <a
+ * href="https://developers.google.com/protocol-buffers">https://developers.google.com/protocol-buffers</a>
+ *
+ * <p>{@link DynamicProtoCoder} is not registered in the global {@link CoderRegistry} as the
+ * descriptor is required to create the coder.
+ */
+public class DynamicProtoCoder extends ProtoCoder<DynamicMessage> {
+
+ public static final long serialVersionUID = 1L;
+
+ /**
+ * Returns a {@link DynamicProtoCoder} for the Protocol Buffers {@link DynamicMessage} for the
+ * given {@link Descriptors.Descriptor}.
+ */
+ public static DynamicProtoCoder of(Descriptors.Descriptor protoMessageDescriptor) {
+ return new DynamicProtoCoder(
+ ProtoDomain.buildFrom(protoMessageDescriptor),
+ protoMessageDescriptor.getFullName(),
+ ImmutableSet.of());
+ }
+
+ /**
+ * Returns a {@link DynamicProtoCoder} for the Protocol Buffers {@link DynamicMessage} for the
+ * given {@link Descriptors.Descriptor}. The message descriptor should be part of the provided
+ * {@link ProtoDomain}, this will ensure object equality within messages from the same domain.
+ */
+ public static DynamicProtoCoder of(
+ ProtoDomain domain, Descriptors.Descriptor protoMessageDescriptor) {
+ return new DynamicProtoCoder(domain, protoMessageDescriptor.getFullName(), ImmutableSet.of());
+ }
+
+ /**
+ * Returns a {@link DynamicProtoCoder} for the Protocol Buffers {@link DynamicMessage} for the
+ * given message name in a {@link ProtoDomain}. The message descriptor should be part of the
+ * provided * {@link ProtoDomain}, this will ensure object equality within messages from the same
+ * domain.
+ */
+ public static DynamicProtoCoder of(ProtoDomain domain, String messageName) {
+ return new DynamicProtoCoder(domain, messageName, ImmutableSet.of());
+ }
+
+ /**
+ * Returns a {@link DynamicProtoCoder} like this one, but with the extensions from the given
+ * classes registered.
+ *
+ * <p>Each of the extension host classes must be an class automatically generated by the Protocol
+ * Buffers compiler, {@code protoc}, that contains messages.
+ *
+ * <p>Does not modify this object.
+ */
+ @Override
+ public DynamicProtoCoder withExtensionsFrom(Iterable<Class<?>> moreExtensionHosts) {
+ validateExtensions(moreExtensionHosts);
+ return new DynamicProtoCoder(
+ this.domain,
+ this.messageName,
+ new ImmutableSet.Builder<Class<?>>()
+ .addAll(extensionHostClasses)
+ .addAll(moreExtensionHosts)
+ .build());
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (other == null || getClass() != other.getClass()) {
+ return false;
+ }
+ DynamicProtoCoder otherCoder = (DynamicProtoCoder) other;
+ return protoMessageClass.equals(otherCoder.protoMessageClass)
+ && Sets.newHashSet(extensionHostClasses)
+ .equals(Sets.newHashSet(otherCoder.extensionHostClasses))
+ && domain.equals(otherCoder.domain)
+ && messageName.equals(otherCoder.messageName);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(protoMessageClass, extensionHostClasses, domain, messageName);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////////
+ // Private implementation details below.
+
+ // Constants used to serialize and deserialize
+ private static final String PROTO_MESSAGE_CLASS = "dynamic_proto_message_class";
+ private static final String PROTO_EXTENSION_HOSTS = "dynamic_proto_extension_hosts";
+
+ // Descriptor used by DynamicMessage.
+ private transient ProtoDomain domain;
+ private transient String messageName;
+
+ private DynamicProtoCoder(
+ ProtoDomain domain, String messageName, Set<Class<?>> extensionHostClasses) {
+ super(DynamicMessage.class, extensionHostClasses);
+ this.domain = domain;
+ this.messageName = messageName;
+ }
+
+ private void writeObject(ObjectOutputStream oos) throws IOException {
+ oos.defaultWriteObject();
+ oos.writeObject(domain);
+ oos.writeObject(messageName);
+ }
+
+ private void readObject(ObjectInputStream ois) throws ClassNotFoundException, IOException {
+ ois.defaultReadObject();
+ this.domain = (ProtoDomain) ois.readObject();
+ this.messageName = (String) ois.readObject();
+ }
+
+ /** Get the memoized {@link Parser}, possibly initializing it lazily. */
+ @Override
+ protected Parser<DynamicMessage> getParser() {
+ if (memoizedParser == null) {
+ DynamicMessage protoMessageInstance =
+ DynamicMessage.newBuilder(domain.getDescriptor(messageName)).build();
+ memoizedParser = protoMessageInstance.getParserForType();
+ }
+ return memoizedParser;
+ }
+
+ /**
+ * Returns a {@link CoderProvider} which uses the {@link DynamicProtoCoder} for {@link Message
+ * proto messages}.
+ *
+ * <p>This method is invoked reflectively from {@link DefaultCoder}.
+ */
+ public static CoderProvider getCoderProvider() {
+ return new ProtoCoderProvider();
+ }
+
+ static final TypeDescriptor<Message> MESSAGE_TYPE = new TypeDescriptor<Message>() {};
+
+ /** A {@link CoderProvider} for {@link Message proto messages}. */
+ private static class ProtoCoderProvider extends CoderProvider {
+
+ @Override
+ public <T> Coder<T> coderFor(
+ TypeDescriptor<T> typeDescriptor, List<? extends Coder<?>> componentCoders)
+ throws CannotProvideCoderException {
+ if (!typeDescriptor.isSubtypeOf(MESSAGE_TYPE)) {
+ throw new CannotProvideCoderException(
+ String.format(
+ "Cannot provide %s because %s is not a subclass of %s",
+ DynamicProtoCoder.class.getSimpleName(), typeDescriptor, Message.class.getName()));
+ }
+
+ @SuppressWarnings("unchecked")
+ TypeDescriptor<? extends Message> messageType =
+ (TypeDescriptor<? extends Message>) typeDescriptor;
+ try {
+ @SuppressWarnings("unchecked")
+ Coder<T> coder = (Coder<T>) DynamicProtoCoder.of(messageType);
+ return coder;
+ } catch (IllegalArgumentException e) {
+ throw new CannotProvideCoderException(e);
+ }
+ }
+ }
+}
diff --git a/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoCoder.java b/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoCoder.java
index e2a919a..0b2d717 100644
--- a/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoCoder.java
+++ b/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoCoder.java
@@ -19,6 +19,7 @@
import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument;
+import com.google.protobuf.DynamicMessage;
import com.google.protobuf.ExtensionRegistry;
import com.google.protobuf.Message;
import com.google.protobuf.Parser;
@@ -32,8 +33,6 @@
import java.util.List;
import java.util.Objects;
import java.util.Set;
-import java.util.SortedSet;
-import java.util.TreeSet;
import org.apache.beam.sdk.coders.CannotProvideCoderException;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.CoderException;
@@ -107,6 +106,8 @@
*/
public class ProtoCoder<T extends Message> extends CustomCoder<T> {
+ public static final long serialVersionUID = -5043999806040629525L;
+
/** Returns a {@link ProtoCoder} for the given Protocol Buffers {@link Message}. */
public static <T extends Message> ProtoCoder<T> of(Class<T> protoMessageClass) {
return new ProtoCoder<>(protoMessageClass, ImmutableSet.of());
@@ -123,15 +124,11 @@
}
/**
- * Returns a {@link ProtoCoder} like this one, but with the extensions from the given classes
- * registered.
+ * Validate that all extensionHosts are able to be registered.
*
- * <p>Each of the extension host classes must be an class automatically generated by the Protocol
- * Buffers compiler, {@code protoc}, that contains messages.
- *
- * <p>Does not modify this object.
+ * @param moreExtensionHosts
*/
- public ProtoCoder<T> withExtensionsFrom(Iterable<Class<?>> moreExtensionHosts) {
+ void validateExtensions(Iterable<Class<?>> moreExtensionHosts) {
for (Class<?> extensionHost : moreExtensionHosts) {
// Attempt to access the required method, to make sure it's present.
try {
@@ -146,7 +143,19 @@
e);
}
}
+ }
+ /**
+ * Returns a {@link ProtoCoder} like this one, but with the extensions from the given classes
+ * registered.
+ *
+ * <p>Each of the extension host classes must be an class automatically generated by the Protocol
+ * Buffers compiler, {@code protoc}, that contains messages.
+ *
+ * <p>Does not modify this object.
+ */
+ public ProtoCoder<T> withExtensionsFrom(Iterable<Class<?>> moreExtensionHosts) {
+ validateExtensions(moreExtensionHosts);
return new ProtoCoder<>(
protoMessageClass,
new ImmutableSet.Builder<Class<?>>()
@@ -200,7 +209,7 @@
if (this == other) {
return true;
}
- if (!(other instanceof ProtoCoder)) {
+ if (other == null || getClass() != other.getClass()) {
return false;
}
ProtoCoder<?> otherCoder = (ProtoCoder<?>) other;
@@ -253,13 +262,13 @@
// Private implementation details below.
/** The {@link Message} type to be coded. */
- private final Class<T> protoMessageClass;
+ final Class<T> protoMessageClass;
/**
* All extension host classes included in this {@link ProtoCoder}. The extensions from these
* classes will be included in the {@link ExtensionRegistry} used during encoding and decoding.
*/
- private final Set<Class<?>> extensionHostClasses;
+ final Set<Class<?>> extensionHostClasses;
// Constants used to serialize and deserialize
private static final String PROTO_MESSAGE_CLASS = "proto_message_class";
@@ -267,23 +276,29 @@
// Transient fields that are lazy initialized and then memoized.
private transient ExtensionRegistry memoizedExtensionRegistry;
- private transient Parser<T> memoizedParser;
+ transient Parser<T> memoizedParser;
/** Private constructor. */
- private ProtoCoder(Class<T> protoMessageClass, Set<Class<?>> extensionHostClasses) {
+ protected ProtoCoder(Class<T> protoMessageClass, Set<Class<?>> extensionHostClasses) {
this.protoMessageClass = protoMessageClass;
this.extensionHostClasses = extensionHostClasses;
}
/** Get the memoized {@link Parser}, possibly initializing it lazily. */
- private Parser<T> getParser() {
+ protected Parser<T> getParser() {
if (memoizedParser == null) {
try {
- @SuppressWarnings("unchecked")
- T protoMessageInstance = (T) protoMessageClass.getMethod("getDefaultInstance").invoke(null);
- @SuppressWarnings("unchecked")
- Parser<T> tParser = (Parser<T>) protoMessageInstance.getParserForType();
- memoizedParser = tParser;
+ if (DynamicMessage.class.equals(protoMessageClass)) {
+ throw new IllegalArgumentException(
+ "DynamicMessage is not supported by the ProtoCoder, use the DynamicProtoCoder.");
+ } else {
+ @SuppressWarnings("unchecked")
+ T protoMessageInstance =
+ (T) protoMessageClass.getMethod("getDefaultInstance").invoke(null);
+ @SuppressWarnings("unchecked")
+ Parser<T> tParser = (Parser<T>) protoMessageInstance.getParserForType();
+ memoizedParser = tParser;
+ }
} catch (IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
throw new IllegalArgumentException(e);
}
@@ -329,12 +344,4 @@
}
}
}
-
- private SortedSet<String> getSortedExtensionClasses() {
- SortedSet<String> ret = new TreeSet<>();
- for (Class<?> clazz : extensionHostClasses) {
- ret.add(clazz.getName());
- }
- return ret;
- }
}
diff --git a/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoDomain.java b/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoDomain.java
new file mode 100644
index 0000000..e9a5d48
--- /dev/null
+++ b/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoDomain.java
@@ -0,0 +1,248 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.protobuf;
+
+import com.google.protobuf.DescriptorProtos;
+import com.google.protobuf.Descriptors;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import javax.annotation.Nullable;
+
+/**
+ * ProtoDomain is a container class for Protobuf descriptors. By using a domain for all descriptors
+ * that are related to each other the FileDescriptorSet needs to be serialized only once in the
+ * graph.
+ *
+ * <p>Using a domain also grantees that all Descriptors have object equality, just like statically
+ * compiled Proto classes Descriptors. A lot of Java code isn't used to the new DynamicMessages an
+ * assume always Object equality. Because of this the domain class is immutable.
+ *
+ * <p>ProtoDomains aren't assumed to be used on with normal Message descriptors, only with
+ * DynamicMessage descriptors.
+ */
+public final class ProtoDomain implements Serializable {
+ public static final long serialVersionUID = 1L;
+ private transient DescriptorProtos.FileDescriptorSet fileDescriptorSet;
+ private transient int hashCode;
+
+ private transient Map<String, Descriptors.FileDescriptor> fileDescriptorMap;
+ private transient Map<String, Descriptors.Descriptor> descriptorMap;
+
+ private transient Map<Integer, Descriptors.FieldDescriptor> fileOptionMap;
+ private transient Map<Integer, Descriptors.FieldDescriptor> messageOptionMap;
+ private transient Map<Integer, Descriptors.FieldDescriptor> fieldOptionMap;
+
+ ProtoDomain() {
+ this(DescriptorProtos.FileDescriptorSet.newBuilder().build());
+ }
+
+ private ProtoDomain(DescriptorProtos.FileDescriptorSet fileDescriptorSet) {
+ this.fileDescriptorSet = fileDescriptorSet;
+ hashCode = java.util.Arrays.hashCode(this.fileDescriptorSet.toByteArray());
+ crosswire();
+ }
+
+ private static Map<String, DescriptorProtos.FileDescriptorProto> extractProtoMap(
+ DescriptorProtos.FileDescriptorSet fileDescriptorSet) {
+ HashMap<String, DescriptorProtos.FileDescriptorProto> map = new HashMap<>();
+ fileDescriptorSet.getFileList().forEach(fdp -> map.put(fdp.getName(), fdp));
+ return map;
+ }
+
+ @Nullable
+ private static Descriptors.FileDescriptor convertToFileDescriptorMap(
+ String name,
+ Map<String, DescriptorProtos.FileDescriptorProto> inMap,
+ Map<String, Descriptors.FileDescriptor> outMap) {
+ if (outMap.containsKey(name)) {
+ return outMap.get(name);
+ }
+ DescriptorProtos.FileDescriptorProto fileDescriptorProto = inMap.get(name);
+ if (fileDescriptorProto == null) {
+ if ("google/protobuf/descriptor.proto".equals(name)) {
+ outMap.put(
+ "google/protobuf/descriptor.proto",
+ DescriptorProtos.FieldOptions.getDescriptor().getFile());
+ return DescriptorProtos.FieldOptions.getDescriptor().getFile();
+ }
+ return null;
+ } else {
+ List<Descriptors.FileDescriptor> dependencies = new ArrayList<>();
+ if (fileDescriptorProto.getDependencyCount() > 0) {
+ fileDescriptorProto
+ .getDependencyList()
+ .forEach(
+ dependencyName -> {
+ Descriptors.FileDescriptor fileDescriptor =
+ convertToFileDescriptorMap(dependencyName, inMap, outMap);
+ if (fileDescriptor != null) {
+ dependencies.add(fileDescriptor);
+ }
+ });
+ }
+ try {
+ Descriptors.FileDescriptor fileDescriptor =
+ Descriptors.FileDescriptor.buildFrom(
+ fileDescriptorProto, dependencies.toArray(new Descriptors.FileDescriptor[0]));
+ outMap.put(name, fileDescriptor);
+ return fileDescriptor;
+ } catch (Descriptors.DescriptorValidationException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ private static void visitFileDescriptorTree(Map map, Descriptors.FileDescriptor fileDescriptor) {
+ if (!map.containsKey(fileDescriptor.getName())) {
+ map.put(fileDescriptor.getName(), fileDescriptor);
+ List<Descriptors.FileDescriptor> dependencies = fileDescriptor.getDependencies();
+ dependencies.forEach(fd -> visitFileDescriptorTree(map, fd));
+ }
+ }
+
+ public static ProtoDomain buildFrom(Descriptors.Descriptor descriptor) {
+ return buildFrom(descriptor.getFile());
+ }
+
+ public static ProtoDomain buildFrom(DescriptorProtos.FileDescriptorSet fileDescriptorSet) {
+ return new ProtoDomain(fileDescriptorSet);
+ }
+
+ public static ProtoDomain buildFrom(Descriptors.FileDescriptor fileDescriptor) {
+ HashMap<String, Descriptors.FileDescriptor> fileDescriptorMap = new HashMap<>();
+ visitFileDescriptorTree(fileDescriptorMap, fileDescriptor);
+ DescriptorProtos.FileDescriptorSet.Builder builder =
+ DescriptorProtos.FileDescriptorSet.newBuilder();
+ fileDescriptorMap.values().forEach(fd -> builder.addFile(fd.toProto()));
+ return new ProtoDomain(builder.build());
+ }
+
+ public static ProtoDomain buildFrom(InputStream inputStream) throws IOException {
+ return buildFrom(DescriptorProtos.FileDescriptorSet.parseFrom(inputStream));
+ }
+
+ private void crosswire() {
+ HashMap<String, DescriptorProtos.FileDescriptorProto> map = new HashMap<>();
+ fileDescriptorSet.getFileList().forEach(fdp -> map.put(fdp.getName(), fdp));
+
+ Map<String, Descriptors.FileDescriptor> outMap = new HashMap<>();
+ map.forEach((fileName, proto) -> convertToFileDescriptorMap(fileName, map, outMap));
+ fileDescriptorMap = outMap;
+
+ indexOptionsByNumber(fileDescriptorMap.values());
+ indexDescriptorByName();
+ }
+
+ private void indexDescriptorByName() {
+ descriptorMap = new HashMap<>();
+ fileDescriptorMap
+ .values()
+ .forEach(
+ fileDescriptor -> {
+ fileDescriptor
+ .getMessageTypes()
+ .forEach(
+ descriptor -> {
+ descriptorMap.put(descriptor.getFullName(), descriptor);
+ });
+ });
+ }
+
+ private void indexOptionsByNumber(Collection<Descriptors.FileDescriptor> fileDescriptors) {
+ fieldOptionMap = new HashMap<>();
+ fileOptionMap = new HashMap<>();
+ messageOptionMap = new HashMap<>();
+ fileDescriptors.forEach(
+ (fileDescriptor) -> {
+ fileDescriptor
+ .getExtensions()
+ .forEach(
+ extension -> {
+ switch (extension.toProto().getExtendee()) {
+ case ".google.protobuf.FileOptions":
+ fileOptionMap.put(extension.getNumber(), extension);
+ break;
+ case ".google.protobuf.MessageOptions":
+ messageOptionMap.put(extension.getNumber(), extension);
+ break;
+ case ".google.protobuf.FieldOptions":
+ fieldOptionMap.put(extension.getNumber(), extension);
+ break;
+ default:
+ break;
+ }
+ });
+ });
+ }
+
+ private void writeObject(ObjectOutputStream oos) throws IOException {
+ byte[] buffer = fileDescriptorSet.toByteArray();
+ oos.writeInt(buffer.length);
+ oos.write(buffer);
+ }
+
+ private void readObject(ObjectInputStream ois) throws IOException {
+ byte[] buffer = new byte[ois.readInt()];
+ ois.readFully(buffer);
+ fileDescriptorSet = DescriptorProtos.FileDescriptorSet.parseFrom(buffer);
+ hashCode = java.util.Arrays.hashCode(buffer);
+ crosswire();
+ }
+
+ public Descriptors.FileDescriptor getFileDescriptor(String name) {
+ return fileDescriptorMap.get(name);
+ }
+
+ public Descriptors.Descriptor getDescriptor(String fullName) {
+ return descriptorMap.get(fullName);
+ }
+
+ public Descriptors.FieldDescriptor getFieldOptionById(int id) {
+ return fieldOptionMap.get(id);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ ProtoDomain that = (ProtoDomain) o;
+ return hashCode == that.hashCode;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(hashCode);
+ }
+
+ public boolean contains(Descriptors.Descriptor descriptor) {
+ return getDescriptor(descriptor.getFullName()) != null;
+ }
+}
diff --git a/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/DynamicProtoCoderTest.java b/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/DynamicProtoCoderTest.java
new file mode 100644
index 0000000..1039583
--- /dev/null
+++ b/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/DynamicProtoCoderTest.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.protobuf;
+
+import static org.apache.beam.sdk.testing.CoderProperties.ALL_CONTEXTS;
+import static org.junit.Assert.assertEquals;
+
+import com.google.protobuf.DynamicMessage;
+import java.io.ObjectStreamClass;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.extensions.protobuf.Proto2CoderTestMessages.MessageA;
+import org.apache.beam.sdk.extensions.protobuf.Proto2CoderTestMessages.MessageB;
+import org.apache.beam.sdk.testing.CoderProperties;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Tests for {@link ProtoCoder}. */
+@RunWith(JUnit4.class)
+public class DynamicProtoCoderTest {
+
+ @Rule public ExpectedException thrown = ExpectedException.none();
+
+ @Test
+ public void testDynamicMessage() throws Exception {
+ DynamicMessage message =
+ DynamicMessage.newBuilder(MessageA.getDescriptor())
+ .setField(
+ MessageA.getDescriptor().findFieldByNumber(MessageA.FIELD1_FIELD_NUMBER), "foo")
+ .build();
+ Coder<DynamicMessage> coder = DynamicProtoCoder.of(message.getDescriptorForType());
+
+ // Special code to check the DynamicMessage equality (@see IsDynamicMessageEqual)
+ for (Coder.Context context : ALL_CONTEXTS) {
+ CoderProperties.coderDecodeEncodeInContext(
+ coder, context, message, IsDynamicMessageEqual.equalTo(message));
+ }
+ }
+
+ @Test
+ public void testDynamicNestedRepeatedMessage() throws Exception {
+ DynamicMessage message =
+ DynamicMessage.newBuilder(MessageA.getDescriptor())
+ .setField(
+ MessageA.getDescriptor().findFieldByNumber(MessageA.FIELD1_FIELD_NUMBER), "foo")
+ .addRepeatedField(
+ MessageA.getDescriptor().findFieldByNumber(MessageA.FIELD2_FIELD_NUMBER),
+ DynamicMessage.newBuilder(MessageB.getDescriptor())
+ .setField(
+ MessageB.getDescriptor().findFieldByNumber(MessageB.FIELD1_FIELD_NUMBER),
+ true)
+ .build())
+ .addRepeatedField(
+ MessageA.getDescriptor().findFieldByNumber(MessageA.FIELD2_FIELD_NUMBER),
+ DynamicMessage.newBuilder(MessageB.getDescriptor())
+ .setField(
+ MessageB.getDescriptor().findFieldByNumber(MessageB.FIELD1_FIELD_NUMBER),
+ false)
+ .build())
+ .build();
+ Coder<DynamicMessage> coder = DynamicProtoCoder.of(message.getDescriptorForType());
+
+ // Special code to check the DynamicMessage equality (@see IsDynamicMessageEqual)
+ for (Coder.Context context : ALL_CONTEXTS) {
+ CoderProperties.coderDecodeEncodeInContext(
+ coder, context, message, IsDynamicMessageEqual.equalTo(message));
+ }
+ }
+
+ @Test
+ public void testSerialVersionID() {
+ long serialVersionID = ObjectStreamClass.lookup(DynamicProtoCoder.class).getSerialVersionUID();
+ assertEquals(1L, serialVersionID);
+ }
+}
diff --git a/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/IsDynamicMessageEqual.java b/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/IsDynamicMessageEqual.java
new file mode 100644
index 0000000..7b07963
--- /dev/null
+++ b/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/IsDynamicMessageEqual.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.protobuf;
+
+import com.google.protobuf.DynamicMessage;
+import com.google.protobuf.Message;
+import org.hamcrest.BaseMatcher;
+import org.hamcrest.Description;
+
+/**
+ * Is the DynamicMessage equal to another message. This special matcher exist because the
+ * DynamicMessage is protobuf does a object equality in it's equals operator.
+ *
+ * <p>Follow https://github.com/protocolbuffers/protobuf/issues/6100 for tracking the issue. If this
+ * is resolved we can remove this code.
+ */
+public class IsDynamicMessageEqual extends BaseMatcher<DynamicMessage> {
+ private final DynamicMessage expectedValue;
+
+ public IsDynamicMessageEqual(DynamicMessage equalArg) {
+ expectedValue = equalArg;
+ }
+
+ public static IsDynamicMessageEqual equalTo(DynamicMessage operand) {
+ return new IsDynamicMessageEqual(operand);
+ }
+
+ @Override
+ public boolean matches(Object actualValue) {
+
+ if (actualValue == null) {
+ return expectedValue == null;
+ }
+
+ if (!(actualValue instanceof Message)) {
+ return false;
+ }
+ final Message actualMessage = (Message) actualValue;
+
+ if (!actualMessage.toByteString().equals(expectedValue.toByteString())) {
+ return false;
+ }
+
+ return actualMessage
+ .getDescriptorForType()
+ .getFullName()
+ .equals(expectedValue.getDescriptorForType().getFullName());
+ }
+
+ @Override
+ public void describeTo(Description description) {
+ description.appendValue(expectedValue);
+ }
+}
diff --git a/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/ProtoCoderTest.java b/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/ProtoCoderTest.java
index 04ed9a6..38aa92b 100644
--- a/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/ProtoCoderTest.java
+++ b/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/ProtoCoderTest.java
@@ -20,6 +20,7 @@
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
+import java.io.ObjectStreamClass;
import java.util.Collections;
import org.apache.beam.sdk.coders.CannotProvideCoderException;
import org.apache.beam.sdk.coders.Coder;
@@ -167,4 +168,10 @@
Coder<MessageWithMap> coder = ProtoCoder.of(MessageWithMap.class);
assertNotEquals(CoderUtils.encodeToBase64(coder, msg2), CoderUtils.encodeToBase64(coder, msg1));
}
+
+ @Test
+ public void testSerialVersionID() {
+ long serialVersionID = ObjectStreamClass.lookup(ProtoCoder.class).getSerialVersionUID();
+ assertEquals(-5043999806040629525L, serialVersionID);
+ }
}
diff --git a/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/ProtoDomainTest.java b/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/ProtoDomainTest.java
new file mode 100644
index 0000000..5ff909b
--- /dev/null
+++ b/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/ProtoDomainTest.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.protobuf;
+
+import com.google.protobuf.Int32Value;
+import com.google.protobuf.Int64Value;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Tests for {@link ProtoDomain}. */
+@RunWith(JUnit4.class)
+public class ProtoDomainTest {
+
+ @Test
+ public void testNamespaceEqual() {
+ ProtoDomain domainFromInt32 = ProtoDomain.buildFrom(Int32Value.getDescriptor());
+ ProtoDomain domainFromInt64 = ProtoDomain.buildFrom(Int64Value.getDescriptor());
+ Assert.assertTrue(domainFromInt64.equals(domainFromInt32));
+ }
+
+ @Test
+ public void testContainsDescriptor() {
+ ProtoDomain domainFromInt32 = ProtoDomain.buildFrom(Int32Value.getDescriptor());
+ Assert.assertTrue(domainFromInt32.contains(Int32Value.getDescriptor()));
+ }
+
+ @Test
+ public void testContainsOtherDescriptorSameFile() {
+ ProtoDomain domain = ProtoDomain.buildFrom(Int32Value.getDescriptor());
+ Assert.assertTrue(domain.contains(Int64Value.getDescriptor()));
+ }
+
+ @Test
+ public void testBuildForFile() {
+ ProtoDomain domain = ProtoDomain.buildFrom(Int32Value.getDescriptor().getFile());
+ Assert.assertNotNull(domain.getFileDescriptor("google/protobuf/wrappers.proto"));
+ }
+}
diff --git a/sdks/java/extensions/sql/datacatalog/build.gradle b/sdks/java/extensions/sql/datacatalog/build.gradle
index 20a91ce..714056b 100644
--- a/sdks/java/extensions/sql/datacatalog/build.gradle
+++ b/sdks/java/extensions/sql/datacatalog/build.gradle
@@ -36,6 +36,8 @@
// Dependencies for the example
provided project(":sdks:java:io:google-cloud-platform")
provided library.java.slf4j_api
+
+ testCompile project(":sdks:java:extensions:sql:zetasql")
testRuntimeOnly library.java.slf4j_simple
}
diff --git a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/BigQueryUtils.java b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/BigQueryTableFactory.java
similarity index 65%
rename from sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/BigQueryUtils.java
rename to sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/BigQueryTableFactory.java
index c199ed0..2e87aac 100644
--- a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/BigQueryUtils.java
+++ b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/BigQueryTableFactory.java
@@ -20,23 +20,39 @@
import com.alibaba.fastjson.JSONObject;
import com.google.cloud.datacatalog.Entry;
import java.net.URI;
+import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.beam.sdk.extensions.sql.meta.Table;
+import org.apache.beam.sdk.extensions.sql.meta.Table.Builder;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap;
-/** Utils to extract BQ-specific entry information. */
-class BigQueryUtils {
+/** {@link TableFactory} that understands Data Catalog BigQuery entries. */
+class BigQueryTableFactory implements TableFactory {
+ private static final String BIGQUERY_API = "bigquery.googleapis.com";
private static final Pattern BQ_PATH_PATTERN =
Pattern.compile(
"/projects/(?<PROJECT>[^/]+)/datasets/(?<DATASET>[^/]+)/tables/(?<TABLE>[^/]+)");
- static Table.Builder tableBuilder(Entry entry) {
- return Table.builder()
- .location(getLocation(entry))
- .properties(new JSONObject())
- .type("bigquery")
- .comment("");
+ private final boolean truncateTimestamps;
+
+ public BigQueryTableFactory(boolean truncateTimestamps) {
+ this.truncateTimestamps = truncateTimestamps;
+ }
+
+ @Override
+ public Optional<Builder> tableBuilder(Entry entry) {
+ if (!URI.create(entry.getLinkedResource()).getAuthority().toLowerCase().equals(BIGQUERY_API)) {
+ return Optional.empty();
+ }
+
+ return Optional.of(
+ Table.builder()
+ .location(getLocation(entry))
+ .properties(new JSONObject(ImmutableMap.of("truncateTimestamps", truncateTimestamps)))
+ .type("bigquery")
+ .comment(""));
}
private static String getLocation(Entry entry) {
diff --git a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/ChainedTableFactory.java b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/ChainedTableFactory.java
new file mode 100644
index 0000000..0aaf994
--- /dev/null
+++ b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/ChainedTableFactory.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.sql.meta.provider.datacatalog;
+
+import com.google.cloud.datacatalog.Entry;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Optional;
+import org.apache.beam.sdk.extensions.sql.meta.Table;
+
+/** {@link TableFactory} that uses the first applicable sub-{@link TableFactory}. */
+class ChainedTableFactory implements TableFactory {
+
+ private final List<TableFactory> subTableFactories;
+
+ public static ChainedTableFactory of(TableFactory... subTableFactories) {
+ return new ChainedTableFactory(Arrays.asList(subTableFactories));
+ }
+
+ private ChainedTableFactory(List<TableFactory> subTableFactories) {
+ this.subTableFactories = subTableFactories;
+ }
+
+ /** Creates a Beam SQL table description from a GCS fileset entry. */
+ @Override
+ public Optional<Table.Builder> tableBuilder(Entry entry) {
+ for (TableFactory tableFactory : subTableFactories) {
+ Optional<Table.Builder> tableBuilder = tableFactory.tableBuilder(entry);
+ if (tableBuilder.isPresent()) {
+ return tableBuilder;
+ }
+ }
+ return Optional.empty();
+ }
+}
diff --git a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/DataCatalogPipelineOptions.java b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/DataCatalogPipelineOptions.java
index 47fffcf..0b9d3b7 100644
--- a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/DataCatalogPipelineOptions.java
+++ b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/DataCatalogPipelineOptions.java
@@ -32,4 +32,12 @@
String getDataCatalogEndpoint();
void setDataCatalogEndpoint(String dataCatalogEndpoint);
+
+ /** Whether to truncate timestamps in tables described by Data Catalog. */
+ @Description("Truncate sub-millisecond precision timestamps in tables described by Data Catalog")
+ @Validation.Required
+ @Default.Boolean(false)
+ boolean getTruncateTimestamps();
+
+ void setTruncateTimestamps(boolean newValue);
}
diff --git a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/DataCatalogTableProvider.java b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/DataCatalogTableProvider.java
index 0e8d6d9..359b3c8 100644
--- a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/DataCatalogTableProvider.java
+++ b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/DataCatalogTableProvider.java
@@ -21,6 +21,7 @@
import com.google.cloud.datacatalog.DataCatalogGrpc;
import com.google.cloud.datacatalog.DataCatalogGrpc.DataCatalogBlockingStub;
+import com.google.cloud.datacatalog.Entry;
import com.google.cloud.datacatalog.LookupEntryRequest;
import io.grpc.ManagedChannelBuilder;
import io.grpc.Status;
@@ -28,6 +29,7 @@
import io.grpc.auth.MoreCallCredentials;
import java.util.HashMap;
import java.util.Map;
+import java.util.Optional;
import java.util.stream.Stream;
import javax.annotation.Nullable;
import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
@@ -39,40 +41,37 @@
import org.apache.beam.sdk.extensions.sql.meta.provider.bigquery.BigQueryTableProvider;
import org.apache.beam.sdk.extensions.sql.meta.provider.pubsub.PubsubJsonTableProvider;
import org.apache.beam.sdk.extensions.sql.meta.provider.text.TextTableProvider;
+import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.vendor.calcite.v1_20_0.com.google.common.collect.ImmutableList;
-import org.apache.beam.vendor.calcite.v1_20_0.com.google.common.collect.ImmutableMap;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.MoreObjects;
/** Uses DataCatalog to get the source type and schema for a table. */
public class DataCatalogTableProvider extends FullNameTableProvider {
- private Map<String, TableProvider> delegateProviders;
- private Map<String, Table> tableCache;
- private DataCatalogBlockingStub dataCatalog;
+ private static final TableFactory PUBSUB_TABLE_FACTORY = new PubsubTableFactory();
+ private static final TableFactory GCS_TABLE_FACTORY = new GcsTableFactory();
+
+ private static final Map<String, TableProvider> DELEGATE_PROVIDERS =
+ Stream.of(new PubsubJsonTableProvider(), new BigQueryTableProvider(), new TextTableProvider())
+ .collect(toMap(TableProvider::getTableType, p -> p));
+
+ private final DataCatalogBlockingStub dataCatalog;
+ private final Map<String, Table> tableCache;
+ private final TableFactory tableFactory;
private DataCatalogTableProvider(
- Map<String, TableProvider> delegateProviders, DataCatalogBlockingStub dataCatalog) {
+ DataCatalogBlockingStub dataCatalog, boolean truncateTimestamps) {
this.tableCache = new HashMap<>();
- this.delegateProviders = ImmutableMap.copyOf(delegateProviders);
this.dataCatalog = dataCatalog;
+ this.tableFactory =
+ ChainedTableFactory.of(
+ PUBSUB_TABLE_FACTORY, GCS_TABLE_FACTORY, new BigQueryTableFactory(truncateTimestamps));
}
public static DataCatalogTableProvider create(DataCatalogPipelineOptions options) {
- return new DataCatalogTableProvider(getSupportedProviders(), createDataCatalogClient(options));
- }
-
- private static DataCatalogBlockingStub createDataCatalogClient(
- DataCatalogPipelineOptions options) {
- return DataCatalogGrpc.newBlockingStub(
- ManagedChannelBuilder.forTarget(options.getDataCatalogEndpoint()).build())
- .withCallCredentials(
- MoreCallCredentials.from(options.as(GcpOptions.class).getGcpCredential()));
- }
-
- private static Map<String, TableProvider> getSupportedProviders() {
- return Stream.of(
- new PubsubJsonTableProvider(), new BigQueryTableProvider(), new TextTableProvider())
- .collect(toMap(TableProvider::getTableType, p -> p));
+ return new DataCatalogTableProvider(
+ createDataCatalogClient(options), options.getTruncateTimestamps());
}
@Override
@@ -98,9 +97,8 @@
}
@Override
- public @Nullable Table getTable(String tableNamePart) {
- throw new UnsupportedOperationException(
- "Loading a table by partial name '" + tableNamePart + "' is unsupported");
+ public @Nullable Table getTable(String tableName) {
+ return loadTable(tableName);
}
@Override
@@ -117,6 +115,11 @@
return loadTable(fullEscapedTableName);
}
+ @Override
+ public BeamSqlTable buildBeamSqlTable(Table table) {
+ return DELEGATE_PROVIDERS.get(table.getType()).buildBeamSqlTable(table);
+ }
+
private @Nullable Table loadTable(String tableName) {
if (!tableCache.containsKey(tableName)) {
tableCache.put(tableName, loadTableFromDC(tableName));
@@ -127,7 +130,7 @@
private Table loadTableFromDC(String tableName) {
try {
- return TableUtils.toBeamTable(
+ return toCalciteTable(
tableName,
dataCatalog.lookupEntry(
LookupEntryRequest.newBuilder().setSqlResource(tableName).build()));
@@ -139,8 +142,35 @@
}
}
- @Override
- public BeamSqlTable buildBeamSqlTable(Table table) {
- return delegateProviders.get(table.getType()).buildBeamSqlTable(table);
+ private static DataCatalogBlockingStub createDataCatalogClient(
+ DataCatalogPipelineOptions options) {
+ return DataCatalogGrpc.newBlockingStub(
+ ManagedChannelBuilder.forTarget(options.getDataCatalogEndpoint()).build())
+ .withCallCredentials(
+ MoreCallCredentials.from(options.as(GcpOptions.class).getGcpCredential()));
+ }
+
+ private Table toCalciteTable(String tableName, Entry entry) {
+ if (entry.getSchema().getColumnsCount() == 0) {
+ throw new UnsupportedOperationException(
+ "Entry doesn't have a schema. Please attach a schema to '"
+ + tableName
+ + "' in Data Catalog: "
+ + entry.toString());
+ }
+ Schema schema = SchemaUtils.fromDataCatalog(entry.getSchema());
+
+ Optional<Table.Builder> tableBuilder = tableFactory.tableBuilder(entry);
+ if (!tableBuilder.isPresent()) {
+ throw new UnsupportedOperationException(
+ String.format(
+ "Unsupported Data Catalog entry: %s",
+ MoreObjects.toStringHelper(entry)
+ .add("linkedResource", entry.getLinkedResource())
+ .add("hasGcsFilesetSpec", entry.hasGcsFilesetSpec())
+ .toString()));
+ }
+
+ return tableBuilder.get().schema(schema).name(tableName).build();
}
}
diff --git a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/GcsUtils.java b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/GcsTableFactory.java
similarity index 77%
rename from sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/GcsUtils.java
rename to sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/GcsTableFactory.java
index d354e9d..02a4a30 100644
--- a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/GcsUtils.java
+++ b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/GcsTableFactory.java
@@ -21,18 +21,20 @@
import com.google.cloud.datacatalog.Entry;
import com.google.cloud.datacatalog.GcsFilesetSpec;
import java.util.List;
+import java.util.Optional;
import org.apache.beam.sdk.extensions.sql.meta.Table;
+import org.apache.beam.sdk.extensions.sql.meta.Table.Builder;
-/** Utils to handle GCS entries from Cloud Data Catalog. */
-class GcsUtils {
-
- /** Check if the entry represents a GCS fileset in Data Catalog. */
- static boolean isGcs(Entry entry) {
- return entry.hasGcsFilesetSpec();
- }
+/** {@link TableFactory} that understands Data Catalog GCS entries. */
+class GcsTableFactory implements TableFactory {
/** Creates a Beam SQL table description from a GCS fileset entry. */
- static Table.Builder tableBuilder(Entry entry) {
+ @Override
+ public Optional<Builder> tableBuilder(Entry entry) {
+ if (!entry.hasGcsFilesetSpec()) {
+ return Optional.empty();
+ }
+
GcsFilesetSpec gcsFilesetSpec = entry.getGcsFilesetSpec();
List<String> filePatterns = gcsFilesetSpec.getFilePatternsList();
@@ -50,10 +52,11 @@
+ "Only file patterns with 'gs://' schema are supported at the moment.");
}
- return Table.builder()
- .type("text")
- .location(filePattern)
- .properties(new JSONObject())
- .comment("");
+ return Optional.of(
+ Table.builder()
+ .type("text")
+ .location(filePattern)
+ .properties(new JSONObject())
+ .comment(""));
}
}
diff --git a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/PubsubUtils.java b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/PubsubTableFactory.java
similarity index 70%
rename from sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/PubsubUtils.java
rename to sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/PubsubTableFactory.java
index 856eec9..5a8f6e5 100644
--- a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/PubsubUtils.java
+++ b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/PubsubTableFactory.java
@@ -20,22 +20,32 @@
import com.alibaba.fastjson.JSONObject;
import com.google.cloud.datacatalog.Entry;
import java.net.URI;
+import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.beam.sdk.extensions.sql.meta.Table;
+import org.apache.beam.sdk.extensions.sql.meta.Table.Builder;
-/** Utils to extract Pubsub-specific entry information. */
-class PubsubUtils {
+/** {@link TableFactory} that understands Data Catalog Pubsub entries. */
+class PubsubTableFactory implements TableFactory {
+
+ private static final String PUBSUB_API = "pubsub.googleapis.com";
private static final Pattern PS_PATH_PATTERN =
Pattern.compile("/projects/(?<PROJECT>[^/]+)/topics/(?<TOPIC>[^/]+)");
- static Table.Builder tableBuilder(Entry entry) {
- return Table.builder()
- .location(getLocation(entry))
- .properties(new JSONObject())
- .type("pubsub")
- .comment("");
+ @Override
+ public Optional<Builder> tableBuilder(Entry entry) {
+ if (!URI.create(entry.getLinkedResource()).getAuthority().toLowerCase().equals(PUBSUB_API)) {
+ return Optional.empty();
+ }
+
+ return Optional.of(
+ Table.builder()
+ .location(getLocation(entry))
+ .properties(new JSONObject())
+ .type("pubsub")
+ .comment(""));
}
private static String getLocation(Entry entry) {
diff --git a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/TableFactory.java b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/TableFactory.java
new file mode 100644
index 0000000..a2a230a
--- /dev/null
+++ b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/TableFactory.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.sql.meta.provider.datacatalog;
+
+import com.google.cloud.datacatalog.Entry;
+import java.util.Optional;
+import org.apache.beam.sdk.extensions.sql.meta.Table;
+
+/**
+ * A {@link TableFactory} <i>may</i> be able to interpret a given Data Catalog {@link Entry} into
+ * Beam SQL {@link Table}.
+ */
+interface TableFactory {
+
+ /**
+ * If this {@link TableFactory} instance can interpret the given {@link Entry}, then a Beam SQL
+ * {@link Table} is constructed, else returns {@link Optional#empty}.
+ *
+ * <p>The {@link Table} is returned as a builder for further customization by the caller.
+ */
+ Optional<Table.Builder> tableBuilder(Entry entry);
+}
diff --git a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/TableUtils.java b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/TableUtils.java
deleted file mode 100644
index 6c0b62e..0000000
--- a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/TableUtils.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.sdk.extensions.sql.meta.provider.datacatalog;
-
-import com.google.cloud.datacatalog.Entry;
-import java.net.URI;
-import java.util.Map;
-import org.apache.beam.sdk.extensions.sql.meta.Table;
-import org.apache.beam.sdk.schemas.Schema;
-import org.apache.beam.vendor.calcite.v1_20_0.com.google.common.collect.ImmutableMap;
-
-/** Common utilities to create Beam SQL tables from Data Catalog schemas. */
-class TableUtils {
-
- interface TableFactory {
- Table.Builder tableBuilder(Entry entry);
- }
-
- private static final Map<String, TableFactory> TABLE_FACTORIES =
- ImmutableMap.<String, TableFactory>builder()
- .put("bigquery.googleapis.com", BigQueryUtils::tableBuilder)
- .put("pubsub.googleapis.com", PubsubUtils::tableBuilder)
- .build();
-
- static Table toBeamTable(String tableName, Entry entry) {
- if (entry.getSchema().getColumnsCount() == 0) {
- throw new UnsupportedOperationException(
- "Entry doesn't have a schema. Please attach a schema to '"
- + tableName
- + "' in Data Catalog: "
- + entry.toString());
- }
- Schema schema = SchemaUtils.fromDataCatalog(entry.getSchema());
-
- String service = URI.create(entry.getLinkedResource()).getAuthority().toLowerCase();
-
- Table.Builder table = null;
- if (TABLE_FACTORIES.containsKey(service)) {
- table = TABLE_FACTORIES.get(service).tableBuilder(entry);
- }
-
- if (GcsUtils.isGcs(entry)) {
- table = GcsUtils.tableBuilder(entry);
- }
-
- if (table != null) {
- return table.schema(schema).name(tableName).build();
- }
-
- throw new UnsupportedOperationException(
- "Unsupported SQL source kind: " + entry.getLinkedResource());
- }
-}
diff --git a/sdks/java/extensions/sql/datacatalog/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/DataCatalogBigQueryIT.java b/sdks/java/extensions/sql/datacatalog/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/DataCatalogBigQueryIT.java
index 53599e9..3e7d16b 100644
--- a/sdks/java/extensions/sql/datacatalog/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/DataCatalogBigQueryIT.java
+++ b/sdks/java/extensions/sql/datacatalog/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/DataCatalogBigQueryIT.java
@@ -24,77 +24,127 @@
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
+import java.util.Arrays;
+import java.util.List;
import org.apache.beam.sdk.extensions.sql.SqlTransform;
+import org.apache.beam.sdk.extensions.sql.impl.BeamSqlPipelineOptions;
+import org.apache.beam.sdk.extensions.sql.impl.CalciteQueryPlanner;
+import org.apache.beam.sdk.extensions.sql.impl.QueryPlanner;
+import org.apache.beam.sdk.extensions.sql.zetasql.ZetaSQLQueryPlanner;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.io.gcp.bigquery.TableRowJsonCoder;
import org.apache.beam.sdk.io.gcp.bigquery.TestBigQuery;
+import org.apache.beam.sdk.io.gcp.bigquery.WriteResult;
import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.testing.PAssert;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.transforms.Create;
+import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PInput;
import org.apache.beam.sdk.values.Row;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList;
import org.joda.time.Duration;
import org.junit.Rule;
import org.junit.Test;
+import org.junit.experimental.runners.Enclosed;
import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
+import org.junit.runners.Parameterized;
/** Integration tests for DataCatalog+BigQuery. */
-@RunWith(JUnit4.class)
+@RunWith(Enclosed.class)
public class DataCatalogBigQueryIT {
- private static final Schema ID_NAME_SCHEMA =
- Schema.builder().addNullableField("id", INT64).addNullableField("name", STRING).build();
+ @RunWith(Parameterized.class)
+ public static class DialectSensitiveTests {
+ private static final Schema ID_NAME_SCHEMA =
+ Schema.builder().addNullableField("id", INT64).addNullableField("name", STRING).build();
+ @Rule public transient TestPipeline writeToBQPipeline = TestPipeline.create();
+ @Rule public transient TestPipeline readPipeline = TestPipeline.create();
+ @Rule public transient TestBigQuery bigQuery = TestBigQuery.create(ID_NAME_SCHEMA);
- @Rule public transient TestPipeline writeToBQPipeline = TestPipeline.create();
- @Rule public transient TestPipeline readPipeline = TestPipeline.create();
- @Rule public transient TestBigQuery bigQuery = TestBigQuery.create(ID_NAME_SCHEMA);
+ /** Parameterized by which SQL dialect, since the syntax here is the same. */
+ @Parameterized.Parameters(name = "{0}")
+ public static Iterable<Object[]> dialects() {
+ return Arrays.asList(
+ new Object[][] {
+ {"ZetaSQL", ZetaSQLQueryPlanner.class},
+ {"CalciteSQL", CalciteQueryPlanner.class}
+ });
+ }
- @Test
- public void testReadWrite() throws Exception {
- createBQTableWith(
- new TableRow().set("id", 1).set("name", "name1"),
- new TableRow().set("id", 2).set("name", "name2"),
- new TableRow().set("id", 3).set("name", "name3"));
+ @Parameterized.Parameter(0)
+ public String dialectName;
- TableReference bqTable = bigQuery.tableReference();
- String tableId =
- String.format(
- "bigquery.`table`.`%s`.`%s`.`%s`",
- bqTable.getProjectId(), bqTable.getDatasetId(), bqTable.getTableId());
+ @Parameterized.Parameter(1)
+ public Class<? extends QueryPlanner> queryPlanner;
- PCollection<Row> result =
- readPipeline.apply(
- "query",
- SqlTransform.query("SELECT id, name FROM " + tableId)
- .withDefaultTableProvider(
- "datacatalog",
- DataCatalogTableProvider.create(
- readPipeline.getOptions().as(DataCatalogPipelineOptions.class))));
+ @Test
+ public void testReadWrite() throws Exception {
+ writeToBQPipeline.apply(
+ createBqTable(
+ new TableRow().set("id", 1).set("name", "name1"),
+ new TableRow().set("id", 2).set("name", "name2"),
+ new TableRow().set("id", 3).set("name", "name3")));
+ writeToBQPipeline.run().waitUntilFinish(Duration.standardMinutes(2));
- PAssert.that(result).containsInAnyOrder(row(1, "name1"), row(2, "name2"), row(3, "name3"));
- readPipeline.run().waitUntilFinish(Duration.standardMinutes(2));
- }
+ TableReference bqTable = bigQuery.tableReference();
+ String tableId =
+ String.format(
+ "bigquery.`table`.`%s`.`%s`.`%s`",
+ bqTable.getProjectId(), bqTable.getDatasetId(), bqTable.getTableId());
- private Row row(long id, String name) {
- return Row.withSchema(ID_NAME_SCHEMA).addValues(id, name).build();
- }
+ readPipeline
+ .getOptions()
+ .as(BeamSqlPipelineOptions.class)
+ .setPlannerName(queryPlanner.getCanonicalName());
- private void createBQTableWith(TableRow r1, TableRow r2, TableRow r3) {
- writeToBQPipeline
- .apply(Create.of(r1, r2, r3).withCoder(TableRowJsonCoder.of()))
- .apply(
- BigQueryIO.writeTableRows()
- .to(bigQuery.tableSpec())
- .withSchema(
- new TableSchema()
- .setFields(
- ImmutableList.of(
- new TableFieldSchema().setName("id").setType("INTEGER"),
- new TableFieldSchema().setName("name").setType("STRING"))))
- .withoutValidation());
- writeToBQPipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ PCollection<Row> result =
+ readPipeline.apply(
+ "query",
+ SqlTransform.query("SELECT id, name FROM " + tableId)
+ .withDefaultTableProvider(
+ "datacatalog",
+ DataCatalogTableProvider.create(
+ readPipeline.getOptions().as(DataCatalogPipelineOptions.class))));
+
+ PAssert.that(result).containsInAnyOrder(row(1, "name1"), row(2, "name2"), row(3, "name3"));
+ readPipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ private static Row row(long id, String name) {
+ return Row.withSchema(ID_NAME_SCHEMA).addValues(id, name).build();
+ }
+
+ public CreateBqTable createBqTable(TableRow... rows) {
+ return new CreateBqTable(Arrays.asList(rows));
+ }
+
+ private class CreateBqTable extends PTransform<PInput, WriteResult> {
+
+ private final List<TableRow> rows;
+
+ private CreateBqTable(List<TableRow> rows) {
+ this.rows = rows;
+ }
+
+ @Override
+ public WriteResult expand(PInput input) {
+ return input
+ .getPipeline()
+ .begin()
+ .apply(Create.<TableRow>of(rows).withCoder(TableRowJsonCoder.of()))
+ .apply(
+ BigQueryIO.writeTableRows()
+ .to(bigQuery.tableSpec())
+ .withSchema(
+ new TableSchema()
+ .setFields(
+ ImmutableList.of(
+ new TableFieldSchema().setName("id").setType("INTEGER"),
+ new TableFieldSchema().setName("name").setType("STRING"))))
+ .withoutValidation());
+ }
+ }
}
}
diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/BeamCalciteTable.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/BeamCalciteTable.java
index bb2f212..94db06d 100644
--- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/BeamCalciteTable.java
+++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/BeamCalciteTable.java
@@ -23,9 +23,11 @@
import org.apache.beam.sdk.extensions.sql.impl.rel.BeamEnumerableConverter;
import org.apache.beam.sdk.extensions.sql.impl.rel.BeamIOSinkRel;
import org.apache.beam.sdk.extensions.sql.impl.rel.BeamIOSourceRel;
+import org.apache.beam.sdk.extensions.sql.impl.rel.BeamLogicalConvention;
import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils;
import org.apache.beam.sdk.extensions.sql.meta.BeamSqlTable;
import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.vendor.calcite.v1_20_0.com.google.common.collect.ImmutableList;
import org.apache.beam.vendor.calcite.v1_20_0.com.google.common.collect.ImmutableMap;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.adapter.java.AbstractQueryableTable;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.linq4j.QueryProvider;
@@ -97,7 +99,14 @@
@Override
public RelNode toRel(RelOptTable.ToRelContext context, RelOptTable relOptTable) {
return new BeamIOSourceRel(
- context.getCluster(), relOptTable, beamTable, pipelineOptionsMap, this);
+ context.getCluster(),
+ context.getCluster().traitSetOf(BeamLogicalConvention.INSTANCE),
+ relOptTable,
+ beamTable,
+ ImmutableList.of(),
+ beamTable.constructFilter(ImmutableList.of()),
+ pipelineOptionsMap,
+ this);
}
@Override
diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/CalciteQueryPlanner.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/CalciteQueryPlanner.java
index 93c26fa..c367197 100644
--- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/CalciteQueryPlanner.java
+++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/CalciteQueryPlanner.java
@@ -67,7 +67,7 @@
* The core component to handle through a SQL statement, from explain execution plan, to generate a
* Beam pipeline.
*/
-class CalciteQueryPlanner implements QueryPlanner {
+public class CalciteQueryPlanner implements QueryPlanner {
private static final Logger LOG = LoggerFactory.getLogger(CalciteQueryPlanner.class);
private final Planner planner;
diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/planner/BeamRuleSets.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/planner/BeamRuleSets.java
index 33d69dd..f30f9f3 100644
--- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/planner/BeamRuleSets.java
+++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/planner/BeamRuleSets.java
@@ -25,6 +25,7 @@
import org.apache.beam.sdk.extensions.sql.impl.rule.BeamCalcRule;
import org.apache.beam.sdk.extensions.sql.impl.rule.BeamCoGBKJoinRule;
import org.apache.beam.sdk.extensions.sql.impl.rule.BeamEnumerableConverterRule;
+import org.apache.beam.sdk.extensions.sql.impl.rule.BeamIOPushDownRule;
import org.apache.beam.sdk.extensions.sql.impl.rule.BeamIntersectRule;
import org.apache.beam.sdk.extensions.sql.impl.rule.BeamJoinAssociateRule;
import org.apache.beam.sdk.extensions.sql.impl.rule.BeamJoinPushThroughJoinRule;
@@ -78,6 +79,7 @@
ProjectCalcMergeRule.INSTANCE,
FilterToCalcRule.INSTANCE,
ProjectToCalcRule.INSTANCE,
+ BeamIOPushDownRule.INSTANCE,
// disabled due to https://issues.apache.org/jira/browse/BEAM-6810
// CalcRemoveRule.INSTANCE,
CalcMergeRule.INSTANCE,
diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamIOSourceRel.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamIOSourceRel.java
index 480ccab..b1d3f02 100644
--- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamIOSourceRel.java
+++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamIOSourceRel.java
@@ -19,13 +19,19 @@
import static org.apache.beam.vendor.calcite.v1_20_0.com.google.common.base.Preconditions.checkArgument;
+import java.util.List;
import java.util.Map;
import org.apache.beam.sdk.extensions.sql.impl.BeamCalciteTable;
import org.apache.beam.sdk.extensions.sql.impl.BeamTableStatistics;
import org.apache.beam.sdk.extensions.sql.impl.planner.BeamCostModel;
import org.apache.beam.sdk.extensions.sql.impl.planner.NodeStats;
+import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils;
import org.apache.beam.sdk.extensions.sql.meta.BeamSqlTable;
+import org.apache.beam.sdk.extensions.sql.meta.BeamSqlTableFilter;
+import org.apache.beam.sdk.extensions.sql.meta.DefaultTableFilter;
+import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionList;
import org.apache.beam.sdk.values.Row;
@@ -33,8 +39,12 @@
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.plan.RelOptCost;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.plan.RelOptPlanner;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.plan.RelOptTable;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.plan.RelTraitSet;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.prepare.RelOptTableImpl;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.RelWriter;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.core.TableScan;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.metadata.RelMetadataQuery;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.type.RelDataType;
/** BeamRelNode to replace a {@code TableScan} node. */
public class BeamIOSourceRel extends TableScan implements BeamRelNode {
@@ -42,19 +52,43 @@
private final BeamSqlTable beamTable;
private final BeamCalciteTable calciteTable;
private final Map<String, String> pipelineOptions;
+ private final List<String> usedFields;
+ private final BeamSqlTableFilter tableFilters;
public BeamIOSourceRel(
RelOptCluster cluster,
+ RelTraitSet traitSet,
RelOptTable table,
BeamSqlTable beamTable,
+ List<String> usedFields,
+ BeamSqlTableFilter tableFilters,
Map<String, String> pipelineOptions,
BeamCalciteTable calciteTable) {
- super(cluster, cluster.traitSetOf(BeamLogicalConvention.INSTANCE), table);
+ super(cluster, traitSet, table);
this.beamTable = beamTable;
+ this.usedFields = usedFields;
+ this.tableFilters = tableFilters;
this.calciteTable = calciteTable;
this.pipelineOptions = pipelineOptions;
}
+ public BeamIOSourceRel copy(
+ RelDataType newType, List<String> usedFields, BeamSqlTableFilter tableFilters) {
+ RelOptTable relOptTable =
+ newType == null ? table : ((RelOptTableImpl) getTable()).copy(newType);
+ tableFilters = tableFilters == null ? this.tableFilters : tableFilters;
+
+ return new BeamIOSourceRel(
+ getCluster(),
+ traitSet,
+ relOptTable,
+ beamTable,
+ usedFields,
+ tableFilters,
+ pipelineOptions,
+ calciteTable);
+ }
+
@Override
public double estimateRowCount(RelMetadataQuery mq) {
BeamTableStatistics rowCountStatistics = calciteTable.getStatistic();
@@ -85,6 +119,22 @@
return new Transform();
}
+ @Override
+ public RelWriter explainTerms(RelWriter pw) {
+ super.explainTerms(pw);
+
+ // This is done to tell Calcite planner that BeamIOSourceRel cannot be simply substituted by
+ // another BeamIOSourceRel, except for when they carry the same content.
+ if (!usedFields.isEmpty()) {
+ pw.item("usedFields", usedFields.toString());
+ }
+ if (!(tableFilters instanceof DefaultTableFilter)) {
+ pw.item(tableFilters.getClass().getSimpleName(), tableFilters.toString());
+ }
+
+ return pw;
+ }
+
private class Transform extends PTransform<PCollectionList<Row>, PCollection<Row>> {
@Override
@@ -94,7 +144,15 @@
"Should not have received input for %s: %s",
BeamIOSourceRel.class.getSimpleName(),
input);
- return beamTable.buildIOReader(input.getPipeline().begin());
+
+ final PBegin begin = input.getPipeline().begin();
+
+ if (usedFields.isEmpty() && tableFilters instanceof DefaultTableFilter) {
+ return beamTable.buildIOReader(begin);
+ }
+
+ final Schema newBeamSchema = CalciteUtils.toSchema(getRowType());
+ return beamTable.buildIOReader(begin, tableFilters, usedFields).setRowSchema(newBeamSchema);
}
}
@@ -109,10 +167,12 @@
@Override
public BeamCostModel beamComputeSelfCost(RelOptPlanner planner, RelMetadataQuery mq) {
NodeStats estimates = BeamSqlRelUtils.getNodeStats(this, mq);
- return BeamCostModel.FACTORY.makeCost(estimates.getRowCount(), estimates.getRate());
+ return BeamCostModel.FACTORY
+ .makeCost(estimates.getRowCount(), estimates.getRate())
+ .multiplyBy(getRowType().getFieldCount());
}
- protected BeamSqlTable getBeamSqlTable() {
+ public BeamSqlTable getBeamSqlTable() {
return beamTable;
}
diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rule/BeamIOPushDownRule.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rule/BeamIOPushDownRule.java
new file mode 100644
index 0000000..ca654c8
--- /dev/null
+++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rule/BeamIOPushDownRule.java
@@ -0,0 +1,275 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.sql.impl.rule;
+
+import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument;
+
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.stream.Collectors;
+import org.apache.beam.sdk.extensions.sql.impl.rel.BeamIOSourceRel;
+import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils;
+import org.apache.beam.sdk.extensions.sql.meta.BeamSqlTable;
+import org.apache.beam.sdk.extensions.sql.meta.BeamSqlTableFilter;
+import org.apache.beam.sdk.extensions.sql.meta.DefaultTableFilter;
+import org.apache.beam.sdk.schemas.FieldAccessDescriptor;
+import org.apache.beam.sdk.schemas.FieldAccessDescriptor.FieldDescriptor;
+import org.apache.beam.sdk.schemas.Schema;
+import org.apache.beam.sdk.schemas.utils.SelectHelpers;
+import org.apache.beam.vendor.calcite.v1_20_0.com.google.common.collect.ImmutableList;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.plan.RelOptRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.plan.RelOptRuleCall;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.RelNode;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.core.Calc;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.core.RelFactories;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.type.RelDataType;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.type.RelDataTypeField;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.type.RelRecordType;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexCall;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexInputRef;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexLiteral;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexLocalRef;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexNode;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexProgram;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.tools.RelBuilder;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.tools.RelBuilderFactory;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.util.Pair;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting;
+
+public class BeamIOPushDownRule extends RelOptRule {
+ // ~ Static fields/initializers ---------------------------------------------
+
+ public static final BeamIOPushDownRule INSTANCE =
+ new BeamIOPushDownRule(RelFactories.LOGICAL_BUILDER);
+
+ // ~ Constructors -----------------------------------------------------------
+
+ public BeamIOPushDownRule(RelBuilderFactory relBuilderFactory) {
+ super(operand(Calc.class, operand(BeamIOSourceRel.class, any())), relBuilderFactory, null);
+ }
+
+ // ~ Methods ----------------------------------------------------------------
+
+ @Override
+ public void onMatch(RelOptRuleCall call) {
+ final BeamIOSourceRel ioSourceRel = call.rel(1);
+ final BeamSqlTable beamSqlTable = ioSourceRel.getBeamSqlTable();
+
+ // Nested rows are not supported at the moment
+ for (RelDataTypeField field : ioSourceRel.getRowType().getFieldList()) {
+ if (field.getType() instanceof RelRecordType) {
+ return;
+ }
+ }
+
+ final Calc calc = call.rel(0);
+ final RexProgram program = calc.getProgram();
+ final Pair<ImmutableList<RexNode>, ImmutableList<RexNode>> projectFilter = program.split();
+ final RelDataType calcInputRowType = program.getInputRowType();
+
+ // When predicate push-down is not supported - all filters are unsupported.
+ final BeamSqlTableFilter tableFilter = beamSqlTable.constructFilter(projectFilter.right);
+ if (!beamSqlTable.supportsProjects() && tableFilter instanceof DefaultTableFilter) {
+ // Either project or filter push-down must be supported by the IO.
+ return;
+ }
+
+ if (!(tableFilter instanceof DefaultTableFilter) && !beamSqlTable.supportsProjects()) {
+ // TODO(BEAM-8508): add support for standalone filter push-down.
+ // Filter push-down without project push-down is not supported for now.
+ return;
+ }
+
+ // Find all input refs used by projects
+ Set<String> usedFields = new LinkedHashSet<>();
+ for (RexNode project : projectFilter.left) {
+ findUtilizedInputRefs(calcInputRowType, project, usedFields);
+ }
+
+ // Find all input refs used by filters
+ for (RexNode filter : tableFilter.getNotSupported()) {
+ findUtilizedInputRefs(calcInputRowType, filter, usedFields);
+ }
+
+ FieldAccessDescriptor resolved =
+ FieldAccessDescriptor.withFieldNames(usedFields)
+ .withOrderByFieldInsertionOrder()
+ .resolve(beamSqlTable.getSchema());
+ Schema newSchema =
+ SelectHelpers.getOutputSchema(ioSourceRel.getBeamSqlTable().getSchema(), resolved);
+ RelDataType calcInputType =
+ CalciteUtils.toCalciteRowType(newSchema, ioSourceRel.getCluster().getTypeFactory());
+
+ // Check if the calc can be dropped:
+ // 1. Calc only does projects and renames.
+ // And
+ // 2. Predicate can be completely pushed-down to IO level.
+ if (isProjectRenameOnlyProgram(program) && tableFilter.getNotSupported().isEmpty()) {
+ // Tell the optimizer to not use old IO, since the new one is better.
+ call.getPlanner().setImportance(ioSourceRel, 0.0);
+ call.transformTo(ioSourceRel.copy(calc.getRowType(), newSchema.getFieldNames(), tableFilter));
+ return;
+ }
+
+ // Already most optimal case:
+ // Calc contains all unsupported filters.
+ // IO only projects fields utilised by a calc.
+ if (tableFilter.getNotSupported().equals(projectFilter.right)
+ && usedFields.size() == ioSourceRel.getRowType().getFieldCount()) {
+ return;
+ }
+
+ BeamIOSourceRel newIoSourceRel =
+ ioSourceRel.copy(calcInputType, newSchema.getFieldNames(), tableFilter);
+ RelBuilder relBuilder = call.builder();
+ relBuilder.push(newIoSourceRel);
+
+ List<RexNode> newProjects = new ArrayList<>();
+ List<RexNode> newFilter = new ArrayList<>();
+ // Ex: let's say the original fields are (number before each element is the index):
+ // {0:unused1, 1:id, 2:name, 3:unused2},
+ // where only 'id' and 'name' are being used. Then the new calcInputType should be as follows:
+ // {0:id, 1:name}.
+ // A mapping list will contain 2 entries: {0:1, 1:2},
+ // showing how used field names map to the original fields.
+ List<Integer> mapping =
+ resolved.getFieldsAccessed().stream()
+ .map(FieldDescriptor::getFieldId)
+ .collect(Collectors.toList());
+
+ // Map filters to new RexInputRef.
+ for (RexNode filter : tableFilter.getNotSupported()) {
+ newFilter.add(reMapRexNodeToNewInputs(filter, mapping));
+ }
+ // Map projects to new RexInputRef.
+ for (RexNode project : projectFilter.left) {
+ newProjects.add(reMapRexNodeToNewInputs(project, mapping));
+ }
+
+ relBuilder.filter(newFilter);
+ relBuilder.project(
+ newProjects, calc.getRowType().getFieldNames(), true); // Always preserve named projects.
+
+ RelNode result = relBuilder.build();
+
+ if (newFilter.size() < projectFilter.right.size()) {
+ // Smaller Calc programs are indisputably better.
+ // Tell the optimizer not to use old Calc and IO.
+ call.getPlanner().setImportance(calc, 0.0);
+ call.getPlanner().setImportance(ioSourceRel, 0.0);
+ call.transformTo(result);
+ } else if (newFilter.size() == projectFilter.right.size()) {
+ // But we can consider something with the same number of filters.
+ call.transformTo(result);
+ }
+ }
+
+ /**
+ * Given a {@code RexNode}, find all {@code RexInputRef}s a node or it's children nodes use.
+ *
+ * @param inputRowType {@code RelDataType} used for looking up names of {@code RexInputRef}.
+ * @param startNode A node to start at.
+ * @param usedFields Names of {@code RexInputRef}s are added to this list.
+ */
+ @VisibleForTesting
+ void findUtilizedInputRefs(RelDataType inputRowType, RexNode startNode, Set<String> usedFields) {
+ Queue<RexNode> prerequisites = new ArrayDeque<>();
+ prerequisites.add(startNode);
+
+ // Assuming there are no cyclic nodes, traverse dependency tree until all RexInputRefs are found
+ while (!prerequisites.isEmpty()) {
+ RexNode node = prerequisites.poll();
+
+ if (node instanceof RexCall) { // Composite expression, example: "=($t11, $t12)"
+ RexCall compositeNode = (RexCall) node;
+
+ // Expression from example above contains 2 operands: $t11, $t12
+ prerequisites.addAll(compositeNode.getOperands());
+ } else if (node instanceof RexInputRef) { // Input reference
+ // Find a field in an inputRowType for the input reference
+ int inputFieldIndex = ((RexInputRef) node).getIndex();
+ RelDataTypeField field = inputRowType.getFieldList().get(inputFieldIndex);
+
+ // If we have not seen it before - add it to the list (hash set)
+ usedFields.add(field.getName());
+ } else if (node instanceof RexLiteral) {
+ // Does not contain information about columns utilized by a Calc
+ } else {
+ throw new RuntimeException(
+ "Unexpected RexNode encountered: " + node.getClass().getSimpleName());
+ }
+ }
+ }
+
+ /**
+ * Recursively reconstruct a {@code RexNode}, mapping old RexInputRefs to new.
+ *
+ * @param node {@code RexNode} to reconstruct.
+ * @param inputRefMapping Mapping from old {@code RexInputRefNode} indexes to new, where list
+ * index is the new {@code RexInputRefNode} and the value is old {@code RexInputRefNode}.
+ * @return reconstructed {@code RexNode} with {@code RexInputRefNode} remapped to new values.
+ */
+ @VisibleForTesting
+ RexNode reMapRexNodeToNewInputs(RexNode node, List<Integer> inputRefMapping) {
+ if (node instanceof RexInputRef) {
+ int oldInputIndex = ((RexInputRef) node).getIndex();
+ int newInputIndex = inputRefMapping.indexOf(oldInputIndex);
+
+ // Create a new input reference pointing to a new input field
+ return new RexInputRef(newInputIndex, node.getType());
+ } else if (node instanceof RexCall) { // Composite expression, example: "=($t11, $t12)"
+ RexCall compositeNode = (RexCall) node;
+ List<RexNode> newOperands = new ArrayList<>();
+
+ for (RexNode operand : compositeNode.getOperands()) {
+ newOperands.add(reMapRexNodeToNewInputs(operand, inputRefMapping));
+ }
+
+ return compositeNode.clone(compositeNode.getType(), newOperands);
+ }
+
+ // If node is anything else - return it as is (ex: Literal)
+ checkArgument(
+ node instanceof RexLiteral,
+ "RexLiteral node expected, but was: " + node.getClass().getSimpleName());
+ return node;
+ }
+
+ /**
+ * Determine whether a program only performs renames and/or projects. RexProgram#isTrivial is not
+ * sufficient in this case, because number of projects does not need to be the same as inputs.
+ *
+ * @param program A program to check.
+ * @return True when program performs only projects (w/o any modifications), false otherwise.
+ */
+ @VisibleForTesting
+ boolean isProjectRenameOnlyProgram(RexProgram program) {
+ int fieldCount = program.getInputRowType().getFieldCount();
+ for (RexLocalRef ref : program.getProjectList()) {
+ if (ref.getIndex() >= fieldCount) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+}
diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/bigquery/BigQueryTable.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/bigquery/BigQueryTable.java
index be42b86..711f1bf 100644
--- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/bigquery/BigQueryTable.java
+++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/bigquery/BigQueryTable.java
@@ -25,16 +25,23 @@
import java.util.stream.Collectors;
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.extensions.sql.impl.BeamTableStatistics;
+import org.apache.beam.sdk.extensions.sql.meta.BeamSqlTableFilter;
+import org.apache.beam.sdk.extensions.sql.meta.DefaultTableFilter;
import org.apache.beam.sdk.extensions.sql.meta.SchemaBaseBeamTable;
import org.apache.beam.sdk.extensions.sql.meta.Table;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TypedRead;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TypedRead.Method;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryOptions;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryUtils;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryUtils.ConversionOptions;
import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.schemas.FieldAccessDescriptor;
+import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.schemas.SchemaCoder;
+import org.apache.beam.sdk.schemas.transforms.Select;
+import org.apache.beam.sdk.schemas.utils.SelectHelpers;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.POutput;
@@ -106,16 +113,37 @@
@Override
public PCollection<Row> buildIOReader(PBegin begin) {
+ return begin.apply("Read Input BQ Rows", getBigQueryReadBuilder(getSchema()));
+ }
+
+ @Override
+ public PCollection<Row> buildIOReader(
+ PBegin begin, BeamSqlTableFilter filters, List<String> fieldNames) {
+ if (!method.equals(Method.DIRECT_READ)) {
+ LOGGER.info("Predicate/project push-down only available for `DIRECT_READ` method, skipping.");
+ return buildIOReader(begin);
+ }
+
+ final FieldAccessDescriptor resolved =
+ FieldAccessDescriptor.withFieldNames(fieldNames).resolve(getSchema());
+ final Schema newSchema = SelectHelpers.getOutputSchema(getSchema(), resolved);
+
+ TypedRead<Row> builder = getBigQueryReadBuilder(newSchema);
+
+ if (!(filters instanceof DefaultTableFilter)) {
+ throw new RuntimeException("Unimplemented at the moment.");
+ }
+
+ if (!fieldNames.isEmpty()) {
+ builder.withSelectedFields(fieldNames);
+ }
+
return begin
+ .apply("Read Input BQ Rows with push-down", builder)
.apply(
- "Read Input BQ Rows",
- BigQueryIO.read(
- record ->
- BigQueryUtils.toBeamRow(record.getRecord(), getSchema(), conversionOptions))
- .withMethod(method)
- .from(bqLocation)
- .withCoder(SchemaCoder.of(getSchema())))
- .setRowSchema(getSchema());
+ "ReorderRowFields",
+ Select.fieldAccess(
+ FieldAccessDescriptor.withFieldNames(fieldNames).withOrderByFieldInsertionOrder()));
}
@Override
@@ -127,6 +155,19 @@
.to(bqLocation));
}
+ @Override
+ public boolean supportsProjects() {
+ return method.equals(Method.DIRECT_READ);
+ }
+
+ private TypedRead<Row> getBigQueryReadBuilder(Schema schema) {
+ return BigQueryIO.read(
+ record -> BigQueryUtils.toBeamRow(record.getRecord(), schema, conversionOptions))
+ .withMethod(method)
+ .from(bqLocation)
+ .withCoder(SchemaCoder.of(schema));
+ }
+
private static BeamTableStatistics getRowCountFromBQ(PipelineOptions o, String bqLocation) {
try {
BigInteger rowCount =
diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/pubsub/PubsubMessageToRow.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/pubsub/PubsubMessageToRow.java
index 654e722..3d9a712 100644
--- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/pubsub/PubsubMessageToRow.java
+++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/pubsub/PubsubMessageToRow.java
@@ -31,8 +31,8 @@
import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.schemas.Schema.TypeName;
import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.util.RowJsonDeserializer;
-import org.apache.beam.sdk.util.RowJsonDeserializer.UnsupportedRowJsonException;
+import org.apache.beam.sdk.util.RowJson.RowJsonDeserializer;
+import org.apache.beam.sdk.util.RowJson.RowJsonDeserializer.UnsupportedRowJsonException;
import org.apache.beam.sdk.util.RowJsonUtils;
import org.apache.beam.sdk.values.Row;
import org.apache.beam.sdk.values.TupleTag;
diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableFilter.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableFilter.java
new file mode 100644
index 0000000..10adbea
--- /dev/null
+++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableFilter.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.sql.meta.provider.test;
+
+import static org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.SqlKind.COMPARISON;
+import static org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.SqlKind.IN;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+import org.apache.beam.sdk.extensions.sql.meta.BeamSqlTableFilter;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexCall;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexInputRef;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexLiteral;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexNode;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.type.SqlTypeName;
+
+public class TestTableFilter implements BeamSqlTableFilter {
+ private List<RexNode> supported;
+ private List<RexNode> unsupported;
+
+ public TestTableFilter(List<RexNode> predicateCNF) {
+ supported = new ArrayList<>();
+ unsupported = new ArrayList<>();
+
+ for (RexNode node : predicateCNF) {
+ if (isSupported(node)) {
+ supported.add(node);
+ } else {
+ unsupported.add(node);
+ }
+ }
+ }
+
+ @Override
+ public List<RexNode> getNotSupported() {
+ return unsupported;
+ }
+
+ public List<RexNode> getSupported() {
+ return supported;
+ }
+
+ @Override
+ public String toString() {
+ String supStr =
+ "supported{"
+ + supported.stream().map(RexNode::toString).collect(Collectors.joining())
+ + "}";
+ String unsupStr =
+ "unsupported{"
+ + unsupported.stream().map(RexNode::toString).collect(Collectors.joining())
+ + "}";
+
+ return "[" + supStr + ", " + unsupStr + "]";
+ }
+
+ /**
+ * Check whether a {@code RexNode} is supported. For testing purposes only simple nodes are
+ * supported. Ex: comparison between 2 input fields, input field to a literal, literal to a
+ * literal.
+ *
+ * @param node A node to check for predicate push-down support.
+ * @return True when a node is supported, false otherwise.
+ */
+ private boolean isSupported(RexNode node) {
+ if (node.getType().getSqlTypeName().equals(SqlTypeName.BOOLEAN)) {
+ if (node instanceof RexCall) {
+ RexCall compositeNode = (RexCall) node;
+
+ // Only support comparisons in a predicate
+ if (!node.getKind().belongsTo(COMPARISON)) {
+ return false;
+ }
+
+ // Not support IN operator for now
+ if (node.getKind().equals(IN)) {
+ return false;
+ }
+
+ for (RexNode operand : compositeNode.getOperands()) {
+ if (!(operand instanceof RexLiteral) && !(operand instanceof RexInputRef)) {
+ return false;
+ }
+ }
+ } else if (node instanceof RexInputRef) {
+ // When field is a boolean
+ return true;
+ } else {
+ throw new RuntimeException(
+ "Encountered an unexpected node type: " + node.getClass().getSimpleName());
+ }
+ } else {
+ throw new RuntimeException(
+ "Predicate node '"
+ + node.getClass().getSimpleName()
+ + "' should be a boolean expression, but was: "
+ + node.getType().getSqlTypeName());
+ }
+
+ return true;
+ }
+}
diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableProvider.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableProvider.java
index e65d903..5dae333 100644
--- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableProvider.java
+++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableProvider.java
@@ -21,6 +21,7 @@
import com.google.auto.service.AutoService;
import java.io.Serializable;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
@@ -38,17 +39,28 @@
import org.apache.beam.sdk.extensions.sql.meta.provider.InMemoryMetaTableProvider;
import org.apache.beam.sdk.extensions.sql.meta.provider.TableProvider;
import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.schemas.FieldAccessDescriptor;
+import org.apache.beam.sdk.schemas.FieldTypeDescriptors;
import org.apache.beam.sdk.schemas.Schema;
+import org.apache.beam.sdk.schemas.Schema.FieldType;
import org.apache.beam.sdk.schemas.SchemaCoder;
+import org.apache.beam.sdk.schemas.transforms.Filter;
import org.apache.beam.sdk.schemas.transforms.Select;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PDone;
import org.apache.beam.sdk.values.POutput;
import org.apache.beam.sdk.values.Row;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexCall;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexInputRef;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexLiteral;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexNode;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.type.SqlTypeName;
/**
* Test in-memory table provider for use in tests.
@@ -58,6 +70,7 @@
@AutoService(TableProvider.class)
public class TestTableProvider extends InMemoryMetaTableProvider {
static final Map<Long, Map<String, TableWithRows>> GLOBAL_TABLES = new ConcurrentHashMap<>();
+ public static final String PUSH_DOWN_OPTION = "push_down";
private static final AtomicLong INSTANCES = new AtomicLong(0);
private final long instanceId = INSTANCES.getAndIncrement();
@@ -124,6 +137,7 @@
private static class InMemoryTable extends BaseBeamTable {
private TableWithRows tableWithRows;
+ private PushDownOptions options;
@Override
public PCollection.IsBounded isBounded() {
@@ -132,6 +146,16 @@
public InMemoryTable(TableWithRows tableWithRows) {
this.tableWithRows = tableWithRows;
+
+ // The reason for introducing a property here is to simplify writing unit tests, testing
+ // project and predicate push-down behavior when run separate and together.
+ if (tableWithRows.table.getProperties().containsKey(PUSH_DOWN_OPTION)) {
+ options =
+ PushDownOptions.valueOf(
+ tableWithRows.table.getProperties().getString(PUSH_DOWN_OPTION).toUpperCase());
+ } else {
+ options = PushDownOptions.NONE;
+ }
}
public Coder<Row> rowCoder() {
@@ -156,18 +180,46 @@
@Override
public PCollection<Row> buildIOReader(
PBegin begin, BeamSqlTableFilter filters, List<String> fieldNames) {
+ if (!(filters instanceof DefaultTableFilter)
+ && (options == PushDownOptions.NONE || options == PushDownOptions.PROJECT)) {
+ throw new RuntimeException(
+ "Filter push-down is not supported, yet non-default filter was passed.");
+ }
+ if ((!fieldNames.isEmpty() && fieldNames.size() < getSchema().getFieldCount())
+ && (options == PushDownOptions.NONE || options == PushDownOptions.FILTER)) {
+ throw new RuntimeException(
+ "Project push-down is not supported, yet a list of fieldNames was passed.");
+ }
+
PCollection<Row> withAllFields = buildIOReader(begin);
- if (fieldNames.isEmpty() && filters instanceof DefaultTableFilter) {
+ if (options == PushDownOptions.NONE) { // needed for testing purposes
return withAllFields;
}
PCollection<Row> result = withAllFields;
- if (!(filters instanceof DefaultTableFilter)) {
- throw new RuntimeException("Unimplemented at the moment.");
+ // When filter push-down is supported.
+ if (options == PushDownOptions.FILTER || options == PushDownOptions.BOTH) {
+ if (filters instanceof TestTableFilter) {
+ // Create a filter for each supported node.
+ for (RexNode node : ((TestTableFilter) filters).getSupported()) {
+ result = result.apply("IOPushDownFilter_" + node.toString(), filterFromNode(node));
+ }
+ } else {
+ throw new RuntimeException(
+ "Was expecting a filter of type TestTableFilter, but received: "
+ + filters.getClass().getSimpleName());
+ }
}
- if (!fieldNames.isEmpty()) {
- result = result.apply(Select.fieldNames(fieldNames.toArray(new String[0])));
+ // When project push-down is supported.
+ if ((options == PushDownOptions.PROJECT || options == PushDownOptions.BOTH)
+ && !fieldNames.isEmpty()) {
+ result =
+ result.apply(
+ "IOPushDownProject",
+ Select.fieldAccess(
+ FieldAccessDescriptor.withFieldNames(fieldNames)
+ .withOrderByFieldInsertionOrder()));
}
return result;
@@ -180,14 +232,143 @@
}
@Override
+ public BeamSqlTableFilter constructFilter(List<RexNode> filter) {
+ if (options == PushDownOptions.FILTER || options == PushDownOptions.BOTH) {
+ return new TestTableFilter(filter);
+ }
+ return super.constructFilter(filter);
+ }
+
+ @Override
public boolean supportsProjects() {
- return true;
+ return options == PushDownOptions.BOTH || options == PushDownOptions.PROJECT;
}
@Override
public Schema getSchema() {
return tableWithRows.table.getSchema();
}
+
+ /**
+ * A helper method to create a {@code Filter} from {@code RexNode}.
+ *
+ * @param node {@code RexNode} to create a filter from.
+ * @return {@code Filter} PTransform.
+ */
+ private PTransform<PCollection<Row>, PCollection<Row>> filterFromNode(RexNode node) {
+ List<RexNode> operands = new ArrayList<>();
+ List<Integer> fieldIds = new ArrayList<>();
+ List<RexLiteral> literals = new ArrayList<>();
+ List<RexInputRef> inputRefs = new ArrayList<>();
+
+ if (node instanceof RexCall) {
+ operands.addAll(((RexCall) node).getOperands());
+ } else if (node instanceof RexInputRef) {
+ operands.add(node);
+ operands.add(RexLiteral.fromJdbcString(node.getType(), SqlTypeName.BOOLEAN, "true"));
+ } else {
+ throw new RuntimeException(
+ "Was expecting a RexCall or a boolean RexInputRef, but received: "
+ + node.getClass().getSimpleName());
+ }
+
+ for (RexNode operand : operands) {
+ if (operand instanceof RexInputRef) {
+ RexInputRef inputRef = (RexInputRef) operand;
+ fieldIds.add(inputRef.getIndex());
+ inputRefs.add(inputRef);
+ } else if (operand instanceof RexLiteral) {
+ RexLiteral literal = (RexLiteral) operand;
+ literals.add(literal);
+ } else {
+ throw new RuntimeException(
+ "Encountered an unexpected operand: " + operand.getClass().getSimpleName());
+ }
+ }
+
+ SerializableFunction<Integer, Boolean> comparison;
+ // TODO: add support for expressions like:
+ // =(CAST($3):INTEGER NOT NULL, 200)
+ switch (node.getKind()) {
+ case LESS_THAN:
+ comparison = i -> i < 0;
+ break;
+ case GREATER_THAN:
+ comparison = i -> i > 0;
+ break;
+ case LESS_THAN_OR_EQUAL:
+ comparison = i -> i <= 0;
+ break;
+ case GREATER_THAN_OR_EQUAL:
+ comparison = i -> i >= 0;
+ break;
+ case EQUALS:
+ case INPUT_REF:
+ comparison = i -> i == 0;
+ break;
+ case NOT_EQUALS:
+ comparison = i -> i != 0;
+ break;
+ default:
+ throw new RuntimeException("Unsupported node kind: " + node.getKind().toString());
+ }
+
+ return Filter.<Row>create()
+ .whereFieldIds(
+ fieldIds, createFilter(operands, fieldIds, inputRefs, literals, comparison));
+ }
+
+ /**
+ * A helper method to create a serializable function comparing row fields.
+ *
+ * @param operands A list of operands used in a comparison.
+ * @param fieldIds A list of operand ids.
+ * @param inputRefs A list of operands, which are an instanceof {@code RexInputRef}.
+ * @param literals A list of operands, which are an instanceof {@code RexLiteral}.
+ * @param comparison A comparison to perform between operands.
+ * @return A filter comparing row fields to literals/other fields.
+ */
+ private SerializableFunction<Row, Boolean> createFilter(
+ List<RexNode> operands,
+ List<Integer> fieldIds,
+ List<RexInputRef> inputRefs,
+ List<RexLiteral> literals,
+ SerializableFunction<Integer, Boolean> comparison) {
+ // Filter push-down only supports comparisons between 2 operands (for now).
+ assert operands.size() == 2;
+ // Comparing two columns (2 input refs).
+ assert inputRefs.size() <= 2;
+ // Case where we compare 2 Literals should never appear and get optimized away.
+ assert literals.size() < 2;
+
+ if (inputRefs.size() == 2) { // Comparing 2 columns.
+ final int op0 = fieldIds.indexOf(inputRefs.get(0).getIndex());
+ final int op1 = fieldIds.indexOf(inputRefs.get(1).getIndex());
+ return row -> comparison.apply(row.<Comparable>getValue(op0).compareTo(op1));
+ }
+ // Comparing a column to a literal.
+ int fieldSchemaIndex = inputRefs.get(0).getIndex();
+ FieldType beamFieldType = getSchema().getField(fieldSchemaIndex).getType();
+ final int op0 = fieldIds.indexOf(fieldSchemaIndex);
+
+ // Find Java type of the op0 in Schema
+ final Comparable op1 =
+ literals
+ .get(0)
+ .<Comparable>getValueAs(
+ FieldTypeDescriptors.javaTypeForFieldType(beamFieldType).getRawType());
+ if (operands.get(0) instanceof RexLiteral) { // First operand is a literal
+ return row -> comparison.apply(op1.compareTo(row.getValue(op0)));
+ } else if (operands.get(0) instanceof RexInputRef) { // First operand is a column value
+ return row -> comparison.apply(row.<Comparable>getValue(op0).compareTo(op1));
+ } else {
+ throw new RuntimeException(
+ "Was expecting a RexLiteral and a RexInputRef, but received: "
+ + operands.stream()
+ .map(o -> o.getClass().getSimpleName())
+ .collect(Collectors.joining(", ")));
+ }
+ }
}
private static final class CollectorFn extends DoFn<Row, Row> {
@@ -205,4 +386,11 @@
context.output(context.element());
}
}
+
+ public enum PushDownOptions {
+ NONE,
+ PROJECT,
+ FILTER,
+ BOTH
+ }
}
diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/rule/IOPushDownRuleTest.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/rule/IOPushDownRuleTest.java
new file mode 100644
index 0000000..907389f
--- /dev/null
+++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/rule/IOPushDownRuleTest.java
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.sql.impl.rule;
+
+import static org.apache.beam.sdk.extensions.sql.meta.provider.test.TestTableProvider.PUSH_DOWN_OPTION;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.collection.IsIterableContainingInAnyOrder.containsInAnyOrder;
+import static org.hamcrest.core.IsEqual.equalTo;
+import static org.hamcrest.core.IsInstanceOf.instanceOf;
+
+import com.alibaba.fastjson.JSON;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv;
+import org.apache.beam.sdk.extensions.sql.impl.rel.BeamRelNode;
+import org.apache.beam.sdk.extensions.sql.meta.Table;
+import org.apache.beam.sdk.extensions.sql.meta.provider.test.TestTableProvider;
+import org.apache.beam.sdk.extensions.sql.meta.provider.test.TestTableProvider.PushDownOptions;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.schemas.Schema;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.values.Row;
+import org.apache.beam.vendor.calcite.v1_20_0.com.google.common.collect.ImmutableList;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.plan.RelOptRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.core.Calc;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.rules.CalcMergeRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.rules.FilterCalcMergeRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.rules.FilterToCalcRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.rules.ProjectCalcMergeRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.rules.ProjectToCalcRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexNode;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.tools.RuleSet;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.tools.RuleSets;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.util.Pair;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class IOPushDownRuleTest {
+ private static final Schema BASIC_SCHEMA =
+ Schema.builder()
+ .addInt32Field("unused1")
+ .addInt32Field("id")
+ .addStringField("name")
+ .addInt32Field("unused2")
+ .build();
+ private static final List<RelOptRule> defaultRules =
+ ImmutableList.of(
+ BeamCalcRule.INSTANCE,
+ FilterCalcMergeRule.INSTANCE,
+ ProjectCalcMergeRule.INSTANCE,
+ FilterToCalcRule.INSTANCE,
+ ProjectToCalcRule.INSTANCE,
+ CalcMergeRule.INSTANCE);
+ private BeamSqlEnv sqlEnv;
+
+ @Rule public TestPipeline pipeline = TestPipeline.create();
+
+ @Before
+ public void buildUp() {
+ TestTableProvider tableProvider = new TestTableProvider();
+ Table table = getTable("TEST", PushDownOptions.PROJECT);
+ tableProvider.createTable(table);
+ tableProvider.addRows(
+ table.getName(),
+ row(BASIC_SCHEMA, 100, 1, "one", 100),
+ row(BASIC_SCHEMA, 200, 2, "two", 200));
+
+ sqlEnv =
+ BeamSqlEnv.builder(tableProvider)
+ .setPipelineOptions(PipelineOptionsFactory.create())
+ .setRuleSets(new RuleSet[] {RuleSets.ofList(defaultRules)})
+ .build();
+ }
+
+ @Test
+ public void testFindUtilisedInputRefs() {
+ String sqlQuery = "select id+10 from TEST where name='one'";
+ BeamRelNode basicRel = sqlEnv.parseQuery(sqlQuery);
+ assertThat(basicRel, instanceOf(Calc.class));
+
+ Calc calc = (Calc) basicRel;
+ final Pair<ImmutableList<RexNode>, ImmutableList<RexNode>> projectFilter =
+ calc.getProgram().split();
+ final ImmutableList<RexNode> projects = projectFilter.left;
+ final ImmutableList<RexNode> filters = projectFilter.right;
+
+ Set<String> usedFields = new HashSet<>();
+ BeamIOPushDownRule.INSTANCE.findUtilizedInputRefs(
+ calc.getProgram().getInputRowType(), projects.get(0), usedFields);
+ assertThat(usedFields, containsInAnyOrder("id"));
+
+ BeamIOPushDownRule.INSTANCE.findUtilizedInputRefs(
+ calc.getProgram().getInputRowType(), filters.get(0), usedFields);
+ assertThat(usedFields, containsInAnyOrder("id", "name"));
+ }
+
+ @Test
+ public void testReMapRexNodeToNewInputs() {
+ String sqlQuery = "select id+10 from TEST where name='one'";
+ BeamRelNode basicRel = sqlEnv.parseQuery(sqlQuery);
+ assertThat(basicRel, instanceOf(Calc.class));
+
+ Calc calc = (Calc) basicRel;
+ final Pair<ImmutableList<RexNode>, ImmutableList<RexNode>> projectFilter =
+ calc.getProgram().split();
+ final ImmutableList<RexNode> projects = projectFilter.left;
+ final ImmutableList<RexNode> filters = projectFilter.right;
+
+ List<Integer> mapping = ImmutableList.of(1, 2);
+
+ RexNode newProject =
+ BeamIOPushDownRule.INSTANCE.reMapRexNodeToNewInputs(projects.get(0), mapping);
+ assertThat(newProject.toString(), equalTo("+($0, 10)"));
+
+ RexNode newFilter =
+ BeamIOPushDownRule.INSTANCE.reMapRexNodeToNewInputs(filters.get(0), mapping);
+ assertThat(newFilter.toString(), equalTo("=($1, 'one')"));
+ }
+
+ @Test
+ public void testIsProjectRenameOnlyProgram() {
+ List<Pair<String, Boolean>> tests =
+ ImmutableList.of(
+ Pair.of("select id from TEST", true),
+ Pair.of("select * from TEST", true),
+ Pair.of("select id, name from TEST", true),
+ Pair.of("select id+10 from TEST", false),
+ // Note that we only care about projects.
+ Pair.of("select id from TEST where name='one'", true));
+
+ for (Pair<String, Boolean> test : tests) {
+ String sqlQuery = test.left;
+ boolean expectedAnswer = test.right;
+ BeamRelNode basicRel = sqlEnv.parseQuery(sqlQuery);
+ assertThat(basicRel, instanceOf(Calc.class));
+
+ Calc calc = (Calc) basicRel;
+ assertThat(
+ BeamIOPushDownRule.INSTANCE.isProjectRenameOnlyProgram(calc.getProgram()),
+ equalTo(expectedAnswer));
+ }
+ }
+
+ private static Row row(Schema schema, Object... objects) {
+ return Row.withSchema(schema).addValues(objects).build();
+ }
+
+ private static Table getTable(String name, PushDownOptions options) {
+ return Table.builder()
+ .name(name)
+ .comment(name + " table")
+ .schema(BASIC_SCHEMA)
+ .properties(
+ JSON.parseObject("{ " + PUSH_DOWN_OPTION + ": " + "\"" + options.toString() + "\" }"))
+ .type("test")
+ .build();
+ }
+}
diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/bigquery/BigQueryReadWriteIT.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/bigquery/BigQueryReadWriteIT.java
index 2c00edb..c2fb264 100644
--- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/bigquery/BigQueryReadWriteIT.java
+++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/bigquery/BigQueryReadWriteIT.java
@@ -27,9 +27,10 @@
import static org.apache.beam.sdk.schemas.Schema.FieldType.INT32;
import static org.apache.beam.sdk.schemas.Schema.FieldType.INT64;
import static org.apache.beam.sdk.schemas.Schema.FieldType.STRING;
+import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.containsInAnyOrder;
+import static org.hamcrest.Matchers.instanceOf;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertThat;
import java.io.Serializable;
import java.util.Arrays;
@@ -38,6 +39,8 @@
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.PipelineResult.State;
import org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv;
+import org.apache.beam.sdk.extensions.sql.impl.rel.BeamIOSourceRel;
+import org.apache.beam.sdk.extensions.sql.impl.rel.BeamRelNode;
import org.apache.beam.sdk.extensions.sql.impl.rel.BeamSqlRelUtils;
import org.apache.beam.sdk.extensions.sql.impl.schema.BeamPCollectionTable;
import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils;
@@ -300,6 +303,74 @@
}
@Test
+ public void testSQLRead_withDirectRead_withProjectPushDown() {
+ BeamSqlEnv sqlEnv = BeamSqlEnv.inMemory(new BigQueryTableProvider());
+
+ String createTableStatement =
+ "CREATE EXTERNAL TABLE TEST( \n"
+ + " c_bigint BIGINT, \n"
+ + " c_tinyint TINYINT, \n"
+ + " c_smallint SMALLINT, \n"
+ + " c_integer INTEGER, \n"
+ + " c_float FLOAT, \n"
+ + " c_double DOUBLE, \n"
+ + " c_boolean BOOLEAN, \n"
+ + " c_timestamp TIMESTAMP, \n"
+ + " c_varchar VARCHAR, \n "
+ + " c_char CHAR, \n"
+ + " c_arr ARRAY<VARCHAR> \n"
+ + ") \n"
+ + "TYPE 'bigquery' \n"
+ + "LOCATION '"
+ + bigQueryTestingTypes.tableSpec()
+ + "' \n"
+ + "TBLPROPERTIES "
+ + "'{ "
+ + METHOD_PROPERTY
+ + ": \""
+ + Method.DIRECT_READ.toString()
+ + "\" }'";
+ sqlEnv.executeDdl(createTableStatement);
+
+ String insertStatement =
+ "INSERT INTO TEST VALUES ("
+ + "9223372036854775807, "
+ + "127, "
+ + "32767, "
+ + "2147483647, "
+ + "1.0, "
+ + "1.0, "
+ + "TRUE, "
+ + "TIMESTAMP '2018-05-28 20:17:40.123', "
+ + "'varchar', "
+ + "'char', "
+ + "ARRAY['123', '456']"
+ + ")";
+
+ sqlEnv.parseQuery(insertStatement);
+ BeamSqlRelUtils.toPCollection(pipeline, sqlEnv.parseQuery(insertStatement));
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(5));
+
+ String selectTableStatement = "SELECT c_integer, c_varchar, c_tinyint FROM TEST";
+ BeamRelNode relNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> output = BeamSqlRelUtils.toPCollection(readPipeline, relNode);
+
+ assertThat(relNode, instanceOf(BeamIOSourceRel.class));
+ assertEquals(
+ output.getSchema(),
+ Schema.builder()
+ .addNullableField("c_integer", INT32)
+ .addNullableField("c_varchar", STRING)
+ .addNullableField("c_tinyint", BYTE)
+ .build());
+
+ PAssert.that(output)
+ .containsInAnyOrder(row(output.getSchema(), 2147483647, "varchar", (byte) 127));
+ PipelineResult.State state = readPipeline.run().waitUntilFinish(Duration.standardMinutes(5));
+ assertEquals(state, State.DONE);
+ }
+
+ @Test
public void testSQLTypes() {
BeamSqlEnv sqlEnv = BeamSqlEnv.inMemory(new BigQueryTableProvider());
diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableProviderTest.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableProviderTest.java
index c65d593..c15101f 100644
--- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableProviderTest.java
+++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableProviderTest.java
@@ -17,8 +17,12 @@
*/
package org.apache.beam.sdk.extensions.sql.meta.provider.test;
+import static org.apache.beam.sdk.extensions.sql.meta.provider.test.TestTableProvider.PUSH_DOWN_OPTION;
+
+import com.alibaba.fastjson.JSON;
import org.apache.beam.sdk.extensions.sql.meta.BeamSqlTable;
import org.apache.beam.sdk.extensions.sql.meta.Table;
+import org.apache.beam.sdk.extensions.sql.meta.provider.test.TestTableProvider.PushDownOptions;
import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.testing.PAssert;
import org.apache.beam.sdk.testing.TestPipeline;
@@ -83,11 +87,11 @@
beamSqlTable.buildIOReader(
pipeline.begin(),
beamSqlTable.constructFilter(ImmutableList.of()),
- ImmutableList.of("name", "id")); // Note that order is ignored
+ ImmutableList.of("name", "id"));
// Selected columns are outputted in the same order they are listed in the schema.
PAssert.that(result)
- .containsInAnyOrder(row(result.getSchema(), 1, "one"), row(result.getSchema(), 2, "two"));
+ .containsInAnyOrder(row(result.getSchema(), "one", 1), row(result.getSchema(), "two", 2));
pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
}
@@ -115,6 +119,9 @@
.name(name)
.comment(name + " table")
.schema(BASIC_SCHEMA)
+ .properties(
+ JSON.parseObject(
+ "{ " + PUSH_DOWN_OPTION + ": " + "\"" + PushDownOptions.BOTH.toString() + "\" }"))
.type("test")
.build();
}
diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableProviderWithFilterPushDown.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableProviderWithFilterPushDown.java
new file mode 100644
index 0000000..0b6ead6
--- /dev/null
+++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableProviderWithFilterPushDown.java
@@ -0,0 +1,362 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.sql.meta.provider.test;
+
+import static org.apache.beam.sdk.extensions.sql.meta.provider.test.TestTableProvider.PUSH_DOWN_OPTION;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.collection.IsIterableContainingInAnyOrder.containsInAnyOrder;
+import static org.hamcrest.core.IsInstanceOf.instanceOf;
+import static org.junit.Assert.assertEquals;
+
+import com.alibaba.fastjson.JSON;
+import java.util.List;
+import org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv;
+import org.apache.beam.sdk.extensions.sql.impl.rel.BeamCalcRel;
+import org.apache.beam.sdk.extensions.sql.impl.rel.BeamIOSourceRel;
+import org.apache.beam.sdk.extensions.sql.impl.rel.BeamRelNode;
+import org.apache.beam.sdk.extensions.sql.impl.rel.BeamSqlRelUtils;
+import org.apache.beam.sdk.extensions.sql.impl.rule.BeamCalcRule;
+import org.apache.beam.sdk.extensions.sql.impl.rule.BeamIOPushDownRule;
+import org.apache.beam.sdk.extensions.sql.meta.Table;
+import org.apache.beam.sdk.extensions.sql.meta.provider.test.TestTableProvider.PushDownOptions;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.schemas.Schema;
+import org.apache.beam.sdk.testing.PAssert;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.Row;
+import org.apache.beam.vendor.calcite.v1_20_0.com.google.common.collect.ImmutableList;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.plan.RelOptRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.rules.CalcMergeRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.rules.FilterCalcMergeRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.rules.FilterToCalcRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.rules.ProjectCalcMergeRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.rules.ProjectToCalcRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.tools.RuleSet;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.tools.RuleSets;
+import org.joda.time.Duration;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class TestTableProviderWithFilterPushDown {
+ private static final Schema BASIC_SCHEMA =
+ Schema.builder()
+ .addInt32Field("unused1")
+ .addInt32Field("id")
+ .addStringField("name")
+ .addInt16Field("unused2")
+ .addBooleanField("b")
+ .build();
+ private static final List<RelOptRule> rulesWithPushDown =
+ ImmutableList.of(
+ BeamCalcRule.INSTANCE,
+ FilterCalcMergeRule.INSTANCE,
+ ProjectCalcMergeRule.INSTANCE,
+ BeamIOPushDownRule.INSTANCE,
+ FilterToCalcRule.INSTANCE,
+ ProjectToCalcRule.INSTANCE,
+ CalcMergeRule.INSTANCE);
+ private BeamSqlEnv sqlEnv;
+
+ @Rule public TestPipeline pipeline = TestPipeline.create();
+
+ @Before
+ public void buildUp() {
+ TestTableProvider tableProvider = new TestTableProvider();
+ Table table = getTable("TEST", PushDownOptions.BOTH);
+ tableProvider.createTable(table);
+ tableProvider.addRows(
+ table.getName(),
+ row(BASIC_SCHEMA, 100, 1, "one", (short) 100, true),
+ row(BASIC_SCHEMA, 200, 2, "two", (short) 200, false));
+
+ sqlEnv =
+ BeamSqlEnv.builder(tableProvider)
+ .setPipelineOptions(PipelineOptionsFactory.create())
+ .setRuleSets(new RuleSet[] {RuleSets.ofList(rulesWithPushDown)})
+ .build();
+ }
+
+ @Test
+ public void testIOSourceRel_predicateSimple() {
+ String selectTableStatement = "SELECT name FROM TEST where id=2";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertThat(beamRelNode, instanceOf(BeamIOSourceRel.class));
+ assertEquals(Schema.builder().addStringField("name").build(), result.getSchema());
+ PAssert.that(result).containsInAnyOrder(row(result.getSchema(), "two"));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ @Test
+ public void testIOSourceRel_predicateSimple_Boolean() {
+ String selectTableStatement = "SELECT name FROM TEST where b";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertThat(beamRelNode, instanceOf(BeamIOSourceRel.class));
+ assertEquals(Schema.builder().addStringField("name").build(), result.getSchema());
+ PAssert.that(result).containsInAnyOrder(row(result.getSchema(), "one"));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ @Test
+ public void testIOSourceRel_predicateWithAnd() {
+ String selectTableStatement = "SELECT name FROM TEST where id>=2 and unused1<=200";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertThat(beamRelNode, instanceOf(BeamIOSourceRel.class));
+ assertEquals(Schema.builder().addStringField("name").build(), result.getSchema());
+ PAssert.that(result).containsInAnyOrder(row(result.getSchema(), "two"));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ @Test
+ public void testIOSourceRel_withComplexProjects_withSupportedFilter() {
+ String selectTableStatement =
+ "SELECT name as new_name, unused1+10-id as new_id FROM TEST where 1<id";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertThat(beamRelNode, instanceOf(BeamCalcRel.class));
+ assertThat(beamRelNode.getInput(0), instanceOf(BeamIOSourceRel.class));
+ // Make sure project push-down was done
+ List<String> a = beamRelNode.getInput(0).getRowType().getFieldNames();
+ assertThat(a, containsInAnyOrder("name", "unused1", "id"));
+ assertEquals(
+ Schema.builder().addStringField("new_name").addInt32Field("new_id").build(),
+ result.getSchema());
+ PAssert.that(result).containsInAnyOrder(row(result.getSchema(), "two", 208));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ @Test
+ public void testIOSourceRel_selectFieldsInRandomOrder_withRename_withSupportedFilter() {
+ String selectTableStatement =
+ "SELECT name as new_name, id as new_id, unused1 as new_unused1 FROM TEST where 1<id";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertThat(beamRelNode, instanceOf(BeamIOSourceRel.class));
+ // Make sure project push-down was done
+ List<String> a = beamRelNode.getRowType().getFieldNames();
+ assertThat(a, containsInAnyOrder("new_name", "new_id", "new_unused1"));
+ assertEquals(
+ Schema.builder()
+ .addStringField("new_name")
+ .addInt32Field("new_id")
+ .addInt32Field("new_unused1")
+ .build(),
+ result.getSchema());
+ PAssert.that(result).containsInAnyOrder(row(result.getSchema(), "two", 2, 200));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ @Test
+ public void testIOSourceRel_selectFieldsInRandomOrder_withRename_withUnsupportedFilter() {
+ String selectTableStatement =
+ "SELECT name as new_name, id as new_id, unused1 as new_unused1 FROM TEST where id+unused1=202";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertThat(beamRelNode, instanceOf(BeamCalcRel.class));
+ assertThat(beamRelNode.getInput(0), instanceOf(BeamIOSourceRel.class));
+ // Make sure project push-down was done
+ List<String> a = beamRelNode.getInput(0).getRowType().getFieldNames();
+ assertThat(a, containsInAnyOrder("name", "id", "unused1"));
+ assertEquals(
+ Schema.builder()
+ .addStringField("new_name")
+ .addInt32Field("new_id")
+ .addInt32Field("new_unused1")
+ .build(),
+ result.getSchema());
+ PAssert.that(result).containsInAnyOrder(row(result.getSchema(), "two", 2, 200));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ @Test
+ public void
+ testIOSourceRel_selectFieldsInRandomOrder_withRename_withSupportedAndUnsupportedFilters() {
+ String selectTableStatement =
+ "SELECT name as new_name, id as new_id, unused1 as new_unused1 FROM TEST where 1<id and id+unused1=202";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertThat(beamRelNode, instanceOf(BeamCalcRel.class));
+ assertThat(beamRelNode.getInput(0), instanceOf(BeamIOSourceRel.class));
+ // Make sure project push-down was done
+ List<String> a = beamRelNode.getInput(0).getRowType().getFieldNames();
+ assertThat(a, containsInAnyOrder("name", "id", "unused1"));
+ assertEquals(
+ "BeamIOSourceRel.BEAM_LOGICAL(table=[beam, TEST],usedFields=[name, id, unused1],TestTableFilter=[supported{<(1, $1)}, unsupported{=(+($1, $0), 202)}])",
+ beamRelNode.getInput(0).getDigest());
+ assertEquals(
+ Schema.builder()
+ .addStringField("new_name")
+ .addInt32Field("new_id")
+ .addInt32Field("new_unused1")
+ .build(),
+ result.getSchema());
+ PAssert.that(result).containsInAnyOrder(row(result.getSchema(), "two", 2, 200));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ @Test
+ public void testIOSourceRel_selectAllField() {
+ String selectTableStatement = "SELECT * FROM TEST where id<>2";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertThat(beamRelNode, instanceOf(BeamIOSourceRel.class));
+ assertEquals(
+ "BeamIOSourceRel.BEAM_LOGICAL(table=[beam, TEST],usedFields=[unused1, id, name, unused2, b],TestTableFilter=[supported{<>($1, 2)}, unsupported{}])",
+ beamRelNode.getDigest());
+ assertEquals(BASIC_SCHEMA, result.getSchema());
+ PAssert.that(result)
+ .containsInAnyOrder(row(result.getSchema(), 100, 1, "one", (short) 100, true));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ private static Row row(Schema schema, Object... objects) {
+ return Row.withSchema(schema).addValues(objects).build();
+ }
+
+ @Test
+ public void testIOSourceRel_withUnsupportedPredicate() {
+ String selectTableStatement = "SELECT name FROM TEST where id+unused1=101";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertThat(beamRelNode, instanceOf(BeamCalcRel.class));
+ assertThat(beamRelNode.getInput(0), instanceOf(BeamIOSourceRel.class));
+ assertEquals(
+ "BeamIOSourceRel.BEAM_LOGICAL(table=[beam, TEST],usedFields=[name, id, unused1],TestTableFilter=[supported{}, unsupported{=(+($1, $0), 101)}])",
+ beamRelNode.getInput(0).getDigest());
+ // Make sure project push-down was done
+ List<String> a = beamRelNode.getInput(0).getRowType().getFieldNames();
+ assertThat(a, containsInAnyOrder("name", "id", "unused1"));
+
+ assertEquals(Schema.builder().addStringField("name").build(), result.getSchema());
+ PAssert.that(result).containsInAnyOrder(row(result.getSchema(), "one"));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ @Test
+ public void testIOSourceRel_selectAll_withUnsupportedPredicate() {
+ String selectTableStatement = "SELECT * FROM TEST where id+unused1=101";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertThat(beamRelNode, instanceOf(BeamCalcRel.class));
+ assertThat(beamRelNode.getInput(0), instanceOf(BeamIOSourceRel.class));
+ assertEquals(
+ "BeamIOSourceRel.BEAM_LOGICAL(table=[beam, TEST],TestTableFilter=[supported{}, unsupported{}])",
+ beamRelNode.getInput(0).getDigest());
+ // Make sure project push-down was done (all fields since 'select *')
+ List<String> a = beamRelNode.getInput(0).getRowType().getFieldNames();
+ assertThat(a, containsInAnyOrder("name", "id", "unused1", "unused2", "b"));
+
+ assertEquals(BASIC_SCHEMA, result.getSchema());
+ PAssert.that(result)
+ .containsInAnyOrder(row(result.getSchema(), 100, 1, "one", (short) 100, true));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ @Test
+ public void testIOSourceRel_withSupportedAndUnsupportedPredicate() {
+ String selectTableStatement = "SELECT name FROM TEST where id+unused1=101 and id=1";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertThat(beamRelNode, instanceOf(BeamCalcRel.class));
+ assertThat(beamRelNode.getInput(0), instanceOf(BeamIOSourceRel.class));
+ assertEquals(
+ "BeamIOSourceRel.BEAM_LOGICAL(table=[beam, TEST],usedFields=[name, id, unused1],TestTableFilter=[supported{=($1, 1)}, unsupported{=(+($1, $0), 101)}])",
+ beamRelNode.getInput(0).getDigest());
+ // Make sure project push-down was done
+ List<String> a = beamRelNode.getInput(0).getRowType().getFieldNames();
+ assertThat(a, containsInAnyOrder("name", "id", "unused1"));
+
+ assertEquals(Schema.builder().addStringField("name").build(), result.getSchema());
+ PAssert.that(result).containsInAnyOrder(row(result.getSchema(), "one"));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ @Test
+ public void testIOSourceRel_selectAll_withSupportedAndUnsupportedPredicate() {
+ String selectTableStatement = "SELECT * FROM TEST where id+unused1=101 and id=1";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertThat(beamRelNode, instanceOf(BeamCalcRel.class));
+ assertThat(beamRelNode.getInput(0), instanceOf(BeamIOSourceRel.class));
+ assertEquals(
+ "BeamIOSourceRel.BEAM_LOGICAL(table=[beam, TEST],usedFields=[unused1, id, name, unused2, b],TestTableFilter=[supported{=($1, 1)}, unsupported{=(+($1, $0), 101)}])",
+ beamRelNode.getInput(0).getDigest());
+ // Make sure project push-down was done (all fields since 'select *')
+ List<String> a = beamRelNode.getInput(0).getRowType().getFieldNames();
+ assertThat(a, containsInAnyOrder("unused1", "name", "id", "unused2", "b"));
+
+ assertEquals(BASIC_SCHEMA, result.getSchema());
+ PAssert.that(result)
+ .containsInAnyOrder(row(result.getSchema(), 100, 1, "one", (short) 100, true));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ private static Table getTable(String name, PushDownOptions options) {
+ return Table.builder()
+ .name(name)
+ .comment(name + " table")
+ .schema(BASIC_SCHEMA)
+ .properties(
+ JSON.parseObject("{ " + PUSH_DOWN_OPTION + ": " + "\"" + options.toString() + "\" }"))
+ .type("test")
+ .build();
+ }
+}
diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableProviderWithProjectPushDown.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableProviderWithProjectPushDown.java
new file mode 100644
index 0000000..d8b6141
--- /dev/null
+++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/test/TestTableProviderWithProjectPushDown.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.sql.meta.provider.test;
+
+import static org.apache.beam.sdk.extensions.sql.meta.provider.test.TestTableProvider.PUSH_DOWN_OPTION;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.collection.IsIterableContainingInAnyOrder.containsInAnyOrder;
+import static org.hamcrest.core.IsInstanceOf.instanceOf;
+import static org.junit.Assert.assertEquals;
+
+import com.alibaba.fastjson.JSON;
+import java.util.List;
+import org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv;
+import org.apache.beam.sdk.extensions.sql.impl.rel.BeamIOSourceRel;
+import org.apache.beam.sdk.extensions.sql.impl.rel.BeamRelNode;
+import org.apache.beam.sdk.extensions.sql.impl.rel.BeamSqlRelUtils;
+import org.apache.beam.sdk.extensions.sql.impl.rule.BeamCalcRule;
+import org.apache.beam.sdk.extensions.sql.impl.rule.BeamIOPushDownRule;
+import org.apache.beam.sdk.extensions.sql.meta.Table;
+import org.apache.beam.sdk.extensions.sql.meta.provider.test.TestTableProvider.PushDownOptions;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.schemas.Schema;
+import org.apache.beam.sdk.testing.PAssert;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.Row;
+import org.apache.beam.vendor.calcite.v1_20_0.com.google.common.collect.ImmutableList;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.plan.RelOptRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.rules.CalcMergeRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.rules.FilterCalcMergeRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.rules.FilterToCalcRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.rules.ProjectCalcMergeRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.rules.ProjectToCalcRule;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.tools.RuleSet;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.tools.RuleSets;
+import org.joda.time.Duration;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class TestTableProviderWithProjectPushDown {
+ private static final Schema BASIC_SCHEMA =
+ Schema.builder()
+ .addInt32Field("unused1")
+ .addInt32Field("id")
+ .addStringField("name")
+ .addInt32Field("unused2")
+ .build();
+ private static final List<RelOptRule> rulesWithPushDown =
+ ImmutableList.of(
+ BeamCalcRule.INSTANCE,
+ FilterCalcMergeRule.INSTANCE,
+ ProjectCalcMergeRule.INSTANCE,
+ BeamIOPushDownRule.INSTANCE,
+ FilterToCalcRule.INSTANCE,
+ ProjectToCalcRule.INSTANCE,
+ CalcMergeRule.INSTANCE);
+ private BeamSqlEnv sqlEnv;
+
+ @Rule public TestPipeline pipeline = TestPipeline.create();
+
+ @Before
+ public void buildUp() {
+ TestTableProvider tableProvider = new TestTableProvider();
+ Table table = getTable("TEST", PushDownOptions.PROJECT);
+ tableProvider.createTable(table);
+ tableProvider.addRows(
+ table.getName(),
+ row(BASIC_SCHEMA, 100, 1, "one", 100),
+ row(BASIC_SCHEMA, 200, 2, "two", 200));
+
+ sqlEnv =
+ BeamSqlEnv.builder(tableProvider)
+ .setPipelineOptions(PipelineOptionsFactory.create())
+ .setRuleSets(new RuleSet[] {RuleSets.ofList(rulesWithPushDown)})
+ .build();
+ }
+
+ @Test
+ public void testIOSourceRel_withNoPredicate() {
+ String selectTableStatement = "SELECT id, name FROM TEST";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertEquals(
+ result.getSchema(), Schema.builder().addInt32Field("id").addStringField("name").build());
+ PAssert.that(result)
+ .containsInAnyOrder(row(result.getSchema(), 1, "one"), row(result.getSchema(), 2, "two"));
+ assertThat(beamRelNode, instanceOf(BeamIOSourceRel.class));
+ // If project push-down succeeds new BeamIOSourceRel should not output unused fields
+ assertThat(beamRelNode.getRowType().getFieldNames(), containsInAnyOrder("id", "name"));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ @Test
+ public void testIOSourceRel_withNoPredicate_withRename() {
+ String selectTableStatement = "SELECT id as new_id, name as new_name FROM TEST";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertEquals(
+ result.getSchema(),
+ Schema.builder().addInt32Field("new_id").addStringField("new_name").build());
+ PAssert.that(result)
+ .containsInAnyOrder(row(result.getSchema(), 1, "one"), row(result.getSchema(), 2, "two"));
+ assertThat(beamRelNode, instanceOf(BeamIOSourceRel.class));
+ // If project push-down succeeds new BeamIOSourceRel should not output unused fields
+ assertThat(beamRelNode.getRowType().getFieldNames(), containsInAnyOrder("new_id", "new_name"));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ @Test
+ public void testIOSourceRel_withPredicate() {
+ String selectTableStatement = "SELECT name FROM TEST where id=2";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertEquals(result.getSchema(), Schema.builder().addStringField("name").build());
+ PAssert.that(result).containsInAnyOrder(row(result.getSchema(), "two"));
+ assertThat(beamRelNode.getInput(0), instanceOf(BeamIOSourceRel.class));
+ // When doing only project push-down, predicate should be preserved in a Calc and IO should
+ // project fields queried + fields used by the predicate
+ assertThat(
+ beamRelNode.getInput(0).getRowType().getFieldNames(), containsInAnyOrder("id", "name"));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ @Test
+ public void testIOSourceRel_withPredicate_withRename() {
+ String selectTableStatement = "SELECT name as new_name FROM TEST where id=2";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertEquals(result.getSchema(), Schema.builder().addStringField("new_name").build());
+ PAssert.that(result).containsInAnyOrder(row(result.getSchema(), "two"));
+ assertThat(beamRelNode.getInput(0), instanceOf(BeamIOSourceRel.class));
+ // When doing only project push-down, predicate (and rename) should be preserved in a Calc
+ assertThat(beamRelNode.getRowType().getFieldNames(), containsInAnyOrder("new_name"));
+ // IO should project fields queried + fields used by the predicate
+ assertThat(
+ beamRelNode.getInput(0).getRowType().getFieldNames(), containsInAnyOrder("id", "name"));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ @Test
+ public void testIOSourceRel_AllFields() {
+ String selectTableStatement = "SELECT * FROM TEST";
+
+ BeamRelNode beamRelNode = sqlEnv.parseQuery(selectTableStatement);
+ PCollection<Row> result = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode);
+
+ assertEquals(result.getSchema(), BASIC_SCHEMA);
+ PAssert.that(result)
+ .containsInAnyOrder(
+ row(result.getSchema(), 100, 1, "one", 100),
+ row(result.getSchema(), 200, 2, "two", 200));
+ assertThat(beamRelNode, instanceOf(BeamIOSourceRel.class));
+ // If project push-down succeeds new BeamIOSourceRel should not output unused fields
+ assertThat(
+ beamRelNode.getRowType().getFieldNames(),
+ containsInAnyOrder("unused1", "id", "name", "unused2"));
+
+ pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
+ }
+
+ private static Row row(Schema schema, Object... objects) {
+ return Row.withSchema(schema).addValues(objects).build();
+ }
+
+ private static Table getTable(String name, PushDownOptions options) {
+ return Table.builder()
+ .name(name)
+ .comment(name + " table")
+ .schema(BASIC_SCHEMA)
+ .properties(
+ JSON.parseObject("{ " + PUSH_DOWN_OPTION + ": " + "\"" + options.toString() + "\" }"))
+ .type("test")
+ .build();
+ }
+}
diff --git a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/DateTimeUtils.java b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/DateTimeUtils.java
index 5f90efb..d561c84 100644
--- a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/DateTimeUtils.java
+++ b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/DateTimeUtils.java
@@ -23,6 +23,7 @@
import com.google.zetasql.Value;
import io.grpc.Status;
import java.util.List;
+import javax.annotation.Nullable;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.avatica.util.TimeUnit;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.util.DateString;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.util.TimeString;
@@ -222,7 +223,7 @@
* @return Unchanged timestamp sent for validation.
*/
@SuppressWarnings("GoodTime")
- public static Long validateTimestamp(Long ts) {
+ public static @Nullable Long validateTimestamp(@Nullable Long ts) {
if (ts == null) {
return null;
}
@@ -251,7 +252,7 @@
* @return Argument for the interval.
*/
@SuppressWarnings("GoodTime")
- public static Long validateTimeInterval(Long arg, TimeUnit unit) {
+ public static @Nullable Long validateTimeInterval(@Nullable Long arg, TimeUnit unit) {
if (arg == null) {
return null;
}
diff --git a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLCastFunctionImpl.java b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLCastFunctionImpl.java
index b0e8e56..afd2bf2 100644
--- a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLCastFunctionImpl.java
+++ b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLCastFunctionImpl.java
@@ -19,6 +19,7 @@
import static org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.adapter.enumerable.RexImpTable.createImplementor;
+import java.util.Collections;
import java.util.List;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.adapter.enumerable.CallImplementor;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.adapter.enumerable.NotNullImplementor;
@@ -28,7 +29,6 @@
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.linq4j.tree.Expression;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.linq4j.tree.Expressions;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexCall;
-import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.schema.Function;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.schema.FunctionParameter;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.schema.ImplementableFunction;
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.SqlIdentifier;
@@ -37,7 +37,7 @@
import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.SqlUserDefinedFunction;
/** ZetaSQLCastFunctionImpl. */
-public class ZetaSQLCastFunctionImpl implements Function, ImplementableFunction {
+public class ZetaSQLCastFunctionImpl implements ImplementableFunction {
public static final SqlUserDefinedFunction ZETASQL_CAST_OP =
new SqlUserDefinedFunction(
new SqlIdentifier("CAST", SqlParserPos.ZERO),
@@ -54,7 +54,7 @@
@Override
public List<FunctionParameter> getParameters() {
- return null;
+ return Collections.emptyList();
}
private static class ZetaSQLCastCallNotNullImplementor implements NotNullImplementor {
diff --git a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLPlannerImpl.java b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLPlannerImpl.java
index 5afdcd4..83826e8 100644
--- a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLPlannerImpl.java
+++ b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLPlannerImpl.java
@@ -98,27 +98,35 @@
@Override
public SqlNode parse(String s) throws SqlParseException {
- return null;
+ throw new UnsupportedOperationException(
+ String.format("%s.parse(String) is not implemented", this.getClass().getCanonicalName()));
}
@Override
public SqlNode parse(Reader reader) throws SqlParseException {
- return null;
+ throw new UnsupportedOperationException(
+ String.format("%s.parse(Reader) is not implemented", this.getClass().getCanonicalName()));
}
@Override
public SqlNode validate(SqlNode sqlNode) throws ValidationException {
- return null;
+ throw new UnsupportedOperationException(
+ String.format(
+ "%s.validate(SqlNode) is not implemented", this.getClass().getCanonicalName()));
}
@Override
public Pair<SqlNode, RelDataType> validateAndGetType(SqlNode sqlNode) throws ValidationException {
- throw new RuntimeException("validateAndGetType(SqlNode) is not implemented.");
+ throw new UnsupportedOperationException(
+ String.format(
+ "%s.validateAndGetType(SqlNode) is not implemented",
+ this.getClass().getCanonicalName()));
}
@Override
public RelRoot rel(SqlNode sqlNode) throws RelConversionException {
- return null;
+ throw new UnsupportedOperationException(
+ String.format("%s.rel(SqlNode) is not implemented", this.getClass().getCanonicalName()));
}
public RelRoot rel(String sql, Map<String, Value> params) {
@@ -149,12 +157,14 @@
@Override
public RelNode convert(SqlNode sqlNode) {
- throw new RuntimeException("convert(SqlNode) is not implemented.");
+ throw new UnsupportedOperationException(
+ String.format("%s.convert(SqlNode) is not implemented.", getClass().getCanonicalName()));
}
@Override
public RelDataTypeFactory getTypeFactory() {
- throw new RuntimeException("getTypeFactory() is not implemented.");
+ throw new UnsupportedOperationException(
+ String.format("%s.getTypeFactor() is not implemented.", getClass().getCanonicalName()));
}
@Override
@@ -171,7 +181,8 @@
@Override
public void reset() {
- throw new RuntimeException("reset() is not implemented.");
+ throw new UnsupportedOperationException(
+ String.format("%s.reset() is not implemented", this.getClass().getCanonicalName()));
}
@Override
@@ -181,7 +192,9 @@
@Override
public RelTraitSet getEmptyTraitSet() {
- throw new RuntimeException("getEmptyTraitSet() is not implemented.");
+ throw new UnsupportedOperationException(
+ String.format(
+ "%s.getEmptyTraitSet() is not implemented", this.getClass().getCanonicalName()));
}
public static LanguageOptions getLanguageOptions() {
diff --git a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLQueryPlanner.java b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLQueryPlanner.java
index 3730857..d3874fc 100644
--- a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLQueryPlanner.java
+++ b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLQueryPlanner.java
@@ -71,7 +71,10 @@
@Override
public SqlNode parse(String sqlStatement) throws ParseException {
- return null;
+ throw new UnsupportedOperationException(
+ String.format(
+ "%s.parse(String) is not implemented and should need be called",
+ this.getClass().getCanonicalName()));
}
public BeamRelNode convertToBeamRel(String sqlStatement, Map<String, Value> queryParams)
@@ -129,11 +132,16 @@
final SqlOperatorTable opTab0 =
connection.config().fun(SqlOperatorTable.class, SqlStdOperatorTable.instance());
+ Object[] contexts =
+ org.apache.beam.vendor.calcite.v1_20_0.com.google.common.collect.ImmutableList.of(
+ connection.config(), TableResolutionContext.joinCompoundIds("datacatalog"))
+ .toArray();
+
return Frameworks.newConfigBuilder()
.parserConfig(parserConfig.build())
.defaultSchema(defaultSchema)
.traitDefs(traitDefs)
- .context(Contexts.of(connection.config()))
+ .context(Contexts.of(contexts))
.ruleSets(ruleSets)
.costFactory(null)
.typeSystem(connection.getTypeFactory().getTypeSystem())
diff --git a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/translation/ExpressionConverter.java b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/translation/ExpressionConverter.java
index 652aabd..8b5c81c 100644
--- a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/translation/ExpressionConverter.java
+++ b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/translation/ExpressionConverter.java
@@ -58,6 +58,7 @@
import java.util.Arrays;
import java.util.List;
import java.util.Map;
+import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.beam.sdk.annotations.Internal;
import org.apache.beam.sdk.extensions.sql.zetasql.SqlOperatorRewriter;
@@ -370,18 +371,28 @@
case RESOLVED_COLUMN_REF:
ResolvedColumnRef columnRef = (ResolvedColumnRef) expr;
// first look for column ref on the left side
- ret =
+ Optional<RexNode> colRexNode =
convertRexNodeFromResolvedColumnRefWithRefScan(
columnRef, refScanLeftColumnList, originalLeftColumnList, leftFieldList);
- // if not found there look on the right
- if (ret == null) {
- ret =
- convertRexNodeFromResolvedColumnRefWithRefScan(
- columnRef, refScanRightColumnList, originalRightColumnList, rightFieldList);
+ if (colRexNode.isPresent()) {
+ ret = colRexNode.get();
+ break;
}
- break;
+ // if not found there look on the right
+ colRexNode =
+ convertRexNodeFromResolvedColumnRefWithRefScan(
+ columnRef, refScanRightColumnList, originalRightColumnList, rightFieldList);
+ if (colRexNode.isPresent()) {
+ ret = colRexNode.get();
+ break;
+ }
+
+ throw new IllegalArgumentException(
+ String.format(
+ "Could not find column reference %s in %s or %s",
+ columnRef, refScanLeftColumnList, refScanRightColumnList));
case RESOLVED_FUNCTION_CALL:
// JOIN only support equal join.
ResolvedFunctionCall resolvedFunctionCall = (ResolvedFunctionCall) expr;
@@ -965,7 +976,7 @@
|| (fromType.equals(TYPE_TIMESTAMP) && toType.equals(TYPE_STRING));
}
- private RexNode convertRexNodeFromResolvedColumnRefWithRefScan(
+ private Optional<RexNode> convertRexNodeFromResolvedColumnRefWithRefScan(
ResolvedColumnRef columnRef,
List<ResolvedColumn> refScanColumnList,
List<ResolvedColumn> originalColumnList,
@@ -975,15 +986,16 @@
if (refScanColumnList.get(i).getId() == columnRef.getColumn().getId()) {
boolean nullable = fieldList.get(i).getType().isNullable();
int off = (int) originalColumnList.get(i).getId() - 1;
- return rexBuilder()
- .makeInputRef(
- TypeUtils.toSimpleRelDataType(
- columnRef.getType().getKind(), rexBuilder(), nullable),
- off);
+ return Optional.of(
+ rexBuilder()
+ .makeInputRef(
+ TypeUtils.toSimpleRelDataType(
+ columnRef.getType().getKind(), rexBuilder(), nullable),
+ off));
}
}
- return null;
+ return Optional.empty();
}
private RexNode convertResolvedParameter(ResolvedParameter parameter) {
diff --git a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/avro/AvroIOIT.java b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/avro/AvroIOIT.java
index 925a7c3..0b78688 100644
--- a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/avro/AvroIOIT.java
+++ b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/avro/AvroIOIT.java
@@ -19,7 +19,6 @@
import static org.apache.beam.sdk.io.FileIO.ReadMatches.DirectoryTreatment;
import static org.apache.beam.sdk.io.common.FileBasedIOITHelper.appendTimestampSuffix;
-import static org.apache.beam.sdk.io.common.FileBasedIOITHelper.getExpectedHashForLineCount;
import static org.apache.beam.sdk.io.common.FileBasedIOITHelper.readFileBasedIOITPipelineOptions;
import com.google.cloud.Timestamp;
@@ -66,6 +65,8 @@
* ./gradlew integrationTest -p sdks/java/io/file-based-io-tests
* -DintegrationTestPipelineOptions='[
* "--numberOfRecords=100000",
+ * "--datasetSize=12345",
+ * "--expectedHash=99f23ab",
* "--filenamePrefix=output_file_path"
* ]'
* --tests org.apache.beam.sdk.io.avro.AvroIOIT
@@ -91,10 +92,12 @@
+ "}");
private static String filenamePrefix;
- private static Integer numberOfTextLines;
private static String bigQueryDataset;
private static String bigQueryTable;
private static final String AVRO_NAMESPACE = AvroIOIT.class.getName();
+ private static Integer numberOfTextLines;
+ private static Integer datasetSize;
+ private static String expectedHash;
@Rule public TestPipeline pipeline = TestPipeline.create();
@@ -102,10 +105,12 @@
public static void setup() {
FileBasedIOTestPipelineOptions options = readFileBasedIOITPipelineOptions();
- numberOfTextLines = options.getNumberOfRecords();
filenamePrefix = appendTimestampSuffix(options.getFilenamePrefix());
bigQueryDataset = options.getBigQueryDataset();
bigQueryTable = options.getBigQueryTable();
+ datasetSize = options.getDatasetSize();
+ expectedHash = options.getExpectedHash();
+ numberOfTextLines = options.getNumberOfRecords();
}
@Test
@@ -141,7 +146,6 @@
.apply("Collect end time", ParDo.of(new TimeMonitor<>(AVRO_NAMESPACE, "endPoint")))
.apply("Parse Avro records to Strings", ParDo.of(new ParseAvroRecordsFn()))
.apply("Calculate hashcode", Combine.globally(new HashingFn()));
- String expectedHash = getExpectedHashForLineCount(numberOfTextLines);
PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);
testFilenames.apply(
@@ -191,7 +195,10 @@
double runTime = (readEnd - writeStart) / 1e3;
return NamedTestResult.create(uuid, timestamp, "run_time", runTime);
});
-
+ if (datasetSize != null) {
+ suppliers.add(
+ (reader) -> NamedTestResult.create(uuid, timestamp, "dataset_size", datasetSize));
+ }
return suppliers;
}
diff --git a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/common/FileBasedIOITHelper.java b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/common/FileBasedIOITHelper.java
index 788292f..bb5a47c 100644
--- a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/common/FileBasedIOITHelper.java
+++ b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/common/FileBasedIOITHelper.java
@@ -17,19 +17,15 @@
*/
package org.apache.beam.sdk.io.common;
-import static org.apache.beam.sdk.io.common.IOITHelper.getHashForRecordCount;
-
import java.io.IOException;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
-import java.util.Map;
import java.util.Set;
import org.apache.beam.sdk.io.FileSystems;
import org.apache.beam.sdk.io.fs.MatchResult;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables;
/** Contains helper methods for file based IO Integration tests. */
@@ -45,17 +41,6 @@
return String.format("%s_%s", text, new Date().getTime());
}
- public static String getExpectedHashForLineCount(int lineCount) {
- Map<Integer, String> expectedHashes =
- ImmutableMap.of(
- 1000, "8604c70b43405ef9803cb49b77235ea2",
- 100_000, "4c8bb3b99dcc59459b20fefba400d446",
- 1_000_000, "9796db06e7a7960f974d5a91164afff1",
- 100_000_000, "6ce05f456e2fdc846ded2abd0ec1de95");
-
- return getHashForRecordCount(lineCount, expectedHashes);
- }
-
/** Constructs text lines in files used for testing. */
public static class DeterministicallyConstructTestTextLineFn extends DoFn<Long, String> {
diff --git a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/common/FileBasedIOTestPipelineOptions.java b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/common/FileBasedIOTestPipelineOptions.java
index 610a673..eca31d4 100644
--- a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/common/FileBasedIOTestPipelineOptions.java
+++ b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/common/FileBasedIOTestPipelineOptions.java
@@ -48,4 +48,16 @@
boolean getReportGcsPerformanceMetrics();
void setReportGcsPerformanceMetrics(boolean performanceMetrics);
+
+ @Validation.Required
+ @Description(
+ "Precomputed hashcode to assert IO test pipeline content identity after writing and reading back the dataset")
+ String getExpectedHash();
+
+ void setExpectedHash(String hash);
+
+ @Description("Size of data saved on the target filesystem (bytes)")
+ Integer getDatasetSize();
+
+ void setDatasetSize(Integer size);
}
diff --git a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/parquet/ParquetIOIT.java b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/parquet/ParquetIOIT.java
index 3ee675d..90af13c 100644
--- a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/parquet/ParquetIOIT.java
+++ b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/parquet/ParquetIOIT.java
@@ -18,7 +18,6 @@
package org.apache.beam.sdk.io.parquet;
import static org.apache.beam.sdk.io.common.FileBasedIOITHelper.appendTimestampSuffix;
-import static org.apache.beam.sdk.io.common.FileBasedIOITHelper.getExpectedHashForLineCount;
import static org.apache.beam.sdk.io.common.FileBasedIOITHelper.readFileBasedIOITPipelineOptions;
import static org.apache.beam.sdk.values.TypeDescriptors.strings;
@@ -66,6 +65,8 @@
* ./gradlew integrationTest -p sdks/java/io/file-based-io-tests
* -DintegrationTestPipelineOptions='[
* "--numberOfRecords=100000",
+ * "--datasetSize=12345",
+ * "--expectedHash=99f23ab",
* "--filenamePrefix=output_file_path",
* ]'
* --tests org.apache.beam.sdk.io.parquet.ParquetIOIT
@@ -91,9 +92,11 @@
+ "}");
private static String filenamePrefix;
- private static Integer numberOfRecords;
private static String bigQueryDataset;
private static String bigQueryTable;
+ private static Integer numberOfTextLines;
+ private static Integer datasetSize;
+ private static String expectedHash;
@Rule public TestPipeline pipeline = TestPipeline.create();
private static final String PARQUET_NAMESPACE = ParquetIOIT.class.getName();
@@ -101,8 +104,9 @@
@BeforeClass
public static void setup() {
FileBasedIOTestPipelineOptions options = readFileBasedIOITPipelineOptions();
-
- numberOfRecords = options.getNumberOfRecords();
+ numberOfTextLines = options.getNumberOfRecords();
+ datasetSize = options.getDatasetSize();
+ expectedHash = options.getExpectedHash();
filenamePrefix = appendTimestampSuffix(options.getFilenamePrefix());
bigQueryDataset = options.getBigQueryDataset();
bigQueryTable = options.getBigQueryTable();
@@ -112,7 +116,7 @@
public void writeThenReadAll() {
PCollection<String> testFiles =
pipeline
- .apply("Generate sequence", GenerateSequence.from(0).to(numberOfRecords))
+ .apply("Generate sequence", GenerateSequence.from(0).to(numberOfTextLines))
.apply(
"Produce text lines",
ParDo.of(new FileBasedIOITHelper.DeterministicallyConstructTestTextLineFn()))
@@ -148,7 +152,6 @@
record -> String.valueOf(record.get("row"))))
.apply("Calculate hashcode", Combine.globally(new HashingFn()));
- String expectedHash = getExpectedHashForLineCount(numberOfRecords);
PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);
testFiles.apply(
@@ -196,7 +199,10 @@
double runTime = (readEnd - writeStart) / 1e3;
return NamedTestResult.create(uuid, timestamp, "run_time", runTime);
});
-
+ if (datasetSize != null) {
+ metricSuppliers.add(
+ (ignored) -> NamedTestResult.create(uuid, timestamp, "dataset_size", datasetSize));
+ }
return metricSuppliers;
}
diff --git a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/text/TextIOIT.java b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/text/TextIOIT.java
index 33c13d9..5625a28 100644
--- a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/text/TextIOIT.java
+++ b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/text/TextIOIT.java
@@ -19,7 +19,6 @@
import static org.apache.beam.sdk.io.FileIO.ReadMatches.DirectoryTreatment;
import static org.apache.beam.sdk.io.common.FileBasedIOITHelper.appendTimestampSuffix;
-import static org.apache.beam.sdk.io.common.FileBasedIOITHelper.getExpectedHashForLineCount;
import static org.apache.beam.sdk.io.common.FileBasedIOITHelper.readFileBasedIOITPipelineOptions;
import com.google.cloud.Timestamp;
@@ -64,6 +63,8 @@
* ./gradlew integrationTest -p sdks/java/io/file-based-io-tests
* -DintegrationTestPipelineOptions='[
* "--numberOfRecords=100000",
+ * "--datasetSize=12345",
+ * "--expectedHash=99f23ab",
* "--filenamePrefix=output_file_path",
* "--compressionType=GZIP"
* ]'
@@ -80,6 +81,8 @@
private static String filenamePrefix;
private static Integer numberOfTextLines;
+ private static Integer datasetSize;
+ private static String expectedHash;
private static Compression compressionType;
private static Integer numShards;
private static String bigQueryDataset;
@@ -92,10 +95,11 @@
@BeforeClass
public static void setup() {
FileBasedIOTestPipelineOptions options = readFileBasedIOITPipelineOptions();
-
+ datasetSize = options.getDatasetSize();
+ expectedHash = options.getExpectedHash();
numberOfTextLines = options.getNumberOfRecords();
- filenamePrefix = appendTimestampSuffix(options.getFilenamePrefix());
compressionType = Compression.valueOf(options.getCompressionType());
+ filenamePrefix = appendTimestampSuffix(options.getFilenamePrefix());
numShards = options.getNumberOfShards();
bigQueryDataset = options.getBigQueryDataset();
bigQueryTable = options.getBigQueryTable();
@@ -137,7 +141,6 @@
"Collect read end time", ParDo.of(new TimeMonitor<>(FILEIOIT_NAMESPACE, "endTime")))
.apply("Calculate hashcode", Combine.globally(new HashingFn()));
- String expectedHash = getExpectedHashForLineCount(numberOfTextLines);
PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);
testFilenames.apply(
@@ -189,7 +192,10 @@
double runTime = (readEndTime - writeStartTime) / 1e3;
return NamedTestResult.create(uuid, timestamp, "run_time", runTime);
});
-
+ if (datasetSize != null) {
+ metricSuppliers.add(
+ (ignored) -> NamedTestResult.create(uuid, timestamp, "dataset_size", datasetSize));
+ }
if (gatherGcsPerformanceMetrics) {
metricSuppliers.add(
reader -> {
diff --git a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/tfrecord/TFRecordIOIT.java b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/tfrecord/TFRecordIOIT.java
index fbd1af4..59ac591 100644
--- a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/tfrecord/TFRecordIOIT.java
+++ b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/tfrecord/TFRecordIOIT.java
@@ -19,7 +19,6 @@
import static org.apache.beam.sdk.io.Compression.AUTO;
import static org.apache.beam.sdk.io.common.FileBasedIOITHelper.appendTimestampSuffix;
-import static org.apache.beam.sdk.io.common.FileBasedIOITHelper.getExpectedHashForLineCount;
import static org.apache.beam.sdk.io.common.FileBasedIOITHelper.readFileBasedIOITPipelineOptions;
import com.google.cloud.Timestamp;
@@ -65,6 +64,8 @@
* ./gradlew integrationTest -p sdks/java/io/file-based-io-tests
* -DintegrationTestPipelineOptions='[
* "--numberOfRecords=100000",
+ * "--datasetSize=12345",
+ * "--expectedHash=99f23ab",
* "--filenamePrefix=output_file_path",
* "--compressionType=GZIP"
* ]'
@@ -80,10 +81,12 @@
private static final String TFRECORD_NAMESPACE = TFRecordIOIT.class.getName();
private static String filenamePrefix;
- private static Integer numberOfTextLines;
- private static Compression compressionType;
private static String bigQueryDataset;
private static String bigQueryTable;
+ private static Integer numberOfTextLines;
+ private static Integer datasetSize;
+ private static String expectedHash;
+ private static Compression compressionType;
@Rule public TestPipeline writePipeline = TestPipeline.create();
@@ -92,10 +95,11 @@
@BeforeClass
public static void setup() {
FileBasedIOTestPipelineOptions options = readFileBasedIOITPipelineOptions();
-
+ datasetSize = options.getDatasetSize();
+ expectedHash = options.getExpectedHash();
numberOfTextLines = options.getNumberOfRecords();
- filenamePrefix = appendTimestampSuffix(options.getFilenamePrefix());
compressionType = Compression.valueOf(options.getCompressionType());
+ filenamePrefix = appendTimestampSuffix(options.getFilenamePrefix());
bigQueryDataset = options.getBigQueryDataset();
bigQueryTable = options.getBigQueryTable();
}
@@ -137,7 +141,6 @@
.apply("Calculate hashcode", Combine.globally(new HashingFn()))
.apply(Reshuffle.viaRandomKey());
- String expectedHash = getExpectedHashForLineCount(numberOfTextLines);
PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);
readPipeline
@@ -187,7 +190,10 @@
double runTime = (readEnd - writeStart) / 1e3;
return NamedTestResult.create(uuid, timestamp, "run_time", runTime);
});
-
+ if (datasetSize != null) {
+ suppliers.add(
+ (ignored) -> NamedTestResult.create(uuid, timestamp, "dataset_size", datasetSize));
+ }
return suppliers;
}
diff --git a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/xml/XmlIOIT.java b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/xml/XmlIOIT.java
index 3ce31fa..5d8163e 100644
--- a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/xml/XmlIOIT.java
+++ b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/xml/XmlIOIT.java
@@ -18,7 +18,6 @@
package org.apache.beam.sdk.io.xml;
import static org.apache.beam.sdk.io.common.FileBasedIOITHelper.appendTimestampSuffix;
-import static org.apache.beam.sdk.io.common.IOITHelper.getHashForRecordCount;
import static org.apache.beam.sdk.io.common.IOITHelper.readIOTestPipelineOptions;
import com.google.cloud.Timestamp;
@@ -52,7 +51,6 @@
import org.apache.beam.sdk.transforms.Values;
import org.apache.beam.sdk.transforms.View;
import org.apache.beam.sdk.values.PCollection;
-import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
@@ -68,6 +66,8 @@
* ./gradlew integrationTest -p sdks/java/io/file-based-io-tests
* -DintegrationTestPipelineOptions='[
* "--numberOfRecords=100000",
+ * "--datasetSize=12345",
+ * "--expectedHash=99f23ab",
* "--filenamePrefix=output_file_path",
* "--charset=UTF-8",
* ]'
@@ -90,17 +90,12 @@
void setCharset(String charset);
}
- private static final ImmutableMap<Integer, String> EXPECTED_HASHES =
- ImmutableMap.of(
- 1000, "7f51adaf701441ee83459a3f705c1b86",
- 100_000, "af7775de90d0b0c8bbc36273fbca26fe",
- 100_000_000, "bfee52b33aa1552b9c1bfa8bcc41ae80");
-
- private static Integer numberOfRecords;
-
private static String filenamePrefix;
private static String bigQueryDataset;
private static String bigQueryTable;
+ private static Integer numberOfTextLines;
+ private static Integer datasetSize;
+ private static String expectedHash;
private static final String XMLIOIT_NAMESPACE = XmlIOIT.class.getName();
@@ -111,19 +106,20 @@
@BeforeClass
public static void setup() {
XmlIOITPipelineOptions options = readIOTestPipelineOptions(XmlIOITPipelineOptions.class);
-
filenamePrefix = appendTimestampSuffix(options.getFilenamePrefix());
- numberOfRecords = options.getNumberOfRecords();
charset = Charset.forName(options.getCharset());
bigQueryDataset = options.getBigQueryDataset();
bigQueryTable = options.getBigQueryTable();
+ datasetSize = options.getDatasetSize();
+ expectedHash = options.getExpectedHash();
+ numberOfTextLines = options.getNumberOfRecords();
}
@Test
public void writeThenReadAll() {
PCollection<String> testFileNames =
pipeline
- .apply("Generate sequence", GenerateSequence.from(0).to(numberOfRecords))
+ .apply("Generate sequence", GenerateSequence.from(0).to(numberOfTextLines))
.apply("Create xml records", MapElements.via(new LongToBird()))
.apply(
"Gather write start time",
@@ -162,7 +158,6 @@
.apply("Map xml records to strings", MapElements.via(new BirdToString()))
.apply("Calculate hashcode", Combine.globally(new HashingFn()));
- String expectedHash = getHashForRecordCount(numberOfRecords, EXPECTED_HASHES);
PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);
testFileNames.apply(
@@ -211,6 +206,10 @@
double runTime = (readEnd - writeStart) / 1e3;
return NamedTestResult.create(uuid, timestamp, "run_time", runTime);
});
+ if (datasetSize != null) {
+ suppliers.add(
+ (ignored) -> NamedTestResult.create(uuid, timestamp, "dataset_size", datasetSize));
+ }
return suppliers;
}
diff --git a/sdks/java/io/google-cloud-platform/build.gradle b/sdks/java/io/google-cloud-platform/build.gradle
index 0a9b8a9..0c1befd 100644
--- a/sdks/java/io/google-cloud-platform/build.gradle
+++ b/sdks/java/io/google-cloud-platform/build.gradle
@@ -69,6 +69,10 @@
testCompile project(path: ":sdks:java:core", configuration: "shadowTest")
testCompile project(path: ":sdks:java:extensions:google-cloud-platform-core", configuration: "testRuntime")
testCompile project(path: ":runners:direct-java", configuration: "shadow")
+ testCompile project(path: ":sdks:java:io:common", configuration: "testRuntime")
+ testCompile project(path: ":sdks:java:testing:test-utils", configuration: "testRuntime")
+ // For testing Cross-language transforms
+ testCompile project(":runners:core-construction-java")
testCompile library.java.hamcrest_core
testCompile library.java.hamcrest_library
testCompile library.java.junit
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/DynamicDestinationsHelpers.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/DynamicDestinationsHelpers.java
index b58d18d..a484a42 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/DynamicDestinationsHelpers.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/DynamicDestinationsHelpers.java
@@ -385,7 +385,8 @@
return new TableDestination(
wrappedDestination.getTableSpec(),
existingTable.getDescription(),
- existingTable.getTimePartitioning());
+ existingTable.getTimePartitioning(),
+ existingTable.getClustering());
}
}
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TestBigQuery.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TestBigQuery.java
index 4b4a97e..044ba5f 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TestBigQuery.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TestBigQuery.java
@@ -171,7 +171,7 @@
}
if (description.getMethodName() != null) {
- topicName.append(description.getMethodName()).append("_");
+ topicName.append(description.getMethodName().replaceAll("[\\[\\]\\.]", "_")).append("_");
}
DATETIME_FORMAT.printTo(topicName, Instant.now());
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java
index 4f745ea..3212b3b 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java
@@ -20,7 +20,10 @@
import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState;
import com.google.api.client.util.Clock;
+import com.google.auto.service.AutoService;
import com.google.auto.value.AutoValue;
+import com.google.protobuf.ByteString;
+import com.google.protobuf.InvalidProtocolBufferException;
import com.google.protobuf.Message;
import java.io.IOException;
import java.io.Serializable;
@@ -39,9 +42,11 @@
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.annotations.Experimental.Kind;
import org.apache.beam.sdk.coders.AvroCoder;
+import org.apache.beam.sdk.coders.ByteArrayCoder;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.CoderException;
import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.expansion.ExternalTransformRegistrar;
import org.apache.beam.sdk.extensions.protobuf.ProtoCoder;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.OutgoingMessage;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.ProjectPath;
@@ -53,6 +58,7 @@
import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.schemas.utils.AvroUtils;
import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.ExternalTransformBuilder;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
@@ -705,7 +711,8 @@
abstract Builder<T> toBuilder();
@AutoValue.Builder
- abstract static class Builder<T> {
+ abstract static class Builder<T>
+ implements ExternalTransformBuilder<External.Configuration, PBegin, PCollection<T>> {
abstract Builder<T> setTopicProvider(ValueProvider<PubsubTopic> topic);
abstract Builder<T> setPubsubClientFactory(PubsubClient.PubsubClientFactory clientFactory);
@@ -733,6 +740,85 @@
abstract Builder<T> setClock(@Nullable Clock clock);
abstract Read<T> build();
+
+ @Override
+ public PTransform<PBegin, PCollection<T>> buildExternal(External.Configuration config) {
+ if (config.topic != null) {
+ StaticValueProvider<String> topic = StaticValueProvider.of(config.topic);
+ setTopicProvider(NestedValueProvider.of(topic, new TopicTranslator()));
+ }
+ if (config.subscription != null) {
+ StaticValueProvider<String> subscription = StaticValueProvider.of(config.subscription);
+ setSubscriptionProvider(
+ NestedValueProvider.of(subscription, new SubscriptionTranslator()));
+ }
+ if (config.idAttribute != null) {
+ setIdAttribute(config.idAttribute);
+ }
+ if (config.timestampAttribute != null) {
+ setTimestampAttribute(config.timestampAttribute);
+ }
+ setPubsubClientFactory(FACTORY);
+ setNeedsAttributes(config.needsAttributes);
+ Coder coder = ByteArrayCoder.of();
+ if (config.needsAttributes) {
+ SimpleFunction<PubsubMessage, T> parseFn =
+ (SimpleFunction<PubsubMessage, T>) new ParsePayloadAsPubsubMessageProto();
+ setParseFn(parseFn);
+ setCoder(coder);
+ } else {
+ setParseFn(new ParsePayloadUsingCoder<>(coder));
+ setCoder(coder);
+ }
+ setNeedsMessageId(false);
+ return build();
+ }
+ }
+
+ /** Exposes {@link PubSubIO.Read} as an external transform for cross-language usage. */
+ @Experimental
+ @AutoService(ExternalTransformRegistrar.class)
+ public static class External implements ExternalTransformRegistrar {
+
+ public static final String URN = "beam:external:java:pubsub:read:v1";
+
+ @Override
+ public Map<String, Class<? extends ExternalTransformBuilder>> knownBuilders() {
+ return ImmutableMap.of(URN, AutoValue_PubsubIO_Read.Builder.class);
+ }
+
+ /** Parameters class to expose the transform to an external SDK. */
+ public static class Configuration {
+
+ // All byte arrays are UTF-8 encoded strings
+ @Nullable private String topic;
+ @Nullable private String subscription;
+ @Nullable private String idAttribute;
+ @Nullable private String timestampAttribute;
+ private boolean needsAttributes;
+
+ public void setTopic(@Nullable String topic) {
+ this.topic = topic;
+ }
+
+ public void setSubscription(@Nullable String subscription) {
+ this.subscription = subscription;
+ }
+
+ public void setIdLabel(@Nullable String idAttribute) {
+ this.idAttribute = idAttribute;
+ }
+
+ public void setTimestampAttribute(@Nullable String timestampAttribute) {
+ this.timestampAttribute = timestampAttribute;
+ }
+
+ public void setWithAttributes(Boolean needsAttributes) {
+ // we must use Boolean instead of boolean because the external payload system
+ // inspects the native type of each coder urn, and BooleanCoder wants Boolean.
+ this.needsAttributes = needsAttributes;
+ }
+ }
}
/**
@@ -955,7 +1041,8 @@
abstract Builder<T> toBuilder();
@AutoValue.Builder
- abstract static class Builder<T> {
+ abstract static class Builder<T>
+ implements ExternalTransformBuilder<External.Configuration, PCollection<T>, PDone> {
abstract Builder<T> setTopicProvider(ValueProvider<PubsubTopic> topicProvider);
abstract Builder<T> setPubsubClientFactory(PubsubClient.PubsubClientFactory factory);
@@ -971,6 +1058,58 @@
abstract Builder<T> setFormatFn(SimpleFunction<T, PubsubMessage> formatFn);
abstract Write<T> build();
+
+ @Override
+ public PTransform<PCollection<T>, PDone> buildExternal(External.Configuration config) {
+ if (config.topic != null) {
+ StaticValueProvider<String> topic = StaticValueProvider.of(config.topic);
+ setTopicProvider(NestedValueProvider.of(topic, new TopicTranslator()));
+ }
+ if (config.idAttribute != null) {
+ setIdAttribute(config.idAttribute);
+ }
+ if (config.timestampAttribute != null) {
+ setTimestampAttribute(config.timestampAttribute);
+ }
+ SimpleFunction<T, PubsubMessage> parseFn =
+ (SimpleFunction<T, PubsubMessage>) new FormatPayloadFromPubsubMessageProto();
+ setFormatFn(parseFn);
+ return build();
+ }
+ }
+
+ /** Exposes {@link PubSubIO.Write} as an external transform for cross-language usage. */
+ @Experimental
+ @AutoService(ExternalTransformRegistrar.class)
+ public static class External implements ExternalTransformRegistrar {
+
+ public static final String URN = "beam:external:java:pubsub:write:v1";
+
+ @Override
+ public Map<String, Class<? extends ExternalTransformBuilder>> knownBuilders() {
+ return ImmutableMap.of(URN, AutoValue_PubsubIO_Write.Builder.class);
+ }
+
+ /** Parameters class to expose the transform to an external SDK. */
+ public static class Configuration {
+
+ // All byte arrays are UTF-8 encoded strings
+ private String topic;
+ @Nullable private String idAttribute;
+ @Nullable private String timestampAttribute;
+
+ public void setTopic(String topic) {
+ this.topic = topic;
+ }
+
+ public void setIdLabel(@Nullable String idAttribute) {
+ this.idAttribute = idAttribute;
+ }
+
+ public void setTimestampAttribute(@Nullable String timestampAttribute) {
+ this.timestampAttribute = timestampAttribute;
+ }
+ }
}
/**
@@ -1213,6 +1352,22 @@
}
}
+ private static class ParsePayloadAsPubsubMessageProto
+ extends SimpleFunction<PubsubMessage, byte[]> {
+ @Override
+ public byte[] apply(PubsubMessage input) {
+ Map<String, String> attributes = input.getAttributeMap();
+ com.google.pubsub.v1.PubsubMessage.Builder message =
+ com.google.pubsub.v1.PubsubMessage.newBuilder()
+ .setData(ByteString.copyFrom(input.getPayload()));
+ // TODO(BEAM-8085) this should not be null
+ if (attributes != null) {
+ message.putAllAttributes(attributes);
+ }
+ return message.build().toByteArray();
+ }
+ }
+
private static class FormatPayloadAsUtf8 extends SimpleFunction<String, PubsubMessage> {
@Override
public PubsubMessage apply(String input) {
@@ -1237,6 +1392,20 @@
}
}
+ private static class FormatPayloadFromPubsubMessageProto
+ extends SimpleFunction<byte[], PubsubMessage> {
+ @Override
+ public PubsubMessage apply(byte[] input) {
+ try {
+ com.google.pubsub.v1.PubsubMessage message =
+ com.google.pubsub.v1.PubsubMessage.parseFrom(input);
+ return new PubsubMessage(message.getData().toByteArray(), message.getAttributesMap());
+ } catch (InvalidProtocolBufferException e) {
+ throw new RuntimeException("Could not decode Pubsub message", e);
+ }
+ }
+ }
+
private static class IdentityMessageFn extends SimpleFunction<PubsubMessage, PubsubMessage> {
@Override
public PubsubMessage apply(PubsubMessage input) {
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java
index 11cb0d6..136b1d2 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java
@@ -169,7 +169,7 @@
@Nullable Map<String, String> attributes = pubsubMessage.getAttributes();
// Payload.
- byte[] elementBytes = pubsubMessage.decodeData();
+ byte[] elementBytes = pubsubMessage.getData() == null ? null : pubsubMessage.decodeData();
if (elementBytes == null) {
elementBytes = new byte[0];
}
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIOExternalTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIOExternalTest.java
new file mode 100644
index 0000000..50f7528
--- /dev/null
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIOExternalTest.java
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.gcp.pubsub;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import javax.annotation.Nullable;
+import org.apache.beam.model.expansion.v1.ExpansionApi;
+import org.apache.beam.model.pipeline.v1.ExternalTransforms;
+import org.apache.beam.model.pipeline.v1.RunnerApi;
+import org.apache.beam.runners.core.construction.ParDoTranslation;
+import org.apache.beam.runners.core.construction.PipelineTranslation;
+import org.apache.beam.runners.core.construction.ReadTranslation;
+import org.apache.beam.runners.core.construction.expansion.ExpansionService;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.coders.BooleanCoder;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.options.ValueProvider;
+import org.apache.beam.sdk.transforms.Create;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.vendor.grpc.v1p21p0.com.google.protobuf.ByteString;
+import org.apache.beam.vendor.grpc.v1p21p0.io.grpc.stub.StreamObserver;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables;
+import org.hamcrest.Matchers;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.powermock.reflect.Whitebox;
+
+/** Tests for building {@link PubsubIO} externally via the ExpansionService. */
+@RunWith(JUnit4.class)
+public class PubsubIOExternalTest {
+ @Test
+ public void testConstructPubsubRead() throws Exception {
+ String topic = "projects/project-1234/topics/topic_name";
+ String idAttribute = "id_foo";
+ Boolean needsAttributes = true;
+
+ ExternalTransforms.ExternalConfigurationPayload payload =
+ ExternalTransforms.ExternalConfigurationPayload.newBuilder()
+ .putConfiguration(
+ "topic",
+ ExternalTransforms.ConfigValue.newBuilder()
+ .addCoderUrn("beam:coder:string_utf8:v1")
+ .setPayload(ByteString.copyFrom(encodeString(topic)))
+ .build())
+ .putConfiguration(
+ "id_label",
+ ExternalTransforms.ConfigValue.newBuilder()
+ .addCoderUrn("beam:coder:string_utf8:v1")
+ .setPayload(ByteString.copyFrom(encodeString(idAttribute)))
+ .build())
+ .putConfiguration(
+ "with_attributes",
+ ExternalTransforms.ConfigValue.newBuilder()
+ .addCoderUrn("beam:coder:bool:v1")
+ .setPayload(ByteString.copyFrom(encodeBoolean(needsAttributes)))
+ .build())
+ .build();
+
+ RunnerApi.Components defaultInstance = RunnerApi.Components.getDefaultInstance();
+ ExpansionApi.ExpansionRequest request =
+ ExpansionApi.ExpansionRequest.newBuilder()
+ .setComponents(defaultInstance)
+ .setTransform(
+ RunnerApi.PTransform.newBuilder()
+ .setUniqueName("test")
+ .setSpec(
+ RunnerApi.FunctionSpec.newBuilder()
+ .setUrn("beam:external:java:pubsub:read:v1")
+ .setPayload(payload.toByteString())))
+ .setNamespace("test_namespace")
+ .build();
+
+ ExpansionService expansionService = new ExpansionService();
+ TestStreamObserver<ExpansionApi.ExpansionResponse> observer = new TestStreamObserver<>();
+ expansionService.expand(request, observer);
+
+ ExpansionApi.ExpansionResponse result = observer.result;
+ RunnerApi.PTransform transform = result.getTransform();
+ assertThat(
+ transform.getSubtransformsList(),
+ Matchers.contains(
+ "test_namespacetest/PubsubUnboundedSource", "test_namespacetest/MapElements"));
+ assertThat(transform.getInputsCount(), Matchers.is(0));
+ assertThat(transform.getOutputsCount(), Matchers.is(1));
+
+ RunnerApi.PTransform pubsubComposite =
+ result.getComponents().getTransformsOrThrow(transform.getSubtransforms(0));
+ RunnerApi.PTransform pubsubRead =
+ result.getComponents().getTransformsOrThrow(pubsubComposite.getSubtransforms(0));
+ RunnerApi.ReadPayload readPayload =
+ RunnerApi.ReadPayload.parseFrom(pubsubRead.getSpec().getPayload());
+ PubsubUnboundedSource.PubsubSource source =
+ (PubsubUnboundedSource.PubsubSource) ReadTranslation.unboundedSourceFromProto(readPayload);
+ PubsubUnboundedSource spec = source.outer;
+
+ assertThat(
+ spec.getTopicProvider() == null ? null : String.valueOf(spec.getTopicProvider()),
+ Matchers.is(topic));
+ assertThat(spec.getIdAttribute(), Matchers.is(idAttribute));
+ assertThat(spec.getNeedsAttributes(), Matchers.is(true));
+ }
+
+ @Test
+ public void testConstructPubsubWrite() throws Exception {
+ String topic = "projects/project-1234/topics/topic_name";
+ String idAttribute = "id_foo";
+
+ ExternalTransforms.ExternalConfigurationPayload payload =
+ ExternalTransforms.ExternalConfigurationPayload.newBuilder()
+ .putConfiguration(
+ "topic",
+ ExternalTransforms.ConfigValue.newBuilder()
+ .addCoderUrn("beam:coder:string_utf8:v1")
+ .setPayload(ByteString.copyFrom(encodeString(topic)))
+ .build())
+ .putConfiguration(
+ "id_label",
+ ExternalTransforms.ConfigValue.newBuilder()
+ .addCoderUrn("beam:coder:string_utf8:v1")
+ .setPayload(ByteString.copyFrom(encodeString(idAttribute)))
+ .build())
+ .build();
+
+ Pipeline p = Pipeline.create();
+ p.apply("unbounded", Create.of(1, 2, 3)).setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED);
+
+ RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p);
+ String inputPCollection =
+ Iterables.getOnlyElement(
+ Iterables.getLast(pipelineProto.getComponents().getTransformsMap().values())
+ .getOutputsMap()
+ .values());
+
+ ExpansionApi.ExpansionRequest request =
+ ExpansionApi.ExpansionRequest.newBuilder()
+ .setComponents(pipelineProto.getComponents())
+ .setTransform(
+ RunnerApi.PTransform.newBuilder()
+ .setUniqueName("test")
+ .putInputs("input", inputPCollection)
+ .setSpec(
+ RunnerApi.FunctionSpec.newBuilder()
+ .setUrn("beam:external:java:pubsub:write:v1")
+ .setPayload(payload.toByteString())))
+ .setNamespace("test_namespace")
+ .build();
+
+ ExpansionService expansionService = new ExpansionService();
+ TestStreamObserver<ExpansionApi.ExpansionResponse> observer = new TestStreamObserver<>();
+ expansionService.expand(request, observer);
+
+ ExpansionApi.ExpansionResponse result = observer.result;
+
+ RunnerApi.PTransform transform = result.getTransform();
+ assertThat(
+ transform.getSubtransformsList(),
+ Matchers.contains(
+ "test_namespacetest/MapElements", "test_namespacetest/PubsubUnboundedSink"));
+ assertThat(transform.getInputsCount(), Matchers.is(1));
+ assertThat(transform.getOutputsCount(), Matchers.is(0));
+
+ // test_namespacetest/PubsubUnboundedSink
+ RunnerApi.PTransform writeComposite =
+ result.getComponents().getTransformsOrThrow(transform.getSubtransforms(1));
+
+ // test_namespacetest/PubsubUnboundedSink/PubsubUnboundedSink.Writer
+ RunnerApi.PTransform writeComposite2 =
+ result.getComponents().getTransformsOrThrow(writeComposite.getSubtransforms(3));
+
+ // test_namespacetest/PubsubUnboundedSink/PubsubUnboundedSink.Writer/ParMultiDo(Writer)
+ RunnerApi.PTransform writeParDo =
+ result.getComponents().getTransformsOrThrow(writeComposite2.getSubtransforms(0));
+
+ RunnerApi.ParDoPayload parDoPayload =
+ RunnerApi.ParDoPayload.parseFrom(writeParDo.getSpec().getPayload());
+ DoFn pubsubWriter = ParDoTranslation.getDoFn(parDoPayload);
+
+ String idAttributeActual = (String) Whitebox.getInternalState(pubsubWriter, "idAttribute");
+
+ ValueProvider<PubsubClient.TopicPath> topicActual =
+ (ValueProvider<PubsubClient.TopicPath>) Whitebox.getInternalState(pubsubWriter, "topic");
+
+ assertThat(topicActual == null ? null : String.valueOf(topicActual), Matchers.is(topic));
+ assertThat(idAttributeActual, Matchers.is(idAttribute));
+ }
+
+ private static byte[] encodeString(String str) throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ StringUtf8Coder.of().encode(str, baos);
+ return baos.toByteArray();
+ }
+
+ private static byte[] encodeBoolean(Boolean value) throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ BooleanCoder.of().encode(value, baos);
+ return baos.toByteArray();
+ }
+
+ private static @Nullable String getTopic(@Nullable ValueProvider<PubsubIO.PubsubTopic> value) {
+ if (value == null) {
+ return null;
+ }
+ return String.valueOf(value);
+ }
+
+ private static class TestStreamObserver<T> implements StreamObserver<T> {
+
+ private T result;
+
+ @Override
+ public void onNext(T t) {
+ result = t;
+ }
+
+ @Override
+ public void onError(Throwable throwable) {
+ throw new RuntimeException("Should not happen", throwable);
+ }
+
+ @Override
+ public void onCompleted() {}
+ }
+}
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIOReadTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIOReadTest.java
index 59b60ea..5977c2e 100644
--- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIOReadTest.java
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIOReadTest.java
@@ -17,11 +17,8 @@
*/
package org.apache.beam.sdk.io.gcp.spanner;
-import static org.junit.Assert.assertThat;
import static org.mockito.Matchers.any;
import static org.mockito.Matchers.eq;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import com.google.cloud.Timestamp;
@@ -45,12 +42,8 @@
import org.apache.beam.sdk.testing.PAssert;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.transforms.Create;
-import org.apache.beam.sdk.transforms.DoFnTester;
-import org.apache.beam.sdk.transforms.View;
-import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionView;
-import org.hamcrest.Matchers;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
@@ -94,108 +87,130 @@
@Test
public void runQuery() throws Exception {
- SpannerIO.Read read =
- SpannerIO.read()
+ Timestamp timestamp = Timestamp.ofTimeMicroseconds(12345);
+ TimestampBound timestampBound = TimestampBound.ofReadTimestamp(timestamp);
+
+ SpannerConfig spannerConfig =
+ SpannerConfig.create()
.withProjectId("test")
.withInstanceId("123")
.withDatabaseId("aaa")
- .withQuery("SELECT * FROM users")
.withServiceFactory(serviceFactory);
- List<Partition> fakePartitions =
- Arrays.asList(mock(Partition.class), mock(Partition.class), mock(Partition.class));
+ PCollection<Struct> one =
+ pipeline.apply(
+ "read q",
+ SpannerIO.read()
+ .withSpannerConfig(spannerConfig)
+ .withQuery("SELECT * FROM users")
+ .withTimestampBound(timestampBound));
- BatchTransactionId id = mock(BatchTransactionId.class);
- Transaction tx = Transaction.create(id);
- PCollectionView<Transaction> txView =
- pipeline.apply(Create.of(tx)).apply(View.<Transaction>asSingleton());
+ FakeBatchTransactionId id = new FakeBatchTransactionId("runQueryTest");
+ when(mockBatchTx.getBatchTransactionId()).thenReturn(id);
- BatchSpannerRead.GeneratePartitionsFn fn =
- new BatchSpannerRead.GeneratePartitionsFn(read.getSpannerConfig(), txView);
- DoFnTester<ReadOperation, Partition> fnTester = DoFnTester.of(fn);
- fnTester.setSideInput(txView, GlobalWindow.INSTANCE, tx);
+ when(serviceFactory.mockBatchClient().batchReadOnlyTransaction(timestampBound))
+ .thenReturn(mockBatchTx);
+ when(serviceFactory.mockBatchClient().batchReadOnlyTransaction(any(BatchTransactionId.class)))
+ .thenReturn(mockBatchTx);
- when(serviceFactory.mockBatchClient().batchReadOnlyTransaction(id)).thenReturn(mockBatchTx);
- when(mockBatchTx.partitionQuery(any(PartitionOptions.class), any(Statement.class)))
- .thenReturn(fakePartitions);
+ Partition fakePartition =
+ FakePartitionFactory.createFakeQueryPartition(ByteString.copyFromUtf8("one"));
- List<Partition> result = fnTester.processBundle(read.getReadOperation());
- assertThat(result, Matchers.containsInAnyOrder(fakePartitions.toArray()));
+ when(mockBatchTx.partitionQuery(
+ any(PartitionOptions.class), eq(Statement.of("SELECT * FROM users"))))
+ .thenReturn(Arrays.asList(fakePartition, fakePartition));
+ when(mockBatchTx.execute(any(Partition.class)))
+ .thenReturn(
+ ResultSets.forRows(FAKE_TYPE, FAKE_ROWS.subList(0, 2)),
+ ResultSets.forRows(FAKE_TYPE, FAKE_ROWS.subList(2, 6)));
- verify(serviceFactory.mockBatchClient()).batchReadOnlyTransaction(id);
- verify(mockBatchTx)
- .partitionQuery(any(PartitionOptions.class), eq(Statement.of("SELECT * " + "FROM users")));
+ PAssert.that(one).containsInAnyOrder(FAKE_ROWS);
+
+ pipeline.run();
}
@Test
public void runRead() throws Exception {
- SpannerIO.Read read =
- SpannerIO.read()
+ Timestamp timestamp = Timestamp.ofTimeMicroseconds(12345);
+ TimestampBound timestampBound = TimestampBound.ofReadTimestamp(timestamp);
+
+ SpannerConfig spannerConfig =
+ SpannerConfig.create()
.withProjectId("test")
.withInstanceId("123")
.withDatabaseId("aaa")
- .withTable("users")
- .withColumns("id", "name")
.withServiceFactory(serviceFactory);
- List<Partition> fakePartitions =
- Arrays.asList(mock(Partition.class), mock(Partition.class), mock(Partition.class));
+ PCollection<Struct> one =
+ pipeline.apply(
+ "read q",
+ SpannerIO.read()
+ .withSpannerConfig(spannerConfig)
+ .withTable("users")
+ .withColumns("id", "name")
+ .withTimestampBound(timestampBound));
- BatchTransactionId id = mock(BatchTransactionId.class);
- Transaction tx = Transaction.create(id);
- PCollectionView<Transaction> txView =
- pipeline.apply(Create.of(tx)).apply(View.<Transaction>asSingleton());
+ FakeBatchTransactionId id = new FakeBatchTransactionId("runReadTest");
+ when(mockBatchTx.getBatchTransactionId()).thenReturn(id);
- BatchSpannerRead.GeneratePartitionsFn fn =
- new BatchSpannerRead.GeneratePartitionsFn(read.getSpannerConfig(), txView);
- DoFnTester<ReadOperation, Partition> fnTester = DoFnTester.of(fn);
- fnTester.setSideInput(txView, GlobalWindow.INSTANCE, tx);
+ when(serviceFactory.mockBatchClient().batchReadOnlyTransaction(timestampBound))
+ .thenReturn(mockBatchTx);
+ when(serviceFactory.mockBatchClient().batchReadOnlyTransaction(any(BatchTransactionId.class)))
+ .thenReturn(mockBatchTx);
- when(serviceFactory.mockBatchClient().batchReadOnlyTransaction(id)).thenReturn(mockBatchTx);
+ Partition fakePartition =
+ FakePartitionFactory.createFakeReadPartition(ByteString.copyFromUtf8("one"));
+
when(mockBatchTx.partitionRead(
any(PartitionOptions.class),
eq("users"),
eq(KeySet.all()),
eq(Arrays.asList("id", "name"))))
- .thenReturn(fakePartitions);
+ .thenReturn(Arrays.asList(fakePartition, fakePartition, fakePartition));
+ when(mockBatchTx.execute(any(Partition.class)))
+ .thenReturn(
+ ResultSets.forRows(FAKE_TYPE, FAKE_ROWS.subList(0, 2)),
+ ResultSets.forRows(FAKE_TYPE, FAKE_ROWS.subList(2, 4)),
+ ResultSets.forRows(FAKE_TYPE, FAKE_ROWS.subList(4, 6)));
- List<Partition> result = fnTester.processBundle(read.getReadOperation());
- assertThat(result, Matchers.containsInAnyOrder(fakePartitions.toArray()));
+ PAssert.that(one).containsInAnyOrder(FAKE_ROWS);
- verify(serviceFactory.mockBatchClient()).batchReadOnlyTransaction(id);
- verify(mockBatchTx)
- .partitionRead(
- any(PartitionOptions.class),
- eq("users"),
- eq(KeySet.all()),
- eq(Arrays.asList("id", "name")));
+ pipeline.run();
}
@Test
public void runReadUsingIndex() throws Exception {
- SpannerIO.Read read =
- SpannerIO.read()
+ Timestamp timestamp = Timestamp.ofTimeMicroseconds(12345);
+ TimestampBound timestampBound = TimestampBound.ofReadTimestamp(timestamp);
+
+ SpannerConfig spannerConfig =
+ SpannerConfig.create()
.withProjectId("test")
.withInstanceId("123")
.withDatabaseId("aaa")
- .withTimestamp(Timestamp.now())
- .withTable("users")
- .withColumns("id", "name")
- .withIndex("theindex")
.withServiceFactory(serviceFactory);
- List<Partition> fakePartitions =
- Arrays.asList(mock(Partition.class), mock(Partition.class), mock(Partition.class));
+ PCollection<Struct> one =
+ pipeline.apply(
+ "read q",
+ SpannerIO.read()
+ .withTimestamp(Timestamp.now())
+ .withSpannerConfig(spannerConfig)
+ .withTable("users")
+ .withColumns("id", "name")
+ .withIndex("theindex")
+ .withTimestampBound(timestampBound));
- FakeBatchTransactionId id = new FakeBatchTransactionId("one");
- Transaction tx = Transaction.create(id);
- PCollectionView<Transaction> txView =
- pipeline.apply(Create.of(tx)).apply(View.<Transaction>asSingleton());
+ FakeBatchTransactionId id = new FakeBatchTransactionId("runReadUsingIndexTest");
+ when(mockBatchTx.getBatchTransactionId()).thenReturn(id);
- BatchSpannerRead.GeneratePartitionsFn fn =
- new BatchSpannerRead.GeneratePartitionsFn(read.getSpannerConfig(), txView);
- DoFnTester<ReadOperation, Partition> fnTester = DoFnTester.of(fn);
- fnTester.setSideInput(txView, GlobalWindow.INSTANCE, tx);
+ when(serviceFactory.mockBatchClient().batchReadOnlyTransaction(timestampBound))
+ .thenReturn(mockBatchTx);
+ when(serviceFactory.mockBatchClient().batchReadOnlyTransaction(any(BatchTransactionId.class)))
+ .thenReturn(mockBatchTx);
+
+ Partition fakePartition =
+ FakePartitionFactory.createFakeReadPartition(ByteString.copyFromUtf8("one"));
when(serviceFactory.mockBatchClient().batchReadOnlyTransaction(id)).thenReturn(mockBatchTx);
when(mockBatchTx.partitionReadUsingIndex(
@@ -204,19 +219,17 @@
eq("theindex"),
eq(KeySet.all()),
eq(Arrays.asList("id", "name"))))
- .thenReturn(fakePartitions);
+ .thenReturn(Arrays.asList(fakePartition, fakePartition, fakePartition));
- List<Partition> result = fnTester.processBundle(read.getReadOperation());
- assertThat(result, Matchers.containsInAnyOrder(fakePartitions.toArray()));
+ when(mockBatchTx.execute(any(Partition.class)))
+ .thenReturn(
+ ResultSets.forRows(FAKE_TYPE, FAKE_ROWS.subList(0, 2)),
+ ResultSets.forRows(FAKE_TYPE, FAKE_ROWS.subList(2, 4)),
+ ResultSets.forRows(FAKE_TYPE, FAKE_ROWS.subList(4, 6)));
- verify(serviceFactory.mockBatchClient()).batchReadOnlyTransaction(id);
- verify(mockBatchTx)
- .partitionReadUsingIndex(
- any(PartitionOptions.class),
- eq("users"),
- eq("theindex"),
- eq(KeySet.all()),
- eq(Arrays.asList("id", "name")));
+ PAssert.that(one).containsInAnyOrder(FAKE_ROWS);
+
+ pipeline.run();
}
@Test
diff --git a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcIO.java b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcIO.java
index 0210cee..9162448 100644
--- a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcIO.java
+++ b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcIO.java
@@ -28,9 +28,11 @@
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
+import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
@@ -287,6 +289,9 @@
abstract ValueProvider<String> getConnectionProperties();
@Nullable
+ abstract ValueProvider<Collection<String>> getConnectionInitSqls();
+
+ @Nullable
abstract DataSource getDataSource();
abstract Builder builder();
@@ -303,6 +308,8 @@
abstract Builder setConnectionProperties(ValueProvider<String> connectionProperties);
+ abstract Builder setConnectionInitSqls(ValueProvider<Collection<String>> connectionInitSqls);
+
abstract Builder setDataSource(DataSource dataSource);
abstract DataSourceConfiguration build();
@@ -369,6 +376,25 @@
return builder().setConnectionProperties(connectionProperties).build();
}
+ /**
+ * Sets the connection init sql statements to driver.connect(...).
+ *
+ * <p>NOTE - This property is not applicable across databases. Only MySQL and MariaDB support
+ * this. A Sql exception is thrown if your database does not support it.
+ */
+ public DataSourceConfiguration withConnectionInitSqls(Collection<String> connectionInitSqls) {
+ checkArgument(connectionInitSqls != null, "connectionInitSqls can not be null");
+ return withConnectionInitSqls(ValueProvider.StaticValueProvider.of(connectionInitSqls));
+ }
+
+ /** Same as {@link #withConnectionInitSqls(Collection)} but accepting a ValueProvider. */
+ public DataSourceConfiguration withConnectionInitSqls(
+ ValueProvider<Collection<String>> connectionInitSqls) {
+ checkArgument(connectionInitSqls != null, "connectionInitSqls can not be null");
+ checkArgument(!connectionInitSqls.get().isEmpty(), "connectionInitSqls can not be empty");
+ return builder().setConnectionInitSqls(connectionInitSqls).build();
+ }
+
void populateDisplayData(DisplayData.Builder builder) {
if (getDataSource() != null) {
builder.addIfNotNull(DisplayData.item("dataSource", getDataSource().getClass().getName()));
@@ -397,6 +423,12 @@
if (getConnectionProperties() != null && getConnectionProperties().get() != null) {
basicDataSource.setConnectionProperties(getConnectionProperties().get());
}
+ if (getConnectionInitSqls() != null
+ && getConnectionInitSqls().get() != null
+ && !getConnectionInitSqls().get().isEmpty()) {
+ basicDataSource.setConnectionInitSqls(getConnectionInitSqls().get());
+ }
+
return basicDataSource;
}
return getDataSource();
@@ -904,10 +936,7 @@
/** See {@link WriteVoid#withDataSourceConfiguration(DataSourceConfiguration)}. */
public Write<T> withDataSourceConfiguration(DataSourceConfiguration config) {
- return new Write(
- inner
- .withDataSourceConfiguration(config)
- .withDataSourceProviderFn(new DataSourceProviderFromDataSourceConfiguration(config)));
+ return new Write(inner.withDataSourceConfiguration(config));
}
/** See {@link WriteVoid#withDataSourceProviderFn(SerializableFunction)}. */
@@ -1338,79 +1367,79 @@
}
}
- /** Wraps a {@link DataSourceConfiguration} to provide a {@link PoolingDataSource}. */
+ /**
+ * Wraps a {@link DataSourceConfiguration} to provide a {@link PoolingDataSource}.
+ *
+ * <p>At most a single {@link DataSource} instance will be constructed during pipeline execution
+ * for each unique {@link DataSourceConfiguration} within the pipeline.
+ */
public static class PoolableDataSourceProvider
implements SerializableFunction<Void, DataSource>, HasDisplayData {
- private static PoolableDataSourceProvider instance;
- private static transient DataSource source;
- private static SerializableFunction<Void, DataSource> dataSourceProviderFn;
+ private static final ConcurrentHashMap<DataSourceConfiguration, DataSource> instances =
+ new ConcurrentHashMap<>();
+ private final DataSourceProviderFromDataSourceConfiguration config;
private PoolableDataSourceProvider(DataSourceConfiguration config) {
- dataSourceProviderFn = DataSourceProviderFromDataSourceConfiguration.of(config);
+ this.config = new DataSourceProviderFromDataSourceConfiguration(config);
}
- public static synchronized SerializableFunction<Void, DataSource> of(
- DataSourceConfiguration config) {
- if (instance == null) {
- instance = new PoolableDataSourceProvider(config);
- }
- return instance;
+ public static SerializableFunction<Void, DataSource> of(DataSourceConfiguration config) {
+ return new PoolableDataSourceProvider(config);
}
@Override
public DataSource apply(Void input) {
- return buildDataSource(input);
- }
-
- static synchronized DataSource buildDataSource(Void input) {
- if (source == null) {
- DataSource basicSource = dataSourceProviderFn.apply(input);
- DataSourceConnectionFactory connectionFactory =
- new DataSourceConnectionFactory(basicSource);
- PoolableConnectionFactory poolableConnectionFactory =
- new PoolableConnectionFactory(connectionFactory, null);
- GenericObjectPoolConfig poolConfig = new GenericObjectPoolConfig();
- poolConfig.setMaxTotal(1);
- poolConfig.setMinIdle(0);
- poolConfig.setMinEvictableIdleTimeMillis(10000);
- poolConfig.setSoftMinEvictableIdleTimeMillis(30000);
- GenericObjectPool connectionPool =
- new GenericObjectPool(poolableConnectionFactory, poolConfig);
- poolableConnectionFactory.setPool(connectionPool);
- poolableConnectionFactory.setDefaultAutoCommit(false);
- poolableConnectionFactory.setDefaultReadOnly(false);
- source = new PoolingDataSource(connectionPool);
- }
- return source;
+ return instances.computeIfAbsent(
+ config.config,
+ ignored -> {
+ DataSource basicSource = config.apply(input);
+ DataSourceConnectionFactory connectionFactory =
+ new DataSourceConnectionFactory(basicSource);
+ PoolableConnectionFactory poolableConnectionFactory =
+ new PoolableConnectionFactory(connectionFactory, null);
+ GenericObjectPoolConfig poolConfig = new GenericObjectPoolConfig();
+ poolConfig.setMaxTotal(1);
+ poolConfig.setMinIdle(0);
+ poolConfig.setMinEvictableIdleTimeMillis(10000);
+ poolConfig.setSoftMinEvictableIdleTimeMillis(30000);
+ GenericObjectPool connectionPool =
+ new GenericObjectPool(poolableConnectionFactory, poolConfig);
+ poolableConnectionFactory.setPool(connectionPool);
+ poolableConnectionFactory.setDefaultAutoCommit(false);
+ poolableConnectionFactory.setDefaultReadOnly(false);
+ return new PoolingDataSource(connectionPool);
+ });
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
- if (dataSourceProviderFn instanceof HasDisplayData) {
- ((HasDisplayData) dataSourceProviderFn).populateDisplayData(builder);
- }
+ config.populateDisplayData(builder);
}
}
- private static class DataSourceProviderFromDataSourceConfiguration
+ /**
+ * Wraps a {@link DataSourceConfiguration} to provide a {@link DataSource}.
+ *
+ * <p>At most a single {@link DataSource} instance will be constructed during pipeline execution
+ * for each unique {@link DataSourceConfiguration} within the pipeline.
+ */
+ public static class DataSourceProviderFromDataSourceConfiguration
implements SerializableFunction<Void, DataSource>, HasDisplayData {
+ private static final ConcurrentHashMap<DataSourceConfiguration, DataSource> instances =
+ new ConcurrentHashMap<>();
private final DataSourceConfiguration config;
- private static DataSourceProviderFromDataSourceConfiguration instance;
private DataSourceProviderFromDataSourceConfiguration(DataSourceConfiguration config) {
this.config = config;
}
public static SerializableFunction<Void, DataSource> of(DataSourceConfiguration config) {
- if (instance == null) {
- instance = new DataSourceProviderFromDataSourceConfiguration(config);
- }
- return instance;
+ return new DataSourceProviderFromDataSourceConfiguration(config);
}
@Override
public DataSource apply(Void input) {
- return config.buildDatasource();
+ return instances.computeIfAbsent(config, (config) -> config.buildDatasource());
}
@Override
diff --git a/sdks/java/io/jdbc/src/test/java/org/apache/beam/sdk/io/jdbc/JdbcIOTest.java b/sdks/java/io/jdbc/src/test/java/org/apache/beam/sdk/io/jdbc/JdbcIOTest.java
index c9367a8..046c061 100644
--- a/sdks/java/io/jdbc/src/test/java/org/apache/beam/sdk/io/jdbc/JdbcIOTest.java
+++ b/sdks/java/io/jdbc/src/test/java/org/apache/beam/sdk/io/jdbc/JdbcIOTest.java
@@ -18,6 +18,8 @@
package org.apache.beam.sdk.io.jdbc;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertThrows;
import static org.junit.Assert.assertTrue;
import static org.mockito.Matchers.any;
import static org.mockito.Matchers.anyString;
@@ -56,6 +58,7 @@
import org.apache.beam.sdk.io.common.DatabaseTestHelper;
import org.apache.beam.sdk.io.common.NetworkTestHelper;
import org.apache.beam.sdk.io.common.TestRow;
+import org.apache.beam.sdk.io.jdbc.JdbcIO.PoolableDataSourceProvider;
import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.schemas.transforms.Select;
import org.apache.beam.sdk.testing.ExpectedLogs;
@@ -65,6 +68,7 @@
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.Wait;
+import org.apache.beam.sdk.util.SerializableUtils;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.Row;
@@ -244,6 +248,24 @@
}
}
+ @Test
+ public void testSetConnectoinInitSqlFailWithDerbyDB() {
+ String username = "sa";
+ String password = "sa";
+ JdbcIO.DataSourceConfiguration config =
+ JdbcIO.DataSourceConfiguration.create(
+ "org.apache.derby.jdbc.ClientDriver",
+ "jdbc:derby://localhost:" + port + "/target/beam")
+ .withUsername(username)
+ .withPassword(password)
+ .withConnectionInitSqls(ImmutableList.of("SET innodb_lock_wait_timeout = 5"));
+
+ assertThrows(
+ "innodb_lock_wait_timeout",
+ SQLException.class,
+ () -> config.buildDatasource().getConnection());
+ }
+
/** Create test data that is consistent with that generated by TestRow. */
private static void addInitialData(DataSource dataSource, String tableName) throws SQLException {
try (Connection connection = dataSource.getConnection()) {
@@ -773,12 +795,13 @@
long epochMilli = 1558719710000L;
DateTime dateTime = new DateTime(epochMilli, ISOChronology.getInstanceUTC());
+ DateTime time =
+ new DateTime(
+ 34567000L /* value must be less than num millis in one day */,
+ ISOChronology.getInstanceUTC());
Row row =
- Row.withSchema(schema)
- .addValues(
- dateTime.withTimeAtStartOfDay(), dateTime.withDate(new LocalDate(0L)), dateTime)
- .build();
+ Row.withSchema(schema).addValues(dateTime.withTimeAtStartOfDay(), time, dateTime).build();
PreparedStatement psMocked = mock(PreparedStatement.class);
@@ -897,4 +920,20 @@
pipeline.run();
}
+
+ @Test
+ public void testSerializationAndCachingOfPoolingDataSourceProvider() {
+ SerializableFunction<Void, DataSource> provider =
+ PoolableDataSourceProvider.of(
+ JdbcIO.DataSourceConfiguration.create(
+ "org.apache.derby.jdbc.ClientDriver",
+ "jdbc:derby://localhost:" + port + "/target/beam"));
+ SerializableFunction<Void, DataSource> deserializedProvider =
+ SerializableUtils.ensureSerializable(provider);
+
+ // Assert that that same instance is being returned even when there are multiple provider
+ // instances with the same configuration. Also check that the deserialized provider was
+ // able to produce an instance.
+ assertSame(provider.apply(null), deserializedProvider.apply(null));
+ }
}
diff --git a/sdks/java/io/parquet/src/main/java/org/apache/beam/sdk/io/parquet/ParquetIO.java b/sdks/java/io/parquet/src/main/java/org/apache/beam/sdk/io/parquet/ParquetIO.java
index fa50715..726acf6 100644
--- a/sdks/java/io/parquet/src/main/java/org/apache/beam/sdk/io/parquet/ParquetIO.java
+++ b/sdks/java/io/parquet/src/main/java/org/apache/beam/sdk/io/parquet/ParquetIO.java
@@ -105,7 +105,8 @@
* .<GenericRecord>write()
* .via(ParquetIO.sink(SCHEMA)
* .withCompressionCodec(CompressionCodecName.SNAPPY))
- * .to("destination/path"))
+ * .to("destination/path")
+ * .withSuffix(".parquet"));
* }</pre>
*
* <p>This IO API is considered experimental and may break or receive backwards-incompatible changes
diff --git a/sdks/java/maven-archetypes/examples/build.gradle b/sdks/java/maven-archetypes/examples/build.gradle
index beec649..dd95fdf 100644
--- a/sdks/java/maven-archetypes/examples/build.gradle
+++ b/sdks/java/maven-archetypes/examples/build.gradle
@@ -43,7 +43,7 @@
'maven-jar-plugin.version': dependencies.create(project.library.maven.maven_jar_plugin).getVersion(),
'maven-shade-plugin.version': dependencies.create(project.library.maven.maven_shade_plugin).getVersion(),
'maven-surefire-plugin.version': dependencies.create(project.library.maven.maven_surefire_plugin).getVersion(),
- 'flink.artifact.name': 'beam-runners-flink-'.concat(project(":runners:flink:1.8").getName()),
+ 'flink.artifact.name': 'beam-runners-flink-'.concat(project(":runners:flink:1.9").getName()),
]
}
diff --git a/sdks/java/testing/nexmark/build.gradle b/sdks/java/testing/nexmark/build.gradle
index 1fdfbed..9a62c3a 100644
--- a/sdks/java/testing/nexmark/build.gradle
+++ b/sdks/java/testing/nexmark/build.gradle
@@ -102,7 +102,7 @@
//
// Parameters:
// -Pnexmark.runner
-// Specify a runner subproject, such as ":runners:spark" or ":runners:flink:1.8"
+// Specify a runner subproject, such as ":runners:spark" or ":runners:flink:1.9"
// Defaults to ":runners:direct-java"
//
// -Pnexmark.args
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/aggregation/__init__.py b/sdks/python/apache_beam/examples/snippets/transforms/aggregation/__init__.py
new file mode 100644
index 0000000..6569e3f
--- /dev/null
+++ b/sdks/python/apache_beam/examples/snippets/transforms/aggregation/__init__.py
@@ -0,0 +1,18 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/aggregation/cogroupbykey.py b/sdks/python/apache_beam/examples/snippets/transforms/aggregation/cogroupbykey.py
new file mode 100644
index 0000000..c507e03
--- /dev/null
+++ b/sdks/python/apache_beam/examples/snippets/transforms/aggregation/cogroupbykey.py
@@ -0,0 +1,49 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+
+def cogroupbykey(test=None):
+ # [START cogroupbykey]
+ import apache_beam as beam
+
+ with beam.Pipeline() as pipeline:
+ icon_pairs = pipeline | 'Create icons' >> beam.Create([
+ ('Apple', '🍎'),
+ ('Apple', '🍏'),
+ ('Eggplant', '🍆'),
+ ('Tomato', '🍅'),
+ ])
+
+ duration_pairs = pipeline | 'Create durations' >> beam.Create([
+ ('Apple', 'perennial'),
+ ('Carrot', 'biennial'),
+ ('Tomato', 'perennial'),
+ ('Tomato', 'annual'),
+ ])
+
+ plants = (
+ ({'icons': icon_pairs, 'durations': duration_pairs})
+ | 'Merge' >> beam.CoGroupByKey()
+ | beam.Map(print)
+ )
+ # [END cogroupbykey]
+ if test:
+ test(plants)
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/aggregation/cogroupbykey_test.py b/sdks/python/apache_beam/examples/snippets/transforms/aggregation/cogroupbykey_test.py
new file mode 100644
index 0000000..ff86628
--- /dev/null
+++ b/sdks/python/apache_beam/examples/snippets/transforms/aggregation/cogroupbykey_test.py
@@ -0,0 +1,59 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+import unittest
+
+import mock
+
+from apache_beam.examples.snippets.util import assert_matches_stdout
+from apache_beam.testing.test_pipeline import TestPipeline
+
+from . import cogroupbykey
+
+
+def check_plants(actual):
+ expected = '''[START plants]
+('Apple', {'icons': ['🍎', '🍏'], 'durations': ['perennial']})
+('Carrot', {'icons': [], 'durations': ['biennial']})
+('Tomato', {'icons': ['🍅'], 'durations': ['perennial', 'annual']})
+('Eggplant', {'icons': ['🍆'], 'durations': []})
+[END plants]'''.splitlines()[1:-1]
+
+ # Make it deterministic by sorting all sublists in each element.
+ def normalize_element(elem):
+ name, details = elem
+ details['icons'] = sorted(details['icons'])
+ details['durations'] = sorted(details['durations'])
+ return name, details
+ assert_matches_stdout(actual, expected, normalize_element)
+
+
+@mock.patch('apache_beam.Pipeline', TestPipeline)
+@mock.patch(
+ 'apache_beam.examples.snippets.transforms.aggregation.cogroupbykey.print',
+ str)
+class CoGroupByKeyTest(unittest.TestCase):
+ def test_cogroupbykey(self):
+ cogroupbykey.cogroupbykey(check_plants)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/filter_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/filter_test.py
index d989e43..724b1b9 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/filter_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/filter_test.py
@@ -23,40 +23,34 @@
import mock
+from apache_beam.examples.snippets.util import assert_matches_stdout
from apache_beam.testing.test_pipeline import TestPipeline
-from apache_beam.testing.util import assert_that
-from apache_beam.testing.util import equal_to
from . import filter
def check_perennials(actual):
- # [START perennials]
- perennials = [
- {'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'},
- {'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'},
- {'icon': '🥔', 'name': 'Potato', 'duration': 'perennial'},
- ]
- # [END perennials]
- assert_that(actual, equal_to(perennials))
+ expected = '''[START perennials]
+{'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'}
+{'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'}
+{'icon': '🥔', 'name': 'Potato', 'duration': 'perennial'}
+[END perennials]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
def check_valid_plants(actual):
- # [START valid_plants]
- valid_plants = [
- {'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'},
- {'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'},
- {'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'},
- {'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'},
- ]
- # [END valid_plants]
- assert_that(actual, equal_to(valid_plants))
+ expected = '''[START valid_plants]
+{'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'}
+{'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'}
+{'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'}
+{'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'}
+[END valid_plants]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
@mock.patch('apache_beam.Pipeline', TestPipeline)
-# pylint: disable=line-too-long
-@mock.patch('apache_beam.examples.snippets.transforms.elementwise.filter.print', lambda elem: elem)
-# pylint: enable=line-too-long
+@mock.patch(
+ 'apache_beam.examples.snippets.transforms.elementwise.filter.print', str)
class FilterTest(unittest.TestCase):
def test_filter_function(self):
filter.filter_function(check_perennials)
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/flatmap_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/flatmap_test.py
index 718dcee..5c326e9 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/flatmap_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/flatmap_test.py
@@ -23,42 +23,36 @@
import mock
+from apache_beam.examples.snippets.util import assert_matches_stdout
from apache_beam.testing.test_pipeline import TestPipeline
-from apache_beam.testing.util import assert_that
-from apache_beam.testing.util import equal_to
from . import flatmap
def check_plants(actual):
- # [START plants]
- plants = [
- '🍓Strawberry',
- '🥕Carrot',
- '🍆Eggplant',
- '🍅Tomato',
- '🥔Potato',
- ]
- # [END plants]
- assert_that(actual, equal_to(plants))
+ expected = '''[START plants]
+🍓Strawberry
+🥕Carrot
+🍆Eggplant
+🍅Tomato
+🥔Potato
+[END plants]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
def check_valid_plants(actual):
- # [START valid_plants]
- valid_plants = [
- {'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'},
- {'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'},
- {'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'},
- {'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'},
- ]
- # [END valid_plants]
- assert_that(actual, equal_to(valid_plants))
+ expected = '''[START valid_plants]
+{'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'}
+{'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'}
+{'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'}
+{'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'}
+[END valid_plants]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
@mock.patch('apache_beam.Pipeline', TestPipeline)
-# pylint: disable=line-too-long
-@mock.patch('apache_beam.examples.snippets.transforms.elementwise.flatmap.print', lambda elem: elem)
-# pylint: enable=line-too-long
+@mock.patch(
+ 'apache_beam.examples.snippets.transforms.elementwise.flatmap.print', str)
class FlatMapTest(unittest.TestCase):
def test_flatmap_simple(self):
flatmap.flatmap_simple(check_plants)
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/keys_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/keys_test.py
index 780c5e4..e4a843b 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/keys_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/keys_test.py
@@ -23,30 +23,26 @@
import mock
+from apache_beam.examples.snippets.util import assert_matches_stdout
from apache_beam.testing.test_pipeline import TestPipeline
-from apache_beam.testing.util import assert_that
-from apache_beam.testing.util import equal_to
from . import keys
def check_icons(actual):
- # [START icons]
- icons = [
- '🍓',
- '🥕',
- '🍆',
- '🍅',
- '🥔',
- ]
- # [END icons]
- assert_that(actual, equal_to(icons))
+ expected = '''[START icons]
+🍓
+🥕
+🍆
+🍅
+🥔
+[END icons]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
@mock.patch('apache_beam.Pipeline', TestPipeline)
-# pylint: disable=line-too-long
-@mock.patch('apache_beam.examples.snippets.transforms.elementwise.keys.print', lambda elem: elem)
-# pylint: enable=line-too-long
+@mock.patch(
+ 'apache_beam.examples.snippets.transforms.elementwise.keys.print', str)
class KeysTest(unittest.TestCase):
def test_keys(self):
keys.keys(check_icons)
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/kvswap_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/kvswap_test.py
index ea7698b..83f211d 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/kvswap_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/kvswap_test.py
@@ -23,30 +23,26 @@
import mock
+from apache_beam.examples.snippets.util import assert_matches_stdout
from apache_beam.testing.test_pipeline import TestPipeline
-from apache_beam.testing.util import assert_that
-from apache_beam.testing.util import equal_to
from . import kvswap
def check_plants(actual):
- # [START plants]
- plants = [
- ('Strawberry', '🍓'),
- ('Carrot', '🥕'),
- ('Eggplant', '🍆'),
- ('Tomato', '🍅'),
- ('Potato', '🥔'),
- ]
- # [END plants]
- assert_that(actual, equal_to(plants))
+ expected = '''[START plants]
+('Strawberry', '🍓')
+('Carrot', '🥕')
+('Eggplant', '🍆')
+('Tomato', '🍅')
+('Potato', '🥔')
+[END plants]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
@mock.patch('apache_beam.Pipeline', TestPipeline)
-# pylint: disable=line-too-long
-@mock.patch('apache_beam.examples.snippets.transforms.elementwise.kvswap.print', lambda elem: elem)
-# pylint: enable=line-too-long
+@mock.patch(
+ 'apache_beam.examples.snippets.transforms.elementwise.kvswap.print', str)
class KvSwapTest(unittest.TestCase):
def test_kvswap(self):
kvswap.kvswap(check_plants)
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/map_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/map_test.py
index 4186176..eb77675 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/map_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/map_test.py
@@ -23,43 +23,37 @@
import mock
+from apache_beam.examples.snippets.util import assert_matches_stdout
from apache_beam.testing.test_pipeline import TestPipeline
-from apache_beam.testing.util import assert_that
-from apache_beam.testing.util import equal_to
from . import map
def check_plants(actual):
- # [START plants]
- plants = [
- '🍓Strawberry',
- '🥕Carrot',
- '🍆Eggplant',
- '🍅Tomato',
- '🥔Potato',
- ]
- # [END plants]
- assert_that(actual, equal_to(plants))
+ expected = '''[START plants]
+🍓Strawberry
+🥕Carrot
+🍆Eggplant
+🍅Tomato
+🥔Potato
+[END plants]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
def check_plant_details(actual):
- # [START plant_details]
- plant_details = [
- {'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'},
- {'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'},
- {'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'},
- {'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'},
- {'icon': '🥔', 'name': 'Potato', 'duration': 'perennial'},
- ]
- # [END plant_details]
- assert_that(actual, equal_to(plant_details))
+ expected = '''[START plant_details]
+{'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'}
+{'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'}
+{'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'}
+{'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'}
+{'icon': '🥔', 'name': 'Potato', 'duration': 'perennial'}
+[END plant_details]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
@mock.patch('apache_beam.Pipeline', TestPipeline)
-# pylint: disable=line-too-long
-@mock.patch('apache_beam.examples.snippets.transforms.elementwise.map.print', lambda elem: elem)
-# pylint: enable=line-too-long
+@mock.patch(
+ 'apache_beam.examples.snippets.transforms.elementwise.map.print', str)
class MapTest(unittest.TestCase):
def test_map_simple(self):
map.map_simple(check_plants)
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/pardo.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/pardo.py
index 971e9f0..4ecd74d 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/pardo.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/pardo.py
@@ -18,7 +18,6 @@
from __future__ import absolute_import
from __future__ import print_function
-from __future__ import unicode_literals
def pardo_dofn(test=None):
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/pardo_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/pardo_test.py
index 8507e01..cbf4903 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/pardo_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/pardo_test.py
@@ -18,38 +18,42 @@
from __future__ import absolute_import
from __future__ import print_function
-from __future__ import unicode_literals
-import io
import platform
import sys
import unittest
import mock
+from apache_beam.examples.snippets.util import assert_matches_stdout
from apache_beam.testing.test_pipeline import TestPipeline
from apache_beam.testing.util import assert_that
from apache_beam.testing.util import equal_to
from . import pardo
+# TODO: Remove this after Python 2 deprecation.
+# https://issues.apache.org/jira/browse/BEAM-8124
+if sys.version_info[0] == 2:
+ from io import BytesIO as StringIO
+else:
+ from io import StringIO
+
def check_plants(actual):
- # [START plants]
- plants = [
- '🍓Strawberry',
- '🥕Carrot',
- '🍆Eggplant',
- '🍅Tomato',
- '🥔Potato',
- ]
- # [END plants]
- assert_that(actual, equal_to(plants))
+ expected = '''[START plants]
+🍓Strawberry
+🥕Carrot
+🍆Eggplant
+🍅Tomato
+🥔Potato
+[END plants]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
def check_dofn_params(actual):
# pylint: disable=line-too-long
- dofn_params = '\n'.join('''[START dofn_params]
+ expected = '\n'.join('''[START dofn_params]
# timestamp
type(timestamp) -> <class 'apache_beam.utils.timestamp.Timestamp'>
timestamp.micros -> 1584675660000000
@@ -63,7 +67,7 @@
window.max_timestamp() -> Timestamp(1584675689.999999) (2020-03-20 03:41:29.999999)
[END dofn_params]'''.splitlines()[1:-1])
# pylint: enable=line-too-long
- assert_that(actual, equal_to([dofn_params]))
+ assert_that(actual, equal_to([expected]))
def check_dofn_methods(actual):
@@ -83,23 +87,22 @@
@mock.patch('apache_beam.Pipeline', TestPipeline)
-# pylint: disable=line-too-long
-@mock.patch('apache_beam.examples.snippets.transforms.elementwise.pardo.print', lambda elem: elem)
-# pylint: enable=line-too-long
+@mock.patch(
+ 'apache_beam.examples.snippets.transforms.elementwise.pardo.print', str)
class ParDoTest(unittest.TestCase):
def test_pardo_dofn(self):
pardo.pardo_dofn(check_plants)
# TODO: Remove this after Python 2 deprecation.
# https://issues.apache.org/jira/browse/BEAM-8124
- @unittest.skipIf(sys.version_info[0] < 3 and platform.system() == 'Windows',
+ @unittest.skipIf(sys.version_info[0] == 2 and platform.system() == 'Windows',
'Python 2 on Windows uses `long` rather than `int`')
def test_pardo_dofn_params(self):
pardo.pardo_dofn_params(check_dofn_params)
@mock.patch('apache_beam.Pipeline', TestPipeline)
-@mock.patch('sys.stdout', new_callable=io.StringIO)
+@mock.patch('sys.stdout', new_callable=StringIO)
class ParDoStdoutTest(unittest.TestCase):
def test_pardo_dofn_methods(self, mock_stdout):
expected = pardo.pardo_dofn_methods(check_dofn_methods)
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/partition.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/partition.py
index 6f839d4..5633607 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/partition.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/partition.py
@@ -21,6 +21,7 @@
def partition_function(test=None):
+ # pylint: disable=line-too-long, expression-not-assigned
# [START partition_function]
import apache_beam as beam
@@ -41,24 +42,18 @@
])
| 'Partition' >> beam.Partition(by_duration, len(durations))
)
- _ = (
- annuals
- | 'Annuals' >> beam.Map(lambda x: print('annual: ' + str(x)))
- )
- _ = (
- biennials
- | 'Biennials' >> beam.Map(lambda x: print('biennial: ' + str(x)))
- )
- _ = (
- perennials
- | 'Perennials' >> beam.Map(lambda x: print('perennial: ' + str(x)))
- )
+
+ annuals | 'Annuals' >> beam.Map(lambda x: print('annual: {}'.format(x)))
+ biennials | 'Biennials' >> beam.Map(lambda x: print('biennial: {}'.format(x)))
+ perennials | 'Perennials' >> beam.Map(lambda x: print('perennial: {}'.format(x)))
# [END partition_function]
+ # pylint: enable=line-too-long, expression-not-assigned
if test:
test(annuals, biennials, perennials)
def partition_lambda(test=None):
+ # pylint: disable=line-too-long, expression-not-assigned
# [START partition_lambda]
import apache_beam as beam
@@ -79,24 +74,18 @@
len(durations),
)
)
- _ = (
- annuals
- | 'Annuals' >> beam.Map(lambda x: print('annual: ' + str(x)))
- )
- _ = (
- biennials
- | 'Biennials' >> beam.Map(lambda x: print('biennial: ' + str(x)))
- )
- _ = (
- perennials
- | 'Perennials' >> beam.Map(lambda x: print('perennial: ' + str(x)))
- )
+
+ annuals | 'Annuals' >> beam.Map(lambda x: print('annual: {}'.format(x)))
+ biennials | 'Biennials' >> beam.Map(lambda x: print('biennial: {}'.format(x)))
+ perennials | 'Perennials' >> beam.Map(lambda x: print('perennial: {}'.format(x)))
# [END partition_lambda]
+ # pylint: enable=line-too-long, expression-not-assigned
if test:
test(annuals, biennials, perennials)
def partition_multiple_arguments(test=None):
+ # pylint: disable=expression-not-assigned
# [START partition_multiple_arguments]
import apache_beam as beam
import json
@@ -123,14 +112,10 @@
])
| 'Partition' >> beam.Partition(split_dataset, 2, ratio=[8, 2])
)
- _ = (
- train_dataset
- | 'Train' >> beam.Map(lambda x: print('train: ' + str(x)))
- )
- _ = (
- test_dataset
- | 'Test' >> beam.Map(lambda x: print('test: ' + str(x)))
- )
+
+ train_dataset | 'Train' >> beam.Map(lambda x: print('train: {}'.format(x)))
+ test_dataset | 'Test' >> beam.Map(lambda x: print('test: {}'.format(x)))
# [END partition_multiple_arguments]
+ # pylint: enable=expression-not-assigned
if test:
test(train_dataset, test_dataset)
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/partition_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/partition_test.py
index 0b8ae3d..4f98ab1 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/partition_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/partition_test.py
@@ -23,52 +23,70 @@
import mock
+from apache_beam.examples.snippets.util import assert_matches_stdout
from apache_beam.testing.test_pipeline import TestPipeline
-from apache_beam.testing.util import assert_that
-from apache_beam.testing.util import equal_to
from . import partition
def check_partitions(actual1, actual2, actual3):
- # [START partitions]
+ expected = '''[START partitions]
+perennial: {'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'}
+biennial: {'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'}
+perennial: {'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'}
+annual: {'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'}
+perennial: {'icon': '🥔', 'name': 'Potato', 'duration': 'perennial'}
+[END partitions]'''.splitlines()[1:-1]
+
annuals = [
- {'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'},
+ line.split(':', 1)[1].strip()
+ for line in expected
+ if line.split(':', 1)[0] == 'annual'
]
biennials = [
- {'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'},
+ line.split(':', 1)[1].strip()
+ for line in expected
+ if line.split(':', 1)[0] == 'biennial'
]
perennials = [
- {'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'},
- {'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'},
- {'icon': '🥔', 'name': 'Potato', 'duration': 'perennial'},
+ line.split(':', 1)[1].strip()
+ for line in expected
+ if line.split(':', 1)[0] == 'perennial'
]
- # [END partitions]
- assert_that(actual1, equal_to(annuals), label='assert annuals')
- assert_that(actual2, equal_to(biennials), label='assert biennials')
- assert_that(actual3, equal_to(perennials), label='assert perennials')
+
+ assert_matches_stdout(actual1, annuals, label='annuals')
+ assert_matches_stdout(actual2, biennials, label='biennials')
+ assert_matches_stdout(actual3, perennials, label='perennials')
def check_split_datasets(actual1, actual2):
- # [START train_test]
+ expected = '''[START train_test]
+train: {'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'}
+train: {'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'}
+test: {'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'}
+test: {'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'}
+train: {'icon': '🥔', 'name': 'Potato', 'duration': 'perennial'}
+[END train_test]'''.splitlines()[1:-1]
+
train_dataset = [
- {'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'},
- {'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'},
- {'icon': '🥔', 'name': 'Potato', 'duration': 'perennial'},
+ line.split(':', 1)[1].strip()
+ for line in expected
+ if line.split(':', 1)[0] == 'train'
]
test_dataset = [
- {'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'},
- {'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'},
+ line.split(':', 1)[1].strip()
+ for line in expected
+ if line.split(':', 1)[0] == 'test'
]
- # [END train_test]
- assert_that(actual1, equal_to(train_dataset), label='assert train')
- assert_that(actual2, equal_to(test_dataset), label='assert test')
+
+ assert_matches_stdout(actual1, train_dataset, label='train_dataset')
+ assert_matches_stdout(actual2, test_dataset, label='test_dataset')
@mock.patch('apache_beam.Pipeline', TestPipeline)
-# pylint: disable=line-too-long
-@mock.patch('apache_beam.examples.snippets.transforms.elementwise.partition.print', lambda elem: elem)
-# pylint: enable=line-too-long
+@mock.patch(
+ 'apache_beam.examples.snippets.transforms.elementwise.partition.print',
+ lambda elem: elem)
class PartitionTest(unittest.TestCase):
def test_partition_function(self):
partition.partition_function(check_partitions)
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/regex_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/regex_test.py
index 8312312..9df9f62 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/regex_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/regex_test.py
@@ -23,123 +23,105 @@
import mock
+from apache_beam.examples.snippets.util import assert_matches_stdout
from apache_beam.testing.test_pipeline import TestPipeline
-from apache_beam.testing.util import assert_that
-from apache_beam.testing.util import equal_to
from . import regex
def check_matches(actual):
- # [START plants_matches]
- plants_matches = [
- '🍓, Strawberry, perennial',
- '🥕, Carrot, biennial',
- '🍆, Eggplant, perennial',
- '🍅, Tomato, annual',
- '🥔, Potato, perennial',
- ]
- # [END plants_matches]
- assert_that(actual, equal_to(plants_matches))
+ expected = '''[START plants_matches]
+🍓, Strawberry, perennial
+🥕, Carrot, biennial
+🍆, Eggplant, perennial
+🍅, Tomato, annual
+🥔, Potato, perennial
+[END plants_matches]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
def check_all_matches(actual):
- # [START plants_all_matches]
- plants_all_matches = [
- ['🍓, Strawberry, perennial', '🍓', 'Strawberry', 'perennial'],
- ['🥕, Carrot, biennial', '🥕', 'Carrot', 'biennial'],
- ['🍆, Eggplant, perennial', '🍆', 'Eggplant', 'perennial'],
- ['🍅, Tomato, annual', '🍅', 'Tomato', 'annual'],
- ['🥔, Potato, perennial', '🥔', 'Potato', 'perennial'],
- ]
- # [END plants_all_matches]
- assert_that(actual, equal_to(plants_all_matches))
+ expected = '''[START plants_all_matches]
+['🍓, Strawberry, perennial', '🍓', 'Strawberry', 'perennial']
+['🥕, Carrot, biennial', '🥕', 'Carrot', 'biennial']
+['🍆, Eggplant, perennial', '🍆', 'Eggplant', 'perennial']
+['🍅, Tomato, annual', '🍅', 'Tomato', 'annual']
+['🥔, Potato, perennial', '🥔', 'Potato', 'perennial']
+[END plants_all_matches]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
def check_matches_kv(actual):
- # [START plants_matches_kv]
- plants_matches_kv = [
- ('🍓', '🍓, Strawberry, perennial'),
- ('🥕', '🥕, Carrot, biennial'),
- ('🍆', '🍆, Eggplant, perennial'),
- ('🍅', '🍅, Tomato, annual'),
- ('🥔', '🥔, Potato, perennial'),
- ]
- # [END plants_matches_kv]
- assert_that(actual, equal_to(plants_matches_kv))
+ expected = '''[START plants_matches_kv]
+('🍓', '🍓, Strawberry, perennial')
+('🥕', '🥕, Carrot, biennial')
+('🍆', '🍆, Eggplant, perennial')
+('🍅', '🍅, Tomato, annual')
+('🥔', '🥔, Potato, perennial')
+[END plants_matches_kv]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
def check_find_all(actual):
- # [START plants_find_all]
- plants_find_all = [
- ['🍓, Strawberry, perennial'],
- ['🥕, Carrot, biennial'],
- ['🍆, Eggplant, perennial', '🍌, Banana, perennial'],
- ['🍅, Tomato, annual', '🍉, Watermelon, annual'],
- ['🥔, Potato, perennial'],
- ]
- # [END plants_find_all]
- assert_that(actual, equal_to(plants_find_all))
+ expected = '''[START plants_find_all]
+['🍓, Strawberry, perennial']
+['🥕, Carrot, biennial']
+['🍆, Eggplant, perennial', '🍌, Banana, perennial']
+['🍅, Tomato, annual', '🍉, Watermelon, annual']
+['🥔, Potato, perennial']
+[END plants_find_all]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
def check_find_kv(actual):
- # [START plants_find_kv]
- plants_find_all = [
- ('🍓', '🍓, Strawberry, perennial'),
- ('🥕', '🥕, Carrot, biennial'),
- ('🍆', '🍆, Eggplant, perennial'),
- ('🍌', '🍌, Banana, perennial'),
- ('🍅', '🍅, Tomato, annual'),
- ('🍉', '🍉, Watermelon, annual'),
- ('🥔', '🥔, Potato, perennial'),
- ]
- # [END plants_find_kv]
- assert_that(actual, equal_to(plants_find_all))
+ expected = '''[START plants_find_kv]
+('🍓', '🍓, Strawberry, perennial')
+('🥕', '🥕, Carrot, biennial')
+('🍆', '🍆, Eggplant, perennial')
+('🍌', '🍌, Banana, perennial')
+('🍅', '🍅, Tomato, annual')
+('🍉', '🍉, Watermelon, annual')
+('🥔', '🥔, Potato, perennial')
+[END plants_find_kv]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
def check_replace_all(actual):
- # [START plants_replace_all]
- plants_replace_all = [
- '🍓,Strawberry,perennial',
- '🥕,Carrot,biennial',
- '🍆,Eggplant,perennial',
- '🍅,Tomato,annual',
- '🥔,Potato,perennial',
- ]
- # [END plants_replace_all]
- assert_that(actual, equal_to(plants_replace_all))
+ expected = '''[START plants_replace_all]
+🍓,Strawberry,perennial
+🥕,Carrot,biennial
+🍆,Eggplant,perennial
+🍅,Tomato,annual
+🥔,Potato,perennial
+[END plants_replace_all]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
def check_replace_first(actual):
- # [START plants_replace_first]
- plants_replace_first = [
- '🍓: Strawberry, perennial',
- '🥕: Carrot, biennial',
- '🍆: Eggplant, perennial',
- '🍅: Tomato, annual',
- '🥔: Potato, perennial',
- ]
- # [END plants_replace_first]
- assert_that(actual, equal_to(plants_replace_first))
+ expected = '''[START plants_replace_first]
+🍓: Strawberry, perennial
+🥕: Carrot, biennial
+🍆: Eggplant, perennial
+🍅: Tomato, annual
+🥔: Potato, perennial
+[END plants_replace_first]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
def check_split(actual):
- # [START plants_split]
- plants_split = [
- ['🍓', 'Strawberry', 'perennial'],
- ['🥕', 'Carrot', 'biennial'],
- ['🍆', 'Eggplant', 'perennial'],
- ['🍅', 'Tomato', 'annual'],
- ['🥔', 'Potato', 'perennial'],
- ]
- # [END plants_split]
- assert_that(actual, equal_to(plants_split))
+ expected = '''[START plants_split]
+['🍓', 'Strawberry', 'perennial']
+['🥕', 'Carrot', 'biennial']
+['🍆', 'Eggplant', 'perennial']
+['🍅', 'Tomato', 'annual']
+['🥔', 'Potato', 'perennial']
+[END plants_split]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
@mock.patch('apache_beam.Pipeline', TestPipeline)
-# pylint: disable=line-too-long
-@mock.patch('apache_beam.examples.snippets.transforms.elementwise.regex.print', lambda elem: elem)
-# pylint: enable=line-too-long
+@mock.patch(
+ 'apache_beam.examples.snippets.transforms.elementwise.regex.print', str)
class RegexTest(unittest.TestCase):
def test_matches(self):
regex.regex_matches(check_matches)
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/tostring_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/tostring_test.py
index b253ea1..04939a7 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/tostring_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/tostring_test.py
@@ -19,77 +19,52 @@
from __future__ import absolute_import
from __future__ import print_function
-import sys
import unittest
import mock
+from apache_beam.examples.snippets.util import assert_matches_stdout
from apache_beam.testing.test_pipeline import TestPipeline
-from apache_beam.testing.util import assert_that
-from apache_beam.testing.util import equal_to
from . import tostring
def check_plants(actual):
- # [START plants]
- plants = [
- '🍓,Strawberry',
- '🥕,Carrot',
- '🍆,Eggplant',
- '🍅,Tomato',
- '🥔,Potato',
- ]
- # [END plants]
- assert_that(actual, equal_to(plants))
+ expected = '''[START plants]
+🍓,Strawberry
+🥕,Carrot
+🍆,Eggplant
+🍅,Tomato
+🥔,Potato
+[END plants]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
def check_plant_lists(actual):
- # [START plant_lists]
- plant_lists = [
- "['🍓', 'Strawberry', 'perennial']",
- "['🥕', 'Carrot', 'biennial']",
- "['🍆', 'Eggplant', 'perennial']",
- "['🍅', 'Tomato', 'annual']",
- "['🥔', 'Potato', 'perennial']",
- ]
- # [END plant_lists]
-
- # Some unicode characters become escaped with double backslashes.
- import apache_beam as beam
-
- def normalize_escaping(elem):
- # In Python 2 all utf-8 characters are escaped with double backslashes.
- # TODO: Remove this after Python 2 deprecation.
- # https://issues.apache.org/jira/browse/BEAM-8124
- if sys.version_info.major == 2:
- return elem.decode('string-escape')
-
- # In Python 3.5 some utf-8 characters are escaped with double backslashes.
- if '\\' in elem:
- return bytes(elem, 'utf-8').decode('unicode-escape')
- return elem
- actual = actual | beam.Map(normalize_escaping)
- assert_that(actual, equal_to(plant_lists))
+ expected = '''[START plant_lists]
+['🍓', 'Strawberry', 'perennial']
+['🥕', 'Carrot', 'biennial']
+['🍆', 'Eggplant', 'perennial']
+['🍅', 'Tomato', 'annual']
+['🥔', 'Potato', 'perennial']
+[END plant_lists]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
def check_plants_csv(actual):
- # [START plants_csv]
- plants_csv = [
- '🍓,Strawberry,perennial',
- '🥕,Carrot,biennial',
- '🍆,Eggplant,perennial',
- '🍅,Tomato,annual',
- '🥔,Potato,perennial',
- ]
- # [END plants_csv]
- assert_that(actual, equal_to(plants_csv))
+ expected = '''[START plants_csv]
+🍓,Strawberry,perennial
+🥕,Carrot,biennial
+🍆,Eggplant,perennial
+🍅,Tomato,annual
+🥔,Potato,perennial
+[END plants_csv]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
@mock.patch('apache_beam.Pipeline', TestPipeline)
-# pylint: disable=line-too-long
-@mock.patch('apache_beam.examples.snippets.transforms.elementwise.tostring.print', lambda elem: elem)
-# pylint: enable=line-too-long
+@mock.patch(
+ 'apache_beam.examples.snippets.transforms.elementwise.tostring.print', str)
class ToStringTest(unittest.TestCase):
def test_tostring_kvs(self):
tostring.tostring_kvs(check_plants)
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/values_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/values_test.py
index 06abef6..7a3b8f3 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/values_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/values_test.py
@@ -23,30 +23,26 @@
import mock
+from apache_beam.examples.snippets.util import assert_matches_stdout
from apache_beam.testing.test_pipeline import TestPipeline
-from apache_beam.testing.util import assert_that
-from apache_beam.testing.util import equal_to
from . import values
def check_plants(actual):
- # [START plants]
- plants = [
- 'Strawberry',
- 'Carrot',
- 'Eggplant',
- 'Tomato',
- 'Potato',
- ]
- # [END plants]
- assert_that(actual, equal_to(plants))
+ expected = '''[START plants]
+Strawberry
+Carrot
+Eggplant
+Tomato
+Potato
+[END plants]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
@mock.patch('apache_beam.Pipeline', TestPipeline)
-# pylint: disable=line-too-long
-@mock.patch('apache_beam.examples.snippets.transforms.elementwise.values.print', lambda elem: elem)
-# pylint: enable=line-too-long
+@mock.patch(
+ 'apache_beam.examples.snippets.transforms.elementwise.values.print', str)
class ValuesTest(unittest.TestCase):
def test_values(self):
values.values(check_plants)
diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/withtimestamps_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/withtimestamps_test.py
index 53fa7e2..ad8c31b 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/withtimestamps_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/withtimestamps_test.py
@@ -23,63 +23,55 @@
import mock
+import apache_beam as beam
+from apache_beam.examples.snippets.util import assert_matches_stdout
from apache_beam.testing.test_pipeline import TestPipeline
-from apache_beam.testing.util import assert_that
-from apache_beam.testing.util import equal_to
from . import withtimestamps
def check_plant_timestamps(actual):
- # [START plant_timestamps]
- plant_timestamps = [
- '2020-04-01 00:00:00 - Strawberry',
- '2020-06-01 00:00:00 - Carrot',
- '2020-03-01 00:00:00 - Artichoke',
- '2020-05-01 00:00:00 - Tomato',
- '2020-09-01 00:00:00 - Potato',
- ]
- # [END plant_timestamps]
- assert_that(actual, equal_to(plant_timestamps))
+ expected = '''[START plant_timestamps]
+2020-04-01 00:00:00 - Strawberry
+2020-06-01 00:00:00 - Carrot
+2020-03-01 00:00:00 - Artichoke
+2020-05-01 00:00:00 - Tomato
+2020-09-01 00:00:00 - Potato
+[END plant_timestamps]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
def check_plant_events(actual):
- # [START plant_events]
- plant_events = [
- '1 - Strawberry',
- '4 - Carrot',
- '2 - Artichoke',
- '3 - Tomato',
- '5 - Potato',
- ]
- # [END plant_events]
- assert_that(actual, equal_to(plant_events))
+ expected = '''[START plant_events]
+1 - Strawberry
+4 - Carrot
+2 - Artichoke
+3 - Tomato
+5 - Potato
+[END plant_events]'''.splitlines()[1:-1]
+ assert_matches_stdout(actual, expected)
def check_plant_processing_times(actual):
- import apache_beam as beam
-
- # [START plant_processing_times]
- plant_processing_times = [
- '2020-03-20 20:12:42.145594 - Strawberry',
- '2020-03-20 20:12:42.145827 - Carrot',
- '2020-03-20 20:12:42.145962 - Artichoke',
- '2020-03-20 20:12:42.146093 - Tomato',
- '2020-03-20 20:12:42.146216 - Potato',
- ]
- # [END plant_processing_times]
+ expected = '''[START plant_processing_times]
+2020-03-20 20:12:42.145594 - Strawberry
+2020-03-20 20:12:42.145827 - Carrot
+2020-03-20 20:12:42.145962 - Artichoke
+2020-03-20 20:12:42.146093 - Tomato
+2020-03-20 20:12:42.146216 - Potato
+[END plant_processing_times]'''.splitlines()[1:-1]
# Since `time.time()` will always give something different, we'll
# simply strip the timestamp information before testing the results.
actual = actual | beam.Map(lambda row: row.split('-')[-1].strip())
- expected = [row.split('-')[-1].strip() for row in plant_processing_times]
- assert_that(actual, equal_to(expected))
+ expected = [row.split('-')[-1].strip() for row in expected]
+ assert_matches_stdout(actual, expected)
@mock.patch('apache_beam.Pipeline', TestPipeline)
-# pylint: disable=line-too-long
-@mock.patch('apache_beam.examples.snippets.transforms.elementwise.withtimestamps.print', lambda elem: elem)
-# pylint: enable=line-too-long
+@mock.patch(
+ 'apache_beam.examples.snippets.transforms.elementwise.withtimestamps.print',
+ str)
class WithTimestampsTest(unittest.TestCase):
def test_event_time(self):
withtimestamps.withtimestamps_event_time(check_plant_timestamps)
diff --git a/sdks/python/apache_beam/examples/snippets/util.py b/sdks/python/apache_beam/examples/snippets/util.py
index 6e6e9e0..60c2c7e 100644
--- a/sdks/python/apache_beam/examples/snippets/util.py
+++ b/sdks/python/apache_beam/examples/snippets/util.py
@@ -17,28 +17,36 @@
from __future__ import absolute_import
-import argparse
+import ast
import shlex
import subprocess as sp
+import apache_beam as beam
+from apache_beam.testing.util import assert_that
+from apache_beam.testing.util import equal_to
-def parse_example(argv=None):
- """Parse the command line arguments and return it as a string function call.
- Examples:
- python path/to/snippets.py function_name
- python path/to/snippets.py function_name arg1
- python path/to/snippets.py function_name arg1 arg2 ... argN
+def assert_matches_stdout(
+ actual, expected_stdout, normalize_fn=lambda elem: elem, label=''):
+ """Asserts a PCollection of strings matches the expected stdout elements.
+
+ Args:
+ actual (beam.PCollection): A PCollection.
+ expected (List[str]): A list of stdout elements, one line per element.
+ normalize_fn (Function[any]): A function to normalize elements before
+ comparing them. Can be used to sort lists before comparing.
+ label (str): [optional] Label to make transform names unique.
"""
- parser = argparse.ArgumentParser()
- parser.add_argument('example', help='Name of the example to run.')
- parser.add_argument('args', nargs=argparse.REMAINDER,
- help='Arguments for example.')
- args = parser.parse_args(argv)
+ def stdout_to_python_object(elem_str):
+ try:
+ elem = ast.literal_eval(elem_str)
+ except (SyntaxError, ValueError):
+ elem = elem_str
+ return normalize_fn(elem)
- # Return the example as a string representing the Python function call.
- example_args = ', '.join([repr(arg) for arg in args.args])
- return '{}({})'.format(args.example, example_args)
+ actual = actual | label >> beam.Map(stdout_to_python_object)
+ expected = list(map(stdout_to_python_object, expected_stdout))
+ assert_that(actual, equal_to(expected), 'assert ' + label)
def run_shell_commands(commands, **kwargs):
diff --git a/sdks/python/apache_beam/examples/snippets/util_test.py b/sdks/python/apache_beam/examples/snippets/util_test.py
index a23e916..fcf3955 100644
--- a/sdks/python/apache_beam/examples/snippets/util_test.py
+++ b/sdks/python/apache_beam/examples/snippets/util_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
@@ -21,30 +22,54 @@
from mock import patch
-from apache_beam.examples.snippets.util import *
+import apache_beam as beam
+from apache_beam.examples.snippets import util
+from apache_beam.testing.test_pipeline import TestPipeline
class UtilTest(unittest.TestCase):
- def test_parse_example_empty(self):
- # python path/to/snippets.py
- argv = []
- with self.assertRaises(SystemExit):
- self.assertEqual(parse_example(argv), 'example()')
+ def test_assert_matches_stdout_object(self):
+ expected = [
+ "{'a': '🍓', 'b': True}",
+ "{'a': '🥕', 'b': 42}",
+ "{'a': '🍆', 'b': '\"hello\"'}",
+ "{'a': '🍅', 'b': [1, 2, 3]}",
+ "{'b': 'B', 'a': '🥔'}",
+ ]
+ with TestPipeline() as pipeline:
+ actual = (
+ pipeline
+ | beam.Create([
+ {'a': '🍓', 'b': True},
+ {'a': '🥕', 'b': 42},
+ {'a': '🍆', 'b': '"hello"'},
+ {'a': '🍅', 'b': [1, 2, 3]},
+ {'a': '🥔', 'b': 'B'},
+ ])
+ | beam.Map(str)
+ )
+ util.assert_matches_stdout(actual, expected)
- def test_parse_example_no_arguments(self):
- # python path/to/snippets.py example
- argv = ['example']
- self.assertEqual(parse_example(argv), 'example()')
+ def test_assert_matches_stdout_string(self):
+ expected = ['🍓', '🥕', '🍆', '🍅', '🥔']
+ with TestPipeline() as pipeline:
+ actual = (
+ pipeline
+ | beam.Create(['🍓', '🥕', '🍆', '🍅', '🥔'])
+ | beam.Map(str)
+ )
+ util.assert_matches_stdout(actual, expected)
- def test_parse_example_one_argument(self):
- # python path/to/snippets.py example A
- argv = ['example', 'A']
- self.assertEqual(parse_example(argv), "example('A')")
-
- def test_parse_example_multiple_arguments(self):
- # python path/to/snippets.py example A B "C's"
- argv = ['example', 'A', 'B', "C's"]
- self.assertEqual(parse_example(argv), "example('A', 'B', \"C's\")")
+ def test_assert_matches_stdout_sorted_keys(self):
+ expected = [{'list': [1, 2]}, {'list': [3, 4]}]
+ with TestPipeline() as pipeline:
+ actual = (
+ pipeline
+ | beam.Create([{'list': [2, 1]}, {'list': [4, 3]}])
+ | beam.Map(str)
+ )
+ util.assert_matches_stdout(
+ actual, expected, lambda elem: {'sorted': sorted(elem['list'])})
@patch('subprocess.call', lambda cmd: None)
def test_run_shell_commands(self):
@@ -54,7 +79,7 @@
' !echo {variable} ',
' echo "quoted arguments work" # trailing comment ',
]
- actual = list(run_shell_commands(commands, variable='hello world'))
+ actual = list(util.run_shell_commands(commands, variable='hello world'))
expected = [
['echo', 'this', 'is', 'a', 'shell', 'command'],
['echo', 'hello', 'world'],
diff --git a/sdks/python/apache_beam/io/external/gcp/__init__.py b/sdks/python/apache_beam/io/external/gcp/__init__.py
new file mode 100644
index 0000000..6569e3f
--- /dev/null
+++ b/sdks/python/apache_beam/io/external/gcp/__init__.py
@@ -0,0 +1,18 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
diff --git a/sdks/python/apache_beam/io/external/gcp/pubsub.py b/sdks/python/apache_beam/io/external/gcp/pubsub.py
new file mode 100644
index 0000000..f0988ed
--- /dev/null
+++ b/sdks/python/apache_beam/io/external/gcp/pubsub.py
@@ -0,0 +1,168 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
+
+import typing
+
+from past.builtins import unicode
+
+import apache_beam as beam
+from apache_beam.io.gcp import pubsub
+from apache_beam.transforms import Map
+from apache_beam.transforms.external import ExternalTransform
+from apache_beam.transforms.external import NamedTupleBasedPayloadBuilder
+
+ReadFromPubsubSchema = typing.NamedTuple(
+ 'ReadFromPubsubSchema',
+ [
+ ('topic', typing.Optional[unicode]),
+ ('subscription', typing.Optional[unicode]),
+ ('id_label', typing.Optional[unicode]),
+ ('with_attributes', bool),
+ ('timestamp_attribute', typing.Optional[unicode]),
+ ]
+)
+
+
+class ReadFromPubSub(beam.PTransform):
+ """An external ``PTransform`` for reading from Cloud Pub/Sub.
+
+ Experimental; no backwards compatibility guarantees. It requires special
+ preparation of the Java SDK. See BEAM-7870.
+ """
+
+ URN = 'beam:external:java:pubsub:read:v1'
+
+ def __init__(self, topic=None, subscription=None, id_label=None,
+ with_attributes=False, timestamp_attribute=None,
+ expansion_service=None):
+ """Initializes ``ReadFromPubSub``.
+
+ Args:
+ topic: Cloud Pub/Sub topic in the form
+ "projects/<project>/topics/<topic>". If provided, subscription must be
+ None.
+ subscription: Existing Cloud Pub/Sub subscription to use in the
+ form "projects/<project>/subscriptions/<subscription>". If not
+ specified, a temporary subscription will be created from the specified
+ topic. If provided, topic must be None.
+ id_label: The attribute on incoming Pub/Sub messages to use as a unique
+ record identifier. When specified, the value of this attribute (which
+ can be any string that uniquely identifies the record) will be used for
+ deduplication of messages. If not provided, we cannot guarantee
+ that no duplicate data will be delivered on the Pub/Sub stream. In this
+ case, deduplication of the stream will be strictly best effort.
+ with_attributes:
+ True - output elements will be
+ :class:`~apache_beam.io.gcp.pubsub.PubsubMessage` objects.
+ False - output elements will be of type ``bytes`` (message
+ data only).
+ timestamp_attribute: Message value to use as element timestamp. If None,
+ uses message publishing time as the timestamp.
+
+ Timestamp values should be in one of two formats:
+
+ - A numerical value representing the number of milliseconds since the
+ Unix epoch.
+ - A string in RFC 3339 format, UTC timezone. Example:
+ ``2015-10-29T23:41:41.123Z``. The sub-second component of the
+ timestamp is optional, and digits beyond the first three (i.e., time
+ units smaller than milliseconds) may be ignored.
+ """
+ self.params = ReadFromPubsubSchema(
+ topic=topic,
+ subscription=subscription,
+ id_label=id_label,
+ with_attributes=with_attributes,
+ timestamp_attribute=timestamp_attribute)
+ self.expansion_service = expansion_service
+
+ def expand(self, pbegin):
+ pcoll = pbegin.apply(
+ ExternalTransform(
+ self.URN, NamedTupleBasedPayloadBuilder(self.params),
+ self.expansion_service))
+
+ if self.params.with_attributes:
+ pcoll = pcoll | 'FromProto' >> Map(pubsub.PubsubMessage._from_proto_str)
+ pcoll.element_type = pubsub.PubsubMessage
+ else:
+ pcoll.element_type = bytes
+ return pcoll
+
+
+WriteToPubsubSchema = typing.NamedTuple(
+ 'WriteToPubsubSchema',
+ [
+ ('topic', unicode),
+ ('id_label', typing.Optional[unicode]),
+ # this is not implemented yet on the Java side:
+ # ('with_attributes', bool),
+ ('timestamp_attribute', typing.Optional[unicode]),
+ ]
+)
+
+
+class WriteToPubSub(beam.PTransform):
+ """An external ``PTransform`` for writing messages to Cloud Pub/Sub.
+
+ Experimental; no backwards compatibility guarantees. It requires special
+ preparation of the Java SDK. See BEAM-7870.
+ """
+
+ URN = 'beam:external:java:pubsub:write:v1'
+
+ def __init__(self, topic, with_attributes=False, id_label=None,
+ timestamp_attribute=None, expansion_service=None):
+ """Initializes ``WriteToPubSub``.
+
+ Args:
+ topic: Cloud Pub/Sub topic in the form "/topics/<project>/<topic>".
+ with_attributes:
+ True - input elements will be
+ :class:`~apache_beam.io.gcp.pubsub.PubsubMessage` objects.
+ False - input elements will be of type ``bytes`` (message
+ data only).
+ id_label: If set, will set an attribute for each Cloud Pub/Sub message
+ with the given name and a unique value. This attribute can then be used
+ in a ReadFromPubSub PTransform to deduplicate messages.
+ timestamp_attribute: If set, will set an attribute for each Cloud Pub/Sub
+ message with the given name and the message's publish time as the value.
+ """
+ self.params = WriteToPubsubSchema(
+ topic=topic,
+ id_label=id_label,
+ # with_attributes=with_attributes,
+ timestamp_attribute=timestamp_attribute)
+ self.expansion_service = expansion_service
+ self.with_attributes = with_attributes
+
+ def expand(self, pvalue):
+ if self.with_attributes:
+ pcoll = pvalue | 'ToProto' >> Map(pubsub.WriteToPubSub.to_proto_str)
+ else:
+ pcoll = pvalue | 'ToProto' >> Map(
+ lambda x: pubsub.PubsubMessage(x, {})._to_proto_str())
+ pcoll.element_type = bytes
+
+ return pcoll.apply(
+ ExternalTransform(
+ self.URN,
+ NamedTupleBasedPayloadBuilder(self.params),
+ self.expansion_service)
+ )
diff --git a/sdks/python/apache_beam/io/external/kafka.py b/sdks/python/apache_beam/io/external/kafka.py
index f824515..04d91a7 100644
--- a/sdks/python/apache_beam/io/external/kafka.py
+++ b/sdks/python/apache_beam/io/external/kafka.py
@@ -64,7 +64,8 @@
Note: Runners need to support translating Read operations in order to use
this source. At the moment only the Flink Runner supports this.
- Experimental; no backwards compatibility guarantees.
+ Experimental; no backwards compatibility guarantees. It requires special
+ preparation of the Java SDK. See BEAM-7870.
"""
# Returns the key/value data as raw byte arrays
@@ -128,7 +129,8 @@
If no Kafka Serializer for key/value is provided, then key/value are
assumed to be byte arrays.
- Experimental; no backwards compatibility guarantees.
+ Experimental; no backwards compatibility guarantees. It requires special
+ preparation of the Java SDK. See BEAM-7870.
"""
# Default serializer which passes raw bytes to Kafka
diff --git a/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py b/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py
index e74a39b..cb285ea 100644
--- a/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py
+++ b/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py
@@ -527,6 +527,8 @@
return WaitForBQJobs.FAILED
elif job.status.state == 'DONE':
continue
+ else:
+ return WaitForBQJobs.WAITING
return WaitForBQJobs.ALL_DONE
diff --git a/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py b/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py
index 3a1c1eb..035be18 100644
--- a/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py
+++ b/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py
@@ -23,6 +23,7 @@
import logging
import os
import random
+import sys
import time
import unittest
@@ -424,6 +425,90 @@
assert_that(jobs,
equal_to([job_reference]), label='CheckJobs')
+ @unittest.skipIf(sys.version_info[0] == 2,
+ 'Mock pickling problems in Py 2')
+ @mock.patch('time.sleep')
+ def test_wait_for_job_completion(self, sleep_mock):
+ job_references = [bigquery_api.JobReference(),
+ bigquery_api.JobReference()]
+ job_references[0].projectId = 'project1'
+ job_references[0].jobId = 'jobId1'
+ job_references[1].projectId = 'project1'
+ job_references[1].jobId = 'jobId2'
+
+ job_1_waiting = mock.Mock()
+ job_1_waiting.status.state = 'RUNNING'
+ job_2_done = mock.Mock()
+ job_2_done.status.state = 'DONE'
+ job_2_done.status.errorResult = None
+
+ job_1_done = mock.Mock()
+ job_1_done.status.state = 'DONE'
+ job_1_done.status.errorResult = None
+
+ bq_client = mock.Mock()
+ bq_client.jobs.Get.side_effect = [
+ job_1_waiting,
+ job_2_done,
+ job_1_done,
+ job_2_done]
+
+ waiting_dofn = bqfl.WaitForBQJobs(bq_client)
+
+ dest_list = [(i, job) for i, job in enumerate(job_references)]
+
+ with TestPipeline('DirectRunner') as p:
+ references = beam.pvalue.AsList(p | 'job_ref' >> beam.Create(dest_list))
+ outputs = (p
+ | beam.Create([''])
+ | beam.ParDo(waiting_dofn, references))
+
+ assert_that(outputs,
+ equal_to(dest_list))
+
+ sleep_mock.assert_called_once()
+
+ @unittest.skipIf(sys.version_info[0] == 2,
+ 'Mock pickling problems in Py 2')
+ @mock.patch('time.sleep')
+ def test_one_job_failed_after_waiting(self, sleep_mock):
+ job_references = [bigquery_api.JobReference(),
+ bigquery_api.JobReference()]
+ job_references[0].projectId = 'project1'
+ job_references[0].jobId = 'jobId1'
+ job_references[1].projectId = 'project1'
+ job_references[1].jobId = 'jobId2'
+
+ job_1_waiting = mock.Mock()
+ job_1_waiting.status.state = 'RUNNING'
+ job_2_done = mock.Mock()
+ job_2_done.status.state = 'DONE'
+ job_2_done.status.errorResult = None
+
+ job_1_error = mock.Mock()
+ job_1_error.status.state = 'DONE'
+ job_1_error.status.errorResult = 'Some problems happened'
+
+ bq_client = mock.Mock()
+ bq_client.jobs.Get.side_effect = [
+ job_1_waiting,
+ job_2_done,
+ job_1_error,
+ job_2_done]
+
+ waiting_dofn = bqfl.WaitForBQJobs(bq_client)
+
+ dest_list = [(i, job) for i, job in enumerate(job_references)]
+
+ with self.assertRaises(Exception):
+ with TestPipeline('DirectRunner') as p:
+ references = beam.pvalue.AsList(p | 'job_ref' >> beam.Create(dest_list))
+ _ = (p
+ | beam.Create([''])
+ | beam.ParDo(waiting_dofn, references))
+
+ sleep_mock.assert_called_once()
+
def test_multiple_partition_files(self):
destination = 'project1:dataset1.table1'
diff --git a/sdks/python/apache_beam/io/gcp/bigquery_test.py b/sdks/python/apache_beam/io/gcp/bigquery_test.py
index 153635c..b8c8c1c 100644
--- a/sdks/python/apache_beam/io/gcp/bigquery_test.py
+++ b/sdks/python/apache_beam/io/gcp/bigquery_test.py
@@ -36,6 +36,7 @@
import apache_beam as beam
from apache_beam.internal.gcp.json_value import to_json_value
+from apache_beam.io.filebasedsink_test import _TestCaseWithTempDirCleanUp
from apache_beam.io.gcp import bigquery_tools
from apache_beam.io.gcp.bigquery import TableRowJsonCoder
from apache_beam.io.gcp.bigquery import WriteToBigQuery
@@ -503,7 +504,14 @@
# InsertRows not called in finish bundle as no records
self.assertFalse(client.tabledata.InsertAll.called)
+
+@unittest.skipIf(HttpError is None, 'GCP dependencies are not installed')
+class PipelineBasedStreamingInsertTest(_TestCaseWithTempDirCleanUp):
+
def test_failure_has_same_insert_ids(self):
+ tempdir = '%s%s' % (self._new_tempdir(), os.sep)
+ file_name_1 = os.path.join(tempdir, 'file1')
+ file_name_2 = os.path.join(tempdir, 'file2')
def store_callback(arg):
insert_ids = [r.insertId for r in arg.tableDataInsertAllRequest.rows]
@@ -513,11 +521,13 @@
'colA_values': colA_values}
# The first time we try to insert, we save those insertions in
# file insert_calls1.
- if not os.path.exists('insert_calls1'):
- json.dump(json_output, open('insert_calls1', 'w'))
- raise Exception()
+ if not os.path.exists(file_name_1):
+ with open(file_name_1, 'w') as f:
+ json.dump(json_output, f)
+ raise RuntimeError()
else:
- json.dump(json_output, open('insert_calls2', 'w'))
+ with open(file_name_2, 'w') as f:
+ json.dump(json_output, f)
res = mock.Mock()
res.insertErrors = []
@@ -526,6 +536,8 @@
client = mock.Mock()
client.tabledata.InsertAll = mock.Mock(side_effect=store_callback)
+ # Using the bundle based direct runner to avoid pickling problems
+ # with mocks.
with beam.Pipeline(runner='BundleBasedDirectRunner') as p:
_ = (p
| beam.Create([{'columnA':'value1', 'columnB':'value2'},
@@ -540,9 +552,10 @@
None, None,
[], test_client=client))
- self.assertEqual(
- json.load(open('insert_calls1')),
- json.load(open('insert_calls2')))
+ with open(file_name_1) as f1, open(file_name_2) as f2:
+ self.assertEqual(
+ json.load(f1),
+ json.load(f2))
class BigQueryStreamingInsertTransformIntegrationTests(unittest.TestCase):
diff --git a/sdks/python/apache_beam/io/gcp/datastore/v1new/types.py b/sdks/python/apache_beam/io/gcp/datastore/v1new/types.py
index 7370d97..a664ec7 100644
--- a/sdks/python/apache_beam/io/gcp/datastore/v1new/types.py
+++ b/sdks/python/apache_beam/io/gcp/datastore/v1new/types.py
@@ -221,6 +221,8 @@
for name, value in client_entity.items():
if isinstance(value, key.Key):
value = Key.from_client_key(value)
+ if isinstance(value, entity.Entity):
+ value = Entity.from_client_entity(value)
res.properties[name] = value
return res
@@ -236,6 +238,10 @@
if not value.project:
value.project = self.key.project
value = value.to_client_key()
+ if isinstance(value, Entity):
+ if not value.key.project:
+ value.key.project = self.key.project
+ value = value.to_client_entity()
res[name] = value
return res
diff --git a/sdks/python/apache_beam/io/gcp/datastore/v1new/types_test.py b/sdks/python/apache_beam/io/gcp/datastore/v1new/types_test.py
index 0f6f936..21633d9 100644
--- a/sdks/python/apache_beam/io/gcp/datastore/v1new/types_test.py
+++ b/sdks/python/apache_beam/io/gcp/datastore/v1new/types_test.py
@@ -19,6 +19,7 @@
from __future__ import absolute_import
+import datetime
import logging
import unittest
@@ -29,6 +30,7 @@
# Protect against environments where datastore library is not available.
try:
from google.cloud.datastore import client
+ from google.cloud.datastore.helpers import GeoPoint
from apache_beam.io.gcp.datastore.v1new.types import Entity
from apache_beam.io.gcp.datastore.v1new.types import Key
from apache_beam.io.gcp.datastore.v1new.types import Query
@@ -49,30 +51,52 @@
# Don't do any network requests.
_http=mock.MagicMock())
+ def _assert_keys_equal(self, beam_type, client_type, expected_project):
+ self.assertEqual(beam_type.path_elements[0], client_type.kind)
+ self.assertEqual(beam_type.path_elements[1], client_type.id)
+ self.assertEqual(expected_project, client_type.project)
+
def testEntityToClientEntity(self):
+ # Test conversion from Beam type to client type.
k = Key(['kind', 1234], project=self._PROJECT)
kc = k.to_client_key()
- exclude_from_indexes = ('efi1', 'efi2')
+ exclude_from_indexes = ('datetime', 'key')
e = Entity(k, exclude_from_indexes=exclude_from_indexes)
- ref = Key(['kind2', 1235])
- e.set_properties({'efi1': 'value', 'property': 'value', 'ref': ref})
+ properties = {
+ 'datetime': datetime.datetime.utcnow(),
+ 'key_ref': Key(['kind2', 1235]),
+ 'bool': True,
+ 'float': 1.21,
+ 'int': 1337,
+ 'unicode': 'text',
+ 'bytes': b'bytes',
+ 'geopoint': GeoPoint(0.123, 0.456),
+ 'none': None,
+ 'list': [1, 2, 3],
+ 'entity': Entity(Key(['kind', 111])),
+ 'dict': {'property': 5},
+ }
+ e.set_properties(properties)
ec = e.to_client_entity()
self.assertEqual(kc, ec.key)
self.assertSetEqual(set(exclude_from_indexes), ec.exclude_from_indexes)
self.assertEqual('kind', ec.kind)
self.assertEqual(1234, ec.id)
- self.assertEqual('kind2', ec['ref'].kind)
- self.assertEqual(1235, ec['ref'].id)
- self.assertEqual(self._PROJECT, ec['ref'].project)
+ for name, unconverted in properties.items():
+ converted = ec[name]
+ if name == 'key_ref':
+ self.assertNotIsInstance(converted, Key)
+ self._assert_keys_equal(unconverted, converted, self._PROJECT)
+ elif name == 'entity':
+ self.assertNotIsInstance(converted, Entity)
+ self.assertNotIsInstance(converted.key, Key)
+ self._assert_keys_equal(unconverted.key, converted.key, self._PROJECT)
+ else:
+ self.assertEqual(unconverted, converted)
- def testEntityFromClientEntity(self):
- k = Key(['kind', 1234], project=self._PROJECT)
- exclude_from_indexes = ('efi1', 'efi2')
- e = Entity(k, exclude_from_indexes=exclude_from_indexes)
- ref = Key(['kind2', 1235])
- e.set_properties({'efi1': 'value', 'property': 'value', 'ref': ref})
- efc = Entity.from_client_entity(e.to_client_entity())
- self.assertEqual(e, efc)
+ # Test reverse conversion.
+ entity_from_client_entity = Entity.from_client_entity(ec)
+ self.assertEqual(e, entity_from_client_entity)
def testKeyToClientKey(self):
k = Key(['kind1', 'parent'],
diff --git a/sdks/python/apache_beam/io/iobase.py b/sdks/python/apache_beam/io/iobase.py
index 605c1bf..5b66730 100644
--- a/sdks/python/apache_beam/io/iobase.py
+++ b/sdks/python/apache_beam/io/iobase.py
@@ -1440,6 +1440,9 @@
def restriction_size(self, element, restriction):
return restriction.weight
+ def restriction_coder(self):
+ return coders.DillCoder()
+
def __init__(self, source):
if not isinstance(source, BoundedSource):
raise RuntimeError('SDFBoundedSourceWrapper can only wrap BoundedSource')
diff --git a/sdks/python/apache_beam/io/iobase_test.py b/sdks/python/apache_beam/io/iobase_test.py
index c7d1656..7adb764 100644
--- a/sdks/python/apache_beam/io/iobase_test.py
+++ b/sdks/python/apache_beam/io/iobase_test.py
@@ -21,10 +21,16 @@
import unittest
+import mock
+
+import apache_beam as beam
from apache_beam.io.concat_source import ConcatSource
from apache_beam.io.concat_source_test import RangeSource
from apache_beam.io import iobase
from apache_beam.io.iobase import SourceBundle
+from apache_beam.options.pipeline_options import DebugOptions
+from apache_beam.testing.util import assert_that
+from apache_beam.testing.util import equal_to
class SDFBoundedSourceRestrictionProviderTest(unittest.TestCase):
@@ -154,5 +160,36 @@
self.sdf_restriction_tracker._weight)
+class UseSdfBoundedSourcesTests(unittest.TestCase):
+
+ def _run_sdf_wrapper_pipeline(self, source, expected_values):
+ with beam.Pipeline() as p:
+ experiments = (p._options.view_as(DebugOptions).experiments or [])
+
+ # Setup experiment option to enable using SDFBoundedSourceWrapper
+ if 'use_sdf_bounded_source' not in experiments:
+ experiments.append('use_sdf_bounded_source')
+ if 'beam_fn_api' not in experiments:
+ # Required so mocking below doesn't mock Create used in assert_that.
+ experiments.append('beam_fn_api')
+
+ p._options.view_as(DebugOptions).experiments = experiments
+
+ actual = p | beam.io.Read(source)
+ assert_that(actual, equal_to(expected_values))
+
+ @mock.patch('apache_beam.io.iobase._SDFBoundedSourceWrapper.expand')
+ def test_sdf_wrapper_overrides_read(self, sdf_wrapper_mock_expand):
+ def _fake_wrapper_expand(pbegin):
+ return (pbegin
+ | beam.Create(['fake']))
+
+ sdf_wrapper_mock_expand.side_effect = _fake_wrapper_expand
+ self._run_sdf_wrapper_pipeline(RangeSource(0, 4), ['fake'])
+
+ def test_sdf_wrap_range_source(self):
+ self._run_sdf_wrapper_pipeline(RangeSource(0, 4), [0, 1, 2, 3])
+
+
if __name__ == '__main__':
unittest.main()
diff --git a/sdks/python/apache_beam/metrics/monitoring_infos.py b/sdks/python/apache_beam/metrics/monitoring_infos.py
index 0e73461..b75ef3a 100644
--- a/sdks/python/apache_beam/metrics/monitoring_infos.py
+++ b/sdks/python/apache_beam/metrics/monitoring_infos.py
@@ -217,6 +217,25 @@
labels)
+def int64_gauge(urn, metric, ptransform=None, tag=None):
+ """Return the gauge monitoring info for the URN, metric and labels.
+
+ Args:
+ urn: The URN of the monitoring info/metric.
+ metric: The metric proto field to use in the monitoring info.
+ ptransform: The ptransform/step name used as a label.
+ tag: The output tag name, used as a label.
+ """
+ labels = create_labels(ptransform=ptransform, tag=tag)
+ if isinstance(metric, int):
+ metric = metrics_pb2.Metric(
+ counter_data=metrics_pb2.CounterData(
+ int64_value=metric
+ )
+ )
+ return create_monitoring_info(urn, LATEST_INT64_TYPE, metric, labels)
+
+
def create_monitoring_info(urn, type_urn, metric_proto, labels=None):
"""Return the gauge monitoring info for the URN, type, metric and labels.
@@ -300,6 +319,13 @@
return split[0], split[1]
+def get_step_name(monitoring_info_proto):
+ """Returns a step name for the given monitoring info or None if step name
+ cannot be specified."""
+ # Right now only metrics that have a PTRANSFORM are taken into account
+ return monitoring_info_proto.labels.get(PTRANSFORM_LABEL)
+
+
def to_key(monitoring_info_proto):
"""Returns a key based on the URN and labels.
diff --git a/sdks/python/apache_beam/options/pipeline_options.py b/sdks/python/apache_beam/options/pipeline_options.py
index 5842f7e..22d1586 100644
--- a/sdks/python/apache_beam/options/pipeline_options.py
+++ b/sdks/python/apache_beam/options/pipeline_options.py
@@ -814,7 +814,8 @@
class PortableOptions(PipelineOptions):
"""Portable options are common options expected to be understood by most of
- the portable runners.
+ the portable runners. Should generally be kept in sync with
+ PortablePipelineOptions.java.
"""
@classmethod
def _add_argparse_args(cls, parser):
@@ -845,9 +846,9 @@
'"<ENV_VAL>"} }. All fields in the json are optional except '
'command.'))
parser.add_argument(
- '--sdk_worker_parallelism', default=0,
+ '--sdk_worker_parallelism', default=1,
help=('Sets the number of sdk worker processes that will run on each '
- 'worker node. Default is 0. If 0, a value will be chosen by the '
+ 'worker node. Default is 1. If 0, a value will be chosen by the '
'runner.'))
parser.add_argument(
'--environment_cache_millis', default=0,
diff --git a/sdks/python/apache_beam/pipeline.py b/sdks/python/apache_beam/pipeline.py
index 5ce95d0..7dd1299 100644
--- a/sdks/python/apache_beam/pipeline.py
+++ b/sdks/python/apache_beam/pipeline.py
@@ -154,10 +154,9 @@
raise ValueError(
'Pipeline has validations errors: \n' + '\n'.join(errors))
- # set default experiments for portable runner
+ # set default experiments for portable runners
# (needs to occur prior to pipeline construction)
- portable_runners = ['PortableRunner', 'FlinkRunner']
- if self._options.view_as(StandardOptions).runner in portable_runners:
+ if runner.is_fnapi_compatible():
experiments = (self._options.view_as(DebugOptions).experiments or [])
if not 'beam_fn_api' in experiments:
experiments.append('beam_fn_api')
@@ -486,8 +485,7 @@
label or transform.label]).lstrip('/')
if full_label in self.applied_labels:
raise RuntimeError(
- 'Transform "%s" does not have a stable unique label. '
- 'This will prevent updating of pipelines. '
+ 'A transform with label "%s" already exists in the pipeline. '
'To apply a transform with a specified label write '
'pvalue | "label" >> transform'
% full_label)
@@ -545,13 +543,15 @@
return pvalueish_result
def _infer_result_type(self, transform, inputs, result_pcollection):
- # TODO(robertwb): Multi-input, multi-output inference.
+ # TODO(robertwb): Multi-input inference.
type_options = self._options.view_as(TypeOptions)
- if (type_options is not None and type_options.pipeline_type_check
- and isinstance(result_pcollection, pvalue.PCollection)
+ if type_options is None or not type_options.pipeline_type_check:
+ return
+ if (isinstance(result_pcollection, pvalue.PCollection)
and (not result_pcollection.element_type
# TODO(robertwb): Ideally we'd do intersection here.
or result_pcollection.element_type == typehints.Any)):
+ # Single-input, single-output inference.
input_element_type = (
inputs[0].element_type
if len(inputs) == 1
@@ -571,6 +571,13 @@
else:
result_pcollection.element_type = transform.infer_output_type(
input_element_type)
+ elif isinstance(result_pcollection, pvalue.DoOutputsTuple):
+ # Single-input, multi-output inference.
+ # TODO(BEAM-4132): Add support for tagged type hints.
+ # https://github.com/apache/beam/pull/9810#discussion_r338765251
+ for pcoll in result_pcollection:
+ if pcoll.element_type is None:
+ pcoll.element_type = typehints.Any
def __reduce__(self):
# Some transforms contain a reference to their enclosing pipeline,
diff --git a/sdks/python/apache_beam/pipeline_test.py b/sdks/python/apache_beam/pipeline_test.py
index e01e100..a3128db 100644
--- a/sdks/python/apache_beam/pipeline_test.py
+++ b/sdks/python/apache_beam/pipeline_test.py
@@ -290,9 +290,8 @@
pipeline.apply(transform, pcoll2)
self.assertEqual(
cm.exception.args[0],
- 'Transform "CustomTransform" does not have a stable unique label. '
- 'This will prevent updating of pipelines. '
- 'To apply a transform with a specified label write '
+ 'A transform with label "CustomTransform" already exists in the '
+ 'pipeline. To apply a transform with a specified label write '
'pvalue | "label" >> transform')
def test_reuse_cloned_custom_transform_instance(self):
diff --git a/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py b/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py
index 9b2cf68..4928550 100644
--- a/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py
+++ b/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py
@@ -23,6 +23,7 @@
from __future__ import absolute_import
from __future__ import division
+import base64
import json
import logging
import sys
@@ -111,6 +112,9 @@
self._cache = cache if cache is not None else PValueCache()
self._unique_step_id = 0
+ def is_fnapi_compatible(self):
+ return False
+
def _get_unique_step_name(self):
self._unique_step_id += 1
return 's%s' % self._unique_step_id
@@ -629,8 +633,16 @@
coders.BytesCoder(),
coders.coders.GlobalWindowCoder()).get_impl().encode_nested(
window.GlobalWindows.windowed_value(b''))
+
+ from apache_beam.runners.dataflow.internal import apiclient
+ if apiclient._use_fnapi(options):
+ encoded_impulse_as_str = self.byte_array_to_json_string(
+ encoded_impulse_element)
+ else:
+ encoded_impulse_as_str = base64.b64encode(
+ encoded_impulse_element).decode('ascii')
step.add_property(PropertyNames.IMPULSE_ELEMENT,
- self.byte_array_to_json_string(encoded_impulse_element))
+ encoded_impulse_as_str)
step.encoding = self._get_encoded_output_coder(transform_node)
step.add_property(
diff --git a/sdks/python/apache_beam/runners/dataflow/internal/names.py b/sdks/python/apache_beam/runners/dataflow/internal/names.py
index c0445a2..7e4f825 100644
--- a/sdks/python/apache_beam/runners/dataflow/internal/names.py
+++ b/sdks/python/apache_beam/runners/dataflow/internal/names.py
@@ -38,10 +38,10 @@
# Update this version to the next version whenever there is a change that will
# require changes to legacy Dataflow worker execution environment.
-BEAM_CONTAINER_VERSION = 'beam-master-20191010'
+BEAM_CONTAINER_VERSION = 'beam-master-20191029'
# Update this version to the next version whenever there is a change that
# requires changes to SDK harness container or SDK harness launcher.
-BEAM_FNAPI_CONTAINER_VERSION = 'beam-master-20191010'
+BEAM_FNAPI_CONTAINER_VERSION = 'beam-master-20191029'
# TODO(BEAM-5939): Remove these shared names once Dataflow worker is updated.
PICKLED_MAIN_SESSION_FILE = 'pickled_main_session'
diff --git a/sdks/python/apache_beam/runners/direct/direct_runner.py b/sdks/python/apache_beam/runners/direct/direct_runner.py
index e3643d7..d85fc97 100644
--- a/sdks/python/apache_beam/runners/direct/direct_runner.py
+++ b/sdks/python/apache_beam/runners/direct/direct_runner.py
@@ -69,6 +69,9 @@
implemented in the FnApiRunner.
"""
+ def is_fnapi_compatible(self):
+ return BundleBasedDirectRunner.is_fnapi_compatible()
+
def run_pipeline(self, pipeline, options):
from apache_beam.pipeline import PipelineVisitor
@@ -336,6 +339,10 @@
class BundleBasedDirectRunner(PipelineRunner):
"""Executes a single pipeline on the local machine."""
+ @staticmethod
+ def is_fnapi_compatible():
+ return False
+
def run_pipeline(self, pipeline, options):
"""Execute the entire pipeline and returns an DirectPipelineResult."""
diff --git a/sdks/python/apache_beam/runners/interactive/README.md b/sdks/python/apache_beam/runners/interactive/README.md
index 6f187de..bdcb85d 100644
--- a/sdks/python/apache_beam/runners/interactive/README.md
+++ b/sdks/python/apache_beam/runners/interactive/README.md
@@ -225,7 +225,7 @@
```bash
$ ./gradlew -p sdks/python/container/py35 docker # Optionally replace py35 with the Python version of your choice
- $ ./gradlew :runners:flink:1.8:job-server:runShadow # Blocking
+ $ ./gradlew :runners:flink:1.9:job-server:runShadow # Blocking
```
* Run `$ jupyter notebook` in another terminal.
diff --git a/sdks/python/apache_beam/runners/interactive/interactive_runner.py b/sdks/python/apache_beam/runners/interactive/interactive_runner.py
index 4bf125e..94c0de7 100644
--- a/sdks/python/apache_beam/runners/interactive/interactive_runner.py
+++ b/sdks/python/apache_beam/runners/interactive/interactive_runner.py
@@ -65,6 +65,10 @@
self._renderer = pipeline_graph_renderer.get_renderer(render_option)
self._in_session = False
+ def is_fnapi_compatible(self):
+ # TODO(BEAM-8436): return self._underlying_runner.is_fnapi_compatible()
+ return False
+
def set_render_option(self, render_option):
"""Sets the rendering option.
diff --git a/sdks/python/apache_beam/runners/portability/artifact_service.py b/sdks/python/apache_beam/runners/portability/artifact_service.py
index 94aea5b..100eca5 100644
--- a/sdks/python/apache_beam/runners/portability/artifact_service.py
+++ b/sdks/python/apache_beam/runners/portability/artifact_service.py
@@ -57,7 +57,7 @@
return path + '.tmp'
def _open(self, path, mode):
- return self._zipfile.open(path, mode, force_zip64=True)
+ raise NotImplementedError(type(self))
def _rename(self, src, dest):
raise NotImplementedError(type(self))
@@ -186,6 +186,18 @@
return super(
ZipFileArtifactService, self).CommitManifest(request, context)
+ def GetManifest(self, request, context=None):
+ # ZipFile appears to not be threadsafe on some platforms.
+ with self._lock:
+ return super(ZipFileArtifactService, self).GetManifest(request, context)
+
+ def GetArtifact(self, request, context=None):
+ # ZipFile appears to not be threadsafe on some platforms.
+ with self._lock:
+ for chunk in super(ZipFileArtifactService, self).GetArtifact(
+ request, context):
+ yield chunk
+
def close(self):
self._zipfile.close()
diff --git a/sdks/python/apache_beam/runners/portability/flink_runner.py b/sdks/python/apache_beam/runners/portability/flink_runner.py
index 6d50753..ac21e32 100644
--- a/sdks/python/apache_beam/runners/portability/flink_runner.py
+++ b/sdks/python/apache_beam/runners/portability/flink_runner.py
@@ -20,6 +20,8 @@
from __future__ import absolute_import
from __future__ import print_function
+import logging
+import re
import sys
from apache_beam.options import pipeline_options
@@ -27,23 +29,46 @@
from apache_beam.runners.portability import job_server
from apache_beam.runners.portability import portable_runner
-PUBLISHED_FLINK_VERSIONS = ['1.7', '1.8']
+PUBLISHED_FLINK_VERSIONS = ['1.7', '1.8', '1.9']
+MAGIC_HOST_NAMES = ['[local]', '[auto]']
class FlinkRunner(portable_runner.PortableRunner):
def default_job_server(self, options):
- flink_master_url = options.view_as(FlinkRunnerOptions).flink_master_url
- if flink_master_url == '[local]' or sys.version_info < (3, 6):
- # TOOD(BEAM-8396): Also default to LOOPBACK for [local].
+ flink_master = self.add_http_scheme(
+ options.view_as(FlinkRunnerOptions).flink_master)
+ options.view_as(FlinkRunnerOptions).flink_master = flink_master
+ if flink_master in MAGIC_HOST_NAMES or sys.version_info < (3, 6):
return job_server.StopOnExitJobServer(FlinkJarJobServer(options))
else:
- return flink_uber_jar_job_server.FlinkUberJarJobServer(flink_master_url)
+ # This has to be changed [auto], otherwise we will attempt to submit a
+ # the pipeline remotely on the Flink JobMaster which will _fail_.
+ # DO NOT CHANGE the following line, unless you have tested this.
+ options.view_as(FlinkRunnerOptions).flink_master = '[auto]'
+ return flink_uber_jar_job_server.FlinkUberJarJobServer(flink_master)
+
+ @staticmethod
+ def add_http_scheme(flink_master):
+ """Adds a http protocol scheme if none provided."""
+ flink_master = flink_master.strip()
+ if not flink_master in MAGIC_HOST_NAMES and \
+ not re.search('^http[s]?://', flink_master):
+ logging.info('Adding HTTP protocol scheme to flink_master parameter: '
+ 'http://%s', flink_master)
+ flink_master = 'http://' + flink_master
+ return flink_master
class FlinkRunnerOptions(pipeline_options.PipelineOptions):
@classmethod
def _add_argparse_args(cls, parser):
- parser.add_argument('--flink_master_url', default='[local]')
+ parser.add_argument('--flink_master',
+ default='[auto]',
+ help='Flink master address (http://host:port)'
+ ' Use "[local]" to start a local cluster'
+ ' for the execution. Use "[auto]" if you'
+ ' plan to either execute locally or let the'
+ ' Flink job server infer the cluster address.')
parser.add_argument('--flink_version',
default=PUBLISHED_FLINK_VERSIONS[-1],
choices=PUBLISHED_FLINK_VERSIONS,
@@ -58,7 +83,7 @@
super(FlinkJarJobServer, self).__init__()
options = options.view_as(FlinkRunnerOptions)
self._jar = options.flink_job_server_jar
- self._master_url = options.flink_master_url
+ self._master_url = options.flink_master
self._flink_version = options.flink_version
self._artifacts_dir = options.artifacts_dir
diff --git a/sdks/python/apache_beam/runners/portability/flink_runner_test.py b/sdks/python/apache_beam/runners/portability/flink_runner_test.py
index 397297b..0328dff 100644
--- a/sdks/python/apache_beam/runners/portability/flink_runner_test.py
+++ b/sdks/python/apache_beam/runners/portability/flink_runner_test.py
@@ -31,6 +31,7 @@
from apache_beam import Impulse
from apache_beam import Map
from apache_beam import Pipeline
+from apache_beam.coders import VarIntCoder
from apache_beam.io.external.generate_sequence import GenerateSequence
from apache_beam.io.external.kafka import ReadFromKafka
from apache_beam.io.external.kafka import WriteToKafka
@@ -42,6 +43,7 @@
from apache_beam.runners.portability import portable_runner_test
from apache_beam.testing.util import assert_that
from apache_beam.testing.util import equal_to
+from apache_beam.transforms import userstate
if __name__ == '__main__':
# Run as
@@ -107,7 +109,8 @@
f.write(linesep.join([
'metrics.reporters: file',
'metrics.reporter.file.class: %s' % file_reporter,
- 'metrics.reporter.file.path: %s' % cls.test_metrics_path
+ 'metrics.reporter.file.path: %s' % cls.test_metrics_path,
+ 'metrics.scope.operator: <operator_name>',
]))
@classmethod
@@ -219,42 +222,102 @@
with_transcoding=False)
def test_metrics(self):
- """Run a simple DoFn that increments a counter, and verify that its
- expected value is written to a temporary file by the FileReporter"""
+ """Run a simple DoFn that increments a counter and verifies state
+ caching metrics. Verifies that its expected value is written to a
+ temporary file by the FileReporter"""
counter_name = 'elem_counter'
+ state_spec = userstate.BagStateSpec('state', VarIntCoder())
class DoFn(beam.DoFn):
def __init__(self):
self.counter = Metrics.counter(self.__class__, counter_name)
logging.info('counter: %s' % self.counter.metric_name)
- def process(self, v):
+ def process(self, kv, state=beam.DoFn.StateParam(state_spec)):
+ # Trigger materialization
+ list(state.read())
+ state.add(1)
self.counter.inc()
options = self.create_options()
# Test only supports parallelism of 1
options._all_options['parallelism'] = 1
- n = 100
+ # Create multiple bundles to test cache metrics
+ options._all_options['max_bundle_size'] = 10
+ options._all_options['max_bundle_time_millis'] = 95130590130
+ experiments = options.view_as(DebugOptions).experiments or []
+ experiments.append('state_cache_size=123')
+ options.view_as(DebugOptions).experiments = experiments
with Pipeline(self.get_runner(), options) as p:
# pylint: disable=expression-not-assigned
(p
- | beam.Create(list(range(n)))
- | beam.ParDo(DoFn()))
+ | "create" >> beam.Create(list(range(0, 110)))
+ | "mapper" >> beam.Map(lambda x: (x % 10, 'val'))
+ | "stateful" >> beam.ParDo(DoFn()))
+ lines_expected = {'counter: 110'}
+ if streaming:
+ lines_expected.update([
+ # Gauges for the last finished bundle
+ 'stateful.beam.metric:statecache:capacity: 123',
+ # These are off by 10 because the first bundle contains all the keys
+ # once. Caching is only initialized after the first bundle. Caching
+ # depends on the cache token which is lazily initialized by the
+ # Runner's StateRequestHandlers.
+ 'stateful.beam.metric:statecache:size: 10',
+ 'stateful.beam.metric:statecache:get: 10',
+ 'stateful.beam.metric:statecache:miss: 0',
+ 'stateful.beam.metric:statecache:hit: 10',
+ 'stateful.beam.metric:statecache:put: 0',
+ 'stateful.beam.metric:statecache:extend: 10',
+ 'stateful.beam.metric:statecache:evict: 0',
+ # Counters
+ # (total of get/hit will be off by 10 due to the caching
+ # only getting initialized after the first bundle.
+ # Caching depends on the cache token which is lazily
+ # initialized by the Runner's StateRequestHandlers).
+ 'stateful.beam.metric:statecache:get_total: 100',
+ 'stateful.beam.metric:statecache:miss_total: 10',
+ 'stateful.beam.metric:statecache:hit_total: 90',
+ 'stateful.beam.metric:statecache:put_total: 10',
+ 'stateful.beam.metric:statecache:extend_total: 100',
+ 'stateful.beam.metric:statecache:evict_total: 0',
+ ])
+ else:
+ # Batch has a different processing model. All values for
+ # a key are processed at once.
+ lines_expected.update([
+ # Gauges
+ 'stateful).beam.metric:statecache:capacity: 123',
+ # For the first key, the cache token will not be set yet.
+ # It's lazily initialized after first access in StateRequestHandlers
+ 'stateful).beam.metric:statecache:size: 9',
+ # We have 11 here because there are 110 / 10 elements per key
+ 'stateful).beam.metric:statecache:get: 11',
+ 'stateful).beam.metric:statecache:miss: 1',
+ 'stateful).beam.metric:statecache:hit: 10',
+ # State is flushed back once per key
+ 'stateful).beam.metric:statecache:put: 1',
+ 'stateful).beam.metric:statecache:extend: 1',
+ 'stateful).beam.metric:statecache:evict: 0',
+ # Counters
+ 'stateful).beam.metric:statecache:get_total: 99',
+ 'stateful).beam.metric:statecache:miss_total: 9',
+ 'stateful).beam.metric:statecache:hit_total: 90',
+ 'stateful).beam.metric:statecache:put_total: 9',
+ 'stateful).beam.metric:statecache:extend_total: 9',
+ 'stateful).beam.metric:statecache:evict_total: 0',
+ ])
+ lines_actual = set()
with open(self.test_metrics_path, 'r') as f:
- lines = [line for line in f.readlines() if counter_name in line]
- self.assertEqual(
- len(lines), 1,
- msg='Expected 1 line matching "{}":\n{}'.format(
- counter_name, '\n'.join(lines))
- )
- line = lines[0]
- self.assertTrue(
- '{}: {}'.format(counter_name in line, n),
- msg='Failed to find expected counter {} in line {}'.format(
- counter_name, line)
- )
+ line = f.readline()
+ while line:
+ for metric_str in lines_expected:
+ if metric_str in line:
+ lines_actual.add(metric_str)
+ line = f.readline()
+ self.assertSetEqual(lines_actual, lines_expected)
def test_sdf_with_sdf_initiated_checkpointing(self):
raise unittest.SkipTest("BEAM-2939")
diff --git a/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server.py b/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server.py
index ebbd98e..b69da66 100644
--- a/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server.py
+++ b/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server.py
@@ -42,6 +42,11 @@
class FlinkUberJarJobServer(abstract_job_service.AbstractJobServiceServicer):
+ """A Job server which submits a self-contained Jar to a Flink cluster.
+
+ The jar contains the Beam pipeline definition, dependencies, and
+ the pipeline artifacts.
+ """
def __init__(self, master_url, executable_jar=None):
super(FlinkUberJarJobServer, self).__init__()
@@ -77,6 +82,8 @@
class FlinkBeamJob(abstract_job_service.AbstractBeamJob):
+ """Runs a single Beam job on Flink by staging all contents into a Jar
+ and uploading it via the Flink Rest API."""
# These must agree with those defined in PortablePipelineJarUtils.java.
PIPELINE_FOLDER = 'BEAM-PIPELINE'
@@ -208,7 +215,7 @@
'RECONCILING': beam_job_api_pb2.JobState.RUNNING,
'IN_PROGRESS': beam_job_api_pb2.JobState.RUNNING,
'COMPLETED': beam_job_api_pb2.JobState.DONE,
- }.get(flink_status, beam_job_api_pb2.JobState.DONE)
+ }.get(flink_status, beam_job_api_pb2.JobState.UNSPECIFIED)
if beam_state in abstract_job_service.TERMINAL_STATES:
self.delete_jar()
return beam_state
diff --git a/sdks/python/apache_beam/runners/portability/fn_api_runner.py b/sdks/python/apache_beam/runners/portability/fn_api_runner.py
index 94b6d4c..d7e0f33 100644
--- a/sdks/python/apache_beam/runners/portability/fn_api_runner.py
+++ b/sdks/python/apache_beam/runners/portability/fn_api_runner.py
@@ -39,13 +39,12 @@
import apache_beam as beam # pylint: disable=ungrouped-imports
from apache_beam import coders
-from apache_beam import metrics
from apache_beam.coders.coder_impl import create_InputStream
from apache_beam.coders.coder_impl import create_OutputStream
+from apache_beam.metrics import metric
from apache_beam.metrics import monitoring_infos
-from apache_beam.metrics.execution import MetricKey
+from apache_beam.metrics.execution import MetricResult
from apache_beam.metrics.execution import MetricsEnvironment
-from apache_beam.metrics.metricbase import MetricName
from apache_beam.options import pipeline_options
from apache_beam.options.value_provider import RuntimeValueProvider
from apache_beam.portability import common_urns
@@ -62,6 +61,7 @@
from apache_beam.runners import runner
from apache_beam.runners.portability import artifact_service
from apache_beam.runners.portability import fn_api_runner_transforms
+from apache_beam.runners.portability import portable_metrics
from apache_beam.runners.portability.fn_api_runner_transforms import create_buffer_id
from apache_beam.runners.portability.fn_api_runner_transforms import only_element
from apache_beam.runners.portability.fn_api_runner_transforms import split_buffer_id
@@ -1133,14 +1133,14 @@
self, data_plane.InMemoryDataChannel(), state, provision_info)
self.control_conn = self
self.data_conn = self.data_plane_handler
+ state_cache = StateCache(STATE_CACHE_SIZE)
self.worker = sdk_worker.SdkWorker(
sdk_worker.BundleProcessorCache(
FnApiRunner.SingletonStateHandlerFactory(
- sdk_worker.CachingMaterializingStateHandler(
- StateCache(STATE_CACHE_SIZE), state)),
+ sdk_worker.CachingStateHandler(state_cache, state)),
data_plane.InMemoryDataChannelFactory(
self.data_plane_handler.inverse()),
- {}))
+ {}), state_cache_metrics_fn=state_cache.get_monitoring_infos)
self._uid_counter = 0
def push(self, request):
@@ -1334,9 +1334,9 @@
super(GrpcWorkerHandler, self).close()
def port_from_worker(self, port):
- return '%s:%s' % (self.localhost_from_worker(), port)
+ return '%s:%s' % (self.host_from_worker(), port)
- def localhost_from_worker(self):
+ def host_from_worker(self):
return 'localhost'
@@ -1365,6 +1365,10 @@
def stop_worker(self):
pass
+ def host_from_worker(self):
+ import socket
+ return socket.getfqdn()
+
@WorkerHandler.register_environment(python_urns.EMBEDDED_PYTHON_GRPC, bytes)
class EmbeddedGrpcWorkerHandler(GrpcWorkerHandler):
@@ -1425,12 +1429,12 @@
self._container_image = payload.container_image
self._container_id = None
- def localhost_from_worker(self):
+ def host_from_worker(self):
if sys.platform == "darwin":
# See https://docs.docker.com/docker-for-mac/networking/
return 'host.docker.internal'
else:
- return super(DockerSdkWorkerHandler, self).localhost_from_worker()
+ return super(DockerSdkWorkerHandler, self).host_from_worker()
def start_worker(self):
with SUBPROCESS_LOCK:
@@ -1879,53 +1883,34 @@
return self._response
-class FnApiMetrics(metrics.metric.MetricResults):
+class FnApiMetrics(metric.MetricResults):
def __init__(self, step_monitoring_infos, user_metrics_only=True):
"""Used for querying metrics from the PipelineResult object.
step_monitoring_infos: Per step metrics specified as MonitoringInfos.
- use_monitoring_infos: If true, return the metrics based on the
- step_monitoring_infos.
+ user_metrics_only: If true, includes user metrics only.
"""
self._counters = {}
self._distributions = {}
self._gauges = {}
self._user_metrics_only = user_metrics_only
- self._init_metrics_from_monitoring_infos(step_monitoring_infos)
self._monitoring_infos = step_monitoring_infos
- def _init_metrics_from_monitoring_infos(self, step_monitoring_infos):
for smi in step_monitoring_infos.values():
- # Only include user metrics.
- for mi in smi:
- if (self._user_metrics_only and
- not monitoring_infos.is_user_monitoring_info(mi)):
- continue
- key = self._to_metric_key(mi)
- if monitoring_infos.is_counter(mi):
- self._counters[key] = (
- monitoring_infos.extract_metric_result_map_value(mi))
- elif monitoring_infos.is_distribution(mi):
- self._distributions[key] = (
- monitoring_infos.extract_metric_result_map_value(mi))
- elif monitoring_infos.is_gauge(mi):
- self._gauges[key] = (
- monitoring_infos.extract_metric_result_map_value(mi))
-
- def _to_metric_key(self, monitoring_info):
- # Right now this assumes that all metrics have a PTRANSFORM
- transform_id = monitoring_info.labels['PTRANSFORM']
- namespace, name = monitoring_infos.parse_namespace_and_name(monitoring_info)
- return MetricKey(transform_id, MetricName(namespace, name))
+ counters, distributions, gauges = \
+ portable_metrics.from_monitoring_infos(smi, user_metrics_only)
+ self._counters.update(counters)
+ self._distributions.update(distributions)
+ self._gauges.update(gauges)
def query(self, filter=None):
- counters = [metrics.execution.MetricResult(k, v, v)
+ counters = [MetricResult(k, v, v)
for k, v in self._counters.items()
if self.matches(filter, k)]
- distributions = [metrics.execution.MetricResult(k, v, v)
+ distributions = [MetricResult(k, v, v)
for k, v in self._distributions.items()
if self.matches(filter, k)]
- gauges = [metrics.execution.MetricResult(k, v, v)
+ gauges = [MetricResult(k, v, v)
for k, v in self._gauges.items()
if self.matches(filter, k)]
diff --git a/sdks/python/apache_beam/runners/portability/fn_api_runner_test.py b/sdks/python/apache_beam/runners/portability/fn_api_runner_test.py
index e645b16..1f368c0 100644
--- a/sdks/python/apache_beam/runners/portability/fn_api_runner_test.py
+++ b/sdks/python/apache_beam/runners/portability/fn_api_runner_test.py
@@ -35,7 +35,6 @@
# patches unittest.TestCase to be python3 compatible
import future.tests.base # pylint: disable=unused-import
import hamcrest # pylint: disable=ungrouped-imports
-import mock
from hamcrest.core.matcher import Matcher
from hamcrest.core.string_description import StringDescription
from tenacity import retry
@@ -43,11 +42,11 @@
import apache_beam as beam
from apache_beam.io import restriction_trackers
-from apache_beam.io.concat_source_test import RangeSource
from apache_beam.metrics import monitoring_infos
from apache_beam.metrics.execution import MetricKey
-from apache_beam.metrics.execution import MetricsEnvironment
from apache_beam.metrics.metricbase import MetricName
+from apache_beam.options.pipeline_options import DebugOptions
+from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.portability import python_urns
from apache_beam.portability.api import beam_runner_api_pb2
from apache_beam.runners.portability import fn_api_runner
@@ -109,48 +108,6 @@
| beam.Map(lambda e: e + 'x'))
assert_that(res, equal_to(['aax', 'bcbcx']))
- def test_pardo_metrics(self):
-
- class MyDoFn(beam.DoFn):
-
- def start_bundle(self):
- self.count = beam.metrics.Metrics.counter('ns1', 'elements')
-
- def process(self, element):
- self.count.inc(element)
- return [element]
-
- class MyOtherDoFn(beam.DoFn):
-
- def start_bundle(self):
- self.count = beam.metrics.Metrics.counter('ns2', 'elementsplusone')
-
- def process(self, element):
- self.count.inc(element + 1)
- return [element]
-
- with self.create_pipeline() as p:
- res = (p | beam.Create([1, 2, 3])
- | 'mydofn' >> beam.ParDo(MyDoFn())
- | 'myotherdofn' >> beam.ParDo(MyOtherDoFn()))
- p.run()
- if not MetricsEnvironment.METRICS_SUPPORTED:
- self.skipTest('Metrics are not supported.')
-
- counter_updates = [{'key': key, 'value': val}
- for container in p.runner.metrics_containers()
- for key, val in
- container.get_updates().counters.items()]
- counter_values = [update['value'] for update in counter_updates]
- counter_keys = [update['key'] for update in counter_updates]
- assert_that(res, equal_to([1, 2, 3]))
- self.assertEqual(counter_values, [6, 9])
- self.assertEqual(counter_keys, [
- MetricKey('mydofn',
- MetricName('ns1', 'elements')),
- MetricKey('myotherdofn',
- MetricName('ns2', 'elementsplusone'))])
-
def test_pardo_side_outputs(self):
def tee(elem, *tags):
for tag in tags:
@@ -527,34 +484,6 @@
self.assertEqual(1, len(counters))
self.assertEqual(counters[0].committed, len(''.join(data)))
- def _run_sdf_wrapper_pipeline(self, source, expected_value):
- with self.create_pipeline() as p:
- from apache_beam.options.pipeline_options import DebugOptions
- experiments = (p._options.view_as(DebugOptions).experiments or [])
-
- # Setup experiment option to enable using SDFBoundedSourceWrapper
- if not 'use_sdf_bounded_source' in experiments:
- experiments.append('use_sdf_bounded_source')
-
- p._options.view_as(DebugOptions).experiments = experiments
-
- actual = (
- p | beam.io.Read(source)
- )
- assert_that(actual, equal_to(expected_value))
-
- @mock.patch('apache_beam.io.iobase._SDFBoundedSourceWrapper.expand')
- def test_sdf_wrapper_overrides_read(self, sdf_wrapper_mock_expand):
- def _fake_wrapper_expand(pbegin):
- return (pbegin
- | beam.Create(['1']))
-
- sdf_wrapper_mock_expand.side_effect = _fake_wrapper_expand
- self._run_sdf_wrapper_pipeline(RangeSource(0, 4), ['1'])
-
- def test_sdf_wrap_range_source(self):
- self._run_sdf_wrapper_pipeline(RangeSource(0, 4), [0, 1, 2, 3])
-
def test_group_by_key(self):
with self.create_pipeline() as p:
res = (p
@@ -682,10 +611,6 @@
def test_metrics(self):
p = self.create_pipeline()
- if not isinstance(p.runner, fn_api_runner.FnApiRunner):
- # This test is inherited by others that may not support the same
- # internal way of accessing progress metrics.
- self.skipTest('Metrics not supported.')
counter = beam.metrics.Metrics.counter('ns', 'counter')
distribution = beam.metrics.Metrics.distribution('ns', 'distribution')
@@ -696,7 +621,7 @@
pcoll | 'count1' >> beam.FlatMap(lambda x: counter.inc())
pcoll | 'count2' >> beam.FlatMap(lambda x: counter.inc(len(x)))
pcoll | 'dist' >> beam.FlatMap(lambda x: distribution.update(len(x)))
- pcoll | 'gauge' >> beam.FlatMap(lambda x: gauge.set(len(x)))
+ pcoll | 'gauge' >> beam.FlatMap(lambda x: gauge.set(3))
res = p.run()
res.wait_until_finish()
@@ -856,7 +781,10 @@
(found, (urn, labels, str(description)),))
def create_pipeline(self):
- return beam.Pipeline(runner=fn_api_runner.FnApiRunner())
+ p = beam.Pipeline(runner=fn_api_runner.FnApiRunner())
+ # TODO(BEAM-8448): Fix these tests.
+ p.options.view_as(DebugOptions).experiments.remove('beam_fn_api')
+ return p
def test_element_count_metrics(self):
class GenerateTwoOutputs(beam.DoFn):
@@ -871,10 +799,6 @@
yield element
p = self.create_pipeline()
- if not isinstance(p.runner, fn_api_runner.FnApiRunner):
- # This test is inherited by others that may not support the same
- # internal way of accessing progress metrics.
- self.skipTest('Metrics not supported.')
# Produce enough elements to make sure byte sampling occurs.
num_source_elems = 100
@@ -1009,10 +933,6 @@
def test_non_user_metrics(self):
p = self.create_pipeline()
- if not isinstance(p.runner, fn_api_runner.FnApiRunner):
- # This test is inherited by others that may not support the same
- # internal way of accessing progress metrics.
- self.skipTest('Metrics not supported.')
pcoll = p | beam.Create(['a', 'zzz'])
# pylint: disable=expression-not-assigned
@@ -1053,11 +973,6 @@
@retry(reraise=True, stop=stop_after_attempt(3))
def test_progress_metrics(self):
p = self.create_pipeline()
- if not isinstance(p.runner, fn_api_runner.FnApiRunner):
- # This test is inherited by others that may not support the same
- # internal way of accessing progress metrics.
- self.skipTest('Progress metrics not supported.')
- return
_ = (p
| beam.Create([0, 0, 0, 5e-3 * DEFAULT_SAMPLING_PERIOD_MS])
@@ -1198,11 +1113,12 @@
class FnApiRunnerTestWithMultiWorkers(FnApiRunnerTest):
def create_pipeline(self):
- from apache_beam.options.pipeline_options import PipelineOptions
- pipeline_options = PipelineOptions(['--direct_num_workers', '2'])
+ pipeline_options = PipelineOptions(direct_num_workers=2)
p = beam.Pipeline(
runner=fn_api_runner.FnApiRunner(),
options=pipeline_options)
+ #TODO(BEAM-8444): Fix these tests..
+ p.options.view_as(DebugOptions).experiments.remove('beam_fn_api')
return p
def test_metrics(self):
@@ -1215,13 +1131,14 @@
class FnApiRunnerTestWithGrpcAndMultiWorkers(FnApiRunnerTest):
def create_pipeline(self):
- from apache_beam.options.pipeline_options import PipelineOptions
- pipeline_options = PipelineOptions(['--direct_num_workers', '2'])
+ pipeline_options = PipelineOptions(direct_num_workers=2)
p = beam.Pipeline(
runner=fn_api_runner.FnApiRunner(
default_environment=beam_runner_api_pb2.Environment(
urn=python_urns.EMBEDDED_PYTHON_GRPC)),
options=pipeline_options)
+ #TODO(BEAM-8444): Fix these tests..
+ p.options.view_as(DebugOptions).experiments.remove('beam_fn_api')
return p
def test_metrics(self):
@@ -1244,11 +1161,12 @@
class FnApiRunnerTestWithBundleRepeatAndMultiWorkers(FnApiRunnerTest):
def create_pipeline(self):
- from apache_beam.options.pipeline_options import PipelineOptions
- pipeline_options = PipelineOptions(['--direct_num_workers', '2'])
- return beam.Pipeline(
+ pipeline_options = PipelineOptions(direct_num_workers=2)
+ p = beam.Pipeline(
runner=fn_api_runner.FnApiRunner(bundle_repeat=3),
options=pipeline_options)
+ p.options.view_as(DebugOptions).experiments.remove('beam_fn_api')
+ return p
def test_register_finalizations(self):
raise unittest.SkipTest("TODO: Avoid bundle finalizations on repeat.")
@@ -1569,13 +1487,14 @@
class FnApiRunnerSplitTestWithMultiWorkers(FnApiRunnerSplitTest):
def create_pipeline(self):
- from apache_beam.options.pipeline_options import PipelineOptions
- pipeline_options = PipelineOptions(['--direct_num_workers', '2'])
+ pipeline_options = PipelineOptions(direct_num_workers=2)
p = beam.Pipeline(
runner=fn_api_runner.FnApiRunner(
default_environment=beam_runner_api_pb2.Environment(
urn=python_urns.EMBEDDED_PYTHON_GRPC)),
options=pipeline_options)
+ #TODO(BEAM-8444): Fix these tests..
+ p.options.view_as(DebugOptions).experiments.remove('beam_fn_api')
return p
def test_checkpoint(self):
diff --git a/sdks/python/apache_beam/runners/portability/local_job_service.py b/sdks/python/apache_beam/runners/portability/local_job_service.py
index d638d9b..4305810 100644
--- a/sdks/python/apache_beam/runners/portability/local_job_service.py
+++ b/sdks/python/apache_beam/runners/portability/local_job_service.py
@@ -31,6 +31,7 @@
import grpc
from google.protobuf import text_format
+from apache_beam.metrics import monitoring_infos
from apache_beam.portability.api import beam_artifact_api_pb2
from apache_beam.portability.api import beam_artifact_api_pb2_grpc
from apache_beam.portability.api import beam_fn_api_pb2_grpc
@@ -107,6 +108,26 @@
if os.path.exists(self._staging_dir) and self._cleanup_staging_dir:
shutil.rmtree(self._staging_dir, ignore_errors=True)
+ def GetJobMetrics(self, request, context=None):
+ if request.job_id not in self._jobs:
+ raise LookupError("Job {} does not exist".format(request.job_id))
+
+ result = self._jobs[request.job_id].result
+ monitoring_info_list = []
+ for mi in result._monitoring_infos_by_stage.values():
+ monitoring_info_list.extend(mi)
+
+ # Filter out system metrics
+ user_monitoring_info_list = [
+ x for x in monitoring_info_list
+ if monitoring_infos._is_user_monitoring_info(x) or
+ monitoring_infos._is_user_distribution_monitoring_info(x)
+ ]
+
+ return beam_job_api_pb2.GetJobMetricsResponse(
+ metrics=beam_job_api_pb2.MetricResults(
+ committed=user_monitoring_info_list))
+
class SubprocessSdkWorker(object):
"""Manages a SDK worker implemented as a subprocess communicating over grpc.
@@ -176,6 +197,7 @@
self._log_queues = []
self.state = beam_job_api_pb2.JobState.STARTING
self.daemon = True
+ self.result = None
@property
def state(self):
@@ -204,11 +226,12 @@
def _run_job(self):
with JobLogHandler(self._log_queues):
try:
- fn_api_runner.FnApiRunner(
+ result = fn_api_runner.FnApiRunner(
provision_info=self._provision_info).run_via_runner_api(
self._pipeline_proto)
logging.info('Successfully completed job.')
self.state = beam_job_api_pb2.JobState.DONE
+ self.result = result
except: # pylint: disable=bare-except
logging.exception('Error running pipeline.')
logging.exception(traceback)
diff --git a/sdks/python/apache_beam/runners/portability/local_job_service_main.py b/sdks/python/apache_beam/runners/portability/local_job_service_main.py
index 4563769..70a33ff 100644
--- a/sdks/python/apache_beam/runners/portability/local_job_service_main.py
+++ b/sdks/python/apache_beam/runners/portability/local_job_service_main.py
@@ -45,5 +45,4 @@
if __name__ == '__main__':
- logging.getLogger().setLevel(logging.INFO)
run(sys.argv)
diff --git a/sdks/python/apache_beam/runners/portability/portable_metrics.py b/sdks/python/apache_beam/runners/portability/portable_metrics.py
new file mode 100644
index 0000000..e7306af
--- /dev/null
+++ b/sdks/python/apache_beam/runners/portability/portable_metrics.py
@@ -0,0 +1,69 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
+
+import logging
+
+from apache_beam.metrics import monitoring_infos
+from apache_beam.metrics.execution import MetricKey
+from apache_beam.metrics.metric import MetricName
+
+
+def from_monitoring_infos(monitoring_info_list, user_metrics_only=False):
+ """Groups MonitoringInfo objects into counters, distributions and gauges.
+
+ Args:
+ monitoring_info_list: An iterable of MonitoringInfo objects.
+ user_metrics_only: If true, includes user metrics only.
+ Returns:
+ A tuple containing three dictionaries: counters, distributions and gauges,
+ respectively. Each dictionary contains (MetricKey, metric result) pairs.
+ """
+ counters = {}
+ distributions = {}
+ gauges = {}
+
+ for mi in monitoring_info_list:
+ if (user_metrics_only and
+ not monitoring_infos.is_user_monitoring_info(mi)):
+ continue
+
+ try:
+ key = _create_metric_key(mi)
+ except ValueError as e:
+ logging.debug(str(e))
+ continue
+ metric_result = (monitoring_infos.extract_metric_result_map_value(mi))
+
+ if monitoring_infos.is_counter(mi):
+ counters[key] = metric_result
+ elif monitoring_infos.is_distribution(mi):
+ distributions[key] = metric_result
+ elif monitoring_infos.is_gauge(mi):
+ gauges[key] = metric_result
+
+ return counters, distributions, gauges
+
+
+def _create_metric_key(monitoring_info):
+ step_name = monitoring_infos.get_step_name(monitoring_info)
+ if not step_name:
+ raise ValueError('Failed to deduce step_name from MonitoringInfo: {}'
+ .format(monitoring_info))
+ namespace, name = monitoring_infos.parse_namespace_and_name(monitoring_info)
+ return MetricKey(step_name, MetricName(namespace, name))
diff --git a/sdks/python/apache_beam/runners/portability/portable_runner.py b/sdks/python/apache_beam/runners/portability/portable_runner.py
index 16c6eba..c7fb76c 100644
--- a/sdks/python/apache_beam/runners/portability/portable_runner.py
+++ b/sdks/python/apache_beam/runners/portability/portable_runner.py
@@ -28,7 +28,8 @@
import grpc
from apache_beam import version as beam_version
-from apache_beam import metrics
+from apache_beam.metrics import metric
+from apache_beam.metrics.execution import MetricResult
from apache_beam.options.pipeline_options import DebugOptions
from apache_beam.options.pipeline_options import PortableOptions
from apache_beam.options.pipeline_options import SetupOptions
@@ -41,6 +42,7 @@
from apache_beam.runners.job import utils as job_utils
from apache_beam.runners.portability import fn_api_runner_transforms
from apache_beam.runners.portability import job_server
+from apache_beam.runners.portability import portable_metrics
from apache_beam.runners.portability import portable_stager
from apache_beam.runners.worker import sdk_worker_main
from apache_beam.runners.worker import worker_pool_main
@@ -95,6 +97,11 @@
@staticmethod
def _create_environment(options):
portable_options = options.view_as(PortableOptions)
+ # Do not set a Runner. Otherwise this can cause problems in Java's
+ # PipelineOptions, i.e. ClassNotFoundException, if the corresponding Runner
+ # does not exist in the Java SDK. In portability, the entry point is clearly
+ # defined via the JobService.
+ portable_options.view_as(StandardOptions).runner = None
environment_urn = common_urns.environments.DOCKER.urn
if portable_options.environment_type == 'DOCKER':
environment_urn = common_urns.environments.DOCKER.urn
@@ -361,16 +368,30 @@
state_stream, cleanup_callbacks)
-class PortableMetrics(metrics.metric.MetricResults):
+class PortableMetrics(metric.MetricResults):
def __init__(self, job_metrics_response):
- # TODO(lgajowy): Convert portable metrics to MetricResults
- # and allow querying them (BEAM-4775)
- pass
+ metrics = job_metrics_response.metrics
+ self.attempted = portable_metrics.from_monitoring_infos(metrics.attempted)
+ self.committed = portable_metrics.from_monitoring_infos(metrics.committed)
+
+ @staticmethod
+ def _combine(committed, attempted, filter):
+ all_keys = set(committed.keys()) | set(attempted.keys())
+ return [
+ MetricResult(key, committed.get(key), attempted.get(key))
+ for key in all_keys
+ if metric.MetricResults.matches(filter, key)
+ ]
def query(self, filter=None):
- return {'counters': [],
- 'distributions': [],
- 'gauges': []}
+ counters, distributions, gauges = [
+ self._combine(x, y, filter)
+ for x, y in zip(self.committed, self.attempted)
+ ]
+
+ return {self.COUNTERS: counters,
+ self.DISTRIBUTIONS: distributions,
+ self.GAUGES: gauges}
class PipelineResult(runner.PipelineResult):
diff --git a/sdks/python/apache_beam/runners/portability/portable_runner_test.py b/sdks/python/apache_beam/runners/portability/portable_runner_test.py
index 80afea7..3658c21 100644
--- a/sdks/python/apache_beam/runners/portability/portable_runner_test.py
+++ b/sdks/python/apache_beam/runners/portability/portable_runner_test.py
@@ -187,7 +187,10 @@
def create_pipeline(self):
return beam.Pipeline(self.get_runner(), self.create_options())
- # Inherits all tests from fn_api_runner_test.FnApiRunnerTest
+ def test_metrics(self):
+ self.skipTest('Metrics not supported.')
+
+ # Inherits all other tests from fn_api_runner_test.FnApiRunnerTest
@unittest.skip("BEAM-7248")
diff --git a/sdks/python/apache_beam/runners/runner.py b/sdks/python/apache_beam/runners/runner.py
index f5a431b..b7a2d30 100644
--- a/sdks/python/apache_beam/runners/runner.py
+++ b/sdks/python/apache_beam/runners/runner.py
@@ -198,6 +198,10 @@
'Execution of [%s] not implemented in runner %s.' % (
transform_node.transform, self))
+ def is_fnapi_compatible(self):
+ """Whether to enable the beam_fn_api experiment by default."""
+ return True
+
class PValueCache(object):
"""For internal use only; no backwards-compatibility guarantees.
diff --git a/sdks/python/apache_beam/runners/worker/bundle_processor.py b/sdks/python/apache_beam/runners/worker/bundle_processor.py
index 30c6f3d..8439c8f 100644
--- a/sdks/python/apache_beam/runners/worker/bundle_processor.py
+++ b/sdks/python/apache_beam/runners/worker/bundle_processor.py
@@ -427,14 +427,18 @@
self._added_elements = set()
def _commit(self):
+ to_await = None
if self._cleared:
- self._state_handler.clear(self._state_key, is_cached=True).get()
+ to_await = self._state_handler.clear(self._state_key, is_cached=True)
if self._added_elements:
- self._state_handler.extend(
+ to_await = self._state_handler.extend(
self._state_key,
self._value_coder.get_impl(),
self._added_elements,
- is_cached=True).get()
+ is_cached=True)
+ if to_await:
+ # To commit, we need to wait on the last state request future to complete.
+ to_await.get()
class OutputTimer(object):
@@ -563,7 +567,7 @@
Args:
process_bundle_descriptor (``beam_fn_api_pb2.ProcessBundleDescriptor``):
a description of the stage that this ``BundleProcessor``is to execute.
- state_handler (beam_fn_api_pb2_grpc.BeamFnStateServicer).
+ state_handler (CachingStateHandler).
data_channel_factory (``data_plane.DataChannelFactory``).
"""
self.process_bundle_descriptor = process_bundle_descriptor
@@ -764,7 +768,7 @@
def monitoring_infos(self):
"""Returns the list of MonitoringInfos collected processing this bundle."""
- # Construct a new dict first to remove duplciates.
+ # Construct a new dict first to remove duplicates.
all_monitoring_infos_dict = {}
for transform_id, op in self.ops.items():
for mi in op.monitoring_infos(transform_id).values():
diff --git a/sdks/python/apache_beam/runners/worker/sdk_worker.py b/sdks/python/apache_beam/runners/worker/sdk_worker.py
index 0dd2e8a..74a3e99 100644
--- a/sdks/python/apache_beam/runners/worker/sdk_worker.py
+++ b/sdks/python/apache_beam/runners/worker/sdk_worker.py
@@ -67,6 +67,7 @@
self._worker_count = worker_count
self._worker_index = 0
self._worker_id = worker_id
+ self._state_cache = StateCache(state_cache_size)
if credentials is None:
logging.info('Creating insecure control channel for %s.', control_address)
self._control_channel = GRPCChannelFactory.insecure_channel(
@@ -82,7 +83,7 @@
self._control_channel, WorkerIdInterceptor(self._worker_id))
self._data_channel_factory = data_plane.GrpcClientDataChannelFactory(
credentials, self._worker_id)
- self._state_handler_factory = GrpcStateHandlerFactory(state_cache_size,
+ self._state_handler_factory = GrpcStateHandlerFactory(self._state_cache,
credentials)
self._profiler_factory = profiler_factory
self._fns = {}
@@ -126,6 +127,8 @@
# centralized function list shared among all the workers.
self.workers.put(
SdkWorker(self._bundle_processor_cache,
+ state_cache_metrics_fn=
+ self._state_cache.get_monitoring_infos,
profiler_factory=self._profiler_factory))
def get_responses():
@@ -179,17 +182,7 @@
self._responses.put(response)
def _request_register(self, request):
-
- def task():
- for process_bundle_descriptor in getattr(
- request, request.WhichOneof('request')).process_bundle_descriptor:
- self._fns[process_bundle_descriptor.id] = process_bundle_descriptor
-
- return beam_fn_api_pb2.InstructionResponse(
- instruction_id=request.instruction_id,
- register=beam_fn_api_pb2.RegisterResponse())
-
- self._execute(task, request)
+ self._request_execute(request)
def _request_process_bundle(self, request):
@@ -238,6 +231,9 @@
self._progress_thread_pool.submit(task)
def _request_finalize_bundle(self, request):
+ self._request_execute(request)
+
+ def _request_execute(self, request):
def task():
# Get one available worker.
@@ -345,9 +341,11 @@
def __init__(self,
bundle_processor_cache,
+ state_cache_metrics_fn=list,
profiler_factory=None,
log_lull_timeout_ns=None):
self.bundle_processor_cache = bundle_processor_cache
+ self.state_cache_metrics_fn = state_cache_metrics_fn
self.profiler_factory = profiler_factory
self.log_lull_timeout_ns = (log_lull_timeout_ns
or DEFAULT_LOG_LULL_TIMEOUT_NS)
@@ -384,12 +382,14 @@
with self.maybe_profile(instruction_id):
delayed_applications, requests_finalization = (
bundle_processor.process_bundle(instruction_id))
+ monitoring_infos = bundle_processor.monitoring_infos()
+ monitoring_infos.extend(self.state_cache_metrics_fn())
response = beam_fn_api_pb2.InstructionResponse(
instruction_id=instruction_id,
process_bundle=beam_fn_api_pb2.ProcessBundleResponse(
residual_roots=delayed_applications,
metrics=bundle_processor.metrics(),
- monitoring_infos=bundle_processor.monitoring_infos(),
+ monitoring_infos=monitoring_infos,
requires_finalization=requests_finalization))
# Don't release here if finalize is needed.
if not requests_finalization:
@@ -501,12 +501,12 @@
Caches the created channels by ``state descriptor url``.
"""
- def __init__(self, state_cache_size, credentials=None):
+ def __init__(self, state_cache, credentials=None):
self._state_handler_cache = {}
self._lock = threading.Lock()
self._throwing_state_handler = ThrowingStateHandler()
self._credentials = credentials
- self._state_cache = StateCache(state_cache_size)
+ self._state_cache = state_cache
def create_state_handler(self, api_service_descriptor):
if not api_service_descriptor:
@@ -532,7 +532,7 @@
# Add workerId to the grpc channel
grpc_channel = grpc.intercept_channel(grpc_channel,
WorkerIdInterceptor())
- self._state_handler_cache[url] = CachingMaterializingStateHandler(
+ self._state_handler_cache[url] = CachingStateHandler(
self._state_cache,
GrpcStateHandler(
beam_fn_api_pb2_grpc.BeamFnStateStub(grpc_channel)))
@@ -676,7 +676,7 @@
return str(request_id)
-class CachingMaterializingStateHandler(object):
+class CachingStateHandler(object):
""" A State handler which retrieves and caches state. """
def __init__(self, global_state_cache, underlying_state):
@@ -698,6 +698,7 @@
assert not user_state_cache_token
user_state_cache_token = cache_token_struct.token
try:
+ self._state_cache.initialize_metrics()
self._context.cache_token = user_state_cache_token
with self._underlying.process_instruction_id(bundle_id):
yield
diff --git a/sdks/python/apache_beam/runners/worker/statecache.py b/sdks/python/apache_beam/runners/worker/statecache.py
index a4902c6..cd3e057 100644
--- a/sdks/python/apache_beam/runners/worker/statecache.py
+++ b/sdks/python/apache_beam/runners/worker/statecache.py
@@ -20,19 +20,110 @@
import collections
import logging
-from threading import Lock
+import threading
+
+from apache_beam.metrics import monitoring_infos
+
+
+class Metrics(object):
+ """Metrics container for state cache metrics."""
+
+ # A set of all registered metrics
+ ALL_METRICS = set()
+ PREFIX = "beam:metric:statecache:"
+
+ def __init__(self):
+ self._context = threading.local()
+
+ def initialize(self):
+ """Needs to be called once per thread to initialize the local metrics cache.
+ """
+ if hasattr(self._context, 'metrics'):
+ return # Already initialized
+ self._context.metrics = collections.defaultdict(int)
+
+ def count(self, name):
+ self._context.metrics[name] += 1
+
+ def hit_miss(self, total_name, hit_miss_name):
+ self._context.metrics[total_name] += 1
+ self._context.metrics[hit_miss_name] += 1
+
+ def get_monitoring_infos(self, cache_size, cache_capacity):
+ """Returns the metrics scoped to the current bundle."""
+ metrics = self._context.metrics
+ if len(metrics) == 0:
+ # No metrics collected, do not report
+ return []
+ # Add all missing metrics which were not reported
+ for key in Metrics.ALL_METRICS:
+ if key not in metrics:
+ metrics[key] = 0
+ # Gauges which reflect the state since last queried
+ gauges = [monitoring_infos.int64_gauge(self.PREFIX + name, val)
+ for name, val in metrics.items()]
+ gauges.append(monitoring_infos.int64_gauge(self.PREFIX + 'size',
+ cache_size))
+ gauges.append(monitoring_infos.int64_gauge(self.PREFIX + 'capacity',
+ cache_capacity))
+ # Counters for the summary across all metrics
+ counters = [monitoring_infos.int64_counter(self.PREFIX + name + '_total',
+ val)
+ for name, val in metrics.items()]
+ # Reinitialize metrics for this thread/bundle
+ metrics.clear()
+ return gauges + counters
+
+ @staticmethod
+ def counter_hit_miss(total_name, hit_name, miss_name):
+ """Decorator for counting function calls and whether
+ the return value equals None (=miss) or not (=hit)."""
+ Metrics.ALL_METRICS.update([total_name, hit_name, miss_name])
+
+ def decorator(function):
+
+ def reporter(self, *args, **kwargs):
+ value = function(self, *args, **kwargs)
+ if value is None:
+ self._metrics.hit_miss(total_name, miss_name)
+ else:
+ self._metrics.hit_miss(total_name, hit_name)
+ return value
+
+ return reporter
+
+ return decorator
+
+ @staticmethod
+ def counter(metric_name):
+ """Decorator for counting function calls."""
+ Metrics.ALL_METRICS.add(metric_name)
+
+ def decorator(function):
+
+ def reporter(self, *args, **kwargs):
+ self._metrics.count(metric_name)
+ return function(self, *args, **kwargs)
+
+ return reporter
+
+ return decorator
class StateCache(object):
""" Cache for Beam state access, scoped by state key and cache_token.
+ Assumes a bag state implementation.
For a given state_key, caches a (cache_token, value) tuple and allows to
a) read from the cache (get),
if the currently stored cache_token matches the provided
a) write to the cache (put),
storing the new value alongside with a cache token
- c) append to the cache (extend),
+ c) append to the currently cache item (extend),
if the currently stored cache_token matches the provided
+ c) empty a cached element (clear),
+ if the currently stored cache_token matches the provided
+ d) evict a cached element (evict)
The operations on the cache are thread-safe for use by multiple workers.
@@ -43,19 +134,23 @@
def __init__(self, max_entries):
logging.info('Creating state cache with size %s', max_entries)
self._cache = self.LRUCache(max_entries, (None, None))
- self._lock = Lock()
+ self._lock = threading.RLock()
+ self._metrics = Metrics()
+ @Metrics.counter_hit_miss("get", "hit", "miss")
def get(self, state_key, cache_token):
assert cache_token and self.is_cache_enabled()
with self._lock:
token, value = self._cache.get(state_key)
return value if token == cache_token else None
+ @Metrics.counter("put")
def put(self, state_key, cache_token, value):
assert cache_token and self.is_cache_enabled()
with self._lock:
return self._cache.put(state_key, (cache_token, value))
+ @Metrics.counter("extend")
def extend(self, state_key, cache_token, elements):
assert cache_token and self.is_cache_enabled()
with self._lock:
@@ -67,8 +162,9 @@
self._cache.put(state_key, (cache_token, value))
else:
# Discard cached state if tokens do not match
- self._cache.evict(state_key)
+ self.evict(state_key)
+ @Metrics.counter("clear")
def clear(self, state_key, cache_token):
assert cache_token and self.is_cache_enabled()
with self._lock:
@@ -77,8 +173,9 @@
self._cache.put(state_key, (cache_token, []))
else:
# Discard cached state if tokens do not match
- self._cache.evict(state_key)
+ self.evict(state_key)
+ @Metrics.counter("evict")
def evict(self, state_key):
assert self.is_cache_enabled()
with self._lock:
@@ -88,12 +185,22 @@
with self._lock:
self._cache.evict_all()
+ def initialize_metrics(self):
+ self._metrics.initialize()
+
def is_cache_enabled(self):
return self._cache._max_entries > 0
- def __len__(self):
+ def size(self):
return len(self._cache)
+ def get_monitoring_infos(self):
+ """Retrieves the monitoring infos and resets the counters."""
+ with self._lock:
+ size = len(self._cache)
+ capacity = self._cache._max_entries
+ return self._metrics.get_monitoring_infos(size, capacity)
+
class LRUCache(object):
def __init__(self, max_entries, default_entry):
diff --git a/sdks/python/apache_beam/runners/worker/statecache_test.py b/sdks/python/apache_beam/runners/worker/statecache_test.py
index 8fedeaf..00ae852 100644
--- a/sdks/python/apache_beam/runners/worker/statecache_test.py
+++ b/sdks/python/apache_beam/runners/worker/statecache_test.py
@@ -21,90 +21,121 @@
import logging
import unittest
+from apache_beam.metrics.monitoring_infos import LATEST_INT64_TYPE
+from apache_beam.metrics.monitoring_infos import SUM_INT64_TYPE
from apache_beam.runners.worker.statecache import StateCache
class StateCacheTest(unittest.TestCase):
def test_empty_cache_get(self):
- cache = StateCache(5)
+ cache = self.get_cache(5)
self.assertEqual(cache.get("key", 'cache_token'), None)
with self.assertRaises(Exception):
+ # Invalid cache token provided
self.assertEqual(cache.get("key", None), None)
+ self.verify_metrics(cache, {'get': 1, 'put': 0, 'extend': 0,
+ 'miss': 1, 'hit': 0, 'clear': 0,
+ 'evict': 0,
+ 'size': 0, 'capacity': 5})
def test_put_get(self):
- cache = StateCache(5)
+ cache = self.get_cache(5)
cache.put("key", "cache_token", "value")
- self.assertEqual(len(cache), 1)
+ self.assertEqual(cache.size(), 1)
self.assertEqual(cache.get("key", "cache_token"), "value")
self.assertEqual(cache.get("key", "cache_token2"), None)
with self.assertRaises(Exception):
self.assertEqual(cache.get("key", None), None)
+ self.verify_metrics(cache, {'get': 2, 'put': 1, 'extend': 0,
+ 'miss': 1, 'hit': 1, 'clear': 0,
+ 'evict': 0,
+ 'size': 1, 'capacity': 5})
def test_overwrite(self):
- cache = StateCache(2)
+ cache = self.get_cache(2)
cache.put("key", "cache_token", "value")
cache.put("key", "cache_token2", "value2")
- self.assertEqual(len(cache), 1)
+ self.assertEqual(cache.size(), 1)
self.assertEqual(cache.get("key", "cache_token"), None)
self.assertEqual(cache.get("key", "cache_token2"), "value2")
+ self.verify_metrics(cache, {'get': 2, 'put': 2, 'extend': 0,
+ 'miss': 1, 'hit': 1, 'clear': 0,
+ 'evict': 0,
+ 'size': 1, 'capacity': 2})
def test_extend(self):
- cache = StateCache(3)
+ cache = self.get_cache(3)
cache.put("key", "cache_token", ['val'])
# test extend for existing key
cache.extend("key", "cache_token", ['yet', 'another', 'val'])
- self.assertEqual(len(cache), 1)
+ self.assertEqual(cache.size(), 1)
self.assertEqual(cache.get("key", "cache_token"),
['val', 'yet', 'another', 'val'])
# test extend without existing key
cache.extend("key2", "cache_token", ['another', 'val'])
- self.assertEqual(len(cache), 2)
+ self.assertEqual(cache.size(), 2)
self.assertEqual(cache.get("key2", "cache_token"), ['another', 'val'])
# test eviction in case the cache token changes
cache.extend("key2", "new_token", ['new_value'])
self.assertEqual(cache.get("key2", "new_token"), None)
- self.assertEqual(len(cache), 1)
+ self.assertEqual(cache.size(), 1)
+ self.verify_metrics(cache, {'get': 3, 'put': 1, 'extend': 3,
+ 'miss': 1, 'hit': 2, 'clear': 0,
+ 'evict': 1,
+ 'size': 1, 'capacity': 3})
def test_clear(self):
- cache = StateCache(5)
+ cache = self.get_cache(5)
cache.clear("new-key", "cache_token")
cache.put("key", "cache_token", ["value"])
- self.assertEqual(len(cache), 2)
+ self.assertEqual(cache.size(), 2)
self.assertEqual(cache.get("new-key", "new_token"), None)
self.assertEqual(cache.get("key", "cache_token"), ['value'])
# test clear without existing key/token
cache.clear("non-existing", "token")
- self.assertEqual(len(cache), 3)
+ self.assertEqual(cache.size(), 3)
self.assertEqual(cache.get("non-existing", "token"), [])
# test eviction in case the cache token changes
cache.clear("new-key", "wrong_token")
- self.assertEqual(len(cache), 2)
+ self.assertEqual(cache.size(), 2)
self.assertEqual(cache.get("new-key", "cache_token"), None)
self.assertEqual(cache.get("new-key", "wrong_token"), None)
+ self.verify_metrics(cache, {'get': 5, 'put': 1, 'extend': 0,
+ 'miss': 3, 'hit': 2, 'clear': 3,
+ 'evict': 1,
+ 'size': 2, 'capacity': 5})
def test_max_size(self):
- cache = StateCache(2)
+ cache = self.get_cache(2)
cache.put("key", "cache_token", "value")
cache.put("key2", "cache_token", "value")
- self.assertEqual(len(cache), 2)
+ self.assertEqual(cache.size(), 2)
cache.put("key2", "cache_token", "value")
- self.assertEqual(len(cache), 2)
+ self.assertEqual(cache.size(), 2)
cache.put("key", "cache_token", "value")
- self.assertEqual(len(cache), 2)
+ self.assertEqual(cache.size(), 2)
+ self.verify_metrics(cache, {'get': 0, 'put': 4, 'extend': 0,
+ 'miss': 0, 'hit': 0, 'clear': 0,
+ 'evict': 0,
+ 'size': 2, 'capacity': 2})
def test_evict_all(self):
- cache = StateCache(5)
+ cache = self.get_cache(5)
cache.put("key", "cache_token", "value")
cache.put("key2", "cache_token", "value2")
- self.assertEqual(len(cache), 2)
+ self.assertEqual(cache.size(), 2)
cache.evict_all()
- self.assertEqual(len(cache), 0)
+ self.assertEqual(cache.size(), 0)
self.assertEqual(cache.get("key", "cache_token"), None)
self.assertEqual(cache.get("key2", "cache_token"), None)
+ self.verify_metrics(cache, {'get': 2, 'put': 2, 'extend': 0,
+ 'miss': 2, 'hit': 0, 'clear': 0,
+ 'evict': 0,
+ 'size': 0, 'capacity': 5})
def test_lru(self):
- cache = StateCache(5)
+ cache = self.get_cache(5)
cache.put("key", "cache_token", "value")
cache.put("key2", "cache_token2", "value2")
cache.put("key3", "cache_token", "value0")
@@ -112,7 +143,7 @@
cache.put("key4", "cache_token4", "value4")
cache.put("key5", "cache_token", "value0")
cache.put("key5", "cache_token", ["value5"])
- self.assertEqual(len(cache), 5)
+ self.assertEqual(cache.size(), 5)
self.assertEqual(cache.get("key", "cache_token"), "value")
self.assertEqual(cache.get("key2", "cache_token2"), "value2")
self.assertEqual(cache.get("key3", "cache_token"), "value3")
@@ -120,34 +151,70 @@
self.assertEqual(cache.get("key5", "cache_token"), ["value5"])
# insert another key to trigger cache eviction
cache.put("key6", "cache_token2", "value7")
- self.assertEqual(len(cache), 5)
+ self.assertEqual(cache.size(), 5)
# least recently used key should be gone ("key")
self.assertEqual(cache.get("key", "cache_token"), None)
# trigger a read on "key2"
cache.get("key2", "cache_token")
# insert another key to trigger cache eviction
cache.put("key7", "cache_token", "value7")
- self.assertEqual(len(cache), 5)
+ self.assertEqual(cache.size(), 5)
# least recently used key should be gone ("key3")
self.assertEqual(cache.get("key3", "cache_token"), None)
# trigger a put on "key2"
cache.put("key2", "cache_token", "put")
- self.assertEqual(len(cache), 5)
+ self.assertEqual(cache.size(), 5)
# insert another key to trigger cache eviction
cache.put("key8", "cache_token", "value8")
- self.assertEqual(len(cache), 5)
+ self.assertEqual(cache.size(), 5)
# least recently used key should be gone ("key4")
self.assertEqual(cache.get("key4", "cache_token"), None)
# make "key5" used by appending to it
cache.extend("key5", "cache_token", ["another"])
# least recently used key should be gone ("key6")
self.assertEqual(cache.get("key6", "cache_token"), None)
+ self.verify_metrics(cache, {'get': 10, 'put': 11, 'extend': 1,
+ 'miss': 5, 'hit': 5, 'clear': 0,
+ 'evict': 0,
+ 'size': 5, 'capacity': 5})
def test_is_cached_enabled(self):
- cache = StateCache(1)
+ cache = self.get_cache(1)
self.assertEqual(cache.is_cache_enabled(), True)
- cache = StateCache(0)
+ self.verify_metrics(cache, {})
+ cache = self.get_cache(0)
self.assertEqual(cache.is_cache_enabled(), False)
+ self.verify_metrics(cache, {})
+
+ def verify_metrics(self, cache, expected_metrics):
+ infos = cache.get_monitoring_infos()
+ # Reconstruct metrics dictionary from monitoring infos
+ metrics = {
+ info.urn.rsplit(':', 1)[1]: info.metric.counter_data.int64_value
+ for info in infos
+ if "_total" not in info.urn and info.type == LATEST_INT64_TYPE
+ }
+ self.assertDictEqual(metrics, expected_metrics)
+ # Metrics and total metrics should be identical for a single bundle.
+ # The following two gauges are not part of the total metrics:
+ try:
+ del metrics['capacity']
+ del metrics['size']
+ except KeyError:
+ pass
+ total_metrics = {
+ info.urn.rsplit(':', 1)[1].rsplit("_total")[0]:
+ info.metric.counter_data.int64_value
+ for info in infos
+ if "_total" in info.urn and info.type == SUM_INT64_TYPE
+ }
+ self.assertDictEqual(metrics, total_metrics)
+
+ @staticmethod
+ def get_cache(size):
+ cache = StateCache(size)
+ cache.initialize_metrics()
+ return cache
if __name__ == '__main__':
diff --git a/sdks/python/apache_beam/testing/load_tests/load_test.py b/sdks/python/apache_beam/testing/load_tests/load_test.py
index cbe4d86..61db9c60 100644
--- a/sdks/python/apache_beam/testing/load_tests/load_test.py
+++ b/sdks/python/apache_beam/testing/load_tests/load_test.py
@@ -52,32 +52,24 @@
self.input_options = json.loads(self.pipeline.get_option('input_options'))
self.project_id = self.pipeline.get_option('project')
- self.publish_to_big_query = self.pipeline.get_option('publish_to_big_query')
self.metrics_dataset = self.pipeline.get_option('metrics_dataset')
self.metrics_namespace = self.pipeline.get_option('metrics_table')
- if not self.are_metrics_collected():
- logging.info('Metrics will not be collected')
- self.metrics_monitor = None
- else:
- self.metrics_monitor = MetricsReader(
- project_name=self.project_id,
- bq_table=self.metrics_namespace,
- bq_dataset=self.metrics_dataset,
- )
+ self.metrics_monitor = MetricsReader(
+ publish_to_bq=self.pipeline.get_option('publish_to_big_query') ==
+ 'true',
+ project_name=self.project_id,
+ bq_table=self.metrics_namespace,
+ bq_dataset=self.metrics_dataset,
+ # Apply filter to prevent system metrics from being published
+ filters=MetricsFilter().with_namespace(self.metrics_namespace)
+ )
def tearDown(self):
result = self.pipeline.run()
result.wait_until_finish()
- if self.metrics_monitor:
- self.metrics_monitor.publish_metrics(result)
-
- def apply_filter(self, allowed):
- """Prevents metrics from namespaces other than specified in the argument
- from being published."""
- if allowed:
- self.metrics_monitor.filters = MetricsFilter().with_namespaces(allowed)
+ self.metrics_monitor.publish_metrics(result)
def get_option_or_default(self, opt_name, default=0):
"""Returns a pipeline option or a default value if it was not provided.
@@ -92,10 +84,6 @@
except ValueError as exc:
self.fail(str(exc))
- def are_metrics_collected(self):
- return self.publish_to_big_query == 'true' and None not in (
- self.project_id, self.metrics_dataset, self.metrics_namespace)
-
if __name__ == '__main__':
logging.getLogger().setLevel(logging.DEBUG)
diff --git a/sdks/python/apache_beam/testing/load_tests/load_test_metrics_utils.py b/sdks/python/apache_beam/testing/load_tests/load_test_metrics_utils.py
index ca3b3af..c5c5259 100644
--- a/sdks/python/apache_beam/testing/load_tests/load_test_metrics_utils.py
+++ b/sdks/python/apache_beam/testing/load_tests/load_test_metrics_utils.py
@@ -171,7 +171,7 @@
publishers = []
def __init__(self, project_name=None, bq_table=None, bq_dataset=None,
- filters=None):
+ publish_to_bq=False, filters=None):
"""Initializes :class:`MetricsReader` .
Args:
@@ -182,7 +182,8 @@
"""
self._namespace = bq_table
self.publishers.append(ConsoleMetricsPublisher())
- check = project_name and bq_table and bq_dataset
+
+ check = project_name and bq_table and bq_dataset and publish_to_bq
if check:
bq_publisher = BigQueryMetricsPublisher(
project_name, bq_table, bq_dataset)
@@ -311,8 +312,8 @@
min_values = []
max_values = []
for dist in distributions:
- min_values.append(dist.committed.min)
- max_values.append(dist.committed.max)
+ min_values.append(dist.result.min)
+ max_values.append(dist.result.max)
# finding real start
min_value = min(min_values)
# finding real end
diff --git a/sdks/python/apache_beam/testing/load_tests/pardo_test.py b/sdks/python/apache_beam/testing/load_tests/pardo_test.py
index bc92a7b..7c05422 100644
--- a/sdks/python/apache_beam/testing/load_tests/pardo_test.py
+++ b/sdks/python/apache_beam/testing/load_tests/pardo_test.py
@@ -138,8 +138,6 @@
class ParDoTest(LoadTest):
def setUp(self):
super(ParDoTest, self).setUp()
- if self.are_metrics_collected():
- self.apply_filter([self.metrics_namespace])
self.iterations = self.get_option_or_default('iterations')
self.number_of_counters = self.get_option_or_default('number_of_counters')
self.number_of_operations = self.get_option_or_default(
diff --git a/sdks/python/apache_beam/testing/util.py b/sdks/python/apache_beam/testing/util.py
index 6d77ee7..b52e61b 100644
--- a/sdks/python/apache_beam/testing/util.py
+++ b/sdks/python/apache_beam/testing/util.py
@@ -209,7 +209,10 @@
Returns:
Ignored.
"""
- assert isinstance(actual, pvalue.PCollection)
+ assert isinstance(
+ actual,
+ pvalue.PCollection), ('%s is not a supported type for Beam assert'
+ % type(actual))
class ReifyTimestampWindow(DoFn):
def process(self, element, timestamp=DoFn.TimestampParam,
diff --git a/sdks/python/apache_beam/transforms/core.py b/sdks/python/apache_beam/transforms/core.py
index 306de6a..148caae 100644
--- a/sdks/python/apache_beam/transforms/core.py
+++ b/sdks/python/apache_beam/transforms/core.py
@@ -583,10 +583,11 @@
fn_type_hints = typehints.decorators.IOTypeHints.from_callable(self.process)
if fn_type_hints is not None:
try:
- fn_type_hints.strip_iterable()
+ fn_type_hints = fn_type_hints.strip_iterable()
except ValueError as e:
raise ValueError('Return value not iterable: %s: %s' % (self, e))
- return fn_type_hints
+ # Prefer class decorator type hints for backwards compatibility.
+ return get_type_hints(self.__class__).with_defaults(fn_type_hints)
# TODO(sourabhbajaj): Do we want to remove the responsibility of these from
# the DoFn or maybe the runner
@@ -676,22 +677,14 @@
def default_type_hints(self):
fn_type_hints = typehints.decorators.IOTypeHints.from_callable(self._fn)
- if fn_type_hints is not None:
- try:
- fn_type_hints.strip_iterable()
- except ValueError as e:
- raise ValueError('Return value not iterable: %s: %s' % (self._fn, e))
type_hints = get_type_hints(self._fn).with_defaults(fn_type_hints)
- # If the fn was a DoFn annotated with a type-hint that hinted a return
- # type compatible with Iterable[Any], then we strip off the outer
- # container type due to the 'flatten' portion of FlatMap.
- # TODO(robertwb): Should we require an iterable specification for FlatMap?
- if type_hints.output_types:
- args, kwargs = type_hints.output_types
- if len(args) == 1 and is_consistent_with(
- args[0], typehints.Iterable[typehints.Any]):
- type_hints = type_hints.copy()
- type_hints.set_output_types(element_type(args[0]), **kwargs)
+ # The fn's output type should be iterable. Strip off the outer
+ # container type due to the 'flatten' portion of FlatMap/ParDo.
+ try:
+ type_hints = type_hints.strip_iterable()
+ except ValueError as e:
+ # TODO(BEAM-8466): Raise exception here if using stricter type checking.
+ logging.warning('%s: %s', self.display_data()['fn'].value, e)
return type_hints
def infer_output_type(self, input_type):
@@ -1099,8 +1092,8 @@
Args:
pcoll (~apache_beam.pvalue.PCollection):
a :class:`~apache_beam.pvalue.PCollection` to be processed.
- fn (DoFn): a :class:`DoFn` object to be applied to each element
- of **pcoll** argument.
+ fn (`typing.Union[DoFn, typing.Callable]`): a :class:`DoFn` object to be
+ applied to each element of **pcoll** argument, or a Callable.
*args: positional arguments passed to the :class:`DoFn` object.
**kwargs: keyword arguments passed to the :class:`DoFn` object.
diff --git a/sdks/python/apache_beam/transforms/external_test.py b/sdks/python/apache_beam/transforms/external_test.py
index ba315f9..fe26977 100644
--- a/sdks/python/apache_beam/transforms/external_test.py
+++ b/sdks/python/apache_beam/transforms/external_test.py
@@ -34,6 +34,7 @@
import apache_beam as beam
from apache_beam import Pipeline
+from apache_beam.coders import BooleanCoder
from apache_beam.coders import FloatCoder
from apache_beam.coders import IterableCoder
from apache_beam.coders import StrUtf8Coder
@@ -66,6 +67,7 @@
class PayloadBase(object):
values = {
'integer_example': 1,
+ 'boolean': True,
'string_example': u'thing',
'list_of_strings': [u'foo', u'bar'],
'optional_kv': (u'key', 1.1),
@@ -74,6 +76,7 @@
bytes_values = {
'integer_example': 1,
+ 'boolean': True,
'string_example': 'thing',
'list_of_strings': ['foo', 'bar'],
'optional_kv': ('key', 1.1),
@@ -85,6 +88,10 @@
coder_urn=['beam:coder:varint:v1'],
payload=VarIntCoder()
.get_impl().encode_nested(values['integer_example'])),
+ 'boolean': ConfigValue(
+ coder_urn=['beam:coder:bool:v1'],
+ payload=BooleanCoder()
+ .get_impl().encode_nested(values['boolean'])),
'string_example': ConfigValue(
coder_urn=['beam:coder:string_utf8:v1'],
payload=StrUtf8Coder()
@@ -151,6 +158,7 @@
'TestSchema',
[
('integer_example', int),
+ ('boolean', bool),
('string_example', unicode),
('list_of_strings', typing.List[unicode]),
('optional_kv', typing.Optional[typing.Tuple[unicode, float]]),
@@ -188,6 +196,10 @@
coder_urn=['beam:coder:varint:v1'],
payload=VarIntCoder()
.get_impl().encode_nested(values['integer_example'])),
+ 'boolean': ConfigValue(
+ coder_urn=['beam:coder:bool:v1'],
+ payload=BooleanCoder()
+ .get_impl().encode_nested(values['boolean'])),
'string_example': ConfigValue(
coder_urn=['beam:coder:bytes:v1'],
payload=StrUtf8Coder()
diff --git a/sdks/python/apache_beam/transforms/external_test_py3.py b/sdks/python/apache_beam/transforms/external_test_py3.py
index 575fb50..c2e7f87 100644
--- a/sdks/python/apache_beam/transforms/external_test_py3.py
+++ b/sdks/python/apache_beam/transforms/external_test_py3.py
@@ -43,6 +43,7 @@
def __init__(self,
integer_example: int,
+ boolean: bool,
string_example: str,
list_of_strings: typing.List[str],
optional_kv: typing.Optional[
@@ -54,6 +55,7 @@
AnnotationBasedPayloadBuilder(
self,
integer_example=integer_example,
+ boolean=boolean,
string_example=string_example,
list_of_strings=list_of_strings,
optional_kv=optional_kv,
@@ -70,6 +72,7 @@
def __init__(self,
integer_example: int,
+ boolean: bool,
string_example: str,
list_of_strings: typehints.List[str],
optional_kv: typehints.Optional[
@@ -81,6 +84,7 @@
AnnotationBasedPayloadBuilder(
self,
integer_example=integer_example,
+ boolean=boolean,
string_example=string_example,
list_of_strings=list_of_strings,
optional_kv=optional_kv,
diff --git a/sdks/python/apache_beam/transforms/external_test_py37.py b/sdks/python/apache_beam/transforms/external_test_py37.py
index 2b3481f..e01f532 100644
--- a/sdks/python/apache_beam/transforms/external_test_py37.py
+++ b/sdks/python/apache_beam/transforms/external_test_py37.py
@@ -44,6 +44,7 @@
URN = 'beam:external:fakeurn:v1'
integer_example: int
+ boolean: bool
string_example: str
list_of_strings: typing.List[str]
optional_kv: typing.Optional[typing.Tuple[str, float]] = None
@@ -59,6 +60,7 @@
URN = 'beam:external:fakeurn:v1'
integer_example: int
+ boolean: bool
string_example: str
list_of_strings: typehints.List[str]
optional_kv: typehints.Optional[typehints.KV[str, float]] = None
diff --git a/sdks/python/apache_beam/transforms/ptransform_test.py b/sdks/python/apache_beam/transforms/ptransform_test.py
index cf640b8..ad201d1 100644
--- a/sdks/python/apache_beam/transforms/ptransform_test.py
+++ b/sdks/python/apache_beam/transforms/ptransform_test.py
@@ -194,6 +194,13 @@
assert_that(r2.m, equal_to([3, 4, 5]), label='r2')
pipeline.run()
+ @attr('ValidatesRunner')
+ def test_impulse(self):
+ pipeline = TestPipeline()
+ result = pipeline | beam.Impulse() | beam.Map(lambda _: 0)
+ assert_that(result, equal_to([0]))
+ pipeline.run()
+
# TODO(BEAM-3544): Disable this test in streaming temporarily.
# Remove sickbay-streaming tag after it's resolved.
@attr('ValidatesRunner', 'sickbay-streaming')
diff --git a/sdks/python/apache_beam/typehints/decorators.py b/sdks/python/apache_beam/typehints/decorators.py
index 218e3b1..aab45d9 100644
--- a/sdks/python/apache_beam/typehints/decorators.py
+++ b/sdks/python/apache_beam/typehints/decorators.py
@@ -294,16 +294,22 @@
"""Removes outer Iterable (or equivalent) from output type.
Only affects instances with simple output types, otherwise is a no-op.
+ Does not modify self.
Example: Generator[Tuple(int, int)] becomes Tuple(int, int)
+ Returns:
+ A possible copy of this instance with a possibly different output type.
+
Raises:
ValueError if output type is simple and not iterable.
"""
if not self.has_simple_output_type():
- return
+ return self
yielded_type = typehints.get_yielded_type(self.output_types[0][0])
- self.output_types = ((yielded_type,), {})
+ res = self.copy()
+ res.output_types = ((yielded_type,), {})
+ return res
def copy(self):
return IOTypeHints(self.input_types, self.output_types)
diff --git a/sdks/python/apache_beam/typehints/typed_pipeline_test.py b/sdks/python/apache_beam/typehints/typed_pipeline_test.py
index 354b867..c27bede 100644
--- a/sdks/python/apache_beam/typehints/typed_pipeline_test.py
+++ b/sdks/python/apache_beam/typehints/typed_pipeline_test.py
@@ -134,6 +134,43 @@
self.assertEqual([1, 3], [1, 2, 3] | beam.Filter(filter_fn))
+ def test_partition(self):
+ p = TestPipeline()
+ even, odd = (p
+ | beam.Create([1, 2, 3])
+ | 'even_odd' >> beam.Partition(lambda e, _: e % 2, 2))
+ self.assertIsNotNone(even.element_type)
+ self.assertIsNotNone(odd.element_type)
+ res_even = (even
+ | 'id_even' >> beam.ParDo(lambda e: [e]).with_input_types(int))
+ res_odd = (odd
+ | 'id_odd' >> beam.ParDo(lambda e: [e]).with_input_types(int))
+ assert_that(res_even, equal_to([2]), label='even_check')
+ assert_that(res_odd, equal_to([1, 3]), label='odd_check')
+ p.run()
+
+ def test_typed_dofn_multi_output(self):
+ class MyDoFn(beam.DoFn):
+ def process(self, element):
+ if element % 2:
+ yield beam.pvalue.TaggedOutput('odd', element)
+ else:
+ yield beam.pvalue.TaggedOutput('even', element)
+
+ p = TestPipeline()
+ res = (p
+ | beam.Create([1, 2, 3])
+ | beam.ParDo(MyDoFn()).with_outputs('odd', 'even'))
+ self.assertIsNotNone(res['even'].element_type)
+ self.assertIsNotNone(res['odd'].element_type)
+ res_even = (res['even']
+ | 'id_even' >> beam.ParDo(lambda e: [e]).with_input_types(int))
+ res_odd = (res['odd']
+ | 'id_odd' >> beam.ParDo(lambda e: [e]).with_input_types(int))
+ assert_that(res_even, equal_to([2]), label='even_check')
+ assert_that(res_odd, equal_to([1, 3]), label='odd_check')
+ p.run()
+
class NativeTypesTest(unittest.TestCase):
diff --git a/sdks/python/apache_beam/typehints/typed_pipeline_test_py3.py b/sdks/python/apache_beam/typehints/typed_pipeline_test_py3.py
index 640ca2b..988f0c2 100644
--- a/sdks/python/apache_beam/typehints/typed_pipeline_test_py3.py
+++ b/sdks/python/apache_beam/typehints/typed_pipeline_test_py3.py
@@ -32,9 +32,6 @@
class MainInputTest(unittest.TestCase):
def test_typed_dofn_method(self):
- # process annotations are recognized and take precedence over decorators.
- @typehints.with_input_types(typehints.Tuple[int, int])
- @typehints.with_output_types(int)
class MyDoFn(beam.DoFn):
def process(self, element: int) -> typehints.Tuple[str]:
return tuple(str(element))
@@ -50,6 +47,25 @@
r'requires.*int.*got.*str'):
_ = [1, 2, 3] | (beam.ParDo(MyDoFn()) | 'again' >> beam.ParDo(MyDoFn()))
+ def test_typed_dofn_method_with_class_decorators(self):
+ # Class decorators take precedence over PEP 484 hints.
+ @typehints.with_input_types(typehints.Tuple[int, int])
+ @typehints.with_output_types(int)
+ class MyDoFn(beam.DoFn):
+ def process(self, element: int) -> typehints.Tuple[str]:
+ yield element[0]
+
+ result = [(1, 2)] | beam.ParDo(MyDoFn())
+ self.assertEqual([1], sorted(result))
+
+ with self.assertRaisesRegex(typehints.TypeCheckError,
+ r'requires.*Tuple\[int, int\].*got.*str'):
+ _ = ['a', 'b', 'c'] | beam.ParDo(MyDoFn())
+
+ with self.assertRaisesRegex(typehints.TypeCheckError,
+ r'requires.*Tuple\[int, int\].*got.*int'):
+ _ = [1, 2, 3] | (beam.ParDo(MyDoFn()) | 'again' >> beam.ParDo(MyDoFn()))
+
def test_typed_dofn_instance(self):
# Type hints applied to DoFn instance take precedence over decorators and
# process annotations.
@@ -76,8 +92,8 @@
# Type hints applied to ParDo instance take precedence over callable
# decorators and annotations.
@typehints.with_input_types(typehints.Tuple[int, int])
- @typehints.with_output_types(int)
- def do_fn(element: typehints.Tuple[int, int]) -> typehints.Generator[int]:
+ @typehints.with_output_types(typehints.Generator[int])
+ def do_fn(element: typehints.Tuple[int, int]) -> typehints.Generator[str]:
yield str(element)
pardo = beam.ParDo(do_fn).with_input_types(int).with_output_types(str)
@@ -92,10 +108,8 @@
r'requires.*int.*got.*str'):
_ = [1, 2, 3] | (pardo | 'again' >> pardo)
- @unittest.skip('BEAM-7981: Iterable in output type should not be removed.')
def test_typed_callable_iterable_output(self):
- # TODO(BEAM-7981): Both Iterables get stripped in
- # CallableWrapperDoFn.default_type_hints, but only one should.
+ # Only the outer Iterable should be stripped.
def do_fn(element: int) -> typehints.Iterable[typehints.Iterable[str]]:
return [[str(element)] * 2]
@@ -111,10 +125,11 @@
_ = [1, 2, 3] | beam.ParDo(MyDoFn())
def test_typed_callable_not_iterable(self):
- def do_fn(element: typehints.Tuple[int, int]) -> int:
- return element[0]
- with self.assertRaisesRegex(ValueError, r'int.*is not iterable'):
+ def do_fn(element: int) -> int:
+ return [element] # Return a list to not fail the pipeline.
+ with self.assertLogs() as cm:
_ = [1, 2, 3] | beam.ParDo(do_fn)
+ self.assertRegex(''.join(cm.output), r'int.*is not iterable')
def test_typed_dofn_kwonly(self):
class MyDoFn(beam.DoFn):
@@ -163,7 +178,7 @@
def process(self, element: int) -> str:
return str(element)
- with self.assertRaisesRegex(ValueError, r'Return value not iterable'):
+ with self.assertRaisesRegex(ValueError, r'str.*is not iterable'):
_ = beam.ParDo(MyDoFn()).get_type_hints()
def test_pardo_wrapper(self):
@@ -174,12 +189,23 @@
self.assertEqual(th.input_types, ((int,), {}))
self.assertEqual(th.output_types, ((str,), {}))
+ def test_pardo_wrapper_tuple(self):
+ # Test case for callables that return key-value pairs for GBK. The outer
+ # Iterable should be stripped but the inner Tuple left intact.
+ def do_fn(element: int) -> typehints.Iterable[typehints.Tuple[str, int]]:
+ return [(str(element), element)]
+
+ th = beam.ParDo(do_fn).get_type_hints()
+ self.assertEqual(th.input_types, ((int,), {}))
+ self.assertEqual(th.output_types, ((typehints.Tuple[str, int],), {}))
+
def test_pardo_wrapper_not_iterable(self):
def do_fn(element: int) -> str:
return str(element)
- with self.assertRaisesRegex(ValueError, r'Return value not iterable'):
+ with self.assertLogs() as cm:
_ = beam.ParDo(do_fn).get_type_hints()
+ self.assertRegex(''.join(cm.output), r'do_fn.* not iterable')
def test_flat_map_wrapper(self):
def map_fn(element: int) -> typehints.Iterable[int]:
diff --git a/sdks/python/apache_beam/typehints/typehints.py b/sdks/python/apache_beam/typehints/typehints.py
index 4a9c739..6062e6f 100644
--- a/sdks/python/apache_beam/typehints/typehints.py
+++ b/sdks/python/apache_beam/typehints/typehints.py
@@ -1171,7 +1171,8 @@
def get_yielded_type(type_hint):
"""Obtains the type of elements yielded by an iterable.
- Note that "iterable" here means: can be iterated over in a for loop.
+ Note that "iterable" here means: can be iterated over in a for loop, excluding
+ strings.
Args:
type_hint: (TypeConstraint) The iterable in question. Must be normalize()-d.
diff --git a/sdks/python/apache_beam/version.py b/sdks/python/apache_beam/version.py
index 1365114..f32561a 100644
--- a/sdks/python/apache_beam/version.py
+++ b/sdks/python/apache_beam/version.py
@@ -18,4 +18,4 @@
"""Apache Beam SDK version information and utilities."""
-__version__ = '2.17.0.dev'
+__version__ = '2.18.0.dev'
diff --git a/sdks/python/test-suites/portable/common.gradle b/sdks/python/test-suites/portable/common.gradle
index 690c09d..1ea51ca 100644
--- a/sdks/python/test-suites/portable/common.gradle
+++ b/sdks/python/test-suites/portable/common.gradle
@@ -45,7 +45,7 @@
extra_experiments.add('pre_optimize=all')
tasks.create(name: name) {
dependsOn 'setupVirtualenv'
- dependsOn ':runners:flink:1.8:job-server:shadowJar'
+ dependsOn ':runners:flink:1.9:job-server:shadowJar'
if (workerType.toLowerCase() == 'docker')
dependsOn pythonContainerTask
else if (workerType.toLowerCase() == 'process')
@@ -53,7 +53,7 @@
doLast {
exec {
executable 'sh'
- args '-c', ". ${envdir}/bin/activate && cd ${pythonRootDir} && pip install -e .[test] && python -m apache_beam.runners.portability.flink_runner_test --flink_job_server_jar=${project(":runners:flink:1.8:job-server:").shadowJar.archivePath} --environment_type=${workerType} ${environment_config} ${streaming ? '--streaming' : ''} ${extra_experiments ? '--extra_experiments=' + extra_experiments.join(',') : ''}"
+ args '-c', ". ${envdir}/bin/activate && cd ${pythonRootDir} && pip install -e .[test] && python -m apache_beam.runners.portability.flink_runner_test --flink_job_server_jar=${project(":runners:flink:1.9:job-server:").shadowJar.archivePath} --environment_type=${workerType} ${environment_config} ${streaming ? '--streaming' : ''} ${extra_experiments ? '--extra_experiments=' + extra_experiments.join(',') : ''}"
}
}
}
diff --git a/sdks/python/test-suites/portable/py2/build.gradle b/sdks/python/test-suites/portable/py2/build.gradle
index 5ceac52..2b95296 100644
--- a/sdks/python/test-suites/portable/py2/build.gradle
+++ b/sdks/python/test-suites/portable/py2/build.gradle
@@ -28,7 +28,7 @@
addPortableWordCountTasks()
task preCommitPy2() {
- dependsOn ':runners:flink:1.8:job-server-container:docker'
+ dependsOn ':runners:flink:1.9:job-server-container:docker'
dependsOn ':sdks:python:container:py2:docker'
dependsOn portableWordCountBatch
dependsOn portableWordCountStreaming
@@ -45,12 +45,12 @@
// 2. Either a) or b)
// a) If you want the Job Server to run in a Docker container:
//
-// ./gradlew :runners:flink:1.8:job-server-container:docker
+// ./gradlew :runners:flink:1.9:job-server-container:docker
//
// b) Otherwise, start a local JobService, for example, the Portable Flink runner
// (in a separate shell since it continues to run):
//
-// ./gradlew :runners:flink:1.8:job-server:runShadow
+// ./gradlew :runners:flink:1.9:job-server:runShadow
//
// Then you can run this example:
//
@@ -87,7 +87,7 @@
task crossLanguagePythonJavaFlink {
dependsOn 'setupVirtualenv'
- dependsOn ':runners:flink:1.8:job-server-container:docker'
+ dependsOn ':runners:flink:1.9:job-server-container:docker'
dependsOn ':sdks:python:container:py2:docker'
dependsOn ':sdks:java:container:docker'
dependsOn ':sdks:java:testing:expansion-service:buildTestExpansionServiceJar'
@@ -112,7 +112,7 @@
task crossLanguagePortableWordCount {
dependsOn 'setupVirtualenv'
- dependsOn ':runners:flink:1.8:job-server-container:docker'
+ dependsOn ':runners:flink:1.9:job-server-container:docker'
dependsOn ':sdks:python:container:py2:docker'
dependsOn ':sdks:java:container:docker'
dependsOn ':sdks:java:testing:expansion-service:buildTestExpansionServiceJar'
@@ -128,6 +128,8 @@
"--shutdown_sources_on_final_watermark",
"--environment_cache_millis=10000",
"--expansion_service_jar=${testServiceExpansionJar}",
+ // Writes to local filesystem might fail for multiple SDK workers.
+ "--sdk_worker_parallelism=1"
]
exec {
executable 'sh'
diff --git a/sdks/python/test-suites/portable/py35/build.gradle b/sdks/python/test-suites/portable/py35/build.gradle
index b0d670c..42667c7 100644
--- a/sdks/python/test-suites/portable/py35/build.gradle
+++ b/sdks/python/test-suites/portable/py35/build.gradle
@@ -25,7 +25,7 @@
addPortableWordCountTasks()
task preCommitPy35() {
- dependsOn ':runners:flink:1.8:job-server-container:docker'
+ dependsOn ':runners:flink:1.9:job-server-container:docker'
dependsOn ':sdks:python:container:py35:docker'
dependsOn portableWordCountBatch
dependsOn portableWordCountStreaming
diff --git a/sdks/python/test-suites/portable/py36/build.gradle b/sdks/python/test-suites/portable/py36/build.gradle
index 70fbdce..d536d14 100644
--- a/sdks/python/test-suites/portable/py36/build.gradle
+++ b/sdks/python/test-suites/portable/py36/build.gradle
@@ -25,7 +25,7 @@
addPortableWordCountTasks()
task preCommitPy36() {
- dependsOn ':runners:flink:1.8:job-server-container:docker'
+ dependsOn ':runners:flink:1.9:job-server-container:docker'
dependsOn ':sdks:python:container:py36:docker'
dependsOn portableWordCountBatch
dependsOn portableWordCountStreaming
diff --git a/sdks/python/test-suites/portable/py37/build.gradle b/sdks/python/test-suites/portable/py37/build.gradle
index fa2ead2..da57c93 100644
--- a/sdks/python/test-suites/portable/py37/build.gradle
+++ b/sdks/python/test-suites/portable/py37/build.gradle
@@ -25,7 +25,7 @@
addPortableWordCountTasks()
task preCommitPy37() {
- dependsOn ':runners:flink:1.8:job-server-container:docker'
+ dependsOn ':runners:flink:1.9:job-server-container:docker'
dependsOn ':sdks:python:container:py37:docker'
dependsOn portableWordCountBatch
dependsOn portableWordCountStreaming
diff --git a/settings.gradle b/settings.gradle
index 7a03955..f0dff1e 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -39,6 +39,10 @@
include ":runners:flink:1.8"
include ":runners:flink:1.8:job-server"
include ":runners:flink:1.8:job-server-container"
+// Flink 1.9
+include ":runners:flink:1.9"
+include ":runners:flink:1.9:job-server"
+include ":runners:flink:1.9:job-server-container"
/* End Flink Runner related settings */
include ":runners:gearpump"
include ":runners:google-cloud-dataflow-java"
diff --git a/website/src/_includes/footer.html b/website/src/_includes/footer.html
index 8f4d983..2052fa9 100644
--- a/website/src/_includes/footer.html
+++ b/website/src/_includes/footer.html
@@ -42,11 +42,12 @@
width="14" height="14"
alt="External link."></a></div>
<div class="footer__cols__col__link"><a href="{{'/contribute/presentation-materials/'|prepend:site.baseurl}}">Media</a></div>
+ <div class="footer__cols__col__link"><a href="{{'/community/in-person/'|prepend:site.baseurl}}">Events/Meetups</a></div>
</div>
<div class="footer__cols__col footer__cols__col--md">
<div class="footer__cols__col__title">Resources</div>
<div class="footer__cols__col__link"><a href="{{'/blog/'|prepend:site.baseurl}}">Blog</a></div>
- <div class="footer__cols__col__link"><a href="{{'/get-started/support/'|prepend:site.baseurl}}">Support</a></div>
+ <div class="footer__cols__col__link"><a href="{{'/community/contact-us/'|prepend:site.baseurl}}">Contact Us</a></div>
<div class="footer__cols__col__link"><a href="https://github.com/apache/beam">GitHub</a></div>
</div>
</div>
diff --git a/website/src/_includes/head.html b/website/src/_includes/head.html
index 2fd0083..aba87c4 100644
--- a/website/src/_includes/head.html
+++ b/website/src/_includes/head.html
@@ -19,6 +19,9 @@
<link href="https://fonts.googleapis.com/css?family=Roboto:100,300,400" rel="stylesheet">
<link rel="stylesheet" href="{{ "/css/site.css" | prepend: site.baseurl }}">
<script src="https://code.jquery.com/jquery-2.2.4.min.js"></script>
+ <style>
+ .body__contained img { max-width: 100% }
+ </style>
<script src="{{ "/js/bootstrap.min.js" | prepend: site.baseurl }}"></script>
<script src="{{ "/js/language-switch.js" | prepend: site.baseurl }}"></script>
<script src="{{ "/js/fix-menu.js" | prepend: site.baseurl }}"></script>
diff --git a/website/src/_posts/2019-07-31-beam-2.14.0.md b/website/src/_posts/2019-07-31-beam-2.14.0.md
index 310e55e..0f00e71 100644
--- a/website/src/_posts/2019-07-31-beam-2.14.0.md
+++ b/website/src/_posts/2019-07-31-beam-2.14.0.md
@@ -79,6 +79,7 @@
### Known Issues
* Do **NOT** use Python MongoDB source in this release. Python MongoDB source [added](https://issues.apache.org/jira/browse/BEAM-5148) in this release has a known issue that can result in data loss. See ([BEAM-7866](https://issues.apache.org/jira/browse/BEAM-7866)) for details.
+* Can't install the Python SDK on macOS 10.15. See ([BEAM-8368](https://issues.apache.org/jira/browse/BEAM-8368)) for details.
## List of Contributors
diff --git a/website/src/_posts/2019-08-22-beam-2.15.0.md b/website/src/_posts/2019-08-22-beam-2.15.0.md
index 4346dcf..474cf13 100644
--- a/website/src/_posts/2019-08-22-beam-2.15.0.md
+++ b/website/src/_posts/2019-08-22-beam-2.15.0.md
@@ -58,6 +58,7 @@
* [BEAM-7616](https://issues.apache.org/jira/browse/BEAM-7616) urlopen calls may get stuck. (Regression from 2.14.0)
* [BEAM-8111](https://issues.apache.org/jira/browse/BEAM-8111) SchemaCoder fails on Dataflow, preventing the use of SqlTransform and schema-aware transforms. (Regression from 2.14.0)
+* ([BEAM-8368](https://issues.apache.org/jira/browse/BEAM-8368)) Can't install the Python SDK on macOS 10.15.
### Breaking Changes
diff --git a/website/src/_posts/2019-10-07-beam-2.16.0.md b/website/src/_posts/2019-10-07-beam-2.16.0.md
index 41d36c4..9356044 100644
--- a/website/src/_posts/2019-10-07-beam-2.16.0.md
+++ b/website/src/_posts/2019-10-07-beam-2.16.0.md
@@ -76,6 +76,7 @@
* Given that Python 2 will reach EOL on Jan 1 2020, Python 2 users of Beam will now receive a warning that new releases of Apache Beam will soon support Python 3 only.
* Filesystems not properly registered using FileIO.write in FlinkRunner. ([BEAM-8303](https://issues.apache.org/jira/browse/BEAM-8303))
* Performance regression in Java DirectRunner in streaming mode. ([BEAM-8363](https://issues.apache.org/jira/browse/BEAM-8363))
+* Can't install the Python SDK on macOS 10.15. ([BEAM-8368](https://issues.apache.org/jira/browse/BEAM-8368))
## List of Contributors
diff --git a/website/src/community/contact-us.md b/website/src/community/contact-us.md
index de8a884..9b79ad2 100644
--- a/website/src/community/contact-us.md
+++ b/website/src/community/contact-us.md
@@ -25,7 +25,7 @@
# Contact Us
There are many ways to reach the Beam user and developer communities - use
-whichever one seems best!
+whichever one seems best.
| How to contact us | When to use it |
| ----------------- | ---------------|
@@ -38,6 +38,8 @@
| [Slack](https://s.apache.org/beam-slack-channel) | Chat with users and developers in the ASF Slack. Note: Please [join the #beam channel](https://s.apache.org/beam-slack-channel) after you [created an account](https://s.apache.org/slack-invite). Please do not ask Beam questions in #general. |
{:.table}
+If you have questions about how to use Apache Beam, we recommend you try out the [user@](https://lists.apache.org/list.html?user@beam.apache.org) mailing list, and [StackOverflow](http://stackoverflow.com/questions/tagged/apache-beam).
+
[^1]: To subscribe or unsubscribe, a blank email is fine.
If you wish to report a security vulnerability, please contact [security@apache.org](mailto:security@apache.org). Apache Beam follows the typical [Apache vulnerability handling process](https://apache.org/security/committers.html#vulnerability-handling).
diff --git a/website/src/contribute/index.md b/website/src/contribute/index.md
index fa8a18f..9d84167 100644
--- a/website/src/contribute/index.md
+++ b/website/src/contribute/index.md
@@ -149,13 +149,17 @@
1. Make your code change. Every source file needs to include the Apache license header. Every new dependency needs to
have an open source license [compatible](https://www.apache.org/legal/resolved.html#criteria) with Apache.
-1. Add unit tests for your change
+
+1. Add unit tests for your change.
+
+1. Use descriptive commit messages that make it easy to identify changes and provide a clear history.
+
1. When your change is ready to be reviewed and merged, create a pull request.
- Format commit messages and the pull request title like `[BEAM-XXX] Fixes bug in ApproximateQuantiles`,
+
+1. Format commit messages and the pull request title like `[BEAM-XXX] Fixes bug in ApproximateQuantiles`,
where you replace BEAM-XXX with the appropriate JIRA issue.
This will automatically link the pull request to the issue.
- Use descriptive commit messages that make it easy to identify changes and provide a clear history.
- To support efficient and quality review, avoid tiny or out-of-context changes and huge mega-changes.
+
1. The pull request and any changes pushed to it will trigger [pre-commit
jobs](https://cwiki.apache.org/confluence/display/BEAM/Contribution+Testing+Guide#ContributionTestingGuide-Pre-commit). If a test fails and appears unrelated to your
change, you can cause tests to be re-run by adding a single line comment on your
@@ -163,9 +167,9 @@
retest this please
- There are other trigger phrases for post-commit tests found in
- .testinfra/jenkins, but use these sparingly because post-commit
- tests consume shared development resources.
+ Pull request template has a link to a [catalog of trigger phrases](https://github.com/apache/beam/blob/master/.test-infra/jenkins/README.md)
+ that start various post-commit tests suites. Use these sparingly because post-commit tests consume shared development resources.
+
1. Pull requests can only be merged by a
[Beam committer]({{ site.baseurl }}/contribute/team/).
To find a committer for your area, either:
@@ -174,11 +178,27 @@
- ask on [dev@beam.apache.org]({{ site.baseurl }}/community/contact-us/)
Use `R: @username` in the pull request to notify a reviewer.
+
1. If you don't get any response in 3 business days, email the [dev@ mailing list]({{ site.baseurl }}/community/contact-us) to ask for someone to look at your pull
request.
-1. Review feedback typically leads to follow-up changes. You can add these changes as additional "fixup" commits to the
- existing PR/branch. This will allow reviewer(s) to track the incremental progress. After review is complete and the
- PR accepted, multiple commits should be squashed (see [Git workflow tips](https://cwiki.apache.org/confluence/display/BEAM/Git+Tips)).
+
+### Make reviewer's job easier
+
+1. Provide context for your changes in the associated JIRA issue and/or PR description.
+
+1. Avoid huge mega-changes.
+
+1. Review feedback typically leads to follow-up changes. It is easier to review follow-up changes when they are added as additional "fixup" commits to the
+ existing PR/branch. This allows reviewer(s) to track the incremental progress and focus on new changes,
+ and keeps comment threads attached to the code.
+ Please refrain from squashing new commits into reviewed commits before review is completed.
+ Because squashing reviewed and unreviewed commits often makes it harder to
+ see the the difference between the review iterations, reviewers may ask you to unsquash new changes.
+
+1. After review is complete and the PR is accepted, fixup commits should be squashed (see [Git workflow tips](https://cwiki.apache.org/confluence/display/BEAM/Git+Tips)).
+ Beam committers [can squash](https://beam.apache.org/contribute/committer-guide/#merging-it)
+ all commits in the PR during merge, however if a PR has a mixture of independent changes that should not be squashed, and fixup commits,
+ then the PR author should help squashing fixup commits to maintain a clean commmit history.
## When will my change show up in an Apache Beam release?
diff --git a/website/src/contribute/release-guide.md b/website/src/contribute/release-guide.md
index 03fb6cc..1987986 100644
--- a/website/src/contribute/release-guide.md
+++ b/website/src/contribute/release-guide.md
@@ -993,7 +993,7 @@
```
Flink Local Runner
```
- ./gradlew :runners:flink:1.8:runQuickstartJavaFlinkLocal \
+ ./gradlew :runners:flink:1.9:runQuickstartJavaFlinkLocal \
-Prepourl=https://repository.apache.org/content/repositories/orgapachebeam-${KEY} \
-Pver=${RELEASE_VERSION}
```
diff --git a/website/src/documentation/dsls/sql/shell.md b/website/src/documentation/dsls/sql/shell.md
index 1317575..025b031 100644
--- a/website/src/documentation/dsls/sql/shell.md
+++ b/website/src/documentation/dsls/sql/shell.md
@@ -31,7 +31,7 @@
To use Beam SQL shell, you must first clone the [Beam SDK repository](https://github.com/apache/beam). Then, from the root of the repository clone, execute the following commands to run the shell:
```
-./gradlew -p sdks/java/extensions/sql/shell -Pbeam.sql.shell.bundled=':runners:flink:1.8,:sdks:java:io:kafka' installDist
+./gradlew -p sdks/java/extensions/sql/shell -Pbeam.sql.shell.bundled=':runners:flink:1.9,:sdks:java:io:kafka' installDist
./sdks/java/extensions/sql/shell/build/install/shell/bin/shell
```
@@ -119,7 +119,7 @@
1. Make sure the SQL shell includes the desired runner. Add the corresponding project id to the `-Pbeam.sql.shell.bundled` parameter of the Gradle invocation ([source code](https://github.com/apache/beam/blob/master/sdks/java/extensions/sql/shell/build.gradle), [project ids](https://github.com/apache/beam/blob/master/settings.gradle)). For example, use the following command to include Flink runner and KafkaIO:
```
- ./gradlew -p sdks/java/extensions/sql/shell -Pbeam.sql.shell.bundled=':runners:flink:1.8,:sdks:java:io:kafka' installDist
+ ./gradlew -p sdks/java/extensions/sql/shell -Pbeam.sql.shell.bundled=':runners:flink:1.9,:sdks:java:io:kafka' installDist
```
_Note: You can bundle multiple runners (using a comma-separated list) or other additional components in the same manner. For example, you can add support for more I/Os._
@@ -145,7 +145,7 @@
You can also build your own standalone package for SQL shell using `distZip` or `distTar` tasks. For example:
```
-./gradlew -p sdks/java/extensions/sql/shell -Pbeam.sql.shell.bundled=':runners:flink:1.8,:sdks:java:io:kafka' distZip
+./gradlew -p sdks/java/extensions/sql/shell -Pbeam.sql.shell.bundled=':runners:flink:1.9,:sdks:java:io:kafka' distZip
ls ./sdks/java/extensions/sql/shell/build/distributions/
beam-sdks-java-extensions-sql-shell-2.6.0-SNAPSHOT.tar beam-sdks-java-extensions-sql-shell-2.6.0-SNAPSHOT.zip
diff --git a/website/src/documentation/io/developing-io-overview.md b/website/src/documentation/io/developing-io-overview.md
index eabb8d9..b17a710 100644
--- a/website/src/documentation/io/developing-io-overview.md
+++ b/website/src/documentation/io/developing-io-overview.md
@@ -101,6 +101,16 @@
records per file, or if you'd like to read from a key-value store that supports
read operations in sorted key order.
+### Source lifecycle {#source}
+Here is a sequence diagram that shows the lifecycle of the Source during
+ the execution of the Read transform of an IO. The comments give useful
+ information to IO developers such as the constraints that
+ apply to the objects or particular cases such as streaming mode.
+
+ <!-- The source for the sequence diagram can be found in the the SVG resource. -->
+
+
### Using ParDo and GroupByKey
For data stores or file types where the data can be read in parallel, you can
diff --git a/website/src/documentation/programming-guide.md b/website/src/documentation/programming-guide.md
index 0c8a2c1..d78b609 100644
--- a/website/src/documentation/programming-guide.md
+++ b/website/src/documentation/programming-guide.md
@@ -802,6 +802,17 @@
> **Note:** You can use Java 8 lambda functions with several other Beam
> transforms, including `Filter`, `FlatMapElements`, and `Partition`.
+##### 4.2.1.4. DoFn lifecycle {#dofn}
+Here is a sequence diagram that shows the lifecycle of the DoFn during
+ the execution of the ParDo transform. The comments give useful
+ information to pipeline developers such as the constraints that
+ apply to the objects or particular cases such as failover or
+ instance reuse. They also give instanciation use cases.
+
+<!-- The source for the sequence diagram can be found in the the SVG resource. -->
+
+
#### 4.2.2. GroupByKey {#groupbykey}
`GroupByKey` is a Beam transform for processing collections of key/value pairs.
@@ -3098,4 +3109,4 @@
context.output(context.element());
}
}
-```
+```
\ No newline at end of file
diff --git a/website/src/documentation/runners/flink.md b/website/src/documentation/runners/flink.md
index 515d8e2..016400c 100644
--- a/website/src/documentation/runners/flink.md
+++ b/website/src/documentation/runners/flink.md
@@ -103,7 +103,11 @@
<th>Artifact Id</th>
</tr>
<tr>
- <td rowspan="2">2.17.0</td>
+ <td rowspan="3">2.17.0</td>
+ <td>1.9.x</td>
+ <td>beam-runners-flink-1.9</td>
+</tr>
+<tr>
<td>1.8.x</td>
<td>beam-runners-flink-1.8</td>
</tr>
@@ -266,12 +270,18 @@
<span class="language-py">
As of now you will need a copy of Apache Beam's source code. You can
-download it on the [Downloads page]({{ site.baseurl
-}}/get-started/downloads/). In the future there will be pre-built Docker images
-available. To run a pipeline on an embedded Flink cluster:
+download it on the [Downloads page]({{ site.baseurl }}/get-started/downloads/).
+
+Pre-built Docker images are available at Docker-Hub:
+[Python 2.7](https://hub.docker.com/r/apachebeam/python2.7_sdk),
+[Python 3.5](https://hub.docker.com/r/apachebeam/python3.5_sdk),
+[Python 3.6](https://hub.docker.com/r/apachebeam/python3.6_sdk),
+[Python 3.7](https://hub.docker.com/r/apachebeam/python3.7_sdk).
+
+To run a pipeline on an embedded Flink cluster:
</span>
-<span class="language-py">1. Start the JobService endpoint: `./gradlew :runners:flink:1.8:job-server:runShadow`
+<span class="language-py">1. Start the JobService endpoint: `./gradlew :runners:flink:1.9:job-server:runShadow`
</span>
<span class="language-py">
@@ -298,13 +308,13 @@
```
<span class="language-py">
-To run on a separate [Flink cluster](https://ci.apache.org/projects/flink/flink-docs-release-1.5/quickstart/setup_quickstart.html):
+To run on a separate [Flink cluster](https://ci.apache.org/projects/flink/flink-docs-release-1.8/tutorials/local_setup.html):
</span>
<span class="language-py">1. Start a Flink cluster which exposes the Rest interface on `localhost:8081` by default.
</span>
-<span class="language-py">2. Start JobService with Flink Rest endpoint: `./gradlew :runners:flink:1.8:job-server:runShadow -PflinkMasterUrl=localhost:8081`.
+<span class="language-py">2. Start JobService with Flink Rest endpoint: `./gradlew :runners:flink:1.9:job-server:runShadow -PflinkMasterUrl=localhost:8081`.
</span>
<span class="language-py">3. Submit the pipeline as above.
diff --git a/website/src/documentation/sdks/nexmark.md b/website/src/documentation/sdks/nexmark.md
index d5230da..b73023b 100644
--- a/website/src/documentation/sdks/nexmark.md
+++ b/website/src/documentation/sdks/nexmark.md
@@ -149,7 +149,7 @@
-P nexmark.runner
The Gradle project name of the runner, such as ":runners:direct-java" or
- ":runners:flink:1.8. The project names can be found in the root
+ ":runners:flink:1.9. The project names can be found in the root
`settings.gradle`.
Test data is deterministically synthesized on demand. The test
@@ -557,7 +557,7 @@
Batch Mode:
./gradlew :sdks:java:testing:nexmark:run \
- -Pnexmark.runner=":runners:flink:1.8" \
+ -Pnexmark.runner=":runners:flink:1.9" \
-Pnexmark.args="
--runner=FlinkRunner
--suite=SMOKE
@@ -570,7 +570,7 @@
Streaming Mode:
./gradlew :sdks:java:testing:nexmark:run \
- -Pnexmark.runner=":runners:flink:1.8" \
+ -Pnexmark.runner=":runners:flink:1.9" \
-Pnexmark.args="
--runner=FlinkRunner
--suite=SMOKE
diff --git a/website/src/documentation/sdks/python.md b/website/src/documentation/sdks/python.md
index 8b80c7d..33a5964 100644
--- a/website/src/documentation/sdks/python.md
+++ b/website/src/documentation/sdks/python.md
@@ -47,4 +47,3 @@
new I/O connectors. See the [Developing I/O connectors overview]({{ site.baseurl }}/documentation/io/developing-io-overview)
for information about developing new I/O connectors and links to
language-specific implementation guidance.
-
diff --git a/website/src/images/dofn-sequence-diagram.svg b/website/src/images/dofn-sequence-diagram.svg
new file mode 100644
index 0000000..898b1ae
--- /dev/null
+++ b/website/src/images/dofn-sequence-diagram.svg
@@ -0,0 +1,94 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" contentScriptType="application/ecmascript" contentStyleType="text/css" height="863px" preserveAspectRatio="none" style="width:740px;height:863px;" version="1.1" viewBox="0 0 740 863" width="740px" zoomAndPan="magnify"><defs/><g><rect fill="#6B9FE6" height="781.5625" style="stroke: #6B9FE6; stroke-width: 1.0;" width="10" x="49" y="60.8125"/><rect fill="#6B9FE6" height="425.2813" style="stroke: #6B9FE6; stroke-width: 1.0;" width="10" x="273" y="417.0938"/><rect fill="#6B9FE6" height="425.2813" style="stroke: #6B9FE6; stroke-width: 1.0;" width="10" x="547" y="417.0938"/><line style="stroke: #6B9FE6; stroke-width: 1.0; stroke-dasharray: 5.0,5.0;" x1="54" x2="54" y1="50.8125" y2="851.375"/><line style="stroke: #6B9FE6; stroke-width: 1.0; stroke-dasharray: 5.0,5.0;" x1="278" x2="278" y1="50.8125" y2="851.375"/><line style="stroke: #6B9FE6; stroke-width: 1.0; stroke-dasharray: 5.0,5.0;" x1="551.5" x2="551.5" y1="50.8125" y2="851.375"/><rect fill="#8AC483" height="30.4063" style="stroke: #8AC483; stroke-width: 1.5;" width="92" x="8" y="19.4063"/><text fill="#000000" font-family="Roboto" font-size="14" lengthAdjust="spacingAndGlyphs" textLength="78" x="15" y="39.3945">User pipeline</text><rect fill="#8AC483" height="46.8125" style="stroke: #8AC483; stroke-width: 1.5;" width="94" x="231" y="3"/><text fill="#000000" font-family="Roboto" font-size="14" font-style="italic" lengthAdjust="spacingAndGlyphs" textLength="80" x="238" y="24">«Serializable»</text><text fill="#000000" font-family="Roboto" font-size="14" lengthAdjust="spacingAndGlyphs" textLength="33" x="261.5" y="40.4063">DoFn</text><rect fill="#8AC483" height="30.4063" style="stroke: #8AC483; stroke-width: 1.5;" width="59" x="522.5" y="19.4063"/><text fill="#000000" font-family="Roboto" font-size="14" lengthAdjust="spacingAndGlyphs" textLength="45" x="529.5" y="39.3945">Runner</text><rect fill="#6B9FE6" height="781.5625" style="stroke: #6B9FE6; stroke-width: 1.0;" width="10" x="49" y="60.8125"/><rect fill="#6B9FE6" height="425.2813" style="stroke: #6B9FE6; stroke-width: 1.0;" width="10" x="273" y="417.0938"/><rect fill="#6B9FE6" height="425.2813" style="stroke: #6B9FE6; stroke-width: 1.0;" width="10" x="547" y="417.0938"/><path d="M283,65.8125 L283,105.8125 L515,105.8125 L515,75.8125 L505,65.8125 L283,65.8125 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><path d="M505,65.8125 L505,75.8125 L515,75.8125 L505,65.8125 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="181" x="289" y="82.873">can have non-transient instance</text><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="211" x="289" y="98.1074">variable state that will be deserialized</text><path d="M283,116.2813 L283,156.2813 L643,156.2813 L643,126.2813 L633,116.2813 L283,116.2813 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><path d="M633,116.2813 L633,126.2813 L643,126.2813 L633,116.2813 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="332" x="289" y="133.3418">do not include enclosing class serializable state; use static</text><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="339" x="289" y="148.5762">nested DoFn or define as anonymous class in static method</text><path d="M283,166.75 L283,206.75 L725,206.75 L725,176.75 L715,166.75 L283,166.75 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><path d="M715,166.75 L715,176.75 L725,176.75 L715,166.75 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="421" x="289" y="183.8105">no shared (global) static variable access (no sync mechanism) but a beam</text><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="409" x="289" y="199.0449">state (based on engine mechanisms) can be injected to processElement</text><path d="M283,217.2188 L283,257.2188 L648,257.2188 L648,227.2188 L638,217.2188 L283,217.2188 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><path d="M638,217.2188 L638,227.2188 L648,227.2188 L638,217.2188 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="344" x="289" y="234.2793">keep as pure function as possible or idempotent side effects</text><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="269" x="289" y="249.5137">because DoFns can be retried on failed bundles</text><polygon fill="#67666A" points="266,279.9219,276,283.9219,266,287.9219,270,283.9219" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="59" x2="272" y1="283.9219" y2="283.9219"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="69" x="66" y="278.748">create DoFn</text><polygon fill="#67666A" points="540,309.1563,550,313.1563,540,317.1563,544,313.1563" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="278" x2="546" y1="313.1563" y2="313.1563"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="250" x="285" y="307.9824">passed instance or deserialized on workers</text><path d="M64,326.1563 L64,366.1563 L405,366.1563 L405,336.1563 L395,326.1563 L64,326.1563 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><path d="M395,326.1563 L395,336.1563 L405,336.1563 L395,326.1563 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="320" x="70" y="343.2168">If state variables are known at pipeline construction step</text><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="216" x="70" y="358.4512">initialize state variables by constructor</text><path d="M201,378.625 L331,378.625 L331,385.625 L321,395.625 L201,395.625 L201,378.625 " fill="#8AC483" style="stroke: #8AC483; stroke-width: 1.0;"/><rect fill="none" height="455.75" style="stroke: #8AC483; stroke-width: 2.0;" width="528" x="201" y="378.625"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="85" x="216" y="391.6855">DoFn Lifecycle</text><polygon fill="#67666A" points="294,413.0938,284,417.0938,294,421.0938,290,417.0938" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="288" x2="546" y1="417.0938" y2="417.0938"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="55" x="300" y="411.9199">call setup</text><path d="M288,430.0938 L288,455.0938 L658,455.0938 L658,440.0938 L648,430.0938 L288,430.0938 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><path d="M648,430.0938 L648,440.0938 L658,440.0938 L648,430.0938 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="349" x="294" y="447.1543">reused instance to process other bundles on the same worker</text><path d="M288,465.3281 L288,505.3281 L719,505.3281 L719,475.3281 L709,465.3281 L288,465.3281 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><path d="M709,465.3281 L709,475.3281 L719,475.3281 L709,465.3281 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="410" x="294" y="482.3887">If state variables do not depend on the main pipeline program and are the</text><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="288" x="294" y="497.623">same for all DoFn instances initialize them in setup</text><path d="M211,517.7969 L347,517.7969 L347,524.7969 L337,534.7969 L211,534.7969 L211,517.7969 " fill="#8AC483" style="stroke: #8AC483; stroke-width: 1.0;"/><rect fill="none" height="245.1094" style="stroke: #8AC483; stroke-width: 2.0;" width="390.5" x="211" y="517.7969"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="91" x="226" y="530.8574">For each bundle</text><polygon fill="#67666A" points="294,552.2656,284,556.2656,294,560.2656,290,556.2656" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="288" x2="546" y1="556.2656" y2="556.2656"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="89" x="300" y="551.0918">call startBundle</text><path d="M221,571.2656 L365,571.2656 L365,578.2656 L355,588.2656 L221,588.2656 L221,571.2656 " fill="#8AC483" style="stroke: #8AC483; stroke-width: 1.0;"/><rect fill="none" height="126.1719" style="stroke: #8AC483; stroke-width: 2.0;" width="370.5" x="221" y="571.2656"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="99" x="236" y="584.3262">For each element</text><polygon fill="#67666A" points="294,605.7344,284,609.7344,294,613.7344,290,609.7344" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="288" x2="546" y1="609.7344" y2="609.7344"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="116" x="300" y="604.5605">call processElement</text><path d="M288,622.7344 L288,662.7344 L569,662.7344 L569,632.7344 L559,622.7344 L288,622.7344 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><path d="M559,622.7344 L559,632.7344 L569,632.7344 L559,622.7344 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="260" x="294" y="639.7949">If state variables are computed by the pipeline</text><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="240" x="294" y="655.0293">pass it in a PcollectionView as a side input</text><polygon fill="#67666A" points="535,685.4375,545,689.4375,535,693.4375,539,689.4375" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0; stroke-dasharray: 2.0,2.0;" x1="283" x2="541" y1="689.4375" y2="689.4375"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="36" x="290" y="684.2637">output</text><polygon fill="#67666A" points="294,721.6719,284,725.6719,294,729.6719,290,725.6719" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="288" x2="546" y1="725.6719" y2="725.6719"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="70" x="300" y="720.498">call onTimer</text><polygon fill="#67666A" points="294,750.9063,284,754.9063,294,758.9063,290,754.9063" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="288" x2="546" y1="754.9063" y2="754.9063"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="94" x="300" y="749.7324">call finishBundle</text><polygon fill="#67666A" points="535,787.1406,545,791.1406,535,795.1406,539,791.1406" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="283" x2="541" y1="791.1406" y2="791.1406"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="234" x="290" y="785.9668">If DoFn is no more needed: call tearDown</text><path d="M288,804.1406 L288,829.1406 L633,829.1406 L633,814.1406 L623,804.1406 L288,804.1406 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><path d="M623,804.1406 L623,814.1406 L633,814.1406 L623,804.1406 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="324" x="294" y="821.2012">Call of teardown is best effort; do not use for side effects</text><!--MD5=[4b9cd25bbc466f533d08153696c40e3e]
+@startuml
+
+hide footbox
+skinparam backgroundColor transparent
+skinparam shadowing false
+skinparam defaultFontName "Roboto"
+
+skinparam sequenceArrowThickness 2
+
+skinparam note {
+ BackgroundColor #cee2f2
+ BorderColor #cee2f2
+}
+
+skinparam sequence {
+ ArrowColor #67666a
+
+ LifeLineBorderColor #6b9fe6
+ LifeLineBackgroundColor #6b9fe6
+
+ GroupBackgroundColor #8ac483
+ GroupBorderColor #8ac483
+
+ ParticipantBackgroundColor #8ac483
+ ParticipantBorderColor #8ac483
+}
+
+participant "User pipeline" as Pipeline
+participant DoFn << Serializable >>
+note right of DoFn: can have non-transient instance\nvariable state that will be deserialized
+note right of DoFn: do not include enclosing class serializable state; use static\nnested DoFn or define as anonymous class in static method
+note right of DoFn: no shared (global) static variable access (no sync mechanism) but a beam\nstate (based on engine mechanisms) can be injected to processElement
+note right of DoFn: keep as pure function as possible or idempotent side effects\nbecause DoFns can be retried on failed bundles
+
+participant Runner
+
+activate Pipeline
+Pipeline -> DoFn: **create DoFn **
+DoFn -> Runner: **passed instance or deserialized on workers**
+
+note right Pipeline: If state variables are known at pipeline construction step\ninitialize state variables by constructor
+
+group DoFn Lifecycle
+ Runner -> DoFn: **call setup**
+ activate Runner
+ activate DoFn
+ note right DoFn: reused instance to process other bundles on the same worker
+ note right DoFn: If state variables do not depend on the main pipeline program and are the\nsame for all DoFn instances initialize them in setup
+ group For each bundle
+ Runner -> DoFn: **call startBundle**
+ group For each element
+ Runner -> DoFn: **call processElement**
+ note right DoFn: If state variables are computed by the pipeline\npass it in a PcollectionView as a side input
+ DoFn - -> Runner: output
+ end
+ DoFn <- Runner: call onTimer
+ DoFn <- Runner: **call finishBundle**
+ end
+ DoFn -> Runner: **If DoFn is no more needed: call tearDown**
+ note right DoFn: Call of teardown is best effort; do not use for side effects
+end
+
+@enduml
+
+PlantUML version 1.2019.11(Sun Sep 22 12:02:15 CEST 2019)
+(GPL source distribution)
+Java Runtime: OpenJDK Runtime Environment
+JVM: OpenJDK 64-Bit Server VM
+Java Version: 1.8.0_222-b10
+Operating System: Linux
+Default Encoding: UTF-8
+Language: en
+Country: CA
+--></g></svg>
diff --git a/website/src/images/source-sequence-diagram.svg b/website/src/images/source-sequence-diagram.svg
new file mode 100644
index 0000000..02facd6
--- /dev/null
+++ b/website/src/images/source-sequence-diagram.svg
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" contentScriptType="application/ecmascript" contentStyleType="text/css" height="538px" preserveAspectRatio="none" style="width:574px;height:538px;" version="1.1" viewBox="0 0 574 538" width="574px" zoomAndPan="magnify"><defs/><g><rect fill="#6B9FE6" height="435.2813" style="stroke: #6B9FE6; stroke-width: 1.0;" width="10" x="250.5" y="82.0469"/><rect fill="#6B9FE6" height="152.1719" style="stroke: #6B9FE6; stroke-width: 1.0;" width="10" x="367.5" y="356.1563"/><rect fill="#6B9FE6" height="338.5781" style="stroke: #6B9FE6; stroke-width: 1.0;" width="10" x="468.5" y="140.5156"/><line style="stroke: #6B9FE6; stroke-width: 1.0; stroke-dasharray: 5.0,5.0;" x1="37" x2="37" y1="50.8125" y2="526.3281"/><line style="stroke: #6B9FE6; stroke-width: 1.0; stroke-dasharray: 5.0,5.0;" x1="255.5" x2="255.5" y1="50.8125" y2="526.3281"/><line style="stroke: #6B9FE6; stroke-width: 1.0; stroke-dasharray: 5.0,5.0;" x1="372.5" x2="372.5" y1="50.8125" y2="526.3281"/><line style="stroke: #6B9FE6; stroke-width: 1.0; stroke-dasharray: 5.0,5.0;" x1="473.5" x2="473.5" y1="50.8125" y2="526.3281"/><rect fill="#8AC483" height="30.4063" style="stroke: #8AC483; stroke-width: 1.5;" width="59" x="8" y="19.4063"/><text fill="#000000" font-family="Roboto" font-size="14" lengthAdjust="spacingAndGlyphs" textLength="45" x="15" y="39.3945">Runner</text><rect fill="#8AC483" height="46.8125" style="stroke: #8AC483; stroke-width: 1.5;" width="94" x="208.5" y="3"/><text fill="#000000" font-family="Roboto" font-size="14" font-style="italic" lengthAdjust="spacingAndGlyphs" textLength="80" x="215.5" y="24">«Serializable»</text><text fill="#000000" font-family="Roboto" font-size="14" lengthAdjust="spacingAndGlyphs" textLength="43" x="234" y="40.4063">Source</text><rect fill="#8AC483" height="30.4063" style="stroke: #8AC483; stroke-width: 1.5;" width="58" x="343.5" y="19.4063"/><text fill="#000000" font-family="Roboto" font-size="14" lengthAdjust="spacingAndGlyphs" textLength="44" x="350.5" y="39.3945">Reader</text><rect fill="#8AC483" height="30.4063" style="stroke: #8AC483; stroke-width: 1.5;" width="86" x="430.5" y="19.4063"/><text fill="#000000" font-family="Roboto" font-size="14" lengthAdjust="spacingAndGlyphs" textLength="72" x="437.5" y="39.3945">Middleware</text><rect fill="#6B9FE6" height="435.2813" style="stroke: #6B9FE6; stroke-width: 1.0;" width="10" x="250.5" y="82.0469"/><rect fill="#6B9FE6" height="152.1719" style="stroke: #6B9FE6; stroke-width: 1.0;" width="10" x="367.5" y="356.1563"/><rect fill="#6B9FE6" height="338.5781" style="stroke: #6B9FE6; stroke-width: 1.0;" width="10" x="468.5" y="140.5156"/><polygon fill="#67666A" points="238.5,78.0469,248.5,82.0469,238.5,86.0469,242.5,82.0469" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="37.5" x2="244.5" y1="82.0469" y2="82.0469"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="80" x="44.5" y="76.873">create source</text><polygon fill="#67666A" points="238.5,107.2813,248.5,111.2813,238.5,115.2813,242.5,111.2813" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="37.5" x2="244.5" y1="111.2813" y2="111.2813"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="104" x="44.5" y="106.1074">get estimated size</text><polygon fill="#67666A" points="456.5,136.5156,466.5,140.5156,456.5,144.5156,460.5,140.5156" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="260.5" x2="462.5" y1="140.5156" y2="140.5156"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="77" x="267.5" y="135.3418">estimate size</text><polygon fill="#67666A" points="48.5,165.75,38.5,169.75,48.5,173.75,44.5,169.75" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0; stroke-dasharray: 2.0,2.0;" x1="42.5" x2="249.5" y1="169.75" y2="169.75"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="66" x="54.5" y="164.5762">size of data</text><line style="stroke: #67666A; stroke-width: 2.0;" x1="37.5" x2="79.5" y1="214.2188" y2="214.2188"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="79.5" x2="79.5" y1="214.2188" y2="227.2188"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="38.5" x2="79.5" y1="227.2188" y2="227.2188"/><polygon fill="#67666A" points="48.5,223.2188,38.5,227.2188,48.5,231.2188,44.5,227.2188" style="stroke: #67666A; stroke-width: 1.0;"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="204" x="44.5" y="193.8105">compute size / number of executors</text><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="119" x="47.5" y="209.0449">= desired bundle size</text><polygon fill="#67666A" points="238.5,255.4531,248.5,259.4531,238.5,263.4531,242.5,259.4531" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="37.5" x2="244.5" y1="259.4531" y2="259.4531"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="187" x="44.5" y="254.2793">split source (desired bundle size)</text><path d="M265,240.2188 L265,265.2188 L548,265.2188 L548,250.2188 L538,240.2188 L265,240.2188 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><path d="M538,240.2188 L538,250.2188 L548,250.2188 L538,240.2188 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="262" x="271" y="257.2793">streaming: split based on number of executors</text><polygon fill="#67666A" points="48.5,290.6875,38.5,294.6875,48.5,298.6875,44.5,294.6875" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0; stroke-dasharray: 2.0,2.0;" x1="42.5" x2="249.5" y1="294.6875" y2="294.6875"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="70" x="54.5" y="289.5137">list<source></text><path d="M265,275.4531 L265,300.4531 L562,300.4531 L562,285.4531 L552,275.4531 L265,275.4531 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><path d="M552,275.4531 L552,285.4531 L562,285.4531 L552,275.4531 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="276" x="271" y="292.5137">streaming: runner asks the source for watermark</text><polygon fill="#67666A" points="238.5,322.9219,248.5,326.9219,238.5,330.9219,242.5,326.9219" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="37.5" x2="244.5" y1="326.9219" y2="326.9219"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="182" x="44.5" y="321.748">for each source create a reader</text><polygon fill="#67666A" points="355.5,352.1563,365.5,356.1563,355.5,360.1563,359.5,356.1563" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="260.5" x2="361.5" y1="356.1563" y2="356.1563"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="88" x="267.5" y="350.9824">create a reader</text><polygon fill="#67666A" points="48.5,384.3906,38.5,388.3906,48.5,392.3906,44.5,388.3906" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0; stroke-dasharray: 2.0,2.0;" x1="42.5" x2="366.5" y1="388.3906" y2="388.3906"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="45" x="54.5" y="383.2168">readers</text><path d="M382,369.1563 L382,394.1563 L539,394.1563 L539,379.1563 L529,369.1563 L382,369.1563 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><path d="M529,369.1563 L529,379.1563 L539,379.1563 L529,369.1563 " fill="#CEE2F2" style="stroke: #CEE2F2; stroke-width: 1.0;"/><text fill="#000000" font-family="Roboto" font-size="13" lengthAdjust="spacingAndGlyphs" textLength="136" x="388" y="386.2168">streaming: + checkpoint</text><polygon fill="#67666A" points="355.5,416.625,365.5,420.625,355.5,424.625,359.5,420.625" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="37.5" x2="361.5" y1="420.625" y2="420.625"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="167" x="44.5" y="415.4512">for each reader : start reader</text><polygon fill="#67666A" points="355.5,445.8594,365.5,449.8594,355.5,453.8594,359.5,449.8594" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="37.5" x2="361.5" y1="449.8594" y2="449.8594"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="183" x="44.5" y="444.6855">read elements until none to read</text><polygon fill="#67666A" points="461.5,475.0938,471.5,479.0938,461.5,483.0938,465.5,479.0938" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="377.5" x2="467.5" y1="479.0938" y2="479.0938"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="67" x="384.5" y="473.9199">get element</text><polygon fill="#67666A" points="360.5,504.3281,370.5,508.3281,360.5,512.3281,364.5,508.3281" style="stroke: #67666A; stroke-width: 1.0;"/><line style="stroke: #67666A; stroke-width: 2.0;" x1="37.5" x2="366.5" y1="508.3281" y2="508.3281"/><text fill="#000000" font-family="Roboto" font-size="13" font-weight="bold" lengthAdjust="spacingAndGlyphs" textLength="72" x="44.5" y="503.1543">close reader</text><!--MD5=[6e6ef42902efdf0e23898c2d72194b90]
+@startuml
+
+hide footbox
+skinparam backgroundColor transparent
+skinparam shadowing false
+skinparam defaultFontName "Roboto"
+
+skinparam sequenceArrowThickness 2
+
+skinparam note {
+ BackgroundColor #cee2f2
+ BorderColor #cee2f2
+}
+
+skinparam sequence {
+ ArrowColor #67666a
+
+ LifeLineBorderColor #6b9fe6
+ LifeLineBackgroundColor #6b9fe6
+
+ GroupBackgroundColor #8ac483
+ GroupBorderColor #8ac483
+
+ ParticipantBackgroundColor #8ac483
+ ParticipantBorderColor #8ac483
+}
+
+participant Runner
+participant "Source" << Serializable >>
+participant "Reader"
+participant Middleware
+
+Runner -> Source: **create source**
+activate Source
+
+Runner -> Source: get estimated size
+
+Source -> Middleware: **estimate size**
+activate Middleware
+
+Source - -> Runner: size of data
+
+Runner -> Runner: compute size / number of executors\n = desired bundle size
+
+Runner -> Source: split source (desired bundle size)
+note right
+ streaming: split based on number of executors
+end note
+
+Source - -> Runner: list<source>
+note right
+ streaming: runner asks the source for watermark
+end note
+
+Runner -> Source: **for each source create a reader**
+
+Source -> Reader: **create a reader**
+activate Reader
+
+Reader - -> Runner: **readers**
+note right
+ streaming: + checkpoint
+end note
+
+Runner -> Reader: **for each reader : start reader**
+
+Runner -> Reader: **read elements until none to read**
+
+Reader -> Middleware: **get element**
+
+deactivate Middleware
+
+Runner -> Reader: **close reader**
+deactivate Reader
+@enduml
+
+PlantUML version 1.2019.11(Sun Sep 22 12:02:15 CEST 2019)
+(GPL source distribution)
+Java Runtime: OpenJDK Runtime Environment
+JVM: OpenJDK 64-Bit Server VM
+Java Version: 1.8.0_222-b10
+Operating System: Linux
+Default Encoding: UTF-8
+Language: en
+Country: CA
+--></g></svg>