blob: 333deba8f3fd8b263d21b3f0f6c931cce150ed4c [file] [view]
---
title: "Try Apache Beam"
---
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Try Apache Beam
You can try an Apache Beam pipeline using our interactive notebooks.
{{< language-switcher java py go >}}
## Interactive WordCount in Colab
This interactive notebook shows you what a simple, minimal version of WordCount looks like.
{{< highlight java >}}
package samples.quickstart;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Count;
import org.apache.beam.sdk.transforms.Filter;
import org.apache.beam.sdk.transforms.FlatMapElements;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.TypeDescriptors;
import java.util.Arrays;
public class WordCount {
public static void main(String[] args) {
String inputsDir = "data/*";
String outputsPrefix = "outputs/part";
PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply("Read lines", TextIO.read().from(inputsDir))
.apply("Find words", FlatMapElements.into(TypeDescriptors.strings())
.via((String line) -> Arrays.asList(line.split("[^\\p{L}]+"))))
.apply("Filter empty words", Filter.by((String word) -> !word.isEmpty()))
.apply("Count words", Count.perElement())
.apply("Write results", MapElements.into(TypeDescriptors.strings())
.via((KV<String, Long> wordCount) ->
wordCount.getKey() + ": " + wordCount.getValue()))
.apply(TextIO.write().to(outputsPrefix));
pipeline.run();
}
}
{{< /highlight >}}
{{< paragraph class="language-java" >}}
<a class="button button--primary" target="_blank"
href="https://colab.sandbox.google.com/github/{{< param branch_repo >}}/examples/notebooks/get-started/try-apache-beam-java.ipynb">
Run in Colab
</a>
<a class="button button--primary" target="_blank"
href="https://github.com/{{< param branch_repo >}}/examples/notebooks/get-started/try-apache-beam-java.ipynb">
View on GitHub
</a>
{{< /paragraph >}}
{{< paragraph class="language-java" >}}
To learn how to install and run the Apache Beam Java SDK on your own computer, follow the instructions in the <a href="/get-started/quickstart-java">Java Quickstart</a>.
{{< /paragraph >}}
{{< highlight py >}}
import apache_beam as beam
import re
inputs_pattern = 'data/*'
outputs_prefix = 'outputs/part'
with beam.Pipeline() as pipeline:
(
pipeline
| 'Read lines' >> beam.io.ReadFromText(inputs_pattern)
| 'Find words' >> beam.FlatMap(lambda line: re.findall(r"[a-zA-Z']+", line))
| 'Pair words with 1' >> beam.Map(lambda word: (word, 1))
| 'Group and sum' >> beam.CombinePerKey(sum)
| 'Format results' >> beam.Map(lambda word_count: str(word_count))
| 'Write results' >> beam.io.WriteToText(outputs_prefix)
)
{{< /highlight >}}
{{< paragraph class="language-py" >}}
<a class="button button--primary" target="_blank"
href="https://colab.sandbox.google.com/github/{{< param branch_repo >}}/examples/notebooks/get-started/try-apache-beam-py.ipynb">
Run in Colab
</a>
<a class="button button--primary" target="_blank"
href="https://github.com/{{< param branch_repo >}}/examples/notebooks/get-started/try-apache-beam-py.ipynb">
View on GitHub
</a>
{{< /paragraph >}}
{{< paragraph class="language-py" >}}
To learn how to install and run the Apache Beam Python SDK on your own computer, follow the instructions in the <a href="/get-started/quickstart-py">Python Quickstart</a>.
{{< /paragraph >}}
{{< highlight go >}}
package main
import (
"context"
"flag"
"fmt"
"regexp"
"github.com/apache/beam/sdks/v2/go/pkg/beam"
"github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio"
"github.com/apache/beam/sdks/v2/go/pkg/beam/runners/direct"
"github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats"
_ "github.com/apache/beam/sdks/v2/go/pkg/beam/io/filesystem/local"
)
var (
input = flag.String("input", "data/*", "File(s) to read.")
output = flag.String("output", "outputs/wordcounts.txt", "Output filename.")
)
var wordRE = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`)
func main() {
flag.Parse()
beam.Init()
pipeline := beam.NewPipeline()
root := pipeline.Root()
lines := textio.Read(root, *input)
words := beam.ParDo(root, func(line string, emit func(string)) {
for _, word := range wordRE.FindAllString(line, -1) {
emit(word)
}
}, lines)
counted := stats.Count(root, words)
formatted := beam.ParDo(root, func(word string, count int) string {
return fmt.Sprintf("%s: %v", word, count)
}, counted)
textio.Write(root, *output, formatted)
direct.Execute(context.Background(), pipeline)
}
{{< /highlight >}}
{{< paragraph class="language-go" >}}
<a class="button button--primary" target="_blank"
href="https://colab.sandbox.google.com/github/{{< param branch_repo >}}/examples/notebooks/get-started/try-apache-beam-go.ipynb">
Run in Colab
</a>
<a class="button button--primary" target="_blank"
href="https://github.com/{{< param branch_repo >}}/examples/notebooks/get-started/try-apache-beam-go.ipynb">
View on GitHub
</a>
{{< /paragraph >}}
{{< paragraph class="language-go" >}}
To learn how to install and run the Apache Beam Go SDK on your own computer, follow the instructions in the <a href="/get-started/quickstart-go">Go Quickstart</a>.
{{< /paragraph >}}
For a more detailed explanation about how WordCount works, see the [WordCount Example Walkthrough](/get-started/wordcount-example).
## Next Steps
* Walk through additional WordCount examples in the [WordCount Example Walkthrough](/get-started/wordcount-example).
* Take a self-paced tour through our [Learning Resources](/documentation/resources/learning-resources).
* Dive in to some of our favorite [Videos and Podcasts](/get-started/resources/videos-and-podcasts).
* Join the Beam [users@](/community/contact-us) mailing list.
* If you're interested in contributing to the Apache Beam codebase, see the [Contribution Guide](/contribute).
Please don't hesitate to [reach out](/community/contact-us) if you encounter any issues!