blob: 8d13bfe433e49269ee57f4d7a862cd9000f91624 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.nemo.examples.spark;
import org.apache.nemo.compiler.frontend.spark.core.rdd.SparkJavaPairRDD;
import org.apache.nemo.compiler.frontend.spark.core.rdd.SparkJavaRDD;
import org.apache.nemo.compiler.frontend.spark.sql.SparkSession;
import scala.Tuple2;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
/**
* Java Spark word-count and line-count examples in one.
*/
public final class JavaWordAndLineCount {
private static final Pattern SPACE = Pattern.compile(" ");
/**
* Private constructor.
*/
private JavaWordAndLineCount() {
}
/**
* Main method.
*
* @param args arguments.
* @throws Exception exceptions.
*/
public static void main(final String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Usage: JavaWordAndLineCount <input_file> [<output_file>]");
System.exit(1);
}
SparkSession spark = SparkSession
.builder()
.appName("JavaWordAndLineCount")
.getOrCreate();
SparkJavaRDD<String> lines = spark.read().textFile(args[0]).javaRDD();
SparkJavaPairRDD<String, Integer> lineOnes = lines.mapToPair(s -> new Tuple2<>("line count", 1));
SparkJavaPairRDD<String, Integer> lineCounts = lineOnes.reduceByKey((i1, i2) -> i1 + i2);
List<Tuple2<String, Integer>> lineOutput = lineCounts.collect();
SparkJavaRDD<String> words = lines.flatMap(s -> Arrays.asList(SPACE.split(s)).iterator());
SparkJavaPairRDD<String, Integer> wordOnes = words.mapToPair(s -> new Tuple2<>(s, 1));
SparkJavaPairRDD<String, Integer> wordCounts = wordOnes.reduceByKey((i1, i2) -> i1 + i2);
List<Tuple2<String, Integer>> wordOutput = wordCounts.collect();
final boolean writemode = args[1] != null;
if (writemode) { // print to output file
try (BufferedWriter bw = new BufferedWriter(new FileWriter(args[1]))) {
for (Tuple2<?, ?> lineTuple : lineOutput) {
bw.write(lineTuple._1 + ": " + lineTuple._2 + "\n\n");
}
for (Tuple2<?, ?> wordTuple : wordOutput) {
bw.write(wordTuple._1 + ": " + wordTuple._2 + "\n");
}
} catch (IOException e) {
throw new RuntimeException(e);
}
} else { // print to console.
for (Tuple2<?, ?> lineTuple : lineOutput) {
System.out.println(lineTuple._1 + ": " + lineTuple._2 + "\n");
}
for (Tuple2<?, ?> wordTuple : wordOutput) {
System.out.println(wordTuple._1 + ": " + wordTuple._2);
}
}
spark.stop();
}
}