blob: 9d6f682d17c86205b6dc386e840fb03d88316d84 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.crunch;
import static org.junit.Assert.assertEquals;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.List;
import org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchControlledJob;
import org.apache.crunch.impl.mr.MRJob;
import org.apache.crunch.impl.mr.MRPipeline;
import org.apache.crunch.io.To;
import org.apache.crunch.test.TemporaryPath;
import org.apache.crunch.test.TemporaryPaths;
import org.apache.crunch.types.avro.Avros;
import org.junit.Rule;
import org.junit.Test;
import com.google.common.io.Files;
/**
*
*/
public class CleanTextIT {
private static final int LINES_IN_SHAKES = 3667;
@Rule
public TemporaryPath tmpDir = TemporaryPaths.create();
static DoFn<String, String> CLEANER = new DoFn<String, String>() {
@Override
public void process(String input, Emitter<String> emitter) {
emitter.emit(input.toLowerCase());
}
};
static DoFn<String, String> SPLIT = new DoFn<String, String>() {
@Override
public void process(String input, Emitter<String> emitter) {
for (String word : input.split("\\S+")) {
if (!word.isEmpty()) {
emitter.emit(word);
}
}
}
};
@Test
public void testMapSideOutputs() throws Exception {
MRPipeline pipeline = new MRPipeline(CleanTextIT.class, tmpDir.getDefaultConfiguration());
JobHook prepareOne = new JobHook();
JobHook prepareTwo = new JobHook();
JobHook completed = new JobHook();
pipeline.addPrepareHook(prepareOne).addPrepareHook(prepareTwo).addCompletionHook(completed);
String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
PCollection<String> cleanShakes = shakespeare.parallelDo(CLEANER, Avros.strings());
File cso = tmpDir.getFile("cleanShakes");
cleanShakes.write(To.textFile(cso.getAbsolutePath()));
File wc = tmpDir.getFile("wordCounts");
cleanShakes.parallelDo(SPLIT, Avros.strings()).count().write(To.textFile(wc.getAbsolutePath()));
pipeline.done();
File cleanFile = new File(cso, "part-m-00000");
List<String> lines = Files.readLines(cleanFile, Charset.defaultCharset());
assertEquals(LINES_IN_SHAKES, lines.size());
assertEquals(1, prepareOne.called);
assertEquals(1, prepareTwo.called);
assertEquals(1, completed.called);
}
static class JobHook implements CrunchControlledJob.Hook {
int called = 0;
@Override
public void run(MRJob job) throws IOException {
called++;
}
}
}