| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.hadoop.examples; |
| |
| import static org.junit.Assert.assertEquals; |
| |
| import java.io.BufferedReader; |
| import java.io.File; |
| import java.io.IOException; |
| import java.io.InputStreamReader; |
| import java.util.StringTokenizer; |
| import java.util.TreeMap; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.fs.FileStatus; |
| import org.apache.hadoop.fs.FileSystem; |
| import org.apache.hadoop.fs.Path; |
| import org.apache.hadoop.util.ToolRunner; |
| import org.junit.Before; |
| import org.junit.Test; |
| |
| public class TestWordStats { |
| |
| private final static String INPUT = "src/test/java/org/apache/hadoop/examples/pi/math"; |
| private final static String MEAN_OUTPUT = "build/data/mean_output"; |
| private final static String MEDIAN_OUTPUT = "build/data/median_output"; |
| private final static String STDDEV_OUTPUT = "build/data/stddev_output"; |
| |
| /** |
| * Modified internal test class that is designed to read all the files in the |
| * input directory, and find the standard deviation between all of the word |
| * lengths. |
| */ |
| public static class WordStdDevReader { |
| private long wordsRead = 0; |
| private long wordLengthsRead = 0; |
| private long wordLengthsReadSquared = 0; |
| |
| public WordStdDevReader() { |
| } |
| |
| public double read(String path) throws IOException { |
| FileSystem fs = FileSystem.get(new Configuration()); |
| FileStatus[] files = fs.listStatus(new Path(path)); |
| |
| for (FileStatus fileStat : files) { |
| if (!fileStat.isFile()) |
| continue; |
| |
| BufferedReader br = null; |
| |
| try { |
| br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath()))); |
| |
| String line; |
| while ((line = br.readLine()) != null) { |
| StringTokenizer st = new StringTokenizer(line); |
| String word; |
| while (st.hasMoreTokens()) { |
| word = st.nextToken(); |
| this.wordsRead++; |
| this.wordLengthsRead += word.length(); |
| this.wordLengthsReadSquared += (long) Math.pow(word.length(), 2.0); |
| } |
| } |
| |
| } catch (IOException e) { |
| System.out.println("Output could not be read!"); |
| throw e; |
| } finally { |
| br.close(); |
| } |
| } |
| |
| double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead)); |
| mean = Math.pow(mean, 2.0); |
| double term = (((double) this.wordLengthsReadSquared / ((double) this.wordsRead))); |
| double stddev = Math.sqrt((term - mean)); |
| return stddev; |
| } |
| |
| } |
| |
| /** |
| * Modified internal test class that is designed to read all the files in the |
| * input directory, and find the median length of all the words. |
| */ |
| public static class WordMedianReader { |
| private long wordsRead = 0; |
| private TreeMap<Integer, Integer> map = new TreeMap<Integer, Integer>(); |
| |
| public WordMedianReader() { |
| } |
| |
| public double read(String path) throws IOException { |
| FileSystem fs = FileSystem.get(new Configuration()); |
| FileStatus[] files = fs.listStatus(new Path(path)); |
| |
| int num = 0; |
| |
| for (FileStatus fileStat : files) { |
| if (!fileStat.isFile()) |
| continue; |
| |
| BufferedReader br = null; |
| |
| try { |
| br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath()))); |
| |
| String line; |
| while ((line = br.readLine()) != null) { |
| StringTokenizer st = new StringTokenizer(line); |
| String word; |
| while (st.hasMoreTokens()) { |
| word = st.nextToken(); |
| this.wordsRead++; |
| if (this.map.get(word.length()) == null) { |
| this.map.put(word.length(), 1); |
| } else { |
| int count = this.map.get(word.length()); |
| this.map.put(word.length(), count + 1); |
| } |
| } |
| } |
| } catch (IOException e) { |
| System.out.println("Output could not be read!"); |
| throw e; |
| } finally { |
| br.close(); |
| } |
| } |
| |
| int medianIndex1 = (int) Math.ceil((this.wordsRead / 2.0)); |
| int medianIndex2 = (int) Math.floor((this.wordsRead / 2.0)); |
| |
| for (Integer key : this.map.navigableKeySet()) { |
| int prevNum = num; |
| num += this.map.get(key); |
| |
| if (medianIndex2 >= prevNum && medianIndex1 <= num) { |
| return key; |
| } else if (medianIndex2 >= prevNum && medianIndex1 < num) { |
| Integer nextCurrLen = this.map.navigableKeySet().iterator().next(); |
| double median = (key + nextCurrLen) / 2.0; |
| return median; |
| } |
| } |
| return -1; |
| } |
| |
| } |
| |
| /** |
| * Modified internal test class that is designed to read all the files in the |
| * input directory, and find the mean length of all the words. |
| */ |
| public static class WordMeanReader { |
| private long wordsRead = 0; |
| private long wordLengthsRead = 0; |
| |
| public WordMeanReader() { |
| } |
| |
| public double read(String path) throws IOException { |
| FileSystem fs = FileSystem.get(new Configuration()); |
| FileStatus[] files = fs.listStatus(new Path(path)); |
| |
| for (FileStatus fileStat : files) { |
| if (!fileStat.isFile()) |
| continue; |
| |
| BufferedReader br = null; |
| |
| try { |
| br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath()))); |
| |
| String line; |
| while ((line = br.readLine()) != null) { |
| StringTokenizer st = new StringTokenizer(line); |
| String word; |
| while (st.hasMoreTokens()) { |
| word = st.nextToken(); |
| this.wordsRead++; |
| this.wordLengthsRead += word.length(); |
| } |
| } |
| } catch (IOException e) { |
| System.out.println("Output could not be read!"); |
| throw e; |
| } finally { |
| br.close(); |
| } |
| } |
| |
| double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead)); |
| return mean; |
| } |
| |
| } |
| |
| /** |
| * Internal class designed to delete the output directory. Meant solely for |
| * use before and after the test is run; this is so next iterations of the |
| * test do not encounter a "file already exists" error. |
| * |
| * @param dir |
| * The directory to delete. |
| * @return Returns whether the deletion was successful or not. |
| */ |
| public static boolean deleteDir(File dir) { |
| if (dir.isDirectory()) { |
| String[] children = dir.list(); |
| for (int i = 0; i < children.length; i++) { |
| boolean success = deleteDir(new File(dir, children[i])); |
| if (!success) { |
| System.out.println("Could not delete directory after test!"); |
| return false; |
| } |
| } |
| } |
| |
| // The directory is now empty so delete it |
| return dir.delete(); |
| } |
| |
| @Before public void setup() throws Exception { |
| deleteDir(new File(MEAN_OUTPUT)); |
| deleteDir(new File(MEDIAN_OUTPUT)); |
| deleteDir(new File(STDDEV_OUTPUT)); |
| } |
| |
| @Test public void testGetTheMean() throws Exception { |
| String args[] = new String[2]; |
| args[0] = INPUT; |
| args[1] = MEAN_OUTPUT; |
| |
| WordMean wm = new WordMean(); |
| ToolRunner.run(new Configuration(), wm, args); |
| double mean = wm.getMean(); |
| |
| // outputs MUST match |
| WordMeanReader wr = new WordMeanReader(); |
| assertEquals(mean, wr.read(INPUT), 0.0); |
| } |
| |
| @Test public void testGetTheMedian() throws Exception { |
| String args[] = new String[2]; |
| args[0] = INPUT; |
| args[1] = MEDIAN_OUTPUT; |
| |
| WordMedian wm = new WordMedian(); |
| ToolRunner.run(new Configuration(), wm, args); |
| double median = wm.getMedian(); |
| |
| // outputs MUST match |
| WordMedianReader wr = new WordMedianReader(); |
| assertEquals(median, wr.read(INPUT), 0.0); |
| } |
| |
| @Test public void testGetTheStandardDeviation() throws Exception { |
| String args[] = new String[2]; |
| args[0] = INPUT; |
| args[1] = STDDEV_OUTPUT; |
| |
| WordStandardDeviation wsd = new WordStandardDeviation(); |
| ToolRunner.run(new Configuration(), wsd, args); |
| double stddev = wsd.getStandardDeviation(); |
| |
| // outputs MUST match |
| WordStdDevReader wr = new WordStdDevReader(); |
| assertEquals(stddev, wr.read(INPUT), 0.0); |
| } |
| |
| } |