package org.apache.hadoop.examples; | |
import static org.junit.Assert.assertEquals; | |
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.util.StringTokenizer; | |
import java.util.TreeMap; | |
import org.apache.hadoop.conf.Configuration; | |
import org.apache.hadoop.fs.FileStatus; | |
import org.apache.hadoop.fs.FileSystem; | |
import org.apache.hadoop.fs.Path; | |
import org.apache.hadoop.util.ToolRunner; | |
import org.junit.Before; | |
import org.junit.Test; | |
public class TestWordStats { | |
private final static String INPUT = "src/test/java/org/apache/hadoop/examples/pi/math"; | |
private final static String MEAN_OUTPUT = "build/data/mean_output"; | |
private final static String MEDIAN_OUTPUT = "build/data/median_output"; | |
private final static String STDDEV_OUTPUT = "build/data/stddev_output"; | |
/** | |
* Modified internal test class that is designed to read all the files in the | |
* input directory, and find the standard deviation between all of the word | |
* lengths. | |
*/ | |
public static class WordStdDevReader { | |
private long wordsRead = 0; | |
private long wordLengthsRead = 0; | |
private long wordLengthsReadSquared = 0; | |
public WordStdDevReader() { | |
} | |
public double read(String path) throws IOException { | |
FileSystem fs = FileSystem.get(new Configuration()); | |
FileStatus[] files = fs.listStatus(new Path(path)); | |
for (FileStatus fileStat : files) { | |
if (!fileStat.isFile()) | |
continue; | |
BufferedReader br = null; | |
try { | |
br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath()))); | |
String line; | |
while ((line = br.readLine()) != null) { | |
StringTokenizer st = new StringTokenizer(line); | |
String word; | |
while (st.hasMoreTokens()) { | |
word = st.nextToken(); | |
this.wordsRead++; | |
this.wordLengthsRead += word.length(); | |
this.wordLengthsReadSquared += (long) Math.pow(word.length(), 2.0); | |
} | |
} | |
} catch (IOException e) { | |
System.out.println("Output could not be read!"); | |
throw e; | |
} finally { | |
br.close(); | |
} | |
} | |
double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead)); | |
mean = Math.pow(mean, 2.0); | |
double term = (((double) this.wordLengthsReadSquared / ((double) this.wordsRead))); | |
double stddev = Math.sqrt((term - mean)); | |
return stddev; | |
} | |
} | |
/** | |
* Modified internal test class that is designed to read all the files in the | |
* input directory, and find the median length of all the words. | |
*/ | |
public static class WordMedianReader { | |
private long wordsRead = 0; | |
private TreeMap<Integer, Integer> map = new TreeMap<Integer, Integer>(); | |
public WordMedianReader() { | |
} | |
public double read(String path) throws IOException { | |
FileSystem fs = FileSystem.get(new Configuration()); | |
FileStatus[] files = fs.listStatus(new Path(path)); | |
int num = 0; | |
for (FileStatus fileStat : files) { | |
if (!fileStat.isFile()) | |
continue; | |
BufferedReader br = null; | |
try { | |
br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath()))); | |
String line; | |
while ((line = br.readLine()) != null) { | |
StringTokenizer st = new StringTokenizer(line); | |
String word; | |
while (st.hasMoreTokens()) { | |
word = st.nextToken(); | |
this.wordsRead++; | |
if (this.map.get(word.length()) == null) { | |
this.map.put(word.length(), 1); | |
} else { | |
int count = this.map.get(word.length()); | |
this.map.put(word.length(), count + 1); | |
} | |
} | |
} | |
} catch (IOException e) { | |
System.out.println("Output could not be read!"); | |
throw e; | |
} finally { | |
br.close(); | |
} | |
} | |
int medianIndex1 = (int) Math.ceil((this.wordsRead / 2.0)); | |
int medianIndex2 = (int) Math.floor((this.wordsRead / 2.0)); | |
for (Integer key : this.map.navigableKeySet()) { | |
int prevNum = num; | |
num += this.map.get(key); | |
if (medianIndex2 >= prevNum && medianIndex1 <= num) { | |
return key; | |
} else if (medianIndex2 >= prevNum && medianIndex1 < num) { | |
Integer nextCurrLen = this.map.navigableKeySet().iterator().next(); | |
double median = (key + nextCurrLen) / 2.0; | |
return median; | |
} | |
} | |
return -1; | |
} | |
} | |
/** | |
* Modified internal test class that is designed to read all the files in the | |
* input directory, and find the mean length of all the words. | |
*/ | |
public static class WordMeanReader { | |
private long wordsRead = 0; | |
private long wordLengthsRead = 0; | |
public WordMeanReader() { | |
} | |
public double read(String path) throws IOException { | |
FileSystem fs = FileSystem.get(new Configuration()); | |
FileStatus[] files = fs.listStatus(new Path(path)); | |
for (FileStatus fileStat : files) { | |
if (!fileStat.isFile()) | |
continue; | |
BufferedReader br = null; | |
try { | |
br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath()))); | |
String line; | |
while ((line = br.readLine()) != null) { | |
StringTokenizer st = new StringTokenizer(line); | |
String word; | |
while (st.hasMoreTokens()) { | |
word = st.nextToken(); | |
this.wordsRead++; | |
this.wordLengthsRead += word.length(); | |
} | |
} | |
} catch (IOException e) { | |
System.out.println("Output could not be read!"); | |
throw e; | |
} finally { | |
br.close(); | |
} | |
} | |
double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead)); | |
return mean; | |
} | |
} | |
/** | |
* Internal class designed to delete the output directory. Meant solely for | |
* use before and after the test is run; this is so next iterations of the | |
* test do not encounter a "file already exists" error. | |
* | |
* @param dir | |
* The directory to delete. | |
* @return Returns whether the deletion was successful or not. | |
*/ | |
public static boolean deleteDir(File dir) { | |
if (dir.isDirectory()) { | |
String[] children = dir.list(); | |
for (int i = 0; i < children.length; i++) { | |
boolean success = deleteDir(new File(dir, children[i])); | |
if (!success) { | |
System.out.println("Could not delete directory after test!"); | |
return false; | |
} | |
} | |
} | |
// The directory is now empty so delete it | |
return dir.delete(); | |
} | |
@Before public void setup() throws Exception { | |
deleteDir(new File(MEAN_OUTPUT)); | |
deleteDir(new File(MEDIAN_OUTPUT)); | |
deleteDir(new File(STDDEV_OUTPUT)); | |
} | |
@Test public void testGetTheMean() throws Exception { | |
String args[] = new String[2]; | |
args[0] = INPUT; | |
args[1] = MEAN_OUTPUT; | |
WordMean wm = new WordMean(); | |
ToolRunner.run(new Configuration(), wm, args); | |
double mean = wm.getMean(); | |
// outputs MUST match | |
WordMeanReader wr = new WordMeanReader(); | |
assertEquals(mean, wr.read(INPUT), 0.0); | |
} | |
@Test public void testGetTheMedian() throws Exception { | |
String args[] = new String[2]; | |
args[0] = INPUT; | |
args[1] = MEDIAN_OUTPUT; | |
WordMedian wm = new WordMedian(); | |
ToolRunner.run(new Configuration(), wm, args); | |
double median = wm.getMedian(); | |
// outputs MUST match | |
WordMedianReader wr = new WordMedianReader(); | |
assertEquals(median, wr.read(INPUT), 0.0); | |
} | |
@Test public void testGetTheStandardDeviation() throws Exception { | |
String args[] = new String[2]; | |
args[0] = INPUT; | |
args[1] = STDDEV_OUTPUT; | |
WordStandardDeviation wsd = new WordStandardDeviation(); | |
ToolRunner.run(new Configuration(), wsd, args); | |
double stddev = wsd.getStandardDeviation(); | |
// outputs MUST match | |
WordStdDevReader wr = new WordStdDevReader(); | |
assertEquals(stddev, wr.read(INPUT), 0.0); | |
} | |
} |