src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.parsefilter.naivebayes;

 import java.io.BufferedReader;
 import java.io.BufferedWriter;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
 import java.util.HashMap;
 import java.util.HashSet;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;

 public class Train {

   public static String replacefirstoccuranceof(String tomatch, String line) {

     int index = line.indexOf(tomatch);
     if (index == -1) {
       return line;
     } else {
       return line.substring(0, index)
           + line.substring(index + tomatch.length());
     }

   }

   public static void updateHashMap(HashMap<String, Integer> dict, String key) {
     if (!key.equals("")) {
       if (dict.containsKey(key))
         dict.put(key, dict.get(key) + 1);
       else
         dict.put(key, 1);
     }
   }

   public static String flattenHashMap(HashMap<String, Integer> dict) {
     String result = "";

     for (String key : dict.keySet()) {

       result += key + ":" + dict.get(key) + ",";
     }

     // remove the last comma
     result = result.substring(0, result.length() - 1);

     return result;
   }

   public static void start(String filepath) throws IOException {

     // two classes 0/irrelevant and 1/relevant

     // calculate the total number of instances/examples per class, word count in
     // each class and for each class a word:frequency map

     int numof_ir = 0;
     int numof_r = 0;
     int numwords_ir = 0;
     int numwords_r = 0;
     HashSet<String> uniquewords = new HashSet<String>();
     HashMap<String, Integer> wordfreq_ir = new HashMap<String, Integer>();
     HashMap<String, Integer> wordfreq_r = new HashMap<String, Integer>();

     String line = "";
     String target = "";
     String[] linearray = null;

     // read the line
     Configuration configuration = new Configuration();
     FileSystem fs = FileSystem.get(configuration);

     BufferedReader bufferedReader = new BufferedReader(
         configuration.getConfResourceAsReader(filepath));

     while ((line = bufferedReader.readLine()) != null) {

       target = line.split("\t")[0];

       line = replacefirstoccuranceof(target + "\t", line);

       linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase().split(" ");

       // update the data structures
       if (target.equals("0")) {

         numof_ir += 1;
         numwords_ir += linearray.length;
         for (int i = 0; i < linearray.length; i++) {
           uniquewords.add(linearray[i]);
           updateHashMap(wordfreq_ir, linearray[i]);
         }
       } else {

         numof_r += 1;
         numwords_r += linearray.length;
         for (int i = 0; i < linearray.length; i++) {
           uniquewords.add(linearray[i]);
           updateHashMap(wordfreq_r, linearray[i]);
         }

       }

     }

     // write the model file

     Path path = new Path("naivebayes-model");

     Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(path,
         true)));

     writer.write(String.valueOf(uniquewords.size()) + "\n");
     writer.write("0\n");
     writer.write(String.valueOf(numof_ir) + "\n");
     writer.write(String.valueOf(numwords_ir) + "\n");
     writer.write(flattenHashMap(wordfreq_ir) + "\n");
     writer.write("1\n");
     writer.write(String.valueOf(numof_r) + "\n");
     writer.write(String.valueOf(numwords_r) + "\n");
     writer.write(flattenHashMap(wordfreq_r) + "\n");

     writer.close();

     bufferedReader.close();

   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.parsefilter.naivebayes;

	import java.io.BufferedReader;
	import java.io.BufferedWriter;
	import java.io.IOException;
	import java.io.OutputStreamWriter;
	import java.io.Writer;
	import java.util.HashMap;
	import java.util.HashSet;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;

	public class Train {

	public static String replacefirstoccuranceof(String tomatch, String line) {

	int index = line.indexOf(tomatch);
	if (index == -1) {
	return line;
	} else {
	return line.substring(0, index)
	+ line.substring(index + tomatch.length());
	}

	}

	public static void updateHashMap(HashMap<String, Integer> dict, String key) {
	if (!key.equals("")) {
	if (dict.containsKey(key))
	dict.put(key, dict.get(key) + 1);
	else
	dict.put(key, 1);
	}
	}

	public static String flattenHashMap(HashMap<String, Integer> dict) {
	String result = "";

	for (String key : dict.keySet()) {

	result += key + ":" + dict.get(key) + ",";
	}

	// remove the last comma
	result = result.substring(0, result.length() - 1);

	return result;
	}

	public static void start(String filepath) throws IOException {

	// two classes 0/irrelevant and 1/relevant

	// calculate the total number of instances/examples per class, word count in
	// each class and for each class a word:frequency map

	int numof_ir = 0;
	int numof_r = 0;
	int numwords_ir = 0;
	int numwords_r = 0;
	HashSet<String> uniquewords = new HashSet<String>();
	HashMap<String, Integer> wordfreq_ir = new HashMap<String, Integer>();
	HashMap<String, Integer> wordfreq_r = new HashMap<String, Integer>();

	String line = "";
	String target = "";
	String[] linearray = null;

	// read the line
	Configuration configuration = new Configuration();
	FileSystem fs = FileSystem.get(configuration);

	BufferedReader bufferedReader = new BufferedReader(
	configuration.getConfResourceAsReader(filepath));

	while ((line = bufferedReader.readLine()) != null) {

	target = line.split("\t")[0];

	line = replacefirstoccuranceof(target + "\t", line);

	linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase().split(" ");

	// update the data structures
	if (target.equals("0")) {

	numof_ir += 1;
	numwords_ir += linearray.length;
	for (int i = 0; i < linearray.length; i++) {
	uniquewords.add(linearray[i]);
	updateHashMap(wordfreq_ir, linearray[i]);
	}
	} else {

	numof_r += 1;
	numwords_r += linearray.length;
	for (int i = 0; i < linearray.length; i++) {
	uniquewords.add(linearray[i]);
	updateHashMap(wordfreq_r, linearray[i]);
	}

	}

	}

	// write the model file

	Path path = new Path("naivebayes-model");

	Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(path,
	true)));

	writer.write(String.valueOf(uniquewords.size()) + "\n");
	writer.write("0\n");
	writer.write(String.valueOf(numof_ir) + "\n");
	writer.write(String.valueOf(numwords_ir) + "\n");
	writer.write(flattenHashMap(wordfreq_ir) + "\n");
	writer.write("1\n");
	writer.write(String.valueOf(numof_r) + "\n");
	writer.write(String.valueOf(numwords_r) + "\n");
	writer.write(flattenHashMap(wordfreq_r) + "\n");

	writer.close();

	bufferedReader.close();

	}

	}