Update with MLLib hashing and tfidf implementation.

commit: 4882e381c10a36bfffb2988a7bccc2e1053c5d01 [log] [tgz]
author: mvivero091 <mvivero091@gmail.com> Tue Jun 23 13:01:09 2015 -0700
committer: mvivero091 <mvivero091@gmail.com> Tue Jun 23 13:01:09 2015 -0700
tree: 24236370724bbc074f9640446124d442c3a5b698
parent: c0f2a10364af93a85a3a1bfbf54929a9d359d77c [diff]
diff --git a/README.md b/README.md
index 24e7241..5b27591 100644
--- a/README.md
+++ b/README.md

@@ -4,6 +4,10 @@
 
 # Release Information
 
+## Version 2.2
+
+Modified PreparedData to use MLLib hashing and tf-idf implementations.
+
 ## Version 2.1
 
 Fixed dot product implementation in the predict methods to work with batch predict method for evaluation.

diff --git a/build.sbt b/build.sbt
index 149041b..7276847 100644
--- a/build.sbt
+++ b/build.sbt

@@ -7,7 +7,5 @@
   "io.prediction"    %% "core"        % pioVersion.value % "provided",
   "org.apache.spark" %% "spark-core" % "1.3.1" % "provided",
   "org.apache.spark" %% "spark-mllib" % "1.3.1" % "provided",
-  "org.xerial.snappy" % "snappy-java" % "1.1.1.7",
-  "org.apache.opennlp" % "opennlp-tools" % "1.5.3",
-  "com.github.fommil.netlib" % "core" % "1.1.2"
+  "org.xerial.snappy" % "snappy-java" % "1.1.1.7"
 )

diff --git a/data/semanticanalysis.json b/data/sentimentanalysis.json
similarity index 100%
rename from data/semanticanalysis.json
rename to data/sentimentanalysis.json


diff --git a/engine.json b/engine.json
index 30df882..da6bd9c 100644
--- a/engine.json
+++ b/engine.json

@@ -4,16 +4,12 @@
   "engineFactory": "org.template.textclassification.TextClassificationEngine",
   "datasource": {
     "params": {
-      "appName": "MyTextApp",
-      "evalK": 3
+      "appName": "MyTextApp"
     }
   },
   "preparator": {
     "params": {
-      "nMin": 1,
-      "nMax": 3,
-      "inverseIdfMin" : 0,
-      "inverseIdfMax" : 0.85
+      "nGram": 2
     }
   },
   "algorithms": [
@@ -22,12 +18,6 @@
       "params": {
         "lambda": 0.25
       }
-    },
-    {
-      "name": "lr",
-      "params": {
-        "lambda": 1.25
-      }
     }
   ]
-}
\ No newline at end of file
+}

diff --git a/src/main/scala/org/template/textclassification/DataSource.scala b/src/main/scala/org/template/textclassification/DataSource.scala
index 766c93c..c470be2 100644
--- a/src/main/scala/org/template/textclassification/DataSource.scala
+++ b/src/main/scala/org/template/textclassification/DataSource.scala

@@ -39,17 +39,17 @@
     //Get RDD of Events.
     PEventStore.find(
       appName = dsp.appName,
-      entityType = Some("source"), // specify data entity type
-      eventNames = Some(List("phrases")) // specify data event name
+      entityType = Some("content"), // specify data entity type
+      eventNames = Some(List("e-mail")) // specify data event name
 
       // Convert collected RDD of events to and RDD of Observation
       // objects.
     )(sc).map(e => {
-      val label = e.properties.get[Double]("sentiment")
+      val label : String = e.properties.get[String]("label")
       Observation(
-        label,
-        e.properties.get[String]("phrase"),
-        label.toString
+        if (label == "spam") 1.0 else 0.0,
+        e.properties.get[String]("text"),
+        label
       )
     }).cache
   }

diff --git a/src/main/scala/org/template/textclassification/Evaluation.scala b/src/main/scala/org/template/textclassification/Evaluation.scala
index 1365eca..2ff093d 100644
--- a/src/main/scala/org/template/textclassification/Evaluation.scala
+++ b/src/main/scala/org/template/textclassification/Evaluation.scala

@@ -37,21 +37,14 @@
   // Set data source and preparator parameters.
   private[this] val baseEP = EngineParams(
     dataSourceParams = DataSourceParams(appName = "MyTextApp", evalK = Some(3)),
-    preparatorParams = PreparatorParams(
-      nMin = 1,
-      nMax = 2,
-      inverseIdfMin = 0,
-      inverseIdfMax = 0.85
-    )
+    preparatorParams = PreparatorParams(nGram = 2)
   )
 
   // Set the algorithm params for which we will assess an accuracy score.
   engineParamsList = Seq(
     baseEP.copy(algorithmParamsList = Seq(("nb", NBAlgorithmParams(0.25)))),
-    baseEP.copy(algorithmParamsList = Seq(("nb", NBAlgorithmParams(0.5)))),
     baseEP.copy(algorithmParamsList = Seq(("nb", NBAlgorithmParams(1.0)))),
     baseEP.copy(algorithmParamsList = Seq(("lr", LRAlgorithmParams(0.5)))),
-    baseEP.copy(algorithmParamsList = Seq(("lr", LRAlgorithmParams(1.0)))),
-    baseEP.copy(algorithmParamsList = Seq(("lr", LRAlgorithmParams(1.5))))
+    baseEP.copy(algorithmParamsList = Seq(("lr", LRAlgorithmParams(1.25))))
   )
 }
\ No newline at end of file

diff --git a/src/main/scala/org/template/textclassification/LRAlgorithm.scala b/src/main/scala/org/template/textclassification/LRAlgorithm.scala
index bf71824..22b7a86 100644
--- a/src/main/scala/org/template/textclassification/LRAlgorithm.scala
+++ b/src/main/scala/org/template/textclassification/LRAlgorithm.scala

@@ -93,8 +93,7 @@
 
   // 5. Define prediction rule.
   def predict(text : String): PredictedResult = {
-    try {
-      val x : Array[Double] = pd.transform(text).toArray
+    val x : Array[Double] = pd.transform(text).toArray
 
     // Logistic Regression binary formula for positive probability.
     // According to MLLib documentation, class labeled 0 is used as pivot.
@@ -103,17 +102,14 @@
     // p1 = exp(z) * (1 - p1)
     // p1 * (1 + exp(z)) = exp(z)
     // p1 = exp(z)/(1 + exp(z))
-      val pred = lrModels.map(
-        e => {
-          val z = exp(innerProduct(e._2.coefficients, x) + e._2.intercept)
-            (e._1, z / (1 + z))
-        }
-      ).maxBy(_._2)
+    val pred = lrModels.map(
+      e => {
+        val z = exp(innerProduct(e._2.coefficients, x) + e._2.intercept)
+        (e._1, z / (1 + z))
+      }
+    ).maxBy(_._2)
 
-      PredictedResult(pd.categoryMap(pred._1), pred._2)
-    } catch {
-      case e : IllegalArgumentException => PredictedResult(pd.majorityCategory, 0)
-    }
+    PredictedResult(pd.categoryMap(pred._1), pred._2)
   }
 }
 

diff --git a/src/main/scala/org/template/textclassification/NBAlgorithm.scala b/src/main/scala/org/template/textclassification/NBAlgorithm.scala
index c9cf029..512c2e9 100644
--- a/src/main/scala/org/template/textclassification/NBAlgorithm.scala
+++ b/src/main/scala/org/template/textclassification/NBAlgorithm.scala

@@ -87,12 +87,8 @@
   // the prediction rule given in tutorial.
 
   def predict(doc : String) : PredictedResult = {
-    try {
-      val x: Array[Double] = getScores(doc)
-      val y: (Double, Double) = (nb.labels zip x).maxBy(_._2)
-      new PredictedResult(pd.categoryMap.getOrElse(y._1, ""), y._2)
-    } catch {
-      case e : IllegalArgumentException => PredictedResult(pd.majorityCategory, 0)
-    }
+    val x: Array[Double] = getScores(doc)
+    val y: (Double, Double) = (nb.labels zip x).maxBy(_._2)
+    new PredictedResult(pd.categoryMap.getOrElse(y._1, ""), y._2)
   }
 }
\ No newline at end of file

diff --git a/src/main/scala/org/template/textclassification/Preparator.scala b/src/main/scala/org/template/textclassification/Preparator.scala
index ca8370a..871b7e4 100644
--- a/src/main/scala/org/template/textclassification/Preparator.scala
+++ b/src/main/scala/org/template/textclassification/Preparator.scala

@@ -3,10 +3,8 @@
 
 import io.prediction.controller.PPreparator
 import io.prediction.controller.Params
-import opennlp.tools.ngram.NGramModel
-import opennlp.tools.tokenize.SimpleTokenizer
-import opennlp.tools.util.StringList
 import org.apache.spark.SparkContext
+import org.apache.spark.mllib.feature.{IDF, IDFModel, HashingTF}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -22,10 +20,7 @@
 // components.
 
 case class PreparatorParams(
-  nMin: Int,
-  nMax: Int,
-  inverseIdfMin : Double,
-  inverseIdfMax : Double
+  nGram : Int
 ) extends Params
 
 
@@ -36,7 +31,7 @@
 
   // Prepare your training data.
   def prepare(sc : SparkContext, td: TrainingData): PreparedData = {
-    new PreparedData(td, pp.nMin, pp.nMax, pp.inverseIdfMin, pp. inverseIdfMax)
+    new PreparedData(td, pp.nGram)
   }
 }
 
@@ -44,123 +39,48 @@
 
 class PreparedData (
 val td : TrainingData,
-val nMin: Int,
-val nMax: Int,
-val inverseIdfMin : Double,
-val inverseIdfMax : Double
+val nGram : Int
 ) extends Serializable {
 
 
-  // 1. Tokenizer: document => token list.
-  // Takes an individual document and converts it to
-  // a list of allowable tokens.
 
-  private def tokenize (doc : String): Array[String] = {
-    SimpleTokenizer.INSTANCE
-    .tokenize(doc.toLowerCase)
-    .filter(e => ! td.stopWords.contains(e))
+  // 1. Hashing function: Text -> term frequency vector.
+
+  private val hasher = new HashingTF()
+
+  private def hashTF (text : String) : Vector = {
+    val newList : Array[String] = text.split(" ")
+    .sliding(nGram)
+    .map(_.mkString)
+    .toArray
+
+    hasher.transform(newList)
   }
 
+  // 2. Term frequency vector -> t.f.-i.d.f. vector.
 
-  // 2. Hasher: Array[tokens] => Map(n-gram -> n-gram document tf).
-
-  private def hash (tokenList : Array[String]): HashMap[String, Double] = {
-    // Initialize an NGramModel from OpenNLP tools library,
-    // and add the list of allowable tokens to the n-gram model.
-    val model : NGramModel = new NGramModel()
-    model.add(new StringList(tokenList: _*), nMin, nMax)
-
-    val map : HashMap[String, Double] = HashMap(
-      model.iterator.map(
-        x => (x.toString, model.getCount(x).toDouble)
-      ).toSeq : _*
-    )
-
-    val mapSum = map.values.sum
-
-    // Divide by the total number of n-grams in the document
-    // to obtain n-gram frequency.
-    map.map(e => (e._1, e._2 / mapSum))
-
-  }
+  val idf : IDFModel = new IDF().fit(td.data.map(e => hashTF(e.text)))
 
 
-  // 3. Bigram universe extractor: RDD[bigram hashmap] => RDD[(n-gram, n-gram idf)]
+  // 3. Document Transformer: text => tf-idf vector.
 
-  private def createUniverse(u: RDD[HashMap[String, Double]]): RDD[(String, Double)] = {
-    // Total number of documents (should be 11314).
-    val numDocs: Double = td.data.count.toDouble
-    u.flatMap(e => e.map(f => (f._1, 1.0)))
-    .reduceByKey(_ + _)
-    .filter(e => {
-      val docFreq = e._2 / numDocs
-
-      // Cut out n-grams with inverse i.d.f. greater/less than or equal to min/max
-      // cutoff.
-      docFreq >= inverseIdfMin && docFreq <= inverseIdfMax
-    })
-    .map(e => (e._1, log(numDocs / e._2)))
-  }
-
-
-  // 4. Set private class variables for use in data transformations.
-
-  // Create ngram to idf hashmap for every n-gram in universe:
-  //    Map(n-gram -> n-gram idf)
-  private val idf : HashMap[String, Double] = HashMap(
-    createUniverse(
-      td.data
-      .map(e => hash(tokenize(e.text)))
-    ).collect: _*
-  )
-
-
-
-
-  // Get total number n-grams used.
-  val numTokens : Int = idf.size
-
-
-  // Create n-gram to global index hashmap:
-  //    Map(n-gram -> global index)
-  private val globalIndex : HashMap[String, Int] = HashMap(
-    idf.keys.zipWithIndex.toSeq
-    : _*)
-
-  // 5. Document Transformer: document => sparse tf-idf vector.
-  // This takes a single document, tokenizes it, hashes it,
-  // and finally returns a sparse vector containing the
-  // tf-idf entries of the document n-grams (0 for all n-grams
-  // not contained in the document).
-
-  def transform(doc: String): Vector = {
+  def transform(text : String): Vector = {
     // Map(n-gram -> document tf)
-    val hashedDoc = hash(tokenize(doc)).filter(e => idf.contains(e._1))
-    Vectors.sparse(
-      numTokens,
-      hashedDoc.map {
-        case (ngram, tf) => (globalIndex(ngram), idf(ngram) * tf)
-      }.toArray
-    )
+    idf.transform(hashTF(text))
   }
 
 
-  // 6. Data Transformer: RDD[documents] => RDD[LabeledPoints]
+  // 4. Data Transformer: RDD[documents] => RDD[LabeledPoints]
 
   val transformedData: RDD[(LabeledPoint)] = {
     td.data.map(e => LabeledPoint(e.label, transform(e.text)))
   }
 
 
-  // 7. Finally extract category map, associating label to category.
+  // 5. Finally extract category map, associating label to category.
   val categoryMap = td.data.map(e => (e.label, e.category)).collectAsMap
 
 
-  // 8. Finally consider the case where new document has no matching-ngrams.
-  val majorityCategory = categoryMap.getOrElse(
-    td.data.map(e => e.label).countByValue.maxBy(_._2)._1,
-    ""
-  )
 
 }
commit	4882e381c10a36bfffb2988a7bccc2e1053c5d01	[log] [tgz]
author	mvivero091 <mvivero091@gmail.com>	Tue Jun 23 13:01:09 2015 -0700
committer	mvivero091 <mvivero091@gmail.com>	Tue Jun 23 13:01:09 2015 -0700
tree	24236370724bbc074f9640446124d442c3a5b698
parent	c0f2a10364af93a85a3a1bfbf54929a9d359d77c [diff]