Update with MLLib hashing and tfidf implementation.
diff --git a/README.md b/README.md
index 24e7241..5b27591 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,10 @@
# Release Information
+## Version 2.2
+
+Modified PreparedData to use MLLib hashing and tf-idf implementations.
+
## Version 2.1
Fixed dot product implementation in the predict methods to work with batch predict method for evaluation.
diff --git a/build.sbt b/build.sbt
index 149041b..7276847 100644
--- a/build.sbt
+++ b/build.sbt
@@ -7,7 +7,5 @@
"io.prediction" %% "core" % pioVersion.value % "provided",
"org.apache.spark" %% "spark-core" % "1.3.1" % "provided",
"org.apache.spark" %% "spark-mllib" % "1.3.1" % "provided",
- "org.xerial.snappy" % "snappy-java" % "1.1.1.7",
- "org.apache.opennlp" % "opennlp-tools" % "1.5.3",
- "com.github.fommil.netlib" % "core" % "1.1.2"
+ "org.xerial.snappy" % "snappy-java" % "1.1.1.7"
)
diff --git a/data/semanticanalysis.json b/data/sentimentanalysis.json
similarity index 100%
rename from data/semanticanalysis.json
rename to data/sentimentanalysis.json
diff --git a/engine.json b/engine.json
index 30df882..da6bd9c 100644
--- a/engine.json
+++ b/engine.json
@@ -4,16 +4,12 @@
"engineFactory": "org.template.textclassification.TextClassificationEngine",
"datasource": {
"params": {
- "appName": "MyTextApp",
- "evalK": 3
+ "appName": "MyTextApp"
}
},
"preparator": {
"params": {
- "nMin": 1,
- "nMax": 3,
- "inverseIdfMin" : 0,
- "inverseIdfMax" : 0.85
+ "nGram": 2
}
},
"algorithms": [
@@ -22,12 +18,6 @@
"params": {
"lambda": 0.25
}
- },
- {
- "name": "lr",
- "params": {
- "lambda": 1.25
- }
}
]
-}
\ No newline at end of file
+}
diff --git a/src/main/scala/org/template/textclassification/DataSource.scala b/src/main/scala/org/template/textclassification/DataSource.scala
index 766c93c..c470be2 100644
--- a/src/main/scala/org/template/textclassification/DataSource.scala
+++ b/src/main/scala/org/template/textclassification/DataSource.scala
@@ -39,17 +39,17 @@
//Get RDD of Events.
PEventStore.find(
appName = dsp.appName,
- entityType = Some("source"), // specify data entity type
- eventNames = Some(List("phrases")) // specify data event name
+ entityType = Some("content"), // specify data entity type
+ eventNames = Some(List("e-mail")) // specify data event name
// Convert collected RDD of events to and RDD of Observation
// objects.
)(sc).map(e => {
- val label = e.properties.get[Double]("sentiment")
+ val label : String = e.properties.get[String]("label")
Observation(
- label,
- e.properties.get[String]("phrase"),
- label.toString
+ if (label == "spam") 1.0 else 0.0,
+ e.properties.get[String]("text"),
+ label
)
}).cache
}
diff --git a/src/main/scala/org/template/textclassification/Evaluation.scala b/src/main/scala/org/template/textclassification/Evaluation.scala
index 1365eca..2ff093d 100644
--- a/src/main/scala/org/template/textclassification/Evaluation.scala
+++ b/src/main/scala/org/template/textclassification/Evaluation.scala
@@ -37,21 +37,14 @@
// Set data source and preparator parameters.
private[this] val baseEP = EngineParams(
dataSourceParams = DataSourceParams(appName = "MyTextApp", evalK = Some(3)),
- preparatorParams = PreparatorParams(
- nMin = 1,
- nMax = 2,
- inverseIdfMin = 0,
- inverseIdfMax = 0.85
- )
+ preparatorParams = PreparatorParams(nGram = 2)
)
// Set the algorithm params for which we will assess an accuracy score.
engineParamsList = Seq(
baseEP.copy(algorithmParamsList = Seq(("nb", NBAlgorithmParams(0.25)))),
- baseEP.copy(algorithmParamsList = Seq(("nb", NBAlgorithmParams(0.5)))),
baseEP.copy(algorithmParamsList = Seq(("nb", NBAlgorithmParams(1.0)))),
baseEP.copy(algorithmParamsList = Seq(("lr", LRAlgorithmParams(0.5)))),
- baseEP.copy(algorithmParamsList = Seq(("lr", LRAlgorithmParams(1.0)))),
- baseEP.copy(algorithmParamsList = Seq(("lr", LRAlgorithmParams(1.5))))
+ baseEP.copy(algorithmParamsList = Seq(("lr", LRAlgorithmParams(1.25))))
)
}
\ No newline at end of file
diff --git a/src/main/scala/org/template/textclassification/LRAlgorithm.scala b/src/main/scala/org/template/textclassification/LRAlgorithm.scala
index bf71824..22b7a86 100644
--- a/src/main/scala/org/template/textclassification/LRAlgorithm.scala
+++ b/src/main/scala/org/template/textclassification/LRAlgorithm.scala
@@ -93,8 +93,7 @@
// 5. Define prediction rule.
def predict(text : String): PredictedResult = {
- try {
- val x : Array[Double] = pd.transform(text).toArray
+ val x : Array[Double] = pd.transform(text).toArray
// Logistic Regression binary formula for positive probability.
// According to MLLib documentation, class labeled 0 is used as pivot.
@@ -103,17 +102,14 @@
// p1 = exp(z) * (1 - p1)
// p1 * (1 + exp(z)) = exp(z)
// p1 = exp(z)/(1 + exp(z))
- val pred = lrModels.map(
- e => {
- val z = exp(innerProduct(e._2.coefficients, x) + e._2.intercept)
- (e._1, z / (1 + z))
- }
- ).maxBy(_._2)
+ val pred = lrModels.map(
+ e => {
+ val z = exp(innerProduct(e._2.coefficients, x) + e._2.intercept)
+ (e._1, z / (1 + z))
+ }
+ ).maxBy(_._2)
- PredictedResult(pd.categoryMap(pred._1), pred._2)
- } catch {
- case e : IllegalArgumentException => PredictedResult(pd.majorityCategory, 0)
- }
+ PredictedResult(pd.categoryMap(pred._1), pred._2)
}
}
diff --git a/src/main/scala/org/template/textclassification/NBAlgorithm.scala b/src/main/scala/org/template/textclassification/NBAlgorithm.scala
index c9cf029..512c2e9 100644
--- a/src/main/scala/org/template/textclassification/NBAlgorithm.scala
+++ b/src/main/scala/org/template/textclassification/NBAlgorithm.scala
@@ -87,12 +87,8 @@
// the prediction rule given in tutorial.
def predict(doc : String) : PredictedResult = {
- try {
- val x: Array[Double] = getScores(doc)
- val y: (Double, Double) = (nb.labels zip x).maxBy(_._2)
- new PredictedResult(pd.categoryMap.getOrElse(y._1, ""), y._2)
- } catch {
- case e : IllegalArgumentException => PredictedResult(pd.majorityCategory, 0)
- }
+ val x: Array[Double] = getScores(doc)
+ val y: (Double, Double) = (nb.labels zip x).maxBy(_._2)
+ new PredictedResult(pd.categoryMap.getOrElse(y._1, ""), y._2)
}
}
\ No newline at end of file
diff --git a/src/main/scala/org/template/textclassification/Preparator.scala b/src/main/scala/org/template/textclassification/Preparator.scala
index ca8370a..871b7e4 100644
--- a/src/main/scala/org/template/textclassification/Preparator.scala
+++ b/src/main/scala/org/template/textclassification/Preparator.scala
@@ -3,10 +3,8 @@
import io.prediction.controller.PPreparator
import io.prediction.controller.Params
-import opennlp.tools.ngram.NGramModel
-import opennlp.tools.tokenize.SimpleTokenizer
-import opennlp.tools.util.StringList
import org.apache.spark.SparkContext
+import org.apache.spark.mllib.feature.{IDF, IDFModel, HashingTF}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
@@ -22,10 +20,7 @@
// components.
case class PreparatorParams(
- nMin: Int,
- nMax: Int,
- inverseIdfMin : Double,
- inverseIdfMax : Double
+ nGram : Int
) extends Params
@@ -36,7 +31,7 @@
// Prepare your training data.
def prepare(sc : SparkContext, td: TrainingData): PreparedData = {
- new PreparedData(td, pp.nMin, pp.nMax, pp.inverseIdfMin, pp. inverseIdfMax)
+ new PreparedData(td, pp.nGram)
}
}
@@ -44,123 +39,48 @@
class PreparedData (
val td : TrainingData,
-val nMin: Int,
-val nMax: Int,
-val inverseIdfMin : Double,
-val inverseIdfMax : Double
+val nGram : Int
) extends Serializable {
- // 1. Tokenizer: document => token list.
- // Takes an individual document and converts it to
- // a list of allowable tokens.
- private def tokenize (doc : String): Array[String] = {
- SimpleTokenizer.INSTANCE
- .tokenize(doc.toLowerCase)
- .filter(e => ! td.stopWords.contains(e))
+ // 1. Hashing function: Text -> term frequency vector.
+
+ private val hasher = new HashingTF()
+
+ private def hashTF (text : String) : Vector = {
+ val newList : Array[String] = text.split(" ")
+ .sliding(nGram)
+ .map(_.mkString)
+ .toArray
+
+ hasher.transform(newList)
}
+ // 2. Term frequency vector -> t.f.-i.d.f. vector.
- // 2. Hasher: Array[tokens] => Map(n-gram -> n-gram document tf).
-
- private def hash (tokenList : Array[String]): HashMap[String, Double] = {
- // Initialize an NGramModel from OpenNLP tools library,
- // and add the list of allowable tokens to the n-gram model.
- val model : NGramModel = new NGramModel()
- model.add(new StringList(tokenList: _*), nMin, nMax)
-
- val map : HashMap[String, Double] = HashMap(
- model.iterator.map(
- x => (x.toString, model.getCount(x).toDouble)
- ).toSeq : _*
- )
-
- val mapSum = map.values.sum
-
- // Divide by the total number of n-grams in the document
- // to obtain n-gram frequency.
- map.map(e => (e._1, e._2 / mapSum))
-
- }
+ val idf : IDFModel = new IDF().fit(td.data.map(e => hashTF(e.text)))
- // 3. Bigram universe extractor: RDD[bigram hashmap] => RDD[(n-gram, n-gram idf)]
+ // 3. Document Transformer: text => tf-idf vector.
- private def createUniverse(u: RDD[HashMap[String, Double]]): RDD[(String, Double)] = {
- // Total number of documents (should be 11314).
- val numDocs: Double = td.data.count.toDouble
- u.flatMap(e => e.map(f => (f._1, 1.0)))
- .reduceByKey(_ + _)
- .filter(e => {
- val docFreq = e._2 / numDocs
-
- // Cut out n-grams with inverse i.d.f. greater/less than or equal to min/max
- // cutoff.
- docFreq >= inverseIdfMin && docFreq <= inverseIdfMax
- })
- .map(e => (e._1, log(numDocs / e._2)))
- }
-
-
- // 4. Set private class variables for use in data transformations.
-
- // Create ngram to idf hashmap for every n-gram in universe:
- // Map(n-gram -> n-gram idf)
- private val idf : HashMap[String, Double] = HashMap(
- createUniverse(
- td.data
- .map(e => hash(tokenize(e.text)))
- ).collect: _*
- )
-
-
-
-
- // Get total number n-grams used.
- val numTokens : Int = idf.size
-
-
- // Create n-gram to global index hashmap:
- // Map(n-gram -> global index)
- private val globalIndex : HashMap[String, Int] = HashMap(
- idf.keys.zipWithIndex.toSeq
- : _*)
-
- // 5. Document Transformer: document => sparse tf-idf vector.
- // This takes a single document, tokenizes it, hashes it,
- // and finally returns a sparse vector containing the
- // tf-idf entries of the document n-grams (0 for all n-grams
- // not contained in the document).
-
- def transform(doc: String): Vector = {
+ def transform(text : String): Vector = {
// Map(n-gram -> document tf)
- val hashedDoc = hash(tokenize(doc)).filter(e => idf.contains(e._1))
- Vectors.sparse(
- numTokens,
- hashedDoc.map {
- case (ngram, tf) => (globalIndex(ngram), idf(ngram) * tf)
- }.toArray
- )
+ idf.transform(hashTF(text))
}
- // 6. Data Transformer: RDD[documents] => RDD[LabeledPoints]
+ // 4. Data Transformer: RDD[documents] => RDD[LabeledPoints]
val transformedData: RDD[(LabeledPoint)] = {
td.data.map(e => LabeledPoint(e.label, transform(e.text)))
}
- // 7. Finally extract category map, associating label to category.
+ // 5. Finally extract category map, associating label to category.
val categoryMap = td.data.map(e => (e.label, e.category)).collectAsMap
- // 8. Finally consider the case where new document has no matching-ngrams.
- val majorityCategory = categoryMap.getOrElse(
- td.data.map(e => e.label).countByValue.maxBy(_._2)._1,
- ""
- )
}