https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html#E2006-tfidf
$ wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2 $ wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.test.bz2
scala> :paste spark.read.format("libsvm").load("E2006.train.bz2") .select($"label", to_hivemall_features($"features").as("features")) .createOrReplaceTempView("rawTrainTable") val (max, min) = sql("SELECT MAX(label), MIN(label) FROM rawTrainTable").collect.map { case Row(max: Double, min: Double) => (max, min) }.head // `label` must be [0.0, 1.0] sql(s""" CREATE OR REPLACE TEMPORARY VIEW trainTable AS SELECT rescale(label, $min, $max) AS label, features FROM rawTrainTable """) scala> trainDf.printSchema root |-- label: float (nullable = true) |-- features: vector (nullable = true) scala> :paste spark.read.format("libsvm").load("E2006.test.bz2") .select($"label", to_hivemall_features($"features").as("features")) .createOrReplaceTempView("rawTestTable") sql(s""" CREATE OR REPLACE TEMPORARY VIEW testTable AS SELECT rowid() AS rowid, rescale(label, $min, $max) AS target, features FROM rawTestTable """) // Caches data to fix row IDs sql("CACHE TABLE testTable") sql(""" CREATE OR REPLACE TEMPORARY VIEW testTable_exploded AS SELECT rowid, target, extract_feature(ft) AS feature, extract_weight(ft) AS value FROM ( SELECT rowid, target, explode(features) AS ft FROM testTable """) scala> df.printSchema root |-- rowid: string (nullable = true) |-- target: float (nullable = true) |-- feature: string (nullable = true) |-- value: double (nullable = true)
#Training
scala> :paste sql(""" CREATE OR REPLACE TEMPORARY VIEW modelTable AS SELECT feature, AVG(weight) AS weight FROM ( SELECT train_arowe2_regr(add_bias(features), label) AS (feature, weight) FROM trainTable ) GROUP BY feature """)
#Test
scala> :paste sql(""" CREATE OR REPLACE TEMPORARY VIEW predicted AS SELECT rowid, sum(weight * value) AS predicted FROM testTable_exploded t LEFT OUTER JOIN modelTable m ON t.feature = m.feature GROUP BY rowid """)
#Evaluation
scala> :paste sql(s""" SELECT AVG(target), AVG(predicted) FROM predicted p INNER JOIN testTable t ON p.rowid = t.rowid """) +------------------+------------------+ | avg(target)| avg(predicted)| +------------------+------------------+ |0.5489154884487879|0.6030108853227014| +------------------+------------------+