blob: 174e9f705cbac0937dd23874de656056410f81eb [file] [log] [blame]
{
"paragraphs": [
{
"text": "// Trigger Spark Startup\nsc",
"dateUpdated": "Oct 12, 2015 10:47:23 AM",
"config": {
"colWidth": 12.0,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
},
"editorMode": "ace/mode/scala"
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1444073218763_655887574",
"id": "20151005-122658_592219673",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "res8: org.apache.spark.SparkContext \u003d org.apache.spark.SparkContext@6ce70bf3\n"
},
"dateCreated": "Oct 5, 2015 12:26:58 PM",
"dateStarted": "Oct 12, 2015 10:47:23 AM",
"dateFinished": "Oct 12, 2015 10:47:23 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"text": "// Generate data\nimport org.apache.spark.mllib.util.LinearDataGenerator\n\nval numRows \u003d 10000\nval numCols \u003d 1000\nval rawData \u003d LinearDataGenerator.generateLinearRDD(sc, numRows, numCols, 1).toDF()\n\n// Repartition into a more parallelism-friendly number of partitions\nval data \u003d rawData.repartition(64).cache()",
"dateUpdated": "Oct 12, 2015 10:49:12 AM",
"config": {
"colWidth": 12.0,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
},
"editorMode": "ace/mode/scala"
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1444067726904_-135213052",
"id": "20151005-105526_1974722763",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "import org.apache.spark.mllib.util.LinearDataGenerator\nnumRows: Int \u003d 10000\nnumCols: Int \u003d 1000\nrawData: org.apache.spark.sql.DataFrame \u003d [label: double, features: vector]\ndata: org.apache.spark.sql.DataFrame \u003d [label: double, features: vector]\n"
},
"dateCreated": "Oct 5, 2015 10:55:26 AM",
"dateStarted": "Oct 12, 2015 10:49:12 AM",
"dateFinished": "Oct 12, 2015 10:49:13 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"text": "// Spark ML\nimport org.apache.spark.ml.regression.LinearRegression\n\n// Model Settings\nval maxIters \u003d 100\nval reg \u003d 0\nval elasticNetParam \u003d 0 // L2 reg\n\n// Fit the model\nval lr \u003d new LinearRegression()\n .setMaxIter(maxIters)\n .setRegParam(reg)\n .setElasticNetParam(elasticNetParam)\nval start \u003d System.currentTimeMillis()\nval model \u003d lr.fit(data)\nval trainingTime \u003d (System.currentTimeMillis() - start).toDouble / 1000.0\n\n// Summarize the model over the training set and gather some metrics\nval trainingSummary \u003d model.summary\nval r2 \u003d trainingSummary.r2\nval iters \u003d trainingSummary.totalIterations\nval trainingTimePerIter \u003d trainingTime / iters",
"dateUpdated": "Oct 12, 2015 10:47:23 AM",
"config": {
"colWidth": 12.0,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
},
"editorMode": "ace/mode/scala",
"tableHide": false
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1444072136657_1600671053",
"id": "20151005-120856_674927719",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "import org.apache.spark.ml.regression.LinearRegression\nmaxIters: Int \u003d 100\nreg: Int \u003d 0\nelasticNetParam: Int \u003d 0\nlr: org.apache.spark.ml.regression.LinearRegression \u003d linReg_a7f51d676562\nstart: Long \u003d 1444672044647\nmodel: org.apache.spark.ml.regression.LinearRegressionModel \u003d linReg_a7f51d676562\ntrainingTime: Double \u003d 12.985\ntrainingSummary: org.apache.spark.ml.regression.LinearRegressionTrainingSummary \u003d org.apache.spark.ml.regression.LinearRegressionTrainingSummary@227ba28b\nr2: Double \u003d 0.9677118209276552\niters: Int \u003d 17\ntrainingTimePerIter: Double \u003d 0.7638235294117647\n"
},
"dateCreated": "Oct 5, 2015 12:08:56 PM",
"dateStarted": "Oct 12, 2015 10:47:24 AM",
"dateFinished": "Oct 12, 2015 10:47:38 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"text": "// Print statistics\nprintln(s\"R2: ${r2}\")\nprintln(s\"Iterations: ${iters}\")\nprintln(s\"Training time per iter: ${trainingTimePerIter} seconds\")",
"dateUpdated": "Oct 12, 2015 10:47:23 AM",
"config": {
"colWidth": 12.0,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
},
"editorMode": "ace/mode/scala"
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1444440910099_1714338510",
"id": "20151009-183510_1200043993",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "R2: 0.9677118209276552\nIterations: 17\nTraining time per iter: 0.7638235294117647 seconds\n"
},
"dateCreated": "Oct 9, 2015 6:35:10 PM",
"dateStarted": "Oct 12, 2015 10:47:24 AM",
"dateFinished": "Oct 12, 2015 10:47:38 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"text": "// SystemML kernels\nval linearReg \u003d\n\"\"\"\n#\n# THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJUGATE GRADIENT ALGORITHM\n#\n# INPUT PARAMETERS:\n# --------------------------------------------------------------------------------------------\n# NAME TYPE DEFAULT MEANING\n# --------------------------------------------------------------------------------------------\n# X String --- Matrix X of feature vectors\n# Y String --- 1-column Matrix Y of response values\n# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:\n# 0 \u003d no intercept, no shifting, no rescaling;\n# 1 \u003d add intercept, but neither shift nor rescale X;\n# 2 \u003d add intercept, shift \u0026 rescale X columns to mean \u003d 0, variance \u003d 1\n# reg Double 0.000001 Regularization constant (lambda) for L2-regularization; set to nonzero\n# for highly dependend/sparse/numerous features\n# tol Double 0.000001 Tolerance (epsilon); conjugate graduent procedure terminates early if\n# L2 norm of the beta-residual is less than tolerance * its initial norm\n# maxi Int 0 Maximum number of conjugate gradient iterations, 0 \u003d no maximum\n# --------------------------------------------------------------------------------------------\n#\n# OUTPUT:\n# B Estimated regression parameters (the betas) to store\n#\n# Note: Matrix of regression parameters (the betas) and its size depend on icpt input value:\n# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:\n# icpt\u003d0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B\n# icpt\u003d1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]\n# icpt\u003d2: ncol(X)+1 x 2 Col.1: betas for X \u0026 intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]\n# Col.2: betas for shifted/rescaled X and intercept\n#\n\nfileX \u003d \"\";\nfileY \u003d \"\";\nfileB \u003d \"\";\n\nintercept_status \u003d ifdef ($icpt, 0); # $icpt\u003d0;\ntolerance \u003d ifdef ($tol, 0.000001); # $tol\u003d0.000001;\nmax_iteration \u003d ifdef ($maxi, 0); # $maxi\u003d0;\nregularization \u003d ifdef ($reg, 0.000001); # $reg\u003d0.000001;\n\nX \u003d read (fileX);\ny \u003d read (fileY);\n\nn \u003d nrow (X);\nm \u003d ncol (X);\nones_n \u003d matrix (1, rows \u003d n, cols \u003d 1);\nzero_cell \u003d matrix (0, rows \u003d 1, cols \u003d 1);\n\n# Introduce the intercept, shift and rescale the columns of X if needed\n\nm_ext \u003d m;\nif (intercept_status \u003d\u003d 1 | intercept_status \u003d\u003d 2) # add the intercept column\n{\n X \u003d append (X, ones_n);\n m_ext \u003d ncol (X);\n}\n\nscale_lambda \u003d matrix (1, rows \u003d m_ext, cols \u003d 1);\nif (intercept_status \u003d\u003d 1 | intercept_status \u003d\u003d 2)\n{\n scale_lambda [m_ext, 1] \u003d 0;\n}\n\nif (intercept_status \u003d\u003d 2) # scale-\u0026-shift X columns to mean 0, variance 1\n{ # Important assumption: X [, m_ext] \u003d ones_n\n avg_X_cols \u003d t(colSums(X)) / n;\n var_X_cols \u003d (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);\n is_unsafe \u003d ppred (var_X_cols, 0.0, \"\u003c\u003d\");\n scale_X \u003d 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);\n scale_X [m_ext, 1] \u003d 1;\n shift_X \u003d - avg_X_cols * scale_X;\n shift_X [m_ext, 1] \u003d 0;\n} else {\n scale_X \u003d matrix (1, rows \u003d m_ext, cols \u003d 1);\n shift_X \u003d matrix (0, rows \u003d m_ext, cols \u003d 1);\n}\n\n# Henceforth, if intercept_status \u003d\u003d 2, we use \"X %*% (SHIFT/SCALE TRANSFORM)\"\n# instead of \"X\". However, in order to preserve the sparsity of X,\n# we apply the transform associatively to some other part of the expression\n# in which it occurs. To avoid materializing a large matrix, we rewrite it:\n#\n# ssX_A \u003d (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:\n# ssX_A \u003d diag (scale_X) %*% A;\n# ssX_A [m_ext, ] \u003d ssX_A [m_ext, ] + t(shift_X) %*% A;\n#\n# tssX_A \u003d t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:\n# tssX_A \u003d diag (scale_X) %*% A + shift_X %*% A [m_ext, ];\n\nlambda \u003d scale_lambda * regularization;\nbeta_unscaled \u003d matrix (0, rows \u003d m_ext, cols \u003d 1);\n\nif (max_iteration \u003d\u003d 0) {\n max_iteration \u003d m_ext;\n}\ni \u003d 0;\n\n# BEGIN THE CONJUGATE GRADIENT ALGORITHM\nr \u003d - t(X) %*% y;\n\nif (intercept_status \u003d\u003d 2) {\n r \u003d scale_X * r + shift_X %*% r [m_ext, ];\n}\n\np \u003d - r;\nnorm_r2 \u003d sum (r ^ 2);\nnorm_r2_initial \u003d norm_r2;\nnorm_r2_target \u003d norm_r2_initial * tolerance ^ 2;\n\nwhile (i \u003c max_iteration \u0026 norm_r2 \u003e norm_r2_target)\n{\n if (intercept_status \u003d\u003d 2) {\n ssX_p \u003d scale_X * p;\n ssX_p [m_ext, ] \u003d ssX_p [m_ext, ] + t(shift_X) %*% p;\n } else {\n ssX_p \u003d p;\n }\n\n q \u003d t(X) %*% (X %*% ssX_p);\n\n if (intercept_status \u003d\u003d 2) {\n q \u003d scale_X * q + shift_X %*% q [m_ext, ];\n }\n\n q \u003d q + lambda * p;\n a \u003d norm_r2 / sum (p * q);\n beta_unscaled \u003d beta_unscaled + a * p;\n r \u003d r + a * q;\n old_norm_r2 \u003d norm_r2;\n norm_r2 \u003d sum (r ^ 2);\n p \u003d -r + (norm_r2 / old_norm_r2) * p;\n i \u003d i + 1;\n}\n# END THE CONJUGATE GRADIENT ALGORITHM\n\nif (intercept_status \u003d\u003d 2) {\n beta \u003d scale_X * beta_unscaled;\n beta [m_ext, ] \u003d beta [m_ext, ] + t(shift_X) %*% beta_unscaled;\n} else {\n beta \u003d beta_unscaled;\n}\n\n# Output statistics\navg_tot \u003d sum (y) / n;\nss_tot \u003d sum (y ^ 2);\nss_avg_tot \u003d ss_tot - n * avg_tot ^ 2;\nvar_tot \u003d ss_avg_tot / (n - 1);\ny_residual \u003d y - X %*% beta;\navg_res \u003d sum (y_residual) / n;\nss_res \u003d sum (y_residual ^ 2);\nss_avg_res \u003d ss_res - n * avg_res ^ 2;\n\nR2_temp \u003d 1 - ss_res / ss_avg_tot\nR2 \u003d matrix(R2_temp, rows\u003d1, cols\u003d1)\nwrite(R2, \"\")\n\ntotalIters \u003d matrix(i, rows\u003d1, cols\u003d1)\nwrite(totalIters, \"\")\n\n# Prepare the output matrix\nif (intercept_status \u003d\u003d 2) {\n beta_out \u003d append (beta, beta_unscaled);\n} else {\n beta_out \u003d beta;\n}\n\nwrite (beta_out, fileB);\n\"\"\"",
"dateUpdated": "Oct 12, 2015 10:47:23 AM",
"config": {
"colWidth": 12.0,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
},
"editorMode": "ace/mode/scala",
"tableHide": true,
"editorHide": false
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1444067747401_589515959",
"id": "20151005-105547_1888511498",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "linearReg: String \u003d \n\"\n#\n# THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJUGATE GRADIENT ALGORITHM\n#\n# INPUT PARAMETERS:\n# --------------------------------------------------------------------------------------------\n# NAME TYPE DEFAULT MEANING\n# --------------------------------------------------------------------------------------------\n# X String --- Matrix X of feature vectors\n# Y String --- 1-column Matrix Y of response values\n# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:\n# 0 \u003d no intercept, no shifting, no rescaling;\n# 1 \u003d add intercept, but neither shift nor rescale X;\n# 2 \u003d add intercept, shift \u0026 rescale X columns to mean \u003d 0, variance \u003d 1\n# reg Doub..."
},
"dateCreated": "Oct 5, 2015 10:55:47 AM",
"dateStarted": "Oct 12, 2015 10:47:38 AM",
"dateFinished": "Oct 12, 2015 10:47:38 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"text": "// SystemML Kernels\nval linearRegFull \u003d\n\"\"\"\n#-------------------------------------------------------------\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n#-------------------------------------------------------------\n\n#\n# THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJUGATE GRADIENT ALGORITHM\n#\n# INPUT PARAMETERS:\n# --------------------------------------------------------------------------------------------\n# NAME TYPE DEFAULT MEANING\n# --------------------------------------------------------------------------------------------\n# X String --- Location (on HDFS) to read the matrix X of feature vectors\n# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values\n# B String --- Location to store estimated regression parameters (the betas)\n# O String \" \" Location to write the printed statistics; by default is standard output\n# Log String \" \" Location to write per-iteration variables for log/debugging purposes\n# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:\n# 0 \u003d no intercept, no shifting, no rescaling;\n# 1 \u003d add intercept, but neither shift nor rescale X;\n# 2 \u003d add intercept, shift \u0026 rescale X columns to mean \u003d 0, variance \u003d 1\n# reg Double 0.000001 Regularization constant (lambda) for L2-regularization; set to nonzero\n# for highly dependend/sparse/numerous features\n# tol Double 0.000001 Tolerance (epsilon); conjugate graduent procedure terminates early if\n# L2 norm of the beta-residual is less than tolerance * its initial norm\n# maxi Int 0 Maximum number of conjugate gradient iterations, 0 \u003d no maximum\n# fmt String \"text\" Matrix output format for B (the betas) only, usually \"text\" or \"csv\"\n# --------------------------------------------------------------------------------------------\n# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:\n# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:\n# icpt\u003d0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B\n# icpt\u003d1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]\n# icpt\u003d2: ncol(X)+1 x 2 Col.1: betas for X \u0026 intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]\n# Col.2: betas for shifted/rescaled X and intercept\n#\n# In addition, some regression statistics are provided in CSV format, one comma-separated\n# name-value pair per each line, as follows:\n#\n# NAME MEANING\n# -------------------------------------------------------------------------------------\n# AVG_TOT_Y Average of the response value Y\n# STDEV_TOT_Y Standard Deviation of the response value Y\n# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias\n# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X)\n# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr.\n# PLAIN_R2 Plain R^2 of residual with bias included vs. total average\n# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average\n# PLAIN_R2_NOBIAS Plain R^2 of residual with bias subtracted vs. total average\n# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average\n# PLAIN_R2_VS_0 * Plain R^2 of residual with bias included vs. zero constant\n# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant\n# -------------------------------------------------------------------------------------\n# * The last two statistics are only printed if there is no intercept (icpt\u003d0)\n#\n# The Log file, when requested, contains the following per-iteration variables in CSV\n# format, each line containing triple (NAME, ITERATION, VALUE) with ITERATION \u003d 0 for\n# initial values:\n#\n# NAME MEANING\n# -------------------------------------------------------------------------------------\n# CG_RESIDUAL_NORM L2-norm of Conj.Grad.residual, which is A %*% beta - t(X) %*% y\n# where A \u003d t(X) %*% X + diag (lambda), or a similar quantity\n# CG_RESIDUAL_RATIO Ratio of current L2-norm of Conj.Grad.residual over the initial\n# -------------------------------------------------------------------------------------\n#\n# HOW TO INVOKE THIS SCRIPT - EXAMPLE:\n# hadoop jar SystemML.jar -f LinearRegCG.dml -nvargs X\u003dINPUT_DIR/X Y\u003dINPUT_DIR/Y B\u003dOUTPUT_DIR/B\n# O\u003dOUTPUT_DIR/Out icpt\u003d2 reg\u003d1.0 tol\u003d0.001 maxi\u003d100 fmt\u003dcsv Log\u003dOUTPUT_DIR/log\n\nfileX \u003d \"\";\nfileY \u003d \"\";\nfileB \u003d \"\";\nfileO \u003d ifdef ($O, \" \");\nfileLog \u003d ifdef ($Log, \" \");\nfmtB \u003d ifdef ($fmt, \"text\");\n\nintercept_status \u003d ifdef ($icpt, 0); # $icpt\u003d0;\ntolerance \u003d ifdef ($tol, 0.000001); # $tol\u003d0.000001;\nmax_iteration \u003d ifdef ($maxi, 0); # $maxi\u003d0;\nregularization \u003d ifdef ($reg, 0.000001); # $reg\u003d0.000001;\n\nprint (\"BEGIN LINEAR REGRESSION SCRIPT\");\nprint (\"Reading X and Y...\");\nX \u003d read (fileX);\ny \u003d read (fileY);\n\nn \u003d nrow (X);\nm \u003d ncol (X);\nones_n \u003d matrix (1, rows \u003d n, cols \u003d 1);\nzero_cell \u003d matrix (0, rows \u003d 1, cols \u003d 1);\n\n# Introduce the intercept, shift and rescale the columns of X if needed\n\nm_ext \u003d m;\nif (intercept_status \u003d\u003d 1 | intercept_status \u003d\u003d 2) # add the intercept column\n{\n X \u003d append (X, ones_n);\n m_ext \u003d ncol (X);\n}\n\nscale_lambda \u003d matrix (1, rows \u003d m_ext, cols \u003d 1);\nif (intercept_status \u003d\u003d 1 | intercept_status \u003d\u003d 2)\n{\n scale_lambda [m_ext, 1] \u003d 0;\n}\n\nif (intercept_status \u003d\u003d 2) # scale-\u0026-shift X columns to mean 0, variance 1\n{ # Important assumption: X [, m_ext] \u003d ones_n\n avg_X_cols \u003d t(colSums(X)) / n;\n var_X_cols \u003d (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);\n is_unsafe \u003d ppred (var_X_cols, 0.0, \"\u003c\u003d\");\n scale_X \u003d 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);\n scale_X [m_ext, 1] \u003d 1;\n shift_X \u003d - avg_X_cols * scale_X;\n shift_X [m_ext, 1] \u003d 0;\n} else {\n scale_X \u003d matrix (1, rows \u003d m_ext, cols \u003d 1);\n shift_X \u003d matrix (0, rows \u003d m_ext, cols \u003d 1);\n}\n\n# Henceforth, if intercept_status \u003d\u003d 2, we use \"X %*% (SHIFT/SCALE TRANSFORM)\"\n# instead of \"X\". However, in order to preserve the sparsity of X,\n# we apply the transform associatively to some other part of the expression\n# in which it occurs. To avoid materializing a large matrix, we rewrite it:\n#\n# ssX_A \u003d (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:\n# ssX_A \u003d diag (scale_X) %*% A;\n# ssX_A [m_ext, ] \u003d ssX_A [m_ext, ] + t(shift_X) %*% A;\n#\n# tssX_A \u003d t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:\n# tssX_A \u003d diag (scale_X) %*% A + shift_X %*% A [m_ext, ];\n\nlambda \u003d scale_lambda * regularization;\nbeta_unscaled \u003d matrix (0, rows \u003d m_ext, cols \u003d 1);\n\nif (max_iteration \u003d\u003d 0) {\n max_iteration \u003d m_ext;\n}\ni \u003d 0;\n\n# BEGIN THE CONJUGATE GRADIENT ALGORITHM\nprint (\"Running the CG algorithm...\");\n\nr \u003d - t(X) %*% y;\n\nif (intercept_status \u003d\u003d 2) {\n r \u003d scale_X * r + shift_X %*% r [m_ext, ];\n}\n\np \u003d - r;\nnorm_r2 \u003d sum (r ^ 2);\nnorm_r2_initial \u003d norm_r2;\nnorm_r2_target \u003d norm_r2_initial * tolerance ^ 2;\nprint (\"||r|| initial value \u003d \" + sqrt (norm_r2_initial) + \", target value \u003d \" + sqrt (norm_r2_target));\nlog_str \u003d \"CG_RESIDUAL_NORM,0,\" + sqrt (norm_r2_initial);\nlog_str \u003d append (log_str, \"CG_RESIDUAL_RATIO,0,1.0\");\n\nwhile (i \u003c max_iteration \u0026 norm_r2 \u003e norm_r2_target)\n{\n if (intercept_status \u003d\u003d 2) {\n ssX_p \u003d scale_X * p;\n ssX_p [m_ext, ] \u003d ssX_p [m_ext, ] + t(shift_X) %*% p;\n } else {\n ssX_p \u003d p;\n }\n \n q \u003d t(X) %*% (X %*% ssX_p);\n\n if (intercept_status \u003d\u003d 2) {\n q \u003d scale_X * q + shift_X %*% q [m_ext, ];\n }\n\n\tq \u003d q + lambda * p;\n\ta \u003d norm_r2 / sum (p * q);\n\tbeta_unscaled \u003d beta_unscaled + a * p;\n\tr \u003d r + a * q;\n\told_norm_r2 \u003d norm_r2;\n\tnorm_r2 \u003d sum (r ^ 2);\n\tp \u003d -r + (norm_r2 / old_norm_r2) * p;\n\ti \u003d i + 1;\n\tprint (\"Iteration \" + i + \": ||r|| / ||r init|| \u003d \" + sqrt (norm_r2 / norm_r2_initial));\n\tlog_str \u003d append (log_str, \"CG_RESIDUAL_NORM,\" + i + \",\" + sqrt (norm_r2));\n log_str \u003d append (log_str, \"CG_RESIDUAL_RATIO,\" + i + \",\" + sqrt (norm_r2 / norm_r2_initial));\n}\n\nif (i \u003e\u003d max_iteration) {\n print (\"Warning: the maximum number of iterations has been reached.\");\n}\nprint (\"The CG algorithm is done.\");\n# END THE CONJUGATE GRADIENT ALGORITHM\n\nif (intercept_status \u003d\u003d 2) {\n beta \u003d scale_X * beta_unscaled;\n beta [m_ext, ] \u003d beta [m_ext, ] + t(shift_X) %*% beta_unscaled;\n} else {\n beta \u003d beta_unscaled;\n}\n\nprint (\"Computing the statistics...\");\n\navg_tot \u003d sum (y) / n;\nss_tot \u003d sum (y ^ 2);\nss_avg_tot \u003d ss_tot - n * avg_tot ^ 2;\nvar_tot \u003d ss_avg_tot / (n - 1);\ny_residual \u003d y - X %*% beta;\navg_res \u003d sum (y_residual) / n;\nss_res \u003d sum (y_residual ^ 2);\nss_avg_res \u003d ss_res - n * avg_res ^ 2;\n\nplain_R2 \u003d 1 - ss_res / ss_avg_tot;\nif (n \u003e m_ext) {\n dispersion \u003d ss_res / (n - m_ext);\n adjusted_R2 \u003d 1 - dispersion / (ss_avg_tot / (n - 1));\n} else {\n dispersion \u003d 0.0 / 0.0;\n adjusted_R2 \u003d 0.0 / 0.0;\n}\n\nplain_R2_nobias \u003d 1 - ss_avg_res / ss_avg_tot;\ndeg_freedom \u003d n - m - 1;\nif (deg_freedom \u003e 0) {\n var_res \u003d ss_avg_res / deg_freedom;\n adjusted_R2_nobias \u003d 1 - var_res / (ss_avg_tot / (n - 1));\n} else {\n var_res \u003d 0.0 / 0.0;\n adjusted_R2_nobias \u003d 0.0 / 0.0;\n print (\"Warning: zero or negative number of degrees of freedom.\");\n}\n\nplain_R2_vs_0 \u003d 1 - ss_res / ss_tot;\nif (n \u003e m) {\n adjusted_R2_vs_0 \u003d 1 - (ss_res / (n - m)) / (ss_tot / n);\n} else {\n adjusted_R2_vs_0 \u003d 0.0 / 0.0;\n}\n\nstr \u003d \"AVG_TOT_Y,\" + avg_tot; # Average of the response value Y\nstr \u003d append (str, \"STDEV_TOT_Y,\" + sqrt (var_tot)); # Standard Deviation of the response value Y\nstr \u003d append (str, \"AVG_RES_Y,\" + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias\nstr \u003d append (str, \"STDEV_RES_Y,\" + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X)\nstr \u003d append (str, \"DISPERSION,\" + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f.\nstr \u003d append (str, \"PLAIN_R2,\" + plain_R2); # Plain R^2 of residual with bias included vs. total average\nstr \u003d append (str, \"ADJUSTED_R2,\" + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average\nstr \u003d append (str, \"PLAIN_R2_NOBIAS,\" + plain_R2_nobias); # Plain R^2 of residual with bias subtracted vs. total average\nstr \u003d append (str, \"ADJUSTED_R2_NOBIAS,\" + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average\nif (intercept_status \u003d\u003d 0) {\n str \u003d append (str, \"PLAIN_R2_VS_0,\" + plain_R2_vs_0); # Plain R^2 of residual with bias included vs. zero constant\n str \u003d append (str, \"ADJUSTED_R2_VS_0,\" + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant\n}\n\nif (fileO !\u003d \" \") {\n write (str, fileO);\n} else {\n print (str);\n}\n\n# Prepare the output matrix\nprint (\"Writing the output matrix...\");\n\nif (intercept_status \u003d\u003d 2) {\n beta_out \u003d append (beta, beta_unscaled);\n} else {\n beta_out \u003d beta;\n}\nwrite (beta_out, fileB, format\u003dfmtB);\n\nif (fileLog !\u003d \" \") {\n write (log_str, fileLog);\n}\nprint (\"END LINEAR REGRESSION SCRIPT\");\n\"\"\"",
"dateUpdated": "Oct 12, 2015 10:47:23 AM",
"config": {
"colWidth": 12.0,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
},
"editorMode": "ace/mode/scala",
"editorHide": true,
"tableHide": true
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1444365784575_-1898057033",
"id": "20151008-214304_2123048601",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "linearRegFull: String \u003d \n\"\n#-------------------------------------------------------------\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n#-------------------------------------------------------------\n\n#\n# THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJU..."
},
"dateCreated": "Oct 8, 2015 9:43:04 PM",
"dateStarted": "Oct 12, 2015 10:47:38 AM",
"dateFinished": "Oct 12, 2015 10:47:38 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"text": "// Helper functions\nimport org.apache.sysml.api.MLOutput\n\ndef getScalar(outputs: MLOutput, symbol: String): Any \u003d\n outputs.getDF(sqlContext, symbol).first()(1)\n \ndef getScalarDouble(outputs: MLOutput, symbol: String): Double \u003d \n getScalar(outputs, symbol).asInstanceOf[Double]\n \ndef getScalarInt(outputs: MLOutput, symbol: String): Int \u003d\n getScalarDouble(outputs, symbol).toInt",
"dateUpdated": "Oct 12, 2015 10:47:23 AM",
"config": {
"colWidth": 12.0,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
},
"editorMode": "ace/mode/scala"
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1444428901112_1314919146",
"id": "20151009-151501_511642642",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "import org.apache.sysml.api.MLOutput\ngetScalar: (outputs: org.apache.sysml.api.MLOutput, symbol: String)Any\ngetScalarDouble: (outputs: org.apache.sysml.api.MLOutput, symbol: String)Double\ngetScalarInt: (outputs: org.apache.sysml.api.MLOutput, symbol: String)Int\n"
},
"dateCreated": "Oct 9, 2015 3:15:01 PM",
"dateStarted": "Oct 12, 2015 10:47:38 AM",
"dateFinished": "Oct 12, 2015 10:47:39 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"text": "// Imports\nimport org.apache.sysml.api.MLContext\nimport org.apache.sysml.runtime.instructions.spark.utils.{RDDConverterUtilsExt \u003d\u003e RDDConverterUtils}\nimport org.apache.sysml.runtime.matrix.MatrixCharacteristics;\n\n// Create SystemML context\nval ml \u003d new MLContext(sc)\n\n// Convert data to proper format\nval mcX \u003d new MatrixCharacteristics(numRows, numCols, 1000, 1000)\nval mcY \u003d new MatrixCharacteristics(numRows, 1, 1000, 1000)\nval X \u003d RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, data, mcX, false, \"features\")\nval y \u003d RDDConverterUtils.dataFrameToBinaryBlock(sc, data.select(\"label\"), mcY, false)\n// val y \u003d data.select(\"label\")\n\n// Cache\nval X2 \u003d X.cache()\nval y2 \u003d y.cache()\nval cnt1 \u003d X2.count()\nval cnt2 \u003d y2.count() ",
"dateUpdated": "Oct 12, 2015 10:47:23 AM",
"config": {
"colWidth": 12.0,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
},
"editorMode": "ace/mode/scala"
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1444068596053_-555488546",
"id": "20151005-110956_169115151",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "import org.apache.sysml.api.MLContext\nimport org.apache.sysml.runtime.instructions.spark.utils.{RDDConverterUtilsExt\u003d\u003eRDDConverterUtils}\nimport org.apache.sysml.runtime.matrix.MatrixCharacteristics\nml: org.apache.sysml.api.MLContext \u003d org.apache.sysml.api.MLContext@38d59245\nmcX: org.apache.sysml.runtime.matrix.MatrixCharacteristics \u003d [10000 x 1000, nnz\u003d-1, blocks (1000 x 1000)]\nmcY: org.apache.sysml.runtime.matrix.MatrixCharacteristics \u003d [10000 x 1, nnz\u003d-1, blocks (1000 x 1000)]\nX: org.apache.spark.api.java.JavaPairRDD[org.apache.sysml.runtime.matrix.data.MatrixIndexes,org.apache.sysml.runtime.matrix.data.MatrixBlock] \u003d org.apache.spark.api.java.JavaPairRDD@b5a86e3\ny: org.apache.spark.api.java.JavaPairRDD[org.apache.sysml.runtime.matrix.data.MatrixIndexes,org.apache.sysml.runtime.matrix.data.MatrixBlock] \u003d org.apache.spark.api.java.JavaPairRDD@56377665\nX2: org.apache.spark.api.java.JavaPairRDD[org.apache.sysml.runtime.matrix.data.MatrixIndexes,org.apache.sysml.runtime.matrix.data.MatrixBlock] \u003d org.apache.spark.api.java.JavaPairRDD@650f29d2\ny2: org.apache.spark.api.java.JavaPairRDD[org.apache.sysml.runtime.matrix.data.MatrixIndexes,org.apache.sysml.runtime.matrix.data.MatrixBlock] \u003d org.apache.spark.api.java.JavaPairRDD@334857a8\ncnt1: Long \u003d 10\ncnt2: Long \u003d 10\n"
},
"dateCreated": "Oct 5, 2015 11:09:56 AM",
"dateStarted": "Oct 12, 2015 10:47:39 AM",
"dateFinished": "Oct 12, 2015 10:47:43 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"text": "// Register inputs \u0026 outputs\nml.reset() \nml.registerInput(\"X\", X, numRows, numCols)\nml.registerInput(\"y\", y, numRows, 1)\n// ml.registerInput(\"y\", y)\nml.registerOutput(\"beta_out\")\nml.registerOutput(\"R2\")\nml.registerOutput(\"totalIters\")\n\n// Run the script\nval start \u003d System.currentTimeMillis()\nval outputs \u003d ml.executeScript(linearReg)\nval trainingTime \u003d (System.currentTimeMillis() - start).toDouble / 1000.0\n\n// Get outputs\nval B \u003d outputs.getDF(sqlContext, \"beta_out\").sort(\"ID\").drop(\"ID\")\nval r2 \u003d getScalarDouble(outputs, \"R2\")\nval iters \u003d getScalarInt(outputs, \"totalIters\")\nval trainingTimePerIter \u003d trainingTime / iters",
"dateUpdated": "Oct 12, 2015 10:48:10 AM",
"config": {
"colWidth": 12.0,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
},
"editorMode": "ace/mode/scala"
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1444671982719_328618024",
"id": "20151012-104622_1349641375",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "start: Long \u003d 1444672090620\noutputs: org.apache.sysml.api.MLOutput \u003d org.apache.sysml.api.MLOutput@5d2c22d0\ntrainingTime: Double \u003d 1.176\nB: org.apache.spark.sql.DataFrame \u003d [C1: double]\nr2: Double \u003d 0.9677079547216473\niters: Int \u003d 12\ntrainingTimePerIter: Double \u003d 0.09799999999999999\n"
},
"dateCreated": "Oct 12, 2015 10:46:22 AM",
"dateStarted": "Oct 12, 2015 10:48:10 AM",
"dateFinished": "Oct 12, 2015 10:48:12 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"text": "// Print statistics\nprintln(s\"R2: ${r2}\")\nprintln(s\"Iterations: ${iters}\")\nprintln(s\"Training time per iter: ${trainingTimePerIter} seconds\")\nB.describe().show()",
"dateUpdated": "Oct 12, 2015 10:47:23 AM",
"config": {
"colWidth": 12.0,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
},
"tableHide": false,
"editorHide": false,
"editorMode": "ace/mode/scala"
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1444081221119_-327699254",
"id": "20151005-144021_55411373",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "R2: 0.9677079547216473\nIterations: 12\nTraining time per iter: 0.2334166666666667 seconds\n+-------+-------------------+\n|summary| C1|\n+-------+-------------------+\n| count| 1000|\n| mean| 0.0184500840658385|\n| stddev| 0.2764750319432085|\n| min|-0.5426068958986378|\n| max| 0.5225309861616542|\n+-------+-------------------+\n\n"
},
"dateCreated": "Oct 5, 2015 2:40:21 PM",
"dateStarted": "Oct 12, 2015 10:47:43 AM",
"dateFinished": "Oct 12, 2015 10:47:50 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"dateUpdated": "Oct 12, 2015 10:47:23 AM",
"config": {
"colWidth": 12.0,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
},
"editorMode": "ace/mode/scala"
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1444422131984_536286492",
"id": "20151009-132211_1399012872",
"result": {
"code": "SUCCESS",
"type": "TEXT"
},
"dateCreated": "Oct 9, 2015 1:22:11 PM",
"dateStarted": "Oct 12, 2015 10:47:47 AM",
"dateFinished": "Oct 12, 2015 10:47:50 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
}
],
"name": "SystemML - Linear Regression",
"id": "2AZ2AQ12B",
"angularObjects": {
"2AZ9BN82Z": [],
"2AZWVF2GZ": [],
"2AZ78WFZ8": [],
"2B16GQAY6": [],
"2AXNVS2AP": [],
"2AXQ86QRG": [],
"2AZR88MK4": [],
"2AY2CE5DY": [],
"2AZRRUZZU": [],
"2AY16128C": [],
"2AX3MFKQ2": [],
"2AWQWADKQ": [],
"2AYCYP1MW": [],
"2AZYXB1NC": []
},
"config": {},
"info": {}
}