samples/zeppelin-notebooks/2AZ2AQ12B/note.json - systemds - Git at Google

 {
   "paragraphs": [
     {
       "text": "// Trigger Spark Startup\nsc",
       "dateUpdated": "Oct 12, 2015 10:47:23 AM",
       "config": {
         "colWidth": 12.0,
         "graph": {
           "mode": "table",
           "height": 300.0,
           "optionOpen": false,
           "keys": [],
           "values": [],
           "groups": [],
           "scatter": {}
         },
         "editorMode": "ace/mode/scala"
       },
       "settings": {
         "params": {},
         "forms": {}
       },
       "jobName": "paragraph_1444073218763_655887574",
       "id": "20151005-122658_592219673",
       "result": {
         "code": "SUCCESS",
         "type": "TEXT",
         "msg": "res8: org.apache.spark.SparkContext \u003d org.apache.spark.SparkContext@6ce70bf3\n"
       },
       "dateCreated": "Oct 5, 2015 12:26:58 PM",
       "dateStarted": "Oct 12, 2015 10:47:23 AM",
       "dateFinished": "Oct 12, 2015 10:47:23 AM",
       "status": "FINISHED",
       "progressUpdateIntervalMs": 500
     },
     {
       "text": "// Generate data\nimport org.apache.spark.mllib.util.LinearDataGenerator\n\nval numRows \u003d 10000\nval numCols \u003d 1000\nval rawData \u003d LinearDataGenerator.generateLinearRDD(sc, numRows, numCols, 1).toDF()\n\n// Repartition into a more parallelism-friendly number of partitions\nval data \u003d rawData.repartition(64).cache()",
       "dateUpdated": "Oct 12, 2015 10:49:12 AM",
       "config": {
         "colWidth": 12.0,
         "graph": {
           "mode": "table",
           "height": 300.0,
           "optionOpen": false,
           "keys": [],
           "values": [],
           "groups": [],
           "scatter": {}
         },
         "editorMode": "ace/mode/scala"
       },
       "settings": {
         "params": {},
         "forms": {}
       },
       "jobName": "paragraph_1444067726904_-135213052",
       "id": "20151005-105526_1974722763",
       "result": {
         "code": "SUCCESS",
         "type": "TEXT",
         "msg": "import org.apache.spark.mllib.util.LinearDataGenerator\nnumRows: Int \u003d 10000\nnumCols: Int \u003d 1000\nrawData: org.apache.spark.sql.DataFrame \u003d [label: double, features: vector]\ndata: org.apache.spark.sql.DataFrame \u003d [label: double, features: vector]\n"
       },
       "dateCreated": "Oct 5, 2015 10:55:26 AM",
       "dateStarted": "Oct 12, 2015 10:49:12 AM",
       "dateFinished": "Oct 12, 2015 10:49:13 AM",
       "status": "FINISHED",
       "progressUpdateIntervalMs": 500
     },
     {
       "text": "// Spark ML\nimport org.apache.spark.ml.regression.LinearRegression\n\n// Model Settings\nval maxIters \u003d 100\nval reg \u003d 0\nval elasticNetParam \u003d 0  // L2 reg\n\n// Fit the model\nval lr \u003d new LinearRegression()\n  .setMaxIter(maxIters)\n  .setRegParam(reg)\n  .setElasticNetParam(elasticNetParam)\nval start \u003d System.currentTimeMillis()\nval model \u003d lr.fit(data)\nval trainingTime \u003d (System.currentTimeMillis() - start).toDouble / 1000.0\n\n// Summarize the model over the training set and gather some metrics\nval trainingSummary \u003d model.summary\nval r2 \u003d trainingSummary.r2\nval iters \u003d trainingSummary.totalIterations\nval trainingTimePerIter \u003d trainingTime / iters",
       "dateUpdated": "Oct 12, 2015 10:47:23 AM",
       "config": {
         "colWidth": 12.0,
         "graph": {
           "mode": "table",
           "height": 300.0,
           "optionOpen": false,
           "keys": [],
           "values": [],
           "groups": [],
           "scatter": {}
         },
         "editorMode": "ace/mode/scala",
         "tableHide": false
       },
       "settings": {
         "params": {},
         "forms": {}
       },
       "jobName": "paragraph_1444072136657_1600671053",
       "id": "20151005-120856_674927719",
       "result": {
         "code": "SUCCESS",
         "type": "TEXT",
         "msg": "import org.apache.spark.ml.regression.LinearRegression\nmaxIters: Int \u003d 100\nreg: Int \u003d 0\nelasticNetParam: Int \u003d 0\nlr: org.apache.spark.ml.regression.LinearRegression \u003d linReg_a7f51d676562\nstart: Long \u003d 1444672044647\nmodel: org.apache.spark.ml.regression.LinearRegressionModel \u003d linReg_a7f51d676562\ntrainingTime: Double \u003d 12.985\ntrainingSummary: org.apache.spark.ml.regression.LinearRegressionTrainingSummary \u003d org.apache.spark.ml.regression.LinearRegressionTrainingSummary@227ba28b\nr2: Double \u003d 0.9677118209276552\niters: Int \u003d 17\ntrainingTimePerIter: Double \u003d 0.7638235294117647\n"
       },
       "dateCreated": "Oct 5, 2015 12:08:56 PM",
       "dateStarted": "Oct 12, 2015 10:47:24 AM",
       "dateFinished": "Oct 12, 2015 10:47:38 AM",
       "status": "FINISHED",
       "progressUpdateIntervalMs": 500
     },
     {
       "text": "// Print statistics\nprintln(s\"R2: ${r2}\")\nprintln(s\"Iterations: ${iters}\")\nprintln(s\"Training time per iter: ${trainingTimePerIter} seconds\")",
       "dateUpdated": "Oct 12, 2015 10:47:23 AM",
       "config": {
         "colWidth": 12.0,
         "graph": {
           "mode": "table",
           "height": 300.0,
           "optionOpen": false,
           "keys": [],
           "values": [],
           "groups": [],
           "scatter": {}
         },
         "editorMode": "ace/mode/scala"
       },
       "settings": {
         "params": {},
         "forms": {}
       },
       "jobName": "paragraph_1444440910099_1714338510",
       "id": "20151009-183510_1200043993",
       "result": {
         "code": "SUCCESS",
         "type": "TEXT",
         "msg": "R2: 0.9677118209276552\nIterations: 17\nTraining time per iter: 0.7638235294117647 seconds\n"
       },
       "dateCreated": "Oct 9, 2015 6:35:10 PM",
       "dateStarted": "Oct 12, 2015 10:47:24 AM",
       "dateFinished": "Oct 12, 2015 10:47:38 AM",
       "status": "FINISHED",
       "progressUpdateIntervalMs": 500
     },
     {
       "text": "// SystemML kernels\nval linearReg \u003d\n\"\"\"\n#\n# THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJUGATE GRADIENT ALGORITHM\n#\n# INPUT PARAMETERS:\n# --------------------------------------------------------------------------------------------\n# NAME  TYPE   DEFAULT  MEANING\n# --------------------------------------------------------------------------------------------\n# X     String  ---     Matrix X of feature vectors\n# Y     String  ---     1-column Matrix Y of response values\n# icpt  Int      0      Intercept presence, shifting and rescaling the columns of X:\n#                       0 \u003d no intercept, no shifting, no rescaling;\n#                       1 \u003d add intercept, but neither shift nor rescale X;\n#                       2 \u003d add intercept, shift \u0026 rescale X columns to mean \u003d 0, variance \u003d 1\n# reg   Double 0.000001 Regularization constant (lambda) for L2-regularization; set to nonzero\n#                       for highly dependend/sparse/numerous features\n# tol   Double 0.000001 Tolerance (epsilon); conjugate graduent procedure terminates early if\n#                       L2 norm of the beta-residual is less than tolerance * its initial norm\n# maxi  Int      0      Maximum number of conjugate gradient iterations, 0 \u003d no maximum\n# --------------------------------------------------------------------------------------------\n#\n# OUTPUT:\n# B Estimated regression parameters (the betas) to store\n#\n# Note: Matrix of regression parameters (the betas) and its size depend on icpt input value:\n#         OUTPUT SIZE:   OUTPUT CONTENTS:                HOW TO PREDICT Y FROM X AND B:\n# icpt\u003d0: ncol(X)   x 1  Betas for X only                Y ~ X %*% B[1:ncol(X), 1], or just X %*% B\n# icpt\u003d1: ncol(X)+1 x 1  Betas for X and intercept       Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]\n# icpt\u003d2: ncol(X)+1 x 2  Col.1: betas for X \u0026 intercept  Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]\n#                        Col.2: betas for shifted/rescaled X and intercept\n#\n\nfileX \u003d \"\";\nfileY \u003d \"\";\nfileB \u003d \"\";\n\nintercept_status \u003d ifdef ($icpt, 0);     # $icpt\u003d0;\ntolerance \u003d ifdef ($tol, 0.000001);      # $tol\u003d0.000001;\nmax_iteration \u003d ifdef ($maxi, 0);        # $maxi\u003d0;\nregularization \u003d ifdef ($reg, 0.000001); # $reg\u003d0.000001;\n\nX \u003d read (fileX);\ny \u003d read (fileY);\n\nn \u003d nrow (X);\nm \u003d ncol (X);\nones_n \u003d matrix (1, rows \u003d n, cols \u003d 1);\nzero_cell \u003d matrix (0, rows \u003d 1, cols \u003d 1);\n\n# Introduce the intercept, shift and rescale the columns of X if needed\n\nm_ext \u003d m;\nif (intercept_status \u003d\u003d 1 | intercept_status \u003d\u003d 2)  # add the intercept column\n{\n    X \u003d append (X, ones_n);\n    m_ext \u003d ncol (X);\n}\n\nscale_lambda \u003d matrix (1, rows \u003d m_ext, cols \u003d 1);\nif (intercept_status \u003d\u003d 1 | intercept_status \u003d\u003d 2)\n{\n    scale_lambda [m_ext, 1] \u003d 0;\n}\n\nif (intercept_status \u003d\u003d 2)  # scale-\u0026-shift X columns to mean 0, variance 1\n{                           # Important assumption: X [, m_ext] \u003d ones_n\n    avg_X_cols \u003d t(colSums(X)) / n;\n    var_X_cols \u003d (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);\n    is_unsafe \u003d ppred (var_X_cols, 0.0, \"\u003c\u003d\");\n    scale_X \u003d 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);\n    scale_X [m_ext, 1] \u003d 1;\n    shift_X \u003d - avg_X_cols * scale_X;\n    shift_X [m_ext, 1] \u003d 0;\n} else {\n    scale_X \u003d matrix (1, rows \u003d m_ext, cols \u003d 1);\n    shift_X \u003d matrix (0, rows \u003d m_ext, cols \u003d 1);\n}\n\n# Henceforth, if intercept_status \u003d\u003d 2, we use \"X %*% (SHIFT/SCALE TRANSFORM)\"\n# instead of \"X\".  However, in order to preserve the sparsity of X,\n# we apply the transform associatively to some other part of the expression\n# in which it occurs.  To avoid materializing a large matrix, we rewrite it:\n#\n# ssX_A  \u003d (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:\n# ssX_A  \u003d diag (scale_X) %*% A;\n# ssX_A [m_ext, ] \u003d ssX_A [m_ext, ] + t(shift_X) %*% A;\n#\n# tssX_A \u003d t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:\n# tssX_A \u003d diag (scale_X) %*% A + shift_X %*% A [m_ext, ];\n\nlambda \u003d scale_lambda * regularization;\nbeta_unscaled \u003d matrix (0, rows \u003d m_ext, cols \u003d 1);\n\nif (max_iteration \u003d\u003d 0) {\n    max_iteration \u003d m_ext;\n}\ni \u003d 0;\n\n# BEGIN THE CONJUGATE GRADIENT ALGORITHM\nr \u003d - t(X) %*% y;\n\nif (intercept_status \u003d\u003d 2) {\n    r \u003d scale_X * r + shift_X %*% r [m_ext, ];\n}\n\np \u003d - r;\nnorm_r2 \u003d sum (r ^ 2);\nnorm_r2_initial \u003d norm_r2;\nnorm_r2_target \u003d norm_r2_initial * tolerance ^ 2;\n\nwhile (i \u003c max_iteration \u0026 norm_r2 \u003e norm_r2_target)\n{\n    if (intercept_status \u003d\u003d 2) {\n        ssX_p \u003d scale_X * p;\n        ssX_p [m_ext, ] \u003d ssX_p [m_ext, ] + t(shift_X) %*% p;\n    } else {\n        ssX_p \u003d p;\n    }\n\n    q \u003d t(X) %*% (X %*% ssX_p);\n\n    if (intercept_status \u003d\u003d 2) {\n        q \u003d scale_X * q + shift_X %*% q [m_ext, ];\n    }\n\n    q \u003d q + lambda * p;\n    a \u003d norm_r2 / sum (p * q);\n    beta_unscaled \u003d beta_unscaled + a * p;\n    r \u003d r + a * q;\n    old_norm_r2 \u003d norm_r2;\n    norm_r2 \u003d sum (r ^ 2);\n    p \u003d -r + (norm_r2 / old_norm_r2) * p;\n    i \u003d i + 1;\n}\n# END THE CONJUGATE GRADIENT ALGORITHM\n\nif (intercept_status \u003d\u003d 2) {\n    beta \u003d scale_X * beta_unscaled;\n    beta [m_ext, ] \u003d beta [m_ext, ] + t(shift_X) %*% beta_unscaled;\n} else {\n    beta \u003d beta_unscaled;\n}\n\n# Output statistics\navg_tot \u003d sum (y) / n;\nss_tot \u003d sum (y ^ 2);\nss_avg_tot \u003d ss_tot - n * avg_tot ^ 2;\nvar_tot \u003d ss_avg_tot / (n - 1);\ny_residual \u003d y - X %*% beta;\navg_res \u003d sum (y_residual) / n;\nss_res \u003d sum (y_residual ^ 2);\nss_avg_res \u003d ss_res - n * avg_res ^ 2;\n\nR2_temp \u003d 1 - ss_res / ss_avg_tot\nR2 \u003d matrix(R2_temp, rows\u003d1, cols\u003d1)\nwrite(R2, \"\")\n\ntotalIters \u003d matrix(i, rows\u003d1, cols\u003d1)\nwrite(totalIters, \"\")\n\n# Prepare the output matrix\nif (intercept_status \u003d\u003d 2) {\n    beta_out \u003d append (beta, beta_unscaled);\n} else {\n    beta_out \u003d beta;\n}\n\nwrite (beta_out, fileB);\n\"\"\"",
       "dateUpdated": "Oct 12, 2015 10:47:23 AM",
       "config": {
         "colWidth": 12.0,
         "graph": {
           "mode": "table",
           "height": 300.0,
           "optionOpen": false,
           "keys": [],
           "values": [],
           "groups": [],
           "scatter": {}
         },
         "editorMode": "ace/mode/scala",
         "tableHide": true,
         "editorHide": false
       },
       "settings": {
         "params": {},
         "forms": {}
       },
       "jobName": "paragraph_1444067747401_589515959",
       "id": "20151005-105547_1888511498",
       "result": {
         "code": "SUCCESS",
         "type": "TEXT",
         "msg": "linearReg: String \u003d \n\"\n#\n# THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJUGATE GRADIENT ALGORITHM\n#\n# INPUT PARAMETERS:\n# --------------------------------------------------------------------------------------------\n# NAME  TYPE   DEFAULT  MEANING\n# --------------------------------------------------------------------------------------------\n# X     String  ---     Matrix X of feature vectors\n# Y     String  ---     1-column Matrix Y of response values\n# icpt  Int      0      Intercept presence, shifting and rescaling the columns of X:\n#                       0 \u003d no intercept, no shifting, no rescaling;\n#                       1 \u003d add intercept, but neither shift nor rescale X;\n#                       2 \u003d add intercept, shift \u0026 rescale X columns to mean \u003d 0, variance \u003d 1\n# reg   Doub..."
       },
       "dateCreated": "Oct 5, 2015 10:55:47 AM",
       "dateStarted": "Oct 12, 2015 10:47:38 AM",
       "dateFinished": "Oct 12, 2015 10:47:38 AM",
       "status": "FINISHED",
       "progressUpdateIntervalMs": 500
     },
     {
       "text": "// SystemML Kernels\nval linearRegFull \u003d\n\"\"\"\n#-------------------------------------------------------------\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n#-------------------------------------------------------------\n\n#\n# THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJUGATE GRADIENT ALGORITHM\n#\n# INPUT PARAMETERS:\n# --------------------------------------------------------------------------------------------\n# NAME  TYPE   DEFAULT  MEANING\n# --------------------------------------------------------------------------------------------\n# X     String  ---     Location (on HDFS) to read the matrix X of feature vectors\n# Y     String  ---     Location (on HDFS) to read the 1-column matrix Y of response values\n# B     String  ---     Location to store estimated regression parameters (the betas)\n# O     String  \" \"     Location to write the printed statistics; by default is standard output\n# Log   String  \" \"     Location to write per-iteration variables for log/debugging purposes\n# icpt  Int      0      Intercept presence, shifting and rescaling the columns of X:\n#                       0 \u003d no intercept, no shifting, no rescaling;\n#                       1 \u003d add intercept, but neither shift nor rescale X;\n#                       2 \u003d add intercept, shift \u0026 rescale X columns to mean \u003d 0, variance \u003d 1\n# reg   Double 0.000001 Regularization constant (lambda) for L2-regularization; set to nonzero\n#                       for highly dependend/sparse/numerous features\n# tol   Double 0.000001 Tolerance (epsilon); conjugate graduent procedure terminates early if\n#                       L2 norm of the beta-residual is less than tolerance * its initial norm\n# maxi  Int      0      Maximum number of conjugate gradient iterations, 0 \u003d no maximum\n# fmt   String \"text\"   Matrix output format for B (the betas) only, usually \"text\" or \"csv\"\n# --------------------------------------------------------------------------------------------\n# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:\n#         OUTPUT SIZE:   OUTPUT CONTENTS:                HOW TO PREDICT Y FROM X AND B:\n# icpt\u003d0: ncol(X)   x 1  Betas for X only                Y ~ X %*% B[1:ncol(X), 1], or just X %*% B\n# icpt\u003d1: ncol(X)+1 x 1  Betas for X and intercept       Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]\n# icpt\u003d2: ncol(X)+1 x 2  Col.1: betas for X \u0026 intercept  Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]\n#                        Col.2: betas for shifted/rescaled X and intercept\n#\n# In addition, some regression statistics are provided in CSV format, one comma-separated\n# name-value pair per each line, as follows:\n#\n# NAME                  MEANING\n# -------------------------------------------------------------------------------------\n# AVG_TOT_Y             Average of the response value Y\n# STDEV_TOT_Y           Standard Deviation of the response value Y\n# AVG_RES_Y             Average of the residual Y - pred(Y|X), i.e. residual bias\n# STDEV_RES_Y           Standard Deviation of the residual Y - pred(Y|X)\n# DISPERSION            GLM-style dispersion, i.e. residual sum of squares / # deg. fr.\n# PLAIN_R2              Plain R^2 of residual with bias included vs. total average\n# ADJUSTED_R2           Adjusted R^2 of residual with bias included vs. total average\n# PLAIN_R2_NOBIAS       Plain R^2 of residual with bias subtracted vs. total average\n# ADJUSTED_R2_NOBIAS    Adjusted R^2 of residual with bias subtracted vs. total average\n# PLAIN_R2_VS_0         * Plain R^2 of residual with bias included vs. zero constant\n# ADJUSTED_R2_VS_0      * Adjusted R^2 of residual with bias included vs. zero constant\n# -------------------------------------------------------------------------------------\n# * The last two statistics are only printed if there is no intercept (icpt\u003d0)\n#\n# The Log file, when requested, contains the following per-iteration variables in CSV\n# format, each line containing triple (NAME, ITERATION, VALUE) with ITERATION \u003d 0 for\n# initial values:\n#\n# NAME                  MEANING\n# -------------------------------------------------------------------------------------\n# CG_RESIDUAL_NORM      L2-norm of Conj.Grad.residual, which is A %*% beta - t(X) %*% y\n#                           where A \u003d t(X) %*% X + diag (lambda), or a similar quantity\n# CG_RESIDUAL_RATIO     Ratio of current L2-norm of Conj.Grad.residual over the initial\n# -------------------------------------------------------------------------------------\n#\n# HOW TO INVOKE THIS SCRIPT - EXAMPLE:\n# hadoop jar SystemML.jar -f LinearRegCG.dml -nvargs X\u003dINPUT_DIR/X Y\u003dINPUT_DIR/Y B\u003dOUTPUT_DIR/B\n#     O\u003dOUTPUT_DIR/Out icpt\u003d2 reg\u003d1.0 tol\u003d0.001 maxi\u003d100 fmt\u003dcsv Log\u003dOUTPUT_DIR/log\n\nfileX \u003d \"\";\nfileY \u003d \"\";\nfileB \u003d \"\";\nfileO \u003d ifdef ($O, \" \");\nfileLog \u003d ifdef ($Log, \" \");\nfmtB \u003d ifdef ($fmt, \"text\");\n\nintercept_status \u003d ifdef ($icpt, 0);     # $icpt\u003d0;\ntolerance \u003d ifdef ($tol, 0.000001);      # $tol\u003d0.000001;\nmax_iteration \u003d ifdef ($maxi, 0);        # $maxi\u003d0;\nregularization \u003d ifdef ($reg, 0.000001); # $reg\u003d0.000001;\n\nprint (\"BEGIN LINEAR REGRESSION SCRIPT\");\nprint (\"Reading X and Y...\");\nX \u003d read (fileX);\ny \u003d read (fileY);\n\nn \u003d nrow (X);\nm \u003d ncol (X);\nones_n \u003d matrix (1, rows \u003d n, cols \u003d 1);\nzero_cell \u003d matrix (0, rows \u003d 1, cols \u003d 1);\n\n# Introduce the intercept, shift and rescale the columns of X if needed\n\nm_ext \u003d m;\nif (intercept_status \u003d\u003d 1 | intercept_status \u003d\u003d 2)  # add the intercept column\n{\n    X \u003d append (X, ones_n);\n    m_ext \u003d ncol (X);\n}\n\nscale_lambda \u003d matrix (1, rows \u003d m_ext, cols \u003d 1);\nif (intercept_status \u003d\u003d 1 | intercept_status \u003d\u003d 2)\n{\n    scale_lambda [m_ext, 1] \u003d 0;\n}\n\nif (intercept_status \u003d\u003d 2)  # scale-\u0026-shift X columns to mean 0, variance 1\n{                           # Important assumption: X [, m_ext] \u003d ones_n\n    avg_X_cols \u003d t(colSums(X)) / n;\n    var_X_cols \u003d (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);\n    is_unsafe \u003d ppred (var_X_cols, 0.0, \"\u003c\u003d\");\n    scale_X \u003d 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);\n    scale_X [m_ext, 1] \u003d 1;\n    shift_X \u003d - avg_X_cols * scale_X;\n    shift_X [m_ext, 1] \u003d 0;\n} else {\n    scale_X \u003d matrix (1, rows \u003d m_ext, cols \u003d 1);\n    shift_X \u003d matrix (0, rows \u003d m_ext, cols \u003d 1);\n}\n\n# Henceforth, if intercept_status \u003d\u003d 2, we use \"X %*% (SHIFT/SCALE TRANSFORM)\"\n# instead of \"X\".  However, in order to preserve the sparsity of X,\n# we apply the transform associatively to some other part of the expression\n# in which it occurs.  To avoid materializing a large matrix, we rewrite it:\n#\n# ssX_A  \u003d (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:\n# ssX_A  \u003d diag (scale_X) %*% A;\n# ssX_A [m_ext, ] \u003d ssX_A [m_ext, ] + t(shift_X) %*% A;\n#\n# tssX_A \u003d t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:\n# tssX_A \u003d diag (scale_X) %*% A + shift_X %*% A [m_ext, ];\n\nlambda \u003d scale_lambda * regularization;\nbeta_unscaled \u003d matrix (0, rows \u003d m_ext, cols \u003d 1);\n\nif (max_iteration \u003d\u003d 0) {\n    max_iteration \u003d m_ext;\n}\ni \u003d 0;\n\n# BEGIN THE CONJUGATE GRADIENT ALGORITHM\nprint (\"Running the CG algorithm...\");\n\nr \u003d - t(X) %*% y;\n\nif (intercept_status \u003d\u003d 2) {\n    r \u003d scale_X * r + shift_X %*% r [m_ext, ];\n}\n\np \u003d - r;\nnorm_r2 \u003d sum (r ^ 2);\nnorm_r2_initial \u003d norm_r2;\nnorm_r2_target \u003d norm_r2_initial * tolerance ^ 2;\nprint (\"||r|| initial value \u003d \" + sqrt (norm_r2_initial) + \",  target value \u003d \" + sqrt (norm_r2_target));\nlog_str \u003d \"CG_RESIDUAL_NORM,0,\" + sqrt (norm_r2_initial);\nlog_str \u003d append (log_str, \"CG_RESIDUAL_RATIO,0,1.0\");\n\nwhile (i \u003c max_iteration \u0026 norm_r2 \u003e norm_r2_target)\n{\n    if (intercept_status \u003d\u003d 2) {\n        ssX_p \u003d scale_X * p;\n        ssX_p [m_ext, ] \u003d ssX_p [m_ext, ] + t(shift_X) %*% p;\n    } else {\n        ssX_p \u003d p;\n    }\n    \n    q \u003d t(X) %*% (X %*% ssX_p);\n\n    if (intercept_status \u003d\u003d 2) {\n        q \u003d scale_X * q + shift_X %*% q [m_ext, ];\n    }\n\n\tq \u003d q + lambda * p;\n\ta \u003d norm_r2 / sum (p * q);\n\tbeta_unscaled \u003d beta_unscaled + a * p;\n\tr \u003d r + a * q;\n\told_norm_r2 \u003d norm_r2;\n\tnorm_r2 \u003d sum (r ^ 2);\n\tp \u003d -r + (norm_r2 / old_norm_r2) * p;\n\ti \u003d i + 1;\n\tprint (\"Iteration \" + i + \":  ||r|| / ||r init|| \u003d \" + sqrt (norm_r2 / norm_r2_initial));\n\tlog_str \u003d append (log_str, \"CG_RESIDUAL_NORM,\"  + i + \",\" + sqrt (norm_r2));\n    log_str \u003d append (log_str, \"CG_RESIDUAL_RATIO,\" + i + \",\" + sqrt (norm_r2 / norm_r2_initial));\n}\n\nif (i \u003e\u003d max_iteration) {\n    print (\"Warning: the maximum number of iterations has been reached.\");\n}\nprint (\"The CG algorithm is done.\");\n# END THE CONJUGATE GRADIENT ALGORITHM\n\nif (intercept_status \u003d\u003d 2) {\n    beta \u003d scale_X * beta_unscaled;\n    beta [m_ext, ] \u003d beta [m_ext, ] + t(shift_X) %*% beta_unscaled;\n} else {\n    beta \u003d beta_unscaled;\n}\n\nprint (\"Computing the statistics...\");\n\navg_tot \u003d sum (y) / n;\nss_tot \u003d sum (y ^ 2);\nss_avg_tot \u003d ss_tot - n * avg_tot ^ 2;\nvar_tot \u003d ss_avg_tot / (n - 1);\ny_residual \u003d y - X %*% beta;\navg_res \u003d sum (y_residual) / n;\nss_res \u003d sum (y_residual ^ 2);\nss_avg_res \u003d ss_res - n * avg_res ^ 2;\n\nplain_R2 \u003d 1 - ss_res / ss_avg_tot;\nif (n \u003e m_ext) {\n    dispersion  \u003d ss_res / (n - m_ext);\n    adjusted_R2 \u003d 1 - dispersion / (ss_avg_tot / (n - 1));\n} else {\n    dispersion  \u003d 0.0 / 0.0;\n    adjusted_R2 \u003d 0.0 / 0.0;\n}\n\nplain_R2_nobias \u003d 1 - ss_avg_res / ss_avg_tot;\ndeg_freedom \u003d n - m - 1;\nif (deg_freedom \u003e 0) {\n    var_res \u003d ss_avg_res / deg_freedom;\n    adjusted_R2_nobias \u003d 1 - var_res / (ss_avg_tot / (n - 1));\n} else {\n    var_res \u003d 0.0 / 0.0;\n    adjusted_R2_nobias \u003d 0.0 / 0.0;\n    print (\"Warning: zero or negative number of degrees of freedom.\");\n}\n\nplain_R2_vs_0 \u003d 1 - ss_res / ss_tot;\nif (n \u003e m) {\n    adjusted_R2_vs_0 \u003d 1 - (ss_res / (n - m)) / (ss_tot / n);\n} else {\n    adjusted_R2_vs_0 \u003d 0.0 / 0.0;\n}\n\nstr \u003d \"AVG_TOT_Y,\" + avg_tot;                                    #  Average of the response value Y\nstr \u003d append (str, \"STDEV_TOT_Y,\" + sqrt (var_tot));             #  Standard Deviation of the response value Y\nstr \u003d append (str, \"AVG_RES_Y,\" + avg_res);                      #  Average of the residual Y - pred(Y|X), i.e. residual bias\nstr \u003d append (str, \"STDEV_RES_Y,\" + sqrt (var_res));             #  Standard Deviation of the residual Y - pred(Y|X)\nstr \u003d append (str, \"DISPERSION,\" + dispersion);                  #  GLM-style dispersion, i.e. residual sum of squares / # d.f.\nstr \u003d append (str, \"PLAIN_R2,\" + plain_R2);                      #  Plain R^2 of residual with bias included vs. total average\nstr \u003d append (str, \"ADJUSTED_R2,\" + adjusted_R2);                #  Adjusted R^2 of residual with bias included vs. total average\nstr \u003d append (str, \"PLAIN_R2_NOBIAS,\" + plain_R2_nobias);        #  Plain R^2 of residual with bias subtracted vs. total average\nstr \u003d append (str, \"ADJUSTED_R2_NOBIAS,\" + adjusted_R2_nobias);  #  Adjusted R^2 of residual with bias subtracted vs. total average\nif (intercept_status \u003d\u003d 0) {\n    str \u003d append (str, \"PLAIN_R2_VS_0,\" + plain_R2_vs_0);        #  Plain R^2 of residual with bias included vs. zero constant\n    str \u003d append (str, \"ADJUSTED_R2_VS_0,\" + adjusted_R2_vs_0);  #  Adjusted R^2 of residual with bias included vs. zero constant\n}\n\nif (fileO !\u003d \" \") {\n    write (str, fileO);\n} else {\n    print (str);\n}\n\n# Prepare the output matrix\nprint (\"Writing the output matrix...\");\n\nif (intercept_status \u003d\u003d 2) {\n    beta_out \u003d append (beta, beta_unscaled);\n} else {\n    beta_out \u003d beta;\n}\nwrite (beta_out, fileB, format\u003dfmtB);\n\nif (fileLog !\u003d \" \") {\n    write (log_str, fileLog);\n}\nprint (\"END LINEAR REGRESSION SCRIPT\");\n\"\"\"",
       "dateUpdated": "Oct 12, 2015 10:47:23 AM",
       "config": {
         "colWidth": 12.0,
         "graph": {
           "mode": "table",
           "height": 300.0,
           "optionOpen": false,
           "keys": [],
           "values": [],
           "groups": [],
           "scatter": {}
         },
         "editorMode": "ace/mode/scala",
         "editorHide": true,
         "tableHide": true
       },
       "settings": {
         "params": {},
         "forms": {}
       },
       "jobName": "paragraph_1444365784575_-1898057033",
       "id": "20151008-214304_2123048601",
       "result": {
         "code": "SUCCESS",
         "type": "TEXT",
         "msg": "linearRegFull: String \u003d \n\"\n#-------------------------------------------------------------\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n#-------------------------------------------------------------\n\n#\n# THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJU..."
       },
       "dateCreated": "Oct 8, 2015 9:43:04 PM",
       "dateStarted": "Oct 12, 2015 10:47:38 AM",
       "dateFinished": "Oct 12, 2015 10:47:38 AM",
       "status": "FINISHED",
       "progressUpdateIntervalMs": 500
     },
     {
       "text": "// Helper functions\nimport org.apache.sysml.api.MLOutput\n\ndef getScalar(outputs: MLOutput, symbol: String): Any \u003d\n    outputs.getDF(sqlContext, symbol).first()(1)\n    \ndef getScalarDouble(outputs: MLOutput, symbol: String): Double \u003d \n    getScalar(outputs, symbol).asInstanceOf[Double]\n    \ndef getScalarInt(outputs: MLOutput, symbol: String): Int \u003d\n    getScalarDouble(outputs, symbol).toInt",
       "dateUpdated": "Oct 12, 2015 10:47:23 AM",
       "config": {
         "colWidth": 12.0,
         "graph": {
           "mode": "table",
           "height": 300.0,
           "optionOpen": false,
           "keys": [],
           "values": [],
           "groups": [],
           "scatter": {}
         },
         "editorMode": "ace/mode/scala"
       },
       "settings": {
         "params": {},
         "forms": {}
       },
       "jobName": "paragraph_1444428901112_1314919146",
       "id": "20151009-151501_511642642",
       "result": {
         "code": "SUCCESS",
         "type": "TEXT",
         "msg": "import org.apache.sysml.api.MLOutput\ngetScalar: (outputs: org.apache.sysml.api.MLOutput, symbol: String)Any\ngetScalarDouble: (outputs: org.apache.sysml.api.MLOutput, symbol: String)Double\ngetScalarInt: (outputs: org.apache.sysml.api.MLOutput, symbol: String)Int\n"
       },
       "dateCreated": "Oct 9, 2015 3:15:01 PM",
       "dateStarted": "Oct 12, 2015 10:47:38 AM",
       "dateFinished": "Oct 12, 2015 10:47:39 AM",
       "status": "FINISHED",
       "progressUpdateIntervalMs": 500
     },
     {
       "text": "// Imports\nimport org.apache.sysml.api.MLContext\nimport org.apache.sysml.runtime.instructions.spark.utils.{RDDConverterUtilsExt \u003d\u003e RDDConverterUtils}\nimport org.apache.sysml.runtime.matrix.MatrixCharacteristics;\n\n// Create SystemML context\nval ml \u003d new MLContext(sc)\n\n// Convert data to proper format\nval mcX \u003d new MatrixCharacteristics(numRows, numCols, 1000, 1000)\nval mcY \u003d new MatrixCharacteristics(numRows, 1, 1000, 1000)\nval X \u003d RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, data, mcX, false, \"features\")\nval y \u003d RDDConverterUtils.dataFrameToBinaryBlock(sc, data.select(\"label\"), mcY, false)\n// val y \u003d data.select(\"label\")\n\n// Cache\nval X2 \u003d X.cache()\nval y2 \u003d y.cache()\nval cnt1 \u003d X2.count()\nval cnt2 \u003d y2.count() ",
       "dateUpdated": "Oct 12, 2015 10:47:23 AM",
       "config": {
         "colWidth": 12.0,
         "graph": {
           "mode": "table",
           "height": 300.0,
           "optionOpen": false,
           "keys": [],
           "values": [],
           "groups": [],
           "scatter": {}
         },
         "editorMode": "ace/mode/scala"
       },
       "settings": {
         "params": {},
         "forms": {}
       },
       "jobName": "paragraph_1444068596053_-555488546",
       "id": "20151005-110956_169115151",
       "result": {
         "code": "SUCCESS",
         "type": "TEXT",
         "msg": "import org.apache.sysml.api.MLContext\nimport org.apache.sysml.runtime.instructions.spark.utils.{RDDConverterUtilsExt\u003d\u003eRDDConverterUtils}\nimport org.apache.sysml.runtime.matrix.MatrixCharacteristics\nml: org.apache.sysml.api.MLContext \u003d org.apache.sysml.api.MLContext@38d59245\nmcX: org.apache.sysml.runtime.matrix.MatrixCharacteristics \u003d [10000 x 1000, nnz\u003d-1, blocks (1000 x 1000)]\nmcY: org.apache.sysml.runtime.matrix.MatrixCharacteristics \u003d [10000 x 1, nnz\u003d-1, blocks (1000 x 1000)]\nX: org.apache.spark.api.java.JavaPairRDD[org.apache.sysml.runtime.matrix.data.MatrixIndexes,org.apache.sysml.runtime.matrix.data.MatrixBlock] \u003d org.apache.spark.api.java.JavaPairRDD@b5a86e3\ny: org.apache.spark.api.java.JavaPairRDD[org.apache.sysml.runtime.matrix.data.MatrixIndexes,org.apache.sysml.runtime.matrix.data.MatrixBlock] \u003d org.apache.spark.api.java.JavaPairRDD@56377665\nX2: org.apache.spark.api.java.JavaPairRDD[org.apache.sysml.runtime.matrix.data.MatrixIndexes,org.apache.sysml.runtime.matrix.data.MatrixBlock] \u003d org.apache.spark.api.java.JavaPairRDD@650f29d2\ny2: org.apache.spark.api.java.JavaPairRDD[org.apache.sysml.runtime.matrix.data.MatrixIndexes,org.apache.sysml.runtime.matrix.data.MatrixBlock] \u003d org.apache.spark.api.java.JavaPairRDD@334857a8\ncnt1: Long \u003d 10\ncnt2: Long \u003d 10\n"
       },
       "dateCreated": "Oct 5, 2015 11:09:56 AM",
       "dateStarted": "Oct 12, 2015 10:47:39 AM",
       "dateFinished": "Oct 12, 2015 10:47:43 AM",
       "status": "FINISHED",
       "progressUpdateIntervalMs": 500
     },
     {
       "text": "// Register inputs \u0026 outputs\nml.reset()  \nml.registerInput(\"X\", X, numRows, numCols)\nml.registerInput(\"y\", y, numRows, 1)\n// ml.registerInput(\"y\", y)\nml.registerOutput(\"beta_out\")\nml.registerOutput(\"R2\")\nml.registerOutput(\"totalIters\")\n\n// Run the script\nval start \u003d System.currentTimeMillis()\nval outputs \u003d ml.executeScript(linearReg)\nval trainingTime \u003d (System.currentTimeMillis() - start).toDouble / 1000.0\n\n// Get outputs\nval B \u003d outputs.getDF(sqlContext, \"beta_out\").sort(\"ID\").drop(\"ID\")\nval r2 \u003d getScalarDouble(outputs, \"R2\")\nval iters \u003d getScalarInt(outputs, \"totalIters\")\nval trainingTimePerIter \u003d trainingTime / iters",
       "dateUpdated": "Oct 12, 2015 10:48:10 AM",
       "config": {
         "colWidth": 12.0,
         "graph": {
           "mode": "table",
           "height": 300.0,
           "optionOpen": false,
           "keys": [],
           "values": [],
           "groups": [],
           "scatter": {}
         },
         "editorMode": "ace/mode/scala"
       },
       "settings": {
         "params": {},
         "forms": {}
       },
       "jobName": "paragraph_1444671982719_328618024",
       "id": "20151012-104622_1349641375",
       "result": {
         "code": "SUCCESS",
         "type": "TEXT",
         "msg": "start: Long \u003d 1444672090620\noutputs: org.apache.sysml.api.MLOutput \u003d org.apache.sysml.api.MLOutput@5d2c22d0\ntrainingTime: Double \u003d 1.176\nB: org.apache.spark.sql.DataFrame \u003d [C1: double]\nr2: Double \u003d 0.9677079547216473\niters: Int \u003d 12\ntrainingTimePerIter: Double \u003d 0.09799999999999999\n"
       },
       "dateCreated": "Oct 12, 2015 10:46:22 AM",
       "dateStarted": "Oct 12, 2015 10:48:10 AM",
       "dateFinished": "Oct 12, 2015 10:48:12 AM",
       "status": "FINISHED",
       "progressUpdateIntervalMs": 500
     },
     {
       "text": "// Print statistics\nprintln(s\"R2: ${r2}\")\nprintln(s\"Iterations: ${iters}\")\nprintln(s\"Training time per iter: ${trainingTimePerIter} seconds\")\nB.describe().show()",
       "dateUpdated": "Oct 12, 2015 10:47:23 AM",
       "config": {
         "colWidth": 12.0,
         "graph": {
           "mode": "table",
           "height": 300.0,
           "optionOpen": false,
           "keys": [],
           "values": [],
           "groups": [],
           "scatter": {}
         },
         "tableHide": false,
         "editorHide": false,
         "editorMode": "ace/mode/scala"
       },
       "settings": {
         "params": {},
         "forms": {}
       },
       "jobName": "paragraph_1444081221119_-327699254",
       "id": "20151005-144021_55411373",
       "result": {
         "code": "SUCCESS",
         "type": "TEXT",
         "msg": "R2: 0.9677079547216473\nIterations: 12\nTraining time per iter: 0.2334166666666667 seconds\n+-------+-------------------+\n|summary|                 C1|\n+-------+-------------------+\n|  count|               1000|\n|   mean| 0.0184500840658385|\n| stddev| 0.2764750319432085|\n|    min|-0.5426068958986378|\n|    max| 0.5225309861616542|\n+-------+-------------------+\n\n"
       },
       "dateCreated": "Oct 5, 2015 2:40:21 PM",
       "dateStarted": "Oct 12, 2015 10:47:43 AM",
       "dateFinished": "Oct 12, 2015 10:47:50 AM",
       "status": "FINISHED",
       "progressUpdateIntervalMs": 500
     },
     {
       "dateUpdated": "Oct 12, 2015 10:47:23 AM",
       "config": {
         "colWidth": 12.0,
         "graph": {
           "mode": "table",
           "height": 300.0,
           "optionOpen": false,
           "keys": [],
           "values": [],
           "groups": [],
           "scatter": {}
         },
         "editorMode": "ace/mode/scala"
       },
       "settings": {
         "params": {},
         "forms": {}
       },
       "jobName": "paragraph_1444422131984_536286492",
       "id": "20151009-132211_1399012872",
       "result": {
         "code": "SUCCESS",
         "type": "TEXT"
       },
       "dateCreated": "Oct 9, 2015 1:22:11 PM",
       "dateStarted": "Oct 12, 2015 10:47:47 AM",
       "dateFinished": "Oct 12, 2015 10:47:50 AM",
       "status": "FINISHED",
       "progressUpdateIntervalMs": 500
     }
   ],
   "name": "SystemML - Linear Regression",
   "id": "2AZ2AQ12B",
   "angularObjects": {
     "2AZ9BN82Z": [],
     "2AZWVF2GZ": [],
     "2AZ78WFZ8": [],
     "2B16GQAY6": [],
     "2AXNVS2AP": [],
     "2AXQ86QRG": [],
     "2AZR88MK4": [],
     "2AY2CE5DY": [],
     "2AZRRUZZU": [],
     "2AY16128C": [],
     "2AX3MFKQ2": [],
     "2AWQWADKQ": [],
     "2AYCYP1MW": [],
     "2AZYXB1NC": []
   },
   "config": {},
   "info": {}
 }