blob: a29dbb9d596775a35173547d529a57f2f4018167 [file] [log] [blame]
{
"paragraphs": [
{
"text": "%md\n\n# Introduction\n\nThis note is to demonstrate how to do machine learning in flink. Here we use [Alink](https://github.com/alibaba/Alink/). \nWe use logics regression to do classification task. We use the same data as other tutorials [bank](https://archive.ics.uci.edu/ml/datasets/bank+marketing).\n",
"user": "anonymous",
"dateUpdated": "2021-01-25 14:30:35.800",
"progress": 0,
"config": {
"colWidth": 12.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "markdown",
"editOnDblClick": true,
"completionKey": "TAB",
"completionSupport": false
},
"editorMode": "ace/mode/markdown",
"editorHide": true,
"tableHide": false
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"runtimeInfos": {},
"progressUpdateIntervalMs": 500,
"jobName": "paragraph_1588147625869_1181490991",
"id": "paragraph_1588147625869_1181490991",
"dateCreated": "2020-04-29 16:07:05.869",
"dateStarted": "2021-01-25 14:30:35.800",
"dateFinished": "2021-01-25 14:30:35.807",
"status": "FINISHED"
},
{
"text": "%flink.pyflink\n\nimport pyflink\nfrom pyflink.dataset import ExecutionEnvironment\nfrom pyflink.datastream import StreamExecutionEnvironment\nfrom pyalink.alink.env import useCustomEnv\nmlenv \u003d useCustomEnv(gateway,\n b_env,bt_env_2, s_env, st_env_2)\nfrom pyalink.alink import *\n\nt \u003d bt_env_2.from_elements([(1, 2), (2, 5), (3, 1)], [\u0027a\u0027, \u0027b\u0027])\nsource \u003d TableSourceBatchOp(t)\nsource.print()",
"user": "anonymous",
"dateUpdated": "2020-04-27 13:48:06.523",
"progress": 0,
"config": {
"editorMode": "ace/mode/python",
"editorHide": false,
"colWidth": 6.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "python",
"editOnDblClick": false,
"completionSupport": true,
"completionKey": "TAB"
}
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"runtimeInfos": {},
"progressUpdateIntervalMs": 500,
"jobName": "paragraph_1583768872979_-705704388",
"id": "20200309-234752_541772059",
"dateCreated": "2020-03-09 23:47:52.979",
"dateStarted": "2020-04-27 13:48:06.528",
"dateFinished": "2020-04-27 13:48:29.623",
"status": "FINISHED"
},
{
"text": "%flink.pyflink\n\n\ntest_data_path \u003d \"/tmp/bank.csv\"\nfull_data_path \u003d \"/tmp/bank-full.csv\"\nschema_str \u003d \"age int, job string, marital string, education string, default string, balance string, housing string, loan string, contact string, day string, month string, duration int, campaign int, pdays int, previous int, poutcome string, y string\"\n\ntest_data \u003d CsvSourceBatchOp() \\\n .setFilePath(test_data_path) \\\n .setSchemaStr(schema_str) \\\n .setIgnoreFirstLine(True) \\\n .setFieldDelimiter(\";\")\n \nfull_data \u003d CsvSourceBatchOp() \\\n .setFilePath(full_data_path) \\\n .setSchemaStr(schema_str) \\\n .setIgnoreFirstLine(True) \\\n .setFieldDelimiter(\";\")\n \ntrain_set \u003d UnionAllBatchOp().linkFrom(\n full_data.filter(\"y\u003d\u0027yes\u0027\"),\n full_data.filter(\"y\u003d\u0027no\u0027\").sample(0.25)\n \n)",
"user": "anonymous",
"dateUpdated": "2020-04-29 16:10:03.433",
"progress": 0,
"config": {
"editorMode": "ace/mode/python",
"editorHide": false,
"colWidth": 6.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "python",
"editOnDblClick": false,
"completionSupport": true,
"completionKey": "TAB"
}
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"runtimeInfos": {},
"progressUpdateIntervalMs": 500,
"jobName": "paragraph_1583768872982_-1596320538",
"id": "20200309-234752_30368548",
"dateCreated": "2020-03-09 23:47:52.982",
"dateStarted": "2020-04-27 13:48:33.820",
"dateFinished": "2020-04-27 13:48:34.969",
"status": "FINISHED"
},
{
"text": "%flink.pyflink\n\n\ncategoricalColNames \u003d [\"job\", \"marital\", \"education\", \"default\",\n \"balance\", \"housing\", \"loan\", \"contact\", \"poutcome\" ]\nnumerialColNames \u003d [\"age\", \"duration\", \"campaign\", \"pdays\",\n \"previous\"]\nlabelColName \u003d \"y\"\n\nonehot \u003d OneHotEncoder().setSelectedCols(categoricalColNames) \\\n .setOutputCols([\"output\"])\nassembler \u003d VectorAssembler().setSelectedCols([\"output\"] + numerialColNames) \\\n .setOutputCol(\"vec\")\npipeline \u003d Pipeline().add(onehot).add(assembler)",
"user": "anonymous",
"dateUpdated": "2020-04-27 13:48:37.221",
"progress": 0,
"config": {
"editorMode": "ace/mode/python",
"editorHide": false,
"colWidth": 6.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "python",
"editOnDblClick": false,
"completionSupport": true,
"completionKey": "TAB"
}
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"runtimeInfos": {},
"progressUpdateIntervalMs": 500,
"jobName": "paragraph_1583768872983_-260771927",
"id": "20200309-234752_1624274051",
"dateCreated": "2020-03-09 23:47:52.983",
"dateStarted": "2020-04-27 13:48:37.242",
"dateFinished": "2020-04-27 13:48:37.911",
"status": "FINISHED"
},
{
"text": "%flink.pyflink\n\n\nlogistic \u003d LogisticRegression().setVectorCol(\"vec\").setLabelCol(labelColName) \\\n .setPredictionCol(\"pred\").setPredictionDetailCol(\"detail\")\nmodel \u003d pipeline.add(logistic).fit(train_set)\n\npredict \u003d model.transform(test_data)\n\nmetrics \u003d EvalBinaryClassBatchOp().setLabelCol(labelColName) \\\n .setPredictionDetailCol(\"detail\").linkFrom(predict).collectMetrics()\n \n \n ",
"user": "anonymous",
"dateUpdated": "2020-04-27 13:48:39.232",
"progress": 0,
"config": {
"editorMode": "ace/mode/python",
"editorHide": false,
"colWidth": 6.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "python",
"editOnDblClick": false,
"completionSupport": true,
"completionKey": "TAB"
}
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"runtimeInfos": {},
"progressUpdateIntervalMs": 500,
"jobName": "paragraph_1583768872983_-2044786839",
"id": "20200309-234752_452787710",
"dateCreated": "2020-03-09 23:47:52.983",
"dateStarted": "2020-04-27 13:48:39.249",
"dateFinished": "2020-04-27 13:48:50.221",
"status": "FINISHED"
},
{
"text": "%flink.pyflink\n\nprint(\"AUC:\", metrics.getAuc())\nprint(\"KS:\", metrics.getKs())\nprint(\"PRC:\", metrics.getPrc())\nprint(\"Precision:\", metrics.getPrecision())\nprint(\"Recall:\", metrics.getRecall())\nprint(\"F1:\", metrics.getF1())\nprint(\"ConfusionMatrix:\", metrics.getConfusionMatrix())\nprint(\"LabelArray:\", metrics.getLabelArray())\nprint(\"LogLoss:\", metrics.getLogLoss())\nprint(\"TotalSamples:\", metrics.getTotalSamples())\nprint(\"ActualLabelProportion:\", metrics.getActualLabelProportion())\nprint(\"ActualLabelFrequency:\", metrics.getActualLabelFrequency())\nprint(\"Accuracy:\", metrics.getAccuracy())\nprint(\"Kappa:\", metrics.getKappa())",
"user": "anonymous",
"dateUpdated": "2020-04-27 13:48:51.441",
"progress": 0,
"config": {
"editorMode": "ace/mode/python",
"editorHide": false,
"colWidth": 6.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "python",
"editOnDblClick": false,
"completionSupport": true,
"completionKey": "TAB"
}
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"runtimeInfos": {},
"progressUpdateIntervalMs": 500,
"jobName": "paragraph_1583768872984_389357281",
"id": "20200309-234752_912989550",
"dateCreated": "2020-03-09 23:47:52.984",
"dateStarted": "2020-04-27 13:48:51.451",
"dateFinished": "2020-04-27 13:48:52.073",
"status": "FINISHED"
},
{
"text": "%flink.pyflink\n\ndf \u003d predict.filter(\"y\u003c\u003epred\").firstN(300).collectToDataframe()\n\nz.show(df)",
"user": "anonymous",
"dateUpdated": "2020-04-27 13:48:54.873",
"progress": 0,
"config": {
"editorMode": "ace/mode/python",
"editorHide": false,
"colWidth": 12.0,
"fontSize": 9.0,
"enabled": true,
"results": {
"0": {
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"setting": {
"table": {
"tableGridState": {},
"tableColumnTypeState": {
"names": {
"age": "string",
"job": "string",
"marital": "string",
"education": "string",
"default": "string",
"balance": "string",
"housing": "string",
"loan": "string",
"contact": "string",
"day": "string",
"month": "string",
"duration": "string",
"campaign": "string",
"pdays": "string",
"previous": "string",
"poutcome": "string",
"y": "string",
"output": "string",
"vec": "string",
"pred": "string",
"detail": "string"
},
"updated": false
},
"tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]",
"tableOptionValue": {
"useFilter": false,
"showPagination": false,
"showAggregationFooter": false
},
"updated": false,
"initialized": false
}
},
"commonSetting": {}
}
}
},
"editorSetting": {
"language": "python",
"editOnDblClick": false,
"completionSupport": true,
"completionKey": "TAB"
}
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"runtimeInfos": {},
"progressUpdateIntervalMs": 500,
"jobName": "paragraph_1583768872984_-1459808170",
"id": "20200309-234752_1664292334",
"dateCreated": "2020-03-09 23:47:52.984",
"dateStarted": "2020-04-27 13:48:54.934",
"dateFinished": "2020-04-27 13:49:01.921",
"status": "FINISHED"
},
{
"text": "%flink.pyflink\n",
"user": "anonymous",
"dateUpdated": "2020-03-10 11:04:48.771",
"progress": 0,
"config": {
"editorMode": "ace/mode/python",
"editorHide": false,
"colWidth": 12.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "python",
"editOnDblClick": false,
"completionSupport": true,
"completionKey": "TAB"
}
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"runtimeInfos": {},
"progressUpdateIntervalMs": 500,
"jobName": "paragraph_1583768872984_-575920971",
"id": "20200309-234752_517801851",
"dateCreated": "2020-03-09 23:47:52.984",
"dateStarted": "2020-03-10 00:25:39.912",
"dateFinished": "2020-03-10 00:25:39.981",
"status": "FINISHED"
}
],
"name": "8. Logistic Regression (Alink)",
"id": "2F4HJNWVN",
"defaultInterpreterGroup": "flink",
"version": "0.9.0-SNAPSHOT",
"noteParams": {},
"noteForms": {},
"angularObjects": {},
"config": {
"isZeppelinNotebookCronEnable": false
},
"info": {}
}