Merge branch 'spark2-doc' into spark2-merge

commit: 009d4495ccf0a88f9803ae7b7a5bcb4ec8ea9bfa [log] [tgz]
author: Lionel Liu <bhlx3lyx7@163.com> Thu Apr 26 15:34:10 2018 +0800
committer: Lionel Liu <bhlx3lyx7@163.com> Thu Apr 26 15:34:10 2018 +0800
tree: f5a88598ee2c794660d6c7934d6c94451363e1e8
parent: 186ffdb6b8ebe074f0cb98261a559b47294deeb5 [diff]
parent: e9229154832f9802bdffe49039df3f70becf1bc6 [diff]
diff --git a/griffin-doc/deploy/deploy-guide.md b/griffin-doc/deploy/deploy-guide.md
index 0693c25..d23ee15 100644
--- a/griffin-doc/deploy/deploy-guide.md
+++ b/griffin-doc/deploy/deploy-guide.md

@@ -23,11 +23,11 @@
 ### Prerequisites
 You need to install following items
 - jdk (1.8 or later versions).
-- mysql or Postgresql.
+- Postgresql or Mysql.
 - npm (version 6.0.0+).
 - [Hadoop](http://apache.claz.org/hadoop/common/hadoop-2.6.0/hadoop-2.6.0.tar.gz) (2.6.0 or later), you can get some help [here](https://hadoop.apache.org/docs/r2.7.2/hadoop-project-dist/hadoop-common/SingleCluster.html).
--  [Spark](http://spark.apache.org/downloads.html) (version 1.6.x, griffin does not support 2.0.x at current), if you want to install Pseudo Distributed/Single Node Cluster, you can get some help [here](http://why-not-learn-something.blogspot.com/2015/06/spark-installation-pseudo.html).
-- [Hive](http://apache.claz.org/hive/hive-1.2.1/apache-hive-1.2.1-bin.tar.gz) (version 1.2.1 or later), you can get some help [here](https://cwiki.apache.org/confluence/display/Hive/GettingStarted#GettingStarted-RunningHive).
+-  [Spark](http://spark.apache.org/downloads.html) (version 2.2.1), if you want to install Pseudo Distributed/Single Node Cluster, you can get some help [here](http://why-not-learn-something.blogspot.com/2015/06/spark-installation-pseudo.html).
+- [Hive](http://apache.claz.org/hive/hive-2.2.0/apache-hive-2.2.0-bin.tar.gz) (version 2.2.0), you can get some help [here](https://cwiki.apache.org/confluence/display/Hive/GettingStarted#GettingStarted-RunningHive).
     You need to make sure that your spark cluster could access your HiveContext.
 - [Livy](http://archive.cloudera.com/beta/livy/livy-server-0.3.0.zip), you can get some help [here](http://livy.io/quickstart.html).
     Griffin need to schedule spark jobs by server, we use livy to submit our jobs.
@@ -37,18 +37,31 @@
     datanucleus-core-3.2.10.jar
     datanucleus-rdbms-3.2.9.jar
     ```
-- ElasticSearch.
+- ElasticSearch (5.0 or later).
 	ElasticSearch works as a metrics collector, Griffin produces metrics to it, and our default UI get metrics from it, you can use your own way as well.
 
 ### Configuration
 
+#### Postgresql
+
+Create database 'quartz' in postgresql
+```
+createdb -O <username> quartz
+```
+Init quartz tables in postgresql by [init_quartz.sql](../../service/src/main/resources/Init_quartz_postgres.sql)
+```
+psql -p <password> -h <host address> -U <username> -f init_quartz.sql quartz
+```
+
+#### Mysql
+
 Create database 'quartz' in mysql
 ```
-mysql -u username -e "create database quartz" -p
+mysql -u <username> -e "create database quartz" -p
 ```
-Init quartz tables in mysql by service/src/main/resources/Init_quartz.sql
+Init quartz tables in mysql by [init_quartz.sql](../../service/src/main/resources/Init_quartz_mysql.sql)
 ```
-mysql -u username -p quartz < service/src/main/resources/Init_quartz.sql
+mysql -u <username> -p quartz < init_quartz.sql
 ```
 
 
@@ -112,20 +125,17 @@
     sparkJob.file = hdfs://<griffin measure path>/griffin-measure.jar
     sparkJob.args_1 = hdfs://<griffin env path>/env.json
 
-    sparkJob.jars = hdfs://<datanucleus path>/spark-avro_2.11-2.0.1.jar\
-	    hdfs://<datanucleus path>/datanucleus-api-jdo-3.2.6.jar\
-	    hdfs://<datanucleus path>/datanucleus-core-3.2.10.jar\
-	    hdfs://<datanucleus path>/datanucleus-rdbms-3.2.9.jar
+    # other dependent jars
+    sparkJob.jars =
 
-	spark.yarn.dist.files = hdfs:///<spark conf path>/hive-site.xml
+    # hive-site.xml location, as configured in spark conf if ignored here
+	spark.yarn.dist.files =
 
     livy.uri = http://<your IP>:8998/batches
     spark.uri = http://<your IP>:8088
     ```
     - \<griffin measure path> is the location you should put the jar file of measure module.
     - \<griffin env path> is the location you should put the env.json file.
-    - \<datanucleus path> is the location you should put the 3 jar files of livy, and the spark avro jar file if you need to support avro data.
-    - \<spark conf path> is the location of spark conf directory.
 
 ### Build and Run
 
@@ -154,7 +164,7 @@
   http://<your IP>:8080
   ```
 
-You can use UI following the steps  [here](../ui/user-guide.md).
+You can use UI following the steps [here](../ui/user-guide.md).
 
-**Note**: The front-end UI is still under development, you can only access some basic features currently.
+**Note**: The UI doesn't support all the features, for the advanced features you can try API of service.
 

diff --git a/griffin-doc/dev/dev-env-build.md b/griffin-doc/dev/dev-env-build.md
index fc981cb..4e075f8 100644
--- a/griffin-doc/dev/dev-env-build.md
+++ b/griffin-doc/dev/dev-env-build.md

@@ -122,15 +122,15 @@
 ```
 git clone https://github.com/bhlx3lyx7/griffin-docker.git
 ```
-2. Copy your measure and service JAR into svc_msr_new directory.
+2. Copy your measure and service JAR into griffin_spark2 directory.
 ```
-cp service-<version>.jar <path to>/griffin-docker/svc_msr_new/prep/service/service.jar
-cp measure-<version>.jar <path to>/griffin-docker/svc_msr_new/prep/measure/griffin-measure.jar
+cp service-<version>.jar <path to>/griffin-docker/griffin_spark2/prep/service/service.jar
+cp measure-<version>.jar <path to>/griffin-docker/griffin_spark2/prep/measure/griffin-measure.jar
 ```
 3. Build your new griffin docker image.
-In svc_msr_new directory.
+In griffin_spark2 directory.
 ```
-cd <path to>/griffin-docker/svc_msr_new
+cd <path to>/griffin-docker/griffin_spark2
 docker build -t <image name>[:<image version>] .
 ```
 4. If you are using another image name (or version), you need also modify the docker-compose file you're using.

diff --git a/griffin-doc/docker/svc_msr/docker-compose-batch.yml b/griffin-doc/docker/compose/docker-compose-batch.yml
similarity index 96%
rename from griffin-doc/docker/svc_msr/docker-compose-batch.yml
rename to griffin-doc/docker/compose/docker-compose-batch.yml
index 5375b5c..9b247f2 100644
--- a/griffin-doc/docker/svc_msr/docker-compose-batch.yml
+++ b/griffin-doc/docker/compose/docker-compose-batch.yml

@@ -16,7 +16,7 @@
 #under the License.
 
 griffin:
-  image: bhlx3lyx7/svc_msr:0.2.0
+  image: bhlx3lyx7/griffin_spark2:0.2.0
   hostname: griffin
   links:
     - es

diff --git a/griffin-doc/docker/svc_msr/docker-compose-streaming.yml b/griffin-doc/docker/compose/docker-compose-streaming.yml
similarity index 97%
rename from griffin-doc/docker/svc_msr/docker-compose-streaming.yml
rename to griffin-doc/docker/compose/docker-compose-streaming.yml
index bb17f70..44759e0 100644
--- a/griffin-doc/docker/svc_msr/docker-compose-streaming.yml
+++ b/griffin-doc/docker/compose/docker-compose-streaming.yml

@@ -16,7 +16,7 @@
 #under the License.
 
 griffin:
-  image: bhlx3lyx7/svc_msr:0.2.0
+  image: bhlx3lyx7/griffin_spark2:0.2.0
   hostname: griffin
   links:
     - es

diff --git a/griffin-doc/docker/griffin-docker-guide.md b/griffin-doc/docker/griffin-docker-guide.md
index bc36759..9308e7b 100644
--- a/griffin-doc/docker/griffin-docker-guide.md
+++ b/griffin-doc/docker/griffin-docker-guide.md

@@ -30,32 +30,32 @@
     ```
 3. Pull griffin pre-built docker images.
     ```
-    docker pull bhlx3lyx7/svc_msr:0.2.0
+    docker pull bhlx3lyx7/griffin_spark2:0.2.0
     docker pull bhlx3lyx7/elasticsearch
     docker pull bhlx3lyx7/kafka
     docker pull zookeeper:3.5
     ```
    Or you can pull the images faster through mirror acceleration if you are in China.
     ```
-    docker pull registry.docker-cn.com/bhlx3lyx7/svc_msr:0.2.0
+    docker pull registry.docker-cn.com/bhlx3lyx7/griffin_spark2:0.2.0
     docker pull registry.docker-cn.com/bhlx3lyx7/elasticsearch
     docker pull registry.docker-cn.com/bhlx3lyx7/kafka
     docker pull registry.docker-cn.com/zookeeper:3.5
     ```
    The docker images are the griffin environment images.
-    - `bhlx3lyx7/svc_msr`: This image contains mysql, hadoop, hive, spark, livy, griffin service, griffin measure, and some prepared demo data, it works as a single node spark cluster, providing spark engine and griffin service.
+    - `bhlx3lyx7/griffin_spark2`: This image contains mysql, hadoop, hive, spark, livy, griffin service, griffin measure, and some prepared demo data, it works as a single node spark cluster, providing spark engine and griffin service.
     - `bhlx3lyx7/elasticsearch`: This image is based on official elasticsearch, adding some configurations to enable cors requests, to provide elasticsearch service for metrics persist.
     - `bhlx3lyx7/kafka`: This image contains kafka 0.8, and some demo streaming data, to provide streaming data source in streaming mode.
     - `zookeeper:3.5`: This image is official zookeeper, to provide zookeeper service in streaming mode.
 
 ### How to use griffin docker images in batch mode
-1. Copy [docker-compose-batch.yml](https://github.com/apache/incubator-griffin/blob/master/griffin-doc/docker/svc_msr/docker-compose-batch.yml) to your work path.
+1. Copy [docker-compose-batch.yml](compose/docker-compose-batch.yml) to your work path.
 2. In your work path, start docker containers by using docker compose, wait for about one minutes, then griffin service is ready.
     ```
     docker-compose -f docker-compose-batch.yml up -d
     ```
-3. Now you can try griffin APIs by using postman after importing the [json files](https://github.com/apache/incubator-griffin/tree/master/griffin-doc/service/postman).
-    In which you need to modify the environment `BASE_PATH` value into `<your local IP address>:38080`.
+3. Now you can try griffin APIs by using postman after importing the [json files](../service/postman).
+    In which you need to modify the environment `BASE_PATH` value to `<your local IP address>:38080`.
 4. You can try the api `Basic -> Get griffin version`, to make sure griffin service has started up.
 5. Add an accuracy measure through api `Measures -> Add measure`, to create a measure in griffin.
 6. Add a job to through api `jobs -> Add job`, to schedule a job to execute the measure. In the example, the schedule interval is 5 minutes.
@@ -65,7 +65,7 @@
     ```
 
 ### How to use griffin docker images in streaming mode
-1. Copy [docker-compose-streaming.yml](https://github.com/apache/incubator-griffin/blob/master/griffin-doc/docker/svc_msr/docker-compose-streaming.yml) to your work path.
+1. Copy [docker-compose-streaming.yml](compose/docker-compose-streaming.yml) to your work path.
 2. In your work path, start docker containers by using docker compose, wait for about one minutes, then griffin service is ready.
     ```
     docker-compose -f docker-compose-streaming.yml up -d

diff --git a/griffin-doc/measure/dsl-guide.md b/griffin-doc/measure/dsl-guide.md
index 0fc8059..7370274 100644
--- a/griffin-doc/measure/dsl-guide.md
+++ b/griffin-doc/measure/dsl-guide.md

@@ -125,6 +125,10 @@
 Uniqueness rule expression in Griffin DSL is a list of selection expressions separated by comma, indicates the columns to check if is unique.  
 	e.g. `name, age`, `name, (age + 1) as next_age`
 
+### Distinctness Rule
+Distinctness rule expression in Griffin DSL is a list of selection expressions separated by comma, indicates the columns to check if is distinct.
+    e.g. `name, age`, `name, (age + 1) as next_age`
+
 ### Timeliness Rule
 Timeliness rule expression in Griffin DSL is a list of selection expressions separated by comma, indicates the input time and output time (calculate time as default if not set).  
 	e.g. `ts`, `ts, end_ts`
@@ -149,12 +153,12 @@
 For example, the dsl rule is `source.cntry, source.id.count(), source.age.max() group by source.cntry`, which represents the profiling requests. After the translation, the sql rule is as below:  
 - **profiling sql rule**: `SELECT source.cntry, count(source.id), max(source.age) FROM source GROUP BY source.cntry`, save as table `profiling`.  
 
-After the translation, the metrics will be persisted in table `profiling`.  
+After the translation, the metrics will be persisted in table `profiling`.
 
 ### Uniqueness
-For uniqueness, or called duplicate, is to find out the duplicate items of data, and rollup the items count group by duplicate times.  
-For example, the dsl rule is `name, age`, which represents the duplicate requests, in this case, source and target are the same data set. After the translation, the sql rule is as below:  
-- **get distinct items from source**: `SELECT name, age FROM source`, save as table `src`.  
+For uniqueness, or called duplicate, is to find out the duplicate items of data, and rollup the items count group by duplicate times.
+For example, the dsl rule is `name, age`, which represents the duplicate requests, in this case, source and target are the same data set. After the translation, the sql rule is as below:
+- **get distinct items from source**: `SELECT name, age FROM source`, save as table `src`.
 - **get all items from target**: `SELECT name, age FROM target`, save as table `tgt`.
 - **join two tables**: `SELECT src.name, src.age FROM tgt RIGHT JOIN src ON coalesce(src.name, '') = coalesce(tgt.name, '') AND coalesce(src.age, '') = coalesce(tgt.age, '')`, save as table `joined`.
 - **get items duplication**: `SELECT name, age, (count(*) - 1) AS dup FROM joined GROUP BY name, age`, save as table `grouped`.
@@ -166,6 +170,20 @@
 
 After the translation, the metrics will be persisted in table `dup_metric`.
 
+### Distinctness
+For distinctness, is to find out the duplicate items of data, the same as uniqueness in batch mode, but with some differences in streaming mode.
+In most time, you need distinctness other than uniqueness.
+For example, the dsl rule is `name, age`, which represents the distinct requests, in this case, source and target are the same data set. After the translation, the sql rule is as below:
+- **total count of source**: `SELECT COUNT(*) AS total FROM source`, save as table `total_count`.
+- **group by fields**: `SELECT name, age, (COUNT(*) - 1) AS dup, TRUE AS dist FROM source GROUP BY name, age`, save as table `dup_count`.
+- **distinct metric**: `SELECT COUNT(*) AS dist_count FROM dup_count WHERE dist`, save as table `distinct_metric`.
+- **source join distinct metric**: `SELECT source.*, dup_count.dup AS dup, dup_count.dist AS dist FROM source LEFT JOIN dup_count ON source.name = dup_count.name AND source.age = dup_count.age`, save as table `dist_joined`.
+- **add row number**: `SELECT *, ROW_NUMBER() OVER (DISTRIBUTE BY name, age SORT BY dist) row_num FROM dist_joined`, save as table `row_numbered`.
+- **duplicate records**: `SELECT name, age, dup FROM row_numbered WHERE NOT dist OR row_num > 1`, save as table `dup_records`.
+- **duplicate metric**: `SELECT name, age, dup, COUNT(*) AS num FROM dup_records GROUP BY name, age, dup`, save as table `dup_metric`.
+
+After the translation, the metrics will be persisted in table `distinct_metric` and `dup_metric`.
+
 ### Timeliness
 For timeliness, is to measure the latency of each item, and get the statistics of the latencies.  
 For example, the dsl rule is `ts, out_ts`, the first column means the input time of item, the second column means the output time of item, if not set, `__tmst` will be the default output time column. After the translation, the sql rule is as below:  

diff --git a/griffin-doc/roadmap.md b/griffin-doc/roadmap.md
index b916311..01807e8 100644
--- a/griffin-doc/roadmap.md
+++ b/griffin-doc/roadmap.md

@@ -21,41 +21,32 @@
 ## Current feature list
 In the current version, we've implemented the below main DQ features
 
-- **Data Asset Management**
-
-  User can register, delete, edit data assets, currently only Hadoop data-sets are supported
+- **Data Asset Detection**
+  After configuration in service module, Griffin can detect the Hive tables metadata through Hive metastore service.
 
 - **Measure Management**
+  Through UI, user can create, delete measures for 3 types: accuracy, profiling and publish metrics.
+  Through service API, user can create, delete and update measures for 6 types: accuracy, profiling, timeliness, uniqueness, completeness and publish metrics.
 
-  User can create, delete, edit measures for 4 types: Accuracy, Profiling, Anomaly Detection, Publish Metrics
+- **Job Management**
+  User can create, delete job to schedule batch job for calculative measures, data range of each calculation, and the extra trigger condition like "done file" on hdfs.
 
-- **Job Scheduler**
-
-  After the measures are created, the Job Scheduler component can create the jobs and schedule them to calculate the metrics values
-
-- **Measure Execution on Spark**
-
-  The Job Scheduler will trigger the measure execution on Spark to generate the metrics values
+- **Measure Calculation on Spark**
+  Service module will trigger and submit calculation jobs to Spark cluster through livy, the measure module calculates and persists the metric values to elasticsearch by default.
 
 - **Metrics Visualization**
-
-  We have a web portal to display all metrics
-
-- **My Dashboard**
-
-  Only the interested metrics will be displayed on "My Dashboard"
+  Through service API, user can get metric values of each job from elasticsearch.
+  On UI, accuracy metrics will be rendered as a chart, profiling metrics will be displayed as a table.
 
 
 ## Short-term Roadmap
 
-- **Support more data-set types**  
-
-  Current we only support Hadoop datasets, we should also support RDBMS and real-time streaming data from Kafka, Storm, etc.
+- **Support more data source types**
+  At current, Griffin only supports Hive table, avro files on hdfs as data source in batch mode, Kafka as data source in streaming mode.
+  We plan to support more data source types, like RDBM, elasticsearch.
 
 - **Support more data quality dimensions**
+  Griffin need to support more data quality dimensions, like consistency and validity.
 
-  Besides accuracy, there are some other data quality dimensions(Completeness, Uniqueness, Timeliness, Validity, Consistency), we should support more dimensions
-
-- **More ML algorithms for Anomaly Detection measure**
-
-  Currently only [MAD(Median absolute deviation)](https://en.wikipedia.org/wiki/Median_absolute_deviation) and [Bollinger Bands](https://en.wikipedia.org/wiki/Bollinger_Bands) are supported, we are considering to support more Machine Learning algorithms
+- **Anomaly Detection**
+  Griffin plan to support anomaly detection, by analyzing calculated metrics from elasticsearch.
\ No newline at end of file

diff --git a/griffin-doc/service/hibernate_eclipselink_switch.md b/griffin-doc/service/hibernate_eclipselink_switch.md
index 3c1ad06..6bfae3b 100644
--- a/griffin-doc/service/hibernate_eclipselink_switch.md
+++ b/griffin-doc/service/hibernate_eclipselink_switch.md

@@ -1,3 +1,22 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 # Hibernate and eclipselink switch
 ## Overview
 In this document,we list two main part.

diff --git a/griffin-doc/service/mysql_postgresql_switch.md b/griffin-doc/service/mysql_postgresql_switch.md
index d3925a1..081307c 100644
--- a/griffin-doc/service/mysql_postgresql_switch.md
+++ b/griffin-doc/service/mysql_postgresql_switch.md

@@ -1,3 +1,22 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 # Mysql and postgresql switch
 
 ## Overview

diff --git a/measure/src/main/scala/org/apache/griffin/measure/rule/dsl/expr/SelectExpr.scala b/measure/src/main/scala/org/apache/griffin/measure/rule/dsl/expr/SelectExpr.scala
index d6e350b..0c3c451 100644
--- a/measure/src/main/scala/org/apache/griffin/measure/rule/dsl/expr/SelectExpr.scala
+++ b/measure/src/main/scala/org/apache/griffin/measure/rule/dsl/expr/SelectExpr.scala

@@ -115,7 +115,7 @@
     selectors.lastOption match {
       case None => desc
       case Some(sel: FunctionSelectExpr) => desc
-      case _ => s"coalesce(${desc}, 'null')"
+      case _ => s"coalesce(${desc}, '')"
     }
   }
   def alias: Option[String] = {

diff --git a/service/src/main/resources/sparkJob.properties b/service/src/main/resources/sparkJob.properties
index ce2587a..1334747 100644
--- a/service/src/main/resources/sparkJob.properties
+++ b/service/src/main/resources/sparkJob.properties

@@ -32,7 +32,7 @@
 sparkJob.driverMemory=1g
 sparkJob.executorMemory=1g
 
-# shouldn't config in server, but in
+# other dependent jars
 sparkJob.jars =
 
 # hive-site.xml location, as configured in spark conf if ignored here
commit	009d4495ccf0a88f9803ae7b7a5bcb4ec8ea9bfa	[log] [tgz]
author	Lionel Liu <bhlx3lyx7@163.com>	Thu Apr 26 15:34:10 2018 +0800
committer	Lionel Liu <bhlx3lyx7@163.com>	Thu Apr 26 15:34:10 2018 +0800
tree	f5a88598ee2c794660d6c7934d6c94451363e1e8
parent	186ffdb6b8ebe074f0cb98261a559b47294deeb5 [diff]
parent	e9229154832f9802bdffe49039df3f70becf1bc6 [diff]