TAJO-2027: Writing Hive UDF integration document. Signed-off-by: Hyunsik Choi <hyunsik@apache.org>

commit: d8e9ef3fe4cf46a847ca31ddba823d1ea23d63ed [log] [tgz]
author: Jongyoung Park <eminency@gmail.com> Fri May 20 15:06:03 2016 -0700
committer: Hyunsik Choi <hyunsik@apache.org> Fri May 20 15:06:41 2016 -0700
tree: fb0183f50781a136f7d55cbb474e33b6dc9869ea
parent: c0b15f2ab859f9ab9cb4ec286b16318c2bb7c3c1 [diff]
diff --git a/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java b/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java
index 6e3eaea..6d24ee3 100644
--- a/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java
+++ b/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java

@@ -304,7 +304,7 @@
     PYTHON_CONTROLLER_LOG_DIR("tajo.function.python.controller.log-dir", ""),
 
     // HIVE UDF
-    HIVE_UDF_DIR("tajo.function.hive.code-dir", "./lib/hiveudf"),
+    HIVE_UDF_JAR_DIR("tajo.function.hive.jar-dir", "./lib/hiveudf"),
 
     // Partition
     PARTITION_DYNAMIC_BULK_INSERT_BATCH_SIZE("tajo.partition.dynamic.bulk-insert.batch-size", 1000),

diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java
index 945e3d1..4a0283b 100644
--- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java
+++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java

@@ -57,11 +57,9 @@
 import org.apache.tajo.storage.text.CSVLineSerDe;
 import org.apache.tajo.storage.text.TextLineDeserializer;
 import org.apache.tajo.schema.IdentifierUtil;
-import org.apache.tajo.storage.LazyTuple;
 import org.apache.tajo.storage.TablespaceManager;
 import org.apache.tajo.storage.Tuple;
 import org.apache.tajo.storage.VTuple;
-import org.apache.tajo.util.BytesUtils;
 import org.apache.tajo.util.CommonTestingUtil;
 import org.apache.tajo.util.datetime.DateTimeUtil;
 import org.junit.AfterClass;
@@ -111,7 +109,7 @@
     // load Hive UDFs
     URL hiveUDFURL = ClassLoader.getSystemResource("hiveudf");
     Preconditions.checkNotNull(hiveUDFURL, "hive udf directory is absent.");
-    conf.set(TajoConf.ConfVars.HIVE_UDF_DIR.varname, hiveUDFURL.toString().substring("file:".length()));
+    conf.set(TajoConf.ConfVars.HIVE_UDF_JAR_DIR.varname, hiveUDFURL.toString().substring("file:".length()));
     list.addAll(HiveFunctionLoader.loadHiveUDFs(conf).orElse(new ArrayList<>()));
 
     for (FunctionDesc funcDesc : list) {

diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/function/hiveudf/HiveFunctionLoader.java b/tajo-core/src/main/java/org/apache/tajo/engine/function/hiveudf/HiveFunctionLoader.java
index 98ae4cd..e7d8844 100644
--- a/tajo-core/src/main/java/org/apache/tajo/engine/function/hiveudf/HiveFunctionLoader.java
+++ b/tajo-core/src/main/java/org/apache/tajo/engine/function/hiveudf/HiveFunctionLoader.java

@@ -50,7 +50,7 @@
 
   public static Optional<List<FunctionDesc>> loadHiveUDFs(TajoConf conf) {
     ArrayList<FunctionDesc> funcList = new ArrayList<>();
-    String udfdir = conf.getVar(TajoConf.ConfVars.HIVE_UDF_DIR);
+    String udfdir = conf.getVar(TajoConf.ConfVars.HIVE_UDF_JAR_DIR);
 
     try {
       // Currently Hive udf jar must be on local filesystem

diff --git a/tajo-docs/src/main/sphinx/functions.rst b/tajo-docs/src/main/sphinx/functions.rst
index ff753a3..41c7955 100644
--- a/tajo-docs/src/main/sphinx/functions.rst
+++ b/tajo-docs/src/main/sphinx/functions.rst

@@ -1,12 +1,12 @@
-******************
+*********
 Functions
-******************
+*********
 
 Tajo provides extensive supports for functions. It includes a lot of built-in functions and user-defined functions which is implemented in Python.
 
-==========================
+=========================
 Built-in Scalar Functions
-==========================
+=========================
 
 .. toctree::
     :maxdepth: 1
@@ -18,29 +18,30 @@
     functions/network_func_and_operators
     functions/json_func
 
-================================
+==============================
 Built-in Aggregation Functions
-================================
+==============================
 
 .. toctree::
     :maxdepth: 1
 
     functions/agg_func
 
-================================
+=========================
 Built-in Window Functions
-================================
+=========================
 
 .. toctree::
     :maxdepth: 1
 
     functions/window_func
 
-==============================
+======================
 User-defined Functions
-==============================
+======================
 
 .. toctree::
     :maxdepth: 1
 
-    functions/python
\ No newline at end of file
+    functions/python
+    functions/hivefunc

diff --git a/tajo-docs/src/main/sphinx/functions/hivefunc.rst b/tajo-docs/src/main/sphinx/functions/hivefunc.rst
new file mode 100644
index 0000000..dccafb7
--- /dev/null
+++ b/tajo-docs/src/main/sphinx/functions/hivefunc.rst

@@ -0,0 +1,80 @@
+##############
+Hive Functions
+##############
+
+Tajo provides a feature to use Hive functions directly without re-compilation or additional code.
+
+*************
+Configuration
+*************
+
+Only thing to do is registering path to a directory for jar files containing your hive functions.
+You can do this by set ``tajo.function.hive.jar-dir`` in ``tajo-site.xml`` like the following.
+
+.. code-block:: xml
+
+  <property>
+    <name>tajo.function.hive.jar-dir</name>
+    <value>/path/to/hive/function/jar</value>
+  </property>
+
+.. note::
+  The path should be one in local filesystem. HDFS directory is not supported because of JAVA URI compatability problem.
+
+.. warning::
+
+  The path must point to a directory, not a file. And multiple directory entries are not allowed.
+  However, it is possible to load multiple jar files.
+
+***************
+Using in detail
+***************
+
+=============
+Function Name
+=============
+
+Tajo reads hive functions override ``org.apache.hadoop.hive.ql.exec.UDF`` class. Function name is used as specified in
+``@Description`` annotation. If it doesn't exist, Tajo uses full qualified class name as function name. For example,
+it can be like this : ``select com_example_hive_udf_myupper('abcd')``, so it is recommended to use Description annotation.
+
+Additionally if some function signature duplicate occurs, it may throw ``AmbiguousFunctionException``.
+
+============================
+Parameter type / Return type
+============================
+
+Hive uses *Writable* type of Hadoop in functions, but Tajo uses its internal *Datum* type.
+So only some Writable types are supported currently by internal converting.
+They are listed below.
+
+==================== =========
+Writable             Tajo Type
+==================== =========
+ByteWritable         INT1
+ShortWritable        INT2
+IntWritable          INT4
+LongWritable         INT8
+FloatWritable        FLOAT4
+DoubleWritable       FLOAT8
+Text                 TEXT
+BytesWritable        VARBINARY
+DateWritable(*)      DATE
+TimestampWritable(*) TIMESTAMP
+HiveCharWritable(*)  CHAR
+==================== =========
+
+.. note::
+
+  (*) They are in org.apache.hadoop.hive.serde2.io package, others are in org.apache.hadoop.io package.
+
+==========
+Limitation
+==========
+
+1. Currently, Hive UDAF is not supported. Old UDAF interface is deprecated in Hive,
+and new GenericUDAF interface cannot be applied because of function design difference between Tajo and Hive.
+For same reason, new GenericUDF functions are not supported in Tajo.
+
+2. Because HDFS directory is not supported, Hive UDF jar files should be copied to each worker directory and each path
+should be specified in tajo-site.xml.
commit	d8e9ef3fe4cf46a847ca31ddba823d1ea23d63ed	[log] [tgz]
author	Jongyoung Park <eminency@gmail.com>	Fri May 20 15:06:03 2016 -0700
committer	Hyunsik Choi <hyunsik@apache.org>	Fri May 20 15:06:41 2016 -0700
tree	fb0183f50781a136f7d55cbb474e33b6dc9869ea
parent	c0b15f2ab859f9ab9cb4ec286b16318c2bb7c3c1 [diff]