A few documentation imrovements JIRA: MADLIB-569 Small updates in Feature Extraction, Compatibility, Linrar Regression, and Utilities

commit: 16820bbbbdfcdd54a21ca5d4bf915f2f61729042 [log] [tgz]
author: Florian Schoppmann <Florian.Schoppmann@emc.com> Mon Jun 18 01:08:58 2012 -0700
committer: Florian Schoppmann <Florian.Schoppmann@emc.com> Mon Jun 18 01:10:50 2012 -0700
tree: bfdf31c6e1b90ec48a3f4552c98a7b59009105a3
parent: 357c7b0e617103e8f6bf823466b801bb4aab793c [diff]
diff --git a/methods/textfex_viterbi/src/pg_gp/textfex.sql_in b/methods/textfex_viterbi/src/pg_gp/textfex.sql_in
index 3f847b8..8d1220a 100644
--- a/methods/textfex_viterbi/src/pg_gp/textfex.sql_in
+++ b/methods/textfex_viterbi/src/pg_gp/textfex.sql_in

@@ -4,7 +4,7 @@
  *
  * @brief SQL function for text feature extraction
  * @date February 2012
- *  
+ *
  * @sa For an introduction to text feature extraction, see the module
  *     description \ref grp_textfex_viterbi
  *//* ----------------------------------------------------------------------- */
@@ -15,11 +15,11 @@
 @addtogroup grp_textfex_viterbi
 
 @about
-This module provides a functionality of the feature extraction for basic text
-analysis tasks such as part-of-speech(POS) tagging, named entity resolution.
-In addition to the feature extraction, it also has a Viterbi implementation
+The Feature Extraction module provides functionality for basic text-analysis
+tasks such as part-of-speech (POS) tagging and named-entity resolution.
+In addition to feature extraction, it also has a Viterbi implementation
 to get the best label sequence and the conditional probability
-\f$ p(top1_label_sequence|sentence) \f$.
+\f$ \Pr( \text{best label sequence} \mid \text{Sentence}) \f$.
 
 At present, six feature types are implemented.
     - Edge Feature: transition feature that encodes the transition feature
@@ -37,7 +37,7 @@
 
 Instead of scanning every token in a sentence and extracting features for
 each token on the fly, we extract features for each distinct token and
-materialize it in the table.  When we call viterbi function to get the best
+materialize it in the table.  When we call the Viterbi function to get the best
 label sequence, we only need a single lookup to get the feature weight.
 
 @usage
@@ -46,7 +46,7 @@
     to convert the data in the model files to the data format required by in
     this module.
 
-  - Load model from local drive to database 
+  - Load model from local drive to database
     <pre>SELECT madlib.load_crf_model(
          '<em>/path/to/data</em>');</pre>
 
@@ -110,7 +110,7 @@
  * ...
  * edgeFeature  44  a  a  a  a  a  a...a
  * endFeature   45  a  a  a  a  a  a...a</pre>
- * 
+ *
  * - viterbi_r table
  * is related to specific tokens.  It encodes the single state features,
  * e.g., wordFeature, RegexFeature for all tokens.  The r table is represented
@@ -180,46 +180,46 @@
         rv = plpy.execute("SELECT COUNT(*) AS total_label FROM " + labeltbl + ";")
         nlabel = rv[0]['total_label']
 
-        plpy.execute("""INSERT INTO segment_hashtbl(seg_text) 
+        plpy.execute("""INSERT INTO segment_hashtbl(seg_text)
                         SELECT DISTINCT seg_text
                         FROM   """ + segmenttbl + """;""")
 
-        plpy.execute("""INSERT INTO unknown_segment_hashtbl(seg_text) 
-                        ((SELECT DISTINCT seg_text 
-                          FROM   segment_hashtbl) 
+        plpy.execute("""INSERT INTO unknown_segment_hashtbl(seg_text)
+                        ((SELECT DISTINCT seg_text
+                          FROM   segment_hashtbl)
                          EXCEPT
-                         (SELECT DISTINCT token 
-                          FROM   """ + dictionary + """ 
+                         (SELECT DISTINCT token
+                          FROM   """ + dictionary + """
                           WHERE  total>1));""")
 
         plpy.execute("""INSERT INTO prev_labeltbl
                         SELECT id
                         FROM   """ + labeltbl + """;
-                        INSERT INTO prev_labeltbl VALUES(-1); 
+                        INSERT INTO prev_labeltbl VALUES(-1);
                         INSERT INTO prev_labeltbl VALUES( """ + str(nlabel) + """);""")
 
         # Generate sparse M factor table
-        plpy.execute("""INSERT INTO mtbl(prev_label, label, value) 
-                        SELECT prev_label.id, label.id, 0 
-                        FROM   """ + labeltbl + """ AS label, 
+        plpy.execute("""INSERT INTO mtbl(prev_label, label, value)
+                        SELECT prev_label.id, label.id, 0
+                        FROM   """ + labeltbl + """ AS label,
                                prev_labeltbl as prev_label;""")
 
-        # EdgeFeature and startFeature, startFeature can be considered as a special edgeFeature  
-        plpy.execute("""INSERT INTO mtbl(prev_label, label, value) 
+        # EdgeFeature and startFeature, startFeature can be considered as a special edgeFeature
+        plpy.execute("""INSERT INTO mtbl(prev_label, label, value)
                         SELECT prev_label_id,label_id,weight
-                        FROM   """ + featuretbl + """ AS features 
+                        FROM   """ + featuretbl + """ AS features
                         WHERE  features.prev_label_id<>-1 OR features.name = 'S.';""")
 
         # EndFeature, endFeature can be considered as a special edgeFeature
-        plpy.execute("""INSERT INTO mtbl(prev_label, label, value) 
+        plpy.execute("""INSERT INTO mtbl(prev_label, label, value)
                         SELECT """ + str(nlabel) + """, label_id, weight
-                        FROM   """ + featuretbl + """ AS features 
+                        FROM   """ + featuretbl + """ AS features
                         WHERE  features.name = 'End.';""")
 
 m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `
         plpy.execute("""INSERT INTO {viterbi_mtbl}
-                        SELECT array_agg(weight ORDER BY prev_label,label) 
-                        FROM   (SELECT prev_label, label, (SUM(value)*1000)::integer AS weight 
+                        SELECT array_agg(weight ORDER BY prev_label,label)
+                        FROM   (SELECT prev_label, label, (SUM(value)*1000)::integer AS weight
                                 FROM   mtbl
                                 GROUP BY prev_label,label
                                 ORDER BY prev_label,label) as TEMP_MTBL;""".format(
@@ -241,37 +241,37 @@
                         ))
 ')
 
-        plpy.execute("""INSERT INTO rtbl(seg_text, label, value) 
-                        SELECT segment_hashtbl.seg_text, labels.id, 0 
-                        FROM   segment_hashtbl segment_hashtbl, 
+        plpy.execute("""INSERT INTO rtbl(seg_text, label, value)
+                        SELECT segment_hashtbl.seg_text, labels.id, 0
+                        FROM   segment_hashtbl segment_hashtbl,
                          """ + labeltbl + """ AS labels;""")
 
         # RegExFeature
-        plpy.execute("""INSERT INTO rtbl(seg_text, label, value) 
-                        SELECT segment_hashtbl.seg_text, features.label_id, features.weight 
-                        FROM   segment_hashtbl AS segment_hashtbl, 
+        plpy.execute("""INSERT INTO rtbl(seg_text, label, value)
+                        SELECT segment_hashtbl.seg_text, features.label_id, features.weight
+                        FROM   segment_hashtbl AS segment_hashtbl,
                          """ + featuretbl + """ AS features,
                          """ + regextbl + """ AS regex
-                        WHERE  segment_hashtbl.seg_text ~ regex.pattern 
+                        WHERE  segment_hashtbl.seg_text ~ regex.pattern
                                AND features.name||'%' ='R_' || regex.name;""")
 
         # UnknownFeature
-        plpy.execute("""INSERT INTO rtbl(seg_text, label, value) 
-                        SELECT segment_hashtbl.seg_text, features.label_id, features.weight 
-                        FROM   unknown_segment_hashtbl AS segment_hashtbl, 
-                         """ + featuretbl + """ AS features 
+        plpy.execute("""INSERT INTO rtbl(seg_text, label, value)
+                        SELECT segment_hashtbl.seg_text, features.label_id, features.weight
+                        FROM   unknown_segment_hashtbl AS segment_hashtbl,
+                         """ + featuretbl + """ AS features
                         WHERE  features.name = 'U';""")
 
         # Wordfeature
-        plpy.execute("""INSERT INTO rtbl(seg_text, label, value) 
-                        SELECT seg_text, label_id, weight 
-                        FROM   segment_hashtbl, 
-                        """  + featuretbl + """ 
+        plpy.execute("""INSERT INTO rtbl(seg_text, label, value)
+                        SELECT seg_text, label_id, weight
+                        FROM   segment_hashtbl,
+                        """  + featuretbl + """
                         WHERE  name = 'W_' || seg_text;""")
 
         # Factor table
-        plpy.execute("""INSERT INTO """ + viterbi_rtbl + """(seg_text, label, score) 
-                        SELECT seg_text,label,(SUM(value)*1000)::integer AS score 
+        plpy.execute("""INSERT INTO """ + viterbi_rtbl + """(seg_text, label, score)
+                        SELECT seg_text,label,(SUM(value)*1000)::integer AS score
                         FROM   rtbl
                         GROUP BY seg_text,label;""")
 

diff --git a/src/ports/greenplum/modules/compatibility/compatibility.sql_in b/src/ports/greenplum/modules/compatibility/compatibility.sql_in
index b670790..c297f21 100644
--- a/src/ports/greenplum/modules/compatibility/compatibility.sql_in
+++ b/src/ports/greenplum/modules/compatibility/compatibility.sql_in

@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------- *//** 
+/* ----------------------------------------------------------------------- *//**
  *
  * @file compatibility.sql_in
  *
@@ -22,21 +22,24 @@
 
 - <tt>CREATE TABLE <em>table_name</em> AS <em>query</em></tt> statements where
   <em>query</em> contains certain MADlib functions fails with the error
-  “function cannot execute on segment because it issues a non-SELECT statement”.
+  “function cannot execute on segment because it issues a non-SELECT statement”
+  (on Greenplum before version 4.2).
   The workaround is:
   <pre>SELECT \ref create_table_as('<em>table_name</em>', $$
     <em>query</em>
 $$, 'BY (<em>column</em>, [...]) | RANDOMLY');</pre>
 - <tt>INSERT INTO <em>table_name</em> <em>query</em></tt> where <em>query</em>
   contains certain MADlib functions fails with the error “function cannot
-  execute on segment because it issues a non-SELECT statement”. The workaround
-  is:
+  execute on segment because it issues a non-SELECT statement” (on Greenplum
+  before version 4.2). The workaround is:
   <pre>SELECT \ref insert_into('<em>table_name</em>', $$
     <em>query</em>
 $$);</pre>
 
 @note
-These functions are not installed on other DBMSs (and not needed there).
+These functions are not installed on other DBMSs (and not needed there). On
+Greenplum 4.2 and later, they are installed only for backward compatibility,
+but not otherwise needed.
 Workarounds should be used only when necessary. For portability and best
 performance, standard SQL should be prefered whenever possible.
 
@@ -77,7 +80,7 @@
  * <pre>SELECT insert_into('<em>table_name</em>', $$
  *    <em>query</em>
  *$$);</pre>
- * 
+ *
  * @examp
  * <pre>SELECT insert_into('public.test', $$
  *    SELECT * FROM generate_series(1,10) AS id
@@ -109,7 +112,7 @@
     EXECUTE 'SET client_min_messages TO warning';
 
     PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
-        
+
     EXECUTE
         'DROP FUNCTION IF EXISTS pg_temp._madlib_temp_function();
         CREATE FUNCTION pg_temp._madlib_temp_function()
@@ -158,16 +161,16 @@
     ELSE
         whatToCreate := 'TABLE';
     END IF;
-    
+
     -- We separate the following EXECUTE statement because it is prone
     -- to generate an exception -- e.g., if the table already exists
     -- In that case we want to keep the context in the error message short
     EXECUTE
-        'CREATE ' || whatToCreate || ' ' || "inTableName" || ' AS 
+        'CREATE ' || whatToCreate || ' ' || "inTableName" || ' AS
         SELECT * FROM (' || "inSQL" || ') AS _madlib_ignore
         WHERE FALSE
         DISTRIBUTED ' || "inDistributed";
-    
+
     PERFORM MADLIB_SCHEMA.insert_into("inTableName", "inSQL");
 END;
 $$;
@@ -189,7 +192,7 @@
  * <pre>SELECT create_table_as('<em>table_name</em>', $$
  *    <em>query</em>
  *$$, 'BY (<em>column</em>, [...]) | RANDOMLY');</pre>
- * 
+ *
  * @examp
  * <pre>SELECT create_table_as('public.test', $$
  *    SELECT * FROM generate_series(1,10) AS id
@@ -204,7 +207,7 @@
  *     Known caveats of this workaround:
  *     - For queries returning a large number of rows, this function will be
  *       significantly slower than the <tt>CREATE TABLE AS</tt> statement.
- */ 
+ */
 CREATE FUNCTION MADLIB_SCHEMA.create_table_as(
     "inTableName" VARCHAR,
     "inSQL" VARCHAR,

diff --git a/src/ports/postgres/modules/regress/linear.sql_in b/src/ports/postgres/modules/regress/linear.sql_in
index d24181e..a23ac56 100644
--- a/src/ports/postgres/modules/regress/linear.sql_in
+++ b/src/ports/postgres/modules/regress/linear.sql_in

@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------- *//** 
+/* ----------------------------------------------------------------------- *//**
  *
  * @file linear.sql_in
  *
@@ -110,7 +110,7 @@
 \f[
     \frac{\max_{\| z \|_2 = 1} \| X z \|_2}{\min_{\| z \|_2 = 1} \| X z \|_2} .
 \f]
-The condition number of a problem is a worst-case measure of sensitive the
+The condition number of a problem is a worst-case measure of how sensitive the
 result is to small perturbations of the input. A large condition number (say,
 more than 1000) indicates the presence of significant multicollinearity.
 
@@ -175,31 +175,31 @@
 -#  You can call the linregr() function for an individual metric:
 \verbatim
 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).coef FROM houses;
-                                  coef                                  
+                                  coef
 ------------------------------------------------------------------------
  {27923.4334170641,-35524.7753390234,2269.34393735323,130.793920208133}
 (1 row)
 
 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).r2 FROM houses;
-        r2         
+        r2
 -------------------
  0.745374010140315
 (1 row)
 
 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).std_err FROM houses;
-                               std_err                                
+                               std_err
 ----------------------------------------------------------------------
  {56306.4821787474,25036.6537279169,22208.6687270562,36.208642285651}
 (1 row)
 
 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).t_stats FROM houses;
-                                t_stats                                 
+                                t_stats
 ------------------------------------------------------------------------
  {0.495918628487924,-1.41891067892239,0.10218279921428,3.6122293450358}
 (1 row)
 
 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).p_values FROM houses;
-                                  p_values                                   
+                                  p_values
 -----------------------------------------------------------------------------
  {0.629711069315512,0.183633155781461,0.920450514073051,0.00408159079312354}
 (1 row)
@@ -305,7 +305,7 @@
 CREATE AGGREGATE MADLIB_SCHEMA.linregr(
     /*+ "dependentVariable" */ DOUBLE PRECISION,
     /*+ "independentVariables" */ DOUBLE PRECISION[]) (
-    
+
     SFUNC=MADLIB_SCHEMA.linregr_transition,
     STYPE=float8[],
     FINALFUNC=MADLIB_SCHEMA.linregr_final,

diff --git a/src/ports/postgres/modules/utilities/utilities.sql_in b/src/ports/postgres/modules/utilities/utilities.sql_in
index 5db5e92..fb10476 100644
--- a/src/ports/postgres/modules/utilities/utilities.sql_in
+++ b/src/ports/postgres/modules/utilities/utilities.sql_in

@@ -110,11 +110,12 @@
 /**
  * @brief Check if a floating-point number is NaN (not a number)
  *
- * This function exists for portability. Some DBMSs like PostgreSQL make
- * floating-point numbers a fully ordered set -- contrary to IEEE 754.
- * http://www.postgresql.org/docs/current/static/datatype-numeric.html#DATATYPE-FLOAT
- * For portability, MADlib code should not make use of such "features" directly,
- * but instead only use isnan() instead.
+ * This function exists for portability. Some DBMSs like PostgreSQL treat
+ * floating-point numbers as fully ordered -- contrary to IEEE 754. (See, e.g.,
+ * the <a href=
+ * "http://www.postgresql.org/docs/current/static/datatype-numeric.html#DATATYPE-FLOAT"
+ * >PostgreSQL documentation</a>. For portability, MADlib code should not make
+ * use of such "features" directly, but only use isnan() instead.
  *
  * @param number
  * @returns \c TRUE if \c number is \c NaN, \c FALSE otherwise
commit	16820bbbbdfcdd54a21ca5d4bf915f2f61729042	[log] [tgz]
author	Florian Schoppmann <Florian.Schoppmann@emc.com>	Mon Jun 18 01:08:58 2012 -0700
committer	Florian Schoppmann <Florian.Schoppmann@emc.com>	Mon Jun 18 01:10:50 2012 -0700
tree	bfdf31c6e1b90ec48a3f4552c98a7b59009105a3
parent	357c7b0e617103e8f6bf823466b801bb4aab793c [diff]