A few documentation imrovements
JIRA: MADLIB-569
Small updates in Feature Extraction, Compatibility, Linrar Regression, and Utilities
diff --git a/methods/textfex_viterbi/src/pg_gp/textfex.sql_in b/methods/textfex_viterbi/src/pg_gp/textfex.sql_in
index 3f847b8..8d1220a 100644
--- a/methods/textfex_viterbi/src/pg_gp/textfex.sql_in
+++ b/methods/textfex_viterbi/src/pg_gp/textfex.sql_in
@@ -4,7 +4,7 @@
*
* @brief SQL function for text feature extraction
* @date February 2012
- *
+ *
* @sa For an introduction to text feature extraction, see the module
* description \ref grp_textfex_viterbi
*//* ----------------------------------------------------------------------- */
@@ -15,11 +15,11 @@
@addtogroup grp_textfex_viterbi
@about
-This module provides a functionality of the feature extraction for basic text
-analysis tasks such as part-of-speech(POS) tagging, named entity resolution.
-In addition to the feature extraction, it also has a Viterbi implementation
+The Feature Extraction module provides functionality for basic text-analysis
+tasks such as part-of-speech (POS) tagging and named-entity resolution.
+In addition to feature extraction, it also has a Viterbi implementation
to get the best label sequence and the conditional probability
-\f$ p(top1_label_sequence|sentence) \f$.
+\f$ \Pr( \text{best label sequence} \mid \text{Sentence}) \f$.
At present, six feature types are implemented.
- Edge Feature: transition feature that encodes the transition feature
@@ -37,7 +37,7 @@
Instead of scanning every token in a sentence and extracting features for
each token on the fly, we extract features for each distinct token and
-materialize it in the table. When we call viterbi function to get the best
+materialize it in the table. When we call the Viterbi function to get the best
label sequence, we only need a single lookup to get the feature weight.
@usage
@@ -46,7 +46,7 @@
to convert the data in the model files to the data format required by in
this module.
- - Load model from local drive to database
+ - Load model from local drive to database
<pre>SELECT madlib.load_crf_model(
'<em>/path/to/data</em>');</pre>
@@ -110,7 +110,7 @@
* ...
* edgeFeature 44 a a a a a a...a
* endFeature 45 a a a a a a...a</pre>
- *
+ *
* - viterbi_r table
* is related to specific tokens. It encodes the single state features,
* e.g., wordFeature, RegexFeature for all tokens. The r table is represented
@@ -180,46 +180,46 @@
rv = plpy.execute("SELECT COUNT(*) AS total_label FROM " + labeltbl + ";")
nlabel = rv[0]['total_label']
- plpy.execute("""INSERT INTO segment_hashtbl(seg_text)
+ plpy.execute("""INSERT INTO segment_hashtbl(seg_text)
SELECT DISTINCT seg_text
FROM """ + segmenttbl + """;""")
- plpy.execute("""INSERT INTO unknown_segment_hashtbl(seg_text)
- ((SELECT DISTINCT seg_text
- FROM segment_hashtbl)
+ plpy.execute("""INSERT INTO unknown_segment_hashtbl(seg_text)
+ ((SELECT DISTINCT seg_text
+ FROM segment_hashtbl)
EXCEPT
- (SELECT DISTINCT token
- FROM """ + dictionary + """
+ (SELECT DISTINCT token
+ FROM """ + dictionary + """
WHERE total>1));""")
plpy.execute("""INSERT INTO prev_labeltbl
SELECT id
FROM """ + labeltbl + """;
- INSERT INTO prev_labeltbl VALUES(-1);
+ INSERT INTO prev_labeltbl VALUES(-1);
INSERT INTO prev_labeltbl VALUES( """ + str(nlabel) + """);""")
# Generate sparse M factor table
- plpy.execute("""INSERT INTO mtbl(prev_label, label, value)
- SELECT prev_label.id, label.id, 0
- FROM """ + labeltbl + """ AS label,
+ plpy.execute("""INSERT INTO mtbl(prev_label, label, value)
+ SELECT prev_label.id, label.id, 0
+ FROM """ + labeltbl + """ AS label,
prev_labeltbl as prev_label;""")
- # EdgeFeature and startFeature, startFeature can be considered as a special edgeFeature
- plpy.execute("""INSERT INTO mtbl(prev_label, label, value)
+ # EdgeFeature and startFeature, startFeature can be considered as a special edgeFeature
+ plpy.execute("""INSERT INTO mtbl(prev_label, label, value)
SELECT prev_label_id,label_id,weight
- FROM """ + featuretbl + """ AS features
+ FROM """ + featuretbl + """ AS features
WHERE features.prev_label_id<>-1 OR features.name = 'S.';""")
# EndFeature, endFeature can be considered as a special edgeFeature
- plpy.execute("""INSERT INTO mtbl(prev_label, label, value)
+ plpy.execute("""INSERT INTO mtbl(prev_label, label, value)
SELECT """ + str(nlabel) + """, label_id, weight
- FROM """ + featuretbl + """ AS features
+ FROM """ + featuretbl + """ AS features
WHERE features.name = 'End.';""")
m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `
plpy.execute("""INSERT INTO {viterbi_mtbl}
- SELECT array_agg(weight ORDER BY prev_label,label)
- FROM (SELECT prev_label, label, (SUM(value)*1000)::integer AS weight
+ SELECT array_agg(weight ORDER BY prev_label,label)
+ FROM (SELECT prev_label, label, (SUM(value)*1000)::integer AS weight
FROM mtbl
GROUP BY prev_label,label
ORDER BY prev_label,label) as TEMP_MTBL;""".format(
@@ -241,37 +241,37 @@
))
')
- plpy.execute("""INSERT INTO rtbl(seg_text, label, value)
- SELECT segment_hashtbl.seg_text, labels.id, 0
- FROM segment_hashtbl segment_hashtbl,
+ plpy.execute("""INSERT INTO rtbl(seg_text, label, value)
+ SELECT segment_hashtbl.seg_text, labels.id, 0
+ FROM segment_hashtbl segment_hashtbl,
""" + labeltbl + """ AS labels;""")
# RegExFeature
- plpy.execute("""INSERT INTO rtbl(seg_text, label, value)
- SELECT segment_hashtbl.seg_text, features.label_id, features.weight
- FROM segment_hashtbl AS segment_hashtbl,
+ plpy.execute("""INSERT INTO rtbl(seg_text, label, value)
+ SELECT segment_hashtbl.seg_text, features.label_id, features.weight
+ FROM segment_hashtbl AS segment_hashtbl,
""" + featuretbl + """ AS features,
""" + regextbl + """ AS regex
- WHERE segment_hashtbl.seg_text ~ regex.pattern
+ WHERE segment_hashtbl.seg_text ~ regex.pattern
AND features.name||'%' ='R_' || regex.name;""")
# UnknownFeature
- plpy.execute("""INSERT INTO rtbl(seg_text, label, value)
- SELECT segment_hashtbl.seg_text, features.label_id, features.weight
- FROM unknown_segment_hashtbl AS segment_hashtbl,
- """ + featuretbl + """ AS features
+ plpy.execute("""INSERT INTO rtbl(seg_text, label, value)
+ SELECT segment_hashtbl.seg_text, features.label_id, features.weight
+ FROM unknown_segment_hashtbl AS segment_hashtbl,
+ """ + featuretbl + """ AS features
WHERE features.name = 'U';""")
# Wordfeature
- plpy.execute("""INSERT INTO rtbl(seg_text, label, value)
- SELECT seg_text, label_id, weight
- FROM segment_hashtbl,
- """ + featuretbl + """
+ plpy.execute("""INSERT INTO rtbl(seg_text, label, value)
+ SELECT seg_text, label_id, weight
+ FROM segment_hashtbl,
+ """ + featuretbl + """
WHERE name = 'W_' || seg_text;""")
# Factor table
- plpy.execute("""INSERT INTO """ + viterbi_rtbl + """(seg_text, label, score)
- SELECT seg_text,label,(SUM(value)*1000)::integer AS score
+ plpy.execute("""INSERT INTO """ + viterbi_rtbl + """(seg_text, label, score)
+ SELECT seg_text,label,(SUM(value)*1000)::integer AS score
FROM rtbl
GROUP BY seg_text,label;""")
diff --git a/src/ports/greenplum/modules/compatibility/compatibility.sql_in b/src/ports/greenplum/modules/compatibility/compatibility.sql_in
index b670790..c297f21 100644
--- a/src/ports/greenplum/modules/compatibility/compatibility.sql_in
+++ b/src/ports/greenplum/modules/compatibility/compatibility.sql_in
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------- *//**
+/* ----------------------------------------------------------------------- *//**
*
* @file compatibility.sql_in
*
@@ -22,21 +22,24 @@
- <tt>CREATE TABLE <em>table_name</em> AS <em>query</em></tt> statements where
<em>query</em> contains certain MADlib functions fails with the error
- “function cannot execute on segment because it issues a non-SELECT statement”.
+ “function cannot execute on segment because it issues a non-SELECT statement”
+ (on Greenplum before version 4.2).
The workaround is:
<pre>SELECT \ref create_table_as('<em>table_name</em>', $$
<em>query</em>
$$, 'BY (<em>column</em>, [...]) | RANDOMLY');</pre>
- <tt>INSERT INTO <em>table_name</em> <em>query</em></tt> where <em>query</em>
contains certain MADlib functions fails with the error “function cannot
- execute on segment because it issues a non-SELECT statement”. The workaround
- is:
+ execute on segment because it issues a non-SELECT statement” (on Greenplum
+ before version 4.2). The workaround is:
<pre>SELECT \ref insert_into('<em>table_name</em>', $$
<em>query</em>
$$);</pre>
@note
-These functions are not installed on other DBMSs (and not needed there).
+These functions are not installed on other DBMSs (and not needed there). On
+Greenplum 4.2 and later, they are installed only for backward compatibility,
+but not otherwise needed.
Workarounds should be used only when necessary. For portability and best
performance, standard SQL should be prefered whenever possible.
@@ -77,7 +80,7 @@
* <pre>SELECT insert_into('<em>table_name</em>', $$
* <em>query</em>
*$$);</pre>
- *
+ *
* @examp
* <pre>SELECT insert_into('public.test', $$
* SELECT * FROM generate_series(1,10) AS id
@@ -109,7 +112,7 @@
EXECUTE 'SET client_min_messages TO warning';
PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
-
+
EXECUTE
'DROP FUNCTION IF EXISTS pg_temp._madlib_temp_function();
CREATE FUNCTION pg_temp._madlib_temp_function()
@@ -158,16 +161,16 @@
ELSE
whatToCreate := 'TABLE';
END IF;
-
+
-- We separate the following EXECUTE statement because it is prone
-- to generate an exception -- e.g., if the table already exists
-- In that case we want to keep the context in the error message short
EXECUTE
- 'CREATE ' || whatToCreate || ' ' || "inTableName" || ' AS
+ 'CREATE ' || whatToCreate || ' ' || "inTableName" || ' AS
SELECT * FROM (' || "inSQL" || ') AS _madlib_ignore
WHERE FALSE
DISTRIBUTED ' || "inDistributed";
-
+
PERFORM MADLIB_SCHEMA.insert_into("inTableName", "inSQL");
END;
$$;
@@ -189,7 +192,7 @@
* <pre>SELECT create_table_as('<em>table_name</em>', $$
* <em>query</em>
*$$, 'BY (<em>column</em>, [...]) | RANDOMLY');</pre>
- *
+ *
* @examp
* <pre>SELECT create_table_as('public.test', $$
* SELECT * FROM generate_series(1,10) AS id
@@ -204,7 +207,7 @@
* Known caveats of this workaround:
* - For queries returning a large number of rows, this function will be
* significantly slower than the <tt>CREATE TABLE AS</tt> statement.
- */
+ */
CREATE FUNCTION MADLIB_SCHEMA.create_table_as(
"inTableName" VARCHAR,
"inSQL" VARCHAR,
diff --git a/src/ports/postgres/modules/regress/linear.sql_in b/src/ports/postgres/modules/regress/linear.sql_in
index d24181e..a23ac56 100644
--- a/src/ports/postgres/modules/regress/linear.sql_in
+++ b/src/ports/postgres/modules/regress/linear.sql_in
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------- *//**
+/* ----------------------------------------------------------------------- *//**
*
* @file linear.sql_in
*
@@ -110,7 +110,7 @@
\f[
\frac{\max_{\| z \|_2 = 1} \| X z \|_2}{\min_{\| z \|_2 = 1} \| X z \|_2} .
\f]
-The condition number of a problem is a worst-case measure of sensitive the
+The condition number of a problem is a worst-case measure of how sensitive the
result is to small perturbations of the input. A large condition number (say,
more than 1000) indicates the presence of significant multicollinearity.
@@ -175,31 +175,31 @@
-# You can call the linregr() function for an individual metric:
\verbatim
sql> SELECT (linregr(price, array[1, bedroom, bath, size])).coef FROM houses;
- coef
+ coef
------------------------------------------------------------------------
{27923.4334170641,-35524.7753390234,2269.34393735323,130.793920208133}
(1 row)
sql> SELECT (linregr(price, array[1, bedroom, bath, size])).r2 FROM houses;
- r2
+ r2
-------------------
0.745374010140315
(1 row)
sql> SELECT (linregr(price, array[1, bedroom, bath, size])).std_err FROM houses;
- std_err
+ std_err
----------------------------------------------------------------------
{56306.4821787474,25036.6537279169,22208.6687270562,36.208642285651}
(1 row)
sql> SELECT (linregr(price, array[1, bedroom, bath, size])).t_stats FROM houses;
- t_stats
+ t_stats
------------------------------------------------------------------------
{0.495918628487924,-1.41891067892239,0.10218279921428,3.6122293450358}
(1 row)
sql> SELECT (linregr(price, array[1, bedroom, bath, size])).p_values FROM houses;
- p_values
+ p_values
-----------------------------------------------------------------------------
{0.629711069315512,0.183633155781461,0.920450514073051,0.00408159079312354}
(1 row)
@@ -305,7 +305,7 @@
CREATE AGGREGATE MADLIB_SCHEMA.linregr(
/*+ "dependentVariable" */ DOUBLE PRECISION,
/*+ "independentVariables" */ DOUBLE PRECISION[]) (
-
+
SFUNC=MADLIB_SCHEMA.linregr_transition,
STYPE=float8[],
FINALFUNC=MADLIB_SCHEMA.linregr_final,
diff --git a/src/ports/postgres/modules/utilities/utilities.sql_in b/src/ports/postgres/modules/utilities/utilities.sql_in
index 5db5e92..fb10476 100644
--- a/src/ports/postgres/modules/utilities/utilities.sql_in
+++ b/src/ports/postgres/modules/utilities/utilities.sql_in
@@ -110,11 +110,12 @@
/**
* @brief Check if a floating-point number is NaN (not a number)
*
- * This function exists for portability. Some DBMSs like PostgreSQL make
- * floating-point numbers a fully ordered set -- contrary to IEEE 754.
- * http://www.postgresql.org/docs/current/static/datatype-numeric.html#DATATYPE-FLOAT
- * For portability, MADlib code should not make use of such "features" directly,
- * but instead only use isnan() instead.
+ * This function exists for portability. Some DBMSs like PostgreSQL treat
+ * floating-point numbers as fully ordered -- contrary to IEEE 754. (See, e.g.,
+ * the <a href=
+ * "http://www.postgresql.org/docs/current/static/datatype-numeric.html#DATATYPE-FLOAT"
+ * >PostgreSQL documentation</a>. For portability, MADlib code should not make
+ * use of such "features" directly, but only use isnan() instead.
*
* @param number
* @returns \c TRUE if \c number is \c NaN, \c FALSE otherwise