| /* ------------------------------------------------------------------------ |
| * @file table_to_pmml.sql_in |
| * |
| * @brief Convert model table from MADlib to PMML |
| * @date June 2014 |
| * |
| * @sa PMML Export module description \ref grp_pmml. |
| * |
| *//* -------------------------------------------------------------------*/ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_pmml |
| |
| <div class="toc"><b>Contents</b><ul> |
| <li class="level1"><a href="#function">PMML Export Function</a></li> |
| <li class="level1"><a href="#examples">Examples</a></li> |
| <li class="level1"><a href="#background">Background</a></li> |
| <li class="level1"><a href="#related">Related Topics</a></li> |
| </ul></div> |
| |
| @brief Implements the PMML XML standard to describe and exchange models |
| produced by data mining and machine learning algorithms. |
| |
| |
| @anchor function |
| @par PMML Export Function |
| The PMML export function in MADlib has the following syntax: |
| <pre class="syntax"> |
| pmml ( model_table, |
| name_spec |
| ) |
| </pre> |
| \b Arguments |
| <DL class="arglist"> |
| <DT>model_table</DT> |
| <DD>VARCHAR. The name of the table containing the model.</DD> |
| |
| <DT>name_spec (optional)</DT> |
| <DD>VARCHAR or VARCHAR[]. Names to be used in the Data Dictionary of the PMML. |
| See pmml() for detailed explanation.</DD> |
| </DL> |
| |
| \b Output |
| <DD>XML. The output of this function is a standard PMML document, some |
| examples of which are covered in the next section.</DD> |
| </DL> |
| |
| <DD>@note In PostgreSQL, users may be required to install their database |
| with XML support in order to use this function.</DD> |
| |
| Usually the user wants to export the resulting PMML contents into a PMML file |
| so that external softwares can use it. The following method can be used |
| (Note: the user needs to use unaligned table output mode for psql with '-A' |
| flag. And inside psql client, both '\\t' and '\\o' should be used): |
| |
| <pre class="example"> |
| \> \# under bash |
| \> psql -A my_database |
| \# -- in psql now |
| \# \\t |
| \# \\o test.pmml -- export to a file |
| \# select madlib.pmml('tree_out'); |
| \# \\o |
| \# \\t |
| </pre> |
| |
| @anchor examples |
| @examp |
| -# Create the training data table. |
| <pre class="example"> |
| CREATE TABLE patients( id integer NOT NULL, |
| second_attack integer, |
| treatment integer, |
| trait_anxiety integer); |
| INSERT INTO patients(id, second_attack, treatment, trait_anxiety) VALUES |
| ( 1, 1, 1, 70), |
| ( 3, 1, 1, 50), |
| ( 5, 1, 0, 40), |
| ( 7, 1, 0, 75), |
| ( 9, 1, 0, 70), |
| (11, 0, 1, 65), |
| (13, 0, 1, 45), |
| (15, 0, 1, 40), |
| (17, 0, 0, 55), |
| (19, 0, 0, 50), |
| ( 2, 1, 1, 80), |
| ( 4, 1, 0, 60), |
| ( 6, 1, 0, 65), |
| ( 8, 1, 0, 80), |
| (10, 1, 0, 60), |
| (12, 0, 1, 50), |
| (14, 0, 1, 35), |
| (16, 0, 1, 50), |
| (18, 0, 0, 45), |
| (20, 0, 0, 60); |
| </pre> |
| -# Train a regression model using logregr_train(). |
| <pre class="example"> |
| SELECT madlib.logregr_train( |
| 'patients', |
| 'patients_logregr', |
| 'second_attack', |
| 'ARRAY[1, treatment, trait_anxiety]'); |
| </pre> |
| -# View the PMML export for this model. |
| <pre class="example"> |
| SELECT madlib.pmml('patients_logregr'); |
| </pre> |
| Result: |
| <pre class="result"> |
| <?xml version="1.0" standalone="yes"?> |
| <PMML version="4.1" xmlns="http://www.dmg.org/pmml-v4-1.html"> |
| <Header copyright="redacted for this example"> |
| <Extension extender="MADlib" name="user" value="gpadmin"/> |
| <Application name="MADlib" version="1.7"/> |
| <Timestamp> |
| 2014-06-13 17:30:14.527899 PDT |
| </Timestamp> |
| </Header> |
| <DataDictionary numberOfFields="4"> |
| <DataField dataType="boolean" name="second_attack_pmml_prediction" optype="categorical"/> |
| <DataField dataType="double" name="1" optype="continuous"/> |
| <DataField dataType="double" name="treatment" optype="continuous"/> |
| <DataField dataType="double" name="trait_anxiety" optype="continuous"/> |
| </DataDictionary> |
| <RegressionModel functionName="classification" normalizationMethod="softmax"> |
| <MiningSchema> |
| <MiningField name="second_attack_pmml_prediction" usageType="predicted"/> |
| <MiningField name="1"/> |
| <MiningField name="treatment"/> |
| <MiningField name="trait_anxiety"/> |
| </MiningSchema> |
| <RegressionTable intercept="0.0" targetCategory="True"> |
| <NumericPredictor coefficient="-6.36346994178" name="1"/> |
| <NumericPredictor coefficient="-1.02410605239" name="treatment"/> |
| <NumericPredictor coefficient="0.119044916669" name="trait_anxiety"/> |
| </RegressionTable> |
| <RegressionTable intercept="0.0" targetCategory="False"/> |
| </RegressionModel> |
| </PMML> |
| </pre> |
| |
| Alternatively, the above can also be invoked as below if custom names are needed |
| for fields in the Data Dictionary: |
| <pre class="example"> |
| SELECT madlib.pmml('patients_logregr', |
| 'out_attack~1+in_trait_anxiety+in_treatment'); |
| </pre> |
| |
| \b Note: If the second argument of 'pmml' function is not specified, a default suffix "_pmml_prediction" will be automatically append to the column name to be predicted. This can help avoid name conflicts. |
| |
| The following example demonstrates grouping columns in the model table for the same dataset as the previous example. |
| |
| -# Train a different regression model with 'treatment' as the grouping column. |
| <pre class="example"> |
| SELECT madlib.logregr_train( |
| 'patients', |
| 'patients_logregr_grouping', |
| 'second_attack', |
| 'ARRAY[1, trait_anxiety]', |
| 'treatment'); |
| </pre> |
| -# View the PMML export for this model. |
| <pre class="example"> |
| SELECT madlib.pmml('patients_logregr_grouping', |
| ARRAY['second_attack','1','in_trait_anxiety']); |
| </pre> |
| Result: |
| <pre class="result"> |
| <?xml version="1.0" standalone="yes"?> |
| <PMML version="4.1" xmlns="http://www.dmg.org/pmml-v4-1.html"> |
| <Header copyright="redacted for this example"> |
| <Extension extender="MADlib" name="user" value="gpadmin"/> |
| <Application name="MADlib" version="1.7"/> |
| <Timestamp> |
| 2014-06-13 17:37:55.786307 PDT |
| </Timestamp> |
| </Header> |
| <DataDictionary numberOfFields="4"> |
| <DataField dataType="boolean" name="second_attack" optype="categorical"/> |
| <DataField dataType="double" name="1" optype="continuous"/> |
| <DataField dataType="double" name="in_trait_anxiety" optype="continuous"/> |
| <DataField dataType="string" name="treatment" optype="categorical"/> |
| </DataDictionary> |
| <MiningModel functionName="classification"> |
| <MiningSchema> |
| <MiningField name="second_attack" usageType="predicted"/> |
| <MiningField name="1"/> |
| <MiningField name="in_trait_anxiety"/> |
| <MiningField name="treatment"/> |
| </MiningSchema> |
| <Segmentation multipleModelMethod="selectFirst"> |
| <Segment> |
| <SimplePredicate field="treatment" operator="equal" value="1"/> |
| <RegressionModel functionName="classification" normalizationMethod="softmax"> |
| <MiningSchema> |
| <MiningField name="second_attack" usageType="predicted"/> |
| <MiningField name="1"/> |
| <MiningField name="in_trait_anxiety"/> |
| </MiningSchema> |
| <RegressionTable intercept="0.0" targetCategory="True"> |
| <NumericPredictor coefficient="-8.02068430057" name="1"/> |
| <NumericPredictor coefficient="0.130090428526" name="in_trait_anxiety"/> |
| </RegressionTable> |
| <RegressionTable intercept="0.0" targetCategory="False"/> |
| </RegressionModel> |
| </Segment> |
| <Segment> |
| <SimplePredicate field="treatment" operator="equal" value="0"/> |
| <RegressionModel functionName="classification" normalizationMethod="softmax"> |
| <MiningSchema> |
| <MiningField name="second_attack" usageType="predicted"/> |
| <MiningField name="1"/> |
| <MiningField name="in_trait_anxiety"/> |
| </MiningSchema> |
| <RegressionTable intercept="0.0" targetCategory="True"> |
| <NumericPredictor coefficient="-5.75043192191" name="1"/> |
| <NumericPredictor coefficient="0.108282446319" name="in_trait_anxiety"/> |
| </RegressionTable> |
| <RegressionTable intercept="0.0" targetCategory="False"/> |
| </RegressionModel> |
| </Segment> |
| </Segmentation> |
| </MiningModel> |
| </PMML> |
| </pre> |
| |
| \b Note: MADlib currently supports PMML export for Linear Regression, |
| Logistic Regression, Generalized Linear Regression Model, Multinomial Logistic |
| Regression, Ordinal Linear Regression, Decision Tree and Random Forests. |
| |
| In Ordinal Regression, the signs of feature coefficients will be different in |
| PMML export and in the default output model table from ordinal(). This is due |
| to the difference of model settings. |
| |
| @anchor background |
| @par Background |
| The Predictive Model Markup Language (PMML) is an XML-based file format that |
| provides a way for applications to describe and exchange models produced by |
| data mining and machine learning algorithms. A PMML file comprises the |
| following components: |
| - Header: Contains general information of the model, such as copyright |
| information and model description. |
| - Data Dictionary: Contains definitions of fields used in the model. |
| - Data Transformations: Contains transformations for mapping user data into |
| a form that can be used by the mining model. |
| - Model: Contains definitions of the data mining model, which includes |
| attributes such as the model name, function name, and algorithm name. |
| - Mining Schema: Contains specific information for the fields used in |
| the model, which includes the name and usage type. |
| - Targets: Allows for post-processing of the predicted value. |
| - Output: Allows for naming of output fields expected from the model. |
| |
| MADlib follows the PMML v4.1 standard. For more details about PMML, see |
| http://www.dmg.org/v4-1/GeneralStructure.html. |
| |
| |
| @anchor related |
| @par Related Topics |
| |
| File table_to_pmml.sql_in documenting the PMML export functions. |
| |
| \ref grp_linreg |
| |
| \ref grp_logreg |
| |
| \ref grp_glm |
| |
| \ref grp_ordinal |
| |
| \ref grp_multinom |
| |
| \ref grp_decision_tree |
| |
| \ref grp_random_forest |
| |
| */ |
| |
| |
| /** |
| * @brief Given the model constructed from a data mining algorithm, |
| * this function converts the model into PMML format. |
| * |
| * @param model_table Model table that contains the output of a training algorithm. |
| * |
| * @return An XML document corresponding to the PMML of the input model. |
| * |
| * The names in the Data Dictionary are generated based on the names |
| * used in the model table. |
| * If the fields are represented as an array of expressions without array |
| * concatenation, the names in the Data Dictionary will match the source table, |
| * except that a suffix '_pmml_prediction' is added to the predicted variable. |
| * If the field is an array of FLOAT8 values, the names in the Data |
| * Dictionary will be indexed using the array name, like 'ind[1]', 'ind[2]', etc. |
| * In all other cases, the names will simply be 'x[1]', 'x[2]', and so on. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pmml( |
| model_table varchar |
| ) RETURNS xml AS $$ |
| PythonFunction(pmml, table_to_pmml, table_to_pmml) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| /** |
| * @brief Given the model constructed from a data mining algorithm, |
| * this function converts the model into PMML format. |
| * |
| * @param model_table Model table that contains the output of a training algorithm. |
| * @param name_spec Names to be used in the Data Dictionary, given as |
| * a string containing an expression such as 'y ~ x1 + x2 + x3'. |
| * |
| * @return An XML document corresponding to the PMML of the input model. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pmml( |
| model_table varchar |
| , name_spec varchar |
| ) RETURNS xml AS $$ |
| PythonFunction(pmml, table_to_pmml, table_to_pmml) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| /** |
| * @brief Given the model constructed from a data mining algorithm, |
| * this function converts the model into PMML format. |
| * |
| * @param model_table Model table that contains the output of a training algorithm. |
| * @param name_spec Names to be used in the Data Dictionary given as |
| * an array of strings, such as ARRAY['y', 'x1', 'x2', ...]. |
| * |
| * @return An XML document corresponding to the PMML of the input model. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pmml( |
| model_table varchar |
| , name_spec varchar[] |
| ) RETURNS xml AS $$ |
| PythonFunction(pmml, table_to_pmml, table_to_pmml) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| -- Help messages ------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pmml() |
| RETURNS TEXT AS $$ |
| PythonFunction(pmml, table_to_pmml, pmml_help_msg) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |