| /* ----------------------------------------------------------------------- *//** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * @file porter_stemmer.sql_in |
| * |
| * @brief implementation of porter stemmer operations in SQL |
| * @date September 2015 |
| * |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_stemmer |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#notes">Implementation Notes</a></li> |
| <li><a href="#list">List of Stemmer Operations</a></li> |
| <li><a href="#examples">Examples</a></li> |
| <li><a href="#related">Related Topics</a></li> |
| </ul> |
| </div> |
| |
| @brief Provides porter stemmer operations supporting other MADlib modules. |
| |
| This module provides a basic stemming operation for text input. |
| It is a support module for several machine learning algorithms that |
| require a stemmer. Currently, it only supports English words. |
| |
| This function is a SQL interface to the implementation of the |
| <a href="http://tartarus.org/~martin/PorterStemmer/">Porter Stemming Algorithm</a>. |
| The original stemming algorithm is written and maintained by Martin Porter |
| |
| @anchor notes |
| @par Implementation Notes |
| |
| All functions described in this module work with text OR text array. |
| |
| Several of the function require TEXT VALUES, and returns NULL for a NULL input. |
| See details in description of individual functions. |
| |
| @anchor list |
| @par Stemmer Operations |
| <table class="output"> |
| <tr><th>stem_token()</th><td> Returns the stem of the token. Returns NULL if input is NULL.</td></tr> |
| |
| <tr><th>stem_token_arr()</th><td> Returns the stems in an array of input token array. |
| The stem would be NULL for corresponding NULL token.</td></tr> |
| |
| @anchor examples |
| @examp |
| |
| -# Create a table with some words to be stemmed. |
| <pre class="example"> |
| CREATE TABLE token_tbl ( id integer, |
| word text |
| ); |
| INSERT INTO token_tbl VALUES |
| (1, 'kneel'), |
| (2, 'kneeled'), |
| (3, 'kneeling'), |
| (4, 'kneels'), |
| (5, 'knees'), |
| (6, 'knell'), |
| (7, 'knelt'), |
| (8, 'knew'), |
| (9, 'knick'), |
| (10, 'knif'), |
| (11, 'knife'), |
| (12, 'knight'), |
| (13, 'knightly'), |
| (14, 'knights'), |
| (15, 'knit'), |
| (16, 'knits'), |
| (17, 'knitted'), |
| (18, 'knitting'), |
| (19, 'knives'), |
| (20, 'knob'), |
| (21, 'knobs'), |
| (22, 'knock'), |
| (23, 'knocked'), |
| (24, 'knocker'), |
| (25, 'knockers'), |
| (26, 'knocking'), |
| (27, 'knocks'), |
| (28, 'knopp'), |
| (29, 'knot'), |
| (30, 'knots'); |
| </pre> |
| |
| -# Return the stem words |
| <pre class="example"> |
| SELECT madlib.stem_token(word) FROM token_tbl; |
| </pre> |
| <pre class="result"> |
| stem_token |
| ------------ |
| kneel |
| kneel |
| kneel |
| kneel |
| knee |
| knell |
| knelt |
| knew |
| knick |
| knif |
| knife |
| knight |
| knight |
| knight |
| knit |
| knit |
| knit |
| knit |
| knive |
| knob |
| knob |
| knock |
| knock |
| knocker |
| knocker |
| knock |
| knock |
| knopp |
| knot |
| knot |
| (30 rows) |
| </pre> |
| |
| -# The input can be processed as an array |
| <pre class="example"> |
| SELECT madlib.stem_token_arr(array_agg(word order by word)) FROM token_tbl; |
| </pre> |
| <pre class="result"> |
| stem_token_arr |
| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| {kneel,kneel,kneel,kneel,knee,knell,knelt,knew,knick,knif,knife,knight,knight,knight,knit,knit,knit,knit,knive,knob,knob,knock,knock,knocker,knocker,knock,knock,knopp,knot,knot} |
| (1 row) |
| </pre> |
| |
| @anchor related |
| @par Related Topics |
| |
| File porter_stemmer.sql_in for list of functions and usage. |
| */ |
| |
| /** |
| * @brief Returns stem of input token. Returns NULL if input token is NULL. |
| * |
| * @param token Text token |
| * @returns stem of token. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stem_token(token text) RETURNS text |
| AS 'MODULE_PATHNAME', 'stem_token' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| /** |
| * @brief Returns stems in an array of input token array. Returns NULL element for corresponding input NULL token. |
| * |
| * @param token_arr Text[] token |
| * @returns stems array of input token array. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stem_token_arr(token_arr text[]) RETURNS text[] |
| AS 'MODULE_PATHNAME', 'stem_token_arr' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stem_token() |
| RETURNS TEXT AS $$ |
| return """ |
| ------------------------------------------------------------------ |
| SUMMARY |
| ------------------------------------------------------------------ |
| Stemming: Reducing a word to its stem or root form. |
| This function is a SQL interface to the implementation of the |
| Porter Stemming Algorithm (http://tartarus.org/~martin/PorterStemmer/). |
| |
| ------------------------------------------------------------------ |
| USAGE |
| ------------------------------------------------------------------ |
| We provide two forms of the stem function: |
| |
| 1. Text input: |
| SELECT {schema_madlib}.stem_token(input_token) from input_table; |
| |
| 2. Array input: |
| SELECT {schema_madlib}.stem_token_arr(input_token_array) from input_table; |
| |
| Here the 'input_token' is of type TEXT and input_token_array is of type TEXT[]. |
| |
| ------------------------------------------------------------------ |
| EXAMPLE |
| ------------------------------------------------------------------ |
| CREATE TABLE token_tbl ( id integer, |
| word text |
| ); |
| |
| INSERT INTO token_tbl VALUES |
| (1, 'kneel'), (2, 'kneeled'), (3, 'kneeling'), (4, 'kneels'), (5, 'knees'), |
| (6, 'knell'), (7, 'knelt'), (8, 'knew'), (9, 'knick'), (10, 'knif'), |
| (11, 'knife'), (12, 'knight'), (13, 'knightly'), (14, 'knights'), (15, 'knit'), |
| (16, 'knits'), (17, 'knitted'), (18, 'knitting'), (19, 'knives'), (20, 'knob'), |
| (21, 'knobs'), (22, 'knock'), (23, 'knocked'), (24, 'knocker'), (25, 'knockers'), |
| (26, 'knocking'), (27, 'knocks'), (28, 'knopp'), (29, 'knot'), (30, 'knots'); |
| |
| -- text input |
| SELECT id, {schema_madlib}.stem_token(word) FROM token_tbl; |
| |
| -- array input |
| SELECT {schema_madlib}.stem_token_arr(array_agg(word order by id)) FROM token_tbl; |
| """.format(schema_madlib='MADLIB_SCHEMA') |
| $$ LANGUAGE PLPYTHONU IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stem_token_arr() |
| RETURNS TEXT AS $$ |
| SELECT MADLIB_SCHEMA.stem_token(); |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |