blob: 43e28053d96d61ef1d6a3c515478daaa69aa6171 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
* @file porter_stemmer.sql_in
*
* @brief implementation of porter stemmer operations in SQL
* @date September 2015
*
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_stemmer
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#notes">Implementation Notes</a></li>
<li><a href="#list">List of Stemmer Operations</a></li>
<li><a href="#examples">Examples</a></li>
<li><a href="#related">Related Topics</a></li>
</ul>
</div>
@brief Provides porter stemmer operations supporting other MADlib modules.
This module provides a basic stemming operation for text input.
It is a support module for several machine learning algorithms that
require a stemmer. Currently, it only supports English words.
This function is a SQL interface to the implementation of the
<a href="http://tartarus.org/~martin/PorterStemmer/">Porter Stemming Algorithm</a>.
The original stemming algorithm is written and maintained by Martin Porter
@anchor notes
@par Implementation Notes
All functions described in this module work with text OR text array.
Several of the function require TEXT VALUES, and returns NULL for a NULL input.
See details in description of individual functions.
@anchor list
@par Stemmer Operations
<table class="output">
<tr><th>stem_token()</th><td> Returns the stem of the token. Returns NULL if input is NULL.</td></tr>
<tr><th>stem_token_arr()</th><td> Returns the stems in an array of input token array.
The stem would be NULL for corresponding NULL token.</td></tr>
@anchor examples
@examp
-# Create a table with some words to be stemmed.
<pre class="example">
CREATE TABLE token_tbl ( id integer,
word text
);
INSERT INTO token_tbl VALUES
(1, 'kneel'),
(2, 'kneeled'),
(3, 'kneeling'),
(4, 'kneels'),
(5, 'knees'),
(6, 'knell'),
(7, 'knelt'),
(8, 'knew'),
(9, 'knick'),
(10, 'knif'),
(11, 'knife'),
(12, 'knight'),
(13, 'knightly'),
(14, 'knights'),
(15, 'knit'),
(16, 'knits'),
(17, 'knitted'),
(18, 'knitting'),
(19, 'knives'),
(20, 'knob'),
(21, 'knobs'),
(22, 'knock'),
(23, 'knocked'),
(24, 'knocker'),
(25, 'knockers'),
(26, 'knocking'),
(27, 'knocks'),
(28, 'knopp'),
(29, 'knot'),
(30, 'knots');
</pre>
-# Return the stem words
<pre class="example">
SELECT madlib.stem_token(word) FROM token_tbl;
</pre>
<pre class="result">
stem_token
&nbsp;------------
kneel
kneel
kneel
kneel
knee
knell
knelt
knew
knick
knif
knife
knight
knight
knight
knit
knit
knit
knit
knive
knob
knob
knock
knock
knocker
knocker
knock
knock
knopp
knot
knot
(30 rows)
</pre>
-# The input can be processed as an array
<pre class="example">
SELECT madlib.stem_token_arr(array_agg(word order by word)) FROM token_tbl;
</pre>
<pre class="result">
stem_token_arr
&nbsp;-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
{kneel,kneel,kneel,kneel,knee,knell,knelt,knew,knick,knif,knife,knight,knight,knight,knit,knit,knit,knit,knive,knob,knob,knock,knock,knocker,knocker,knock,knock,knopp,knot,knot}
(1 row)
</pre>
@anchor related
@par Related Topics
File porter_stemmer.sql_in for list of functions and usage.
*/
/**
* @brief Returns stem of input token. Returns NULL if input token is NULL.
*
* @param token Text token
* @returns stem of token.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stem_token(token text) RETURNS text
AS 'MODULE_PATHNAME', 'stem_token'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/**
* @brief Returns stems in an array of input token array. Returns NULL element for corresponding input NULL token.
*
* @param token_arr Text[] token
* @returns stems array of input token array.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stem_token_arr(token_arr text[]) RETURNS text[]
AS 'MODULE_PATHNAME', 'stem_token_arr'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stem_token()
RETURNS TEXT AS $$
return """
------------------------------------------------------------------
SUMMARY
------------------------------------------------------------------
Stemming: Reducing a word to its stem or root form.
This function is a SQL interface to the implementation of the
Porter Stemming Algorithm (http://tartarus.org/~martin/PorterStemmer/).
------------------------------------------------------------------
USAGE
------------------------------------------------------------------
We provide two forms of the stem function:
1. Text input:
SELECT {schema_madlib}.stem_token(input_token) from input_table;
2. Array input:
SELECT {schema_madlib}.stem_token_arr(input_token_array) from input_table;
Here the 'input_token' is of type TEXT and input_token_array is of type TEXT[].
------------------------------------------------------------------
EXAMPLE
------------------------------------------------------------------
CREATE TABLE token_tbl ( id integer,
word text
);
INSERT INTO token_tbl VALUES
(1, 'kneel'), (2, 'kneeled'), (3, 'kneeling'), (4, 'kneels'), (5, 'knees'),
(6, 'knell'), (7, 'knelt'), (8, 'knew'), (9, 'knick'), (10, 'knif'),
(11, 'knife'), (12, 'knight'), (13, 'knightly'), (14, 'knights'), (15, 'knit'),
(16, 'knits'), (17, 'knitted'), (18, 'knitting'), (19, 'knives'), (20, 'knob'),
(21, 'knobs'), (22, 'knock'), (23, 'knocked'), (24, 'knocker'), (25, 'knockers'),
(26, 'knocking'), (27, 'knocks'), (28, 'knopp'), (29, 'knot'), (30, 'knots');
-- text input
SELECT id, {schema_madlib}.stem_token(word) FROM token_tbl;
-- array input
SELECT {schema_madlib}.stem_token_arr(array_agg(word order by id)) FROM token_tbl;
""".format(schema_madlib='MADLIB_SCHEMA')
$$ LANGUAGE PLPYTHONU IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stem_token_arr()
RETURNS TEXT AS $$
SELECT MADLIB_SCHEMA.stem_token();
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');