src/ports/postgres/modules/utilities/text_utilities.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file text_utilities.sql_in
  *
  * @brief SQL functions for carrying out routine text operations
  *
  * @sa For a brief overview of utility functions, see the
  *     module description \ref grp_utilities.
  *
  *//* ----------------------------------------------------------------------- */

 m4_include(`SQLCommon.m4')

 /**
 @addtogroup grp_text_utilities

 <div class="toc"><b>Contents</b>
   <ul>
     <li><a href="#function_syntax">Function Syntax</a></li>
     <li><a href="#examples">Examples</a></li>
     <li><a href="#related">Related Topics</a></li>
   </ul>
 </div>

 @brief Provides a collection of functions for performing common
 tasks related to text analytics.

 Term frequency computes the number of times that a word
 or term occurs in a document.  Term frequency is often
 used as part of a larger text processing pipeline, which may
 include operations such as stemming, stop word removal
 and topic modelling.

 @anchor function_syntax
 @par Function Syntax

 <pre class="syntax">
     term_frequency(input_table,
                    doc_id_col,
                    word_col,
                    output_table,
                    compute_vocab)
 </pre>

 \b Arguments:
 <dl class="arglist">
     <dt>input_table</dt>
     <dd>TEXT.
     The name of the table containing the documents, with one
     document per row.
     Each row is in the form &lt;doc_id, word_vector&gt; where \c doc_id is an id
     unique to each document, and  \c word_vector is a text array containing the
     words in the document. The \c word_vector should contain multiple entries of
     a word if the document contains multiple occurrence of that word.
     </dd>

     <dt>doc_id_col</dt>
     <dd>TEXT.
     The name of the column containing the document id. </dd>

     <dt>word_col</dt>
     <dd>TEXT.
     The name of the column containing the vector of words/terms in the
     document. This column should be of type that can be cast to TEXT[].</dd>

     <dt>output_table</dt>
     <dd>TEXT.
     The name of the table to store the term frequency output.
     The output table contains the following columns:
         - \c doc_id_col: This the document id column
         (name will be same as the one provided as input).
         - \c word: Word/term present in a document. Depending on the value
         of \c compute_vocab below, this is either the original
         word as it appears in \c word_col, or an id representing the word.
         Note that word id's start from 0 not 1.
         - \c count: The number of times this word is found in the document.
     </dd>

     <dt>compute_vocab</dt>
     <dd>BOOLEAN. (Optional, Default=FALSE)
     Flag to indicate if a vocabulary table is to be created. If TRUE, an additional
     output table is created containing the vocabulary of all words, with an id
     assigned to each word in alphabetical order.
     The table is called <em>output_table</em>_vocabulary
     (i.e., suffix added to the <em>output_table</em> name) and contains the
     following columns:
         - \c wordid: An id for each word in alphabetical order.
         - \c word: The word/term corresponding to the id.
     </dd>
 </dl>

 @anchor examples
 @par Examples

 -# First we create a document table with one document per row:
 <pre class="example">
 DROP TABLE IF EXISTS documents;
 CREATE TABLE documents(docid INT4, contents TEXT);
 INSERT INTO documents VALUES
 (0, 'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'),
 (1, 'Chinchillas and kittens are cute.'),
 (2, 'My sister adopted two kittens yesterday.'),
 (3, 'Look at this cute hamster munching on a piece of broccoli.');
 </pre>
 You can apply stemming, stop word removal and tokenization at this point
 in order to prepare the documents for text processing.
 Depending upon your database version, various tools are
 available. Databases based on more recent versions of
 PostgreSQL may do something like:
 <pre class="example">
 SELECT tsvector_to_array(to_tsvector('english',contents)) from documents;
 </pre>
 <pre class="result">
                     tsvector_to_array
 +----------------------------------------------------------
  {ate,banana,breakfast,broccoli,eat,like,smoothi,spinach}
  {chinchilla,cute,kitten}
  {adopt,kitten,sister,two,yesterday}
  {broccoli,cute,hamster,look,munch,piec}
 (4 rows)
 </pre>
 In this example, we assume a database based on an older
 version of PostgreSQL and just perform basic punctuation
 removal and tokenization. The array of words is added as
 a new column to the documents table:
 <pre class="example">
 ALTER TABLE documents ADD COLUMN words TEXT[];
 UPDATE documents SET words =
     regexp_split_to_array(lower(
     regexp_replace(contents, E'[,.;\\']','', 'g')
     ), E'[\\\\s+]');
 \\x on
 SELECT * FROM documents ORDER BY docid;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]------------------------------------------------------------------------------------
 docid    | 0
 contents | I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.
 words    | {i,like,to,eat,broccoli,and,bananas,i,ate,a,banana,and,spinach,smoothie,for,breakfast}
 -[ RECORD 2 ]------------------------------------------------------------------------------------
 docid    | 1
 contents | Chinchillas and kittens are cute.
 words    | {chinchillas,and,kittens,are,cute}
 -[ RECORD 3 ]------------------------------------------------------------------------------------
 docid    | 2
 contents | My sister adopted two kittens yesterday.
 words    | {my,sister,adopted,two,kittens,yesterday}
 -[ RECORD 4 ]------------------------------------------------------------------------------------
 docid    | 3
 contents | Look at this cute hamster munching on a piece of broccoli.
 words    | {look,at,this,cute,hamster,munching,on,a,piece,of,broccoli}
 </pre>

 -# Compute the frequency of each word in each document:
 <pre class="example">
 DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;
 SELECT madlib.term_frequency('documents',    -- input table
                              'docid',        -- document id column
                              'words',        -- vector of words in document
                              'documents_tf'  -- output table
                             );
 \\x off
 SELECT * FROM documents_tf ORDER BY docid;
 </pre>
 <pre class="result">
  docid |    word     | count
 -------+-------------+-------
      0 | a           |     1
      0 | breakfast   |     1
      0 | banana      |     1
      0 | and         |     2
      0 | eat         |     1
      0 | smoothie    |     1
      0 | to          |     1
      0 | like        |     1
      0 | broccoli    |     1
      0 | bananas     |     1
      0 | spinach     |     1
      0 | i           |     2
      0 | ate         |     1
      0 | for         |     1
      1 | are         |     1
      1 | cute        |     1
      1 | kittens     |     1
      1 | chinchillas |     1
      1 | and         |     1
      2 | two         |     1
      2 | yesterday   |     1
      2 | kittens     |     1
      2 | sister      |     1
      2 | my          |     1
      2 | adopted     |     1
      3 | this        |     1
      3 | at          |     1
      3 | a           |     1
      3 | broccoli    |     1
      3 | of          |     1
      3 | look        |     1
      3 | hamster     |     1
      3 | on          |     1
      3 | piece       |     1
      3 | cute        |     1
      3 | munching    |     1
 (36 rows)
 </pre>

 -# Next we create a vocabulary of the words
 and store a wordid in the output table instead of the
 actual word:
 <pre class="example">
 DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;
 SELECT madlib.term_frequency('documents',    -- input table
                              'docid',        -- document id column
                              'words',        -- vector of words in document
                              'documents_tf',-- output table
                              TRUE
                             );
 SELECT * FROM documents_tf ORDER BY docid;
 </pre>
 \nbsp
 <pre class="result">
  docid | wordid | count
 -------+--------+-------
      0 |     17 |     1
      0 |      9 |     1
      0 |     25 |     1
      0 |     12 |     1
      0 |     13 |     1
      0 |     15 |     2
      0 |      0 |     1
      0 |      2 |     2
      0 |     28 |     1
      0 |      5 |     1
      0 |      6 |     1
      0 |      7 |     1
      0 |      8 |     1
      0 |     26 |     1
      1 |     16 |     1
      1 |     11 |     1
      1 |     10 |     1
      1 |      2 |     1
      1 |      3 |     1
      2 |     30 |     1
      2 |      1 |     1
      2 |     16 |     1
      2 |     20 |     1
      2 |     24 |     1
      2 |     29 |     1
      3 |      4 |     1
      3 |     21 |     1
      3 |     22 |     1
      3 |     23 |     1
      3 |      0 |     1
      3 |     11 |     1
      3 |      9 |     1
      3 |     27 |     1
      3 |     14 |     1
      3 |     18 |     1
      3 |     19 |     1
 (36 rows)
 </pre>
 \nbsp
 Note above that wordid's start
 at 0 not 1.  The vocabulary table maps wordid to the actual word:
 <pre class="example">
 SELECT * FROM documents_tf_vocabulary ORDER BY wordid;
 </pre>
 <pre class="result">
  wordid |    word
 --------+-------------
       0 | a
       1 | adopted
       2 | and
       3 | are
       4 | at
       5 | ate
       6 | banana
       7 | bananas
       8 | breakfast
       9 | broccoli
      10 | chinchillas
      11 | cute
      12 | eat
      13 | for
      14 | hamster
      15 | i
      16 | kittens
      17 | like
      18 | look
      19 | munching
      20 | my
      21 | of
      22 | on
      23 | piece
      24 | sister
      25 | smoothie
      26 | spinach
      27 | this
      28 | to
      29 | two
      30 | yesterday
 (31 rows)
 </pre>

 @anchor related
 @par Related Topics

 See text_utilities.sql_in for the term frequency SQL function definition
 and porter_stemmer.sql_in for the stemmer function.

 */

 /**
  * @brief Compute the term frequency for each term in a collection of documents
  *
  * @returns Names of tables created
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.term_frequency(
     input_table         TEXT,
     doc_id_col          TEXT,
     word_vec_col        TEXT,
     output_table        TEXT,
     compute_vocab  BOOLEAN)
 RETURNS TEXT
 AS $$
     PythonFunctionBodyOnly(`utilities', `text_utilities')
     from utilities.control import MinWarning
     with AOControl(False):
         with MinWarning("error"):
             return text_utilities.term_frequency(input_table, doc_id_col, word_vec_col,
                                              output_table, compute_vocab=compute_vocab)
 $$
 LANGUAGE plpythonu
 VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.term_frequency(
     input_table         TEXT,
     doc_id_col          TEXT,
     word_vec_col        TEXT,
     output_table        TEXT)
 RETURNS TEXT
 AS $$
     SELECT MADLIB_SCHEMA.term_frequency($1, $2, $3, $4, FALSE);
 $$
 LANGUAGE SQL
 VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
	/* ----------------------------------------------------------------------- //*
	*
	* @file text_utilities.sql_in
	*
	* @brief SQL functions for carrying out routine text operations
	*
	* @sa For a brief overview of utility functions, see the
	* module description \ref grp_utilities.
	*
	// ----------------------------------------------------------------------- */

	m4_include(`SQLCommon.m4')

	/**
	@addtogroup grp_text_utilities

	<div class="toc"><b>Contents</b>
	<ul>
	<li><a href="#function_syntax">Function Syntax</a></li>
	<li><a href="#examples">Examples</a></li>
	<li><a href="#related">Related Topics</a></li>
	</ul>
	</div>

	@brief Provides a collection of functions for performing common
	tasks related to text analytics.

	Term frequency computes the number of times that a word
	or term occurs in a document. Term frequency is often
	used as part of a larger text processing pipeline, which may
	include operations such as stemming, stop word removal
	and topic modelling.

	@anchor function_syntax
	@par Function Syntax

	<pre class="syntax">
	term_frequency(input_table,
	doc_id_col,
	word_col,
	output_table,
	compute_vocab)
	</pre>

	\b Arguments:
	<dl class="arglist">
	<dt>input_table</dt>
	<dd>TEXT.
	The name of the table containing the documents, with one
	document per row.
	Each row is in the form <doc_id, word_vector> where \c doc_id is an id
	unique to each document, and \c word_vector is a text array containing the
	words in the document. The \c word_vector should contain multiple entries of
	a word if the document contains multiple occurrence of that word.
	</dd>

	<dt>doc_id_col</dt>
	<dd>TEXT.
	The name of the column containing the document id. </dd>

	<dt>word_col</dt>
	<dd>TEXT.
	The name of the column containing the vector of words/terms in the
	document. This column should be of type that can be cast to TEXT[].</dd>

	<dt>output_table</dt>
	<dd>TEXT.
	The name of the table to store the term frequency output.
	The output table contains the following columns:
	- \c doc_id_col: This the document id column
	(name will be same as the one provided as input).
	- \c word: Word/term present in a document. Depending on the value
	of \c compute_vocab below, this is either the original
	word as it appears in \c word_col, or an id representing the word.
	Note that word id's start from 0 not 1.
	- \c count: The number of times this word is found in the document.
	</dd>

	<dt>compute_vocab</dt>
	<dd>BOOLEAN. (Optional, Default=FALSE)
	Flag to indicate if a vocabulary table is to be created. If TRUE, an additional
	output table is created containing the vocabulary of all words, with an id
	assigned to each word in alphabetical order.
	The table is called <em>output_table</em>_vocabulary
	(i.e., suffix added to the <em>output_table</em> name) and contains the
	following columns:
	- \c wordid: An id for each word in alphabetical order.
	- \c word: The word/term corresponding to the id.
	</dd>
	</dl>

	@anchor examples
	@par Examples

	-# First we create a document table with one document per row:
	<pre class="example">
	DROP TABLE IF EXISTS documents;
	CREATE TABLE documents(docid INT4, contents TEXT);
	INSERT INTO documents VALUES
	(0, 'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'),
	(1, 'Chinchillas and kittens are cute.'),
	(2, 'My sister adopted two kittens yesterday.'),
	(3, 'Look at this cute hamster munching on a piece of broccoli.');
	</pre>
	You can apply stemming, stop word removal and tokenization at this point
	in order to prepare the documents for text processing.
	Depending upon your database version, various tools are
	available. Databases based on more recent versions of
	PostgreSQL may do something like:
	<pre class="example">
	SELECT tsvector_to_array(to_tsvector('english',contents)) from documents;
	</pre>
	<pre class="result">
	tsvector_to_array
	+----------------------------------------------------------
	{ate,banana,breakfast,broccoli,eat,like,smoothi,spinach}
	{chinchilla,cute,kitten}
	{adopt,kitten,sister,two,yesterday}
	{broccoli,cute,hamster,look,munch,piec}
	(4 rows)
	</pre>
	In this example, we assume a database based on an older
	version of PostgreSQL and just perform basic punctuation
	removal and tokenization. The array of words is added as
	a new column to the documents table:
	<pre class="example">
	ALTER TABLE documents ADD COLUMN words TEXT[];
	UPDATE documents SET words =
	regexp_split_to_array(lower(
	regexp_replace(contents, E'[,.;\\']','', 'g')
	), E'[\\\\s+]');
	\\x on
	SELECT * FROM documents ORDER BY docid;
	</pre>
	<pre class="result">
	-[ RECORD 1 ]------------------------------------------------------------------------------------
	docid \| 0
	contents \| I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.
	words \| {i,like,to,eat,broccoli,and,bananas,i,ate,a,banana,and,spinach,smoothie,for,breakfast}
	-[ RECORD 2 ]------------------------------------------------------------------------------------
	docid \| 1
	contents \| Chinchillas and kittens are cute.
	words \| {chinchillas,and,kittens,are,cute}
	-[ RECORD 3 ]------------------------------------------------------------------------------------
	docid \| 2
	contents \| My sister adopted two kittens yesterday.
	words \| {my,sister,adopted,two,kittens,yesterday}
	-[ RECORD 4 ]------------------------------------------------------------------------------------
	docid \| 3
	contents \| Look at this cute hamster munching on a piece of broccoli.
	words \| {look,at,this,cute,hamster,munching,on,a,piece,of,broccoli}
	</pre>

	-# Compute the frequency of each word in each document:
	<pre class="example">
	DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;
	SELECT madlib.term_frequency('documents', -- input table
	'docid', -- document id column
	'words', -- vector of words in document
	'documents_tf' -- output table
	);
	\\x off
	SELECT * FROM documents_tf ORDER BY docid;
	</pre>
	<pre class="result">
	docid \| word \| count
	-------+-------------+-------
	0 \| a \| 1
	0 \| breakfast \| 1
	0 \| banana \| 1
	0 \| and \| 2
	0 \| eat \| 1
	0 \| smoothie \| 1
	0 \| to \| 1
	0 \| like \| 1
	0 \| broccoli \| 1
	0 \| bananas \| 1
	0 \| spinach \| 1
	0 \| i \| 2
	0 \| ate \| 1
	0 \| for \| 1
	1 \| are \| 1
	1 \| cute \| 1
	1 \| kittens \| 1
	1 \| chinchillas \| 1
	1 \| and \| 1
	2 \| two \| 1
	2 \| yesterday \| 1
	2 \| kittens \| 1
	2 \| sister \| 1
	2 \| my \| 1
	2 \| adopted \| 1
	3 \| this \| 1
	3 \| at \| 1
	3 \| a \| 1
	3 \| broccoli \| 1
	3 \| of \| 1
	3 \| look \| 1
	3 \| hamster \| 1
	3 \| on \| 1
	3 \| piece \| 1
	3 \| cute \| 1
	3 \| munching \| 1
	(36 rows)
	</pre>

	-# Next we create a vocabulary of the words
	and store a wordid in the output table instead of the
	actual word:
	<pre class="example">
	DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;
	SELECT madlib.term_frequency('documents', -- input table
	'docid', -- document id column
	'words', -- vector of words in document
	'documents_tf',-- output table
	TRUE
	);
	SELECT * FROM documents_tf ORDER BY docid;
	</pre>
	\nbsp
	<pre class="result">
	docid \| wordid \| count
	-------+--------+-------
	0 \| 17 \| 1
	0 \| 9 \| 1
	0 \| 25 \| 1
	0 \| 12 \| 1
	0 \| 13 \| 1
	0 \| 15 \| 2
	0 \| 0 \| 1
	0 \| 2 \| 2
	0 \| 28 \| 1
	0 \| 5 \| 1
	0 \| 6 \| 1
	0 \| 7 \| 1
	0 \| 8 \| 1
	0 \| 26 \| 1
	1 \| 16 \| 1
	1 \| 11 \| 1
	1 \| 10 \| 1
	1 \| 2 \| 1
	1 \| 3 \| 1
	2 \| 30 \| 1
	2 \| 1 \| 1
	2 \| 16 \| 1
	2 \| 20 \| 1
	2 \| 24 \| 1
	2 \| 29 \| 1
	3 \| 4 \| 1
	3 \| 21 \| 1
	3 \| 22 \| 1
	3 \| 23 \| 1
	3 \| 0 \| 1
	3 \| 11 \| 1
	3 \| 9 \| 1
	3 \| 27 \| 1
	3 \| 14 \| 1
	3 \| 18 \| 1
	3 \| 19 \| 1
	(36 rows)
	</pre>
	\nbsp
	Note above that wordid's start
	at 0 not 1. The vocabulary table maps wordid to the actual word:
	<pre class="example">
	SELECT * FROM documents_tf_vocabulary ORDER BY wordid;
	</pre>
	<pre class="result">
	wordid \| word
	--------+-------------
	0 \| a
	1 \| adopted
	2 \| and
	3 \| are
	4 \| at
	5 \| ate
	6 \| banana
	7 \| bananas
	8 \| breakfast
	9 \| broccoli
	10 \| chinchillas
	11 \| cute
	12 \| eat
	13 \| for
	14 \| hamster
	15 \| i
	16 \| kittens
	17 \| like
	18 \| look
	19 \| munching
	20 \| my
	21 \| of
	22 \| on
	23 \| piece
	24 \| sister
	25 \| smoothie
	26 \| spinach
	27 \| this
	28 \| to
	29 \| two
	30 \| yesterday
	(31 rows)
	</pre>

	@anchor related
	@par Related Topics

	See text_utilities.sql_in for the term frequency SQL function definition
	and porter_stemmer.sql_in for the stemmer function.

	*/

	/**
	* @brief Compute the term frequency for each term in a collection of documents
	*
	* @returns Names of tables created
	*/
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.term_frequency(
	input_table TEXT,
	doc_id_col TEXT,
	word_vec_col TEXT,
	output_table TEXT,
	compute_vocab BOOLEAN)
	RETURNS TEXT
	AS $$
	PythonFunctionBodyOnly(`utilities', `text_utilities')
	from utilities.control import MinWarning
	with AOControl(False):
	with MinWarning("error"):
	return text_utilities.term_frequency(input_table, doc_id_col, word_vec_col,
	output_table, compute_vocab=compute_vocab)
	$$
	LANGUAGE plpythonu
	VOLATILE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.term_frequency(
	input_table TEXT,
	doc_id_col TEXT,
	word_vec_col TEXT,
	output_table TEXT)
	RETURNS TEXT
	AS $$
	SELECT MADLIB_SCHEMA.term_frequency($1, $2, $3, $4, FALSE);
	$$
	LANGUAGE SQL
	VOLATILE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');