| { |
| "cells": [ |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Term Frequency\n", |
| "Term frequency computes the number of times that a word or term occurs in a document. Term frequency is often used as part of a larger text processing pipeline, which may include operations such as stemming, stop word removal and topic modelling." |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 36, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "The sql extension is already loaded. To reload it, use:\n", |
| " %reload_ext sql\n" |
| ] |
| } |
| ], |
| "source": [ |
| "%load_ext sql" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 37, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "data": { |
| "text/plain": [ |
| "u'Connected: fmcquillan@madlib'" |
| ] |
| }, |
| "execution_count": 37, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "# Greenplum 4.3.10.0\n", |
| "# %sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", |
| " \n", |
| "# PostgreSQL local\n", |
| "%sql postgresql://fmcquillan@localhost:5432/madlib" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 38, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "1 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>version</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>MADlib version: 1.13, git revision: unknown, cmake configuration time: Wed Dec 20 08:02:21 UTC 2017, build type: Release, build system: Darwin-17.3.0, C compiler: Clang, C++ compiler: Clang</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(u'MADlib version: 1.13, git revision: unknown, cmake configuration time: Wed Dec 20 08:02:21 UTC 2017, build type: Release, build system: Darwin-17.3.0, C compiler: Clang, C++ compiler: Clang',)]" |
| ] |
| }, |
| "execution_count": 38, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%sql select madlib.version();\n", |
| "#%sql select version();" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# 1. Prepare documents\n", |
| "First we create a document table with one document per row:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 58, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "Done.\n", |
| "4 rows affected.\n", |
| "4 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>docid</th>\n", |
| " <th>contents</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>Chinchillas and kittens are cute.</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>My sister adopted two kittens yesterday.</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>Look at this cute hamster munching on a piece of broccoli.</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(0, u'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'),\n", |
| " (1, u'Chinchillas and kittens are cute.'),\n", |
| " (2, u'My sister adopted two kittens yesterday.'),\n", |
| " (3, u'Look at this cute hamster munching on a piece of broccoli.')]" |
| ] |
| }, |
| "execution_count": 58, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS documents;\n", |
| "CREATE TABLE documents(docid INT4, contents TEXT);\n", |
| "\n", |
| "INSERT INTO documents VALUES\n", |
| "(0, 'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'),\n", |
| "(1, 'Chinchillas and kittens are cute.'),\n", |
| "(2, 'My sister adopted two kittens yesterday.'),\n", |
| "(3, 'Look at this cute hamster munching on a piece of broccoli.');\n", |
| "\n", |
| "SELECT * from documents ORDER BY docid;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "You can apply stemming, stop word removal and tokenization at this point in order to prepare the documents for text processing. Depending upon your database version, various tools are available here. Databases based on more recent versions of PostgreSQL may do something like:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 53, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "4 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>tsvector_to_array</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[u'ate', u'banana', u'breakfast', u'broccoli', u'eat', u'like', u'smoothi', u'spinach']</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[u'chinchilla', u'cute', u'kitten']</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[u'adopt', u'kitten', u'sister', u'two', u'yesterday']</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[u'broccoli', u'cute', u'hamster', u'look', u'munch', u'piec']</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[([u'ate', u'banana', u'breakfast', u'broccoli', u'eat', u'like', u'smoothi', u'spinach'],),\n", |
| " ([u'chinchilla', u'cute', u'kitten'],),\n", |
| " ([u'adopt', u'kitten', u'sister', u'two', u'yesterday'],),\n", |
| " ([u'broccoli', u'cute', u'hamster', u'look', u'munch', u'piec'],)]" |
| ] |
| }, |
| "execution_count": 53, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "SELECT tsvector_to_array(to_tsvector('english',contents)) from documents;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "In this example, we assume a database based on an older version of PostgreSQL and just perform basic punctuation removal and tokenization. The array of words is added as a new column to the documents table:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 59, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "4 rows affected.\n", |
| "4 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>docid</th>\n", |
| " <th>contents</th>\n", |
| " <th>words</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.</td>\n", |
| " <td>[u'i', u'like', u'to', u'eat', u'broccoli', u'and', u'bananas', u'i', u'ate', u'a', u'banana', u'and', u'spinach', u'smoothie', u'for', u'breakfast']</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>Chinchillas and kittens are cute.</td>\n", |
| " <td>[u'chinchillas', u'and', u'kittens', u'are', u'cute']</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>My sister adopted two kittens yesterday.</td>\n", |
| " <td>[u'my', u'sister', u'adopted', u'two', u'kittens', u'yesterday']</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>Look at this cute hamster munching on a piece of broccoli.</td>\n", |
| " <td>[u'look', u'at', u'this', u'cute', u'hamster', u'munching', u'on', u'a', u'piece', u'of', u'broccoli']</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(0, u'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.', [u'i', u'like', u'to', u'eat', u'broccoli', u'and', u'bananas', u'i', u'ate', u'a', u'banana', u'and', u'spinach', u'smoothie', u'for', u'breakfast']),\n", |
| " (1, u'Chinchillas and kittens are cute.', [u'chinchillas', u'and', u'kittens', u'are', u'cute']),\n", |
| " (2, u'My sister adopted two kittens yesterday.', [u'my', u'sister', u'adopted', u'two', u'kittens', u'yesterday']),\n", |
| " (3, u'Look at this cute hamster munching on a piece of broccoli.', [u'look', u'at', u'this', u'cute', u'hamster', u'munching', u'on', u'a', u'piece', u'of', u'broccoli'])]" |
| ] |
| }, |
| "execution_count": 59, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "ALTER TABLE documents ADD COLUMN words TEXT[];\n", |
| "\n", |
| "UPDATE documents SET words = \n", |
| " regexp_split_to_array(lower(\n", |
| " regexp_replace(contents, E'[,.;\\']','', 'g')\n", |
| " ), E'[\\\\s+]');\n", |
| " \n", |
| "SELECT * FROM documents ORDER BY docid;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# 2. Term frequency\n", |
| "Build a histogram for each document:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 60, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "1 rows affected.\n", |
| "36 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>docid</th>\n", |
| " <th>word</th>\n", |
| " <th>count</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>a</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>breakfast</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>banana</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>and</td>\n", |
| " <td>2</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>eat</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>smoothie</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>to</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>like</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>broccoli</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>bananas</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>spinach</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>i</td>\n", |
| " <td>2</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>ate</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>for</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>are</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>cute</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>kittens</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>chinchillas</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>and</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>two</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>yesterday</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>kittens</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>sister</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>my</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>adopted</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>this</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>at</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>a</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>broccoli</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>of</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>look</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>hamster</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>on</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>piece</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>cute</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>munching</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(0, u'a', 1),\n", |
| " (0, u'breakfast', 1),\n", |
| " (0, u'banana', 1),\n", |
| " (0, u'and', 2),\n", |
| " (0, u'eat', 1),\n", |
| " (0, u'smoothie', 1),\n", |
| " (0, u'to', 1),\n", |
| " (0, u'like', 1),\n", |
| " (0, u'broccoli', 1),\n", |
| " (0, u'bananas', 1),\n", |
| " (0, u'spinach', 1),\n", |
| " (0, u'i', 2),\n", |
| " (0, u'ate', 1),\n", |
| " (0, u'for', 1),\n", |
| " (1, u'are', 1),\n", |
| " (1, u'cute', 1),\n", |
| " (1, u'kittens', 1),\n", |
| " (1, u'chinchillas', 1),\n", |
| " (1, u'and', 1),\n", |
| " (2, u'two', 1),\n", |
| " (2, u'yesterday', 1),\n", |
| " (2, u'kittens', 1),\n", |
| " (2, u'sister', 1),\n", |
| " (2, u'my', 1),\n", |
| " (2, u'adopted', 1),\n", |
| " (3, u'this', 1),\n", |
| " (3, u'at', 1),\n", |
| " (3, u'a', 1),\n", |
| " (3, u'broccoli', 1),\n", |
| " (3, u'of', 1),\n", |
| " (3, u'look', 1),\n", |
| " (3, u'hamster', 1),\n", |
| " (3, u'on', 1),\n", |
| " (3, u'piece', 1),\n", |
| " (3, u'cute', 1),\n", |
| " (3, u'munching', 1)]" |
| ] |
| }, |
| "execution_count": 60, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;\n", |
| "\n", |
| "SELECT madlib.term_frequency('documents', -- input table\n", |
| " 'docid', -- document id\n", |
| " 'words', -- vector of words in document\n", |
| " 'documents_tf' -- output table\n", |
| " );\n", |
| "\n", |
| "SELECT * FROM documents_tf ORDER BY docid;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# 3. Term frequency with vocabulary\n", |
| "\n", |
| "In this example we create a vocabulary of the words and store a wordid in the output table instead of the actual word." |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 61, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "1 rows affected.\n", |
| "36 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>docid</th>\n", |
| " <th>wordid</th>\n", |
| " <th>count</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>17</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>9</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>25</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>12</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>13</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>15</td>\n", |
| " <td>2</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>0</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>2</td>\n", |
| " <td>2</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>28</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>5</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>6</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>7</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>8</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>26</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>16</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>11</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>10</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>2</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>3</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>30</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>1</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>16</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>20</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>24</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>29</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>4</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>21</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>22</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>23</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>0</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>11</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>9</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>27</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>14</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>18</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>19</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(0, 17, 1),\n", |
| " (0, 9, 1),\n", |
| " (0, 25, 1),\n", |
| " (0, 12, 1),\n", |
| " (0, 13, 1),\n", |
| " (0, 15, 2),\n", |
| " (0, 0, 1),\n", |
| " (0, 2, 2),\n", |
| " (0, 28, 1),\n", |
| " (0, 5, 1),\n", |
| " (0, 6, 1),\n", |
| " (0, 7, 1),\n", |
| " (0, 8, 1),\n", |
| " (0, 26, 1),\n", |
| " (1, 16, 1),\n", |
| " (1, 11, 1),\n", |
| " (1, 10, 1),\n", |
| " (1, 2, 1),\n", |
| " (1, 3, 1),\n", |
| " (2, 30, 1),\n", |
| " (2, 1, 1),\n", |
| " (2, 16, 1),\n", |
| " (2, 20, 1),\n", |
| " (2, 24, 1),\n", |
| " (2, 29, 1),\n", |
| " (3, 4, 1),\n", |
| " (3, 21, 1),\n", |
| " (3, 22, 1),\n", |
| " (3, 23, 1),\n", |
| " (3, 0, 1),\n", |
| " (3, 11, 1),\n", |
| " (3, 9, 1),\n", |
| " (3, 27, 1),\n", |
| " (3, 14, 1),\n", |
| " (3, 18, 1),\n", |
| " (3, 19, 1)]" |
| ] |
| }, |
| "execution_count": 61, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;\n", |
| "\n", |
| "SELECT madlib.term_frequency('documents', -- input table\n", |
| " 'docid', -- document id\n", |
| " 'words', -- vector of words in document\n", |
| " 'documents_tf',-- output table\n", |
| " TRUE\n", |
| " );\n", |
| "\n", |
| "SELECT * FROM documents_tf ORDER BY docid;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Note above that a wordid have been generated. The vocabulary table maps wordid to the actual word text:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 62, |
| "metadata": { |
| "scrolled": true |
| }, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "31 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>wordid</th>\n", |
| " <th>word</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>a</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>adopted</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>and</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>are</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>4</td>\n", |
| " <td>at</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>5</td>\n", |
| " <td>ate</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>6</td>\n", |
| " <td>banana</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>7</td>\n", |
| " <td>bananas</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>8</td>\n", |
| " <td>breakfast</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>9</td>\n", |
| " <td>broccoli</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>10</td>\n", |
| " <td>chinchillas</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>11</td>\n", |
| " <td>cute</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>12</td>\n", |
| " <td>eat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>13</td>\n", |
| " <td>for</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>14</td>\n", |
| " <td>hamster</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>15</td>\n", |
| " <td>i</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>16</td>\n", |
| " <td>kittens</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>17</td>\n", |
| " <td>like</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>18</td>\n", |
| " <td>look</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>19</td>\n", |
| " <td>munching</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>20</td>\n", |
| " <td>my</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>21</td>\n", |
| " <td>of</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>22</td>\n", |
| " <td>on</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>23</td>\n", |
| " <td>piece</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>24</td>\n", |
| " <td>sister</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>25</td>\n", |
| " <td>smoothie</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>26</td>\n", |
| " <td>spinach</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>27</td>\n", |
| " <td>this</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>28</td>\n", |
| " <td>to</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>29</td>\n", |
| " <td>two</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>30</td>\n", |
| " <td>yesterday</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(0, u'a'),\n", |
| " (1, u'adopted'),\n", |
| " (2, u'and'),\n", |
| " (3, u'are'),\n", |
| " (4, u'at'),\n", |
| " (5, u'ate'),\n", |
| " (6, u'banana'),\n", |
| " (7, u'bananas'),\n", |
| " (8, u'breakfast'),\n", |
| " (9, u'broccoli'),\n", |
| " (10, u'chinchillas'),\n", |
| " (11, u'cute'),\n", |
| " (12, u'eat'),\n", |
| " (13, u'for'),\n", |
| " (14, u'hamster'),\n", |
| " (15, u'i'),\n", |
| " (16, u'kittens'),\n", |
| " (17, u'like'),\n", |
| " (18, u'look'),\n", |
| " (19, u'munching'),\n", |
| " (20, u'my'),\n", |
| " (21, u'of'),\n", |
| " (22, u'on'),\n", |
| " (23, u'piece'),\n", |
| " (24, u'sister'),\n", |
| " (25, u'smoothie'),\n", |
| " (26, u'spinach'),\n", |
| " (27, u'this'),\n", |
| " (28, u'to'),\n", |
| " (29, u'two'),\n", |
| " (30, u'yesterday')]" |
| ] |
| }, |
| "execution_count": 62, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "SELECT * FROM documents_tf_vocabulary ORDER BY wordid;" |
| ] |
| } |
| ], |
| "metadata": { |
| "kernelspec": { |
| "display_name": "Python 2", |
| "language": "python", |
| "name": "python2" |
| }, |
| "language_info": { |
| "codemirror_mode": { |
| "name": "ipython", |
| "version": 2 |
| }, |
| "file_extension": ".py", |
| "mimetype": "text/x-python", |
| "name": "python", |
| "nbconvert_exporter": "python", |
| "pygments_lexer": "ipython2", |
| "version": "2.7.12" |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 1 |
| } |