| |
| """ |
| @file generate_svec.py_in |
| |
| @namespace generate_svec |
| |
| """ |
| import plpy |
| from utilities.validate_args import table_exists |
| from utilities.validate_args import columns_exist_in_table |
| from utilities.validate_args import table_is_empty |
| from utilities.validate_args import get_cols_and_types |
| from utilities.utilities import _assert |
| |
| def generate_doc_svecs_help(schema_madlib, **kwargs): |
| return """ |
| ----------------------------------------------------- |
| Usage |
| ----------------------------------------------------- |
| SELECT * FROM {schema_madlib}.generate_doc_svecs ( |
| 'output_tbl' -- Output table to be created containing |
| sparse vectors for the documents. |
| 'dictionary_tbl' -- Dictionary table |
| 'dict_id_col' -- Name of the id column in the dictionary table |
| Supported Types: INTEGER or BIGINT |
| NOTE: Values must range from 0 to total number |
| of elements in the dictionary - 1 |
| 'dict_term_col' -- Name of the terms column in the dictionary table |
| 'documents_tbl' -- Documents table |
| 'doc_id_col' -- Name of the id column in documents table |
| 'doc_term_col' -- Name of the terms column in documents table |
| 'doc_term_info_col' -- Name of the term info column in documents table |
| Supported Types: |
| INTEGER, BIGINT :- Values directly used to populate the vectors |
| DOUBLE PRECISION |
| ARRAY :- Length of the array is used as values |
| ); |
| |
| ----------------------------------------------------------- |
| Output |
| ----------------------------------------------------------- |
| The output table (output_tbl in the above) has the following columns: |
| doc_id __TYPE_DOC__ -- Document id column |
| sparse_vector {schema_madlib}.svec -- Column containing the sparse vector |
| representation for the document |
| ** __TYPE_DOC__: Type Depends on Type of Column doc_id_col |
| in documents_tbl. |
| """.format(schema_madlib = schema_madlib) |
| |
| def generate_doc_svecs(schema_madlib, output_tbl, dictionary_tbl, |
| dict_id_col, dict_term_col, documents_tbl, |
| doc_id_col, doc_term_col, doc_term_info_col, **kwargs): |
| |
| """ |
| Creates the sparse vector representations of the documents represented by |
| documents table |
| |
| @param output_tbl Name of the output table to be created containing sparse |
| vector representations of the documents |
| @param dictionary_tbl Name of the dictionary table |
| @param dict_id_col Name of the id column in dictionary table |
| @param dict_term_col Name of the term column in dictionary table |
| @param documents_tbl Name of the documents table |
| @param doc_id_col Name of the id column in documents table |
| @param doc_term_col Name of the term column in documents table |
| @param doc_term_info_col Name of the term info column in documents table. |
| |
| """ |
| |
| # Validate arguments and get the column type for term info column in documents table. |
| term_info_col_type = _validate_args(schema_madlib, output_tbl, dictionary_tbl, dict_id_col, |
| dict_term_col, documents_tbl, doc_id_col, doc_term_col, doc_term_info_col) |
| |
| if term_info_col_type == 'array': |
| term_count_expr = """ array_upper(doc_table.{doc_term_info_col}, 1) """ |
| else: |
| term_count_expr = " doc_table.{doc_term_info_col} " |
| |
| dict_count = plpy.execute(""" |
| SELECT count(*) FROM {dict_table} |
| """.format(dict_table = dictionary_tbl))[0]['count'] |
| |
| query = """ |
| CREATE TABLE {output_tbl} AS |
| SELECT |
| doc_table.{doc_id_col} doc_id, |
| {schema_madlib}.__gen_svec( |
| array_agg(dict_table.{dict_id_col}), |
| array_agg( """ + term_count_expr + """), |
| {dict_count}) sparse_vector |
| FROM |
| {doc_table} doc_table, |
| {dict_table} dict_table |
| WHERE |
| doc_table.{doc_term_col} = dict_table.{dict_term_col} |
| GROUP BY |
| doc_table.{doc_id_col} |
| m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (doc_id)') |
| """ |
| |
| plpy.execute(query.format( |
| output_tbl = output_tbl, |
| doc_table = documents_tbl, |
| dict_table = dictionary_tbl, |
| doc_id_col = doc_id_col, |
| doc_term_col = doc_term_col, |
| doc_term_info_col = doc_term_info_col, |
| dict_id_col = dict_id_col, |
| dict_term_col = dict_term_col, |
| dict_count = dict_count, |
| schema_madlib = schema_madlib)) |
| |
| return "Created table %s (doc_id, sparse_vector) containing sparse vectors" % output_tbl |
| |
| def verify_type(type_dict, col_name, isIdColumn): |
| """ |
| @brief We validate two types of columns |
| id column: Expected types: int or bigint |
| term info column: Expected types: int, bigint, float8 or array |
| """ |
| |
| if isIdColumn: |
| if type_dict[col_name].lower() not in ['integer', 'bigint']: |
| return False |
| else: |
| if type_dict[col_name].lower() not in ['integer', 'bigint', |
| 'double precision', 'array']: |
| return False |
| |
| return True |
| |
| def _validate_args(schema_madlib, output_tbl, dictionary_tbl, dict_id_col, |
| dict_term_col, documents_tbl, doc_id_col, |
| doc_term_col, doc_term_info_col): |
| """ |
| @brief Validate the arguments |
| """ |
| |
| _assert(dictionary_tbl is not None and |
| dictionary_tbl.lower().strip() not in ('null', ''), |
| "Svec error: Invalid dictionary table name") |
| _assert(documents_tbl is not None and |
| documents_tbl.lower().strip() not in ('null', ''), |
| "Svec error: Invalid documents table name") |
| _assert(table_exists(dictionary_tbl), |
| "Svec error: Dictionary table does not exist!") |
| _assert(table_exists(documents_tbl), |
| "Svec error: Documents table does not exist!") |
| _assert(not table_is_empty(dictionary_tbl), |
| "Svec error: Dictionary table is empty!") |
| _assert(not table_is_empty(documents_tbl), |
| "Svec error: Documents table is empty!") |
| |
| _assert(output_tbl is not None and |
| output_tbl.strip().lower() not in ('null', ''), |
| "Svec error: Invalid output table name!") |
| |
| dict_cols = [dict_id_col, dict_term_col] |
| _assert(columns_exist_in_table(dictionary_tbl, dict_cols), |
| "Svec error: Missing specified column names from dictionary table") |
| |
| doc_cols = [doc_id_col, doc_term_col, doc_term_info_col] |
| _assert(columns_exist_in_table(documents_tbl, doc_cols), |
| "Svec error: Missing specified columns names from documents table") |
| |
| # Verify column types. We only need to verify id column from dictionary |
| # table and term info column from documents table to be of any of the |
| # respective required types. |
| # |
| dict_col_type_dict = dict(get_cols_and_types(dictionary_tbl)) |
| _assert(verify_type(dict_col_type_dict, dict_id_col, True), |
| "Svec error: Unexpected type for column '%s' in dictionary table." |
| " Should be int or bigint type" % dict_id_col) |
| |
| doc_col_type_dict = dict(get_cols_and_types(documents_tbl)) |
| _assert(verify_type(doc_col_type_dict, doc_term_info_col, False), |
| "Svec error: Unexpected type for column '%s' in documents table." |
| " Should be int, bigint, double precision or Array type" % doc_term_info_col) |
| |
| # When everything else has been verified, check if output table exists and |
| # notify if needs to be dropped. |
| _assert(not table_exists(output_tbl), |
| "Svec error: Output table name already exists. Drop the table before calling the function.") |
| |
| return doc_col_type_dict[doc_term_info_col].lower() |