blob: 4617b5e9f915d84919c1745b369ad72cb04be9c9 [file] [log] [blame]
#include <postgres.h>
#include <stdio.h>
#include <string.h>
#include <search.h>
#include <stdlib.h>
#include <math.h>
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "catalog/pg_type.h"
#include "access/tupmacs.h"
#include "sparse_vector.h"
static char **get_text_array_contents(ArrayType *array, int *numitems);
SvecType * classify_document(char **features, int num_features,
char **document, int num_words);
Datum gp_extract_feature_histogram(PG_FUNCTION_ARGS);
void gp_extract_feature_histogram_errout(char *msg);
/**
* gp_extract_feature_histogram
* Approach:
* Definitions:
* Feature Vector:
* A feature vector is a list of words, generally all of the possible
* choices of words. In other words, a feature vector is a dictionary
* and might have cardinality of 20,000 or so.
*
* Document:
* A document, here identifed using a list of words. Generally a
* document will consist of a set of words contained in the feature
* vector, but sometimes a document will contain words that are not
* in the feature vector.
*
* Sparse Feature Vector (SFV):
* An SFV is an array of attributes defined for each feature found
* in a document. For example, you might define an SFV where each
* attribute is a count of the number of instances of a feature is
* found in the document, with one entry per feature found in the
* document.
*
* Example:
* Say we have a document defined as follows:
* document1 = {"this","is","an","example","sentence","with","some",
* "some","repeat","repeat"}
* And say we have a feature vector as follows:
* features = {"foo","bar","this","is","an","baz","example","sentence",
* "with","some","repeat","word1","word2","word3"}
*
* Now we'd like to create the SFV for document1. We can number each
* feature starting at 1, so that feature(1) = foo, feature(2) = bar
* and so on. The SFV of document1 would then be:
* sfv(document1,features) = {0,0,1,1,1,0,1,1,1,2,2,0,0,0}
* Note that the position in the SFV array is the number of the feature
* vector and the attribute is the count of the number of features
* found in each position.
*
* We would like to store the SFV in a terse representation that fits
* in a small amount of memory. We also want to be able to compare the
* number of instances where the SFV of one document intersects another.
* This routine uses the Sparse Vector datatype to store the SFV.
*
* Function Signature is:
*
* Where:
* features: a text array of features (words)
* document: the document, tokenized into words
*
* Returns:
* SFV of the document with counts of each feature, stored in a Sparse Vector (svec) datatype
*
* TODO:
* Use the built-in hash table structure instead of hsearch()
* The problem with hsearch is that it's not safe to use more than
* one per process. That means we currently can't do more than one document
* classification per query slice or we'll get the wrong results.
* [DONE] Implement a better scheme for detecting whether we're in a new query since
* we created the hash table.
* Right now we write a key into palloc'ed memory and check to see
* if it's the same value on reentry to the classification routine.
* This is a hack and may fail in certain circumstances.
* A better approach uses the gp_session_id and gp_command_count
* to determine if we're in the same query as the last time we were
* called.
*/
/**
* Notes from Brian Dolan on how this feature vector is commonly used:
*
* The actual count is hardly ever used. Insead, it's turned into a weight. The most
* common weight is called tf/idf for "Term Frequency / Inverse Document Frequency".
* The calculation for a given term in a given document is:
* {#Times in this document} * log {#Documents / #Documents the term appears in}
* For instance, the term "document" in document A would have weight 1 * log (4/3). In
* document D it would have weight 2 * log (4/3).
* Terms that appear in every document would have tf/idf weight 0, since:
* log (4/4) = log(1) = 0. (Our example has no term like that.)
* That usually sends a lot of values to 0.
*
* In this function we're just calculating the term:
* log {#Documents / #Documents the term appears in}
* as an Svec.
*
* We'll need to have the following arguments:
* Svec *count_corpus //count of documents in which each feature is found
* float8 *document_count //count of all documents in corpus
*/
PG_FUNCTION_INFO_V1( gp_extract_feature_histogram );
Datum gp_extract_feature_histogram(PG_FUNCTION_ARGS)
{
SvecType *returnval;
char **features, **document;
int num_features, num_words, result;
if (PG_ARGISNULL(0)) PG_RETURN_NULL();
/* Error checking */
if (PG_NARGS() != 2)
gp_extract_feature_histogram_errout(
"gp_extract_feature_histogram called with wrong number of arguments");
/* Retrieve the C text array equivalents from the PG text[][] inputs */
features = get_text_array_contents(PG_GETARG_ARRAYTYPE_P(0),&num_features);
document = get_text_array_contents(PG_GETARG_ARRAYTYPE_P(1),&num_words);
// Check if dictionary is sorted
for (int i=0; i<num_features-1; i++) {
result = strcmp(*(features+i),*(features+i+1));
if (result > 0) {
elog(ERROR,"Dictionary is unsorted: '%s' is out of order.\n",*(features+i+1));
}else if (result == 0) {
elog(ERROR,"Dictionary has duplicated word: '%s'\n",*(features+i+1));
}
}
//elog(NOTICE,"Number of items in the feature array is: %d\n",num_features);
//elog(NOTICE,"Number of items in the document array is: %d\n",num_words);
returnval = classify_document(features,num_features,document,num_words);
pfree(features);
pfree(document);
PG_RETURN_POINTER(returnval);
}
void gp_extract_feature_histogram_errout(char *msg) {
ereport(ERROR,
(errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
errmsg(
"%s\ngp_extract_feature_histogram internal error.",msg)));
}
int str_bsearch(char * word, char ** features, int num_features)
{
int low = 0;
int high = num_features - 1;
int mid;
int cmp_result;
while (true) {
if (low > high) return -5;
mid = low + (high - low)/2;
Assert(mid >= 0 && mid < num_features);
cmp_result = strcmp(word, features[mid]);
if (cmp_result == 0) return mid;
if (cmp_result > 0) low = mid + 1;
else high = mid - 1;
}
/*
int i;
for (i=0; i!=num_features; i++)
if (strcmp(word, features[i]) == 0)
return i;
return -5;
*/
}
SvecType *classify_document(char **features, int num_features,
char **document, int num_words)
{
float8 * histogram = (float8 *)palloc0(sizeof(float8)*num_features);
SvecType * output_sfv;
int i, idx;
for (i=0; i!=num_words; i++) {
if (document[i] == NULL) continue;
idx = str_bsearch(document[i], features, num_features);
if (idx >= 0)
histogram[idx]++;
}
output_sfv = svec_from_float8arr(histogram, num_features);
return output_sfv;
}
/**
* Deconstruct a text[] into C-strings (note any NULL elements will be
* returned as NULL pointers)
*/
static char ** get_text_array_contents(ArrayType *array, int *numitems)
{
int ndim = ARR_NDIM(array);
int * dims = ARR_DIMS(array);
int nitems;
int16 typlen;
bool typbyval;
char typalign;
char ** values;
char * ptr;
bits8 * bitmap;
int bitmask;
int i;
Assert(ARR_ELEMTYPE(array) == TEXTOID);
if (ARR_ELEMTYPE(array) != TEXTOID) {
*numitems = 0;
elog(WARNING,"attempt to use a non-text[][] variable with a function that uses text[][] argumenst.\n");
return NULL;
}
nitems = ArrayGetNItems(ndim, dims);
*numitems = nitems;
get_typlenbyvalalign(ARR_ELEMTYPE(array),&typlen, &typbyval, &typalign);
values = (char **) palloc(nitems * sizeof(char *));
ptr = ARR_DATA_PTR(array);
bitmap = ARR_NULLBITMAP(array);
bitmask = 1;
for (i = 0; i < nitems; i++) {
if (bitmap && (*bitmap & bitmask) == 0) {
values[i] = NULL;
} else {
values[i] = DatumGetCString(DirectFunctionCall1(textout,
PointerGetDatum(ptr)));
ptr = att_addlength_pointer(ptr, typlen, ptr);
ptr = (char *) att_align_nominal(ptr, typalign);
}
/* advance bitmap pointer if any */
if (bitmap) {
bitmask <<= 1;
if (bitmask == 0x100) {
bitmap++;
bitmask = 1;
}
}
}
return values;
}