methods/svec_util/src/pg_gp/gp_sfv.c - madlib - Git at Google

 #include <postgres.h>

 #include <stdio.h>
 #include <string.h>
 #include <search.h>
 #include <stdlib.h>
 #include <math.h>

 #include "access/tupmacs.h"
 #include "catalog/pg_type.h"
 #if PG_VERSION_NUM >= 90100
 #include "catalog/pg_collation.h"
 #endif
 #include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"

 #include "../../../svec/src/pg_gp/sparse_vector.h"

 Datum gp_extract_feature_histogram(PG_FUNCTION_ARGS);

 static void gp_extract_feature_histogram_errout(char *msg);

 static SvecType * classify_document(Datum *features, int num_features,
 				  Datum *document, int num_words, bool *null_words);

 #if PG_VERSION_NUM >= 90100
 #include "catalog/pg_collation.h"
 #define TextDatumCmp(x, y) (DatumGetInt32( \
 			DirectFunctionCall2Coll(bttextcmp, DEFAULT_COLLATION_OID, (x), (y))))
 #else
 #define TextDatumCmp(x, y) (DatumGetInt32( \
 			DirectFunctionCall2(bttextcmp, (x), (y))))
 #endif

 /**
  * 	gp_extract_feature_histogram
  * 	Approach:
  * 	  Definitions:
  * 	   Feature Vector:
  * 	    A feature vector is a list of words, generally all of the possible
  *          choices of words. In other words, a feature vector is a dictionary
  *          and might have cardinality of 20,000 or so.
  *
  * 	   Document:
  * 	    A document, here identifed using a list of words. Generally a
  *          document will consist of a set of words contained in the feature
  *          vector, but sometimes a document will contain words that are not
  *          in the feature vector.
  *
  * 	   Sparse Feature Vector (SFV):
  * 	    An SFV is an array of attributes defined for each feature found
  *          in a document. For example, you might define an SFV where each
  *          attribute is a count of the number of instances of a feature is
  *          found in the document, with one entry per feature found in the
  *          document.
  *
  * 	Example:
  * 	  Say we have a document defined as follows:
  * 	    document1 = {"this","is","an","example","sentence","with","some",
  *                       "some","repeat","repeat"}
  * 	  And say we have a feature vector as follows:
  * 	    features = {"foo","bar","this","is","an","baz","example","sentence",
  *                      "with","some","repeat","word1","word2","word3"}
  *
  * 	  Now we'd like to create the SFV for document1. We can number each
  *        feature starting at 1, so that feature(1) = foo, feature(2) = bar
  *        and so on. The SFV of document1 would then be:
  * 	    sfv(document1,features) = {0,0,1,1,1,0,1,1,1,2,2,0,0,0}
  * 	  Note that the position in the SFV array is the number of the feature
  *        vector and the attribute is the count of the number of features
  *        found in each position.
  *
  *     We would like to store the SFV in a terse representation that fits
  *     in a small amount of memory. We also want to be able to compare the
  *     number of instances where the SFV of one document intersects another.
  *     This routine uses the Sparse Vector datatype to store the SFV.
  *
  * Function Signature is:
  *
  * Where:
  * 	features:		a text array of features (words)
  *	document:		the document, tokenized into words
  *
  * Returns:
  * 	SFV of the document with counts of each feature, stored in a Sparse Vector (svec) datatype
  */

 /**
  * Notes from Brian Dolan on how this feature vector is commonly used:
  *
  * The actual count is hardly ever used.  Instead, it's turned into a weight.  The most
  * common weight is called tf/idf for "Term Frequency / Inverse Document Frequency".
  * The calculation for a given term in a given document is:
  * 	{#Times in this document} * log {#Documents / #Documents  the term appears in}
  * For instance, the term "document" in document A would have weight 1 * log (4/3).  In
  * document D it would have weight 2 * log (4/3).
  * Terms that appear in every document would have tf/idf weight 0, since:
  * 	log (4/4) = log(1) = 0.  (Our example has no term like that.)
  * That usually sends a lot of values to 0.
  *
  * In this function we're just calculating the term:
  * 	log {#Documents / #Documents  the term appears in}
  * as an Svec.
  *
  * We'll need to have the following arguments:
  * 	Svec *count_corpus           //count of documents in which each feature is found
  * 	float8 *document_count      //count of all documents in corpus
  */

 PG_FUNCTION_INFO_V1( gp_extract_feature_histogram );
 Datum gp_extract_feature_histogram(PG_FUNCTION_ARGS)
 {
 	SvecType   *returnval;
 	ArrayType  *arr0, *arr1;
 	Datum	   *features, *document;
 	int			num_features, num_words;
 	bool	   *null_words;
 	int16		elmlen;
 	bool		elmbyval;
 	char		elmalign;
 	int			i;

 	if (PG_ARGISNULL(0))
 		PG_RETURN_NULL();

 	/* Error checking */
 	if (PG_NARGS() != 2)
 		gp_extract_feature_histogram_errout(
 			"gp_extract_feature_histogram called with wrong number of arguments");

 	arr0 = PG_GETARG_ARRAYTYPE_P(0);
 	arr1 = PG_GETARG_ARRAYTYPE_P(1);

 	/* Error if dictionary is empty or contains a null */
 	if (ARR_HASNULL(arr0))
 		gp_extract_feature_histogram_errout(
 		  "dictionary argument contains a null entry");

 	if (ARR_NDIM(arr0) == 0)
 		gp_extract_feature_histogram_errout(
 		  "dictionary argument is empty");

 	if (ARR_ELEMTYPE(arr0) != TEXTOID || ARR_ELEMTYPE(arr1) != TEXTOID)
 		gp_extract_feature_histogram_errout("the input types must be text[]");

 	get_typlenbyvalalign(TEXTOID, &elmlen, &elmbyval, &elmalign);
 	deconstruct_array(arr0, TEXTOID, elmlen, elmbyval, elmalign,
 					  &features, NULL, &num_features);
 	deconstruct_array(arr1, TEXTOID, elmlen, elmbyval, elmalign,
 					  &document, &null_words, &num_words);

 	for (i = 0; i < num_features - 1; i++)
 	{
 		int		cmp;

 		cmp = TextDatumCmp(features[i], features[i + 1]);

 		if (cmp > 0)
 			elog(ERROR, "Dictionary is unsorted: '%s' is out of order.\n",
 					TextDatumGetCString(features[i + 1]));
 		else if (cmp == 0)
 			elog(ERROR, "Dictionary has duplicated word: '%s'\n",
 					TextDatumGetCString(features[i + 1]));
 	}

 	returnval = classify_document(features, num_features,
 								  document, num_words, null_words);
 	pfree(features);
 	pfree(document);

 	PG_RETURN_POINTER(returnval);
 }

 static void
 gp_extract_feature_histogram_errout(char *msg) {
 	ereport(ERROR,
 		(errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
 		 errmsg(
 		"%s\ngp_extract_feature_histogram internal error.",msg)));
 }

 /*
  * The comparator for bsearch.
  */
 static int
 textdatum_cmp(const void *a, const void *b)
 {
 	return TextDatumCmp(*(Datum *) a, *(Datum *) b);
 }

 static SvecType *
 classify_document(Datum *features, int num_features,
 				  Datum *document, int num_words, bool *null_words)
 {
 	float8 * histogram = (float8 *)palloc0(sizeof(float8)*num_features);
 	SvecType * output_sfv;
 	int i;

 	for (i = 0; i < num_words; i++)
 	{
 		void   *found;

 		/* Skip if this word is NULL */
 		if (null_words[i])
 			continue;
 		found = bsearch(&document[i], features, num_features,
 						sizeof(Datum), textdatum_cmp);
 		if (found)
 		{
 			int		idx;

 			idx = ((uintptr_t) found - (uintptr_t) features) / sizeof(Datum);
 			histogram[idx]++;
 		}

 	}
 	output_sfv = svec_from_float8arr(histogram, num_features);
 	pfree(histogram);

 	return output_sfv;
 }
	#include <postgres.h>

	#include <stdio.h>
	#include <string.h>
	#include <search.h>
	#include <stdlib.h>
	#include <math.h>

	#include "access/tupmacs.h"
	#include "catalog/pg_type.h"
	#if PG_VERSION_NUM >= 90100
	#include "catalog/pg_collation.h"
	#endif
	#include "utils/array.h"
	#include "utils/builtins.h"
	#include "utils/lsyscache.h"

	#include "../../../svec/src/pg_gp/sparse_vector.h"

	Datum gp_extract_feature_histogram(PG_FUNCTION_ARGS);

	static void gp_extract_feature_histogram_errout(char *msg);

	static SvecType * classify_document(Datum *features, int num_features,
	Datum document, int num_words, bool null_words);

	#if PG_VERSION_NUM >= 90100
	#include "catalog/pg_collation.h"
	#define TextDatumCmp(x, y) (DatumGetInt32( \
	DirectFunctionCall2Coll(bttextcmp, DEFAULT_COLLATION_OID, (x), (y))))
	#else
	#define TextDatumCmp(x, y) (DatumGetInt32( \
	DirectFunctionCall2(bttextcmp, (x), (y))))
	#endif

	/**
	* gp_extract_feature_histogram
	* Approach:
	* Definitions:
	* Feature Vector:
	* A feature vector is a list of words, generally all of the possible
	* choices of words. In other words, a feature vector is a dictionary
	* and might have cardinality of 20,000 or so.
	*
	* Document:
	* A document, here identifed using a list of words. Generally a
	* document will consist of a set of words contained in the feature
	* vector, but sometimes a document will contain words that are not
	* in the feature vector.
	*
	* Sparse Feature Vector (SFV):
	* An SFV is an array of attributes defined for each feature found
	* in a document. For example, you might define an SFV where each
	* attribute is a count of the number of instances of a feature is
	* found in the document, with one entry per feature found in the
	* document.
	*
	* Example:
	* Say we have a document defined as follows:
	* document1 = {"this","is","an","example","sentence","with","some",
	* "some","repeat","repeat"}
	* And say we have a feature vector as follows:
	* features = {"foo","bar","this","is","an","baz","example","sentence",
	* "with","some","repeat","word1","word2","word3"}
	*
	* Now we'd like to create the SFV for document1. We can number each
	* feature starting at 1, so that feature(1) = foo, feature(2) = bar
	* and so on. The SFV of document1 would then be:
	* sfv(document1,features) = {0,0,1,1,1,0,1,1,1,2,2,0,0,0}
	* Note that the position in the SFV array is the number of the feature
	* vector and the attribute is the count of the number of features
	* found in each position.
	*
	* We would like to store the SFV in a terse representation that fits
	* in a small amount of memory. We also want to be able to compare the
	* number of instances where the SFV of one document intersects another.
	* This routine uses the Sparse Vector datatype to store the SFV.
	*
	* Function Signature is:
	*
	* Where:
	* features: a text array of features (words)
	* document: the document, tokenized into words
	*
	* Returns:
	* SFV of the document with counts of each feature, stored in a Sparse Vector (svec) datatype
	*/

	/**
	* Notes from Brian Dolan on how this feature vector is commonly used:
	*
	* The actual count is hardly ever used. Instead, it's turned into a weight. The most
	* common weight is called tf/idf for "Term Frequency / Inverse Document Frequency".
	* The calculation for a given term in a given document is:
	* {#Times in this document} * log {#Documents / #Documents the term appears in}
	* For instance, the term "document" in document A would have weight 1 * log (4/3). In
	* document D it would have weight 2 * log (4/3).
	* Terms that appear in every document would have tf/idf weight 0, since:
	* log (4/4) = log(1) = 0. (Our example has no term like that.)
	* That usually sends a lot of values to 0.
	*
	* In this function we're just calculating the term:
	* log {#Documents / #Documents the term appears in}
	* as an Svec.
	*
	* We'll need to have the following arguments:
	* Svec *count_corpus //count of documents in which each feature is found
	* float8 *document_count //count of all documents in corpus
	*/

	PG_FUNCTION_INFO_V1( gp_extract_feature_histogram );
	Datum gp_extract_feature_histogram(PG_FUNCTION_ARGS)
	{
	SvecType *returnval;
	ArrayType arr0, arr1;
	Datum features, document;
	int num_features, num_words;
	bool *null_words;
	int16 elmlen;
	bool elmbyval;
	char elmalign;
	int i;

	if (PG_ARGISNULL(0))
	PG_RETURN_NULL();

	/* Error checking */
	if (PG_NARGS() != 2)
	gp_extract_feature_histogram_errout(
	"gp_extract_feature_histogram called with wrong number of arguments");

	arr0 = PG_GETARG_ARRAYTYPE_P(0);
	arr1 = PG_GETARG_ARRAYTYPE_P(1);

	/* Error if dictionary is empty or contains a null */
	if (ARR_HASNULL(arr0))
	gp_extract_feature_histogram_errout(
	"dictionary argument contains a null entry");

	if (ARR_NDIM(arr0) == 0)
	gp_extract_feature_histogram_errout(
	"dictionary argument is empty");

	if (ARR_ELEMTYPE(arr0) != TEXTOID \|\| ARR_ELEMTYPE(arr1) != TEXTOID)
	gp_extract_feature_histogram_errout("the input types must be text[]");

	get_typlenbyvalalign(TEXTOID, &elmlen, &elmbyval, &elmalign);
	deconstruct_array(arr0, TEXTOID, elmlen, elmbyval, elmalign,
	&features, NULL, &num_features);
	deconstruct_array(arr1, TEXTOID, elmlen, elmbyval, elmalign,
	&document, &null_words, &num_words);

	for (i = 0; i < num_features - 1; i++)
	{
	int cmp;

	cmp = TextDatumCmp(features[i], features[i + 1]);

	if (cmp > 0)
	elog(ERROR, "Dictionary is unsorted: '%s' is out of order.\n",
	TextDatumGetCString(features[i + 1]));
	else if (cmp == 0)
	elog(ERROR, "Dictionary has duplicated word: '%s'\n",
	TextDatumGetCString(features[i + 1]));
	}

	returnval = classify_document(features, num_features,
	document, num_words, null_words);
	pfree(features);
	pfree(document);

	PG_RETURN_POINTER(returnval);
	}

	static void
	gp_extract_feature_histogram_errout(char *msg) {
	ereport(ERROR,
	(errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
	errmsg(
	"%s\ngp_extract_feature_histogram internal error.",msg)));
	}

	/*
	* The comparator for bsearch.
	*/
	static int
	textdatum_cmp(const void a, const void b)
	{
	return TextDatumCmp((Datum ) a, (Datum ) b);
	}

	static SvecType *
	classify_document(Datum *features, int num_features,
	Datum document, int num_words, bool null_words)
	{
	float8 * histogram = (float8 )palloc0(sizeof(float8)num_features);
	SvecType * output_sfv;
	int i;

	for (i = 0; i < num_words; i++)
	{
	void *found;

	/* Skip if this word is NULL */
	if (null_words[i])
	continue;
	found = bsearch(&document[i], features, num_features,
	sizeof(Datum), textdatum_cmp);
	if (found)
	{
	int idx;

	idx = ((uintptr_t) found - (uintptr_t) features) / sizeof(Datum);
	histogram[idx]++;
	}

	}
	output_sfv = svec_from_float8arr(histogram, num_features);
	pfree(histogram);

	return output_sfv;
	}