blob: 6648abb819a3dacd14e4c8cfe507da080a090a64 [file] [log] [blame]
CREATE EXTENSION gp_sparse_vector;
SET search_path TO sparse_vector;
DROP TABLE IF EXISTS features;
DROP TABLE IF EXISTS corpus;
DROP TABLE IF EXISTS documents;
DROP TABLE IF EXISTS dictionary;
-- Test simple document classIFication routines
CREATE TABLE features (dictionary text[][]) DISTRIBUTED RANDOMLY;
INSERT INTO features values ('{am,before,being,bothered,corpus,document,i,in,is,me,never,now,one,really,second,the,third,this,until}');
CREATE TABLE documents (docnum int, document text[]) DISTRIBUTED RANDOMLY;
INSERT INTO documents values (1,'{this,is,one,document,in,the,corpus}');
INSERT INTO documents values (2,'{i,am,the,second,document,in,the,corpus}');
INSERT INTO documents values (3,'{being,third,never,really,bothered,me,until,now}');
INSERT INTO documents values (4,'{the,document,before,me,is,the,third,document}');
CREATE TABLE corpus (docnum int, feature_vector svec) DISTRIBUTED RANDOMLY;
INSERT INTO corpus (SELECT docnum,gp_extract_feature_histogram((SELECT dictionary FROM features LIMIT 1),document) FROM documents);
-- Show the feature dictionary
SELECT dictionary FROM features;
-- Show each document
SELECT docnum AS Document_Number, document FROM documents ORDER BY 1;
-- The extracted feature vector for each document
SELECT docnum AS Document_Number, feature_vector::float8[] FROM corpus ORDER BY 1;
-- Count the number of times each feature occurs at least once in all documents
SELECT (vec_count_nonzero(feature_vector))::float8[] AS count_in_document FROM corpus;
-- Count all occurrences of each term in all documents
SELECT (sum(feature_vector))::float8[] AS sum_in_document FROM corpus;
-- Calculate Term Frequency / Inverse Document Frequency
SELECT docnum, (feature_vector*logidf)::float8[] AS tf_idf FROM (SELECT log(count(feature_vector)/vec_count_nonzero(feature_vector)) AS logidf FROM corpus) AS foo, corpus ORDER BY docnum;
-- Show the same calculation in compressed vector format
SELECT docnum, (feature_vector*logidf) AS tf_idf FROM (SELECT log(count(feature_vector)/vec_count_nonzero(feature_vector)) AS logidf FROM corpus) foo, corpus ORDER BY docnum;
-- Create a table with TF / IDF weighted vectors in it
DROP TABLE IF EXISTS WEIGHTS;
CREATE TABLE weights AS (SELECT docnum, (feature_vector*logidf) tf_idf FROM (SELECT log(count(feature_vector)/vec_count_nonzero(feature_vector)) AS logidf FROM corpus) foo, corpus ORDER BY docnum) DISTRIBUTED RANDOMLY;
-- Calculate the angular distance between the first document to each other document
SELECT docnum,trunc((180.*(ACOS(dmin(1.,(tf_idf%*%testdoc)/(l2norm(tf_idf)*l2norm(testdoc))))/(4.*ATAN(1.))))::numeric,2) AS angular_distance FROM weights,(SELECT tf_idf testdoc FROM weights WHERE docnum = 1 LIMIT 1) foo ORDER BY 1;
DROP TABLE features;
DROP TABLE corpus;
DROP TABLE documents;
DROP TABLE WEIGHTS;
DROP EXTENSION gp_sparse_vector;