blob: a4e4477862afe314aded2ccf17f696b337f24fb7 [file] [log] [blame]
DROP TABLE IF EXISTS features;
DROP TABLE IF EXISTS corpus;
DROP TABLE IF EXISTS documents;
DROP TABLE IF EXISTS dictionary;
-- Test simple document classIFication routines
CREATE TABLE features (a text[][]) DISTRIBUTED RANDOMLY;
INSERT INTO features values ('{am,before,being,bothered,corpus,document,i,in,is,me,never,now,one,really,second,the,third,this,until}');
DROP TABLE IF EXISTS documents;
CREATE TABLE documents (docnum int, a text[]) DISTRIBUTED RANDOMLY;
INSERT INTO documents values (1,'{this,is,one,document,in,the,corpus}');
INSERT INTO documents values (2,'{i,am,the,second,document,in,the,corpus}');
INSERT INTO documents values (3,'{being,third,never,really,bothered,me,until,now}');
INSERT INTO documents values (4,'{the,document,before,me,is,the,third,document}');
CREATE TABLE corpus (docnum int, a svec) DISTRIBUTED RANDOMLY;
INSERT INTO corpus (SELECT docnum,gp_extract_feature_histogram((SELECT a FROM features LIMIT 1),a) FROM documents);
\qecho Show the feature dictionary
SELECT a dictionary FROM features;
\qecho Show each document
SELECT docnum Document_Number, a document FROM documents ORDER BY 1;
\qecho The extracted feature vector for each document
SELECT docnum Document_Number, a::float8[] feature_vector FROM corpus ORDER BY 1;
\qecho Count the number of times each feature occurs at least once in all documents
SELECT (vec_count_nonzero(a))::float8[] count_in_document FROM corpus;
\qecho Count all occurrences of each term in all documents
SELECT (sum(a))::float8[] sum_in_document FROM corpus;
\qecho Calculate Term Frequency / Inverse Document Frequency
SELECT docnum, (a*logidf)::float8[] tf_idf FROM (SELECT log(count(a)/vec_count_nonzero(a)) logidf FROM corpus) foo, corpus ORDER BY docnum;
\qecho Show the same calculation in compressed vector format
SELECT docnum, (a*logidf) tf_idf FROM (SELECT log(count(a)/vec_count_nonzero(a)) logidf FROM corpus) foo, corpus ORDER BY docnum;
\qecho Create a table with TF / IDF weighted vectors in it
DROP TABLE IF EXISTS WEIGHTS;
CREATE TABLE weights AS (SELECT docnum, (a*logidf) tf_idf FROM (SELECT log(count(a)/vec_count_nonzero(a)) logidf FROM corpus) foo, corpus ORDER BY docnum) DISTRIBUTED RANDOMLY;
\qecho Calculate the angular distance between the first document to each other document
SELECT docnum,trunc((180.*(ACOS(dmin(1.,(tf_idf%*%testdoc)/(l2norm(tf_idf)*l2norm(testdoc))))/(4.*ATAN(1.))))::numeric,2) angular_distance FROM weights,(SELECT tf_idf testdoc FROM weights WHERE docnum = 1 LIMIT 1) foo ORDER BY 1;