blob: 19d4cfffb0e8d7736b682c5301a09557e6240878 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# This script implements KNN (K Nearest Neighbor) algorithm.
#
# INPUT:
# ---------------------------------------------------------------------------
# Train The input matrix as features
# Test The input matrix for nearest neighbor search
# CL The input matrix as target
# CL_T The target type of matrix CL whether
# columns in CL are continuous ( =1 ) or
# categorical ( =2 ) or not specified ( =0 )
# trans_continuous Option flag for continuous feature transformed to [-1,1]:
# FALSE = do not transform continuous variable;
# TRUE = transform continuous variable;
# k_value k value for KNN, ignore if select_k enable
# select_k Use k selection algorithm to estimate k (TRUE means yes)
# k_min Min k value( available if select_k = 1 )
# k_max Max k value( available if select_k = 1 )
# select_feature Use feature selection algorithm to select feature (TRUE means yes)
# feature_max Max feature selection
# interval Interval value for K selecting ( available if select_k = 1 )
# feature_importance Use feature importance algorithm to estimate each feature
# (TRUE means yes)
# predict_con_tg Continuous target predict function: mean(=0) or median(=1)
# START_SELECTED feature selection initial value
# ---------------------------------------------------------------------------
#
# OUTPUT:
# ---------------------------------------------------------------------------------------------
# NNR_matrix Applied clusters to X
# CL_matrix Cluster matrix
# m_feature_importance Feature importance value
# ---------------------------------------------------------------------------------------------
m_knn = function(
Matrix[Double] Train,
Matrix[Double] Test,
Matrix[Double] CL,
Integer CL_T = 0,
Integer trans_continuous = 0,
Integer k_value = 5,
Integer select_k = 0,
Integer k_min = 1,
Integer k_max = 100,
Integer select_feature = 0,
Integer feature_max = 10,
Integer interval = 1000,
Integer feature_importance = 0,
Integer predict_con_tg = 0,
Matrix[Double] START_SELECTED = matrix(0, 0, 0)
)return(
Matrix[Double] NNR_matrix,
Matrix[Double] CL_matrix,
Matrix[Double] m_feature_importance
){
m_feature_importance = matrix(0, 0, 0);
#data prepare
if( trans_continuous == 1 ){
Train = prepareKNNData( Train);
Test = prepareKNNData( Test);
}
n_records = nrow( Train );
n_features = ncol( Train );
s_selected_k = 5;
m_selected_feature = matrix(1,rows=1,cols=n_records);
if( select_k == 1 | select_feature==1 ){
#parameter check
#parameter re-define
if( select_k==1 ){
if( k_max >= n_records ){
k_max = n_records - 1;
print( "k_max should no greater than number of record, change k_max equal " +
"( number of record - 1 ) : " + k_max );
}
if( k_max >= interval ){
interval = k_max + 1;
# k_max should equal interval -1, because we drop self when search nn.
print( "interval should be no less than k_max, change interval equal : " +
interval );
}
if( k_max <= 1 )
stop( "uncorrect k_max value" );
if( k_min >= k_max )
stop( "k_min >= k_max" );
}
if( select_feature == 1 ){
if( k_value >= n_records ){
k_value = n_records - 1;
print( "k_value should be no greater than number of record, change k_value equal " +
"( number of record - 1 ) : " + k_value );
}
#Select feature only
if( nrow(START_SELECTED) == 0 & ncol(START_SELECTED) == 0 )
m_start_selected_feature = matrix( 0,1,n_features );
else
m_start_selected_feature = START_SELECTED;
}
if( select_k == 1 & select_feature == 1){
#Combined k and feature selection
print("Start combined k and feature selection ...");
[m_selected_feature,s_selected_k] =
getSelectedFeatureAndK( Train,CL,CL_T,m_start_selected_feature,
feature_max,k_min,k_max,interval );
}
else if( select_k == 1 ){
#Select k only
print("Start k select ...");
s_selected_k = getSelectedKBase( Train,CL,CL_T,k_min,k_max,interval );
}
else if( select_feature == 1 ){
#Select feature only
print("Start feature selection ... ");
[m_selected_feature,d_err] =
getSelectedFeature( Train,CL,CL_T,m_start_selected_feature,
feature_max,k_value,interval );
}
}
if( feature_importance == 1){
if( k_value >= n_records ){
k_value = n_records - 1;
print( "k_value should be no greater than number of record, make k_value equal " +
"( number of record - 1 ) : " + k_value );
}
[m_feature_importance,m_norm_feature_importance] =
getFeatureImportance(Train,CL,CL_T,k_value);
}
NNR_matrix = naiveKNNsearch(P=Train,Q=Test,K=k_value);
CL_matrix = matrix( 0,nrow( Test ),1 );
for(i in 1 : nrow(NNR_matrix))
{
NNR_tmp_matrix = matrix( 0,k_value,1 );
for( j in 1:k_value )
NNR_tmp_matrix[j,1] = CL[as.scalar( NNR_matrix[i,j] ),1];
if(CL_T == 2) {
t_cl_value = as.scalar( rowIndexMax( t(NNR_tmp_matrix) ) );
}
else {
if ( predict_con_tg == 0)
t_cl_value = mean( NNR_tmp_matrix );
else if(predict_con_tg == 1)
t_cl_value = median( NNR_tmp_matrix );
}
CL_matrix[i,1] = t_cl_value;
}
}
#naive knn search implement
naiveKNNsearch = function(
Matrix[Double] P,
Matrix[Double] Q,
Integer K
)return(
Matrix[Double] O
){
num_records = nrow (P);
num_features = ncol (P);
num_queries = nrow (Q);
Qt = t(Q);
PQt = P %*% Qt;
P2 = rowSums (P ^ 2);
D = -2 * PQt + P2;
if (K == 1) {
Dt = t(D);
O = rowIndexMin (Dt);
} else {
O = matrix (0, rows = num_queries, cols = K);
parfor (i in 1:num_queries) {
D_sorted=order(target=D[,i], by=1, decreasing=FALSE, index.return=TRUE);
O[i,] = t(D_sorted[1:K,1]);
}
}
}
#naive knn search for predict value only implement
#TODO eliminate redundancy
naiveKNNsearchForPredict = function(
matrix[double] P,
matrix[double] Q,
matrix[double] L,
integer K
)return(
matrix[double] OL
){
num_records = nrow (P);
num_features = ncol (P);
num_queries = nrow (Q);
Qt = t(Q);
PQt = P %*% Qt;
P2 = rowSums (P ^ 2);
D = -2 * PQt + P2;
if (K == 1) {
Dt = t(D);
O = rowIndexMin (Dt);
OL = matrix (0, rows = num_queries, cols = 1)
parfor( i in 1:num_queries){
OL[i,] = L[as.scalar(O[i,1]),1]
}
} else {
OL = matrix (0, rows = num_queries, cols = K);
parfor (i in 1:num_queries) {
D_sorted=order(target=cbind(D[,i],L), by=1, decreasing=FALSE, index.return=FALSE);
OL[i,] = t(D_sorted[1:K,2]);
}
}
}
getErr_k = function ( matrix[double] in_m_neighbor_value,
matrix[double] in_m_cl,
integer in_i_cl_type ,
integer in_i_k_min )
return ( matrix[double] out_m_err )
{
i_col = ncol( in_m_neighbor_value );
i_row = nrow( in_m_neighbor_value );
out_m_err = matrix( 0,i_row,i_col - in_i_k_min + 1 );
if( in_i_cl_type == 2 ) #category
m_correct = in_m_neighbor_value != in_m_cl[1:i_row,];
else #continuous
m_correct = (in_m_neighbor_value - in_m_cl[1:i_row,])^2;#ppred( in_m_neighbor_value,in_m_cl,"-" );
parfor( i in 1:i_col-in_i_k_min+1 ,check = 0 ){
out_m_err[,i] =
( rowSums( m_correct[,1:in_i_k_min + i - 1] ) / ( in_i_k_min + i - 1 ) );
}
#return err for each record and each k ( belong to range 1~max );
}
eliminateModel = function ( double s_err_mean, double s_err_vars, integer i_row )
return( boolean out_b_inactived ){
#alpha, beta, gamma, delta
d_gamma = 0.001;
d_delta = 0.001;
tmp_d_delta = cdf(target = (-d_gamma - s_err_mean)/s_err_vars, dist="t",df=i_row-1);
out_b_inactived = (tmp_d_delta < d_delta)
}
getErr = function ( matrix[double] in_m_neighbor_value,
matrix[double] in_m_cl,
integer in_i_cl_type )
return ( matrix[double] out_m_err )
{
i_col = ncol( in_m_neighbor_value );
i_row = nrow( in_m_neighbor_value );
if( in_i_cl_type == 2 ) #category
m_correct = in_m_neighbor_value != in_m_cl[1:i_row,];
else #continuous
m_correct = (in_m_neighbor_value - in_m_cl[1:i_row,])^2;
out_m_err = ( rowSums( m_correct[,1:i_col] )/( i_col ) );
}
# getSelectedFeatureAndK:
# Combine k and feature selection algorithm.
# Refer to ADD part "8.Combined k and feature selection"
# Argument:
# in_m_data input matrix as features
# in_m_data_target input matrix as target value
# in_i_is_categorical 1 = category , 0 = continuous
# in_m_init_selected S.user, initial selected feature which use specified
# in_i_max_select J, max feature selected
# k_min minimun k
# k_max maximun k
# interval block size for BRACE algorithm
#
# Reture:
# out_m_selected_feature output matrix for feature selection
# out_i_selected_k output k value for k selection
getSelectedFeatureAndK = function (
matrix[double] in_m_data,
matrix[double] in_m_data_target,
integer in_i_is_categorical, # 1 = category , 0 = continuous
matrix[double] in_m_init_selected,
integer in_i_max_select,
integer k_min,
integer k_max,
integer interval )
return(
matrix[double] out_m_selected_feature,
integer out_i_selected_k
)
{
m_err = matrix( 0,1,k_max-k_min+1 );
m_feature = matrix( 0,k_max-k_min+1,ncol( in_m_data ) );
#Step 1. For each k in [k_min,k_max] ( k_min has default value 1, k_max has default value 100 )
#in parallel select relevant features using FS+BRACE or schemata search described in Section 7.
parfor( i in k_min:k_max,check=0 ){
[m_selected_feature,d_err] =
getSelectedFeature( in_m_data,in_m_data_target,in_i_is_categorical,
in_m_init_selected,in_i_max_select,i,interval );
m_err[1,i] = d_err;
m_feature[i,] = m_selected_feature;
}
#Step 2. Output the combination of features and k with the smallest LOOCV error.
i_min_err_index = as.integer( as.scalar( rowIndexMin( m_err ) ) );
out_m_selected_feature = m_feature[i_min_err_index,];
out_i_selected_k = i_min_err_index + k_min - 1;
}
getFeatureImportance = function (
matrix[double] in_m_data,
matrix[double] in_m_data_target,
integer in_i_is_categorical, # 1 = category , 0 = continuous
integer k_value)
return(
matrix[double] out_m_feature_importance,
matrix[double] out_m_norm_feature_importance
)
{
n_feature = ncol(in_m_data)
n_record = nrow(in_m_data)
if(n_feature <= 1)
stop("can't estimate feature importance when ncol = 1")
m_err = matrix( 0,n_record,n_feature);
for(i_feature in 1:n_feature){
m_feature_select = matrix(1,1,n_feature)
m_feature_select[1,i_feature] = 0;
m_in_tmp_data = removeEmpty(target=in_m_data,margin="cols", select= m_feature_select)
m_neighbor_value = getKNeighbor( m_in_tmp_data,m_in_tmp_data,in_m_data_target,k_value);
m_tmp_err = getErr( m_neighbor_value,in_m_data_target ,in_i_is_categorical );
m_err[,i_feature] = m_tmp_err
}
out_m_feature_importance = colSums( m_err );
out_m_norm_feature_importance =
out_m_feature_importance / as.scalar(rowSums(out_m_feature_importance))
}
# prepareKNNData:
# Do data prepare - [-1,1] transform for continues variable
# Argument:
# * in_m_data input matrix as features
prepareKNNData = function(matrix[double] in_m_data)
return(matrix[double] out_m_data)
{
m_colmax = colMaxs(in_m_data);
m_colmin = colMins(in_m_data);
out_m_data = 2 * (in_m_data - m_colmin ) / ( m_colmax - m_colmin ) - 1;
}
getKNeighbor = function(matrix[double] in_m_data,
matrix[double] in_m_test_data,
matrix[double] in_m_cl,
integer in_i_k_max)
return (matrix[double] out_m_neighbor_value)
{
# to naive
m_search_result = naiveKNNsearchForPredict(in_m_data, in_m_test_data, in_m_cl, in_i_k_max + 1)
out_m_neighbor_value = m_search_result[ , 2 : in_i_k_max + 1]
}
# getSelectedKBase:
# k selection algorithm with simple KNN algorithm.
# Argument:
# * in_m_data input matrix as features
# * in_m_data_target input matrix as target value
# * in_i_is_categorical 1 = category , 0 = continuous
# * k_min minimum k
# * k_max maximum k
# * interval block size
#
# Return:
# * k output k value for k selection
getSelectedKBase = function(matrix[double] in_m_data,
matrix[double] in_m_data_target,
integer in_i_is_categorical, # 1 = category, 0 = continuous
integer k_min,
integer k_max,
integer interval)
return (integer k)
{
b_continue_loop = TRUE;
i_iter = 1;
i_record = nrow(in_m_data);
i_active_model_number = k_max - k_min + 1;
m_active_flag = matrix(0, 1, i_active_model_number);
m_iter_err_sum = matrix(0, 1, k_max - k_min + 1);
m_iter_err_sum_squared = matrix(0, 1, k_max - k_min + 1);
while(b_continue_loop)
{
# 1.build k-d tree? , or use hash method
# 2.search data to get k_max nearest neighbor
i_process_item = i_iter * interval;
if(i_process_item >= i_record) {
i_process_item = i_record;
b_continue_loop = FALSE;
}
i_process_begin_item = ((i_iter - 1) * interval) + 1;
i_process_end_item = i_process_item;
m_neighbor_value = getKNeighbor(in_m_data, in_m_data[i_process_begin_item : i_process_end_item, ], in_m_data_target, k_max);
# 3.get matrix of err from k_min to k_max
m_err = getErr_k(m_neighbor_value, in_m_data_target[i_process_begin_item : i_process_end_item, ], in_i_is_categorical, k_min);
# 4.check this matrix to drop unnessary record
m_active_flag_tmp = matrix(0, 1, ncol(m_err));
s_rows_number = i_process_item;
m_iter_err_sum = colSums(m_err) + m_iter_err_sum;
m_iter_err_sum_squared = colSums(m_err ^ 2) + m_iter_err_sum_squared;
m_err_mean = - outer(t(m_iter_err_sum), m_iter_err_sum , "-") / s_rows_number;
m_err_vars = ( m_err_mean ^2 * s_rows_number -
2 * m_err_mean * m_iter_err_sum + m_iter_err_sum_squared) / (s_rows_number-1);
m_err_vars = sqrt(m_err_vars);
parfor(i in 1 : ncol(m_err), check = 0) {
parfor(j in 1 : ncol(m_err), check = 0) {
b_execute_block = !(j == i
| as.scalar(m_active_flag_tmp[1, i]) == 1 # i has dropped, ignore this case
| as.scalar(m_active_flag_tmp[1, j]) == 1) # j has dropped, ignore this case
if(b_execute_block) {
b_flag = eliminateModel(as.scalar(m_err_mean[i, j]), as.scalar(m_err_vars[i, j]), s_rows_number);
if(b_flag == TRUE)
m_active_flag_tmp[1, i] = 1;
}
}
}
m_active_flag = ((m_active_flag + m_active_flag_tmp) >= 1);
i_active_model_number = -sum(m_active_flag - 1);
# 5.break while check
if(i_active_model_number <= 1)
b_continue_loop = FALSE;
i_iter = i_iter + 1;
print("i_iter" + i_iter)
}
k = 0;
if(i_active_model_number == 0) {
print("All k kick out, use min of range " + k_min);
k = k_min;
}
else if(i_active_model_number == 1) {
k = k_min + as.integer(as.scalar(rowIndexMin(m_active_flag))) - 1;
print( "Get k, which value is " + k );
}
else {
m_err_for_order =
cbind(t(m_iter_err_sum), matrix(seq(k_min, k_max, 1), k_max - k_min + 1, 1));
m_err_for_order = removeEmpty(
target = m_err_for_order * t(m_active_flag == 0), margin = "rows");
for(i in 1 : nrow(m_err_for_order)) {
print("k:" + as.scalar(m_err_for_order[i, 2]) +
", err:" + as.scalar(m_err_for_order[i, 1]));
}
m_err_order = order(target = m_err_for_order, by = 1, decreasing = FALSE, index.return = FALSE);
k = as.integer(as.scalar(m_err_order[1, 2]));
print("Get minimum LOOCV error, which value is " + k);
}
}
# getSelectedFeature:
# feature selection algorithm.
# Refer to ADD part "7.1 FS+BRACE"
# Argument:
# in_m_data input matrix as features
# in_m_data_target input matrix as target value
# in_i_is_categorical 1 = category , 0 = continuous
# in_m_init_selected S.user, initial selected feature which use specified
# in_i_max_select J, max feature selected
# k_value k
# interval block size for BRACE algorithm
#
# Return:
# out_m_selected_feature output matrix for feature selection
# out_d_min_LOOCV output err
getSelectedFeature = function (
matrix[double] in_m_data,
matrix[double] in_m_data_target,
integer in_i_is_categorical, # 1 = category , 0 = continuous
matrix[double] in_m_init_selected,
integer in_i_max_select,
integer k_value,
integer interval )
return(
matrix[double] out_m_selected_feature,
double out_d_min_LOOCV
)
{
i_n_record = nrow( in_m_data );
i_n_column = ncol( in_m_data );
m_main_selected_flag = in_m_init_selected;
b_no_feature_selected = TRUE;
if( sum( in_m_init_selected ) >= 1 )
b_no_feature_selected = FALSE;
d_max_err_value = ( max( in_m_data_target ) - min( in_m_data_target ) ) * 100;
b_continue_main_loop = TRUE; #level 1 while loop flag
d_min_LOOCV = Inf;
while( b_continue_main_loop ){
m_feature_selected_flag = m_main_selected_flag;
m_this_model_selected_flag = TRUE;
i_index_min_LOOCV = -1; # flag for which model win in BRACE algorith
b_selected_morethan_one = FALSE;
b_continue_loop = TRUE; #level 2 while loop flag
i_iter = 1;
m_iter_err_sum = matrix( 0,1,i_n_column+1 );
m_iter_err_sum_squared = matrix( 0,1,i_n_column+1 );
while( b_continue_loop ){
i_process_item = i_iter*interval;
if( i_process_item >= i_n_record ){
i_process_item = i_n_record;
b_continue_loop = FALSE;
}
i_process_begin_item = (i_iter - 1)*interval + 1
i_process_end_item = i_process_item
m_err = matrix( d_max_err_value,i_process_end_item - i_process_begin_item + 1,i_n_column+1 );
if( b_no_feature_selected == TRUE ){
parfor( i in 1:i_n_column ,check=0){
if( as.scalar( m_feature_selected_flag[1,i] ) != 1 ){
m_tmp_process_data = in_m_data[,i];
m_neighbor_value = getKNeighbor(m_tmp_process_data,
m_tmp_process_data[i_process_begin_item:i_process_end_item,], in_m_data_target,k_value );
m_tmp_err = getErr(m_neighbor_value,
in_m_data_target[i_process_begin_item:i_process_end_item,], in_i_is_categorical );
m_err[,i] = m_tmp_err;
}
}
}else{
#Use m_main_selected_flag but not m_feature_selected_flag,
# m_main_selected_flag: which feature are init selected
# m_feature_selected_flag: which feature are dropped & init selected
m_tmp_data = removeEmpty( target=in_m_data ,margin="cols", select = m_main_selected_flag);
if( m_this_model_selected_flag == TRUE ){
m_neighbor_value = getKNeighbor(
m_tmp_data,m_tmp_data[i_process_begin_item:i_process_end_item,],in_m_data_target, k_value );
m_tmp_err = getErr( m_neighbor_value,in_m_data_target[i_process_begin_item:i_process_end_item,],in_i_is_categorical );
m_err[,i_n_column+1] = m_tmp_err;
}
parfor( i in 1:i_n_column ,check=0){
if( as.scalar( m_feature_selected_flag[1,i] ) != 1 ){
m_tmp_process_data = cbind( m_tmp_data,in_m_data[,i] );
m_neighbor_value = getKNeighbor(
m_tmp_process_data,m_tmp_process_data[i_process_begin_item:i_process_end_item,],in_m_data_target,k_value );
m_tmp_err = getErr(
m_neighbor_value,in_m_data_target[i_process_begin_item:i_process_end_item,],in_i_is_categorical );
m_err[,i] = m_tmp_err;
}
}
}
if( m_this_model_selected_flag == TRUE )
m_active_flag_tmp = cbind( m_feature_selected_flag,matrix( 0,1,1 ) );
else
m_active_flag_tmp = cbind( m_feature_selected_flag,matrix( 1,1,1 ) );
s_rows_number = i_process_item
m_iter_err_sum = colSums(m_err) + m_iter_err_sum
m_iter_err_sum_squared = colSums(m_err ^ 2) + m_iter_err_sum_squared
m_err_mean = - outer(t(m_iter_err_sum), m_iter_err_sum , "-") / s_rows_number
m_err_vars = ( m_err_mean ^2 * s_rows_number -
2 * m_err_mean * m_iter_err_sum + m_iter_err_sum_squared) / (s_rows_number-1)
m_err_vars = sqrt(m_err_vars)
parfor( i in 1:ncol( m_err ) ){
parfor( j in 1:ncol( m_err ) ,check=0){
b_execute_block = TRUE;
if( j==i ) b_execute_block = FALSE;
if( as.scalar( m_active_flag_tmp[1,i] ) == 1 ) b_execute_block = FALSE;
#i has dropped, ignore this case
if( as.scalar( m_active_flag_tmp[1,j] ) == 1 ) b_execute_block = FALSE;
#j has dropped, ignore this case
if( b_execute_block ){
b_flag = eliminateModel( as.scalar(m_err_mean[i,j]),as.scalar(m_err_vars[i,j]),s_rows_number);
if( b_flag == TRUE )
m_active_flag_tmp[1,i] = 1;
}
}
}
#We mark bit to 1 for selected feature before current loop,
#and mark bit to 1 also for dropped feature in current loop
if( sum( m_active_flag_tmp != 1 ) > 1 )
b_selected_morethan_one = TRUE;
m_col_sums_err = m_iter_err_sum #colSums( m_err );
i_index_min_LOOCV = as.scalar( rowIndexMin( m_col_sums_err ) );
d_min_LOOCV = as.scalar( m_col_sums_err[1,i_index_min_LOOCV] );
i_index_min_LOOCV = i_index_min_LOOCV%% ( i_n_column+1 )
if( sum( m_active_flag_tmp != 1 ) <= 1 )
b_continue_loop = FALSE;
if( as.scalar( m_active_flag_tmp[1,i_n_column+1] ) == 1 )
m_this_model_selected_flag = FALSE;
m_feature_selected_flag = m_active_flag_tmp[,1:i_n_column];
i_iter = i_iter + 1;
}
#select current model, jump out.
if( i_index_min_LOOCV == 0 ){
b_continue_main_loop = FALSE;
print( "Select Current model" );
}else{
print( "select feature " + i_index_min_LOOCV + ", change bit value to 1" );
m_main_selected_flag[1,i_index_min_LOOCV] = 1;
b_no_feature_selected = FALSE;
}
if( sum( m_main_selected_flag - in_m_init_selected ) >= in_i_max_select ){
#select more than 10
b_continue_main_loop = FALSE;
}
if( sum( m_main_selected_flag ) == i_n_column ){
#all selected
b_continue_main_loop = FALSE;
}
}
out_m_selected_feature = m_main_selected_flag;
out_d_min_LOOCV = d_min_LOOCV;
}