blob: e5cc3663245ca7fa859da232b1c5c74fb0499f0b [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT IMPLEMENTS KNN( K Nearest Neighbor ) ALGORITHM
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT OPTIONAL MEANING
# ---------------------------------------------------------------------------------------------
# X String --- N Location to read the input matrix as features
# T String --- N Location to read the input matrix for nearest neighbor
# search
# Y String --- Y Location to read the input matrix as target
# Y_T String --- Y Location to read the target type of matrix Y whether
# columns in Y are continuous ( =1 ) or
# categorical ( =2 )
# K String --- Y Location to write the result of selected k value
# PR String --- Y Location to write the predicted value result,
# it's optional when Y is NOT specified.
# NNR String --- N Location to write the nearest neighbor index result for T.
# trans_continuous int 0 Y Option flag for continuous feature transformed to [-1,1]:
# 0 = do not transform continuous variable;
# 1 = transform continuous variable;
# select_k int 0 Y Use k selection algorithm to estimate k
# ( 1 means yes )
# k_min int 1 Y Min k value( available if select_k = 1 )
# k_max int 100 Y Max k value( available if select_k = 1 )
# select_feature int 0 Y Use feature selection algorithm to select feature
# ( 1 means yes )
# START_SELECTED String --- Y Location to read feature selection initinal value
# FEATURE_SELECTED String --- Y Location to write feature selection result
# feature_max int 10 Y Max feature selection
# interval int 1000 Y Interval value for K selecting ( available if select_k = 1 )
# feature_importance int 0 Y Use feature importance algorithm to estimate each feature
# ( 1 means yes )
# FEATURE_IMPORTANCE_VALUE String --- Y Location to write feature importance result
# k_value int 5 Y k value for KNN, ignore if select_k enable
# predict_con_tg int 0 Y Continuous target predict function: mean(=0) or
# median(=1)
# fmt String "text" Y Matrix output format for MB, ML, usually "text" or "csv"
# ---------------------------------------------------------------------------------------------
# OUTPUT: Matrix PR, Matrix NNR, Matrix FEATURE_IMPORTANCE_VALUE
#
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemML.jar -f knn.dml -nvargs X=%datapath%/train.csv
# T=%datapath%/test.csv NNR=%tmpdir%/NNR.csv fmt=csv
print( "start knn algorithm" );
#input data
TrainPath = $X;
TestPath = $T;
CLPath = ifdef( $Y," " );
CLTypePath = ifdef( $Y_T," " );
#output data
NNRPath = $NNR;
PRPath = ifdef( $PR," " );
#parameter
epsilon = ifdef( $epsilon,0 );
distance_type = ifdef( $distance_type,1 );
trans_continuous = ifdef( $trans_continuous,0 );
predict_con_tg = ifdef($predict_con_tg,0);
k_value = ifdef( $k_value,5 );
FMT = ifdef( $fmt,"text" );
#k select parameter
select_k = ifdef( $select_k,0 );
k_min = ifdef( $k_min,1 );
k_max = ifdef( $k_max,100 );
#feature select parameter
select_feature = ifdef( $select_feature,0 );
feature_max = ifdef( $feature_max,10 );
#feature importance parameter
feature_importance = ifdef ($feature_importance,0);
#Read train data
Train = read ( TrainPath );
Test = read ( TestPath );
#data prepare
if( trans_continuous == 1 ){
print( "Do data prepare to transfer continuous variable to [-1,1] ..." );
Train = prepareKNNData( Train);
Test = prepareKNNData( Test);
}
n_records = nrow( Train );
n_features = ncol( Train );
s_selected_k = 5;
m_selected_feature = matrix(1,rows=1,cols=n_records);
if( select_k == 1 | select_feature==1 ){
default_interval = as.integer( ceil( sqrt( n_records ) ) );
interval = ifdef( $interval,default_interval );
print( "interval is " + interval );
#parameter check
if( CLPath == " " | CLTypePath == " "){
stop( "Y or Y_T must be set for k selection algorithm" );
}
#parameter re-define
if( select_k==1 ){
if( k_max >= n_records ){
k_max = n_records - 1;
print( "k_max should no greater than number of record, change k_max equal " +
"( number of record - 1 ) : " + k_max );
}
if( k_max >= interval ){
interval = k_max + 1;
# k_max should equal interval -1, because we drop self when search nn.
print( "interval should be no less than k_max, change interval equal : " +
interval );
}
if( k_max <= 1 ){
stop( "uncorrect k_max value" );
}
if( k_min >= k_max ){
stop( "k_min >= k_max" );
}
}
if( select_feature == 1 ){
if( k_value >= n_records ){
k_value = n_records - 1;
print( "k_value should be no greater than number of record, change k_value equal " +
"( number of record - 1 ) : " + k_value );
}
#Select feature only
StartSelectedPath = ifdef( $START_SELECTED," " );
SelectedPath = ifdef( $FEATURE_SELECTED," " );
if( StartSelectedPath == " " ){
m_start_selected_feature = matrix( 0,1,n_features );
}else{
m_start_selected_feature = read( StartSelectedPath );
}
}
CL = read( CLPath );
CLType = read( CLTypePath );
type = as.integer( as.scalar( CLType[1,1] ) );
if( select_k == 1 & select_feature == 1){
#Combined k and feature selection
print("Start combined k and feature selection ...");
[m_selected_feature,s_selected_k] =
getSelectedFeatureAndK( Train,CL,type,m_start_selected_feature,
feature_max,k_min,k_max,interval );
}else if( select_k == 1 ){
#Select k only
print("Start k select ...");
s_selected_k = getSelectedKBase( Train,CL,type,k_min,k_max,interval );
}else if( select_feature == 1 ){
#Select feature only
print("Start feature selection ... ");
[m_selected_feature,d_err] =
getSelectedFeature( Train,CL,type,m_start_selected_feature,
feature_max,k_value,interval );
}
}
if( feature_importance == 1){
if( k_value >= n_records ){
k_value = n_records - 1;
print( "k_value should be no greater than number of record, make k_value equal " +
"( number of record - 1 ) : " + k_value );
}
CL = read( CLPath );
CLType = read( CLTypePath );
type = as.integer( as.scalar( CLType[1,1] ) );
FeatureImportanceExportPath = ifdef( $FEATURE_IMPORTANCE_VALUE," " );
if(FeatureImportanceExportPath != " "){
[m_feature_importance,m_norm_feature_importance] =
getFeatureImportance(Train,CL,type,k_value);
#print( "feature importance is " );
#tmp = matrixDisplay( m_feature_importance );
write ( m_feature_importance,FeatureImportanceExportPath,format=FMT );
}
}
# knn search
print("Start knn search ...")
# apply k selection result
if(select_k == 1){
k_value = s_selected_k
}
if( k_value >= n_records ){
k_value = n_records - 1;
print( "k_value should be no greater than number of record, make k_value equal " +
"( number of record - 1 )" );
}
print( "k value is " + k_value );
# apply feature selection result
if( select_feature == 1 ){
if( n_features > sum( m_selected_feature ) ){
Train = removeEmpty( target = Train,margin="cols",select = m_selected_feature);
Test = removeEmpty( target = Test,margin="cols",select = m_selected_feature );
}
}
NNR_matrix = naiveKNNsearch(Train,Test,k_value)
write( NNR_matrix,NNRPath,format=FMT );
# main part for knn predicted
if( CLPath != " " ){
CL = read ( CLPath );
if( CLTypePath == " " ){
stop( "Set Y but Miss Y_T for label type" );
}
CLType = read( CLTypePath );
i_cl_type = as.integer( as.scalar( CLType[1,1] ) );
}
CL_matrix = matrix( 0,nrow( Test ),1 );
if( CLPath != " " ){
parfor( i in 1:nrow( NNR_matrix ) ,check=0){
NNR_tmp_matrix = matrix( 0,k_value,1 );
for( j in 1:k_value ){
NNR_tmp_matrix[j,1] = CL[as.scalar( NNR_matrix[i,j] ),1] ;
}
t_cl_value = 0.0 #test data label value
if( i_cl_type == 2 ){
t_cl_value = as.scalar( rowIndexMax( t( table( NNR_tmp_matrix,1 ) ) ) )
}else{
if ( predict_con_tg == 0){
t_cl_value = mean( NNR_tmp_matrix );
} else if(predict_con_tg == 1){
t_cl_value = median( NNR_tmp_matrix );
}
}
CL_matrix[i,1] = t_cl_value;
}
}
if( PRPath != " " ){
write( CL_matrix,PRPath,format=FMT );
}
#naive knn search implement
naiveKNNsearch = function(
matrix[double] P,
matrix[double] Q,
integer K
)return(
matrix[double] O
){
num_records = nrow (P);
num_features = ncol (P);
num_queries = nrow (Q);
Qt = t(Q);
PQt = P %*% Qt;
P2 = rowSums (P ^ 2);
D = -2 * PQt + P2;
if (K == 1) {
Dt = t(D);
O = rowIndexMin (Dt);
} else {
O = matrix (0, rows = num_queries, cols = K);
parfor (i in 1:num_queries) {
D_sorted=order(target=D[,i], by=1, decreasing=FALSE, index.return=TRUE);
O[i,] = t(D_sorted[1:K,1]);
}
}
}
#naive knn search for predict value only implement
naiveKNNsearchForPredict = function(
matrix[double] P,
matrix[double] Q,
matrix[double] L,
integer K
)return(
matrix[double] OL
){
num_records = nrow (P);
num_features = ncol (P);
num_queries = nrow (Q);
Qt = t(Q);
PQt = P %*% Qt;
P2 = rowSums (P ^ 2);
D = -2 * PQt + P2;
if (K == 1) {
Dt = t(D);
O = rowIndexMin (Dt);
OL = matrix (0, rows = num_queries, cols = 1)
parfor( i in 1:num_queries){
OL[i,] = L[as.scalar(O[i,1]),1]
}
} else {
OL = matrix (0, rows = num_queries, cols = K);
parfor (i in 1:num_queries) {
D_sorted=order(target=append(D[,i],L), by=1, decreasing=FALSE, index.return=FALSE);
OL[i,] = t(D_sorted[1:K,2]);
}
}
}
# prepareKNNData:
# Do data prepare - [-1,1] transform for continues variable
# Argument:
# in_m_data input matrix as features
prepareKNNData = function (
matrix[double] in_m_data
)
return(
matrix[double] out_m_data
)
{
m_colmax = colMaxs(in_m_data);
m_colmin = colMins(in_m_data);
out_m_data = 2*( in_m_data - m_colmin )/( m_colmax - m_colmin ) - 1;
}
getKNeighbor = function ( matrix[double] in_m_data,
matrix[double] in_m_test_data,
matrix[double] in_m_cl,
integer in_i_k_max )
return ( matrix[double] out_m_neighbor_value ) {
#to naive
m_search_result = naiveKNNsearchForPredict(in_m_data,in_m_test_data,in_m_cl,in_i_k_max + 1)
out_m_neighbor_value = m_search_result[ ,2:in_i_k_max + 1]
}
getErr_k = function ( matrix[double] in_m_neighbor_value,
matrix[double] in_m_cl,
integer in_i_cl_type ,
integer in_i_k_min )
return ( matrix[double] out_m_err ){
i_col = ncol( in_m_neighbor_value );
i_row = nrow( in_m_neighbor_value );
out_m_err = matrix( 0,i_row,i_col - in_i_k_min + 1 );
if( in_i_cl_type == 2 ){
#category
m_correct = in_m_neighbor_value != in_m_cl[1:i_row,];
}else{
#continues
m_correct = (in_m_neighbor_value - in_m_cl[1:i_row,])^2;#ppred( in_m_neighbor_value,in_m_cl,"-" );
}
parfor( i in 1:i_col-in_i_k_min+1 ,check = 0 ){
out_m_err[,i] =
( rowSums( m_correct[,1:in_i_k_min + i - 1] ) / ( in_i_k_min + i - 1 ) );
}
#return err for each record and each k ( belong to range 1~max );
}
/*
eliminateModel = function ( matrix[double] in_j_err, matrix[double] in_j1_err )
return( boolean out_b_inactived ){
out_b_inactived = FALSE;
i_row = nrow( in_j_err );
m_err_mean = as.scalar( colSums( in_j_err - in_j1_err ) / i_row );
m_err_variance = as.scalar(
sqrt( colSums( ( in_j_err - in_j1_err - m_err_mean ) ^ 2 ) / ( i_row-1 )))
#alpha, beta, gamma, delta
d_gamma = 0.001;
d_delta = 0.001;
tmp_d_delta = cdf(
target = ( -d_gamma - m_err_mean ) / m_err_variance, dist="t",df=i_row-1 );
#print( "m_err_mean=" + m_err_mean + ",m_err_variance=" +
# m_err_variance + ",tmp_d_delta=" + tmp_d_delta );
if( tmp_d_delta < d_delta ){
out_b_inactived = TRUE;
}
}*/
eliminateModel = function ( double s_err_mean, double s_err_vars, integer i_row )
return( boolean out_b_inactived ){
out_b_inactived = FALSE;
#alpha, beta, gamma, delta
d_gamma = 0.001;
d_delta = 0.001;
tmp_d_delta = cdf(
target = ( -d_gamma - s_err_mean ) / s_err_vars, dist="t",df=i_row-1 );
#print( "m_err_mean=" + m_err_mean + ",m_err_variance=" +
# m_err_variance + ",tmp_d_delta=" + tmp_d_delta );
if( tmp_d_delta < d_delta ){
out_b_inactived = TRUE;
}
}
# getSelectedKBase:
# k selection algorithm with simple KNN algorithm.
# Refer to ADD part "6.3 Optimized LOOCV + BRACE"
# Argument:
# in_m_data input matrix as features
# in_m_data_target input matrix as target value
# in_i_is_categorical 1 = category , 0 = continuous
# k_min minimun k
# k_max maximun k
# interval block size for BRACE algorithm
#
# Reture:
# k output k value for k selection
getSelectedKBase = function (
matrix[double] in_m_data,
matrix[double] in_m_data_target,
integer in_i_is_categorical, # 1 = category , 0 = continuous
integer k_min,
integer k_max,
integer interval )
return(
integer k
)
{
b_continue_loop = TRUE;
i_iter = 1;
i_record = nrow( in_m_data );
m_active_flag = matrix( 0,1,k_max-k_min+1 );
i_active_model_number = k_max - k_min + 1;
m_iter_err_sum = matrix( 0,1,k_max-k_min+1);
m_iter_err_sum_squared = matrix( 0,1,k_max-k_min+1);
while( b_continue_loop ){
# 1.build k-d tree? , or use hash method
# 2.search data to get k_max nearest neighbor
i_process_item = i_iter*interval;
if( i_process_item >= i_record ){
i_process_item = i_record;
b_continue_loop = FALSE;
}
i_process_begin_item = (i_iter - 1)*interval + 1
#i_process_begin_item = 1
i_process_end_item = i_process_item
m_neighbor_value = getKNeighbor( in_m_data,in_m_data[i_process_begin_item:i_process_end_item,],in_m_data_target,k_max );
# 3.get matrix of err from k_min to k_max
m_err = getErr_k( m_neighbor_value,in_m_data_target[i_process_begin_item:i_process_end_item,],in_i_is_categorical,k_min );
# 4.check this matrix to drop unnessary record
m_active_flag_tmp = matrix( 0,1,ncol( m_err ) );
s_rows_number = i_process_item#nrow(m_err);
m_iter_err_sum = colSums(m_err) + m_iter_err_sum
m_iter_err_sum_squared = colSums(m_err ^ 2) + m_iter_err_sum_squared
m_err_mean = - outer(t(m_iter_err_sum), m_iter_err_sum , "-") / s_rows_number
m_err_vars = ( m_err_mean ^2 * s_rows_number -
2 * m_err_mean * m_iter_err_sum + m_iter_err_sum_squared) / (s_rows_number-1)
m_err_vars = sqrt(m_err_vars)
parfor( i in 1:ncol( m_err ) ){
#m_err_i = m_err - m_err[,i];
#m_err_i_mean = colMeans(m_err_i);
#m_err_i_vars = colVars(m_err_i);
parfor( j in 1:ncol( m_err ),check=0 ){
b_execute_block = TRUE;
if( j==i ) b_execute_block = FALSE;
if( as.scalar( m_active_flag_tmp[1,i] ) == 1 ) b_execute_block = FALSE;
#i has dropped, ignore this case
if( as.scalar( m_active_flag_tmp[1,j] ) == 1 ) b_execute_block = FALSE;
#j has dropped, ignore this case
if( b_execute_block ){
b_flag = eliminateModel( as.scalar(m_err_mean[i,j]),as.scalar(m_err_vars[i,j]),s_rows_number);
if( b_flag == TRUE ){
m_active_flag_tmp[1,i] = 1;
}
}
}
}
#m_tmp = ppred( colSums( m_active_flag_tmp ),1,">=" );
m_active_flag = m_active_flag + m_active_flag_tmp >= 1;
i_active_model_number = -sum( m_active_flag - 1 );
# 5.break while check
if( i_active_model_number <= 1 ){
b_continue_loop = FALSE;
}
i_iter = i_iter + 1;
print("i_iter" + i_iter)
}
k = 0;
if( i_active_model_number == 0 ){
print( "All k kick out, use min of range " + k_min );
k = k_min;
}else if( i_active_model_number == 1 ){
k = k_min + as.integer( as.scalar( rowIndexMin( m_active_flag ) ) ) - 1;
print( "Get k, which value is " + k );
}else{
m_err_for_order =
append( t( m_iter_err_sum ), matrix( seq( k_min,k_max,1 ),k_max-k_min+1,1 ) );
m_err_for_order = removeEmpty(
target=m_err_for_order * t( m_active_flag == 0 ),margin="rows" );
for( i in 1:nrow( m_err_for_order ) ){
print( "k:" + as.scalar( m_err_for_order[i,2] ) +
",err:" + as.scalar( m_err_for_order[i,1] ) );
}
m_err_order = order( target=m_err_for_order,by=1,decreasing=FALSE,index.return=FALSE );
k = as.integer( as.scalar( m_err_order[1,2] ) );
print( "Get minimum LOOCV error, which value is " + k );
}
}
getErr = function ( matrix[double] in_m_neighbor_value,
matrix[double] in_m_cl,
integer in_i_cl_type )
return ( matrix[double] out_m_err ){
i_col = ncol( in_m_neighbor_value );
i_row = nrow( in_m_neighbor_value );
if( in_i_cl_type == 2 ){
#category
m_correct = in_m_neighbor_value != in_m_cl[1:i_row,];
}else{
#continues
m_correct = (in_m_neighbor_value - in_m_cl[1:i_row,])^2;#ppred( in_m_neighbor_value,in_m_cl,"-" );
}
out_m_err = ( rowSums( m_correct[,1:i_col] )/( i_col ) );
}
# getSelectedFeature:
# feature selection algorithm.
# Refer to ADD part "7.1 FS+BRACE"
# Argument:
# in_m_data input matrix as features
# in_m_data_target input matrix as target value
# in_i_is_categorical 1 = category , 0 = continuous
# in_m_init_selected S.user, initial selected feature which use specified
# in_i_max_select J, max feature selected
# k_value k
# interval block size for BRACE algorithm
#
# Reture:
# out_m_selected_feature output matrix for feature selection
# out_d_min_LOOCV output err
getSelectedFeature = function (
matrix[double] in_m_data,
matrix[double] in_m_data_target,
integer in_i_is_categorical, # 1 = category , 0 = continuous
matrix[double] in_m_init_selected,
integer in_i_max_select,
integer k_value,
integer interval )
return(
matrix[double] out_m_selected_feature,
double out_d_min_LOOCV
)
{
i_n_record = nrow( in_m_data );
i_n_column = ncol( in_m_data );
m_main_selected_flag = in_m_init_selected;
b_no_feature_selected = TRUE;
if( sum( in_m_init_selected ) >= 1 ){
b_no_feature_selected = FALSE;
}
d_max_err_value = ( max( in_m_data_target ) - min( in_m_data_target ) ) * 100;
b_continue_main_loop = TRUE; #level 1 while loop flag
d_min_LOOCV = 1.0/0.0;
while( b_continue_main_loop ){
m_feature_selected_flag = m_main_selected_flag;
m_this_model_selected_flag = TRUE;
i_index_min_LOOCV = -1; # flag for which model win in BRACE algorith
b_selected_morethan_one = FALSE;
b_continue_loop = TRUE; #level 2 while loop flag
i_iter = 1;
m_iter_err_sum = matrix( 0,1,i_n_column+1 );
m_iter_err_sum_squared = matrix( 0,1,i_n_column+1 );
while( b_continue_loop ){
#print( "*************" + i_iter + "*************" );
i_process_item = i_iter*interval;
if( i_process_item >= i_n_record ){
i_process_item = i_n_record;
b_continue_loop = FALSE;
}
i_process_begin_item = (i_iter - 1)*interval + 1
#i_process_begin_item = 1
i_process_end_item = i_process_item
m_err = matrix( d_max_err_value,i_process_end_item - i_process_begin_item + 1,i_n_column+1 );
if( b_no_feature_selected == TRUE ){
parfor( i in 1:i_n_column ,check=0){
if( as.scalar( m_feature_selected_flag[1,i] ) != 1 ){
m_tmp_process_data = in_m_data[,i];
m_neighbor_value = getKNeighbor(
m_tmp_process_data,m_tmp_process_data[i_process_begin_item:i_process_end_item,],
in_m_data_target,k_value );
m_tmp_err = getErr(
m_neighbor_value,in_m_data_target[i_process_begin_item:i_process_end_item,],in_i_is_categorical );
m_err[,i] = m_tmp_err;
}
}
}else{
#Use m_main_selected_flag but not m_feature_selected_flag,
# m_main_selected_flag: which feature are init selected
# m_feature_selected_flag: which feature are dropped & init selected
m_tmp_data = removeEmpty( target=in_m_data ,margin="cols", select = m_main_selected_flag);
if( m_this_model_selected_flag == TRUE ){
m_neighbor_value = getKNeighbor(
m_tmp_data,m_tmp_data[i_process_begin_item:i_process_end_item,],in_m_data_target, k_value );
m_tmp_err = getErr( m_neighbor_value,in_m_data_target[i_process_begin_item:i_process_end_item,],in_i_is_categorical );
m_err[,i_n_column+1] = m_tmp_err;
}
parfor( i in 1:i_n_column ,check=0){
if( as.scalar( m_feature_selected_flag[1,i] ) != 1 ){
m_tmp_process_data = append( m_tmp_data,in_m_data[,i] );
m_neighbor_value = getKNeighbor(
m_tmp_process_data,m_tmp_process_data[i_process_begin_item:i_process_end_item,],in_m_data_target,k_value );
m_tmp_err = getErr(
m_neighbor_value,in_m_data_target[i_process_begin_item:i_process_end_item,],in_i_is_categorical );
m_err[,i] = m_tmp_err;
}
}
}
if( m_this_model_selected_flag == TRUE ){
m_active_flag_tmp = append( m_feature_selected_flag,matrix( 0,1,1 ) );
}else{
m_active_flag_tmp = append( m_feature_selected_flag,matrix( 1,1,1 ) );
}
s_rows_number = i_process_item#nrow(m_err);
m_iter_err_sum = colSums(m_err) + m_iter_err_sum
m_iter_err_sum_squared = colSums(m_err ^ 2) + m_iter_err_sum_squared
m_err_mean = - outer(t(m_iter_err_sum), m_iter_err_sum , "-") / s_rows_number
m_err_vars = ( m_err_mean ^2 * s_rows_number -
2 * m_err_mean * m_iter_err_sum + m_iter_err_sum_squared) / (s_rows_number-1)
m_err_vars = sqrt(m_err_vars)
parfor( i in 1:ncol( m_err ) ){
#m_err_i_mean = (m_iter_err_sum - m_iter_err_sum[,i])/s_rows_number
#m_err_i_vars = (m_iter_err_sum_squared + m_err_i_mean ^2 * s_rows_number -
# 2 * m_iter_err_sum * m_err_i_mean) / (s_rows_number-1)
#m_err_i_vars = sqrt(m_err_i_vars)
#m_err_i_mean = m_err_mean[i,]
#m_err_i_vars = m_err_vars[i,]
parfor( j in 1:ncol( m_err ) ,check=0){
b_execute_block = TRUE;
if( j==i ) b_execute_block = FALSE;
if( as.scalar( m_active_flag_tmp[1,i] ) == 1 ) b_execute_block = FALSE;
#i has dropped, ignore this case
if( as.scalar( m_active_flag_tmp[1,j] ) == 1 ) b_execute_block = FALSE;
#j has dropped, ignore this case
if( b_execute_block ){
#print( "i=" + i + ",j=" + j );
b_flag = eliminateModel( as.scalar(m_err_mean[i,j]),as.scalar(m_err_vars[i,j]),s_rows_number);
if( b_flag == TRUE ){
m_active_flag_tmp[1,i] = 1;
}
}
}
}
#We mark bit to 1 for selected feature before current loop,
#and mark bit to 1 also for dropped feature in current loop
if( sum( m_active_flag_tmp != 1 ) > 1 ){
b_selected_morethan_one = TRUE;
}
m_col_sums_err = m_iter_err_sum #colSums( m_err );
i_index_min_LOOCV = as.scalar( rowIndexMin( m_col_sums_err ) );
d_min_LOOCV = as.scalar( m_col_sums_err[1,i_index_min_LOOCV] );
i_index_min_LOOCV = i_index_min_LOOCV%% ( i_n_column+1 )
if( sum( m_active_flag_tmp != 1 ) <= 1 ){
b_continue_loop = FALSE;
}
if( as.scalar( m_active_flag_tmp[1,i_n_column+1] ) == 1 ){
m_this_model_selected_flag = FALSE;
}
m_feature_selected_flag = m_active_flag_tmp[,1:i_n_column];
i_iter = i_iter + 1;
}
#select current model, jump out.
if( i_index_min_LOOCV == 0 ){
b_continue_main_loop = FALSE;
print( "Select Current model" );
}else{
print( "select feature " + i_index_min_LOOCV + ", change bit value to 1" );
m_main_selected_flag[1,i_index_min_LOOCV] = 1;
b_no_feature_selected = FALSE;
}
if( sum( m_main_selected_flag - in_m_init_selected ) >= in_i_max_select ){
#select more than 10
b_continue_main_loop = FALSE;
}
if( sum( m_main_selected_flag ) == i_n_column ){
#all selected
b_continue_main_loop = FALSE;
}
}
out_m_selected_feature = m_main_selected_flag;
out_d_min_LOOCV = d_min_LOOCV;
}
# getSelectedFeatureAndK:
# Combine k and feature selection algorithm.
# Refer to ADD part "8.Combined k and feature selection"
# Argument:
# in_m_data input matrix as features
# in_m_data_target input matrix as target value
# in_i_is_categorical 1 = category , 0 = continuous
# in_m_init_selected S.user, initial selected feature which use specified
# in_i_max_select J, max feature selected
# k_min minimun k
# k_max maximun k
# interval block size for BRACE algorithm
#
# Reture:
# out_m_selected_feature output matrix for feature selection
# out_i_selected_k output k value for k selection
getSelectedFeatureAndK = function (
matrix[double] in_m_data,
matrix[double] in_m_data_target,
integer in_i_is_categorical, # 1 = category , 0 = continuous
matrix[double] in_m_init_selected,
integer in_i_max_select,
integer k_min,
integer k_max,
integer interval )
return(
matrix[double] out_m_selected_feature,
integer out_i_selected_k
)
{
m_err = matrix( 0,1,k_max-k_min+1 );
m_feature = matrix( 0,k_max-k_min+1,ncol( in_m_data ) );
#Step 1. For each k in [k_min,k_max] ( k_min has default value 1, k_max has default value 100 )
#in parallel select relevant features using FS+BRACE or schemata search described in Section 7.
parfor( i in k_min:k_max,check=0 ){
#print( " ************** " + i + " ****************" );
[m_selected_feature,d_err] =
getSelectedFeature( in_m_data,in_m_data_target,in_i_is_categorical
,in_m_init_selected,in_i_max_select,i,interval );
m_err[1,i] = d_err;
m_feature[i,] = m_selected_feature;
#print( "err:" + d_err );
}
#Step 2. Output the combination of features and k with the smallest LOOCV error.
i_min_err_index = as.integer( as.scalar( rowIndexMin( m_err ) ) );
out_m_selected_feature = m_feature[i_min_err_index,];
out_i_selected_k = i_min_err_index + k_min - 1;
#print( "min err k is " + out_i_selected_k );
}
getFeatureImportance = function (
matrix[double] in_m_data,
matrix[double] in_m_data_target,
integer in_i_is_categorical, # 1 = category , 0 = continuous
integer k_value)
return(
matrix[double] out_m_feature_importance,
matrix[double] out_m_norm_feature_importance
)
{
n_feature = ncol(in_m_data)
n_record = nrow(in_m_data)
if(n_feature <= 1){
stop("can't estimate feature importance when ncol = 1")
}
m_err = matrix( 0,n_record,n_feature);
parfor(i_feature in 1:n_feature, check=0){
m_feature_select = matrix(1,1,n_feature)
m_feature_select[1,i_feature] = 0;
m_in_tmp_data = removeEmpty(target=in_m_data,margin="cols", select= m_feature_select)
m_neighbor_value = getKNeighbor( m_in_tmp_data,m_in_tmp_data,in_m_data_target,k_value);
m_tmp_err = getErr( m_neighbor_value,in_m_data_target ,in_i_is_categorical );
m_err[,i_feature] = m_tmp_err
}
out_m_feature_importance = colSums( m_err );
out_m_norm_feature_importance =
out_m_feature_importance / as.scalar(rowSums(out_m_feature_importance))
}