| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| # |
| # THIS SCRIPT IMPLEMENTS KNN( K Nearest Neighbor ) ALGORITHM |
| # |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT OPTIONAL MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X String --- N Location to read the input matrix as features |
| # T String --- N Location to read the input matrix for nearest neighbor |
| # search |
| # Y String --- Y Location to read the input matrix as target |
| # Y_T String --- Y Location to read the target type of matrix Y whether |
| # columns in Y are continuous ( =1 ) or |
| # categorical ( =2 ) |
| # K String --- Y Location to write the result of selected k value |
| # PR String --- Y Location to write the predicted value result, |
| # it's optional when Y is NOT specified. |
| # NNR String --- N Location to write the nearest neighbor index result for T. |
| # trans_continuous int 0 Y Option flag for continuous feature transformed to [-1,1]: |
| # 0 = do not transform continuous variable; |
| # 1 = transform continuous variable; |
| # select_k int 0 Y Use k selection algorithm to estimate k |
| # ( 1 means yes ) |
| # k_min int 1 Y Min k value( available if select_k = 1 ) |
| # k_max int 100 Y Max k value( available if select_k = 1 ) |
| # select_feature int 0 Y Use feature selection algorithm to select feature |
| # ( 1 means yes ) |
| # START_SELECTED String --- Y Location to read feature selection initinal value |
| # FEATURE_SELECTED String --- Y Location to write feature selection result |
| # feature_max int 10 Y Max feature selection |
| # interval int 1000 Y Interval value for K selecting ( available if select_k = 1 ) |
| # feature_importance int 0 Y Use feature importance algorithm to estimate each feature |
| # ( 1 means yes ) |
| # FEATURE_IMPORTANCE_VALUE String --- Y Location to write feature importance result |
| # k_value int 5 Y k value for KNN, ignore if select_k enable |
| # predict_con_tg int 0 Y Continuous target predict function: mean(=0) or |
| # median(=1) |
| # fmt String "text" Y Matrix output format for MB, ML, usually "text" or "csv" |
| # --------------------------------------------------------------------------------------------- |
| # OUTPUT: Matrix PR, Matrix NNR, Matrix FEATURE_IMPORTANCE_VALUE |
| # |
| # HOW TO INVOKE THIS SCRIPT - EXAMPLE: |
| # hadoop jar SystemML.jar -f knn.dml -nvargs X=%datapath%/train.csv |
| # T=%datapath%/test.csv NNR=%tmpdir%/NNR.csv fmt=csv |
| |
| |
| print( "start knn algorithm" ); |
| #input data |
| TrainPath = $X; |
| TestPath = $T; |
| CLPath = ifdef( $Y," " ); |
| CLTypePath = ifdef( $Y_T," " ); |
| #output data |
| NNRPath = $NNR; |
| PRPath = ifdef( $PR," " ); |
| #parameter |
| epsilon = ifdef( $epsilon,0 ); |
| distance_type = ifdef( $distance_type,1 ); |
| trans_continuous = ifdef( $trans_continuous,0 ); |
| predict_con_tg = ifdef($predict_con_tg,0); |
| k_value = ifdef( $k_value,5 ); |
| FMT = ifdef( $fmt,"text" ); |
| #k select parameter |
| select_k = ifdef( $select_k,0 ); |
| k_min = ifdef( $k_min,1 ); |
| k_max = ifdef( $k_max,100 ); |
| #feature select parameter |
| select_feature = ifdef( $select_feature,0 ); |
| feature_max = ifdef( $feature_max,10 ); |
| #feature importance parameter |
| feature_importance = ifdef ($feature_importance,0); |
| |
| |
| #Read train data |
| Train = read ( TrainPath ); |
| Test = read ( TestPath ); |
| |
| #data prepare |
| if( trans_continuous == 1 ){ |
| print( "Do data prepare to transfer continuous variable to [-1,1] ..." ); |
| Train = prepareKNNData( Train); |
| Test = prepareKNNData( Test); |
| } |
| |
| n_records = nrow( Train ); |
| n_features = ncol( Train ); |
| s_selected_k = 5; |
| m_selected_feature = matrix(1,rows=1,cols=n_records); |
| if( select_k == 1 | select_feature==1 ){ |
| default_interval = as.integer( ceil( sqrt( n_records ) ) ); |
| interval = ifdef( $interval,default_interval ); |
| print( "interval is " + interval ); |
| #parameter check |
| if( CLPath == " " | CLTypePath == " "){ |
| stop( "Y or Y_T must be set for k selection algorithm" ); |
| } |
| #parameter re-define |
| if( select_k==1 ){ |
| if( k_max >= n_records ){ |
| k_max = n_records - 1; |
| print( "k_max should no greater than number of record, change k_max equal " + |
| "( number of record - 1 ) : " + k_max ); |
| } |
| if( k_max >= interval ){ |
| interval = k_max + 1; |
| # k_max should equal interval -1, because we drop self when search nn. |
| print( "interval should be no less than k_max, change interval equal : " + |
| interval ); |
| } |
| if( k_max <= 1 ){ |
| stop( "uncorrect k_max value" ); |
| } |
| if( k_min >= k_max ){ |
| stop( "k_min >= k_max" ); |
| } |
| } |
| if( select_feature == 1 ){ |
| if( k_value >= n_records ){ |
| k_value = n_records - 1; |
| print( "k_value should be no greater than number of record, change k_value equal " + |
| "( number of record - 1 ) : " + k_value ); |
| } |
| #Select feature only |
| StartSelectedPath = ifdef( $START_SELECTED," " ); |
| SelectedPath = ifdef( $FEATURE_SELECTED," " ); |
| if( StartSelectedPath == " " ){ |
| m_start_selected_feature = matrix( 0,1,n_features ); |
| }else{ |
| m_start_selected_feature = read( StartSelectedPath ); |
| } |
| } |
| CL = read( CLPath ); |
| CLType = read( CLTypePath ); |
| type = as.integer( as.scalar( CLType[1,1] ) ); |
| if( select_k == 1 & select_feature == 1){ |
| #Combined k and feature selection |
| print("Start combined k and feature selection ..."); |
| [m_selected_feature,s_selected_k] = |
| getSelectedFeatureAndK( Train,CL,type,m_start_selected_feature, |
| feature_max,k_min,k_max,interval ); |
| }else if( select_k == 1 ){ |
| #Select k only |
| print("Start k select ..."); |
| s_selected_k = getSelectedKBase( Train,CL,type,k_min,k_max,interval ); |
| }else if( select_feature == 1 ){ |
| #Select feature only |
| print("Start feature selection ... "); |
| [m_selected_feature,d_err] = |
| getSelectedFeature( Train,CL,type,m_start_selected_feature, |
| feature_max,k_value,interval ); |
| } |
| } |
| if( feature_importance == 1){ |
| if( k_value >= n_records ){ |
| k_value = n_records - 1; |
| print( "k_value should be no greater than number of record, make k_value equal " + |
| "( number of record - 1 ) : " + k_value ); |
| } |
| CL = read( CLPath ); |
| CLType = read( CLTypePath ); |
| type = as.integer( as.scalar( CLType[1,1] ) ); |
| FeatureImportanceExportPath = ifdef( $FEATURE_IMPORTANCE_VALUE," " ); |
| if(FeatureImportanceExportPath != " "){ |
| [m_feature_importance,m_norm_feature_importance] = |
| getFeatureImportance(Train,CL,type,k_value); |
| #print( "feature importance is " ); |
| #tmp = matrixDisplay( m_feature_importance ); |
| write ( m_feature_importance,FeatureImportanceExportPath,format=FMT ); |
| } |
| } |
| |
| # knn search |
| print("Start knn search ...") |
| # apply k selection result |
| if(select_k == 1){ |
| k_value = s_selected_k |
| } |
| if( k_value >= n_records ){ |
| k_value = n_records - 1; |
| print( "k_value should be no greater than number of record, make k_value equal " + |
| "( number of record - 1 )" ); |
| } |
| print( "k value is " + k_value ); |
| # apply feature selection result |
| if( select_feature == 1 ){ |
| if( n_features > sum( m_selected_feature ) ){ |
| Train = removeEmpty( target = Train,margin="cols",select = m_selected_feature); |
| Test = removeEmpty( target = Test,margin="cols",select = m_selected_feature ); |
| } |
| } |
| |
| NNR_matrix = naiveKNNsearch(Train,Test,k_value) |
| write( NNR_matrix,NNRPath,format=FMT ); |
| |
| # main part for knn predicted |
| if( CLPath != " " ){ |
| CL = read ( CLPath ); |
| if( CLTypePath == " " ){ |
| stop( "Set Y but Miss Y_T for label type" ); |
| } |
| CLType = read( CLTypePath ); |
| i_cl_type = as.integer( as.scalar( CLType[1,1] ) ); |
| } |
| CL_matrix = matrix( 0,nrow( Test ),1 ); |
| if( CLPath != " " ){ |
| parfor( i in 1:nrow( NNR_matrix ) ,check=0){ |
| NNR_tmp_matrix = matrix( 0,k_value,1 ); |
| for( j in 1:k_value ){ |
| NNR_tmp_matrix[j,1] = CL[as.scalar( NNR_matrix[i,j] ),1] ; |
| } |
| t_cl_value = 0.0 #test data label value |
| if( i_cl_type == 2 ){ |
| t_cl_value = as.scalar( rowIndexMax( t( table( NNR_tmp_matrix,1 ) ) ) ) |
| }else{ |
| if ( predict_con_tg == 0){ |
| t_cl_value = mean( NNR_tmp_matrix ); |
| } else if(predict_con_tg == 1){ |
| t_cl_value = median( NNR_tmp_matrix ); |
| } |
| } |
| CL_matrix[i,1] = t_cl_value; |
| } |
| } |
| if( PRPath != " " ){ |
| write( CL_matrix,PRPath,format=FMT ); |
| } |
| |
| #naive knn search implement |
| naiveKNNsearch = function( |
| matrix[double] P, |
| matrix[double] Q, |
| integer K |
| )return( |
| matrix[double] O |
| ){ |
| num_records = nrow (P); |
| num_features = ncol (P); |
| num_queries = nrow (Q); |
| Qt = t(Q); |
| PQt = P %*% Qt; |
| P2 = rowSums (P ^ 2); |
| D = -2 * PQt + P2; |
| if (K == 1) { |
| Dt = t(D); |
| O = rowIndexMin (Dt); |
| } else { |
| O = matrix (0, rows = num_queries, cols = K); |
| parfor (i in 1:num_queries) { |
| D_sorted=order(target=D[,i], by=1, decreasing=FALSE, index.return=TRUE); |
| O[i,] = t(D_sorted[1:K,1]); |
| } |
| } |
| } |
| #naive knn search for predict value only implement |
| naiveKNNsearchForPredict = function( |
| matrix[double] P, |
| matrix[double] Q, |
| matrix[double] L, |
| integer K |
| )return( |
| matrix[double] OL |
| ){ |
| num_records = nrow (P); |
| num_features = ncol (P); |
| num_queries = nrow (Q); |
| Qt = t(Q); |
| PQt = P %*% Qt; |
| P2 = rowSums (P ^ 2); |
| D = -2 * PQt + P2; |
| if (K == 1) { |
| Dt = t(D); |
| O = rowIndexMin (Dt); |
| OL = matrix (0, rows = num_queries, cols = 1) |
| parfor( i in 1:num_queries){ |
| OL[i,] = L[as.scalar(O[i,1]),1] |
| } |
| } else { |
| OL = matrix (0, rows = num_queries, cols = K); |
| parfor (i in 1:num_queries) { |
| D_sorted=order(target=append(D[,i],L), by=1, decreasing=FALSE, index.return=FALSE); |
| OL[i,] = t(D_sorted[1:K,2]); |
| } |
| } |
| } |
| # prepareKNNData: |
| # Do data prepare - [-1,1] transform for continues variable |
| # Argument: |
| # in_m_data input matrix as features |
| prepareKNNData = function ( |
| matrix[double] in_m_data |
| ) |
| return( |
| matrix[double] out_m_data |
| ) |
| { |
| m_colmax = colMaxs(in_m_data); |
| m_colmin = colMins(in_m_data); |
| out_m_data = 2*( in_m_data - m_colmin )/( m_colmax - m_colmin ) - 1; |
| } |
| |
| getKNeighbor = function ( matrix[double] in_m_data, |
| matrix[double] in_m_test_data, |
| matrix[double] in_m_cl, |
| integer in_i_k_max ) |
| return ( matrix[double] out_m_neighbor_value ) { |
| #to naive |
| m_search_result = naiveKNNsearchForPredict(in_m_data,in_m_test_data,in_m_cl,in_i_k_max + 1) |
| out_m_neighbor_value = m_search_result[ ,2:in_i_k_max + 1] |
| } |
| |
| getErr_k = function ( matrix[double] in_m_neighbor_value, |
| matrix[double] in_m_cl, |
| integer in_i_cl_type , |
| integer in_i_k_min ) |
| return ( matrix[double] out_m_err ){ |
| i_col = ncol( in_m_neighbor_value ); |
| i_row = nrow( in_m_neighbor_value ); |
| |
| out_m_err = matrix( 0,i_row,i_col - in_i_k_min + 1 ); |
| if( in_i_cl_type == 2 ){ |
| #category |
| m_correct = ppred( in_m_neighbor_value,in_m_cl[1:i_row,],"!=" ); |
| }else{ |
| #continues |
| m_correct = (in_m_neighbor_value - in_m_cl[1:i_row,])^2;#ppred( in_m_neighbor_value,in_m_cl,"-" ); |
| } |
| parfor( i in 1:i_col-in_i_k_min+1 ,check = 0 ){ |
| out_m_err[,i] = |
| ( rowSums( m_correct[,1:in_i_k_min + i - 1] ) / ( in_i_k_min + i - 1 ) ); |
| } |
| #return err for each record and each k ( belong to range 1~max ); |
| } |
| /* |
| eliminateModel = function ( matrix[double] in_j_err, matrix[double] in_j1_err ) |
| return( boolean out_b_inactived ){ |
| out_b_inactived = FALSE; |
| i_row = nrow( in_j_err ); |
| m_err_mean = as.scalar( colSums( in_j_err - in_j1_err ) / i_row ); |
| m_err_variance = as.scalar( |
| sqrt( colSums( ( in_j_err - in_j1_err - m_err_mean ) ^ 2 ) / ( i_row-1 ))) |
| |
| #alpha, beta, gamma, delta |
| d_gamma = 0.001; |
| d_delta = 0.001; |
| |
| tmp_d_delta = cdf( |
| target = ( -d_gamma - m_err_mean ) / m_err_variance, dist="t",df=i_row-1 ); |
| #print( "m_err_mean=" + m_err_mean + ",m_err_variance=" + |
| # m_err_variance + ",tmp_d_delta=" + tmp_d_delta ); |
| if( tmp_d_delta < d_delta ){ |
| out_b_inactived = TRUE; |
| } |
| |
| }*/ |
| eliminateModel = function ( double s_err_mean, double s_err_vars, integer i_row ) |
| return( boolean out_b_inactived ){ |
| out_b_inactived = FALSE; |
| #alpha, beta, gamma, delta |
| d_gamma = 0.001; |
| d_delta = 0.001; |
| |
| tmp_d_delta = cdf( |
| target = ( -d_gamma - s_err_mean ) / s_err_vars, dist="t",df=i_row-1 ); |
| #print( "m_err_mean=" + m_err_mean + ",m_err_variance=" + |
| # m_err_variance + ",tmp_d_delta=" + tmp_d_delta ); |
| if( tmp_d_delta < d_delta ){ |
| out_b_inactived = TRUE; |
| } |
| |
| } |
| # getSelectedKBase: |
| # k selection algorithm with simple KNN algorithm. |
| # Refer to ADD part "6.3 Optimized LOOCV + BRACE" |
| # Argument: |
| # in_m_data input matrix as features |
| # in_m_data_target input matrix as target value |
| # in_i_is_categorical 1 = category , 0 = continuous |
| # k_min minimun k |
| # k_max maximun k |
| # interval block size for BRACE algorithm |
| # |
| # Reture: |
| # k output k value for k selection |
| getSelectedKBase = function ( |
| matrix[double] in_m_data, |
| matrix[double] in_m_data_target, |
| integer in_i_is_categorical, # 1 = category , 0 = continuous |
| integer k_min, |
| integer k_max, |
| integer interval ) |
| return( |
| integer k |
| ) |
| { |
| b_continue_loop = TRUE; |
| i_iter = 1; |
| i_record = nrow( in_m_data ); |
| |
| m_active_flag = matrix( 0,1,k_max-k_min+1 ); |
| i_active_model_number = k_max - k_min + 1; |
| |
| m_iter_err_sum = matrix( 0,1,k_max-k_min+1); |
| m_iter_err_sum_squared = matrix( 0,1,k_max-k_min+1); |
| while( b_continue_loop ){ |
| # 1.build k-d tree? , or use hash method |
| # 2.search data to get k_max nearest neighbor |
| i_process_item = i_iter*interval; |
| if( i_process_item >= i_record ){ |
| i_process_item = i_record; |
| b_continue_loop = FALSE; |
| } |
| i_process_begin_item = (i_iter - 1)*interval + 1 |
| #i_process_begin_item = 1 |
| i_process_end_item = i_process_item |
| |
| m_neighbor_value = getKNeighbor( in_m_data,in_m_data[i_process_begin_item:i_process_end_item,],in_m_data_target,k_max ); |
| # 3.get matrix of err from k_min to k_max |
| m_err = getErr_k( m_neighbor_value,in_m_data_target[i_process_begin_item:i_process_end_item,],in_i_is_categorical,k_min ); |
| |
| # 4.check this matrix to drop unnessary record |
| m_active_flag_tmp = matrix( 0,1,ncol( m_err ) ); |
| |
| s_rows_number = i_process_item#nrow(m_err); |
| |
| m_iter_err_sum = colSums(m_err) + m_iter_err_sum |
| m_iter_err_sum_squared = colSums(m_err ^ 2) + m_iter_err_sum_squared |
| |
| m_err_mean = - outer(t(m_iter_err_sum), m_iter_err_sum , "-") / s_rows_number |
| m_err_vars = ( m_err_mean ^2 * s_rows_number - |
| 2 * m_err_mean * m_iter_err_sum + m_iter_err_sum_squared) / (s_rows_number-1) |
| m_err_vars = sqrt(m_err_vars) |
| |
| parfor( i in 1:ncol( m_err ) ){ |
| #m_err_i = m_err - m_err[,i]; |
| #m_err_i_mean = colMeans(m_err_i); |
| #m_err_i_vars = colVars(m_err_i); |
| parfor( j in 1:ncol( m_err ),check=0 ){ |
| b_execute_block = TRUE; |
| if( j==i ) b_execute_block = FALSE; |
| if( as.scalar( m_active_flag_tmp[1,i] ) == 1 ) b_execute_block = FALSE; |
| #i has dropped, ignore this case |
| if( as.scalar( m_active_flag_tmp[1,j] ) == 1 ) b_execute_block = FALSE; |
| #j has dropped, ignore this case |
| if( b_execute_block ){ |
| b_flag = eliminateModel( as.scalar(m_err_mean[i,j]),as.scalar(m_err_vars[i,j]),s_rows_number); |
| if( b_flag == TRUE ){ |
| m_active_flag_tmp[1,i] = 1; |
| } |
| } |
| } |
| } |
| |
| #m_tmp = ppred( colSums( m_active_flag_tmp ),1,">=" ); |
| m_active_flag = ppred( m_active_flag + m_active_flag_tmp,1,">=" ); |
| i_active_model_number = -sum( m_active_flag - 1 ); |
| |
| # 5.break while check |
| if( i_active_model_number <= 1 ){ |
| b_continue_loop = FALSE; |
| } |
| i_iter = i_iter + 1; |
| print("i_iter" + i_iter) |
| } |
| |
| k = 0; |
| if( i_active_model_number == 0 ){ |
| print( "All k kick out, use min of range " + k_min ); |
| k = k_min; |
| }else if( i_active_model_number == 1 ){ |
| k = k_min + as.integer( as.scalar( rowIndexMin( m_active_flag ) ) ) - 1; |
| print( "Get k, which value is " + k ); |
| }else{ |
| m_err_for_order = |
| append( t( m_iter_err_sum ), matrix( seq( k_min,k_max,1 ),k_max-k_min+1,1 ) ); |
| m_err_for_order = removeEmpty( |
| target=m_err_for_order * t( ppred( m_active_flag,0,"==" ) ),margin="rows" ); |
| for( i in 1:nrow( m_err_for_order ) ){ |
| print( "k:" + as.scalar( m_err_for_order[i,2] ) + |
| ",err:" + as.scalar( m_err_for_order[i,1] ) ); |
| } |
| m_err_order = order( target=m_err_for_order,by=1,decreasing=FALSE,index.return=FALSE ); |
| k = as.integer( as.scalar( m_err_order[1,2] ) ); |
| print( "Get minimum LOOCV error, which value is " + k ); |
| } |
| } |
| |
| getErr = function ( matrix[double] in_m_neighbor_value, |
| matrix[double] in_m_cl, |
| integer in_i_cl_type ) |
| return ( matrix[double] out_m_err ){ |
| i_col = ncol( in_m_neighbor_value ); |
| i_row = nrow( in_m_neighbor_value ); |
| |
| if( in_i_cl_type == 2 ){ |
| #category |
| m_correct = ppred( in_m_neighbor_value,in_m_cl[1:i_row,],"!=" ); |
| }else{ |
| #continues |
| m_correct = (in_m_neighbor_value - in_m_cl[1:i_row,])^2;#ppred( in_m_neighbor_value,in_m_cl,"-" ); |
| } |
| out_m_err = ( rowSums( m_correct[,1:i_col] )/( i_col ) ); |
| } |
| |
| # getSelectedFeature: |
| # feature selection algorithm. |
| # Refer to ADD part "7.1 FS+BRACE" |
| # Argument: |
| # in_m_data input matrix as features |
| # in_m_data_target input matrix as target value |
| # in_i_is_categorical 1 = category , 0 = continuous |
| # in_m_init_selected S.user, initial selected feature which use specified |
| # in_i_max_select J, max feature selected |
| # k_value k |
| # interval block size for BRACE algorithm |
| # |
| # Reture: |
| # out_m_selected_feature output matrix for feature selection |
| # out_d_min_LOOCV output err |
| |
| getSelectedFeature = function ( |
| matrix[double] in_m_data, |
| matrix[double] in_m_data_target, |
| integer in_i_is_categorical, # 1 = category , 0 = continuous |
| matrix[double] in_m_init_selected, |
| integer in_i_max_select, |
| integer k_value, |
| integer interval ) |
| return( |
| matrix[double] out_m_selected_feature, |
| double out_d_min_LOOCV |
| ) |
| { |
| i_n_record = nrow( in_m_data ); |
| i_n_column = ncol( in_m_data ); |
| m_main_selected_flag = in_m_init_selected; |
| b_no_feature_selected = TRUE; |
| if( sum( in_m_init_selected ) >= 1 ){ |
| b_no_feature_selected = FALSE; |
| } |
| d_max_err_value = ( max( in_m_data_target ) - min( in_m_data_target ) ) * 100; |
| b_continue_main_loop = TRUE; #level 1 while loop flag |
| d_min_LOOCV = 1.0/0.0; |
| while( b_continue_main_loop ){ |
| m_feature_selected_flag = m_main_selected_flag; |
| m_this_model_selected_flag = TRUE; |
| i_index_min_LOOCV = -1; # flag for which model win in BRACE algorith |
| b_selected_morethan_one = FALSE; |
| b_continue_loop = TRUE; #level 2 while loop flag |
| i_iter = 1; |
| m_iter_err_sum = matrix( 0,1,i_n_column+1 ); |
| m_iter_err_sum_squared = matrix( 0,1,i_n_column+1 ); |
| while( b_continue_loop ){ |
| #print( "*************" + i_iter + "*************" ); |
| i_process_item = i_iter*interval; |
| if( i_process_item >= i_n_record ){ |
| i_process_item = i_n_record; |
| b_continue_loop = FALSE; |
| } |
| i_process_begin_item = (i_iter - 1)*interval + 1 |
| #i_process_begin_item = 1 |
| i_process_end_item = i_process_item |
| m_err = matrix( d_max_err_value,i_process_end_item - i_process_begin_item + 1,i_n_column+1 ); |
| if( b_no_feature_selected == TRUE ){ |
| parfor( i in 1:i_n_column ,check=0){ |
| if( as.scalar( m_feature_selected_flag[1,i] ) != 1 ){ |
| m_tmp_process_data = in_m_data[,i]; |
| m_neighbor_value = getKNeighbor( |
| m_tmp_process_data,m_tmp_process_data[i_process_begin_item:i_process_end_item,], |
| in_m_data_target,k_value ); |
| m_tmp_err = getErr( |
| m_neighbor_value,in_m_data_target[i_process_begin_item:i_process_end_item,],in_i_is_categorical ); |
| m_err[,i] = m_tmp_err; |
| } |
| } |
| }else{ |
| #Use m_main_selected_flag but not m_feature_selected_flag, |
| # m_main_selected_flag: which feature are init selected |
| # m_feature_selected_flag: which feature are dropped & init selected |
| m_tmp_data = removeEmpty( target=in_m_data ,margin="cols", select = m_main_selected_flag); |
| if( m_this_model_selected_flag == TRUE ){ |
| m_neighbor_value = getKNeighbor( |
| m_tmp_data,m_tmp_data[i_process_begin_item:i_process_end_item,],in_m_data_target, k_value ); |
| |
| m_tmp_err = getErr( m_neighbor_value,in_m_data_target[i_process_begin_item:i_process_end_item,],in_i_is_categorical ); |
| m_err[,i_n_column+1] = m_tmp_err; |
| } |
| parfor( i in 1:i_n_column ,check=0){ |
| if( as.scalar( m_feature_selected_flag[1,i] ) != 1 ){ |
| m_tmp_process_data = append( m_tmp_data,in_m_data[,i] ); |
| m_neighbor_value = getKNeighbor( |
| m_tmp_process_data,m_tmp_process_data[i_process_begin_item:i_process_end_item,],in_m_data_target,k_value ); |
| m_tmp_err = getErr( |
| m_neighbor_value,in_m_data_target[i_process_begin_item:i_process_end_item,],in_i_is_categorical ); |
| m_err[,i] = m_tmp_err; |
| } |
| } |
| } |
| if( m_this_model_selected_flag == TRUE ){ |
| m_active_flag_tmp = append( m_feature_selected_flag,matrix( 0,1,1 ) ); |
| }else{ |
| m_active_flag_tmp = append( m_feature_selected_flag,matrix( 1,1,1 ) ); |
| } |
| |
| s_rows_number = i_process_item#nrow(m_err); |
| |
| m_iter_err_sum = colSums(m_err) + m_iter_err_sum |
| m_iter_err_sum_squared = colSums(m_err ^ 2) + m_iter_err_sum_squared |
| |
| m_err_mean = - outer(t(m_iter_err_sum), m_iter_err_sum , "-") / s_rows_number |
| m_err_vars = ( m_err_mean ^2 * s_rows_number - |
| 2 * m_err_mean * m_iter_err_sum + m_iter_err_sum_squared) / (s_rows_number-1) |
| m_err_vars = sqrt(m_err_vars) |
| |
| parfor( i in 1:ncol( m_err ) ){ |
| #m_err_i_mean = (m_iter_err_sum - m_iter_err_sum[,i])/s_rows_number |
| #m_err_i_vars = (m_iter_err_sum_squared + m_err_i_mean ^2 * s_rows_number - |
| # 2 * m_iter_err_sum * m_err_i_mean) / (s_rows_number-1) |
| #m_err_i_vars = sqrt(m_err_i_vars) |
| #m_err_i_mean = m_err_mean[i,] |
| #m_err_i_vars = m_err_vars[i,] |
| parfor( j in 1:ncol( m_err ) ,check=0){ |
| b_execute_block = TRUE; |
| if( j==i ) b_execute_block = FALSE; |
| if( as.scalar( m_active_flag_tmp[1,i] ) == 1 ) b_execute_block = FALSE; |
| #i has dropped, ignore this case |
| if( as.scalar( m_active_flag_tmp[1,j] ) == 1 ) b_execute_block = FALSE; |
| #j has dropped, ignore this case |
| if( b_execute_block ){ |
| #print( "i=" + i + ",j=" + j ); |
| b_flag = eliminateModel( as.scalar(m_err_mean[i,j]),as.scalar(m_err_vars[i,j]),s_rows_number); |
| if( b_flag == TRUE ){ |
| m_active_flag_tmp[1,i] = 1; |
| } |
| } |
| } |
| } |
| #We mark bit to 1 for selected feature before current loop, |
| #and mark bit to 1 also for dropped feature in current loop |
| |
| if( sum( ppred( m_active_flag_tmp,1,"!=" ) ) > 1 ){ |
| b_selected_morethan_one = TRUE; |
| } |
| m_col_sums_err = m_iter_err_sum #colSums( m_err ); |
| i_index_min_LOOCV = as.scalar( rowIndexMin( m_col_sums_err ) ); |
| d_min_LOOCV = as.scalar( m_col_sums_err[1,i_index_min_LOOCV] ); |
| i_index_min_LOOCV = i_index_min_LOOCV%% ( i_n_column+1 ) |
| |
| if( sum( ppred( m_active_flag_tmp,1,"!=" ) ) <= 1 ){ |
| b_continue_loop = FALSE; |
| } |
| if( as.scalar( m_active_flag_tmp[1,i_n_column+1] ) == 1 ){ |
| m_this_model_selected_flag = FALSE; |
| } |
| m_feature_selected_flag = m_active_flag_tmp[,1:i_n_column]; |
| i_iter = i_iter + 1; |
| } |
| #select current model, jump out. |
| if( i_index_min_LOOCV == 0 ){ |
| b_continue_main_loop = FALSE; |
| print( "Select Current model" ); |
| }else{ |
| print( "select feature " + i_index_min_LOOCV + ", change bit value to 1" ); |
| m_main_selected_flag[1,i_index_min_LOOCV] = 1; |
| b_no_feature_selected = FALSE; |
| } |
| if( sum( m_main_selected_flag - in_m_init_selected ) >= in_i_max_select ){ |
| #select more than 10 |
| b_continue_main_loop = FALSE; |
| } |
| if( sum( m_main_selected_flag ) == i_n_column ){ |
| #all selected |
| b_continue_main_loop = FALSE; |
| } |
| } |
| out_m_selected_feature = m_main_selected_flag; |
| out_d_min_LOOCV = d_min_LOOCV; |
| } |
| |
| # getSelectedFeatureAndK: |
| # Combine k and feature selection algorithm. |
| # Refer to ADD part "8.Combined k and feature selection" |
| # Argument: |
| # in_m_data input matrix as features |
| # in_m_data_target input matrix as target value |
| # in_i_is_categorical 1 = category , 0 = continuous |
| # in_m_init_selected S.user, initial selected feature which use specified |
| # in_i_max_select J, max feature selected |
| # k_min minimun k |
| # k_max maximun k |
| # interval block size for BRACE algorithm |
| # |
| # Reture: |
| # out_m_selected_feature output matrix for feature selection |
| # out_i_selected_k output k value for k selection |
| getSelectedFeatureAndK = function ( |
| matrix[double] in_m_data, |
| matrix[double] in_m_data_target, |
| integer in_i_is_categorical, # 1 = category , 0 = continuous |
| matrix[double] in_m_init_selected, |
| integer in_i_max_select, |
| integer k_min, |
| integer k_max, |
| integer interval ) |
| return( |
| matrix[double] out_m_selected_feature, |
| integer out_i_selected_k |
| ) |
| { |
| m_err = matrix( 0,1,k_max-k_min+1 ); |
| m_feature = matrix( 0,k_max-k_min+1,ncol( in_m_data ) ); |
| |
| #Step 1. For each k in [k_min,k_max] ( k_min has default value 1, k_max has default value 100 ) |
| #in parallel select relevant features using FS+BRACE or schemata search described in Section 7. |
| parfor( i in k_min:k_max,check=0 ){ |
| #print( " ************** " + i + " ****************" ); |
| [m_selected_feature,d_err] = |
| getSelectedFeature( in_m_data,in_m_data_target,in_i_is_categorical |
| ,in_m_init_selected,in_i_max_select,i,interval ); |
| |
| m_err[1,i] = d_err; |
| m_feature[i,] = m_selected_feature; |
| #print( "err:" + d_err ); |
| } |
| |
| #Step 2. Output the combination of features and k with the smallest LOOCV error. |
| i_min_err_index = as.integer( as.scalar( rowIndexMin( m_err ) ) ); |
| out_m_selected_feature = m_feature[i_min_err_index,]; |
| out_i_selected_k = i_min_err_index + k_min - 1; |
| #print( "min err k is " + out_i_selected_k ); |
| } |
| |
| getFeatureImportance = function ( |
| matrix[double] in_m_data, |
| matrix[double] in_m_data_target, |
| integer in_i_is_categorical, # 1 = category , 0 = continuous |
| integer k_value) |
| return( |
| matrix[double] out_m_feature_importance, |
| matrix[double] out_m_norm_feature_importance |
| ) |
| { |
| n_feature = ncol(in_m_data) |
| n_record = nrow(in_m_data) |
| if(n_feature <= 1){ |
| stop("can't estimate feature importance when ncol = 1") |
| } |
| m_err = matrix( 0,n_record,n_feature); |
| parfor(i_feature in 1:n_feature, check=0){ |
| m_feature_select = matrix(1,1,n_feature) |
| m_feature_select[1,i_feature] = 0; |
| m_in_tmp_data = removeEmpty(target=in_m_data,margin="cols", select= m_feature_select) |
| m_neighbor_value = getKNeighbor( m_in_tmp_data,m_in_tmp_data,in_m_data_target,k_value); |
| m_tmp_err = getErr( m_neighbor_value,in_m_data_target ,in_i_is_categorical ); |
| m_err[,i_feature] = m_tmp_err |
| } |
| out_m_feature_importance = colSums( m_err ); |
| out_m_norm_feature_importance = |
| out_m_feature_importance / as.scalar(rowSums(out_m_feature_importance)) |
| } |