| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # STRATIFIED BIVARIATE STATISTICS, VERSION 4 |
| # |
| # INPUT PARAMETERS: |
| # ----------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # ----------------------------------------------------------------------------- |
| # X String --- Location to read matrix X that has all 1-st covariates |
| # Y String " " Location to read matrix Y that has all 2-nd covariates |
| # the default value " " means "use X in place of Y" |
| # S String " " Location to read matrix S that has the stratum column |
| # the default value " " means "use X in place of S" |
| # Xcid String " " Location to read the 1-st covariate X-column indices |
| # the default value " " means "use columns 1 : ncol(X)" |
| # Ycid String " " Location to read the 2-nd covariate Y-column indices |
| # the default value " " means "use columns 1 : ncol(Y)" |
| # Scid Int 1 Column index of the stratum column in S |
| # O String --- Location to store the output matrix (see below) |
| # fmt String "text" Matrix output format, usually "text" or "csv" |
| # ----------------------------------------------------------------------------- |
| # Note: the stratum column must contain small positive integers; all fractional |
| # values are rounded; strata with ID <= 0 or NaN are treated as missing. |
| # |
| # OUTPUT MATRIX: |
| # One row per each distinct pair (1st covariate, 2nd covariate) |
| # 40 columns containing the following information: |
| # Col 01: 1st covariate X-column number |
| # Col 02: 1st covariate global presence count |
| # Col 03: 1st covariate global mean |
| # Col 04: 1st covariate global standard deviation |
| # Col 05: 1st covariate stratified standard deviation |
| # Col 06: R-squared, 1st covariate vs. strata |
| # Col 07: adjusted R-squared, 1st covariate vs. strata |
| # Col 08: P-value, 1st covariate vs. strata |
| # Col 09-10: Reserved |
| # Col 11: 2nd covariate Y-column number |
| # Col 12: 2nd covariate global presence count |
| # Col 13: 2nd covariate global mean |
| # Col 14: 2nd covariate global standard deviation |
| # Col 15: 2nd covariate stratified standard deviation |
| # Col 16: R-squared, 2nd covariate vs. strata |
| # Col 17: adjusted R-squared, 2nd covariate vs. strata |
| # Col 18: P-value, 2nd covariate vs. strata |
| # Col 19-20: Reserved |
| # Col 21: Global 1st & 2nd covariate presence count |
| # Col 22: Global regression slope (2nd vs. 1st covariate) |
| # Col 23: Global regression slope standard deviation |
| # Col 24: Global correlation = +/- sqrt(R-squared) |
| # Col 25: Global residual standard deviation |
| # Col 26: Global R-squared |
| # Col 27: Global adjusted R-squared |
| # Col 28: Global P-value for hypothesis "slope = 0" |
| # Col 29-30: Reserved |
| # Col 31: Stratified 1st & 2nd covariate presence count |
| # Col 32: Stratified regression slope (2nd vs. 1st covariate) |
| # Col 33: Stratified regression slope standard deviation |
| # Col 34: Stratified correlation = +/- sqrt(R-squared) |
| # Col 35: Stratified residual standard deviation |
| # Col 36: Stratified R-squared |
| # Col 37: Stratified adjusted R-squared |
| # Col 38: Stratified P-value for hypothesis "slope = 0" |
| # Col 39: Number of strata with at least two counted points |
| # Col 40: Reserved |
| # |
| # EXAMPLES: |
| # |
| # hadoop jar SystemDS.jar -f stratstats.dml -nvargs X=INPUT_DIR/X.mtx Xcid=INPUT_DIR/Xcid.mtx |
| # Y=INPUT_DIR/Y.mtx Ycid=INPUT_DIR/Ycid.mtx S=INPUT_DIR/S.mtx Scid=1 O=OUTPUT_DIR/Out.mtx fmt=csv |
| # |
| # hadoop jar SystemDS.jar -f stratstats.dml -nvargs X=INPUT_DIR/Data.mtx Xcid=INPUT_DIR/Xcid.mtx |
| # Ycid=INPUT_DIR/Ycid.mtx Scid=1 O=OUTPUT_DIR/Out.mtx |
| |
| fileX = $X; |
| fileY = ifdef ($Y, " "); |
| fileS = ifdef ($S, " "); |
| fileO = $O; |
| fmtO = ifdef ($fmt, "text"); |
| |
| fileXcid = ifdef ($Xcid, " "); |
| fileYcid = ifdef ($Ycid, " "); |
| stratum_column_id = ifdef ($Scid, 1); |
| |
| print ("BEGIN STRATIFIED STATISTICS SCRIPT"); |
| |
| print ("Reading the input matrices..."); |
| |
| XwithNaNs = read (fileX); |
| if (fileY != " ") { |
| YwithNaNs = read (fileY); |
| } else { |
| YwithNaNs = XwithNaNs; |
| } |
| if (fileS != " ") { |
| SwithNaNsFull = read (fileS); |
| SwithNaNs = SwithNaNsFull [, stratum_column_id]; |
| } else { |
| SwithNaNs = XwithNaNs [, stratum_column_id]; |
| } |
| if (fileXcid != " ") { |
| Xcols = read (fileXcid); |
| } else { |
| Xcols = t(seq (1, ncol (XwithNaNs), 1)); |
| } |
| if (fileYcid != " ") { |
| Ycols = read (fileYcid); |
| } else { |
| Ycols = t(seq (1, ncol (YwithNaNs), 1)); |
| } |
| tXcols = t(Xcols); |
| tYcols = t(Ycols); |
| |
| num_records = nrow (XwithNaNs); |
| num_attrs = ncol (XwithNaNs); |
| num_attrs_X = ncol (Xcols); |
| num_attrs_Y = ncol (Ycols); |
| num_attrs_XY = num_attrs_X * num_attrs_Y; |
| |
| print ("Preparing the covariates..."); |
| |
| XnoNaNs = replace (target = XwithNaNs, pattern = NaN, replacement = 0); |
| YnoNaNs = replace (target = YwithNaNs, pattern = NaN, replacement = 0); |
| XNaNmask = (XwithNaNs == XwithNaNs); |
| YNaNmask = (YwithNaNs == YwithNaNs); |
| one_to_num_attrs_X = seq (1, num_attrs_X, 1); |
| one_to_num_attrs_Y = seq (1, num_attrs_Y, 1); |
| ProjX = matrix (0, rows = num_attrs, cols = num_attrs_X); |
| ProjY = matrix (0, rows = num_attrs, cols = num_attrs_Y); |
| |
| ProjX_ctable = table (tXcols, one_to_num_attrs_X); |
| ProjX [1 : nrow (ProjX_ctable), ] = ProjX_ctable; |
| |
| ProjY_ctable = table (tYcols, one_to_num_attrs_Y); |
| ProjY [1 : nrow (ProjY_ctable), ] = ProjY_ctable; |
| |
| X = XnoNaNs %*% ProjX; |
| Y = YnoNaNs %*% ProjY; |
| X_mask = XNaNmask %*% ProjX; |
| Y_mask = YNaNmask %*% ProjY; |
| |
| print ("Preparing the strata..."); |
| |
| SnoNaNs = replace (target = SwithNaNs, pattern = NaN, replacement = 0); |
| S = round (SnoNaNs) * (SnoNaNs > 0); |
| Proj_good_stratumID = diag (S > 0); |
| Proj_good_stratumID = removeEmpty (target = Proj_good_stratumID, margin = "rows"); |
| vector_of_good_stratumIDs = Proj_good_stratumID %*% S; |
| vector_of_good_stratumIDs = vector_of_good_stratumIDs + (1 - min (vector_of_good_stratumIDs)); |
| num_records_with_good_stratumID = nrow (Proj_good_stratumID); |
| one_to_num_records_with_good_stratumID = seq (1, num_records_with_good_stratumID, 1); |
| |
| # Create a group-by summation matrix for records over stratum IDs |
| # "with_empty" means with stratum IDs that never occur in records |
| |
| num_strata_with_empty = max (vector_of_good_stratumIDs); |
| StrataSummator_with_empty = table (vector_of_good_stratumIDs, one_to_num_records_with_good_stratumID); |
| StrataSummator = removeEmpty (target = StrataSummator_with_empty, margin = "rows"); |
| StrataSummator = StrataSummator %*% Proj_good_stratumID; |
| num_strata = nrow (StrataSummator); |
| num_empty_strata = num_strata_with_empty - num_strata; |
| print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but positive-ID strata."); |
| |
| print ("Computing the global single-variate statistics..."); |
| |
| cnt_X_global = colSums (X_mask); |
| cnt_Y_global = colSums (Y_mask); |
| avg_X_global = colSums (X) / cnt_X_global; |
| avg_Y_global = colSums (Y) / cnt_Y_global; |
| var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global); |
| var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global); |
| sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1); |
| stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1); |
| sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1); |
| stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2); |
| |
| print ("Computing the stratified single-variate statistics..."); |
| |
| # Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata |
| |
| Cnt_X_per_stratum = StrataSummator %*% X_mask; |
| Cnt_Y_per_stratum = StrataSummator %*% Y_mask; |
| Is_none_X_per_stratum = (Cnt_X_per_stratum == 0); |
| Is_none_Y_per_stratum = (Cnt_Y_per_stratum == 0); |
| One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum); |
| One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum); |
| num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum); |
| num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum); |
| |
| Sum_X_per_stratum = StrataSummator %*% X; |
| Sum_Y_per_stratum = StrataSummator %*% Y; |
| |
| # Recompute some global statistics to exclude bad stratum-ID records |
| |
| cnt_X_with_good_stratumID = colSums (Cnt_X_per_stratum); |
| cnt_Y_with_good_stratumID = colSums (Cnt_Y_per_stratum); |
| sum_X_with_good_stratumID = colSums (Sum_X_per_stratum); |
| sum_Y_with_good_stratumID = colSums (Sum_Y_per_stratum); |
| var_sumX_with_good_stratumID = colSums (StrataSummator %*% (X * X)) - (sum_X_with_good_stratumID * sum_X_with_good_stratumID) / cnt_X_with_good_stratumID; |
| var_sumY_with_good_stratumID = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_with_good_stratumID * sum_Y_with_good_stratumID) / cnt_Y_with_good_stratumID; |
| |
| # Compute the stratified statistics |
| |
| var_sumX_stratified = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum); |
| var_sumY_stratified = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum); |
| sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata); |
| stdev_X_stratified = sqrt_failsafe (sqrt_failsafe_input_3); |
| sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata); |
| stdev_Y_stratified = sqrt_failsafe (sqrt_failsafe_input_4); |
| r_sqr_X_vs_strata = 1 - var_sumX_stratified / var_sumX_with_good_stratumID; |
| r_sqr_Y_vs_strata = 1 - var_sumY_stratified / var_sumY_with_good_stratumID; |
| adj_r_sqr_X_vs_strata = 1 - (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)) / (var_sumX_with_good_stratumID / (cnt_X_with_good_stratumID - 1)); |
| adj_r_sqr_Y_vs_strata = 1 - (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)) / (var_sumY_with_good_stratumID / (cnt_Y_with_good_stratumID - 1)); |
| fStat_X_vs_strata = ((var_sumX_with_good_stratumID - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)); |
| fStat_Y_vs_strata = ((var_sumY_with_good_stratumID - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)); |
| p_val_X_vs_strata = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_with_good_stratumID - num_X_nonempty_strata); |
| p_val_Y_vs_strata = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_with_good_stratumID - num_Y_nonempty_strata); |
| |
| print ("Computing the global bivariate statistics..."); |
| |
| # Compute the aggregate X vs. Y statistics and map them into proper positions |
| |
| cnt_XY_rectangle = t(X_mask) %*% Y_mask; |
| sum_X_forXY_rectangle = t(X) %*% Y_mask; |
| sum_XX_forXY_rectangle = t(X * X) %*% Y_mask; |
| sum_Y_forXY_rectangle = t(X_mask) %*% Y; |
| sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y); |
| sum_XY_rectangle = t(X) %*% Y; |
| cnt_XY_global = matrix (cnt_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); |
| sum_X_forXY_global = matrix (sum_X_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); |
| sum_XX_forXY_global = matrix (sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); |
| sum_Y_forXY_global = matrix (sum_Y_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); |
| sum_YY_forXY_global = matrix (sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); |
| sum_XY_global = matrix (sum_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); |
| ones_XY = matrix (1.0, rows = 1, cols = num_attrs_XY); |
| |
| # Compute the global bivariate statistics for output |
| |
| cov_sumX_sumY_global = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global; |
| var_sumX_forXY_global = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global; |
| var_sumY_forXY_global = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global; |
| slope_XY_global = cov_sumX_sumY_global / var_sumX_forXY_global; |
| sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global; |
| sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5); |
| corr_XY_global = cov_sumX_sumY_global / sqrt_failsafe_output_5; |
| r_sqr_X_vs_Y_global = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global); |
| adj_r_sqr_X_vs_Y_global = 1 - (1 - r_sqr_X_vs_Y_global) * (cnt_XY_global - 1) / (cnt_XY_global - 2); |
| sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2) |
| stdev_slope_XY_global = sqrt_failsafe (sqrt_failsafe_input_6); |
| sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2) |
| stdev_errY_vs_X_global = sqrt_failsafe (sqrt_failsafe_input_7); |
| fStat_Y_vs_X_global = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global); |
| p_val_Y_vs_X_global = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2); |
| |
| print ("Computing the stratified bivariate statistics..."); |
| |
| # Create projections to "intermingle" X and Y into attribute pairs |
| |
| Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY); |
| Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY); |
| ones_Y_col = matrix (1.0, rows = num_attrs_Y, cols = 1); |
| for (i in 1:num_attrs_X) { |
| start_cid = (i - 1) * num_attrs_Y + 1; |
| end_cid = i * num_attrs_Y; |
| Proj_X_to_XY [i, start_cid:end_cid] = t(ones_Y_col); |
| Proj_Y_to_XY [ , start_cid:end_cid] = diag (ones_Y_col); |
| } |
| |
| # Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata |
| |
| Cnt_XY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY)); |
| Sum_X_forXY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY)); |
| Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY)); |
| Sum_Y_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY)); |
| Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY)); |
| Sum_XY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY)); |
| |
| Is_none_XY_per_stratum = (Cnt_XY_per_stratum == 0); |
| One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum); |
| num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum); |
| |
| # Recompute some global aggregate X vs. Y statistics to exclude bad stratum-ID records |
| |
| cnt_XY_with_good_stratumID = colSums (Cnt_XY_per_stratum); |
| sum_XX_forXY_with_good_stratumID = colSums (Sum_XX_forXY_per_stratum); |
| sum_YY_forXY_with_good_stratumID = colSums (Sum_YY_forXY_per_stratum); |
| sum_XY_with_good_stratumID = colSums (Sum_XY_per_stratum); |
| |
| # Compute the stratified bivariate statistics |
| |
| var_sumX_forXY_stratified = sum_XX_forXY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum); |
| var_sumY_forXY_stratified = sum_YY_forXY_with_good_stratumID - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum); |
| cov_sumX_sumY_stratified = sum_XY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum); |
| |
| slope_XY_stratified = cov_sumX_sumY_stratified / var_sumX_forXY_stratified; |
| sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified; |
| sqrt_failsafe_output_8 = sqrt_failsafe (sqrt_failsafe_input_8); |
| corr_XY_stratified = cov_sumX_sumY_stratified / sqrt_failsafe_output_8; |
| r_sqr_X_vs_Y_stratified = (cov_sumX_sumY_stratified ^ 2) / (var_sumX_forXY_stratified * var_sumY_forXY_stratified); |
| temp_X_vs_Y_stratified = (1 - r_sqr_X_vs_Y_stratified) / (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1) |
| adj_r_sqr_X_vs_Y_stratified = 1 - temp_X_vs_Y_stratified * (cnt_XY_with_good_stratumID - num_XY_nonempty_strata); |
| sqrt_failsafe_input_9 = temp_X_vs_Y_stratified * var_sumY_forXY_stratified; |
| stdev_errY_vs_X_stratified = sqrt_failsafe (sqrt_failsafe_input_9); |
| sqrt_failsafe_input_10 = sqrt_failsafe_input_9 / var_sumX_forXY_stratified; |
| stdev_slope_XY_stratified = sqrt_failsafe (sqrt_failsafe_input_10); |
| fStat_Y_vs_X_stratified = (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified); |
| p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1); |
| |
| print ("Preparing the output matrix..."); |
| OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY); |
| |
| OutMtx [ 1, ] = Xcols %*% Proj_X_to_XY; # 1st covariate column number |
| OutMtx [ 2, ] = cnt_X_global %*% Proj_X_to_XY; # 1st covariate global presence count |
| OutMtx [ 3, ] = avg_X_global %*% Proj_X_to_XY; # 1st covariate global mean |
| OutMtx [ 4, ] = stdev_X_global %*% Proj_X_to_XY; # 1st covariate global standard deviation |
| OutMtx [ 5, ] = stdev_X_stratified %*% Proj_X_to_XY; # 1st covariate stratified standard deviation |
| OutMtx [ 6, ] = r_sqr_X_vs_strata %*% Proj_X_to_XY; # R-squared, 1st covariate vs. strata |
| OutMtx [ 7, ] = adj_r_sqr_X_vs_strata %*% Proj_X_to_XY; # adjusted R-squared, 1st covariate vs. strata |
| OutMtx [ 8, ] = p_val_X_vs_strata %*% Proj_X_to_XY; # P-value, 1st covariate vs. strata |
| OutMtx [11, ] = Ycols %*% Proj_Y_to_XY; # 2nd covariate column number |
| OutMtx [12, ] = cnt_Y_global %*% Proj_Y_to_XY; # 2nd covariate global presence count |
| OutMtx [13, ] = avg_Y_global %*% Proj_Y_to_XY; # 2nd covariate global mean |
| OutMtx [14, ] = stdev_Y_global %*% Proj_Y_to_XY; # 2nd covariate global standard deviation |
| OutMtx [15, ] = stdev_Y_stratified %*% Proj_Y_to_XY; # 2nd covariate stratified standard deviation |
| OutMtx [16, ] = r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # R-squared, 2nd covariate vs. strata |
| OutMtx [17, ] = adj_r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # adjusted R-squared, 2nd covariate vs. strata |
| OutMtx [18, ] = p_val_Y_vs_strata %*% Proj_Y_to_XY; # P-value, 2nd covariate vs. strata |
| |
| OutMtx [21, ] = cnt_XY_global; # Global 1st & 2nd covariate presence count |
| OutMtx [22, ] = slope_XY_global; # Global regression slope (2nd vs. 1st covariate) |
| OutMtx [23, ] = stdev_slope_XY_global; # Global regression slope standard deviation |
| OutMtx [24, ] = corr_XY_global; # Global correlation = +/- sqrt(R-squared) |
| OutMtx [25, ] = stdev_errY_vs_X_global; # Global residual standard deviation |
| OutMtx [26, ] = r_sqr_X_vs_Y_global; # Global R-squared |
| OutMtx [27, ] = adj_r_sqr_X_vs_Y_global; # Global adjusted R-squared |
| OutMtx [28, ] = p_val_Y_vs_X_global; # Global P-value for hypothesis "slope = 0" |
| OutMtx [31, ] = cnt_XY_with_good_stratumID; # Stratified 1st & 2nd covariate presence count |
| OutMtx [32, ] = slope_XY_stratified; # Stratified regression slope (2nd vs. 1st covariate) |
| OutMtx [33, ] = stdev_slope_XY_stratified; # Stratified regression slope standard deviation |
| OutMtx [34, ] = corr_XY_stratified; # Stratified correlation = +/- sqrt(R-squared) |
| OutMtx [35, ] = stdev_errY_vs_X_stratified; # Stratified residual standard deviation |
| OutMtx [36, ] = r_sqr_X_vs_Y_stratified; # Stratified R-squared |
| OutMtx [37, ] = adj_r_sqr_X_vs_Y_stratified; # Stratified adjusted R-squared |
| OutMtx [38, ] = p_val_Y_vs_X_stratified; # Stratified P-value for hypothesis "slope = 0" |
| OutMtx [39, ] = colSums (Cnt_XY_per_stratum >= 2); # Number of strata with at least two counted points |
| |
| OutMtx = t(OutMtx); |
| |
| print ("Writing the output matrix..."); |
| write (OutMtx, fileO, format=fmtO); |
| print ("END STRATIFIED STATISTICS SCRIPT"); |
| |
| |
| fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob) |
| { # TEMPORARY IMPLEMENTATION |
| tailprob = fStat; |
| for (i in 1:nrow(fStat)) { |
| for (j in 1:ncol(fStat)) { |
| q = as.scalar (fStat [i, j]); |
| d1 = as.scalar (df_1 [i, j]); |
| d2 = as.scalar (df_2 [i, j]); |
| if (d1 >= 1 & d2 >= 1 & q >= 0) { |
| tailprob [i, j] = pf(target = q, df1 = d1, df2 = d2, lower.tail=FALSE); |
| } else { |
| tailprob [i, j] = 0/0; |
| } |
| } } |
| } |
| |
| sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A) |
| { |
| mask_A = (input_A >= 0); |
| prep_A = input_A * mask_A; |
| mask_A = mask_A * (prep_A == prep_A); |
| prep_A = replace (target = prep_A, pattern = NaN, replacement = 0); |
| output_A = sqrt (prep_A) / mask_A; |
| } |