scripts/algorithms/stratstats.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # STRATIFIED BIVARIATE STATISTICS, VERSION 4
 #
 # INPUT PARAMETERS:
 # -----------------------------------------------------------------------------
 # NAME  TYPE   DEFAULT  MEANING
 # -----------------------------------------------------------------------------
 # X     String  ---     Location to read matrix X that has all 1-st covariates
 # Y     String  " "     Location to read matrix Y that has all 2-nd covariates
 #                       the default value " " means "use X in place of Y"
 # S     String  " "     Location to read matrix S that has the stratum column
 #                       the default value " " means "use X in place of S"
 # Xcid  String  " "     Location to read the 1-st covariate X-column indices
 #                       the default value " " means "use columns 1 : ncol(X)"
 # Ycid  String  " "     Location to read the 2-nd covariate Y-column indices
 #                       the default value " " means "use columns 1 : ncol(Y)"
 # Scid  Int      1      Column index of the stratum column in S
 # O     String  ---     Location to store the output matrix (see below)
 # fmt   String "text"   Matrix output format, usually "text" or "csv"
 # -----------------------------------------------------------------------------
 # Note: the stratum column must contain small positive integers; all fractional
 # values are rounded; strata with ID <= 0 or NaN are treated as missing.
 #
 # OUTPUT MATRIX:
 # One row per each distinct pair (1st covariate, 2nd covariate)
 # 40 columns containing the following information:
 #     Col 01: 1st covariate X-column number
 #     Col 02: 1st covariate global presence count
 #     Col 03: 1st covariate global mean
 #     Col 04: 1st covariate global standard deviation
 #     Col 05: 1st covariate stratified standard deviation
 #     Col 06: R-squared, 1st covariate vs. strata
 #     Col 07: adjusted R-squared, 1st covariate vs. strata
 #     Col 08: P-value, 1st covariate vs. strata
 #     Col 09-10: Reserved
 #     Col 11: 2nd covariate Y-column number
 #     Col 12: 2nd covariate global presence count
 #     Col 13: 2nd covariate global mean
 #     Col 14: 2nd covariate global standard deviation
 #     Col 15: 2nd covariate stratified standard deviation
 #     Col 16: R-squared, 2nd covariate vs. strata
 #     Col 17: adjusted R-squared, 2nd covariate vs. strata
 #     Col 18: P-value, 2nd covariate vs. strata
 #     Col 19-20: Reserved
 #     Col 21: Global 1st & 2nd covariate presence count
 #     Col 22: Global regression slope (2nd vs. 1st covariate)
 #     Col 23: Global regression slope standard deviation
 #     Col 24: Global correlation = +/- sqrt(R-squared)
 #     Col 25: Global residual standard deviation
 #     Col 26: Global R-squared
 #     Col 27: Global adjusted R-squared
 #     Col 28: Global P-value for hypothesis "slope = 0"
 #     Col 29-30: Reserved
 #     Col 31: Stratified 1st & 2nd covariate presence count
 #     Col 32: Stratified regression slope (2nd vs. 1st covariate)
 #     Col 33: Stratified regression slope standard deviation
 #     Col 34: Stratified correlation = +/- sqrt(R-squared)
 #     Col 35: Stratified residual standard deviation
 #     Col 36: Stratified R-squared
 #     Col 37: Stratified adjusted R-squared
 #     Col 38: Stratified P-value for hypothesis "slope = 0"
 #     Col 39: Number of strata with at least two counted points
 #     Col 40: Reserved
 #
 # EXAMPLES:
 #
 # hadoop jar SystemDS.jar -f stratstats.dml -nvargs X=INPUT_DIR/X.mtx Xcid=INPUT_DIR/Xcid.mtx
 #     Y=INPUT_DIR/Y.mtx Ycid=INPUT_DIR/Ycid.mtx S=INPUT_DIR/S.mtx Scid=1 O=OUTPUT_DIR/Out.mtx fmt=csv
 #
 # hadoop jar SystemDS.jar -f stratstats.dml -nvargs X=INPUT_DIR/Data.mtx Xcid=INPUT_DIR/Xcid.mtx
 #     Ycid=INPUT_DIR/Ycid.mtx Scid=1 O=OUTPUT_DIR/Out.mtx

 fileX = $X;
 fileY = ifdef ($Y, " ");
 fileS = ifdef ($S, " ");
 fileO = $O;
 fmtO  = ifdef ($fmt, "text");

 fileXcid = ifdef ($Xcid, " ");
 fileYcid = ifdef ($Ycid, " ");
 stratum_column_id = ifdef ($Scid, 1);

 print ("BEGIN STRATIFIED STATISTICS SCRIPT");

 print ("Reading the input matrices...");

 XwithNaNs = read (fileX);
 if (fileY != " ") {
     YwithNaNs = read (fileY);
 } else {
     YwithNaNs = XwithNaNs;
 }
 if (fileS != " ") {
     SwithNaNsFull = read (fileS);
     SwithNaNs = SwithNaNsFull [, stratum_column_id];
 } else {
     SwithNaNs = XwithNaNs [, stratum_column_id];
 }
 if (fileXcid != " ") {
     Xcols = read (fileXcid);
 } else {
     Xcols = t(seq (1, ncol (XwithNaNs), 1));
 }
 if (fileYcid != " ") {
     Ycols = read (fileYcid);
 } else {
     Ycols = t(seq (1, ncol (YwithNaNs), 1));
 }
 tXcols = t(Xcols);
 tYcols = t(Ycols);

 num_records  = nrow (XwithNaNs);
 num_attrs    = ncol (XwithNaNs);
 num_attrs_X  = ncol (Xcols);
 num_attrs_Y  = ncol (Ycols);
 num_attrs_XY = num_attrs_X * num_attrs_Y;

 print ("Preparing the covariates...");

 XnoNaNs = replace (target = XwithNaNs, pattern = NaN, replacement = 0);
 YnoNaNs = replace (target = YwithNaNs, pattern = NaN, replacement = 0);
 XNaNmask = (XwithNaNs == XwithNaNs);
 YNaNmask = (YwithNaNs == YwithNaNs);
 one_to_num_attrs_X = seq (1, num_attrs_X, 1);
 one_to_num_attrs_Y = seq (1, num_attrs_Y, 1);
 ProjX = matrix (0, rows = num_attrs, cols = num_attrs_X);
 ProjY = matrix (0, rows = num_attrs, cols = num_attrs_Y);

 ProjX_ctable = table (tXcols, one_to_num_attrs_X);
 ProjX [1 : nrow (ProjX_ctable), ] = ProjX_ctable;

 ProjY_ctable = table (tYcols, one_to_num_attrs_Y);
 ProjY [1 : nrow (ProjY_ctable), ] = ProjY_ctable;

 X = XnoNaNs %*% ProjX;
 Y = YnoNaNs %*% ProjY;
 X_mask = XNaNmask %*% ProjX;
 Y_mask = YNaNmask %*% ProjY;

 print ("Preparing the strata...");

 SnoNaNs = replace (target = SwithNaNs, pattern = NaN, replacement = 0);
 S = round (SnoNaNs) * (SnoNaNs > 0);
 Proj_good_stratumID = diag (S > 0);
 Proj_good_stratumID = removeEmpty (target = Proj_good_stratumID, margin = "rows");
 vector_of_good_stratumIDs = Proj_good_stratumID %*% S;
 vector_of_good_stratumIDs = vector_of_good_stratumIDs + (1 - min (vector_of_good_stratumIDs));
 num_records_with_good_stratumID = nrow (Proj_good_stratumID);
 one_to_num_records_with_good_stratumID = seq (1, num_records_with_good_stratumID, 1);

 # Create a group-by summation matrix for records over stratum IDs
 # "with_empty" means with stratum IDs that never occur in records

 num_strata_with_empty = max (vector_of_good_stratumIDs);
 StrataSummator_with_empty = table (vector_of_good_stratumIDs, one_to_num_records_with_good_stratumID);
 StrataSummator = removeEmpty (target = StrataSummator_with_empty, margin = "rows");
 StrataSummator = StrataSummator %*% Proj_good_stratumID;
 num_strata = nrow (StrataSummator);
 num_empty_strata = num_strata_with_empty - num_strata;
 print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but positive-ID strata.");

 print ("Computing the global single-variate statistics...");

 cnt_X_global = colSums (X_mask);
 cnt_Y_global = colSums (Y_mask);
 avg_X_global = colSums (X) / cnt_X_global;
 avg_Y_global = colSums (Y) / cnt_Y_global;
 var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global);
 var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global);
                  sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1);
 stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1);
                  sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1);
 stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2);

 print ("Computing the stratified single-variate statistics...");

 # Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata

 Cnt_X_per_stratum = StrataSummator %*% X_mask;
 Cnt_Y_per_stratum = StrataSummator %*% Y_mask;
 Is_none_X_per_stratum = (Cnt_X_per_stratum == 0);
 Is_none_Y_per_stratum = (Cnt_Y_per_stratum == 0);
 One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum);
 One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum);
 num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum);
 num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum);

 Sum_X_per_stratum  = StrataSummator %*% X;
 Sum_Y_per_stratum  = StrataSummator %*% Y;

 # Recompute some global statistics to exclude bad stratum-ID records

 cnt_X_with_good_stratumID = colSums (Cnt_X_per_stratum);
 cnt_Y_with_good_stratumID = colSums (Cnt_Y_per_stratum);
 sum_X_with_good_stratumID = colSums (Sum_X_per_stratum);
 sum_Y_with_good_stratumID = colSums (Sum_Y_per_stratum);
 var_sumX_with_good_stratumID = colSums (StrataSummator %*% (X * X)) - (sum_X_with_good_stratumID * sum_X_with_good_stratumID) / cnt_X_with_good_stratumID;
 var_sumY_with_good_stratumID = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_with_good_stratumID * sum_Y_with_good_stratumID) / cnt_Y_with_good_stratumID;

 # Compute the stratified statistics

 var_sumX_stratified   = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum);
 var_sumY_stratified   = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum);
                         sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata);
 stdev_X_stratified    = sqrt_failsafe (sqrt_failsafe_input_3);
                         sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata);
 stdev_Y_stratified    = sqrt_failsafe (sqrt_failsafe_input_4);
 r_sqr_X_vs_strata     = 1 - var_sumX_stratified / var_sumX_with_good_stratumID;
 r_sqr_Y_vs_strata     = 1 - var_sumY_stratified / var_sumY_with_good_stratumID;
 adj_r_sqr_X_vs_strata = 1 - (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)) / (var_sumX_with_good_stratumID / (cnt_X_with_good_stratumID - 1));
 adj_r_sqr_Y_vs_strata = 1 - (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)) / (var_sumY_with_good_stratumID / (cnt_Y_with_good_stratumID - 1));
 fStat_X_vs_strata     = ((var_sumX_with_good_stratumID - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata));
 fStat_Y_vs_strata     = ((var_sumY_with_good_stratumID - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata));
 p_val_X_vs_strata     = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_with_good_stratumID - num_X_nonempty_strata);
 p_val_Y_vs_strata     = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_with_good_stratumID - num_Y_nonempty_strata);

 print ("Computing the global bivariate statistics...");

 # Compute the aggregate X vs. Y statistics and map them into proper positions

 cnt_XY_rectangle       = t(X_mask) %*% Y_mask;
 sum_X_forXY_rectangle  = t(X)      %*% Y_mask;
 sum_XX_forXY_rectangle = t(X * X)  %*% Y_mask;
 sum_Y_forXY_rectangle  = t(X_mask) %*% Y;
 sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y);
 sum_XY_rectangle       = t(X)      %*% Y;
 cnt_XY_global       = matrix (cnt_XY_rectangle,       rows = 1, cols = num_attrs_XY, byrow = TRUE);
 sum_X_forXY_global  = matrix (sum_X_forXY_rectangle,  rows = 1, cols = num_attrs_XY, byrow = TRUE);
 sum_XX_forXY_global = matrix (sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
 sum_Y_forXY_global  = matrix (sum_Y_forXY_rectangle,  rows = 1, cols = num_attrs_XY, byrow = TRUE);
 sum_YY_forXY_global = matrix (sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
 sum_XY_global       = matrix (sum_XY_rectangle,       rows = 1, cols = num_attrs_XY, byrow = TRUE);
 ones_XY = matrix (1.0, rows = 1, cols = num_attrs_XY);

 # Compute the global bivariate statistics for output

 cov_sumX_sumY_global    = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global;
 var_sumX_forXY_global   = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global;
 var_sumY_forXY_global   = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global;
 slope_XY_global         = cov_sumX_sumY_global / var_sumX_forXY_global;
                                                  sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global;
                                                  sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5);
 corr_XY_global          = cov_sumX_sumY_global / sqrt_failsafe_output_5;
 r_sqr_X_vs_Y_global     = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global);
 adj_r_sqr_X_vs_Y_global = 1 - (1 - r_sqr_X_vs_Y_global) * (cnt_XY_global - 1) / (cnt_XY_global - 2);
                           sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2)
 stdev_slope_XY_global   = sqrt_failsafe (sqrt_failsafe_input_6);
                           sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2)
 stdev_errY_vs_X_global  = sqrt_failsafe (sqrt_failsafe_input_7);
 fStat_Y_vs_X_global     = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global);
 p_val_Y_vs_X_global     = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2);

 print ("Computing the stratified bivariate statistics...");

 # Create projections to "intermingle" X and Y into attribute pairs

 Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY);
 Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY);
 ones_Y_col   = matrix (1.0, rows = num_attrs_Y, cols = 1);
 for (i in 1:num_attrs_X) {
     start_cid = (i - 1) * num_attrs_Y + 1;
     end_cid = i * num_attrs_Y;
     Proj_X_to_XY [i, start_cid:end_cid] = t(ones_Y_col);
     Proj_Y_to_XY [ , start_cid:end_cid] = diag (ones_Y_col);
 }

 # Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata

 Cnt_XY_per_stratum       = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
 Sum_X_forXY_per_stratum  = StrataSummator %*% (( X      %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
 Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
 Sum_Y_forXY_per_stratum  = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y      %*% Proj_Y_to_XY));
 Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY));
 Sum_XY_per_stratum       = StrataSummator %*% (( X      %*% Proj_X_to_XY) * ( Y      %*% Proj_Y_to_XY));

 Is_none_XY_per_stratum = (Cnt_XY_per_stratum == 0);
 One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum);
 num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum);

 # Recompute some global aggregate X vs. Y statistics to exclude bad stratum-ID records

 cnt_XY_with_good_stratumID = colSums (Cnt_XY_per_stratum);
 sum_XX_forXY_with_good_stratumID = colSums (Sum_XX_forXY_per_stratum);
 sum_YY_forXY_with_good_stratumID = colSums (Sum_YY_forXY_per_stratum);
 sum_XY_with_good_stratumID = colSums (Sum_XY_per_stratum);

 # Compute the stratified bivariate statistics

 var_sumX_forXY_stratified = sum_XX_forXY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum);
 var_sumY_forXY_stratified = sum_YY_forXY_with_good_stratumID - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
 cov_sumX_sumY_stratified  = sum_XY_with_good_stratumID       - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);

 slope_XY_stratified     = cov_sumX_sumY_stratified / var_sumX_forXY_stratified;
                                                      sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified;
                                                      sqrt_failsafe_output_8 = sqrt_failsafe (sqrt_failsafe_input_8);
 corr_XY_stratified      = cov_sumX_sumY_stratified / sqrt_failsafe_output_8;
 r_sqr_X_vs_Y_stratified = (cov_sumX_sumY_stratified ^ 2) / (var_sumX_forXY_stratified * var_sumY_forXY_stratified);
 temp_X_vs_Y_stratified  = (1 - r_sqr_X_vs_Y_stratified) / (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1)
 adj_r_sqr_X_vs_Y_stratified = 1 - temp_X_vs_Y_stratified * (cnt_XY_with_good_stratumID - num_XY_nonempty_strata);
                               sqrt_failsafe_input_9  = temp_X_vs_Y_stratified * var_sumY_forXY_stratified;
 stdev_errY_vs_X_stratified  = sqrt_failsafe (sqrt_failsafe_input_9);
                               sqrt_failsafe_input_10 = sqrt_failsafe_input_9  / var_sumX_forXY_stratified;
 stdev_slope_XY_stratified   = sqrt_failsafe (sqrt_failsafe_input_10);
 fStat_Y_vs_X_stratified = (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified);
 p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1);

 print ("Preparing the output matrix...");
 OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY);

 OutMtx [ 1, ] = Xcols                 %*% Proj_X_to_XY;  # 1st covariate column number
 OutMtx [ 2, ] = cnt_X_global          %*% Proj_X_to_XY;  # 1st covariate global presence count
 OutMtx [ 3, ] = avg_X_global          %*% Proj_X_to_XY;  # 1st covariate global mean
 OutMtx [ 4, ] = stdev_X_global        %*% Proj_X_to_XY;  # 1st covariate global standard deviation
 OutMtx [ 5, ] = stdev_X_stratified    %*% Proj_X_to_XY;  # 1st covariate stratified standard deviation
 OutMtx [ 6, ] = r_sqr_X_vs_strata     %*% Proj_X_to_XY;  # R-squared, 1st covariate vs. strata
 OutMtx [ 7, ] = adj_r_sqr_X_vs_strata %*% Proj_X_to_XY;  # adjusted R-squared, 1st covariate vs. strata
 OutMtx [ 8, ] = p_val_X_vs_strata     %*% Proj_X_to_XY;  # P-value, 1st covariate vs. strata
 OutMtx [11, ] = Ycols                 %*% Proj_Y_to_XY;  # 2nd covariate column number
 OutMtx [12, ] = cnt_Y_global          %*% Proj_Y_to_XY;  # 2nd covariate global presence count
 OutMtx [13, ] = avg_Y_global          %*% Proj_Y_to_XY;  # 2nd covariate global mean
 OutMtx [14, ] = stdev_Y_global        %*% Proj_Y_to_XY;  # 2nd covariate global standard deviation
 OutMtx [15, ] = stdev_Y_stratified    %*% Proj_Y_to_XY;  # 2nd covariate stratified standard deviation
 OutMtx [16, ] = r_sqr_Y_vs_strata     %*% Proj_Y_to_XY;  # R-squared, 2nd covariate vs. strata
 OutMtx [17, ] = adj_r_sqr_Y_vs_strata %*% Proj_Y_to_XY;  # adjusted R-squared, 2nd covariate vs. strata
 OutMtx [18, ] = p_val_Y_vs_strata     %*% Proj_Y_to_XY;  # P-value, 2nd covariate vs. strata

 OutMtx [21, ] = cnt_XY_global;               # Global 1st & 2nd covariate presence count
 OutMtx [22, ] = slope_XY_global;             # Global regression slope (2nd vs. 1st covariate)
 OutMtx [23, ] = stdev_slope_XY_global;       # Global regression slope standard deviation
 OutMtx [24, ] = corr_XY_global;              # Global correlation = +/- sqrt(R-squared)
 OutMtx [25, ] = stdev_errY_vs_X_global;      # Global residual standard deviation
 OutMtx [26, ] = r_sqr_X_vs_Y_global;         # Global R-squared
 OutMtx [27, ] = adj_r_sqr_X_vs_Y_global;     # Global adjusted R-squared
 OutMtx [28, ] = p_val_Y_vs_X_global;         # Global P-value for hypothesis "slope = 0"
 OutMtx [31, ] = cnt_XY_with_good_stratumID;  # Stratified 1st & 2nd covariate presence count
 OutMtx [32, ] = slope_XY_stratified;         # Stratified regression slope (2nd vs. 1st covariate)
 OutMtx [33, ] = stdev_slope_XY_stratified;   # Stratified regression slope standard deviation
 OutMtx [34, ] = corr_XY_stratified;          # Stratified correlation = +/- sqrt(R-squared)
 OutMtx [35, ] = stdev_errY_vs_X_stratified;  # Stratified residual standard deviation
 OutMtx [36, ] = r_sqr_X_vs_Y_stratified;     # Stratified R-squared
 OutMtx [37, ] = adj_r_sqr_X_vs_Y_stratified; # Stratified adjusted R-squared
 OutMtx [38, ] = p_val_Y_vs_X_stratified;     # Stratified P-value for hypothesis "slope = 0"
 OutMtx [39, ] = colSums (Cnt_XY_per_stratum >= 2);  # Number of strata with at least two counted points

 OutMtx = t(OutMtx);

 print ("Writing the output matrix...");
 write (OutMtx, fileO, format=fmtO);
 print ("END STRATIFIED STATISTICS SCRIPT");


 fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob)
 { # TEMPORARY IMPLEMENTATION
     tailprob = fStat;
     for (i in 1:nrow(fStat)) {
       for (j in 1:ncol(fStat)) {
         q = as.scalar (fStat [i, j]);
         d1 = as.scalar (df_1 [i, j]);
         d2 = as.scalar (df_2 [i, j]);
         if (d1 >= 1 & d2 >= 1 & q >= 0) {
             tailprob  [i, j] = pf(target = q, df1 = d1, df2 = d2, lower.tail=FALSE);
         } else {
             tailprob  [i, j] = 0/0;
         }
     } }
 }

 sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A)
 {
     mask_A = (input_A >= 0);
     prep_A = input_A * mask_A;
     mask_A = mask_A * (prep_A == prep_A);
     prep_A = replace (target = prep_A, pattern = NaN, replacement = 0);
     output_A = sqrt (prep_A) / mask_A;
 }