blob: 3745fabf62fdc9c26da249e031d23f3807ec18bf [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# STRATIFIED BIVARIATE STATISTICS, VERSION 4
#
# INPUT PARAMETERS:
# -----------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# -----------------------------------------------------------------------------
# X String --- Location to read matrix X that has all 1-st covariates
# Y String " " Location to read matrix Y that has all 2-nd covariates
# the default value " " means "use X in place of Y"
# S String " " Location to read matrix S that has the stratum column
# the default value " " means "use X in place of S"
# Xcid String " " Location to read the 1-st covariate X-column indices
# the default value " " means "use columns 1 : ncol(X)"
# Ycid String " " Location to read the 2-nd covariate Y-column indices
# the default value " " means "use columns 1 : ncol(Y)"
# Scid Int 1 Column index of the stratum column in S
# O String --- Location to store the output matrix (see below)
# fmt String "text" Matrix output format, usually "text" or "csv"
# -----------------------------------------------------------------------------
# Note: the stratum column must contain small positive integers; all fractional
# values are rounded; strata with ID <= 0 or NaN are treated as missing.
#
# OUTPUT MATRIX:
# One row per each distinct pair (1st covariate, 2nd covariate)
# 40 columns containing the following information:
# Col 01: 1st covariate X-column number
# Col 02: 1st covariate global presence count
# Col 03: 1st covariate global mean
# Col 04: 1st covariate global standard deviation
# Col 05: 1st covariate stratified standard deviation
# Col 06: R-squared, 1st covariate vs. strata
# Col 07: adjusted R-squared, 1st covariate vs. strata
# Col 08: P-value, 1st covariate vs. strata
# Col 09-10: Reserved
# Col 11: 2nd covariate Y-column number
# Col 12: 2nd covariate global presence count
# Col 13: 2nd covariate global mean
# Col 14: 2nd covariate global standard deviation
# Col 15: 2nd covariate stratified standard deviation
# Col 16: R-squared, 2nd covariate vs. strata
# Col 17: adjusted R-squared, 2nd covariate vs. strata
# Col 18: P-value, 2nd covariate vs. strata
# Col 19-20: Reserved
# Col 21: Global 1st & 2nd covariate presence count
# Col 22: Global regression slope (2nd vs. 1st covariate)
# Col 23: Global regression slope standard deviation
# Col 24: Global correlation = +/- sqrt(R-squared)
# Col 25: Global residual standard deviation
# Col 26: Global R-squared
# Col 27: Global adjusted R-squared
# Col 28: Global P-value for hypothesis "slope = 0"
# Col 29-30: Reserved
# Col 31: Stratified 1st & 2nd covariate presence count
# Col 32: Stratified regression slope (2nd vs. 1st covariate)
# Col 33: Stratified regression slope standard deviation
# Col 34: Stratified correlation = +/- sqrt(R-squared)
# Col 35: Stratified residual standard deviation
# Col 36: Stratified R-squared
# Col 37: Stratified adjusted R-squared
# Col 38: Stratified P-value for hypothesis "slope = 0"
# Col 39: Number of strata with at least two counted points
# Col 40: Reserved
#
# EXAMPLES:
#
# hadoop jar SystemML.jar -f stratstats.dml -nvargs X=INPUT_DIR/X.mtx Xcid=INPUT_DIR/Xcid.mtx
# Y=INPUT_DIR/Y.mtx Ycid=INPUT_DIR/Ycid.mtx S=INPUT_DIR/S.mtx Scid=1 O=OUTPUT_DIR/Out.mtx fmt=csv
#
# hadoop jar SystemML.jar -f stratstats.dml -nvargs X=INPUT_DIR/Data.mtx Xcid=INPUT_DIR/Xcid.mtx
# Ycid=INPUT_DIR/Ycid.mtx Scid=1 O=OUTPUT_DIR/Out.mtx
fileX = $X;
fileY = ifdef ($Y, " ");
fileS = ifdef ($S, " ");
fileO = $O;
fmtO = ifdef ($fmt, "text");
fileXcid = ifdef ($Xcid, " ");
fileYcid = ifdef ($Ycid, " ");
stratum_column_id = ifdef ($Scid, 1);
print ("BEGIN STRATIFIED STATISTICS SCRIPT");
print ("Reading the input matrices...");
XwithNaNs = read (fileX);
if (fileY != " ") {
YwithNaNs = read (fileY);
} else {
YwithNaNs = XwithNaNs;
}
if (fileS != " ") {
SwithNaNsFull = read (fileS);
SwithNaNs = SwithNaNsFull [, stratum_column_id];
} else {
SwithNaNs = XwithNaNs [, stratum_column_id];
}
if (fileXcid != " ") {
Xcols = read (fileXcid);
} else {
Xcols = t(seq (1, ncol (XwithNaNs), 1));
}
if (fileYcid != " ") {
Ycols = read (fileYcid);
} else {
Ycols = t(seq (1, ncol (YwithNaNs), 1));
}
tXcols = t(Xcols);
tYcols = t(Ycols);
num_records = nrow (XwithNaNs);
num_attrs = ncol (XwithNaNs);
num_attrs_X = ncol (Xcols);
num_attrs_Y = ncol (Ycols);
num_attrs_XY = num_attrs_X * num_attrs_Y;
print ("Preparing the covariates...");
XnoNaNs = replace (target = XwithNaNs, pattern = 0.0/0.0, replacement = 0);
YnoNaNs = replace (target = YwithNaNs, pattern = 0.0/0.0, replacement = 0);
XNaNmask = (XwithNaNs == XwithNaNs);
YNaNmask = (YwithNaNs == YwithNaNs);
one_to_num_attrs_X = seq (1, num_attrs_X, 1);
one_to_num_attrs_Y = seq (1, num_attrs_Y, 1);
ProjX = matrix (0, rows = num_attrs, cols = num_attrs_X);
ProjY = matrix (0, rows = num_attrs, cols = num_attrs_Y);
ProjX_ctable = table (tXcols, one_to_num_attrs_X);
ProjX [1 : nrow (ProjX_ctable), ] = ProjX_ctable;
ProjY_ctable = table (tYcols, one_to_num_attrs_Y);
ProjY [1 : nrow (ProjY_ctable), ] = ProjY_ctable;
X = XnoNaNs %*% ProjX;
Y = YnoNaNs %*% ProjY;
X_mask = XNaNmask %*% ProjX;
Y_mask = YNaNmask %*% ProjY;
print ("Preparing the strata...");
SnoNaNs = replace (target = SwithNaNs, pattern = 0.0/0.0, replacement = 0);
S = round (SnoNaNs) * (SnoNaNs > 0);
Proj_good_stratumID = diag (S > 0);
Proj_good_stratumID = removeEmpty (target = Proj_good_stratumID, margin = "rows");
vector_of_good_stratumIDs = Proj_good_stratumID %*% S;
vector_of_good_stratumIDs = vector_of_good_stratumIDs + (1 - min (vector_of_good_stratumIDs));
num_records_with_good_stratumID = nrow (Proj_good_stratumID);
one_to_num_records_with_good_stratumID = seq (1, num_records_with_good_stratumID, 1);
# Create a group-by summation matrix for records over stratum IDs
# "with_empty" means with stratum IDs that never occur in records
num_strata_with_empty = max (vector_of_good_stratumIDs);
StrataSummator_with_empty = table (vector_of_good_stratumIDs, one_to_num_records_with_good_stratumID);
StrataSummator = removeEmpty (target = StrataSummator_with_empty, margin = "rows");
StrataSummator = StrataSummator %*% Proj_good_stratumID;
num_strata = nrow (StrataSummator);
num_empty_strata = num_strata_with_empty - num_strata;
print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but positive-ID strata.");
print ("Computing the global single-variate statistics...");
cnt_X_global = colSums (X_mask);
cnt_Y_global = colSums (Y_mask);
avg_X_global = colSums (X) / cnt_X_global;
avg_Y_global = colSums (Y) / cnt_Y_global;
var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global);
var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global);
sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1);
stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1);
sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1);
stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2);
print ("Computing the stratified single-variate statistics...");
# Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata
Cnt_X_per_stratum = StrataSummator %*% X_mask;
Cnt_Y_per_stratum = StrataSummator %*% Y_mask;
Is_none_X_per_stratum = (Cnt_X_per_stratum == 0);
Is_none_Y_per_stratum = (Cnt_Y_per_stratum == 0);
One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum);
One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum);
num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum);
num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum);
Sum_X_per_stratum = StrataSummator %*% X;
Sum_Y_per_stratum = StrataSummator %*% Y;
# Recompute some global statistics to exclude bad stratum-ID records
cnt_X_with_good_stratumID = colSums (Cnt_X_per_stratum);
cnt_Y_with_good_stratumID = colSums (Cnt_Y_per_stratum);
sum_X_with_good_stratumID = colSums (Sum_X_per_stratum);
sum_Y_with_good_stratumID = colSums (Sum_Y_per_stratum);
var_sumX_with_good_stratumID = colSums (StrataSummator %*% (X * X)) - (sum_X_with_good_stratumID * sum_X_with_good_stratumID) / cnt_X_with_good_stratumID;
var_sumY_with_good_stratumID = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_with_good_stratumID * sum_Y_with_good_stratumID) / cnt_Y_with_good_stratumID;
# Compute the stratified statistics
var_sumX_stratified = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum);
var_sumY_stratified = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum);
sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata);
stdev_X_stratified = sqrt_failsafe (sqrt_failsafe_input_3);
sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata);
stdev_Y_stratified = sqrt_failsafe (sqrt_failsafe_input_4);
r_sqr_X_vs_strata = 1 - var_sumX_stratified / var_sumX_with_good_stratumID;
r_sqr_Y_vs_strata = 1 - var_sumY_stratified / var_sumY_with_good_stratumID;
adj_r_sqr_X_vs_strata = 1 - (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)) / (var_sumX_with_good_stratumID / (cnt_X_with_good_stratumID - 1));
adj_r_sqr_Y_vs_strata = 1 - (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)) / (var_sumY_with_good_stratumID / (cnt_Y_with_good_stratumID - 1));
fStat_X_vs_strata = ((var_sumX_with_good_stratumID - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata));
fStat_Y_vs_strata = ((var_sumY_with_good_stratumID - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata));
p_val_X_vs_strata = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_with_good_stratumID - num_X_nonempty_strata);
p_val_Y_vs_strata = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_with_good_stratumID - num_Y_nonempty_strata);
print ("Computing the global bivariate statistics...");
# Compute the aggregate X vs. Y statistics and map them into proper positions
cnt_XY_rectangle = t(X_mask) %*% Y_mask;
sum_X_forXY_rectangle = t(X) %*% Y_mask;
sum_XX_forXY_rectangle = t(X * X) %*% Y_mask;
sum_Y_forXY_rectangle = t(X_mask) %*% Y;
sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y);
sum_XY_rectangle = t(X) %*% Y;
cnt_XY_global = matrix (cnt_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_X_forXY_global = matrix (sum_X_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_XX_forXY_global = matrix (sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_Y_forXY_global = matrix (sum_Y_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_YY_forXY_global = matrix (sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_XY_global = matrix (sum_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
ones_XY = matrix (1.0, rows = 1, cols = num_attrs_XY);
# Compute the global bivariate statistics for output
cov_sumX_sumY_global = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global;
var_sumX_forXY_global = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global;
var_sumY_forXY_global = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global;
slope_XY_global = cov_sumX_sumY_global / var_sumX_forXY_global;
sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global;
sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5);
corr_XY_global = cov_sumX_sumY_global / sqrt_failsafe_output_5;
r_sqr_X_vs_Y_global = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global);
adj_r_sqr_X_vs_Y_global = 1 - (1 - r_sqr_X_vs_Y_global) * (cnt_XY_global - 1) / (cnt_XY_global - 2);
sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2)
stdev_slope_XY_global = sqrt_failsafe (sqrt_failsafe_input_6);
sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2)
stdev_errY_vs_X_global = sqrt_failsafe (sqrt_failsafe_input_7);
fStat_Y_vs_X_global = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global);
p_val_Y_vs_X_global = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2);
print ("Computing the stratified bivariate statistics...");
# Create projections to "intermingle" X and Y into attribute pairs
Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY);
Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY);
ones_Y_col = matrix (1.0, rows = num_attrs_Y, cols = 1);
for (i in 1:num_attrs_X) {
start_cid = (i - 1) * num_attrs_Y + 1;
end_cid = i * num_attrs_Y;
Proj_X_to_XY [i, start_cid:end_cid] = t(ones_Y_col);
Proj_Y_to_XY [ , start_cid:end_cid] = diag (ones_Y_col);
}
# Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata
Cnt_XY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
Sum_X_forXY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
Sum_Y_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY));
Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY));
Sum_XY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY));
Is_none_XY_per_stratum = (Cnt_XY_per_stratum == 0);
One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum);
num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum);
# Recompute some global aggregate X vs. Y statistics to exclude bad stratum-ID records
cnt_XY_with_good_stratumID = colSums (Cnt_XY_per_stratum);
sum_XX_forXY_with_good_stratumID = colSums (Sum_XX_forXY_per_stratum);
sum_YY_forXY_with_good_stratumID = colSums (Sum_YY_forXY_per_stratum);
sum_XY_with_good_stratumID = colSums (Sum_XY_per_stratum);
# Compute the stratified bivariate statistics
var_sumX_forXY_stratified = sum_XX_forXY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum);
var_sumY_forXY_stratified = sum_YY_forXY_with_good_stratumID - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
cov_sumX_sumY_stratified = sum_XY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
slope_XY_stratified = cov_sumX_sumY_stratified / var_sumX_forXY_stratified;
sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified;
sqrt_failsafe_output_8 = sqrt_failsafe (sqrt_failsafe_input_8);
corr_XY_stratified = cov_sumX_sumY_stratified / sqrt_failsafe_output_8;
r_sqr_X_vs_Y_stratified = (cov_sumX_sumY_stratified ^ 2) / (var_sumX_forXY_stratified * var_sumY_forXY_stratified);
temp_X_vs_Y_stratified = (1 - r_sqr_X_vs_Y_stratified) / (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1)
adj_r_sqr_X_vs_Y_stratified = 1 - temp_X_vs_Y_stratified * (cnt_XY_with_good_stratumID - num_XY_nonempty_strata);
sqrt_failsafe_input_9 = temp_X_vs_Y_stratified * var_sumY_forXY_stratified;
stdev_errY_vs_X_stratified = sqrt_failsafe (sqrt_failsafe_input_9);
sqrt_failsafe_input_10 = sqrt_failsafe_input_9 / var_sumX_forXY_stratified;
stdev_slope_XY_stratified = sqrt_failsafe (sqrt_failsafe_input_10);
fStat_Y_vs_X_stratified = (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified);
p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1);
print ("Preparing the output matrix...");
OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY);
OutMtx [ 1, ] = Xcols %*% Proj_X_to_XY; # 1st covariate column number
OutMtx [ 2, ] = cnt_X_global %*% Proj_X_to_XY; # 1st covariate global presence count
OutMtx [ 3, ] = avg_X_global %*% Proj_X_to_XY; # 1st covariate global mean
OutMtx [ 4, ] = stdev_X_global %*% Proj_X_to_XY; # 1st covariate global standard deviation
OutMtx [ 5, ] = stdev_X_stratified %*% Proj_X_to_XY; # 1st covariate stratified standard deviation
OutMtx [ 6, ] = r_sqr_X_vs_strata %*% Proj_X_to_XY; # R-squared, 1st covariate vs. strata
OutMtx [ 7, ] = adj_r_sqr_X_vs_strata %*% Proj_X_to_XY; # adjusted R-squared, 1st covariate vs. strata
OutMtx [ 8, ] = p_val_X_vs_strata %*% Proj_X_to_XY; # P-value, 1st covariate vs. strata
OutMtx [11, ] = Ycols %*% Proj_Y_to_XY; # 2nd covariate column number
OutMtx [12, ] = cnt_Y_global %*% Proj_Y_to_XY; # 2nd covariate global presence count
OutMtx [13, ] = avg_Y_global %*% Proj_Y_to_XY; # 2nd covariate global mean
OutMtx [14, ] = stdev_Y_global %*% Proj_Y_to_XY; # 2nd covariate global standard deviation
OutMtx [15, ] = stdev_Y_stratified %*% Proj_Y_to_XY; # 2nd covariate stratified standard deviation
OutMtx [16, ] = r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # R-squared, 2nd covariate vs. strata
OutMtx [17, ] = adj_r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # adjusted R-squared, 2nd covariate vs. strata
OutMtx [18, ] = p_val_Y_vs_strata %*% Proj_Y_to_XY; # P-value, 2nd covariate vs. strata
OutMtx [21, ] = cnt_XY_global; # Global 1st & 2nd covariate presence count
OutMtx [22, ] = slope_XY_global; # Global regression slope (2nd vs. 1st covariate)
OutMtx [23, ] = stdev_slope_XY_global; # Global regression slope standard deviation
OutMtx [24, ] = corr_XY_global; # Global correlation = +/- sqrt(R-squared)
OutMtx [25, ] = stdev_errY_vs_X_global; # Global residual standard deviation
OutMtx [26, ] = r_sqr_X_vs_Y_global; # Global R-squared
OutMtx [27, ] = adj_r_sqr_X_vs_Y_global; # Global adjusted R-squared
OutMtx [28, ] = p_val_Y_vs_X_global; # Global P-value for hypothesis "slope = 0"
OutMtx [31, ] = cnt_XY_with_good_stratumID; # Stratified 1st & 2nd covariate presence count
OutMtx [32, ] = slope_XY_stratified; # Stratified regression slope (2nd vs. 1st covariate)
OutMtx [33, ] = stdev_slope_XY_stratified; # Stratified regression slope standard deviation
OutMtx [34, ] = corr_XY_stratified; # Stratified correlation = +/- sqrt(R-squared)
OutMtx [35, ] = stdev_errY_vs_X_stratified; # Stratified residual standard deviation
OutMtx [36, ] = r_sqr_X_vs_Y_stratified; # Stratified R-squared
OutMtx [37, ] = adj_r_sqr_X_vs_Y_stratified; # Stratified adjusted R-squared
OutMtx [38, ] = p_val_Y_vs_X_stratified; # Stratified P-value for hypothesis "slope = 0"
OutMtx [39, ] = colSums (Cnt_XY_per_stratum >= 2); # Number of strata with at least two counted points
OutMtx = t(OutMtx);
print ("Writing the output matrix...");
write (OutMtx, fileO, format=fmtO);
print ("END STRATIFIED STATISTICS SCRIPT");
fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob)
{ # TEMPORARY IMPLEMENTATION
tailprob = fStat;
for (i in 1:nrow(fStat)) {
for (j in 1:ncol(fStat)) {
q = as.scalar (fStat [i, j]);
d1 = as.scalar (df_1 [i, j]);
d2 = as.scalar (df_2 [i, j]);
if (d1 >= 1 & d2 >= 1 & q >= 0) {
tailprob [i, j] = pf(target = q, df1 = d1, df2 = d2, lower.tail=FALSE);
} else {
tailprob [i, j] = 0/0;
}
} }
}
sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A)
{
mask_A = (input_A >= 0);
prep_A = input_A * mask_A;
mask_A = mask_A * (prep_A == prep_A);
prep_A = replace (target = prep_A, pattern = 0.0/0.0, replacement = 0);
output_A = sqrt (prep_A) / mask_A;
}