blob: d8589664e9d46a8903f22ffb73a1e23db38c968a [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Print summary stats about the distribution of missing values in a univariate time series.
# ------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ------------------------------------------------------------------------------
# X Matrix --- Numeric Vector ('vector') object containing NAs
# bins Integer 4 Split number for bin stats. Number of bins the time series gets
# divided into. For each bin information about amount/percentage of
# missing values is printed.
# verbose Boolean TRUE Print detailed information.
# For print_only = TRUE, the missing value stats are printed with
# more information ("Stats for Bins" and "overview NA series").
# ------------------------------------------------------------------------------
# stats Matrix Double Column vector where each row correspond to following,
# 1. Length of time series (including NAs)
# 2. Number of Missing Values (NAs)
# 3. Percentage of Missing Values (#2/#1)
# 4. Number of Gaps (consisting of one or more consecutive NAs)
# 5. Average Gap Size - Average size of consecutive NAs for the NA gaps
# 6. Longest NA gap - Longest series of consecutive missing values
# 7. Most frequent gap size - Most frequently occurring gap size
# 8. Gap size accounting for most NAs
# ------------------------------------------------------------------------------
m_statsNA = function(Matrix[Double] X, Integer bins = 4, Boolean verbose = TRUE)
return( Matrix[Double] stats)
{
longest_nan_gap = -1
most_frequent_nan_gap = -1
most_weighty_nan_gap = -1
stats = matrix(0, rows=8, cols=1)
if(ncol(X) != 1) {
stop("statsNA: expect a matrix with only one column");
}
# Count total entries
length_series = length(X);
# store length
stats[1, 1] = length_series
if (length_series == 0) {
stop("EMPTY MATRIX")
}
if (length_series < bins) {
print("Warning: data is less than no. of bins, bins value was changed to 4");
bins = 4;
} else if (length_series < 1) {
print("Warning: bin value can not be zero, bin value was changed to 1");
bins = 1;
}
# Count NaNs
p_position_nans = is.na(X)
number_nans = sum(p_position_nans);
# stop if no null value found in data
if(number_nans == 0)
stop("No missing value found in the data.")
stats[2, 1] = number_nans
# Calculate percentage of NaNs
stats[3, 1] = number_nans / length_series;
# Create Vector with length of gaps
# input: 0 0 1 1 1 0 0 0 1 1 1 1 0 1
# csgaps: 0 0 1 2 3 0 0 0 1 2 3 4 0 1
# output: 0 0 0 0 3 0 0 0 0 0 0 4 0 1
csgaps = cumsumprod(cbind(p_position_nans,p_position_nans));
csmask = matrix(0, length_series, 1);
csmask[1:(length_series-1)] = csgaps[2:length_series]
gap_lengths = csgaps * (csgaps > csmask)
gap_lengths = removeEmpty(target=gap_lengths, margin="rows")
p_gaps_vector = table(gap_lengths, 1);
# Count number of gaps
number_nan_gaps = sum(p_gaps_vector);
stats[4, 1] = number_nan_gaps
# Calculate average gap size
stats[5, 1] = number_nans / number_nan_gaps
# Find longest gap
stats[6, 1] = as.scalar(rowIndexMax(t(p_gaps_vector>0)))
# Find most frequent gap size
stats[7, 1] = as.scalar(rowIndexMax(t(p_gaps_vector)));
# Gap size that has most NaNs
p_gaps_vector_with_weight = seq(1,nrow(p_gaps_vector)) * p_gaps_vector;
stats[8, 1] = as.scalar(rowIndexMax(t(p_gaps_vector_with_weight)));
# Calculate bins
#---
bin_length = ceiling(length_series / bins)
# Calculate where a bin starts and ends
bins_start = seq(1, bins*bin_length, bin_length);
bins_end = seq(bin_length, bins*bin_length, bin_length)
bins_nans = rowSums(matrix(p_position_nans, bins, bin_length))
bins_percentage = bins_nans/bin_length;
# Print results
#---
if (verbose) {
print("-------------------------")
print("Length of time series:");
print(as.scalar(stats[1, 1]));
print("-------------------------");
print("Number of Missing Values:");
print(as.scalar(stats[2, 1]));
print("-------------------------");
print("Percentage of Missing Values:");
print("%3.2f %%", as.scalar(stats[3, 1]));
print("-------------------------");
print("Number of Gaps:");
print(as.scalar(stats[4, 1]));
print("-------------------------");
print("Average Gap Size:");
print("%3.2f %%", as.scalar(stats[5, 1]));
print("-------------------------");
print("Longest NA gap (series of consecutive NAs)");
print(as.scalar(stats[6, 1]));
print("-------------------------");
print("Most frequent gap size (series of consecutive NA series)");
print(as.scalar(stats[7, 1]));
print("-------------------------");
print("Gap size accounting for most NAs");
print(as.scalar(stats[8, 1]));
print("-------------------------");
if(bins > 0) {
print("Stats for Bins")
for (i in 1:bins) {
l = bin_length
s = as.scalar(bins_start[i,1]);
e = as.scalar(bins_end[i,1]);
n = as.scalar(bins_nans[i,1]);
p = as.scalar(bins_percentage[i,1]);
print(" Bin %d (%2.0f values from %2.0f to %2.0f):%5.0f NAs (%3.2f %%)", i,l,s,e,n,p);
}
print("-------------------------")
}
print("Overview NA Series")
for (i in 1:nrow(p_gaps_vector)) {
v = as.scalar(p_gaps_vector[i,1]);
if(v > 0)
print(" %.0f NA in a row: %d times", v, i);
}
print("-------------------------")
}
}