scripts/builtin/statsNA.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # Print summary stats about the distribution of missing values in a univariate time series.
 # ------------------------------------------------------------------------------
 # NAME    TYPE        DEFAULT     MEANING
 # ------------------------------------------------------------------------------
 # X       Matrix      ---         Numeric Vector ('vector') object containing NAs
 # bins    Integer     4           Split number for bin stats. Number of bins the time series gets
 #                                 divided into. For each bin information about amount/percentage of
 #                                 missing values is printed.
 # verbose Boolean     TRUE        Print detailed information.
 #                                 For print_only = TRUE, the missing value stats are printed with
 #                                 more information ("Stats for Bins" and "overview NA series").
 # ------------------------------------------------------------------------------
 # stats   Matrix      Double      Column vector where each row correspond to following,
 #                                 1. Length of time series (including NAs)
 #                                 2. Number of Missing Values (NAs)
 #                                 3. Percentage of Missing Values (#2/#1)
 #                                 4. Number of Gaps (consisting of one or more consecutive NAs)
 #                                 5. Average Gap Size - Average size of consecutive NAs for the NA gaps
 #                                 6. Longest NA gap - Longest series of consecutive missing values
 #                                 7. Most frequent gap size - Most frequently occurring gap size
 #                                 8. Gap size accounting for most NAs
 # ------------------------------------------------------------------------------

 m_statsNA = function(Matrix[Double] X, Integer bins = 4, Boolean verbose = TRUE)
   return( Matrix[Double] stats)
 {
   longest_nan_gap = -1
   most_frequent_nan_gap = -1
   most_weighty_nan_gap = -1
   stats = matrix(0, rows=8, cols=1)

   if(ncol(X) != 1) {
     stop("statsNA: expect a matrix with only one column");
   }

   # Count total entries
   length_series = length(X);
   # store length
   stats[1, 1] = length_series

   if (length_series == 0) {
     stop("EMPTY MATRIX")
   }

   if (length_series < bins) {
     print("Warning: data is less than no. of bins, bins value was changed to 4");
     bins = 4;
   } else if (length_series < 1) {
     print("Warning: bin value can not be zero, bin value was changed to 1");
     bins = 1;
   }

   # Count NaNs
   p_position_nans = is.na(X)
   number_nans = sum(p_position_nans);
   # stop if no null value found in data
   if(number_nans == 0)
     stop("No missing value found in the data.")
   stats[2, 1] =  number_nans

   # Calculate percentage of NaNs
   stats[3, 1]  = number_nans / length_series;

   # Create Vector with length of gaps
   #  input:  0 0 1 1 1 0 0 0 1 1 1 1 0 1
   #  csgaps: 0 0 1 2 3 0 0 0 1 2 3 4 0 1
   #  output: 0 0 0 0 3 0 0 0 0 0 0 4 0 1
   csgaps = cumsumprod(cbind(p_position_nans,p_position_nans));
   csmask = matrix(0, length_series, 1);
   csmask[1:(length_series-1)] = csgaps[2:length_series]
   gap_lengths = csgaps * (csgaps > csmask)
   gap_lengths = removeEmpty(target=gap_lengths, margin="rows")
   p_gaps_vector = table(gap_lengths, 1);

   # Count number of gaps
   number_nan_gaps = sum(p_gaps_vector);
   stats[4, 1] = number_nan_gaps

   # Calculate average gap size
   stats[5, 1]  = number_nans / number_nan_gaps

   # Find longest gap
   stats[6, 1] = as.scalar(rowIndexMax(t(p_gaps_vector>0)))

   # Find most frequent gap size
   stats[7, 1]  = as.scalar(rowIndexMax(t(p_gaps_vector)));

   # Gap size that has most NaNs
   p_gaps_vector_with_weight = seq(1,nrow(p_gaps_vector)) * p_gaps_vector;
   stats[8, 1]  = as.scalar(rowIndexMax(t(p_gaps_vector_with_weight)));

   # Calculate bins
   #---
   bin_length = ceiling(length_series / bins)

   # Calculate where a bin starts and ends
   bins_start = seq(1, bins*bin_length, bin_length);
   bins_end = seq(bin_length, bins*bin_length, bin_length)
   bins_nans = rowSums(matrix(p_position_nans, bins, bin_length))
   bins_percentage = bins_nans/bin_length;

   # Print results
   #---
   if (verbose) {
     print("-------------------------")
     print("Length of time series:");
     print(as.scalar(stats[1, 1]));
     print("-------------------------");
     print("Number of Missing Values:");
     print(as.scalar(stats[2, 1]));
     print("-------------------------");
     print("Percentage of Missing Values:");
     print("%3.2f %%", as.scalar(stats[3, 1]));
     print("-------------------------");
     print("Number of Gaps:");
     print(as.scalar(stats[4, 1]));
     print("-------------------------");
     print("Average Gap Size:");
     print("%3.2f %%", as.scalar(stats[5, 1]));
     print("-------------------------");
     print("Longest NA gap (series of consecutive NAs)");
     print(as.scalar(stats[6, 1]));
     print("-------------------------");
     print("Most frequent gap size (series of consecutive NA series)");
     print(as.scalar(stats[7, 1]));
     print("-------------------------");
     print("Gap size accounting for most NAs");
     print(as.scalar(stats[8, 1]));
     print("-------------------------");
     if(bins > 0) {
       print("Stats for Bins")
       for (i in 1:bins) {
         l = bin_length
         s = as.scalar(bins_start[i,1]);
         e = as.scalar(bins_end[i,1]);
         n = as.scalar(bins_nans[i,1]);
         p = as.scalar(bins_percentage[i,1]);
         print("  Bin %d (%2.0f values from %2.0f to %2.0f):%5.0f NAs (%3.2f %%)", i,l,s,e,n,p);
       }
       print("-------------------------")
     }
     print("Overview NA Series")
     for (i in 1:nrow(p_gaps_vector)) {
       v = as.scalar(p_gaps_vector[i,1]);
       if(v > 0)
         print(" %.0f NA in a row: %d times", v, i);
     }
     print("-------------------------")
   }
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# Print summary stats about the distribution of missing values in a univariate time series.
	# ------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ------------------------------------------------------------------------------
	# X Matrix --- Numeric Vector ('vector') object containing NAs
	# bins Integer 4 Split number for bin stats. Number of bins the time series gets
	# divided into. For each bin information about amount/percentage of
	# missing values is printed.
	# verbose Boolean TRUE Print detailed information.
	# For print_only = TRUE, the missing value stats are printed with
	# more information ("Stats for Bins" and "overview NA series").
	# ------------------------------------------------------------------------------
	# stats Matrix Double Column vector where each row correspond to following,
	# 1. Length of time series (including NAs)
	# 2. Number of Missing Values (NAs)
	# 3. Percentage of Missing Values (#2/#1)
	# 4. Number of Gaps (consisting of one or more consecutive NAs)
	# 5. Average Gap Size - Average size of consecutive NAs for the NA gaps
	# 6. Longest NA gap - Longest series of consecutive missing values
	# 7. Most frequent gap size - Most frequently occurring gap size
	# 8. Gap size accounting for most NAs
	# ------------------------------------------------------------------------------

	m_statsNA = function(Matrix[Double] X, Integer bins = 4, Boolean verbose = TRUE)
	return( Matrix[Double] stats)
	{
	longest_nan_gap = -1
	most_frequent_nan_gap = -1
	most_weighty_nan_gap = -1
	stats = matrix(0, rows=8, cols=1)

	if(ncol(X) != 1) {
	stop("statsNA: expect a matrix with only one column");
	}

	# Count total entries
	length_series = length(X);
	# store length
	stats[1, 1] = length_series

	if (length_series == 0) {
	stop("EMPTY MATRIX")
	}

	if (length_series < bins) {
	print("Warning: data is less than no. of bins, bins value was changed to 4");
	bins = 4;
	} else if (length_series < 1) {
	print("Warning: bin value can not be zero, bin value was changed to 1");
	bins = 1;
	}

	# Count NaNs
	p_position_nans = is.na(X)
	number_nans = sum(p_position_nans);
	# stop if no null value found in data
	if(number_nans == 0)
	stop("No missing value found in the data.")
	stats[2, 1] = number_nans

	# Calculate percentage of NaNs
	stats[3, 1] = number_nans / length_series;

	# Create Vector with length of gaps
	# input: 0 0 1 1 1 0 0 0 1 1 1 1 0 1
	# csgaps: 0 0 1 2 3 0 0 0 1 2 3 4 0 1
	# output: 0 0 0 0 3 0 0 0 0 0 0 4 0 1
	csgaps = cumsumprod(cbind(p_position_nans,p_position_nans));
	csmask = matrix(0, length_series, 1);
	csmask[1:(length_series-1)] = csgaps[2:length_series]
	gap_lengths = csgaps * (csgaps > csmask)
	gap_lengths = removeEmpty(target=gap_lengths, margin="rows")
	p_gaps_vector = table(gap_lengths, 1);

	# Count number of gaps
	number_nan_gaps = sum(p_gaps_vector);
	stats[4, 1] = number_nan_gaps

	# Calculate average gap size
	stats[5, 1] = number_nans / number_nan_gaps

	# Find longest gap
	stats[6, 1] = as.scalar(rowIndexMax(t(p_gaps_vector>0)))

	# Find most frequent gap size
	stats[7, 1] = as.scalar(rowIndexMax(t(p_gaps_vector)));

	# Gap size that has most NaNs
	p_gaps_vector_with_weight = seq(1,nrow(p_gaps_vector)) * p_gaps_vector;
	stats[8, 1] = as.scalar(rowIndexMax(t(p_gaps_vector_with_weight)));

	# Calculate bins
	#---
	bin_length = ceiling(length_series / bins)

	# Calculate where a bin starts and ends
	bins_start = seq(1, bins*bin_length, bin_length);
	bins_end = seq(bin_length, bins*bin_length, bin_length)
	bins_nans = rowSums(matrix(p_position_nans, bins, bin_length))
	bins_percentage = bins_nans/bin_length;

	# Print results
	#---
	if (verbose) {
	print("-------------------------")
	print("Length of time series:");
	print(as.scalar(stats[1, 1]));
	print("-------------------------");
	print("Number of Missing Values:");
	print(as.scalar(stats[2, 1]));
	print("-------------------------");
	print("Percentage of Missing Values:");
	print("%3.2f %%", as.scalar(stats[3, 1]));
	print("-------------------------");
	print("Number of Gaps:");
	print(as.scalar(stats[4, 1]));
	print("-------------------------");
	print("Average Gap Size:");
	print("%3.2f %%", as.scalar(stats[5, 1]));
	print("-------------------------");
	print("Longest NA gap (series of consecutive NAs)");
	print(as.scalar(stats[6, 1]));
	print("-------------------------");
	print("Most frequent gap size (series of consecutive NA series)");
	print(as.scalar(stats[7, 1]));
	print("-------------------------");
	print("Gap size accounting for most NAs");
	print(as.scalar(stats[8, 1]));
	print("-------------------------");
	if(bins > 0) {
	print("Stats for Bins")
	for (i in 1:bins) {
	l = bin_length
	s = as.scalar(bins_start[i,1]);
	e = as.scalar(bins_end[i,1]);
	n = as.scalar(bins_nans[i,1]);
	p = as.scalar(bins_percentage[i,1]);
	print(" Bin %d (%2.0f values from %2.0f to %2.0f):%5.0f NAs (%3.2f %%)", i,l,s,e,n,p);
	}
	print("-------------------------")
	}
	print("Overview NA Series")
	for (i in 1:nrow(p_gaps_vector)) {
	v = as.scalar(p_gaps_vector[i,1]);
	if(v > 0)
	print(" %.0f NA in a row: %d times", v, i);
	}
	print("-------------------------")
	}
	}