| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # Print summary stats about the distribution of missing values in a univariate time series. |
| # ------------------------------------------------------------------------------ |
| # NAME TYPE DEFAULT MEANING |
| # ------------------------------------------------------------------------------ |
| # X Matrix --- Numeric Vector ('vector') object containing NAs |
| # bins Integer 4 Split number for bin stats. Number of bins the time series gets |
| # divided into. For each bin information about amount/percentage of |
| # missing values is printed. |
| # verbose Boolean TRUE Print detailed information. |
| # For print_only = TRUE, the missing value stats are printed with |
| # more information ("Stats for Bins" and "overview NA series"). |
| # ------------------------------------------------------------------------------ |
| # stats Matrix Double Column vector where each row correspond to following, |
| # 1. Length of time series (including NAs) |
| # 2. Number of Missing Values (NAs) |
| # 3. Percentage of Missing Values (#2/#1) |
| # 4. Number of Gaps (consisting of one or more consecutive NAs) |
| # 5. Average Gap Size - Average size of consecutive NAs for the NA gaps |
| # 6. Longest NA gap - Longest series of consecutive missing values |
| # 7. Most frequent gap size - Most frequently occurring gap size |
| # 8. Gap size accounting for most NAs |
| # ------------------------------------------------------------------------------ |
| |
| m_statsNA = function(Matrix[Double] X, Integer bins = 4, Boolean verbose = TRUE) |
| return( Matrix[Double] stats) |
| { |
| longest_nan_gap = -1 |
| most_frequent_nan_gap = -1 |
| most_weighty_nan_gap = -1 |
| stats = matrix(0, rows=8, cols=1) |
| |
| if(ncol(X) != 1) { |
| stop("statsNA: expect a matrix with only one column"); |
| } |
| |
| # Count total entries |
| length_series = length(X); |
| # store length |
| stats[1, 1] = length_series |
| |
| if (length_series == 0) { |
| stop("EMPTY MATRIX") |
| } |
| |
| if (length_series < bins) { |
| print("Warning: data is less than no. of bins, bins value was changed to 4"); |
| bins = 4; |
| } else if (length_series < 1) { |
| print("Warning: bin value can not be zero, bin value was changed to 1"); |
| bins = 1; |
| } |
| |
| # Count NaNs |
| p_position_nans = is.na(X) |
| number_nans = sum(p_position_nans); |
| # stop if no null value found in data |
| if(number_nans == 0) |
| stop("No missing value found in the data.") |
| stats[2, 1] = number_nans |
| |
| # Calculate percentage of NaNs |
| stats[3, 1] = number_nans / length_series; |
| |
| # Create Vector with length of gaps |
| # input: 0 0 1 1 1 0 0 0 1 1 1 1 0 1 |
| # csgaps: 0 0 1 2 3 0 0 0 1 2 3 4 0 1 |
| # output: 0 0 0 0 3 0 0 0 0 0 0 4 0 1 |
| csgaps = cumsumprod(cbind(p_position_nans,p_position_nans)); |
| csmask = matrix(0, length_series, 1); |
| csmask[1:(length_series-1)] = csgaps[2:length_series] |
| gap_lengths = csgaps * (csgaps > csmask) |
| gap_lengths = removeEmpty(target=gap_lengths, margin="rows") |
| p_gaps_vector = table(gap_lengths, 1); |
| |
| # Count number of gaps |
| number_nan_gaps = sum(p_gaps_vector); |
| stats[4, 1] = number_nan_gaps |
| |
| # Calculate average gap size |
| stats[5, 1] = number_nans / number_nan_gaps |
| |
| # Find longest gap |
| stats[6, 1] = as.scalar(rowIndexMax(t(p_gaps_vector>0))) |
| |
| # Find most frequent gap size |
| stats[7, 1] = as.scalar(rowIndexMax(t(p_gaps_vector))); |
| |
| # Gap size that has most NaNs |
| p_gaps_vector_with_weight = seq(1,nrow(p_gaps_vector)) * p_gaps_vector; |
| stats[8, 1] = as.scalar(rowIndexMax(t(p_gaps_vector_with_weight))); |
| |
| # Calculate bins |
| #--- |
| bin_length = ceiling(length_series / bins) |
| |
| # Calculate where a bin starts and ends |
| bins_start = seq(1, bins*bin_length, bin_length); |
| bins_end = seq(bin_length, bins*bin_length, bin_length) |
| bins_nans = rowSums(matrix(p_position_nans, bins, bin_length)) |
| bins_percentage = bins_nans/bin_length; |
| |
| # Print results |
| #--- |
| if (verbose) { |
| print("-------------------------") |
| print("Length of time series:"); |
| print(as.scalar(stats[1, 1])); |
| print("-------------------------"); |
| print("Number of Missing Values:"); |
| print(as.scalar(stats[2, 1])); |
| print("-------------------------"); |
| print("Percentage of Missing Values:"); |
| print("%3.2f %%", as.scalar(stats[3, 1])); |
| print("-------------------------"); |
| print("Number of Gaps:"); |
| print(as.scalar(stats[4, 1])); |
| print("-------------------------"); |
| print("Average Gap Size:"); |
| print("%3.2f %%", as.scalar(stats[5, 1])); |
| print("-------------------------"); |
| print("Longest NA gap (series of consecutive NAs)"); |
| print(as.scalar(stats[6, 1])); |
| print("-------------------------"); |
| print("Most frequent gap size (series of consecutive NA series)"); |
| print(as.scalar(stats[7, 1])); |
| print("-------------------------"); |
| print("Gap size accounting for most NAs"); |
| print(as.scalar(stats[8, 1])); |
| print("-------------------------"); |
| if(bins > 0) { |
| print("Stats for Bins") |
| for (i in 1:bins) { |
| l = bin_length |
| s = as.scalar(bins_start[i,1]); |
| e = as.scalar(bins_end[i,1]); |
| n = as.scalar(bins_nans[i,1]); |
| p = as.scalar(bins_percentage[i,1]); |
| print(" Bin %d (%2.0f values from %2.0f to %2.0f):%5.0f NAs (%3.2f %%)", i,l,s,e,n,p); |
| } |
| print("-------------------------") |
| } |
| print("Overview NA Series") |
| for (i in 1:nrow(p_gaps_vector)) { |
| v = as.scalar(p_gaps_vector[i,1]); |
| if(v > 0) |
| print(" %.0f NA in a row: %d times", v, i); |
| } |
| print("-------------------------") |
| } |
| } |