| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # Fix invalid lengths |
| # |
| # INPUT: |
| # ------------------- |
| # F1 --- |
| # mask --- |
| # ql --- |
| # qu --- |
| # ------------------- |
| # |
| # OUTPUT: |
| # ------------------- |
| # out --- |
| # M --- |
| # ------------------- |
| |
| s_fixInvalidLengths = function(Frame[Unknown] F1, Matrix[Double] mask, Double ql = 0.05, Double qu = 0.99) |
| return (Frame[Unknown] X, Matrix[Double] mask, Matrix[Double] qLow, Matrix[Double] qUp) |
| { |
| length = map(F1, "x -> x.length()") |
| length = as.matrix(length) |
| length = replace(target = (length * mask), pattern = NaN, replacement = 0) |
| [M, qLow, qUp] = getInvalidsMask(length, ql, qu) |
| # # # check if mask vector has 1 in more than one column |
| # # # this indicates that two values are being swapped and can be fixed |
| rowCountSwap = rowSums(M) >= 2 |
| rowCountDangling = rowSums(M) > 0 & rowSums(M) < 2 |
| |
| if(sum(rowCountSwap) > 0) |
| { |
| countTotalSwaps = sum(rowCountSwap) |
| # # get the row index for swapping |
| rowIds = rowCountSwap * seq(1, nrow(rowCountSwap)) |
| rowIds = removeEmpty(target=rowIds, margin="rows") |
| colIds = M * t(seq(1, ncol(M))) |
| for(i in 1:countTotalSwaps) |
| { |
| rowIdx = as.scalar(rowIds[i, 1]) |
| colIdx = removeEmpty(target = colIds[rowIdx], margin="cols") |
| id1 = as.scalar(colIdx[1, 1]) |
| id2 = as.scalar(colIdx[1, 2]) |
| tmp = F1[rowIdx, id1] |
| F1[rowIdx, id1] = F1[rowIdx, id2] |
| F1[rowIdx, id2] = tmp |
| # # remove the mask for fixed entries |
| M[rowIdx, id1] = 0 |
| M[rowIdx, id2] = 0 |
| } |
| } |
| if(sum(rowCountDangling) > 0) # no swaps just invalid lengths |
| { |
| countTotalInvalids = sum(rowCountDangling) |
| # # get the row index for swapping |
| rowIds = rowCountDangling * seq(1, nrow(rowCountDangling)) |
| rowIds = removeEmpty(target=rowIds, margin="rows") |
| colIds = M * t(seq(1, ncol(M))) |
| for(i in 1:countTotalInvalids) |
| { |
| rowIdx = as.scalar(rowIds[i, 1]) |
| colIdx = removeEmpty(target = colIds[rowIdx], margin="cols") |
| id1 = as.scalar(colIdx[1, 1]) |
| F1[rowIdx, id1] = "" |
| # # remove the mask for fixed entries |
| M[rowIdx, id1] = 0 |
| } |
| } |
| |
| M = replace(target = M, pattern = 1, replacement = NaN) |
| X = F1 |
| } |
| |
| getInvalidsMask = function(Matrix[Double] X, Double ql = 0.05, Double qu = 0.99) |
| return (Matrix[Double] Y, Matrix[Double] qLow, Matrix[Double] qUp) { |
| |
| Y = matrix(0, nrow(X), ncol(X)) |
| qLow = matrix(0, rows=1, cols=ncol(X)) |
| qUp = matrix(0, rows=1, cols=ncol(X)) |
| for(i in 1:ncol(X), check=0) { |
| q1 = quantile(X[,i], ql) |
| q2 = quantile(X[,i], qu) |
| qLow[1, i] = q1 |
| qUp[1, i] = q2 |
| Y[, i] = ( X[, i] < q1 | X[, i] > q2) |
| } |
| } |