blob: 95ce6692664fb93f86b69dbaa53600103892c12c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysds.runtime.util;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.matrix.data.FrameBlock;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
public class DMVUtils {
public static final char DIGIT = 'd';
public static final char LOWER = 'l';
public static final char UPPER = 'u';
public static final char ALPHA = 'a';
public static final char SPACE = 's';
public static final char DOT = 't';
public static final char OTHER = 'y';
public static final char ARBITRARY_LEN = '+';
public static final char MINUS = '-';
public enum LEVEL_ENUM { LEVEL1, LEVEL2, LEVEL3, LEVEL4, LEVEL5, LEVEL6}
public static FrameBlock syntacticalPatternDiscovery(FrameBlock frame, double threshold, String disguised_value) {
// Preparation
String disguisedVal = disguised_value;
int numCols = frame.getNumColumns();
int numRows = frame.getNumRows();
ArrayList<Map<String, Integer>> table_Hist = new ArrayList<>(numCols); // list of every column with values and their frequency
for (int idx = 0; idx < numCols; idx++) {
Object c = frame.getColumnData(idx);
String[] column = (String[]) c;
String key = "";
for (String attr : column) {
key = (attr.isEmpty()) ? "NULL": attr;
addDistinctValueOrIncrementCounter(table_Hist, key, idx);
}
}
// Syntactic Pattern Discovery
int idx = -1;
for (Map<String, Integer> col_Hist : table_Hist) {
idx++;
Map<String, Double> dominant_patterns_ratio = new HashMap<>();
Map<String, Integer> prev_pattern_hist = col_Hist;
for(LEVEL_ENUM level : LEVEL_ENUM.values()) {
dominant_patterns_ratio.clear();
Map<String, Integer> current_pattern_hist = LevelsExecutor(prev_pattern_hist, level);
dominant_patterns_ratio = calculatePatternsRatio(current_pattern_hist, numRows);
String dominant_pattern = findDominantPattern(dominant_patterns_ratio, threshold);
if(dominant_pattern != null) { //found pattern
detectDisguisedValues(dominant_pattern, frame.getColumnData(idx), idx, frame, level, disguisedVal);
break;
}
prev_pattern_hist = current_pattern_hist;
}
}
return frame;
}
public static Map<String, Double> calculatePatternsRatio(Map<String, Integer> patterns_hist, int nr_entries) {
Map<String, Double> patterns_ratio_map = new HashMap<>();
for(Entry<String, Integer> e : patterns_hist.entrySet()) {
String pattern = e.getKey();
double nr_occurences = e.getValue();
double current_ratio = nr_occurences / nr_entries; // percentage of current pattern in column
patterns_ratio_map.put(pattern, current_ratio);
}
return patterns_ratio_map;
}
public static String findDominantPattern(Map<String, Double> dominant_patterns, double threshold) {
for(Entry<String, Double> e : dominant_patterns.entrySet()) {
String pattern = e.getKey();
Double pattern_ratio = e.getValue();
if(pattern_ratio > threshold)
return pattern;
}
return null;
}
private static void addDistinctValueOrIncrementCounter(ArrayList<Map<String, Integer>> maps, String key, Integer idx) {
if (maps.size() == idx) {
HashMap<String, Integer> m = new HashMap<>();
m.put(key, 1);
maps.add(m);
return;
}
if (!(maps.get(idx).containsKey(key)))
maps.get(idx).put(key, 1);
else
maps.get(idx).compute(key, (k, v) -> v + 1);
}
private static void addDistinctValueOrIncrementCounter(Map<String, Integer> map, String encoded_value, Integer nr_occurrences) {
if (!(map.containsKey(encoded_value)))
map.put(encoded_value, nr_occurrences);
else
map.compute(encoded_value, (k, v) -> v + nr_occurrences);
}
public static Map<String, Integer> LevelsExecutor(Map<String, Integer> old_pattern_hist, LEVEL_ENUM level) {
Map<String, Integer> new_pattern_hist = new HashMap<>();
for(Entry<String, Integer> e : old_pattern_hist.entrySet()) {
String pattern = e.getKey();
Integer nr_of_occurrences = e.getValue();
String new_pattern;
switch(level) {
case LEVEL1: // default encoding
new_pattern = encodeRawString(pattern);
break;
case LEVEL2: // ignores the number of occurrences. It replaces all numbers with '+'
new_pattern = removeNumbers(pattern);
break;
case LEVEL3: // ignores upper and lowercase characters. It replaces all 'u' and 'l' with 'a' = Alphabet
new_pattern = removeUpperLowerCase(pattern);
break;
case LEVEL4: // changes floats to digits
new_pattern = removeInnerCharacterInPattern(pattern, DIGIT, DOT);
break;
case LEVEL5: // removes spaces between strings
new_pattern = removeInnerCharacterInPattern(pattern, ALPHA, SPACE);
break;
case LEVEL6: // changes negative numbers to digits
new_pattern = acceptNegativeNumbersAsDigits(pattern);
break;
default:
new_pattern = "";
break;
}
addDistinctValueOrIncrementCounter(new_pattern_hist, new_pattern, nr_of_occurrences);
}
return new_pattern_hist;
}
public static String acceptNegativeNumbersAsDigits(String pattern) {
char[] chars = pattern.toCharArray();
StringBuilder tmp = new StringBuilder();
boolean currently_minus_digit = false;
for (char ch : chars) {
if(ch == MINUS && !currently_minus_digit) {
currently_minus_digit = true;
}
else if(ch == DIGIT && currently_minus_digit) {
tmp.append(ch);
currently_minus_digit = false;
}
else if(currently_minus_digit) {
tmp.append(MINUS);
tmp.append(ch);
currently_minus_digit = false;
}
else {
tmp.append(ch);
}
}
return tmp.toString();
}
public static String removeInnerCharacterInPattern(String pattern, char outter_char, char inner_char) {
char[] chars = pattern.toCharArray();
StringBuilder tmp = new StringBuilder();
boolean currently_digit = false;
for (char ch : chars) {
if(ch == outter_char && !currently_digit) {
currently_digit = true;
tmp.append(ch);
}
else if(currently_digit && (ch == outter_char || ch == inner_char))
continue;
else if(ch != inner_char && ch != ARBITRARY_LEN) {
currently_digit = false;
tmp.append(ch);
}
else {
if(tmp.length() > 0 && tmp.charAt(tmp.length() - 1) != ARBITRARY_LEN)
tmp.append(ch);
}
}
return tmp.toString();
}
public static String removeUpperLowerCase(String pattern) {
char[] chars = pattern.toCharArray();
StringBuilder tmp = new StringBuilder();
boolean currently_alphabetic = false;
for (char ch : chars) {
if(ch == UPPER || ch == LOWER) {
if(!currently_alphabetic) {
currently_alphabetic = true;
tmp.append(ALPHA);
}
}
else if(ch == ARBITRARY_LEN) {
if(tmp.charAt(tmp.length() - 1) != ARBITRARY_LEN)
tmp.append(ch);
}
else {
tmp.append(ch);
currently_alphabetic = false;
}
}
return tmp.toString();
}
private static String removeNumbers(String pattern) {
char[] chars = pattern.toCharArray();
StringBuilder tmp = new StringBuilder();
for (char ch : chars) {
if(Character.isDigit(ch))
tmp.append(ARBITRARY_LEN);
else
tmp.append(ch);
}
return tmp.toString();
}
public static String encodeRawString(String input) {
char[] chars = input.toCharArray();
StringBuilder tmp = new StringBuilder();
for (char ch : chars) {
tmp.append(getCharClass(ch));
}
return getFrequencyOfEachConsecutiveChar(tmp.toString());
}
private static char getCharClass(char c) {
if (Character.isDigit(c)) return DIGIT;
if (Character.isLowerCase(c)) return LOWER;
if (Character.isUpperCase(c)) return UPPER;
if (Character.isSpaceChar(c)) return SPACE;
if (c == '.') return DOT;
if(c == '-') return MINUS;
return OTHER;
}
public static String getFrequencyOfEachConsecutiveChar(String s) {
StringBuilder retval = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
int count = 1;
while (i + 1 < s.length() && s.charAt(i) == s.charAt(i + 1)) {
i++;
count++;
}
retval.append(s.charAt(i));
retval.append(count);
}
return retval.toString();
}
private static void detectDisguisedValues(String dom_pattern, Object col, int col_idx,
FrameBlock frameBlock, LEVEL_ENUM level, String disguisedVal)
{
int row_idx = -1;
String pattern = "";
String[] column = (String[]) col;
for (String attr : column) {
switch (level){
case LEVEL1:
pattern = encodeRawString(attr);
break;
case LEVEL2:
pattern = encodeRawString(attr);
pattern = removeNumbers(pattern);
break;
case LEVEL3:
pattern = encodeRawString(attr);
pattern = removeNumbers(pattern);
pattern = removeUpperLowerCase(pattern);
break;
case LEVEL4:
pattern = encodeRawString(attr);
pattern = removeNumbers(pattern);
pattern = removeUpperLowerCase(pattern);
pattern = removeInnerCharacterInPattern(pattern, DIGIT, DOT);
break;
case LEVEL5:
pattern = encodeRawString(attr);
pattern = removeNumbers(pattern);
pattern = removeUpperLowerCase(pattern);
pattern = removeInnerCharacterInPattern(pattern, DIGIT, DOT);
pattern = removeInnerCharacterInPattern(pattern, ALPHA, SPACE);
break;
case LEVEL6:
pattern = encodeRawString(attr);
pattern = removeNumbers(pattern);
pattern = removeUpperLowerCase(pattern);
pattern = removeInnerCharacterInPattern(pattern, DIGIT, DOT);
pattern = removeInnerCharacterInPattern(pattern, ALPHA, SPACE);
pattern = acceptNegativeNumbersAsDigits(pattern);
break;
default:
throw new DMLRuntimeException("Could not find suitable level");
}
row_idx++;
if(pattern.equals(dom_pattern))
continue;
frameBlock.set(row_idx, col_idx, disguisedVal);
}
}
}