blob: 782ea5370255ce25e6b0c59d8cce3ad942293cc3 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.udfs;
import io.netty.buffer.DrillBuf;
import org.apache.drill.exec.expr.DrillSimpleFunc;
import org.apache.drill.exec.expr.annotations.FunctionTemplate;
import org.apache.drill.exec.expr.annotations.Output;
import org.apache.drill.exec.expr.annotations.Param;
import org.apache.drill.exec.expr.holders.Float8Holder;
import org.apache.drill.exec.expr.holders.VarCharHolder;
import javax.inject.Inject;
public class ThreatHuntingFunctions {
/**
* Punctuation pattern is useful for comparing log entries. It extracts all the punctuation and returns
* that pattern. Spaces are replaced with an underscore.
* <p>
* Usage: SELECT punctuation_pattern( string ) FROM...
*/
@FunctionTemplate(names = {"punctuation_pattern", "punctuationPattern"},
scope = FunctionTemplate.FunctionScope.SIMPLE,
nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
public static class PunctuationPatternFunction implements DrillSimpleFunc {
@Param
VarCharHolder rawInput;
@Output
VarCharHolder out;
@Inject
DrillBuf buffer;
@Override
public void setup() {
}
@Override
public void eval() {
String input = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start, rawInput.end, rawInput.buffer);
String punctuationPattern = input.replaceAll("[a-zA-Z0-9]", "");
punctuationPattern = punctuationPattern.replaceAll(" ", "_");
out.buffer = buffer;
out.start = 0;
out.end = punctuationPattern.getBytes(java.nio.charset.StandardCharsets.UTF_8).length;
buffer.setBytes(0, punctuationPattern.getBytes());
}
}
/**
* This function calculates the Shannon Entropy of a given string of text.
* See: https://en.wikipedia.org/wiki/Entropy_(information_theory) for full definition.
* <p>
* Usage:
* SELECT entropy(<varchar>) AS entropy FROM...
*
* Returns a double
*/
@FunctionTemplate(name = "entropy",
scope = FunctionTemplate.FunctionScope.SIMPLE,
nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
public static class StringEntropyFunction implements DrillSimpleFunc {
@Param
VarCharHolder rawInput1;
@Output
Float8Holder out;
@Override
public void setup() {}
@Override
public void eval() {
String input = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer);
java.util.Set<Character> chars = new java.util.HashSet();
for (char ch : input.toCharArray()) {
chars.add(ch);
}
java.util.Map<Character, Double> probabilities = new java.util.HashMap();
int length = input.length();
// Get the probabilities
for (Character character : chars) {
double charCount = org.apache.commons.lang3.StringUtils.countMatches(input, character);
double probability = charCount / length;
probabilities.put(character, probability);
}
// Now get the entropy
double entropy = 0.0;
for (Double probability : probabilities.values()) {
entropy += (probability * java.lang.Math.log(probability) / java.lang.Math.log(2.0));
}
out.value = Math.abs(entropy);
}
}
/**
* This function calculates the Shannon Entropy of a given string of text, normed for the string length.
* See: https://en.wikipedia.org/wiki/Entropy_(information_theory) for full definition.
* <p>
* Usage:
* SELECT entropy_per_byte(<varchar>) AS entropy FROM...
*
* Returns a double
*/
@FunctionTemplate(names = {"entropy_per_byte", "entropyPerByte"},
scope = FunctionTemplate.FunctionScope.SIMPLE,
nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
public static class NormedStringEntropyFunction implements DrillSimpleFunc {
@Param
VarCharHolder rawInput;
@Output
Float8Holder out;
@Override
public void setup() {}
@Override
public void eval() {
String input = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder(rawInput);
java.util.Set<Character> chars = new java.util.HashSet();
for (char ch : input.toCharArray()) {
chars.add(ch);
}
java.util.Map<Character, Double> probabilities = new java.util.HashMap();
int length = input.length();
// Get the probabilities
for (Character character : chars) {
double charCount = org.apache.commons.lang3.StringUtils.countMatches(input, character);
double probability = charCount / length;
probabilities.put(character, probability);
}
// Now get the entropy
double entropy = 0.0;
for (Double probability : probabilities.values()) {
entropy += (probability * java.lang.Math.log(probability) / java.lang.Math.log(2.0));
}
if (input.length() == 0) {
out.value = 0.0;
} else {
out.value = (Math.abs(entropy) / input.length());
}
}
}
}