blob: fbdda6657e053616db2f9d58162b68364ad76703 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "redactor.h"
#include "redactor.detail.h"
#include <cerrno>
#include <cstring> // strcmp, strcasestr
#include <ostream>
#include <sstream>
#include <sys/stat.h>
#include <vector>
#include <gutil/strings/substitute.h>
#include <rapidjson/document.h>
#include <rapidjson/filereadstream.h>
#include <rapidjson/error/en.h>
#include <re2/re2.h>
#include <re2/stringpiece.h>
#include "common/logging.h"
#include "common/names.h"
namespace impala {
using rapidjson::Document;
using rapidjson::Value;
typedef re2::RE2 Regex;
// The actual rules in effect, if any.
Rules* g_rules = NULL;
string NameOfTypeOfJsonValue(const Value& value) {
switch (value.GetType()) {
case rapidjson::kNullType:
return "Null";
case rapidjson::kFalseType:
case rapidjson::kTrueType:
return "Bool";
case rapidjson::kObjectType:
return "Object";
case rapidjson::kArrayType:
return "Array";
case rapidjson::kStringType:
return "String";
case rapidjson::kNumberType:
if (value.IsInt()) return "Integer";
if (value.IsDouble()) return "Float";
default:
DCHECK(false);
return "Unknown";
}
}
// This class will parse a version 1 rule file and populate g_rules.
class RulesParser {
public:
// Perform the parsing and populate g_rules. Any errors encountered will be written
// to error_message.
string Parse(const Document& rules_doc) {
error_message_.str("");
bool found_rules = false;
for (Value::ConstMemberIterator member = rules_doc.MemberBegin();
member != rules_doc.MemberEnd(); ++member) {
if (strcmp("rules", member->name.GetString()) == 0) {
found_rules = true;
ParseRules(member->value);
} else if (strcmp("version", member->name.GetString()) == 0) {
// Ignore
} else {
AddDocParseError()
<< "unexpected property '" << member->name.GetString() << "' must be removed";
}
}
if (!found_rules) {
AddDocParseError() << "an array of rules is required";
}
return error_message_.str();
}
private:
ostringstream error_message_;
// The current index of the rule being parsed. 'SizeType' avoids an ambiguity between
// json_value[int] (array) and json_value[const char*] (object), otherwise 0 could be
// a null pointer.
rapidjson::SizeType rule_idx_;
// Parse an array of rules.
void ParseRules(const Value& rules) {
if (!rules.IsArray()) {
AddDocParseError() << "'rules' must be of type Array but is a "
<< NameOfTypeOfJsonValue(rules);
return;
}
for (rule_idx_ = 0; rule_idx_ < rules.Size(); ++rule_idx_) {
const Value& rule = rules[rule_idx_];
if (!rule.IsObject()) {
AddRuleParseError() << "rule should be a JSON Object but is a "
<< NameOfTypeOfJsonValue(rule);
continue;
}
ParseRule(rule);
}
}
// Parse a rule and populate g_rules.
void ParseRule(const Value& json_rule) {
bool found_replace = false;
bool case_sensitive = true;
string search_text, replace, trigger;
for (Value::ConstMemberIterator member = json_rule.MemberBegin();
member != json_rule.MemberEnd(); ++member) {
if (strcmp("search", member->name.GetString()) == 0) {
if (!ReadRuleProperty("search", json_rule, &search_text)) return;
if (search_text.empty()) {
AddRuleParseError() << "search property must be a non-empty regex";
return;
}
} else if (strcmp("replace", member->name.GetString()) == 0) {
found_replace = true;
if (!ReadRuleProperty("replace", json_rule, &replace)) return;
} else if (strcmp("trigger", member->name.GetString()) == 0) {
if (!ReadRuleProperty("trigger", json_rule, &trigger, /*required*/ false)) return;
} else if (strcmp("caseSensitive", member->name.GetString()) == 0) {
if (!ReadRuleProperty("caseSensitive", json_rule, &case_sensitive, false)) return;
} else if (strcmp("description", member->name.GetString()) == 0) {
// Ignore, this property is for user documentation.
} else {
// Future properties may change the meaning of current properties so ignoring
// unknown properties is not safe.
AddRuleParseError() << "unexpected property '" << member->name.GetString()
<< "' must be removed";
return;
}
}
if (search_text.empty()) { // Only empty if not found
AddRuleParseError() << "a 'search' property is required";
return;
} else if (!found_replace) {
AddRuleParseError() << "a 'replace' property is required";
return;
}
const Rule& rule = Rule::Create(trigger, search_text, replace, case_sensitive);
if (!rule.search_pattern.ok()) {
AddRuleParseError() << "search regex is invalid; " << rule.search_pattern.error();
return;
}
(*g_rules).push_back(rule);
}
// Reads a rule property of the given name and assigns the property value to the out
// parameter. A true return value indicates success.
template<typename T>
bool ReadRuleProperty(const string& name, const Value& rule, T* value,
bool required = true) {
const Value& json_value = rule[name.c_str()];
if (json_value.IsNull()) {
if (required) {
AddRuleParseError() << name << " property is required and cannot be null";
return false;
}
return true;
}
return ValidateTypeAndExtractValue(name, json_value, value);
}
// Extract a value stored in a rapidjson::Value and assign it to the out parameter.
// The type will be validated before extraction. A true return value indicates success.
// The name parameter is only used to generate an error message upon failure.
#define EXTRACT_VALUE(json_type, cpp_type) \
bool ValidateTypeAndExtractValue(const string& name, const Value& json_value, \
cpp_type* value) { \
if (!json_value.Is ## json_type()) { \
AddRuleParseError() << name << " property must be of type " #json_type \
<< " but is a " << NameOfTypeOfJsonValue(json_value); \
return false; \
} \
*value = json_value.Get ## json_type(); \
return true; \
}
EXTRACT_VALUE(String, string)
EXTRACT_VALUE(Bool, bool)
ostream& AddDocParseError() {
if (error_message_.tellp()) error_message_ << endl;
error_message_ << "Error parsing redaction rules; ";
return error_message_;
}
ostream& AddRuleParseError() {
if (error_message_.tellp()) error_message_ << endl;
error_message_ << "Error parsing redaction rule #" << (rule_idx_ + 1) << "; ";
return error_message_;
}
};
string SetRedactionRulesFromFile(const string& rules_file_path) {
if (g_rules == NULL) g_rules = new Rules();
g_rules->clear();
// Read the file.
FILE* rules_file = fopen(rules_file_path.c_str(), "r");
if (rules_file == NULL) {
return Substitute("Could not open redaction rules file '$0'; $1",
rules_file_path, strerror(errno));
}
// Check for an empty file and ignore it. This is done to play nice with automated
// cluster configuration tools that will generate empty files when no rules are in
// effect. Without this the JSON parser would produce an error.
struct stat rules_file_stats;
if (fstat(fileno(rules_file), &rules_file_stats)) {
fclose(rules_file);
return Substitute("Error reading redaction rules file; $0", strerror(errno));
}
if (rules_file_stats.st_size == 0) {
fclose(rules_file);
return "";
}
char readBuffer[65536];
rapidjson::FileReadStream stream(rules_file, readBuffer, sizeof(readBuffer));
Document rules_doc;
rules_doc.ParseStream(stream);
fclose(rules_file);
if (rules_doc.HasParseError()) {
return Substitute("Error parsing redaction rules; $0",
GetParseError_En(rules_doc.GetParseError()));
}
if (!rules_doc.IsObject()) {
return "Error parsing redaction rules; root element must be a JSON Object.";
}
if (!rules_doc.HasMember("version")) {
return "Error parsing redaction rules; a document version is required.";
}
const Value& version = rules_doc["version"];
if (version.IsNull()) {
return "Error parsing redaction rules; a document version is required.";
}
if (!version.IsInt()) {
return Substitute("Error parsing redaction rules; version must be an Integer but "
"is a $0", NameOfTypeOfJsonValue(version));
}
if (version.GetInt() != 1) {
return "Error parsing redaction rules; only version 1 is supported.";
}
RulesParser rules_parser;
return rules_parser.Parse(rules_doc);
}
void Redact(string* value, bool* changed) {
DCHECK(value != NULL);
if (g_rules == NULL || g_rules->empty()) return;
for (Rules::const_iterator rule = g_rules->begin(); rule != g_rules->end(); ++rule) {
if (rule->case_sensitive()) {
if (value->find(rule->trigger) == string::npos) continue;
} else {
if (strcasestr(value->c_str(), rule->trigger.c_str()) == NULL) continue;
}
int replacement_count = re2::RE2::GlobalReplace(
value, rule->search_pattern, rule->replacement);
if (changed != NULL && !*changed) *changed = replacement_count;
}
}
}