blob: 238860d6cbc402c5ded4a368510480303289e5fe [file] [log] [blame]
/**
* @file ExtractText.cpp
* ExtractText class implementation
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "ExtractText.h"
#include <algorithm>
#include <map>
#include <memory>
#include <sstream>
#include <string>
#include <utility>
#include "core/FlowFile.h"
#include "core/ProcessContext.h"
#include "core/ProcessSession.h"
#include "core/Resource.h"
#include "utils/ConfigurationUtils.h"
#include "utils/RegexUtils.h"
#include "utils/gsl.h"
#include "utils/ProcessorConfigUtils.h"
namespace org::apache::nifi::minifi::processors {
void ExtractText::initialize() {
setSupportedProperties(Properties);
setSupportedRelationships(Relationships);
}
void ExtractText::onTrigger(core::ProcessContext& context, core::ProcessSession& session) {
std::shared_ptr<core::FlowFile> flowFile = session.get();
if (!flowFile) {
return;
}
session.read(flowFile, ReadCallback{flowFile, context, logger_});
session.transfer(flowFile, Success);
}
int64_t ExtractText::ReadCallback::operator()(const std::shared_ptr<io::InputStream>& stream) const {
const auto flow_file_size = gsl::narrow<size_t>(flowFile_->getSize());
const auto default_buffer_size = utils::configuration::getBufferSize(*ctx_->getConfiguration());
std::vector<std::byte> buffer((std::min)(flow_file_size, default_buffer_size));
std::string attrKey = utils::parseOptionalProperty(*ctx_, Attribute).value_or("");
bool regex_mode = utils::parseBoolProperty(*ctx_, RegexMode);
auto size_limit = gsl::narrow<size_t>(utils::parseU64Property(*ctx_, SizeLimit));
if (size_limit == 0) {
size_limit = flow_file_size;
}
std::ostringstream contentStream;
size_t read_size = 0;
while (read_size < size_limit) {
// Don't read more than config limit or the size of the buffer
const auto length = (std::min)(size_limit - read_size, buffer.size());
const auto ret = stream->read(std::span(buffer).subspan(0, length));
if (io::isError(ret)) {
return -1; // Stream error
} else if (ret == 0) {
break; // End of stream, no more data
}
contentStream.write(reinterpret_cast<const char*>(buffer.data()), gsl::narrow<std::streamsize>(ret));
read_size += ret;
if (contentStream.fail()) {
return -1;
}
}
if (regex_mode) {
const bool insensitive = utils::parseBoolProperty(*ctx_, InsensitiveMatch);
const bool include_capture_group_zero = utils::parseBoolProperty(*ctx_, IncludeCaptureGroupZero);
const bool repeating_capture = utils::parseBoolProperty(*ctx_, EnableRepeatingCaptureGroup);
const auto max_capture_size = gsl::narrow<size_t>(utils::parseU64Property(*ctx_, MaxCaptureGroupLen));
std::vector<utils::Regex::Mode> regex_flags;
if (insensitive) {
regex_flags.push_back(utils::Regex::Mode::ICASE);
}
std::string contentStr = contentStream.str();
std::map<std::string, std::string> regexAttributes;
for (const auto& [dynamic_property_key, dynamic_property_value] : ctx_->getDynamicProperties()) {
std::string workStr = contentStr;
int matchcount = 0;
try {
utils::Regex rgx(dynamic_property_value, regex_flags);
utils::SMatch matches;
while (utils::regexSearch(workStr, matches, rgx)) {
for (std::size_t i = (include_capture_group_zero ? 0 : 1); i < matches.size(); ++i, ++matchcount) {
std::string attributeValue = matches[i];
if (attributeValue.length() > max_capture_size) {
attributeValue = attributeValue.substr(0, max_capture_size);
}
if (matchcount == 0) {
regexAttributes[dynamic_property_key] = attributeValue;
}
regexAttributes[dynamic_property_key + '.' + std::to_string(matchcount)] = attributeValue;
}
if (!repeating_capture) {
break;
}
workStr = matches.suffix();
}
} catch (const Exception &e) {
logger_->log_error("{} error encountered when trying to construct regular expression from property (key: {}) value: {}",
e.what(), dynamic_property_key, dynamic_property_value);
continue;
}
}
for (const auto& kv : regexAttributes) {
flowFile_->setAttribute(kv.first, kv.second);
}
} else {
flowFile_->setAttribute(attrKey, contentStream.str());
}
return gsl::narrow<int64_t>(read_size);
}
ExtractText::ReadCallback::ReadCallback(std::shared_ptr<core::FlowFile> flowFile, core::ProcessContext& ctx, std::shared_ptr<core::logging::Logger> lgr)
: flowFile_(std::move(flowFile)),
ctx_(gsl::make_not_null(&ctx)),
logger_(std::move(lgr)) {
}
REGISTER_RESOURCE(ExtractText, Processor);
} // namespace org::apache::nifi::minifi::processors