blob: 8c0703bedb28452b33efa63410ef1cd4cec57df4 [file] [log] [blame]
/*
* Copyright 2013 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.addons.modelbuilder.impls;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import opennlp.addons.modelbuilder.ModelGenerationValidator;
/**
* Validates NER results input before inclusion into the model.
*/
public class FileModelValidatorImpl implements ModelGenerationValidator {
private final Set<String> badentities = new HashSet<>();
BaseModelBuilderParams params;
@Override
public void setParameters(BaseModelBuilderParams params) {
this.params = params;
}
@Override
public Boolean validSentence(String sentence) {
//returning true by default, because the sentence provider will return only "valid" sentences in this case
return true;
}
@Override
public Boolean validNamedEntity(String namedEntity) {
if (badentities.isEmpty()) {
getBlackList();
}
//
// Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
// if (p.matcher(namedEntity).find()) {
// return false;
// }
boolean b = true;
if (badentities.contains(namedEntity.toLowerCase())) {
b = false;
}
return b;
}
@Override
public Collection<String> getBlackList() {
if (params.getKnownEntityBlacklist() == null) {
return badentities;
}
if (!badentities.isEmpty()) {
try (BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(params.getKnownEntityBlacklist()), StandardCharsets.UTF_8))) {
String line;
while ((line = br.readLine()) != null) {
badentities.add(line);
}
} catch (IOException ex) {
Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
}
}
return badentities;
}
}