tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.tika.parser.ner.regex;

 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import org.apache.commons.io.IOUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import org.apache.tika.parser.ner.NERecogniser;

 /**
  * This class offers an implementation of {@link NERecogniser} based on
  * Regular Expressions.
  * <p>
  * The default configuration file {@value NER_REGEX_FILE} is used when no
  * argument constructor is used to instantiate this class. The regex file is
  * loaded via {@link Class#getResourceAsStream(String)}, so the file should be
  * placed in the same package path as of this class.
  * </p>
  * The format of regex configuration as follows:
  * <pre>
  * ENTITY_TYPE1=REGEX1
  * ENTITY_TYPE2=REGEX2
  * </pre>
  *
  * <i>For example, to extract week day from text:</i>
  * <pre>WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
  * </pre>
  *
  * @since Nov. 7, 2015
  */
 public class RegexNERecogniser implements NERecogniser {

     public static final String NER_REGEX_FILE = "ner-regex.txt";
     private static Logger LOG = LoggerFactory.getLogger(RegexNERecogniser.class);
     private static RegexNERecogniser INSTANCE;
     public Set<String> entityTypes = new HashSet<>();
     public Map<String, Pattern> patterns;
     private boolean available = false;

     public RegexNERecogniser() {
         this(RegexNERecogniser.class.getResourceAsStream(NER_REGEX_FILE));
     }

     public RegexNERecogniser(InputStream stream) {
         try {
             patterns = new HashMap<>();
             List<String> lines = IOUtils.readLines(stream, StandardCharsets.UTF_8);
             IOUtils.closeQuietly(stream);
             for (String line : lines) {
                 line = line.trim();
                 if (line.isEmpty() || line.startsWith("#")) { //empty or comment
                     continue;                                //skip
                 }

                 int delim = line.indexOf('=');
                 if (delim < 0) { //delim not found
                     //skip
                     LOG.error("Skipped : Invalid config : {} ", line);
                     continue;
                 }
                 String type = line.substring(0, delim).trim();
                 String patternStr = line.substring(delim + 1).trim();
                 patterns.put(type, Pattern.compile(patternStr));
                 entityTypes.add(type);
             }
         } catch (Exception e) {
             LOG.error(e.getMessage(), e);
         }
         available = !entityTypes.isEmpty();
     }

     public synchronized static RegexNERecogniser getInstance() {
         if (INSTANCE == null) {
             INSTANCE = new RegexNERecogniser();
         }
         return INSTANCE;
     }

     @Override
     public boolean isAvailable() {
         return available;
     }

     @Override
     public Set<String> getEntityTypes() {
         return entityTypes;
     }

     /**
      * finds matching sub groups in text
      *
      * @param text    text containing interesting sub strings
      * @param pattern pattern to find sub strings
      * @return set of sub strings if any found, or null if none found
      */
     public Set<String> findMatches(String text, Pattern pattern) {
         Set<String> results = null;
         Matcher matcher = pattern.matcher(text);
         if (matcher.find()) {
             results = new HashSet<>();
             results.add(matcher.group(0));
             while (matcher.find()) {
                 results.add(matcher.group(0));
             }
         }
         return results;
     }

     @Override
     public Map<String, Set<String>> recognise(String text) {
         Map<String, Set<String>> result = new HashMap<>();
         for (Map.Entry<String, Pattern> entry : patterns.entrySet()) {
             Set<String> names = findMatches(text, entry.getValue());
             if (names != null) {
                 result.put(entry.getKey(), names);
             }
         }
         return result;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.tika.parser.ner.regex;

	import java.io.InputStream;
	import java.nio.charset.StandardCharsets;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import org.apache.commons.io.IOUtils;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import org.apache.tika.parser.ner.NERecogniser;

	/**
	* This class offers an implementation of {@link NERecogniser} based on
	* Regular Expressions.
	* <p>
	* The default configuration file {@value NER_REGEX_FILE} is used when no
	* argument constructor is used to instantiate this class. The regex file is
	* loaded via {@link Class#getResourceAsStream(String)}, so the file should be
	* placed in the same package path as of this class.
	* </p>
	* The format of regex configuration as follows:
	* <pre>
	* ENTITY_TYPE1=REGEX1
	* ENTITY_TYPE2=REGEX2
	* </pre>
	*
	* <i>For example, to extract week day from text:</i>
	* <pre>WEEK_DAY=(?i)((sun)\|(mon)\|(tues)\|(thurs)\|(fri)\|((sat)(ur)?))(day)?
	* </pre>
	*
	* @since Nov. 7, 2015
	*/
	public class RegexNERecogniser implements NERecogniser {

	public static final String NER_REGEX_FILE = "ner-regex.txt";
	private static Logger LOG = LoggerFactory.getLogger(RegexNERecogniser.class);
	private static RegexNERecogniser INSTANCE;
	public Set<String> entityTypes = new HashSet<>();
	public Map<String, Pattern> patterns;
	private boolean available = false;

	public RegexNERecogniser() {
	this(RegexNERecogniser.class.getResourceAsStream(NER_REGEX_FILE));
	}

	public RegexNERecogniser(InputStream stream) {
	try {
	patterns = new HashMap<>();
	List<String> lines = IOUtils.readLines(stream, StandardCharsets.UTF_8);
	IOUtils.closeQuietly(stream);
	for (String line : lines) {
	line = line.trim();
	if (line.isEmpty() \|\| line.startsWith("#")) { //empty or comment
	continue; //skip
	}

	int delim = line.indexOf('=');
	if (delim < 0) { //delim not found
	//skip
	LOG.error("Skipped : Invalid config : {} ", line);
	continue;
	}
	String type = line.substring(0, delim).trim();
	String patternStr = line.substring(delim + 1).trim();
	patterns.put(type, Pattern.compile(patternStr));
	entityTypes.add(type);
	}
	} catch (Exception e) {
	LOG.error(e.getMessage(), e);
	}
	available = !entityTypes.isEmpty();
	}

	public synchronized static RegexNERecogniser getInstance() {
	if (INSTANCE == null) {
	INSTANCE = new RegexNERecogniser();
	}
	return INSTANCE;
	}

	@Override
	public boolean isAvailable() {
	return available;
	}

	@Override
	public Set<String> getEntityTypes() {
	return entityTypes;
	}

	/**
	* finds matching sub groups in text
	*
	* @param text text containing interesting sub strings
	* @param pattern pattern to find sub strings
	* @return set of sub strings if any found, or null if none found
	*/
	public Set<String> findMatches(String text, Pattern pattern) {
	Set<String> results = null;
	Matcher matcher = pattern.matcher(text);
	if (matcher.find()) {
	results = new HashSet<>();
	results.add(matcher.group(0));
	while (matcher.find()) {
	results.add(matcher.group(0));
	}
	}
	return results;
	}

	@Override
	public Map<String, Set<String>> recognise(String text) {
	Map<String, Set<String>> result = new HashMap<>();
	for (Map.Entry<String, Pattern> entry : patterns.entrySet()) {
	Set<String> names = findMatches(text, entry.getValue());
	if (names != null) {
	result.put(entry.getKey(), names);
	}
	}
	return result;
	}
	}