opennlp-tools/src/main/java/opennlp/tools/namefind/NameSample.java - opennlp - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.namefind;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import opennlp.tools.tokenize.WhitespaceTokenizer;
 import opennlp.tools.util.Span;

 /**
  * Class for holding names for a single unit of text.
  */
 public class NameSample {

   private final String id;
   private final List<String> sentence;
   private final List<Span> names;
   private final String[][] additionalContext;
   private final boolean isClearAdaptiveData;

   /** The a default type value when there is no type in training data. */
   public static final String DEFAULT_TYPE = "default";

   public NameSample(String id, String[] sentence, Span[] names,
       String[][] additionalContext, boolean clearAdaptiveData) {

     this.id = id;

     if (sentence == null) {
       throw new IllegalArgumentException("sentence must not be null!");
     }

     if (names == null) {
       names = new Span[0];
     }

     this.sentence = Collections.unmodifiableList(new ArrayList<String>(Arrays.asList(sentence)));
     this.names = Collections.unmodifiableList(new ArrayList<Span>(Arrays.asList(names)));

     if (additionalContext != null) {
       this.additionalContext = new String[additionalContext.length][];

       for (int i = 0; i < additionalContext.length; i++) {
         this.additionalContext[i] = new String[additionalContext[i].length];
         System.arraycopy(additionalContext[i], 0, this.additionalContext[i], 0, additionalContext[i].length);
       }
     }
     else {
       this.additionalContext = null;
     }
     isClearAdaptiveData = clearAdaptiveData;

     // TODO: Check that name spans are not overlapping, otherwise throw exception
   }

   /**
    * Initializes the current instance.
    *
    * @param sentence training sentence
    * @param names
    * @param additionalContext
    * @param clearAdaptiveData if true the adaptive data of the
    *     feature generators is cleared
    */
   public NameSample(String[] sentence, Span[] names,
       String[][] additionalContext, boolean clearAdaptiveData) {
     this(null, sentence, names, additionalContext, clearAdaptiveData);
   }

   public NameSample(String[] sentence, Span[] names, boolean clearAdaptiveData) {
     this(sentence, names, null, clearAdaptiveData);
   }

   public String getId() {
     return id;
   }

   public String[] getSentence() {
     return sentence.toArray(new String[sentence.size()]);
   }

   public Span[] getNames() {
     return names.toArray(new Span[names.size()]);
   }

   public String[][] getAdditionalContext() {
     return additionalContext;
   }

   public boolean isClearAdaptiveDataSet() {
     return isClearAdaptiveData;
   }

   @Override
   public boolean equals(Object obj) {

     if (this == obj) {
       return true;
     }
     else if (obj instanceof NameSample) {
       NameSample a = (NameSample) obj;

       return Arrays.equals(getSentence(), a.getSentence()) &&
           Arrays.equals(getNames(), a.getNames()) &&
           Arrays.equals(getAdditionalContext(), a.getAdditionalContext()) &&
           isClearAdaptiveDataSet() == a.isClearAdaptiveDataSet();
     }
     else {
       return false;
     }

   }

   @Override
   public String toString() {
     StringBuilder result = new StringBuilder();

     // If adaptive data must be cleared insert an empty line
     // before the sample sentence line
     if (isClearAdaptiveDataSet())
       result.append("\n");

     for (int tokenIndex = 0; tokenIndex < sentence.size(); tokenIndex++) {
       // token

       for (Span name : names) {
         if (name.getStart() == tokenIndex) {
           // check if nameTypes is null, or if the nameType for this specific
           // entity is empty. If it is, we leave the nameType blank.
           if (name.getType() == null) {
             result.append(NameSampleDataStream.START_TAG).append(' ');
           }
           else {
             result.append(NameSampleDataStream.START_TAG_PREFIX).append(name.getType()).append("> ");
           }
         }

         if (name.getEnd() == tokenIndex) {
           result.append(NameSampleDataStream.END_TAG).append(' ');
         }
       }

       result.append(sentence.get(tokenIndex)).append(' ');
     }

     if (sentence.size() > 1)
       result.setLength(result.length() - 1);

     for (Span name : names) {
       if (name.getEnd() == sentence.size()) {
         result.append(' ').append(NameSampleDataStream.END_TAG);
       }
     }

     return result.toString();
   }

   private static String errorTokenWithContext(String sentence[], int index) {

     StringBuilder errorString = new StringBuilder();

     // two token before
     if (index > 1)
       errorString.append(sentence[index -2]).append(" ");

     if (index > 0)
       errorString.append(sentence[index -1]).append(" ");

     // token itself
     errorString.append("###");
     errorString.append(sentence[index]);
     errorString.append("###").append(" ");

     // two token after
     if (index + 1 < sentence.length)
       errorString.append(sentence[index + 1]).append(" ");

     if (index + 2 < sentence.length)
       errorString.append(sentence[index + 2]);

     return errorString.toString();
   }

   private static final Pattern START_TAG_PATTERN = Pattern.compile("<START(:([^:>\\s]*))?>");

   public static NameSample parse(String taggedTokens,
       boolean isClearAdaptiveData) throws IOException {
     return parse(taggedTokens, DEFAULT_TYPE, isClearAdaptiveData);
   }

   public static NameSample parse(String taggedTokens, String defaultType,
       boolean isClearAdaptiveData)
     // TODO: Should throw another exception, and then convert it into an IOException in the stream
     throws IOException {
     String[] parts = WhitespaceTokenizer.INSTANCE.tokenize(taggedTokens);

     List<String> tokenList = new ArrayList<String>(parts.length);
     List<Span> nameList = new ArrayList<Span>();

     String nameType = defaultType;
     int startIndex = -1;
     int wordIndex = 0;

     // we check if at least one name has the a type. If no one has, we will
     // leave the NameType property of NameSample null.
     boolean catchingName = false;

     for (int pi = 0; pi < parts.length; pi++) {
       Matcher startMatcher = START_TAG_PATTERN.matcher(parts[pi]);
       if (startMatcher.matches()) {
         if(catchingName) {
           throw new IOException("Found unexpected annotation" +
               " while handling a name sequence: " + errorTokenWithContext(parts, pi));
         }
         catchingName = true;
         startIndex = wordIndex;
         String nameTypeFromSample = startMatcher.group(2);
         if(nameTypeFromSample != null) {
           if(nameTypeFromSample.length() == 0) {
             throw new IOException("Missing a name type: " + errorTokenWithContext(parts, pi));
           }
           nameType = nameTypeFromSample;
         }

       }
       else if (parts[pi].equals(NameSampleDataStream.END_TAG)) {
         if(catchingName == false) {
           throw new IOException("Found unexpected annotation: " + errorTokenWithContext(parts, pi));
         }
         catchingName = false;
         // create name
         nameList.add(new Span(startIndex, wordIndex, nameType));

       }
       else {
         tokenList.add(parts[pi]);
         wordIndex++;
       }
     }
     String[] sentence = tokenList.toArray(new String[tokenList.size()]);
     Span[] names = nameList.toArray(new Span[nameList.size()]);

     return new NameSample(sentence, names, isClearAdaptiveData);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.namefind;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.List;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import opennlp.tools.tokenize.WhitespaceTokenizer;
	import opennlp.tools.util.Span;

	/**
	* Class for holding names for a single unit of text.
	*/
	public class NameSample {

	private final String id;
	private final List<String> sentence;
	private final List<Span> names;
	private final String[][] additionalContext;
	private final boolean isClearAdaptiveData;

	/** The a default type value when there is no type in training data. */
	public static final String DEFAULT_TYPE = "default";

	public NameSample(String id, String[] sentence, Span[] names,
	String[][] additionalContext, boolean clearAdaptiveData) {

	this.id = id;

	if (sentence == null) {
	throw new IllegalArgumentException("sentence must not be null!");
	}

	if (names == null) {
	names = new Span[0];
	}

	this.sentence = Collections.unmodifiableList(new ArrayList<String>(Arrays.asList(sentence)));
	this.names = Collections.unmodifiableList(new ArrayList<Span>(Arrays.asList(names)));

	if (additionalContext != null) {
	this.additionalContext = new String[additionalContext.length][];

	for (int i = 0; i < additionalContext.length; i++) {
	this.additionalContext[i] = new String[additionalContext[i].length];
	System.arraycopy(additionalContext[i], 0, this.additionalContext[i], 0, additionalContext[i].length);
	}
	}
	else {
	this.additionalContext = null;
	}
	isClearAdaptiveData = clearAdaptiveData;

	// TODO: Check that name spans are not overlapping, otherwise throw exception
	}

	/**
	* Initializes the current instance.
	*
	* @param sentence training sentence
	* @param names
	* @param additionalContext
	* @param clearAdaptiveData if true the adaptive data of the
	* feature generators is cleared
	*/
	public NameSample(String[] sentence, Span[] names,
	String[][] additionalContext, boolean clearAdaptiveData) {
	this(null, sentence, names, additionalContext, clearAdaptiveData);
	}

	public NameSample(String[] sentence, Span[] names, boolean clearAdaptiveData) {
	this(sentence, names, null, clearAdaptiveData);
	}

	public String getId() {
	return id;
	}

	public String[] getSentence() {
	return sentence.toArray(new String[sentence.size()]);
	}

	public Span[] getNames() {
	return names.toArray(new Span[names.size()]);
	}

	public String[][] getAdditionalContext() {
	return additionalContext;
	}

	public boolean isClearAdaptiveDataSet() {
	return isClearAdaptiveData;
	}

	@Override
	public boolean equals(Object obj) {

	if (this == obj) {
	return true;
	}
	else if (obj instanceof NameSample) {
	NameSample a = (NameSample) obj;

	return Arrays.equals(getSentence(), a.getSentence()) &&
	Arrays.equals(getNames(), a.getNames()) &&
	Arrays.equals(getAdditionalContext(), a.getAdditionalContext()) &&
	isClearAdaptiveDataSet() == a.isClearAdaptiveDataSet();
	}
	else {
	return false;
	}

	}

	@Override
	public String toString() {
	StringBuilder result = new StringBuilder();

	// If adaptive data must be cleared insert an empty line
	// before the sample sentence line
	if (isClearAdaptiveDataSet())
	result.append("\n");

	for (int tokenIndex = 0; tokenIndex < sentence.size(); tokenIndex++) {
	// token

	for (Span name : names) {
	if (name.getStart() == tokenIndex) {
	// check if nameTypes is null, or if the nameType for this specific
	// entity is empty. If it is, we leave the nameType blank.
	if (name.getType() == null) {
	result.append(NameSampleDataStream.START_TAG).append(' ');
	}
	else {
	result.append(NameSampleDataStream.START_TAG_PREFIX).append(name.getType()).append("> ");
	}
	}

	if (name.getEnd() == tokenIndex) {
	result.append(NameSampleDataStream.END_TAG).append(' ');
	}
	}

	result.append(sentence.get(tokenIndex)).append(' ');
	}

	if (sentence.size() > 1)
	result.setLength(result.length() - 1);

	for (Span name : names) {
	if (name.getEnd() == sentence.size()) {
	result.append(' ').append(NameSampleDataStream.END_TAG);
	}
	}

	return result.toString();
	}

	private static String errorTokenWithContext(String sentence[], int index) {

	StringBuilder errorString = new StringBuilder();

	// two token before
	if (index > 1)
	errorString.append(sentence[index -2]).append(" ");

	if (index > 0)
	errorString.append(sentence[index -1]).append(" ");

	// token itself
	errorString.append("###");
	errorString.append(sentence[index]);
	errorString.append("###").append(" ");

	// two token after
	if (index + 1 < sentence.length)
	errorString.append(sentence[index + 1]).append(" ");

	if (index + 2 < sentence.length)
	errorString.append(sentence[index + 2]);

	return errorString.toString();
	}

	private static final Pattern START_TAG_PATTERN = Pattern.compile("<START(:([^:>\\s]*))?>");

	public static NameSample parse(String taggedTokens,
	boolean isClearAdaptiveData) throws IOException {
	return parse(taggedTokens, DEFAULT_TYPE, isClearAdaptiveData);
	}

	public static NameSample parse(String taggedTokens, String defaultType,
	boolean isClearAdaptiveData)
	// TODO: Should throw another exception, and then convert it into an IOException in the stream
	throws IOException {
	String[] parts = WhitespaceTokenizer.INSTANCE.tokenize(taggedTokens);

	List<String> tokenList = new ArrayList<String>(parts.length);
	List<Span> nameList = new ArrayList<Span>();

	String nameType = defaultType;
	int startIndex = -1;
	int wordIndex = 0;

	// we check if at least one name has the a type. If no one has, we will
	// leave the NameType property of NameSample null.
	boolean catchingName = false;

	for (int pi = 0; pi < parts.length; pi++) {
	Matcher startMatcher = START_TAG_PATTERN.matcher(parts[pi]);
	if (startMatcher.matches()) {
	if(catchingName) {
	throw new IOException("Found unexpected annotation" +
	" while handling a name sequence: " + errorTokenWithContext(parts, pi));
	}
	catchingName = true;
	startIndex = wordIndex;
	String nameTypeFromSample = startMatcher.group(2);
	if(nameTypeFromSample != null) {
	if(nameTypeFromSample.length() == 0) {
	throw new IOException("Missing a name type: " + errorTokenWithContext(parts, pi));
	}
	nameType = nameTypeFromSample;
	}

	}
	else if (parts[pi].equals(NameSampleDataStream.END_TAG)) {
	if(catchingName == false) {
	throw new IOException("Found unexpected annotation: " + errorTokenWithContext(parts, pi));
	}
	catchingName = false;
	// create name
	nameList.add(new Span(startIndex, wordIndex, nameType));

	}
	else {
	tokenList.add(parts[pi]);
	wordIndex++;
	}
	}
	String[] sentence = tokenList.toArray(new String[tokenList.size()]);
	Span[] names = nameList.toArray(new Span[nameList.size()]);

	return new NameSample(sentence, names, isClearAdaptiveData);
	}
	}