opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSample.java - opennlp - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.tokenize;

 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.Objects;

 import opennlp.tools.tokenize.Detokenizer.DetokenizationOperation;
 import opennlp.tools.util.Span;

 /**
  * A {@link TokenSample} is text with token spans.
  */
 public class TokenSample implements Serializable {

   public static final String DEFAULT_SEPARATOR_CHARS = "<SPLIT>";

   private static final String separatorChars = DEFAULT_SEPARATOR_CHARS;

   private final String text;

   private final List<Span> tokenSpans;

   /**
    * Initializes the current instance.
    *
    * @param text the text which contains the tokens.
    * @param tokenSpans the spans which mark the begin and end of the tokens.
    */
   public TokenSample(String text, Span[] tokenSpans) {
     Objects.requireNonNull(tokenSpans, "tokenSpans must not be null");

     this.text = Objects.requireNonNull(text, "text must not be null");
     this.tokenSpans = Collections.unmodifiableList(new ArrayList<>(Arrays.asList(tokenSpans)));

     for (Span tokenSpan : tokenSpans) {
       if (tokenSpan.getStart() < 0 || tokenSpan.getStart() > text.length() ||
           tokenSpan.getEnd() > text.length() || tokenSpan.getEnd() < 0) {
         throw new IllegalArgumentException("Span " + tokenSpan +
             " is out of bounds, text length: " + text.length() + "!");
       }
     }
   }

   public TokenSample(Detokenizer detokenizer, String[] tokens) {

     StringBuilder sentence = new StringBuilder();

     DetokenizationOperation[] operations = detokenizer.detokenize(tokens);

     List<Span> mergedTokenSpans = new ArrayList<>();

     for (int i = 0; i < operations.length; i++) {

       boolean isSeparateFromPreviousToken = i > 0 &&
           !isMergeToRight(operations[i - 1]) &&
           !isMergeToLeft(operations[i]);

       if (isSeparateFromPreviousToken) {
         sentence.append(' ');
       }

       int beginIndex = sentence.length();
       sentence.append(tokens[i]);
       mergedTokenSpans.add(new Span(beginIndex, sentence.length()));
     }

     text = sentence.toString();
     tokenSpans = Collections.unmodifiableList(mergedTokenSpans);
   }

   private boolean isMergeToRight(DetokenizationOperation operation) {
     return DetokenizationOperation.MERGE_TO_RIGHT.equals(operation)
         || DetokenizationOperation.MERGE_BOTH.equals(operation);
   }

   private boolean isMergeToLeft(DetokenizationOperation operation) {
     return DetokenizationOperation.MERGE_TO_LEFT.equals(operation)
         || DetokenizationOperation.MERGE_BOTH.equals(operation);
   }

   /**
    * Retrieves the text.
    */
   public String getText() {
     return text;
   }

   /**
    * Retrieves the token spans.
    */
   public Span[] getTokenSpans() {
     return tokenSpans.toArray(new Span[tokenSpans.size()]);
   }

   @Override
   public String toString() {

     StringBuilder sentence = new StringBuilder();

     int lastEndIndex = -1;
     for (Span token : tokenSpans) {

       if (lastEndIndex != -1) {

         // If there are no chars between last token
         // and this token insert the separator chars
         // otherwise insert a space

         String separator;
         if (lastEndIndex == token.getStart())
           separator = separatorChars;
         else
           separator = " ";

         sentence.append(separator);
       }

       sentence.append(token.getCoveredText(text));

       lastEndIndex = token.getEnd();
     }

     return sentence.toString();
   }

   private static void addToken(StringBuilder sample, List<Span> tokenSpans,
       String token, boolean isNextMerged) {

     int tokenSpanStart = sample.length();
     sample.append(token);
     int tokenSpanEnd = sample.length();

     tokenSpans.add(new Span(tokenSpanStart, tokenSpanEnd));

     if (!isNextMerged)
         sample.append(" ");
   }

   public static TokenSample parse(String sampleString, String separatorChars) {
     Objects.requireNonNull(sampleString, "sampleString must not be null");
     Objects.requireNonNull(separatorChars, "separatorChars must not be null");

     Span[] whitespaceTokenSpans = WhitespaceTokenizer.INSTANCE.tokenizePos(sampleString);

     // Pre-allocate 20% for newly created tokens
     List<Span> realTokenSpans = new ArrayList<>((int) (whitespaceTokenSpans.length * 1.2d));

     StringBuilder untaggedSampleString = new StringBuilder();

     for (Span whiteSpaceTokenSpan : whitespaceTokenSpans) {
       String whitespaceToken = whiteSpaceTokenSpan.getCoveredText(sampleString).toString();

       boolean wasTokenReplaced = false;

       int tokStart = 0;
       int tokEnd;
       while ((tokEnd = whitespaceToken.indexOf(separatorChars, tokStart)) > -1) {

         String token = whitespaceToken.substring(tokStart, tokEnd);

         addToken(untaggedSampleString, realTokenSpans, token, true);

         tokStart = tokEnd + separatorChars.length();
         wasTokenReplaced = true;
       }

       if (wasTokenReplaced) {
         // If the token contains the split chars at least once
         // a span for the last token must still be added
         String token = whitespaceToken.substring(tokStart);

         addToken(untaggedSampleString, realTokenSpans, token, false);
       }
       else {
         // If it does not contain the split chars at lest once
         // just copy the original token span

         addToken(untaggedSampleString, realTokenSpans, whitespaceToken, false);
       }
     }

     return new TokenSample(untaggedSampleString.toString(), realTokenSpans.toArray(
         new Span[realTokenSpans.size()]));
   }

   @Override
   public int hashCode() {
     return Objects.hash(getText(), Arrays.hashCode(getTokenSpans()));
   }

   @Override
   public boolean equals(Object obj) {
     if (this == obj) {
       return true;
     }

     if (obj instanceof TokenSample) {
       TokenSample a = (TokenSample) obj;

       return getText().equals(a.getText())
           && Arrays.equals(getTokenSpans(), a.getTokenSpans());
     }

     return false;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.tokenize;

	import java.io.Serializable;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.List;
	import java.util.Objects;

	import opennlp.tools.tokenize.Detokenizer.DetokenizationOperation;
	import opennlp.tools.util.Span;

	/**
	* A {@link TokenSample} is text with token spans.
	*/
	public class TokenSample implements Serializable {

	public static final String DEFAULT_SEPARATOR_CHARS = "<SPLIT>";

	private static final String separatorChars = DEFAULT_SEPARATOR_CHARS;

	private final String text;

	private final List<Span> tokenSpans;

	/**
	* Initializes the current instance.
	*
	* @param text the text which contains the tokens.
	* @param tokenSpans the spans which mark the begin and end of the tokens.
	*/
	public TokenSample(String text, Span[] tokenSpans) {
	Objects.requireNonNull(tokenSpans, "tokenSpans must not be null");

	this.text = Objects.requireNonNull(text, "text must not be null");
	this.tokenSpans = Collections.unmodifiableList(new ArrayList<>(Arrays.asList(tokenSpans)));

	for (Span tokenSpan : tokenSpans) {
	if (tokenSpan.getStart() < 0 \|\| tokenSpan.getStart() > text.length() \|\|
	tokenSpan.getEnd() > text.length() \|\| tokenSpan.getEnd() < 0) {
	throw new IllegalArgumentException("Span " + tokenSpan +
	" is out of bounds, text length: " + text.length() + "!");
	}
	}
	}

	public TokenSample(Detokenizer detokenizer, String[] tokens) {

	StringBuilder sentence = new StringBuilder();

	DetokenizationOperation[] operations = detokenizer.detokenize(tokens);

	List<Span> mergedTokenSpans = new ArrayList<>();

	for (int i = 0; i < operations.length; i++) {

	boolean isSeparateFromPreviousToken = i > 0 &&
	!isMergeToRight(operations[i - 1]) &&
	!isMergeToLeft(operations[i]);

	if (isSeparateFromPreviousToken) {
	sentence.append(' ');
	}

	int beginIndex = sentence.length();
	sentence.append(tokens[i]);
	mergedTokenSpans.add(new Span(beginIndex, sentence.length()));
	}

	text = sentence.toString();
	tokenSpans = Collections.unmodifiableList(mergedTokenSpans);
	}

	private boolean isMergeToRight(DetokenizationOperation operation) {
	return DetokenizationOperation.MERGE_TO_RIGHT.equals(operation)
	\|\| DetokenizationOperation.MERGE_BOTH.equals(operation);
	}

	private boolean isMergeToLeft(DetokenizationOperation operation) {
	return DetokenizationOperation.MERGE_TO_LEFT.equals(operation)
	\|\| DetokenizationOperation.MERGE_BOTH.equals(operation);
	}

	/**
	* Retrieves the text.
	*/
	public String getText() {
	return text;
	}

	/**
	* Retrieves the token spans.
	*/
	public Span[] getTokenSpans() {
	return tokenSpans.toArray(new Span[tokenSpans.size()]);
	}

	@Override
	public String toString() {

	StringBuilder sentence = new StringBuilder();

	int lastEndIndex = -1;
	for (Span token : tokenSpans) {

	if (lastEndIndex != -1) {

	// If there are no chars between last token
	// and this token insert the separator chars
	// otherwise insert a space

	String separator;
	if (lastEndIndex == token.getStart())
	separator = separatorChars;
	else
	separator = " ";

	sentence.append(separator);
	}

	sentence.append(token.getCoveredText(text));

	lastEndIndex = token.getEnd();
	}

	return sentence.toString();
	}

	private static void addToken(StringBuilder sample, List<Span> tokenSpans,
	String token, boolean isNextMerged) {

	int tokenSpanStart = sample.length();
	sample.append(token);
	int tokenSpanEnd = sample.length();

	tokenSpans.add(new Span(tokenSpanStart, tokenSpanEnd));

	if (!isNextMerged)
	sample.append(" ");
	}

	public static TokenSample parse(String sampleString, String separatorChars) {
	Objects.requireNonNull(sampleString, "sampleString must not be null");
	Objects.requireNonNull(separatorChars, "separatorChars must not be null");

	Span[] whitespaceTokenSpans = WhitespaceTokenizer.INSTANCE.tokenizePos(sampleString);

	// Pre-allocate 20% for newly created tokens
	List<Span> realTokenSpans = new ArrayList<>((int) (whitespaceTokenSpans.length * 1.2d));

	StringBuilder untaggedSampleString = new StringBuilder();

	for (Span whiteSpaceTokenSpan : whitespaceTokenSpans) {
	String whitespaceToken = whiteSpaceTokenSpan.getCoveredText(sampleString).toString();

	boolean wasTokenReplaced = false;

	int tokStart = 0;
	int tokEnd;
	while ((tokEnd = whitespaceToken.indexOf(separatorChars, tokStart)) > -1) {

	String token = whitespaceToken.substring(tokStart, tokEnd);

	addToken(untaggedSampleString, realTokenSpans, token, true);

	tokStart = tokEnd + separatorChars.length();
	wasTokenReplaced = true;
	}

	if (wasTokenReplaced) {
	// If the token contains the split chars at least once
	// a span for the last token must still be added
	String token = whitespaceToken.substring(tokStart);

	addToken(untaggedSampleString, realTokenSpans, token, false);
	}
	else {
	// If it does not contain the split chars at lest once
	// just copy the original token span

	addToken(untaggedSampleString, realTokenSpans, whitespaceToken, false);
	}
	}

	return new TokenSample(untaggedSampleString.toString(), realTokenSpans.toArray(
	new Span[realTokenSpans.size()]));
	}

	@Override
	public int hashCode() {
	return Objects.hash(getText(), Arrays.hashCode(getTokenSpans()));
	}

	@Override
	public boolean equals(Object obj) {
	if (this == obj) {
	return true;
	}

	if (obj instanceof TokenSample) {
	TokenSample a = (TokenSample) obj;

	return getText().equals(a.getText())
	&& Arrays.equals(getTokenSpans(), a.getTokenSpans());
	}

	return false;
	}
	}