blob: 0bc9f410e9202aa9a5ca7a395163554cf42a2b43 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.postag;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.InvalidFormatException;
/**
* Represents an pos-tagged sentence.
*/
public class POSSample implements Serializable {
private List<String> sentence;
private List<String> tags;
private final String[][] additionalContext;
public POSSample(String[] sentence, String[] tags) {
this(sentence, tags, null);
}
public POSSample(List<String> sentence, List<String> tags) {
this(sentence, tags, null);
}
public POSSample(List<String> sentence, List<String> tags,
String[][] additionalContext) {
this.sentence = Collections.unmodifiableList(sentence);
this.tags = Collections.unmodifiableList(tags);
checkArguments();
String[][] ac;
if (additionalContext != null) {
ac = new String[additionalContext.length][];
for (int i = 0; i < additionalContext.length; i++) {
ac[i] = new String[additionalContext[i].length];
System.arraycopy(additionalContext[i], 0, ac[i], 0,
additionalContext[i].length);
}
} else {
ac = null;
}
this.additionalContext = ac;
}
public POSSample(String[] sentence, String[] tags,
String[][] additionalContext) {
this(Arrays.asList(sentence), Arrays.asList(tags), additionalContext);
}
private void checkArguments() {
if (sentence.size() != tags.size()) {
throw new IllegalArgumentException(
"There must be exactly one tag for each token. tokens: " + sentence.size() +
", tags: " + tags.size());
}
if (sentence.contains(null)) {
throw new IllegalArgumentException("null elements are not allowed in sentence tokens!");
}
if (tags.contains(null)) {
throw new IllegalArgumentException("null elements are not allowed in tags!");
}
}
public String[] getSentence() {
return sentence.toArray(new String[sentence.size()]);
}
public String[] getTags() {
return tags.toArray(new String[tags.size()]);
}
public String[][] getAddictionalContext() {
return this.additionalContext;
}
@Override
public String toString() {
StringBuilder result = new StringBuilder();
for (int i = 0; i < getSentence().length; i++) {
result.append(getSentence()[i]);
result.append('_');
result.append(getTags()[i]);
result.append(' ');
}
if (result.length() > 0) {
// get rid of last space
result.setLength(result.length() - 1);
}
return result.toString();
}
public static POSSample parse(String sentenceString) throws InvalidFormatException {
String[] tokenTags = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString);
String[] sentence = new String[tokenTags.length];
String[] tags = new String[tokenTags.length];
for (int i = 0; i < tokenTags.length; i++) {
int split = tokenTags[i].lastIndexOf("_");
if (split == -1) {
throw new InvalidFormatException("Cannot find \"_\" inside token '" + tokenTags[i] + "'!");
}
sentence[i] = tokenTags[i].substring(0, split);
tags[i] = tokenTags[i].substring(split + 1);
}
return new POSSample(sentence, tags);
}
@Override
public int hashCode() {
return Objects.hash(Arrays.hashCode(getSentence()), Arrays.hashCode(getTags()));
}
@Override
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
if (obj instanceof POSSample) {
POSSample a = (POSSample) obj;
return Arrays.equals(getSentence(), a.getSentence())
&& Arrays.equals(getTags(), a.getTags());
}
return false;
}
}