blob: 88dacc6d5011cd5188f8fd27c5b13eb7c281c616 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.bratannotator;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.ws.rs.Consumes;
import javax.ws.rs.POST;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.QueryParam;
import javax.ws.rs.core.MediaType;
import opennlp.tools.namefind.TokenNameFinder;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.Span;
@Path("/ner")
public class BratNameFinderResource {
public static class NameAnn {
public int[][] offsets;
public String[] texts;
public String type;
}
private SentenceDetector sentDetect = BratAnnService.sentenceDetector;
private Tokenizer tokenizer = BratAnnService.tokenizer;
private TokenNameFinder nameFinders[] = BratAnnService.nameFinders;
private static int findNextNonWhitespaceChar(CharSequence s, int beginOffset,
int endOffset) {
for (int i = beginOffset; i < endOffset; i++) {
if (!Character.isSpaceChar(s.charAt(i))) {
return i;
}
}
return -1;
}
@POST
@Consumes(MediaType.TEXT_PLAIN)
@Produces(MediaType.APPLICATION_JSON)
public Map<String, NameAnn> findNames(@QueryParam("model") String modelName,
String text) {
Span sentenceSpans[] = sentDetect.sentPosDetect(text);
Map<String, NameAnn> map = new HashMap<String, NameAnn>();
int indexCounter = 0;
for (int i = 0; i < sentenceSpans.length; i++) {
String sentenceText = sentenceSpans[i].getCoveredText(text).toString();
// offset of sentence gets lost here!
Span tokenSpans[] = tokenizer
.tokenizePos(sentenceText);
String tokens[] = Span.spansToStrings(tokenSpans, sentenceText);
for (TokenNameFinder nameFinder : nameFinders) {
Span names[] = nameFinder.find(tokens);
for (Span name : names) {
int beginOffset = tokenSpans[name.getStart()].getStart()
+ sentenceSpans[i].getStart();
int endOffset = tokenSpans[name.getEnd() - 1].getEnd()
+ sentenceSpans[i].getStart();
// create a list of new line indexes
List<Integer> newLineIndexes = new ArrayList<Integer>();
// TODO: Code needs to handle case that there are multiple new lines
// in a row
boolean inNewLineSequence = false;
for (int ci = beginOffset; ci < endOffset; ci++) {
if (text.charAt(ci) == '\n' || text.charAt(ci) == '\r') {
if (!inNewLineSequence) {
newLineIndexes.add(ci);
}
inNewLineSequence = true;
} else {
inNewLineSequence = false;
}
}
List<String> textSegments = new ArrayList<String>();
List<int[]> spanSegments = new ArrayList<int[]>();
int segmentBegin = beginOffset;
for (int newLineOffset : newLineIndexes) {
// create segment from begin to offset
textSegments.add(text.substring(segmentBegin, newLineOffset));
spanSegments.add(new int[] { segmentBegin, newLineOffset });
segmentBegin = findNextNonWhitespaceChar(text, newLineOffset + 1,
endOffset);
if (segmentBegin == -1) {
break;
}
}
// create left over segment
if (segmentBegin != -1) {
textSegments.add(text.substring(segmentBegin, endOffset));
spanSegments.add(new int[] { segmentBegin, endOffset });
}
NameAnn ann = new NameAnn();
ann.texts = textSegments.toArray(new String[textSegments.size()]);
ann.offsets = spanSegments.toArray(new int[spanSegments.size()][]);
ann.type = name.getType();
map.put(Integer.toString(indexCounter++), ann);
}
}
}
return map;
}
}