| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package opennlp.tools.formats.letsmt; |
| |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.List; |
| |
| import javax.xml.parsers.SAXParser; |
| |
| import org.xml.sax.InputSource; |
| import org.xml.sax.SAXException; |
| import org.xml.sax.XMLReader; |
| import org.xml.sax.helpers.DefaultHandler; |
| |
| import opennlp.tools.util.XmlUtil; |
| |
| /** |
| * A structure to hold the letsmt document. The documents contains sentences and depending on the |
| * source it either contains tokenized text (words) or an un-tokenized sentence string. |
| * <p> |
| * The format specification can be found |
| * <a href="http://project.letsmt.eu/uploads/Deliverables/D2.1%20%20Specification%20of%20data%20formats%20v1%20final.pdf">here</a>. |
| */ |
| public class LetsmtDocument { |
| |
| public static class LetsmtSentence { |
| private String nonTokenizedText; |
| private String[] tokens; |
| |
| public String getNonTokenizedText() { |
| return nonTokenizedText; |
| } |
| |
| public String[] getTokens() { |
| if (tokens != null) { |
| return Arrays.copyOf(tokens, tokens.length); |
| } |
| |
| return null; |
| } |
| } |
| |
| // define a content handler to receive the sax events ... |
| public static class LetsmtDocumentHandler extends DefaultHandler { |
| |
| private List<LetsmtSentence> sentences = new ArrayList<>(); |
| |
| private StringBuilder chars = new StringBuilder(); |
| private List<String> tokens = new ArrayList<>(); |
| |
| @Override |
| public void characters(char[] ch, int start, int length) throws SAXException { |
| chars.append(ch, start, length); |
| } |
| |
| @Override |
| public void endElement(String uri, String localName, String qName) throws SAXException { |
| super.endElement(uri, localName, qName); |
| |
| // Note: |
| // words are optional in sentences, if there are no words just the chars have to be captured |
| |
| switch (qName) { |
| case "w": |
| tokens.add(chars.toString().trim()); |
| chars.setLength(0); |
| break; |
| |
| // TODO: The sentence should contain the id, so it can be tracked back to the |
| // place it came from |
| case "s": |
| LetsmtSentence sentence = new LetsmtSentence(); |
| |
| if (tokens.size() > 0) { |
| sentence.tokens = tokens.toArray(new String[tokens.size()]); |
| tokens = new ArrayList<>(); |
| } |
| else { |
| sentence.nonTokenizedText = chars.toString().trim(); |
| } |
| |
| sentences.add(sentence); |
| |
| chars.setLength(0); |
| } |
| } |
| } |
| |
| private List<LetsmtSentence> sentences = new ArrayList<>(); |
| |
| private LetsmtDocument(List<LetsmtSentence> sentences) { |
| this.sentences = sentences; |
| } |
| |
| public List<LetsmtSentence> getSentences() { |
| return Collections.unmodifiableList(sentences); |
| } |
| |
| static LetsmtDocument parse(InputStream letsmtXmlIn) throws IOException { |
| SAXParser saxParser = XmlUtil.createSaxParser(); |
| |
| try { |
| XMLReader xmlReader = saxParser.getXMLReader(); |
| LetsmtDocumentHandler docHandler = new LetsmtDocumentHandler(); |
| xmlReader.setContentHandler(docHandler); |
| xmlReader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); |
| xmlReader.parse(new InputSource(letsmtXmlIn)); |
| return new LetsmtDocument(docHandler.sentences); |
| } catch (SAXException e) { |
| throw new IOException("Failed to parse letsmt xml!", e); |
| } |
| } |
| |
| static LetsmtDocument parse(File file) throws IOException { |
| try (InputStream in = new FileInputStream(file)) { |
| return parse(in); |
| } |
| } |
| } |