| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package opennlp.tools.formats.ad; |
| |
| import java.io.FileInputStream; |
| import java.io.IOException; |
| import java.io.UnsupportedEncodingException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.List; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import opennlp.tools.formats.ad.ADSentenceStream.Sentence; |
| import opennlp.tools.sentdetect.SentenceSample; |
| import opennlp.tools.sentdetect.lang.Factory; |
| import opennlp.tools.util.InputStreamFactory; |
| import opennlp.tools.util.ObjectStream; |
| import opennlp.tools.util.PlainTextByLineStream; |
| import opennlp.tools.util.Span; |
| |
| /** |
| * <b>Note:</b> Do not use this class, internal use only! |
| */ |
| public class ADSentenceSampleStream implements ObjectStream<SentenceSample> { |
| |
| private final ObjectStream<ADSentenceStream.Sentence> adSentenceStream; |
| |
| private int text = -1; |
| private int para = -1; |
| private boolean isSameText; |
| private boolean isSamePara; |
| private Sentence sent; |
| private boolean isIncludeTitles = true; |
| private boolean isTitle; |
| |
| private final char[] ptEosCharacters; |
| |
| /** |
| * Creates a new {@link SentenceSample} stream from a line stream, i.e. |
| * {@link ObjectStream}<{@link String}>, that could be a |
| * {@link PlainTextByLineStream} object. |
| * |
| * @param lineStream |
| * a stream of lines as {@link String} |
| * @param includeHeadlines |
| * if true will output the sentences marked as news headlines |
| */ |
| public ADSentenceSampleStream(ObjectStream<String> lineStream, boolean includeHeadlines) { |
| this.adSentenceStream = new ADSentenceStream(lineStream); |
| ptEosCharacters = Factory.ptEosCharacters; |
| Arrays.sort(ptEosCharacters); |
| this.isIncludeTitles = includeHeadlines; |
| } |
| |
| /** |
| * Creates a new {@link SentenceSample} stream from a {@link FileInputStream} |
| * |
| * @param in |
| * input stream from the corpus |
| * @param charsetName |
| * the charset to use while reading the corpus |
| * @param includeHeadlines |
| * if true will output the sentences marked as news headlines |
| */ |
| public ADSentenceSampleStream(InputStreamFactory in, String charsetName, |
| boolean includeHeadlines) throws IOException { |
| try { |
| this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream( |
| in, charsetName)); |
| } catch (UnsupportedEncodingException e) { |
| // UTF-8 is available on all JVMs, will never happen |
| throw new IllegalStateException(e); |
| } |
| ptEosCharacters = Factory.ptEosCharacters; |
| Arrays.sort(ptEosCharacters); |
| this.isIncludeTitles = includeHeadlines; |
| } |
| |
| // The Arvores Deitadas Corpus has information about texts and paragraphs. |
| public SentenceSample read() throws IOException { |
| |
| if (sent == null) { |
| sent = this.adSentenceStream.read(); |
| updateMeta(); |
| if (sent == null) { |
| return null; |
| } |
| } |
| |
| StringBuilder document = new StringBuilder(); |
| List<Span> sentences = new ArrayList<>(); |
| do { |
| do { |
| if (!isTitle || (isTitle && isIncludeTitles)) { |
| if (hasPunctuation(sent.getText())) { |
| int start = document.length(); |
| document.append(sent.getText()); |
| sentences.add(new Span(start, document.length())); |
| document.append(" "); |
| } |
| |
| } |
| sent = this.adSentenceStream.read(); |
| updateMeta(); |
| } |
| while (isSamePara); |
| // break; // got one paragraph! |
| } |
| while (isSameText); |
| |
| String doc; |
| if (document.length() > 0) { |
| doc = document.substring(0, document.length() - 1); |
| } else { |
| doc = document.toString(); |
| } |
| |
| return new SentenceSample(doc, |
| sentences.toArray(new Span[sentences.size()])); |
| } |
| |
| private boolean hasPunctuation(String text) { |
| text = text.trim(); |
| if (text.length() > 0) { |
| char lastChar = text.charAt(text.length() - 1); |
| if (Arrays.binarySearch(ptEosCharacters, lastChar) >= 0) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| // there are some different types of metadata depending on the corpus. |
| // todo: merge this patterns |
| private Pattern meta1 = Pattern |
| .compile("^(?:[a-zA-Z\\-]*(\\d+)).*?p=(\\d+).*"); |
| |
| private void updateMeta() { |
| if (this.sent != null) { |
| String meta = this.sent.getMetadata(); |
| Matcher m = meta1.matcher(meta); |
| int currentText; |
| int currentPara; |
| if (m.matches()) { |
| currentText = Integer.parseInt(m.group(1)); |
| currentPara = Integer.parseInt(m.group(2)); |
| } else { |
| throw new RuntimeException("Invalid metadata: " + meta); |
| } |
| isSamePara = isSameText = false; |
| if (currentText == text) |
| isSameText = true; |
| |
| if (isSameText && currentPara == para) |
| isSamePara = true; |
| |
| isTitle = meta.contains("title"); |
| |
| text = currentText; |
| para = currentPara; |
| |
| } else { |
| this.isSamePara = this.isSameText = false; |
| } |
| } |
| |
| public void reset() throws IOException, UnsupportedOperationException { |
| adSentenceStream.reset(); |
| } |
| |
| public void close() throws IOException { |
| adSentenceStream.close(); |
| } |
| } |