blob: 9c03e7491d5960debc9905981e27d2644fdf78fc [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.opennlp.wikinews_importer;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import info.bliki.wiki.dump.IArticleFilter;
import info.bliki.wiki.dump.Siteinfo;
import info.bliki.wiki.dump.WikiArticle;
import info.bliki.wiki.dump.WikiXMLParser;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.xml.sax.SAXException;
/**
* Demo application which reads an uncompressed Wikipedia XML dump
* file and writes each article as an XMI file.
*/
public class WikinewsConverter {
static class CASArticleFilter implements IArticleFilter {
private final TypeSystemDescription tsDesc;
private final File outputFolder;
private final List<String> endOfArticleMarkers = new ArrayList<>();
CASArticleFilter(TypeSystemDescription tsDesc, File outputFolder) {
this.tsDesc = tsDesc;
this.outputFolder = outputFolder;
endOfArticleMarkers.add("{{haveyoursay}}");
endOfArticleMarkers.add("== Sources ==");
endOfArticleMarkers.add("==Sources==");
endOfArticleMarkers.add("== Source ==");
endOfArticleMarkers.add("==Source==");
endOfArticleMarkers.add("==References==");
endOfArticleMarkers.add("== References ==");
endOfArticleMarkers.add("=== References===");
}
public static String titleToUri(String title) {
return URLEncoder.encode(title.replaceAll(" ", "_"), StandardCharsets.UTF_8);
}
@Override
public void process(WikiArticle page, Siteinfo siteinfo)
throws SAXException {
if (page.getIntegerNamespace() == 0 && page.isMain()) {
if (page.getText().toLowerCase().contains("{publish}")) {
String pageText = page.getText();
int cutIndex = pageText.length();
for (String endMarker : endOfArticleMarkers) {
int endMarkerIndex = pageText.indexOf(endMarker);
if (endMarkerIndex != -1 && endMarkerIndex < cutIndex) {
cutIndex = endMarkerIndex;
}
}
if (cutIndex < pageText.length()) {
pageText = pageText.substring(0, cutIndex);
}
WikinewsWikiModel wikiModel = new WikinewsWikiModel(
"https://en.wikinews.org/wiki/${image}",
"https://en.wikinews.org/wiki/${title}");
AnnotatingMarkupParser converter = new AnnotatingMarkupParser();
String plainStr = wikiModel.render(converter, pageText);
CAS articleCAS = UimaUtil.createEmptyCAS(tsDesc);
// TODO: find a way to nicely add title ..
StringBuilder articleText = new StringBuilder();
articleText.append(page.getTitle());
int endOffsetTitle = articleText.length();
articleText.append("\n");
articleText.append("\n");
int bodyOffset = articleText.length();
articleText.append(plainStr); // Note: Add offset to annotations ... by this
articleCAS.setDocumentLanguage("en");
articleCAS.setDocumentText(articleText.toString());
AnnotationFS headlineAnnotation = articleCAS.createAnnotation(articleCAS.getTypeSystem()
.getType("org.apache.opennlp.annotations.Headline"),
0, endOffsetTitle);
articleCAS.addFsToIndexes(headlineAnnotation);
for (Annotation paraAnn : converter.getParagraphAnnotations()) {
AnnotationFS paraAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
.getType("org.apache.opennlp.annotations.Paragraph"),
bodyOffset + paraAnn.begin, bodyOffset + paraAnn.end);
articleCAS.addFsToIndexes(paraAnnFS);
}
for (Annotation subHeadAnn : converter.getHeaderAnnotations()) {
AnnotationFS subHeadAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
.getType("org.apache.opennlp.annotations.SubHeadline"),
bodyOffset + subHeadAnn.begin, bodyOffset + subHeadAnn.end);
articleCAS.addFsToIndexes(subHeadAnnFS);
}
Type wikiLinkType = articleCAS.getTypeSystem()
.getType("org.apache.opennlp.annotations.WikiLink");
Feature linkFeature = wikiLinkType.getFeatureByBaseName("link");
for (Annotation wikiLinkAnn : converter.getWikiLinkAnnotations()) {
AnnotationFS wikiLinkAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
.getType("org.apache.opennlp.annotations.WikiLink"),
bodyOffset + wikiLinkAnn.begin, bodyOffset + wikiLinkAnn.end);
wikiLinkAnnFS.setStringValue(linkFeature, wikiLinkAnn.value);
articleCAS.addFsToIndexes(wikiLinkAnnFS);
}
CAS markupCas = articleCAS.createView("WikiMarkup");
markupCas.setDocumentText(page.toString());
// now serialize CAS
try (OutputStream casOut = new FileOutputStream(outputFolder.getAbsolutePath() +
File.separator + titleToUri(page.getTitle()) + ".xmi")) {
UimaUtil.serializeCASToXmi(articleCAS, casOut);
}
catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: Parser <XML-File> <Output-Folder>");
System.exit(-1);
}
// TODO: Should to be configurable!
TypeSystemDescription tsDesc = UimaUtil.createTypeSystemDescription(
new FileInputStream("samples/TypeSystem.xml"));
File outputFolder = new File(args[1]);
outputFolder.mkdirs();
String bz2Filename = args[0];
try {
IArticleFilter handler = new CASArticleFilter(tsDesc, new File(args[1]));
WikiXMLParser wxp = new WikiXMLParser(bz2Filename, handler);
wxp.parse();
} catch (Exception e) {
System.out.println("Parsing the corpus failed:");
System.out.println();
e.printStackTrace();
}
}
}