wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.opennlp.wikinews_importer;

 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.net.URLEncoder;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;

 import info.bliki.wiki.dump.IArticleFilter;
 import info.bliki.wiki.dump.Siteinfo;
 import info.bliki.wiki.dump.WikiArticle;
 import info.bliki.wiki.dump.WikiXMLParser;

 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.Feature;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.resource.metadata.TypeSystemDescription;
 import org.xml.sax.SAXException;

 /**
  * Demo application which reads an uncompressed Wikipedia XML dump
  * file and writes each article as an XMI file.
  */
 public class WikinewsConverter {

   static class CASArticleFilter implements IArticleFilter {

     private final TypeSystemDescription tsDesc;
     private final File outputFolder;
     private final List<String> endOfArticleMarkers = new ArrayList<>();

     CASArticleFilter(TypeSystemDescription tsDesc, File outputFolder) {

       this.tsDesc = tsDesc;
       this.outputFolder = outputFolder;

       endOfArticleMarkers.add("{{haveyoursay}}");
       endOfArticleMarkers.add("== Sources ==");
       endOfArticleMarkers.add("==Sources==");
       endOfArticleMarkers.add("== Source ==");
       endOfArticleMarkers.add("==Source==");
       endOfArticleMarkers.add("==References==");
       endOfArticleMarkers.add("== References ==");
       endOfArticleMarkers.add("=== References===");
     }

     public static String titleToUri(String title) {
       return URLEncoder.encode(title.replaceAll(" ", "_"), StandardCharsets.UTF_8);
     }

     @Override
     public void process(WikiArticle page, Siteinfo siteinfo)
         throws SAXException {

       if (page.getIntegerNamespace() == 0 && page.isMain()) {
         if (page.getText().toLowerCase().contains("{publish}")) {

           String pageText = page.getText();
           int cutIndex = pageText.length();

           for (String endMarker : endOfArticleMarkers) {
             int endMarkerIndex = pageText.indexOf(endMarker);
               if (endMarkerIndex != -1 && endMarkerIndex < cutIndex) {
                 cutIndex = endMarkerIndex;
               }
           }

           if (cutIndex < pageText.length()) {
             pageText = pageText.substring(0, cutIndex);
           }

           WikinewsWikiModel wikiModel = new WikinewsWikiModel(
                   "https://en.wikinews.org/wiki/${image}",
                   "https://en.wikinews.org/wiki/${title}");

           AnnotatingMarkupParser converter = new AnnotatingMarkupParser();
           String plainStr = wikiModel.render(converter, pageText);

           CAS articleCAS = UimaUtil.createEmptyCAS(tsDesc);

           // TODO: find a way to nicely add title ..
           StringBuilder articleText = new StringBuilder();
           articleText.append(page.getTitle());

           int endOffsetTitle = articleText.length();

           articleText.append("\n");
           articleText.append("\n");

           int bodyOffset = articleText.length();

           articleText.append(plainStr); // Note: Add offset to annotations ... by this

           articleCAS.setDocumentLanguage("en");
           articleCAS.setDocumentText(articleText.toString());

           AnnotationFS headlineAnnotation = articleCAS.createAnnotation(articleCAS.getTypeSystem()
               .getType("org.apache.opennlp.annotations.Headline"),
               0, endOffsetTitle);

           articleCAS.addFsToIndexes(headlineAnnotation);

           for (Annotation paraAnn : converter.getParagraphAnnotations()) {
             AnnotationFS paraAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
                 .getType("org.apache.opennlp.annotations.Paragraph"),
                 bodyOffset + paraAnn.begin, bodyOffset + paraAnn.end);

             articleCAS.addFsToIndexes(paraAnnFS);
           }

           for (Annotation subHeadAnn : converter.getHeaderAnnotations()) {
             AnnotationFS subHeadAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
                 .getType("org.apache.opennlp.annotations.SubHeadline"),
                 bodyOffset + subHeadAnn.begin, bodyOffset + subHeadAnn.end);

             articleCAS.addFsToIndexes(subHeadAnnFS);
           }

           Type wikiLinkType = articleCAS.getTypeSystem()
               .getType("org.apache.opennlp.annotations.WikiLink");
           Feature linkFeature = wikiLinkType.getFeatureByBaseName("link");

           for (Annotation wikiLinkAnn : converter.getWikiLinkAnnotations()) {
             AnnotationFS wikiLinkAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
                 .getType("org.apache.opennlp.annotations.WikiLink"),
                 bodyOffset + wikiLinkAnn.begin, bodyOffset + wikiLinkAnn.end);

             wikiLinkAnnFS.setStringValue(linkFeature, wikiLinkAnn.value);

             articleCAS.addFsToIndexes(wikiLinkAnnFS);
           }

           CAS markupCas = articleCAS.createView("WikiMarkup");
           markupCas.setDocumentText(page.toString());

           // now serialize CAS
           try (OutputStream casOut = new FileOutputStream(outputFolder.getAbsolutePath() +
                   File.separator + titleToUri(page.getTitle()) + ".xmi")) {

               UimaUtil.serializeCASToXmi(articleCAS, casOut);
           }
           catch (IOException e) {
             e.printStackTrace();
           }
         }
       }
     }
   }

   public static void main(String[] args) throws Exception {
     if (args.length != 2) {
       System.err.println("Usage: Parser <XML-File> <Output-Folder>");
       System.exit(-1);
     }

     // TODO: Should to be configurable!
     TypeSystemDescription tsDesc = UimaUtil.createTypeSystemDescription(
         new FileInputStream("samples/TypeSystem.xml"));

     File outputFolder = new File(args[1]);
     outputFolder.mkdirs();

     String bz2Filename = args[0];
     try {
       IArticleFilter handler = new CASArticleFilter(tsDesc, new File(args[1]));
       WikiXMLParser wxp = new WikiXMLParser(bz2Filename, handler);
       wxp.parse();
     } catch (Exception e) {
       System.out.println("Parsing the corpus failed:");
       System.out.println();
       e.printStackTrace();
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.opennlp.wikinews_importer;

	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileOutputStream;
	import java.io.IOException;
	import java.io.OutputStream;
	import java.net.URLEncoder;
	import java.nio.charset.StandardCharsets;
	import java.util.ArrayList;
	import java.util.List;

	import info.bliki.wiki.dump.IArticleFilter;
	import info.bliki.wiki.dump.Siteinfo;
	import info.bliki.wiki.dump.WikiArticle;
	import info.bliki.wiki.dump.WikiXMLParser;

	import org.apache.uima.cas.CAS;
	import org.apache.uima.cas.Feature;
	import org.apache.uima.cas.Type;
	import org.apache.uima.cas.text.AnnotationFS;
	import org.apache.uima.resource.metadata.TypeSystemDescription;
	import org.xml.sax.SAXException;

	/**
	* Demo application which reads an uncompressed Wikipedia XML dump
	* file and writes each article as an XMI file.
	*/
	public class WikinewsConverter {

	static class CASArticleFilter implements IArticleFilter {

	private final TypeSystemDescription tsDesc;
	private final File outputFolder;
	private final List<String> endOfArticleMarkers = new ArrayList<>();

	CASArticleFilter(TypeSystemDescription tsDesc, File outputFolder) {

	this.tsDesc = tsDesc;
	this.outputFolder = outputFolder;

	endOfArticleMarkers.add("{{haveyoursay}}");
	endOfArticleMarkers.add("== Sources ==");
	endOfArticleMarkers.add("==Sources==");
	endOfArticleMarkers.add("== Source ==");
	endOfArticleMarkers.add("==Source==");
	endOfArticleMarkers.add("==References==");
	endOfArticleMarkers.add("== References ==");
	endOfArticleMarkers.add("=== References===");
	}

	public static String titleToUri(String title) {
	return URLEncoder.encode(title.replaceAll(" ", "_"), StandardCharsets.UTF_8);
	}

	@Override
	public void process(WikiArticle page, Siteinfo siteinfo)
	throws SAXException {

	if (page.getIntegerNamespace() == 0 && page.isMain()) {
	if (page.getText().toLowerCase().contains("{publish}")) {

	String pageText = page.getText();
	int cutIndex = pageText.length();

	for (String endMarker : endOfArticleMarkers) {
	int endMarkerIndex = pageText.indexOf(endMarker);
	if (endMarkerIndex != -1 && endMarkerIndex < cutIndex) {
	cutIndex = endMarkerIndex;
	}
	}

	if (cutIndex < pageText.length()) {
	pageText = pageText.substring(0, cutIndex);
	}

	WikinewsWikiModel wikiModel = new WikinewsWikiModel(
	"https://en.wikinews.org/wiki/${image}",
	"https://en.wikinews.org/wiki/${title}");

	AnnotatingMarkupParser converter = new AnnotatingMarkupParser();
	String plainStr = wikiModel.render(converter, pageText);

	CAS articleCAS = UimaUtil.createEmptyCAS(tsDesc);

	// TODO: find a way to nicely add title ..
	StringBuilder articleText = new StringBuilder();
	articleText.append(page.getTitle());

	int endOffsetTitle = articleText.length();

	articleText.append("\n");
	articleText.append("\n");

	int bodyOffset = articleText.length();

	articleText.append(plainStr); // Note: Add offset to annotations ... by this

	articleCAS.setDocumentLanguage("en");
	articleCAS.setDocumentText(articleText.toString());

	AnnotationFS headlineAnnotation = articleCAS.createAnnotation(articleCAS.getTypeSystem()
	.getType("org.apache.opennlp.annotations.Headline"),
	0, endOffsetTitle);

	articleCAS.addFsToIndexes(headlineAnnotation);

	for (Annotation paraAnn : converter.getParagraphAnnotations()) {
	AnnotationFS paraAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
	.getType("org.apache.opennlp.annotations.Paragraph"),
	bodyOffset + paraAnn.begin, bodyOffset + paraAnn.end);

	articleCAS.addFsToIndexes(paraAnnFS);
	}

	for (Annotation subHeadAnn : converter.getHeaderAnnotations()) {
	AnnotationFS subHeadAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
	.getType("org.apache.opennlp.annotations.SubHeadline"),
	bodyOffset + subHeadAnn.begin, bodyOffset + subHeadAnn.end);

	articleCAS.addFsToIndexes(subHeadAnnFS);
	}

	Type wikiLinkType = articleCAS.getTypeSystem()
	.getType("org.apache.opennlp.annotations.WikiLink");
	Feature linkFeature = wikiLinkType.getFeatureByBaseName("link");

	for (Annotation wikiLinkAnn : converter.getWikiLinkAnnotations()) {
	AnnotationFS wikiLinkAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
	.getType("org.apache.opennlp.annotations.WikiLink"),
	bodyOffset + wikiLinkAnn.begin, bodyOffset + wikiLinkAnn.end);

	wikiLinkAnnFS.setStringValue(linkFeature, wikiLinkAnn.value);

	articleCAS.addFsToIndexes(wikiLinkAnnFS);
	}

	CAS markupCas = articleCAS.createView("WikiMarkup");
	markupCas.setDocumentText(page.toString());

	// now serialize CAS
	try (OutputStream casOut = new FileOutputStream(outputFolder.getAbsolutePath() +
	File.separator + titleToUri(page.getTitle()) + ".xmi")) {

	UimaUtil.serializeCASToXmi(articleCAS, casOut);
	}
	catch (IOException e) {
	e.printStackTrace();
	}
	}
	}
	}
	}

	public static void main(String[] args) throws Exception {
	if (args.length != 2) {
	System.err.println("Usage: Parser <XML-File> <Output-Folder>");
	System.exit(-1);
	}

	// TODO: Should to be configurable!
	TypeSystemDescription tsDesc = UimaUtil.createTypeSystemDescription(
	new FileInputStream("samples/TypeSystem.xml"));

	File outputFolder = new File(args[1]);
	outputFolder.mkdirs();

	String bz2Filename = args[0];
	try {
	IArticleFilter handler = new CASArticleFilter(tsDesc, new File(args[1]));
	WikiXMLParser wxp = new WikiXMLParser(bz2Filename, handler);
	wxp.parse();
	} catch (Exception e) {
	System.out.println("Parsing the corpus failed:");
	System.out.println();
	e.printStackTrace();
	}
	}
	}