opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocumentStream.java - opennlp - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.formats.masc;

 import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileFilter;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Stack;
 import javax.xml.parsers.SAXParser;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;

 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.XmlUtil;

 public class MascDocumentStream implements ObjectStream<MascDocument> {

   private static final Logger logger = LoggerFactory.getLogger(MascDocumentStream.class);
   /**
    * A helper class to parse the header (.hdr) files.
    */
   private static class HeaderHandler extends DefaultHandler {
     private HashMap<String, String> annotationFiles = null;
     private String file = null;
     private String fType = null;

     protected HashMap<String, String> getPathList() {
       return annotationFiles;
     }

     @Override
     public void startElement(String uri, String localName, String qName, Attributes attributes)
         throws SAXException {

       // create a new annotation file and put it in map
       // initialize File object and set path attribute
       if (qName.equalsIgnoreCase("annotation") ||
           qName.equalsIgnoreCase("primaryData")) {
         file = attributes.getValue("loc");
         fType = attributes.getValue("f.id");

         // initialize list
         if (annotationFiles == null) {
           annotationFiles = new HashMap<>();
         }
       }

     }

     @Override
     public void endElement(String uri, String localName, String qName) throws SAXException {

       // add annotation object to list
       if (qName.equalsIgnoreCase("annotation") ||
           qName.equalsIgnoreCase("primaryData")) {
         annotationFiles.put(fType, file);
       }

     }

   }
   private final List<MascDocument> documents = new LinkedList<>();
   private Iterator<MascDocument> documentIterator;
   private final SAXParser saxParser;

   public MascDocumentStream(File mascCorpusDirectory) throws IOException {
     this(mascCorpusDirectory, true, pathname -> pathname.getName().contains(""));
   }

   /**
    * Creates a MascDocumentStream to read the documents from a given directory.
    * Works iff all annotation files mentioned in the headers are present.
    *
    * @param mascCorpusDirectory the directory containing all the MASC files
    * @param searchRecursive     whether the search should go through subdirectories
    * @param fileFilter          a custom file filter to filter out some files or
    *                            null to accept anything
    * @throws IOException if any stage of the stream creation fails
    */
   public MascDocumentStream(File mascCorpusDirectory,
                             boolean searchRecursive, FileFilter fileFilter) throws IOException {

     saxParser = XmlUtil.createSaxParser();

     if (!mascCorpusDirectory.isDirectory()) {
       throw new IOException("Input corpus directory must be a directory " +
           "according to File.isDirectory()!");
     }

     int failedLoads = 0;
     Stack<File> directoryStack = new Stack<>();
     directoryStack.add(mascCorpusDirectory);

     while (!directoryStack.isEmpty()) {
       for (File file : directoryStack.pop().listFiles(fileFilter)) {
         if (file.isFile()) {
           String hdrFilePath = file.getAbsolutePath();

           // look for the header files
           if (hdrFilePath.endsWith(".hdr")) {

             HashMap<String, File> fileGroup = checkAnnotations(hdrFilePath);
             InputStream f_primary = new BufferedInputStream(
                 new FileInputStream(fileGroup.get("f.text")));
             InputStream f_seg = (fileGroup.containsKey("f.seg")) ?
                 new BufferedInputStream(new FileInputStream(fileGroup.get("f.seg"))) : null;
             InputStream f_penn = (fileGroup.containsKey("f.penn")) ?
                 new BufferedInputStream(new FileInputStream(fileGroup.get("f.penn"))) : null;
             InputStream f_s = (fileGroup.containsKey("f.s")) ?
                 new BufferedInputStream(new FileInputStream(fileGroup.get("f.s"))) : null;
             InputStream f_ne = (fileGroup.containsKey("f.ne")) ?
                 new BufferedInputStream(new FileInputStream(fileGroup.get("f.ne"))) : null;

             try {
               documents.add(MascDocument.parseDocument(hdrFilePath, f_primary, f_seg,
                   f_penn, f_s, f_ne));
             } catch (IOException e) {
               logger.error("Failed to parse the file: {}", hdrFilePath, e);
               failedLoads++;
             }
           }

         } else if (searchRecursive && file.isDirectory()) {
           directoryStack.push(file);
         }
       }
     }

     logger.info("Documents loaded: {}", documents.size());
     if (failedLoads > 0) {
       logger.info("Failed loading {} documents.", failedLoads);
     }
     reset();

   }

   /**
    * Check that all annotation files mentioned in the header are present
    *
    * @param path The path to header
    * @throws IOException If corpus integrity is violated
    */
   private HashMap<String, File> checkAnnotations(String path) throws IOException {
     HeaderHandler handler = new HeaderHandler();
     HashMap<String, File> fileGroup = new HashMap<>();
     File hdrFile = new File(path);
     try {
       saxParser.parse(hdrFile, handler);
     } catch (SAXException e) {
       throw new IOException("Invalid corpus format. " +
           "Could not parse the header: " + path);
     }
     HashMap<String, String> annotationFiles = handler.getPathList();

     String pathToFolder = hdrFile.getParentFile().getAbsolutePath();
     for (Map.Entry<String, String> annotation : annotationFiles.entrySet()) {
       File file = new File(pathToFolder, annotation.getValue());
       if (!(file.isFile() && file.exists())) {
         throw new IOException("Corpus integrity violated. " +
             "Annotation file " + file.getAbsolutePath() + " is missing.");
       }

       fileGroup.put(annotation.getKey(), file);

     }

     return fileGroup;

   }

   /**
    * Reset the reading of all documents to the first sentence.
    * Reset the corpus to the first document.
    */
   @Override
   public void reset() {
     for (MascDocument doc : documents) {
       doc.reset();
     }
     documentIterator = documents.iterator();
   }

   /**
    * Return the next document. Client needs to check if this document has the necessary annotations.
    *
    * @return A corpus document with all its annotations.
    * @throws IOException if anything goes wrong.
    */
   @Override
   public MascDocument read() throws IOException {

     MascDocument doc = null;

     if (documentIterator.hasNext()) {
       doc = documentIterator.next();
     }

     return doc;
   }

   /**
    * Remove the corpus from the memory.
    */
   @Override
   public void close() {
     documentIterator = null;
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.formats.masc;

	import java.io.BufferedInputStream;
	import java.io.File;
	import java.io.FileFilter;
	import java.io.FileInputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.util.HashMap;
	import java.util.Iterator;
	import java.util.LinkedList;
	import java.util.List;
	import java.util.Map;
	import java.util.Stack;
	import javax.xml.parsers.SAXParser;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.xml.sax.Attributes;
	import org.xml.sax.SAXException;
	import org.xml.sax.helpers.DefaultHandler;

	import opennlp.tools.util.ObjectStream;
	import opennlp.tools.util.XmlUtil;

	public class MascDocumentStream implements ObjectStream<MascDocument> {

	private static final Logger logger = LoggerFactory.getLogger(MascDocumentStream.class);
	/**
	* A helper class to parse the header (.hdr) files.
	*/
	private static class HeaderHandler extends DefaultHandler {
	private HashMap<String, String> annotationFiles = null;
	private String file = null;
	private String fType = null;

	protected HashMap<String, String> getPathList() {
	return annotationFiles;
	}

	@Override
	public void startElement(String uri, String localName, String qName, Attributes attributes)
	throws SAXException {

	// create a new annotation file and put it in map
	// initialize File object and set path attribute
	if (qName.equalsIgnoreCase("annotation") \|\|
	qName.equalsIgnoreCase("primaryData")) {
	file = attributes.getValue("loc");
	fType = attributes.getValue("f.id");

	// initialize list
	if (annotationFiles == null) {
	annotationFiles = new HashMap<>();
	}
	}

	}

	@Override
	public void endElement(String uri, String localName, String qName) throws SAXException {

	// add annotation object to list
	if (qName.equalsIgnoreCase("annotation") \|\|
	qName.equalsIgnoreCase("primaryData")) {
	annotationFiles.put(fType, file);
	}

	}

	}
	private final List<MascDocument> documents = new LinkedList<>();
	private Iterator<MascDocument> documentIterator;
	private final SAXParser saxParser;

	public MascDocumentStream(File mascCorpusDirectory) throws IOException {
	this(mascCorpusDirectory, true, pathname -> pathname.getName().contains(""));
	}

	/**
	* Creates a MascDocumentStream to read the documents from a given directory.
	* Works iff all annotation files mentioned in the headers are present.
	*
	* @param mascCorpusDirectory the directory containing all the MASC files
	* @param searchRecursive whether the search should go through subdirectories
	* @param fileFilter a custom file filter to filter out some files or
	* null to accept anything
	* @throws IOException if any stage of the stream creation fails
	*/
	public MascDocumentStream(File mascCorpusDirectory,
	boolean searchRecursive, FileFilter fileFilter) throws IOException {

	saxParser = XmlUtil.createSaxParser();

	if (!mascCorpusDirectory.isDirectory()) {
	throw new IOException("Input corpus directory must be a directory " +
	"according to File.isDirectory()!");
	}

	int failedLoads = 0;
	Stack<File> directoryStack = new Stack<>();
	directoryStack.add(mascCorpusDirectory);

	while (!directoryStack.isEmpty()) {
	for (File file : directoryStack.pop().listFiles(fileFilter)) {
	if (file.isFile()) {
	String hdrFilePath = file.getAbsolutePath();

	// look for the header files
	if (hdrFilePath.endsWith(".hdr")) {

	HashMap<String, File> fileGroup = checkAnnotations(hdrFilePath);
	InputStream f_primary = new BufferedInputStream(
	new FileInputStream(fileGroup.get("f.text")));
	InputStream f_seg = (fileGroup.containsKey("f.seg")) ?
	new BufferedInputStream(new FileInputStream(fileGroup.get("f.seg"))) : null;
	InputStream f_penn = (fileGroup.containsKey("f.penn")) ?
	new BufferedInputStream(new FileInputStream(fileGroup.get("f.penn"))) : null;
	InputStream f_s = (fileGroup.containsKey("f.s")) ?
	new BufferedInputStream(new FileInputStream(fileGroup.get("f.s"))) : null;
	InputStream f_ne = (fileGroup.containsKey("f.ne")) ?
	new BufferedInputStream(new FileInputStream(fileGroup.get("f.ne"))) : null;

	try {
	documents.add(MascDocument.parseDocument(hdrFilePath, f_primary, f_seg,
	f_penn, f_s, f_ne));
	} catch (IOException e) {
	logger.error("Failed to parse the file: {}", hdrFilePath, e);
	failedLoads++;
	}
	}

	} else if (searchRecursive && file.isDirectory()) {
	directoryStack.push(file);
	}
	}
	}

	logger.info("Documents loaded: {}", documents.size());
	if (failedLoads > 0) {
	logger.info("Failed loading {} documents.", failedLoads);
	}
	reset();

	}

	/**
	* Check that all annotation files mentioned in the header are present
	*
	* @param path The path to header
	* @throws IOException If corpus integrity is violated
	*/
	private HashMap<String, File> checkAnnotations(String path) throws IOException {
	HeaderHandler handler = new HeaderHandler();
	HashMap<String, File> fileGroup = new HashMap<>();
	File hdrFile = new File(path);
	try {
	saxParser.parse(hdrFile, handler);
	} catch (SAXException e) {
	throw new IOException("Invalid corpus format. " +
	"Could not parse the header: " + path);
	}
	HashMap<String, String> annotationFiles = handler.getPathList();

	String pathToFolder = hdrFile.getParentFile().getAbsolutePath();
	for (Map.Entry<String, String> annotation : annotationFiles.entrySet()) {
	File file = new File(pathToFolder, annotation.getValue());
	if (!(file.isFile() && file.exists())) {
	throw new IOException("Corpus integrity violated. " +
	"Annotation file " + file.getAbsolutePath() + " is missing.");
	}

	fileGroup.put(annotation.getKey(), file);

	}

	return fileGroup;

	}

	/**
	* Reset the reading of all documents to the first sentence.
	* Reset the corpus to the first document.
	*/
	@Override
	public void reset() {
	for (MascDocument doc : documents) {
	doc.reset();
	}
	documentIterator = documents.iterator();
	}

	/**
	* Return the next document. Client needs to check if this document has the necessary annotations.
	*
	* @return A corpus document with all its annotations.
	* @throws IOException if anything goes wrong.
	*/
	@Override
	public MascDocument read() throws IOException {

	MascDocument doc = null;

	if (documentIterator.hasNext()) {
	doc = documentIterator.next();
	}

	return doc;
	}

	/**
	* Remove the corpus from the memory.
	*/
	@Override
	public void close() {
	documentIterator = null;
	}

	}