opennlp-tools/src/main/java/opennlp/tools/formats/muc/SgmlParser.java - opennlp - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.formats.muc;

 import java.io.IOException;
 import java.io.Reader;
 import java.util.HashMap;
 import java.util.Map;

 import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.StringUtil;

 /**
  * SAX style SGML parser.
  * <p>
  * Note:<br>
  * The implementation is very limited, but good enough to
  * parse the MUC corpora. Its must very likely be extended/improved/fixed to parse
  * a different SGML corpora.
  */
 public class SgmlParser {

   public static abstract class ContentHandler {

     public void startElement(String name, Map<String, String> attributes) throws InvalidFormatException {
     }

     public void characters(CharSequence chars) throws InvalidFormatException{
     }

     public void endElement(String name) throws InvalidFormatException {
     }
   }

   private static String extractTagName(CharSequence tagChars) throws InvalidFormatException {

     int fromOffset = 1;

     if (tagChars.length() > 1 && tagChars.charAt(1) == '/') {
       fromOffset = 2;
     }

     for (int ci = 1; ci < tagChars.length(); ci++) {

       if (tagChars.charAt(ci) == '>' || StringUtil.isWhitespace(tagChars.charAt(ci))) {
         return tagChars.subSequence(fromOffset, ci).toString();
       }
     }

     throw new InvalidFormatException("Failed to extract tag name!");
   }

   private static Map<String, String> getAttributes(CharSequence tagChars) {

     // format:
     // space
     // key
     // =
     // " <- begin
     // value chars
     // " <- end

     Map<String, String> attributes = new HashMap<>();

     StringBuilder key = new StringBuilder();
     StringBuilder value = new StringBuilder();

     boolean extractKey = false;
     boolean extractValue = false;

     for (int i = 0; i < tagChars.length(); i++) {

       // White space indicates begin of new key name
       if (StringUtil.isWhitespace(tagChars.charAt(i)) && !extractValue) {
         extractKey = true;
       }
       // Equals sign indicated end of key name
       else if (extractKey && ('=' == tagChars.charAt(i) || StringUtil.isWhitespace(tagChars.charAt(i)))) {
         extractKey = false;
       }
       // Inside key name, extract all chars
       else if (extractKey) {
         key.append(tagChars.charAt(i));
       }
       // " Indicates begin or end of value chars
       else if ('"' == tagChars.charAt(i)) {

         if (extractValue) {
           attributes.put(key.toString(), value.toString());

           // clear key and value buffers
           key.setLength(0);
           value.setLength(0);
         }

         extractValue = !extractValue;
       }
       // Inside value, extract all chars
       else if (extractValue) {
         value.append(tagChars.charAt(i));
       }
     }

     return attributes;
   }

   public void parse(Reader in, ContentHandler handler) throws IOException {

     StringBuilder buffer = new StringBuilder();

     boolean isInsideTag = false;
     boolean isStartTag = true;

     int lastChar = -1;
     int c;
     while ((c = in.read()) != -1) {

       if ('<' == c) {
         if (isInsideTag) {
           throw new InvalidFormatException("Did not expect < char!");
         }

         if (buffer.toString().trim().length() > 0) {
           handler.characters(buffer.toString().trim());
         }

         buffer.setLength(0);

         isInsideTag = true;
         isStartTag = true;
       }

       buffer.appendCodePoint(c);

       if ('/' == c && lastChar == '<') {
         isStartTag = false;
       }

       if ('>' == c) {

         if (!isInsideTag) {
           throw new InvalidFormatException("Did not expect > char!");
         }

         if (isStartTag) {
           handler.startElement(extractTagName(buffer), getAttributes(buffer));
         }
         else {
           handler.endElement(extractTagName(buffer));
         }

         buffer.setLength(0);

         isInsideTag = false;
       }

       lastChar = c;
     }

     if (isInsideTag) {
       throw new InvalidFormatException("Did not find matching > char!");
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.formats.muc;

	import java.io.IOException;
	import java.io.Reader;
	import java.util.HashMap;
	import java.util.Map;

	import opennlp.tools.util.InvalidFormatException;
	import opennlp.tools.util.StringUtil;

	/**
	* SAX style SGML parser.
	* <p>
	* Note:<br>
	* The implementation is very limited, but good enough to
	* parse the MUC corpora. Its must very likely be extended/improved/fixed to parse
	* a different SGML corpora.
	*/
	public class SgmlParser {

	public static abstract class ContentHandler {

	public void startElement(String name, Map<String, String> attributes) throws InvalidFormatException {
	}

	public void characters(CharSequence chars) throws InvalidFormatException{
	}

	public void endElement(String name) throws InvalidFormatException {
	}
	}

	private static String extractTagName(CharSequence tagChars) throws InvalidFormatException {

	int fromOffset = 1;

	if (tagChars.length() > 1 && tagChars.charAt(1) == '/') {
	fromOffset = 2;
	}

	for (int ci = 1; ci < tagChars.length(); ci++) {

	if (tagChars.charAt(ci) == '>' \|\| StringUtil.isWhitespace(tagChars.charAt(ci))) {
	return tagChars.subSequence(fromOffset, ci).toString();
	}
	}

	throw new InvalidFormatException("Failed to extract tag name!");
	}

	private static Map<String, String> getAttributes(CharSequence tagChars) {

	// format:
	// space
	// key
	// =
	// " <- begin
	// value chars
	// " <- end

	Map<String, String> attributes = new HashMap<>();

	StringBuilder key = new StringBuilder();
	StringBuilder value = new StringBuilder();

	boolean extractKey = false;
	boolean extractValue = false;

	for (int i = 0; i < tagChars.length(); i++) {

	// White space indicates begin of new key name
	if (StringUtil.isWhitespace(tagChars.charAt(i)) && !extractValue) {
	extractKey = true;
	}
	// Equals sign indicated end of key name
	else if (extractKey && ('=' == tagChars.charAt(i) \|\| StringUtil.isWhitespace(tagChars.charAt(i)))) {
	extractKey = false;
	}
	// Inside key name, extract all chars
	else if (extractKey) {
	key.append(tagChars.charAt(i));
	}
	// " Indicates begin or end of value chars
	else if ('"' == tagChars.charAt(i)) {

	if (extractValue) {
	attributes.put(key.toString(), value.toString());

	// clear key and value buffers
	key.setLength(0);
	value.setLength(0);
	}

	extractValue = !extractValue;
	}
	// Inside value, extract all chars
	else if (extractValue) {
	value.append(tagChars.charAt(i));
	}
	}

	return attributes;
	}

	public void parse(Reader in, ContentHandler handler) throws IOException {

	StringBuilder buffer = new StringBuilder();

	boolean isInsideTag = false;
	boolean isStartTag = true;

	int lastChar = -1;
	int c;
	while ((c = in.read()) != -1) {

	if ('<' == c) {
	if (isInsideTag) {
	throw new InvalidFormatException("Did not expect < char!");
	}

	if (buffer.toString().trim().length() > 0) {
	handler.characters(buffer.toString().trim());
	}

	buffer.setLength(0);

	isInsideTag = true;
	isStartTag = true;
	}

	buffer.appendCodePoint(c);

	if ('/' == c && lastChar == '<') {
	isStartTag = false;
	}

	if ('>' == c) {

	if (!isInsideTag) {
	throw new InvalidFormatException("Did not expect > char!");
	}

	if (isStartTag) {
	handler.startElement(extractTagName(buffer), getAttributes(buffer));
	}
	else {
	handler.endElement(extractTagName(buffer));
	}

	buffer.setLength(0);

	isInsideTag = false;
	}

	lastChar = c;
	}

	if (isInsideTag) {
	throw new InvalidFormatException("Did not find matching > char!");
	}
	}
	}