blob: a1b2a6ac10ae2265588ccc613054605f4f27dc65 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cocoon.slop.parsing;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.cocoon.ProcessingException;
import org.apache.cocoon.xml.XMLUtils;
import org.apache.cocoon.slop.interfaces.SlopParser;
import org.apache.cocoon.slop.interfaces.SlopConstants;
/**
* Simplistic SLOP parser, recognizes the following constructs:
*
* Field: a line starting with letters and : is considered a field
*
* Empty lines are detected.
* Other lines are output as line elements
*
* This is sufficient for basic parsing of RFC 822 headers,
* but a configurable rfc822 mode would be good to differentiate
* between the header and body of the email message and parse them
* with different rules.
*
* @author <a href="mailto:bdelacretaz@apache.org">Bertrand Delacretaz</a>
* @version $Id$
*/
public class SimpleSlopParser implements SlopParser,SlopConstants {
private ContentHandler contentHandler;
/** chars that can be part of a field name (other than letters) */
private final static String DEFAULT_TAGNAME_CHARS = "-_";
private String tagnameChars = DEFAULT_TAGNAME_CHARS;
/** valid characters in an XML element name (in addition to letters and digits) */
final static String VALID_TAGNAME_CHARS = "_-";
final static String TAGNAME_REPLACEMENT_CHAR = "_";
/** optionally preserve whitespace in input */
private boolean preserveSpace = false;
/** count lines */
private int lineCounter;
/** result of parsing a line */
static class ParsedLine {
final String name;
final String contents;
ParsedLine(String elementName, String elementContents) {
name = filterElementName(elementName);
contents = elementContents;
}
}
/** make sure element names are valid XML */
static String filterElementName(String str) {
final StringBuffer sb = new StringBuffer();
for(int i=0; i < str.length(); i++) {
final char c = str.charAt(i);
if(Character.isLetter(c)) {
sb.append(c);
} else if(Character.isDigit(c) && i > 0) {
sb.append(c);
} else if(VALID_TAGNAME_CHARS.indexOf(c) >= 0) {
sb.append(c);
} else {
sb.append(TAGNAME_REPLACEMENT_CHAR);
}
}
return sb.toString();
}
/** set the list of valid chars for tag names (in addition to letters) */
public void setValidTagnameChars(String str) {
tagnameChars = (str == null ? DEFAULT_TAGNAME_CHARS : str.trim());
}
/** optionally preserve whitespace in input */
public void setPreserveWhitespace(boolean b) {
preserveSpace = b;
}
/** must be called before any call to processLine() */
public void startDocument(ContentHandler destination)
throws SAXException, ProcessingException {
contentHandler = destination;
contentHandler.startDocument();
contentHandler.startPrefixMapping("", SLOP_NAMESPACE_URI);
contentHandler.startElement(SLOP_NAMESPACE_URI, SLOP_ROOT_ELEMENT, SLOP_ROOT_ELEMENT, XMLUtils.EMPTY_ATTRIBUTES);
}
/** must be called once all calls to processLine() are done */
public void endDocument()
throws SAXException, ProcessingException {
contentHandler.endElement(SLOP_NAMESPACE_URI, SLOP_ROOT_ELEMENT, SLOP_ROOT_ELEMENT);
contentHandler.endPrefixMapping("");
contentHandler.endDocument();
contentHandler = null;
}
/** add simple name-value attribute to attr */
private void setAttribute(AttributesImpl attr,String name,String value) {
final String ATTR_TYPE = "NMTOKEN";
attr.addAttribute("",name,name,ATTR_TYPE,value);
}
/** call this to process input lines, does the actual parsing */
public void processLine(String line)
throws SAXException, ProcessingException {
if(contentHandler == null) {
throw new ProcessingException("SimpleSlopParser content handler is null (startDocument not called?)");
}
// find out which element name to use, based on the contents of the line
final ParsedLine p = parseLine(line);
// generate the element and its contents
lineCounter++;
final AttributesImpl atts = new AttributesImpl();
setAttribute(atts,SLOP_ATTR_LINENUMBER,String.valueOf(lineCounter));
contentHandler.startElement(SLOP_NAMESPACE_URI, p.name, p.name, atts);
contentHandler.characters(p.contents.toCharArray(),0,p.contents.length());
contentHandler.endElement(SLOP_NAMESPACE_URI, p.name, p.name);
}
/** parse a line, extract element name and contents */
protected ParsedLine parseLine(String line) {
ParsedLine result = null;
// empty lines
if(line == null || line.trim().length()==0) {
result = new ParsedLine(SLOP_EMPTY_LINE_ELEMENT,"");
}
// simple extraction of field names, lines starting with alpha chars followed
// by a colon are parsed as follows:
//
// input:
// field-name: this line is a field
// output:
// <field-name>this line is a field</field-name>
if(result == null) {
final int colonPos = line.indexOf(':');
if(colonPos > 0) {
boolean fieldFound = true;
for(int i=0; i < colonPos; i++) {
final char c = line.charAt(i);
final boolean isFieldChar = Character.isLetter(c) || tagnameChars.indexOf(c) >= 0;
if(!isFieldChar) {
fieldFound = false;
break;
}
}
if(fieldFound) {
String contents = "";
if(line.length() > colonPos + 1) {
final String str = line.substring(colonPos+1);
contents = (preserveSpace ? str : str.trim());
}
result = new ParsedLine(line.substring(0,colonPos),contents);
}
}
}
// default: output a line element
if(result == null) {
final String str = (preserveSpace ? line : line.trim());
result = new ParsedLine(SLOP_LINE_ELEMENT,str);
}
return result;
}
}