| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.cocoon.slop.parsing; |
| |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.SAXException; |
| import org.xml.sax.helpers.AttributesImpl; |
| import org.apache.cocoon.ProcessingException; |
| import org.apache.cocoon.xml.XMLUtils; |
| import org.apache.cocoon.slop.interfaces.SlopParser; |
| import org.apache.cocoon.slop.interfaces.SlopConstants; |
| |
| /** |
| * Simplistic SLOP parser, recognizes the following constructs: |
| * |
| * Field: a line starting with letters and : is considered a field |
| * |
| * Empty lines are detected. |
| * Other lines are output as line elements |
| * |
| * This is sufficient for basic parsing of RFC 822 headers, |
| * but a configurable rfc822 mode would be good to differentiate |
| * between the header and body of the email message and parse them |
| * with different rules. |
| * |
| * @author <a href="mailto:bdelacretaz@apache.org">Bertrand Delacretaz</a> |
| * @version $Id$ |
| */ |
| public class SimpleSlopParser implements SlopParser,SlopConstants { |
| |
| private ContentHandler contentHandler; |
| |
| /** chars that can be part of a field name (other than letters) */ |
| private final static String DEFAULT_TAGNAME_CHARS = "-_"; |
| private String tagnameChars = DEFAULT_TAGNAME_CHARS; |
| |
| /** valid characters in an XML element name (in addition to letters and digits) */ |
| final static String VALID_TAGNAME_CHARS = "_-"; |
| final static String TAGNAME_REPLACEMENT_CHAR = "_"; |
| |
| /** optionally preserve whitespace in input */ |
| private boolean preserveSpace = false; |
| |
| /** count lines */ |
| private int lineCounter; |
| |
| /** result of parsing a line */ |
| static class ParsedLine { |
| final String name; |
| final String contents; |
| |
| ParsedLine(String elementName, String elementContents) { |
| name = filterElementName(elementName); |
| contents = elementContents; |
| } |
| } |
| |
| /** make sure element names are valid XML */ |
| static String filterElementName(String str) { |
| final StringBuffer sb = new StringBuffer(); |
| for(int i=0; i < str.length(); i++) { |
| final char c = str.charAt(i); |
| if(Character.isLetter(c)) { |
| sb.append(c); |
| } else if(Character.isDigit(c) && i > 0) { |
| sb.append(c); |
| } else if(VALID_TAGNAME_CHARS.indexOf(c) >= 0) { |
| sb.append(c); |
| } else { |
| sb.append(TAGNAME_REPLACEMENT_CHAR); |
| } |
| } |
| return sb.toString(); |
| } |
| |
| /** set the list of valid chars for tag names (in addition to letters) */ |
| public void setValidTagnameChars(String str) { |
| tagnameChars = (str == null ? DEFAULT_TAGNAME_CHARS : str.trim()); |
| } |
| |
| /** optionally preserve whitespace in input */ |
| public void setPreserveWhitespace(boolean b) { |
| preserveSpace = b; |
| } |
| |
| /** must be called before any call to processLine() */ |
| public void startDocument(ContentHandler destination) |
| throws SAXException, ProcessingException { |
| contentHandler = destination; |
| contentHandler.startDocument(); |
| contentHandler.startPrefixMapping("", SLOP_NAMESPACE_URI); |
| contentHandler.startElement(SLOP_NAMESPACE_URI, SLOP_ROOT_ELEMENT, SLOP_ROOT_ELEMENT, XMLUtils.EMPTY_ATTRIBUTES); |
| } |
| |
| /** must be called once all calls to processLine() are done */ |
| public void endDocument() |
| throws SAXException, ProcessingException { |
| contentHandler.endElement(SLOP_NAMESPACE_URI, SLOP_ROOT_ELEMENT, SLOP_ROOT_ELEMENT); |
| contentHandler.endPrefixMapping(""); |
| contentHandler.endDocument(); |
| contentHandler = null; |
| } |
| |
| /** add simple name-value attribute to attr */ |
| private void setAttribute(AttributesImpl attr,String name,String value) { |
| final String ATTR_TYPE = "NMTOKEN"; |
| attr.addAttribute("",name,name,ATTR_TYPE,value); |
| } |
| |
| /** call this to process input lines, does the actual parsing */ |
| public void processLine(String line) |
| throws SAXException, ProcessingException { |
| if(contentHandler == null) { |
| throw new ProcessingException("SimpleSlopParser content handler is null (startDocument not called?)"); |
| } |
| |
| // find out which element name to use, based on the contents of the line |
| final ParsedLine p = parseLine(line); |
| |
| // generate the element and its contents |
| lineCounter++; |
| final AttributesImpl atts = new AttributesImpl(); |
| setAttribute(atts,SLOP_ATTR_LINENUMBER,String.valueOf(lineCounter)); |
| contentHandler.startElement(SLOP_NAMESPACE_URI, p.name, p.name, atts); |
| contentHandler.characters(p.contents.toCharArray(),0,p.contents.length()); |
| contentHandler.endElement(SLOP_NAMESPACE_URI, p.name, p.name); |
| } |
| |
| /** parse a line, extract element name and contents */ |
| protected ParsedLine parseLine(String line) { |
| ParsedLine result = null; |
| |
| // empty lines |
| if(line == null || line.trim().length()==0) { |
| result = new ParsedLine(SLOP_EMPTY_LINE_ELEMENT,""); |
| } |
| |
| // simple extraction of field names, lines starting with alpha chars followed |
| // by a colon are parsed as follows: |
| // |
| // input: |
| // field-name: this line is a field |
| // output: |
| // <field-name>this line is a field</field-name> |
| if(result == null) { |
| final int colonPos = line.indexOf(':'); |
| if(colonPos > 0) { |
| boolean fieldFound = true; |
| for(int i=0; i < colonPos; i++) { |
| final char c = line.charAt(i); |
| final boolean isFieldChar = Character.isLetter(c) || tagnameChars.indexOf(c) >= 0; |
| if(!isFieldChar) { |
| fieldFound = false; |
| break; |
| } |
| } |
| |
| if(fieldFound) { |
| String contents = ""; |
| if(line.length() > colonPos + 1) { |
| final String str = line.substring(colonPos+1); |
| contents = (preserveSpace ? str : str.trim()); |
| } |
| result = new ParsedLine(line.substring(0,colonPos),contents); |
| } |
| } |
| } |
| |
| // default: output a line element |
| if(result == null) { |
| final String str = (preserveSpace ? line : line.trim()); |
| result = new ParsedLine(SLOP_LINE_ELEMENT,str); |
| } |
| |
| return result; |
| } |
| } |