blob: 19ef7e32e048d7ba45a7129ee60b6d0669f040dc [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.tika.handler;
import java.io.Writer;
import org.apache.tika.sax.ToTextContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
/**
* Small extensions to the default {@link ToTextContentHandler}. This allows
* to <ul>
* <li>skip ignoreable whitespaces
* <li>skip linebreaks within literals
* </ul>
*
* @author Rupert Westenthaler
*
*/
public class PlainTextHandler extends ToTextContentHandler {
private static char[] SPACE = new char[]{' '};
private final boolean skipWhitespaces;
private final boolean skipLinebreakes;
boolean addedText = false;
public PlainTextHandler(Writer writer, boolean skipIgnoreableWhitespaces, boolean skipLinebreaksWithinLiterals) {
super(writer);
this.skipWhitespaces = skipIgnoreableWhitespaces;
this.skipLinebreakes = skipLinebreaksWithinLiterals;
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
if(!skipWhitespaces && addedText){
super.characters(ch, start, length);
addedText = false;
} //else ignore
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if(skipLinebreakes){
int end = start+length;
for(int pos = start; pos<end;pos++){
if(ch[pos] == '\n'){
if(pos > start){
super.characters(ch, start, pos-start);
super.characters(SPACE, 0, 1);
}
start = pos+1;
length = length-start;
} //ignore line breaks
}
}
if(length > 0) {
super.characters(ch, start, length);
}
addedText = true;
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
// if(skipLinebreakes & addedText){
// characters(LINEBREAK, 0, 1);
// addedText = false;
// }
super.endElement(uri, localName, qName);
}
}