| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engines.tika.handler; |
| |
| import java.io.Writer; |
| |
| import org.apache.tika.sax.ToTextContentHandler; |
| import org.xml.sax.Attributes; |
| import org.xml.sax.SAXException; |
| /** |
| * Small extensions to the default {@link ToTextContentHandler}. This allows |
| * to <ul> |
| * <li>skip ignoreable whitespaces |
| * <li>skip linebreaks within literals |
| * </ul> |
| * |
| * @author Rupert Westenthaler |
| * |
| */ |
| public class PlainTextHandler extends ToTextContentHandler { |
| |
| private static char[] SPACE = new char[]{' '}; |
| |
| |
| private final boolean skipWhitespaces; |
| private final boolean skipLinebreakes; |
| boolean addedText = false; |
| public PlainTextHandler(Writer writer, boolean skipIgnoreableWhitespaces, boolean skipLinebreaksWithinLiterals) { |
| super(writer); |
| this.skipWhitespaces = skipIgnoreableWhitespaces; |
| this.skipLinebreakes = skipLinebreaksWithinLiterals; |
| } |
| |
| @Override |
| public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { |
| if(!skipWhitespaces && addedText){ |
| super.characters(ch, start, length); |
| addedText = false; |
| } //else ignore |
| } |
| @Override |
| public void characters(char[] ch, int start, int length) throws SAXException { |
| if(skipLinebreakes){ |
| int end = start+length; |
| for(int pos = start; pos<end;pos++){ |
| if(ch[pos] == '\n'){ |
| if(pos > start){ |
| super.characters(ch, start, pos-start); |
| super.characters(SPACE, 0, 1); |
| } |
| start = pos+1; |
| length = length-start; |
| } //ignore line breaks |
| } |
| } |
| if(length > 0) { |
| super.characters(ch, start, length); |
| } |
| addedText = true; |
| } |
| |
| @Override |
| public void endElement(String uri, String localName, String qName) throws SAXException { |
| // if(skipLinebreakes & addedText){ |
| // characters(LINEBREAK, 0, 1); |
| // addedText = false; |
| // } |
| super.endElement(uri, localName, qName); |
| } |
| } |