| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.wordperfect; |
| |
| import java.io.IOException; |
| |
| import org.xml.sax.SAXException; |
| |
| import org.apache.tika.sax.XHTMLContentHandler; |
| |
| /** |
| * Extracts WordPerfect Document Area text from a WordPerfect document. |
| * |
| * @author Pascal Essiembre |
| */ |
| abstract class WPDocumentAreaExtractor { |
| |
| boolean startedP = false; |
| |
| public void extract(WPInputStream in, XHTMLContentHandler xhtml) |
| throws IOException, SAXException { |
| int chunk = 4096; |
| StringBuilder out = new StringBuilder(chunk); |
| int c; |
| while ((c = in.read()) != -1) { |
| extract(c, in, out, xhtml); |
| if (out.length() >= chunk) { |
| xhtml.characters(out.toString()); |
| out.setLength(0); |
| } |
| } |
| endParagraph(out, xhtml); |
| } |
| |
| protected abstract void extract(int c, WPInputStream in, StringBuilder out, |
| XHTMLContentHandler xhtml) throws IOException, SAXException; |
| |
| |
| protected void lazilyStartParagraph(XHTMLContentHandler xhtml) throws SAXException { |
| if (!startedP) { |
| xhtml.startElement("p"); |
| } |
| startedP = true; |
| } |
| |
| /** |
| * This assumes that the <p> was started before you get here. |
| * And the user is required to close the <p> at the end of the document. |
| * <p> |
| * These are currently handled by {@link #extract(WPInputStream, XHTMLContentHandler)}. |
| * |
| * @param xhtml |
| * @throws SAXException |
| */ |
| protected void endParagraph(StringBuilder buffer, XHTMLContentHandler xhtml) |
| throws SAXException { |
| lazilyStartParagraph(xhtml); |
| |
| xhtml.characters(buffer.toString()); |
| buffer.setLength(0); |
| xhtml.endElement("p"); |
| startedP = false; |
| } |
| |
| // Skips until the given character is encountered. |
| protected int skipUntilChar(WPInputStream in, int targetChar) throws IOException { |
| int count = 0; |
| int c; |
| while ((c = in.read()) != -1) { |
| count++; |
| if (c == targetChar) { |
| return count; |
| } |
| } |
| return count; |
| } |
| } |