| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.wordperfect; |
| |
| import java.io.IOException; |
| import java.util.HashMap; |
| import java.util.Map; |
| |
| import org.apache.commons.collections4.MapUtils; |
| import org.xml.sax.SAXException; |
| |
| import org.apache.tika.sax.XHTMLContentHandler; |
| |
| /** |
| * Extracts WordPerfect Document Area text from a WordPerfect document |
| * version 5.x. |
| * |
| * @author Pascal Essiembre |
| */ |
| class WP5DocumentAreaExtractor extends WPDocumentAreaExtractor { |
| |
| /* 192-207 characters represent fixed-length multi-byte functions. |
| * Those that are not handled explicitely in the code below should be |
| * skipped according to their size (minus the first char if already read). |
| */ |
| private static final Map<Integer, Integer> FIXED_LENGTH_FUNCTION_SIZES = |
| MapUtils.putAll(new HashMap<>(), |
| new Integer[]{192, 4, // Extended character |
| 193, 9, // Center/Align/ Tab/Left Margin Release |
| 194, 11, // Indent |
| 195, 3, // Attribute ON |
| 196, 3, // Attribute OFF |
| 197, 5, // Block Protect |
| 198, 6, // End of Indent |
| 199, 7, // Different Display Character when Hyphenated |
| 200, 4, // (Reserved) |
| 201, 5, // (Reserved) |
| 202, 6, // (Reserved) |
| 203, 6, // (Reserved) |
| 204, 8, // (Reserved) |
| 205, 10, // (Reserved) |
| 206, 10, // (Reserved) |
| 207, 12, // (Reserved) |
| }); |
| |
| protected void extract(int c, WPInputStream in, StringBuilder out, XHTMLContentHandler xhtml) |
| throws IOException, SAXException { |
| |
| // 0-31: control characters |
| if (c == 10) { |
| endParagraph(out, xhtml);// hard return ("Enter") |
| } else if (c == 11) { |
| out.append(' '); // soft page break |
| } else if (c == 12) { |
| endParagraph(out, xhtml);// hard page break |
| } else if (c == 13) { |
| out.append(' '); // soft return (line wrap) |
| |
| // 32-126: ASCII characters |
| } else if (c >= 32 && c <= 126) { |
| out.append((char) c); // ASCII character |
| |
| // 128-191: single-byte functions |
| } else if (c == 140) { |
| endParagraph(out, xhtml);// combination hard return/soft page (WP5.1) |
| } else if (c >= 144 && c <= 149) { |
| out.append(' '); // deletable/invisible soft return/page |
| } else if (c == 153) { |
| endParagraph(out, xhtml);// Dormant Hard return (WP5.1) |
| } else if (c == 160) { |
| out.append('\u00A0'); // Hard space |
| } else if (c >= 169 && c <= 171) { |
| out.append('-'); // Hard hyphen |
| |
| // 192-207: fixed-length multi-byte function |
| } else if (c == 192) { |
| // extended char |
| int charval = in.readWP(); |
| int charset = in.readWP(); |
| in.readWP(); // closing character |
| WP5Charsets.append(out, charset, charval); |
| } else if (c >= 193 && c <= 207) { |
| // removing 1 from function length since first char already read |
| in.skipWPByte(FIXED_LENGTH_FUNCTION_SIZES.get(c) - 1); |
| |
| // 208-255: variable-length multi-byte function |
| } else if (c >= 208 && c <= 255) { |
| // Variable-Length Multi-Byte Functions |
| in.readWP(); // subgroup (the function code) |
| int functionSize = in.readWPShort(); |
| for (int i = 0; i < functionSize; i++) { |
| in.readWP(); |
| } |
| //TODO Are there functions containing data? Like footnotes? |
| } |
| } |
| } |