blob: 13c6e0ec717e0922a8cc42a56a9e66e6b6596cb7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.wordperfect;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.collections4.MapUtils;
import org.xml.sax.SAXException;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* Extracts WordPerfect Document Area text from a WordPerfect document
* version 6+.
*
* @author Pascal Essiembre
*/
class WP6DocumentAreaExtractor extends WPDocumentAreaExtractor {
private static final byte START_UNDO = 0;
private static final byte END_UNDO = 1;
private static final byte START_INSERT = 3;
private static final byte END_INSERT = 4;
/* 240-254 characters represent fixed-length multi-byte functions.
* Those that are not handled explicitely in the code below should be
* skipped according to their size (minus the first char if already read).
*/
private static final Map<Integer, Integer> FIXED_LENGTH_FUNCTION_SIZES =
MapUtils.putAll(new HashMap<>(),
new Integer[]{240, 4, // Extended Character
241, 5, // Undo
242, 3, // Attribute On
243, 3, // Attribute Off
244, 3, // (Reserved)
245, 3, // (Reserved)
246, 4, // (Reserved)
247, 4, // (Reserved)
248, 4, // (Reserved)
249, 5, // (Reserved)
250, 5, // (Reserved)
251, 6, // (Reserved)
252, 6, // (Reserved)
253, 8, // (Reserved)
254, 8, // (Reserved)
});
private boolean includeDeletedContent = true;
private boolean inUndo = false;
public WP6DocumentAreaExtractor(boolean includeDeletedContent) {
super();
this.includeDeletedContent = includeDeletedContent;
}
protected void extract(int c, WPInputStream in, StringBuilder out, XHTMLContentHandler xhtml)
throws IOException, SAXException {
//special handling for undo must come first
if (!includeDeletedContent) {
if (inUndo && c != 241) {
return;
}
}
//241 is the fixed length multi-byte marker for
//undo/insert. The second byte determines
//what type of undo this is. I don't understand
//what the third byte signifies.
if (c == 241) {
byte b = in.readWPByte();
if (b == START_UNDO) {
inUndo = true;
} else if (b == END_UNDO) {
inUndo = false;
}
// removing 2 from function length since first two chars already read
in.skipWPByte(FIXED_LENGTH_FUNCTION_SIZES.get(c) - 2);
} else if (c > 0 && c <= 32) {
out.append(WP6Charsets.DEFAULT_EXTENDED_INTL_CHARS[c]);
} else if (c >= 33 && c <= 126) {
out.append((char) c);
} else if (c == 128) {
out.append(' '); // Soft space
} else if (c == 129) {
out.append('\u00A0'); // Hard space
} else if (c == 132) {
out.append('-'); // Hard hyphen
} else if (c == 135 || c == 137) {
endParagraph(out, xhtml); // Dormant Hard return
} else if (c == 138) {
// skip to closing pair surrounding page number
skipUntilChar(in, 139);
} else if (c == 198) {
// end of cell
out.append('\t');
} else if (c >= 180 && c <= 207) {
endParagraph(out, xhtml);
// 208-239: variable-length multi-byte function
} else if (c >= 208 && c <= 239) {
int subgroup = in.readWP();
int functionSize = in.readWPShort();
for (int i = 0; i < functionSize - 4; i++) {
in.readWP();
}
// End-of-Line group
if (c == 208) {
if (subgroup >= 1 && subgroup <= 3) {
out.append(' ');
} else if (subgroup == 10) {
// end of cell
out.append('\t');
} else if (subgroup >= 4 && subgroup <= 19) {
endParagraph(out, xhtml);
} else if (subgroup >= 20 && subgroup <= 22) {
out.append(' ');
} else if (subgroup >= 23 && subgroup <= 28) {
endParagraph(out, xhtml);
}
} else if (c == 213) {
out.append(' ');
} else if (c == 224) {
out.append('\t');
}
//TODO Are there functions containing data? Like footnotes?
} else if (c == 240) {
// extended char
int charval = in.readWP();
int charset = in.readWP();
in.readWP(); // closing character
WP6Charsets.append(out, charset, charval);
// 241-254: fixed-length multi-byte function
} else if (c >= 241 && c <= 254) {
// removing 1 from function length since first char already read
in.skipWPByte(FIXED_LENGTH_FUNCTION_SIZES.get(c) - 1);
} else if (c == 255) {
// Should not be used so this line should not be called.
// We still have this code in case a future version uses it.
skipUntilChar(in, c);
}
// Ignored codes above 127:
// 130,131,133: soft hyphens
// 134: invisible return in line
// 136: soft end of center/align
// 140: style separator mark
// 141,142: start/end of text to skip
// 143: exited hyphenation
// 144: cancel hyphenation
// 145-151: match functions
// 152-179: unknown/ignored
// 255: reserved, cannot be used
}
}