| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.strings; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.UnsupportedEncodingException; |
| import java.util.HashSet; |
| import java.util.Set; |
| |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.SAXException; |
| |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.mime.MediaType; |
| import org.apache.tika.parser.AbstractParser; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.sax.XHTMLContentHandler; |
| |
| /** |
| * Parser to extract printable Latin1 strings from arbitrary files with pure java |
| * without running any external process. Useful for binary or unknown files, for |
| * files without a specific parser and for corrupted ones causing a TikaException |
| * as a fallback parser. To enable the parsing of unknown or files without a |
| * specific parser with AutoDetectParser: |
| * <p> |
| * AutoDetectParser parser = new AutoDetectParser(); |
| * parser.setFallback(new Latin1StringsParser()); |
| * </p> |
| * Currently the parser does a best effort to extract Latin1 strings, used by |
| * Western European languages, encoded with ISO-8859-1, UTF-8 or UTF-16 charsets |
| * mixed within the same file. |
| * <p> |
| * The implementation is optimized for fast parsing with only one pass. |
| */ |
| public class Latin1StringsParser extends AbstractParser { |
| |
| private static final long serialVersionUID = 1L; |
| |
| /** |
| * The set of supported types |
| */ |
| private static final Set<MediaType> SUPPORTED_TYPES = getTypes(); |
| |
| /** |
| * The valid ISO-8859-1 character map. |
| */ |
| private static final boolean[] isChar = getCharMap(); |
| |
| /** |
| * The size of the internal buffers. |
| */ |
| private static int BUF_SIZE = 64 * 1024; |
| |
| /** |
| * The minimum size of a character sequence to be extracted. |
| */ |
| private int minSize = 4; |
| |
| /** |
| * The output buffer. |
| */ |
| private byte[] output = new byte[BUF_SIZE]; |
| |
| /** |
| * The input buffer. |
| */ |
| private byte[] input = new byte[BUF_SIZE]; |
| |
| /** |
| * The temporary position into the output buffer. |
| */ |
| private int tmpPos = 0; |
| |
| /** |
| * The current position into the output buffer. |
| */ |
| private int outPos = 0; |
| |
| /** |
| * The number of bytes into the input buffer. |
| */ |
| private int inSize = 0; |
| |
| /** |
| * The position into the input buffer. |
| */ |
| private int inPos = 0; |
| |
| /** |
| * The output content handler. |
| */ |
| private XHTMLContentHandler xhtml; |
| |
| /** |
| * Populates the valid ISO-8859-1 character map. |
| * |
| * @return the valid ISO-8859-1 character map. |
| */ |
| private static boolean[] getCharMap() { |
| |
| boolean[] isChar = new boolean[256]; |
| for (int c = Byte.MIN_VALUE; c <= Byte.MAX_VALUE; c++) |
| if ((c >= 0x20 && c <= 0x7E) || (c >= (byte) 0xC0 && c <= (byte) 0xFE) || c == 0x0A || |
| c == 0x0D || c == 0x09) { |
| isChar[c & 0xFF] = true; |
| } |
| return isChar; |
| |
| } |
| |
| /** |
| * Returns the set of supported types. |
| * |
| * @return the set of supported types |
| */ |
| private static Set<MediaType> getTypes() { |
| HashSet<MediaType> supportedTypes = new HashSet<>(); |
| supportedTypes.add(MediaType.OCTET_STREAM); |
| return supportedTypes; |
| } |
| |
| /** |
| * Tests if the byte is a ISO-8859-1 char. |
| * |
| * @param c the byte to test. |
| * @return if the byte is a char. |
| */ |
| private static final boolean isChar(byte c) { |
| return isChar[c & 0xFF]; |
| } |
| |
| /** |
| * Returns the minimum size of a character sequence to be extracted. |
| * |
| * @return the minimum size of a character sequence |
| */ |
| public int getMinSize() { |
| return minSize; |
| } |
| |
| /** |
| * Sets the minimum size of a character sequence to be extracted. |
| * |
| * @param minSize the minimum size of a character sequence |
| */ |
| public void setMinSize(int minSize) { |
| this.minSize = minSize; |
| } |
| |
| /** |
| * Flushes the internal output buffer to the content handler. |
| * |
| * @throws UnsupportedEncodingException |
| * @throws SAXException |
| */ |
| private void flushBuffer() throws UnsupportedEncodingException, SAXException { |
| if (tmpPos - outPos >= minSize) { |
| outPos = tmpPos - minSize; |
| } |
| |
| xhtml.characters(new String(output, 0, outPos, "windows-1252")); |
| |
| if (tmpPos - outPos >= 0) { |
| System.arraycopy(output, outPos, output, 0, tmpPos - outPos); |
| } |
| tmpPos = tmpPos - outPos; |
| outPos = 0; |
| } |
| |
| @Override |
| public Set<MediaType> getSupportedTypes(ParseContext arg0) { |
| return SUPPORTED_TYPES; |
| } |
| |
| /** |
| * @see org.apache.tika.parser.Parser#parse(java.io.InputStream, |
| * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata, |
| * org.apache.tika.parser.ParseContext) |
| */ |
| @Override |
| public void parse(InputStream stream, ContentHandler handler, Metadata metadata, |
| ParseContext context) throws IOException, SAXException { |
| /* |
| * Creates a new instance because the object is not immutable. |
| */ |
| new Latin1StringsParser().doParse(stream, handler, metadata, context); |
| } |
| |
| /** |
| * Does a best effort to extract Latin1 strings encoded with ISO-8859-1, |
| * UTF-8 or UTF-16. Valid chars are saved into the output buffer and the |
| * temporary buffer position is incremented. When an invalid char is read, |
| * the difference of the temporary and current buffer position is checked. |
| * If it is greater than the minimum string size, the current buffer |
| * position is updated to the temp position. If it is not, the temp position |
| * is reseted to the current position. |
| * |
| * @param stream the input stream. |
| * @param handler the output content handler |
| * @param metadata the metadata of the file |
| * @param context the parsing context |
| * @throws IOException if an io error occurs |
| * @throws SAXException if a sax error occurs |
| */ |
| private void doParse(InputStream stream, ContentHandler handler, Metadata metadata, |
| ParseContext context) throws IOException, SAXException { |
| |
| tmpPos = 0; |
| outPos = 0; |
| |
| xhtml = new XHTMLContentHandler(handler, metadata); |
| xhtml.startDocument(); |
| |
| int i = 0; |
| do { |
| inSize = 0; |
| while ((i = stream.read(input, inSize, BUF_SIZE - inSize)) > 0) { |
| inSize += i; |
| } |
| inPos = 0; |
| while (inPos < inSize) { |
| byte c = input[inPos++]; |
| boolean utf8 = false; |
| /* |
| * Test for a possible UTF8 encoded char |
| */ |
| if (c == (byte) 0xC3) { |
| byte c_ = inPos < inSize ? input[inPos++] : (byte) stream.read(); |
| /* |
| * Test if the next byte is in the valid UTF8 range |
| */ |
| if (c_ >= (byte) 0x80 && c_ <= (byte) 0xBF) { |
| utf8 = true; |
| output[tmpPos++] = (byte) (c_ + 0x40); |
| } else { |
| output[tmpPos++] = c; |
| c = c_; |
| } |
| if (tmpPos == BUF_SIZE) { |
| flushBuffer(); |
| } |
| |
| /* |
| * Test for a possible UTF8 encoded char |
| */ |
| } else if (c == (byte) 0xC2) { |
| byte c_ = inPos < inSize ? input[inPos++] : (byte) stream.read(); |
| /* |
| * Test if the next byte is in the valid UTF8 range |
| */ |
| if (c_ >= (byte) 0xA0 && c_ <= (byte) 0xBF) { |
| utf8 = true; |
| output[tmpPos++] = c_; |
| } else { |
| output[tmpPos++] = c; |
| c = c_; |
| } |
| if (tmpPos == BUF_SIZE) { |
| flushBuffer(); |
| } |
| } |
| if (!utf8) |
| /* |
| * Test if the byte is a valid char. |
| */ { |
| if (isChar(c)) { |
| output[tmpPos++] = c; |
| if (tmpPos == BUF_SIZE) { |
| flushBuffer(); |
| } |
| } else { |
| /* |
| * Test if the byte is an invalid char, marking a string |
| * end. If it is a zero, test 2 positions before or |
| * ahead for a valid char, meaning it marks the |
| * transition between ISO-8859-1 and UTF16 sequences. |
| */ |
| if (c != 0 || (inPos >= 3 && isChar(input[inPos - 3])) || |
| (inPos + 1 < inSize && isChar(input[inPos + 1]))) { |
| |
| if (tmpPos - outPos >= minSize) { |
| output[tmpPos++] = 0x0A; |
| outPos = tmpPos; |
| |
| if (tmpPos == BUF_SIZE) { |
| flushBuffer(); |
| } |
| } else { |
| tmpPos = outPos; |
| } |
| |
| } |
| } |
| } |
| } |
| } while (i != -1 && !Thread.currentThread().isInterrupted()); |
| |
| if (tmpPos - outPos >= minSize) { |
| output[tmpPos++] = 0x0A; |
| outPos = tmpPos; |
| } |
| xhtml.characters(new String(output, 0, outPos, "windows-1252")); |
| |
| xhtml.endDocument(); |
| |
| } |
| |
| } |