blob: 6b973fc269255518ca2f4873116dbfa635795ecc [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.sax;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* Content handler decorator that makes sure that the character events
* ({@link #characters(char[], int, int)} or
* {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated
* content handler contain only valid XML characters. All invalid characters
* are replaced with spaces.
* <p>
* The XML standard defines the following Unicode character ranges as
* valid XML characters:
* <pre>
* #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
* </pre>
* <p>
* Note that currently this class only detects those invalid characters whose
* UTF-16 representation fits a single char. Also, this class does not ensure
* that the UTF-16 encoding of incoming characters is correct.
*/
public class SafeContentHandler extends ContentHandlerDecorator {
/**
* Replacement for invalid characters.
*/
private static final char[] REPLACEMENT = new char[] { ' ' };
/**
* Internal interface that allows both character and
* ignorable whitespace content to be filtered the same way.
*/
protected interface Output {
void write(char[] ch, int start, int length) throws SAXException;
}
/**
* Output through the {@link ContentHandler#characters(char[], int, int)}
* method of the decorated content handler.
*/
private final Output charactersOutput = new Output() {
public void write(char[] ch, int start, int length)
throws SAXException {
SafeContentHandler.super.characters(ch, start, length);
}
};
/**
* Output through the
* {@link ContentHandler#ignorableWhitespace(char[], int, int)}
* method of the decorated content handler.
*/
private final Output ignorableWhitespaceOutput = new Output() {
public void write(char[] ch, int start, int length)
throws SAXException {
SafeContentHandler.super.ignorableWhitespace(ch, start, length);
}
};
public SafeContentHandler(ContentHandler handler) {
super(handler);
}
/**
* Filters and outputs the contents of the given input buffer. Any
* invalid characters in the input buffer area handled by sending a
* replacement (a space character) to the given output. Any sequences
* of valid characters are passed as-is to the given output.
*
* @param ch input buffer
* @param start start offset within the buffer
* @param length number of characters to read from the buffer
* @param output output channel
* @throws SAXException if the filtered characters could not be written out
*/
private void filter(char[] ch, int start, int length, Output output)
throws SAXException {
int end = start + length;
for (int i = start; i < end; i++) {
if (isInvalid(ch[i])) {
// Output any preceding valid characters
if (i > start) {
output.write(ch, start, i - start);
}
// Output the replacement for this invalid character
writeReplacement(output);
// Continue with the rest of the array
start = i + 1;
}
}
// Output any remaining valid characters
output.write(ch, start, end - start);
}
/**
* Checks whether the given character (more accurately a UTF-16 code unit)
* is an invalid XML character and should be replaced for output.
* Subclasses can override this method to use an alternative definition
* of which characters should be replaced in the XML output.
*
* @param ch character
* @return <code>true</code> if the character should be replaced,
* <code>false</code> otherwise
*/
protected boolean isInvalid(char ch) {
// TODO: Correct handling of multi-word characters
if (ch < 0x20) {
return ch != 0x09 && ch != 0x0A && ch != 0x0D;
} else {
return ch >= 0xFFFE;
}
}
/**
* Outputs the replacement for an invalid character. Subclasses can
* override this method to use a custom replacement.
*
* @param output where the replacement is written to
* @throws SAXException if the replacement could not be written
*/
protected void writeReplacement(Output output) throws SAXException {
output.write(REPLACEMENT, 0, REPLACEMENT.length);
}
//------------------------------------------------------< ContentHandler >
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
filter(ch, start, length, charactersOutput);
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
filter(ch, start, length, ignorableWhitespaceOutput);
}
}