blob: 2c483951034e7bfd6474613a3abecdbdd5f642c5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.html;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.codec.binary.Base64;
import org.apache.tika.mime.MediaType;
/**
* Not thread safe. Create a separate util for each thread.
*/
public class DataURISchemeUtil {
public static String UNSPECIFIED_MEDIA_TYPE = "text/plain;charset=US-ASCII";
private static Pattern PARSE_PATTERN = Pattern.compile("(?s)data:([^,]*?)(base64)?,(.*)$");
private static Pattern EXTRACT_PATTERN =
Pattern.compile("(?s)data:([^,]*?)(base64)?,([^\"\']*)[\"\']");
private final Matcher parseMatcher = PARSE_PATTERN.matcher("");
private final Matcher extractMatcher = EXTRACT_PATTERN.matcher("");
Base64 base64 = new Base64();
public DataURIScheme parse(String string) throws DataURISchemeParseException {
parseMatcher.reset(string);
if (parseMatcher.find()) {
return build(parseMatcher.group(1), parseMatcher.group(2), parseMatcher.group(3));
}
throw new DataURISchemeParseException("Couldn't find expected pattern");
}
private DataURIScheme build(String mediaTypeString, String isBase64, String dataString) {
byte[] data = null;
//strip out back slashes as you might have in css
dataString = (dataString != null) ? dataString.replaceAll("\\\\", " ") : dataString;
if (dataString == null || dataString.length() == 0) {
data = new byte[0];
} else if (isBase64 != null) {
data = base64.decode(dataString);
} else {
//TODO: handle encodings
MediaType mediaType = MediaType.parse(mediaTypeString);
Charset charset = StandardCharsets.UTF_8;
if (mediaType.hasParameters()) {
String charsetName = mediaType.getParameters().get("charset");
if (charsetName != null && Charset.isSupported(charsetName)) {
try {
charset = Charset.forName(charsetName);
} catch (IllegalCharsetNameException e) {
//swallow and default to UTF-8
}
}
}
data = dataString.getBytes(charset);
}
return new DataURIScheme(mediaTypeString, (isBase64 != null), data);
}
/**
* Extracts DataURISchemes from free text, as in javascript.
*
* @param string
* @return list of extracted DataURISchemes
*/
public List<DataURIScheme> extract(String string) {
extractMatcher.reset(string);
List<DataURIScheme> list = null;
while (extractMatcher.find()) {
DataURIScheme dataURIScheme = build(extractMatcher.group(1), extractMatcher.group(2),
extractMatcher.group(3));
if (list == null) {
list = new ArrayList<>();
}
list.add(dataURIScheme);
}
return (list == null) ? Collections.EMPTY_LIST : list;
}
}