blob: 17bc9e920743d04c24dbd44f97e84662417dcb47 [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
import java.util.HashSet;
import java.util.Locale;
import org.apache.tika.utils.StringUtils;
public class FilenameUtils {
* Reserved characters
public final static char[] RESERVED_FILENAME_CHARACTERS =
{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A,
0x1B, 0x1C, 0x1D, 0x1E, 0x1F, '?', ':', '*', '<', '>', '|'};
private final static HashSet<Character> RESERVED = new HashSet<>(38);
static {
for (char reservedFilenameCharacter : RESERVED_FILENAME_CHARACTERS) {
* Scans the given file name for reserved characters on different OSs and
* file systems and returns a sanitized version of the name with the
* reserved chars replaced by their hexadecimal value.
* <p>
* For example <code>why?.zip</code> will be converted into <code></code>
* @param name the file name to be normalized - NOT NULL
* @return the normalized file name
* @throws IllegalArgumentException if name is null
public static String normalize(final String name) {
if (name == null) {
throw new IllegalArgumentException("name cannot be null");
StringBuilder sb = new StringBuilder();
for (char c : name.toCharArray()) {
if (RESERVED.contains(c)) {
sb.append('%').append((c < 16) ? "0" : "")
} else {
return sb.toString();
* This is a duplication of the algorithm and functionality
* available in commons io FilenameUtils. If Java's File were
* able handle Windows file paths correctly in linux,
* we wouldn't need this.
* <p>
* The goal of this is to get a filename from a path.
* The package parsers and some other embedded doc
* extractors could put anything into TikaCoreProperties.RESOURCE_NAME_KEY.
* <p>
* If a careless client used that filename as if it were a
* filename and not a path when writing embedded files,
* bad things could happen. Consider: "../../../my_ppt.ppt".
* <p>
* Consider using this in combination with {@link #normalize(String)}.
* @param path path to strip
* @return empty string or a filename, never null
public static String getName(final String path) {
if (path == null || path.length() == 0) {
return StringUtils.EMPTY;
int unix = path.lastIndexOf("/");
int windows = path.lastIndexOf("\\");
//some macintosh file names are stored with : as the delimiter
//also necessary to properly handle C:somefilename
int colon = path.lastIndexOf(":");
String cand = path.substring(Math.max(colon, Math.max(unix, windows)) + 1);
if (cand.equals("..") || cand.equals(".")) {
return StringUtils.EMPTY;
return cand;
* This includes the period, e.g. ".pdf"
* @param path
* @return the suffix or an empty string if one could not be found
public static String getSuffixFromPath(String path) {
String n = getName(path);
int i = n.lastIndexOf(".");
//arbitrarily sets max extension length
if (i > -1 && n.length() - i < 6) {
return n.substring(i);
return StringUtils.EMPTY;