tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.io;

 import java.util.HashSet;
 import java.util.Locale;

 import org.apache.tika.utils.StringUtils;


 public class FilenameUtils {


     /**
      * Reserved characters
      */
     public final static char[] RESERVED_FILENAME_CHARACTERS =
             {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
                     0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A,
                     0x1B, 0x1C, 0x1D, 0x1E, 0x1F, '?', ':', '*', '<', '>', '|'};

     private final static HashSet<Character> RESERVED = new HashSet<>(38);


     static {
         for (char reservedFilenameCharacter : RESERVED_FILENAME_CHARACTERS) {
             RESERVED.add(reservedFilenameCharacter);
         }
     }


     /**
      * Scans the given file name for reserved characters on different OSs and
      * file systems and returns a sanitized version of the name with the
      * reserved chars replaced by their hexadecimal value.
      * <p>
      * For example <code>why?.zip</code> will be converted into <code>why%3F.zip</code>
      *
      * @param name the file name to be normalized - NOT NULL
      * @return the normalized file name
      * @throws IllegalArgumentException if name is null
      */
     public static String normalize(final String name) {
         if (name == null) {
             throw new IllegalArgumentException("name cannot be null");
         }

         StringBuilder sb = new StringBuilder();

         for (char c : name.toCharArray()) {
             if (RESERVED.contains(c)) {
                 sb.append('%').append((c < 16) ? "0" : "")
                         .append(Integer.toHexString(c).toUpperCase(Locale.ROOT));
             } else {
                 sb.append(c);
             }
         }

         return sb.toString();
     }

     /**
      * This is a duplication of the algorithm and functionality
      * available in commons io FilenameUtils.  If Java's File were
      * able handle Windows file paths correctly in linux,
      * we wouldn't need this.
      * <p>
      * The goal of this is to get a filename from a path.
      * The package parsers and some other embedded doc
      * extractors could put anything into TikaCoreProperties.RESOURCE_NAME_KEY.
      * <p>
      * If a careless client used that filename as if it were a
      * filename and not a path when writing embedded files,
      * bad things could happen.  Consider: "../../../my_ppt.ppt".
      * <p>
      * Consider using this in combination with {@link #normalize(String)}.
      *
      * @param path path to strip
      * @return empty string or a filename, never null
      */
     public static String getName(final String path) {

         if (path == null || path.length() == 0) {
             return StringUtils.EMPTY;
         }
         int unix = path.lastIndexOf("/");
         int windows = path.lastIndexOf("\\");
         //some macintosh file names are stored with : as the delimiter
         //also necessary to properly handle C:somefilename
         int colon = path.lastIndexOf(":");
         String cand = path.substring(Math.max(colon, Math.max(unix, windows)) + 1);
         if (cand.equals("..") || cand.equals(".")) {
             return StringUtils.EMPTY;
         }
         return cand;
     }

     /**
      * This includes the period, e.g. ".pdf"
      * @param path
      * @return the suffix or an empty string if one could not be found
      */
     public static String getSuffixFromPath(String path) {
         String n = getName(path);
         int i = n.lastIndexOf(".");
         //arbitrarily sets max extension length
         if (i > -1 && n.length() - i < 6) {
             return n.substring(i);
         }
         return StringUtils.EMPTY;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.io;

	import java.util.HashSet;
	import java.util.Locale;

	import org.apache.tika.utils.StringUtils;


	public class FilenameUtils {


	/**
	* Reserved characters
	*/
	public final static char[] RESERVED_FILENAME_CHARACTERS =
	{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
	0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A,
	0x1B, 0x1C, 0x1D, 0x1E, 0x1F, '?', ':', '*', '<', '>', '\|'};

	private final static HashSet<Character> RESERVED = new HashSet<>(38);


	static {
	for (char reservedFilenameCharacter : RESERVED_FILENAME_CHARACTERS) {
	RESERVED.add(reservedFilenameCharacter);
	}
	}


	/**
	* Scans the given file name for reserved characters on different OSs and
	* file systems and returns a sanitized version of the name with the
	* reserved chars replaced by their hexadecimal value.
	* <p>
	* For example <code>why?.zip</code> will be converted into <code>why%3F.zip</code>
	*
	* @param name the file name to be normalized - NOT NULL
	* @return the normalized file name
	* @throws IllegalArgumentException if name is null
	*/
	public static String normalize(final String name) {
	if (name == null) {
	throw new IllegalArgumentException("name cannot be null");
	}

	StringBuilder sb = new StringBuilder();

	for (char c : name.toCharArray()) {
	if (RESERVED.contains(c)) {
	sb.append('%').append((c < 16) ? "0" : "")
	.append(Integer.toHexString(c).toUpperCase(Locale.ROOT));
	} else {
	sb.append(c);
	}
	}

	return sb.toString();
	}

	/**
	* This is a duplication of the algorithm and functionality
	* available in commons io FilenameUtils. If Java's File were
	* able handle Windows file paths correctly in linux,
	* we wouldn't need this.
	* <p>
	* The goal of this is to get a filename from a path.
	* The package parsers and some other embedded doc
	* extractors could put anything into TikaCoreProperties.RESOURCE_NAME_KEY.
	* <p>
	* If a careless client used that filename as if it were a
	* filename and not a path when writing embedded files,
	* bad things could happen. Consider: "../../../my_ppt.ppt".
	* <p>
	* Consider using this in combination with {@link #normalize(String)}.
	*
	* @param path path to strip
	* @return empty string or a filename, never null
	*/
	public static String getName(final String path) {

	if (path == null \|\| path.length() == 0) {
	return StringUtils.EMPTY;
	}
	int unix = path.lastIndexOf("/");
	int windows = path.lastIndexOf("\\");
	//some macintosh file names are stored with : as the delimiter
	//also necessary to properly handle C:somefilename
	int colon = path.lastIndexOf(":");
	String cand = path.substring(Math.max(colon, Math.max(unix, windows)) + 1);
	if (cand.equals("..") \|\| cand.equals(".")) {
	return StringUtils.EMPTY;
	}
	return cand;
	}

	/**
	* This includes the period, e.g. ".pdf"
	* @param path
	* @return the suffix or an empty string if one could not be found
	*/
	public static String getSuffixFromPath(String path) {
	String n = getName(path);
	int i = n.lastIndexOf(".");
	//arbitrarily sets max extension length
	if (i > -1 && n.length() - i < 6) {
	return n.substring(i);
	}
	return StringUtils.EMPTY;
	}
	}