blob: 0bfdfe67d50fc36a7ed967018c088f430fe8097a [file] [log] [blame]
/*
* Copyright The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.chukwa.util;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.hadoop.chukwa.extraction.CHUKWA_CONSTANT;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.log4j.Logger;
/**
* To support hierarchyDataType according to CHUKWA-648, which is quite similar
* to the idea of Hive's Partition. For example, the user can define the
* dataType as "datatypeLevel1/dataTypeLevel2/dataTypeLevel3" instead of a flat
* structure like: "datatypeLevel1_datatTypeLeve2_dataTypeLevel3" <BR>
* <BR>
* The hierarchyDataType makes the filtering work much more easy when doing the
* analysis job. For example, if the user focuses on all data under
* "datatypeLevel1/dataTypeLevel2" category, he only needs to go through all
* level2 related sub-directories.
*/
public class HierarchyDataType {
static Logger log = Logger.getLogger(HierarchyDataType.class);
/**
* List all matched files under the directory and its sub-dirs
* @param fs The file system
* @param path The parent folder
* @param filter The pattern matcher to filter the required files
* @param recursive is a flag to search recursively
* @return list of FileStatus
*/
public static List<FileStatus> globStatus(FileSystem fs, Path path,
PathFilter filter, boolean recursive) {
List<FileStatus> results = new ArrayList<FileStatus>();
try {
FileStatus[] candidates = fs.globStatus(path);
for (FileStatus candidate : candidates) {
log.debug("candidate is:" + candidate);
Path p = candidate.getPath();
if (candidate.isDir() && recursive) {
StringBuilder subpath = new StringBuilder(p.toString());
subpath.append("/*");
log.debug("subfolder is:" + p);
results.addAll(globStatus(fs, new Path(subpath.toString()), filter,
recursive));
} else {
log.debug("Eventfile is:" + p);
FileStatus[] qualifiedfiles = fs.globStatus(p, filter);
if (qualifiedfiles != null && qualifiedfiles.length > 0) {
log.debug("qualified Eventfile is:" + p);
Collections.addAll(results, qualifiedfiles);
}
}
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
log.debug("results.length: " + results.size());
return results;
}
/**
* List all files under certain path and its sub-directories
* @param fs The file system
* @param path The parent folder
* @param recursive is flag to search recursive
* @return The list of all sub-dirs
*/
public static List<FileStatus> globStatus(FileSystem fs, Path path,
boolean recursive) {
List<FileStatus> results = new ArrayList<FileStatus>();
try {
FileStatus[] candidates = fs.listStatus(path);
if (candidates.length > 0) {
for (FileStatus candidate : candidates) {
log.debug("candidate is:" + candidate);
Path p = candidate.getPath();
if (candidate.isDir() && recursive) {
results.addAll(globStatus(fs, p, recursive));
}
}
} else {
log.debug("path is:" + path);
results.add(fs.globStatus(path)[0]);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return results;
}
/**
* Get the hierarchyDataType format from the directory.
*
* @param path The data path
* @param cluster The cluster's folder
* @return The hierarchyDataType
*/
public static String getDataType(Path path, Path cluster) {
log.debug("datasource path: " + path + " cluster path: " + cluster);
String Cluster = cluster.toString();
if (!Cluster.endsWith("/")) {
Cluster = Cluster + "/";
}
String dataType = path.toString().replaceFirst(Cluster, "");
log.debug("The datatype is: " + dataType);
return dataType;
}
/**
* Get the directory without first and last slash mark.
*
* @param datasource is a string
* @return same string with ending slash trimmed
*/
public static String trimSlash(String datasource) {
String results = datasource;
if (datasource.startsWith("/")) {
results = datasource.replaceFirst("/", "");
}
if (results.endsWith("/")) {
results = results.substring(0, results.length()-1);
}
return results;
}
/**
* Transform the hierarchyDatatType directory into its filename (without any
* slash mark)
*
* @param datasource is a string
* @return path to data source
*/
public static String getHierarchyDataTypeFileName(String datasource){
return datasource.replace("/", CHUKWA_CONSTANT.HIERARCHY_CONNECTOR);
}
/**
* Transform the hierarchyDataType filename into its directory name (with
* slash mark)
*
* @param datasource is a string
* @return path to data source
*/
public static String getHierarchyDataTypeDirectory(String datasource) {
return datasource.replace(CHUKWA_CONSTANT.HIERARCHY_CONNECTOR, "/");
}
}