blob: 68797927e26964505a28a412e18c991afe20b3af [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.apache.iceberg.hadoop;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.iceberg.BaseMetastoreCatalog;
import org.apache.iceberg.CatalogProperties;
import org.apache.iceberg.CatalogUtil;
import org.apache.iceberg.Schema;
import org.apache.iceberg.TableMetadata;
import org.apache.iceberg.TableOperations;
import org.apache.iceberg.catalog.Namespace;
import org.apache.iceberg.catalog.SupportsNamespaces;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.exceptions.AlreadyExistsException;
import org.apache.iceberg.exceptions.NamespaceNotEmptyException;
import org.apache.iceberg.exceptions.NoSuchNamespaceException;
import org.apache.iceberg.exceptions.NoSuchTableException;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
* HadoopCatalog provides a way to use table names like db.table to work with path-based tables under a common
* location. It uses a specified directory under a specified filesystem as the warehouse directory, and organizes
* multiple levels directories that mapped to the database, namespace and the table respectively. The HadoopCatalog
* takes a location as the warehouse directory. When creating a table such as $db.$tbl, it creates $db/$tbl
* directory under the warehouse directory, and put the table metadata into that directory.
* The HadoopCatalog now supports {@link org.apache.iceberg.catalog.Catalog#createTable},
* {@link org.apache.iceberg.catalog.Catalog#dropTable}, the {@link org.apache.iceberg.catalog.Catalog#renameTable}
* is not supported yet.
* Note: The HadoopCatalog requires that the underlying file system supports atomic rename.
public class HadoopCatalog extends BaseMetastoreCatalog implements Closeable, SupportsNamespaces, Configurable {
private static final Logger LOG = LoggerFactory.getLogger(HadoopCatalog.class);
private static final String ICEBERG_HADOOP_WAREHOUSE_BASE = "iceberg/warehouse";
private static final String TABLE_METADATA_FILE_EXTENSION = ".metadata.json";
private static final Joiner SLASH = Joiner.on("/");
private static final PathFilter TABLE_FILTER = path -> path.getName().endsWith(TABLE_METADATA_FILE_EXTENSION);
private static final String HADOOP_SUPPRESS_PERMISSION_ERROR = "suppress-permission-error";
private String catalogName;
private Configuration conf;
private String warehouseLocation;
private FileSystem fs;
private FileIO fileIO;
private boolean suppressPermissionError = false;
public HadoopCatalog(){
* The constructor of the HadoopCatalog. It uses the passed location as its warehouse directory.
* @deprecated please use the no-arg constructor, setConf and initialize to construct the catalog. Will be removed in
* v0.12.0
* @param name The catalog name
* @param conf The Hadoop configuration
* @param warehouseLocation The location used as warehouse directory
public HadoopCatalog(String name, Configuration conf, String warehouseLocation) {
this(name, conf, warehouseLocation, Maps.newHashMap());
* The all-arg constructor of the HadoopCatalog.
* @deprecated please use the no-arg constructor, setConf and initialize to construct the catalog. Will be removed in
* v0.12.0
* @param name The catalog name
* @param conf The Hadoop configuration
* @param warehouseLocation The location used as warehouse directory
* @param properties catalog properties
public HadoopCatalog(String name, Configuration conf, String warehouseLocation, Map<String, String> properties) {
Preconditions.checkArgument(warehouseLocation != null && !warehouseLocation.equals(""),
"Cannot instantiate hadoop catalog. No location provided for warehouse");
Map<String, String> props = Maps.newHashMap(properties);
props.put(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation);
initialize(name, props);
public void initialize(String name, Map<String, String> properties) {
String inputWarehouseLocation = properties.get(CatalogProperties.WAREHOUSE_LOCATION);
Preconditions.checkArgument(inputWarehouseLocation != null && !inputWarehouseLocation.equals(""),
"Cannot instantiate hadoop catalog. No location provided for warehouse (Set warehouse config)");
this.catalogName = name;
this.warehouseLocation = inputWarehouseLocation.replaceAll("/*$", "");
this.fs = Util.getFs(new Path(warehouseLocation), conf);
String fileIOImpl = properties.get(CatalogProperties.FILE_IO_IMPL);
this.fileIO = fileIOImpl == null ? new HadoopFileIO(conf) : CatalogUtil.loadFileIO(fileIOImpl, properties, conf);
this.suppressPermissionError = Boolean.parseBoolean(properties.get(HADOOP_SUPPRESS_PERMISSION_ERROR));
* The constructor of the HadoopCatalog. It uses the passed location as its warehouse directory.
* @param conf The Hadoop configuration
* @param warehouseLocation The location used as warehouse directory
public HadoopCatalog(Configuration conf, String warehouseLocation) {
this("hadoop", conf, warehouseLocation);
* The constructor of the HadoopCatalog. It gets the value of <code>fs.defaultFS</code> property
* from the passed Hadoop configuration as its default file system, and use the default directory
* <code>iceberg/warehouse</code> as the warehouse directory.
* @param conf The Hadoop configuration
public HadoopCatalog(Configuration conf) {
this("hadoop", conf, conf.get("fs.defaultFS") + "/" + ICEBERG_HADOOP_WAREHOUSE_BASE);
public String name() {
return catalogName;
private boolean shouldSuppressPermissionError(IOException ioException) {
if (suppressPermissionError) {
return ioException.getMessage() != null && ioException.getMessage().contains("AuthorizationPermissionMismatch");
return false;
private boolean isTableDir(Path path) {
Path metadataPath = new Path(path, "metadata");
// Only the path which contains metadata is the path for table, otherwise it could be
// still a namespace.
try {
return fs.listStatus(metadataPath, TABLE_FILTER).length >= 1;
} catch (FileNotFoundException e) {
return false;
} catch (IOException e) {
if (shouldSuppressPermissionError(e)) {
LOG.warn("Unable to list metadata directory {}: {}", metadataPath, e);
return false;
} else {
throw new UncheckedIOException(e);
private boolean isDirectory(Path path) {
try {
return fs.getFileStatus(path).isDirectory();
} catch (FileNotFoundException e) {
return false;
} catch (IOException e) {
if (shouldSuppressPermissionError(e)) {
LOG.warn("Unable to list directory {}: {}", path, e);
return false;
} else {
throw new UncheckedIOException(e);
public List<TableIdentifier> listTables(Namespace namespace) {
Preconditions.checkArgument(namespace.levels().length >= 1,
"Missing database in table identifier: %s", namespace);
Path nsPath = new Path(warehouseLocation, SLASH.join(namespace.levels()));
Set<TableIdentifier> tblIdents = Sets.newHashSet();
try {
if (!isDirectory(nsPath)) {
throw new NoSuchNamespaceException("Namespace does not exist: %s", namespace);
for (FileStatus s : fs.listStatus(nsPath)) {
if (!s.isDirectory()) {
// Ignore the path which is not a directory.
Path path = s.getPath();
if (isTableDir(path)) {
TableIdentifier tblIdent = TableIdentifier.of(namespace, path.getName());
} catch (IOException ioe) {
throw new RuntimeIOException(ioe, "Failed to list tables under: %s", namespace);
return Lists.newArrayList(tblIdents);
protected boolean isValidIdentifier(TableIdentifier identifier) {
return true;
protected TableOperations newTableOps(TableIdentifier identifier) {
return new HadoopTableOperations(new Path(defaultWarehouseLocation(identifier)), fileIO, conf);
protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) {
String tableName =;
StringBuilder sb = new StringBuilder();
for (String level : tableIdentifier.namespace().levels()) {
return sb.toString();
public boolean dropTable(TableIdentifier identifier, boolean purge) {
if (!isValidIdentifier(identifier)) {
throw new NoSuchTableException("Invalid identifier: %s", identifier);
Path tablePath = new Path(defaultWarehouseLocation(identifier));
TableOperations ops = newTableOps(identifier);
TableMetadata lastMetadata;
if (purge && ops.current() != null) {
lastMetadata = ops.current();
} else {
lastMetadata = null;
try {
if (purge && lastMetadata != null) {
// Since the data files and the metadata files may store in different locations,
// so it has to call dropTableData to force delete the data file.
CatalogUtil.dropTableData(, lastMetadata);
fs.delete(tablePath, true /* recursive */);
return true;
} catch (IOException e) {
throw new RuntimeIOException(e, "Failed to delete file: %s", tablePath);
public void renameTable(TableIdentifier from, TableIdentifier to) {
throw new UnsupportedOperationException("Cannot rename Hadoop tables");
public void createNamespace(Namespace namespace, Map<String, String> meta) {
"Cannot create namespace with invalid name: %s", namespace);
if (!meta.isEmpty()) {
throw new UnsupportedOperationException("Cannot create namespace " + namespace + ": metadata is not supported");
Path nsPath = new Path(warehouseLocation, SLASH.join(namespace.levels()));
if (isNamespace(nsPath)) {
throw new AlreadyExistsException("Namespace already exists: %s", namespace);
try {
} catch (IOException e) {
throw new RuntimeIOException(e, "Create namespace failed: %s", namespace);
public List<Namespace> listNamespaces(Namespace namespace) {
Path nsPath = namespace.isEmpty() ? new Path(warehouseLocation)
: new Path(warehouseLocation, SLASH.join(namespace.levels()));
if (!isNamespace(nsPath)) {
throw new NoSuchNamespaceException("Namespace does not exist: %s", namespace);
try {
return Stream.of(fs.listStatus(nsPath))
.map(path -> append(namespace, path.getName()))
} catch (IOException ioe) {
throw new RuntimeIOException(ioe, "Failed to list namespace under: %s", namespace);
private Namespace append(Namespace ns, String name) {
String[] levels = Arrays.copyOfRange(ns.levels(), 0, ns.levels().length + 1);
levels[ns.levels().length] = name;
return Namespace.of(levels);
public boolean dropNamespace(Namespace namespace) {
Path nsPath = new Path(warehouseLocation, SLASH.join(namespace.levels()));
if (!isNamespace(nsPath) || namespace.isEmpty()) {
return false;
try {
if (fs.listStatusIterator(nsPath).hasNext()) {
throw new NamespaceNotEmptyException("Namespace %s is not empty.", namespace);
return fs.delete(nsPath, false /* recursive */);
} catch (IOException e) {
throw new RuntimeIOException(e, "Namespace delete failed: %s", namespace);
public boolean setProperties(Namespace namespace, Map<String, String> properties) {
throw new UnsupportedOperationException(
"Cannot set namespace properties " + namespace + " : setProperties is not supported");
public boolean removeProperties(Namespace namespace, Set<String> properties) {
throw new UnsupportedOperationException(
"Cannot remove properties " + namespace + " : removeProperties is not supported");
public Map<String, String> loadNamespaceMetadata(Namespace namespace) {
Path nsPath = new Path(warehouseLocation, SLASH.join(namespace.levels()));
if (!isNamespace(nsPath) || namespace.isEmpty()) {
throw new NoSuchNamespaceException("Namespace does not exist: %s", namespace);
return ImmutableMap.of("location", nsPath.toString());
private boolean isNamespace(Path path) {
return isDirectory(path) && !isTableDir(path);
public void close() throws IOException {
public String toString() {
return MoreObjects.toStringHelper(this)
.add("name", catalogName)
.add("location", warehouseLocation)
public TableBuilder buildTable(TableIdentifier identifier, Schema schema) {
return new HadoopCatalogTableBuilder(identifier, schema);
public void setConf(Configuration conf) {
this.conf = conf;
public Configuration getConf() {
return conf;
private class HadoopCatalogTableBuilder extends BaseMetastoreCatalogTableBuilder {
private final String defaultLocation;
private HadoopCatalogTableBuilder(TableIdentifier identifier, Schema schema) {
super(identifier, schema);
defaultLocation = defaultWarehouseLocation(identifier);
public TableBuilder withLocation(String location) {
Preconditions.checkArgument(location == null || location.equals(defaultLocation),
"Cannot set a custom location for a path-based table. Expected " + defaultLocation + " but got " + location);
return this;