* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.apache.accumulo.server.util;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.accumulo.core.client.Accumulo;
import org.apache.accumulo.core.client.AccumuloClient;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.clientImpl.ClientContext;
import org.apache.accumulo.core.clientImpl.Tables;
import org.apache.accumulo.core.dataImpl.KeyExtent;
import org.apache.accumulo.core.metadata.MetadataTable;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.DataFileColumnFamily;
import org.apache.accumulo.core.util.NumUtil;
import org.apache.accumulo.server.cli.ServerUtilOpts;
import org.apache.accumulo.server.fs.VolumeManager;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.htrace.TraceScope;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.beust.jcommander.Parameter;
public class TableDiskUsage {
private static final Logger log = LoggerFactory.getLogger(TableDiskUsage.class);
private int nextInternalId = 0;
private Map<TableId,Integer> internalIds = new HashMap<>();
private Map<Integer,TableId> externalIds = new HashMap<>();
private Map<String,Integer[]> tableFiles = new HashMap<>();
private Map<String,Long> fileSizes = new HashMap<>();
void addTable(TableId tableId) {
if (internalIds.containsKey(tableId))
throw new IllegalArgumentException("Already added table " + tableId);
// Keep an internal counter for each table added
int iid = nextInternalId++;
// Store the table id to the internal id
internalIds.put(tableId, iid);
// Store the internal id to the table id
externalIds.put(iid, tableId);
void linkFileAndTable(TableId tableId, String file) {
// get the internal id for this table
int internalId = internalIds.get(tableId);
// Initialize a bitset for tables (internal IDs) that reference this file
Integer[] tables = tableFiles.get(file);
if (tables == null) {
tables = new Integer[internalIds.size()];
for (int i = 0; i < tables.length; i++)
tables[i] = 0;
tableFiles.put(file, tables);
// Update the bitset to track that this table has seen this file
tables[internalId] = 1;
void addFileSize(String file, long size) {
fileSizes.put(file, size);
Map<List<TableId>,Long> calculateUsage() {
// Bitset of tables that contain a file and total usage by all files that share that usage
Map<List<Integer>,Long> usage = new HashMap<>();
if (log.isTraceEnabled()) {
log.trace("fileSizes {}", fileSizes);
// For each file w/ referenced-table bitset
for (Entry<String,Integer[]> entry : tableFiles.entrySet()) {
if (log.isTraceEnabled()) {
log.trace("file {} table bitset {}", entry.getKey(), Arrays.toString(entry.getValue()));
List<Integer> key = Arrays.asList(entry.getValue());
Long size = fileSizes.get(entry.getKey());
Long tablesUsage = usage.get(key);
if (tablesUsage == null)
tablesUsage = 0L;
tablesUsage += size;
usage.put(key, tablesUsage);
Map<List<TableId>,Long> externalUsage = new HashMap<>();
for (Entry<List<Integer>,Long> entry : usage.entrySet()) {
List<TableId> externalKey = new ArrayList<>();
List<Integer> key = entry.getKey();
// table bitset
for (int i = 0; i < key.size(); i++)
if (key.get(i) != 0)
// Convert by internal id to the table id
// list of table ids and size of files shared across the tables
externalUsage.put(externalKey, entry.getValue());
// mapping of all enumerations of files being referenced by tables and total size of files who
// share the same reference
return externalUsage;
public interface Printer {
void print(String line);
public static void printDiskUsage(Collection<String> tableNames, VolumeManager fs,
AccumuloClient client, boolean humanReadable) throws TableNotFoundException, IOException {
printDiskUsage(tableNames, fs, client, System.out::println, humanReadable);
public static Map<TreeSet<String>,Long> getDiskUsage(Set<TableId> tableIds, VolumeManager fs,
AccumuloClient client) throws IOException {
TableDiskUsage tdu = new TableDiskUsage();
// Add each tableID
for (TableId tableId : tableIds)
HashSet<TableId> tablesReferenced = new HashSet<>(tableIds);
HashSet<TableId> emptyTableIds = new HashSet<>();
HashSet<String> nameSpacesReferenced = new HashSet<>();
// For each table ID
for (TableId tableId : tableIds) {
Scanner mdScanner;
try {
mdScanner = client.createScanner(MetadataTable.NAME, Authorizations.EMPTY);
} catch (TableNotFoundException e) {
throw new RuntimeException(e);
mdScanner.setRange(new KeyExtent(tableId, null, null).toMetadataRange());
if (!mdScanner.iterator().hasNext()) {
// Read each file referenced by that table
for (Entry<Key,Value> entry : mdScanner) {
String file = entry.getKey().getColumnQualifier().toString();
String[] parts = file.split("/");
// the filename
String uniqueName = parts[parts.length - 1];
if (file.contains(":") || file.startsWith("../")) {
String ref = parts[parts.length - 3];
// Track any tables which are referenced externally by the current table
if (!ref.equals(tableId.canonical())) {
if (file.contains(":") && parts.length > 3) {
List<String> base = Arrays.asList(Arrays.copyOf(parts, parts.length - 3));
// add this file to this table
tdu.linkFileAndTable(tableId, uniqueName);
// Each table seen (provided by user, or reference by table the user provided)
for (TableId tableId : tablesReferenced) {
for (String tableDir : nameSpacesReferenced) {
// Find each file and add its size
FileStatus[] files = fs.globStatus(new Path(tableDir + "/" + tableId + "/*/*"));
if (files != null) {
for (FileStatus fileStatus : files) {
// Assumes that all filenames are unique
String name = fileStatus.getPath().getName();
tdu.addFileSize(name, fileStatus.getLen());
Map<TableId,String> reverseTableIdMap = Tables.getIdToNameMap((ClientContext) client);
TreeMap<TreeSet<String>,Long> usage = new TreeMap<>((o1, o2) -> {
int len1 = o1.size();
int len2 = o2.size();
int min = Math.min(len1, len2);
Iterator<String> iter1 = o1.iterator();
Iterator<String> iter2 = o2.iterator();
int count = 0;
while (count < min) {
String s1 =;
String s2 =;
int cmp = s1.compareTo(s2);
if (cmp != 0)
return cmp;
return len1 - len2;
for (Entry<List<TableId>,Long> entry : tdu.calculateUsage().entrySet()) {
TreeSet<String> tableNames = new TreeSet<>();
// Convert size shared by each table id into size shared by each table name
for (TableId tableId : entry.getKey())
// Make table names to shared file size
usage.put(tableNames, entry.getValue());
if (!emptyTableIds.isEmpty()) {
TreeSet<String> emptyTables = new TreeSet<>();
for (TableId tableId : emptyTableIds) {
usage.put(emptyTables, 0L);
return usage;
public static void printDiskUsage(Collection<String> tableNames, VolumeManager fs,
AccumuloClient client, Printer printer, boolean humanReadable)
throws TableNotFoundException, IOException {
HashSet<TableId> tableIds = new HashSet<>();
// Get table IDs for all tables requested to be 'du'
for (String tableName : tableNames) {
TableId tableId = Tables.getTableId((ClientContext) client, tableName);
if (tableId == null)
throw new TableNotFoundException(null, tableName, "Table " + tableName + " not found");
Map<TreeSet<String>,Long> usage = getDiskUsage(tableIds, fs, client);
String valueFormat = humanReadable ? "%9s" : "%,24d";
for (Entry<TreeSet<String>,Long> entry : usage.entrySet()) {
Object value = humanReadable ? NumUtil.bigNumberForSize(entry.getValue()) : entry.getValue();
printer.print(String.format(valueFormat + " %s", value, entry.getKey()));
static class Opts extends ServerUtilOpts {
@Parameter(description = " <table> { <table> ... } ")
List<String> tables = new ArrayList<>();
public static void main(String[] args) throws Exception {
Opts opts = new Opts();
try (TraceScope clientSpan = opts.parseArgsAndTrace(TableDiskUsage.class.getName(), args)) {
try (AccumuloClient client = Accumulo.newClient().from(opts.getClientProps()).build()) {
VolumeManager fs = opts.getServerContext().getVolumeManager();
org.apache.accumulo.server.util.TableDiskUsage.printDiskUsage(opts.tables, fs, client,