upgrade-acid/src/main/java/org/apache/hadoop/hive/upgrade/acid/UpgradeTool.java - hive - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.hadoop.hive.upgrade.acid;

 import com.google.common.annotations.VisibleForTesting;
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
 import org.apache.commons.cli.GnuParser;
 import org.apache.commons.cli.HelpFormatter;
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.Options;
 import org.apache.commons.cli.ParseException;
 import org.apache.hadoop.fs.ContentSummary;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.hive.common.ValidTxnList;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
 import org.apache.hadoop.hive.metastore.TableType;
 import org.apache.hadoop.hive.metastore.Warehouse;
 import org.apache.hadoop.hive.metastore.api.CompactionResponse;
 import org.apache.hadoop.hive.metastore.api.InvalidOperationException;
 import org.apache.hadoop.hive.metastore.api.MetaException;
 import org.apache.hadoop.hive.metastore.api.Partition;
 import org.apache.hadoop.hive.metastore.api.ShowCompactResponse;
 import org.apache.hadoop.hive.metastore.api.ShowCompactResponseElement;
 import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
 import org.apache.hadoop.hive.metastore.api.Table;
 import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
 import org.apache.hadoop.hive.metastore.txn.TxnStore;
 import org.apache.hadoop.hive.metastore.txn.TxnUtils;
 import org.apache.hadoop.hive.ql.io.AcidUtils;
 import org.apache.hadoop.hive.ql.metadata.Hive;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.serde.serdeConstants;
 import org.apache.hadoop.hive.shims.HadoopShims;
 import org.apache.hive.common.util.HiveVersionInfo;
 import org.apache.orc.OrcFile;
 import org.apache.orc.Reader;
 import org.apache.orc.impl.AcidStats;
 import org.apache.orc.impl.OrcAcidUtils;
 import org.apache.thrift.TException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;

 import static org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.escapeSQLString;

 /**
  * This utility is designed to help with upgrading to Hive 3.0.  On-disk layout for transactional
  * tables has changed in 3.0 and require pre-processing before upgrade to ensure they are readable
  * by Hive 3.0.  Some transactional tables (identified by this utility) require Major compaction
  * to be run on them before upgrading to 3.0.  Once this compaction starts, no more
  * update/delete/merge statements may be executed on these tables until upgrade is finished.
  *
  * Additionally, a new type of transactional tables was added in 3.0 - insert-only tables.  These
  * tables support ACID semantics and work with any Input/OutputFormat.  Any Managed tables may
  * be made insert-only transactional table. These tables don't support Update/Delete/Merge commands.
  *
  * This utility works in 2 modes: preUpgrade and postUpgrade.
  * In preUpgrade mode it has to have 2.x Hive jars on the classpath.  It will perform analysis on
  * existing transactional tables, determine which require compaction and generate a set of SQL
  * commands to launch all of these compactions.
  *
  * Note that depending on the number of tables/partitions and amount of data in them compactions
  * may take a significant amount of time and resources.  The script output by this utility includes
  * some heuristics that may help estimate the time required.  If no script is produced, no action
  * is needed.  For compactions to run an instance of standalone Hive Metastore must be running.
  * Please make sure hive.compactor.worker.threads is sufficiently high - this specifies the limit
  * of concurrent compactions that may be run.  Each compaction job is a Map-Reduce job.
  * hive.compactor.job.queue may be used to set a Yarn queue ame where all compaction jobs will be
  * submitted.
  *
  * In postUpgrade mode, Hive 3.0 jars/hive-site.xml should be on the classpath. This utility will
  * find all the tables that may be made transactional (with ful CRUD support) and generate
  * Alter Table commands to do so.  It will also find all tables that may not support full CRUD
  * but can be made insert-only transactional tables and generate corresponding Alter Table commands.
  *
  * TODO: rename files
  *
  * "execute" option may be supplied in both modes to have the utility automatically execute the
  * equivalent of the generated commands
  *
  * "location" option may be supplied followed by a path to set the location for the generated
  * scripts.
  */
 public class UpgradeTool {
   private static final Logger LOG = LoggerFactory.getLogger(UpgradeTool.class);
   private static final int PARTITION_BATCH_SIZE = 10000;
   private final Options cmdLineOptions = new Options();

   public static void main(String[] args) throws Exception {
     UpgradeTool tool = new UpgradeTool();
     tool.init();
     CommandLineParser parser = new GnuParser();
     CommandLine line ;
     String outputDir = ".";
     boolean preUpgrade = false, postUpgrade = false, execute = false, nonBlocking = false;
     try {
       line = parser.parse(tool.cmdLineOptions, args);
     } catch (ParseException e) {
       System.err.println("UpgradeTool: Parsing failed.  Reason: " + e.getLocalizedMessage());
       printAndExit(tool);
       return;
     }
     if (line.hasOption("help")) {
       HelpFormatter formatter = new HelpFormatter();
       formatter.printHelp("upgrade-acid", tool.cmdLineOptions);
       return;
     }
     if(line.hasOption("location")) {
       outputDir = line.getOptionValue("location");
     }
     if(line.hasOption("execute")) {
       execute = true;
     }
     if(line.hasOption("preUpgrade")) {
       preUpgrade = true;
     }
     if(line.hasOption("postUpgrade")) {
       postUpgrade = true;
     }
     LOG.info("Starting with preUpgrade=" + preUpgrade + ", postUpgrade=" + postUpgrade +
         ", execute=" + execute + ", location=" + outputDir);
     if(preUpgrade && postUpgrade) {
       throw new IllegalArgumentException("Cannot specify both preUpgrade and postUpgrade");
     }

     try {
       String hiveVer = HiveVersionInfo.getShortVersion();
       if(preUpgrade) {
         if(!hiveVer.startsWith("2.")) {
           throw new IllegalStateException("preUpgrade requires Hive 2.x.  Actual: " + hiveVer);
         }
       }
       if(postUpgrade && execute && !isTestMode) {
         if(!hiveVer.startsWith("3.")) {
           throw new IllegalStateException("postUpgrade w/execute requires Hive 3.x.  Actual: " +
               hiveVer);
         }
       }
       tool.prepareAcidUpgradeInternal(outputDir, preUpgrade, postUpgrade, execute);
     }
     catch(Exception ex) {
       LOG.error("UpgradeTool failed", ex);
       throw ex;
     }
   }
   private static void printAndExit(UpgradeTool tool) {
     HelpFormatter formatter = new HelpFormatter();
     formatter.printHelp("upgrade-acid", tool.cmdLineOptions);
     System.exit(1);
   }

   private void init() {
     try {
       cmdLineOptions.addOption(new Option("help", "print this message"));
       cmdLineOptions.addOption(new Option("preUpgrade",
           "Generates a script to execute on 2.x cluster.  This requires 2.x binaries" +
               " on the classpath and hive-site.xml."));
       cmdLineOptions.addOption(new Option("postUpgrade",
           "Generates a script to execute on 3.x cluster.  This requires 3.x binaries" +
               " on the classpath and hive-site.xml."));
       Option exec = new Option("execute",
           "Executes commands equivalent to generated scrips");
       exec.setOptionalArg(true);
       cmdLineOptions.addOption(exec);
       cmdLineOptions.addOption(new Option("location", true,
           "Location to write scripts to. Default is CWD."));
     }
     catch(Exception ex) {
       LOG.error("init()", ex);
       throw ex;
     }
   }
   /**
    * todo: this should accept a file of table names to exclude from non-acid to acid conversion
    * todo: change script comments to a preamble instead of a footer
    *
    * how does rename script work?  "hadoop fs -mv oldname newname"    * and what what about S3?
    * How does this actually get executed?
    * all other actions are done via embedded JDBC
    *
    *
    */
   private void prepareAcidUpgradeInternal(String scriptLocation, boolean preUpgrade,
       boolean postUpgrade, boolean execute) throws HiveException, TException, IOException {
     HiveConf conf = hiveConf != null ? hiveConf : new HiveConf();
     boolean isAcidEnabled = isAcidEnabled(conf);
     HiveMetaStoreClient hms = new HiveMetaStoreClient(conf);//MetaException
     LOG.debug("Looking for databases");
     List<String> databases = hms.getAllDatabases();//TException
     LOG.debug("Found " + databases.size() + " databases to process");
     List<String> compactions = new ArrayList<>();
     List<String> convertToAcid = new ArrayList<>();
     List<String> convertToMM = new ArrayList<>();
     final CompactionMetaInfo compactionMetaInfo = new CompactionMetaInfo();
     ValidTxnList txns = null;
     Hive db = null;
     if(execute) {
       db = Hive.get(conf);
     }

     for(String dbName : databases) {
       List<String> tables = hms.getAllTables(dbName);
       LOG.debug("found " + tables.size() + " tables in " + dbName);
       for(String tableName : tables) {
         Table t = hms.getTable(dbName, tableName);
         LOG.debug("processing table " + Warehouse.getQualifiedName(t));
         if(preUpgrade && isAcidEnabled) {
           //if acid is off, there can't be any acid tables - nothing to compact
           if(execute && txns == null) {
           /*
            This API changed from 2.x to 3.0.  so this won't even compile with 3.0
            but it doesn't need to since we only run this preUpgrade
           */
             TxnStore txnHandler = TxnUtils.getTxnStore(conf);
             txns = TxnUtils.createValidCompactTxnList(txnHandler.getOpenTxnsInfo());
           }
           List<String> compactionCommands =
               getCompactionCommands(t, conf, hms, compactionMetaInfo, execute, db, txns);
           compactions.addAll(compactionCommands);
         }
         if(postUpgrade && isAcidEnabled) {
           //if acid is off post upgrade, you can't make any tables acid - will throw
           processConversion(t, convertToAcid, convertToMM, hms, db, execute);
         }
         /*todo: handle renaming files somewhere*/
       }
     }
     makeCompactionScript(compactions, scriptLocation, compactionMetaInfo);
     makeConvertTableScript(convertToAcid, convertToMM, scriptLocation);
     makeRenameFileScript(scriptLocation);//todo: is this pre or post upgrade?
     //todo: can different tables be in different FileSystems?
     if(preUpgrade && execute) {
       while(compactionMetaInfo.compactionIds.size() > 0) {
         LOG.debug("Will wait for " + compactionMetaInfo.compactionIds.size() +
             " compactions to complete");
         ShowCompactResponse resp = db.showCompactions();
         for(ShowCompactResponseElement e : resp.getCompacts()) {
           final String state = e.getState();
           boolean removed;
           switch (state) {
             case TxnStore.CLEANING_RESPONSE:
             case TxnStore.SUCCEEDED_RESPONSE:
               removed = compactionMetaInfo.compactionIds.remove(e.getId());
               if(removed) {
                 LOG.debug("Required compaction succeeded: " + e.toString());
               }
               break;
             case TxnStore.ATTEMPTED_RESPONSE:
             case TxnStore.FAILED_RESPONSE:
               removed = compactionMetaInfo.compactionIds.remove(e.getId());
               if(removed) {
                 LOG.warn("Required compaction failed: " + e.toString());
               }
               break;
             case TxnStore.INITIATED_RESPONSE:
               //may flood the log
               //LOG.debug("Still waiting  on: " + e.toString());
               break;
             case TxnStore.WORKING_RESPONSE:
               LOG.debug("Still working on: " + e.toString());
               break;
             default://shouldn't be any others
               LOG.error("Unexpected state for : " + e.toString());
           }
         }
         if(compactionMetaInfo.compactionIds.size() > 0) {
           try {
             if (callback != null) {
               callback.onWaitForCompaction();
             }
             Thread.sleep(pollIntervalMs);
           } catch (InterruptedException ex) {
             ;//this only responds to ^C
           }
         }
       }
     }
   }

   /**
    * Actualy makes the table transactional
    */
   private static void alterTable(Table t, Hive db, boolean isMM)
       throws HiveException, InvalidOperationException {
     org.apache.hadoop.hive.ql.metadata.Table metaTable =
         //clone to make sure new prop doesn't leak
         new org.apache.hadoop.hive.ql.metadata.Table(t.deepCopy());
     metaTable.getParameters().put(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, "true");
     if(isMM) {
       metaTable.getParameters()
           .put(hive_metastoreConstants.TABLE_TRANSACTIONAL_PROPERTIES, "insert_only");
     }
     db.alterTable(Warehouse.getQualifiedName(t), metaTable, false, null);
   }

   /**
    * todo: handle exclusion list
    * Figures out which tables to make Acid, MM and (optionally, performs the operation)
    */
   private static void processConversion(Table t, List<String> convertToAcid,
       List<String> convertToMM, HiveMetaStoreClient hms, Hive db, boolean execute)
       throws TException, HiveException {
     if(isFullAcidTable(t)) {
       return;
     }
     if(!TableType.MANAGED_TABLE.name().equalsIgnoreCase(t.getTableType())) {
       return;
     }
     String fullTableName = Warehouse.getQualifiedName(t);
     if(t.getPartitionKeysSize() <= 0) {
       if(canBeMadeAcid(fullTableName, t.getSd())) {
         convertToAcid.add("ALTER TABLE " + Warehouse.getQualifiedName(t) + " SET TBLPROPERTIES (" +
             "'transactional'='true')");
         if(execute) {
           alterTable(t, db, false);
         }
       }
       else {
         convertToMM.add("ALTER TABLE " + Warehouse.getQualifiedName(t) + " SET TBLPROPERTIES (" +
             "'transactional'='true', 'transactional_properties'='insert_only')");
         if(execute) {
           alterTable(t, db, true);
         }
       }
     }
     else {
       /*
         each Partition may have different I/O Format so have to check them all before deciding to
         make a full CRUD table.
         Run in batches to prevent OOM
        */
       List<String> partNames = hms.listPartitionNames(t.getDbName(), t.getTableName(), (short)-1);
       int batchSize = PARTITION_BATCH_SIZE;
       int numWholeBatches = partNames.size()/batchSize;
       for(int i = 0; i < numWholeBatches; i++) {
         List<Partition> partitionList = hms.getPartitionsByNames(t.getDbName(), t.getTableName(),
             partNames.subList(i * batchSize, (i + 1) * batchSize));
         if(alterTable(fullTableName, partitionList, convertToMM, t, db, execute)) {
           return;
         }
       }
       if(numWholeBatches * batchSize < partNames.size()) {
         //last partial batch
         List<Partition> partitionList = hms.getPartitionsByNames(t.getDbName(), t.getTableName(),
             partNames.subList(numWholeBatches * batchSize, partNames.size()));
         if(alterTable(fullTableName, partitionList, convertToMM, t, db, execute)) {
           return;
         }
       }
       //if here checked all parts and they are Acid compatible - make it acid
       convertToAcid.add("ALTER TABLE " + Warehouse.getQualifiedName(t) + " SET TBLPROPERTIES (" +
           "'transactional'='true')");
       if(execute) {
         alterTable(t, db, false);
       }
     }
   }
   /**
    * @return true if table was converted/command generated
    */
   private static boolean alterTable(String fullTableName,  List<Partition> partitionList,
       List<String> convertToMM, Table t, Hive db, boolean execute)
       throws InvalidOperationException, HiveException {
     for(Partition p : partitionList) {
       if(!canBeMadeAcid(fullTableName, p.getSd())) {
         convertToMM.add("ALTER TABLE " + Warehouse.getQualifiedName(t) + " SET TBLPROPERTIES (" +
             "'transactional'='true', 'transactional_properties'='insert_only')");
         if(execute) {
           alterTable(t, db, true);
         }
         return true;
       }
     }
     return false;
   }
   private static boolean canBeMadeAcid(String fullTableName, StorageDescriptor sd) {
     return isAcidInputOutputFormat(fullTableName, sd) && sd.getSortColsSize() <= 0;
   }
   private static boolean isAcidInputOutputFormat(String fullTableName, StorageDescriptor sd) {
     try {
       Class inputFormatClass = sd.getInputFormat() == null ? null :
           Class.forName(sd.getInputFormat());
       Class outputFormatClass = sd.getOutputFormat() == null ? null :
           Class.forName(sd.getOutputFormat());

       if (inputFormatClass != null && outputFormatClass != null &&
           Class.forName("org.apache.hadoop.hive.ql.io.AcidInputFormat")
               .isAssignableFrom(inputFormatClass) &&
           Class.forName("org.apache.hadoop.hive.ql.io.AcidOutputFormat")
               .isAssignableFrom(outputFormatClass)) {
         return true;
       }
     } catch (ClassNotFoundException e) {
       //if a table is using some custom I/O format and it's not in the classpath, we won't mark
       //the table for Acid, but today (Hive 3.1 and earlier) OrcInput/OutputFormat is the only
       //Acid format
       LOG.error("Could not determine if " + fullTableName +
           " can be made Acid due to: " + e.getMessage(), e);
       return false;
     }
     return false;
   }
   /**
    * Generates a set compaction commands to run on pre Hive 3 cluster
    */
   private static void makeCompactionScript(List<String> commands, String scriptLocation,
       CompactionMetaInfo compactionMetaInfo) throws IOException {
     if (commands.isEmpty()) {
       LOG.info("No compaction is necessary");
       return;
     }
     String fileName = "compacts_" + System.currentTimeMillis() + ".sql";
     LOG.debug("Writing compaction commands to " + fileName);
     try(PrintWriter pw = createScript(commands, fileName, scriptLocation)) {
       //add post script
       pw.println("-- Generated total of " + commands.size() + " compaction commands");
       if(compactionMetaInfo.numberOfBytes < Math.pow(2, 20)) {
         //to see it working in UTs
         pw.println("-- The total volume of data to be compacted is " +
             String.format("%.6fMB", compactionMetaInfo.numberOfBytes/Math.pow(2, 20)));
       }
       else {
         pw.println("-- The total volume of data to be compacted is " +
             String.format("%.3fGB", compactionMetaInfo.numberOfBytes/Math.pow(2, 30)));
       }
       pw.println();
       //todo: should be at the top of the file...
       pw.println(
           "-- Please note that compaction may be a heavyweight and time consuming process.\n" +
               "-- Submitting all of these commands will enqueue them to a scheduling queue from\n" +
               "-- which they will be picked up by compactor Workers.  The max number of\n" +
               "-- concurrent Workers is controlled by hive.compactor.worker.threads configured\n" +
               "-- for the standalone metastore process.  Compaction itself is a Map-Reduce job\n" +
               "-- which is submitted to the YARN queue identified by hive.compactor.job.queue\n" +
               "-- property if defined or 'default' if not defined.  It's advisable to set the\n" +
               "-- capacity of this queue appropriately");
     }
   }
   private static void makeConvertTableScript(List<String> alterTableAcid, List<String> alterTableMm,
       String scriptLocation) throws IOException {
     if (alterTableAcid.isEmpty()) {
       LOG.info("No acid conversion is necessary");
     } else {
       String fileName = "convertToAcid_" + System.currentTimeMillis() + ".sql";
       LOG.debug("Writing CRUD conversion commands to " + fileName);
       try(PrintWriter pw = createScript(alterTableAcid, fileName, scriptLocation)) {
         //todo: fix this - it has to run in 3.0 since tables may be unbucketed
         pw.println("-- These commands may be executed by Hive 1.x later");
       }
     }

     if (alterTableMm.isEmpty()) {
       LOG.info("No managed table conversion is necessary");
     } else {
       String fileName = "convertToMM_" + System.currentTimeMillis() + ".sql";
       LOG.debug("Writing managed table conversion commands to " + fileName);
       try(PrintWriter pw = createScript(alterTableMm, fileName, scriptLocation)) {
         pw.println("-- These commands must be executed by Hive 3.0 or later");
       }
     }
   }

   private static PrintWriter createScript(List<String> commands, String fileName,
       String scriptLocation) throws IOException {
     FileWriter fw = new FileWriter(scriptLocation + "/" + fileName);
     PrintWriter pw = new PrintWriter(fw);
     for(String cmd : commands) {
       pw.println(cmd + ";");
     }
     return pw;
   }
   private static void makeRenameFileScript(String scriptLocation) throws IOException {
     List<String> commands = Collections.emptyList();
     if (commands.isEmpty()) {
       LOG.info("No file renaming is necessary");
     } else {
       String fileName = "normalizeFileNames_" + System.currentTimeMillis() + ".sh";
       LOG.debug("Writing file renaming commands to " + fileName);
       PrintWriter pw = createScript(commands, fileName, scriptLocation);
       pw.close();
     }
   }
   /**
    * @return any compaction commands to run for {@code Table t}
    */
   private static List<String> getCompactionCommands(Table t, HiveConf conf,
       HiveMetaStoreClient hms, CompactionMetaInfo compactionMetaInfo, boolean execute, Hive db,
       ValidTxnList txns) throws IOException, TException, HiveException {
     if(!isFullAcidTable(t)) {
       return Collections.emptyList();
     }
     if(t.getPartitionKeysSize() <= 0) {
       //not partitioned
       if(!needsCompaction(new Path(t.getSd().getLocation()), conf, compactionMetaInfo, txns)) {
         return Collections.emptyList();
       }

       List<String> cmds = new ArrayList<>();
       cmds.add(getCompactionCommand(t, null));
       if(execute) {
         scheduleCompaction(t, null, db, compactionMetaInfo);
       }
       return cmds;
     }
     List<String> partNames = hms.listPartitionNames(t.getDbName(), t.getTableName(), (short)-1);
     int batchSize = PARTITION_BATCH_SIZE;
     int numWholeBatches = partNames.size()/batchSize;
     List<String> compactionCommands = new ArrayList<>();
     for(int i = 0; i < numWholeBatches; i++) {
       List<Partition> partitionList = hms.getPartitionsByNames(t.getDbName(), t.getTableName(),
           partNames.subList(i * batchSize, (i + 1) * batchSize));
       getCompactionCommands(t, partitionList, db, execute, compactionCommands,
           compactionMetaInfo, conf, txns);
     }
     if(numWholeBatches * batchSize < partNames.size()) {
       //last partial batch
       List<Partition> partitionList = hms.getPartitionsByNames(t.getDbName(), t.getTableName(),
           partNames.subList(numWholeBatches * batchSize, partNames.size()));
       getCompactionCommands(t, partitionList, db, execute, compactionCommands,
           compactionMetaInfo, conf, txns);
     }
     return compactionCommands;
   }
   private static void getCompactionCommands(Table t, List<Partition> partitionList, Hive db,
       boolean execute, List<String> compactionCommands, CompactionMetaInfo compactionMetaInfo,
       HiveConf conf, ValidTxnList txns)
       throws IOException, TException, HiveException {
     for (Partition p : partitionList) {
       if (needsCompaction(new Path(p.getSd().getLocation()), conf, compactionMetaInfo, txns)) {
         compactionCommands.add(getCompactionCommand(t, p));
         if (execute) {
           scheduleCompaction(t, p, db, compactionMetaInfo);
         }
       }
     }
   }
   private static void scheduleCompaction(Table t, Partition p, Hive db,
       CompactionMetaInfo compactionMetaInfo) throws HiveException, MetaException {
     String partName = p == null ? null :
         Warehouse.makePartName(t.getPartitionKeys(), p.getValues());
     CompactionResponse resp =
         //this gives an easy way to get at compaction ID so we can only wait for those this
         //utility started
         db.compact2(t.getDbName(), t.getTableName(), partName, "major", null);
     if(!resp.isAccepted()) {
       LOG.info(Warehouse.getQualifiedName(t) + (p == null ? "" : "/" + partName) +
           " is already being compacted with id=" + resp.getId());
     }
     else {
       LOG.info("Scheduled compaction for " + Warehouse.getQualifiedName(t) +
           (p == null ? "" : "/" + partName) + " with id=" + resp.getId());
     }
     compactionMetaInfo.compactionIds.add(resp.getId());
   }
   /**
    *
    * @param location - path to a partition (or table if not partitioned) dir
    */
   private static boolean needsCompaction2(Path location, HiveConf conf,
       CompactionMetaInfo compactionMetaInfo) throws IOException {
     FileSystem fs = location.getFileSystem(conf);
     FileStatus[] deltas = fs.listStatus(location, new PathFilter() {
       @Override
       public boolean accept(Path path) {
         //checking for delete_delta is only so that this functionality can be exercised by code 3.0
         //which cannot produce any deltas with mix of update/insert events
         return path.getName().startsWith("delta_") || path.getName().startsWith("delete_delta_");
       }
     });
     if(deltas == null || deltas.length == 0) {
       //base_n cannot contain update/delete.  Original files are all 'insert' and we need to compact
       //only if there are update/delete events.
       return false;
     }
     deltaLoop: for(FileStatus delta : deltas) {
       if(!delta.isDirectory()) {
         //should never happen - just in case
         continue;
       }
       FileStatus[] buckets = fs.listStatus(delta.getPath(), new PathFilter() {
         @Override
         public boolean accept(Path path) {
           //since this is inside a delta dir created by Hive 2.x or earlier it can only contain
           //bucket_x or bucket_x__flush_length
           return path.getName().startsWith("bucket_");
         }
       });
       for(FileStatus bucket : buckets) {
         if(bucket.getPath().getName().endsWith("_flush_length")) {
           //streaming ingest dir - cannot have update/delete events
           continue deltaLoop;
         }
         if(needsCompaction(bucket, fs)) {
           //found delete events - this 'location' needs compacting
           compactionMetaInfo.numberOfBytes += getDataSize(location, conf);
           //todo: this is not remotely accurate if you have many (relevant) original files
           return true;
         }
       }
     }
     return false;
   }
   /**
    *
    * @param location - path to a partition (or table if not partitioned) dir
    */
   private static boolean needsCompaction(Path location, HiveConf conf,
       CompactionMetaInfo compactionMetaInfo, ValidTxnList txns) throws IOException {
     FileSystem fs = location.getFileSystem(conf);
     FileStatus[] deltas = fs.listStatus(location, new PathFilter() {
       @Override
       public boolean accept(Path path) {
         //checking for delete_delta is only so that this functionality can be exercised by code 3.0
         //which cannot produce any deltas with mix of update/insert events
         return path.getName().startsWith("delta_") || path.getName().startsWith("delete_delta_");
       }
     });
     if(deltas == null || deltas.length == 0) {
       //base_n cannot contain update/delete.  Original files are all 'insert' and we need to compact
       //only if there are update/delete events.
       return false;
     }
     /*getAcidState() is smart not to return any deltas in current if there is a base that covers
     * them, i.e. if they were compacted but not yet cleaned.  This means re-checking if
     * compaction is needed should cheap(er)*/
     AcidUtils.Directory dir = AcidUtils.getAcidState(location, conf, txns);
     deltaLoop: for(AcidUtils.ParsedDelta delta : dir.getCurrentDirectories()) {
       FileStatus[] buckets = fs.listStatus(delta.getPath(), new PathFilter() {
         @Override
         public boolean accept(Path path) {
           //since this is inside a delta dir created by Hive 2.x or earlier it can only contain
           //bucket_x or bucket_x__flush_length
           return path.getName().startsWith("bucket_");
         }
       });
       for(FileStatus bucket : buckets) {
         if(bucket.getPath().getName().endsWith("_flush_length")) {
           //streaming ingest dir - cannot have update/delete events
           continue deltaLoop;
         }
         if(needsCompaction(bucket, fs)) {
           //found delete events - this 'location' needs compacting
           compactionMetaInfo.numberOfBytes += getDataSize(location, conf);

           //if there are un-compacted original files, they will be included in compaction, so
           //count at the size for 'cost' estimation later
           for(HadoopShims.HdfsFileStatusWithId origFile : dir.getOriginalFiles()) {
             FileStatus fileStatus = origFile.getFileStatus();
             if(fileStatus != null) {
               compactionMetaInfo.numberOfBytes += fileStatus.getLen();
             }
           }
           return true;
         }
       }
     }
     return false;
   }

   /**
    * @param location - path to a partition (or table if not partitioned) dir
    */
   private static long getDataSize(Path location, HiveConf conf) throws IOException {
     FileSystem fs = location.getFileSystem(conf);
     ContentSummary cs = fs.getContentSummary(location);
     return cs.getLength();
   }
   private static boolean needsCompaction(FileStatus bucket, FileSystem fs) throws IOException {
     //create reader, look at footer
     //no need to check side file since it can only be in a streaming ingest delta
     Reader orcReader = OrcFile.createReader(bucket.getPath(),OrcFile.readerOptions(fs.getConf())
         .filesystem(fs));
     AcidStats as = OrcAcidUtils.parseAcidStats(orcReader);
     if(as == null) {
       //should never happen since we are reading bucket_x written by acid write
       throw new IllegalStateException("AcidStats missing in " + bucket.getPath());
     }
     return as.deletes > 0 || as.updates > 0;
   }
   private static String getCompactionCommand(Table t, Partition p) {
     StringBuilder sb = new StringBuilder("ALTER TABLE ").append(Warehouse.getQualifiedName(t));
     if(t.getPartitionKeysSize() > 0) {
       assert p != null : "must supply partition for partitioned table " +
           Warehouse.getQualifiedName(t);
       sb.append(" PARTITION(");
       for (int i = 0; i < t.getPartitionKeysSize(); i++) {
         sb.append(t.getPartitionKeys().get(i).getName()).append('=').append(
             genPartValueString(t.getPartitionKeys().get(i).getType(), p.getValues().get(i))).
             append(",");
       }
       sb.setCharAt(sb.length() - 1, ')');//replace trailing ','
     }
     return sb.append(" COMPACT 'major'").toString();
   }

   /**
    * This is copy-pasted from {@link org.apache.hadoop.hive.ql.parse.ColumnStatsSemanticAnalyzer},
    * which can't be refactored since this is linked against Hive 2.x
    */
   private static String genPartValueString(String partColType, String partVal)  {
     String returnVal = partVal;
     if (partColType.equals(serdeConstants.STRING_TYPE_NAME) ||
         partColType.contains(serdeConstants.VARCHAR_TYPE_NAME) ||
         partColType.contains(serdeConstants.CHAR_TYPE_NAME)) {
       returnVal = "'" + escapeSQLString(partVal) + "'";
     } else if (partColType.equals(serdeConstants.TINYINT_TYPE_NAME)) {
       returnVal = partVal + "Y";
     } else if (partColType.equals(serdeConstants.SMALLINT_TYPE_NAME)) {
       returnVal = partVal + "S";
     } else if (partColType.equals(serdeConstants.INT_TYPE_NAME)) {
       returnVal = partVal;
     } else if (partColType.equals(serdeConstants.BIGINT_TYPE_NAME)) {
       returnVal = partVal + "L";
     } else if (partColType.contains(serdeConstants.DECIMAL_TYPE_NAME)) {
       returnVal = partVal + "BD";
     } else if (partColType.equals(serdeConstants.DATE_TYPE_NAME) ||
         partColType.equals(serdeConstants.TIMESTAMP_TYPE_NAME)) {
       returnVal = partColType + " '" + escapeSQLString(partVal) + "'";
     } else {
       //for other usually not used types, just quote the value
       returnVal = "'" + escapeSQLString(partVal) + "'";
     }

     return returnVal;
   }
   private static boolean isFullAcidTable(Table t) {
     if (t.getParametersSize() <= 0) {
       //cannot be acid
       return false;
     }
     String transacationalValue = t.getParameters()
         .get(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL);
     if (transacationalValue != null && "true".equalsIgnoreCase(transacationalValue)) {
       System.out.println("Found Acid table: " + Warehouse.getQualifiedName(t));
       return true;
     }
     return false;
   }
   private static boolean isAcidEnabled(HiveConf hiveConf) {
     String txnMgr = hiveConf.getVar(HiveConf.ConfVars.HIVE_TXN_MANAGER);
     boolean concurrency =  hiveConf.getBoolVar(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY);
     String dbTxnMgr = "org.apache.hadoop.hive.ql.lockmgr.DbTxnManager";
     return txnMgr.equals(dbTxnMgr) && concurrency;
   }

   private static class CompactionMetaInfo {
     /**
      * total number of bytes to be compacted across all compaction commands
      */
     long numberOfBytes;
     /**
      * IDs of compactions launched by this utility
      */
     Set<Long> compactionIds = new HashSet<>();
   }

   @VisibleForTesting
   static abstract class Callback {
     /**
      * This is a hack enable Unit testing.  Derby can't handle multiple concurrent threads but
      * somehow Compactor needs to run to test "execute" mode.  This callback can be used
      * to run Worker.  For TESTING ONLY.
      */
     void onWaitForCompaction() throws MetaException {}
   }
   @VisibleForTesting
   static Callback callback;
   @VisibleForTesting
   static int pollIntervalMs = 1000*30;
   /**
    * Also to enable testing until I set up Maven profiles to be able to run with 3.0 jars
    */
   @VisibleForTesting
   static boolean isTestMode = false;
   /**
    * can set it from tests to test when config needs something other than default values
    */
   @VisibleForTesting
   static HiveConf hiveConf = null;
 }