blob: 3f7dc74f1de8242db9f766c210076f6538fd3db5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.accumulo.master;
import static com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly;
import static java.lang.Math.min;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.TimeUnit;
import org.apache.accumulo.core.Constants;
import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.BatchWriter;
import org.apache.accumulo.core.client.BatchWriterConfig;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.MutationsRejectedException;
import org.apache.accumulo.core.client.RowIterator;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.conf.Property;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.PartialKey;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.data.impl.KeyExtent;
import org.apache.accumulo.core.master.state.tables.TableState;
import org.apache.accumulo.core.master.thrift.MasterState;
import org.apache.accumulo.core.master.thrift.TabletServerStatus;
import org.apache.accumulo.core.metadata.MetadataTable;
import org.apache.accumulo.core.metadata.RootTable;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.ChoppedColumnFamily;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.CurrentLocationColumnFamily;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.DataFileColumnFamily;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.FutureLocationColumnFamily;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.accumulo.core.tabletserver.thrift.NotServingTabletException;
import org.apache.accumulo.core.util.Daemon;
import org.apache.accumulo.master.Master.TabletGoalState;
import org.apache.accumulo.master.state.MergeStats;
import org.apache.accumulo.master.state.TableCounts;
import org.apache.accumulo.master.state.TableStats;
import org.apache.accumulo.server.ServerConstants;
import org.apache.accumulo.server.conf.TableConfiguration;
import org.apache.accumulo.server.fs.FileRef;
import org.apache.accumulo.server.fs.VolumeManager.FileType;
import org.apache.accumulo.server.log.WalStateManager;
import org.apache.accumulo.server.log.WalStateManager.WalMarkerException;
import org.apache.accumulo.server.master.LiveTServerSet.TServerConnection;
import org.apache.accumulo.server.master.state.Assignment;
import org.apache.accumulo.server.master.state.ClosableIterator;
import org.apache.accumulo.server.master.state.DistributedStoreException;
import org.apache.accumulo.server.master.state.MergeInfo;
import org.apache.accumulo.server.master.state.MergeState;
import org.apache.accumulo.server.master.state.TServerInstance;
import org.apache.accumulo.server.master.state.TabletLocationState;
import org.apache.accumulo.server.master.state.TabletLocationState.BadLocationStateException;
import org.apache.accumulo.server.master.state.TabletState;
import org.apache.accumulo.server.master.state.TabletStateStore;
import org.apache.accumulo.server.tables.TableManager;
import org.apache.accumulo.server.tablets.TabletTime;
import org.apache.accumulo.server.util.MetadataTableUtil;
import org.apache.accumulo.server.zookeeper.ZooReaderWriter;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.thrift.TException;
import com.google.common.base.Optional;
import com.google.common.collect.ImmutableSortedSet;
import com.google.common.collect.Iterators;
abstract class TabletGroupWatcher extends Daemon {
// Constants used to make sure assignment logging isn't excessive in quantity or size
private static final String ASSIGNMENT_BUFFER_SEPARATOR = ", ";
private static final int ASSINGMENT_BUFFER_MAX_LENGTH = 4096;
private final Master master;
final TabletStateStore store;
final TabletGroupWatcher dependentWatcher;
private MasterState masterState;
final TableStats stats = new TableStats();
private SortedSet<TServerInstance> lastScanServers = ImmutableSortedSet.of();
TabletGroupWatcher(Master master, TabletStateStore store, TabletGroupWatcher dependentWatcher) {
this.master = master;
this.store = store;
this.dependentWatcher = dependentWatcher;
}
/** Should this {@code TabletGroupWatcher} suspend tablets? */
abstract boolean canSuspendTablets();
Map<String,TableCounts> getStats() {
return stats.getLast();
}
// returns the master state under which stats were collected
MasterState statsState() {
return masterState;
}
TableCounts getStats(String tableId) {
return stats.getLast(tableId);
}
/** True if the collection of live tservers specified in 'candidates' hasn't changed since the last time an assignment scan was started. */
public synchronized boolean isSameTserversAsLastScan(Set<TServerInstance> candidates) {
return candidates.equals(lastScanServers);
}
@Override
public void run() {
Thread.currentThread().setName("Watching " + store.name());
int[] oldCounts = new int[TabletState.values().length];
EventCoordinator.Listener eventListener = this.master.nextEvent.getListener();
WalStateManager wals = new WalStateManager(master.getInstance(), ZooReaderWriter.getInstance());
while (this.master.stillMaster()) {
// slow things down a little, otherwise we spam the logs when there are many wake-up events
sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
masterState = master.getMasterState();
int totalUnloaded = 0;
int unloaded = 0;
ClosableIterator<TabletLocationState> iter = null;
try {
Map<String,MergeStats> mergeStatsCache = new HashMap<>();
Map<String,MergeStats> currentMerges = new HashMap<>();
for (MergeInfo merge : master.merges()) {
if (merge.getExtent() != null) {
currentMerges.put(merge.getExtent().getTableId(), new MergeStats(merge));
}
}
// Get the current status for the current list of tservers
SortedMap<TServerInstance,TabletServerStatus> currentTServers = new TreeMap<>();
for (TServerInstance entry : this.master.tserverSet.getCurrentServers()) {
currentTServers.put(entry, this.master.tserverStatus.get(entry));
}
if (currentTServers.size() == 0) {
eventListener.waitForEvents(Master.TIME_TO_WAIT_BETWEEN_SCANS);
synchronized (this) {
lastScanServers = ImmutableSortedSet.of();
}
continue;
}
// Don't move tablets to servers that are shutting down
SortedMap<TServerInstance,TabletServerStatus> destinations = new TreeMap<>(currentTServers);
destinations.keySet().removeAll(this.master.serversToShutdown);
List<Assignment> assignments = new ArrayList<Assignment>();
List<Assignment> assigned = new ArrayList<Assignment>();
List<TabletLocationState> assignedToDeadServers = new ArrayList<>();
List<TabletLocationState> suspendedToGoneServers = new ArrayList<>();
Map<KeyExtent,TServerInstance> unassigned = new HashMap<>();
Map<TServerInstance,List<Path>> logsForDeadServers = new TreeMap<>();
MasterState masterState = master.getMasterState();
int[] counts = new int[TabletState.values().length];
stats.begin();
// Walk through the tablets in our store, and work tablets
// towards their goal
iter = store.iterator();
while (iter.hasNext()) {
TabletLocationState tls = iter.next();
if (tls == null) {
continue;
}
Master.log.debug(store.name() + " location State: " + tls);
// ignore entries for tables that do not exist in zookeeper
if (TableManager.getInstance().getTableState(tls.extent.getTableId()) == null)
continue;
if (Master.log.isTraceEnabled())
Master.log.trace(tls + " walogs " + tls.walogs.size());
// Don't overwhelm the tablet servers with work
if (unassigned.size() + unloaded > Master.MAX_TSERVER_WORK_CHUNK * currentTServers.size()) {
flushChanges(destinations, assignments, assigned, assignedToDeadServers, logsForDeadServers, suspendedToGoneServers, unassigned);
assignments.clear();
assigned.clear();
assignedToDeadServers.clear();
suspendedToGoneServers.clear();
unassigned.clear();
unloaded = 0;
eventListener.waitForEvents(Master.TIME_TO_WAIT_BETWEEN_SCANS);
}
String tableId = tls.extent.getTableId();
TableConfiguration tableConf = this.master.getConfigurationFactory().getTableConfiguration(tableId);
MergeStats mergeStats = mergeStatsCache.get(tableId);
if (mergeStats == null) {
mergeStats = currentMerges.get(tableId);
if (mergeStats == null) {
mergeStats = new MergeStats(new MergeInfo());
}
mergeStatsCache.put(tableId, mergeStats);
}
TabletGoalState goal = this.master.getGoalState(tls, mergeStats.getMergeInfo());
TServerInstance server = tls.getServer();
TabletState state = tls.getState(currentTServers.keySet());
if (Master.log.isTraceEnabled()) {
Master.log.trace("Goal state " + goal + " current " + state + " for " + tls.extent);
}
stats.update(tableId, state);
mergeStats.update(tls.extent, state, tls.chopped, !tls.walogs.isEmpty());
sendChopRequest(mergeStats.getMergeInfo(), state, tls);
sendSplitRequest(mergeStats.getMergeInfo(), state, tls);
// Always follow through with assignments
if (state == TabletState.ASSIGNED) {
goal = TabletGoalState.HOSTED;
}
// if we are shutting down all the tabletservers, we have to do it in order
if (goal == TabletGoalState.SUSPENDED && state == TabletState.HOSTED) {
if (this.master.serversToShutdown.equals(currentTServers.keySet())) {
if (dependentWatcher != null && dependentWatcher.assignedOrHosted() > 0) {
goal = TabletGoalState.HOSTED;
}
}
}
if (goal == TabletGoalState.HOSTED) {
if (state != TabletState.HOSTED && !tls.walogs.isEmpty()) {
if (this.master.recoveryManager.recoverLogs(tls.extent, tls.walogs))
continue;
}
switch (state) {
case HOSTED:
if (server.equals(this.master.migrations.get(tls.extent)))
this.master.migrations.remove(tls.extent);
break;
case ASSIGNED_TO_DEAD_SERVER:
assignedToDeadServers.add(tls);
if (server.equals(this.master.migrations.get(tls.extent)))
this.master.migrations.remove(tls.extent);
TServerInstance tserver = tls.futureOrCurrent();
if (!logsForDeadServers.containsKey(tserver)) {
logsForDeadServers.put(tserver, wals.getWalsInUse(tserver));
}
break;
case SUSPENDED:
if (master.getSteadyTime() - tls.suspend.suspensionTime < tableConf.getTimeInMillis(Property.TABLE_SUSPEND_DURATION)) {
// Tablet is suspended. See if its tablet server is back.
TServerInstance returnInstance = null;
Iterator<TServerInstance> find = destinations.tailMap(new TServerInstance(tls.suspend.server, " ")).keySet().iterator();
if (find.hasNext()) {
TServerInstance found = find.next();
if (found.getLocation().equals(tls.suspend.server)) {
returnInstance = found;
}
}
// Old tablet server is back. Return this tablet to its previous owner.
if (returnInstance != null) {
assignments.add(new Assignment(tls.extent, returnInstance));
} else {
// leave suspended, don't ask for a new assignment.
}
} else {
// Treat as unassigned, ask for a new assignment.
unassigned.put(tls.extent, server);
}
break;
case UNASSIGNED:
// maybe it's a finishing migration
TServerInstance dest = this.master.migrations.get(tls.extent);
if (dest != null) {
// if destination is still good, assign it
if (destinations.keySet().contains(dest)) {
assignments.add(new Assignment(tls.extent, dest));
} else {
// get rid of this migration
this.master.migrations.remove(tls.extent);
unassigned.put(tls.extent, server);
}
} else {
unassigned.put(tls.extent, server);
}
break;
case ASSIGNED:
// Send another reminder
assigned.add(new Assignment(tls.extent, tls.future));
break;
}
} else {
switch (state) {
case SUSPENDED:
// Request a move to UNASSIGNED, so as to allow balancing to continue.
suspendedToGoneServers.add(tls);
cancelOfflineTableMigrations(tls);
break;
case UNASSIGNED:
cancelOfflineTableMigrations(tls);
break;
case ASSIGNED_TO_DEAD_SERVER:
assignedToDeadServers.add(tls);
if (!logsForDeadServers.containsKey(tls.futureOrCurrent())) {
logsForDeadServers.put(tls.futureOrCurrent(), wals.getWalsInUse(tls.futureOrCurrent()));
}
break;
case HOSTED:
TServerConnection conn = this.master.tserverSet.getConnection(server);
if (conn != null) {
conn.unloadTablet(this.master.masterLock, tls.extent, goal.howUnload(), master.getSteadyTime());
unloaded++;
totalUnloaded++;
} else {
Master.log.warn("Could not connect to server " + server);
}
break;
case ASSIGNED:
break;
}
}
counts[state.ordinal()]++;
}
flushChanges(destinations, assignments, assigned, assignedToDeadServers, logsForDeadServers, suspendedToGoneServers, unassigned);
// provide stats after flushing changes to avoid race conditions w/ delete table
stats.end(masterState);
// Report changes
for (TabletState state : TabletState.values()) {
int i = state.ordinal();
if (counts[i] > 0 && counts[i] != oldCounts[i]) {
this.master.nextEvent.event("[%s]: %d tablets are %s", store.name(), counts[i], state.name());
}
}
Master.log.debug(String.format("[%s]: scan time %.2f seconds", store.name(), stats.getScanTime() / 1000.));
oldCounts = counts;
if (totalUnloaded > 0) {
this.master.nextEvent.event("[%s]: %d tablets unloaded", store.name(), totalUnloaded);
}
updateMergeState(mergeStatsCache);
synchronized (this) {
lastScanServers = ImmutableSortedSet.copyOf(currentTServers.keySet());
}
if (this.master.tserverSet.getCurrentServers().equals(currentTServers.keySet())) {
Master.log.debug(String.format("[%s] sleeping for %.2f seconds", store.name(), Master.TIME_TO_WAIT_BETWEEN_SCANS / 1000.));
eventListener.waitForEvents(Master.TIME_TO_WAIT_BETWEEN_SCANS);
} else {
Master.log.info("Detected change in current tserver set, re-running state machine.");
}
} catch (Exception ex) {
Master.log.error("Error processing table state for store " + store.name(), ex);
if (ex.getCause() != null && ex.getCause() instanceof BadLocationStateException) {
repairMetadata(((BadLocationStateException) ex.getCause()).getEncodedEndRow());
} else {
sleepUninterruptibly(Master.WAIT_BETWEEN_ERRORS, TimeUnit.MILLISECONDS);
}
} finally {
if (iter != null) {
try {
iter.close();
} catch (IOException ex) {
Master.log.warn("Error closing TabletLocationState iterator: " + ex, ex);
}
}
}
}
}
private void cancelOfflineTableMigrations(TabletLocationState tls) {
TServerInstance dest = this.master.migrations.get(tls.extent);
TableState tableState = TableManager.getInstance().getTableState(tls.extent.getTableId());
if (dest != null && tableState == TableState.OFFLINE) {
this.master.migrations.remove(tls.extent);
}
}
private void repairMetadata(Text row) {
Master.log.debug("Attempting repair on " + row);
// ACCUMULO-2261 if a dying tserver writes a location before its lock information propagates, it may cause duplicate assignment.
// Attempt to find the dead server entry and remove it.
try {
Map<Key,Value> future = new HashMap<>();
Map<Key,Value> assigned = new HashMap<>();
KeyExtent extent = new KeyExtent(row, new Value(new byte[] {0}));
String table = MetadataTable.NAME;
if (extent.isMeta())
table = RootTable.NAME;
Scanner scanner = this.master.getConnector().createScanner(table, Authorizations.EMPTY);
scanner.fetchColumnFamily(CurrentLocationColumnFamily.NAME);
scanner.fetchColumnFamily(FutureLocationColumnFamily.NAME);
scanner.setRange(new Range(row));
for (Entry<Key,Value> entry : scanner) {
if (entry.getKey().getColumnFamily().equals(CurrentLocationColumnFamily.NAME)) {
assigned.put(entry.getKey(), entry.getValue());
} else if (entry.getKey().getColumnFamily().equals(FutureLocationColumnFamily.NAME)) {
future.put(entry.getKey(), entry.getValue());
}
}
if (future.size() > 0 && assigned.size() > 0) {
Master.log.warn("Found a tablet assigned and hosted, attempting to repair");
} else if (future.size() > 1 && assigned.size() == 0) {
Master.log.warn("Found a tablet assigned to multiple servers, attempting to repair");
} else if (future.size() == 0 && assigned.size() > 1) {
Master.log.warn("Found a tablet hosted on multiple servers, attempting to repair");
} else {
Master.log.info("Attempted a repair, but nothing seems to be obviously wrong. " + assigned + " " + future);
return;
}
Iterator<Entry<Key,Value>> iter = Iterators.concat(future.entrySet().iterator(), assigned.entrySet().iterator());
while (iter.hasNext()) {
Entry<Key,Value> entry = iter.next();
TServerInstance alive = master.tserverSet.find(entry.getValue().toString());
if (alive == null) {
Master.log.info("Removing entry " + entry);
BatchWriter bw = this.master.getConnector().createBatchWriter(table, new BatchWriterConfig());
Mutation m = new Mutation(entry.getKey().getRow());
m.putDelete(entry.getKey().getColumnFamily(), entry.getKey().getColumnQualifier());
bw.addMutation(m);
bw.close();
return;
}
}
Master.log.error("Metadata table is inconsistent at " + row + " and all assigned/future tservers are still online.");
} catch (Throwable e) {
Master.log.error("Error attempting repair of metadata " + row + ": " + e, e);
}
}
private int assignedOrHosted() {
int result = 0;
for (TableCounts counts : stats.getLast().values()) {
result += counts.assigned() + counts.hosted();
}
return result;
}
private void sendSplitRequest(MergeInfo info, TabletState state, TabletLocationState tls) {
// Already split?
if (!info.getState().equals(MergeState.SPLITTING))
return;
// Merges don't split
if (!info.isDelete())
return;
// Online and ready to split?
if (!state.equals(TabletState.HOSTED))
return;
// Does this extent cover the end points of the delete?
KeyExtent range = info.getExtent();
if (tls.extent.overlaps(range)) {
for (Text splitPoint : new Text[] {range.getPrevEndRow(), range.getEndRow()}) {
if (splitPoint == null)
continue;
if (!tls.extent.contains(splitPoint))
continue;
if (splitPoint.equals(tls.extent.getEndRow()))
continue;
if (splitPoint.equals(tls.extent.getPrevEndRow()))
continue;
try {
TServerConnection conn;
conn = this.master.tserverSet.getConnection(tls.current);
if (conn != null) {
Master.log.info("Asking " + tls.current + " to split " + tls.extent + " at " + splitPoint);
conn.splitTablet(this.master.masterLock, tls.extent, splitPoint);
} else {
Master.log.warn("Not connected to server " + tls.current);
}
} catch (NotServingTabletException e) {
Master.log.debug("Error asking tablet server to split a tablet: " + e);
} catch (Exception e) {
Master.log.warn("Error asking tablet server to split a tablet: " + e);
}
}
}
}
private void sendChopRequest(MergeInfo info, TabletState state, TabletLocationState tls) {
// Don't bother if we're in the wrong state
if (!info.getState().equals(MergeState.WAITING_FOR_CHOPPED))
return;
// Tablet must be online
if (!state.equals(TabletState.HOSTED))
return;
// Tablet isn't already chopped
if (tls.chopped)
return;
// Tablet ranges intersect
if (info.needsToBeChopped(tls.extent)) {
TServerConnection conn;
try {
conn = this.master.tserverSet.getConnection(tls.current);
if (conn != null) {
Master.log.info("Asking " + tls.current + " to chop " + tls.extent);
conn.chop(this.master.masterLock, tls.extent);
} else {
Master.log.warn("Could not connect to server " + tls.current);
}
} catch (TException e) {
Master.log.warn("Communications error asking tablet server to chop a tablet");
}
}
}
private void updateMergeState(Map<String,MergeStats> mergeStatsCache) {
for (MergeStats stats : mergeStatsCache.values()) {
try {
MergeState update = stats.nextMergeState(this.master.getConnector(), this.master);
// when next state is MERGING, its important to persist this before
// starting the merge... the verification check that is done before
// moving into the merging state could fail if merge starts but does
// not finish
if (update == MergeState.COMPLETE)
update = MergeState.NONE;
if (update != stats.getMergeInfo().getState()) {
this.master.setMergeState(stats.getMergeInfo(), update);
}
if (update == MergeState.MERGING) {
try {
if (stats.getMergeInfo().isDelete()) {
deleteTablets(stats.getMergeInfo());
} else {
mergeMetadataRecords(stats.getMergeInfo());
}
this.master.setMergeState(stats.getMergeInfo(), update = MergeState.COMPLETE);
} catch (Exception ex) {
Master.log.error("Unable merge metadata table records", ex);
}
}
} catch (Exception ex) {
Master.log.error("Unable to update merge state for merge " + stats.getMergeInfo().getExtent(), ex);
}
}
}
private void deleteTablets(MergeInfo info) throws AccumuloException {
KeyExtent extent = info.getExtent();
String targetSystemTable = extent.isMeta() ? RootTable.NAME : MetadataTable.NAME;
Master.log.debug("Deleting tablets for " + extent);
char timeType = '\0';
KeyExtent followingTablet = null;
if (extent.getEndRow() != null) {
Key nextExtent = new Key(extent.getEndRow()).followingKey(PartialKey.ROW);
followingTablet = getHighTablet(new KeyExtent(extent.getTableId(), nextExtent.getRow(), extent.getEndRow()));
Master.log.debug("Found following tablet " + followingTablet);
}
try {
Connector conn = this.master.getConnector();
Text start = extent.getPrevEndRow();
if (start == null) {
start = new Text();
}
Master.log.debug("Making file deletion entries for " + extent);
Range deleteRange = new Range(KeyExtent.getMetadataEntry(extent.getTableId(), start), false, KeyExtent.getMetadataEntry(extent.getTableId(),
extent.getEndRow()), true);
Scanner scanner = conn.createScanner(targetSystemTable, Authorizations.EMPTY);
scanner.setRange(deleteRange);
TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN.fetch(scanner);
TabletsSection.ServerColumnFamily.TIME_COLUMN.fetch(scanner);
scanner.fetchColumnFamily(DataFileColumnFamily.NAME);
scanner.fetchColumnFamily(TabletsSection.CurrentLocationColumnFamily.NAME);
Set<FileRef> datafiles = new TreeSet<>();
for (Entry<Key,Value> entry : scanner) {
Key key = entry.getKey();
if (key.compareColumnFamily(DataFileColumnFamily.NAME) == 0) {
datafiles.add(new FileRef(this.master.fs, key));
if (datafiles.size() > 1000) {
MetadataTableUtil.addDeleteEntries(extent, datafiles, master);
datafiles.clear();
}
} else if (TabletsSection.ServerColumnFamily.TIME_COLUMN.hasColumns(key)) {
timeType = entry.getValue().toString().charAt(0);
} else if (key.compareColumnFamily(TabletsSection.CurrentLocationColumnFamily.NAME) == 0) {
throw new IllegalStateException("Tablet " + key.getRow() + " is assigned during a merge!");
} else if (TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN.hasColumns(key)) {
// ACCUMULO-2974 Need to include the TableID when converting a relative path to an absolute path.
// The value has the leading path separator already included so it doesn't need it included.
String path = entry.getValue().toString();
if (path.contains(":")) {
datafiles.add(new FileRef(path));
} else {
datafiles.add(new FileRef(path, this.master.fs.getFullPath(FileType.TABLE, Path.SEPARATOR + extent.getTableId() + path)));
}
if (datafiles.size() > 1000) {
MetadataTableUtil.addDeleteEntries(extent, datafiles, master);
datafiles.clear();
}
}
}
MetadataTableUtil.addDeleteEntries(extent, datafiles, master);
BatchWriter bw = conn.createBatchWriter(targetSystemTable, new BatchWriterConfig());
try {
deleteTablets(info, deleteRange, bw, conn);
} finally {
bw.close();
}
if (followingTablet != null) {
Master.log.debug("Updating prevRow of " + followingTablet + " to " + extent.getPrevEndRow());
bw = conn.createBatchWriter(targetSystemTable, new BatchWriterConfig());
try {
Mutation m = new Mutation(followingTablet.getMetadataEntry());
TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN.put(m, KeyExtent.encodePrevEndRow(extent.getPrevEndRow()));
ChoppedColumnFamily.CHOPPED_COLUMN.putDelete(m);
bw.addMutation(m);
bw.flush();
} finally {
bw.close();
}
} else {
// Recreate the default tablet to hold the end of the table
Master.log.debug("Recreating the last tablet to point to " + extent.getPrevEndRow());
String tdir = master.getFileSystem().choose(Optional.of(extent.getTableId()), ServerConstants.getBaseUris()) + Constants.HDFS_TABLES_DIR
+ Path.SEPARATOR + extent.getTableId() + Constants.DEFAULT_TABLET_LOCATION;
MetadataTableUtil.addTablet(new KeyExtent(extent.getTableId(), null, extent.getPrevEndRow()), tdir, master, timeType, this.master.masterLock);
}
} catch (RuntimeException | IOException | TableNotFoundException | AccumuloSecurityException ex) {
throw new AccumuloException(ex);
}
}
private void mergeMetadataRecords(MergeInfo info) throws AccumuloException {
KeyExtent range = info.getExtent();
Master.log.debug("Merging metadata for " + range);
KeyExtent stop = getHighTablet(range);
Master.log.debug("Highest tablet is " + stop);
Value firstPrevRowValue = null;
Text stopRow = stop.getMetadataEntry();
Text start = range.getPrevEndRow();
if (start == null) {
start = new Text();
}
Range scanRange = new Range(KeyExtent.getMetadataEntry(range.getTableId(), start), false, stopRow, false);
String targetSystemTable = MetadataTable.NAME;
if (range.isMeta()) {
targetSystemTable = RootTable.NAME;
}
BatchWriter bw = null;
try {
long fileCount = 0;
Connector conn = this.master.getConnector();
// Make file entries in highest tablet
bw = conn.createBatchWriter(targetSystemTable, new BatchWriterConfig());
Scanner scanner = conn.createScanner(targetSystemTable, Authorizations.EMPTY);
scanner.setRange(scanRange);
TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN.fetch(scanner);
TabletsSection.ServerColumnFamily.TIME_COLUMN.fetch(scanner);
TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN.fetch(scanner);
scanner.fetchColumnFamily(DataFileColumnFamily.NAME);
Mutation m = new Mutation(stopRow);
String maxLogicalTime = null;
for (Entry<Key,Value> entry : scanner) {
Key key = entry.getKey();
Value value = entry.getValue();
if (key.getColumnFamily().equals(DataFileColumnFamily.NAME)) {
m.put(key.getColumnFamily(), key.getColumnQualifier(), value);
fileCount++;
} else if (TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN.hasColumns(key) && firstPrevRowValue == null) {
Master.log.debug("prevRow entry for lowest tablet is " + value);
firstPrevRowValue = new Value(value);
} else if (TabletsSection.ServerColumnFamily.TIME_COLUMN.hasColumns(key)) {
maxLogicalTime = TabletTime.maxMetadataTime(maxLogicalTime, value.toString());
} else if (TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN.hasColumns(key)) {
bw.addMutation(MetadataTableUtil.createDeleteMutation(range.getTableId(), entry.getValue().toString()));
}
}
// read the logical time from the last tablet in the merge range, it is not included in
// the loop above
scanner = conn.createScanner(targetSystemTable, Authorizations.EMPTY);
scanner.setRange(new Range(stopRow));
TabletsSection.ServerColumnFamily.TIME_COLUMN.fetch(scanner);
for (Entry<Key,Value> entry : scanner) {
if (TabletsSection.ServerColumnFamily.TIME_COLUMN.hasColumns(entry.getKey())) {
maxLogicalTime = TabletTime.maxMetadataTime(maxLogicalTime, entry.getValue().toString());
}
}
if (maxLogicalTime != null)
TabletsSection.ServerColumnFamily.TIME_COLUMN.put(m, new Value(maxLogicalTime.getBytes()));
if (!m.getUpdates().isEmpty()) {
bw.addMutation(m);
}
bw.flush();
Master.log.debug("Moved " + fileCount + " files to " + stop);
if (firstPrevRowValue == null) {
Master.log.debug("tablet already merged");
return;
}
stop.setPrevEndRow(KeyExtent.decodePrevEndRow(firstPrevRowValue));
Mutation updatePrevRow = stop.getPrevRowUpdateMutation();
Master.log.debug("Setting the prevRow for last tablet: " + stop);
bw.addMutation(updatePrevRow);
bw.flush();
deleteTablets(info, scanRange, bw, conn);
// Clean-up the last chopped marker
m = new Mutation(stopRow);
ChoppedColumnFamily.CHOPPED_COLUMN.putDelete(m);
bw.addMutation(m);
bw.flush();
} catch (Exception ex) {
throw new AccumuloException(ex);
} finally {
if (bw != null)
try {
bw.close();
} catch (Exception ex) {
throw new AccumuloException(ex);
}
}
}
private void deleteTablets(MergeInfo info, Range scanRange, BatchWriter bw, Connector conn) throws TableNotFoundException, MutationsRejectedException {
Scanner scanner;
Mutation m;
// Delete everything in the other tablets
// group all deletes into tablet into one mutation, this makes tablets
// either disappear entirely or not all.. this is important for the case
// where the process terminates in the loop below...
scanner = conn.createScanner(info.getExtent().isMeta() ? RootTable.NAME : MetadataTable.NAME, Authorizations.EMPTY);
Master.log.debug("Deleting range " + scanRange);
scanner.setRange(scanRange);
RowIterator rowIter = new RowIterator(scanner);
while (rowIter.hasNext()) {
Iterator<Entry<Key,Value>> row = rowIter.next();
m = null;
while (row.hasNext()) {
Entry<Key,Value> entry = row.next();
Key key = entry.getKey();
if (m == null)
m = new Mutation(key.getRow());
m.putDelete(key.getColumnFamily(), key.getColumnQualifier());
Master.log.debug("deleting entry " + key);
}
bw.addMutation(m);
}
bw.flush();
}
private KeyExtent getHighTablet(KeyExtent range) throws AccumuloException {
try {
Connector conn = this.master.getConnector();
Scanner scanner = conn.createScanner(range.isMeta() ? RootTable.NAME : MetadataTable.NAME, Authorizations.EMPTY);
TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN.fetch(scanner);
KeyExtent start = new KeyExtent(range.getTableId(), range.getEndRow(), null);
scanner.setRange(new Range(start.getMetadataEntry(), null));
Iterator<Entry<Key,Value>> iterator = scanner.iterator();
if (!iterator.hasNext()) {
throw new AccumuloException("No last tablet for a merge " + range);
}
Entry<Key,Value> entry = iterator.next();
KeyExtent highTablet = new KeyExtent(entry.getKey().getRow(), KeyExtent.decodePrevEndRow(entry.getValue()));
if (!highTablet.getTableId().equals(range.getTableId())) {
throw new AccumuloException("No last tablet for merge " + range + " " + highTablet);
}
return highTablet;
} catch (Exception ex) {
throw new AccumuloException("Unexpected failure finding the last tablet for a merge " + range, ex);
}
}
private void flushChanges(SortedMap<TServerInstance,TabletServerStatus> currentTServers, List<Assignment> assignments, List<Assignment> assigned,
List<TabletLocationState> assignedToDeadServers, Map<TServerInstance,List<Path>> logsForDeadServers, List<TabletLocationState> suspendedToGoneServers,
Map<KeyExtent,TServerInstance> unassigned) throws DistributedStoreException, TException, WalMarkerException {
boolean tabletsSuspendable = canSuspendTablets();
if (!assignedToDeadServers.isEmpty()) {
int maxServersToShow = min(assignedToDeadServers.size(), 100);
Master.log.debug(assignedToDeadServers.size() + " assigned to dead servers: " + assignedToDeadServers.subList(0, maxServersToShow) + "...");
Master.log.debug("logs for dead servers: " + logsForDeadServers);
if (tabletsSuspendable) {
store.suspend(assignedToDeadServers, logsForDeadServers, master.getSteadyTime());
} else {
store.unassign(assignedToDeadServers, logsForDeadServers);
}
this.master.markDeadServerLogsAsClosed(logsForDeadServers);
this.master.nextEvent.event("Marked %d tablets as suspended because they don't have current servers", assignedToDeadServers.size());
}
if (!suspendedToGoneServers.isEmpty()) {
int maxServersToShow = min(assignedToDeadServers.size(), 100);
Master.log.debug(assignedToDeadServers.size() + " suspended to gone servers: " + assignedToDeadServers.subList(0, maxServersToShow) + "...");
store.unsuspend(suspendedToGoneServers);
}
if (!currentTServers.isEmpty()) {
Map<KeyExtent,TServerInstance> assignedOut = new HashMap<>();
final StringBuilder builder = new StringBuilder(64);
this.master.tabletBalancer.getAssignments(Collections.unmodifiableSortedMap(currentTServers), Collections.unmodifiableMap(unassigned), assignedOut);
for (Entry<KeyExtent,TServerInstance> assignment : assignedOut.entrySet()) {
if (unassigned.containsKey(assignment.getKey())) {
if (assignment.getValue() != null) {
if (!currentTServers.containsKey(assignment.getValue())) {
Master.log.warn("balancer assigned " + assignment.getKey() + " to a tablet server that is not current " + assignment.getValue() + " ignoring");
continue;
}
if (builder.length() > 0) {
builder.append(ASSIGNMENT_BUFFER_SEPARATOR);
}
builder.append(assignment);
// Don't let the log message get too gigantic
if (builder.length() > ASSINGMENT_BUFFER_MAX_LENGTH) {
builder.append("]");
Master.log.debug(store.name() + " assigning tablets: [" + builder.toString());
builder.setLength(0);
}
assignments.add(new Assignment(assignment.getKey(), assignment.getValue()));
}
} else {
Master.log.warn(store.name() + " load balancer assigning tablet that was not nominated for assignment " + assignment.getKey());
}
}
if (builder.length() > 0) {
// Make sure to log any leftover assignments
builder.append("]");
Master.log.debug(store.name() + " assigning tablets: [" + builder.toString());
}
if (!unassigned.isEmpty() && assignedOut.isEmpty())
Master.log.warn("Load balancer failed to assign any tablets");
}
if (assignments.size() > 0) {
Master.log.info(String.format("Assigning %d tablets", assignments.size()));
store.setFutureLocations(assignments);
}
assignments.addAll(assigned);
for (Assignment a : assignments) {
TServerConnection conn = this.master.tserverSet.getConnection(a.server);
if (conn != null) {
conn.assignTablet(this.master.masterLock, a.tablet);
} else {
Master.log.warn("Could not connect to server " + a.server);
}
master.assignedTablet(a.tablet);
}
}
}