blob: 6e024351688ee2647a30b2e01e476f2936aced6e [file] [log] [blame]
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
package org.apache.cassandra.service.paxos;
import java.util.concurrent.locks.Lock;
import com.google.common.util.concurrent.Striped;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.*;
import org.apache.cassandra.tracing.Tracing;
import org.apache.cassandra.utils.UUIDGen;
public class PaxosState
{
private static final Striped<Lock> LOCKS = Striped.lazyWeakLock(DatabaseDescriptor.getConcurrentWriters() * 1024);
private final Commit promised;
private final Commit accepted;
private final Commit mostRecentCommit;
public PaxosState(DecoratedKey key, TableMetadata metadata)
{
this(Commit.emptyCommit(key, metadata), Commit.emptyCommit(key, metadata), Commit.emptyCommit(key, metadata));
}
public PaxosState(Commit promised, Commit accepted, Commit mostRecentCommit)
{
assert promised.update.partitionKey().equals(accepted.update.partitionKey()) && accepted.update.partitionKey().equals(mostRecentCommit.update.partitionKey());
assert promised.update.metadata().id.equals(accepted.update.metadata().id) && accepted.update.metadata().id.equals(mostRecentCommit.update.metadata().id);
this.promised = promised;
this.accepted = accepted;
this.mostRecentCommit = mostRecentCommit;
}
public static PrepareResponse prepare(Commit toPrepare)
{
long start = System.nanoTime();
try
{
Lock lock = LOCKS.get(toPrepare.update.partitionKey());
lock.lock();
try
{
// When preparing, we need to use the same time as "now" (that's the time we use to decide if something
// is expired or not) accross nodes otherwise we may have a window where a Most Recent Commit shows up
// on some replica and not others during a new proposal (in StorageProxy.beginAndRepairPaxos()), and no
// amount of re-submit will fix this (because the node on which the commit has expired will have a
// tombstone that hides any re-submit). See CASSANDRA-12043 for details.
int nowInSec = UUIDGen.unixTimestampInSec(toPrepare.ballot);
PaxosState state = SystemKeyspace.loadPaxosState(toPrepare.update.partitionKey(), toPrepare.update.metadata(), nowInSec);
if (toPrepare.isAfter(state.promised))
{
Tracing.trace("Promising ballot {}", toPrepare.ballot);
SystemKeyspace.savePaxosPromise(toPrepare);
return new PrepareResponse(true, state.accepted, state.mostRecentCommit);
}
else
{
Tracing.trace("Promise rejected; {} is not sufficiently newer than {}", toPrepare, state.promised);
// return the currently promised ballot (not the last accepted one) so the coordinator can make sure it uses newer ballot next time (#5667)
return new PrepareResponse(false, state.promised, state.mostRecentCommit);
}
}
finally
{
lock.unlock();
}
}
finally
{
Keyspace.open(toPrepare.update.metadata().keyspace).getColumnFamilyStore(toPrepare.update.metadata().id).metric.casPrepare.addNano(System.nanoTime() - start);
}
}
public static Boolean propose(Commit proposal)
{
long start = System.nanoTime();
try
{
Lock lock = LOCKS.get(proposal.update.partitionKey());
lock.lock();
try
{
int nowInSec = UUIDGen.unixTimestampInSec(proposal.ballot);
PaxosState state = SystemKeyspace.loadPaxosState(proposal.update.partitionKey(), proposal.update.metadata(), nowInSec);
if (proposal.hasBallot(state.promised.ballot) || proposal.isAfter(state.promised))
{
Tracing.trace("Accepting proposal {}", proposal);
SystemKeyspace.savePaxosProposal(proposal);
return true;
}
else
{
Tracing.trace("Rejecting proposal for {} because inProgress is now {}", proposal, state.promised);
return false;
}
}
finally
{
lock.unlock();
}
}
finally
{
Keyspace.open(proposal.update.metadata().keyspace).getColumnFamilyStore(proposal.update.metadata().id).metric.casPropose.addNano(System.nanoTime() - start);
}
}
public static void commit(Commit proposal)
{
long start = System.nanoTime();
try
{
// There is no guarantee we will see commits in the right order, because messages
// can get delayed, so a proposal can be older than our current most recent ballot/commit.
// Committing it is however always safe due to column timestamps, so always do it. However,
// if our current in-progress ballot is strictly greater than the proposal one, we shouldn't
// erase the in-progress update.
// The table may have been truncated since the proposal was initiated. In that case, we
// don't want to perform the mutation and potentially resurrect truncated data
if (UUIDGen.unixTimestamp(proposal.ballot) >= SystemKeyspace.getTruncatedAt(proposal.update.metadata().id))
{
Tracing.trace("Committing proposal {}", proposal);
Mutation mutation = proposal.makeMutation();
Keyspace.open(mutation.getKeyspaceName()).apply(mutation, true);
}
else
{
Tracing.trace("Not committing proposal {} as ballot timestamp predates last truncation time", proposal);
}
// We don't need to lock, we're just blindly updating
SystemKeyspace.savePaxosCommit(proposal);
}
finally
{
Keyspace.open(proposal.update.metadata().keyspace).getColumnFamilyStore(proposal.update.metadata().id).metric.casCommit.addNano(System.nanoTime() - start);
}
}
}