blob: f733619509277259b510ac4be29faa491038a568 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.io.sstable.format.big;
import java.io.*;
import java.util.Map;
import org.apache.cassandra.db.*;
import org.apache.cassandra.db.transform.Transformation;
import org.apache.cassandra.io.sstable.*;
import org.apache.cassandra.io.sstable.format.SSTableReader;
import org.apache.cassandra.io.sstable.format.SSTableWriter;
import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.rows.*;
import org.apache.cassandra.io.FSWriteError;
import org.apache.cassandra.io.compress.CompressedSequentialWriter;
import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
import org.apache.cassandra.io.sstable.metadata.MetadataType;
import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
import org.apache.cassandra.io.util.*;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.FilterFactory;
import org.apache.cassandra.utils.IFilter;
import org.apache.cassandra.utils.concurrent.Transactional;
import org.apache.cassandra.utils.SyncUtil;
public class BigTableWriter extends SSTableWriter
{
private static final Logger logger = LoggerFactory.getLogger(BigTableWriter.class);
private final IndexWriter iwriter;
private final SegmentedFile.Builder dbuilder;
protected final SequentialWriter dataFile;
private DecoratedKey lastWrittenKey;
private DataPosition dataMark;
public BigTableWriter(Descriptor descriptor,
Long keyCount,
Long repairedAt,
CFMetaData metadata,
MetadataCollector metadataCollector,
SerializationHeader header,
LifecycleNewTracker lifecycleNewTracker)
{
super(descriptor, keyCount, repairedAt, metadata, metadataCollector, header);
lifecycleNewTracker.trackNew(this); // must track before any files are created
if (compression)
{
dataFile = SequentialWriter.open(getFilename(),
descriptor.filenameFor(Component.COMPRESSION_INFO),
metadata.params.compression,
metadataCollector);
dbuilder = SegmentedFile.getCompressedBuilder((CompressedSequentialWriter) dataFile);
}
else
{
dataFile = SequentialWriter.open(new File(getFilename()), new File(descriptor.filenameFor(Component.CRC)));
dbuilder = SegmentedFile.getBuilder(DatabaseDescriptor.getDiskAccessMode(), false);
}
iwriter = new IndexWriter(keyCount, dataFile);
}
public void mark()
{
dataMark = dataFile.mark();
iwriter.mark();
}
public void resetAndTruncate()
{
dataFile.resetAndTruncate(dataMark);
iwriter.resetAndTruncate();
}
/**
* Perform sanity checks on @param decoratedKey and @return the position in the data file before any data is written
*/
protected long beforeAppend(DecoratedKey decoratedKey)
{
assert decoratedKey != null : "Keys must not be null"; // empty keys ARE allowed b/c of indexed column values
if (lastWrittenKey != null && lastWrittenKey.compareTo(decoratedKey) >= 0)
throw new RuntimeException("Last written key " + lastWrittenKey + " >= current key " + decoratedKey + " writing into " + getFilename());
return (lastWrittenKey == null) ? 0 : dataFile.position();
}
private void afterAppend(DecoratedKey decoratedKey, long dataEnd, RowIndexEntry index) throws IOException
{
metadataCollector.addKey(decoratedKey.getKey());
lastWrittenKey = decoratedKey;
last = lastWrittenKey;
if (first == null)
first = lastWrittenKey;
if (logger.isTraceEnabled())
logger.trace("wrote {} at {}", decoratedKey, dataEnd);
iwriter.append(decoratedKey, index, dataEnd);
}
/**
* Appends partition data to this writer.
*
* @param iterator the partition to write
* @return the created index entry if something was written, that is if {@code iterator}
* wasn't empty, {@code null} otherwise.
*
* @throws FSWriteError if a write to the dataFile fails
*/
public RowIndexEntry append(UnfilteredRowIterator iterator)
{
DecoratedKey key = iterator.partitionKey();
if (key.getKey().remaining() > FBUtilities.MAX_UNSIGNED_SHORT)
{
logger.error("Key size {} exceeds maximum of {}, skipping row", key.getKey().remaining(), FBUtilities.MAX_UNSIGNED_SHORT);
return null;
}
if (iterator.isEmpty())
return null;
long startPosition = beforeAppend(key);
try (UnfilteredRowIterator collecting = Transformation.apply(iterator, new StatsCollector(metadataCollector)))
{
ColumnIndex index = ColumnIndex.writeAndBuildIndex(collecting, dataFile, header, descriptor.version);
RowIndexEntry entry = RowIndexEntry.create(startPosition, collecting.partitionLevelDeletion(), index);
long endPosition = dataFile.position();
long rowSize = endPosition - startPosition;
maybeLogLargePartitionWarning(key, rowSize);
metadataCollector.addPartitionSizeInBytes(rowSize);
afterAppend(key, endPosition, entry);
return entry;
}
catch (IOException e)
{
throw new FSWriteError(e, dataFile.getPath());
}
}
private void maybeLogLargePartitionWarning(DecoratedKey key, long rowSize)
{
if (rowSize > DatabaseDescriptor.getCompactionLargePartitionWarningThreshold())
{
String keyString = metadata.getKeyValidator().getString(key.getKey());
logger.warn("Writing large partition {}/{}:{} ({} bytes to sstable {}) ", metadata.ksName, metadata.cfName, keyString, rowSize, getFilename());
}
}
private static class StatsCollector extends Transformation
{
private final MetadataCollector collector;
private int cellCount;
StatsCollector(MetadataCollector collector)
{
this.collector = collector;
}
@Override
public Row applyToStatic(Row row)
{
if (!row.isEmpty())
cellCount += Rows.collectStats(row, collector);
return row;
}
@Override
public Row applyToRow(Row row)
{
collector.updateClusteringValues(row.clustering());
cellCount += Rows.collectStats(row, collector);
return row;
}
@Override
public RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
{
collector.updateClusteringValues(marker.clustering());
if (marker.isBoundary())
{
RangeTombstoneBoundaryMarker bm = (RangeTombstoneBoundaryMarker)marker;
collector.update(bm.endDeletionTime());
collector.update(bm.startDeletionTime());
}
else
{
collector.update(((RangeTombstoneBoundMarker)marker).deletionTime());
}
return marker;
}
@Override
public void onPartitionClose()
{
collector.addCellPerPartitionCount(cellCount);
}
@Override
public DeletionTime applyToDeletion(DeletionTime deletionTime)
{
collector.update(deletionTime);
return deletionTime;
}
}
@SuppressWarnings("resource")
public SSTableReader openEarly()
{
// find the max (exclusive) readable key
IndexSummaryBuilder.ReadableBoundary boundary = iwriter.getMaxReadable();
if (boundary == null)
return null;
StatsMetadata stats = statsMetadata();
assert boundary.indexLength > 0 && boundary.dataLength > 0;
// open the reader early
IndexSummary indexSummary = iwriter.summary.build(metadata.partitioner, boundary);
SegmentedFile ifile = iwriter.builder.buildIndex(descriptor, indexSummary, boundary);
SegmentedFile dfile = dbuilder.buildData(descriptor, stats, boundary);
SSTableReader sstable = SSTableReader.internalOpen(descriptor,
components, metadata,
ifile, dfile, indexSummary,
iwriter.bf.sharedCopy(), maxDataAge, stats, SSTableReader.OpenReason.EARLY, header);
// now it's open, find the ACTUAL last readable key (i.e. for which the data file has also been flushed)
sstable.first = getMinimalKey(first);
sstable.last = getMinimalKey(boundary.lastKey);
return sstable;
}
public SSTableReader openFinalEarly()
{
// we must ensure the data is completely flushed to disk
dataFile.sync();
iwriter.indexFile.sync();
return openFinal(descriptor, SSTableReader.OpenReason.EARLY);
}
@SuppressWarnings("resource")
private SSTableReader openFinal(Descriptor desc, SSTableReader.OpenReason openReason)
{
if (maxDataAge < 0)
maxDataAge = System.currentTimeMillis();
StatsMetadata stats = statsMetadata();
// finalize in-memory state for the reader
IndexSummary indexSummary = iwriter.summary.build(this.metadata.partitioner);
SegmentedFile ifile = iwriter.builder.buildIndex(desc, indexSummary);
SegmentedFile dfile = dbuilder.buildData(desc, stats);
SSTableReader sstable = SSTableReader.internalOpen(desc,
components,
this.metadata,
ifile,
dfile,
indexSummary,
iwriter.bf.sharedCopy(),
maxDataAge,
stats,
openReason,
header);
sstable.first = getMinimalKey(first);
sstable.last = getMinimalKey(last);
return sstable;
}
protected SSTableWriter.TransactionalProxy txnProxy()
{
return new TransactionalProxy();
}
class TransactionalProxy extends SSTableWriter.TransactionalProxy
{
// finalise our state on disk, including renaming
protected void doPrepare()
{
iwriter.prepareToCommit();
// write sstable statistics
dataFile.setDescriptor(descriptor).prepareToCommit();
writeMetadata(descriptor, finalizeMetadata());
// save the table of components
SSTable.appendTOC(descriptor, components);
if (openResult)
finalReader = openFinal(descriptor, SSTableReader.OpenReason.NORMAL);
}
protected Throwable doCommit(Throwable accumulate)
{
accumulate = dataFile.commit(accumulate);
accumulate = iwriter.commit(accumulate);
return accumulate;
}
@Override
protected Throwable doPostCleanup(Throwable accumulate)
{
accumulate = dbuilder.close(accumulate);
return accumulate;
}
protected Throwable doAbort(Throwable accumulate)
{
accumulate = iwriter.abort(accumulate);
accumulate = dataFile.abort(accumulate);
return accumulate;
}
}
private static void writeMetadata(Descriptor desc, Map<MetadataType, MetadataComponent> components)
{
File file = new File(desc.filenameFor(Component.STATS));
try (SequentialWriter out = SequentialWriter.open(file))
{
desc.getMetadataSerializer().serialize(components, out, desc.version);
out.setDescriptor(desc).finish();
}
catch (IOException e)
{
throw new FSWriteError(e, file.getPath());
}
}
public long getFilePointer()
{
return dataFile.position();
}
public long getOnDiskFilePointer()
{
return dataFile.getOnDiskFilePointer();
}
/**
* Encapsulates writing the index and filter for an SSTable. The state of this object is not valid until it has been closed.
*/
class IndexWriter extends AbstractTransactional implements Transactional
{
private final SequentialWriter indexFile;
public final SegmentedFile.Builder builder;
public final IndexSummaryBuilder summary;
public final IFilter bf;
private DataPosition mark;
IndexWriter(long keyCount, final SequentialWriter dataFile)
{
indexFile = SequentialWriter.open(new File(descriptor.filenameFor(Component.PRIMARY_INDEX)));
builder = SegmentedFile.getBuilder(DatabaseDescriptor.getIndexAccessMode(), false);
summary = new IndexSummaryBuilder(keyCount, metadata.params.minIndexInterval, Downsampling.BASE_SAMPLING_LEVEL);
bf = FilterFactory.getFilter(keyCount, metadata.params.bloomFilterFpChance, true, descriptor.version.hasOldBfHashOrder());
// register listeners to be alerted when the data files are flushed
indexFile.setPostFlushListener(new Runnable()
{
public void run()
{
summary.markIndexSynced(indexFile.getLastFlushOffset());
}
});
dataFile.setPostFlushListener(new Runnable()
{
public void run()
{
summary.markDataSynced(dataFile.getLastFlushOffset());
}
});
}
// finds the last (-offset) decorated key that can be guaranteed to occur fully in the flushed portion of the index file
IndexSummaryBuilder.ReadableBoundary getMaxReadable()
{
return summary.getLastReadableBoundary();
}
public void append(DecoratedKey key, RowIndexEntry indexEntry, long dataEnd) throws IOException
{
bf.add(key);
long indexStart = indexFile.position();
try
{
ByteBufferUtil.writeWithShortLength(key.getKey(), indexFile);
rowIndexEntrySerializer.serialize(indexEntry, indexFile);
}
catch (IOException e)
{
throw new FSWriteError(e, indexFile.getPath());
}
long indexEnd = indexFile.position();
if (logger.isTraceEnabled())
logger.trace("wrote index entry: {} at {}", indexEntry, indexStart);
summary.maybeAddEntry(key, indexStart, indexEnd, dataEnd);
}
/**
* Closes the index and bloomfilter, making the public state of this writer valid for consumption.
*/
void flushBf()
{
if (components.contains(Component.FILTER))
{
String path = descriptor.filenameFor(Component.FILTER);
try (FileOutputStream fos = new FileOutputStream(path);
DataOutputStreamPlus stream = new BufferedDataOutputStreamPlus(fos))
{
// bloom filter
FilterFactory.serialize(bf, stream);
stream.flush();
SyncUtil.sync(fos);
}
catch (IOException e)
{
throw new FSWriteError(e, path);
}
}
}
public void mark()
{
mark = indexFile.mark();
}
public void resetAndTruncate()
{
// we can't un-set the bloom filter addition, but extra keys in there are harmless.
// we can't reset dbuilder either, but that is the last thing called in afterappend so
// we assume that if that worked then we won't be trying to reset.
indexFile.resetAndTruncate(mark);
}
protected void doPrepare()
{
flushBf();
// truncate index file
long position = iwriter.indexFile.position();
iwriter.indexFile.setDescriptor(descriptor).prepareToCommit();
FileUtils.truncate(iwriter.indexFile.getPath(), position);
// save summary
summary.prepareToCommit();
try (IndexSummary summary = iwriter.summary.build(getPartitioner()))
{
SSTableReader.saveSummary(descriptor, first, last, iwriter.builder, dbuilder, summary);
}
}
protected Throwable doCommit(Throwable accumulate)
{
return indexFile.commit(accumulate);
}
protected Throwable doAbort(Throwable accumulate)
{
return indexFile.abort(accumulate);
}
@Override
protected Throwable doPostCleanup(Throwable accumulate)
{
accumulate = summary.close(accumulate);
accumulate = bf.close(accumulate);
accumulate = builder.close(accumulate);
return accumulate;
}
}
}