blob: 9df4c81255cd4bd4dc45d31cd63d842f86dcb8c7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.io.util;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.util.function.Supplier;
import com.google.common.util.concurrent.RateLimiter;
import org.apache.cassandra.config.Config;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.io.compress.CompressedSequentialWriter;
import org.apache.cassandra.io.sstable.Component;
import org.apache.cassandra.io.sstable.Descriptor;
import org.apache.cassandra.io.sstable.IndexSummary;
import org.apache.cassandra.io.sstable.IndexSummaryBuilder;
import org.apache.cassandra.io.sstable.format.Version;
import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
import org.apache.cassandra.utils.NativeLibrary;
import org.apache.cassandra.utils.concurrent.RefCounted;
import org.apache.cassandra.utils.concurrent.SharedCloseableImpl;
import static org.apache.cassandra.utils.Throwables.maybeFail;
/**
* Abstracts a read-only file that has been split into segments, each of which can be represented by an independent
* FileDataInput. Allows for iteration over the FileDataInputs, or random access to the FileDataInput for a given
* position.
*
* The JVM can only map up to 2GB at a time, so each segment is at most that size when using mmap i/o. If a segment
* would need to be longer than 2GB, that segment will not be mmap'd, and a new RandomAccessFile will be created for
* each access to that segment.
*/
public abstract class SegmentedFile extends SharedCloseableImpl
{
public final ChannelProxy channel;
public final int bufferSize;
public final long length;
// This differs from length for compressed files (but we still need length for
// SegmentIterator because offsets in the file are relative to the uncompressed size)
public final long onDiskLength;
/**
* Use getBuilder to get a Builder to construct a SegmentedFile.
*/
SegmentedFile(Cleanup cleanup, ChannelProxy channel, int bufferSize, long length)
{
this(cleanup, channel, bufferSize, length, length);
}
protected SegmentedFile(Cleanup cleanup, ChannelProxy channel, int bufferSize, long length, long onDiskLength)
{
super(cleanup);
this.channel = channel;
this.bufferSize = bufferSize;
this.length = length;
this.onDiskLength = onDiskLength;
}
protected SegmentedFile(SegmentedFile copy)
{
super(copy);
channel = copy.channel;
bufferSize = copy.bufferSize;
length = copy.length;
onDiskLength = copy.onDiskLength;
}
public String path()
{
return channel.filePath();
}
protected static class Cleanup implements RefCounted.Tidy
{
final ChannelProxy channel;
protected Cleanup(ChannelProxy channel)
{
this.channel = channel;
}
public String name()
{
return channel.filePath();
}
public void tidy()
{
channel.close();
}
}
public abstract SegmentedFile sharedCopy();
public RandomAccessReader createReader()
{
return new RandomAccessReader.Builder(channel)
.overrideLength(length)
.bufferSize(bufferSize)
.build();
}
public RandomAccessReader createReader(RateLimiter limiter)
{
return new RandomAccessReader.Builder(channel)
.overrideLength(length)
.bufferSize(bufferSize)
.limiter(limiter)
.build();
}
public FileDataInput createReader(long position)
{
RandomAccessReader reader = createReader();
reader.seek(position);
return reader;
}
public void dropPageCache(long before)
{
NativeLibrary.trySkipCache(channel.getFileDescriptor(), 0, before, path());
}
/**
* @return A SegmentedFile.Builder.
*/
public static Builder getBuilder(Config.DiskAccessMode mode, boolean compressed)
{
return compressed ? new CompressedSegmentedFile.Builder(null)
: mode == Config.DiskAccessMode.mmap ? new MmappedSegmentedFile.Builder()
: new BufferedSegmentedFile.Builder();
}
public static Builder getCompressedBuilder(CompressedSequentialWriter writer)
{
return new CompressedSegmentedFile.Builder(writer);
}
/**
* Collects potential segmentation points in an underlying file, and builds a SegmentedFile to represent it.
*/
public static abstract class Builder implements AutoCloseable
{
private ChannelProxy channel;
/**
* Called after all potential boundaries have been added to apply this Builder to a concrete file on disk.
* @param channel The channel to the file on disk.
*/
protected abstract SegmentedFile complete(ChannelProxy channel, int bufferSize, long overrideLength);
@SuppressWarnings("resource") // SegmentedFile owns channel
private SegmentedFile complete(String path, int bufferSize, long overrideLength)
{
ChannelProxy channelCopy = getChannel(path);
try
{
return complete(channelCopy, bufferSize, overrideLength);
}
catch (Throwable t)
{
channelCopy.close();
throw t;
}
}
public SegmentedFile buildData(Descriptor desc, StatsMetadata stats, IndexSummaryBuilder.ReadableBoundary boundary)
{
return complete(desc.filenameFor(Component.DATA), bufferSize(stats), boundary.dataLength);
}
public SegmentedFile buildData(Descriptor desc, StatsMetadata stats)
{
return complete(desc.filenameFor(Component.DATA), bufferSize(stats), -1L);
}
public SegmentedFile buildIndex(Descriptor desc, IndexSummary indexSummary, IndexSummaryBuilder.ReadableBoundary boundary)
{
return complete(desc.filenameFor(Component.PRIMARY_INDEX), bufferSize(desc, indexSummary), boundary.indexLength);
}
public SegmentedFile buildIndex(Descriptor desc, IndexSummary indexSummary)
{
return complete(desc.filenameFor(Component.PRIMARY_INDEX), bufferSize(desc, indexSummary), -1L);
}
private static int bufferSize(StatsMetadata stats)
{
return bufferSize(stats.estimatedPartitionSize.percentile(DatabaseDescriptor.getDiskOptimizationEstimatePercentile()));
}
private static int bufferSize(Descriptor desc, IndexSummary indexSummary)
{
File file = new File(desc.filenameFor(Component.PRIMARY_INDEX));
return bufferSize(file.length() / indexSummary.size());
}
/**
Return the buffer size for a given record size. For spinning disks always add one page.
For solid state disks only add one page if the chance of crossing to the next page is more
than a predifined value, @see Config.disk_optimization_page_cross_chance.
*/
static int bufferSize(long recordSize)
{
Config.DiskOptimizationStrategy strategy = DatabaseDescriptor.getDiskOptimizationStrategy();
if (strategy == Config.DiskOptimizationStrategy.ssd)
{
// The crossing probability is calculated assuming a uniform distribution of record
// start position in a page, so it's the record size modulo the page size divided by
// the total page size.
double pageCrossProbability = (recordSize % 4096) / 4096.;
// if the page cross probability is equal or bigger than disk_optimization_page_cross_chance we add one page
if ((pageCrossProbability - DatabaseDescriptor.getDiskOptimizationPageCrossChance()) > -1e-16)
recordSize += 4096;
return roundBufferSize(recordSize);
}
else if (strategy == Config.DiskOptimizationStrategy.spinning)
{
return roundBufferSize(recordSize + 4096);
}
else
{
throw new IllegalStateException("Unsupported disk optimization strategy: " + strategy);
}
}
/**
Round up to the next multiple of 4k but no more than 64k
*/
static int roundBufferSize(long size)
{
if (size <= 0)
return 4096;
size = (size + 4095) & ~4095;
return (int)Math.min(size, 1 << 16);
}
public void serializeBounds(DataOutput out, Version version) throws IOException
{
if (!version.hasBoundaries())
return;
out.writeUTF(DatabaseDescriptor.getDiskAccessMode().name());
}
public void deserializeBounds(DataInput in, Version version) throws IOException
{
if (!version.hasBoundaries())
return;
if (!in.readUTF().equals(DatabaseDescriptor.getDiskAccessMode().name()))
throw new IOException("Cannot deserialize SSTable Summary component because the DiskAccessMode was changed!");
}
public Throwable close(Throwable accumulate)
{
if (channel != null)
return channel.close(accumulate);
return accumulate;
}
public void close()
{
maybeFail(close(null));
}
private ChannelProxy getChannel(String path)
{
if (channel != null)
{
// This is really fragile, both path and channel.filePath()
// must agree, i.e. they both must be absolute or both relative
// eventually we should really pass the filePath to the builder
// constructor and remove this
if (channel.filePath().equals(path))
return channel.sharedCopy();
else
channel.close();
}
channel = new ChannelProxy(path);
return channel.sharedCopy();
}
}
@Override
public String toString() {
return getClass().getSimpleName() + "(path='" + path() + '\'' +
", length=" + length +
')';
}
}