| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.iceberg; |
| |
| import java.nio.ByteBuffer; |
| import java.util.List; |
| import java.util.Map; |
| import org.apache.hadoop.fs.FileStatus; |
| import org.apache.iceberg.encryption.EncryptedOutputFile; |
| import org.apache.iceberg.encryption.EncryptionKeyMetadata; |
| import org.apache.iceberg.hadoop.HadoopInputFile; |
| import org.apache.iceberg.io.InputFile; |
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; |
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; |
| import org.apache.iceberg.types.Conversions; |
| import org.apache.iceberg.util.ArrayUtil; |
| import org.apache.iceberg.util.ByteBuffers; |
| |
| public class DataFiles { |
| |
| private DataFiles() {} |
| |
| static PartitionData newPartitionData(PartitionSpec spec) { |
| return new PartitionData(spec.partitionType()); |
| } |
| |
| static PartitionData copyPartitionData( |
| PartitionSpec spec, StructLike partitionData, PartitionData reuse) { |
| Preconditions.checkArgument( |
| spec.isPartitioned(), "Can't copy partition data to a unpartitioned table"); |
| PartitionData data = reuse; |
| if (data == null) { |
| data = newPartitionData(spec); |
| } |
| |
| Class<?>[] javaClasses = spec.javaClasses(); |
| List<PartitionField> fields = spec.fields(); |
| for (int i = 0; i < fields.size(); i += 1) { |
| data.set(i, partitionData.get(i, javaClasses[i])); |
| } |
| |
| return data; |
| } |
| |
| static PartitionData fillFromPath(PartitionSpec spec, String partitionPath, PartitionData reuse) { |
| PartitionData data = reuse; |
| if (data == null) { |
| data = newPartitionData(spec); |
| } |
| |
| String[] partitions = partitionPath.split("/", -1); |
| Preconditions.checkArgument( |
| partitions.length <= spec.fields().size(), |
| "Invalid partition data, too many fields (expecting %s): %s", |
| spec.fields().size(), |
| partitionPath); |
| Preconditions.checkArgument( |
| partitions.length >= spec.fields().size(), |
| "Invalid partition data, not enough fields (expecting %s): %s", |
| spec.fields().size(), |
| partitionPath); |
| |
| for (int i = 0; i < partitions.length; i += 1) { |
| PartitionField field = spec.fields().get(i); |
| String[] parts = partitions[i].split("=", 2); |
| Preconditions.checkArgument( |
| parts.length == 2 && parts[0] != null && field.name().equals(parts[0]), |
| "Invalid partition: %s", |
| partitions[i]); |
| |
| data.set(i, Conversions.fromPartitionString(data.getType(i), parts[1])); |
| } |
| |
| return data; |
| } |
| |
| static PartitionData fillFromValues( |
| PartitionSpec spec, List<String> partitionValues, PartitionData reuse) { |
| PartitionData data = reuse; |
| if (data == null) { |
| data = newPartitionData(spec); |
| } |
| |
| Preconditions.checkArgument( |
| partitionValues.size() == spec.fields().size(), |
| "Invalid partition data, expecting %s fields, found %s", |
| spec.fields().size(), |
| partitionValues.size()); |
| |
| for (int i = 0; i < partitionValues.size(); i += 1) { |
| data.set(i, Conversions.fromPartitionString(data.getType(i), partitionValues.get(i))); |
| } |
| |
| return data; |
| } |
| |
| public static PartitionData data(PartitionSpec spec, String partitionPath) { |
| return fillFromPath(spec, partitionPath, null); |
| } |
| |
| public static PartitionData copy(PartitionSpec spec, StructLike partition) { |
| return copyPartitionData(spec, partition, null); |
| } |
| |
| public static DataFile fromManifest(ManifestFile manifest) { |
| Preconditions.checkArgument( |
| manifest.addedFilesCount() != null && manifest.existingFilesCount() != null, |
| "Cannot create data file from manifest: data file counts are missing."); |
| |
| return DataFiles.builder(PartitionSpec.unpartitioned()) |
| .withPath(manifest.path()) |
| .withFormat(FileFormat.AVRO) |
| .withRecordCount(manifest.addedFilesCount() + manifest.existingFilesCount()) |
| .withFileSizeInBytes(manifest.length()) |
| .build(); |
| } |
| |
| public static Builder builder(PartitionSpec spec) { |
| return new Builder(spec); |
| } |
| |
| public static class Builder { |
| private final PartitionSpec spec; |
| private final boolean isPartitioned; |
| private final int specId; |
| private PartitionData partitionData; |
| private String filePath = null; |
| private FileFormat format = null; |
| private long recordCount = -1L; |
| private long fileSizeInBytes = -1L; |
| |
| // optional fields |
| private Map<Integer, Long> columnSizes = null; |
| private Map<Integer, Long> valueCounts = null; |
| private Map<Integer, Long> nullValueCounts = null; |
| private Map<Integer, Long> nanValueCounts = null; |
| private Map<Integer, ByteBuffer> lowerBounds = null; |
| private Map<Integer, ByteBuffer> upperBounds = null; |
| private ByteBuffer keyMetadata = null; |
| private List<Long> splitOffsets = null; |
| private List<Integer> equalityFieldIds = null; |
| private Integer sortOrderId = SortOrder.unsorted().orderId(); |
| |
| public Builder(PartitionSpec spec) { |
| this.spec = spec; |
| this.specId = spec.specId(); |
| this.isPartitioned = spec.isPartitioned(); |
| this.partitionData = isPartitioned ? newPartitionData(spec) : null; |
| } |
| |
| public void clear() { |
| if (isPartitioned) { |
| partitionData.clear(); |
| } |
| this.filePath = null; |
| this.format = null; |
| this.recordCount = -1L; |
| this.fileSizeInBytes = -1L; |
| this.columnSizes = null; |
| this.valueCounts = null; |
| this.nullValueCounts = null; |
| this.nanValueCounts = null; |
| this.lowerBounds = null; |
| this.upperBounds = null; |
| this.splitOffsets = null; |
| this.sortOrderId = SortOrder.unsorted().orderId(); |
| } |
| |
| public Builder copy(DataFile toCopy) { |
| if (isPartitioned) { |
| Preconditions.checkState( |
| specId == toCopy.specId(), "Cannot copy a DataFile with a different spec"); |
| this.partitionData = copyPartitionData(spec, toCopy.partition(), partitionData); |
| } |
| this.filePath = toCopy.path().toString(); |
| this.format = toCopy.format(); |
| this.recordCount = toCopy.recordCount(); |
| this.fileSizeInBytes = toCopy.fileSizeInBytes(); |
| this.columnSizes = toCopy.columnSizes(); |
| this.valueCounts = toCopy.valueCounts(); |
| this.nullValueCounts = toCopy.nullValueCounts(); |
| this.nanValueCounts = toCopy.nanValueCounts(); |
| this.lowerBounds = toCopy.lowerBounds(); |
| this.upperBounds = toCopy.upperBounds(); |
| this.keyMetadata = |
| toCopy.keyMetadata() == null ? null : ByteBuffers.copy(toCopy.keyMetadata()); |
| this.splitOffsets = |
| toCopy.splitOffsets() == null ? null : ImmutableList.copyOf(toCopy.splitOffsets()); |
| this.sortOrderId = toCopy.sortOrderId(); |
| return this; |
| } |
| |
| public Builder withStatus(FileStatus stat) { |
| this.filePath = stat.getPath().toString(); |
| this.fileSizeInBytes = stat.getLen(); |
| return this; |
| } |
| |
| public Builder withInputFile(InputFile file) { |
| if (file instanceof HadoopInputFile) { |
| return withStatus(((HadoopInputFile) file).getStat()); |
| } |
| |
| this.filePath = file.location(); |
| this.fileSizeInBytes = file.getLength(); |
| return this; |
| } |
| |
| public Builder withEncryptedOutputFile(EncryptedOutputFile newEncryptedFile) { |
| withInputFile(newEncryptedFile.encryptingOutputFile().toInputFile()); |
| withEncryptionKeyMetadata(newEncryptedFile.keyMetadata()); |
| return this; |
| } |
| |
| public Builder withPath(String newFilePath) { |
| this.filePath = newFilePath; |
| return this; |
| } |
| |
| public Builder withFormat(String newFormat) { |
| this.format = FileFormat.fromString(newFormat); |
| return this; |
| } |
| |
| public Builder withFormat(FileFormat newFormat) { |
| this.format = newFormat; |
| return this; |
| } |
| |
| public Builder withPartition(StructLike newPartition) { |
| if (isPartitioned) { |
| this.partitionData = copyPartitionData(spec, newPartition, partitionData); |
| } |
| return this; |
| } |
| |
| public Builder withRecordCount(long newRecordCount) { |
| this.recordCount = newRecordCount; |
| return this; |
| } |
| |
| public Builder withFileSizeInBytes(long newFileSizeInBytes) { |
| this.fileSizeInBytes = newFileSizeInBytes; |
| return this; |
| } |
| |
| public Builder withPartitionPath(String newPartitionPath) { |
| Preconditions.checkArgument( |
| isPartitioned || newPartitionPath.isEmpty(), |
| "Cannot add partition data for an unpartitioned table"); |
| if (!newPartitionPath.isEmpty()) { |
| this.partitionData = fillFromPath(spec, newPartitionPath, partitionData); |
| } |
| return this; |
| } |
| |
| public Builder withPartitionValues(List<String> partitionValues) { |
| Preconditions.checkArgument( |
| isPartitioned ^ partitionValues.isEmpty(), |
| "Table must be partitioned or partition values must be empty"); |
| if (!partitionValues.isEmpty()) { |
| this.partitionData = fillFromValues(spec, partitionValues, partitionData); |
| } |
| return this; |
| } |
| |
| public Builder withMetrics(Metrics metrics) { |
| // check for null to avoid NPE when unboxing |
| this.recordCount = metrics.recordCount() == null ? -1 : metrics.recordCount(); |
| this.columnSizes = metrics.columnSizes(); |
| this.valueCounts = metrics.valueCounts(); |
| this.nullValueCounts = metrics.nullValueCounts(); |
| this.nanValueCounts = metrics.nanValueCounts(); |
| this.lowerBounds = metrics.lowerBounds(); |
| this.upperBounds = metrics.upperBounds(); |
| return this; |
| } |
| |
| public Builder withSplitOffsets(List<Long> offsets) { |
| if (offsets != null) { |
| this.splitOffsets = ImmutableList.copyOf(offsets); |
| } else { |
| this.splitOffsets = null; |
| } |
| return this; |
| } |
| |
| public Builder withEqualityFieldIds(List<Integer> equalityIds) { |
| if (equalityIds != null) { |
| this.equalityFieldIds = ImmutableList.copyOf(equalityIds); |
| } |
| |
| return this; |
| } |
| |
| public Builder withEncryptionKeyMetadata(ByteBuffer newKeyMetadata) { |
| this.keyMetadata = newKeyMetadata; |
| return this; |
| } |
| |
| public Builder withEncryptionKeyMetadata(EncryptionKeyMetadata newKeyMetadata) { |
| return withEncryptionKeyMetadata(newKeyMetadata.buffer()); |
| } |
| |
| public Builder withSortOrder(SortOrder newSortOrder) { |
| if (newSortOrder != null) { |
| this.sortOrderId = newSortOrder.orderId(); |
| } |
| return this; |
| } |
| |
| public DataFile build() { |
| Preconditions.checkArgument(filePath != null, "File path is required"); |
| if (format == null) { |
| this.format = FileFormat.fromFileName(filePath); |
| } |
| Preconditions.checkArgument(format != null, "File format is required"); |
| Preconditions.checkArgument(fileSizeInBytes >= 0, "File size is required"); |
| Preconditions.checkArgument(recordCount >= 0, "Record count is required"); |
| |
| return new GenericDataFile( |
| specId, |
| filePath, |
| format, |
| isPartitioned ? partitionData.copy() : null, |
| fileSizeInBytes, |
| new Metrics( |
| recordCount, |
| columnSizes, |
| valueCounts, |
| nullValueCounts, |
| nanValueCounts, |
| lowerBounds, |
| upperBounds), |
| keyMetadata, |
| splitOffsets, |
| ArrayUtil.toIntArray(equalityFieldIds), |
| sortOrderId); |
| } |
| } |
| } |