blob: 07118be23e17216685b18f2b913ad9080b6ee499 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg;
import java.util.Arrays;
import java.util.Set;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.types.Conversions;
import org.apache.iceberg.types.Types;
import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
@RunWith(Parameterized.class)
public class TestFindFiles extends TableTestBase {
@Parameterized.Parameters(name = "formatVersion = {0}")
public static Object[] parameters() {
return new Object[] { 1, 2 };
}
public TestFindFiles(int formatVersion) {
super(formatVersion);
}
@Test
public void testBasicBehavior() {
table.newAppend()
.appendFile(FILE_A)
.appendFile(FILE_B)
.commit();
Iterable<DataFile> files = FindFiles.in(table).collect();
Assert.assertEquals(pathSet(FILE_A, FILE_B), pathSet(files));
}
@Test
public void testWithMetadataMatching() {
table.newAppend()
.appendFile(FILE_A)
.appendFile(FILE_B)
.appendFile(FILE_C)
.appendFile(FILE_D)
.commit();
Iterable<DataFile> files = FindFiles.in(table)
.withMetadataMatching(Expressions.startsWith("file_path", "/path/to/data-a"))
.collect();
Assert.assertEquals(pathSet(FILE_A), pathSet(files));
}
@Test
public void testWithRecordsMatching() {
table.newAppend()
.appendFile(DataFiles.builder(SPEC)
.withInputFile(Files.localInput("/path/to/data-e.parquet"))
.withPartitionPath("data_bucket=4")
.withMetrics(new Metrics(3L,
null, // no column sizes
ImmutableMap.of(1, 3L), // value count
ImmutableMap.of(1, 0L), // null count
ImmutableMap.of(1, Conversions.toByteBuffer(Types.IntegerType.get(), 1)), // lower bounds
ImmutableMap.of(1, Conversions.toByteBuffer(Types.IntegerType.get(), 5)))) // lower bounds
.build())
.commit();
final Iterable<DataFile> files = FindFiles.in(table)
.withRecordsMatching(Expressions.equal("id", 1))
.collect();
Assert.assertEquals(Sets.newHashSet("/path/to/data-e.parquet"), pathSet(files));
}
@Test
public void testInPartition() {
table.newAppend()
.appendFile(FILE_A) // bucket 0
.appendFile(FILE_B) // bucket 1
.appendFile(FILE_C) // bucket 2
.appendFile(FILE_D) // bucket 3
.commit();
Iterable<DataFile> files = FindFiles.in(table)
.inPartition(table.spec(), StaticDataTask.Row.of(1))
.inPartition(table.spec(), StaticDataTask.Row.of(2))
.collect();
Assert.assertEquals(pathSet(FILE_B, FILE_C), pathSet(files));
}
@Test
public void testInPartitions() {
table.newAppend()
.appendFile(FILE_A) // bucket 0
.appendFile(FILE_B) // bucket 1
.appendFile(FILE_C) // bucket 2
.appendFile(FILE_D) // bucket 3
.commit();
Iterable<DataFile> files = FindFiles.in(table)
.inPartitions(table.spec(), StaticDataTask.Row.of(1), StaticDataTask.Row.of(2))
.collect();
Assert.assertEquals(pathSet(FILE_B, FILE_C), pathSet(files));
}
@Test
public void testAsOfTimestamp() {
table.newAppend()
.appendFile(FILE_A)
.commit();
table.newAppend()
.appendFile(FILE_B)
.commit();
long timestamp = System.currentTimeMillis();
table.newAppend()
.appendFile(FILE_C)
.commit();
table.newAppend()
.appendFile(FILE_D)
.commit();
Iterable<DataFile> files = FindFiles.in(table).asOfTime(timestamp).collect();
Assert.assertEquals(pathSet(FILE_A, FILE_B), pathSet(files));
}
@Test
public void testSnapshotId() {
table.newAppend()
.appendFile(FILE_A)
.appendFile(FILE_B)
.commit();
table.newAppend()
.appendFile(FILE_C)
.commit();
long snapshotId = table.currentSnapshot().snapshotId();
table.newAppend()
.appendFile(FILE_D)
.commit();
Iterable<DataFile> files = FindFiles.in(table).inSnapshot(snapshotId).collect();
Assert.assertEquals(pathSet(FILE_A, FILE_B, FILE_C), pathSet(files));
}
@Test
public void testCaseSensitivity() {
table.newAppend()
.appendFile(FILE_A)
.appendFile(FILE_B)
.appendFile(FILE_C)
.appendFile(FILE_D)
.commit();
Iterable<DataFile> files = FindFiles.in(table)
.caseInsensitive()
.withMetadataMatching(Expressions.startsWith("FILE_PATH", "/path/to/data-a"))
.collect();
Assert.assertEquals(pathSet(FILE_A), pathSet(files));
}
@Test
public void testNoSnapshot() {
// a table has no snapshot when it just gets created and no data is loaded yet
// if not handled properly, NPE will be thrown in collect()
Iterable<DataFile> files = FindFiles.in(table).collect();
// verify an empty collection of data file is returned
Assert.assertEquals(0, Sets.newHashSet(files).size());
}
private Set<String> pathSet(DataFile... files) {
return Sets.newHashSet(Iterables.transform(Arrays.asList(files), file -> file.path().toString()));
}
private Set<String> pathSet(Iterable<DataFile> files) {
return Sets.newHashSet(Iterables.transform(files, file -> file.path().toString()));
}
}