blob: c15b4d9d64dead1336f36ed44772585bcd9399a0 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.crunch.kafka.inputformat;
import kafka.api.OffsetRequest;
import org.apache.crunch.Pair;
import org.apache.crunch.kafka.ClusterTest;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.TaskAttemptContext;
import org.apache.kafka.clients.consumer.Consumer;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.common.TopicPartition;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TestName;
import org.junit.runner.RunWith;
import org.mockito.Matchers;
import org.mockito.Mock;
import org.mockito.runners.MockitoJUnitRunner;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import static org.apache.crunch.kafka.KafkaUtils.getBrokerOffsets;
import static org.hamcrest.CoreMatchers.notNullValue;
import static org.hamcrest.core.Is.is;
import static org.junit.Assert.assertThat;
import static org.junit.matchers.JUnitMatchers.hasItem;
import static org.mockito.Mockito.when;
@RunWith(MockitoJUnitRunner.class)
public class KafkaRecordReaderIT {
@Mock
private TaskAttemptContext context;
@Mock
private Consumer<String, String> consumer;
@Rule
public TestName testName = new TestName();
private Properties consumerProps;
private Configuration config;
@BeforeClass
public static void setup() throws Exception {
ClusterTest.startTest();
}
@AfterClass
public static void cleanup() throws Exception {
ClusterTest.endTest();
}
private String topic;
@Before
public void setupTest() {
topic = testName.getMethodName();
consumerProps = ClusterTest.getConsumerProperties();
config = ClusterTest.getConsumerConfig();
when(context.getConfiguration()).thenReturn(config);
when(consumer.poll(Matchers.anyLong())).thenReturn(null);
}
@Test
public void readData() throws IOException, InterruptedException {
List<String> keys = ClusterTest.writeData(ClusterTest.getProducerProperties(), topic, "batch", 10, 10);
Map<TopicPartition, Long> startOffsets = getBrokerOffsets(consumerProps, OffsetRequest.EarliestTime(), topic);
Map<TopicPartition, Long> endOffsets = getBrokerOffsets(consumerProps, OffsetRequest.LatestTime(), topic);
Map<TopicPartition, Pair<Long, Long>> offsets = new HashMap<>();
for (Map.Entry<TopicPartition, Long> entry : startOffsets.entrySet()) {
Long endingOffset = endOffsets.get(entry.getKey());
offsets.put(entry.getKey(), Pair.of(entry.getValue(), endingOffset));
}
KafkaInputFormat.writeOffsetsToConfiguration(offsets, config);
Set<String> keysRead = new HashSet<>();
//read all data from all splits
for (Map.Entry<TopicPartition, Pair<Long, Long>> partitionInfo : offsets.entrySet()) {
KafkaInputSplit split = new KafkaInputSplit(partitionInfo.getKey().topic(), partitionInfo.getKey().partition(),
partitionInfo.getValue().first(), partitionInfo.getValue().second());
KafkaRecordReader<String, String> recordReader = new KafkaRecordReader<>();
recordReader.initialize(split, context);
int numRecordsFound = 0;
while (recordReader.nextKeyValue()) {
keysRead.add(recordReader.getCurrentKey());
assertThat(keys, hasItem(recordReader.getCurrentKey()));
assertThat(recordReader.getCurrentValue(), is(notNullValue()));
numRecordsFound++;
}
recordReader.close();
//assert that it encountered a partitions worth of data
assertThat(((long) numRecordsFound), is(partitionInfo.getValue().second() - partitionInfo.getValue().first()));
}
//validate the same number of unique keys was read as were written.
assertThat(keysRead.size(), is(keys.size()));
}
@Test
public void pollReturnsNullAtStart() throws IOException, InterruptedException {
List<String> keys = ClusterTest.writeData(ClusterTest.getProducerProperties(), topic, "batch", 10, 10);
Map<TopicPartition, Long> startOffsets = getBrokerOffsets(consumerProps, OffsetRequest.EarliestTime(), topic);
Map<TopicPartition, Long> endOffsets = getBrokerOffsets(consumerProps, OffsetRequest.LatestTime(), topic);
Map<TopicPartition, Pair<Long, Long>> offsets = new HashMap<>();
for (Map.Entry<TopicPartition, Long> entry : startOffsets.entrySet()) {
Long endingOffset = endOffsets.get(entry.getKey());
offsets.put(entry.getKey(), Pair.of(entry.getValue(), endingOffset));
}
KafkaInputFormat.writeOffsetsToConfiguration(offsets, config);
Set<String> keysRead = new HashSet<>();
//read all data from all splits
for (Map.Entry<TopicPartition, Pair<Long, Long>> partitionInfo : offsets.entrySet()) {
KafkaInputSplit split = new KafkaInputSplit(partitionInfo.getKey().topic(), partitionInfo.getKey().partition(),
partitionInfo.getValue().first(), partitionInfo.getValue().second());
KafkaRecordReader<String, String> recordReader = new NullAtStartKafkaRecordReader<>(consumer, 3);
recordReader.initialize(split, context);
int numRecordsFound = 0;
while (recordReader.nextKeyValue()) {
keysRead.add(recordReader.getCurrentKey());
assertThat(keys, hasItem(recordReader.getCurrentKey()));
assertThat(recordReader.getCurrentValue(), is(notNullValue()));
numRecordsFound++;
}
recordReader.close();
//assert that it encountered a partitions worth of data
assertThat(((long) numRecordsFound), is(partitionInfo.getValue().second() - partitionInfo.getValue().first()));
}
//validate the same number of unique keys was read as were written.
assertThat(keysRead.size(), is(keys.size()));
}
@Test
public void pollReturnsEmptyAtStart() throws IOException, InterruptedException {
List<String> keys = ClusterTest.writeData(ClusterTest.getProducerProperties(), topic, "batch", 10, 10);
Map<TopicPartition, Long> startOffsets = getBrokerOffsets(consumerProps, OffsetRequest.EarliestTime(), topic);
Map<TopicPartition, Long> endOffsets = getBrokerOffsets(consumerProps, OffsetRequest.LatestTime(), topic);
Map<TopicPartition, Pair<Long, Long>> offsets = new HashMap<>();
for (Map.Entry<TopicPartition, Long> entry : startOffsets.entrySet()) {
Long endingOffset = endOffsets.get(entry.getKey());
offsets.put(entry.getKey(), Pair.of(entry.getValue(), endingOffset));
}
KafkaInputFormat.writeOffsetsToConfiguration(offsets, config);
Set<String> keysRead = new HashSet<>();
//read all data from all splits
for (Map.Entry<TopicPartition, Pair<Long, Long>> partitionInfo : offsets.entrySet()) {
KafkaInputSplit split = new KafkaInputSplit(partitionInfo.getKey().topic(), partitionInfo.getKey().partition(),
partitionInfo.getValue().first(), partitionInfo.getValue().second());
when(consumer.poll(Matchers.anyLong())).thenReturn(ConsumerRecords.<String, String>empty());
KafkaRecordReader<String, String> recordReader = new NullAtStartKafkaRecordReader<>(consumer, 3);
recordReader.initialize(split, context);
int numRecordsFound = 0;
while (recordReader.nextKeyValue()) {
keysRead.add(recordReader.getCurrentKey());
assertThat(keys, hasItem(recordReader.getCurrentKey()));
assertThat(recordReader.getCurrentValue(), is(notNullValue()));
numRecordsFound++;
}
recordReader.close();
//assert that it encountered a partitions worth of data
assertThat(((long) numRecordsFound), is(partitionInfo.getValue().second() - partitionInfo.getValue().first()));
}
//validate the same number of unique keys was read as were written.
assertThat(keysRead.size(), is(keys.size()));
}
@Test
public void pollReturnsNullInMiddle() throws IOException, InterruptedException {
List<String> keys = ClusterTest.writeData(ClusterTest.getProducerProperties(), topic, "batch", 10, 10);
Map<TopicPartition, Long> startOffsets = getBrokerOffsets(consumerProps, OffsetRequest.EarliestTime(), topic);
Map<TopicPartition, Long> endOffsets = getBrokerOffsets(consumerProps, OffsetRequest.LatestTime(), topic);
Map<TopicPartition, Pair<Long, Long>> offsets = new HashMap<>();
for (Map.Entry<TopicPartition, Long> entry : startOffsets.entrySet()) {
Long endingOffset = endOffsets.get(entry.getKey());
offsets.put(entry.getKey(), Pair.of(entry.getValue(), endingOffset));
}
KafkaInputFormat.writeOffsetsToConfiguration(offsets, config);
Set<String> keysRead = new HashSet<>();
//read all data from all splits
for (Map.Entry<TopicPartition, Pair<Long, Long>> partitionInfo : offsets.entrySet()) {
KafkaInputSplit split = new KafkaInputSplit(partitionInfo.getKey().topic(), partitionInfo.getKey().partition(),
partitionInfo.getValue().first(), partitionInfo.getValue().second());
KafkaRecordReader<String, String> recordReader = new InjectableKafkaRecordReader<>(consumer, 1);
recordReader.initialize(split, context);
int numRecordsFound = 0;
while (recordReader.nextKeyValue()) {
keysRead.add(recordReader.getCurrentKey());
assertThat(keys, hasItem(recordReader.getCurrentKey()));
assertThat(recordReader.getCurrentValue(), is(notNullValue()));
numRecordsFound++;
}
recordReader.close();
//assert that it encountered a partitions worth of data
assertThat(((long) numRecordsFound), is(partitionInfo.getValue().second() - partitionInfo.getValue().first()));
}
//validate the same number of unique keys was read as were written.
assertThat(keysRead.size(), is(keys.size()));
}
@Test
public void pollReturnsEmptyInMiddle() throws IOException, InterruptedException {
List<String> keys = ClusterTest.writeData(ClusterTest.getProducerProperties(), topic, "batch", 10, 10);
Map<TopicPartition, Long> startOffsets = getBrokerOffsets(consumerProps, OffsetRequest.EarliestTime(), topic);
Map<TopicPartition, Long> endOffsets = getBrokerOffsets(consumerProps, OffsetRequest.LatestTime(), topic);
Map<TopicPartition, Pair<Long, Long>> offsets = new HashMap<>();
for (Map.Entry<TopicPartition, Long> entry : startOffsets.entrySet()) {
Long endingOffset = endOffsets.get(entry.getKey());
offsets.put(entry.getKey(), Pair.of(entry.getValue(), endingOffset));
}
KafkaInputFormat.writeOffsetsToConfiguration(offsets, config);
Set<String> keysRead = new HashSet<>();
//read all data from all splits
for (Map.Entry<TopicPartition, Pair<Long, Long>> partitionInfo : offsets.entrySet()) {
KafkaInputSplit split = new KafkaInputSplit(partitionInfo.getKey().topic(), partitionInfo.getKey().partition(),
partitionInfo.getValue().first(), partitionInfo.getValue().second());
when(consumer.poll(Matchers.anyLong())).thenReturn(ConsumerRecords.<String, String>empty());
KafkaRecordReader<String, String> recordReader = new InjectableKafkaRecordReader<>(consumer, 1);
recordReader.initialize(split, context);
int numRecordsFound = 0;
while (recordReader.nextKeyValue()) {
keysRead.add(recordReader.getCurrentKey());
assertThat(keys, hasItem(recordReader.getCurrentKey()));
assertThat(recordReader.getCurrentValue(), is(notNullValue()));
numRecordsFound++;
}
recordReader.close();
//assert that it encountered a partitions worth of data
assertThat(((long) numRecordsFound), is(partitionInfo.getValue().second() - partitionInfo.getValue().first()));
}
//validate the same number of unique keys was read as were written.
assertThat(keysRead.size(), is(keys.size()));
}
@Test
public void pollEarliestEqualsEnding() throws IOException, InterruptedException {
List<String> keys = ClusterTest.writeData(ClusterTest.getProducerProperties(), topic, "batch", 10, 10);
Map<TopicPartition, Long> startOffsets = getBrokerOffsets(consumerProps, OffsetRequest.EarliestTime(), topic);
Map<TopicPartition, Long> endOffsets = getBrokerOffsets(consumerProps, OffsetRequest.LatestTime(), topic);
Map<TopicPartition, Pair<Long, Long>> offsets = new HashMap<>();
for (Map.Entry<TopicPartition, Long> entry : startOffsets.entrySet()) {
Long endingOffset = endOffsets.get(entry.getKey());
offsets.put(entry.getKey(), Pair.of(entry.getValue(), endingOffset));
}
KafkaInputFormat.writeOffsetsToConfiguration(offsets, config);
Set<String> keysRead = new HashSet<>();
//read all data from all splits
for (Map.Entry<TopicPartition, Pair<Long, Long>> partitionInfo : offsets.entrySet()) {
KafkaInputSplit split = new KafkaInputSplit(partitionInfo.getKey().topic(), partitionInfo.getKey().partition(),
partitionInfo.getValue().first(), partitionInfo.getValue().second());
when(consumer.poll(Matchers.anyLong())).thenReturn(ConsumerRecords.<String, String>empty());
KafkaRecordReader<String, String> recordReader = new EarliestRecordReader<>(consumer,
partitionInfo.getValue().second());
recordReader.initialize(split, context);
int numRecordsFound = 0;
while (recordReader.nextKeyValue()) {
keysRead.add(recordReader.getCurrentKey());
numRecordsFound++;
}
recordReader.close();
//assert that it encountered a partitions worth of data
assertThat(numRecordsFound, is(0));
}
//validate the same number of unique keys was read as were written.
assertThat(keysRead.size(), is(0));
}
private static class NullAtStartKafkaRecordReader<K, V> extends KafkaRecordReader<K, V>{
private final Consumer consumer;
private final int callAttempts;
private int attempts;
public NullAtStartKafkaRecordReader(Consumer consumer, int callAttempts){
this.consumer = consumer;
this.callAttempts = callAttempts;
attempts = 0;
}
@Override
protected Consumer<K, V> getConsumer() {
if(attempts > callAttempts){
return super.getConsumer();
}
attempts++;
return consumer;
}
}
private static class InjectableKafkaRecordReader<K, V> extends KafkaRecordReader<K, V>{
private final Consumer consumer;
private final int failAttempt;
private int attempts;
public InjectableKafkaRecordReader(Consumer consumer, int failAttempt){
this.consumer = consumer;
this.failAttempt = failAttempt;
attempts = 0;
}
@Override
protected Consumer<K, V> getConsumer() {
if(attempts == failAttempt){
attempts++;
return consumer;
}
attempts++;
return super.getConsumer();
}
}
private static class EarliestRecordReader<K,V> extends KafkaRecordReader<K, V>{
private final long earliest;
private final Consumer consumer;
public EarliestRecordReader(Consumer consumer, long earliest){
this.earliest = earliest;
this.consumer = consumer;
}
@Override
protected Consumer<K, V> getConsumer() {
return consumer;
}
@Override
protected long getEarliestOffset() {
return earliest;
}
}
}