blob: fa8334b249debcd407144db83799bafc916b65bf [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.avro.mapred;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.hadoop.io.serializer.Serialization;
import org.apache.hadoop.io.serializer.Deserializer;
import org.apache.hadoop.io.serializer.Serializer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.io.BinaryEncoder;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.io.EncoderFactory;
/** The {@link Serialization} used by jobs configured with {@link AvroJob}. */
public class AvroSerialization<T> extends Configured
implements Serialization<AvroWrapper<T>> {
public boolean accept(Class<?> c) {
return AvroWrapper.class.isAssignableFrom(c);
}
/** Returns the specified map output deserializer. Defaults to the final
* output deserializer if no map output schema was specified. */
public Deserializer<AvroWrapper<T>> getDeserializer(Class<AvroWrapper<T>> c) {
Configuration conf = getConf();
boolean isKey = AvroKey.class.isAssignableFrom(c);
Schema schema = isKey
? Pair.getKeySchema(AvroJob.getMapOutputSchema(conf))
: Pair.getValueSchema(AvroJob.getMapOutputSchema(conf));
GenericData dataModel = AvroJob.createMapOutputDataModel(conf);
DatumReader<T> datumReader = dataModel.createDatumReader(schema);
return new AvroWrapperDeserializer(datumReader, isKey);
}
private static final DecoderFactory FACTORY = DecoderFactory.get();
private class AvroWrapperDeserializer
implements Deserializer<AvroWrapper<T>> {
private DatumReader<T> reader;
private BinaryDecoder decoder;
private boolean isKey;
public AvroWrapperDeserializer(DatumReader<T> reader, boolean isKey) {
this.reader = reader;
this.isKey = isKey;
}
public void open(InputStream in) {
this.decoder = FACTORY.directBinaryDecoder(in, decoder);
}
public AvroWrapper<T> deserialize(AvroWrapper<T> wrapper)
throws IOException {
T datum = reader.read(wrapper == null ? null : wrapper.datum(), decoder);
if (wrapper == null) {
wrapper = isKey? new AvroKey<T>(datum) : new AvroValue<T>(datum);
} else {
wrapper.datum(datum);
}
return wrapper;
}
public void close() throws IOException {
decoder.inputStream().close();
}
}
/** Returns the specified output serializer. */
public Serializer<AvroWrapper<T>> getSerializer(Class<AvroWrapper<T>> c) {
// AvroWrapper used for final output, AvroKey or AvroValue for map output
boolean isFinalOutput = c.equals(AvroWrapper.class);
Configuration conf = getConf();
Schema schema = isFinalOutput
? AvroJob.getOutputSchema(conf)
: (AvroKey.class.isAssignableFrom(c)
? Pair.getKeySchema(AvroJob.getMapOutputSchema(conf))
: Pair.getValueSchema(AvroJob.getMapOutputSchema(conf)));
GenericData dataModel = AvroJob.createDataModel(conf);
return new AvroWrapperSerializer(dataModel.createDatumWriter(schema));
}
private class AvroWrapperSerializer implements Serializer<AvroWrapper<T>> {
private DatumWriter<T> writer;
private OutputStream out;
private BinaryEncoder encoder;
public AvroWrapperSerializer(DatumWriter<T> writer) {
this.writer = writer;
}
public void open(OutputStream out) {
this.out = out;
this.encoder = new EncoderFactory().configureBlockSize(512)
.binaryEncoder(out, null);
}
public void serialize(AvroWrapper<T> wrapper) throws IOException {
writer.write(wrapper.datum(), encoder);
// would be a lot faster if the Serializer interface had a flush()
// method and the Hadoop framework called it when needed rather
// than for every record.
encoder.flush();
}
public void close() throws IOException {
out.close();
}
}
}