Merge pull request #531 from sebastian-nagel/NUTCH-2787
NUTCH-2787 CrawlDb JSON dump does not export metadata primitive data types correctly
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 603b2e3..1bb8160 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -79,8 +79,11 @@
import com.fasterxml.jackson.core.JsonGenerationException;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.util.MinimalPrettyPrinter;
+import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.module.SimpleModule;
/**
* Read utility for the CrawlDB.
@@ -243,6 +246,9 @@
this.out = out;
jsonMapper.getFactory()
.configure(JsonGenerator.Feature.ESCAPE_NON_ASCII, true);
+ SimpleModule module = new SimpleModule();
+ module.addSerializer(Writable.class, new WritableSerializer());
+ jsonMapper.registerModule(module);
jsonWriter = jsonMapper.writer(new JsonIndenter());
}
@@ -295,6 +301,36 @@
DataOutputStream fileOut = fs.create(new Path(dir, name), context);
return new LineRecordWriter(fileOut);
}
+
+ public static class WritableSerializer extends JsonSerializer<Writable> {
+ @Override
+ public void serialize(Writable obj, JsonGenerator jgen,
+ SerializerProvider provider) throws IOException {
+ if (obj instanceof org.apache.hadoop.io.NullWritable) {
+ jgen.writeNull();
+ } else if (obj instanceof org.apache.hadoop.io.BooleanWritable) {
+ jgen.writeBoolean(((org.apache.hadoop.io.BooleanWritable) obj).get());
+ } else if (obj instanceof org.apache.hadoop.io.IntWritable) {
+ jgen.writeNumber(((org.apache.hadoop.io.IntWritable) obj).get());
+ } else if (obj instanceof org.apache.hadoop.io.VIntWritable) {
+ jgen.writeNumber(((org.apache.hadoop.io.VIntWritable) obj).get());
+ } else if (obj instanceof org.apache.hadoop.io.LongWritable) {
+ jgen.writeNumber(((org.apache.hadoop.io.LongWritable) obj).get());
+ } else if (obj instanceof org.apache.hadoop.io.VLongWritable) {
+ jgen.writeNumber(((org.apache.hadoop.io.VLongWritable) obj).get());
+ } else if (obj instanceof org.apache.hadoop.io.ByteWritable) {
+ jgen.writeNumber(((org.apache.hadoop.io.ByteWritable) obj).get());
+ } else if (obj instanceof org.apache.hadoop.io.FloatWritable) {
+ jgen.writeNumber(((org.apache.hadoop.io.FloatWritable) obj).get());
+ } else if (obj instanceof org.apache.hadoop.io.DoubleWritable) {
+ jgen.writeNumber(((org.apache.hadoop.io.DoubleWritable) obj).get());
+ } else if (obj instanceof org.apache.hadoop.io.BytesWritable) {
+ jgen.writeBinary(((org.apache.hadoop.io.BytesWritable) obj).getBytes());
+ } else {
+ jgen.writeString(obj.toString());
+ }
+ }
+ }
}
public static class CrawlDbStatMapper