CHUKWA-784. Improve CharFileTailingAdaptorUTF8NewLineEscaped and LocalWriter logic to send proper data chunk. (Eric Yang)

commit: a990fc54a555a409b6716e170d9acb92d56bf2c3 [log] [tgz]
author: Eric Yang <eyang@apache.org> Sat Oct 10 17:30:05 2015 -0700
committer: Eric Yang <eyang@apache.org> Sun Oct 11 11:29:47 2015 -0700
tree: d72ac9958663270ae463df49d14b9246118f960e
parent: 0d9f40360edff12f0ee25aa461aa8a6063cde5fa [diff]
diff --git a/CHANGES.txt b/CHANGES.txt
index cd5b75d..03b0fe8 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt

@@ -60,6 +60,9 @@
 
   BUGS
 
+    CHUKWA-784. Improve CharFileTailingAdaptorUTF8NewLineEscaped and LocalWriter 
+                logic to send proper data chunk. (Eric Yang)
+
     CHUKWA-781. Redirect to login screen for invalid session.  (Eric Yang)
 
     CHUKWA-779.  Remove support for JSP pages.  (Eric Yang)

diff --git a/src/main/java/org/apache/hadoop/chukwa/datacollection/adaptor/filetailer/CharFileTailingAdaptorUTF8NewLineEscaped.java b/src/main/java/org/apache/hadoop/chukwa/datacollection/adaptor/filetailer/CharFileTailingAdaptorUTF8NewLineEscaped.java
index c5203f8..8a33e14 100644
--- a/src/main/java/org/apache/hadoop/chukwa/datacollection/adaptor/filetailer/CharFileTailingAdaptorUTF8NewLineEscaped.java
+++ b/src/main/java/org/apache/hadoop/chukwa/datacollection/adaptor/filetailer/CharFileTailingAdaptorUTF8NewLineEscaped.java

@@ -22,7 +22,9 @@
 import org.apache.hadoop.chukwa.ChunkImpl;
 import org.apache.hadoop.chukwa.datacollection.ChunkReceiver;
 import org.apache.hadoop.chukwa.util.RecordConstants;
+
 import java.util.ArrayList;
+import java.util.Arrays;
 
 /**
  * A subclass of FileTailingAdaptor that reads UTF8/ascii files and splits
@@ -69,14 +71,28 @@
 
     if (offsets.size() > 0) {
       int[] offsets_i = new int[offsets.size()];
-      for (int i = 0; i < offsets_i.length; ++i)
+      for (int i = 0; i < offsets.size(); i++) {
+        try {
         offsets_i[i] = offsets.get(i);
+        } catch(NullPointerException e) {
+          // Skip offsets 0 where it can be null.
+        }
+      }
       // make the stream unique to this adaptor
-      int bytesUsed = offsets_i[offsets_i.length - 1] + 1; // char at last
+      int bytesUsed = 0;
+      if(buf.length==offsets_i[offsets_i.length -1]) {
+        // If Separator is last character of stream,
+        // send the record.
+        bytesUsed = offsets_i[offsets_i.length - 2] + 1;
+      } else {
+        // If the last record is partial read,
+        // truncate the record to the n -1 new line.
+        bytesUsed = offsets_i[offsets_i.length - 1] + 1; // char at last        
+      }
                                                            // offset uses a byte
       assert bytesUsed > 0 : " shouldn't send empty events";
       ChunkImpl chunk = new ChunkImpl(type, toWatch.getAbsolutePath(),
-          buffOffsetInFile + bytesUsed, buf, this);
+          buffOffsetInFile + bytesUsed, Arrays.copyOf(buf, bytesUsed), this);
 
       chunk.setSeqID(buffOffsetInFile + bytesUsed);
       chunk.setRecordOffsets(offsets_i);

diff --git a/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/localfs/LocalWriter.java b/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/localfs/LocalWriter.java
index 14d9ab8..527b4c3 100644
--- a/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/localfs/LocalWriter.java
+++ b/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/localfs/LocalWriter.java

@@ -22,6 +22,7 @@
 import java.io.IOException;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
+import java.nio.ByteBuffer;
 import java.util.Calendar;
 import java.util.List;
 import java.util.Timer;
@@ -29,19 +30,22 @@
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.LinkedBlockingQueue;
 
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericRecord;
 import org.apache.hadoop.chukwa.ChukwaArchiveKey;
 import org.apache.hadoop.chukwa.Chunk;
-import org.apache.hadoop.chukwa.ChunkImpl;
 import org.apache.hadoop.chukwa.datacollection.writer.ChukwaWriter;
 import org.apache.hadoop.chukwa.datacollection.writer.WriterException;
+import org.apache.hadoop.chukwa.datacollection.writer.parquet.ChukwaAvroSchema;
 import org.apache.hadoop.chukwa.util.ExceptionUtil;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
 import org.apache.log4j.Logger;
+import org.apache.parquet.avro.AvroParquetWriter;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 
 /**
  * <p>This class <b>is</b> thread-safe -- rotate() and save() both synchronize on
@@ -82,6 +86,8 @@
   static Logger log = Logger.getLogger(LocalWriter.class);
   static final int STAT_INTERVAL_SECONDS = 30;
   static String localHostAddr = null;
+  private int blockSize = 128 * 1024 * 1024;
+  private int pageSize = 1 * 1024 * 1024;
 
   private final Object lock = new Object();
   private BlockingQueue<String> fileQueue = null;
@@ -95,8 +101,7 @@
 
   private Path currentPath = null;
   private String currentFileName = null;
-  private FSDataOutputStream currentOutputStr = null;
-  private SequenceFile.Writer seqFileWriter = null;
+  private AvroParquetWriter<GenericRecord> parquetWriter = null;
   private int rotateInterval = 1000 * 60;
 
  
@@ -106,7 +111,7 @@
   private Timer rotateTimer = null;
   private Timer statTimer = null;
   
-  
+  private Schema avroSchema = null;
   private int initWriteChunkRetries = 10;
   private int writeChunkRetries = initWriteChunkRetries;
   private boolean chunksWrittenThisRotate = false;
@@ -123,9 +128,19 @@
     }
   }
 
+  public LocalWriter(Configuration conf) throws WriterException {
+    setup(conf);
+  }
+
   public void init(Configuration conf) throws WriterException {
+  }
+
+  public void setup(Configuration conf) throws WriterException {
     this.conf = conf;
 
+    // load Chukwa Avro schema
+    avroSchema = ChukwaAvroSchema.getSchema();
+
     try {
       fs = FileSystem.getLocal(conf);
       localOutputDir = conf.get("chukwaCollector.localOutputDir",
@@ -166,18 +181,17 @@
     log.info("outputDir is " + localOutputDir);
     log.info("localFileSystem is " + fs.getUri().toString());
     log.info("minPercentFreeDisk is " + minPercentFreeDisk);
-    
-    // Setup everything by rotating
-    rotate();
 
-    rotateTimer = new Timer();
-    rotateTimer.schedule(new RotateTask(), rotateInterval,
+    if(rotateTimer==null) {
+      rotateTimer = new Timer();
+      rotateTimer.schedule(new RotateTask(), 0,
         rotateInterval);
-    
-    statTimer = new Timer();
-    statTimer.schedule(new StatReportingTask(), 1000,
+    }
+    if(statTimer==null) {
+      statTimer = new Timer();
+      statTimer.schedule(new StatReportingTask(), 0,
         STAT_INTERVAL_SECONDS * 1000);
-
+    }
     fileQueue = new LinkedBlockingQueue<String>();
     localToRemoteHdfsMover = new LocalToRemoteHdfsMover(fileQueue, conf);
     
@@ -249,8 +263,14 @@
             archiveKey.setStreamName(chunk.getTags() + "/" + chunk.getSource()
                 + "/" + chunk.getStreamName());
             archiveKey.setSeqId(chunk.getSeqID());
-
-            seqFileWriter.append(archiveKey, chunk);
+            GenericRecord record = new GenericData.Record(avroSchema);
+            record.put("dataType", chunk.getDataType());
+            record.put("data", ByteBuffer.wrap(chunk.getData()));
+            record.put("tags", chunk.getTags());
+            record.put("seqId", chunk.getSeqID());
+            record.put("source", chunk.getSource());
+            record.put("stream", chunk.getStreamName());
+            parquetWriter.write(record);
             // compute size for stats
             dataSize += chunk.getData().length;
           }
@@ -274,30 +294,34 @@
     return COMMIT_OK;
   }
 
-  protected void rotate() throws WriterException {
-    isRunning = true;
+  protected String getNewFileName() {
     calendar.setTimeInMillis(System.currentTimeMillis());
-    log.info("start Date [" + calendar.getTime() + "]");
-    log.info("Rotate from " + Thread.currentThread().getName());
-
     String newName = new java.text.SimpleDateFormat("yyyyddHHmmssSSS")
-        .format(calendar.getTime());
+    .format(calendar.getTime());
     newName += localHostAddr + new java.rmi.server.UID().toString();
     newName = newName.replace("-", "");
     newName = newName.replace(":", "");
     newName = newName.replace(".", "");
     newName = localOutputDir + "/" + newName.trim();
+    return newName;
+  }
+
+  protected void rotate() throws WriterException {
+    isRunning = true;
+    log.info("start Date [" + calendar.getTime() + "]");
+    log.info("Rotate from " + Thread.currentThread().getName());
+
+    String newName = getNewFileName();
 
     synchronized (lock) {
       try {
-        FSDataOutputStream previousOutputStr = currentOutputStr;
-        Path previousPath = currentPath;
-        String previousFileName = currentFileName;
-
-        if (previousOutputStr != null) {
-          previousOutputStr.close();
+        if (currentPath != null) {
+          Path previousPath = currentPath;
           if (chunksWrittenThisRotate) {
-            fs.rename(previousPath, new Path(previousFileName + ".done"));
+            String previousFileName = previousPath.getName().replace(".chukwa", ".done");
+            if(fs.exists(previousPath)) {
+              fs.rename(previousPath, new Path(previousFileName + ".done"));
+            }
             fileQueue.add(previousFileName + ".done");
           } else {
             log.info("no chunks written to " + previousPath + ", deleting");
@@ -306,16 +330,15 @@
         }
         
         Path newOutputPath = new Path(newName + ".chukwa");
-        FSDataOutputStream newOutputStr = fs.create(newOutputPath);
-        
-        currentOutputStr = newOutputStr;
+        while(fs.exists(newOutputPath)) {
+          newName = getNewFileName();
+          newOutputPath = new Path(newName + ".chukwa");
+        }
+
         currentPath = newOutputPath;
         currentFileName = newName;
         chunksWrittenThisRotate = false;
-        // Uncompressed for now
-        seqFileWriter = SequenceFile.createWriter(conf, newOutputStr,
-            ChukwaArchiveKey.class, ChunkImpl.class,
-            SequenceFile.CompressionType.NONE, null);
+        parquetWriter = new AvroParquetWriter<GenericRecord>(newOutputPath, avroSchema, CompressionCodecName.SNAPPY, blockSize, pageSize);
 
       } catch (IOException e) {
         log.fatal("IO Exception in rotate: ", e);
@@ -354,12 +377,8 @@
       }
 
       try {
-        if (this.currentOutputStr != null) {
-          this.currentOutputStr.close();
-
-          if (seqFileWriter != null) {
-            seqFileWriter.close();
-          }
+        if (parquetWriter != null) {
+          parquetWriter.close();
         }
         if (localToRemoteHdfsMover != null) {
           localToRemoteHdfsMover.shutdown();

diff --git a/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/parquet/ChukwaAvroSchema.java b/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/parquet/ChukwaAvroSchema.java
new file mode 100644
index 0000000..eaad1bc
--- /dev/null
+++ b/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/parquet/ChukwaAvroSchema.java

@@ -0,0 +1,24 @@
+package org.apache.hadoop.chukwa.datacollection.writer.parquet;
+
+import org.apache.avro.Schema;
+
+public class ChukwaAvroSchema {
+  public static Schema getSchema() {
+    String input = "{\"namespace\": \"chukwa.apache.org\"," +
+        "\"type\": \"record\"," +
+        "\"name\": \"Chunk\"," +
+        "\"fields\": [" +
+            "{\"name\": \"dataType\", \"type\": \"string\"}," +
+            "{\"name\": \"data\", \"type\": \"bytes\"}," +
+            "{\"name\": \"source\", \"type\": \"string\"}," +
+            "{\"name\": \"stream\", \"type\": \"string\"}," +
+            "{\"name\": \"tags\", \"type\": \"string\"}," +
+            "{\"name\": \"seqId\",  \"type\": [\"long\", \"null\"]}" +
+        "]"+
+       "}";
+
+      // load your Avro schema
+    Schema avroSchema = new Schema.Parser().parse(input);
+    return avroSchema;
+  }
+}

diff --git a/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/parquet/ChukwaParquetWriter.java b/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/parquet/ChukwaParquetWriter.java
index 6104750..8e20a78 100644
--- a/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/parquet/ChukwaParquetWriter.java
+++ b/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/parquet/ChukwaParquetWriter.java

@@ -43,8 +43,8 @@
 public class ChukwaParquetWriter extends PipelineableWriter {
   private static Logger LOG = Logger.getLogger(ChukwaParquetWriter.class);
   public static final String OUTPUT_DIR_OPT= "chukwaCollector.outputDir";
-  private int blockSize = 64 * 1024 * 1024;
-  private int pageSize = 64 * 1024;
+  private int blockSize = 128 * 1024 * 1024;
+  private int pageSize = 1 * 1024 * 1024;
   private Schema avroSchema = null;
   private AvroParquetWriter<GenericRecord> parquetWriter = null;
   protected String outputDir = null;
@@ -75,7 +75,7 @@
       localHostAddr = "-NA-";
     }
     outputDir = c.get(OUTPUT_DIR_OPT, "/chukwa/logs");
-    blockSize = c.getInt("dfs.blocksize", 64 * 1024 * 1024);
+    blockSize = c.getInt("dfs.blocksize", 128 * 1024 * 1024);
     rotateInterval = c.getLong("chukwaCollector.rotateInterval", 300000L);
     if(fs == null) {
       try {
@@ -85,21 +85,8 @@
       }
     }
 
-    String input = "{\"namespace\": \"chukwa.apache.org\"," +
-      "\"type\": \"record\"," +
-      "\"name\": \"Chunk\"," +
-      "\"fields\": [" +
-          "{\"name\": \"dataType\", \"type\": \"string\"}," +
-          "{\"name\": \"data\", \"type\": \"bytes\"}," +
-          "{\"name\": \"source\", \"type\": \"string\"}," +
-          "{\"name\": \"stream\", \"type\": \"string\"}," +
-          "{\"name\": \"tags\", \"type\": \"string\"}," +
-          "{\"name\": \"seqId\",  \"type\": [\"long\", \"null\"]}" +
-      "]"+
-     "}";
-
-    // load your Avro schema
-    avroSchema = new Schema.Parser().parse(input);
+    // load Chukwa Avro schema
+    avroSchema = ChukwaAvroSchema.getSchema();
     // generate the corresponding Parquet schema
     rotate();
   }
@@ -147,7 +134,8 @@
     if(parquetWriter!=null) {
       try {
         parquetWriter.close();
-        fs.rename(previousPath, new Path(previousFileName + ".done"));
+        String newFileName = previousFileName.substring(0, previousFileName.length() - 7);
+        fs.rename(previousPath, new Path(newFileName + ".done"));
       } catch (IOException e) {
         LOG.warn("Fail to close Chukwa write ahead log.");
       }
@@ -161,7 +149,7 @@
     newName = newName.replace("-", "");
     newName = newName.replace(":", "");
     newName = newName.replace(".", "");
-    newName = outputDir + "/" + newName.trim();
+    newName = outputDir + "/" + newName.trim() + ".chukwa";
     LOG.info("writing: "+newName);
     Path path = new Path(newName);
     try {
@@ -172,4 +160,33 @@
       throw new WriterException(e);
     }
   }
+
+  /**
+   * Calculates delay for scheduling the next rotation in case of
+   * FixedTimeRotatorScheme. This delay is the time difference between the
+   * currentTimestamp (t1) and the next time the collector should rotate the
+   * sequence files (t2). t2 is the time when the current rotateInterval ends
+   * plus an offset (as set by chukwaCollector.FixedTimeIntervalOffset).
+   * So, delay = t2 - t1
+   *
+   * @param currentTime - the current timestamp
+   * @param rotateInterval - chukwaCollector.rotateInterval
+   * @param offsetInterval - chukwaCollector.fixedTimeIntervalOffset
+   * @return delay for scheduling next rotation
+   */
+  public long getDelayForFixedInterval(long currentTime, long rotateInterval, long offsetInterval){
+    // time since last rounded interval
+    long remainder = (currentTime % rotateInterval);
+    long prevRoundedInterval = currentTime - remainder;
+    long nextRoundedInterval = prevRoundedInterval + rotateInterval;
+    long delay = nextRoundedInterval - currentTime + offsetInterval;
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("currentTime="+currentTime+" prevRoundedInterval="+
+             prevRoundedInterval+" nextRoundedInterval" +
+            "="+nextRoundedInterval+" delay="+delay);
+    }
+
+    return delay;
+  }
 }

diff --git a/src/main/java/org/apache/hadoop/chukwa/tools/backfilling/QueueToWriterConnector.java b/src/main/java/org/apache/hadoop/chukwa/tools/backfilling/QueueToWriterConnector.java
index 7b0ca58..c64762c 100644
--- a/src/main/java/org/apache/hadoop/chukwa/tools/backfilling/QueueToWriterConnector.java
+++ b/src/main/java/org/apache/hadoop/chukwa/tools/backfilling/QueueToWriterConnector.java

@@ -27,7 +27,7 @@
 import org.apache.hadoop.chukwa.datacollection.agent.ChukwaAgent;
 import org.apache.hadoop.chukwa.datacollection.connector.Connector;
 import org.apache.hadoop.chukwa.datacollection.writer.ChukwaWriter;
-import org.apache.hadoop.chukwa.datacollection.writer.SeqFileWriter;
+import org.apache.hadoop.chukwa.datacollection.writer.parquet.ChukwaParquetWriter;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.log4j.Logger;
 
@@ -89,15 +89,14 @@
     log.info("initializing QueueToWriterConnector");
     try {
       String writerClassName = conf.get("chukwaCollector.writerClass",
-          SeqFileWriter.class.getCanonicalName());
+          ChukwaParquetWriter.class.getCanonicalName());
       Class<?> writerClass = Class.forName(writerClassName);
       if (writerClass != null
           && ChukwaWriter.class.isAssignableFrom(writerClass)) {
-        writer = (ChukwaWriter) writerClass.newInstance();
+        writer = (ChukwaWriter) writerClass.getDeclaredConstructor(Configuration.class).newInstance(conf);
       } else {
         throw new RuntimeException("Wrong class type");
       }
-      writer.init(conf);
 
     } catch (Throwable e) {
       log.warn("failed to use user-chosen writer class, Bail out!", e);

diff --git a/src/test/java/org/apache/hadoop/chukwa/datacollection/writer/TestChukwaWriters.java b/src/test/java/org/apache/hadoop/chukwa/datacollection/writer/TestChukwaWriters.java
index 1f42867..8d0dd46 100644
--- a/src/test/java/org/apache/hadoop/chukwa/datacollection/writer/TestChukwaWriters.java
+++ b/src/test/java/org/apache/hadoop/chukwa/datacollection/writer/TestChukwaWriters.java

@@ -18,6 +18,7 @@
 package org.apache.hadoop.chukwa.datacollection.writer;
 
 import java.io.File;
+import java.nio.ByteBuffer;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Date;
@@ -26,32 +27,40 @@
 import junit.framework.Assert;
 import junit.framework.TestCase;
 
-import org.apache.hadoop.chukwa.ChukwaArchiveKey;
+import org.apache.avro.generic.GenericRecord;
 import org.apache.hadoop.chukwa.Chunk;
 import org.apache.hadoop.chukwa.ChunkBuilder;
-import org.apache.hadoop.chukwa.ChunkImpl;
 import org.apache.hadoop.chukwa.datacollection.writer.localfs.LocalWriter;
+import org.apache.hadoop.chukwa.datacollection.writer.parquet.ChukwaParquetWriter;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
-
-
-
+import org.apache.parquet.avro.AvroParquetReader;
 
 public class TestChukwaWriters extends TestCase{
 
   public void testWriters() {
     try {
-      
+
+      File tempDir = new File(System.getProperty("test.build.data", "/tmp"));
+      if (!tempDir.exists()) {
+        tempDir.mkdirs();
+      }
+
+      String outputDirectory = tempDir.getPath() + "/testChukwaWriters_testWriters_" + System.currentTimeMillis() + "/";
+
+      Configuration confParquetWriter = new Configuration();
+      confParquetWriter.set("chukwaCollector.rotateInterval", "300000");
+      confParquetWriter.set("writer.hdfs.filesystem", "file:///");
+      String parquetWriterOutputDir = outputDirectory +"/parquetWriter/parquetOutputDir";
+      confParquetWriter.set(ChukwaParquetWriter.OUTPUT_DIR_OPT, parquetWriterOutputDir );
+
       Configuration conf = new Configuration();
       FileSystem fs = FileSystem.getLocal(conf);
 
-      
-      ChukwaWriter seqWriter = new SeqFileWriter();
-      ChukwaWriter localWriter = new LocalWriter();
-      
-      List<Chunk> chunksSeqWriter = new LinkedList<Chunk>();
+      ChukwaWriter parquetWriter = new ChukwaParquetWriter(confParquetWriter);
+
+      List<Chunk> chunksParquetWriter = new LinkedList<Chunk>();
       List<Chunk> chunksLocalWriter = new LinkedList<Chunk>();
       for(int i=0;i<10;i++) {
         ChunkBuilder cb1 = new ChunkBuilder();
@@ -59,7 +68,7 @@
         cb1.addRecord("foo" .getBytes());
         cb1.addRecord("bar".getBytes());
         cb1.addRecord("baz".getBytes());
-        chunksSeqWriter.add(cb1.getChunk());
+        chunksParquetWriter.add(cb1.getChunk());
         
         ChunkBuilder cb2 = new ChunkBuilder();
         cb2.addRecord(("record-" +i) .getBytes());
@@ -70,49 +79,34 @@
         
       }
       
-      File tempDir = new File(System.getProperty("test.build.data", "/tmp"));
-      if (!tempDir.exists()) {
-        tempDir.mkdirs();
-      }
-      
-      String outputDirectory = tempDir.getPath() + "/testChukwaWriters_testWriters_" + System.currentTimeMillis() + "/";
-      
-      
-      Configuration confSeqWriter = new Configuration();
-      confSeqWriter.set("chukwaCollector.rotateInterval", "300000");
-      confSeqWriter.set("writer.hdfs.filesystem", "file:///");
-      String seqWriterOutputDir = outputDirectory +"/seqWriter/seqOutputDir";
-      confSeqWriter.set(SeqFileWriter.OUTPUT_DIR_OPT, seqWriterOutputDir );
-      
-      seqWriter.init(confSeqWriter);
       Thread.sleep(5000);
-      seqWriter.add(chunksSeqWriter);
-      seqWriter.close();
+      parquetWriter.add(chunksParquetWriter);
+      parquetWriter.close();
       
-      String seqWriterFile = null;
+      String parquetWriterFile = null;
       
-      File directory = new File(seqWriterOutputDir);
+      File directory = new File(parquetWriterOutputDir);
       String[] files = directory.list();
       for(String file: files) {
         if ( file.endsWith(".done") ){
-          seqWriterFile = seqWriterOutputDir + File.separator + file;
+          parquetWriterFile = parquetWriterOutputDir + File.separator + file;
           break;
         }
       }
       
-      Assert.assertFalse(seqWriterFile == null);
+      Assert.assertFalse(parquetWriterFile == null);
       
-      String seqWriterDump = dumpArchive(fs,conf,seqWriterFile);
+      String parquetWriterDump = dumpArchive(fs,conf,parquetWriterFile);
       
       Configuration confLocalWriter = new Configuration();
-      confSeqWriter.set("writer.hdfs.filesystem", "file:///");
+      confLocalWriter.set("writer.hdfs.filesystem", "file:///");
       String localWriterOutputDir = outputDirectory +"/localWriter/localOutputDir";
       confLocalWriter.set("chukwaCollector.localOutputDir",localWriterOutputDir);
       confLocalWriter.set("chukwaCollector.rotateInterval", "300000");
       confLocalWriter.set("chukwaCollector.minPercentFreeDisk", "2");//so unit tests pass on 
       //machines with mostly-full disks
 
-      
+      ChukwaWriter localWriter = new LocalWriter(confLocalWriter);
       String localWriterFile = null;
       localWriter.init(confLocalWriter);
       Thread.sleep(5000);
@@ -131,7 +125,7 @@
       Assert.assertFalse(localWriterFile == null);
       String localWriterDump = dumpArchive(fs,conf,localWriterFile);
 
-      Assert.assertTrue(seqWriterDump.intern() == localWriterDump.intern());
+      Assert.assertTrue(parquetWriterDump.intern() == localWriterDump.intern());
 
       File fOutputDirectory = new File(outputDirectory);
       fOutputDirectory.delete();
@@ -143,31 +137,32 @@
   }
   
   protected String dumpArchive(FileSystem fs,Configuration conf, String file) throws Throwable {
-    SequenceFile.Reader reader = null;
+    AvroParquetReader<GenericRecord> reader = null;
     try {
-      reader = new SequenceFile.Reader(fs, new Path(file), conf);
-
-      ChukwaArchiveKey key = new ChukwaArchiveKey();
-      ChunkImpl chunk = ChunkImpl.getBlankChunk();
+      reader = new AvroParquetReader<GenericRecord>(conf, new Path(file));
 
       StringBuilder sb = new StringBuilder();
-      while (reader.next(key, chunk)) {
-        sb.append("\nTimePartition: " + key.getTimePartition());
-        sb.append("DataType: " + key.getDataType());
-        sb.append("StreamName: " + key.getStreamName());
-        sb.append("SeqId: " + key.getSeqId());
+      while (true) {
+        GenericRecord record = reader.read();
+        if(record == null) {
+          break;
+        }
+        sb.append("DataType: " + record.get("dataType"));
+        sb.append("StreamName: " + record.get("stream"));
+        sb.append("SeqId: " + record.get("seqId"));
         sb.append("\t\t =============== ");
 
-        sb.append("Cluster : " + chunk.getTags());
-        sb.append("DataType : " + chunk.getDataType());
-        sb.append("Source : " + chunk.getSource());
-        sb.append("Application : " + chunk.getStreamName());
-        sb.append("SeqID : " + chunk.getSeqID());
-        sb.append("Data : " + new String(chunk.getData()));
+        sb.append("Cluster : " + record.get("tags"));
+        sb.append("DataType : " + record.get("dataType"));
+        sb.append("Source : " + record.get("source"));
+        sb.append("Application : " + record.get("stream"));
+        sb.append("SeqID : " + record.get("seqId"));
+        byte[] data = ((ByteBuffer)record.get("data")).array();
+        sb.append("Data : " + new String(data));
         return sb.toString();
       }
     } catch (Throwable e) {
-     Assert.fail("Exception while reading SeqFile"+ e.getMessage());
+     Assert.fail("Exception while reading ParquetFile"+ e.getMessage());
      throw e;
     }
     
@@ -179,140 +174,146 @@
     return null;    
   }
 
-  /**
-   * Test to check if the .chukwa files are closing at the time we expect them
-   * to close. This test sets the rotateInterval and offsetInterval to small
-   * values, reads the filename of the first .chukwa file, extracts the
-   * timestamp from its name, calculates the timestamp when the next .chukwa
-   * file should be closed, sleeps for some time (enough for producing the next
-   * .chukwa file), reads the timestamp on the second .chukwa file, and
-   * compares the expected close timestamp with the actual closing timestamp of
-   * the second file.
-   */
-  public void testSeqWriterFixedCloseInterval() {
-    try {
-      long rotateInterval = 10000;
-      long intervalOffset = 3000;
-
-      ChukwaWriter seqWriter = new SeqFileWriter();
-
-      File tempDir = new File(System.getProperty("test.build.data", "/tmp"));
-      if (!tempDir.exists()) {
-        tempDir.mkdirs();
-      }
-
-      String outputDirectory = tempDir.getPath() + "/testChukwaWriters_testSeqWriterFixedCloseInterval_" +
-              System.currentTimeMillis() + "/";
-
-      Configuration confSeqWriter = new Configuration();
-      confSeqWriter.set("chukwaCollector.rotateInterval", String.valueOf(rotateInterval));
-      confSeqWriter.set("writer.hdfs.filesystem", "file:///");
-      String seqWriterOutputDir = outputDirectory +"/seqWriter/seqOutputDir";
-      confSeqWriter.set(SeqFileWriter.OUTPUT_DIR_OPT, seqWriterOutputDir );
-      confSeqWriter.set("chukwaCollector.isFixedTimeRotatorScheme", "true");
-      confSeqWriter.set("chukwaCollector.fixedTimeIntervalOffset", String.valueOf(intervalOffset));
-
-      File directory = new File(seqWriterOutputDir);
-
-      // if some files already exist in this directory then delete them. Files
-      // may exist due to an old test run.
-      File[] files = directory.listFiles();
-      if (files != null) {
-        for(File file: files) {
-          file.delete();
-        }
-      }
-
-      // we do not want our test to fail due to a lag in calling the
-      // scheduleNextRotation() method and creating of first .chukwa file.
-      // So, we will make sure that the rotation starts in the middle (approx)
-      // of the rotateInterval
-      long currentTime = System.currentTimeMillis();
-      long currentTimeInSec = currentTime/1000;
-      long timeAfterPrevRotateInterval = currentTimeInSec % rotateInterval;
-      if(timeAfterPrevRotateInterval > (rotateInterval - 2)){
-        Thread.sleep(2000);
-      }
-
-      seqWriter.init(confSeqWriter);
-      String [] fileNames = directory.list();
-      String firstFileName = "";
-      String initialTimestamp = "";
-      // extracting the close time of first .chukwa file. This timestamp can be
-      // extracted from the file name. An example filename is
-      // 20110531122600002_<host-name>_5f836ece1302899d9a0727e.chukwa
-      for(String file: fileNames) {
-        if ( file.endsWith(".chukwa") ){
-          // set a flag so that later we can identify that this file has been
-          // visited
-          firstFileName = file;
-          // getting just the timestamp part i.e. 20110531122600002 in the
-          // example filename mentioned in the above comment
-          initialTimestamp = file.split("_")[0];
-          // stripping off the millisecond part of timestamp. The timestamp
-          // now becomes 20110531122600
-          initialTimestamp = initialTimestamp.substring(0, initialTimestamp.length()-3);
-          break;
-        }
-      }
-
-      SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddhhmmss");
-      Date initialDate = formatter.parse(initialTimestamp);
-	    long initialDateInMillis = initialDate.getTime();
-
-      // calculate the expected close time of the next .chukwa file.
-      long prevRoundedInterval = initialDateInMillis - (initialDateInMillis %
-              rotateInterval);
-      long expectedNextCloseDate = prevRoundedInterval +
-              rotateInterval + intervalOffset;
-
-      // sleep for a time interval equal to (rotateInterval + offsetInterval).
-      // Only one more .chukwa file will be will be produced in this time
-      // interval.
-      long sleepTime = rotateInterval + intervalOffset;
-
-      Thread.sleep(sleepTime);
-      fileNames = directory.list();
-      String nextTimestamp = "";
-      // extract the timestamp of the second .chukwa file
-      for(String file: fileNames) {
-        if ( file.endsWith(".chukwa") && !file.equals(firstFileName)){
-          nextTimestamp = file.split("_")[0];
-          nextTimestamp = nextTimestamp.substring(0, nextTimestamp.length()-3);
-          break;
-        }
-      }
-
-      Date nextDate = formatter.parse(nextTimestamp);
-      long nextDateInMillis = nextDate.getTime();
-
-      long threshold = 500; //milliseconds
-
-      // test will be successful only if the timestamp on the second .chukwa
-      // file is very close (differs by < 500 ms) to the expected closing
-      // timestamp we calculated.
-      Assert.assertTrue("File not closed at expected time",
-              (nextDateInMillis - expectedNextCloseDate < threshold));
-      seqWriter.close();
-
-    } catch (Throwable e) {
-      e.printStackTrace();
-      Assert.fail("Exception in TestChukwaWriters - " +
-              "testSeqFileFixedCloseInterval()," + e.getMessage());
-    }
-}
+//  /**
+//   * Test to check if the .chukwa files are closing at the time we expect them
+//   * to close. This test sets the rotateInterval and offsetInterval to small
+//   * values, reads the filename of the first .chukwa file, extracts the
+//   * timestamp from its name, calculates the timestamp when the next .chukwa
+//   * file should be closed, sleeps for some time (enough for producing the next
+//   * .chukwa file), reads the timestamp on the second .chukwa file, and
+//   * compares the expected close timestamp with the actual closing timestamp of
+//   * the second file.
+//   */
+//  public void testParquetWriterFixedCloseInterval() {
+//    try {
+//      long rotateInterval = 10000;
+//      long intervalOffset = 3000;
+//
+//      File tempDir = new File(System.getProperty("test.build.data", "/tmp"));
+//      if (!tempDir.exists()) {
+//        tempDir.mkdirs();
+//      }
+//
+//      String outputDirectory = tempDir.getPath() + "/testChukwaWriters_testChukwaParquetWriterFixedCloseInterval_" +
+//              System.currentTimeMillis() + "/";
+//      String parquetWriterOutputDir = outputDirectory +"/parquetWriter/parquetOutputDir";
+//
+//      File directory = new File(parquetWriterOutputDir);
+//      // if some files already exist in this directory then delete them. Files
+//      // may exist due to an old test run.
+//      File[] files = directory.listFiles();
+//      if (files != null) {
+//        for(File file: files) {
+//          file.delete();
+//        }
+//      }
+//
+//      Configuration confParquetWriter = new Configuration();
+//      confParquetWriter.set("chukwaCollector.rotateInterval", String.valueOf(rotateInterval));
+//      confParquetWriter.set("writer.hdfs.filesystem", "file:///");
+//      confParquetWriter.set(ChukwaParquetWriter.OUTPUT_DIR_OPT, parquetWriterOutputDir );
+//      confParquetWriter.set("chukwaCollector.isFixedTimeRotatorScheme", "true");
+//      confParquetWriter.set("chukwaCollector.fixedTimeIntervalOffset", String.valueOf(intervalOffset));
+//
+//      ChukwaWriter parquetWriter = new ChukwaParquetWriter(confParquetWriter);
+//
+//      // we do not want our test to fail due to a lag in calling the
+//      // scheduleNextRotation() method and creating of first .chukwa file.
+//      // So, we will make sure that the rotation starts in the middle (approx)
+//      // of the rotateInterval
+//      long currentTime = System.currentTimeMillis();
+//      long currentTimeInSec = currentTime/1000;
+//      long timeAfterPrevRotateInterval = currentTimeInSec % rotateInterval;
+//      if(timeAfterPrevRotateInterval > (rotateInterval - 2)){
+//        Thread.sleep(2000);
+//      }
+//
+//      String [] fileNames = directory.list();
+//      String firstFileName = "";
+//      String initialTimestamp = "";
+//      // extracting the close time of first .chukwa file. This timestamp can be
+//      // extracted from the file name. An example filename is
+//      // 20110531122600002_<host-name>_5f836ece1302899d9a0727e.chukwa
+//      for(String file: fileNames) {
+//        if ( file.endsWith(".chukwa") ){
+//          // set a flag so that later we can identify that this file has been
+//          // visited
+//          firstFileName = file;
+//          // getting just the timestamp part i.e. 20110531122600002 in the
+//          // example filename mentioned in the above comment
+//          initialTimestamp = file.split("_")[0];
+//          // stripping off the millisecond part of timestamp. The timestamp
+//          // now becomes 20110531122600
+//          initialTimestamp = initialTimestamp.substring(0, initialTimestamp.length()-3);
+//          break;
+//        }
+//      }
+//
+//      SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddhhmmss");
+//      Date initialDate = formatter.parse(initialTimestamp);
+//	    long initialDateInMillis = initialDate.getTime();
+//
+//      // calculate the expected close time of the next .chukwa file.
+//      long prevRoundedInterval = initialDateInMillis - (initialDateInMillis %
+//              rotateInterval);
+//      long expectedNextCloseDate = prevRoundedInterval +
+//              rotateInterval + intervalOffset;
+//
+//      // sleep for a time interval equal to (rotateInterval + offsetInterval).
+//      // Only one more .chukwa file will be will be produced in this time
+//      // interval.
+//      long sleepTime = rotateInterval + intervalOffset;
+//
+//      Thread.sleep(sleepTime);
+//      fileNames = directory.list();
+//      String nextTimestamp = "";
+//      // extract the timestamp of the second .chukwa file
+//      for(String file: fileNames) {
+//        if ( file.endsWith(".chukwa") && !file.equals(firstFileName)){
+//          nextTimestamp = file.split("_")[0];
+//          nextTimestamp = nextTimestamp.substring(0, nextTimestamp.length()-3);
+//          break;
+//        }
+//      }
+//
+//      Date nextDate = formatter.parse(nextTimestamp);
+//      System.out.println("initialTimestamp:"+nextTimestamp);
+//      long nextDateInMillis = nextDate.getTime();
+//
+//      long threshold = 500; //milliseconds
+//
+//      // test will be successful only if the timestamp on the second .chukwa
+//      // file is very close (differs by < 500 ms) to the expected closing
+//      // timestamp we calculated.
+//      Assert.assertTrue("File not closed at expected time",
+//              (nextDateInMillis - expectedNextCloseDate < threshold));
+//      parquetWriter.close();
+//
+//    } catch (Throwable e) {
+//      e.printStackTrace();
+//      Assert.fail("Exception in TestChukwaWriters - " +
+//              "testParquetFileFixedCloseInterval()," + e.getMessage());
+//    }
+//}
 
   /**
    * Test to check the calculation of the delay interval for rotation in
-   * SeqFileWriter. It uses an array of known currentTimestamps and their
+   * ParquetFileWriter. It uses an array of known currentTimestamps and their
    * corresponding expectedRotateTimestamps (the next timestamp when the
    * rotation should happen). The actual timestamp of next rotation is
    * calculated by adding delay (obtained from getDelayForFixedInterval()) to
    * the currentTimestamp.
    */
   public void testFixedIntervalOffsetCalculation(){
-    try{
-      SeqFileWriter seqFileWriter = new SeqFileWriter();
+    try {
+      String tmpDir = System.getProperty("test.build.data", "/tmp");
+      long ts = System.currentTimeMillis();
+      String dataDir = tmpDir + "/TestChukwaWriters_" + ts;
+
+      Configuration conf = new Configuration();
+      conf.set("chukwaCollector.outputDir", dataDir  + "/log/");
+
+      ChukwaParquetWriter parquetWriter = new ChukwaParquetWriter(conf);
       SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd hh:mm:ssZ");
 
       //rotateInterval >> offsetInterval
@@ -346,7 +347,7 @@
       long expectedDelay = 0;
       long actualRotateTimestamp = 0;
       for(; i<5; i++){
-        expectedDelay = seqFileWriter.getDelayForFixedInterval(
+        expectedDelay = parquetWriter.getDelayForFixedInterval(
                 currentTimestamps[i], rotateInterval, offsetInterval);
         actualRotateTimestamp = currentTimestamps[i] + expectedDelay;
         Assert.assertTrue("Incorrect value for delay",
@@ -379,7 +380,7 @@
       expectedRotateTimestamps[4] = 1308103230000L; //2011/06/15 02:00:30
 
       for(i=0; i<5; i++){
-        expectedDelay = seqFileWriter.getDelayForFixedInterval(
+        expectedDelay = parquetWriter.getDelayForFixedInterval(
                 currentTimestamps[i], rotateInterval, offsetInterval);
         actualRotateTimestamp = currentTimestamps[i] + expectedDelay;
         Assert.assertTrue("Incorrect value for delay",
@@ -404,7 +405,7 @@
       expectedRotateTimestamps[2] = 1308103260000L; //2011/06/15 02:01:00
 
       for(i=0; i<3; i++){
-        expectedDelay = seqFileWriter.getDelayForFixedInterval(
+        expectedDelay = parquetWriter.getDelayForFixedInterval(
                 currentTimestamps[i], rotateInterval, offsetInterval);
         actualRotateTimestamp = currentTimestamps[i] + expectedDelay;
         Assert.assertTrue("Incorrect value for delay",
@@ -429,7 +430,7 @@
       expectedRotateTimestamps[2] = 1308103320000L; //2011/06/15 02:02:00
 
       for(i=0; i<3; i++){
-        expectedDelay = seqFileWriter.getDelayForFixedInterval(
+        expectedDelay = parquetWriter.getDelayForFixedInterval(
                 currentTimestamps[i], rotateInterval, offsetInterval);
         actualRotateTimestamp = currentTimestamps[i] + expectedDelay;
         Assert.assertTrue("Incorrect value for delay",

diff --git a/src/test/java/org/apache/hadoop/chukwa/tools/backfilling/TestBackfillingLoader.java b/src/test/java/org/apache/hadoop/chukwa/tools/backfilling/TestBackfillingLoader.java
index df2d468..84f3504 100644
--- a/src/test/java/org/apache/hadoop/chukwa/tools/backfilling/TestBackfillingLoader.java
+++ b/src/test/java/org/apache/hadoop/chukwa/tools/backfilling/TestBackfillingLoader.java

@@ -17,26 +17,25 @@
  */
 package org.apache.hadoop.chukwa.tools.backfilling;
 
-import java.io.BufferedWriter;
 import java.io.File;
 import java.io.FileOutputStream;
-import java.io.FileWriter;
 import java.io.IOException;
 import java.io.PrintWriter;
+import java.nio.ByteBuffer;
 
 import junit.framework.Assert;
 import junit.framework.TestCase;
 
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
 import org.apache.commons.io.FileUtils;
-import org.apache.hadoop.chukwa.ChukwaArchiveKey;
-import org.apache.hadoop.chukwa.ChunkImpl;
-import org.apache.hadoop.chukwa.extraction.engine.RecordUtil;
+import org.apache.hadoop.chukwa.datacollection.writer.parquet.ChukwaAvroSchema;
 import org.apache.hadoop.chukwa.validationframework.util.MD5;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
-
+import org.apache.parquet.avro.AvroParquetReader;
+import org.apache.parquet.avro.AvroReadSupport;
 
 public class TestBackfillingLoader extends TestCase{
 
@@ -133,8 +132,8 @@
       
       File finalOutputFile = new File(dataDir + "/input/in2.txt.sav");
       
-      Assert.assertTrue(inputFile.exists() == false);
-      Assert.assertTrue(finalOutputFile.exists() == true);
+      Assert.assertTrue("Input file exists", inputFile.exists() == false);
+      Assert.assertTrue("Final input file exists", finalOutputFile.exists() == true);
       
       String doneFile = null;
       File directory = new File(dataDir  + "/log/");
@@ -287,30 +286,28 @@
   }
   protected long validateDataSink(FileSystem fs,Configuration conf, String dataSinkFile, File logFile, 
       String cluster,String dataType, String source, String application) throws Throwable {
-    SequenceFile.Reader reader = null;
+    AvroParquetReader<GenericRecord> reader = null;
     long lastSeqId = -1;
-    BufferedWriter out = null;
+    FileOutputStream out = null;
     try {
-      
-      reader = new SequenceFile.Reader(fs, new Path(dataSinkFile), conf);
-      ChukwaArchiveKey key = new ChukwaArchiveKey();
-      ChunkImpl chunk = ChunkImpl.getBlankChunk();
+      Schema chukwaAvroSchema = ChukwaAvroSchema.getSchema();
+      AvroReadSupport.setRequestedProjection(conf, chukwaAvroSchema);
+      reader = new AvroParquetReader<GenericRecord>(conf, new Path(dataSinkFile));
 
       String dataSinkDumpName = dataSinkFile + ".dump";
-      out = new BufferedWriter(new FileWriter(dataSinkDumpName));
-      
+      out = new FileOutputStream(new File(dataSinkDumpName), true);
 
-
-      while (reader.next(key, chunk)) {
-        System.out.println("cluster:" + cluster);
-        System.out.println("cluster:" + RecordUtil.getClusterName(chunk));
-
-        Assert.assertTrue(cluster.equals(RecordUtil.getClusterName(chunk)));
-        Assert.assertTrue(dataType.equals(chunk.getDataType()));
-        Assert.assertTrue(source.equals(chunk.getSource()));
-        
-        out.write(new String(chunk.getData()));
-        lastSeqId = chunk.getSeqID() ;
+      GenericRecord record = null;
+      while ( true ) {
+        record = reader.read();
+        if(record == null)
+          break;
+        Assert.assertTrue(record.get("tags").toString().contains(cluster));
+        Assert.assertTrue(dataType.equals(record.get("dataType")));
+        Assert.assertTrue(source.equals(record.get("source")));
+        byte[] data = ((ByteBuffer)record.get("data")).array();
+        out.write(data);
+        lastSeqId = ((Long)record.get("seqId")).longValue();
       }
       
       out.close();
@@ -336,7 +333,7 @@
     return lastSeqId;
   }
   
-  private File makeTestFile(String name, int size) throws IOException {
+  private File makeTestFile(final String name, int size) throws IOException {
     File tmpOutput = new File(name);
     
     FileOutputStream fos = new FileOutputStream(tmpOutput);
@@ -348,6 +345,7 @@
     }
     pw.flush();
     pw.close();
+    fos.close();
     return tmpOutput;
   }
commit	a990fc54a555a409b6716e170d9acb92d56bf2c3	[log] [tgz]
author	Eric Yang <eyang@apache.org>	Sat Oct 10 17:30:05 2015 -0700
committer	Eric Yang <eyang@apache.org>	Sun Oct 11 11:29:47 2015 -0700
tree	d72ac9958663270ae463df49d14b9246118f960e
parent	0d9f40360edff12f0ee25aa461aa8a6063cde5fa [diff]