add some logging
diff --git a/processing/src/main/java/io/druid/query/aggregation/gpu/AbstractFloatKernelAggregator.java b/processing/src/main/java/io/druid/query/aggregation/gpu/AbstractFloatKernelAggregator.java
index 5723cdf..52f3ec0 100644
--- a/processing/src/main/java/io/druid/query/aggregation/gpu/AbstractFloatKernelAggregator.java
+++ b/processing/src/main/java/io/druid/query/aggregation/gpu/AbstractFloatKernelAggregator.java
@@ -41,6 +41,7 @@
   protected final CLBuffer<Float> totalBuffer;
   protected List<CLEvent> copyEvents;
   private int totalBufferOffset = 0;
+  protected long t = 0;
 
   public AbstractFloatKernelAggregator(
       CLQueue queue,
@@ -64,10 +65,13 @@
     CLBuffer<Float> buf = context.createFloatBuffer(CLMem.Usage.Input, Pointer.pointerToFloats(currentBuffer));
 
     int bufRemaining = currentBuffer.remaining();
+
+    long t0 = System.nanoTime();
     CLEvent copyEvent = buf.copyTo(queue, 0, bufRemaining, totalBuffer, totalBufferOffset);
-    totalBufferOffset += bufRemaining;
-    copyEvents.add(copyEvent);
     copyEvent.waitFor();
+    t += System.nanoTime() - t0;
+    copyEvents.add(copyEvent);
+    totalBufferOffset += bufRemaining;
   }
 
   @Override
diff --git a/processing/src/main/java/io/druid/query/aggregation/gpu/FloatKernelAggregator.java b/processing/src/main/java/io/druid/query/aggregation/gpu/FloatKernelAggregator.java
index a8b98df..bbaf21b 100644
--- a/processing/src/main/java/io/druid/query/aggregation/gpu/FloatKernelAggregator.java
+++ b/processing/src/main/java/io/druid/query/aggregation/gpu/FloatKernelAggregator.java
@@ -19,6 +19,7 @@
 
 package io.druid.query.aggregation.gpu;
 
+import com.metamx.common.logger.Logger;
 import com.nativelibs4java.opencl.CLBuffer;
 import com.nativelibs4java.opencl.CLContext;
 import com.nativelibs4java.opencl.CLEvent;
@@ -37,6 +38,8 @@
   private final CLProgram program;
   private final CLKernel kernel;
 
+  private static final Logger log = new Logger(FloatKernelAggregator.class);
+
   public FloatKernelAggregator(
       FloatBufferSelector selector,
       CLContext context,
@@ -53,6 +56,7 @@
   @Override
   public void run(IntBuffer buckets, ByteBuffer out, int position)
   {
+    long t0 = System.nanoTime();
     final int nBuckets = buckets.remaining() / 2;
     final int n = (int)totalBuffer.getElementCount();
 
@@ -66,6 +70,9 @@
     final Pointer<Float> outPtr = Pointer.pointerToFloats(out.asFloatBuffer());
     CLEvent readEvt = kernelOut.read(queue, outPtr, false, addEvt);
     readEvt.waitFor();
+
+    log.debug("Memory copy took: %d ns for %d bytes", this.t, totalBuffer.getByteCount());
+    log.debug("Computation took: %d ns", System.nanoTime() - t0);
   }
 
   @Override