Several bug fixes in HHJ, NLJ, and tokenizer

    - in HHJ handle the case when it spills and skipInMemoryHJ is set to false,
    - check for memsize in NLJ and correctly set memsize in HHJ,
    - make counthashed-ngram-token() to skip the bits for length & type

Change-Id: I908345f993019b0bfd0ac0bcb3e497a42295b623
Reviewed-on: http://fulliautomatix.ics.uci.edu:8443/96
Reviewed-by: Pouria Pirzadeh <pouria.pirzadeh@gmail.com>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/NestedLoopJoin.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/NestedLoopJoin.java
index 52f1198..eab60bc 100644
--- a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/NestedLoopJoin.java
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/NestedLoopJoin.java
@@ -64,6 +64,9 @@
         this.appender.reset(outBuffer, true);
         this.outBuffers = new ArrayList<ByteBuffer>();
         this.memSize = memSize;
+        if (memSize < 3) {
+            throw new HyracksDataException("Not enough memory is available for Nested Loop Join");
+        }
         this.predEvaluator = predEval;
         this.isReversed = false;
         this.ctx = ctx;
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java
index 276f60f..744f939 100644
--- a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java
@@ -437,30 +437,33 @@
                     LOGGER.fine("\n>>>Joining Partition Pairs (pid "+pid+") - (level "+level+") - wasReversed "+wasReversed+" - BuildSize:\t"+buildPartSize+"\tProbeSize:\t"+probePartSize+" - MemForJoin "+(state.memForJoin)+"  - LeftOuter is "+isLeftOuter);
                                       
                     //Apply in-Mem HJ if possible
-                    if(!skipInMemoryHJ){
-                    	if ((buildPartSize < state.memForJoin) || (probePartSize < state.memForJoin && !isLeftOuter)) {
-                            int tabSize = -1;
-                            if (!forceRR && (isLeftOuter || (buildPartSize < probePartSize)) ) {	//Case 1.1 - InMemHJ (wout Role-Reversal)
-                            	LOGGER.fine("\t>>>Case 1.1 (IsLeftOuter || buildSize<probe) AND ApplyInMemHJ - [Level "+level+"]");
-                                tabSize = wasReversed ? ohhj.getProbePartitionSizeInTup(pid) : ohhj.getBuildPartitionSizeInTup(pid);
-                                if (tabSize == 0) {
-                                    throw new HyracksDataException(
-                                            "Trying to join an empty partition. Invalid table size for inMemoryHashJoin.");
-                                }
-                                	//Build Side is smaller
-                                applyInMemHashJoin(buildKeys, probeKeys, tabSize, probeRd, buildRd, hpcRep0, hpcRep1,
-                                        buildSideReader, probeSideReader, false, pid);	//checked-confirmed
-                            } else { //Case 1.2 - InMemHJ with Role Reversal
-                            	LOGGER.fine("\t>>>Case 1.2. (NoIsLeftOuter || probe<build) AND ApplyInMemHJ WITH RoleReversal - [Level "+level+"]");
-                                tabSize = wasReversed ? ohhj.getBuildPartitionSizeInTup(pid) : ohhj.getProbePartitionSizeInTup(pid);
-                                if (tabSize == 0) {
-                                    throw new HyracksDataException(
-                                            "Trying to join an empty partition. Invalid table size for inMemoryHashJoin.");
-                                }
-                                	//Probe Side is smaller
-                                applyInMemHashJoin(probeKeys, buildKeys, tabSize, buildRd, probeRd, hpcRep1, hpcRep0,
-                                        probeSideReader, buildSideReader, true, pid);	//checked-confirmed
+                    if (!skipInMemoryHJ && (buildPartSize < state.memForJoin)
+                            || (probePartSize < state.memForJoin && !isLeftOuter)) {
+                        int tabSize = -1;
+                        if (!forceRR && (isLeftOuter || (buildPartSize < probePartSize))) { //Case 1.1 - InMemHJ (wout Role-Reversal)
+                            LOGGER.fine("\t>>>Case 1.1 (IsLeftOuter || buildSize<probe) AND ApplyInMemHJ - [Level "
+                                    + level + "]");
+                            tabSize = wasReversed ? ohhj.getProbePartitionSizeInTup(pid) : ohhj
+                                    .getBuildPartitionSizeInTup(pid);
+                            if (tabSize == 0) {
+                                throw new HyracksDataException(
+                                        "Trying to join an empty partition. Invalid table size for inMemoryHashJoin.");
                             }
+                            //Build Side is smaller
+                            applyInMemHashJoin(buildKeys, probeKeys, tabSize, probeRd, buildRd, hpcRep0, hpcRep1,
+                                    buildSideReader, probeSideReader, false, pid); //checked-confirmed
+                        } else { //Case 1.2 - InMemHJ with Role Reversal
+                            LOGGER.fine("\t>>>Case 1.2. (NoIsLeftOuter || probe<build) AND ApplyInMemHJ WITH RoleReversal - [Level "
+                                    + level + "]");
+                            tabSize = wasReversed ? ohhj.getBuildPartitionSizeInTup(pid) : ohhj
+                                    .getProbePartitionSizeInTup(pid);
+                            if (tabSize == 0) {
+                                throw new HyracksDataException(
+                                        "Trying to join an empty partition. Invalid table size for inMemoryHashJoin.");
+                            }
+                            //Probe Side is smaller
+                            applyInMemHashJoin(probeKeys, buildKeys, tabSize, buildRd, probeRd, hpcRep1, hpcRep0,
+                                    probeSideReader, buildSideReader, true, pid); //checked-confirmed
                         }
                     }
                     //Apply (Recursive) HHJ
@@ -523,10 +526,10 @@
                                     int buildSideInTups = rHHj.getBuildPartitionSizeInTup(rPid);
                                     int probeSideInTups = rHHj.getProbePartitionSizeInTup(rPid);
                                     if (isLeftOuter || buildSideInTups < probeSideInTups) {
-                                        applyNestedLoopJoin(buildRd, probeRd, state.memForJoin, rprfw, rbrfw,
+                                        applyNestedLoopJoin(buildRd, probeRd, memsize, rprfw, rbrfw,
                                                 nljComparator0, false); //checked-modified
                                     } else {
-                                        applyNestedLoopJoin(probeRd, buildRd, state.memForJoin, rbrfw, rprfw,
+                                        applyNestedLoopJoin(probeRd, buildRd, memsize, rbrfw, rprfw,
                                                 nljComparator1, true); //checked-modified
                                     }
                                 }
@@ -585,10 +588,10 @@
                                     long buildSideSize = rbrfw.getFileSize();
                                     long probeSideSize = rprfw.getFileSize();
                                     if (buildSideSize > probeSideSize) {
-                                        applyNestedLoopJoin(buildRd, probeRd, state.memForJoin, rbrfw, rprfw,
+                                        applyNestedLoopJoin(buildRd, probeRd, memsize, rbrfw, rprfw,
                                                 nljComparator0, true);	//checked-modified
                                     } else {
-                                        applyNestedLoopJoin(probeRd, buildRd, state.memForJoin, rprfw, rbrfw,
+                                        applyNestedLoopJoin(probeRd, buildRd, memsize, rprfw, rbrfw,
                                                 nljComparator1, true);	//checked-modified
                                     }
                                 }
diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
index 847a923..b1d722e 100644
--- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
+++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
@@ -64,7 +64,10 @@
         // compute token count
         // ignore pre and post grams for duplicate detection
         if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
-            int tmpIndex = start;
+            int tmpIndex = start + 2; // skip utf8 length indicator
+            if (sourceHasTypeTag) {
+                tmpIndex++; // skip type tag
+            }
             while (tmpIndex < currentTokenStart) {
                 tokenCount++; // assume found
                 int offset = 0;