fix bug where nextZeroRow was set 1 too large, always leaving an empty row, and finish SISVD for both libraries
diff --git a/src/main/java/com/yahoo/sketches/vector/decomposition/FrequentDirections.java b/src/main/java/com/yahoo/sketches/vector/decomposition/FrequentDirections.java
index 4191897..d6f3f97 100644
--- a/src/main/java/com/yahoo/sketches/vector/decomposition/FrequentDirections.java
+++ b/src/main/java/com/yahoo/sketches/vector/decomposition/FrequentDirections.java
@@ -521,7 +521,7 @@
       for (int i = k_ - 1; i < S_.countColumns(); ++i) {
         S_.set(i, i, 0.0);
       }
-      nextZeroRow_ = k_;
+      nextZeroRow_ = k_ - 1;
     } else {
       for (int i = 0; i < sv_.length; ++i) {
         S_.set(i, i, sv_[i]);
diff --git a/src/main/java/com/yahoo/sketches/vector/decomposition/MatrixOps.java b/src/main/java/com/yahoo/sketches/vector/decomposition/MatrixOps.java
index f97070b..7923b14 100644
--- a/src/main/java/com/yahoo/sketches/vector/decomposition/MatrixOps.java
+++ b/src/main/java/com/yahoo/sketches/vector/decomposition/MatrixOps.java
@@ -13,7 +13,7 @@
 public abstract class MatrixOps {
 
   // number of iterations for SISVD
-  static final int DEFAULT_NUM_ITER = 8;
+  static final int DEFAULT_NUM_ITER = 200;
 
   /**
    * Matrix dimensions
diff --git a/src/main/java/com/yahoo/sketches/vector/decomposition/MatrixOpsImplMTJ.java b/src/main/java/com/yahoo/sketches/vector/decomposition/MatrixOpsImplMTJ.java
index a356d43..58fd8aa 100644
--- a/src/main/java/com/yahoo/sketches/vector/decomposition/MatrixOpsImplMTJ.java
+++ b/src/main/java/com/yahoo/sketches/vector/decomposition/MatrixOpsImplMTJ.java
@@ -6,6 +6,7 @@
 
 import java.util.concurrent.ThreadLocalRandom;
 
+import com.github.fommil.netlib.BLAS;
 import org.netlib.util.intW;
 
 import com.github.fommil.netlib.LAPACK;
@@ -76,6 +77,7 @@
 
     if (computeVectors && Vt_ == null) {
       Vt_ = new DenseMatrix(n_, d_);
+      //Vt_ = new DenseMatrix(k_, d_);
 
       final int[] diag = {0}; // only need the main diagonal
       S_ = new CompDiagMatrix(n_, n_, diag);
@@ -213,7 +215,7 @@
   private void allocateSpaceSISVD() {
     block_ = new DenseMatrix(d_, k_);
     T_ = new DenseMatrix(n_, k_);
-    // TODO: should allocate space for QR and final SVD here
+    // TODO: should allocate space for QR and final SVD here?
   }
 
   private MatrixOps computeFullSVD(final DenseMatrix A, final boolean computeVectors) {
@@ -247,20 +249,20 @@
     for (MatrixEntry entry : block_) {
       entry.set(rand.nextGaussian());
     }
-    // TODO: in-line QR
+    // TODO: in-line QR with direct LAPACK call
     final QR qr = new QR(block_.numRows(), block_.numColumns());
-    block_ = qr.factor(A).getQ(); // important for numeric stability
+    block_ = qr.factor(block_).getQ(); // important for numeric stability
 
     for (int i = 0; i < DEFAULT_NUM_ITER; ++i) {
       A.mult(block_, T_);
-      A.transABmult(T_, block_);
+      A.transAmult(T_, block_);
       block_ = qr.factor(block_).getQ(); // again, for stability
     }
 
     // Rayleigh-Ritz postprocessing
     A.mult(block_, T_);
 
-    // TODO: use full SVD code
+    // TODO: use LAPACK directly
     final SVD svd = new SVD(T_.numRows(), T_.numColumns(), computeVectors);
     try {
       svd.factor(T_);
@@ -270,7 +272,11 @@
     System.arraycopy(svd.getS(), 0, sv_, 0, svd.getS().length); // sv_ is final
 
     if (computeVectors) {
-      block_.mult(svd.getVt(), Vt_);
+      // V^T = (block * V^T)^T = (V^T)^T * block^T
+      // using BLAS directly since Vt is (n_ x d_) but result here is only (k_ x d_)
+      BLAS.getInstance().dgemm("T", "T", k_, d_, k_,
+              1.0, svd.getVt().getData(), k_, block_.getData(), d_,
+              0.0, Vt_.getData(), n_);
     }
 
     return this;
diff --git a/src/main/java/com/yahoo/sketches/vector/decomposition/MatrixOpsImplOjAlgo.java b/src/main/java/com/yahoo/sketches/vector/decomposition/MatrixOpsImplOjAlgo.java
index 91dda43..589377a 100644
--- a/src/main/java/com/yahoo/sketches/vector/decomposition/MatrixOpsImplOjAlgo.java
+++ b/src/main/java/com/yahoo/sketches/vector/decomposition/MatrixOpsImplOjAlgo.java
@@ -172,7 +172,9 @@
     svd.getSingularValues(sv_);
 
     if (computeVectors) {
-      block_.multiply(svd.getQ2().transpose()).supplyTo(Vt_);
+      // V = block * Q2^T so V^T = Q2 * block^T
+      // and ojAlgo figures out that it only needs to fill the first k_ rows of Vt_
+      svd.getQ2().multiply(block_.transpose()).supplyTo(Vt_);
     }
 
     return this;
diff --git a/src/main/java/com/yahoo/sketches/vector/decomposition/NewFrequentDirections.java b/src/main/java/com/yahoo/sketches/vector/decomposition/NewFrequentDirections.java
index 4ce118e..28c765f 100644
--- a/src/main/java/com/yahoo/sketches/vector/decomposition/NewFrequentDirections.java
+++ b/src/main/java/com/yahoo/sketches/vector/decomposition/NewFrequentDirections.java
@@ -517,8 +517,9 @@
 
     final double newSvAdjustment = svd_.reduceRank(B_);
     svAdjustment_ += newSvAdjustment;
-    nextZeroRow_ = (int) Math.min(k_, n_); //svd_.getSingularValues().length;
+    nextZeroRow_ = k_ - 1; //(int) Math.min(k_, n_); //svd_.getSingularValues().length;
 
     //System.out.println(toString(true, true, false));
+    //System.exit(0);
   }
 }