Put sensible bounds on quantized scores (#15411) Loss from quantization can yield some unexpected values from the corrected dot product, sometimes producing values that are out-of-bounds. This is more likely when the inputs are "extreme" in the sense that they are very far from the segment-level centroid. Bound euclidean distance to a non-negative value -- negative values do not make any sense. Clamp dot product/cosine score to [-1,1] as the normalized dot product should always return values in this range. This works well enough for 4+ bit quantization but may not work as well for 1-bit quantization since the loss is so great. Fix the testSingleVectorCase to l2 normalize all vectors for DOT_PRODUCT similarity. This is a partial fix for #15408

commit: 7ceee5d5f926505a3b56988fc2e10157e27232ad [log] [tgz]
author: Trevor McCulloch <mccullocht@gmail.com> Fri Nov 07 11:54:07 2025 -0800
committer: GitHub <noreply@github.com> Fri Nov 07 11:54:07 2025 -0800
tree: 63fb8dd083c86d85d8d7389c0ac0dad0831d9448
parent: efa5204179320ea8836caaf5eabcaa151a27c16a [diff]
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene104/Lucene104ScalarQuantizedVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene104/Lucene104ScalarQuantizedVectorScorer.java
index 642d882..22ba079 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene104/Lucene104ScalarQuantizedVectorScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene104/Lucene104ScalarQuantizedVectorScorer.java

@@ -263,7 +263,9 @@
           queryCorrections.additionalCorrection()
               + indexCorrections.additionalCorrection()
               - 2 * score;
-      return Math.max(1 / (1f + score), 0);
+      // Ensure that 'score' (the squared euclidean distance) is non-negative. The computed value
+      // may be negative as a result of quantization loss.
+      return 1 / (1f + Math.max(score, 0f));
     } else {
       // For cosine and max inner product, we need to apply the additional correction, which is
       // assumed to be the non-centered dot-product between the vector and the centroid
@@ -274,7 +276,10 @@
       if (similarityFunction == MAXIMUM_INNER_PRODUCT) {
         return VectorUtil.scaleMaxInnerProductScore(score);
       }
-      return Math.max((1f + score) / 2f, 0);
+      // Ensure that 'score' (a normalized dot product) is in [-1,1]. The computed value may be out
+      // of bounds as a result of quantization loss.
+      score = Math.clamp(score, -1, 1);
+      return (1f + score) / 2f;
     }
   }
 }

diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene104/TestLucene104HnswScalarQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene104/TestLucene104HnswScalarQuantizedVectorsFormat.java
index d5bc94a..4a4652e 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene104/TestLucene104HnswScalarQuantizedVectorsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene104/TestLucene104HnswScalarQuantizedVectorsFormat.java

@@ -47,7 +47,9 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
 import org.apache.lucene.tests.util.TestUtil;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.SameThreadExecutorService;
+import org.apache.lucene.util.VectorUtil;
 import org.junit.Before;
 
 public class TestLucene104HnswScalarQuantizedVectorsFormat extends BaseKnnVectorsFormatTestCase {
@@ -109,7 +111,11 @@
       try (Directory dir = newDirectory();
           IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
         Document doc = new Document();
-        doc.add(new KnnFloatVectorField("f", vector, similarityFunction));
+        float[] docVector =
+            similarityFunction == VectorSimilarityFunction.DOT_PRODUCT
+                ? VectorUtil.l2normalize(ArrayUtil.copyArray(vector))
+                : vector;
+        doc.add(new KnnFloatVectorField("f", docVector, similarityFunction));
         w.addDocument(doc);
         w.commit();
         try (IndexReader reader = DirectoryReader.open(w)) {
@@ -118,10 +124,14 @@
           KnnVectorValues.DocIndexIterator docIndexIterator = vectorValues.iterator();
           assert (vectorValues.size() == 1);
           while (docIndexIterator.nextDoc() != NO_MORE_DOCS) {
-            assertArrayEquals(vector, vectorValues.vectorValue(docIndexIterator.index()), 0.00001f);
+            assertArrayEquals(
+                docVector, vectorValues.vectorValue(docIndexIterator.index()), 0.00001f);
           }
-          float[] randomVector = randomVector(vector.length);
-          float trueScore = similarityFunction.compare(vector, randomVector);
+          float[] randomVector =
+              similarityFunction == VectorSimilarityFunction.DOT_PRODUCT
+                  ? randomNormalizedVector(vector.length)
+                  : randomVector(vector.length);
+          float trueScore = similarityFunction.compare(docVector, randomVector);
           TopDocs td =
               r.searchNearestVectors(
                   "f",
commit	7ceee5d5f926505a3b56988fc2e10157e27232ad	[log] [tgz]
author	Trevor McCulloch <mccullocht@gmail.com>	Fri Nov 07 11:54:07 2025 -0800
committer	GitHub <noreply@github.com>	Fri Nov 07 11:54:07 2025 -0800
tree	63fb8dd083c86d85d8d7389c0ac0dad0831d9448
parent	efa5204179320ea8836caaf5eabcaa151a27c16a [diff]