stop using SPECIES_PREFERRED except at the top of this file

commit: cce3cf106af03393f8e165724b19eaa57d079bc2 [log] [tgz]
author: Robert Muir <rmuir@apache.org> Sat Oct 14 18:08:33 2023 -0400
committer: Robert Muir <rmuir@apache.org> Sat Oct 14 18:08:33 2023 -0400
tree: 2754e16ffedf28f576fb94bf4f6fa12d374bc014
parent: 3ec9c26d672262762f4213c827699bf735409eeb [diff]
diff --git a/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java
index a7ca073..9129259 100644
--- a/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java
+++ b/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java

@@ -31,24 +31,22 @@
 
 final class PanamaVectorUtilSupport implements VectorUtilSupport {
 
-  private static final int INT_SPECIES_PREF_BIT_SIZE = IntVector.SPECIES_PREFERRED.vectorBitSize();
-
-  // we always use the platform's maximum floating point vector size
+  // we always use the platform's maximum floating point/int vector size
   private static final VectorSpecies<Float> FLOAT_SPECIES = FloatVector.SPECIES_PREFERRED;
+  private static final VectorSpecies<Integer> INT_SPECIES = IntVector.SPECIES_PREFERRED;
 
   // for integer methods, it is more complicated due to conversions
+  private static final int INT_SPECIES_VSIZE = INT_SPECIES.vectorBitSize();
   private static final VectorSpecies<Byte> BYTE_SPECIES;
   private static final VectorSpecies<Short> SHORT_SPECIES;
 
   // compute BYTE/SHORT sizes relative to preferred integer vector size
   static {
-    if (INT_SPECIES_PREF_BIT_SIZE >= 256) {
+    if (INT_SPECIES_VSIZE >= 256) {
       BYTE_SPECIES =
-          ByteVector.SPECIES_MAX.withShape(
-              VectorShape.forBitSize(IntVector.SPECIES_PREFERRED.vectorBitSize() >> 2));
+          ByteVector.SPECIES_MAX.withShape(VectorShape.forBitSize(INT_SPECIES_VSIZE >> 2));
       SHORT_SPECIES =
-          ShortVector.SPECIES_MAX.withShape(
-              VectorShape.forBitSize(IntVector.SPECIES_PREFERRED.vectorBitSize() >> 1));
+          ShortVector.SPECIES_MAX.withShape(VectorShape.forBitSize(INT_SPECIES_VSIZE >> 1));
     } else {
       BYTE_SPECIES = null;
       SHORT_SPECIES = null;
@@ -306,10 +304,10 @@
     // vectors (256-bit on intel to dodge performance landmines)
     if (a.length >= 16 && useIntegerVectors) {
       // compute vectorized dot product consistent with VPDPBUSD instruction
-      if (INT_SPECIES_PREF_BIT_SIZE >= 512) {
+      if (INT_SPECIES_VSIZE >= 512) {
         i += BYTE_SPECIES.loopBound(a.length);
         res += dotProductBody512(a, b, i);
-      } else if (INT_SPECIES_PREF_BIT_SIZE == 256) {
+      } else if (INT_SPECIES_VSIZE == 256) {
         i += BYTE_SPECIES.loopBound(a.length);
         res += dotProductBody256(a, b, i);
       } else {
@@ -328,7 +326,7 @@
 
   /** vectorized dot product body (512 bit vectors) */
   private int dotProductBody512(byte[] a, byte[] b, int limit) {
-    IntVector acc = IntVector.zero(IntVector.SPECIES_PREFERRED);
+    IntVector acc = IntVector.zero(INT_SPECIES);
     for (int i = 0; i < limit; i += BYTE_SPECIES.length()) {
       ByteVector va8 = ByteVector.fromArray(BYTE_SPECIES, a, i);
       ByteVector vb8 = ByteVector.fromArray(BYTE_SPECIES, b, i);
@@ -339,7 +337,7 @@
       Vector<Short> prod16 = va16.mul(vb16);
 
       // 32-bit add
-      Vector<Integer> prod32 = prod16.convertShape(S2I, IntVector.SPECIES_PREFERRED, 0);
+      Vector<Integer> prod32 = prod16.convertShape(S2I, INT_SPECIES, 0);
       acc = acc.add(prod32);
     }
     // reduce
@@ -348,14 +346,14 @@
 
   /** vectorized dot product body (256 bit vectors) */
   private int dotProductBody256(byte[] a, byte[] b, int limit) {
-    IntVector acc = IntVector.zero(IntVector.SPECIES_PREFERRED);
-    for (int i = 0; i < limit; i += BYTE_SPECIES.length()) {
-      ByteVector va8 = ByteVector.fromArray(BYTE_SPECIES, a, i);
-      ByteVector vb8 = ByteVector.fromArray(BYTE_SPECIES, b, i);
+    IntVector acc = IntVector.zero(IntVector.SPECIES_256);
+    for (int i = 0; i < limit; i += ByteVector.SPECIES_64.length()) {
+      ByteVector va8 = ByteVector.fromArray(ByteVector.SPECIES_64, a, i);
+      ByteVector vb8 = ByteVector.fromArray(ByteVector.SPECIES_64, b, i);
 
       // 32-bit multiply and add into accumulator
-      Vector<Integer> va32 = va8.convertShape(B2I, IntVector.SPECIES_PREFERRED, 0);
-      Vector<Integer> vb32 = vb8.convertShape(B2I, IntVector.SPECIES_PREFERRED, 0);
+      Vector<Integer> va32 = va8.convertShape(B2I, IntVector.SPECIES_256, 0);
+      Vector<Integer> vb32 = vb8.convertShape(B2I, IntVector.SPECIES_256, 0);
       acc = acc.add(va32.mul(vb32));
     }
     // reduce
@@ -394,10 +392,10 @@
     // vectors (256-bit on intel to dodge performance landmines)
     if (a.length >= 16 && useIntegerVectors) {
       final float[] ret;
-      if (INT_SPECIES_PREF_BIT_SIZE >= 512) {
+      if (INT_SPECIES_VSIZE >= 512) {
         i += BYTE_SPECIES.loopBound(a.length);
         ret = cosineBody512(a, b, i);
-      } else if (INT_SPECIES_PREF_BIT_SIZE == 256) {
+      } else if (INT_SPECIES_VSIZE == 256) {
         i += BYTE_SPECIES.loopBound(a.length);
         ret = cosineBody256(a, b, i);
       } else {
@@ -423,9 +421,9 @@
 
   /** vectorized cosine body (512 bit vectors) */
   private float[] cosineBody512(byte[] a, byte[] b, int limit) {
-    IntVector accSum = IntVector.zero(IntVector.SPECIES_PREFERRED);
-    IntVector accNorm1 = IntVector.zero(IntVector.SPECIES_PREFERRED);
-    IntVector accNorm2 = IntVector.zero(IntVector.SPECIES_PREFERRED);
+    IntVector accSum = IntVector.zero(INT_SPECIES);
+    IntVector accNorm1 = IntVector.zero(INT_SPECIES);
+    IntVector accNorm2 = IntVector.zero(INT_SPECIES);
     for (int i = 0; i < limit; i += BYTE_SPECIES.length()) {
       ByteVector va8 = ByteVector.fromArray(BYTE_SPECIES, a, i);
       ByteVector vb8 = ByteVector.fromArray(BYTE_SPECIES, b, i);
@@ -438,9 +436,9 @@
       Vector<Short> prod16 = va16.mul(vb16);
 
       // sum into accumulators: 32-bit add
-      Vector<Integer> norm1_32 = norm1_16.convertShape(S2I, IntVector.SPECIES_PREFERRED, 0);
-      Vector<Integer> norm2_32 = norm2_16.convertShape(S2I, IntVector.SPECIES_PREFERRED, 0);
-      Vector<Integer> prod32 = prod16.convertShape(S2I, IntVector.SPECIES_PREFERRED, 0);
+      Vector<Integer> norm1_32 = norm1_16.convertShape(S2I, INT_SPECIES, 0);
+      Vector<Integer> norm2_32 = norm2_16.convertShape(S2I, INT_SPECIES, 0);
+      Vector<Integer> prod32 = prod16.convertShape(S2I, INT_SPECIES, 0);
       accNorm1 = accNorm1.add(norm1_32);
       accNorm2 = accNorm2.add(norm2_32);
       accSum = accSum.add(prod32);
@@ -453,16 +451,16 @@
 
   /** vectorized cosine body (256 bit vectors) */
   private float[] cosineBody256(byte[] a, byte[] b, int limit) {
-    IntVector accSum = IntVector.zero(IntVector.SPECIES_PREFERRED);
-    IntVector accNorm1 = IntVector.zero(IntVector.SPECIES_PREFERRED);
-    IntVector accNorm2 = IntVector.zero(IntVector.SPECIES_PREFERRED);
-    for (int i = 0; i < limit; i += BYTE_SPECIES.length()) {
-      ByteVector va8 = ByteVector.fromArray(BYTE_SPECIES, a, i);
-      ByteVector vb8 = ByteVector.fromArray(BYTE_SPECIES, b, i);
+    IntVector accSum = IntVector.zero(IntVector.SPECIES_256);
+    IntVector accNorm1 = IntVector.zero(IntVector.SPECIES_256);
+    IntVector accNorm2 = IntVector.zero(IntVector.SPECIES_256);
+    for (int i = 0; i < limit; i += ByteVector.SPECIES_64.length()) {
+      ByteVector va8 = ByteVector.fromArray(ByteVector.SPECIES_64, a, i);
+      ByteVector vb8 = ByteVector.fromArray(ByteVector.SPECIES_64, b, i);
 
       // 16-bit multiply, and add into accumulators
-      Vector<Integer> va32 = va8.convertShape(B2I, IntVector.SPECIES_PREFERRED, 0);
-      Vector<Integer> vb32 = vb8.convertShape(B2I, IntVector.SPECIES_PREFERRED, 0);
+      Vector<Integer> va32 = va8.convertShape(B2I, IntVector.SPECIES_256, 0);
+      Vector<Integer> vb32 = vb8.convertShape(B2I, IntVector.SPECIES_256, 0);
       Vector<Integer> norm1_32 = va32.mul(va32);
       Vector<Integer> norm2_32 = vb32.mul(vb32);
       Vector<Integer> prod32 = va32.mul(vb32);
@@ -511,7 +509,7 @@
     // only vectorize if we'll at least enter the loop a single time, and we have at least 128-bit
     // vectors (256-bit on intel to dodge performance landmines)
     if (a.length >= 16 && useIntegerVectors) {
-      if (INT_SPECIES_PREF_BIT_SIZE >= 256) {
+      if (INT_SPECIES_VSIZE >= 256) {
         i += BYTE_SPECIES.loopBound(a.length);
         res += squareDistanceBody256(a, b, i);
       } else {
@@ -530,15 +528,15 @@
 
   /** vectorized square distance body (256+ bit vectors) */
   private int squareDistanceBody256(byte[] a, byte[] b, int limit) {
-    IntVector acc = IntVector.zero(IntVector.SPECIES_PREFERRED);
+    IntVector acc = IntVector.zero(INT_SPECIES);
     for (int i = 0; i < limit; i += BYTE_SPECIES.length()) {
       ByteVector va8 = ByteVector.fromArray(BYTE_SPECIES, a, i);
       ByteVector vb8 = ByteVector.fromArray(BYTE_SPECIES, b, i);
 
       // 32-bit sub, multiply, and add into accumulators
       // TODO: uses AVX-512 heavy multiply on zmm, should we just use 256-bit vectors on AVX-512?
-      Vector<Integer> va32 = va8.convertShape(B2I, IntVector.SPECIES_PREFERRED, 0);
-      Vector<Integer> vb32 = vb8.convertShape(B2I, IntVector.SPECIES_PREFERRED, 0);
+      Vector<Integer> va32 = va8.convertShape(B2I, INT_SPECIES, 0);
+      Vector<Integer> vb32 = vb8.convertShape(B2I, INT_SPECIES, 0);
       Vector<Integer> diff32 = va32.sub(vb32);
       acc = acc.add(diff32.mul(diff32));
     }
commit	cce3cf106af03393f8e165724b19eaa57d079bc2	[log] [tgz]
author	Robert Muir <rmuir@apache.org>	Sat Oct 14 18:08:33 2023 -0400
committer	Robert Muir <rmuir@apache.org>	Sat Oct 14 18:08:33 2023 -0400
tree	2754e16ffedf28f576fb94bf4f6fa12d374bc014
parent	3ec9c26d672262762f4213c827699bf735409eeb [diff]