</pre><pre class="rust"><code><span class="comment">// This file contains a set of fairly generic utility functions when working
// with SIMD vectors.
// SAFETY: All of the routines below are unsafe to call because they assume
// the necessary CPU target features in order to use particular vendor
// intrinsics. Calling these routines when the underlying CPU does not support
// the appropriate target features is NOT safe. Callers must ensure this
// themselves.
// Note that it may not look like this safety invariant is being upheld when
// these routines are called. Namely, the CPU feature check is typically pretty
// far away from when these routines are used. Instead, we rely on the fact
// that certain types serve as a guaranteed receipt that pertinent target
// features are enabled. For example, the only way TeddySlim3Mask256 can be
// constructed is if the AVX2 CPU feature is available. Thus, any code running
// inside of TeddySlim3Mask256 can use any of the functions below without any
// additional checks: its very existence *is* the check.
</span><span class="kw">use </span>core::arch::x86_64::<span class="kw-2">*</span>;
<span class="doccomment">/// Shift `a` to the left by two bytes (removing its two most significant
/// bytes), and concatenate it with the the two most significant bytes of `b`.
</span><span class="attribute">#[target_feature(enable = <span class="string">&quot;avx2&quot;</span>)]
</span><span class="kw">pub unsafe fn </span>alignr256_14(a: __m256i, b: __m256i) -&gt; __m256i {
<span class="comment">// Credit goes to jneem for figuring this out:
// TL;DR avx2&#39;s PALIGNR instruction is actually just two 128-bit PALIGNR
// instructions, which is not what we want, so we need to do some extra
// shuffling.
// This permute gives us the low 16 bytes of a concatenated with the high
// 16 bytes of b, in order of most significant to least significant. So
// `v = a[15:0] b[31:16]`.
</span><span class="kw">let </span>v = _mm256_permute2x128_si256(b, a, <span class="number">0x21</span>);
<span class="comment">// This effectively does this (where we deal in terms of byte-indexing
// and byte-shifting, and use inclusive ranges):
// ret[15:0] := ((a[15:0] &lt;&lt; 16) | v[15:0]) &gt;&gt; 14
// = ((a[15:0] &lt;&lt; 16) | b[31:16]) &gt;&gt; 14
// ret[31:16] := ((a[31:16] &lt;&lt; 16) | v[31:16]) &gt;&gt; 14
// = ((a[31:16] &lt;&lt; 16) | a[15:0]) &gt;&gt; 14
// Which therefore results in:
// ret[31:0] := a[29:16] a[15:14] a[13:0] b[31:30]
// The end result is that we&#39;ve effectively done this:
// (a &lt;&lt; 2) | (b &gt;&gt; 30)
// When `A` and `B` are strings---where the beginning of the string is in
// the least significant bits---we effectively result in the following
// semantic operation:
// (A &gt;&gt; 2) | (B &lt;&lt; 30)
// The reversal being attributed to the fact that we are in little-endian.
</span>_mm256_alignr_epi8(a, v, <span class="number">14</span>)
<span class="doccomment">/// Shift `a` to the left by three byte (removing its most significant byte),
/// and concatenate it with the the most significant byte of `b`.
</span><span class="attribute">#[target_feature(enable = <span class="string">&quot;avx2&quot;</span>)]
</span><span class="kw">pub unsafe fn </span>alignr256_13(a: __m256i, b: __m256i) -&gt; __m256i {
<span class="comment">// For explanation, see alignr256_14.
</span><span class="kw">let </span>v = _mm256_permute2x128_si256(b, a, <span class="number">0x21</span>);
_mm256_alignr_epi8(a, v, <span class="number">13</span>)
<span class="doccomment">/// Shift `a` to the left by one byte (removing its most significant byte), and
/// concatenate it with the the most significant byte of `b`.
</span><span class="attribute">#[target_feature(enable = <span class="string">&quot;avx2&quot;</span>)]
</span><span class="kw">pub unsafe fn </span>alignr256_15(a: __m256i, b: __m256i) -&gt; __m256i {
<span class="comment">// For explanation, see alignr256_14.
</span><span class="kw">let </span>v = _mm256_permute2x128_si256(b, a, <span class="number">0x21</span>);
_mm256_alignr_epi8(a, v, <span class="number">15</span>)
<span class="doccomment">/// Unpack the given 128-bit vector into its 64-bit components. The first
/// element of the array returned corresponds to the least significant 64-bit
/// lane in `a`.
</span><span class="attribute">#[target_feature(enable = <span class="string">&quot;ssse3&quot;</span>)]
</span><span class="kw">pub unsafe fn </span>unpack64x128(a: __m128i) -&gt; [u64; <span class="number">2</span>] {
_mm_cvtsi128_si64(a) <span class="kw">as </span>u64,
_mm_cvtsi128_si64(_mm_srli_si128(a, <span class="number">8</span>)) <span class="kw">as </span>u64,
<span class="doccomment">/// Unpack the given 256-bit vector into its 64-bit components. The first
/// element of the array returned corresponds to the least significant 64-bit
/// lane in `a`.
</span><span class="attribute">#[target_feature(enable = <span class="string">&quot;avx2&quot;</span>)]
</span><span class="kw">pub unsafe fn </span>unpack64x256(a: __m256i) -&gt; [u64; <span class="number">4</span>] {
<span class="comment">// Using transmute here is precisely equivalent, but actually slower. It&#39;s
// not quite clear why.
</span><span class="kw">let </span>lo = _mm256_extracti128_si256(a, <span class="number">0</span>);
<span class="kw">let </span>hi = _mm256_extracti128_si256(a, <span class="number">1</span>);
_mm_cvtsi128_si64(lo) <span class="kw">as </span>u64,
_mm_cvtsi128_si64(_mm_srli_si128(lo, <span class="number">8</span>)) <span class="kw">as </span>u64,
_mm_cvtsi128_si64(hi) <span class="kw">as </span>u64,
_mm_cvtsi128_si64(_mm_srli_si128(hi, <span class="number">8</span>)) <span class="kw">as </span>u64,
<span class="doccomment">/// Unpack the low 128-bits of `a` and `b`, and return them as 4 64-bit
/// integers.
/// More precisely, if a = a4 a3 a2 a1 and b = b4 b3 b2 b1, where each element
/// is a 64-bit integer and a1/b1 correspond to the least significant 64 bits,
/// then the return value is `b2 b1 a2 a1`.
</span><span class="attribute">#[target_feature(enable = <span class="string">&quot;avx2&quot;</span>)]
</span><span class="kw">pub unsafe fn </span>unpacklo64x256(a: __m256i, b: __m256i) -&gt; [u64; <span class="number">4</span>] {
<span class="kw">let </span>lo = _mm256_castsi256_si128(a);
<span class="kw">let </span>hi = _mm256_castsi256_si128(b);
_mm_cvtsi128_si64(lo) <span class="kw">as </span>u64,
_mm_cvtsi128_si64(_mm_srli_si128(lo, <span class="number">8</span>)) <span class="kw">as </span>u64,
_mm_cvtsi128_si64(hi) <span class="kw">as </span>u64,
_mm_cvtsi128_si64(_mm_srli_si128(hi, <span class="number">8</span>)) <span class="kw">as </span>u64,
<span class="doccomment">/// Returns true if and only if all bits in the given 128-bit vector are 0.
</span><span class="attribute">#[target_feature(enable = <span class="string">&quot;ssse3&quot;</span>)]
</span><span class="kw">pub unsafe fn </span>is_all_zeroes128(a: __m128i) -&gt; bool {
<span class="kw">let </span>cmp = _mm_cmpeq_epi8(a, zeroes128());
_mm_movemask_epi8(cmp) <span class="kw">as </span>u32 == <span class="number">0xFFFF
<span class="doccomment">/// Returns true if and only if all bits in the given 256-bit vector are 0.
</span><span class="attribute">#[target_feature(enable = <span class="string">&quot;avx2&quot;</span>)]
</span><span class="kw">pub unsafe fn </span>is_all_zeroes256(a: __m256i) -&gt; bool {
<span class="kw">let </span>cmp = _mm256_cmpeq_epi8(a, zeroes256());
_mm256_movemask_epi8(cmp) <span class="kw">as </span>u32 == <span class="number">0xFFFFFFFF
<span class="doccomment">/// Load a 128-bit vector from slice at the given position. The slice does
/// not need to be unaligned.
/// Since this code assumes little-endian (there is no big-endian x86), the
/// bytes starting in `slice[at..]` will be at the least significant bits of
/// the returned vector. This is important for the surrounding code, since for
/// example, shifting the resulting vector right is equivalent to logically
/// shifting the bytes in `slice` left.
</span><span class="attribute">#[target_feature(enable = <span class="string">&quot;sse2&quot;</span>)]
</span><span class="kw">pub unsafe fn </span>loadu128(slice: <span class="kw-2">&amp;</span>[u8], at: usize) -&gt; __m128i {
<span class="kw">let </span>ptr = slice.get_unchecked(at..).as_ptr();
_mm_loadu_si128(ptr <span class="kw">as </span><span class="kw-2">*const </span>u8 <span class="kw">as </span><span class="kw-2">*const </span>__m128i)
<span class="doccomment">/// Load a 256-bit vector from slice at the given position. The slice does
/// not need to be unaligned.
/// Since this code assumes little-endian (there is no big-endian x86), the
/// bytes starting in `slice[at..]` will be at the least significant bits of
/// the returned vector. This is important for the surrounding code, since for
/// example, shifting the resulting vector right is equivalent to logically
/// shifting the bytes in `slice` left.
</span><span class="attribute">#[target_feature(enable = <span class="string">&quot;avx2&quot;</span>)]
</span><span class="kw">pub unsafe fn </span>loadu256(slice: <span class="kw-2">&amp;</span>[u8], at: usize) -&gt; __m256i {
<span class="kw">let </span>ptr = slice.get_unchecked(at..).as_ptr();
_mm256_loadu_si256(ptr <span class="kw">as </span><span class="kw-2">*const </span>u8 <span class="kw">as </span><span class="kw-2">*const </span>__m256i)
<span class="doccomment">/// Returns a 128-bit vector with all bits set to 0.
</span><span class="attribute">#[target_feature(enable = <span class="string">&quot;sse2&quot;</span>)]
</span><span class="kw">pub unsafe fn </span>zeroes128() -&gt; __m128i {
_mm_set1_epi8(<span class="number">0</span>)
<span class="doccomment">/// Returns a 256-bit vector with all bits set to 0.
</span><span class="attribute">#[target_feature(enable = <span class="string">&quot;avx2&quot;</span>)]
</span><span class="kw">pub unsafe fn </span>zeroes256() -&gt; __m256i {
_mm256_set1_epi8(<span class="number">0</span>)
<span class="doccomment">/// Returns a 128-bit vector with all bits set to 1.
</span><span class="attribute">#[target_feature(enable = <span class="string">&quot;sse2&quot;</span>)]
</span><span class="kw">pub unsafe fn </span>ones128() -&gt; __m128i {
_mm_set1_epi8(<span class="number">0xFF </span><span class="kw">as </span>u8 <span class="kw">as </span>i8)
<span class="doccomment">/// Returns a 256-bit vector with all bits set to 1.
</span><span class="attribute">#[target_feature(enable = <span class="string">&quot;avx2&quot;</span>)]
</span><span class="kw">pub unsafe fn </span>ones256() -&gt; __m256i {
_mm256_set1_epi8(<span class="number">0xFF </span><span class="kw">as </span>u8 <span class="kw">as </span>i8)
