Optimising SDI CRC, part 2

Yesterday, we looked at optimising an SDI CRC computation. After a bunch of mathematical optimisations, we reached this point:

uint64_t pack60(const uint16_t* data) {
    return (((uint64_t)data[ 0]) <<  4) ^
           (((uint64_t)data[ 2]) << 14) ^
           (((uint64_t)data[ 4]) << 24) ^
           (((uint64_t)data[ 6]) << 34) ^
           (((uint64_t)data[ 8]) << 44) ^
           (((uint64_t)data[10]) << 54);
}

__m128i pack120(const uint16_t* data) {
    return _mm_set_epi64x(pack60(data + 12) >> 4, pack60(data));
}

__m128i xor_clmul(__m128i a, __m128i b) {
    return _mm_xor_si128(_mm_clmulepi64_si128(a, b, 0x00),
                         _mm_clmulepi64_si128(a, b, 0x11));
}

void crc_sdi(uint32_t* crcs, const uint16_t* data, size_t n) {
    __m128i c = _mm_cvtsi32_si128(crcs[0]);
    __m128i y = _mm_cvtsi32_si128(crcs[1]);
    { // *= x^-14 semi-mod P
        __m128i k = _mm_cvtsi32_si128(
            0x9f380000 /* x^-14-(64-18)-32-1 mod P */);
        c = _mm_clmulepi64_si128(c, k, 0x00);
        y = _mm_clmulepi64_si128(y, k, 0x00);
    }
    for (size_t i = 0; i < n; i += 24) {
        { // *= x^120 semi-mod P
            __m128i k = _mm_setr_epi32(
                0, 0x4b334000 /* x^120+64-1 mod P */,
                0, 0x96d30000 /* x^120-1    mod P */);
            c = xor_clmul(c, k);
            y = xor_clmul(y, k);
        }
        { // +=
            c = _mm_xor_si128(c, pack120(data + i));
            y = _mm_xor_si128(y, pack120(data + i + 1));
        }
    }
    { // *= x^14 semi-mod P
        __m128i k = _mm_setr_epi32(
            0, 0x14980000 /* x^14+64-1 mod P */,
            0, 0x00040000 /* x^14-1    mod P */);
        c = xor_clmul(c, k);
        y = xor_clmul(y, k);
    }
    { // mod P
        __m128i k = _mm_setr_epi32( /* x^128-1 div P */
            0x14980559, 0x4c9bb5d5,
            0x80040000, 0x5e405011);
        c = _mm_xor_si128(_mm_srli_si128(xor_clmul(c, k), 8),
                          _mm_clmulepi64_si128(c, k, 0x01));
        y = _mm_xor_si128(_mm_srli_si128(xor_clmul(y, k), 8),
                          _mm_clmulepi64_si128(y, k, 0x01));
        __m128i P = _mm_cvtsi32_si128(0x46001 /* P */);
        c = _mm_clmulepi64_si128(c, P, 0x00);
        y = _mm_clmulepi64_si128(y, P, 0x00);
    }
    crcs[0] = _mm_cvtsi128_si32(c);
    crcs[1] = _mm_cvtsi128_si32(y);
}

This ran at approximately 11 bits per cycle:

VersionBroadwellSkylakeIce Lake
_mm_clmulepi64_si12810.811.012.8

From here, the question is how to best implement pack120. Compared to yesterday, there is a very different approach based around _mm_shuffle_epi8:

void crc_sdi_pack(uint32_t* crcs, const uint16_t* data, size_t n) {
...
        { // +=
            __m128i d1 = _mm_loadu_si128((__m128i*)(data + i));
            __m128i d2 = _mm_loadu_si128((__m128i*)(data + i + 8));
            __m128i d3 = _mm_loadu_si128((__m128i*)(data + i + 16));
            __m128i k1 = _mm_setr_epi16(16, 16, 64, 64, 1, 1, 4, 4);
            __m128i k2 = _mm_setr_epi16(16,  0, 64,  0, 1, 0, 4, 0);
            __m128i k3 = _mm_setr_epi16( 0, 16,  0, 64, 0, 1, 0, 4);
            d1 = _mm_mullo_epi16(k1, d1);
            __m128i k4=_mm_setr_epi8( 0, 1, -1,  8,  9, -1, -1, -1,
                                      2, 3, -1, 10, 11, -1, -1, -1);
            __m128i k5=_mm_setr_epi8(-1, 4,  5, -1, 12, 13, -1, -1,
                                     -1, 6,  7, -1, 14, 15, -1, -1);
            d1 = _mm_shuffle_epi8(d1, k4) ^ _mm_shuffle_epi8(d1, k5);
            __m128i d1m; // Force a store to memory
            asm("vmovdqa %1, %0" : "=m"(d1m) : "x"(d1) : "memory");
            __m128i cd = _mm_packus_epi32(_mm_madd_epi16(d2, k2),
                                          _mm_madd_epi16(d3, k2));
            __m128i yd = _mm_packus_epi32(_mm_madd_epi16(d2, k3),
                                          _mm_madd_epi16(d3, k3));
            __m128i k6=_mm_setr_epi8(-1, -1, -1, -1, -1,  0,  1, -1,
                                      4,  5,  8,  9, -1, 12, 13, -1);
            __m128i k7=_mm_setr_epi8(-1, -1, -1, -1, -1, -1,  2,  3,
                                     -1,  6,  7, 10, 11, -1, 14, 15);
            cd = _mm_shuffle_epi8(cd, k6) ^ _mm_shuffle_epi8(cd, k7)
               ^ _mm_loadu_si64(&d1m);
            yd = _mm_shuffle_epi8(yd, k6) ^ _mm_shuffle_epi8(yd, k7)
               ^ _mm_loadu_si64(8 + (char*)&d1m);
            c = _mm_xor_si128(c, cd);
            y = _mm_xor_si128(y, yd);
        }
...
}

This is very impressive, getting us to the region of 20 bits per cycle:

VersionBroadwellSkylakeIce Lake
_mm_shuffle_epi820.219.723.4

If targetting AVX2, then some pairs of operations can be collapsed:

__m256i broadcast128(const uint16_t* data) {
    __m256i result;
    asm("vbroadcasti128 %1, %0" : "=x"(result) :
                                   "m"(*(const __m128i*)data));
    return result;
}

void crc_sdi_pack(uint32_t* crcs, const uint16_t* data, size_t n) {
...
        { // +=
            __m128i d1 = _mm_loadu_si128((__m128i*)(data + i));
            __m256i d2 = broadcast128(data + i + 8);
            __m256i d3 = broadcast128(data + i + 16);
            __m128i k1 = _mm_setr_epi16(
                16, 16, 64, 64, 1, 1, 4, 4);
            __m256i k23 = _mm256_setr_epi16(
                16,  0, 64,  0, 1, 0, 4, 0,
                 0, 16,  0, 64, 0, 1, 0, 4);
            d1 = _mm_mullo_epi16(k1, d1);
            __m128i k4 = _mm_setr_epi8(
                0,  1, -1,  8,  9, -1, -1, -1,
                2,  3, -1, 10, 11, -1, -1, -1);
            __m128i k5 = _mm_setr_epi8(
                -1, 4,  5, -1, 12, 13, -1, -1,
                -1, 6,  7, -1, 14, 15, -1, -1);
            d1 = _mm_shuffle_epi8(d1, k4) ^ _mm_shuffle_epi8(d1, k5);
            __m128i d1m; // Force a store to memory
            asm("vmovdqa %1, %0" : "=m"(d1m) : "x"(d1) : "memory");
            __m256i cdyd = _mm256_packus_epi32(
                _mm256_madd_epi16(d2, k23),
                _mm256_madd_epi16(d3, k23));
            __m256i k6 = _mm256_setr_epi8(
                -1, -1, -1, -1, -1,  0,  1, -1,
                 4,  5,  8,  9, -1, 12, 13, -1,
                -1, -1, -1, -1, -1,  0,  1, -1, 
                 4,  5,  8,  9, -1, 12, 13, -1);
            __m256i k7 = _mm256_setr_epi8(
                -1, -1, -1, -1, -1, -1,  2,  3,
                -1,  6,  7, 10, 11, -1, 14, 15,
                -1, -1, -1, -1, -1, -1,  2,  3,
                -1,  6,  7, 10, 11, -1, 14, 15);
            cdyd = _mm256_shuffle_epi8(cdyd, k6)
                 ^ _mm256_shuffle_epi8(cdyd, k7);
            __m256i m; // Force a store to memory
            asm("vmovdqa %1, %0" : "=m"(m) : "x"(cdyd) : "memory");
            __m128i cd = _mm256_castsi256_si128(cdyd)
                       ^ _mm_loadu_si64(&d1m);
            __m128i yd = _mm_loadu_si128(1 + (__m128i*)&m)
                       ^ _mm_loadu_si64(8 + (char*)&d1m);
            c = _mm_xor_si128(c, cd);
            y = _mm_xor_si128(y, yd);
        }
...
}

This gains us a few bits per cycle on all three processors:

VersionBroadwellSkylakeIce Lake
AVX222.323.225.4

If targetting AVX512, then the xor3 trick from yesterday can be applied on top:

__m128i xor3(__m128i a, __m128i b, __m128i c) {
    return _mm_ternarylogic_epi64(a, b, c, 0x96);
}

void crc_sdi_pack(uint32_t* crcs, const uint16_t* data, size_t n) {
...
    for (size_t i = 0; i < n; i += 24) {
        // *= x^120 semi-mod P
        // +=
        __m128i k = _mm_setr_epi32(
            0, 0x4b334000 /* x^120+64-1 mod P */,
            0, 0x96d30000 /* x^120-1    mod P */);
        __m128i d1 = _mm_loadu_si128((__m128i*)(data + i));
        __m256i d2 = broadcast128(data + i + 8);
        __m256i d3 = broadcast128(data + i + 16);
        __m128i k1 = _mm_setr_epi16(
            16, 16, 64, 64, 1, 1, 4, 4);
        __m256i k23 = _mm256_setr_epi16(
            16,  0, 64,  0, 1, 0, 4, 0,
             0, 16,  0, 64, 0, 1, 0, 4);
        d1 = _mm_mullo_epi16(k1, d1);
        __m128i k4 = _mm_setr_epi8(
            0,  1, -1,  8,  9, -1, -1, -1,
            2,  3, -1, 10, 11, -1, -1, -1);
        __m128i k5 = _mm_setr_epi8(
            -1, 4,  5, -1, 12, 13, -1, -1,
            -1, 6,  7, -1, 14, 15, -1, -1);
        d1 = _mm_shuffle_epi8(d1, k4) ^ _mm_shuffle_epi8(d1, k5);
        __m128i d1m; // Force a store to memory
        asm("vmovdqa %1, %0" : "=m"(d1m) : "x"(d1) : "memory");
        __m256i cdyd = _mm256_packus_epi32(
            _mm256_madd_epi16(d2, k23),
            _mm256_madd_epi16(d3, k23));
        __m256i k6 = _mm256_setr_epi8(
            -1, -1, -1, -1, -1,  0,  1, -1,
             4,  5,  8,  9, -1, 12, 13, -1,
            -1, -1, -1, -1, -1,  0,  1, -1, 
             4,  5,  8,  9, -1, 12, 13, -1);
        __m256i k7 = _mm256_setr_epi8(
            -1, -1, -1, -1, -1, -1,  2,  3,
            -1,  6,  7, 10, 11, -1, 14, 15,
            -1, -1, -1, -1, -1, -1,  2,  3,
            -1,  6,  7, 10, 11, -1, 14, 15);
        cdyd = _mm256_shuffle_epi8(cdyd, k6)
             ^ _mm256_shuffle_epi8(cdyd, k7);
        __m256i m; // Force a store to memory
        asm("vmovdqa %1, %0" : "=m"(m) : "x"(cdyd) : "memory");
        __m128i cd = _mm256_castsi256_si128(cdyd)
                   ^ _mm_loadu_si64(&d1m);
        __m128i yd = _mm_loadu_si128(1 + (__m128i*)&m)
                   ^ _mm_loadu_si64(8 + (char*)&d1m);
        c = xor3(_mm_clmulepi64_si128(c, k, 0x00),
                 _mm_clmulepi64_si128(c, k, 0x11), cd);
        y = xor3(_mm_clmulepi64_si128(y, k, 0x00),
                 _mm_clmulepi64_si128(y, k, 0x11), yd);
    }
...
}

On applicable processors, this gains another few bits per cycle:

VersionBroadwellSkylakeIce Lake
AVX512N/A25.828.9

Optimising SDI CRC

An SDI data stream consists of 10-bit data. In hardware, this is not a problem. In contemporary software, 10-bit values are slightly awkward, and are generally represented in the low 10 bits of a uint16_t. SDI has two separate streams of 10-bit data (called c and y), which software might choose to represent in an interleaved fashion. Finally, each stream has an 18-bit CRC computed over it (polynomial x18 + x5 + x4 + x0), where the 18-bit value might be represented in the low 18 bits of a uint32_t. Given all this, the CRCs could be computed using the following code:

void crc_sdi(uint32_t* crcs, const uint16_t* data, size_t n) {
    uint32_t c = crcs[0];
    uint32_t y = crcs[1];
    for (size_t i = 0; i < n; i += 2) {
        c ^= data[i];
        y ^= data[i+1];
        for (int k = 0; k < 10; k++) {
            c = c & 1 ? (c >> 1) ^ 0x23000 : c >> 1;
            y = y & 1 ? (y >> 1) ^ 0x23000 : y >> 1;
        }
    }
    crcs[0] = c;
    crcs[1] = y;
}

The above code is short, but it isn't fast: on my three test systems, it manages approximately 0.5 bits per cycle. Expressed differently, that is 40 cycles per iteration of the outer loop (which is ingesting 20 bits per iteration).

VersionBroadwellSkylakeIce Lake
Starting point0.5360.5280.477

At the cost of a 2KiB table, this can be improved to approximately 2.1 bits per cycle, which is better, but still not great:

uint16_t table[1u << 10];

void make_table() {
    for (uint32_t idx = 0; idx < (1u << 10); idx++) {
        uint32_t c = idx;
        for (int k = 0; k < 10; k++) {
            c = c & 1 ? (c >> 1) ^ 0x23000 : c >> 1;
        }
        table[idx] = (c >> 2);
    }
}

void crc_sdi(uint32_t* crcs, const uint16_t* data, size_t n) {
    uint32_t c = crcs[0];
    uint32_t y = crcs[1];
    for (size_t i = 0; i < n; i += 2) {
        c ^= data[i];
        y ^= data[i+1];
        c = (table[c & 0x3ff] << 2) ^ (c >> 10);
        y = (table[y & 0x3ff] << 2) ^ (y >> 10);
    }
    crcs[0] = c;
    crcs[1] = y;
}
VersionBroadwellSkylakeIce Lake
2KiB table2.102.152.05

To do better, some mathematical tricks need to be pulled. The polynomial ring ℤ2[x] is relevant, along with an isomorphism between bit strings and said polynomials. The original code can be re-expressed, but borrowing some notation from polynomials:

void crc_sdi(polynomial* crcs, const polynomial* data, size_t n) {
    polynomial c = crcs[0];
    polynomial y = crcs[1];
    for (size_t i = 0; i < n; i += 2) {
        c += data[i]   * x8;
        y += data[i+1] * x8;
        for (int k = 0; k < 10; k++) {
            c = (c * x1) mod P; // P is x18 + x5 + x4 + x0
            y = (y * x1) mod P;
        }
    }
    crcs[0] = c;
    crcs[1] = y;
}

The first mathematical trick is eliminating the inner loop:

void crc_sdi(polynomial* crcs, const polynomial* data, size_t n) {
    polynomial c = crcs[0];
    polynomial y = crcs[1];
    for (size_t i = 0; i < n; i += 2) {
        c = (c * x10 + data[i]   * x18) mod P;
        y = (y * x10 + data[i+1] * x18) mod P;
    }
    crcs[0] = c;
    crcs[1] = y;
}

The next mathematical trick is pulling * x18 and mod P out of the loop:

void crc_sdi(polynomial* crcs, const polynomial* data, size_t n) {
    polynomial c = crcs[0] * x-18;
    polynomial y = crcs[1] * x-18;
    for (size_t i = 0; i < n; i += 2) {
        c = c * x10 + data[i];
        y = y * x10 + data[i+1];
    }
    crcs[0] = (c * x18) mod P;
    crcs[1] = (y * x18) mod P;
}

At this point, a step back towards practical implementation is required, which takes the form of unrolling the loop 12 times:

void crc_sdi(polynomial* crcs, const polynomial* data, size_t n) {
    polynomial c = crcs[0] * x-18;
    polynomial y = crcs[1] * x-18;
    for (size_t i = 0; i < n; i += 24) {
        c = c * x120 + pack120(data + i);
        y = y * x120 + pack120(data + i + 1);
    }
    crcs[0] = (c * x18) mod P;
    crcs[1] = (y * x18) mod P;
}

polynomial pack120(const polynomial* data) { return pack60(data) * x60 + pack60(data + 12); }

polynomial pack60(const polynomial* data) { polynomial result = 0; for (size_t i = 0; i < 12; i += 2) { result = result * x10 + data[i]; } return result; }

The * x60 in the middle of pack120 is slightly awkward, and would be more convenient as * x64. This can be achieved by shuffling a few things around:

void crc_sdi(polynomial* crcs, const polynomial* data, size_t n) {
    polynomial c = crcs[0] * x-14;
    polynomial y = crcs[1] * x-14;
    for (size_t i = 0; i < n; i += 24) {
        c = c * x120 + pack120(data + i);
        y = y * x120 + pack120(data + i + 1);
    }
    crcs[0] = (c * x14) mod P;
    crcs[1] = (y * x14) mod P;
}

polynomial pack120(const polynomial* data) { return pack60(data) * x64 + pack60(data + 12) * x4; }

polynomial pack60(const polynomial* data) { polynomial result = 0; for (size_t i = 0; i < 12; i += 2) { result = result * x10 + data[i]; } return result; }

To continue back towards a practical implementation, the size of each polynomial variable needs to be bounded. For that, I'll introduce the semi-mod operator, where for a polynomial F, (F * xn) semi-mod P is defined as (F mod x64) * (xn mod P) + (F div x64) * (xn+64 mod P). This is then tactically deployed in a few places:

void crc_sdi(polynomial* crcs, const polynomial* data, size_t n) {
    polynomial c = (crcs[0] * x-14) semi-mod P;
    polynomial y = (crcs[1] * x-14) semi-mod P;
    for (size_t i = 0; i < n; i += 24) {
        c = (c * x120) semi-mod P;
        y = (y * x120) semi-mod P;
        c = c + pack120(data + i);
        y = y + pack120(data + i + 1);
    }
    c = (c * x14) semi-mod P;
    y = (y * x14) semi-mod P;
    crcs[0] = c mod P;
    crcs[1] = y mod P;
}

polynomial pack120(const polynomial* data) { return pack60(data) * x64 + pack60(data + 12) * x4; }

polynomial pack60(const polynomial* data) { polynomial result = 0; for (size_t i = 0; i < 12; i += 2) { result = result * x10 + data[i]; } return result; }

The next transformation is unrolling pack60:

void crc_sdi(polynomial* crcs, const polynomial* data, size_t n) {
    polynomial c = (crcs[0] * x-14) semi-mod P;
    polynomial y = (crcs[1] * x-14) semi-mod P;
    for (size_t i = 0; i < n; i += 24) {
        c = (c * x120) semi-mod P;
        y = (y * x120) semi-mod P;
        c = c + pack120(data + i);
        y = y + pack120(data + i + 1);
    }
    c = (c * x14) semi-mod P;
    y = (y * x14) semi-mod P;
    crcs[0] = c mod P;
    crcs[1] = y mod P;
}

polynomial pack120(const polynomial* data) { return pack60(data) * x64 + pack60(data + 12) * x4; }

polynomial pack60(const polynomial* data) { return data[ 0] * x50 + data[ 2] * x40 + data[ 4] * x30 + data[ 6] * x20 + data[ 8] * x10 + data[10]; }

Moving back toward the practical realm, pack60 can return a 64-bit value. This value will be a polynomial in reversed bit order, meaning that the MSB represents x0 and the LSB represents x63. pack60 thus becomes:

uint64_t pack60(const uint16_t* data) {
    return (((uint64_t)data[ 0]) <<  4) ^
           (((uint64_t)data[ 2]) << 14) ^
           (((uint64_t)data[ 4]) << 24) ^
           (((uint64_t)data[ 6]) << 34) ^
           (((uint64_t)data[ 8]) << 44) ^
           (((uint64_t)data[10]) << 54);
}

Next up is pack120, which can return a 128-bit value. Again, this value will be a polynomial in reversed bit order, so the MSB represents x0 and the LSB represents x127. pack120 thus becomes:

__m128i pack120(const uint16_t* data) {
    return _mm_set_epi64x(pack60(data + 12) >> 4, pack60(data));
}

The c and y polynomials within the main function can also be 128-bit values, in the same reversed bit order. At this point, _mm_clmulepi64_si128 needs to be introduced, which can perform multiplication of two 64-bit polynomials, either in regular bit order or in reversed bit order. For polynomials F and G in regular bit order, it performs F * G with the result in regular order. For polynomials F and G in reversed bit order, it performs F * G * x1 with the result in reversed order. Accordingly, _mm_clmulepi64_si128 can be used to implement the semi-mod operator. It can also be used to implement mod P. It is neither short nor readable, but the resultant code is:

__m128i xor_clmul(__m128i a, __m128i b) {
    return _mm_xor_si128(_mm_clmulepi64_si128(a, b, 0x00),
                         _mm_clmulepi64_si128(a, b, 0x11));
}

void crc_sdi(uint32_t* crcs, const uint16_t* data, size_t n) {
    __m128i c = _mm_cvtsi32_si128(crcs[0]);
    __m128i y = _mm_cvtsi32_si128(crcs[1]);
    { // *= x^-14 semi-mod P
        __m128i k = _mm_cvtsi32_si128(
            0x9f380000 /* x^-14-(64-18)-32-1 mod P */);
        c = _mm_clmulepi64_si128(c, k, 0x00);
        y = _mm_clmulepi64_si128(y, k, 0x00);
    }
    for (size_t i = 0; i < n; i += 24) {
        { // *= x^120 semi-mod P
            __m128i k = _mm_setr_epi32(
                0, 0x4b334000 /* x^120+64-1 mod P */,
                0, 0x96d30000 /* x^120-1    mod P */);
            c = xor_clmul(c, k);
            y = xor_clmul(y, k);
        }
        { // +=
            c = _mm_xor_si128(c, pack120(data + i));
            y = _mm_xor_si128(y, pack120(data + i + 1));
        }
    }
    { // *= x^14 semi-mod P
        __m128i k = _mm_setr_epi32(
            0, 0x14980000 /* x^14+64-1 mod P */,
            0, 0x00040000 /* x^14-1    mod P */);
        c = xor_clmul(c, k);
        y = xor_clmul(y, k);
    }
    { // mod P
        __m128i k = _mm_setr_epi32( /* x^128-1 div P */
            0x14980559, 0x4c9bb5d5,
            0x80040000, 0x5e405011);
        c = _mm_xor_si128(_mm_srli_si128(xor_clmul(c, k), 8),
                          _mm_clmulepi64_si128(c, k, 0x01));
        y = _mm_xor_si128(_mm_srli_si128(xor_clmul(y, k), 8),
                          _mm_clmulepi64_si128(y, k, 0x01));
        __m128i P = _mm_cvtsi32_si128(0x46001 /* P */);
        c = _mm_clmulepi64_si128(c, P, 0x00);
        y = _mm_clmulepi64_si128(y, P, 0x00);
    }
    crcs[0] = _mm_cvtsi128_si32(c);
    crcs[1] = _mm_cvtsi128_si32(y);
}

This version comes in at approximately 11 bits per cycle:

VersionBroadwellSkylakeIce Lake
_mm_clmulepi64_si12810.811.012.8

This is good, but we can do better still. At this point, all the mathematical tricks have been pulled, so the next avenue of exploration is vectorisation. In particular, there are four pack60 calls per iteration, and it should be possible to perform two calls at once by operating at 128-bit vector width:

__m128i pmovzxwq(const uint16_t* data) {
    __m128i out;
    asm("vpmovzxwq %1, %0" : "=x"(out) :
                              "m"(*(const uint32_t*)data));
    return out;
}

__m128i pack60x2(const uint16_t* data) {
    __m128i result;
    result  = _mm_slli_epi64(pmovzxwq(data     ),  4);
    result ^= _mm_slli_epi64(pmovzxwq(data +  2), 14);
    result ^= _mm_slli_epi64(pmovzxwq(data +  4), 24);
    result ^= _mm_slli_epi64(pmovzxwq(data +  6), 34);
    result ^= _mm_slli_epi64(pmovzxwq(data +  8), 44);
    result ^= _mm_slli_epi64(pmovzxwq(data + 10), 54);
    return result;
}

void crc_sdi_pack(uint32_t* crcs, const uint16_t* data, size_t n) {
...
        { // +=
            __m128i lo = pack60x2(data + i);
            __m128i hi = _mm_srli_epi64(pack60x2(data + i + 12), 4);
            c = _mm_xor_si128(c, _mm_unpacklo_epi64(lo, hi));
            y = _mm_xor_si128(y, _mm_unpackhi_epi64(lo, hi));
        }
...
}

This gives an extra 0.7 bits per cycle on Broadwell. On Skylake and Ice Lake, it improves things by more than 2 bits per cycle:

VersionBroadwellSkylakeIce Lake
pack60x211.513.215.2

From here, a few shift instructions can be eliminated by having two variants of pack60x2:

__m128i pack60x2_4(const uint16_t* data) {
    __m128i result;
    result  = _mm_slli_epi64(pmovzxwq(data     ),  4);
    result ^= _mm_slli_epi64(pmovzxwq(data +  2), 14);
    result ^= _mm_slli_epi64(pmovzxwq(data +  4), 24);
    result ^= _mm_slli_epi64(pmovzxwq(data +  6), 34);
    result ^= _mm_slli_epi64(pmovzxwq(data +  8), 44);
    result ^= _mm_slli_epi64(pmovzxwq(data + 10), 54);
    return result;
}

__m128i pack60x2_0(const uint16_t* data) {
    __m128i result;
    result  =                pmovzxwq(data     );
    result ^= _mm_slli_epi64(pmovzxwq(data +  2), 10);
    result ^= _mm_slli_epi64(pmovzxwq(data +  4), 20);
    result ^= _mm_slli_epi64(pmovzxwq(data +  6), 30);
    result ^= _mm_slli_epi64(pmovzxwq(data +  8), 40);
    result ^= _mm_slli_epi64(pmovzxwq(data + 10), 50);
    return result;
}

void crc_sdi_pack(uint32_t* crcs, const uint16_t* data, size_t n) {
...
        { // +=
            __m128i lo = pack60x2_4(data + i);
            __m128i hi = pack60x2_0(data + i + 12);
            c = _mm_xor_si128(c, _mm_unpacklo_epi64(lo, hi));
            y = _mm_xor_si128(y, _mm_unpackhi_epi64(lo, hi));
        }
...
}

This makes basically no difference on Skylake, but gives a minor improvement on both Broadwell and Ice Lake:

VersionBroadwellSkylakeIce Lake
pack60x2_4 and pack60x2_012.613.215.9

At this point it is worth reviewing exactly which micro-ops are being executed per iteration of the loop (which is now ingesting 240 bits per iteration):

InstructionCountBDW μopsSKL μopsICL μops
_mm_clmulepi64_si128 x, x, i4p0p5p5
pmovzxwq x, m12p5 + p23p5 + p23p15 + p23
_mm_slli_epi64 x, i11p0p01p01
_mm_xor_si128 x, x14p015p015p015
_mm_unpack(lo|hi)_epi64 x, x2p5p5p15

On Skylake, we're bound by port 5 (which is why eliminating the shifts didn't help): 240 bits require 18 μops executed by port 5, and 240 bits divided by 18 cycles is 13.3 bits per cycle, which is pretty much the measured 13.2 bits per cycle. Things are slightly less neat on Broadwell and Ice Lake, as we're hitting suboptimal dispatch of μops to execution ports, causing us to be bound on a port even though clairvoyant dispatch could do slightly better. Broadwell could be described as being bound on port 0 (due to some xor μops being dispatched to port 0, even though it would be better to dispatch them all to port 1), and Ice Lake could be described as being bound on a mix of port 1 and port 5 (due to some xor μops not being dispatched to port 0).

To get past this bound, some work needs to be removed from port 5. Each pair of pmovzxwq and _mm_slli_epi64 causes a memory μop on port 2 or 3, a shuffle μop on port 5 (or port 1 on Ice Lake), and an arithmetic μop on port 0 (or port 1 on Skylake / Ice Lake). The shuffle μop can be removed by using a different pair of instructions: broadcastss (one memory μop) followed by _mm_madd_epi16 against a constant (one arithmetic μop). This different pair can only be used when the shift amount, modulo 32, is between 0 and 14 (inclusive), as the constant values in question are int16_t values. Half of the cases satisfy this constraint on the shift amount, and can thus be replaced:

__m128i broadcastss(const uint16_t* data) {
    __m128i result;
    asm("vbroadcastss %1, %0" : "=x"(result) :
                                 "m"(*(const uint32_t*)data));
    return result;
}

__m128i pack60x2_4(const uint16_t* data) {
    __m128i shift4  = _mm_setr_epi32(1u <<  4, 0, 1u << ( 4+16), 0);
    __m128i shift14 = _mm_setr_epi32(1u << 14, 0, 1u << (14+16), 0);
    __m128i shift34 = _mm_setr_epi32(0, 1u <<  2, 0, 1u << ( 2+16));
    __m128i shift44 = _mm_setr_epi32(0, 1u << 12, 0, 1u << (12+16));
    __m128i result;
    result  = _mm_madd_epi16(broadcastss(data     ), shift4);
    result ^= _mm_madd_epi16(broadcastss(data +  2), shift14);
    result ^= _mm_slli_epi64(   pmovzxwq(data +  4), 24);
    result ^= _mm_madd_epi16(broadcastss(data +  6), shift34);
    result ^= _mm_madd_epi16(broadcastss(data +  8), shift44);
    result ^= _mm_slli_epi64(   pmovzxwq(data + 10), 54);
    return result;
}

__m128i pack60x2_0(const uint16_t* data) {
    __m128i shift10 = _mm_setr_epi32(1u << 10, 0, 1u << (10+16), 0);
    __m128i shift40 = _mm_setr_epi32(0, 1u << 8, 0, 1u << (8+16));
    __m128i result;
    result  =                   pmovzxwq(data     );
    result ^= _mm_madd_epi16(broadcastss(data +  2), shift10);
    result ^= _mm_slli_epi64(   pmovzxwq(data +  4), 20);
    result ^= _mm_slli_epi64(   pmovzxwq(data +  6), 30);
    result ^= _mm_madd_epi16(broadcastss(data +  8), shift40);
    result ^= _mm_slli_epi64(   pmovzxwq(data + 10), 50);
    return result;
}

This gives a nice improvement across the board, getting us to at least 18 bits per cycle on Skylake and Ice Lake:

VersionBroadwellSkylakeIce Lake
broadcastss + _mm_madd_epi1613.518.018.8

A few more pairs of pmovzxwq and _mm_slli_epi64 can be replaced with broadcastss and _mm_madd_epi16, albeit at the cost of one extra arithmetic μop for every two removed shuffle μops:

__m128i pack60x2_4(const uint16_t* data) {
    __m128i shift4  = _mm_setr_epi32(1u <<  4, 0, 1u << ( 4+16), 0);
    __m128i shift14 = _mm_setr_epi32(1u << 14, 0, 1u << (14+16), 0);
    __m128i shift34 = _mm_setr_epi32(0, 1u <<  2, 0, 1u << ( 2+16));
    __m128i shift44 = _mm_setr_epi32(0, 1u << 12, 0, 1u << (12+16));
    __m128i result, hi;
    result  = _mm_madd_epi16(broadcastss(data     ), shift4);
    result ^= _mm_madd_epi16(broadcastss(data +  2), shift14);
    hi      = _mm_madd_epi16(broadcastss(data +  4), shift4);
    result ^= _mm_madd_epi16(broadcastss(data +  6), shift34);
    result ^= _mm_madd_epi16(broadcastss(data +  8), shift44);
    hi     ^= _mm_madd_epi16(broadcastss(data + 10), shift34);
    result ^= _mm_slli_epi64(hi, 20);
    return result;
}

__m128i pack60x2_0(const uint16_t* data) {
    __m128i shift10 = _mm_setr_epi32(1u << 10, 0, 1u << (10+16), 0);
    __m128i shift40 = _mm_setr_epi32(0, 1u << 8, 0, 1u << (8+16));
    __m128i result, hi;
    result  =                   pmovzxwq(data     );
    result ^= _mm_madd_epi16(broadcastss(data +  2), shift10);
    hi      =                   pmovzxwq(data +  4);
    hi     ^= _mm_madd_epi16(broadcastss(data +  6), shift10);
    result ^= _mm_slli_epi64(hi, 20);
    result ^= _mm_madd_epi16(broadcastss(data +  8), shift40);
    result ^= _mm_slli_epi64(   pmovzxwq(data + 10), 50);
    return result;
}

This is a minor regression on Broadwell (because at this point we're bound by port 0, and that is where the extra arithmetic μop goes), and a small improvement on Skylake and Ice Lake (where the bound is shuffle rather than arithmetic):

VersionBroadwellSkylakeIce Lake
More broadcastss + _mm_madd_epi1613.218.819.7

To break through the barrier of 20 bits per cycle, AVX512's ternary bitwise instruction can be used to merge together pairs of xor instructions:

__m128i xor3(__m128i a, __m128i b, __m128i c) {
    return _mm_ternarylogic_epi64(a, b, c, 0x96);
}

__m128i pack60x2_4(const uint16_t* data) {
    __m128i shift4  = _mm_setr_epi32(1u <<  4, 0, 1u << ( 4+16), 0);
    __m128i shift14 = _mm_setr_epi32(1u << 14, 0, 1u << (14+16), 0);
    __m128i shift34 = _mm_setr_epi32(0, 1u <<  2, 0, 1u << ( 2+16));
    __m128i shift44 = _mm_setr_epi32(0, 1u << 12, 0, 1u << (12+16));
    __m128i result, hi;
    result = xor3(_mm_madd_epi16(broadcastss(data     ), shift4),
                  _mm_madd_epi16(broadcastss(data +  2), shift14),
                  _mm_madd_epi16(broadcastss(data +  6), shift34));
    hi     =      _mm_madd_epi16(broadcastss(data +  4), shift4)
           ^      _mm_madd_epi16(broadcastss(data + 10), shift34);
    result = xor3(result,
                  _mm_madd_epi16(broadcastss(data +  8), shift44),
                  _mm_slli_epi64(hi, 20));
    return result;
}

__m128i pack60x2_0(const uint16_t* data) {
    __m128i shift10 = _mm_setr_epi32(1u << 10, 0, 1u << (10+16), 0);
    __m128i shift40 = _mm_setr_epi32(0, 1u << 8, 0, 1u << (8+16));
    __m128i result, hi;
    hi     =                        pmovzxwq(data +  4)
           ^      _mm_madd_epi16(broadcastss(data +  6), shift10);
    result = xor3(                  pmovzxwq(data     ),
                  _mm_madd_epi16(broadcastss(data +  2), shift10),
                  _mm_slli_epi64(hi, 20));
    result = xor3(result,
                  _mm_madd_epi16(broadcastss(data +  8), shift40),
                  _mm_slli_epi64(   pmovzxwq(data + 10), 50));
    return result;
}

void crc_sdi(uint32_t* crcs, const uint16_t* data, size_t n) {
    __m128i c = _mm_cvtsi32_si128(crcs[0]);
    __m128i y = _mm_cvtsi32_si128(crcs[1]);
    { // *= x^-14 semi-mod P
        __m128i k = _mm_cvtsi32_si128(
            0x9f380000 /* x^-14-(64-18)-32-1 mod P */);
        c = _mm_clmulepi64_si128(c, k, 0x00);
        y = _mm_clmulepi64_si128(y, k, 0x00);
    }
    for (size_t i = 0; i < n; i += 24) {
        // *= x^120 semi-mod P
        // +=
        __m128i lo = pack60x2_4(data + i);
        __m128i hi = pack60x2_0(data + i + 12); 
        __m128i k = _mm_setr_epi32(
            0, 0x4b334000 /* x^120+64-1 mod P */,
            0, 0x96d30000 /* x^120-1    mod P */);
        c = xor3(_mm_clmulepi64_si128(c, k, 0x00),
                 _mm_clmulepi64_si128(c, k, 0x11),
                 _mm_unpacklo_epi64(lo, hi));
        y = xor3(_mm_clmulepi64_si128(y, k, 0x00),
                 _mm_clmulepi64_si128(y, k, 0x11),
                 _mm_unpackhi_epi64(lo, hi));
    }
    { // *= x^14 semi-mod P
        __m128i k = _mm_setr_epi32(
            0, 0x14980000 /* x^14+64-1 mod P */,
            0, 0x00040000 /* x^14-1    mod P */);
        c = xor_clmul(c, k);
        y = xor_clmul(y, k);
    }
    { // mod P
        __m128i k = _mm_setr_epi32( /* x^128-1 div P */
            0x14980559, 0x4c9bb5d5,
            0x80040000, 0x5e405011);
        c = _mm_xor_si128(_mm_srli_si128(xor_clmul(c, k), 8),
                          _mm_clmulepi64_si128(c, k, 0x01));
        y = _mm_xor_si128(_mm_srli_si128(xor_clmul(y, k), 8),
                          _mm_clmulepi64_si128(y, k, 0x01));
        __m128i P = _mm_cvtsi32_si128(0x46001 /* P */);
        c = _mm_clmulepi64_si128(c, P, 0x00);
        y = _mm_clmulepi64_si128(y, P, 0x00);
    }
    crcs[0] = _mm_cvtsi128_si32(c);
    crcs[1] = _mm_cvtsi128_si32(y);
}

AVX512 isn't available on Broadwell, nor on the client variant of Skylake, but where it is available it pushes us well above 20 bits per cycle:

VersionBroadwellSkylakeIce Lake
_mm_ternarylogic_epi64N/A21.923.5

To recap the journey, when measuring bits per cycle, we've got a 25x overall improvement on Broadwell, more than 40x overall improvement on Skylake, and nearly a 50x overall improvement on Ice Lake:

VersionBroadwellSkylakeIce Lake
Starting point0.5360.5280.477
2KiB table2.102.152.05
_mm_clmulepi64_si12810.811.012.8
pack60x211.513.215.2
pack60x2_4 and pack60x2_012.613.215.9
broadcastss + _mm_madd_epi1613.518.018.8
More broadcastss + _mm_madd_epi1613.218.819.7
_mm_ternarylogic_epi64N/A21.923.5

Contrasting Intel AMX and Apple AMX

Intel has an x64 instruction set extension called AMX, meanwhile Apple has a totally different aarch64 instruction set extension also called AMX.

Register file

Intel AMX: 8 tmm registers, each a 16 by 16 matrix of 32-bit elements (technically, each can be configured to be any size - square or rectangular - between 1 by 1 and 16 by 16, though element size is fixed at 32-bits regardless). Total architectural state 8 kilobytes.

Apple AMX: Total architectural state 5 kilobytes, broken down as:

The vectors have 8/16/32/64-bit elements, like regular SIMD registers. Note that Intel AMX does not need vector registers, as Intel already has AVX512 with 64-byte vectors (32 of which are in the AVX512 architectural register file).

Data types

Intel AMX: Multiplicands are always 32-bit, either i8[4] or u8[4] or bf16[2], combined via dot product. Accumulators are always 32-bit, either i32 or u32 or f32.

Apple AMX: Multiplicands are scalars, any of i8 / u8 / i16 / u16 / f16 / f32 / f64. Accumulators are any of i16 / u16 / i32 / u32 / f16 / f32 / f64. Note f16 (i.e. IEEE 754 half-precision with 5-bit exponent and 10-bit fraction) rather than bf16 (8-bit exponent, 7-bit fraction), though bf16 support is added on M2 and later.

Computational operations

Intel AMX: Matrix multiplication of any two tmm registers, accumulating onto a third tmm register. For the multiplication, matrix elements are themselves (very small) vectors, combined via dot product. This is the only operation. Viewed differently, this is doing 16×64 by 64×16 (int8) or 16×32 by 32×16 (bf16) matmul, then adding onto a 16×16 matrix.

Apple AMX: Outer product of any x register with any y register (viewed as a column vector), accumulating onto any (matrix view) z register). For the multiplication, x / y elements are scalars (depending on the data type, this might be viewed as doing 16×1 by 1×16 matmul then adding onto a 16×16 matrix). Alternatively, pointwise product of any x register with any y register (viewed as a row vector), accumulating onto any (vector view) z register. Many more operations as well, though the general theme is {outer or pointwise} {multiplication or addition or subtraction}, possibly followed by right-shift, possibly followed by integer saturation. The most exotic exceptions to the theme are min / max / popcnt.

Memory operations

Intel AMX: Matrix load or store (up to 1 kilobyte), configurable with a base address (register + immediate offset) and a row stride (register or zero, optionally shifted left by 1-3 bits).

Apple AMX: Vector load or store (64 bytes), configurable with a base address (register). Also load or store pair (128 bytes), though the two registers must be consecutive, and the row stride is fixed at 64 bytes, and the base address must be 128-byte aligned. Loads or stores with y effectively give a free vector transpose, as y registers can be viewed as column vectors.

Masking modes

Intel AMX: Each tmm register can be configured to any size - square or rectangular - between 1 by 1 and 16 by 16. This is (mostly) equivalent to saying that a tmm register is always 16 by 16, but has an associated mask on each dimension to only enable some number of leading rows and columns. These per-register masks are architectural state.

Apple AMX: Per-dimension masking is available on a per-instruction basis (though notably not for loads / stores). Available masking modes are: all rows (columns), even/odd rows (columns) only, first N rows (columns) only, last N rows (columns) only, row (column) N only.

Note that both of these approaches are different to Intel's AVX512 approach, which is a separate register file containing 8 mask registers (k0 through k7) and every operation optionally taking a mask register as an input.

Other

Apple AMX contains a very interesting instruction called genlut. In the forward direction ("lookup"), it is somewhere between AVX512's vpshufb and vgatherps. In the backward direction ("generate") it is something like an inverse vpshufb, performing arbitrary 2/3/4/5-bit quantisation. When used in both directions, it can be useful for piecewise linear interpolation, or as an alternative to AVX512's vfpclassps / vfixupimmps.

Computing parity using CRC

It is a party trick rather than anything practically useful, but the following four functions all compute the same thing:

uint8_t parity_b(const uint64_t* data, size_t n) {
    uint8_t acc = 0;
    for (size_t i = 0; i < n; ++i) {
        for (size_t j = 0; j < 64; ++j) {
            if (data[i] & (1ull << j)) acc ^= 1;
        }
    }
    return acc;
}
uint8_t parity_p(const uint64_t* data, size_t n) {
    uint8_t acc = 0;
    for (size_t i = 0; i < n; ++i) {
        acc += _mm_popcnt_u64(data[i]);
    }
    return acc & 1;
}
uint8_t parity_x(const uint64_t* data, size_t n) {
    uint64_t acc = 0;
    for (size_t i = 0; i < n; ++i) {
        acc ^= data[i];
    }
    return _mm_popcnt_u64(acc) & 1;
}
uint8_t parity_c(const uint64_t* data, size_t n) {
    uint32_t acc = 0;
    for (size_t i = 0; i < n; ++i) {
        acc = _mm_crc32_u64(acc, data[i]);
    }
    return _mm_popcnt_u64(acc) & 1;
}

Going from parity_b to parity_p is an easy step: replace the inner loop with a popcnt. Going from parity_p to parity_x is also an easy step: lift the popcnt out of the loop. Hence it is not too surprising that parity_b and parity_p and parity_x all compute the same thing. The parity_c function is like parity_x, except replacing ^= with crc, and the surprising thing is that parity_c computes the same thing as all the other functions. To understand why, we need to look at the polynomial used by the crc instruction (the CRC32-C polynomial):

x^{32} + x^{28} + x^{27} + x^{26} + x^{25} + x^{23} + x^{22} + x^{20} + x^{19} + x^{18} + x^{14} + x^{13} + x^{11} + x^{10} + x^{9} + x^{8} + x^{6} + 1

This polynomial can be expressed as the product of two separate polynomials:

(x + 1)(x^{31} + x^{30} + x^{29} + x^{28} + x^{26} + x^{24} + x^{23} + x^{21} + x^{20} + x^{18} + x^{13} + x^{10} + x^{8} + x^{5} + x^{4} + x^{3} + x^{2} + x^{1} + 1)

Accordingly, CRC32-C isn't really a 32-bit checksum, it is really two separate things packed into 32 bits: a 1-bit checksum and a 31-bit checksum. The method of packing is slightly funky, and thus the method of unpacking is also slightly funky: from a mathematical point of view, to get the 1-bit checksum out of the 32-bit value, we need to view the 32-bit value as a Z2 polynomial and then compute the remainder modulo x + 1. This is equivalent to iteratively applying the reduction rule x = 1 until nothing more can be reduced. If we were to start with the 4-bit polynomial b3 * x3 + b2 * x2 + b1 * x + b0 and iteratively apply x = 1, we'd get b3 * 1*1*1 + b2 * 1*1 + b1 * 1 + b0, which is just b3 + b2 + b1 + b0, which is just another way of saying popcnt (mod 2). The same thing is true starting with a 32-bit polynomial: the remainder modulo x + 1 is just popcnt (mod 2). In other words, we can use popcnt to unpack the 1-bit checksum out of the 32-bit value, and a 1-bit checksum is just parity. Hence why parity_c computes the same thing as all the other functions.

This combination of a 1-bit checksum and a 31-bit checksum can be seen as a good thing for checksumming purposes, as the two parts protect against different problems. Notably, every bit-flip in the data being checksummed will invert the 1-bit checksum, and hence any odd number of bit-flips will cause the 1-bit checksum to mismatch. The (ahem) flip-side is that the 1-bit checksum will match for any even number of bit-flips, which is where the 31-bit checksum comes in: it needs to protect against as many even number of bit-flips as possible.

Polynomial remainders by direct computation

Last time, we had a need to compute mod_P(A * x32) where A was a polynomial with degree at most 31, and mod_P took the remainder after dividing by a polynomial P with degree 32. The traditional Barrett reduction approach involved finding the quotient and then calculating the remainder from that:

Q = ((A * x32) * (x63 // P)) // x63
R =  (A * x32) - Q * P
return R

Via a combination of the * x32, and the degree of P being 32, and the context being Z2 polynomials, this simplified to:

Q = ((A * x32) * (x63 // P)) // x63
return (Q * P)[0:32]

The resultant C code (for bit-reversed polynomials) was very short:

uint32_t mul_x32_mod_P(uint32_t A) {
    __m128i k = _mm_setr_epi32(0, 0xdea713f1, 0x05ec76f1, 1);
    __m128i a = _mm_cvtsi32_si128(A);
    __m128i b = _mm_clmulepi64_si128(a, k, 0x00);
    __m128i c = _mm_clmulepi64_si128(b, k, 0x10);
    return _mm_extract_epi32(c, 2);
}

Though our particular case permitted this simplification, it is not permitted in general. In Faster Remainder by Direct Computation (D. Lemire, O. Kaser, and N. Kurz, 2018), a different approach was presented: the remainder can be computed directly, rather than computing the quotient and getting the remainder from that. Said paper is about integers, and its main Theorem 1 is effectively:

Provided that d in [1, 2N) and n in [0, 2N) and F ≥ N + log2(d), then:
n % d equals (((n * ceil(2F / d)) % 2F) * d) // 2F.

Translated from the language of integers to the language of polynomials, this theorem would be:

Provided that S, P, T are polynomials satisfying deg((S * P) // T) ≤ 0, then:
S % P equals (((S * (T // P)) % T) * P) // T.

This lets us turn % P into multiplication by T // P followed by % T followed by * P followed by // T. The % T and // T steps are trivial if T is chosen to be xn (with n ≥ deg(S) + deg(P) to make the precondition hold), and T // P can be precomputed if P is fixed.

It turns out that the translated theorem is true, and that the proof isn't too hard. The proof builds upon the Barrett reduction property, which is:

Provided that deg(S // T) ≤ 0, S // P equals (S * (T // P)) // T.

We start with the complicated side of our (supposed) equality:

(((S * (T // P)) % T) * P) // T

We're going to be overwhelmed by parentheses if we're not careful, so I'm going to say that a sequence of mixed * and % and // operations always happens left-to-right, thus allowing some of the parentheses to be dropped:

S * (T // P) % T * P // T

From here, the % can be expanded: (F % G equals F - (F // G * G))

((S * (T // P)) - (S * (T // P) // T * T)) * P // T

Then push the - to the top level:

(S * (T // P) * P // T) - (S * (T // P) // T * T * P // T)

Making use of our deg((S * P) // T) ≤ 0 precondition, by the Barrett reduction property, (S * P) // P equals (S * P) * (T // P) // T:

(S * P // P) - (S * (T // P) // T * T * P // T)

We can then simplify away * G // G for any G:

S - (S * (T // P) // T * P)

Our deg((S * P) // T) ≤ 0 precondition implies deg(S // T) ≤ 0, so by the Barrett reduction property, S // P equals S * (T // P) // T:

S - (S // P * P)

Which is the definition of:

S % P

Thus our supposed equality is an actual equality.

page: 3 4 5 6 7