doxygen/html/simd_8c_source.html

 /* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */
 /*
  * SIMD implementation.
  *
  * ==========================(LICENSE BEGIN)============================
  *
  * Copyright (c) 2007-2010  Projet RNRT SAPHIR
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
  * without limitation the rights to use, copy, modify, merge, publish,
  * distribute, sublicense, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  *
  * The above copyright notice and this permission notice shall be
  * included in all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * ===========================(LICENSE END)=============================
  *
  * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
  */

 #include <stddef.h>
 #include <string.h>
 #include <limits.h>

 #include "sph_simd.h"

 #ifdef __cplusplus
 extern "C"{
 #endif

 #if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD
 #define SPH_SMALL_FOOTPRINT_SIMD   1
 #endif

 #ifdef _MSC_VER
 #pragma warning (disable: 4146)
 #endif

 typedef sph_u32 u32;
 typedef sph_s32 s32;
 #define C32     SPH_C32
 #define T32     SPH_T32
 #define ROL32   SPH_ROTL32

 #define XCAT(x, y)    XCAT_(x, y)
 #define XCAT_(x, y)   x ## y

 /*
  * The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive.
  */
 static const s32 alpha_tab[] = {
           1,  41, 139,  45,  46,  87, 226,  14,  60, 147, 116, 130,
         190,  80, 196,  69,   2,  82,  21,  90,  92, 174, 195,  28,
         120,  37, 232,   3, 123, 160, 135, 138,   4, 164,  42, 180,
         184,  91, 133,  56, 240,  74, 207,   6, 246,  63,  13,  19,
           8,  71,  84, 103, 111, 182,   9, 112, 223, 148, 157,  12,
         235, 126,  26,  38,  16, 142, 168, 206, 222, 107,  18, 224,
         189,  39,  57,  24, 213, 252,  52,  76,  32,  27,  79, 155,
         187, 214,  36, 191, 121,  78, 114,  48, 169, 247, 104, 152,
          64,  54, 158,  53, 117, 171,  72, 125, 242, 156, 228,  96,
          81, 237, 208,  47, 128, 108,  59, 106, 234,  85, 144, 250,
         227,  55, 199, 192, 162, 217, 159,  94, 256, 216, 118, 212,
         211, 170,  31, 243, 197, 110, 141, 127,  67, 177,  61, 188,
         255, 175, 236, 167, 165,  83,  62, 229, 137, 220,  25, 254,
         134,  97, 122, 119, 253,  93, 215,  77,  73, 166, 124, 201,
          17, 183,  50, 251,  11, 194, 244, 238, 249, 186, 173, 154,
         146,  75, 248, 145,  34, 109, 100, 245,  22, 131, 231, 219,
         241, 115,  89,  51,  35, 150, 239,  33,  68, 218, 200, 233,
          44,   5, 205, 181, 225, 230, 178, 102,  70,  43, 221,  66,
         136, 179, 143, 209,  88,  10, 153, 105, 193, 203,  99, 204,
         140,  86, 185, 132,  15, 101,  29, 161, 176,  20,  49, 210,
         129, 149, 198, 151,  23, 172, 113,   7,  30, 202,  58,  65,
          95,  40,  98, 163
 };

 /*
  * Ranges:
  *   REDS1: from -32768..98302 to -383..383
  *   REDS2: from -2^31..2^31-1 to -32768..98302
  */
 #define REDS1(x)    (((x) & 0xFF) - ((x) >> 8))
 #define REDS2(x)    (((x) & 0xFFFF) + ((x) >> 16))

 /*
  * If, upon entry, the values of q[] are all in the -N..N range (where
  * N >= 98302) then the new values of q[] are in the -2N..2N range.
  *
  * Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608.
  */
 #define FFT_LOOP(rb, hk, as, id)   do { \
                 size_t u, v; \
                 s32 m = q[(rb)]; \
                 s32 n = q[(rb) + (hk)]; \
                 q[(rb)] = m + n; \
                 q[(rb) + (hk)] = m - n; \
                 u = v = 0; \
                 goto id; \
                 for (; u < (hk); u += 4, v += 4 * (as)) { \
                         s32 t; \
                         m = q[(rb) + u + 0]; \
                         n = q[(rb) + u + 0 + (hk)]; \
                         t = REDS2(n * alpha_tab[v + 0 * (as)]); \
                         q[(rb) + u + 0] = m + t; \
                         q[(rb) + u + 0 + (hk)] = m - t; \
                 id: \
                         m = q[(rb) + u + 1]; \
                         n = q[(rb) + u + 1 + (hk)]; \
                         t = REDS2(n * alpha_tab[v + 1 * (as)]); \
                         q[(rb) + u + 1] = m + t; \
                         q[(rb) + u + 1 + (hk)] = m - t; \
                         m = q[(rb) + u + 2]; \
                         n = q[(rb) + u + 2 + (hk)]; \
                         t = REDS2(n * alpha_tab[v + 2 * (as)]); \
                         q[(rb) + u + 2] = m + t; \
                         q[(rb) + u + 2 + (hk)] = m - t; \
                         m = q[(rb) + u + 3]; \
                         n = q[(rb) + u + 3 + (hk)]; \
                         t = REDS2(n * alpha_tab[v + 3 * (as)]); \
                         q[(rb) + u + 3] = m + t; \
                         q[(rb) + u + 3 + (hk)] = m - t; \
                 } \
         } while (0)

 /*
  * Output ranges:
  *   d0:   min=    0   max= 1020
  *   d1:   min=  -67   max= 4587
  *   d2:   min=-4335   max= 4335
  *   d3:   min=-4147   max=  507
  *   d4:   min= -510   max=  510
  *   d5:   min= -252   max= 4402
  *   d6:   min=-4335   max= 4335
  *   d7:   min=-4332   max=  322
  */
 #define FFT8(xb, xs, d)   do { \
                 s32 x0 = x[(xb)]; \
                 s32 x1 = x[(xb) + (xs)]; \
                 s32 x2 = x[(xb) + 2 * (xs)]; \
                 s32 x3 = x[(xb) + 3 * (xs)]; \
                 s32 a0 = x0 + x2; \
                 s32 a1 = x0 + (x2 << 4); \
                 s32 a2 = x0 - x2; \
                 s32 a3 = x0 - (x2 << 4); \
                 s32 b0 = x1 + x3; \
                 s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \
                 s32 b2 = (x1 << 4) - (x3 << 4); \
                 s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \
                 d ## 0 = a0 + b0; \
                 d ## 1 = a1 + b1; \
                 d ## 2 = a2 + b2; \
                 d ## 3 = a3 + b3; \
                 d ## 4 = a0 - b0; \
                 d ## 5 = a1 - b1; \
                 d ## 6 = a2 - b2; \
                 d ## 7 = a3 - b3; \
         } while (0)

 /*
  * When k=16, we have alpha=2. Multiplication by alpha^i is then reduced
  * to some shifting.
  *
  * Output: within -591471..591723
  */
 #define FFT16(xb, xs, rb)   do { \
                 s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \
                 s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \
                 FFT8(xb, (xs) << 1, d1_); \
                 FFT8((xb) + (xs), (xs) << 1, d2_); \
                 q[(rb) +  0] = d1_0 + d2_0; \
                 q[(rb) +  1] = d1_1 + (d2_1 << 1); \
                 q[(rb) +  2] = d1_2 + (d2_2 << 2); \
                 q[(rb) +  3] = d1_3 + (d2_3 << 3); \
                 q[(rb) +  4] = d1_4 + (d2_4 << 4); \
                 q[(rb) +  5] = d1_5 + (d2_5 << 5); \
                 q[(rb) +  6] = d1_6 + (d2_6 << 6); \
                 q[(rb) +  7] = d1_7 + (d2_7 << 7); \
                 q[(rb) +  8] = d1_0 - d2_0; \
                 q[(rb) +  9] = d1_1 - (d2_1 << 1); \
                 q[(rb) + 10] = d1_2 - (d2_2 << 2); \
                 q[(rb) + 11] = d1_3 - (d2_3 << 3); \
                 q[(rb) + 12] = d1_4 - (d2_4 << 4); \
                 q[(rb) + 13] = d1_5 - (d2_5 << 5); \
                 q[(rb) + 14] = d1_6 - (d2_6 << 6); \
                 q[(rb) + 15] = d1_7 - (d2_7 << 7); \
         } while (0)

 /*
  * Output range: |q| <= 1183446
  */
 #define FFT32(xb, xs, rb, id)   do { \
                 FFT16(xb, (xs) << 1, rb); \
                 FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \
                 FFT_LOOP(rb, 16, 8, id); \
         } while (0)

 /*
  * Output range: |q| <= 2366892
  */
 #define FFT64(xb, xs, rb, id)   do { \
                 FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \
                 FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \
                 FFT_LOOP(rb, 32, 4, id); \
         } while (0)

 #if SPH_SMALL_FOOTPRINT_SIMD

 static void
 fft32(unsigned char *x, size_t xs, s32 *q)
 {
         size_t xd;

         xd = xs << 1;
         FFT16(0, xd, 0);
         FFT16(xs, xd, 16);
         FFT_LOOP(0, 16, 8, label_);
 }

 #define FFT128(xb, xs, rb, id)   do { \
                 fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) +  0]); \
                 fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \
                 FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \
                 fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \
                 fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \
                 FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \
                 FFT_LOOP(rb, 64, 2, XCAT(id, a)); \
         } while (0)

 #else

 /*
  * Output range: |q| <= 4733784
  */
 #define FFT128(xb, xs, rb, id)   do { \
                 FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \
                 FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \
                 FFT_LOOP(rb, 64, 2, id); \
         } while (0)

 #endif

 /*
  * For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression
  * function which does not fit in the 32 kB L1 cache of a typical x86
  * Intel. We therefore add a function call layer at the FFT64 level.
  */

 static void
 fft64(unsigned char *x, size_t xs, s32 *q)
 {
         size_t xd;

         xd = xs << 1;
         FFT32(0, xd, 0, label_a);
         FFT32(xs, xd, 32, label_b);
         FFT_LOOP(0, 32, 4, label_);
 }

 /*
  * Output range: |q| <= 9467568
  */
 #define FFT256(xb, xs, rb, id)   do { \
                 fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) +   0]); \
                 fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) +  64]); \
                 FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \
                 fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \
                 fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \
                 FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \
                 FFT_LOOP(rb, 128, 1, XCAT(id, a)); \
         } while (0)

 /*
  * alpha^(127*i) mod 257
  */
 static const unsigned short yoff_s_n[] = {
           1,  98,  95,  58,  30, 113,  23, 198, 129,  49, 176,  29,
          15, 185, 140,  99, 193, 153,  88, 143, 136, 221,  70, 178,
         225, 205,  44, 200,  68, 239,  35,  89, 241, 231,  22, 100,
          34, 248, 146, 173, 249, 244,  11,  50,  17, 124,  73, 215,
         253, 122, 134,  25, 137,  62, 165, 236, 255,  61,  67, 141,
         197,  31, 211, 118, 256, 159, 162, 199, 227, 144, 234,  59,
         128, 208,  81, 228, 242,  72, 117, 158,  64, 104, 169, 114,
         121,  36, 187,  79,  32,  52, 213,  57, 189,  18, 222, 168,
          16,  26, 235, 157, 223,   9, 111,  84,   8,  13, 246, 207,
         240, 133, 184,  42,   4, 135, 123, 232, 120, 195,  92,  21,
           2, 196, 190, 116,  60, 226,  46, 139
 };

 /*
  * alpha^(127*i) + alpha^(125*i) mod 257
  */
 static const unsigned short yoff_s_f[] = {
           2, 156, 118, 107,  45, 212, 111, 162,  97, 249, 211,   3,
          49, 101, 151, 223, 189, 178, 253, 204,  76,  82, 232,  65,
          96, 176, 161,  47, 189,  61, 248, 107,   0, 131, 133, 113,
          17,  33,  12, 111, 251, 103,  57, 148,  47,  65, 249, 143,
         189,   8, 204, 230, 205, 151, 187, 227, 247, 111, 140,   6,
          77,  10,  21, 149, 255, 101, 139, 150, 212,  45, 146,  95,
         160,   8,  46, 254, 208, 156, 106,  34,  68,  79,   4,  53,
         181, 175,  25, 192, 161,  81,  96, 210,  68, 196,   9, 150,
           0, 126, 124, 144, 240, 224, 245, 146,   6, 154, 200, 109,
         210, 192,   8, 114,  68, 249,  53,  27,  52, 106,  70,  30,
          10, 146, 117, 251, 180, 247, 236, 108
 };

 /*
  * beta^(255*i) mod 257
  */
 static const unsigned short yoff_b_n[] = {
           1, 163,  98,  40,  95,  65,  58, 202,  30,   7, 113, 172,
          23, 151, 198, 149, 129, 210,  49,  20, 176, 161,  29, 101,
          15, 132, 185,  86, 140, 204,  99, 203, 193, 105, 153,  10,
          88, 209, 143, 179, 136,  66, 221,  43,  70, 102, 178, 230,
         225, 181, 205,   5,  44, 233, 200, 218,  68,  33, 239, 150,
          35,  51,  89, 115, 241, 219, 231, 131,  22, 245, 100, 109,
          34, 145, 248,  75, 146, 154, 173, 186, 249, 238, 244, 194,
          11, 251,  50, 183,  17, 201, 124, 166,  73,  77, 215,  93,
         253, 119, 122,  97, 134, 254,  25, 220, 137, 229,  62,  83,
         165, 167, 236, 175, 255, 188,  61, 177,  67, 127, 141, 110,
         197, 243,  31, 170, 211, 212, 118, 216, 256,  94, 159, 217,
         162, 192, 199,  55, 227, 250, 144,  85, 234, 106,  59, 108,
         128,  47, 208, 237,  81,  96, 228, 156, 242, 125,  72, 171,
         117,  53, 158,  54,  64, 152, 104, 247, 169,  48, 114,  78,
         121, 191,  36, 214, 187, 155,  79,  27,  32,  76,  52, 252,
         213,  24,  57,  39, 189, 224,  18, 107, 222, 206, 168, 142,
          16,  38,  26, 126, 235,  12, 157, 148, 223, 112,   9, 182,
         111, 103,  84,  71,   8,  19,  13,  63, 246,   6, 207,  74,
         240,  56, 133,  91, 184, 180,  42, 164,   4, 138, 135, 160,
         123,   3, 232,  37, 120,  28, 195, 174,  92,  90,  21,  82,
           2,  69, 196,  80, 190, 130, 116, 147,  60,  14, 226,  87,
          46,  45, 139,  41
 };

 /*
  * beta^(255*i) + beta^(253*i) mod 257
  */
 static const unsigned short yoff_b_f[] = {
           2, 203, 156,  47, 118, 214, 107, 106,  45,  93, 212,  20,
         111,  73, 162, 251,  97, 215, 249,  53, 211,  19,   3,  89,
          49, 207, 101,  67, 151, 130, 223,  23, 189, 202, 178, 239,
         253, 127, 204,  49,  76, 236,  82, 137, 232, 157,  65,  79,
          96, 161, 176, 130, 161,  30,  47,   9, 189, 247,  61, 226,
         248,  90, 107,  64,   0,  88, 131, 243, 133,  59, 113, 115,
          17, 236,  33, 213,  12, 191, 111,  19, 251,  61, 103, 208,
          57,  35, 148, 248,  47, 116,  65, 119, 249, 178, 143,  40,
         189, 129,   8, 163, 204, 227, 230, 196, 205, 122, 151,  45,
         187,  19, 227,  72, 247, 125, 111, 121, 140, 220,   6, 107,
          77,  69,  10, 101,  21,  65, 149, 171, 255,  54, 101, 210,
         139,  43, 150, 151, 212, 164,  45, 237, 146, 184,  95,   6,
         160,  42,   8, 204,  46, 238, 254, 168, 208,  50, 156, 190,
         106, 127,  34, 234,  68,  55,  79,  18,   4, 130,  53, 208,
         181,  21, 175, 120,  25, 100, 192, 178, 161,  96,  81, 127,
          96, 227, 210, 248,  68,  10, 196,  31,   9, 167, 150, 193,
           0, 169, 126,  14, 124, 198, 144, 142, 240,  21, 224,  44,
         245,  66, 146, 238,   6, 196, 154,  49, 200, 222, 109,   9,
         210, 141, 192, 138,   8,  79, 114, 217,  68, 128, 249,  94,
          53,  30,  27,  61,  52, 135, 106, 212,  70, 238,  30, 185,
          10, 132, 146, 136, 117,  37, 251, 150, 180, 188, 247, 156,
         236, 192, 108,  86
 };

 #define INNER(l, h, mm)   (((u32)((l) * (mm)) & 0xFFFFU) \
                           + ((u32)((h) * (mm)) << 16))

 #define W_SMALL(sb, o1, o2, mm) \
         (INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \
          INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \
          INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \
          INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm)

 #define WS_0_0   W_SMALL( 4,    0,    1, 185)
 #define WS_0_1   W_SMALL( 6,    0,    1, 185)
 #define WS_0_2   W_SMALL( 0,    0,    1, 185)
 #define WS_0_3   W_SMALL( 2,    0,    1, 185)
 #define WS_0_4   W_SMALL( 7,    0,    1, 185)
 #define WS_0_5   W_SMALL( 5,    0,    1, 185)
 #define WS_0_6   W_SMALL( 3,    0,    1, 185)
 #define WS_0_7   W_SMALL( 1,    0,    1, 185)
 #define WS_1_0   W_SMALL(15,    0,    1, 185)
 #define WS_1_1   W_SMALL(11,    0,    1, 185)
 #define WS_1_2   W_SMALL(12,    0,    1, 185)
 #define WS_1_3   W_SMALL( 8,    0,    1, 185)
 #define WS_1_4   W_SMALL( 9,    0,    1, 185)
 #define WS_1_5   W_SMALL(13,    0,    1, 185)
 #define WS_1_6   W_SMALL(10,    0,    1, 185)
 #define WS_1_7   W_SMALL(14,    0,    1, 185)
 #define WS_2_0   W_SMALL(17, -128,  -64, 233)
 #define WS_2_1   W_SMALL(18, -128,  -64, 233)
 #define WS_2_2   W_SMALL(23, -128,  -64, 233)
 #define WS_2_3   W_SMALL(20, -128,  -64, 233)
 #define WS_2_4   W_SMALL(22, -128,  -64, 233)
 #define WS_2_5   W_SMALL(21, -128,  -64, 233)
 #define WS_2_6   W_SMALL(16, -128,  -64, 233)
 #define WS_2_7   W_SMALL(19, -128,  -64, 233)
 #define WS_3_0   W_SMALL(30, -191, -127, 233)
 #define WS_3_1   W_SMALL(24, -191, -127, 233)
 #define WS_3_2   W_SMALL(25, -191, -127, 233)
 #define WS_3_3   W_SMALL(31, -191, -127, 233)
 #define WS_3_4   W_SMALL(27, -191, -127, 233)
 #define WS_3_5   W_SMALL(29, -191, -127, 233)
 #define WS_3_6   W_SMALL(28, -191, -127, 233)
 #define WS_3_7   W_SMALL(26, -191, -127, 233)

 #define W_BIG(sb, o1, o2, mm) \
         (INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \
          INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \
          INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \
          INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \
          INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \
          INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \
          INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \
          INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm)

 #define WB_0_0   W_BIG( 4,    0,    1, 185)
 #define WB_0_1   W_BIG( 6,    0,    1, 185)
 #define WB_0_2   W_BIG( 0,    0,    1, 185)
 #define WB_0_3   W_BIG( 2,    0,    1, 185)
 #define WB_0_4   W_BIG( 7,    0,    1, 185)
 #define WB_0_5   W_BIG( 5,    0,    1, 185)
 #define WB_0_6   W_BIG( 3,    0,    1, 185)
 #define WB_0_7   W_BIG( 1,    0,    1, 185)
 #define WB_1_0   W_BIG(15,    0,    1, 185)
 #define WB_1_1   W_BIG(11,    0,    1, 185)
 #define WB_1_2   W_BIG(12,    0,    1, 185)
 #define WB_1_3   W_BIG( 8,    0,    1, 185)
 #define WB_1_4   W_BIG( 9,    0,    1, 185)
 #define WB_1_5   W_BIG(13,    0,    1, 185)
 #define WB_1_6   W_BIG(10,    0,    1, 185)
 #define WB_1_7   W_BIG(14,    0,    1, 185)
 #define WB_2_0   W_BIG(17, -256, -128, 233)
 #define WB_2_1   W_BIG(18, -256, -128, 233)
 #define WB_2_2   W_BIG(23, -256, -128, 233)
 #define WB_2_3   W_BIG(20, -256, -128, 233)
 #define WB_2_4   W_BIG(22, -256, -128, 233)
 #define WB_2_5   W_BIG(21, -256, -128, 233)
 #define WB_2_6   W_BIG(16, -256, -128, 233)
 #define WB_2_7   W_BIG(19, -256, -128, 233)
 #define WB_3_0   W_BIG(30, -383, -255, 233)
 #define WB_3_1   W_BIG(24, -383, -255, 233)
 #define WB_3_2   W_BIG(25, -383, -255, 233)
 #define WB_3_3   W_BIG(31, -383, -255, 233)
 #define WB_3_4   W_BIG(27, -383, -255, 233)
 #define WB_3_5   W_BIG(29, -383, -255, 233)
 #define WB_3_6   W_BIG(28, -383, -255, 233)
 #define WB_3_7   W_BIG(26, -383, -255, 233)

 #define IF(x, y, z)    ((((y) ^ (z)) & (x)) ^ (z))
 #define MAJ(x, y, z)   (((x) & (y)) | (((x) | (y)) & (z)))

 #define PP4_0_0   1
 #define PP4_0_1   0
 #define PP4_0_2   3
 #define PP4_0_3   2
 #define PP4_1_0   2
 #define PP4_1_1   3
 #define PP4_1_2   0
 #define PP4_1_3   1
 #define PP4_2_0   3
 #define PP4_2_1   2
 #define PP4_2_2   1
 #define PP4_2_3   0

 #define PP8_0_0   1
 #define PP8_0_1   0
 #define PP8_0_2   3
 #define PP8_0_3   2
 #define PP8_0_4   5
 #define PP8_0_5   4
 #define PP8_0_6   7
 #define PP8_0_7   6

 #define PP8_1_0   6
 #define PP8_1_1   7
 #define PP8_1_2   4
 #define PP8_1_3   5
 #define PP8_1_4   2
 #define PP8_1_5   3
 #define PP8_1_6   0
 #define PP8_1_7   1

 #define PP8_2_0   2
 #define PP8_2_1   3
 #define PP8_2_2   0
 #define PP8_2_3   1
 #define PP8_2_4   6
 #define PP8_2_5   7
 #define PP8_2_6   4
 #define PP8_2_7   5

 #define PP8_3_0   3
 #define PP8_3_1   2
 #define PP8_3_2   1
 #define PP8_3_3   0
 #define PP8_3_4   7
 #define PP8_3_5   6
 #define PP8_3_6   5
 #define PP8_3_7   4

 #define PP8_4_0   5
 #define PP8_4_1   4
 #define PP8_4_2   7
 #define PP8_4_3   6
 #define PP8_4_4   1
 #define PP8_4_5   0
 #define PP8_4_6   3
 #define PP8_4_7   2

 #define PP8_5_0   7
 #define PP8_5_1   6
 #define PP8_5_2   5
 #define PP8_5_3   4
 #define PP8_5_4   3
 #define PP8_5_5   2
 #define PP8_5_6   1
 #define PP8_5_7   0

 #define PP8_6_0   4
 #define PP8_6_1   5
 #define PP8_6_2   6
 #define PP8_6_3   7
 #define PP8_6_4   0
 #define PP8_6_5   1
 #define PP8_6_6   2
 #define PP8_6_7   3

 #if SPH_SIMD_NOCOPY

 #define DECL_STATE_SMALL
 #define READ_STATE_SMALL(sc)
 #define WRITE_STATE_SMALL(sc)
 #define DECL_STATE_BIG
 #define READ_STATE_BIG(sc)
 #define WRITE_STATE_BIG(sc)

 #else

 #define DECL_STATE_SMALL   \
         u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3;

 #define READ_STATE_SMALL(sc)   do { \
                 A0 = (sc)->state[ 0]; \
                 A1 = (sc)->state[ 1]; \
                 A2 = (sc)->state[ 2]; \
                 A3 = (sc)->state[ 3]; \
                 B0 = (sc)->state[ 4]; \
                 B1 = (sc)->state[ 5]; \
                 B2 = (sc)->state[ 6]; \
                 B3 = (sc)->state[ 7]; \
                 C0 = (sc)->state[ 8]; \
                 C1 = (sc)->state[ 9]; \
                 C2 = (sc)->state[10]; \
                 C3 = (sc)->state[11]; \
                 D0 = (sc)->state[12]; \
                 D1 = (sc)->state[13]; \
                 D2 = (sc)->state[14]; \
                 D3 = (sc)->state[15]; \
         } while (0)

 #define WRITE_STATE_SMALL(sc)   do { \
                 (sc)->state[ 0] = A0; \
                 (sc)->state[ 1] = A1; \
                 (sc)->state[ 2] = A2; \
                 (sc)->state[ 3] = A3; \
                 (sc)->state[ 4] = B0; \
                 (sc)->state[ 5] = B1; \
                 (sc)->state[ 6] = B2; \
                 (sc)->state[ 7] = B3; \
                 (sc)->state[ 8] = C0; \
                 (sc)->state[ 9] = C1; \
                 (sc)->state[10] = C2; \
                 (sc)->state[11] = C3; \
                 (sc)->state[12] = D0; \
                 (sc)->state[13] = D1; \
                 (sc)->state[14] = D2; \
                 (sc)->state[15] = D3; \
         } while (0)

 #define DECL_STATE_BIG   \
         u32 A0, A1, A2, A3, A4, A5, A6, A7; \
         u32 B0, B1, B2, B3, B4, B5, B6, B7; \
         u32 C0, C1, C2, C3, C4, C5, C6, C7; \
         u32 D0, D1, D2, D3, D4, D5, D6, D7;

 #define READ_STATE_BIG(sc)   do { \
                 A0 = (sc)->state[ 0]; \
                 A1 = (sc)->state[ 1]; \
                 A2 = (sc)->state[ 2]; \
                 A3 = (sc)->state[ 3]; \
                 A4 = (sc)->state[ 4]; \
                 A5 = (sc)->state[ 5]; \
                 A6 = (sc)->state[ 6]; \
                 A7 = (sc)->state[ 7]; \
                 B0 = (sc)->state[ 8]; \
                 B1 = (sc)->state[ 9]; \
                 B2 = (sc)->state[10]; \
                 B3 = (sc)->state[11]; \
                 B4 = (sc)->state[12]; \
                 B5 = (sc)->state[13]; \
                 B6 = (sc)->state[14]; \
                 B7 = (sc)->state[15]; \
                 C0 = (sc)->state[16]; \
                 C1 = (sc)->state[17]; \
                 C2 = (sc)->state[18]; \
                 C3 = (sc)->state[19]; \
                 C4 = (sc)->state[20]; \
                 C5 = (sc)->state[21]; \
                 C6 = (sc)->state[22]; \
                 C7 = (sc)->state[23]; \
                 D0 = (sc)->state[24]; \
                 D1 = (sc)->state[25]; \
                 D2 = (sc)->state[26]; \
                 D3 = (sc)->state[27]; \
                 D4 = (sc)->state[28]; \
                 D5 = (sc)->state[29]; \
                 D6 = (sc)->state[30]; \
                 D7 = (sc)->state[31]; \
         } while (0)

 #define WRITE_STATE_BIG(sc)   do { \
                 (sc)->state[ 0] = A0; \
                 (sc)->state[ 1] = A1; \
                 (sc)->state[ 2] = A2; \
                 (sc)->state[ 3] = A3; \
                 (sc)->state[ 4] = A4; \
                 (sc)->state[ 5] = A5; \
                 (sc)->state[ 6] = A6; \
                 (sc)->state[ 7] = A7; \
                 (sc)->state[ 8] = B0; \
                 (sc)->state[ 9] = B1; \
                 (sc)->state[10] = B2; \
                 (sc)->state[11] = B3; \
                 (sc)->state[12] = B4; \
                 (sc)->state[13] = B5; \
                 (sc)->state[14] = B6; \
                 (sc)->state[15] = B7; \
                 (sc)->state[16] = C0; \
                 (sc)->state[17] = C1; \
                 (sc)->state[18] = C2; \
                 (sc)->state[19] = C3; \
                 (sc)->state[20] = C4; \
                 (sc)->state[21] = C5; \
                 (sc)->state[22] = C6; \
                 (sc)->state[23] = C7; \
                 (sc)->state[24] = D0; \
                 (sc)->state[25] = D1; \
                 (sc)->state[26] = D2; \
                 (sc)->state[27] = D3; \
                 (sc)->state[28] = D4; \
                 (sc)->state[29] = D5; \
                 (sc)->state[30] = D6; \
                 (sc)->state[31] = D7; \
         } while (0)

 #endif

 #define STEP_ELT(n, w, fun, s, ppb)   do { \
                 u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
                 A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \
                 D ## n = C ## n; \
                 C ## n = B ## n; \
                 B ## n = tA ## n; \
         } while (0)

 #define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b)   do { \
                 u32 tA0 = ROL32(A0, r); \
                 u32 tA1 = ROL32(A1, r); \
                 u32 tA2 = ROL32(A2, r); \
                 u32 tA3 = ROL32(A3, r); \
                 STEP_ELT(0, w0, fun, s, pp4b); \
                 STEP_ELT(1, w1, fun, s, pp4b); \
                 STEP_ELT(2, w2, fun, s, pp4b); \
                 STEP_ELT(3, w3, fun, s, pp4b); \
         } while (0)

 #define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b)   do { \
                 u32 tA0 = ROL32(A0, r); \
                 u32 tA1 = ROL32(A1, r); \
                 u32 tA2 = ROL32(A2, r); \
                 u32 tA3 = ROL32(A3, r); \
                 u32 tA4 = ROL32(A4, r); \
                 u32 tA5 = ROL32(A5, r); \
                 u32 tA6 = ROL32(A6, r); \
                 u32 tA7 = ROL32(A7, r); \
                 STEP_ELT(0, w0, fun, s, pp8b); \
                 STEP_ELT(1, w1, fun, s, pp8b); \
                 STEP_ELT(2, w2, fun, s, pp8b); \
                 STEP_ELT(3, w3, fun, s, pp8b); \
                 STEP_ELT(4, w4, fun, s, pp8b); \
                 STEP_ELT(5, w5, fun, s, pp8b); \
                 STEP_ELT(6, w6, fun, s, pp8b); \
                 STEP_ELT(7, w7, fun, s, pp8b); \
         } while (0)

 #define M3_0_0   0_
 #define M3_1_0   1_
 #define M3_2_0   2_
 #define M3_3_0   0_
 #define M3_4_0   1_
 #define M3_5_0   2_
 #define M3_6_0   0_
 #define M3_7_0   1_

 #define M3_0_1   1_
 #define M3_1_1   2_
 #define M3_2_1   0_
 #define M3_3_1   1_
 #define M3_4_1   2_
 #define M3_5_1   0_
 #define M3_6_1   1_
 #define M3_7_1   2_

 #define M3_0_2   2_
 #define M3_1_2   0_
 #define M3_2_2   1_
 #define M3_3_2   2_
 #define M3_4_2   0_
 #define M3_5_2   1_
 #define M3_6_2   2_
 #define M3_7_2   0_

 #define STEP_SMALL_(w, fun, r, s, pp4b)   STEP_SMALL w, fun, r, s, pp4b)

 #define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3)   do { \
                 STEP_SMALL_(WS_ ## ri ## 0, \
                         IF,  p0, p1, XCAT(PP4_, M3_0_ ## isp)); \
                 STEP_SMALL_(WS_ ## ri ## 1, \
                         IF,  p1, p2, XCAT(PP4_, M3_1_ ## isp)); \
                 STEP_SMALL_(WS_ ## ri ## 2, \
                         IF,  p2, p3, XCAT(PP4_, M3_2_ ## isp)); \
                 STEP_SMALL_(WS_ ## ri ## 3, \
                         IF,  p3, p0, XCAT(PP4_, M3_3_ ## isp)); \
                 STEP_SMALL_(WS_ ## ri ## 4, \
                         MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \
                 STEP_SMALL_(WS_ ## ri ## 5, \
                         MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \
                 STEP_SMALL_(WS_ ## ri ## 6, \
                         MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \
                 STEP_SMALL_(WS_ ## ri ## 7, \
                         MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \
         } while (0)

 #define M7_0_0   0_
 #define M7_1_0   1_
 #define M7_2_0   2_
 #define M7_3_0   3_
 #define M7_4_0   4_
 #define M7_5_0   5_
 #define M7_6_0   6_
 #define M7_7_0   0_

 #define M7_0_1   1_
 #define M7_1_1   2_
 #define M7_2_1   3_
 #define M7_3_1   4_
 #define M7_4_1   5_
 #define M7_5_1   6_
 #define M7_6_1   0_
 #define M7_7_1   1_

 #define M7_0_2   2_
 #define M7_1_2   3_
 #define M7_2_2   4_
 #define M7_3_2   5_
 #define M7_4_2   6_
 #define M7_5_2   0_
 #define M7_6_2   1_
 #define M7_7_2   2_

 #define M7_0_3   3_
 #define M7_1_3   4_
 #define M7_2_3   5_
 #define M7_3_3   6_
 #define M7_4_3   0_
 #define M7_5_3   1_
 #define M7_6_3   2_
 #define M7_7_3   3_

 #define STEP_BIG_(w, fun, r, s, pp8b)   STEP_BIG w, fun, r, s, pp8b)

 #define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3)   do { \
                 STEP_BIG_(WB_ ## ri ## 0, \
                         IF,  p0, p1, XCAT(PP8_, M7_0_ ## isp)); \
                 STEP_BIG_(WB_ ## ri ## 1, \
                         IF,  p1, p2, XCAT(PP8_, M7_1_ ## isp)); \
                 STEP_BIG_(WB_ ## ri ## 2, \
                         IF,  p2, p3, XCAT(PP8_, M7_2_ ## isp)); \
                 STEP_BIG_(WB_ ## ri ## 3, \
                         IF,  p3, p0, XCAT(PP8_, M7_3_ ## isp)); \
                 STEP_BIG_(WB_ ## ri ## 4, \
                         MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \
                 STEP_BIG_(WB_ ## ri ## 5, \
                         MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \
                 STEP_BIG_(WB_ ## ri ## 6, \
                         MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \
                 STEP_BIG_(WB_ ## ri ## 7, \
                         MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \
         } while (0)

 #if SPH_SMALL_FOOTPRINT_SIMD

 #define A0   state[ 0]
 #define A1   state[ 1]
 #define A2   state[ 2]
 #define A3   state[ 3]
 #define B0   state[ 4]
 #define B1   state[ 5]
 #define B2   state[ 6]
 #define B3   state[ 7]
 #define C0   state[ 8]
 #define C1   state[ 9]
 #define C2   state[10]
 #define C3   state[11]
 #define D0   state[12]
 #define D1   state[13]
 #define D2   state[14]
 #define D3   state[15]

 #define STEP2_ELT(n, w, fun, s, ppb)   do { \
                 u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
                 A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
                 D ## n = C ## n; \
                 C ## n = B ## n; \
                 B ## n = tA[n]; \
         } while (0)

 #define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b)   do { \
                 u32 tA[4]; \
                 tA[0] = ROL32(A0, r); \
                 tA[1] = ROL32(A1, r); \
                 tA[2] = ROL32(A2, r); \
                 tA[3] = ROL32(A3, r); \
                 STEP2_ELT(0, w0, fun, s, pp4b); \
                 STEP2_ELT(1, w1, fun, s, pp4b); \
                 STEP2_ELT(2, w2, fun, s, pp4b); \
                 STEP2_ELT(3, w3, fun, s, pp4b); \
         } while (0)

 static void
 one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
 {
         static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 };

         STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF,  p0, p1, pp4k[isp + 0]);
         STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF,  p1, p2, pp4k[isp + 1]);
         STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF,  p2, p3, pp4k[isp + 2]);
         STEP2_SMALL(w[12], w[13], w[14], w[15], IF,  p3, p0, pp4k[isp + 3]);
         STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]);
         STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]);
         STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]);
         STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]);
 }

 static void
 compress_small(sph_simd_small_context *sc, int last)
 {
         unsigned char *x;
         s32 q[128];
         int i;
         u32 w[32];
         u32 state[16];
         size_t u;

         static const size_t wsp[32] = {
                  4 << 3,  6 << 3,  0 << 3,  2 << 3,
                  7 << 3,  5 << 3,  3 << 3,  1 << 3,
                 15 << 3, 11 << 3, 12 << 3,  8 << 3,
                  9 << 3, 13 << 3, 10 << 3, 14 << 3,
                 17 << 3, 18 << 3, 23 << 3, 20 << 3,
                 22 << 3, 21 << 3, 16 << 3, 19 << 3,
                 30 << 3, 24 << 3, 25 << 3, 31 << 3,
                 27 << 3, 29 << 3, 28 << 3, 26 << 3
         };

         x = sc->buf;
         FFT128(0, 1, 0, ll);
         if (last) {
                 for (i = 0; i < 128; i ++) {
                         s32 tq;

                         tq = q[i] + yoff_s_f[i];
                         tq = REDS2(tq);
                         tq = REDS1(tq);
                         tq = REDS1(tq);
                         q[i] = (tq <= 128 ? tq : tq - 257);
                 }
         } else {
                 for (i = 0; i < 128; i ++) {
                         s32 tq;

                         tq = q[i] + yoff_s_n[i];
                         tq = REDS2(tq);
                         tq = REDS1(tq);
                         tq = REDS1(tq);
                         q[i] = (tq <= 128 ? tq : tq - 257);
                 }
         }

         for (i = 0; i < 16; i += 4) {
                 state[i + 0] = sc->state[i + 0]
                         ^ sph_dec32le_aligned(x + 4 * (i + 0));
                 state[i + 1] = sc->state[i + 1]
                         ^ sph_dec32le_aligned(x + 4 * (i + 1));
                 state[i + 2] = sc->state[i + 2]
                         ^ sph_dec32le_aligned(x + 4 * (i + 2));
                 state[i + 3] = sc->state[i + 3]
                         ^ sph_dec32le_aligned(x + 4 * (i + 3));
         }

 #define WSREAD(sb, o1, o2, mm)   do { \
                 for (u = 0; u < 32; u += 4) { \
                         size_t v = wsp[(u >> 2) + (sb)]; \
                         w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
                                 q[v + 2 * 0 + (o2)], mm); \
                         w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
                                 q[v + 2 * 1 + (o2)], mm); \
                         w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
                                 q[v + 2 * 2 + (o2)], mm); \
                         w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
                                 q[v + 2 * 3 + (o2)], mm); \
                 } \
         } while (0)

         WSREAD( 0,    0,    1, 185);
         one_round_small(state, w, 0,  3, 23, 17, 27);
         WSREAD( 8,    0,    1, 185);
         one_round_small(state, w, 2, 28, 19, 22,  7);
         WSREAD(16, -128,  -64, 233);
         one_round_small(state, w, 1, 29,  9, 15,  5);
         WSREAD(24, -191, -127, 233);
         one_round_small(state, w, 0,  4, 13, 10, 25);

 #undef WSREAD

         STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
                 IF,  4, 13, PP4_2_);
         STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
                 IF, 13, 10, PP4_0_);
         STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
                 IF, 10, 25, PP4_1_);
         STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
                 IF, 25,  4, PP4_2_);

         memcpy(sc->state, state, sizeof state);
 }

 #undef A0
 #undef A1
 #undef A2
 #undef A3
 #undef B0
 #undef B1
 #undef B2
 #undef B3
 #undef C0
 #undef C1
 #undef C2
 #undef C3
 #undef D0
 #undef D1
 #undef D2
 #undef D3

 #else

 #if SPH_SIMD_NOCOPY
 #define A0   (sc->state[ 0])
 #define A1   (sc->state[ 1])
 #define A2   (sc->state[ 2])
 #define A3   (sc->state[ 3])
 #define B0   (sc->state[ 4])
 #define B1   (sc->state[ 5])
 #define B2   (sc->state[ 6])
 #define B3   (sc->state[ 7])
 #define C0   (sc->state[ 8])
 #define C1   (sc->state[ 9])
 #define C2   (sc->state[10])
 #define C3   (sc->state[11])
 #define D0   (sc->state[12])
 #define D1   (sc->state[13])
 #define D2   (sc->state[14])
 #define D3   (sc->state[15])
 #endif

 static void
 compress_small(sph_simd_small_context *sc, int last)
 {
         unsigned char *x;
         s32 q[128];
         int i;
         DECL_STATE_SMALL
 #if SPH_SIMD_NOCOPY
         sph_u32 saved[16];
 #endif

 #if SPH_SIMD_NOCOPY
         memcpy(saved, sc->state, sizeof saved);
 #endif
         x = sc->buf;
         FFT128(0, 1, 0, ll);
         if (last) {
                 for (i = 0; i < 128; i ++) {
                         s32 tq;

                         tq = q[i] + yoff_s_f[i];
                         tq = REDS2(tq);
                         tq = REDS1(tq);
                         tq = REDS1(tq);
                         q[i] = (tq <= 128 ? tq : tq - 257);
                 }
         } else {
                 for (i = 0; i < 128; i ++) {
                         s32 tq;

                         tq = q[i] + yoff_s_n[i];
                         tq = REDS2(tq);
                         tq = REDS1(tq);
                         tq = REDS1(tq);
                         q[i] = (tq <= 128 ? tq : tq - 257);
                 }
         }
         READ_STATE_SMALL(sc);
         A0 ^= sph_dec32le_aligned(x +  0);
         A1 ^= sph_dec32le_aligned(x +  4);
         A2 ^= sph_dec32le_aligned(x +  8);
         A3 ^= sph_dec32le_aligned(x + 12);
         B0 ^= sph_dec32le_aligned(x + 16);
         B1 ^= sph_dec32le_aligned(x + 20);
         B2 ^= sph_dec32le_aligned(x + 24);
         B3 ^= sph_dec32le_aligned(x + 28);
         C0 ^= sph_dec32le_aligned(x + 32);
         C1 ^= sph_dec32le_aligned(x + 36);
         C2 ^= sph_dec32le_aligned(x + 40);
         C3 ^= sph_dec32le_aligned(x + 44);
         D0 ^= sph_dec32le_aligned(x + 48);
         D1 ^= sph_dec32le_aligned(x + 52);
         D2 ^= sph_dec32le_aligned(x + 56);
         D3 ^= sph_dec32le_aligned(x + 60);
         ONE_ROUND_SMALL(0_, 0,  3, 23, 17, 27);
         ONE_ROUND_SMALL(1_, 2, 28, 19, 22,  7);
         ONE_ROUND_SMALL(2_, 1, 29,  9, 15,  5);
         ONE_ROUND_SMALL(3_, 0,  4, 13, 10, 25);
 #if SPH_SIMD_NOCOPY
         STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3],
                 IF,  4, 13, PP4_2_);
         STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7],
                 IF, 13, 10, PP4_0_);
         STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11],
                 IF, 10, 25, PP4_1_);
         STEP_SMALL(saved[12], saved[13], saved[14], saved[15],
                 IF, 25,  4, PP4_2_);
 #else
         STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
                 IF,  4, 13, PP4_2_);
         STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
                 IF, 13, 10, PP4_0_);
         STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
                 IF, 10, 25, PP4_1_);
         STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
                 IF, 25,  4, PP4_2_);
         WRITE_STATE_SMALL(sc);
 #endif
 }

 #if SPH_SIMD_NOCOPY
 #undef A0
 #undef A1
 #undef A2
 #undef A3
 #undef B0
 #undef B1
 #undef B2
 #undef B3
 #undef C0
 #undef C1
 #undef C2
 #undef C3
 #undef D0
 #undef D1
 #undef D2
 #undef D3
 #endif

 #endif

 #if SPH_SMALL_FOOTPRINT_SIMD

 #define A0   state[ 0]
 #define A1   state[ 1]
 #define A2   state[ 2]
 #define A3   state[ 3]
 #define A4   state[ 4]
 #define A5   state[ 5]
 #define A6   state[ 6]
 #define A7   state[ 7]
 #define B0   state[ 8]
 #define B1   state[ 9]
 #define B2   state[10]
 #define B3   state[11]
 #define B4   state[12]
 #define B5   state[13]
 #define B6   state[14]
 #define B7   state[15]
 #define C0   state[16]
 #define C1   state[17]
 #define C2   state[18]
 #define C3   state[19]
 #define C4   state[20]
 #define C5   state[21]
 #define C6   state[22]
 #define C7   state[23]
 #define D0   state[24]
 #define D1   state[25]
 #define D2   state[26]
 #define D3   state[27]
 #define D4   state[28]
 #define D5   state[29]
 #define D6   state[30]
 #define D7   state[31]

 /*
  * Not needed -- already defined for SIMD-224 / SIMD-256
  *
 #define STEP2_ELT(n, w, fun, s, ppb)   do { \
                 u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
                 A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
                 D ## n = C ## n; \
                 C ## n = B ## n; \
                 B ## n = tA[n]; \
         } while (0)
  */

 #define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b)   do { \
                 u32 tA[8]; \
                 tA[0] = ROL32(A0, r); \
                 tA[1] = ROL32(A1, r); \
                 tA[2] = ROL32(A2, r); \
                 tA[3] = ROL32(A3, r); \
                 tA[4] = ROL32(A4, r); \
                 tA[5] = ROL32(A5, r); \
                 tA[6] = ROL32(A6, r); \
                 tA[7] = ROL32(A7, r); \
                 STEP2_ELT(0, w0, fun, s, pp8b); \
                 STEP2_ELT(1, w1, fun, s, pp8b); \
                 STEP2_ELT(2, w2, fun, s, pp8b); \
                 STEP2_ELT(3, w3, fun, s, pp8b); \
                 STEP2_ELT(4, w4, fun, s, pp8b); \
                 STEP2_ELT(5, w5, fun, s, pp8b); \
                 STEP2_ELT(6, w6, fun, s, pp8b); \
                 STEP2_ELT(7, w7, fun, s, pp8b); \
         } while (0)

 static void
 one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
 {
         static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 };

         STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7],
                 IF,  p0, p1, pp8k[isp + 0]);
         STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15],
                 IF,  p1, p2, pp8k[isp + 1]);
         STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23],
                 IF,  p2, p3, pp8k[isp + 2]);
         STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31],
                 IF,  p3, p0, pp8k[isp + 3]);
         STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39],
                 MAJ, p0, p1, pp8k[isp + 4]);
         STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47],
                 MAJ, p1, p2, pp8k[isp + 5]);
         STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55],
                 MAJ, p2, p3, pp8k[isp + 6]);
         STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63],
                 MAJ, p3, p0, pp8k[isp + 7]);
 }

 static void
 compress_big(sph_simd_big_context *sc, int last)
 {
         unsigned char *x;
         s32 q[256];
         int i;
         u32 w[64];
         u32 state[32];
         size_t u;

         static const size_t wbp[32] = {
                  4 << 4,  6 << 4,  0 << 4,  2 << 4,
                  7 << 4,  5 << 4,  3 << 4,  1 << 4,
                 15 << 4, 11 << 4, 12 << 4,  8 << 4,
                  9 << 4, 13 << 4, 10 << 4, 14 << 4,
                 17 << 4, 18 << 4, 23 << 4, 20 << 4,
                 22 << 4, 21 << 4, 16 << 4, 19 << 4,
                 30 << 4, 24 << 4, 25 << 4, 31 << 4,
                 27 << 4, 29 << 4, 28 << 4, 26 << 4
         };

         x = sc->buf;
         FFT256(0, 1, 0, ll);
         if (last) {
                 for (i = 0; i < 256; i ++) {
                         s32 tq;

                         tq = q[i] + yoff_b_f[i];
                         tq = REDS2(tq);
                         tq = REDS1(tq);
                         tq = REDS1(tq);
                         q[i] = (tq <= 128 ? tq : tq - 257);
                 }
         } else {
                 for (i = 0; i < 256; i ++) {
                         s32 tq;

                         tq = q[i] + yoff_b_n[i];
                         tq = REDS2(tq);
                         tq = REDS1(tq);
                         tq = REDS1(tq);
                         q[i] = (tq <= 128 ? tq : tq - 257);
                 }
         }

         for (i = 0; i < 32; i += 8) {
                 state[i + 0] = sc->state[i + 0]
                         ^ sph_dec32le_aligned(x + 4 * (i + 0));
                 state[i + 1] = sc->state[i + 1]
                         ^ sph_dec32le_aligned(x + 4 * (i + 1));
                 state[i + 2] = sc->state[i + 2]
                         ^ sph_dec32le_aligned(x + 4 * (i + 2));
                 state[i + 3] = sc->state[i + 3]
                         ^ sph_dec32le_aligned(x + 4 * (i + 3));
                 state[i + 4] = sc->state[i + 4]
                         ^ sph_dec32le_aligned(x + 4 * (i + 4));
                 state[i + 5] = sc->state[i + 5]
                         ^ sph_dec32le_aligned(x + 4 * (i + 5));
                 state[i + 6] = sc->state[i + 6]
                         ^ sph_dec32le_aligned(x + 4 * (i + 6));
                 state[i + 7] = sc->state[i + 7]
                         ^ sph_dec32le_aligned(x + 4 * (i + 7));
         }

 #define WBREAD(sb, o1, o2, mm)   do { \
                 for (u = 0; u < 64; u += 8) { \
                         size_t v = wbp[(u >> 3) + (sb)]; \
                         w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
                                 q[v + 2 * 0 + (o2)], mm); \
                         w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
                                 q[v + 2 * 1 + (o2)], mm); \
                         w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
                                 q[v + 2 * 2 + (o2)], mm); \
                         w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
                                 q[v + 2 * 3 + (o2)], mm); \
                         w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \
                                 q[v + 2 * 4 + (o2)], mm); \
                         w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \
                                 q[v + 2 * 5 + (o2)], mm); \
                         w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \
                                 q[v + 2 * 6 + (o2)], mm); \
                         w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \
                                 q[v + 2 * 7 + (o2)], mm); \
                 } \
         } while (0)

         WBREAD( 0,    0,    1, 185);
         one_round_big(state, w, 0,  3, 23, 17, 27);
         WBREAD( 8,    0,    1, 185);
         one_round_big(state, w, 1, 28, 19, 22,  7);
         WBREAD(16, -256, -128, 233);
         one_round_big(state, w, 2, 29,  9, 15,  5);
         WBREAD(24, -383, -255, 233);
         one_round_big(state, w, 3,  4, 13, 10, 25);

 #undef WBREAD

         STEP_BIG(
                 sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
                 sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
                 IF,  4, 13, PP8_4_);
         STEP_BIG(
                 sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
                 sc->state[12], sc->state[13], sc->state[14], sc->state[15],
                 IF, 13, 10, PP8_5_);
         STEP_BIG(
                 sc->state[16], sc->state[17], sc->state[18], sc->state[19],
                 sc->state[20], sc->state[21], sc->state[22], sc->state[23],
                 IF, 10, 25, PP8_6_);
         STEP_BIG(
                 sc->state[24], sc->state[25], sc->state[26], sc->state[27],
                 sc->state[28], sc->state[29], sc->state[30], sc->state[31],
                 IF, 25,  4, PP8_0_);

         memcpy(sc->state, state, sizeof state);
 }

 #undef A0
 #undef A1
 #undef A2
 #undef A3
 #undef A4
 #undef A5
 #undef A6
 #undef A7
 #undef B0
 #undef B1
 #undef B2
 #undef B3
 #undef B4
 #undef B5
 #undef B6
 #undef B7
 #undef C0
 #undef C1
 #undef C2
 #undef C3
 #undef C4
 #undef C5
 #undef C6
 #undef C7
 #undef D0
 #undef D1
 #undef D2
 #undef D3
 #undef D4
 #undef D5
 #undef D6
 #undef D7

 #else

 #if SPH_SIMD_NOCOPY
 #define A0   (sc->state[ 0])
 #define A1   (sc->state[ 1])
 #define A2   (sc->state[ 2])
 #define A3   (sc->state[ 3])
 #define A4   (sc->state[ 4])
 #define A5   (sc->state[ 5])
 #define A6   (sc->state[ 6])
 #define A7   (sc->state[ 7])
 #define B0   (sc->state[ 8])
 #define B1   (sc->state[ 9])
 #define B2   (sc->state[10])
 #define B3   (sc->state[11])
 #define B4   (sc->state[12])
 #define B5   (sc->state[13])
 #define B6   (sc->state[14])
 #define B7   (sc->state[15])
 #define C0   (sc->state[16])
 #define C1   (sc->state[17])
 #define C2   (sc->state[18])
 #define C3   (sc->state[19])
 #define C4   (sc->state[20])
 #define C5   (sc->state[21])
 #define C6   (sc->state[22])
 #define C7   (sc->state[23])
 #define D0   (sc->state[24])
 #define D1   (sc->state[25])
 #define D2   (sc->state[26])
 #define D3   (sc->state[27])
 #define D4   (sc->state[28])
 #define D5   (sc->state[29])
 #define D6   (sc->state[30])
 #define D7   (sc->state[31])
 #endif

 static void
 compress_big(sph_simd_big_context *sc, int last)
 {
         unsigned char *x;
         s32 q[256];
         int i;
         DECL_STATE_BIG
 #if SPH_SIMD_NOCOPY
         sph_u32 saved[32];
 #endif

 #if SPH_SIMD_NOCOPY
         memcpy(saved, sc->state, sizeof saved);
 #endif

         x = sc->buf;
         FFT256(0, 1, 0, ll);
         if (last) {
                 for (i = 0; i < 256; i ++) {
                         s32 tq;

                         tq = q[i] + yoff_b_f[i];
                         tq = REDS2(tq);
                         tq = REDS1(tq);
                         tq = REDS1(tq);
                         q[i] = (tq <= 128 ? tq : tq - 257);
                 }
         } else {
                 for (i = 0; i < 256; i ++) {
                         s32 tq;

                         tq = q[i] + yoff_b_n[i];
                         tq = REDS2(tq);
                         tq = REDS1(tq);
                         tq = REDS1(tq);
                         q[i] = (tq <= 128 ? tq : tq - 257);
                 }
         }
         READ_STATE_BIG(sc);
         A0 ^= sph_dec32le_aligned(x +   0);
         A1 ^= sph_dec32le_aligned(x +   4);
         A2 ^= sph_dec32le_aligned(x +   8);
         A3 ^= sph_dec32le_aligned(x +  12);
         A4 ^= sph_dec32le_aligned(x +  16);
         A5 ^= sph_dec32le_aligned(x +  20);
         A6 ^= sph_dec32le_aligned(x +  24);
         A7 ^= sph_dec32le_aligned(x +  28);
         B0 ^= sph_dec32le_aligned(x +  32);
         B1 ^= sph_dec32le_aligned(x +  36);
         B2 ^= sph_dec32le_aligned(x +  40);
         B3 ^= sph_dec32le_aligned(x +  44);
         B4 ^= sph_dec32le_aligned(x +  48);
         B5 ^= sph_dec32le_aligned(x +  52);
         B6 ^= sph_dec32le_aligned(x +  56);
         B7 ^= sph_dec32le_aligned(x +  60);
         C0 ^= sph_dec32le_aligned(x +  64);
         C1 ^= sph_dec32le_aligned(x +  68);
         C2 ^= sph_dec32le_aligned(x +  72);
         C3 ^= sph_dec32le_aligned(x +  76);
         C4 ^= sph_dec32le_aligned(x +  80);
         C5 ^= sph_dec32le_aligned(x +  84);
         C6 ^= sph_dec32le_aligned(x +  88);
         C7 ^= sph_dec32le_aligned(x +  92);
         D0 ^= sph_dec32le_aligned(x +  96);
         D1 ^= sph_dec32le_aligned(x + 100);
         D2 ^= sph_dec32le_aligned(x + 104);
         D3 ^= sph_dec32le_aligned(x + 108);
         D4 ^= sph_dec32le_aligned(x + 112);
         D5 ^= sph_dec32le_aligned(x + 116);
         D6 ^= sph_dec32le_aligned(x + 120);
         D7 ^= sph_dec32le_aligned(x + 124);

         ONE_ROUND_BIG(0_, 0,  3, 23, 17, 27);
         ONE_ROUND_BIG(1_, 1, 28, 19, 22,  7);
         ONE_ROUND_BIG(2_, 2, 29,  9, 15,  5);
         ONE_ROUND_BIG(3_, 3,  4, 13, 10, 25);
 #if SPH_SIMD_NOCOPY
         STEP_BIG(
                 saved[ 0], saved[ 1], saved[ 2], saved[ 3],
                 saved[ 4], saved[ 5], saved[ 6], saved[ 7],
                 IF,  4, 13, PP8_4_);
         STEP_BIG(
                 saved[ 8], saved[ 9], saved[10], saved[11],
                 saved[12], saved[13], saved[14], saved[15],
                 IF, 13, 10, PP8_5_);
         STEP_BIG(
                 saved[16], saved[17], saved[18], saved[19],
                 saved[20], saved[21], saved[22], saved[23],
                 IF, 10, 25, PP8_6_);
         STEP_BIG(
                 saved[24], saved[25], saved[26], saved[27],
                 saved[28], saved[29], saved[30], saved[31],
                 IF, 25,  4, PP8_0_);
 #else
         STEP_BIG(
                 sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
                 sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
                 IF,  4, 13, PP8_4_);
         STEP_BIG(
                 sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
                 sc->state[12], sc->state[13], sc->state[14], sc->state[15],
                 IF, 13, 10, PP8_5_);
         STEP_BIG(
                 sc->state[16], sc->state[17], sc->state[18], sc->state[19],
                 sc->state[20], sc->state[21], sc->state[22], sc->state[23],
                 IF, 10, 25, PP8_6_);
         STEP_BIG(
                 sc->state[24], sc->state[25], sc->state[26], sc->state[27],
                 sc->state[28], sc->state[29], sc->state[30], sc->state[31],
                 IF, 25,  4, PP8_0_);
         WRITE_STATE_BIG(sc);
 #endif
 }

 #if SPH_SIMD_NOCOPY
 #undef A0
 #undef A1
 #undef A2
 #undef A3
 #undef A4
 #undef A5
 #undef A6
 #undef A7
 #undef B0
 #undef B1
 #undef B2
 #undef B3
 #undef B4
 #undef B5
 #undef B6
 #undef B7
 #undef C0
 #undef C1
 #undef C2
 #undef C3
 #undef C4
 #undef C5
 #undef C6
 #undef C7
 #undef D0
 #undef D1
 #undef D2
 #undef D3
 #undef D4
 #undef D5
 #undef D6
 #undef D7
 #endif

 #endif

 static const u32 IV224[] = {
         C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53),
         C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96),
         C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6),
         C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8)
 };

 static const u32 IV256[] = {
         C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9),
         C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3),
         C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9),
         C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1)
 };

 static const u32 IV384[] = {
         C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B),
         C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1),
         C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A),
         C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8),
         C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2),
         C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462),
         C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5),
         C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71)
 };

 static const u32 IV512[] = {
         C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
         C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
         C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
         C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
         C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
         C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
         C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
         C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22)
 };

 static void
 init_small(void *cc, const u32 *iv)
 {
         sph_simd_small_context *sc;

         sc = cc;
         memcpy(sc->state, iv, sizeof sc->state);
         sc->count_low = sc->count_high = 0;
         sc->ptr = 0;
 }

 static void
 init_big(void *cc, const u32 *iv)
 {
         sph_simd_big_context *sc;

         sc = cc;
         memcpy(sc->state, iv, sizeof sc->state);
         sc->count_low = sc->count_high = 0;
         sc->ptr = 0;
 }

 static void
 update_small(void *cc, const void *data, size_t len)
 {
         sph_simd_small_context *sc;

         sc = cc;
         while (len > 0) {
                 size_t clen;

                 clen = (sizeof sc->buf) - sc->ptr;
                 if (clen > len)
                         clen = len;
                 memcpy(sc->buf + sc->ptr, data, clen);
                 data = (const unsigned char *)data + clen;
                 len -= clen;
                 if ((sc->ptr += clen) == sizeof sc->buf) {
                         compress_small(sc, 0);
                         sc->ptr = 0;
                         sc->count_low = T32(sc->count_low + 1);
                         if (sc->count_low == 0)
                                 sc->count_high ++;
                 }
         }
 }

 static void
 update_big(void *cc, const void *data, size_t len)
 {
         sph_simd_big_context *sc;

         sc = cc;
         while (len > 0) {
                 size_t clen;

                 clen = (sizeof sc->buf) - sc->ptr;
                 if (clen > len)
                         clen = len;
                 memcpy(sc->buf + sc->ptr, data, clen);
                 data = (const unsigned char *)data + clen;
                 len -= clen;
                 if ((sc->ptr += clen) == sizeof sc->buf) {
                         compress_big(sc, 0);
                         sc->ptr = 0;
                         sc->count_low = T32(sc->count_low + 1);
                         if (sc->count_low == 0)
                                 sc->count_high ++;
                 }
         }
 }

 static void
 encode_count_small(unsigned char *dst,
         u32 low, u32 high, size_t ptr, unsigned n)
 {
         low = T32(low << 9);
         high = T32(high << 9) + (low >> 23);
         low += (ptr << 3) + n;
         sph_enc32le(dst, low);
         sph_enc32le(dst + 4, high);
 }

 static void
 encode_count_big(unsigned char *dst,
         u32 low, u32 high, size_t ptr, unsigned n)
 {
         low = T32(low << 10);
         high = T32(high << 10) + (low >> 22);
         low += (ptr << 3) + n;
         sph_enc32le(dst, low);
         sph_enc32le(dst + 4, high);
 }

 static void
 finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
 {
         sph_simd_small_context *sc;
         unsigned char *d;
         size_t u;

         sc = cc;
         if (sc->ptr > 0 || n > 0) {
                 memset(sc->buf + sc->ptr, 0,
                         (sizeof sc->buf) - sc->ptr);
                 sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
                 compress_small(sc, 0);
         }
         memset(sc->buf, 0, sizeof sc->buf);
         encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
         compress_small(sc, 1);
         d = dst;
         for (d = dst, u = 0; u < dst_len; u ++)
                 sph_enc32le(d + (u << 2), sc->state[u]);
 }

 static void
 finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
 {
         sph_simd_big_context *sc;
         unsigned char *d;
         size_t u;

         sc = cc;
         if (sc->ptr > 0 || n > 0) {
                 memset(sc->buf + sc->ptr, 0,
                         (sizeof sc->buf) - sc->ptr);
                 sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
                 compress_big(sc, 0);
         }
         memset(sc->buf, 0, sizeof sc->buf);
         encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
         compress_big(sc, 1);
         d = dst;
         for (d = dst, u = 0; u < dst_len; u ++)
                 sph_enc32le(d + (u << 2), sc->state[u]);
 }

 void
 sph_simd224_init(void *cc)
 {
         init_small(cc, IV224);
 }

 void
 sph_simd224(void *cc, const void *data, size_t len)
 {
         update_small(cc, data, len);
 }

 void
 sph_simd224_close(void *cc, void *dst)
 {
         sph_simd224_addbits_and_close(cc, 0, 0, dst);
 }

 void
 sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
         finalize_small(cc, ub, n, dst, 7);
         sph_simd224_init(cc);
 }

 void
 sph_simd256_init(void *cc)
 {
         init_small(cc, IV256);
 }

 void
 sph_simd256(void *cc, const void *data, size_t len)
 {
         update_small(cc, data, len);
 }

 void
 sph_simd256_close(void *cc, void *dst)
 {
         sph_simd256_addbits_and_close(cc, 0, 0, dst);
 }

 void
 sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
         finalize_small(cc, ub, n, dst, 8);
         sph_simd256_init(cc);
 }

 void
 sph_simd384_init(void *cc)
 {
         init_big(cc, IV384);
 }

 void
 sph_simd384(void *cc, const void *data, size_t len)
 {
         update_big(cc, data, len);
 }

 void
 sph_simd384_close(void *cc, void *dst)
 {
         sph_simd384_addbits_and_close(cc, 0, 0, dst);
 }

 void
 sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
         finalize_big(cc, ub, n, dst, 12);
         sph_simd384_init(cc);
 }

 void
 sph_simd512_init(void *cc)
 {
         init_big(cc, IV512);
 }

 void
 sph_simd512(void *cc, const void *data, size_t len)
 {
         update_big(cc, data, len);
 }

 void
 sph_simd512_close(void *cc, void *dst)
 {
         sph_simd512_addbits_and_close(cc, 0, 0, dst);
 }

 void
 sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
         finalize_big(cc, ub, n, dst, 16);
         sph_simd512_init(cc);
 }
 #ifdef __cplusplus
 }
 #endif
sph_simd_small_context::state
sph_u32 state[16]
Definition: sph_simd.h:81

u32
sph_u32 u32
Definition: simd.c:51

sph_simd224
void sph_simd224(void *cc, const void *data, size_t len)
Process some data bytes.
Definition: simd.c:1705

FFT32
#define FFT32(xb, xs, rb, id)
Definition: simd.c:202

READ_STATE_BIG
#define READ_STATE_BIG(sc)
Definition: simd.c:595

update_big
static void update_big(void *cc, const void *data, size_t len)
Definition: simd.c:1608

FFT_LOOP
#define FFT_LOOP(rb, hk, as, id)
Definition: simd.c:102

sph_simd384_close
void sph_simd384_close(void *cc, void *dst)
Terminate the current SIMD-384 computation and output the result into the provided buffer...
Definition: simd.c:1761

sph_simd_small_context
This structure is a context for SIMD computations: it contains the intermediate values and some data ...
Definition: sph_simd.h:77

sph_simd_big_context::state
sph_u32 state[32]
Definition: sph_simd.h:101

yoff_s_n
static const unsigned short yoff_s_n[]
Definition: simd.c:286

sph_simd384_init
void sph_simd384_init(void *cc)
Initialize an SIMD-384 context.
Definition: simd.c:1749

sph_simd512
void sph_simd512(void *cc, const void *data, size_t len)
Process some data bytes.
Definition: simd.c:1780

sph_s32
long sph_s32
Definition: sph_types.h:871

DECL_STATE_SMALL
#define DECL_STATE_SMALL
Definition: simd.c:548

sph_simd512_close
void sph_simd512_close(void *cc, void *dst)
Terminate the current SIMD-512 computation and output the result into the provided buffer...
Definition: simd.c:1786

sph_simd256_init
void sph_simd256_init(void *cc)
Initialize an SIMD-256 context.
Definition: simd.c:1724

finalize_big
static void finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
Definition: simd.c:1677

IV512
static const u32 IV512[]
Definition: simd.c:1549

ONE_ROUND_BIG
#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3)
Definition: simd.c:791

IF
#define IF(x, y, z)
Definition: simd.c:458

high
sph_u32 high
Definition: keccak.c:370

WRITE_STATE_BIG
#define WRITE_STATE_BIG(sc)
Definition: simd.c:630

sph_simd384_addbits_and_close
void sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
Add a few additional bits (0 to 7) to the current computation, then terminate it and output the resul...
Definition: simd.c:1767

STEP_BIG
#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b)
Definition: simd.c:686

REDS2
#define REDS2(x)
Definition: simd.c:94

sph_dec32le_aligned
static SPH_INLINE sph_u32 sph_dec32le_aligned(const void *src)
Decode a 32-bit value from the provided buffer (little endian convention).
Definition: sph_types.h:1615

IV256
static const u32 IV256[]
Definition: simd.c:1531

encode_count_small
static void encode_count_small(unsigned char *dst, u32 low, u32 high, size_t ptr, unsigned n)
Definition: simd.c:1633

sph_simd_small_context::count_high
sph_u32 count_high
Definition: sph_simd.h:82

sph_simd.h
SIMD interface.

yoff_b_f
static const unsigned short yoff_b_f[]
Definition: simd.c:348

REDS1
#define REDS1(x)
Definition: simd.c:93

sph_simd_small_context::buf
unsigned char buf[64]
Definition: sph_simd.h:79

T32
#define T32
Definition: simd.c:54

READ_STATE_SMALL
#define READ_STATE_SMALL(sc)
Definition: simd.c:551

C32
#define C32
Definition: simd.c:53

sph_simd_big_context::buf
unsigned char buf[128]
Definition: sph_simd.h:99

WRITE_STATE_SMALL
#define WRITE_STATE_SMALL(sc)
Definition: simd.c:570

sph_simd_big_context
This structure is a context for SIMD computations: it contains the intermediate values and some data ...
Definition: sph_simd.h:97

IV224
static const u32 IV224[]
Definition: simd.c:1524

sph_simd512_addbits_and_close
void sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
Add a few additional bits (0 to 7) to the current computation, then terminate it and output the resul...
Definition: simd.c:1792

compress_big
static void compress_big(sph_simd_big_context *sc, int last)
Definition: simd.c:1374

ONE_ROUND_SMALL
#define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3)
Definition: simd.c:734

yoff_b_n
static const unsigned short yoff_b_n[]
Definition: simd.c:320

encode_count_big
static void encode_count_big(unsigned char *dst, u32 low, u32 high, size_t ptr, unsigned n)
Definition: simd.c:1644

STEP_SMALL
#define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b)
Definition: simd.c:675

alpha_tab
static const s32 alpha_tab[]
Definition: simd.c:63

MAJ
#define MAJ(x, y, z)
Definition: simd.c:459

sph_enc32le
static SPH_INLINE void sph_enc32le(void *dst, sph_u32 val)
Encode a 32-bit value into the provided buffer (little endian convention).
Definition: sph_types.h:1485

sph_simd256_addbits_and_close
void sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
Add a few additional bits (0 to 7) to the current computation, then terminate it and output the resul...
Definition: simd.c:1742

sph_simd256_close
void sph_simd256_close(void *cc, void *dst)
Terminate the current SIMD-256 computation and output the result into the provided buffer...
Definition: simd.c:1736

init_small
static void init_small(void *cc, const u32 *iv)
Definition: simd.c:1561

sph_simd_big_context::ptr
size_t ptr
Definition: sph_simd.h:100

yoff_s_f
static const unsigned short yoff_s_f[]
Definition: simd.c:303

sph_simd224_addbits_and_close
void sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
Add a few additional bits (0 to 7) to the current computation, then terminate it and output the resul...
Definition: simd.c:1717

finalize_small
static void finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
Definition: simd.c:1655

sph_simd_small_context::count_low
sph_u32 count_low
Definition: sph_simd.h:82

sph_simd384
void sph_simd384(void *cc, const void *data, size_t len)
Process some data bytes.
Definition: simd.c:1755

sph_simd224_init
void sph_simd224_init(void *cc)
Initialize an SIMD-224 context.
Definition: simd.c:1699

FFT256
#define FFT256(xb, xs, rb, id)
Definition: simd.c:273

update_small
static void update_small(void *cc, const void *data, size_t len)
Definition: simd.c:1583

sph_simd256
void sph_simd256(void *cc, const void *data, size_t len)
Process some data bytes.
Definition: simd.c:1730

sph_simd_small_context::ptr
size_t ptr
Definition: sph_simd.h:80

memcpy
void * memcpy(void *a, const void *b, size_t c)
Definition: glibc_compat.cpp:18

FFT128
#define FFT128(xb, xs, rb, id)
Definition: simd.c:245

low
sph_u32 low
Definition: keccak.c:370

sph_u32
unsigned long sph_u32
Definition: sph_types.h:870

IV384
static const u32 IV384[]
Definition: simd.c:1538

compress_small
static void compress_small(sph_simd_small_context *sc, int last)
Definition: simd.c:996

sph_simd_big_context::count_low
sph_u32 count_low
Definition: sph_simd.h:102

FFT16
#define FFT16(xb, xs, rb)
Definition: simd.c:176

sph_simd512_init
void sph_simd512_init(void *cc)
Initialize an SIMD-512 context.
Definition: simd.c:1774

s32
sph_s32 s32
Definition: simd.c:52

DECL_STATE_BIG
#define DECL_STATE_BIG
Definition: simd.c:589

_
std::string _(const char *psz)
Translation function: Call Translate signal on UI interface, which returns a boost::optional result...
Definition: util.h:92

init_big
static void init_big(void *cc, const u32 *iv)
Definition: simd.c:1572

fft64
static void fft64(unsigned char *x, size_t xs, s32 *q)
Definition: simd.c:260

sph_simd_big_context::count_high
sph_u32 count_high
Definition: sph_simd.h:102

sph_simd224_close
void sph_simd224_close(void *cc, void *dst)
Terminate the current SIMD-224 computation and output the result into the provided buffer...
Definition: simd.c:1711