1 /*
   2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   3  *
   4  * This code is free software; you can redistribute it and/or modify it
   5  * under the terms of the GNU General Public License version 2 only, as
   6  * published by the Free Software Foundation.  Oracle designates this
   7  * particular file as subject to the "Classpath" exception as provided
   8  * by Oracle in the LICENSE file that accompanied this code.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  */
  24 
  25 /* crc32.c -- compute the CRC-32 of a data stream
  26  * Copyright (C) 1995-2022 Mark Adler
  27  * For conditions of distribution and use, see copyright notice in zlib.h
  28  *
  29  * This interleaved implementation of a CRC makes use of pipelined multiple
  30  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
  31  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
  32  */
  33 
  34 /* @(#) $Id$ */
  35 
  36 /*
  37   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
  38   protection on the static variables used to control the first-use generation
  39   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
  40   first call get_crc_table() to initialize the tables before allowing more than
  41   one thread to use crc32().
  42 
  43   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
  44   produced, so that this one source file can be compiled to an executable.
  45  */
  46 
  47 #ifdef MAKECRCH
  48 #  include <stdio.h>
  49 #  ifndef DYNAMIC_CRC_TABLE
  50 #    define DYNAMIC_CRC_TABLE
  51 #  endif /* !DYNAMIC_CRC_TABLE */
  52 #endif /* MAKECRCH */
  53 
  54 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
  55 
  56  /*
  57   A CRC of a message is computed on N braids of words in the message, where
  58   each word consists of W bytes (4 or 8). If N is 3, for example, then three
  59   running sparse CRCs are calculated respectively on each braid, at these
  60   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
  61   This is done starting at a word boundary, and continues until as many blocks
  62   of N * W bytes as are available have been processed. The results are combined
  63   into a single CRC at the end. For this code, N must be in the range 1..6 and
  64   W must be 4 or 8. The upper limit on N can be increased if desired by adding
  65   more #if blocks, extending the patterns apparent in the code. In addition,
  66   crc32.h would need to be regenerated, if the maximum N value is increased.
  67 
  68   N and W are chosen empirically by benchmarking the execution time on a given
  69   processor. The choices for N and W below were based on testing on Intel Kaby
  70   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
  71   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
  72   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
  73   They were all tested with either gcc or clang, all using the -O3 optimization
  74   level. Your mileage may vary.
  75  */
  76 
  77 /* Define N */
  78 #ifdef Z_TESTN
  79 #  define N Z_TESTN
  80 #else
  81 #  define N 5
  82 #endif
  83 #if N < 1 || N > 6
  84 #  error N must be in 1..6
  85 #endif
  86 
  87 /*
  88   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
  89   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
  90   that bytes are eight bits.
  91  */
  92 
  93 /*
  94   Define W and the associated z_word_t type. If W is not defined, then a
  95   braided calculation is not used, and the associated tables and code are not
  96   compiled.
  97  */
  98 #ifdef Z_TESTW
  99 #  if Z_TESTW-1 != -1
 100 #    define W Z_TESTW
 101 #  endif
 102 #else
 103 #  ifdef MAKECRCH
 104 #    define W 8         /* required for MAKECRCH */
 105 #  else
 106 #    if defined(__x86_64__) || defined(__aarch64__)
 107 #      define W 8
 108 #    else
 109 #      define W 4
 110 #    endif
 111 #  endif
 112 #endif
 113 #ifdef W
 114 #  if W == 8 && defined(Z_U8)
 115      typedef Z_U8 z_word_t;
 116 #  elif defined(Z_U4)
 117 #    undef W
 118 #    define W 4
 119      typedef Z_U4 z_word_t;
 120 #  else
 121 #    undef W
 122 #  endif
 123 #endif
 124 
 125 /* If available, use the ARM processor CRC32 instruction. */
 126 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
 127 #  define ARMCRC32
 128 #endif
 129 
 130 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
 131 /*
 132   Swap the bytes in a z_word_t to convert between little and big endian. Any
 133   self-respecting compiler will optimize this to a single machine byte-swap
 134   instruction, if one is available. This assumes that word_t is either 32 bits
 135   or 64 bits.
 136  */
 137 local z_word_t byte_swap(z_word_t word) {
 138 #  if W == 8
 139     return
 140         (word & 0xff00000000000000) >> 56 |
 141         (word & 0xff000000000000) >> 40 |
 142         (word & 0xff0000000000) >> 24 |
 143         (word & 0xff00000000) >> 8 |
 144         (word & 0xff000000) << 8 |
 145         (word & 0xff0000) << 24 |
 146         (word & 0xff00) << 40 |
 147         (word & 0xff) << 56;
 148 #  else   /* W == 4 */
 149     return
 150         (word & 0xff000000) >> 24 |
 151         (word & 0xff0000) >> 8 |
 152         (word & 0xff00) << 8 |
 153         (word & 0xff) << 24;
 154 #  endif
 155 }
 156 #endif
 157 
 158 #ifdef DYNAMIC_CRC_TABLE
 159 /* =========================================================================
 160  * Table of powers of x for combining CRC-32s, filled in by make_crc_table()
 161  * below.
 162  */
 163    local z_crc_t FAR x2n_table[32];
 164 #else
 165 /* =========================================================================
 166  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
 167  * of x for combining CRC-32s, all made by make_crc_table().
 168  */
 169 #  include "crc32.h"
 170 #endif
 171 
 172 /* CRC polynomial. */
 173 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
 174 
 175 /*
 176   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
 177   reflected. For speed, this requires that a not be zero.
 178  */
 179 local z_crc_t multmodp(z_crc_t a, z_crc_t b) {
 180     z_crc_t m, p;
 181 
 182     m = (z_crc_t)1 << 31;
 183     p = 0;
 184     for (;;) {
 185         if (a & m) {
 186             p ^= b;
 187             if ((a & (m - 1)) == 0)
 188                 break;
 189         }
 190         m >>= 1;
 191         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
 192     }
 193     return p;
 194 }
 195 
 196 /*
 197   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
 198   initialized.
 199  */
 200 local z_crc_t x2nmodp(z_off64_t n, unsigned k) {
 201     z_crc_t p;
 202 
 203     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
 204     while (n) {
 205         if (n & 1)
 206             p = multmodp(x2n_table[k & 31], p);
 207         n >>= 1;
 208         k++;
 209     }
 210     return p;
 211 }
 212 
 213 #ifdef DYNAMIC_CRC_TABLE
 214 /* =========================================================================
 215  * Build the tables for byte-wise and braided CRC-32 calculations, and a table
 216  * of powers of x for combining CRC-32s.
 217  */
 218 local z_crc_t FAR crc_table[256];
 219 #ifdef W
 220    local z_word_t FAR crc_big_table[256];
 221    local z_crc_t FAR crc_braid_table[W][256];
 222    local z_word_t FAR crc_braid_big_table[W][256];
 223    local void braid(z_crc_t [][256], z_word_t [][256], int, int);
 224 #endif
 225 #ifdef MAKECRCH
 226    local void write_table(FILE *, const z_crc_t FAR *, int);
 227    local void write_table32hi(FILE *, const z_word_t FAR *, int);
 228    local void write_table64(FILE *, const z_word_t FAR *, int);
 229 #endif /* MAKECRCH */
 230 
 231 /*
 232   Define a once() function depending on the availability of atomics. If this is
 233   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
 234   multiple threads, and if atomics are not available, then get_crc_table() must
 235   be called to initialize the tables and must return before any threads are
 236   allowed to compute or combine CRCs.
 237  */
 238 
 239 /* Definition of once functionality. */
 240 typedef struct once_s once_t;
 241 
 242 /* Check for the availability of atomics. */
 243 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
 244     !defined(__STDC_NO_ATOMICS__)
 245 
 246 #include <stdatomic.h>
 247 
 248 /* Structure for once(), which must be initialized with ONCE_INIT. */
 249 struct once_s {
 250     atomic_flag begun;
 251     atomic_int done;
 252 };
 253 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
 254 
 255 /*
 256   Run the provided init() function exactly once, even if multiple threads
 257   invoke once() at the same time. The state must be a once_t initialized with
 258   ONCE_INIT.
 259  */
 260 local void once(once_t *state, void (*init)(void)) {
 261     if (!atomic_load(&state->done)) {
 262         if (atomic_flag_test_and_set(&state->begun))
 263             while (!atomic_load(&state->done))
 264                 ;
 265         else {
 266             init();
 267             atomic_store(&state->done, 1);
 268         }
 269     }
 270 }
 271 
 272 #else   /* no atomics */
 273 
 274 /* Structure for once(), which must be initialized with ONCE_INIT. */
 275 struct once_s {
 276     volatile int begun;
 277     volatile int done;
 278 };
 279 #define ONCE_INIT {0, 0}
 280 
 281 /* Test and set. Alas, not atomic, but tries to minimize the period of
 282    vulnerability. */
 283 local int test_and_set(int volatile *flag) {
 284     int was;
 285 
 286     was = *flag;
 287     *flag = 1;
 288     return was;
 289 }
 290 
 291 /* Run the provided init() function once. This is not thread-safe. */
 292 local void once(once_t *state, void (*init)(void)) {
 293     if (!state->done) {
 294         if (test_and_set(&state->begun))
 295             while (!state->done)
 296                 ;
 297         else {
 298             init();
 299             state->done = 1;
 300         }
 301     }
 302 }
 303 
 304 #endif
 305 
 306 /* State for once(). */
 307 local once_t made = ONCE_INIT;
 308 
 309 /*
 310   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
 311   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
 312 
 313   Polynomials over GF(2) are represented in binary, one bit per coefficient,
 314   with the lowest powers in the most significant bit. Then adding polynomials
 315   is just exclusive-or, and multiplying a polynomial by x is a right shift by
 316   one. If we call the above polynomial p, and represent a byte as the
 317   polynomial q, also with the lowest power in the most significant bit (so the
 318   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
 319   where a mod b means the remainder after dividing a by b.
 320 
 321   This calculation is done using the shift-register method of multiplying and
 322   taking the remainder. The register is initialized to zero, and for each
 323   incoming bit, x^32 is added mod p to the register if the bit is a one (where
 324   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
 325   (which is shifting right by one and adding x^32 mod p if the bit shifted out
 326   is a one). We start with the highest power (least significant bit) of q and
 327   repeat for all eight bits of q.
 328 
 329   The table is simply the CRC of all possible eight bit values. This is all the
 330   information needed to generate CRCs on data a byte at a time for all
 331   combinations of CRC register values and incoming bytes.
 332  */
 333 
 334 local void make_crc_table(void) {
 335     unsigned i, j, n;
 336     z_crc_t p;
 337 
 338     /* initialize the CRC of bytes tables */
 339     for (i = 0; i < 256; i++) {
 340         p = i;
 341         for (j = 0; j < 8; j++)
 342             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
 343         crc_table[i] = p;
 344 #ifdef W
 345         crc_big_table[i] = byte_swap(p);
 346 #endif
 347     }
 348 
 349     /* initialize the x^2^n mod p(x) table */
 350     p = (z_crc_t)1 << 30;         /* x^1 */
 351     x2n_table[0] = p;
 352     for (n = 1; n < 32; n++)
 353         x2n_table[n] = p = multmodp(p, p);
 354 
 355 #ifdef W
 356     /* initialize the braiding tables -- needs x2n_table[] */
 357     braid(crc_braid_table, crc_braid_big_table, N, W);
 358 #endif
 359 
 360 #ifdef MAKECRCH
 361     {
 362         /*
 363           The crc32.h header file contains tables for both 32-bit and 64-bit
 364           z_word_t's, and so requires a 64-bit type be available. In that case,
 365           z_word_t must be defined to be 64-bits. This code then also generates
 366           and writes out the tables for the case that z_word_t is 32 bits.
 367          */
 368 #if !defined(W) || W != 8
 369 #  error Need a 64-bit integer type in order to generate crc32.h.
 370 #endif
 371         FILE *out;
 372         int k, n;
 373         z_crc_t ltl[8][256];
 374         z_word_t big[8][256];
 375 
 376         out = fopen("crc32.h", "w");
 377         if (out == NULL) return;
 378 
 379         /* write out little-endian CRC table to crc32.h */
 380         fprintf(out,
 381             "/* crc32.h -- tables for rapid CRC calculation\n"
 382             " * Generated automatically by crc32.c\n */\n"
 383             "\n"
 384             "local const z_crc_t FAR crc_table[] = {\n"
 385             "    ");
 386         write_table(out, crc_table, 256);
 387         fprintf(out,
 388             "};\n");
 389 
 390         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
 391         fprintf(out,
 392             "\n"
 393             "#ifdef W\n"
 394             "\n"
 395             "#if W == 8\n"
 396             "\n"
 397             "local const z_word_t FAR crc_big_table[] = {\n"
 398             "    ");
 399         write_table64(out, crc_big_table, 256);
 400         fprintf(out,
 401             "};\n");
 402 
 403         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
 404         fprintf(out,
 405             "\n"
 406             "#else /* W == 4 */\n"
 407             "\n"
 408             "local const z_word_t FAR crc_big_table[] = {\n"
 409             "    ");
 410         write_table32hi(out, crc_big_table, 256);
 411         fprintf(out,
 412             "};\n"
 413             "\n"
 414             "#endif\n");
 415 
 416         /* write out braid tables for each value of N */
 417         for (n = 1; n <= 6; n++) {
 418             fprintf(out,
 419             "\n"
 420             "#if N == %d\n", n);
 421 
 422             /* compute braid tables for this N and 64-bit word_t */
 423             braid(ltl, big, n, 8);
 424 
 425             /* write out braid tables for 64-bit z_word_t to crc32.h */
 426             fprintf(out,
 427             "\n"
 428             "#if W == 8\n"
 429             "\n"
 430             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
 431             for (k = 0; k < 8; k++) {
 432                 fprintf(out, "   {");
 433                 write_table(out, ltl[k], 256);
 434                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
 435             }
 436             fprintf(out,
 437             "};\n"
 438             "\n"
 439             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
 440             for (k = 0; k < 8; k++) {
 441                 fprintf(out, "   {");
 442                 write_table64(out, big[k], 256);
 443                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
 444             }
 445             fprintf(out,
 446             "};\n");
 447 
 448             /* compute braid tables for this N and 32-bit word_t */
 449             braid(ltl, big, n, 4);
 450 
 451             /* write out braid tables for 32-bit z_word_t to crc32.h */
 452             fprintf(out,
 453             "\n"
 454             "#else /* W == 4 */\n"
 455             "\n"
 456             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
 457             for (k = 0; k < 4; k++) {
 458                 fprintf(out, "   {");
 459                 write_table(out, ltl[k], 256);
 460                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
 461             }
 462             fprintf(out,
 463             "};\n"
 464             "\n"
 465             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
 466             for (k = 0; k < 4; k++) {
 467                 fprintf(out, "   {");
 468                 write_table32hi(out, big[k], 256);
 469                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
 470             }
 471             fprintf(out,
 472             "};\n"
 473             "\n"
 474             "#endif\n"
 475             "\n"
 476             "#endif\n");
 477         }
 478         fprintf(out,
 479             "\n"
 480             "#endif\n");
 481 
 482         /* write out zeros operator table to crc32.h */
 483         fprintf(out,
 484             "\n"
 485             "local const z_crc_t FAR x2n_table[] = {\n"
 486             "    ");
 487         write_table(out, x2n_table, 32);
 488         fprintf(out,
 489             "};\n");
 490         fclose(out);
 491     }
 492 #endif /* MAKECRCH */
 493 }
 494 
 495 #ifdef MAKECRCH
 496 
 497 /*
 498    Write the 32-bit values in table[0..k-1] to out, five per line in
 499    hexadecimal separated by commas.
 500  */
 501 local void write_table(FILE *out, const z_crc_t FAR *table, int k) {
 502     int n;
 503 
 504     for (n = 0; n < k; n++)
 505         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
 506                 (unsigned long)(table[n]),
 507                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
 508 }
 509 
 510 /*
 511    Write the high 32-bits of each value in table[0..k-1] to out, five per line
 512    in hexadecimal separated by commas.
 513  */
 514 local void write_table32hi(FILE *out, const z_word_t FAR *table, int k) {
 515     int n;
 516 
 517     for (n = 0; n < k; n++)
 518         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
 519                 (unsigned long)(table[n] >> 32),
 520                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
 521 }
 522 
 523 /*
 524   Write the 64-bit values in table[0..k-1] to out, three per line in
 525   hexadecimal separated by commas. This assumes that if there is a 64-bit
 526   type, then there is also a long long integer type, and it is at least 64
 527   bits. If not, then the type cast and format string can be adjusted
 528   accordingly.
 529  */
 530 local void write_table64(FILE *out, const z_word_t FAR *table, int k) {
 531     int n;
 532 
 533     for (n = 0; n < k; n++)
 534         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
 535                 (unsigned long long)(table[n]),
 536                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
 537 }
 538 
 539 /* Actually do the deed. */
 540 int main(void) {
 541     make_crc_table();
 542     return 0;
 543 }
 544 
 545 #endif /* MAKECRCH */
 546 
 547 #ifdef W
 548 /*
 549   Generate the little and big-endian braid tables for the given n and z_word_t
 550   size w. Each array must have room for w blocks of 256 elements.
 551  */
 552 local void braid(z_crc_t ltl[][256], z_word_t big[][256], int n, int w) {
 553     int k;
 554     z_crc_t i, p, q;
 555     for (k = 0; k < w; k++) {
 556         p = x2nmodp((n * w + 3 - k) << 3, 0);
 557         ltl[k][0] = 0;
 558         big[w - 1 - k][0] = 0;
 559         for (i = 1; i < 256; i++) {
 560             ltl[k][i] = q = multmodp(i << 24, p);
 561             big[w - 1 - k][i] = byte_swap(q);
 562         }
 563     }
 564 }
 565 #endif
 566 
 567 #endif /* DYNAMIC_CRC_TABLE */
 568 
 569 /* =========================================================================
 570  * This function can be used by asm versions of crc32(), and to force the
 571  * generation of the CRC tables in a threaded application.
 572  */
 573 const z_crc_t FAR * ZEXPORT get_crc_table(void) {
 574 #ifdef DYNAMIC_CRC_TABLE
 575     once(&made, make_crc_table);
 576 #endif /* DYNAMIC_CRC_TABLE */
 577     return (const z_crc_t FAR *)crc_table;
 578 }
 579 
 580 /* =========================================================================
 581  * Use ARM machine instructions if available. This will compute the CRC about
 582  * ten times faster than the braided calculation. This code does not check for
 583  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
 584  * only be defined if the compilation specifies an ARM processor architecture
 585  * that has the instructions. For example, compiling with -march=armv8.1-a or
 586  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
 587  * instructions.
 588  */
 589 #ifdef ARMCRC32
 590 
 591 /*
 592    Constants empirically determined to maximize speed. These values are from
 593    measurements on a Cortex-A57. Your mileage may vary.
 594  */
 595 #define Z_BATCH 3990                /* number of words in a batch */
 596 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
 597 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
 598 
 599 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
 600                               z_size_t len) {
 601     z_crc_t val;
 602     z_word_t crc1, crc2;
 603     const z_word_t *word;
 604     z_word_t val0, val1, val2;
 605     z_size_t last, last2, i;
 606     z_size_t num;
 607 
 608     /* Return initial CRC, if requested. */
 609     if (buf == Z_NULL) return 0;
 610 
 611 #ifdef DYNAMIC_CRC_TABLE
 612     once(&made, make_crc_table);
 613 #endif /* DYNAMIC_CRC_TABLE */
 614 
 615     /* Pre-condition the CRC */
 616     crc = (~crc) & 0xffffffff;
 617 
 618     /* Compute the CRC up to a word boundary. */
 619     while (len && ((z_size_t)buf & 7) != 0) {
 620         len--;
 621         val = *buf++;
 622         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
 623     }
 624 
 625     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
 626     word = (z_word_t const *)buf;
 627     num = len >> 3;
 628     len &= 7;
 629 
 630     /* Do three interleaved CRCs to realize the throughput of one crc32x
 631        instruction per cycle. Each CRC is calculated on Z_BATCH words. The
 632        three CRCs are combined into a single CRC after each set of batches. */
 633     while (num >= 3 * Z_BATCH) {
 634         crc1 = 0;
 635         crc2 = 0;
 636         for (i = 0; i < Z_BATCH; i++) {
 637             val0 = word[i];
 638             val1 = word[i + Z_BATCH];
 639             val2 = word[i + 2 * Z_BATCH];
 640             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
 641             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
 642             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
 643         }
 644         word += 3 * Z_BATCH;
 645         num -= 3 * Z_BATCH;
 646         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
 647         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
 648     }
 649 
 650     /* Do one last smaller batch with the remaining words, if there are enough
 651        to pay for the combination of CRCs. */
 652     last = num / 3;
 653     if (last >= Z_BATCH_MIN) {
 654         last2 = last << 1;
 655         crc1 = 0;
 656         crc2 = 0;
 657         for (i = 0; i < last; i++) {
 658             val0 = word[i];
 659             val1 = word[i + last];
 660             val2 = word[i + last2];
 661             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
 662             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
 663             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
 664         }
 665         word += 3 * last;
 666         num -= 3 * last;
 667         val = x2nmodp(last, 6);
 668         crc = multmodp(val, crc) ^ crc1;
 669         crc = multmodp(val, crc) ^ crc2;
 670     }
 671 
 672     /* Compute the CRC on any remaining words. */
 673     for (i = 0; i < num; i++) {
 674         val0 = word[i];
 675         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
 676     }
 677     word += num;
 678 
 679     /* Complete the CRC on any remaining bytes. */
 680     buf = (const unsigned char FAR *)word;
 681     while (len) {
 682         len--;
 683         val = *buf++;
 684         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
 685     }
 686 
 687     /* Return the CRC, post-conditioned. */
 688     return crc ^ 0xffffffff;
 689 }
 690 
 691 #else
 692 
 693 #ifdef W
 694 
 695 /*
 696   Return the CRC of the W bytes in the word_t data, taking the
 697   least-significant byte of the word as the first byte of data, without any pre
 698   or post conditioning. This is used to combine the CRCs of each braid.
 699  */
 700 local z_crc_t crc_word(z_word_t data) {
 701     int k;
 702     for (k = 0; k < W; k++)
 703         data = (data >> 8) ^ crc_table[data & 0xff];
 704     return (z_crc_t)data;
 705 }
 706 
 707 local z_word_t crc_word_big(z_word_t data) {
 708     int k;
 709     for (k = 0; k < W; k++)
 710         data = (data << 8) ^
 711             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
 712     return data;
 713 }
 714 
 715 #endif
 716 
 717 /* ========================================================================= */
 718 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
 719                               z_size_t len) {
 720     /* Return initial CRC, if requested. */
 721     if (buf == Z_NULL) return 0;
 722 
 723 #ifdef DYNAMIC_CRC_TABLE
 724     once(&made, make_crc_table);
 725 #endif /* DYNAMIC_CRC_TABLE */
 726 
 727     /* Pre-condition the CRC */
 728     crc = (~crc) & 0xffffffff;
 729 
 730 #ifdef W
 731 
 732     /* If provided enough bytes, do a braided CRC calculation. */
 733     if (len >= N * W + W - 1) {
 734         z_size_t blks;
 735         z_word_t const *words;
 736         unsigned endian;
 737         int k;
 738 
 739         /* Compute the CRC up to a z_word_t boundary. */
 740         while (len && ((z_size_t)buf & (W - 1)) != 0) {
 741             len--;
 742             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
 743         }
 744 
 745         /* Compute the CRC on as many N z_word_t blocks as are available. */
 746         blks = len / (N * W);
 747         len -= blks * N * W;
 748         words = (z_word_t const *)buf;
 749 
 750         /* Do endian check at execution time instead of compile time, since ARM
 751            processors can change the endianness at execution time. If the
 752            compiler knows what the endianness will be, it can optimize out the
 753            check and the unused branch. */
 754         endian = 1;
 755         if (*(unsigned char *)&endian) {
 756             /* Little endian. */
 757 
 758             z_crc_t crc0;
 759             z_word_t word0;
 760 #if N > 1
 761             z_crc_t crc1;
 762             z_word_t word1;
 763 #if N > 2
 764             z_crc_t crc2;
 765             z_word_t word2;
 766 #if N > 3
 767             z_crc_t crc3;
 768             z_word_t word3;
 769 #if N > 4
 770             z_crc_t crc4;
 771             z_word_t word4;
 772 #if N > 5
 773             z_crc_t crc5;
 774             z_word_t word5;
 775 #endif
 776 #endif
 777 #endif
 778 #endif
 779 #endif
 780 
 781             /* Initialize the CRC for each braid. */
 782             crc0 = crc;
 783 #if N > 1
 784             crc1 = 0;
 785 #if N > 2
 786             crc2 = 0;
 787 #if N > 3
 788             crc3 = 0;
 789 #if N > 4
 790             crc4 = 0;
 791 #if N > 5
 792             crc5 = 0;
 793 #endif
 794 #endif
 795 #endif
 796 #endif
 797 #endif
 798 
 799             /*
 800               Process the first blks-1 blocks, computing the CRCs on each braid
 801               independently.
 802              */
 803             while (--blks) {
 804                 /* Load the word for each braid into registers. */
 805                 word0 = crc0 ^ words[0];
 806 #if N > 1
 807                 word1 = crc1 ^ words[1];
 808 #if N > 2
 809                 word2 = crc2 ^ words[2];
 810 #if N > 3
 811                 word3 = crc3 ^ words[3];
 812 #if N > 4
 813                 word4 = crc4 ^ words[4];
 814 #if N > 5
 815                 word5 = crc5 ^ words[5];
 816 #endif
 817 #endif
 818 #endif
 819 #endif
 820 #endif
 821                 words += N;
 822 
 823                 /* Compute and update the CRC for each word. The loop should
 824                    get unrolled. */
 825                 crc0 = crc_braid_table[0][word0 & 0xff];
 826 #if N > 1
 827                 crc1 = crc_braid_table[0][word1 & 0xff];
 828 #if N > 2
 829                 crc2 = crc_braid_table[0][word2 & 0xff];
 830 #if N > 3
 831                 crc3 = crc_braid_table[0][word3 & 0xff];
 832 #if N > 4
 833                 crc4 = crc_braid_table[0][word4 & 0xff];
 834 #if N > 5
 835                 crc5 = crc_braid_table[0][word5 & 0xff];
 836 #endif
 837 #endif
 838 #endif
 839 #endif
 840 #endif
 841                 for (k = 1; k < W; k++) {
 842                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
 843 #if N > 1
 844                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
 845 #if N > 2
 846                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
 847 #if N > 3
 848                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
 849 #if N > 4
 850                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
 851 #if N > 5
 852                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
 853 #endif
 854 #endif
 855 #endif
 856 #endif
 857 #endif
 858                 }
 859             }
 860 
 861             /*
 862               Process the last block, combining the CRCs of the N braids at the
 863               same time.
 864              */
 865             crc = crc_word(crc0 ^ words[0]);
 866 #if N > 1
 867             crc = crc_word(crc1 ^ words[1] ^ crc);
 868 #if N > 2
 869             crc = crc_word(crc2 ^ words[2] ^ crc);
 870 #if N > 3
 871             crc = crc_word(crc3 ^ words[3] ^ crc);
 872 #if N > 4
 873             crc = crc_word(crc4 ^ words[4] ^ crc);
 874 #if N > 5
 875             crc = crc_word(crc5 ^ words[5] ^ crc);
 876 #endif
 877 #endif
 878 #endif
 879 #endif
 880 #endif
 881             words += N;
 882         }
 883         else {
 884             /* Big endian. */
 885 
 886             z_word_t crc0, word0, comb;
 887 #if N > 1
 888             z_word_t crc1, word1;
 889 #if N > 2
 890             z_word_t crc2, word2;
 891 #if N > 3
 892             z_word_t crc3, word3;
 893 #if N > 4
 894             z_word_t crc4, word4;
 895 #if N > 5
 896             z_word_t crc5, word5;
 897 #endif
 898 #endif
 899 #endif
 900 #endif
 901 #endif
 902 
 903             /* Initialize the CRC for each braid. */
 904             crc0 = byte_swap(crc);
 905 #if N > 1
 906             crc1 = 0;
 907 #if N > 2
 908             crc2 = 0;
 909 #if N > 3
 910             crc3 = 0;
 911 #if N > 4
 912             crc4 = 0;
 913 #if N > 5
 914             crc5 = 0;
 915 #endif
 916 #endif
 917 #endif
 918 #endif
 919 #endif
 920 
 921             /*
 922               Process the first blks-1 blocks, computing the CRCs on each braid
 923               independently.
 924              */
 925             while (--blks) {
 926                 /* Load the word for each braid into registers. */
 927                 word0 = crc0 ^ words[0];
 928 #if N > 1
 929                 word1 = crc1 ^ words[1];
 930 #if N > 2
 931                 word2 = crc2 ^ words[2];
 932 #if N > 3
 933                 word3 = crc3 ^ words[3];
 934 #if N > 4
 935                 word4 = crc4 ^ words[4];
 936 #if N > 5
 937                 word5 = crc5 ^ words[5];
 938 #endif
 939 #endif
 940 #endif
 941 #endif
 942 #endif
 943                 words += N;
 944 
 945                 /* Compute and update the CRC for each word. The loop should
 946                    get unrolled. */
 947                 crc0 = crc_braid_big_table[0][word0 & 0xff];
 948 #if N > 1
 949                 crc1 = crc_braid_big_table[0][word1 & 0xff];
 950 #if N > 2
 951                 crc2 = crc_braid_big_table[0][word2 & 0xff];
 952 #if N > 3
 953                 crc3 = crc_braid_big_table[0][word3 & 0xff];
 954 #if N > 4
 955                 crc4 = crc_braid_big_table[0][word4 & 0xff];
 956 #if N > 5
 957                 crc5 = crc_braid_big_table[0][word5 & 0xff];
 958 #endif
 959 #endif
 960 #endif
 961 #endif
 962 #endif
 963                 for (k = 1; k < W; k++) {
 964                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
 965 #if N > 1
 966                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
 967 #if N > 2
 968                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
 969 #if N > 3
 970                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
 971 #if N > 4
 972                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
 973 #if N > 5
 974                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
 975 #endif
 976 #endif
 977 #endif
 978 #endif
 979 #endif
 980                 }
 981             }
 982 
 983             /*
 984               Process the last block, combining the CRCs of the N braids at the
 985               same time.
 986              */
 987             comb = crc_word_big(crc0 ^ words[0]);
 988 #if N > 1
 989             comb = crc_word_big(crc1 ^ words[1] ^ comb);
 990 #if N > 2
 991             comb = crc_word_big(crc2 ^ words[2] ^ comb);
 992 #if N > 3
 993             comb = crc_word_big(crc3 ^ words[3] ^ comb);
 994 #if N > 4
 995             comb = crc_word_big(crc4 ^ words[4] ^ comb);
 996 #if N > 5
 997             comb = crc_word_big(crc5 ^ words[5] ^ comb);
 998 #endif
 999 #endif
1000 #endif
1001 #endif
1002 #endif
1003             words += N;
1004             crc = byte_swap(comb);
1005         }
1006 
1007         /*
1008           Update the pointer to the remaining bytes to process.
1009          */
1010         buf = (unsigned char const *)words;
1011     }
1012 
1013 #endif /* W */
1014 
1015     /* Complete the computation of the CRC on any remaining bytes. */
1016     while (len >= 8) {
1017         len -= 8;
1018         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1019         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1020         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1021         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1022         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1023         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1024         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1025         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1026     }
1027     while (len) {
1028         len--;
1029         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1030     }
1031 
1032     /* Return the CRC, post-conditioned. */
1033     return crc ^ 0xffffffff;
1034 }
1035 
1036 #endif
1037 
1038 /* ========================================================================= */
1039 unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
1040                             uInt len) {
1041     return crc32_z(crc, buf, len);
1042 }
1043 
1044 /* ========================================================================= */
1045 uLong ZEXPORT crc32_combine64(uLong crc1, uLong crc2, z_off64_t len2) {
1046 #ifdef DYNAMIC_CRC_TABLE
1047     once(&made, make_crc_table);
1048 #endif /* DYNAMIC_CRC_TABLE */
1049     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1050 }
1051 
1052 /* ========================================================================= */
1053 uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2) {
1054     return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1055 }
1056 
1057 /* ========================================================================= */
1058 uLong ZEXPORT crc32_combine_gen64(z_off64_t len2) {
1059 #ifdef DYNAMIC_CRC_TABLE
1060     once(&made, make_crc_table);
1061 #endif /* DYNAMIC_CRC_TABLE */
1062     return x2nmodp(len2, 3);
1063 }
1064 
1065 /* ========================================================================= */
1066 uLong ZEXPORT crc32_combine_gen(z_off_t len2) {
1067     return crc32_combine_gen64((z_off64_t)len2);
1068 }
1069 
1070 /* ========================================================================= */
1071 uLong ZEXPORT crc32_combine_op(uLong crc1, uLong crc2, uLong op) {
1072     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1073 }