OSDN Git Service

bundle libdeflate to make it optional
authorIvailo Monev <xakepa10@gmail.com>
Sat, 5 Feb 2022 21:08:55 +0000 (23:08 +0200)
committerIvailo Monev <xakepa10@gmail.com>
Sat, 5 Feb 2022 21:08:55 +0000 (23:08 +0200)
Signed-off-by: Ivailo Monev <xakepa10@gmail.com>
47 files changed:
CMakeLists.txt
package/netbsd/Makefile
package/openbsd/Makefile
src/3rdparty/libdeflate/COPYING [new file with mode: 0644]
src/3rdparty/libdeflate/NOTE [new file with mode: 0644]
src/3rdparty/libdeflate/README.md [new file with mode: 0644]
src/3rdparty/libdeflate/common/common_defs.h [new file with mode: 0644]
src/3rdparty/libdeflate/common/compiler_gcc.h [new file with mode: 0644]
src/3rdparty/libdeflate/common/compiler_msc.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/adler32.c [new file with mode: 0644]
src/3rdparty/libdeflate/lib/adler32_vec_template.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/arm/adler32_impl.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/arm/cpu_features.c [new file with mode: 0644]
src/3rdparty/libdeflate/lib/arm/cpu_features.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/arm/crc32_impl.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/arm/matchfinder_impl.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/bt_matchfinder.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/cpu_features_common.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/crc32.c [new file with mode: 0644]
src/3rdparty/libdeflate/lib/crc32_table.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/crc32_vec_template.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/decompress_template.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/deflate_compress.c [new file with mode: 0644]
src/3rdparty/libdeflate/lib/deflate_compress.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/deflate_constants.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/deflate_decompress.c [new file with mode: 0644]
src/3rdparty/libdeflate/lib/gzip_compress.c [new file with mode: 0644]
src/3rdparty/libdeflate/lib/gzip_constants.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/gzip_decompress.c [new file with mode: 0644]
src/3rdparty/libdeflate/lib/hc_matchfinder.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/ht_matchfinder.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/lib_common.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/matchfinder_common.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/unaligned.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/utils.c [new file with mode: 0644]
src/3rdparty/libdeflate/lib/x86/adler32_impl.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/x86/cpu_features.c [new file with mode: 0644]
src/3rdparty/libdeflate/lib/x86/cpu_features.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/x86/crc32_impl.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/x86/crc32_pclmul_template.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/x86/decompress_impl.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/x86/matchfinder_impl.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/zlib_compress.c [new file with mode: 0644]
src/3rdparty/libdeflate/lib/zlib_constants.h [new file with mode: 0644]
src/3rdparty/libdeflate/lib/zlib_decompress.c [new file with mode: 0644]
src/3rdparty/libdeflate/libdeflate.h [new file with mode: 0644]
src/core/CMakeLists.txt

index 54d683b..dadafce 100644 (file)
@@ -138,7 +138,11 @@ set(KATIE_PIXMAPS_PATH "${CMAKE_INSTALL_FULL_DATADIR}/pixmaps" CACHE PATH "Deskt
 set(KATIE_PKGCONFIG_PATH "${KATIE_LIBRARIES_PATH}/pkgconfig" CACHE PATH "pkg-config installation path")
 set(KATIE_TOOLS_SUFFIX "" CACHE STRING "Tools (moc, uic, rcc, etc.) suffix")
 
-# optional package
+# bundled packages
+option(WITH_DEFLATE "Build with external libdeflate" ON)
+add_feature_info(deflate WITH_DEFLATE "build with external libdeflate")
+
+# optional packages
 option(WITH_CUPS "Build CUPS support" ON)
 add_feature_info(cups WITH_CUPS "build CUPS support")
 
@@ -164,15 +168,6 @@ add_feature_info(benchmarks KATIE_BENCHMARKS "build automatic benchmarks")
 option(KATIE_UTILS "Build maintainance utilities" OFF)
 add_feature_info(utils KATIE_UTILS "build maintainance utilities")
 
-# v0.4+ required for inflateInit2() but that is pre-release
-find_package(Deflate)
-set_package_properties(Deflate PROPERTIES
-    PURPOSE "Required for compression and decompression support"
-    DESCRIPTION "Heavily optimized library for DEFLATE/zlib/gzip compression and decompression"
-    URL "https://github.com/ebiggers/libdeflate"
-    TYPE REQUIRED
-)
-
 # v4.6+ required for unorm2_getDecomposition()
 find_package(ICU 4.6)
 set_package_properties(ICU PROPERTIES
@@ -243,6 +238,14 @@ set_package_properties(PNG PROPERTIES
     TYPE REQUIRED
 )
 
+find_package(Deflate)
+set_package_properties(Deflate PROPERTIES
+    PURPOSE "Required for compression and decompression support"
+    DESCRIPTION "Heavily optimized library for DEFLATE/zlib/gzip compression and decompression"
+    URL "https://github.com/ebiggers/libdeflate"
+    TYPE RECOMMENDED
+)
+
 find_package(Cups)
 set_package_properties(Cups PROPERTIES
     PURPOSE "Required for printing support"
index cc57750..099b648 100644 (file)
@@ -26,7 +26,6 @@ DEPENDS += freefont-ttf-[0-9]*:../../fonts/freefont-ttf
 BUILD_DEPENDS = unifdef-[0-9]*:../../devel/unifdef
 
 .include "../../sysutils/desktop-file-utils/desktopdb.mk"
-.include "../../archivers/libdeflate_is_not_available/buildlink3.mk"
 .include "../../textproc/jansson/buildlink3.mk"
 .include "../../textproc/icu/buildlink3.mk"
 .include "../../devel/pcre/buildlink3.mk"
index f04dc83..369b309 100644 (file)
@@ -21,10 +21,10 @@ COMPILER = base-clang ports-gcc
 MODULES = devel/cmake
 BUILD_DEPENDS = devel/gettext,-tools
 RUN_DEPENDS = devel/desktop-file-utils devel/xdg-utils fonts/freefont-ttf
-LIB_DEPENDS = archivers/libdeflate_is_not_available textproc/icu4c devel/jansson devel/pcre \
+LIB_DEPENDS = textproc/icu4c devel/jansson devel/pcre \
        graphics/png x11/dbus print/cups,-libs devel/gettext,-runtime
 WANTLIB = ${COMPILER_LIBCXX} ICE SM X11 Xcursor Xext Xfixes Xinerama Xrandr \
-       Xrender fontconfig freetype deflate icui18n icuuc pcre png \
+       Xrender fontconfig freetype icui18n icuuc pcre png \
        dbus-1 cups intl ssl z c crypto m
 SEPARATE_BUILD = Yes
 CONFIGURE_ARGS = -DKATIE_TOOLS_SUFFIX="-katie" -Wno-dev
diff --git a/src/3rdparty/libdeflate/COPYING b/src/3rdparty/libdeflate/COPYING
new file mode 100644 (file)
index 0000000..1f1b81c
--- /dev/null
@@ -0,0 +1,21 @@
+Copyright 2016 Eric Biggers
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation files
+(the "Software"), to deal in the Software without restriction,
+including without limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of the Software,
+and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/src/3rdparty/libdeflate/NOTE b/src/3rdparty/libdeflate/NOTE
new file mode 100644 (file)
index 0000000..de5ff07
--- /dev/null
@@ -0,0 +1,2 @@
+This is Git checkout 3cc3608e9c340e4996dff3d0633acf2ec537e12a
+from https://github.com/ebiggers/libdeflate that has not been modified.
diff --git a/src/3rdparty/libdeflate/README.md b/src/3rdparty/libdeflate/README.md
new file mode 100644 (file)
index 0000000..6c2f2f2
--- /dev/null
@@ -0,0 +1,285 @@
+# Overview
+
+libdeflate is a library for fast, whole-buffer DEFLATE-based compression and
+decompression.
+
+The supported formats are:
+
+- DEFLATE (raw)
+- zlib (a.k.a. DEFLATE with a zlib wrapper)
+- gzip (a.k.a. DEFLATE with a gzip wrapper)
+
+libdeflate is heavily optimized.  It is significantly faster than the zlib
+library, both for compression and decompression, and especially on x86
+processors.  In addition, libdeflate provides optional high compression modes
+that provide a better compression ratio than the zlib's "level 9".
+
+libdeflate itself is a library, but the following command-line programs which
+use this library are also provided:
+
+* gzip (or gunzip), a program which mostly behaves like the standard equivalent,
+  except that it does not yet have good streaming support and therefore does not
+  yet support very large files
+* benchmark, a program for benchmarking in-memory compression and decompression
+
+For the release notes, see the [NEWS file](NEWS.md).
+
+## Table of Contents
+
+- [Building](#building)
+  - [For UNIX](#for-unix)
+  - [For macOS](#for-macos)
+  - [For Windows](#for-windows)
+    - [Using Cygwin](#using-cygwin)
+    - [Using MSYS2](#using-msys2)
+- [API](#api)
+- [Bindings for other programming languages](#bindings-for-other-programming-languages)
+- [DEFLATE vs. zlib vs. gzip](#deflate-vs-zlib-vs-gzip)
+- [Compression levels](#compression-levels)
+- [Motivation](#motivation)
+- [License](#license)
+
+
+# Building
+
+## For UNIX
+
+Just run `make`, then (if desired) `make install`.  You need GNU Make and either
+GCC or Clang.  GCC is recommended because it builds slightly faster binaries.
+
+By default, the following targets are built: the static library `libdeflate.a`,
+the shared library `libdeflate.so`, the `gzip` program, and the `gunzip` program
+(which is actually just a hard link to `gzip`).  Benchmarking and test programs
+such as `benchmark` are not built by default.  You can run `make help` to
+display the available build targets.
+
+There are also many options which can be set on the `make` command line, e.g. to
+omit library features or to customize the directories into which `make install`
+installs files.  See the Makefile for details.
+
+## For macOS
+
+Prebuilt macOS binaries can be installed with [Homebrew](https://brew.sh):
+
+    brew install libdeflate
+
+But if you need to build the binaries yourself, see the section for UNIX above.
+
+## For Windows
+
+Prebuilt Windows binaries can be downloaded from
+https://github.com/ebiggers/libdeflate/releases.  But if you need to build the
+binaries yourself, MinGW (gcc) is the recommended compiler to use.  If you're
+performing the build *on* Windows (as opposed to cross-compiling for Windows on
+Linux, for example), you'll need to follow the directions in **one** of the two
+sections below to set up a minimal UNIX-compatible environment using either
+Cygwin or MSYS2, then do the build.  (Other MinGW distributions may not work, as
+they often omit basic UNIX tools such as `sh`.)
+
+Alternatively, libdeflate may be built using the Visual Studio toolchain by
+running `nmake /f Makefile.msc`.  However, while this is supported in the sense
+that it will produce working binaries, it is not recommended because the
+binaries built with MinGW will be significantly faster.
+
+Also note that 64-bit binaries are faster than 32-bit binaries and should be
+preferred whenever possible.
+
+### Using Cygwin
+
+Run the Cygwin installer, available from https://cygwin.com/setup-x86_64.exe.
+When you get to the package selection screen, choose the following additional
+packages from category "Devel":
+
+- git
+- make
+- mingw64-i686-binutils
+- mingw64-i686-gcc-g++
+- mingw64-x86_64-binutils
+- mingw64-x86_64-gcc-g++
+
+(You may skip the mingw64-i686 packages if you don't need to build 32-bit
+binaries.)
+
+After the installation finishes, open a Cygwin terminal.  Then download
+libdeflate's source code (if you haven't already) and `cd` into its directory:
+
+    git clone https://github.com/ebiggers/libdeflate
+    cd libdeflate
+
+(Note that it's not required to use `git`; an alternative is to extract a .zip
+or .tar.gz archive of the source code downloaded from the releases page.
+Also, in case you need to find it in the file browser, note that your home
+directory in Cygwin is usually located at `C:\cygwin64\home\<your username>`.)
+
+Then, to build 64-bit binaries:
+
+    make CC=x86_64-w64-mingw32-gcc
+
+or to build 32-bit binaries:
+
+    make CC=i686-w64-mingw32-gcc
+
+### Using MSYS2
+
+Run the MSYS2 installer, available from http://www.msys2.org/.  After
+installing, open an MSYS2 shell and run:
+
+    pacman -Syu
+
+Say `y`, then when it's finished, close the shell window and open a new one.
+Then run the same command again:
+
+    pacman -Syu
+
+Then, install the packages needed to build libdeflate:
+
+    pacman -S git \
+              make \
+              mingw-w64-i686-binutils \
+              mingw-w64-i686-gcc \
+              mingw-w64-x86_64-binutils \
+              mingw-w64-x86_64-gcc
+
+(You may skip the mingw-w64-i686 packages if you don't need to build 32-bit
+binaries.)
+
+Then download libdeflate's source code (if you haven't already):
+
+    git clone https://github.com/ebiggers/libdeflate
+
+(Note that it's not required to use `git`; an alternative is to extract a .zip
+or .tar.gz archive of the source code downloaded from the releases page.
+Also, in case you need to find it in the file browser, note that your home
+directory in MSYS2 is usually located at `C:\msys64\home\<your username>`.)
+
+Then, to build 64-bit binaries, open "MSYS2 MinGW 64-bit" from the Start menu
+and run the following commands:
+
+    cd libdeflate
+    make clean
+    make
+
+Or to build 32-bit binaries, do the same but use "MSYS2 MinGW 32-bit" instead.
+
+# API
+
+libdeflate has a simple API that is not zlib-compatible.  You can create
+compressors and decompressors and use them to compress or decompress buffers.
+See libdeflate.h for details.
+
+There is currently no support for streaming.  This has been considered, but it
+always significantly increases complexity and slows down fast paths.
+Unfortunately, at this point it remains a future TODO.  So: if your application
+compresses data in "chunks", say, less than 1 MB in size, then libdeflate is a
+great choice for you; that's what it's designed to do.  This is perfect for
+certain use cases such as transparent filesystem compression.  But if your
+application compresses large files as a single compressed stream, similarly to
+the `gzip` program, then libdeflate isn't for you.
+
+Note that with chunk-based compression, you generally should have the
+uncompressed size of each chunk stored outside of the compressed data itself.
+This enables you to allocate an output buffer of the correct size without
+guessing.  However, libdeflate's decompression routines do optionally provide
+the actual number of output bytes in case you need it.
+
+Windows developers: note that the calling convention of libdeflate.dll is
+"stdcall" -- the same as the Win32 API.  If you call into libdeflate.dll using a
+non-C/C++ language, or dynamically using LoadLibrary(), make sure to use the
+stdcall convention.  Using the wrong convention may crash your application.
+(Note: older versions of libdeflate used the "cdecl" convention instead.)
+
+# Bindings for other programming languages
+
+The libdeflate project itself only provides a C library.  If you need to use
+libdeflate from a programming language other than C or C++, consider using the
+following bindings:
+
+* C#: [LibDeflate.NET](https://github.com/jzebedee/LibDeflate.NET)
+* Go: [go-libdeflate](https://github.com/4kills/go-libdeflate)
+* Java: [libdeflate-java](https://github.com/astei/libdeflate-java)
+* Julia: [LibDeflate.jl](https://github.com/jakobnissen/LibDeflate.jl)
+* Python: [deflate](https://github.com/dcwatson/deflate)
+* Ruby: [libdeflate-ruby](https://github.com/kaorimatz/libdeflate-ruby)
+* Rust: [libdeflater](https://github.com/adamkewley/libdeflater)
+
+Note: these are third-party projects which haven't necessarily been vetted by
+the authors of libdeflate.  Please direct all questions, bugs, and improvements
+for these bindings to their authors.
+
+# DEFLATE vs. zlib vs. gzip
+
+The DEFLATE format ([rfc1951](https://www.ietf.org/rfc/rfc1951.txt)), the zlib
+format ([rfc1950](https://www.ietf.org/rfc/rfc1950.txt)), and the gzip format
+([rfc1952](https://www.ietf.org/rfc/rfc1952.txt)) are commonly confused with
+each other as well as with the [zlib software library](http://zlib.net), which
+actually supports all three formats.  libdeflate (this library) also supports
+all three formats.
+
+Briefly, DEFLATE is a raw compressed stream, whereas zlib and gzip are different
+wrappers for this stream.  Both zlib and gzip include checksums, but gzip can
+include extra information such as the original filename.  Generally, you should
+choose a format as follows:
+
+- If you are compressing whole files with no subdivisions, similar to the `gzip`
+  program, you probably should use the gzip format.
+- Otherwise, if you don't need the features of the gzip header and footer but do
+  still want a checksum for corruption detection, you probably should use the
+  zlib format.
+- Otherwise, you probably should use raw DEFLATE.  This is ideal if you don't
+  need checksums, e.g. because they're simply not needed for your use case or
+  because you already compute your own checksums that are stored separately from
+  the compressed stream.
+
+Note that gzip and zlib streams can be distinguished from each other based on
+their starting bytes, but this is not necessarily true of raw DEFLATE streams.
+
+# Compression levels
+
+An often-underappreciated fact of compression formats such as DEFLATE is that
+there are an enormous number of different ways that a given input could be
+compressed.  Different algorithms and different amounts of computation time will
+result in different compression ratios, while remaining equally compatible with
+the decompressor.
+
+For this reason, the commonly used zlib library provides nine compression
+levels.  Level 1 is the fastest but provides the worst compression; level 9
+provides the best compression but is the slowest.  It defaults to level 6.
+libdeflate uses this same design but is designed to improve on both zlib's
+performance *and* compression ratio at every compression level.  In addition,
+libdeflate's levels go [up to 12](https://xkcd.com/670/) to make room for a
+minimum-cost-path based algorithm (sometimes called "optimal parsing") that can
+significantly improve on zlib's compression ratio.
+
+If you are using DEFLATE (or zlib, or gzip) in your application, you should test
+different levels to see which works best for your application.
+
+# Motivation
+
+Despite DEFLATE's widespread use mainly through the zlib library, in the
+compression community this format from the early 1990s is often considered
+obsolete.  And in a few significant ways, it is.
+
+So why implement DEFLATE at all, instead of focusing entirely on
+bzip2/LZMA/xz/LZ4/LZX/ZSTD/Brotli/LZHAM/LZFSE/[insert cool new format here]?
+
+To do something better, you need to understand what came before.  And it turns
+out that most ideas from DEFLATE are still relevant.  Many of the newer formats
+share a similar structure as DEFLATE, with different tweaks.  The effects of
+trivial but very useful tweaks, such as increasing the sliding window size, are
+often confused with the effects of nontrivial but less useful tweaks.  And
+actually, many of these formats are similar enough that common algorithms and
+optimizations (e.g. those dealing with LZ77 matchfinding) can be reused.
+
+In addition, comparing compressors fairly is difficult because the performance
+of a compressor depends heavily on optimizations which are not intrinsic to the
+compression format itself.  In this respect, the zlib library sometimes compares
+poorly to certain newer code because zlib is not well optimized for modern
+processors.  libdeflate addresses this by providing an optimized DEFLATE
+implementation which can be used for benchmarking purposes.  And, of course,
+real applications can use it as well.
+
+# License
+
+libdeflate is [MIT-licensed](COPYING).
+
+I am not aware of any patents or patent applications relevant to libdeflate.
diff --git a/src/3rdparty/libdeflate/common/common_defs.h b/src/3rdparty/libdeflate/common/common_defs.h
new file mode 100644 (file)
index 0000000..d56c5cf
--- /dev/null
@@ -0,0 +1,334 @@
+/*
+ * common_defs.h
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef COMMON_COMMON_DEFS_H
+#define COMMON_COMMON_DEFS_H
+
+#ifdef __GNUC__
+#  include "compiler_gcc.h"
+#elif defined(_MSC_VER)
+#  include "compiler_msc.h"
+#else
+#  pragma message("Unrecognized compiler.  Please add a header file for your compiler.  Compilation will proceed, but performance may suffer!")
+#endif
+
+/* ========================================================================== */
+/*                              Type definitions                              */
+/* ========================================================================== */
+
+#include <stddef.h> /* size_t */
+
+#ifndef __bool_true_false_are_defined
+#  include <stdbool.h> /* bool */
+#endif
+
+/* Fixed-width integer types */
+#include <stdint.h>
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef int8_t s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef int64_t s64;
+
+/*
+ * Word type of the target architecture.  Use 'size_t' instead of 'unsigned
+ * long' to account for platforms such as Windows that use 32-bit 'unsigned
+ * long' on 64-bit architectures.
+ */
+typedef size_t machine_word_t;
+
+/* Number of bytes in a word */
+#define WORDBYTES      ((int)sizeof(machine_word_t))
+
+/* Number of bits in a word */
+#define WORDBITS       (8 * WORDBYTES)
+
+/* ========================================================================== */
+/*                         Optional compiler features                         */
+/* ========================================================================== */
+
+/* LIBEXPORT - export a function from a shared library */
+#ifndef LIBEXPORT
+#  define LIBEXPORT
+#endif
+
+/* inline - suggest that a function be inlined */
+#ifndef inline
+#  define inline
+#endif
+
+/* forceinline - force a function to be inlined, if possible */
+#ifndef forceinline
+#  define forceinline inline
+#endif
+
+/* restrict - annotate a non-aliased pointer */
+#ifndef restrict
+#  define restrict
+#endif
+
+/* likely(expr) - hint that an expression is usually true */
+#ifndef likely
+#  define likely(expr)         (expr)
+#endif
+
+/* unlikely(expr) - hint that an expression is usually false */
+#ifndef unlikely
+#  define unlikely(expr)       (expr)
+#endif
+
+/* prefetchr(addr) - prefetch into L1 cache for read */
+#ifndef prefetchr
+#  define prefetchr(addr)
+#endif
+
+/* prefetchw(addr) - prefetch into L1 cache for write */
+#ifndef prefetchw
+#  define prefetchw(addr)
+#endif
+
+/* Does the compiler support the 'target' function attribute? */
+#ifndef COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
+#  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0
+#endif
+
+/* Which targets are supported with the 'target' function attribute? */
+#ifndef COMPILER_SUPPORTS_BMI2_TARGET
+#  define COMPILER_SUPPORTS_BMI2_TARGET 0
+#endif
+#ifndef COMPILER_SUPPORTS_AVX_TARGET
+#  define COMPILER_SUPPORTS_AVX_TARGET 0
+#endif
+#ifndef COMPILER_SUPPORTS_AVX512BW_TARGET
+#  define COMPILER_SUPPORTS_AVX512BW_TARGET 0
+#endif
+
+/*
+ * Which targets are supported with the 'target' function attribute and have
+ * intrinsics that work within 'target'-ed functions?
+ */
+#ifndef COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS
+#  define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 0
+#endif
+#ifndef COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS
+#  define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS 0
+#endif
+#ifndef COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS
+#  define COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS 0
+#endif
+#ifndef COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS
+#  define COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS 0
+#endif
+#ifndef COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS
+#  define COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS 0
+#endif
+#ifndef COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS
+#  define COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS 0
+#endif
+#ifndef COMPILER_SUPPORTS_CRC32_TARGET_INTRINSICS
+#  define COMPILER_SUPPORTS_CRC32_TARGET_INTRINSICS 0
+#endif
+
+/* _aligned_attribute(n) - declare that the annotated variable, or variables of
+ * the annotated type, are to be aligned on n-byte boundaries */
+#ifndef _aligned_attribute
+#endif
+
+/* ========================================================================== */
+/*                          Miscellaneous macros                              */
+/* ========================================================================== */
+
+#define ARRAY_LEN(A)           (sizeof(A) / sizeof((A)[0]))
+#define MIN(a, b)              ((a) <= (b) ? (a) : (b))
+#define MAX(a, b)              ((a) >= (b) ? (a) : (b))
+#define DIV_ROUND_UP(n, d)     (((n) + (d) - 1) / (d))
+#define STATIC_ASSERT(expr)    ((void)sizeof(char[1 - 2 * !(expr)]))
+#define ALIGN(n, a)            (((n) + (a) - 1) & ~((a) - 1))
+
+/* ========================================================================== */
+/*                           Endianness handling                              */
+/* ========================================================================== */
+
+/*
+ * CPU_IS_LITTLE_ENDIAN() - a macro which evaluates to 1 if the CPU is little
+ * endian or 0 if it is big endian.  The macro should be defined in a way such
+ * that the compiler can evaluate it at compilation time.  If not defined, a
+ * fallback is used.
+ */
+#ifndef CPU_IS_LITTLE_ENDIAN
+static forceinline int CPU_IS_LITTLE_ENDIAN(void)
+{
+       union {
+               unsigned int v;
+               unsigned char b;
+       } u;
+       u.v = 1;
+       return u.b;
+}
+#endif
+
+/* bswap16(n) - swap the bytes of a 16-bit integer */
+#ifndef bswap16
+static forceinline u16 bswap16(u16 n)
+{
+       return (n << 8) | (n >> 8);
+}
+#endif
+
+/* bswap32(n) - swap the bytes of a 32-bit integer */
+#ifndef bswap32
+static forceinline u32 bswap32(u32 n)
+{
+       return ((n & 0x000000FF) << 24) |
+              ((n & 0x0000FF00) << 8) |
+              ((n & 0x00FF0000) >> 8) |
+              ((n & 0xFF000000) >> 24);
+}
+#endif
+
+/* bswap64(n) - swap the bytes of a 64-bit integer */
+#ifndef bswap64
+static forceinline u64 bswap64(u64 n)
+{
+       return ((n & 0x00000000000000FF) << 56) |
+              ((n & 0x000000000000FF00) << 40) |
+              ((n & 0x0000000000FF0000) << 24) |
+              ((n & 0x00000000FF000000) << 8) |
+              ((n & 0x000000FF00000000) >> 8) |
+              ((n & 0x0000FF0000000000) >> 24) |
+              ((n & 0x00FF000000000000) >> 40) |
+              ((n & 0xFF00000000000000) >> 56);
+}
+#endif
+
+#define le16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap16(n))
+#define le32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap32(n))
+#define le64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap64(n))
+#define be16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap16(n) : (n))
+#define be32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap32(n) : (n))
+#define be64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap64(n) : (n))
+
+/* ========================================================================== */
+/*                          Unaligned memory accesses                         */
+/* ========================================================================== */
+
+/*
+ * UNALIGNED_ACCESS_IS_FAST should be defined to 1 if unaligned memory accesses
+ * can be performed efficiently on the target platform.
+ */
+#ifndef UNALIGNED_ACCESS_IS_FAST
+#  define UNALIGNED_ACCESS_IS_FAST 0
+#endif
+
+/* ========================================================================== */
+/*                             Bit scan functions                             */
+/* ========================================================================== */
+
+/*
+ * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least
+ * significant end) of the *most* significant 1 bit in the input value.  The
+ * input value must be nonzero!
+ */
+
+#ifndef bsr32
+static forceinline unsigned
+bsr32(u32 n)
+{
+       unsigned i = 0;
+       while ((n >>= 1) != 0)
+               i++;
+       return i;
+}
+#endif
+
+#ifndef bsr64
+static forceinline unsigned
+bsr64(u64 n)
+{
+       unsigned i = 0;
+       while ((n >>= 1) != 0)
+               i++;
+       return i;
+}
+#endif
+
+static forceinline unsigned
+bsrw(machine_word_t n)
+{
+       STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+       if (WORDBITS == 32)
+               return bsr32(n);
+       else
+               return bsr64(n);
+}
+
+/*
+ * Bit Scan Forward (BSF) - find the 0-based index (relative to the least
+ * significant end) of the *least* significant 1 bit in the input value.  The
+ * input value must be nonzero!
+ */
+
+#ifndef bsf32
+static forceinline unsigned
+bsf32(u32 n)
+{
+       unsigned i = 0;
+       while ((n & 1) == 0) {
+               i++;
+               n >>= 1;
+       }
+       return i;
+}
+#endif
+
+#ifndef bsf64
+static forceinline unsigned
+bsf64(u64 n)
+{
+       unsigned i = 0;
+       while ((n & 1) == 0) {
+               i++;
+               n >>= 1;
+       }
+       return i;
+}
+#endif
+
+static forceinline unsigned
+bsfw(machine_word_t n)
+{
+       STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+       if (WORDBITS == 32)
+               return bsf32(n);
+       else
+               return bsf64(n);
+}
+
+#endif /* COMMON_COMMON_DEFS_H */
diff --git a/src/3rdparty/libdeflate/common/compiler_gcc.h b/src/3rdparty/libdeflate/common/compiler_gcc.h
new file mode 100644 (file)
index 0000000..2a45b05
--- /dev/null
@@ -0,0 +1,201 @@
+/*
+ * compiler_gcc.h - definitions for the GNU C Compiler.  This also handles clang
+ * and the Intel C Compiler (icc).
+ *
+ * TODO: icc is not well tested, so some things are currently disabled even
+ * though they maybe can be enabled on some icc versions.
+ */
+
+#if !defined(__clang__) && !defined(__INTEL_COMPILER)
+#  define GCC_PREREQ(major, minor)             \
+       (__GNUC__ > (major) ||                  \
+        (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
+#else
+#  define GCC_PREREQ(major, minor)     0
+#endif
+
+/* Note: only check the clang version when absolutely necessary!
+ * "Vendors" such as Apple can use different version numbers. */
+#ifdef __clang__
+#  ifdef __apple_build_version__
+#    define CLANG_PREREQ(major, minor, apple_version)  \
+       (__apple_build_version__ >= (apple_version))
+#  else
+#    define CLANG_PREREQ(major, minor, apple_version)  \
+       (__clang_major__ > (major) ||                   \
+        (__clang_major__ == (major) && __clang_minor__ >= (minor)))
+#  endif
+#else
+#  define CLANG_PREREQ(major, minor, apple_version)    0
+#endif
+
+#ifndef __has_attribute
+#  define __has_attribute(attribute)   0
+#endif
+#ifndef __has_feature
+#  define __has_feature(feature)       0
+#endif
+#ifndef __has_builtin
+#  define __has_builtin(builtin)       0
+#endif
+
+#ifdef _WIN32
+#  define LIBEXPORT __declspec(dllexport)
+#else
+#  define LIBEXPORT __attribute__((visibility("default")))
+#endif
+
+#define inline                 inline
+#define forceinline            inline __attribute__((always_inline))
+#define restrict               __restrict__
+#define likely(expr)           __builtin_expect(!!(expr), 1)
+#define unlikely(expr)         __builtin_expect(!!(expr), 0)
+#define prefetchr(addr)                __builtin_prefetch((addr), 0)
+#define prefetchw(addr)                __builtin_prefetch((addr), 1)
+#define _aligned_attribute(n)  __attribute__((aligned(n)))
+
+#define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE    \
+       (GCC_PREREQ(4, 4) || __has_attribute(target))
+
+#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
+
+#  if defined(__i386__) || defined(__x86_64__)
+
+#    define COMPILER_SUPPORTS_PCLMUL_TARGET    \
+       (GCC_PREREQ(4, 4) || __has_builtin(__builtin_ia32_pclmulqdq128))
+
+#    define COMPILER_SUPPORTS_AVX_TARGET       \
+       (GCC_PREREQ(4, 6) || __has_builtin(__builtin_ia32_maxps256))
+
+#    define COMPILER_SUPPORTS_BMI2_TARGET      \
+       (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di))
+
+#    define COMPILER_SUPPORTS_AVX2_TARGET      \
+       (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_psadbw256))
+
+#    define COMPILER_SUPPORTS_AVX512BW_TARGET  \
+       (GCC_PREREQ(5, 1) || __has_builtin(__builtin_ia32_psadbw512))
+
+       /*
+        * Prior to gcc 4.9 (r200349) and clang 3.8 (r239883), x86 intrinsics
+        * not available in the main target could not be used in 'target'
+        * attribute functions.  Unfortunately clang has no feature test macro
+        * for this so we have to check its version.
+        */
+#    if GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000)
+#      define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 1
+#      define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS       \
+               COMPILER_SUPPORTS_PCLMUL_TARGET
+#      define COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS \
+               COMPILER_SUPPORTS_AVX2_TARGET
+#      define COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS     \
+               COMPILER_SUPPORTS_AVX512BW_TARGET
+#    endif
+
+#  elif defined(__arm__) || defined(__aarch64__)
+
+    /*
+     * Determine whether NEON and crypto intrinsics are supported.
+     *
+     * With gcc prior to 6.1, (r230411 for arm32, r226563 for arm64), neither
+     * was available unless enabled in the main target.
+     *
+     * But even after that, to include <arm_neon.h> (which contains both the
+     * basic NEON intrinsics and the crypto intrinsics) the main target still
+     * needs to have:
+     *   - gcc: hardware floating point support
+     *   - clang: NEON support (but not necessarily crypto support)
+     */
+#    if (GCC_PREREQ(6, 1) && defined(__ARM_FP)) || \
+        (defined(__clang__) && defined(__ARM_NEON))
+#      define COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS 1
+       /*
+        * The crypto intrinsics are broken on arm32 with clang, even when using
+        * -mfpu=crypto-neon-fp-armv8, because clang's <arm_neon.h> puts them
+        * behind __aarch64__.  Undefine __ARM_FEATURE_CRYPTO in that case...
+        */
+#      if defined(__clang__) && defined(__arm__)
+#        undef __ARM_FEATURE_CRYPTO
+#      elif __has_builtin(__builtin_neon_vmull_p64) || !defined(__clang__)
+#        define COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS 1
+#      endif
+#    endif
+
+     /*
+      * Determine whether CRC32 intrinsics are supported.
+      *
+      * With gcc r274827 or later (gcc 10.1+, 9.3+, or 8.4+), or with clang,
+      * they work as expected.  (Well, not quite.  There's still a bug, but we
+      * have to work around it later when including arm_acle.h.)
+      */
+#    if GCC_PREREQ(10, 1) || \
+        (GCC_PREREQ(9, 3) && !GCC_PREREQ(10, 0)) || \
+        (GCC_PREREQ(8, 4) && !GCC_PREREQ(9, 0)) || \
+        (defined(__clang__) && __has_builtin(__builtin_arm_crc32b))
+#      define COMPILER_SUPPORTS_CRC32_TARGET_INTRINSICS 1
+#    endif
+
+#  endif /* __arm__ || __aarch64__ */
+
+#endif /* COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE */
+
+/*
+ * Prior to gcc 5.1 and clang 3.9, emmintrin.h only defined vectors of signed
+ * integers (e.g. __v4si), not vectors of unsigned integers (e.g.  __v4su).  But
+ * we need the unsigned ones in order to avoid signed integer overflow, which is
+ * undefined behavior.  Add the missing definitions for the unsigned ones if
+ * needed.
+ */
+#if (GCC_PREREQ(4, 0) && !GCC_PREREQ(5, 1)) || \
+    (defined(__clang__) && !CLANG_PREREQ(3, 9, 8020000)) || \
+    defined(__INTEL_COMPILER)
+typedef unsigned long long  __v2du __attribute__((__vector_size__(16)));
+typedef unsigned int        __v4su __attribute__((__vector_size__(16)));
+typedef unsigned short      __v8hu __attribute__((__vector_size__(16)));
+typedef unsigned char      __v16qu __attribute__((__vector_size__(16)));
+typedef unsigned long long  __v4du __attribute__((__vector_size__(32)));
+typedef unsigned int        __v8su __attribute__((__vector_size__(32)));
+typedef unsigned short     __v16hu __attribute__((__vector_size__(32)));
+typedef unsigned char      __v32qu __attribute__((__vector_size__(32)));
+#endif
+
+#ifdef __INTEL_COMPILER
+typedef int   __v16si __attribute__((__vector_size__(64)));
+typedef short __v32hi __attribute__((__vector_size__(64)));
+typedef char  __v64qi __attribute__((__vector_size__(64)));
+#endif
+
+/* Newer gcc supports __BYTE_ORDER__.  Older gcc doesn't. */
+#ifdef __BYTE_ORDER__
+#  define CPU_IS_LITTLE_ENDIAN() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#endif
+
+#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
+#  define bswap16      __builtin_bswap16
+#endif
+
+#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
+#  define bswap32      __builtin_bswap32
+#endif
+
+#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
+#  define bswap64      __builtin_bswap64
+#endif
+
+#if defined(__x86_64__) || defined(__i386__) || \
+    defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) || \
+    /*
+     * For all compilation purposes, WebAssembly behaves like any other CPU
+     * instruction set. Even though WebAssembly engine might be running on top
+     * of different actual CPU architectures, the WebAssembly spec itself
+     * permits unaligned access and it will be fast on most of those platforms,
+     * and simulated at the engine level on others, so it's worth treating it
+     * as a CPU architecture with fast unaligned access.
+    */ defined(__wasm__)
+#  define UNALIGNED_ACCESS_IS_FAST 1
+#endif
+
+#define bsr32(n)       (31 - __builtin_clz(n))
+#define bsr64(n)       (63 - __builtin_clzll(n))
+#define bsf32(n)       __builtin_ctz(n)
+#define bsf64(n)       __builtin_ctzll(n)
diff --git a/src/3rdparty/libdeflate/common/compiler_msc.h b/src/3rdparty/libdeflate/common/compiler_msc.h
new file mode 100644 (file)
index 0000000..18cfa12
--- /dev/null
@@ -0,0 +1,80 @@
+/*
+ * compiler_msc.h - definitions for the Microsoft C Compiler
+ */
+
+#include <stdint.h>
+#include <stdlib.h> /* for _byteswap_*() */
+
+#define LIBEXPORT      __declspec(dllexport)
+
+/*
+ * Old versions (e.g. VS2010) of MSC don't have the C99 header stdbool.h.
+ * Beware: the below replacement isn't fully standard, since normally any value
+ * != 0 should be implicitly cast to a bool with value 1... but that doesn't
+ * happen if bool is really just an 'int'.
+ */
+typedef int bool;
+#define true 1
+#define false 0
+#define __bool_true_false_are_defined 1
+
+/* Define ssize_t */
+#ifdef _WIN64
+typedef long long ssize_t;
+#else
+typedef int ssize_t;
+#endif
+
+/* Assume a little endian architecture with fast unaligned access */
+#define CPU_IS_LITTLE_ENDIAN()         1
+#define UNALIGNED_ACCESS_IS_FAST       1
+
+/* __restrict has nonstandard behavior; don't use it */
+#define restrict
+
+/* ... but we can use __inline and __forceinline */
+#define inline         __inline
+#define forceinline    __forceinline
+
+/* Byte swap functions */
+#define bswap16        _byteswap_ushort
+#define bswap32        _byteswap_ulong
+#define bswap64        _byteswap_uint64
+
+/* Bit scan functions (32-bit) */
+
+static forceinline unsigned
+bsr32(uint32_t n)
+{
+       _BitScanReverse(&n, n);
+       return n;
+}
+#define bsr32 bsr32
+
+static forceinline unsigned
+bsf32(uint32_t n)
+{
+       _BitScanForward(&n, n);
+       return n;
+}
+#define bsf32 bsf32
+
+#ifdef _M_X64 /* Bit scan functions (64-bit) */
+
+static forceinline unsigned
+bsr64(uint64_t n)
+{
+       _BitScanReverse64(&n, n);
+       return n;
+}
+#define bsr64 bsr64
+
+static forceinline unsigned
+bsf64(uint64_t n)
+{
+       _BitScanForward64(&n, n);
+       return n;
+}
+#define bsf64 bsf64
+
+#endif /* _M_X64 */
diff --git a/src/3rdparty/libdeflate/lib/adler32.c b/src/3rdparty/libdeflate/lib/adler32.c
new file mode 100644 (file)
index 0000000..32ab0ce
--- /dev/null
@@ -0,0 +1,130 @@
+/*
+ * adler32.c - Adler-32 checksum algorithm
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "lib_common.h"
+#include "libdeflate.h"
+
+/* The Adler-32 divisor, or "base", value. */
+#define DIVISOR 65521
+
+/*
+ * MAX_CHUNK_SIZE is the most bytes that can be processed without the
+ * possibility of s2 overflowing when it is represented as an unsigned 32-bit
+ * integer.  This value was computed using the following Python script:
+ *
+ *     divisor = 65521
+ *     count = 0
+ *     s1 = divisor - 1
+ *     s2 = divisor - 1
+ *     while True:
+ *             s1 += 0xFF
+ *             s2 += s1
+ *             if s2 > 0xFFFFFFFF:
+ *                     break
+ *             count += 1
+ *     print(count)
+ *
+ * Note that to get the correct worst-case value, we must assume that every byte
+ * has value 0xFF and that s1 and s2 started with the highest possible values
+ * modulo the divisor.
+ */
+#define MAX_CHUNK_SIZE 5552
+
+typedef u32 (*adler32_func_t)(u32, const u8 *, size_t);
+
+/* Include architecture-specific implementations if available */
+#undef DEFAULT_IMPL
+#undef DISPATCH
+#if defined(__arm__) || defined(__aarch64__)
+#  include "arm/adler32_impl.h"
+#elif defined(__i386__) || defined(__x86_64__)
+#  include "x86/adler32_impl.h"
+#endif
+
+/* Define a generic implementation if needed */
+#ifndef DEFAULT_IMPL
+#define DEFAULT_IMPL adler32_generic
+static u32 adler32_generic(u32 adler, const u8 *p, size_t size)
+{
+       u32 s1 = adler & 0xFFFF;
+       u32 s2 = adler >> 16;
+       const u8 * const end = p + size;
+
+       while (p != end) {
+               size_t chunk_size = MIN(end - p, MAX_CHUNK_SIZE);
+               const u8 *chunk_end = p + chunk_size;
+               size_t num_unrolled_iterations = chunk_size / 4;
+
+               while (num_unrolled_iterations--) {
+                       s1 += *p++;
+                       s2 += s1;
+                       s1 += *p++;
+                       s2 += s1;
+                       s1 += *p++;
+                       s2 += s1;
+                       s1 += *p++;
+                       s2 += s1;
+               }
+               while (p != chunk_end) {
+                       s1 += *p++;
+                       s2 += s1;
+               }
+               s1 %= DIVISOR;
+               s2 %= DIVISOR;
+       }
+
+       return (s2 << 16) | s1;
+}
+#endif /* !DEFAULT_IMPL */
+
+#ifdef DISPATCH
+static u32 dispatch(u32, const u8 *, size_t);
+
+static volatile adler32_func_t adler32_impl = dispatch;
+
+/* Choose the fastest implementation at runtime */
+static u32 dispatch(u32 adler, const u8 *buffer, size_t size)
+{
+       adler32_func_t f = arch_select_adler32_func();
+
+       if (f == NULL)
+               f = DEFAULT_IMPL;
+
+       adler32_impl = f;
+       return adler32_impl(adler, buffer, size);
+}
+#else
+#  define adler32_impl DEFAULT_IMPL /* only one implementation, use it */
+#endif
+
+LIBDEFLATEEXPORT u32 LIBDEFLATEAPI
+libdeflate_adler32(u32 adler, const void *buffer, size_t size)
+{
+       if (buffer == NULL) /* return initial value */
+               return 1;
+       return adler32_impl(adler, buffer, size);
+}
diff --git a/src/3rdparty/libdeflate/lib/adler32_vec_template.h b/src/3rdparty/libdeflate/lib/adler32_vec_template.h
new file mode 100644 (file)
index 0000000..4eb8c2a
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ * adler32_vec_template.h - template for vectorized Adler-32 implementations
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This file contains a template for vectorized Adler-32 implementations.
+ *
+ * The inner loop between reductions modulo 65521 of an unvectorized Adler-32
+ * implementation looks something like this:
+ *
+ *     do {
+ *             s1 += *p;
+ *             s2 += s1;
+ *     } while (++p != chunk_end);
+ *
+ * For vectorized calculation of s1, we only need to sum the input bytes.  They
+ * can be accumulated into multiple counters which are eventually summed
+ * together.
+ *
+ * For vectorized calculation of s2, the basic idea is that for each iteration
+ * that processes N bytes, we can perform the following vectorizable
+ * calculation:
+ *
+ *     s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
+ *
+ * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
+ * separate counters, then do the multiplications by N...1 just once at the end
+ * rather than once per iteration.
+ *
+ * Also, we must account for how previous bytes will affect s2 by doing the
+ * following at beginning of each iteration:
+ *
+ *     s2 += s1 * N
+ *
+ * Furthermore, like s1, "s2" can actually be multiple counters which are
+ * eventually summed together.
+ */
+
+static u32 ATTRIBUTES
+FUNCNAME(u32 adler, const u8 *p, size_t size)
+{
+       u32 s1 = adler & 0xFFFF;
+       u32 s2 = adler >> 16;
+       const u8 * const end = p + size;
+       const u8 *vend;
+       const size_t max_chunk_size =
+               MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) -
+               (MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) %
+                IMPL_SEGMENT_SIZE);
+
+       /* Process a byte at a time until the needed alignment is reached */
+       if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) {
+               do {
+                       s1 += *p++;
+                       s2 += s1;
+               } while (p != end && (uintptr_t)p % IMPL_ALIGNMENT);
+               s1 %= DIVISOR;
+               s2 %= DIVISOR;
+       }
+
+       /*
+        * Process "chunks" of bytes using vector instructions.  Chunk sizes are
+        * limited to MAX_CHUNK_SIZE, which guarantees that s1 and s2 never
+        * overflow before being reduced modulo DIVISOR.  For vector processing,
+        * chunk sizes are also made evenly divisible by IMPL_SEGMENT_SIZE and
+        * may be further limited to IMPL_MAX_CHUNK_SIZE.
+        */
+       STATIC_ASSERT(IMPL_SEGMENT_SIZE % IMPL_ALIGNMENT == 0);
+       vend = end - ((size_t)(end - p) % IMPL_SEGMENT_SIZE);
+       while (p != vend) {
+               size_t chunk_size = MIN((size_t)(vend - p), max_chunk_size);
+
+               s2 += s1 * chunk_size;
+
+               FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_size),
+                              &s1, &s2);
+
+               p += chunk_size;
+               s1 %= DIVISOR;
+               s2 %= DIVISOR;
+       }
+
+       /* Process any remaining bytes */
+       if (p != end) {
+               do {
+                       s1 += *p++;
+                       s2 += s1;
+               } while (p != end);
+               s1 %= DIVISOR;
+               s2 %= DIVISOR;
+       }
+
+       return (s2 << 16) | s1;
+}
+
+#undef FUNCNAME
+#undef FUNCNAME_CHUNK
+#undef ATTRIBUTES
+#undef IMPL_ALIGNMENT
+#undef IMPL_SEGMENT_SIZE
+#undef IMPL_MAX_CHUNK_SIZE
diff --git a/src/3rdparty/libdeflate/lib/arm/adler32_impl.h b/src/3rdparty/libdeflate/lib/arm/adler32_impl.h
new file mode 100644 (file)
index 0000000..17e56c0
--- /dev/null
@@ -0,0 +1,125 @@
+/*
+ * arm/adler32_impl.h - ARM implementations of Adler-32 checksum algorithm
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_ARM_ADLER32_IMPL_H
+#define LIB_ARM_ADLER32_IMPL_H
+
+#include "cpu_features.h"
+
+/* NEON implementation */
+#undef DISPATCH_NEON
+#if !defined(DEFAULT_IMPL) &&  \
+       (defined(__ARM_NEON) || (ARM_CPU_FEATURES_ENABLED &&    \
+                                COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS))
+#  define FUNCNAME             adler32_neon
+#  define FUNCNAME_CHUNK       adler32_neon_chunk
+#  define IMPL_ALIGNMENT       16
+#  define IMPL_SEGMENT_SIZE    32
+/* Prevent unsigned overflow of the 16-bit precision byte counters */
+#  define IMPL_MAX_CHUNK_SIZE  (32 * (0xFFFF / 0xFF))
+#  ifdef __ARM_NEON
+#    define ATTRIBUTES
+#    define DEFAULT_IMPL       adler32_neon
+#  else
+#    ifdef __arm__
+#      define ATTRIBUTES       __attribute__((target("fpu=neon")))
+#    else
+#      define ATTRIBUTES       __attribute__((target("+simd")))
+#    endif
+#    define DISPATCH           1
+#    define DISPATCH_NEON      1
+#  endif
+#  include <arm_neon.h>
+static forceinline ATTRIBUTES void
+adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end,
+                  u32 *s1, u32 *s2)
+{
+       uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
+       uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, 0 };
+       uint16x8_t v_byte_sums_a = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
+       uint16x8_t v_byte_sums_b = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
+       uint16x8_t v_byte_sums_c = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
+       uint16x8_t v_byte_sums_d = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+       do {
+               const uint8x16_t bytes1 = *p++;
+               const uint8x16_t bytes2 = *p++;
+               uint16x8_t tmp;
+
+               v_s2 += v_s1;
+
+               /* Vector Pairwise Add Long (u8 => u16) */
+               tmp = vpaddlq_u8(bytes1);
+
+               /* Vector Pairwise Add and Accumulate Long (u8 => u16) */
+               tmp = vpadalq_u8(tmp, bytes2);
+
+               /* Vector Pairwise Add and Accumulate Long (u16 => u32) */
+               v_s1 = vpadalq_u16(v_s1, tmp);
+
+               /* Vector Add Wide (u8 => u16) */
+               v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1));
+               v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1));
+               v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2));
+               v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2));
+
+       } while (p != end);
+
+       /* Vector Shift Left (u32) */
+       v_s2 = vqshlq_n_u32(v_s2, 5);
+
+       /* Vector Multiply Accumulate Long (u16 => u32) */
+       v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a),  (uint16x4_t) { 32, 31, 30, 29 });
+       v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_a), (uint16x4_t) { 28, 27, 26, 25 });
+       v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b),  (uint16x4_t) { 24, 23, 22, 21 });
+       v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_b), (uint16x4_t) { 20, 19, 18, 17 });
+       v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c),  (uint16x4_t) { 16, 15, 14, 13 });
+       v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_c), (uint16x4_t) { 12, 11, 10,  9 });
+       v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_byte_sums_d), (uint16x4_t) {  8,  7,  6,  5 });
+       v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_d), (uint16x4_t) {  4,  3,  2,  1 });
+
+       *s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
+       *s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
+}
+#  include "../adler32_vec_template.h"
+#endif /* NEON implementation */
+
+#ifdef DISPATCH
+static inline adler32_func_t
+arch_select_adler32_func(void)
+{
+       u32 features = get_cpu_features();
+
+#ifdef DISPATCH_NEON
+       if (features & ARM_CPU_FEATURE_NEON)
+               return adler32_neon;
+#endif
+       return NULL;
+}
+#endif /* DISPATCH */
+
+#endif /* LIB_ARM_ADLER32_IMPL_H */
diff --git a/src/3rdparty/libdeflate/lib/arm/cpu_features.c b/src/3rdparty/libdeflate/lib/arm/cpu_features.c
new file mode 100644 (file)
index 0000000..60b1be3
--- /dev/null
@@ -0,0 +1,133 @@
+/*
+ * arm/cpu_features.c - feature detection for ARM processors
+ *
+ * Copyright 2018 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * ARM processors don't have a standard way for unprivileged programs to detect
+ * processor features.  But, on Linux we can read the AT_HWCAP and AT_HWCAP2
+ * values from /proc/self/auxv.
+ *
+ * Ideally we'd use the C library function getauxval(), but it's not guaranteed
+ * to be available: it was only added to glibc in 2.16, and in Android it was
+ * added to API level 18 for ARM and level 21 for AArch64.
+ */
+
+#include "../cpu_features_common.h" /* must be included first */
+#include "cpu_features.h"
+
+#if ARM_CPU_FEATURES_ENABLED
+
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+#define AT_HWCAP       16
+#define AT_HWCAP2      26
+
+volatile u32 _cpu_features = 0;
+
+static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2)
+{
+       int fd;
+       unsigned long auxbuf[32];
+       int filled = 0;
+       int i;
+
+       fd = open("/proc/self/auxv", O_RDONLY);
+       if (fd < 0)
+               return;
+
+       for (;;) {
+               do {
+                       int ret = read(fd, &((char *)auxbuf)[filled],
+                                      sizeof(auxbuf) - filled);
+                       if (ret <= 0) {
+                               if (ret < 0 && errno == EINTR)
+                                       continue;
+                               goto out;
+                       }
+                       filled += ret;
+               } while (filled < 2 * sizeof(long));
+
+               i = 0;
+               do {
+                       unsigned long type = auxbuf[i];
+                       unsigned long value = auxbuf[i + 1];
+
+                       if (type == AT_HWCAP)
+                               *hwcap = value;
+                       else if (type == AT_HWCAP2)
+                               *hwcap2 = value;
+                       i += 2;
+                       filled -= 2 * sizeof(long);
+               } while (filled >= 2 * sizeof(long));
+
+               memmove(auxbuf, &auxbuf[i], filled);
+       }
+out:
+       close(fd);
+}
+
+static const struct cpu_feature arm_cpu_feature_table[] = {
+       {ARM_CPU_FEATURE_NEON,          "neon"},
+       {ARM_CPU_FEATURE_PMULL,         "pmull"},
+       {ARM_CPU_FEATURE_CRC32,         "crc32"},
+};
+
+void setup_cpu_features(void)
+{
+       u32 features = 0;
+       unsigned long hwcap = 0;
+       unsigned long hwcap2 = 0;
+
+       scan_auxv(&hwcap, &hwcap2);
+
+#ifdef __arm__
+       STATIC_ASSERT(sizeof(long) == 4);
+       if (hwcap & (1 << 12))  /* HWCAP_NEON */
+               features |= ARM_CPU_FEATURE_NEON;
+       if (hwcap2 & (1 << 1))  /* HWCAP2_PMULL */
+               features |= ARM_CPU_FEATURE_PMULL;
+       if (hwcap2 & (1 << 4))  /* HWCAP2_CRC32 */
+               features |= ARM_CPU_FEATURE_CRC32;
+#else
+       STATIC_ASSERT(sizeof(long) == 8);
+       if (hwcap & (1 << 1))   /* HWCAP_ASIMD */
+               features |= ARM_CPU_FEATURE_NEON;
+       if (hwcap & (1 << 4))   /* HWCAP_PMULL */
+               features |= ARM_CPU_FEATURE_PMULL;
+       if (hwcap & (1 << 7))   /* HWCAP_CRC32 */
+               features |= ARM_CPU_FEATURE_CRC32;
+#endif
+
+       disable_cpu_features_for_testing(&features, arm_cpu_feature_table,
+                                        ARRAY_LEN(arm_cpu_feature_table));
+
+       _cpu_features = features | ARM_CPU_FEATURES_KNOWN;
+}
+
+#endif /* ARM_CPU_FEATURES_ENABLED */
diff --git a/src/3rdparty/libdeflate/lib/arm/cpu_features.h b/src/3rdparty/libdeflate/lib/arm/cpu_features.h
new file mode 100644 (file)
index 0000000..69d7235
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * arm/cpu_features.h - feature detection for ARM processors
+ */
+
+#ifndef LIB_ARM_CPU_FEATURES_H
+#define LIB_ARM_CPU_FEATURES_H
+
+#include "../lib_common.h"
+
+#if (defined(__arm__) || defined(__aarch64__)) && \
+       defined(__linux__) && \
+       COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE && \
+       !defined(FREESTANDING)
+#  define ARM_CPU_FEATURES_ENABLED 1
+#else
+#  define ARM_CPU_FEATURES_ENABLED 0
+#endif
+
+#if ARM_CPU_FEATURES_ENABLED
+
+#define ARM_CPU_FEATURE_NEON           0x00000001
+#define ARM_CPU_FEATURE_PMULL          0x00000002
+#define ARM_CPU_FEATURE_CRC32          0x00000004
+
+#define ARM_CPU_FEATURES_KNOWN         0x80000000
+
+extern volatile u32 _cpu_features;
+
+void setup_cpu_features(void);
+
+static inline u32 get_cpu_features(void)
+{
+       if (_cpu_features == 0)
+               setup_cpu_features();
+       return _cpu_features;
+}
+
+#endif /* ARM_CPU_FEATURES_ENABLED */
+
+#endif /* LIB_ARM_CPU_FEATURES_H */
diff --git a/src/3rdparty/libdeflate/lib/arm/crc32_impl.h b/src/3rdparty/libdeflate/lib/arm/crc32_impl.h
new file mode 100644 (file)
index 0000000..238a85a
--- /dev/null
@@ -0,0 +1,247 @@
+/*
+ * arm/crc32_impl.h
+ *
+ * Copyright 2017 Jun He <jun.he@linaro.org>
+ * Copyright 2018 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_ARM_CRC32_IMPL_H
+#define LIB_ARM_CRC32_IMPL_H
+
+#include "cpu_features.h"
+
+/* Implementation using ARM CRC32 instructions */
+#undef DISPATCH_ARM
+#if !defined(DEFAULT_IMPL) && \
+    (defined(__ARM_FEATURE_CRC32) || \
+     (ARM_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_CRC32_TARGET_INTRINSICS))
+#  ifdef __ARM_FEATURE_CRC32
+#    define ATTRIBUTES
+#    define DEFAULT_IMPL       crc32_arm
+#  else
+#    ifdef __arm__
+#      ifdef __clang__
+#        define ATTRIBUTES     __attribute__((target("armv8-a,crc")))
+#      else
+#        define ATTRIBUTES     __attribute__((target("arch=armv8-a+crc")))
+#      endif
+#    else
+#      ifdef __clang__
+#        define ATTRIBUTES     __attribute__((target("crc")))
+#      else
+#        define ATTRIBUTES     __attribute__((target("+crc")))
+#      endif
+#    endif
+#    define DISPATCH           1
+#    define DISPATCH_ARM       1
+#  endif
+
+/*
+ * gcc's (as of 10.1) version of arm_acle.h for arm32, and clang's (as of
+ * 10.0.1) version of arm_acle.h for both arm32 and arm64, have a bug where they
+ * only define the CRC32 functions like __crc32b() when __ARM_FEATURE_CRC32 is
+ * defined.  That prevents them from being used via __attribute__((target)) when
+ * the main target doesn't have CRC32 support enabled.  The actual built-ins
+ * like __builtin_arm_crc32b() are available and work, however; it's just the
+ * wrappers in arm_acle.h like __crc32b() that erroneously don't get defined.
+ * Work around this by manually defining __ARM_FEATURE_CRC32.
+ */
+#ifndef __ARM_FEATURE_CRC32
+#  define __ARM_FEATURE_CRC32  1
+#endif
+#include <arm_acle.h>
+
+static u32 ATTRIBUTES
+crc32_arm(u32 remainder, const u8 *p, size_t size)
+{
+       while (size != 0 && (uintptr_t)p & 7) {
+               remainder = __crc32b(remainder, *p++);
+               size--;
+       }
+
+       while (size >= 32) {
+               remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 0)));
+               remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 1)));
+               remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 2)));
+               remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 3)));
+               p += 32;
+               size -= 32;
+       }
+
+       while (size >= 8) {
+               remainder = __crc32d(remainder, le64_bswap(*(u64 *)p));
+               p += 8;
+               size -= 8;
+       }
+
+       while (size != 0) {
+               remainder = __crc32b(remainder, *p++);
+               size--;
+       }
+
+       return remainder;
+}
+#undef ATTRIBUTES
+#endif /* Implementation using ARM CRC32 instructions */
+
+/*
+ * CRC-32 folding with ARM Crypto extension-PMULL
+ *
+ * This works the same way as the x86 PCLMUL version.
+ * See x86/crc32_pclmul_template.h for an explanation.
+ */
+#undef DISPATCH_PMULL
+#if !defined(DEFAULT_IMPL) && \
+    (defined(__ARM_FEATURE_CRYPTO) ||  \
+     (ARM_CPU_FEATURES_ENABLED &&      \
+      COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS)) && \
+      /* not yet tested on big endian, probably needs changes to work there */ \
+    (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#  define FUNCNAME             crc32_pmull
+#  define FUNCNAME_ALIGNED     crc32_pmull_aligned
+#  ifdef __ARM_FEATURE_CRYPTO
+#    define ATTRIBUTES
+#    define DEFAULT_IMPL       crc32_pmull
+#  else
+#    ifdef __arm__
+#      define ATTRIBUTES       __attribute__((target("fpu=crypto-neon-fp-armv8")))
+#    else
+#      ifdef __clang__
+#        define ATTRIBUTES     __attribute__((target("crypto")))
+#      else
+#        define ATTRIBUTES     __attribute__((target("+crypto")))
+#      endif
+#    endif
+#    define DISPATCH           1
+#    define DISPATCH_PMULL     1
+#  endif
+
+#include <arm_neon.h>
+
+static forceinline ATTRIBUTES uint8x16_t
+clmul_00(uint8x16_t a, uint8x16_t b)
+{
+       return (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(a),
+                                    (poly64_t)vget_low_u8(b));
+}
+
+static forceinline ATTRIBUTES uint8x16_t
+clmul_10(uint8x16_t a, uint8x16_t b)
+{
+       return (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(a),
+                                    (poly64_t)vget_high_u8(b));
+}
+
+static forceinline ATTRIBUTES uint8x16_t
+clmul_11(uint8x16_t a, uint8x16_t b)
+{
+       return (uint8x16_t)vmull_high_p64((poly64x2_t)a, (poly64x2_t)b);
+}
+
+static forceinline ATTRIBUTES uint8x16_t
+fold_128b(uint8x16_t dst, uint8x16_t src, uint8x16_t multipliers)
+{
+       return dst ^ clmul_00(src, multipliers) ^ clmul_11(src, multipliers);
+}
+
+static forceinline ATTRIBUTES u32
+crc32_pmull_aligned(u32 remainder, const uint8x16_t *p, size_t nr_segs)
+{
+       /* Constants precomputed by gen_crc32_multipliers.c.  Do not edit! */
+       const uint8x16_t multipliers_4 =
+               (uint8x16_t)(uint64x2_t){ 0x8F352D95, 0x1D9513D7 };
+       const uint8x16_t multipliers_1 =
+               (uint8x16_t)(uint64x2_t){ 0xAE689191, 0xCCAA009E };
+       const uint8x16_t final_multiplier =
+               (uint8x16_t)(uint64x2_t){ 0xB8BC6765 };
+       const uint8x16_t mask32 = (uint8x16_t)(uint32x4_t){ 0xFFFFFFFF };
+       const uint8x16_t barrett_reduction_constants =
+                       (uint8x16_t)(uint64x2_t){ 0x00000001F7011641,
+                                                 0x00000001DB710641 };
+       const uint8x16_t zeroes = (uint8x16_t){ 0 };
+
+       const uint8x16_t * const end = p + nr_segs;
+       const uint8x16_t * const end512 = p + (nr_segs & ~3);
+       uint8x16_t x0, x1, x2, x3;
+
+       x0 = *p++ ^ (uint8x16_t)(uint32x4_t){ remainder };
+       if (nr_segs >= 4) {
+               x1 = *p++;
+               x2 = *p++;
+               x3 = *p++;
+
+               /* Fold 512 bits at a time */
+               while (p != end512) {
+                       x0 = fold_128b(*p++, x0, multipliers_4);
+                       x1 = fold_128b(*p++, x1, multipliers_4);
+                       x2 = fold_128b(*p++, x2, multipliers_4);
+                       x3 = fold_128b(*p++, x3, multipliers_4);
+               }
+
+               /* Fold 512 bits => 128 bits */
+               x1 = fold_128b(x1, x0, multipliers_1);
+               x2 = fold_128b(x2, x1, multipliers_1);
+               x0 = fold_128b(x3, x2, multipliers_1);
+       }
+
+       /* Fold 128 bits at a time */
+       while (p != end)
+               x0 = fold_128b(*p++, x0, multipliers_1);
+
+       /* Fold 128 => 96 bits, implicitly appending 32 zeroes */
+       x0 = vextq_u8(x0, zeroes, 8) ^ clmul_10(x0, multipliers_1);
+
+       /* Fold 96 => 64 bits */
+       x0 = vextq_u8(x0, zeroes, 4) ^ clmul_00(x0 & mask32, final_multiplier);
+
+       /* Reduce 64 => 32 bits using Barrett reduction */
+       x1 = x0;
+       x0 = clmul_00(x0 & mask32, barrett_reduction_constants);
+       x0 = clmul_10(x0 & mask32, barrett_reduction_constants);
+       return vgetq_lane_u32((uint32x4_t)(x0 ^ x1), 1);
+}
+#define IMPL_ALIGNMENT         16
+#define IMPL_SEGMENT_SIZE      16
+#include "../crc32_vec_template.h"
+#endif /* PMULL implementation */
+
+#ifdef DISPATCH
+static inline crc32_func_t
+arch_select_crc32_func(void)
+{
+       u32 features = get_cpu_features();
+
+#ifdef DISPATCH_ARM
+       if (features & ARM_CPU_FEATURE_CRC32)
+               return crc32_arm;
+#endif
+#ifdef DISPATCH_PMULL
+       if (features & ARM_CPU_FEATURE_PMULL)
+               return crc32_pmull;
+#endif
+       return NULL;
+}
+#endif /* DISPATCH */
+
+#endif /* LIB_ARM_CRC32_IMPL_H */
diff --git a/src/3rdparty/libdeflate/lib/arm/matchfinder_impl.h b/src/3rdparty/libdeflate/lib/arm/matchfinder_impl.h
new file mode 100644 (file)
index 0000000..da0d2fd
--- /dev/null
@@ -0,0 +1,86 @@
+/*
+ * arm/matchfinder_impl.h - ARM implementations of matchfinder functions
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_ARM_MATCHFINDER_IMPL_H
+#define LIB_ARM_MATCHFINDER_IMPL_H
+
+#ifdef __ARM_NEON
+#  include <arm_neon.h>
+static forceinline void
+matchfinder_init_neon(mf_pos_t *data, size_t size)
+{
+       int16x8_t *p = (int16x8_t *)data;
+       int16x8_t v = (int16x8_t) {
+               MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
+               MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
+               MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
+       };
+
+       STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+       STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+       STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+       do {
+               p[0] = v;
+               p[1] = v;
+               p[2] = v;
+               p[3] = v;
+               p += 4;
+               size -= 4 * sizeof(*p);
+       } while (size != 0);
+}
+#define matchfinder_init matchfinder_init_neon
+
+static forceinline void
+matchfinder_rebase_neon(mf_pos_t *data, size_t size)
+{
+       int16x8_t *p = (int16x8_t *)data;
+       int16x8_t v = (int16x8_t) {
+               (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
+               (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
+               (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
+               (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
+       };
+
+       STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+       STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+       STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+       do {
+               p[0] = vqaddq_s16(p[0], v);
+               p[1] = vqaddq_s16(p[1], v);
+               p[2] = vqaddq_s16(p[2], v);
+               p[3] = vqaddq_s16(p[3], v);
+               p += 4;
+               size -= 4 * sizeof(*p);
+       } while (size != 0);
+}
+#define matchfinder_rebase matchfinder_rebase_neon
+
+#endif /* __ARM_NEON */
+
+#endif /* LIB_ARM_MATCHFINDER_IMPL_H */
diff --git a/src/3rdparty/libdeflate/lib/bt_matchfinder.h b/src/3rdparty/libdeflate/lib/bt_matchfinder.h
new file mode 100644 (file)
index 0000000..8817141
--- /dev/null
@@ -0,0 +1,343 @@
+/*
+ * bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ----------------------------------------------------------------------------
+ *
+ * This is a Binary Trees (bt) based matchfinder.
+ *
+ * The main data structure is a hash table where each hash bucket contains a
+ * binary tree of sequences whose first 4 bytes share the same hash code.  Each
+ * sequence is identified by its starting position in the input buffer.  Each
+ * binary tree is always sorted such that each left child represents a sequence
+ * lexicographically lesser than its parent and each right child represents a
+ * sequence lexicographically greater than its parent.
+ *
+ * The algorithm processes the input buffer sequentially.  At each byte
+ * position, the hash code of the first 4 bytes of the sequence beginning at
+ * that position (the sequence being matched against) is computed.  This
+ * identifies the hash bucket to use for that position.  Then, a new binary tree
+ * node is created to represent the current sequence.  Then, in a single tree
+ * traversal, the hash bucket's binary tree is searched for matches and is
+ * re-rooted at the new node.
+ *
+ * Compared to the simpler algorithm that uses linked lists instead of binary
+ * trees (see hc_matchfinder.h), the binary tree version gains more information
+ * at each node visitation.  Ideally, the binary tree version will examine only
+ * 'log(n)' nodes to find the same matches that the linked list version will
+ * find by examining 'n' nodes.  In addition, the binary tree version can
+ * examine fewer bytes at each node by taking advantage of the common prefixes
+ * that result from the sort order, whereas the linked list version may have to
+ * examine up to the full length of the match at each node.
+ *
+ * However, it is not always best to use the binary tree version.  It requires
+ * nearly twice as much memory as the linked list version, and it takes time to
+ * keep the binary trees sorted, even at positions where the compressor does not
+ * need matches.  Generally, when doing fast compression on small buffers,
+ * binary trees are the wrong approach.  They are best suited for thorough
+ * compression and/or large buffers.
+ *
+ * ----------------------------------------------------------------------------
+ */
+
+#ifndef LIB_BT_MATCHFINDER_H
+#define LIB_BT_MATCHFINDER_H
+
+#include "matchfinder_common.h"
+
+#define BT_MATCHFINDER_HASH3_ORDER 16
+#define BT_MATCHFINDER_HASH3_WAYS  2
+#define BT_MATCHFINDER_HASH4_ORDER 16
+
+#define BT_MATCHFINDER_TOTAL_HASH_SIZE         \
+       (((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \
+         (1UL << BT_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t))
+
+/* Representation of a match found by the bt_matchfinder  */
+struct lz_match {
+
+       /* The number of bytes matched.  */
+       u16 length;
+
+       /* The offset back from the current position that was matched.  */
+       u16 offset;
+};
+
+struct bt_matchfinder {
+
+       /* The hash table for finding length 3 matches  */
+       mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS];
+
+       /* The hash table which contains the roots of the binary trees for
+        * finding length 4+ matches  */
+       mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER];
+
+       /* The child node references for the binary trees.  The left and right
+        * children of the node for the sequence with position 'pos' are
+        * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively.  */
+       mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE];
+
+} MATCHFINDER_ALIGNED;
+
+/* Prepare the matchfinder for a new input buffer.  */
+static forceinline void
+bt_matchfinder_init(struct bt_matchfinder *mf)
+{
+       STATIC_ASSERT(BT_MATCHFINDER_TOTAL_HASH_SIZE %
+                     MATCHFINDER_SIZE_ALIGNMENT == 0);
+
+       matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_SIZE);
+}
+
+static forceinline void
+bt_matchfinder_slide_window(struct bt_matchfinder *mf)
+{
+       STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0);
+
+       matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf));
+}
+
+static forceinline mf_pos_t *
+bt_left_child(struct bt_matchfinder *mf, s32 node)
+{
+       return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 0];
+}
+
+static forceinline mf_pos_t *
+bt_right_child(struct bt_matchfinder *mf, s32 node)
+{
+       return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 1];
+}
+
+/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches()
+ * and bt_matchfinder_skip_byte().  There must be sufficiently many bytes
+ * remaining to load a 32-bit integer from the *next* position.  */
+#define BT_MATCHFINDER_REQUIRED_NBYTES 5
+
+/* Advance the binary tree matchfinder by one byte, optionally recording
+ * matches.  @record_matches should be a compile-time constant.  */
+static forceinline struct lz_match *
+bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf,
+                               const u8 * const restrict in_base,
+                               const ptrdiff_t cur_pos,
+                               const u32 max_len,
+                               const u32 nice_len,
+                               const u32 max_search_depth,
+                               u32 * const restrict next_hashes,
+                               struct lz_match * restrict lz_matchptr,
+                               const bool record_matches)
+{
+       const u8 *in_next = in_base + cur_pos;
+       u32 depth_remaining = max_search_depth;
+       const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
+       u32 next_hashseq;
+       u32 hash3;
+       u32 hash4;
+       s32 cur_node;
+#if BT_MATCHFINDER_HASH3_WAYS >= 2
+       s32 cur_node_2;
+#endif
+       const u8 *matchptr;
+       mf_pos_t *pending_lt_ptr, *pending_gt_ptr;
+       u32 best_lt_len, best_gt_len;
+       u32 len;
+       u32 best_len = 3;
+
+       STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 &&
+                     BT_MATCHFINDER_HASH3_WAYS <= 2);
+
+       next_hashseq = get_unaligned_le32(in_next + 1);
+
+       hash3 = next_hashes[0];
+       hash4 = next_hashes[1];
+
+       next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, BT_MATCHFINDER_HASH3_ORDER);
+       next_hashes[1] = lz_hash(next_hashseq, BT_MATCHFINDER_HASH4_ORDER);
+       prefetchw(&mf->hash3_tab[next_hashes[0]]);
+       prefetchw(&mf->hash4_tab[next_hashes[1]]);
+
+       cur_node = mf->hash3_tab[hash3][0];
+       mf->hash3_tab[hash3][0] = cur_pos;
+#if BT_MATCHFINDER_HASH3_WAYS >= 2
+       cur_node_2 = mf->hash3_tab[hash3][1];
+       mf->hash3_tab[hash3][1] = cur_node;
+#endif
+       if (record_matches && cur_node > cutoff) {
+               u32 seq3 = load_u24_unaligned(in_next);
+               if (seq3 == load_u24_unaligned(&in_base[cur_node])) {
+                       lz_matchptr->length = 3;
+                       lz_matchptr->offset = in_next - &in_base[cur_node];
+                       lz_matchptr++;
+               }
+       #if BT_MATCHFINDER_HASH3_WAYS >= 2
+               else if (cur_node_2 > cutoff &&
+                       seq3 == load_u24_unaligned(&in_base[cur_node_2]))
+               {
+                       lz_matchptr->length = 3;
+                       lz_matchptr->offset = in_next - &in_base[cur_node_2];
+                       lz_matchptr++;
+               }
+       #endif
+       }
+
+       cur_node = mf->hash4_tab[hash4];
+       mf->hash4_tab[hash4] = cur_pos;
+
+       pending_lt_ptr = bt_left_child(mf, cur_pos);
+       pending_gt_ptr = bt_right_child(mf, cur_pos);
+
+       if (cur_node <= cutoff) {
+               *pending_lt_ptr = MATCHFINDER_INITVAL;
+               *pending_gt_ptr = MATCHFINDER_INITVAL;
+               return lz_matchptr;
+       }
+
+       best_lt_len = 0;
+       best_gt_len = 0;
+       len = 0;
+
+       for (;;) {
+               matchptr = &in_base[cur_node];
+
+               if (matchptr[len] == in_next[len]) {
+                       len = lz_extend(in_next, matchptr, len + 1, max_len);
+                       if (!record_matches || len > best_len) {
+                               if (record_matches) {
+                                       best_len = len;
+                                       lz_matchptr->length = len;
+                                       lz_matchptr->offset = in_next - matchptr;
+                                       lz_matchptr++;
+                               }
+                               if (len >= nice_len) {
+                                       *pending_lt_ptr = *bt_left_child(mf, cur_node);
+                                       *pending_gt_ptr = *bt_right_child(mf, cur_node);
+                                       return lz_matchptr;
+                               }
+                       }
+               }
+
+               if (matchptr[len] < in_next[len]) {
+                       *pending_lt_ptr = cur_node;
+                       pending_lt_ptr = bt_right_child(mf, cur_node);
+                       cur_node = *pending_lt_ptr;
+                       best_lt_len = len;
+                       if (best_gt_len < len)
+                               len = best_gt_len;
+               } else {
+                       *pending_gt_ptr = cur_node;
+                       pending_gt_ptr = bt_left_child(mf, cur_node);
+                       cur_node = *pending_gt_ptr;
+                       best_gt_len = len;
+                       if (best_lt_len < len)
+                               len = best_lt_len;
+               }
+
+               if (cur_node <= cutoff || !--depth_remaining) {
+                       *pending_lt_ptr = MATCHFINDER_INITVAL;
+                       *pending_gt_ptr = MATCHFINDER_INITVAL;
+                       return lz_matchptr;
+               }
+       }
+}
+
+/*
+ * Retrieve a list of matches with the current position.
+ *
+ * @mf
+ *     The matchfinder structure.
+ * @in_base
+ *     Pointer to the next byte in the input buffer to process _at the last
+ *     time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_.
+ * @cur_pos
+ *     The current position in the input buffer relative to @in_base (the
+ *     position of the sequence being matched against).
+ * @max_len
+ *     The maximum permissible match length at this position.  Must be >=
+ *     BT_MATCHFINDER_REQUIRED_NBYTES.
+ * @nice_len
+ *     Stop searching if a match of at least this length is found.
+ *     Must be <= @max_len.
+ * @max_search_depth
+ *     Limit on the number of potential matches to consider.  Must be >= 1.
+ * @next_hashes
+ *     The precomputed hash codes for the sequence beginning at @in_next.
+ *     These will be used and then updated with the precomputed hashcodes for
+ *     the sequence beginning at @in_next + 1.
+ * @lz_matchptr
+ *     An array in which this function will record the matches.  The recorded
+ *     matches will be sorted by strictly increasing length and (non-strictly)
+ *     increasing offset.  The maximum number of matches that may be found is
+ *     'nice_len - 2'.
+ *
+ * The return value is a pointer to the next available slot in the @lz_matchptr
+ * array.  (If no matches were found, this will be the same as @lz_matchptr.)
+ */
+static forceinline struct lz_match *
+bt_matchfinder_get_matches(struct bt_matchfinder *mf,
+                          const u8 *in_base,
+                          ptrdiff_t cur_pos,
+                          u32 max_len,
+                          u32 nice_len,
+                          u32 max_search_depth,
+                          u32 next_hashes[2],
+                          struct lz_match *lz_matchptr)
+{
+       return bt_matchfinder_advance_one_byte(mf,
+                                              in_base,
+                                              cur_pos,
+                                              max_len,
+                                              nice_len,
+                                              max_search_depth,
+                                              next_hashes,
+                                              lz_matchptr,
+                                              true);
+}
+
+/*
+ * Advance the matchfinder, but don't record any matches.
+ *
+ * This is very similar to bt_matchfinder_get_matches() because both functions
+ * must do hashing and tree re-rooting.
+ */
+static forceinline void
+bt_matchfinder_skip_byte(struct bt_matchfinder *mf,
+                        const u8 *in_base,
+                        ptrdiff_t cur_pos,
+                        u32 nice_len,
+                        u32 max_search_depth,
+                        u32 next_hashes[2])
+{
+       bt_matchfinder_advance_one_byte(mf,
+                                       in_base,
+                                       cur_pos,
+                                       nice_len,
+                                       nice_len,
+                                       max_search_depth,
+                                       next_hashes,
+                                       NULL,
+                                       false);
+}
+
+#endif /* LIB_BT_MATCHFINDER_H */
diff --git a/src/3rdparty/libdeflate/lib/cpu_features_common.h b/src/3rdparty/libdeflate/lib/cpu_features_common.h
new file mode 100644 (file)
index 0000000..570b62d
--- /dev/null
@@ -0,0 +1,88 @@
+/*
+ * cpu_features_common.h - code shared by all lib/$arch/cpu_features.c
+ *
+ * Copyright 2020 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_CPU_FEATURES_COMMON_H
+#define LIB_CPU_FEATURES_COMMON_H
+
+#if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING)
+#  define _GNU_SOURCE 1 /* for strdup() and strtok_r() */
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <string.h>
+#endif
+
+#include "lib_common.h"
+
+struct cpu_feature {
+       u32 bit;
+       const char *name;
+};
+
+#if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING)
+/* Disable any features that are listed in $LIBDEFLATE_DISABLE_CPU_FEATURES. */
+static inline void
+disable_cpu_features_for_testing(u32 *features,
+                                const struct cpu_feature *feature_table,
+                                size_t feature_table_length)
+{
+       char *env_value, *strbuf, *p, *saveptr = NULL;
+       size_t i;
+
+       env_value = getenv("LIBDEFLATE_DISABLE_CPU_FEATURES");
+       if (!env_value)
+               return;
+       strbuf = strdup(env_value);
+       if (!strbuf)
+               abort();
+       p = strtok_r(strbuf, ",", &saveptr);
+       while (p) {
+               for (i = 0; i < feature_table_length; i++) {
+                       if (strcmp(p, feature_table[i].name) == 0) {
+                               *features &= ~feature_table[i].bit;
+                               break;
+                       }
+               }
+               if (i == feature_table_length) {
+                       fprintf(stderr,
+                               "unrecognized feature in LIBDEFLATE_DISABLE_CPU_FEATURES: \"%s\"\n",
+                               p);
+                       abort();
+               }
+               p = strtok_r(NULL, ",", &saveptr);
+       }
+       free(strbuf);
+}
+#else /* TEST_SUPPORT__DO_NOT_USE */
+static inline void
+disable_cpu_features_for_testing(u32 *features,
+                                const struct cpu_feature *feature_table,
+                                size_t feature_table_length)
+{
+}
+#endif /* !TEST_SUPPORT__DO_NOT_USE */
+
+#endif /* LIB_CPU_FEATURES_COMMON_H */
diff --git a/src/3rdparty/libdeflate/lib/crc32.c b/src/3rdparty/libdeflate/lib/crc32.c
new file mode 100644 (file)
index 0000000..6adacc5
--- /dev/null
@@ -0,0 +1,313 @@
+/*
+ * crc32.c - CRC-32 checksum algorithm for the gzip format
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * High-level description of CRC
+ * =============================
+ *
+ * Consider a bit sequence 'bits[1...len]'.  Interpret 'bits' as the "message"
+ * polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2),
+ * where the coefficient of 'x^i' is 'bits[len - i]'.  Then, compute:
+ *
+ *                     R(x) = M(x)*x^n mod G(x)
+ *
+ * where G(x) is a selected "generator" polynomial of degree 'n'.  The remainder
+ * R(x) is a polynomial of max degree 'n - 1'.  The CRC of 'bits' is R(x)
+ * interpreted as a bitstring of length 'n'.
+ *
+ * CRC used in gzip
+ * ================
+ *
+ * In the gzip format (RFC 1952):
+ *
+ *     - The bitstring to checksum is formed from the bytes of the uncompressed
+ *       data by concatenating the bits from the bytes in order, proceeding
+ *       from the low-order bit to the high-order bit within each byte.
+ *
+ *     - The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 +
+ *       x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1.
+ *       Consequently, the CRC length is 32 bits ("CRC-32").
+ *
+ *     - The highest order 32 coefficients of M(x)*x^n are inverted.
+ *
+ *     - All 32 coefficients of R(x) are inverted.
+ *
+ * The two inversions cause added leading and trailing zero bits to affect the
+ * resulting CRC, whereas with a regular CRC such bits would have no effect on
+ * the CRC.
+ *
+ * Computation and optimizations
+ * =============================
+ *
+ * We can compute R(x) through "long division", maintaining only 32 bits of
+ * state at any given time.  Multiplication by 'x' can be implemented as
+ * right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the
+ * highest order bit represents the coefficient of x^0), and both addition and
+ * subtraction can be implemented as bitwise exclusive OR (since we are working
+ * in GF(2)).  Here is an unoptimized implementation:
+ *
+ *     static u32 crc32_gzip(const u8 *buffer, size_t size)
+ *     {
+ *             u32 remainder = 0;
+ *             const u32 divisor = 0xEDB88320;
+ *
+ *             for (size_t i = 0; i < size * 8 + 32; i++) {
+ *                     int bit;
+ *                     u32 multiple;
+ *
+ *                     if (i < size * 8)
+ *                             bit = (buffer[i / 8] >> (i % 8)) & 1;
+ *                     else
+ *                             bit = 0; // one of the 32 appended 0 bits
+ *
+ *                     if (i < 32) // the first 32 bits are inverted
+ *                             bit ^= 1;
+ *
+ *                     if (remainder & 1)
+ *                             multiple = divisor;
+ *                     else
+ *                             multiple = 0;
+ *
+ *                     remainder >>= 1;
+ *                     remainder |= (u32)bit << 31;
+ *                     remainder ^= multiple;
+ *             }
+ *
+ *             return ~remainder;
+ *     }
+ *
+ * In this implementation, the 32-bit integer 'remainder' maintains the
+ * remainder of the currently processed portion of the message (with 32 zero
+ * bits appended) when divided by the generator polynomial.  'remainder' is the
+ * representation of R(x), and 'divisor' is the representation of G(x) excluding
+ * the x^32 coefficient.  For each bit to process, we multiply R(x) by 'x^1',
+ * then add 'x^0' if the new bit is a 1.  If this causes R(x) to gain a nonzero
+ * x^32 term, then we subtract G(x) from R(x).
+ *
+ * We can speed this up by taking advantage of the fact that XOR is commutative
+ * and associative, so the order in which we combine the inputs into 'remainder'
+ * is unimportant.  And since each message bit we add doesn't affect the choice
+ * of 'multiple' until 32 bits later, we need not actually add each message bit
+ * until that point:
+ *
+ *     static u32 crc32_gzip(const u8 *buffer, size_t size)
+ *     {
+ *             u32 remainder = ~0;
+ *             const u32 divisor = 0xEDB88320;
+ *
+ *             for (size_t i = 0; i < size * 8; i++) {
+ *                     int bit;
+ *                     u32 multiple;
+ *
+ *                     bit = (buffer[i / 8] >> (i % 8)) & 1;
+ *                     remainder ^= bit;
+ *                     if (remainder & 1)
+ *                             multiple = divisor;
+ *                     else
+ *                             multiple = 0;
+ *                     remainder >>= 1;
+ *                     remainder ^= multiple;
+ *             }
+ *
+ *             return ~remainder;
+ *     }
+ *
+ * With the above implementation we get the effect of 32 appended 0 bits for
+ * free; they never affect the choice of a divisor, nor would they change the
+ * value of 'remainder' if they were to be actually XOR'ed in.  And by starting
+ * with a remainder of all 1 bits, we get the effect of complementing the first
+ * 32 message bits.
+ *
+ * The next optimization is to process the input in multi-bit units.  Suppose
+ * that we insert the next 'n' message bits into the remainder.  Then we get an
+ * intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n'
+ * bits is the amount by which the low 32 bits of the remainder will change as a
+ * result of cancelling out those 'n' bits.  Taking n=8 (one byte) and
+ * precomputing a table containing the CRC of each possible byte, we get
+ * crc32_slice1() defined below.
+ *
+ * As a further optimization, we could increase the multi-bit unit size to 16.
+ * However, that is inefficient because the table size explodes from 256 entries
+ * (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't
+ * fit in L1 cache on typical processors.
+ *
+ * However, we can actually process 4 bytes at a time using 4 different tables
+ * with 256 entries each.  Logically, we form a 64-bit intermediate remainder
+ * and cancel out the high 32 bits in 8-bit chunks.  Bits 32-39 are cancelled
+ * out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the
+ * CRC of those bits with 8 zero bits appended, and so on.  This method is
+ * implemented in crc32_slice4(), defined below.
+ *
+ * In crc32_slice8(), this method is extended to 8 bytes at a time.  The
+ * intermediate remainder (which we never actually store explicitly) is 96 bits.
+ *
+ * On CPUs that support fast carryless multiplication, CRCs can be computed even
+ * more quickly via "folding".  See e.g. the x86 PCLMUL implementation.
+ */
+
+#include "lib_common.h"
+#include "libdeflate.h"
+
+typedef u32 (*crc32_func_t)(u32, const u8 *, size_t);
+
+/* Include architecture-specific implementations if available */
+#undef CRC32_SLICE1
+#undef CRC32_SLICE4
+#undef CRC32_SLICE8
+#undef DEFAULT_IMPL
+#undef DISPATCH
+#if defined(__arm__) || defined(__aarch64__)
+#  include "arm/crc32_impl.h"
+#elif defined(__i386__) || defined(__x86_64__)
+#  include "x86/crc32_impl.h"
+#endif
+
+/*
+ * Define a generic implementation (crc32_slice8()) if needed.  crc32_slice1()
+ * may also be needed as a fallback for architecture-specific implementations.
+ */
+
+#ifndef DEFAULT_IMPL
+#  define CRC32_SLICE8 1
+#  define DEFAULT_IMPL crc32_slice8
+#endif
+
+#if defined(CRC32_SLICE1) || defined(CRC32_SLICE4) || defined(CRC32_SLICE8)
+#include "crc32_table.h"
+static forceinline u32
+crc32_update_byte(u32 remainder, u8 next_byte)
+{
+       return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte];
+}
+#endif
+
+#ifdef CRC32_SLICE1
+static u32
+crc32_slice1(u32 remainder, const u8 *buffer, size_t size)
+{
+       size_t i;
+
+       STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x100);
+
+       for (i = 0; i < size; i++)
+               remainder = crc32_update_byte(remainder, buffer[i]);
+       return remainder;
+}
+#endif /* CRC32_SLICE1 */
+
+#ifdef CRC32_SLICE4
+static u32
+crc32_slice4(u32 remainder, const u8 *buffer, size_t size)
+{
+       const u8 *p = buffer;
+       const u8 *end = buffer + size;
+       const u8 *end32;
+
+       STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x400);
+
+       for (; ((uintptr_t)p & 3) && p != end; p++)
+               remainder = crc32_update_byte(remainder, *p);
+
+       end32 = p + ((end - p) & ~3);
+       for (; p != end32; p += 4) {
+               u32 v = le32_bswap(*(const u32 *)p);
+               remainder =
+                   crc32_table[0x300 + (u8)((remainder ^ v) >>  0)] ^
+                   crc32_table[0x200 + (u8)((remainder ^ v) >>  8)] ^
+                   crc32_table[0x100 + (u8)((remainder ^ v) >> 16)] ^
+                   crc32_table[0x000 + (u8)((remainder ^ v) >> 24)];
+       }
+
+       for (; p != end; p++)
+               remainder = crc32_update_byte(remainder, *p);
+
+       return remainder;
+}
+#endif /* CRC32_SLICE4 */
+
+#ifdef CRC32_SLICE8
+static u32
+crc32_slice8(u32 remainder, const u8 *buffer, size_t size)
+{
+       const u8 *p = buffer;
+       const u8 *end = buffer + size;
+       const u8 *end64;
+
+       STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x800);
+
+       for (; ((uintptr_t)p & 7) && p != end; p++)
+               remainder = crc32_update_byte(remainder, *p);
+
+       end64 = p + ((end - p) & ~7);
+       for (; p != end64; p += 8) {
+               u32 v1 = le32_bswap(*(const u32 *)(p + 0));
+               u32 v2 = le32_bswap(*(const u32 *)(p + 4));
+               remainder =
+                   crc32_table[0x700 + (u8)((remainder ^ v1) >>  0)] ^
+                   crc32_table[0x600 + (u8)((remainder ^ v1) >>  8)] ^
+                   crc32_table[0x500 + (u8)((remainder ^ v1) >> 16)] ^
+                   crc32_table[0x400 + (u8)((remainder ^ v1) >> 24)] ^
+                   crc32_table[0x300 + (u8)(v2 >>  0)] ^
+                   crc32_table[0x200 + (u8)(v2 >>  8)] ^
+                   crc32_table[0x100 + (u8)(v2 >> 16)] ^
+                   crc32_table[0x000 + (u8)(v2 >> 24)];
+       }
+
+       for (; p != end; p++)
+               remainder = crc32_update_byte(remainder, *p);
+
+       return remainder;
+}
+#endif /* CRC32_SLICE8 */
+
+#ifdef DISPATCH
+static u32 dispatch(u32, const u8 *, size_t);
+
+static volatile crc32_func_t crc32_impl = dispatch;
+
+/* Choose the fastest implementation at runtime */
+static u32 dispatch(u32 remainder, const u8 *buffer, size_t size)
+{
+       crc32_func_t f = arch_select_crc32_func();
+
+       if (f == NULL)
+               f = DEFAULT_IMPL;
+
+       crc32_impl = f;
+       return crc32_impl(remainder, buffer, size);
+}
+#else
+#  define crc32_impl DEFAULT_IMPL /* only one implementation, use it */
+#endif
+
+LIBDEFLATEEXPORT u32 LIBDEFLATEAPI
+libdeflate_crc32(u32 remainder, const void *buffer, size_t size)
+{
+       if (buffer == NULL) /* return initial value */
+               return 0;
+       return ~crc32_impl(~remainder, buffer, size);
+}
diff --git a/src/3rdparty/libdeflate/lib/crc32_table.h b/src/3rdparty/libdeflate/lib/crc32_table.h
new file mode 100644 (file)
index 0000000..05421b9
--- /dev/null
@@ -0,0 +1,526 @@
+/*
+ * crc32_table.h - data table to accelerate CRC-32 computation
+ *
+ * THIS FILE WAS AUTOMATICALLY GENERATED BY gen_crc32_table.c.  DO NOT EDIT.
+ */
+
+#include <stdint.h>
+
+static const uint32_t crc32_table[] = {
+       0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
+       0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
+       0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+       0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
+       0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+       0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+       0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
+       0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
+       0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+       0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+       0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
+       0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+       0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
+       0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
+       0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+       0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
+       0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
+       0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+       0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
+       0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+       0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+       0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
+       0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
+       0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+       0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+       0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
+       0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+       0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
+       0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
+       0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+       0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
+       0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
+       0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+       0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
+       0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+       0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+       0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
+       0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
+       0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+       0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+       0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
+       0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+       0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
+       0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
+       0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+       0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
+       0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
+       0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+       0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
+       0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+       0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+       0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
+       0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
+       0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+       0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+       0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
+       0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+       0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
+       0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
+       0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+       0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
+       0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
+       0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+       0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
+#if defined(CRC32_SLICE4) || defined(CRC32_SLICE8)
+       0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3,
+       0x646cc504, 0x7d77f445, 0x565aa786, 0x4f4196c7,
+       0xc8d98a08, 0xd1c2bb49, 0xfaefe88a, 0xe3f4d9cb,
+       0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e, 0x87981ccf,
+       0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192,
+       0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496,
+       0x821b9859, 0x9b00a918, 0xb02dfadb, 0xa936cb9a,
+       0xe6775d5d, 0xff6c6c1c, 0xd4413fdf, 0xcd5a0e9e,
+       0x958424a2, 0x8c9f15e3, 0xa7b24620, 0xbea97761,
+       0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265,
+       0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69,
+       0x39316bae, 0x202a5aef, 0x0b07092c, 0x121c386d,
+       0xdf4636f3, 0xc65d07b2, 0xed705471, 0xf46b6530,
+       0xbb2af3f7, 0xa231c2b6, 0x891c9175, 0x9007a034,
+       0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38,
+       0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c,
+       0xf0794f05, 0xe9627e44, 0xc24f2d87, 0xdb541cc6,
+       0x94158a01, 0x8d0ebb40, 0xa623e883, 0xbf38d9c2,
+       0x38a0c50d, 0x21bbf44c, 0x0a96a78f, 0x138d96ce,
+       0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca,
+       0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97,
+       0xded79850, 0xc7cca911, 0xece1fad2, 0xf5facb93,
+       0x7262d75c, 0x6b79e61d, 0x4054b5de, 0x594f849f,
+       0x160e1258, 0x0f152319, 0x243870da, 0x3d23419b,
+       0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864,
+       0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60,
+       0xad24e1af, 0xb43fd0ee, 0x9f12832d, 0x8609b26c,
+       0xc94824ab, 0xd05315ea, 0xfb7e4629, 0xe2657768,
+       0x2f3f79f6, 0x362448b7, 0x1d091b74, 0x04122a35,
+       0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31,
+       0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d,
+       0x838a36fa, 0x9a9107bb, 0xb1bc5478, 0xa8a76539,
+       0x3b83984b, 0x2298a90a, 0x09b5fac9, 0x10aecb88,
+       0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd, 0x74c20e8c,
+       0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180,
+       0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484,
+       0x71418a1a, 0x685abb5b, 0x4377e898, 0x5a6cd9d9,
+       0x152d4f1e, 0x0c367e5f, 0x271b2d9c, 0x3e001cdd,
+       0xb9980012, 0xa0833153, 0x8bae6290, 0x92b553d1,
+       0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5,
+       0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a,
+       0xca6b79ed, 0xd37048ac, 0xf85d1b6f, 0xe1462a2e,
+       0x66de36e1, 0x7fc507a0, 0x54e85463, 0x4df36522,
+       0x02b2f3e5, 0x1ba9c2a4, 0x30849167, 0x299fa026,
+       0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b,
+       0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f,
+       0x2c1c24b0, 0x350715f1, 0x1e2a4632, 0x07317773,
+       0x4870e1b4, 0x516bd0f5, 0x7a468336, 0x635db277,
+       0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc, 0xe0d7848d,
+       0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189,
+       0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85,
+       0x674f9842, 0x7e54a903, 0x5579fac0, 0x4c62cb81,
+       0x8138c51f, 0x9823f45e, 0xb30ea79d, 0xaa1596dc,
+       0xe554001b, 0xfc4f315a, 0xd7626299, 0xce7953d8,
+       0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4,
+       0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0,
+       0x5e7ef3ec, 0x4765c2ad, 0x6c48916e, 0x7553a02f,
+       0x3a1236e8, 0x230907a9, 0x0824546a, 0x113f652b,
+       0x96a779e4, 0x8fbc48a5, 0xa4911b66, 0xbd8a2a27,
+       0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23,
+       0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e,
+       0x70d024b9, 0x69cb15f8, 0x42e6463b, 0x5bfd777a,
+       0xdc656bb5, 0xc57e5af4, 0xee530937, 0xf7483876,
+       0xb809aeb1, 0xa1129ff0, 0x8a3fcc33, 0x9324fd72,
+       0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59,
+       0x0709a8dc, 0x06cbc2eb, 0x048d7cb2, 0x054f1685,
+       0x0e1351b8, 0x0fd13b8f, 0x0d9785d6, 0x0c55efe1,
+       0x091af964, 0x08d89353, 0x0a9e2d0a, 0x0b5c473d,
+       0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29,
+       0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5,
+       0x1235f2c8, 0x13f798ff, 0x11b126a6, 0x10734c91,
+       0x153c5a14, 0x14fe3023, 0x16b88e7a, 0x177ae44d,
+       0x384d46e0, 0x398f2cd7, 0x3bc9928e, 0x3a0bf8b9,
+       0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065,
+       0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901,
+       0x3157bf84, 0x3095d5b3, 0x32d36bea, 0x331101dd,
+       0x246be590, 0x25a98fa7, 0x27ef31fe, 0x262d5bc9,
+       0x23624d4c, 0x22a0277b, 0x20e69922, 0x2124f315,
+       0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71,
+       0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad,
+       0x709a8dc0, 0x7158e7f7, 0x731e59ae, 0x72dc3399,
+       0x7793251c, 0x76514f2b, 0x7417f172, 0x75d59b45,
+       0x7e89dc78, 0x7f4bb64f, 0x7d0d0816, 0x7ccf6221,
+       0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd,
+       0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9,
+       0x6bb5866c, 0x6a77ec5b, 0x68315202, 0x69f33835,
+       0x62af7f08, 0x636d153f, 0x612bab66, 0x60e9c151,
+       0x65a6d7d4, 0x6464bde3, 0x662203ba, 0x67e0698d,
+       0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579,
+       0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5,
+       0x46c49a98, 0x4706f0af, 0x45404ef6, 0x448224c1,
+       0x41cd3244, 0x400f5873, 0x4249e62a, 0x438b8c1d,
+       0x54f16850, 0x55330267, 0x5775bc3e, 0x56b7d609,
+       0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5,
+       0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1,
+       0x5deb9134, 0x5c29fb03, 0x5e6f455a, 0x5fad2f6d,
+       0xe1351b80, 0xe0f771b7, 0xe2b1cfee, 0xe373a5d9,
+       0xe63cb35c, 0xe7fed96b, 0xe5b86732, 0xe47a0d05,
+       0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461,
+       0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd,
+       0xfd13b8f0, 0xfcd1d2c7, 0xfe976c9e, 0xff5506a9,
+       0xfa1a102c, 0xfbd87a1b, 0xf99ec442, 0xf85cae75,
+       0xf300e948, 0xf2c2837f, 0xf0843d26, 0xf1465711,
+       0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd,
+       0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339,
+       0xde71f5bc, 0xdfb39f8b, 0xddf521d2, 0xdc374be5,
+       0xd76b0cd8, 0xd6a966ef, 0xd4efd8b6, 0xd52db281,
+       0xd062a404, 0xd1a0ce33, 0xd3e6706a, 0xd2241a5d,
+       0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049,
+       0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895,
+       0xcb4dafa8, 0xca8fc59f, 0xc8c97bc6, 0xc90b11f1,
+       0xcc440774, 0xcd866d43, 0xcfc0d31a, 0xce02b92d,
+       0x91af9640, 0x906dfc77, 0x922b422e, 0x93e92819,
+       0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5,
+       0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1,
+       0x98b56f24, 0x99770513, 0x9b31bb4a, 0x9af3d17d,
+       0x8d893530, 0x8c4b5f07, 0x8e0de15e, 0x8fcf8b69,
+       0x8a809dec, 0x8b42f7db, 0x89044982, 0x88c623b5,
+       0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1,
+       0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d,
+       0xa9e2d0a0, 0xa820ba97, 0xaa6604ce, 0xaba46ef9,
+       0xaeeb787c, 0xaf29124b, 0xad6fac12, 0xacadc625,
+       0xa7f18118, 0xa633eb2f, 0xa4755576, 0xa5b73f41,
+       0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d,
+       0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89,
+       0xb2cddb0c, 0xb30fb13b, 0xb1490f62, 0xb08b6555,
+       0xbbd72268, 0xba15485f, 0xb853f606, 0xb9919c31,
+       0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda, 0xbe9834ed,
+       0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee,
+       0x8f629757, 0x37def032, 0x256b5fdc, 0x9dd738b9,
+       0xc5b428ef, 0x7d084f8a, 0x6fbde064, 0xd7018701,
+       0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733, 0x58631056,
+       0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871,
+       0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26,
+       0x95ad7f70, 0x2d111815, 0x3fa4b7fb, 0x8718d09e,
+       0x1acfe827, 0xa2738f42, 0xb0c620ac, 0x087a47c9,
+       0xa032af3e, 0x188ec85b, 0x0a3b67b5, 0xb28700d0,
+       0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787,
+       0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f,
+       0xeae41086, 0x525877e3, 0x40edd80d, 0xf851bf68,
+       0xf02bf8a1, 0x48979fc4, 0x5a22302a, 0xe29e574f,
+       0x7f496ff6, 0xc7f50893, 0xd540a77d, 0x6dfcc018,
+       0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0,
+       0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7,
+       0x9b14583d, 0x23a83f58, 0x311d90b6, 0x89a1f7d3,
+       0x1476cf6a, 0xaccaa80f, 0xbe7f07e1, 0x06c36084,
+       0x5ea070d2, 0xe61c17b7, 0xf4a9b859, 0x4c15df3c,
+       0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b,
+       0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c,
+       0x446f98f5, 0xfcd3ff90, 0xee66507e, 0x56da371b,
+       0x0eb9274d, 0xb6054028, 0xa4b0efc6, 0x1c0c88a3,
+       0x81dbb01a, 0x3967d77f, 0x2bd27891, 0x936e1ff4,
+       0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed,
+       0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba,
+       0xfe92dfec, 0x462eb889, 0x549b1767, 0xec277002,
+       0x71f048bb, 0xc94c2fde, 0xdbf98030, 0x6345e755,
+       0x6b3fa09c, 0xd383c7f9, 0xc1366817, 0x798a0f72,
+       0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825,
+       0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d,
+       0x21e91f24, 0x99557841, 0x8be0d7af, 0x335cb0ca,
+       0xed59b63b, 0x55e5d15e, 0x47507eb0, 0xffec19d5,
+       0x623b216c, 0xda874609, 0xc832e9e7, 0x708e8e82,
+       0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a,
+       0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d,
+       0xbd40e1a4, 0x05fc86c1, 0x1749292f, 0xaff54e4a,
+       0x322276f3, 0x8a9e1196, 0x982bbe78, 0x2097d91d,
+       0x78f4c94b, 0xc048ae2e, 0xd2fd01c0, 0x6a4166a5,
+       0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2,
+       0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb,
+       0xc2098e52, 0x7ab5e937, 0x680046d9, 0xd0bc21bc,
+       0x88df31ea, 0x3063568f, 0x22d6f961, 0x9a6a9e04,
+       0x07bda6bd, 0xbf01c1d8, 0xadb46e36, 0x15080953,
+       0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174,
+       0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623,
+       0xd8c66675, 0x607a0110, 0x72cfaefe, 0xca73c99b,
+       0x57a4f122, 0xef189647, 0xfdad39a9, 0x45115ecc,
+       0x764dee06, 0xcef18963, 0xdc44268d, 0x64f841e8,
+       0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf,
+       0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907,
+       0x3c9b51be, 0x842736db, 0x96929935, 0x2e2efe50,
+       0x2654b999, 0x9ee8defc, 0x8c5d7112, 0x34e11677,
+       0xa9362ece, 0x118a49ab, 0x033fe645, 0xbb838120,
+       0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98,
+       0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf,
+       0xd67f4138, 0x6ec3265d, 0x7c7689b3, 0xc4caeed6,
+       0x591dd66f, 0xe1a1b10a, 0xf3141ee4, 0x4ba87981,
+       0x13cb69d7, 0xab770eb2, 0xb9c2a15c, 0x017ec639,
+       0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e,
+       0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949,
+       0x090481f0, 0xb1b8e695, 0xa30d497b, 0x1bb12e1e,
+       0x43d23e48, 0xfb6e592d, 0xe9dbf6c3, 0x516791a6,
+       0xccb0a91f, 0x740cce7a, 0x66b96194, 0xde0506f1,
+#endif /* CRC32_SLICE4 || CRC32_SLICE8 */
+#if defined(CRC32_SLICE8)
+       0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0,
+       0xf580a6c0, 0xc8e08f70, 0x8f40f5a0, 0xb220dc10,
+       0x30704bc1, 0x0d106271, 0x4ab018a1, 0x77d03111,
+       0xc5f0ed01, 0xf890c4b1, 0xbf30be61, 0x825097d1,
+       0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52,
+       0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92,
+       0x5090dc43, 0x6df0f5f3, 0x2a508f23, 0x1730a693,
+       0xa5107a83, 0x98705333, 0xdfd029e3, 0xe2b00053,
+       0xc1c12f04, 0xfca106b4, 0xbb017c64, 0x866155d4,
+       0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314,
+       0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15,
+       0x0431c205, 0x3951ebb5, 0x7ef19165, 0x4391b8d5,
+       0xa121b886, 0x9c419136, 0xdbe1ebe6, 0xe681c256,
+       0x54a11e46, 0x69c137f6, 0x2e614d26, 0x13016496,
+       0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997,
+       0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57,
+       0x58f35849, 0x659371f9, 0x22330b29, 0x1f532299,
+       0xad73fe89, 0x9013d739, 0xd7b3ade9, 0xead38459,
+       0x68831388, 0x55e33a38, 0x124340e8, 0x2f236958,
+       0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98,
+       0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b,
+       0xcd93690b, 0xf0f340bb, 0xb7533a6b, 0x8a3313db,
+       0x0863840a, 0x3503adba, 0x72a3d76a, 0x4fc3feda,
+       0xfde322ca, 0xc0830b7a, 0x872371aa, 0xba43581a,
+       0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d,
+       0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d,
+       0xa9423c8c, 0x9422153c, 0xd3826fec, 0xeee2465c,
+       0x5cc29a4c, 0x61a2b3fc, 0x2602c92c, 0x1b62e09c,
+       0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af, 0xbe729a1f,
+       0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf,
+       0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de,
+       0x3c220dce, 0x0142247e, 0x46e25eae, 0x7b82771e,
+       0xb1e6b092, 0x8c869922, 0xcb26e3f2, 0xf646ca42,
+       0x44661652, 0x79063fe2, 0x3ea64532, 0x03c66c82,
+       0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183,
+       0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743,
+       0xd1062710, 0xec660ea0, 0xabc67470, 0x96a65dc0,
+       0x248681d0, 0x19e6a860, 0x5e46d2b0, 0x6326fb00,
+       0xe1766cd1, 0xdc164561, 0x9bb63fb1, 0xa6d61601,
+       0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1,
+       0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546,
+       0x85a73956, 0xb8c710e6, 0xff676a36, 0xc2074386,
+       0x4057d457, 0x7d37fde7, 0x3a978737, 0x07f7ae87,
+       0xb5d77297, 0x88b75b27, 0xcf1721f7, 0xf2770847,
+       0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4,
+       0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404,
+       0x20b743d5, 0x1dd76a65, 0x5a7710b5, 0x67173905,
+       0xd537e515, 0xe857cca5, 0xaff7b675, 0x92979fc5,
+       0xe915e8db, 0xd475c16b, 0x93d5bbbb, 0xaeb5920b,
+       0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb,
+       0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca,
+       0x2ce505da, 0x11852c6a, 0x562556ba, 0x6b457f0a,
+       0x89f57f59, 0xb49556e9, 0xf3352c39, 0xce550589,
+       0x7c75d999, 0x4115f029, 0x06b58af9, 0x3bd5a349,
+       0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48,
+       0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888,
+       0x28d4c7df, 0x15b4ee6f, 0x521494bf, 0x6f74bd0f,
+       0xdd54611f, 0xe03448af, 0xa794327f, 0x9af41bcf,
+       0x18a48c1e, 0x25c4a5ae, 0x6264df7e, 0x5f04f6ce,
+       0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e,
+       0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d,
+       0xbdb4f69d, 0x80d4df2d, 0xc774a5fd, 0xfa148c4d,
+       0x78441b9c, 0x4524322c, 0x028448fc, 0x3fe4614c,
+       0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c, 0xca64c78c,
+       0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae,
+       0x9b914216, 0x50cd91b3, 0xd659e31d, 0x1d0530b8,
+       0xec53826d, 0x270f51c8, 0xa19b2366, 0x6ac7f0c3,
+       0x77c2c07b, 0xbc9e13de, 0x3a0a6170, 0xf156b2d5,
+       0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035,
+       0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223,
+       0xef8580f6, 0x24d95353, 0xa24d21fd, 0x6911f258,
+       0x7414c2e0, 0xbf481145, 0x39dc63eb, 0xf280b04e,
+       0x07ac0536, 0xccf0d693, 0x4a64a43d, 0x81387798,
+       0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e,
+       0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5,
+       0x706ec54d, 0xbb3216e8, 0x3da66446, 0xf6fab7e3,
+       0x047a07ad, 0xcf26d408, 0x49b2a6a6, 0x82ee7503,
+       0x9feb45bb, 0x54b7961e, 0xd223e4b0, 0x197f3715,
+       0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e,
+       0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578,
+       0x0f580a6c, 0xc404d9c9, 0x4290ab67, 0x89cc78c2,
+       0x94c9487a, 0x5f959bdf, 0xd901e971, 0x125d3ad4,
+       0xe30b8801, 0x28575ba4, 0xaec3290a, 0x659ffaaf,
+       0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9,
+       0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59,
+       0x971f4ae1, 0x5c439944, 0xdad7ebea, 0x118b384f,
+       0xe0dd8a9a, 0x2b81593f, 0xad152b91, 0x6649f834,
+       0x7b4cc88c, 0xb0101b29, 0x36846987, 0xfdd8ba22,
+       0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4,
+       0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2,
+       0xe4a78d37, 0x2ffb5e92, 0xa96f2c3c, 0x6233ff99,
+       0x7f36cf21, 0xb46a1c84, 0x32fe6e2a, 0xf9a2bd8f,
+       0x0b220dc1, 0xc07ede64, 0x46eaacca, 0x8db67f6f,
+       0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79,
+       0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02,
+       0x7ce0cdba, 0xb7bc1e1f, 0x31286cb1, 0xfa74bf14,
+       0x1eb014d8, 0xd5ecc77d, 0x5378b5d3, 0x98246676,
+       0x852156ce, 0x4e7d856b, 0xc8e9f7c5, 0x03b52460,
+       0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b,
+       0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d,
+       0x1d661643, 0xd63ac5e6, 0x50aeb748, 0x9bf264ed,
+       0x86f75455, 0x4dab87f0, 0xcb3ff55e, 0x006326fb,
+       0xf135942e, 0x3a69478b, 0xbcfd3525, 0x77a1e680,
+       0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496,
+       0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340,
+       0x828d53f8, 0x49d1805d, 0xcf45f2f3, 0x04192156,
+       0xf54f9383, 0x3e134026, 0xb8873288, 0x73dbe12d,
+       0x6eded195, 0xa5820230, 0x2316709e, 0xe84aa33b,
+       0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db,
+       0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd,
+       0xf6999118, 0x3dc542bd, 0xbb513013, 0x700de3b6,
+       0x6d08d30e, 0xa65400ab, 0x20c07205, 0xeb9ca1a0,
+       0x11e81eb4, 0xdab4cd11, 0x5c20bfbf, 0x977c6c1a,
+       0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c,
+       0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77,
+       0x662adecf, 0xad760d6a, 0x2be27fc4, 0xe0beac61,
+       0x123e1c2f, 0xd962cf8a, 0x5ff6bd24, 0x94aa6e81,
+       0x89af5e39, 0x42f38d9c, 0xc467ff32, 0x0f3b2c97,
+       0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec,
+       0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa,
+       0x16441b82, 0xdd18c827, 0x5b8cba89, 0x90d0692c,
+       0x8dd55994, 0x46898a31, 0xc01df89f, 0x0b412b3a,
+       0xfa1799ef, 0x314b4a4a, 0xb7df38e4, 0x7c83eb41,
+       0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957,
+       0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7,
+       0x8e035b0f, 0x455f88aa, 0xc3cbfa04, 0x089729a1,
+       0xf9c19b74, 0x329d48d1, 0xb4093a7f, 0x7f55e9da,
+       0x6250d962, 0xa90c0ac7, 0x2f987869, 0xe4c4abcc,
+       0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d,
+       0xf44f2413, 0x52382fa7, 0x63d0353a, 0xc5a73e8e,
+       0x33ef4e67, 0x959845d3, 0xa4705f4e, 0x020754fa,
+       0xc7a06a74, 0x61d761c0, 0x503f7b5d, 0xf64870e9,
+       0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653,
+       0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240,
+       0x5431d2a9, 0xf246d91d, 0xc3aec380, 0x65d9c834,
+       0xa07ef6ba, 0x0609fd0e, 0x37e1e793, 0x9196ec27,
+       0xcfbd399c, 0x69ca3228, 0x582228b5, 0xfe552301,
+       0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712,
+       0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66,
+       0x081d53e8, 0xae6a585c, 0x9f8242c1, 0x39f54975,
+       0xa863a552, 0x0e14aee6, 0x3ffcb47b, 0x998bbfcf,
+       0x5c2c8141, 0xfa5b8af5, 0xcbb39068, 0x6dc49bdc,
+       0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8,
+       0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb,
+       0x440b7579, 0xe27c7ecd, 0xd3946450, 0x75e36fe4,
+       0xb044516a, 0x16335ade, 0x27db4043, 0x81ac4bf7,
+       0x77e43b1e, 0xd19330aa, 0xe07b2a37, 0x460c2183,
+       0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590,
+       0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a,
+       0xd79acda4, 0x71edc610, 0x4005dc8d, 0xe672d739,
+       0x103aa7d0, 0xb64dac64, 0x87a5b6f9, 0x21d2bd4d,
+       0xe47583c3, 0x42028877, 0x73ea92ea, 0xd59d995e,
+       0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678,
+       0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b,
+       0xb8590282, 0x1e2e0936, 0x2fc613ab, 0x89b1181f,
+       0x4c162691, 0xea612d25, 0xdb8937b8, 0x7dfe3c0c,
+       0xec68d02b, 0x4a1fdb9f, 0x7bf7c102, 0xdd80cab6,
+       0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5,
+       0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1,
+       0x2bc8ba5f, 0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2,
+       0x8816eaf2, 0x2e61e146, 0x1f89fbdb, 0xb9fef06f,
+       0x7c59cee1, 0xda2ec555, 0xebc6dfc8, 0x4db1d47c,
+       0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08,
+       0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b,
+       0xefc8763c, 0x49bf7d88, 0x78576715, 0xde206ca1,
+       0x1b87522f, 0xbdf0599b, 0x8c184306, 0x2a6f48b2,
+       0xdc27385b, 0x7a5033ef, 0x4bb82972, 0xedcf22c6,
+       0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5,
+       0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3,
+       0xb3e4f77d, 0x1593fcc9, 0x247be654, 0x820cede0,
+       0x74449d09, 0xd23396bd, 0xe3db8c20, 0x45ac8794,
+       0x800bb91a, 0x267cb2ae, 0x1794a833, 0xb1e3a387,
+       0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d,
+       0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e,
+       0x139a01c7, 0xb5ed0a73, 0x840510ee, 0x22721b5a,
+       0xe7d525d4, 0x41a22e60, 0x704a34fd, 0xd63d3f49,
+       0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2, 0xfdf58516,
+       0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105,
+       0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71,
+       0x0bbdf5ff, 0xadcafe4b, 0x9c22e4d6, 0x3a55ef62,
+       0xabc30345, 0x0db408f1, 0x3c5c126c, 0x9a2b19d8,
+       0x5f8c2756, 0xf9fb2ce2, 0xc813367f, 0x6e643dcb,
+       0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf,
+       0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac,
+       0x03a0a617, 0xa5d7ada3, 0x943fb73e, 0x3248bc8a,
+       0xf7ef8204, 0x519889b0, 0x6070932d, 0xc6079899,
+       0x304fe870, 0x9638e3c4, 0xa7d0f959, 0x01a7f2ed,
+       0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe,
+       0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044,
+       0x90311eca, 0x3646157e, 0x07ae0fe3, 0xa1d90457,
+       0x579174be, 0xf1e67f0a, 0xc00e6597, 0x66796e23,
+       0xa3de50ad, 0x05a95b19, 0x34414184, 0x92364a30,
+       0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3,
+       0x844a0efa, 0x48e00e64, 0xc66f0987, 0x0ac50919,
+       0xd3e51bb5, 0x1f4f1b2b, 0x91c01cc8, 0x5d6a1c56,
+       0x57af154f, 0x9b0515d1, 0x158a1232, 0xd92012ac,
+       0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8,
+       0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832,
+       0xaf5e2a9e, 0x63f42a00, 0xed7b2de3, 0x21d12d7d,
+       0x2b142464, 0xe7be24fa, 0x69312319, 0xa59b2387,
+       0xf9766256, 0x35dc62c8, 0xbb53652b, 0x77f965b5,
+       0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f,
+       0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00,
+       0xaed97719, 0x62737787, 0xecfc7064, 0x205670fa,
+       0x85cd537d, 0x496753e3, 0xc7e85400, 0x0b42549e,
+       0x01875d87, 0xcd2d5d19, 0x43a25afa, 0x8f085a64,
+       0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b,
+       0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1,
+       0x299dc2ed, 0xe537c273, 0x6bb8c590, 0xa712c50e,
+       0xadd7cc17, 0x617dcc89, 0xeff2cb6a, 0x2358cbf4,
+       0xfa78d958, 0x36d2d9c6, 0xb85dde25, 0x74f7debb,
+       0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041,
+       0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425,
+       0xd16cfd3c, 0x1dc6fda2, 0x9349fa41, 0x5fe3fadf,
+       0x86c3e873, 0x4a69e8ed, 0xc4e6ef0e, 0x084cef90,
+       0x0289e689, 0xce23e617, 0x40ace1f4, 0x8c06e16a,
+       0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758,
+       0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2,
+       0x030ebb0e, 0xcfa4bb90, 0x412bbc73, 0x8d81bced,
+       0x8744b5f4, 0x4beeb56a, 0xc561b289, 0x09cbb217,
+       0xac509190, 0x60fa910e, 0xee7596ed, 0x22df9673,
+       0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889,
+       0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6,
+       0xfbff84df, 0x37558441, 0xb9da83a2, 0x7570833c,
+       0x533b85da, 0x9f918544, 0x111e82a7, 0xddb48239,
+       0xd7718b20, 0x1bdb8bbe, 0x95548c5d, 0x59fe8cc3,
+       0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c,
+       0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776,
+       0x2f80b4f1, 0xe32ab46f, 0x6da5b38c, 0xa10fb312,
+       0xabcaba0b, 0x6760ba95, 0xe9efbd76, 0x2545bde8,
+       0xfc65af44, 0x30cfafda, 0xbe40a839, 0x72eaa8a7,
+       0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d,
+       0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f,
+       0x2e07e976, 0xe2ade9e8, 0x6c22ee0b, 0xa088ee95,
+       0x79a8fc39, 0xb502fca7, 0x3b8dfb44, 0xf727fbda,
+       0xfde2f2c3, 0x3148f25d, 0xbfc7f5be, 0x736df520,
+       0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144,
+       0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe,
+       0x0513cd12, 0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1,
+       0x8159c3e8, 0x4df3c376, 0xc37cc495, 0x0fd6c40b,
+       0x7aa64737, 0xb60c47a9, 0x3883404a, 0xf42940d4,
+       0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e,
+       0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61,
+       0x2d095278, 0xe1a352e6, 0x6f2c5505, 0xa386559b,
+       0x061d761c, 0xcab77682, 0x44387161, 0x889271ff,
+       0x825778e6, 0x4efd7878, 0xc0727f9b, 0x0cd87f05,
+       0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a,
+       0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0,
+       0x83d02561, 0x4f7a25ff, 0xc1f5221c, 0x0d5f2282,
+       0x079a2b9b, 0xcb302b05, 0x45bf2ce6, 0x89152c78,
+       0x50353ed4, 0x9c9f3e4a, 0x121039a9, 0xdeba3937,
+       0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd,
+       0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9,
+       0x7b211ab0, 0xb78b1a2e, 0x39041dcd, 0xf5ae1d53,
+       0x2c8e0fff, 0xe0240f61, 0x6eab0882, 0xa201081c,
+       0xa8c40105, 0x646e019b, 0xeae10678, 0x264b06e6,
+#endif /* CRC32_SLICE8 */
+};
diff --git a/src/3rdparty/libdeflate/lib/crc32_vec_template.h b/src/3rdparty/libdeflate/lib/crc32_vec_template.h
new file mode 100644 (file)
index 0000000..9a2ad5b
--- /dev/null
@@ -0,0 +1,61 @@
+/*
+ * crc32_vec_template.h - template for vectorized CRC-32 implementations
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#define CRC32_SLICE1   1
+static u32 crc32_slice1(u32, const u8 *, size_t);
+
+/*
+ * Template for vectorized CRC-32 implementations.
+ *
+ * Note: on unaligned ends of the buffer, we fall back to crc32_slice1() instead
+ * of crc32_slice8() because only a few bytes need to be processed, so a smaller
+ * table is preferable.
+ */
+static u32 ATTRIBUTES
+FUNCNAME(u32 remainder, const u8 *p, size_t size)
+{
+       if ((uintptr_t)p % IMPL_ALIGNMENT) {
+               size_t n = MIN(size, -(uintptr_t)p % IMPL_ALIGNMENT);
+
+               remainder = crc32_slice1(remainder, p, n);
+               p += n;
+               size -= n;
+       }
+       if (size >= IMPL_SEGMENT_SIZE) {
+               remainder = FUNCNAME_ALIGNED(remainder, (const void *)p,
+                                            size / IMPL_SEGMENT_SIZE);
+               p += size - (size % IMPL_SEGMENT_SIZE);
+               size %= IMPL_SEGMENT_SIZE;
+       }
+       return crc32_slice1(remainder, p, size);
+}
+
+#undef FUNCNAME
+#undef FUNCNAME_ALIGNED
+#undef ATTRIBUTES
+#undef IMPL_ALIGNMENT
+#undef IMPL_SEGMENT_SIZE
diff --git a/src/3rdparty/libdeflate/lib/decompress_template.h b/src/3rdparty/libdeflate/lib/decompress_template.h
new file mode 100644 (file)
index 0000000..16dfeed
--- /dev/null
@@ -0,0 +1,421 @@
+/*
+ * decompress_template.h
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This is the actual DEFLATE decompression routine, lifted out of
+ * deflate_decompress.c so that it can be compiled multiple times with different
+ * target instruction sets.
+ */
+
+static enum libdeflate_result ATTRIBUTES
+FUNCNAME(struct libdeflate_decompressor * restrict d,
+        const void * restrict in, size_t in_nbytes,
+        void * restrict out, size_t out_nbytes_avail,
+        size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
+{
+       u8 *out_next = out;
+       u8 * const out_end = out_next + out_nbytes_avail;
+       const u8 *in_next = in;
+       const u8 * const in_end = in_next + in_nbytes;
+       bitbuf_t bitbuf = 0;
+       unsigned bitsleft = 0;
+       size_t overread_count = 0;
+       unsigned i;
+       unsigned is_final_block;
+       unsigned block_type;
+       u16 len;
+       u16 nlen;
+       unsigned num_litlen_syms;
+       unsigned num_offset_syms;
+       u16 tmp16;
+       u32 tmp32;
+
+next_block:
+       /* Starting to read the next block.  */
+       ;
+
+       STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4));
+       ENSURE_BITS(1 + 2 + 5 + 5 + 4);
+
+       /* BFINAL: 1 bit  */
+       is_final_block = POP_BITS(1);
+
+       /* BTYPE: 2 bits  */
+       block_type = POP_BITS(2);
+
+       if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
+
+               /* Dynamic Huffman block.  */
+
+               /* The order in which precode lengths are stored.  */
+               static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
+                       16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
+               };
+
+               unsigned num_explicit_precode_lens;
+
+               /* Read the codeword length counts.  */
+
+               STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
+               num_litlen_syms = POP_BITS(5) + 257;
+
+               STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
+               num_offset_syms = POP_BITS(5) + 1;
+
+               STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
+               num_explicit_precode_lens = POP_BITS(4) + 4;
+
+               d->static_codes_loaded = false;
+
+               /* Read the precode codeword lengths.  */
+               STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
+               for (i = 0; i < num_explicit_precode_lens; i++) {
+                       ENSURE_BITS(3);
+                       d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
+               }
+
+               for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
+                       d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
+
+               /* Build the decode table for the precode.  */
+               SAFETY_CHECK(build_precode_decode_table(d));
+
+               /* Expand the literal/length and offset codeword lengths.  */
+               for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
+                       u32 entry;
+                       unsigned presym;
+                       u8 rep_val;
+                       unsigned rep_count;
+
+                       ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
+
+                       /* (The code below assumes that the precode decode table
+                        * does not have any subtables.)  */
+                       STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
+
+                       /* Read the next precode symbol.  */
+                       entry = d->u.l.precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
+                       REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
+                       presym = entry >> HUFFDEC_RESULT_SHIFT;
+
+                       if (presym < 16) {
+                               /* Explicit codeword length  */
+                               d->u.l.lens[i++] = presym;
+                               continue;
+                       }
+
+                       /* Run-length encoded codeword lengths  */
+
+                       /* Note: we don't need verify that the repeat count
+                        * doesn't overflow the number of elements, since we
+                        * have enough extra spaces to allow for the worst-case
+                        * overflow (138 zeroes when only 1 length was
+                        * remaining).
+                        *
+                        * In the case of the small repeat counts (presyms 16
+                        * and 17), it is fastest to always write the maximum
+                        * number of entries.  That gets rid of branches that
+                        * would otherwise be required.
+                        *
+                        * It is not just because of the numerical order that
+                        * our checks go in the order 'presym < 16', 'presym ==
+                        * 16', and 'presym == 17'.  For typical data this is
+                        * ordered from most frequent to least frequent case.
+                        */
+                       STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
+
+                       if (presym == 16) {
+                               /* Repeat the previous length 3 - 6 times  */
+                               SAFETY_CHECK(i != 0);
+                               rep_val = d->u.l.lens[i - 1];
+                               STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
+                               rep_count = 3 + POP_BITS(2);
+                               d->u.l.lens[i + 0] = rep_val;
+                               d->u.l.lens[i + 1] = rep_val;
+                               d->u.l.lens[i + 2] = rep_val;
+                               d->u.l.lens[i + 3] = rep_val;
+                               d->u.l.lens[i + 4] = rep_val;
+                               d->u.l.lens[i + 5] = rep_val;
+                               i += rep_count;
+                       } else if (presym == 17) {
+                               /* Repeat zero 3 - 10 times  */
+                               STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
+                               rep_count = 3 + POP_BITS(3);
+                               d->u.l.lens[i + 0] = 0;
+                               d->u.l.lens[i + 1] = 0;
+                               d->u.l.lens[i + 2] = 0;
+                               d->u.l.lens[i + 3] = 0;
+                               d->u.l.lens[i + 4] = 0;
+                               d->u.l.lens[i + 5] = 0;
+                               d->u.l.lens[i + 6] = 0;
+                               d->u.l.lens[i + 7] = 0;
+                               d->u.l.lens[i + 8] = 0;
+                               d->u.l.lens[i + 9] = 0;
+                               i += rep_count;
+                       } else {
+                               /* Repeat zero 11 - 138 times  */
+                               STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
+                               rep_count = 11 + POP_BITS(7);
+                               memset(&d->u.l.lens[i], 0,
+                                      rep_count * sizeof(d->u.l.lens[i]));
+                               i += rep_count;
+                       }
+               }
+       } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
+
+               /* Uncompressed block: copy 'len' bytes literally from the input
+                * buffer to the output buffer.  */
+
+               ALIGN_INPUT();
+
+               SAFETY_CHECK(in_end - in_next >= 4);
+
+               len = READ_U16();
+               nlen = READ_U16();
+
+               SAFETY_CHECK(len == (u16)~nlen);
+               if (unlikely(len > out_end - out_next))
+                       return LIBDEFLATE_INSUFFICIENT_SPACE;
+               SAFETY_CHECK(len <= in_end - in_next);
+
+               memcpy(out_next, in_next, len);
+               in_next += len;
+               out_next += len;
+
+               goto block_done;
+
+       } else {
+               SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
+
+               /*
+                * Static Huffman block: build the decode tables for the static
+                * codes.  Skip doing so if the tables are already set up from
+                * an earlier static block; this speeds up decompression of
+                * degenerate input of many empty or very short static blocks.
+                *
+                * Afterwards, the remainder is the same as decompressing a
+                * dynamic Huffman block.
+                */
+
+               if (d->static_codes_loaded)
+                       goto have_decode_tables;
+
+               d->static_codes_loaded = true;
+
+               STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
+               STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
+
+               for (i = 0; i < 144; i++)
+                       d->u.l.lens[i] = 8;
+               for (; i < 256; i++)
+                       d->u.l.lens[i] = 9;
+               for (; i < 280; i++)
+                       d->u.l.lens[i] = 7;
+               for (; i < 288; i++)
+                       d->u.l.lens[i] = 8;
+
+               for (; i < 288 + 32; i++)
+                       d->u.l.lens[i] = 5;
+
+               num_litlen_syms = 288;
+               num_offset_syms = 32;
+       }
+
+       /* Decompressing a Huffman block (either dynamic or static)  */
+
+       SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
+       SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
+have_decode_tables:
+
+       /* The main DEFLATE decode loop  */
+       for (;;) {
+               u32 entry;
+               u32 length;
+               u32 offset;
+               const u8 *src;
+               u8 *dst;
+
+               /* Decode a litlen symbol.  */
+               ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
+               entry = d->u.litlen_decode_table[BITS(LITLEN_TABLEBITS)];
+               if (entry & HUFFDEC_SUBTABLE_POINTER) {
+                       /* Litlen subtable required (uncommon case)  */
+                       REMOVE_BITS(LITLEN_TABLEBITS);
+                       entry = d->u.litlen_decode_table[
+                               ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
+                               BITS(entry & HUFFDEC_LENGTH_MASK)];
+               }
+               REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
+               if (entry & HUFFDEC_LITERAL) {
+                       /* Literal  */
+                       if (unlikely(out_next == out_end))
+                               return LIBDEFLATE_INSUFFICIENT_SPACE;
+                       *out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
+                       continue;
+               }
+
+               /* Match or end-of-block  */
+
+               entry >>= HUFFDEC_RESULT_SHIFT;
+               ENSURE_BITS(MAX_ENSURE);
+
+               /* Pop the extra length bits and add them to the length base to
+                * produce the full length.  */
+               length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
+                        POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
+
+               /* The match destination must not end after the end of the
+                * output buffer.  For efficiency, combine this check with the
+                * end-of-block check.  We're using 0 for the special
+                * end-of-block length, so subtract 1 and it turn it into
+                * SIZE_MAX.  */
+               STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
+               if (unlikely((size_t)length - 1 >= out_end - out_next)) {
+                       if (unlikely(length != HUFFDEC_END_OF_BLOCK_LENGTH))
+                               return LIBDEFLATE_INSUFFICIENT_SPACE;
+                       goto block_done;
+               }
+
+               /* Decode the match offset.  */
+
+               entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
+               if (entry & HUFFDEC_SUBTABLE_POINTER) {
+                       /* Offset subtable required (uncommon case)  */
+                       REMOVE_BITS(OFFSET_TABLEBITS);
+                       entry = d->offset_decode_table[
+                               ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
+                               BITS(entry & HUFFDEC_LENGTH_MASK)];
+               }
+               REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
+               entry >>= HUFFDEC_RESULT_SHIFT;
+
+               STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
+                                        DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
+                             CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
+               if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
+                               DEFLATE_MAX_OFFSET_CODEWORD_LEN +
+                               DEFLATE_MAX_EXTRA_OFFSET_BITS))
+                       ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
+
+               /* Pop the extra offset bits and add them to the offset base to
+                * produce the full offset.  */
+               offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
+                        POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
+
+               /* The match source must not begin before the beginning of the
+                * output buffer.  */
+               SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+
+               /*
+                * Copy the match: 'length' bytes at 'out_next - offset' to
+                * 'out_next', possibly overlapping.  If the match doesn't end
+                * too close to the end of the buffer and offset >= WORDBYTES ||
+                * offset == 1, take a fast path which copies a word at a time
+                * -- potentially more than the length of the match, but that's
+                * fine as long as we check for enough extra space.
+                *
+                * The remaining cases are not performance-critical so are
+                * handled by a simple byte-by-byte copy.
+                */
+
+               src = out_next - offset;
+               dst = out_next;
+               out_next += length;
+
+               if (UNALIGNED_ACCESS_IS_FAST &&
+                   /* max overrun is writing 3 words for a min length match */
+                   likely(out_end - out_next >=
+                          3 * WORDBYTES - DEFLATE_MIN_MATCH_LEN)) {
+                       if (offset >= WORDBYTES) { /* words don't overlap? */
+                               copy_word_unaligned(src, dst);
+                               src += WORDBYTES;
+                               dst += WORDBYTES;
+                               copy_word_unaligned(src, dst);
+                               src += WORDBYTES;
+                               dst += WORDBYTES;
+                               do {
+                                       copy_word_unaligned(src, dst);
+                                       src += WORDBYTES;
+                                       dst += WORDBYTES;
+                               } while (dst < out_next);
+                       } else if (offset == 1) {
+                               /* RLE encoding of previous byte, common if the
+                                * data contains many repeated bytes */
+                               machine_word_t v = repeat_byte(*src);
+
+                               store_word_unaligned(v, dst);
+                               dst += WORDBYTES;
+                               store_word_unaligned(v, dst);
+                               dst += WORDBYTES;
+                               do {
+                                       store_word_unaligned(v, dst);
+                                       dst += WORDBYTES;
+                               } while (dst < out_next);
+                       } else {
+                               *dst++ = *src++;
+                               *dst++ = *src++;
+                               do {
+                                       *dst++ = *src++;
+                               } while (dst < out_next);
+                       }
+               } else {
+                       STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
+                       *dst++ = *src++;
+                       *dst++ = *src++;
+                       do {
+                               *dst++ = *src++;
+                       } while (dst < out_next);
+               }
+       }
+
+block_done:
+       /* Finished decoding a block.  */
+
+       if (!is_final_block)
+               goto next_block;
+
+       /* That was the last block.  */
+
+       /* Discard any readahead bits and check for excessive overread */
+       ALIGN_INPUT();
+
+       /* Optionally return the actual number of bytes read */
+       if (actual_in_nbytes_ret)
+               *actual_in_nbytes_ret = in_next - (u8 *)in;
+
+       /* Optionally return the actual number of bytes written */
+       if (actual_out_nbytes_ret) {
+               *actual_out_nbytes_ret = out_next - (u8 *)out;
+       } else {
+               if (out_next != out_end)
+                       return LIBDEFLATE_SHORT_OUTPUT;
+       }
+       return LIBDEFLATE_SUCCESS;
+}
+
+#undef FUNCNAME
+#undef ATTRIBUTES
diff --git a/src/3rdparty/libdeflate/lib/deflate_compress.c b/src/3rdparty/libdeflate/lib/deflate_compress.c
new file mode 100644 (file)
index 0000000..377b51f
--- /dev/null
@@ -0,0 +1,3793 @@
+/*
+ * deflate_compress.c - a compressor for DEFLATE
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "deflate_compress.h"
+#include "deflate_constants.h"
+#include "unaligned.h"
+
+#include "libdeflate.h"
+
+/******************************************************************************/
+
+/*
+ * The following parameters can be changed at build time to customize the
+ * compression algorithms slightly:
+ *
+ * (Note, not all customizable parameters are here.  Some others can be found in
+ * libdeflate_alloc_compressor() and in *_matchfinder.h.)
+ */
+
+/*
+ * If this parameter is defined to 1, then the near-optimal parsing algorithm
+ * will be included, and compression levels 10-12 will use it.  This algorithm
+ * usually produces a compression ratio significantly better than the other
+ * algorithms.  However, it is slow.  If this parameter is defined to 0, then
+ * levels 10-12 will be the same as level 9 and will use the lazy2 algorithm.
+ */
+#define SUPPORT_NEAR_OPTIMAL_PARSING   1
+
+/*
+ * This is the minimum block length that the compressor will use, in
+ * uncompressed bytes.  It is also approximately the amount by which the final
+ * block is allowed to grow past the soft maximum length in order to avoid using
+ * a very short block at the end.  This should be a value below which using
+ * shorter blocks is unlikely to be worthwhile, due to the per-block overhead.
+ *
+ * Defining a fixed minimum block length is needed in order to guarantee a
+ * reasonable upper bound on the compressed size.  It's also needed because our
+ * block splitting algorithm doesn't work well on very short blocks.
+ */
+#define MIN_BLOCK_LENGTH       5000
+
+/*
+ * For the greedy, lazy, lazy2, and near-optimal compressors: This is the soft
+ * maximum block length, in uncompressed bytes.  The compressor will try to end
+ * blocks at this length, but it may go slightly past it if there is a match
+ * that straddles this limit or if the input data ends soon after this limit.
+ * This parameter doesn't apply to uncompressed blocks, which the DEFLATE format
+ * limits to 65535 bytes.
+ *
+ * This should be a value above which it is very likely that splitting the block
+ * would produce a better compression ratio.  For the near-optimal compressor,
+ * increasing/decreasing this parameter will increase/decrease per-compressor
+ * memory usage linearly.
+ */
+#define SOFT_MAX_BLOCK_LENGTH  300000
+
+/*
+ * For the greedy, lazy, and lazy2 compressors: this is the length of the
+ * sequence store, which is an array where the compressor temporarily stores
+ * matches that it's going to use in the current block.  This value is the
+ * maximum number of matches that can be used in a block.  If the sequence store
+ * fills up, then the compressor will be forced to end the block early.  This
+ * value should be large enough so that this rarely happens, due to the block
+ * being ended normally before then.  Increasing/decreasing this value will
+ * increase/decrease per-compressor memory usage linearly.
+ */
+#define SEQ_STORE_LENGTH       50000
+
+/*
+ * For deflate_compress_fastest(): This is the soft maximum block length.
+ * deflate_compress_fastest() doesn't use the regular block splitting algorithm;
+ * it only ends blocks when they reach FAST_SOFT_MAX_BLOCK_LENGTH bytes or
+ * FAST_SEQ_STORE_LENGTH matches.  Therefore, this value should be lower than
+ * the regular SOFT_MAX_BLOCK_LENGTH.
+ */
+#define FAST_SOFT_MAX_BLOCK_LENGTH     65535
+
+/*
+ * For deflate_compress_fastest(): this is the length of the sequence store.
+ * This is like SEQ_STORE_LENGTH, but this should be a lower value.
+ */
+#define FAST_SEQ_STORE_LENGTH  8192
+
+/*
+ * These are the maximum codeword lengths, in bits, the compressor will use for
+ * each Huffman code.  The DEFLATE format defines limits for these.  However,
+ * further limiting litlen codewords to 14 bits is beneficial, since it has
+ * negligible effect on compression ratio but allows some optimizations when
+ * outputting bits.  (It allows 4 literals to be written at once rather than 3.)
+ */
+#define MAX_LITLEN_CODEWORD_LEN                14
+#define MAX_OFFSET_CODEWORD_LEN                DEFLATE_MAX_OFFSET_CODEWORD_LEN
+#define MAX_PRE_CODEWORD_LEN           DEFLATE_MAX_PRE_CODEWORD_LEN
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
+/* Parameters specific to the near-optimal parsing algorithm */
+
+/*
+ * BIT_COST is a scaling factor that allows the near-optimal compressor to
+ * consider fractional bit costs when deciding which literal/match sequence to
+ * use.  This is useful when the true symbol costs are unknown.  For example, if
+ * the compressor thinks that a symbol has 6.5 bits of entropy, it can set its
+ * cost to 6.5 bits rather than have to use 6 or 7 bits.  Although in the end
+ * each symbol will use a whole number of bits due to the Huffman coding,
+ * considering fractional bits can be helpful due to the limited information.
+ *
+ * BIT_COST should be a power of 2.  A value of 8 or 16 works well.  A higher
+ * value isn't very useful since the calculations are approximate anyway.
+ *
+ * BIT_COST doesn't apply to deflate_flush_block(), which considers whole bits.
+ */
+#define BIT_COST       16
+
+/*
+ * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
+ * be needed to output a symbol that was unused in the previous optimization
+ * pass.  Assigning a default cost allows the symbol to be used in the next
+ * optimization pass.  However, the cost should be relatively high because the
+ * symbol probably won't be used very many times (if at all).
+ */
+#define LITERAL_NOSTAT_BITS    13
+#define LENGTH_NOSTAT_BITS     13
+#define OFFSET_NOSTAT_BITS     10
+
+/*
+ * This is (slightly less than) the maximum number of matches that the
+ * near-optimal compressor will cache per block.  This behaves similarly to
+ * SEQ_STORE_LENGTH for the other compressors.
+ */
+#define MATCH_CACHE_LENGTH     (SOFT_MAX_BLOCK_LENGTH * 5)
+
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+/******************************************************************************/
+
+/* Include the needed matchfinders. */
+#define MATCHFINDER_WINDOW_ORDER       DEFLATE_WINDOW_ORDER
+#include "hc_matchfinder.h"
+#include "ht_matchfinder.h"
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+#  include "bt_matchfinder.h"
+/*
+ * This is the maximum number of matches the binary trees matchfinder can find
+ * at a single position.  Since the matchfinder never finds more than one match
+ * for the same length, presuming one of each possible length is sufficient for
+ * an upper bound.  (This says nothing about whether it is worthwhile to
+ * consider so many matches; this is just defining the worst case.)
+ */
+#define MAX_MATCHES_PER_POS    \
+       (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
+#endif
+
+/*
+ * The largest block length we will ever use is when the final block is of
+ * length SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, or when any block is of
+ * length SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN.  The latter case
+ * occurs when the lazy2 compressor chooses two literals and a maximum-length
+ * match, starting at SOFT_MAX_BLOCK_LENGTH - 1.
+ */
+#define MAX_BLOCK_LENGTH       \
+       MAX(SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1,       \
+           SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN)
+
+static forceinline void
+check_buildtime_parameters(void)
+{
+       /*
+        * Verify that MIN_BLOCK_LENGTH is being honored, as
+        * libdeflate_deflate_compress_bound() depends on it.
+        */
+       STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
+       STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
+       STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN >=
+                     MIN_BLOCK_LENGTH);
+       STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN >=
+                     MIN_BLOCK_LENGTH);
+
+       /* The definition of MAX_BLOCK_LENGTH assumes this. */
+       STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH);
+
+       /* Verify that the sequence stores aren't uselessly large. */
+       STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN <=
+                     SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
+       STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN <=
+                     FAST_SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
+
+       /* Verify that the maximum codeword lengths are valid. */
+       STATIC_ASSERT(
+               MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
+       STATIC_ASSERT(
+               MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
+       STATIC_ASSERT(
+               MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
+       STATIC_ASSERT(
+               (1U << MAX_LITLEN_CODEWORD_LEN) >= DEFLATE_NUM_LITLEN_SYMS);
+       STATIC_ASSERT(
+               (1U << MAX_OFFSET_CODEWORD_LEN) >= DEFLATE_NUM_OFFSET_SYMS);
+       STATIC_ASSERT(
+               (1U << MAX_PRE_CODEWORD_LEN) >= DEFLATE_NUM_PRECODE_SYMS);
+}
+
+/******************************************************************************/
+
+/* Table: length slot => length slot base value */
+static const unsigned deflate_length_slot_base[] = {
+       3,    4,    5,    6,    7,    8,    9,    10,
+       11,   13,   15,   17,   19,   23,   27,   31,
+       35,   43,   51,   59,   67,   83,   99,   115,
+       131,  163,  195,  227,  258,
+};
+
+/* Table: length slot => number of extra length bits */
+static const u8 deflate_extra_length_bits[] = {
+       0,    0,    0,    0,    0,    0,    0,    0,
+       1,    1,    1,    1,    2,    2,    2,    2,
+       3,    3,    3,    3,    4,    4,    4,    4,
+       5,    5,    5,    5,    0,
+};
+
+/* Table: offset slot => offset slot base value */
+static const unsigned deflate_offset_slot_base[] = {
+       1,     2,     3,     4,     5,     7,     9,     13,
+       17,    25,    33,    49,    65,    97,    129,   193,
+       257,   385,   513,   769,   1025,  1537,  2049,  3073,
+       4097,  6145,  8193,  12289, 16385, 24577,
+};
+
+/* Table: offset slot => number of extra offset bits */
+static const u8 deflate_extra_offset_bits[] = {
+       0,     0,     0,     0,     1,     1,     2,     2,
+       3,     3,     4,     4,     5,     5,     6,     6,
+       7,     7,     8,     8,     9,     9,     10,    10,
+       11,    11,    12,    12,    13,    13,
+};
+
+/* Table: length => length slot */
+static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
+       0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
+       12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
+       16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
+       18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+       20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+       21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+       22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+       23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+       24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
+       25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+       25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
+       26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+       26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+       27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+       27, 27, 28,
+};
+
+/*
+ * A condensed table which maps offset => offset slot as follows:
+ *
+ *     offset <= 256: deflate_offset_slot[offset]
+ *     offset > 256: deflate_offset_slot[256 + ((offset - 1) >> 7)]
+ *
+ * This table was generated by scripts/gen_offset_slot_map.py.
+ */
+static const u8 deflate_offset_slot[512] = {
+       0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7,
+       7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9,
+       9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+       10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+       11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+       12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+       12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+       13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+       13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+       14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+       14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+       14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+       14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+       15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+       15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+       15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+       15, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
+       22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+       24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+       25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+       26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+       26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+       27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+       27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+       28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+       28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+       28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+       28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+       29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+       29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+       29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+       29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+};
+
+/* The order in which precode codeword lengths are stored */
+static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
+       16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
+};
+
+/* Codewords for the DEFLATE Huffman codes */
+struct deflate_codewords {
+       u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
+       u32 offset[DEFLATE_NUM_OFFSET_SYMS];
+};
+
+/*
+ * Codeword lengths (in bits) for the DEFLATE Huffman codes.
+ * A zero length means the corresponding symbol had zero frequency.
+ */
+struct deflate_lens {
+       u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
+       u8 offset[DEFLATE_NUM_OFFSET_SYMS];
+};
+
+/* Codewords and lengths for the DEFLATE Huffman codes */
+struct deflate_codes {
+       struct deflate_codewords codewords;
+       struct deflate_lens lens;
+};
+
+/* Symbol frequency counters for the DEFLATE Huffman codes */
+struct deflate_freqs {
+       u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
+       u32 offset[DEFLATE_NUM_OFFSET_SYMS];
+};
+
+/*
+ * Represents a run of literals followed by a match or end-of-block.  This
+ * struct is needed to temporarily store items chosen by the parser, since items
+ * cannot be written until all items for the block have been chosen and the
+ * block's Huffman codes have been computed.
+ */
+struct deflate_sequence {
+
+       /*
+        * Bits 0..22: the number of literals in this run.  This may be 0 and
+        * can be at most MAX_BLOCK_LENGTH.  The literals are not stored
+        * explicitly in this structure; instead, they are read directly from
+        * the uncompressed data.
+        *
+        * Bits 23..31: the length of the match which follows the literals, or 0
+        * if this literal run was the last in the block, so there is no match
+        * which follows it.
+        */
+#define SEQ_LENGTH_SHIFT 23
+#define SEQ_LITRUNLEN_MASK (((u32)1 << SEQ_LENGTH_SHIFT) - 1)
+       u32 litrunlen_and_length;
+
+       /*
+        * If 'length' doesn't indicate end-of-block, then this is the offset of
+        * the match which follows the literals.
+        */
+       u16 offset;
+
+       /*
+        * If 'length' doesn't indicate end-of-block, then this is the offset
+        * symbol of the match which follows the literals.
+        */
+       u8 offset_symbol;
+
+       /*
+        * If 'length' doesn't indicate end-of-block, then this is the length
+        * slot of the match which follows the literals.
+        */
+       u8 length_slot;
+};
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
+/* Costs for the near-optimal parsing algorithm */
+struct deflate_costs {
+
+       /* The cost to output each possible literal */
+       u32 literal[DEFLATE_NUM_LITERALS];
+
+       /* The cost to output each possible match length */
+       u32 length[DEFLATE_MAX_MATCH_LEN + 1];
+
+       /* The cost to output a match offset of each possible offset slot */
+       u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
+};
+
+/*
+ * This structure represents a byte position in the input data and a node in the
+ * graph of possible match/literal choices for the current block.
+ *
+ * Logically, each incoming edge to this node is labeled with a literal or a
+ * match that can be taken to reach this position from an earlier position; and
+ * each outgoing edge from this node is labeled with a literal or a match that
+ * can be taken to advance from this position to a later position.
+ *
+ * But these "edges" are actually stored elsewhere (in 'match_cache').  Here we
+ * associate with each node just two pieces of information:
+ *
+ *     'cost_to_end' is the minimum cost to reach the end of the block from
+ *     this position.
+ *
+ *     'item' represents the literal or match that must be chosen from here to
+ *     reach the end of the block with the minimum cost.  Equivalently, this
+ *     can be interpreted as the label of the outgoing edge on the minimum-cost
+ *     path to the "end of block" node from this node.
+ */
+struct deflate_optimum_node {
+
+       u32 cost_to_end;
+
+       /*
+        * Notes on the match/literal representation used here:
+        *
+        *      The low bits of 'item' are the length: 1 if this is a literal,
+        *      or the match length if this is a match.
+        *
+        *      The high bits of 'item' are the actual literal byte if this is a
+        *      literal, or the match offset if this is a match.
+        */
+#define OPTIMUM_OFFSET_SHIFT 9
+#define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
+       u32 item;
+
+};
+
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+/* Block split statistics.  See "Block splitting algorithm" below. */
+#define NUM_LITERAL_OBSERVATION_TYPES 8
+#define NUM_MATCH_OBSERVATION_TYPES 2
+#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
+                              NUM_MATCH_OBSERVATION_TYPES)
+#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
+struct block_split_stats {
+       u32 new_observations[NUM_OBSERVATION_TYPES];
+       u32 observations[NUM_OBSERVATION_TYPES];
+       u32 num_new_observations;
+       u32 num_observations;
+};
+
+/* The main DEFLATE compressor structure */
+struct libdeflate_compressor {
+
+       /* Pointer to the compress() implementation chosen at allocation time */
+       size_t (*impl)(struct libdeflate_compressor *c, const u8 *in,
+                      size_t in_nbytes, u8 *out, size_t out_nbytes_avail);
+
+       /* The compression level with which this compressor was created */
+       unsigned compression_level;
+
+       /* Anything smaller than this we won't bother trying to compress. */
+       unsigned min_size_to_compress;
+
+       /*
+        * The maximum search depth: consider at most this many potential
+        * matches at each position
+        */
+       unsigned max_search_depth;
+
+       /*
+        * The "nice" match length: if a match of this length is found, choose
+        * it immediately without further consideration
+        */
+       unsigned nice_match_length;
+
+       /* Frequency counters for the current block */
+       struct deflate_freqs freqs;
+
+       /* Block split statistics for the current block */
+       struct block_split_stats split_stats;
+
+       /* Dynamic Huffman codes for the current block */
+       struct deflate_codes codes;
+
+       /* The static Huffman codes defined by the DEFLATE format */
+       struct deflate_codes static_codes;
+
+       /* Temporary space for Huffman code output */
+       u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
+       u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
+       u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
+       unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS +
+                              DEFLATE_NUM_OFFSET_SYMS];
+       unsigned num_litlen_syms;
+       unsigned num_offset_syms;
+       unsigned num_explicit_lens;
+       unsigned num_precode_items;
+
+       union {
+               /* Data for greedy or lazy parsing */
+               struct {
+                       /* Hash chains matchfinder */
+                       struct hc_matchfinder hc_mf;
+
+                       /* Matches and literals chosen for the current block */
+                       struct deflate_sequence sequences[SEQ_STORE_LENGTH + 1];
+
+               } g; /* (g)reedy */
+
+               /* Data for fastest parsing */
+               struct {
+                       /* Hash table matchfinder */
+                       struct ht_matchfinder ht_mf;
+
+                       /* Matches and literals chosen for the current block */
+                       struct deflate_sequence sequences[
+                                               FAST_SEQ_STORE_LENGTH + 1];
+
+               } f; /* (f)astest */
+
+       #if SUPPORT_NEAR_OPTIMAL_PARSING
+               /* Data for near-optimal parsing */
+               struct {
+
+                       /* Binary tree matchfinder */
+                       struct bt_matchfinder bt_mf;
+
+                       /*
+                        * Cached matches for the current block.  This array
+                        * contains the matches that were found at each position
+                        * in the block.  Specifically, for each position, there
+                        * is a list of matches found at that position, if any,
+                        * sorted by strictly increasing length.  In addition,
+                        * following the matches for each position, there is a
+                        * special 'struct lz_match' whose 'length' member
+                        * contains the number of matches found at that
+                        * position, and whose 'offset' member contains the
+                        * literal at that position.
+                        *
+                        * Note: in rare cases, there will be a very high number
+                        * of matches in the block and this array will overflow.
+                        * If this happens, we force the end of the current
+                        * block.  MATCH_CACHE_LENGTH is the length at which we
+                        * actually check for overflow.  The extra slots beyond
+                        * this are enough to absorb the worst case overflow,
+                        * which occurs if starting at
+                        * &match_cache[MATCH_CACHE_LENGTH - 1], we write
+                        * MAX_MATCHES_PER_POS matches and a match count header,
+                        * then skip searching for matches at
+                        * 'DEFLATE_MAX_MATCH_LEN - 1' positions and write the
+                        * match count header for each.
+                        */
+                       struct lz_match match_cache[MATCH_CACHE_LENGTH +
+                                                   MAX_MATCHES_PER_POS +
+                                                   DEFLATE_MAX_MATCH_LEN - 1];
+
+                       /*
+                        * Array of nodes, one per position, for running the
+                        * minimum-cost path algorithm.
+                        *
+                        * This array must be large enough to accommodate the
+                        * worst-case number of nodes, which is MAX_BLOCK_LENGTH
+                        * plus 1 for the end-of-block node.
+                        */
+                       struct deflate_optimum_node optimum_nodes[
+                               MAX_BLOCK_LENGTH + 1];
+
+                       /* The current cost model being used */
+                       struct deflate_costs costs;
+
+                       /*
+                        * A table that maps match offset to offset slot.  This
+                        * differs from deflate_offset_slot[] in that this is a
+                        * full map, not a condensed one.  The full map is more
+                        * appropriate for the near-optimal parser, since the
+                        * near-optimal parser does more offset => offset_slot
+                        * translations, it doesn't intersperse them with
+                        * matchfinding (so cache evictions are less of a
+                        * concern), and it uses more memory anyway.
+                        */
+                       u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1];
+
+                       /* Literal/match statistics saved from previous block */
+                       u32 prev_observations[NUM_OBSERVATION_TYPES];
+                       u32 prev_num_observations;
+
+                       /*
+                        * Approximate match length frequencies based on a
+                        * greedy parse, gathered during matchfinding.  This is
+                        * used for setting the initial symbol costs.
+                        */
+                       u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
+                       u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
+
+                       unsigned num_optim_passes;
+               } n; /* (n)ear-optimal */
+       #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+       } p; /* (p)arser */
+};
+
+/*
+ * The type for the bitbuffer variable, which temporarily holds bits that are
+ * being packed into bytes and written to the output buffer.  For best
+ * performance, this should have size equal to a machine word.
+ */
+typedef machine_word_t bitbuf_t;
+#define BITBUF_NBITS   (8 * sizeof(bitbuf_t))
+
+/*
+ * Can the specified number of bits always be added to 'bitbuf' after any
+ * pending bytes have been flushed?
+ */
+#define CAN_BUFFER(n)  ((n) <= BITBUF_NBITS - 7)
+
+/*
+ * Structure to keep track of the current state of sending bits to the
+ * compressed output buffer
+ */
+struct deflate_output_bitstream {
+
+       /* Bits that haven't yet been written to the output buffer */
+       bitbuf_t bitbuf;
+
+       /* Number of bits currently held in @bitbuf */
+       unsigned bitcount;
+
+       /* Pointer to the beginning of the output buffer */
+       u8 *begin;
+
+       /*
+        * Pointer to the position in the output buffer at which the next byte
+        * should be written
+        */
+       u8 *next;
+
+       /* Pointer to just past the end of the output buffer */
+       u8 *end;
+};
+
+/*
+ * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
+ * present following os->end, in order to not overrun the buffer when generating
+ * output.  When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
+ * bytes for put_unaligned_leword().  Otherwise we need only 1 byte.  However,
+ * to make the compression algorithm produce the same result on all CPU
+ * architectures (which is sometimes desirable), we have to unconditionally use
+ * the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
+ */
+#define OUTPUT_END_PADDING     8
+
+/*
+ * Initialize the output bitstream.  'size' must be at least OUTPUT_END_PADDING.
+ */
+static void
+deflate_init_output(struct deflate_output_bitstream *os,
+                   void *buffer, size_t size)
+{
+       os->bitbuf = 0;
+       os->bitcount = 0;
+       os->begin = buffer;
+       os->next = os->begin;
+       os->end = os->begin + size - OUTPUT_END_PADDING;
+}
+
+/*
+ * Add some bits to the bitbuffer variable of the output bitstream.  The caller
+ * must ensure that os->bitcount + num_bits <= BITBUF_NBITS, by calling
+ * deflate_flush_bits() frequently enough.
+ */
+static forceinline void
+deflate_add_bits(struct deflate_output_bitstream *os,
+                bitbuf_t bits, unsigned num_bits)
+{
+       os->bitbuf |= bits << os->bitcount;
+       os->bitcount += num_bits;
+}
+
+/* Flush bits from the bitbuffer variable to the output buffer. */
+static forceinline void
+deflate_flush_bits(struct deflate_output_bitstream *os)
+{
+       if (UNALIGNED_ACCESS_IS_FAST) {
+               /* Flush a whole word (branchlessly). */
+               put_unaligned_leword(os->bitbuf, os->next);
+               os->bitbuf >>= os->bitcount & ~7;
+               os->next += MIN(os->end - os->next, os->bitcount >> 3);
+               os->bitcount &= 7;
+       } else {
+               /* Flush a byte at a time. */
+               while (os->bitcount >= 8) {
+                       *os->next = os->bitbuf;
+                       if (os->next != os->end)
+                               os->next++;
+                       os->bitcount -= 8;
+                       os->bitbuf >>= 8;
+               }
+       }
+}
+
+/*
+ * Add bits, then flush right away.  Only use this where it is difficult to
+ * batch up calls to deflate_add_bits().
+ */
+static forceinline void
+deflate_write_bits(struct deflate_output_bitstream *os,
+                  bitbuf_t bits, unsigned num_bits)
+{
+       deflate_add_bits(os, bits, num_bits);
+       deflate_flush_bits(os);
+}
+
+/* Align the bitstream on a byte boundary. */
+static forceinline void
+deflate_align_bitstream(struct deflate_output_bitstream *os)
+{
+       os->bitcount += -os->bitcount & 7;
+       deflate_flush_bits(os);
+}
+
+/*
+ * Flush any remaining bits to the output buffer if needed.  Return the total
+ * number of bytes that have been written to the output buffer since
+ * deflate_init_output(), or 0 if an overflow occurred.
+ */
+static size_t
+deflate_flush_output(struct deflate_output_bitstream *os)
+{
+       if (os->next == os->end) /* overflow? */
+               return 0;
+
+       while ((int)os->bitcount > 0) {
+               *os->next++ = os->bitbuf;
+               os->bitcount -= 8;
+               os->bitbuf >>= 8;
+       }
+
+       return os->next - os->begin;
+}
+
+/*
+ * Given the binary tree node A[subtree_idx] whose children already satisfy the
+ * maxheap property, swap the node with its greater child until it is greater
+ * than or equal to both of its children, so that the maxheap property is
+ * satisfied in the subtree rooted at A[subtree_idx].  'A' uses 1-based indices.
+ */
+static void
+heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
+{
+       unsigned parent_idx;
+       unsigned child_idx;
+       u32 v;
+
+       v = A[subtree_idx];
+       parent_idx = subtree_idx;
+       while ((child_idx = parent_idx * 2) <= length) {
+               if (child_idx < length && A[child_idx + 1] > A[child_idx])
+                       child_idx++;
+               if (v >= A[child_idx])
+                       break;
+               A[parent_idx] = A[child_idx];
+               parent_idx = child_idx;
+       }
+       A[parent_idx] = v;
+}
+
+/*
+ * Rearrange the array 'A' so that it satisfies the maxheap property.
+ * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
+ */
+static void
+heapify_array(u32 A[], unsigned length)
+{
+       unsigned subtree_idx;
+
+       for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
+               heapify_subtree(A, length, subtree_idx);
+}
+
+/*
+ * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
+ *
+ * Note: name this function heap_sort() instead of heapsort() to avoid colliding
+ * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
+ * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
+ */
+static void
+heap_sort(u32 A[], unsigned length)
+{
+       A--; /* Use 1-based indices  */
+
+       heapify_array(A, length);
+
+       while (length >= 2) {
+               u32 tmp = A[length];
+
+               A[length] = A[1];
+               A[1] = tmp;
+               length--;
+               heapify_subtree(A, length, 1);
+       }
+}
+
+#define NUM_SYMBOL_BITS 10
+#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
+
+#define GET_NUM_COUNTERS(num_syms)     (num_syms)
+
+/*
+ * Sort the symbols primarily by frequency and secondarily by symbol value.
+ * Discard symbols with zero frequency and fill in an array with the remaining
+ * symbols, along with their frequencies.  The low NUM_SYMBOL_BITS bits of each
+ * array entry will contain the symbol value, and the remaining bits will
+ * contain the frequency.
+ *
+ * @num_syms
+ *     Number of symbols in the alphabet.
+ *     Can't be greater than (1 << NUM_SYMBOL_BITS).
+ *
+ * @freqs[num_syms]
+ *     The frequency of each symbol.
+ *
+ * @lens[num_syms]
+ *     An array that eventually will hold the length of each codeword.  This
+ *     function only fills in the codeword lengths for symbols that have zero
+ *     frequency, which are not well defined per se but will be set to 0.
+ *
+ * @symout[num_syms]
+ *     The output array, described above.
+ *
+ * Returns the number of entries in 'symout' that were filled.  This is the
+ * number of symbols that have nonzero frequency.
+ */
+static unsigned
+sort_symbols(unsigned num_syms, const u32 freqs[restrict],
+            u8 lens[restrict], u32 symout[restrict])
+{
+       unsigned sym;
+       unsigned i;
+       unsigned num_used_syms;
+       unsigned num_counters;
+       unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
+
+       /*
+        * We use heapsort, but with an added optimization.  Since often most
+        * symbol frequencies are low, we first do a count sort using a limited
+        * number of counters.  High frequencies are counted in the last
+        * counter, and only they will be sorted with heapsort.
+        *
+        * Note: with more symbols, it is generally beneficial to have more
+        * counters.  About 1 counter per symbol seems fastest.
+        */
+
+       num_counters = GET_NUM_COUNTERS(num_syms);
+
+       memset(counters, 0, num_counters * sizeof(counters[0]));
+
+       /* Count the frequencies. */
+       for (sym = 0; sym < num_syms; sym++)
+               counters[MIN(freqs[sym], num_counters - 1)]++;
+
+       /*
+        * Make the counters cumulative, ignoring the zero-th, which counted
+        * symbols with zero frequency.  As a side effect, this calculates the
+        * number of symbols with nonzero frequency.
+        */
+       num_used_syms = 0;
+       for (i = 1; i < num_counters; i++) {
+               unsigned count = counters[i];
+
+               counters[i] = num_used_syms;
+               num_used_syms += count;
+       }
+
+       /*
+        * Sort nonzero-frequency symbols using the counters.  At the same time,
+        * set the codeword lengths of zero-frequency symbols to 0.
+        */
+       for (sym = 0; sym < num_syms; sym++) {
+               u32 freq = freqs[sym];
+
+               if (freq != 0) {
+                       symout[counters[MIN(freq, num_counters - 1)]++] =
+                               sym | (freq << NUM_SYMBOL_BITS);
+               } else {
+                       lens[sym] = 0;
+               }
+       }
+
+       /* Sort the symbols counted in the last counter. */
+       heap_sort(symout + counters[num_counters - 2],
+                 counters[num_counters - 1] - counters[num_counters - 2]);
+
+       return num_used_syms;
+}
+
+/*
+ * Build a Huffman tree.
+ *
+ * This is an optimized implementation that
+ *     (a) takes advantage of the frequencies being already sorted;
+ *     (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman
+ *         tree are sufficient to generate a canonical code;
+ *     (c) Only stores parent pointers, not child pointers;
+ *     (d) Produces the nodes in the same memory used for input frequency
+ *         information.
+ *
+ * Array 'A', which contains 'sym_count' entries, is used for both input and
+ * output.  For this function, 'sym_count' must be at least 2.
+ *
+ * For input, the array must contain the frequencies of the symbols, sorted in
+ * increasing order.  Specifically, each entry must contain a frequency left
+ * shifted by NUM_SYMBOL_BITS bits.  Any data in the low NUM_SYMBOL_BITS bits of
+ * the entries will be ignored by this function.  Although these bits will, in
+ * fact, contain the symbols that correspond to the frequencies, this function
+ * is concerned with frequencies only and keeps the symbols as-is.
+ *
+ * For output, this function will produce the non-leaf nodes of the Huffman
+ * tree.  These nodes will be stored in the first (sym_count - 1) entries of the
+ * array.  Entry A[sym_count - 2] will represent the root node.  Each other node
+ * will contain the zero-based index of its parent node in 'A', left shifted by
+ * NUM_SYMBOL_BITS bits.  The low NUM_SYMBOL_BITS bits of each entry in A will
+ * be kept as-is.  Again, note that although these low bits will, in fact,
+ * contain a symbol value, this symbol will have *no relationship* with the
+ * Huffman tree node that happens to occupy the same slot.  This is because this
+ * implementation only generates the non-leaf nodes of the tree.
+ */
+static void
+build_tree(u32 A[], unsigned sym_count)
+{
+       /*
+        * Index, in 'A', of next lowest frequency symbol that has not yet been
+        * processed.
+        */
+       unsigned i = 0;
+
+       /*
+        * Index, in 'A', of next lowest frequency parentless non-leaf node; or,
+        * if equal to 'e', then no such node exists yet.
+        */
+       unsigned b = 0;
+
+       /* Index, in 'A', of next node to allocate as a non-leaf. */
+       unsigned e = 0;
+
+       do {
+               unsigned m, n;
+               u32 freq_shifted;
+
+               /* Choose the two next lowest frequency entries. */
+
+               if (i != sym_count &&
+                   (b == e ||
+                    (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
+                       m = i++;
+               else
+                       m = b++;
+
+               if (i != sym_count &&
+                   (b == e ||
+                    (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
+                       n = i++;
+               else
+                       n = b++;
+
+               /*
+                * Allocate a non-leaf node and link the entries to it.
+                *
+                * If we link an entry that we're visiting for the first time
+                * (via index 'i'), then we're actually linking a leaf node and
+                * it will have no effect, since the leaf will be overwritten
+                * with a non-leaf when index 'e' catches up to it.  But it's
+                * not any slower to unconditionally set the parent index.
+                *
+                * We also compute the frequency of the non-leaf node as the sum
+                * of its two children's frequencies.
+                */
+
+               freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
+
+               A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
+               A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
+               A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
+               e++;
+       } while (sym_count - e > 1);
+               /*
+                * When just one entry remains, it is a "leaf" that was linked
+                * to some other node.  We ignore it, since the rest of the
+                * array contains the non-leaves which we need.  (Note that
+                * we're assuming the cases with 0 or 1 symbols were handled
+                * separately.)
+                */
+}
+
+/*
+ * Given the stripped-down Huffman tree constructed by build_tree(), determine
+ * the number of codewords that should be assigned each possible length, taking
+ * into account the length-limited constraint.
+ *
+ * @A
+ *     The array produced by build_tree(), containing parent index information
+ *     for the non-leaf nodes of the Huffman tree.  Each entry in this array is
+ *     a node; a node's parent always has a greater index than that node
+ *     itself.  This function will overwrite the parent index information in
+ *     this array, so essentially it will destroy the tree.  However, the data
+ *     in the low NUM_SYMBOL_BITS of each entry will be preserved.
+ *
+ * @root_idx
+ *     The 0-based index of the root node in 'A', and consequently one less
+ *     than the number of tree node entries in 'A'.  (Or, really 2 less than
+ *     the actual length of 'A'.)
+ *
+ * @len_counts
+ *     An array of length ('max_codeword_len' + 1) in which the number of
+ *     codewords having each length <= max_codeword_len will be returned.
+ *
+ * @max_codeword_len
+ *     The maximum permissible codeword length.
+ */
+static void
+compute_length_counts(u32 A[restrict], unsigned root_idx,
+                     unsigned len_counts[restrict], unsigned max_codeword_len)
+{
+       unsigned len;
+       int node;
+
+       /*
+        * The key observations are:
+        *
+        * (1) We can traverse the non-leaf nodes of the tree, always visiting a
+        *     parent before its children, by simply iterating through the array
+        *     in reverse order.  Consequently, we can compute the depth of each
+        *     node in one pass, overwriting the parent indices with depths.
+        *
+        * (2) We can initially assume that in the real Huffman tree, both
+        *     children of the root are leaves.  This corresponds to two
+        *     codewords of length 1.  Then, whenever we visit a (non-leaf) node
+        *     during the traversal, we modify this assumption to account for
+        *     the current node *not* being a leaf, but rather its two children
+        *     being leaves.  This causes the loss of one codeword for the
+        *     current depth and the addition of two codewords for the current
+        *     depth plus one.
+        *
+        * (3) We can handle the length-limited constraint fairly easily by
+        *     simply using the largest length available when a depth exceeds
+        *     max_codeword_len.
+        */
+
+       for (len = 0; len <= max_codeword_len; len++)
+               len_counts[len] = 0;
+       len_counts[1] = 2;
+
+       /* Set the root node's depth to 0. */
+       A[root_idx] &= SYMBOL_MASK;
+
+       for (node = root_idx - 1; node >= 0; node--) {
+
+               /* Calculate the depth of this node. */
+
+               unsigned parent = A[node] >> NUM_SYMBOL_BITS;
+               unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
+               unsigned depth = parent_depth + 1;
+               unsigned len = depth;
+
+               /*
+                * Set the depth of this node so that it is available when its
+                * children (if any) are processed.
+                */
+               A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
+
+               /*
+                * If needed, decrease the length to meet the length-limited
+                * constraint.  This is not the optimal method for generating
+                * length-limited Huffman codes!  But it should be good enough.
+                */
+               if (len >= max_codeword_len) {
+                       len = max_codeword_len;
+                       do {
+                               len--;
+                       } while (len_counts[len] == 0);
+               }
+
+               /*
+                * Account for the fact that we have a non-leaf node at the
+                * current depth.
+                */
+               len_counts[len]--;
+               len_counts[len + 1] += 2;
+       }
+}
+
+/* Reverse the Huffman codeword 'codeword', which is 'len' bits in length. */
+static u32
+reverse_codeword(u32 codeword, u8 len)
+{
+       /*
+        * The following branchless algorithm is faster than going bit by bit.
+        * Note: since no codewords are longer than 16 bits, we only need to
+        * reverse the low 16 bits of the 'u32'.
+        */
+       STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
+
+       /* Flip adjacent 1-bit fields. */
+       codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1);
+
+       /* Flip adjacent 2-bit fields. */
+       codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2);
+
+       /* Flip adjacent 4-bit fields. */
+       codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4);
+
+       /* Flip adjacent 8-bit fields. */
+       codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8);
+
+       /* Return the high 'len' bits of the bit-reversed 16 bit value. */
+       return codeword >> (16 - len);
+}
+
+/*
+ * Generate the codewords for a canonical Huffman code.
+ *
+ * @A
+ *     The output array for codewords.  In addition, initially this
+ *     array must contain the symbols, sorted primarily by frequency and
+ *     secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
+ *     each entry.
+ *
+ * @len
+ *     Output array for codeword lengths.
+ *
+ * @len_counts
+ *     An array that provides the number of codewords that will have
+ *     each possible length <= max_codeword_len.
+ *
+ * @max_codeword_len
+ *     Maximum length, in bits, of each codeword.
+ *
+ * @num_syms
+ *     Number of symbols in the alphabet, including symbols with zero
+ *     frequency.  This is the length of the 'A' and 'len' arrays.
+ */
+static void
+gen_codewords(u32 A[restrict], u8 lens[restrict],
+             const unsigned len_counts[restrict],
+             unsigned max_codeword_len, unsigned num_syms)
+{
+       u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
+       unsigned i;
+       unsigned len;
+       unsigned sym;
+
+       /*
+        * Given the number of codewords that will have each length, assign
+        * codeword lengths to symbols.  We do this by assigning the lengths in
+        * decreasing order to the symbols sorted primarily by increasing
+        * frequency and secondarily by increasing symbol value.
+        */
+       for (i = 0, len = max_codeword_len; len >= 1; len--) {
+               unsigned count = len_counts[len];
+
+               while (count--)
+                       lens[A[i++] & SYMBOL_MASK] = len;
+       }
+
+       /*
+        * Generate the codewords themselves.  We initialize the
+        * 'next_codewords' array to provide the lexicographically first
+        * codeword of each length, then assign codewords in symbol order.  This
+        * produces a canonical code.
+        */
+       next_codewords[0] = 0;
+       next_codewords[1] = 0;
+       for (len = 2; len <= max_codeword_len; len++)
+               next_codewords[len] =
+                       (next_codewords[len - 1] + len_counts[len - 1]) << 1;
+
+       for (sym = 0; sym < num_syms; sym++) {
+               u8 len = lens[sym];
+               u32 codeword = next_codewords[len]++;
+
+               /* DEFLATE requires bit-reversed codewords. */
+               A[sym] = reverse_codeword(codeword, len);
+       }
+}
+
+/*
+ * ---------------------------------------------------------------------
+ *                     deflate_make_huffman_code()
+ * ---------------------------------------------------------------------
+ *
+ * Given an alphabet and the frequency of each symbol in it, construct a
+ * length-limited canonical Huffman code.
+ *
+ * @num_syms
+ *     The number of symbols in the alphabet.  The symbols are the integers in
+ *     the range [0, num_syms - 1].  This parameter must be at least 2 and
+ *     can't be greater than (1 << NUM_SYMBOL_BITS).
+ *
+ * @max_codeword_len
+ *     The maximum permissible codeword length.
+ *
+ * @freqs
+ *     An array of @num_syms entries, each of which specifies the frequency of
+ *     the corresponding symbol.  It is valid for some, none, or all of the
+ *     frequencies to be 0.
+ *
+ * @lens
+ *     An array of @num_syms entries in which this function will return the
+ *     length, in bits, of the codeword assigned to each symbol.  Symbols with
+ *     0 frequency will not have codewords per se, but their entries in this
+ *     array will be set to 0.  No lengths greater than @max_codeword_len will
+ *     be assigned.
+ *
+ * @codewords
+ *     An array of @num_syms entries in which this function will return the
+ *     codeword for each symbol, right-justified and padded on the left with
+ *     zeroes.  Codewords for symbols with 0 frequency will be undefined.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * This function builds a length-limited canonical Huffman code.
+ *
+ * A length-limited Huffman code contains no codewords longer than some
+ * specified length, and has exactly (with some algorithms) or approximately
+ * (with the algorithm used here) the minimum weighted path length from the
+ * root, given this constraint.
+ *
+ * A canonical Huffman code satisfies the properties that a longer codeword
+ * never lexicographically precedes a shorter codeword, and the lexicographic
+ * ordering of codewords of the same length is the same as the lexicographic
+ * ordering of the corresponding symbols.  A canonical Huffman code, or more
+ * generally a canonical prefix code, can be reconstructed from only a list
+ * containing the codeword length of each symbol.
+ *
+ * The classic algorithm to generate a Huffman code creates a node for each
+ * symbol, then inserts these nodes into a min-heap keyed by symbol frequency.
+ * Then, repeatedly, the two lowest-frequency nodes are removed from the
+ * min-heap and added as the children of a new node having frequency equal to
+ * the sum of its two children, which is then inserted into the min-heap.  When
+ * only a single node remains in the min-heap, it is the root of the Huffman
+ * tree.  The codeword for each symbol is determined by the path needed to reach
+ * the corresponding node from the root.  Descending to the left child appends a
+ * 0 bit, whereas descending to the right child appends a 1 bit.
+ *
+ * The classic algorithm is relatively easy to understand, but it is subject to
+ * a number of inefficiencies.  In practice, it is fastest to first sort the
+ * symbols by frequency.  (This itself can be subject to an optimization based
+ * on the fact that most frequencies tend to be low.)  At the same time, we sort
+ * secondarily by symbol value, which aids the process of generating a canonical
+ * code.  Then, during tree construction, no heap is necessary because both the
+ * leaf nodes and the unparented non-leaf nodes can be easily maintained in
+ * sorted order.  Consequently, there can never be more than two possibilities
+ * for the next-lowest-frequency node.
+ *
+ * In addition, because we're generating a canonical code, we actually don't
+ * need the leaf nodes of the tree at all, only the non-leaf nodes.  This is
+ * because for canonical code generation we don't need to know where the symbols
+ * are in the tree.  Rather, we only need to know how many leaf nodes have each
+ * depth (codeword length).  And this information can, in fact, be quickly
+ * generated from the tree of non-leaves only.
+ *
+ * Furthermore, we can build this stripped-down Huffman tree directly in the
+ * array in which the codewords are to be generated, provided that these array
+ * slots are large enough to hold a symbol and frequency value.
+ *
+ * Still furthermore, we don't even need to maintain explicit child pointers.
+ * We only need the parent pointers, and even those can be overwritten in-place
+ * with depth information as part of the process of extracting codeword lengths
+ * from the tree.  So in summary, we do NOT need a big structure like:
+ *
+ *     struct huffman_tree_node {
+ *             unsigned int symbol;
+ *             unsigned int frequency;
+ *             unsigned int depth;
+ *             struct huffman_tree_node *left_child;
+ *             struct huffman_tree_node *right_child;
+ *     };
+ *
+ *
+ * ... which often gets used in "naive" implementations of Huffman code
+ * generation.
+ *
+ * Many of these optimizations are based on the implementation in 7-Zip (source
+ * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov.
+ */
+static void
+deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
+                         const u32 freqs[restrict],
+                         u8 lens[restrict], u32 codewords[restrict])
+{
+       u32 *A = codewords;
+       unsigned num_used_syms;
+
+       STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
+
+       /*
+        * We begin by sorting the symbols primarily by frequency and
+        * secondarily by symbol value.  As an optimization, the array used for
+        * this purpose ('A') shares storage with the space in which we will
+        * eventually return the codewords.
+        */
+       num_used_syms = sort_symbols(num_syms, freqs, lens, A);
+
+       /*
+        * 'num_used_syms' is the number of symbols with nonzero frequency.
+        * This may be less than @num_syms.  'num_used_syms' is also the number
+        * of entries in 'A' that are valid.  Each entry consists of a distinct
+        * symbol and a nonzero frequency packed into a 32-bit integer.
+        */
+
+       /*
+        * Handle special cases where only 0 or 1 symbols were used (had nonzero
+        * frequency).
+        */
+
+       if (unlikely(num_used_syms == 0)) {
+               /*
+                * Code is empty.  sort_symbols() already set all lengths to 0,
+                * so there is nothing more to do.
+                */
+               return;
+       }
+
+       if (unlikely(num_used_syms == 1)) {
+               /*
+                * Only one symbol was used, so we only need one codeword.  But
+                * two codewords are needed to form the smallest complete
+                * Huffman code, which uses codewords 0 and 1.  Therefore, we
+                * choose another symbol to which to assign a codeword.  We use
+                * 0 (if the used symbol is not 0) or 1 (if the used symbol is
+                * 0).  In either case, the lesser-valued symbol must be
+                * assigned codeword 0 so that the resulting code is canonical.
+                */
+
+               unsigned sym = A[0] & SYMBOL_MASK;
+               unsigned nonzero_idx = sym ? sym : 1;
+
+               codewords[0] = 0;
+               lens[0] = 1;
+               codewords[nonzero_idx] = 1;
+               lens[nonzero_idx] = 1;
+               return;
+       }
+
+       /*
+        * Build a stripped-down version of the Huffman tree, sharing the array
+        * 'A' with the symbol values.  Then extract length counts from the tree
+        * and use them to generate the final codewords.
+        */
+
+       build_tree(A, num_used_syms);
+
+       {
+               unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
+
+               compute_length_counts(A, num_used_syms - 2,
+                                     len_counts, max_codeword_len);
+
+               gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
+       }
+}
+
+/*
+ * Clear the Huffman symbol frequency counters.  This must be called when
+ * starting a new DEFLATE block.
+ */
+static void
+deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
+{
+       memset(&c->freqs, 0, sizeof(c->freqs));
+}
+
+/*
+ * Build the literal/length and offset Huffman codes for a DEFLATE block.
+ *
+ * This takes as input the frequency tables for each alphabet and produces as
+ * output a set of tables that map symbols to codewords and codeword lengths.
+ */
+static void
+deflate_make_huffman_codes(const struct deflate_freqs *freqs,
+                          struct deflate_codes *codes)
+{
+       deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
+                                 MAX_LITLEN_CODEWORD_LEN,
+                                 freqs->litlen,
+                                 codes->lens.litlen,
+                                 codes->codewords.litlen);
+
+       deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
+                                 MAX_OFFSET_CODEWORD_LEN,
+                                 freqs->offset,
+                                 codes->lens.offset,
+                                 codes->codewords.offset);
+}
+
+/* Initialize c->static_codes. */
+static void
+deflate_init_static_codes(struct libdeflate_compressor *c)
+{
+       unsigned i;
+
+       for (i = 0; i < 144; i++)
+               c->freqs.litlen[i] = 1 << (9 - 8);
+       for (; i < 256; i++)
+               c->freqs.litlen[i] = 1 << (9 - 9);
+       for (; i < 280; i++)
+               c->freqs.litlen[i] = 1 << (9 - 7);
+       for (; i < 288; i++)
+               c->freqs.litlen[i] = 1 << (9 - 8);
+
+       for (i = 0; i < 32; i++)
+               c->freqs.offset[i] = 1 << (5 - 5);
+
+       deflate_make_huffman_codes(&c->freqs, &c->static_codes);
+}
+
+/* Return the offset slot for the given match offset, using the small map. */
+static forceinline unsigned
+deflate_get_offset_slot(unsigned offset)
+{
+#if 1
+       if (offset <= 256)
+               return deflate_offset_slot[offset];
+       else
+               return deflate_offset_slot[256 + ((offset - 1) >> 7)];
+#else /* Branchless version */
+       u32 i1 = offset;
+       u32 i2 = 256 + ((offset - 1) >> 7);
+       u32 is_small = (s32)(offset - 257) >> 31;
+
+       return deflate_offset_slot[(i1 & is_small) ^ (i2 & ~is_small)];
+#endif
+}
+
+/* Write the header fields common to all DEFLATE block types. */
+static void
+deflate_write_block_header(struct deflate_output_bitstream *os,
+                          bool is_final_block, unsigned block_type)
+{
+       deflate_add_bits(os, is_final_block, 1);
+       deflate_add_bits(os, block_type, 2);
+       deflate_flush_bits(os);
+}
+
+static unsigned
+deflate_compute_precode_items(const u8 lens[restrict],
+                             const unsigned num_lens,
+                             u32 precode_freqs[restrict],
+                             unsigned precode_items[restrict])
+{
+       unsigned *itemptr;
+       unsigned run_start;
+       unsigned run_end;
+       unsigned extra_bits;
+       u8 len;
+
+       memset(precode_freqs, 0,
+              DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
+
+       itemptr = precode_items;
+       run_start = 0;
+       do {
+               /* Find the next run of codeword lengths. */
+
+               /* len = the length being repeated */
+               len = lens[run_start];
+
+               /* Extend the run. */
+               run_end = run_start;
+               do {
+                       run_end++;
+               } while (run_end != num_lens && len == lens[run_end]);
+
+               if (len == 0) {
+                       /* Run of zeroes. */
+
+                       /* Symbol 18: RLE 11 to 138 zeroes at a time. */
+                       while ((run_end - run_start) >= 11) {
+                               extra_bits = MIN((run_end - run_start) - 11,
+                                                0x7F);
+                               precode_freqs[18]++;
+                               *itemptr++ = 18 | (extra_bits << 5);
+                               run_start += 11 + extra_bits;
+                       }
+
+                       /* Symbol 17: RLE 3 to 10 zeroes at a time. */
+                       if ((run_end - run_start) >= 3) {
+                               extra_bits = MIN((run_end - run_start) - 3,
+                                                0x7);
+                               precode_freqs[17]++;
+                               *itemptr++ = 17 | (extra_bits << 5);
+                               run_start += 3 + extra_bits;
+                       }
+               } else {
+
+                       /* A run of nonzero lengths. */
+
+                       /* Symbol 16: RLE 3 to 6 of the previous length. */
+                       if ((run_end - run_start) >= 4) {
+                               precode_freqs[len]++;
+                               *itemptr++ = len;
+                               run_start++;
+                               do {
+                                       extra_bits = MIN((run_end - run_start) -
+                                                        3, 0x3);
+                                       precode_freqs[16]++;
+                                       *itemptr++ = 16 | (extra_bits << 5);
+                                       run_start += 3 + extra_bits;
+                               } while ((run_end - run_start) >= 3);
+                       }
+               }
+
+               /* Output any remaining lengths without RLE. */
+               while (run_start != run_end) {
+                       precode_freqs[len]++;
+                       *itemptr++ = len;
+                       run_start++;
+               }
+       } while (run_start != num_lens);
+
+       return itemptr - precode_items;
+}
+
+/*
+ * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
+ * separate Huffman code, the "precode", which contains a symbol for each
+ * possible codeword length in the larger code as well as several special
+ * symbols to represent repeated codeword lengths (a form of run-length
+ * encoding).  The precode is itself constructed in canonical form, and its
+ * codeword lengths are represented literally in 19 3-bit fields that
+ * immediately precede the compressed codeword lengths of the larger code.
+ */
+
+/* Precompute the information needed to output Huffman codes. */
+static void
+deflate_precompute_huffman_header(struct libdeflate_compressor *c)
+{
+       /* Compute how many litlen and offset symbols are needed. */
+
+       for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
+            c->num_litlen_syms > 257;
+            c->num_litlen_syms--)
+               if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0)
+                       break;
+
+       for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
+            c->num_offset_syms > 1;
+            c->num_offset_syms--)
+               if (c->codes.lens.offset[c->num_offset_syms - 1] != 0)
+                       break;
+
+       /*
+        * If we're not using the full set of literal/length codeword lengths,
+        * then temporarily move the offset codeword lengths over so that the
+        * literal/length and offset codeword lengths are contiguous.
+        */
+       STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
+                     DEFLATE_NUM_LITLEN_SYMS);
+       if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
+               memmove((u8 *)&c->codes.lens + c->num_litlen_syms,
+                       (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
+                       c->num_offset_syms);
+       }
+
+       /*
+        * Compute the "items" (RLE / literal tokens and extra bits) with which
+        * the codeword lengths in the larger code will be output.
+        */
+       c->num_precode_items =
+               deflate_compute_precode_items((u8 *)&c->codes.lens,
+                                             c->num_litlen_syms +
+                                                       c->num_offset_syms,
+                                             c->precode_freqs,
+                                             c->precode_items);
+
+       /* Build the precode. */
+       deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
+                                 MAX_PRE_CODEWORD_LEN,
+                                 c->precode_freqs, c->precode_lens,
+                                 c->precode_codewords);
+
+       /* Count how many precode lengths we actually need to output. */
+       for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
+            c->num_explicit_lens > 4;
+            c->num_explicit_lens--)
+               if (c->precode_lens[deflate_precode_lens_permutation[
+                                               c->num_explicit_lens - 1]] != 0)
+                       break;
+
+       /* Restore the offset codeword lengths if needed. */
+       if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
+               memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
+                       (u8 *)&c->codes.lens + c->num_litlen_syms,
+                       c->num_offset_syms);
+       }
+}
+
+/* Output the Huffman codes. */
+static void
+deflate_write_huffman_header(struct libdeflate_compressor *c,
+                            struct deflate_output_bitstream *os)
+{
+       unsigned i;
+
+       deflate_add_bits(os, c->num_litlen_syms - 257, 5);
+       deflate_add_bits(os, c->num_offset_syms - 1, 5);
+       deflate_add_bits(os, c->num_explicit_lens - 4, 4);
+       deflate_flush_bits(os);
+
+       /* Output the lengths of the codewords in the precode. */
+       for (i = 0; i < c->num_explicit_lens; i++) {
+               deflate_write_bits(os, c->precode_lens[
+                                      deflate_precode_lens_permutation[i]], 3);
+       }
+
+       /* Output the encoded lengths of the codewords in the larger code. */
+       for (i = 0; i < c->num_precode_items; i++) {
+               unsigned precode_item = c->precode_items[i];
+               unsigned precode_sym = precode_item & 0x1F;
+
+               deflate_add_bits(os, c->precode_codewords[precode_sym],
+                                c->precode_lens[precode_sym]);
+               if (precode_sym >= 16) {
+                       if (precode_sym == 16)
+                               deflate_add_bits(os, precode_item >> 5, 2);
+                       else if (precode_sym == 17)
+                               deflate_add_bits(os, precode_item >> 5, 3);
+                       else
+                               deflate_add_bits(os, precode_item >> 5, 7);
+               }
+               STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7));
+               deflate_flush_bits(os);
+       }
+}
+
+static forceinline void
+deflate_write_literal_run(struct deflate_output_bitstream *os,
+                         const u8 *in_next, u32 litrunlen,
+                         const struct deflate_codes *codes)
+{
+#if 1
+       while (litrunlen >= 4) {
+               unsigned lit0 = in_next[0];
+               unsigned lit1 = in_next[1];
+               unsigned lit2 = in_next[2];
+               unsigned lit3 = in_next[3];
+
+               deflate_add_bits(os, codes->codewords.litlen[lit0],
+                                codes->lens.litlen[lit0]);
+               if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
+                       deflate_flush_bits(os);
+
+               deflate_add_bits(os, codes->codewords.litlen[lit1],
+                                codes->lens.litlen[lit1]);
+               if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN))
+                       deflate_flush_bits(os);
+
+               deflate_add_bits(os, codes->codewords.litlen[lit2],
+                                codes->lens.litlen[lit2]);
+               if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
+                       deflate_flush_bits(os);
+
+               deflate_add_bits(os, codes->codewords.litlen[lit3],
+                                codes->lens.litlen[lit3]);
+               deflate_flush_bits(os);
+               in_next += 4;
+               litrunlen -= 4;
+       }
+       if (litrunlen-- != 0) {
+               deflate_add_bits(os, codes->codewords.litlen[*in_next],
+                                codes->lens.litlen[*in_next]);
+               if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
+                       deflate_flush_bits(os);
+               in_next++;
+               if (litrunlen-- != 0) {
+                       deflate_add_bits(os, codes->codewords.litlen[*in_next],
+                                        codes->lens.litlen[*in_next]);
+                       if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
+                               deflate_flush_bits(os);
+                       in_next++;
+                       if (litrunlen-- != 0) {
+                               deflate_add_bits(os,
+                                       codes->codewords.litlen[*in_next],
+                                       codes->lens.litlen[*in_next]);
+                               if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
+                                       deflate_flush_bits(os);
+                               in_next++;
+                       }
+               }
+               if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
+                       deflate_flush_bits(os);
+       }
+#else
+       do {
+               unsigned lit = *in_next++;
+
+               deflate_write_bits(os, codes->codewords.litlen[lit],
+                                  codes->lens.litlen[lit]);
+       } while (--litrunlen);
+#endif
+}
+
+static forceinline void
+deflate_write_match(struct deflate_output_bitstream * restrict os,
+                   unsigned length, unsigned length_slot,
+                   unsigned offset, unsigned offset_symbol,
+                   const struct deflate_codes * restrict codes)
+{
+       unsigned litlen_symbol = DEFLATE_FIRST_LEN_SYM + length_slot;
+
+       /* Litlen symbol */
+       deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
+                        codes->lens.litlen[litlen_symbol]);
+
+       /* Extra length bits */
+       STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
+                                DEFLATE_MAX_EXTRA_LENGTH_BITS));
+       deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
+                        deflate_extra_length_bits[length_slot]);
+
+       if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
+                       DEFLATE_MAX_EXTRA_LENGTH_BITS +
+                       MAX_OFFSET_CODEWORD_LEN +
+                       DEFLATE_MAX_EXTRA_OFFSET_BITS))
+               deflate_flush_bits(os);
+
+       /* Offset symbol */
+       deflate_add_bits(os, codes->codewords.offset[offset_symbol],
+                        codes->lens.offset[offset_symbol]);
+
+       if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
+                       DEFLATE_MAX_EXTRA_OFFSET_BITS))
+               deflate_flush_bits(os);
+
+       /* Extra offset bits */
+       deflate_add_bits(os, offset - deflate_offset_slot_base[offset_symbol],
+                        deflate_extra_offset_bits[offset_symbol]);
+
+       deflate_flush_bits(os);
+}
+
+static void
+deflate_write_sequences(struct deflate_output_bitstream * restrict os,
+                       const struct deflate_codes * restrict codes,
+                       const struct deflate_sequence sequences[restrict],
+                       const u8 * restrict in_next)
+{
+       const struct deflate_sequence *seq = sequences;
+
+       for (;;) {
+               u32 litrunlen = seq->litrunlen_and_length & SEQ_LITRUNLEN_MASK;
+               unsigned length = seq->litrunlen_and_length >> SEQ_LENGTH_SHIFT;
+
+               if (litrunlen) {
+                       deflate_write_literal_run(os, in_next, litrunlen,
+                                                 codes);
+                       in_next += litrunlen;
+               }
+
+               if (length == 0)
+                       return;
+
+               deflate_write_match(os, length, seq->length_slot,
+                                   seq->offset, seq->offset_symbol, codes);
+
+               in_next += length;
+               seq++;
+       }
+}
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+/*
+ * Follow the minimum-cost path in the graph of possible match/literal choices
+ * for the current block and write out the matches/literals using the specified
+ * Huffman codes.
+ */
+static void
+deflate_write_item_list(struct deflate_output_bitstream *os,
+                       const struct deflate_codes *codes,
+                       struct libdeflate_compressor *c,
+                       u32 block_length)
+{
+       struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
+       struct deflate_optimum_node * const end_node =
+               &c->p.n.optimum_nodes[block_length];
+       do {
+               unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
+               unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
+
+               if (length == 1) {
+                       /* Literal */
+                       deflate_write_bits(os, codes->codewords.litlen[offset],
+                                          codes->lens.litlen[offset]);
+               } else {
+                       /* Match */
+                       deflate_write_match(os, length,
+                                           deflate_length_slot[length],
+                                           offset,
+                                           c->p.n.offset_slot_full[offset],
+                                           codes);
+               }
+               cur_node += length;
+       } while (cur_node != end_node);
+}
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+/* Output the end-of-block symbol. */
+static void
+deflate_write_end_of_block(struct deflate_output_bitstream *os,
+                          const struct deflate_codes *codes)
+{
+       deflate_write_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
+                          codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
+}
+
+static void
+deflate_write_uncompressed_block(struct deflate_output_bitstream *os,
+                                const u8 *data, u16 len,
+                                bool is_final_block)
+{
+       deflate_write_block_header(os, is_final_block,
+                                  DEFLATE_BLOCKTYPE_UNCOMPRESSED);
+       deflate_align_bitstream(os);
+
+       if (4 + (u32)len >= os->end - os->next) {
+               os->next = os->end;
+               return;
+       }
+
+       put_unaligned_le16(len, os->next);
+       os->next += 2;
+       put_unaligned_le16(~len, os->next);
+       os->next += 2;
+       memcpy(os->next, data, len);
+       os->next += len;
+}
+
+static void
+deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os,
+                                 const u8 *data, size_t data_length,
+                                 bool is_final_block)
+{
+       do {
+               u16 len = MIN(data_length, UINT16_MAX);
+
+               deflate_write_uncompressed_block(os, data, len,
+                                       is_final_block && len == data_length);
+               data += len;
+               data_length -= len;
+       } while (data_length != 0);
+}
+
+/*
+ * Choose the best type of block to use (dynamic Huffman, static Huffman, or
+ * uncompressed), then output it.
+ */
+static void
+deflate_flush_block(struct libdeflate_compressor * restrict c,
+                   struct deflate_output_bitstream * restrict os,
+                   const u8 * restrict block_begin, u32 block_length,
+                   const struct deflate_sequence *sequences,
+                   bool is_final_block)
+{
+       static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
+               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7,
+       };
+
+       /* Costs are measured in bits */
+       u32 dynamic_cost = 0;
+       u32 static_cost = 0;
+       u32 uncompressed_cost = 0;
+       struct deflate_codes *codes;
+       int block_type;
+       unsigned sym;
+
+       if (sequences != NULL /* !near_optimal */ ||
+           !SUPPORT_NEAR_OPTIMAL_PARSING) {
+               /* Tally the end-of-block symbol. */
+               c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
+
+               /* Build dynamic Huffman codes. */
+               deflate_make_huffman_codes(&c->freqs, &c->codes);
+       } /* Else, this was already done. */
+
+       /* Account for the cost of sending dynamic Huffman codes. */
+       deflate_precompute_huffman_header(c);
+       dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens);
+       for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
+               u32 extra = deflate_extra_precode_bits[sym];
+
+               dynamic_cost += c->precode_freqs[sym] *
+                               (extra + c->precode_lens[sym]);
+       }
+
+       /* Account for the cost of encoding literals. */
+       for (sym = 0; sym < 256; sym++) {
+               dynamic_cost += c->freqs.litlen[sym] *
+                               c->codes.lens.litlen[sym];
+       }
+       for (sym = 0; sym < 144; sym++)
+               static_cost += c->freqs.litlen[sym] * 8;
+       for (; sym < 256; sym++)
+               static_cost += c->freqs.litlen[sym] * 9;
+
+       /* Account for the cost of encoding the end-of-block symbol. */
+       dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK];
+       static_cost += 7;
+
+       /* Account for the cost of encoding lengths. */
+       for (sym = DEFLATE_FIRST_LEN_SYM;
+            sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits);
+            sym++) {
+               u32 extra = deflate_extra_length_bits[
+                                       sym - DEFLATE_FIRST_LEN_SYM];
+
+               dynamic_cost += c->freqs.litlen[sym] *
+                               (extra + c->codes.lens.litlen[sym]);
+               static_cost += c->freqs.litlen[sym] *
+                               (extra + c->static_codes.lens.litlen[sym]);
+       }
+
+       /* Account for the cost of encoding offsets. */
+       for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
+               u32 extra = deflate_extra_offset_bits[sym];
+
+               dynamic_cost += c->freqs.offset[sym] *
+                               (extra + c->codes.lens.offset[sym]);
+               static_cost += c->freqs.offset[sym] * (extra + 5);
+       }
+
+       /* Compute the cost of using uncompressed blocks. */
+       uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 +
+                            (40 * (DIV_ROUND_UP(block_length,
+                                                UINT16_MAX) - 1)) +
+                            (8 * block_length);
+
+       /* Choose the cheapest block type. */
+       if (dynamic_cost < MIN(static_cost, uncompressed_cost)) {
+               block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN;
+               codes = &c->codes;
+       } else if (static_cost < uncompressed_cost) {
+               block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN;
+               codes = &c->static_codes;
+       } else {
+               block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED;
+       }
+
+       /* Now actually output the block. */
+
+       if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
+               /*
+                * Note: the length being flushed may exceed the maximum length
+                * of an uncompressed block (65535 bytes).  Therefore, more than
+                * one uncompressed block might be needed.
+                */
+               deflate_write_uncompressed_blocks(os, block_begin, block_length,
+                                                 is_final_block);
+       } else {
+               /* Output the block header. */
+               deflate_write_block_header(os, is_final_block, block_type);
+
+               /* Output the Huffman codes (dynamic Huffman blocks only). */
+               if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN)
+                       deflate_write_huffman_header(c, os);
+
+               /* Output the literals, matches, and end-of-block symbol. */
+       #if SUPPORT_NEAR_OPTIMAL_PARSING
+               if (sequences == NULL)
+                       deflate_write_item_list(os, codes, c, block_length);
+               else
+       #endif
+                       deflate_write_sequences(os, codes, sequences,
+                                               block_begin);
+               deflate_write_end_of_block(os, codes);
+       }
+}
+
+/******************************************************************************/
+
+/*
+ * Block splitting algorithm.  The problem is to decide when it is worthwhile to
+ * start a new block with new Huffman codes.  There is a theoretically optimal
+ * solution: recursively consider every possible block split, considering the
+ * exact cost of each block, and choose the minimum cost approach.  But this is
+ * far too slow.  Instead, as an approximation, we can count symbols and after
+ * every N symbols, compare the expected distribution of symbols based on the
+ * previous data with the actual distribution.  If they differ "by enough", then
+ * start a new block.
+ *
+ * As an optimization and heuristic, we don't distinguish between every symbol
+ * but rather we combine many symbols into a single "observation type".  For
+ * literals we only look at the high bits and low bits, and for matches we only
+ * look at whether the match is long or not.  The assumption is that for typical
+ * "real" data, places that are good block boundaries will tend to be noticeable
+ * based only on changes in these aggregate probabilities, without looking for
+ * subtle differences in individual symbols.  For example, a change from ASCII
+ * bytes to non-ASCII bytes, or from few matches (generally less compressible)
+ * to many matches (generally more compressible), would be easily noticed based
+ * on the aggregates.
+ *
+ * For determining whether the probability distributions are "different enough"
+ * to start a new block, the simple heuristic of splitting when the sum of
+ * absolute differences exceeds a constant seems to be good enough.  We also add
+ * a number proportional to the block length so that the algorithm is more
+ * likely to end long blocks than short blocks.  This reflects the general
+ * expectation that it will become increasingly beneficial to start a new block
+ * as the current block grows longer.
+ *
+ * Finally, for an approximation, it is not strictly necessary that the exact
+ * symbols being used are considered.  With "near-optimal parsing", for example,
+ * the actual symbols that will be used are unknown until after the block
+ * boundary is chosen and the block has been optimized.  Since the final choices
+ * cannot be used, we can use preliminary "greedy" choices instead.
+ */
+
+/* Initialize the block split statistics when starting a new block. */
+static void
+init_block_split_stats(struct block_split_stats *stats)
+{
+       int i;
+
+       for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+               stats->new_observations[i] = 0;
+               stats->observations[i] = 0;
+       }
+       stats->num_new_observations = 0;
+       stats->num_observations = 0;
+}
+
+/*
+ * Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
+ * literal, for 8 possible literal observation types.
+ */
+static forceinline void
+observe_literal(struct block_split_stats *stats, u8 lit)
+{
+       stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
+       stats->num_new_observations++;
+}
+
+/*
+ * Match observation.  Heuristic: use one observation type for "short match" and
+ * one observation type for "long match".
+ */
+static forceinline void
+observe_match(struct block_split_stats *stats, unsigned length)
+{
+       stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES +
+                               (length >= 9)]++;
+       stats->num_new_observations++;
+}
+
+static void
+merge_new_observations(struct block_split_stats *stats)
+{
+       int i;
+
+       for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+               stats->observations[i] += stats->new_observations[i];
+               stats->new_observations[i] = 0;
+       }
+       stats->num_observations += stats->num_new_observations;
+       stats->num_new_observations = 0;
+}
+
+static bool
+do_end_block_check(struct block_split_stats *stats, u32 block_length)
+{
+       if (stats->num_observations > 0) {
+               /*
+                * Compute the sum of absolute differences of probabilities.  To
+                * avoid needing to use floating point arithmetic or do slow
+                * divisions, we do all arithmetic with the probabilities
+                * multiplied by num_observations * num_new_observations.  E.g.,
+                * for the "old" observations the probabilities would be
+                * (double)observations[i] / num_observations, but since we
+                * multiply by both num_observations and num_new_observations we
+                * really do observations[i] * num_new_observations.
+                */
+               u32 total_delta = 0;
+               u32 num_items;
+               u32 cutoff;
+               int i;
+
+               for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+                       u32 expected = stats->observations[i] *
+                                      stats->num_new_observations;
+                       u32 actual = stats->new_observations[i] *
+                                    stats->num_observations;
+                       u32 delta = (actual > expected) ? actual - expected :
+                                                         expected - actual;
+
+                       total_delta += delta;
+               }
+
+               num_items = stats->num_observations +
+                           stats->num_new_observations;
+               /*
+                * Heuristic: the cutoff is when the sum of absolute differences
+                * of probabilities becomes at least 200/512.  As above, the
+                * probability is multiplied by both num_new_observations and
+                * num_observations.  Be careful to avoid integer overflow.
+                */
+               cutoff = stats->num_new_observations * 200 / 512 *
+                        stats->num_observations;
+               /*
+                * Very short blocks have a lot of overhead for the Huffman
+                * codes, so only use them if it clearly seems worthwhile.
+                * (This is an additional penalty, which adds to the smaller
+                * penalty below which scales more slowly.)
+                */
+               if (block_length < 10000 && num_items < 8192)
+                       cutoff += (u64)cutoff * (8192 - num_items) / 8192;
+
+               /* Ready to end the block? */
+               if (total_delta +
+                   (block_length / 4096) * stats->num_observations >= cutoff)
+                       return true;
+       }
+       merge_new_observations(stats);
+       return false;
+}
+
+static forceinline bool
+ready_to_check_block(const struct block_split_stats *stats,
+                    const u8 *in_block_begin, const u8 *in_next,
+                    const u8 *in_end)
+{
+       return stats->num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK
+               && in_next - in_block_begin >= MIN_BLOCK_LENGTH
+               && in_end - in_next >= MIN_BLOCK_LENGTH;
+}
+
+static forceinline bool
+should_end_block(struct block_split_stats *stats,
+                const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
+{
+       /* Ready to try to end the block (again)? */
+       if (!ready_to_check_block(stats, in_block_begin, in_next, in_end))
+               return false;
+
+       return do_end_block_check(stats, in_next - in_block_begin);
+}
+
+/******************************************************************************/
+
+static void
+deflate_begin_sequences(struct libdeflate_compressor *c,
+                       struct deflate_sequence *first_seq)
+{
+       deflate_reset_symbol_frequencies(c);
+       first_seq->litrunlen_and_length = 0;
+}
+
+static forceinline void
+deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
+                      bool gather_split_stats, struct deflate_sequence *seq)
+{
+       c->freqs.litlen[literal]++;
+
+       if (gather_split_stats)
+               observe_literal(&c->split_stats, literal);
+
+       STATIC_ASSERT(MAX_BLOCK_LENGTH <= SEQ_LITRUNLEN_MASK);
+       seq->litrunlen_and_length++;
+}
+
+static forceinline void
+deflate_choose_match(struct libdeflate_compressor *c,
+                    unsigned length, unsigned offset, bool gather_split_stats,
+                    struct deflate_sequence **seq_p)
+{
+       struct deflate_sequence *seq = *seq_p;
+       unsigned length_slot = deflate_length_slot[length];
+       unsigned offset_slot = deflate_get_offset_slot(offset);
+
+       c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++;
+       c->freqs.offset[offset_slot]++;
+       if (gather_split_stats)
+               observe_match(&c->split_stats, length);
+
+       seq->litrunlen_and_length |= (u32)length << SEQ_LENGTH_SHIFT;
+       seq->offset = offset;
+       seq->length_slot = length_slot;
+       seq->offset_symbol = offset_slot;
+
+       seq++;
+       seq->litrunlen_and_length = 0;
+       *seq_p = seq;
+}
+
+/*
+ * Decrease the maximum and nice match lengths if we're approaching the end of
+ * the input buffer.
+ */
+static forceinline void
+adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining)
+{
+       if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
+               *max_len = remaining;
+               *nice_len = MIN(*nice_len, *max_len);
+       }
+}
+
+/*
+ * Choose the minimum match length for the greedy and lazy parsers.
+ *
+ * By default the minimum match length is 3, which is the smallest length the
+ * DEFLATE format allows.  However, with greedy and lazy parsing, some data
+ * (e.g. DNA sequencing data) benefits greatly from a longer minimum length.
+ * Typically, this is because literals are very cheap.  In general, the
+ * near-optimal parser handles this case naturally, but the greedy and lazy
+ * parsers need a heuristic to decide when to use short matches.
+ *
+ * The heuristic we use is to make the minimum match length depend on the number
+ * of different literals that exist in the data.  If there are many different
+ * literals, then literals will probably be expensive, so short matches will
+ * probably be worthwhile.  Conversely, if not many literals are used, then
+ * probably literals will be cheap and short matches won't be worthwhile.
+ */
+static unsigned
+choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth)
+{
+       /* map from num_used_literals to min_len */
+       static const u8 min_lens[] = {
+               9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6,
+               5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+               5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4,
+               4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+               4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+               /* The rest is implicitly 3. */
+       };
+       unsigned min_len;
+
+       STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3);
+       STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1);
+
+       if (num_used_literals >= ARRAY_LEN(min_lens))
+               return 3;
+       min_len = min_lens[num_used_literals];
+       /*
+        * With a low max_search_depth, it may be too hard to find long matches.
+        */
+       if (max_search_depth < 16) {
+               if (max_search_depth < 5)
+                       min_len = MIN(min_len, 4);
+               else if (max_search_depth < 10)
+                       min_len = MIN(min_len, 5);
+               else
+                       min_len = MIN(min_len, 7);
+       }
+       return min_len;
+}
+
+static unsigned
+calculate_min_match_len(const u8 *data, size_t data_len,
+                       unsigned max_search_depth)
+{
+       u8 used[256] = { 0 };
+       unsigned num_used_literals = 0;
+       size_t i;
+
+       /*
+        * For an initial approximation, scan the first 4 KiB of data.  The
+        * caller may use recalculate_min_match_len() to update min_len later.
+        */
+       data_len = MIN(data_len, 4096);
+       for (i = 0; i < data_len; i++)
+               used[data[i]] = 1;
+       for (i = 0; i < 256; i++)
+               num_used_literals += used[i];
+       return choose_min_match_len(num_used_literals, max_search_depth);
+}
+
+/*
+ * Recalculate the minimum match length for a block, now that we know the
+ * distribution of literals that are actually being used (freqs->litlen).
+ */
+static unsigned
+recalculate_min_match_len(const struct deflate_freqs *freqs,
+                         unsigned max_search_depth)
+{
+       u32 literal_freq = 0;
+       u32 cutoff;
+       unsigned num_used_literals = 0;
+       int i;
+
+       for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
+               literal_freq += freqs->litlen[i];
+
+       cutoff = literal_freq >> 10; /* Ignore literals used very rarely. */
+
+       for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+               if (freqs->litlen[i] > cutoff)
+                       num_used_literals++;
+       }
+       return choose_min_match_len(num_used_literals, max_search_depth);
+}
+
+static forceinline const u8 *
+choose_max_block_end(const u8 *in_block_begin, const u8 *in_end,
+                    size_t soft_max_len)
+{
+       if (in_end - in_block_begin < soft_max_len + MIN_BLOCK_LENGTH)
+               return in_end;
+       return in_block_begin + soft_max_len;
+}
+
+/*
+ * This is the level 0 "compressor".  It always outputs uncompressed blocks.
+ */
+static size_t
+deflate_compress_none(struct libdeflate_compressor * restrict c,
+                     const u8 * restrict in, size_t in_nbytes,
+                     u8 * restrict out, size_t out_nbytes_avail)
+{
+       struct deflate_output_bitstream os;
+
+       deflate_init_output(&os, out, out_nbytes_avail);
+
+       deflate_write_uncompressed_blocks(&os, in, in_nbytes, true);
+
+       return deflate_flush_output(&os);
+}
+
+/*
+ * This is a faster variant of deflate_compress_greedy().  It uses the
+ * ht_matchfinder rather than the hc_matchfinder.  It also skips the block
+ * splitting algorithm and just uses fixed length blocks.  c->max_search_depth
+ * has no effect with this algorithm, as it is hardcoded in ht_matchfinder.h.
+ */
+static size_t
+deflate_compress_fastest(struct libdeflate_compressor * restrict c,
+                        const u8 * restrict in, size_t in_nbytes,
+                        u8 * restrict out, size_t out_nbytes_avail)
+{
+       const u8 *in_next = in;
+       const u8 *in_end = in_next + in_nbytes;
+       struct deflate_output_bitstream os;
+       const u8 *in_cur_base = in_next;
+       unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+       unsigned nice_len = MIN(c->nice_match_length, max_len);
+       u32 next_hash = 0;
+
+       deflate_init_output(&os, out, out_nbytes_avail);
+       ht_matchfinder_init(&c->p.f.ht_mf);
+
+       do {
+               /* Starting a new DEFLATE block */
+
+               const u8 * const in_block_begin = in_next;
+               const u8 * const in_max_block_end = choose_max_block_end(
+                               in_next, in_end, FAST_SOFT_MAX_BLOCK_LENGTH);
+               struct deflate_sequence *seq = c->p.f.sequences;
+
+               deflate_begin_sequences(c, seq);
+
+               do {
+                       u32 length;
+                       u32 offset;
+                       size_t remaining = in_end - in_next;
+
+                       if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
+                               max_len = remaining;
+                               if (max_len < HT_MATCHFINDER_REQUIRED_NBYTES) {
+                                       do {
+                                               deflate_choose_literal(c,
+                                                       *in_next++, false, seq);
+                                       } while (--max_len);
+                                       break;
+                               }
+                               nice_len = MIN(nice_len, max_len);
+                       }
+                       length = ht_matchfinder_longest_match(&c->p.f.ht_mf,
+                                                             &in_cur_base,
+                                                             in_next,
+                                                             max_len,
+                                                             nice_len,
+                                                             &next_hash,
+                                                             &offset);
+                       if (length) {
+                               /* Match found */
+                               deflate_choose_match(c, length, offset, false,
+                                                    &seq);
+                               ht_matchfinder_skip_bytes(&c->p.f.ht_mf,
+                                                         &in_cur_base,
+                                                         in_next + 1,
+                                                         in_end,
+                                                         length - 1,
+                                                         &next_hash);
+                               in_next += length;
+                       } else {
+                               /* No match found */
+                               deflate_choose_literal(c, *in_next++, false,
+                                                      seq);
+                       }
+
+                       /* Check if it's time to output another block. */
+               } while (in_next < in_max_block_end &&
+                        seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]);
+
+               deflate_flush_block(c, &os, in_block_begin,
+                                   in_next - in_block_begin,
+                                   c->p.f.sequences, in_next == in_end);
+       } while (in_next != in_end);
+
+       return deflate_flush_output(&os);
+}
+
+/*
+ * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
+ */
+static size_t
+deflate_compress_greedy(struct libdeflate_compressor * restrict c,
+                       const u8 * restrict in, size_t in_nbytes,
+                       u8 * restrict out, size_t out_nbytes_avail)
+{
+       const u8 *in_next = in;
+       const u8 *in_end = in_next + in_nbytes;
+       struct deflate_output_bitstream os;
+       const u8 *in_cur_base = in_next;
+       unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+       unsigned nice_len = MIN(c->nice_match_length, max_len);
+       u32 next_hashes[2] = {0, 0};
+
+       deflate_init_output(&os, out, out_nbytes_avail);
+       hc_matchfinder_init(&c->p.g.hc_mf);
+
+       do {
+               /* Starting a new DEFLATE block */
+
+               const u8 * const in_block_begin = in_next;
+               const u8 * const in_max_block_end = choose_max_block_end(
+                               in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
+               struct deflate_sequence *seq = c->p.g.sequences;
+               unsigned min_len;
+
+               init_block_split_stats(&c->split_stats);
+               deflate_begin_sequences(c, seq);
+               min_len = calculate_min_match_len(in_next,
+                                                 in_max_block_end - in_next,
+                                                 c->max_search_depth);
+               do {
+                       u32 length;
+                       u32 offset;
+
+                       adjust_max_and_nice_len(&max_len, &nice_len,
+                                               in_end - in_next);
+                       length = hc_matchfinder_longest_match(
+                                               &c->p.g.hc_mf,
+                                               &in_cur_base,
+                                               in_next,
+                                               min_len - 1,
+                                               max_len,
+                                               nice_len,
+                                               c->max_search_depth,
+                                               next_hashes,
+                                               &offset);
+
+                       if (length >= min_len &&
+                           (length > DEFLATE_MIN_MATCH_LEN ||
+                            offset <= 4096)) {
+                               /* Match found */
+                               deflate_choose_match(c, length, offset, true,
+                                                    &seq);
+                               hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+                                                         &in_cur_base,
+                                                         in_next + 1,
+                                                         in_end,
+                                                         length - 1,
+                                                         next_hashes);
+                               in_next += length;
+                       } else {
+                               /* No match found */
+                               deflate_choose_literal(c, *in_next++, true,
+                                                      seq);
+                       }
+
+                       /* Check if it's time to output another block. */
+               } while (in_next < in_max_block_end &&
+                        seq < &c->p.g.sequences[SEQ_STORE_LENGTH] &&
+                        !should_end_block(&c->split_stats,
+                                          in_block_begin, in_next, in_end));
+
+               deflate_flush_block(c, &os, in_block_begin,
+                                   in_next - in_block_begin,
+                                   c->p.g.sequences, in_next == in_end);
+       } while (in_next != in_end);
+
+       return deflate_flush_output(&os);
+}
+
+static forceinline size_t
+deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
+                             const u8 * restrict in, size_t in_nbytes,
+                             u8 * restrict out, size_t out_nbytes_avail,
+                             bool lazy2)
+{
+       const u8 *in_next = in;
+       const u8 *in_end = in_next + in_nbytes;
+       struct deflate_output_bitstream os;
+       const u8 *in_cur_base = in_next;
+       unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+       unsigned nice_len = MIN(c->nice_match_length, max_len);
+       u32 next_hashes[2] = {0, 0};
+
+       deflate_init_output(&os, out, out_nbytes_avail);
+       hc_matchfinder_init(&c->p.g.hc_mf);
+
+       do {
+               /* Starting a new DEFLATE block */
+
+               const u8 * const in_block_begin = in_next;
+               const u8 * const in_max_block_end = choose_max_block_end(
+                               in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
+               const u8 *next_recalc_min_len =
+                       in_next + MIN(in_end - in_next, 10000);
+               struct deflate_sequence *seq = c->p.g.sequences;
+               unsigned min_len;
+
+               init_block_split_stats(&c->split_stats);
+               deflate_begin_sequences(c, seq);
+               min_len = calculate_min_match_len(in_next,
+                                                 in_max_block_end - in_next,
+                                                 c->max_search_depth);
+               do {
+                       unsigned cur_len;
+                       unsigned cur_offset;
+                       unsigned next_len;
+                       unsigned next_offset;
+
+                       /*
+                        * Recalculate the minimum match length if it hasn't
+                        * been done recently.
+                        */
+                       if (in_next >= next_recalc_min_len) {
+                               min_len = recalculate_min_match_len(
+                                               &c->freqs,
+                                               c->max_search_depth);
+                               next_recalc_min_len +=
+                                       MIN(in_end - next_recalc_min_len,
+                                           in_next - in_block_begin);
+                       }
+
+                       /* Find the longest match at the current position. */
+                       adjust_max_and_nice_len(&max_len, &nice_len,
+                                               in_end - in_next);
+                       cur_len = hc_matchfinder_longest_match(
+                                               &c->p.g.hc_mf,
+                                               &in_cur_base,
+                                               in_next,
+                                               min_len - 1,
+                                               max_len,
+                                               nice_len,
+                                               c->max_search_depth,
+                                               next_hashes,
+                                               &cur_offset);
+                       if (cur_len < min_len ||
+                           (cur_len == DEFLATE_MIN_MATCH_LEN &&
+                            cur_offset > 8192)) {
+                               /* No match found.  Choose a literal. */
+                               deflate_choose_literal(c, *in_next++, true,
+                                                      seq);
+                               continue;
+                       }
+                       in_next++;
+
+have_cur_match:
+                       /*
+                        * We have a match at the current position.
+                        * If it's very long, choose it immediately.
+                        */
+                       if (cur_len >= nice_len) {
+                               deflate_choose_match(c, cur_len, cur_offset,
+                                                    true, &seq);
+                               hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+                                                         &in_cur_base,
+                                                         in_next,
+                                                         in_end,
+                                                         cur_len - 1,
+                                                         next_hashes);
+                               in_next += cur_len - 1;
+                               continue;
+                       }
+
+                       /*
+                        * Try to find a better match at the next position.
+                        *
+                        * Note: since we already have a match at the *current*
+                        * position, we use only half the 'max_search_depth'
+                        * when checking the *next* position.  This is a useful
+                        * trade-off because it's more worthwhile to use a
+                        * greater search depth on the initial match.
+                        *
+                        * Note: it's possible to structure the code such that
+                        * there's only one call to longest_match(), which
+                        * handles both the "find the initial match" and "try to
+                        * find a better match" cases.  However, it is faster to
+                        * have two call sites, with longest_match() inlined at
+                        * each.
+                        */
+                       adjust_max_and_nice_len(&max_len, &nice_len,
+                                               in_end - in_next);
+                       next_len = hc_matchfinder_longest_match(
+                                               &c->p.g.hc_mf,
+                                               &in_cur_base,
+                                               in_next++,
+                                               cur_len - 1,
+                                               max_len,
+                                               nice_len,
+                                               c->max_search_depth >> 1,
+                                               next_hashes,
+                                               &next_offset);
+                       if (next_len >= cur_len &&
+                           4 * (int)(next_len - cur_len) +
+                           ((int)bsr32(cur_offset) -
+                            (int)bsr32(next_offset)) > 2) {
+                               /*
+                                * Found a better match at the next position.
+                                * Output a literal.  Then the next match
+                                * becomes the current match.
+                                */
+                               deflate_choose_literal(c, *(in_next - 2), true,
+                                                      seq);
+                               cur_len = next_len;
+                               cur_offset = next_offset;
+                               goto have_cur_match;
+                       }
+
+                       if (lazy2) {
+                               /* In lazy2 mode, look ahead another position */
+                               adjust_max_and_nice_len(&max_len, &nice_len,
+                                                       in_end - in_next);
+                               next_len = hc_matchfinder_longest_match(
+                                               &c->p.g.hc_mf,
+                                               &in_cur_base,
+                                               in_next++,
+                                               cur_len - 1,
+                                               max_len,
+                                               nice_len,
+                                               c->max_search_depth >> 2,
+                                               next_hashes,
+                                               &next_offset);
+                               if (next_len >= cur_len &&
+                                   4 * (int)(next_len - cur_len) +
+                                   ((int)bsr32(cur_offset) -
+                                    (int)bsr32(next_offset)) > 6) {
+                                       /*
+                                        * There's a much better match two
+                                        * positions ahead, so use two literals.
+                                        */
+                                       deflate_choose_literal(
+                                               c, *(in_next - 3), true, seq);
+                                       deflate_choose_literal(
+                                               c, *(in_next - 2), true, seq);
+                                       cur_len = next_len;
+                                       cur_offset = next_offset;
+                                       goto have_cur_match;
+                               }
+                               /*
+                                * No better match at either of the next 2
+                                * positions.  Output the current match.
+                                */
+                               deflate_choose_match(c, cur_len, cur_offset,
+                                                    true, &seq);
+                               if (cur_len > 3) {
+                                       hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+                                                                 &in_cur_base,
+                                                                 in_next,
+                                                                 in_end,
+                                                                 cur_len - 3,
+                                                                 next_hashes);
+                                       in_next += cur_len - 3;
+                               }
+                       } else { /* !lazy2 */
+                               /*
+                                * No better match at the next position.  Output
+                                * the current match.
+                                */
+                               deflate_choose_match(c, cur_len, cur_offset,
+                                                    true, &seq);
+                               hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+                                                         &in_cur_base,
+                                                         in_next,
+                                                         in_end,
+                                                         cur_len - 2,
+                                                         next_hashes);
+                               in_next += cur_len - 2;
+                       }
+                       /* Check if it's time to output another block. */
+               } while (in_next < in_max_block_end &&
+                        seq < &c->p.g.sequences[SEQ_STORE_LENGTH] &&
+                        !should_end_block(&c->split_stats,
+                                          in_block_begin, in_next, in_end));
+
+               deflate_flush_block(c, &os, in_block_begin,
+                                   in_next - in_block_begin,
+                                   c->p.g.sequences, in_next == in_end);
+       } while (in_next != in_end);
+
+       return deflate_flush_output(&os);
+}
+
+/*
+ * This is the "lazy" DEFLATE compressor.  Before choosing a match, it checks to
+ * see if there's a better match at the next position.  If yes, it outputs a
+ * literal and continues to the next position.  If no, it outputs the match.
+ */
+static size_t
+deflate_compress_lazy(struct libdeflate_compressor * restrict c,
+                     const u8 * restrict in, size_t in_nbytes,
+                     u8 * restrict out, size_t out_nbytes_avail)
+{
+       return deflate_compress_lazy_generic(c, in, in_nbytes, out,
+                                            out_nbytes_avail, false);
+}
+
+/*
+ * The lazy2 compressor.  This is similar to the regular lazy one, but it looks
+ * for a better match at the next 2 positions rather than the next 1.  This
+ * makes it take slightly more time, but compress some inputs slightly more.
+ */
+static size_t
+deflate_compress_lazy2(struct libdeflate_compressor * restrict c,
+                      const u8 * restrict in, size_t in_nbytes,
+                      u8 * restrict out, size_t out_nbytes_avail)
+{
+       return deflate_compress_lazy_generic(c, in, in_nbytes, out,
+                                            out_nbytes_avail, true);
+}
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
+/*
+ * Follow the minimum-cost path in the graph of possible match/literal choices
+ * for the current block and compute the frequencies of the Huffman symbols that
+ * would be needed to output those matches and literals.
+ */
+static void
+deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
+{
+       struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
+       struct deflate_optimum_node *end_node =
+               &c->p.n.optimum_nodes[block_length];
+
+       do {
+               unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
+               unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
+
+               if (length == 1) {
+                       /* Literal */
+                       c->freqs.litlen[offset]++;
+               } else {
+                       /* Match */
+                       c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
+                                       deflate_length_slot[length]]++;
+                       c->freqs.offset[c->p.n.offset_slot_full[offset]]++;
+               }
+               cur_node += length;
+       } while (cur_node != end_node);
+
+       /* Tally the end-of-block symbol. */
+       c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
+}
+
+/* Set the current cost model from the codeword lengths specified in @lens. */
+static void
+deflate_set_costs_from_codes(struct libdeflate_compressor *c,
+                            const struct deflate_lens *lens)
+{
+       unsigned i;
+
+       /* Literals */
+       for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+               u32 bits = (lens->litlen[i] ?
+                           lens->litlen[i] : LITERAL_NOSTAT_BITS);
+
+               c->p.n.costs.literal[i] = bits * BIT_COST;
+       }
+
+       /* Lengths */
+       for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
+               unsigned length_slot = deflate_length_slot[i];
+               unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot;
+               u32 bits = (lens->litlen[litlen_sym] ?
+                           lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
+
+               bits += deflate_extra_length_bits[length_slot];
+               c->p.n.costs.length[i] = bits * BIT_COST;
+       }
+
+       /* Offset slots */
+       for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
+               u32 bits = (lens->offset[i] ?
+                           lens->offset[i] : OFFSET_NOSTAT_BITS);
+
+               bits += deflate_extra_offset_bits[i];
+               c->p.n.costs.offset_slot[i] = bits * BIT_COST;
+       }
+}
+
+/*
+ * This lookup table gives the default cost of a literal symbol and of a length
+ * symbol, depending on the characteristics of the input data.  It was generated
+ * by scripts/gen_default_litlen_costs.py.
+ *
+ * This table is indexed first by the estimated match probability:
+ *
+ *     i=0: data doesn't contain many matches  [match_prob=0.25]
+ *     i=1: neutral                            [match_prob=0.50]
+ *     i=2: data contains lots of matches      [match_prob=0.75]
+ *
+ * This lookup produces a subtable which maps the number of distinct used
+ * literals to the default cost of a literal symbol, i.e.:
+ *
+ *     int(-log2((1 - match_prob) / num_used_literals) * BIT_COST)
+ *
+ * ... for num_used_literals in [1, 256] (and 0, which is copied from 1).  This
+ * accounts for literals usually getting cheaper as the number of distinct
+ * literals decreases, and as the proportion of literals to matches increases.
+ *
+ * The lookup also produces the cost of a length symbol, which is:
+ *
+ *     int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST)
+ *
+ * Note: we don't currently assign different costs to different literal symbols,
+ * or to different length symbols, as this is hard to do in a useful way.
+ */
+static const struct {
+       u8 used_lits_to_lit_cost[257];
+       u8 len_sym_cost;
+} default_litlen_costs[] = {
+       { /* match_prob = 0.25 */
+               .used_lits_to_lit_cost = {
+                       6, 6, 22, 32, 38, 43, 48, 51,
+                       54, 57, 59, 61, 64, 65, 67, 69,
+                       70, 72, 73, 74, 75, 76, 77, 79,
+                       80, 80, 81, 82, 83, 84, 85, 85,
+                       86, 87, 88, 88, 89, 89, 90, 91,
+                       91, 92, 92, 93, 93, 94, 95, 95,
+                       96, 96, 96, 97, 97, 98, 98, 99,
+                       99, 99, 100, 100, 101, 101, 101, 102,
+                       102, 102, 103, 103, 104, 104, 104, 105,
+                       105, 105, 105, 106, 106, 106, 107, 107,
+                       107, 108, 108, 108, 108, 109, 109, 109,
+                       109, 110, 110, 110, 111, 111, 111, 111,
+                       112, 112, 112, 112, 112, 113, 113, 113,
+                       113, 114, 114, 114, 114, 114, 115, 115,
+                       115, 115, 115, 116, 116, 116, 116, 116,
+                       117, 117, 117, 117, 117, 118, 118, 118,
+                       118, 118, 118, 119, 119, 119, 119, 119,
+                       120, 120, 120, 120, 120, 120, 121, 121,
+                       121, 121, 121, 121, 121, 122, 122, 122,
+                       122, 122, 122, 123, 123, 123, 123, 123,
+                       123, 123, 124, 124, 124, 124, 124, 124,
+                       124, 125, 125, 125, 125, 125, 125, 125,
+                       125, 126, 126, 126, 126, 126, 126, 126,
+                       127, 127, 127, 127, 127, 127, 127, 127,
+                       128, 128, 128, 128, 128, 128, 128, 128,
+                       128, 129, 129, 129, 129, 129, 129, 129,
+                       129, 129, 130, 130, 130, 130, 130, 130,
+                       130, 130, 130, 131, 131, 131, 131, 131,
+                       131, 131, 131, 131, 131, 132, 132, 132,
+                       132, 132, 132, 132, 132, 132, 132, 133,
+                       133, 133, 133, 133, 133, 133, 133, 133,
+                       133, 134, 134, 134, 134, 134, 134, 134,
+                       134,
+               },
+               .len_sym_cost = 109,
+       }, { /* match_prob = 0.5 */
+               .used_lits_to_lit_cost = {
+                       16, 16, 32, 41, 48, 53, 57, 60,
+                       64, 66, 69, 71, 73, 75, 76, 78,
+                       80, 81, 82, 83, 85, 86, 87, 88,
+                       89, 90, 91, 92, 92, 93, 94, 95,
+                       96, 96, 97, 98, 98, 99, 99, 100,
+                       101, 101, 102, 102, 103, 103, 104, 104,
+                       105, 105, 106, 106, 107, 107, 108, 108,
+                       108, 109, 109, 110, 110, 110, 111, 111,
+                       112, 112, 112, 113, 113, 113, 114, 114,
+                       114, 115, 115, 115, 115, 116, 116, 116,
+                       117, 117, 117, 118, 118, 118, 118, 119,
+                       119, 119, 119, 120, 120, 120, 120, 121,
+                       121, 121, 121, 122, 122, 122, 122, 122,
+                       123, 123, 123, 123, 124, 124, 124, 124,
+                       124, 125, 125, 125, 125, 125, 126, 126,
+                       126, 126, 126, 127, 127, 127, 127, 127,
+                       128, 128, 128, 128, 128, 128, 129, 129,
+                       129, 129, 129, 129, 130, 130, 130, 130,
+                       130, 130, 131, 131, 131, 131, 131, 131,
+                       131, 132, 132, 132, 132, 132, 132, 133,
+                       133, 133, 133, 133, 133, 133, 134, 134,
+                       134, 134, 134, 134, 134, 134, 135, 135,
+                       135, 135, 135, 135, 135, 135, 136, 136,
+                       136, 136, 136, 136, 136, 136, 137, 137,
+                       137, 137, 137, 137, 137, 137, 138, 138,
+                       138, 138, 138, 138, 138, 138, 138, 139,
+                       139, 139, 139, 139, 139, 139, 139, 139,
+                       140, 140, 140, 140, 140, 140, 140, 140,
+                       140, 141, 141, 141, 141, 141, 141, 141,
+                       141, 141, 141, 142, 142, 142, 142, 142,
+                       142, 142, 142, 142, 142, 142, 143, 143,
+                       143, 143, 143, 143, 143, 143, 143, 143,
+                       144,
+               },
+               .len_sym_cost = 93,
+       }, { /* match_prob = 0.75 */
+               .used_lits_to_lit_cost = {
+                       32, 32, 48, 57, 64, 69, 73, 76,
+                       80, 82, 85, 87, 89, 91, 92, 94,
+                       96, 97, 98, 99, 101, 102, 103, 104,
+                       105, 106, 107, 108, 108, 109, 110, 111,
+                       112, 112, 113, 114, 114, 115, 115, 116,
+                       117, 117, 118, 118, 119, 119, 120, 120,
+                       121, 121, 122, 122, 123, 123, 124, 124,
+                       124, 125, 125, 126, 126, 126, 127, 127,
+                       128, 128, 128, 129, 129, 129, 130, 130,
+                       130, 131, 131, 131, 131, 132, 132, 132,
+                       133, 133, 133, 134, 134, 134, 134, 135,
+                       135, 135, 135, 136, 136, 136, 136, 137,
+                       137, 137, 137, 138, 138, 138, 138, 138,
+                       139, 139, 139, 139, 140, 140, 140, 140,
+                       140, 141, 141, 141, 141, 141, 142, 142,
+                       142, 142, 142, 143, 143, 143, 143, 143,
+                       144, 144, 144, 144, 144, 144, 145, 145,
+                       145, 145, 145, 145, 146, 146, 146, 146,
+                       146, 146, 147, 147, 147, 147, 147, 147,
+                       147, 148, 148, 148, 148, 148, 148, 149,
+                       149, 149, 149, 149, 149, 149, 150, 150,
+                       150, 150, 150, 150, 150, 150, 151, 151,
+                       151, 151, 151, 151, 151, 151, 152, 152,
+                       152, 152, 152, 152, 152, 152, 153, 153,
+                       153, 153, 153, 153, 153, 153, 154, 154,
+                       154, 154, 154, 154, 154, 154, 154, 155,
+                       155, 155, 155, 155, 155, 155, 155, 155,
+                       156, 156, 156, 156, 156, 156, 156, 156,
+                       156, 157, 157, 157, 157, 157, 157, 157,
+                       157, 157, 157, 158, 158, 158, 158, 158,
+                       158, 158, 158, 158, 158, 158, 159, 159,
+                       159, 159, 159, 159, 159, 159, 159, 159,
+                       160,
+               },
+               .len_sym_cost = 84,
+       },
+};
+
+/*
+ * Choose the default costs for literal and length symbols.  These symbols are
+ * both part of the litlen alphabet.
+ */
+static void
+deflate_choose_default_litlen_costs(struct libdeflate_compressor *c,
+                                   const u8 *block_begin, u32 block_length,
+                                   u32 *lit_cost, u32 *len_sym_cost)
+{
+       unsigned num_used_literals = 0;
+       u32 literal_freq = block_length;
+       u32 match_freq = 0;
+       u32 cutoff;
+       u32 i;
+
+       /* Calculate the number of distinct literals that exist in the data. */
+       memset(c->freqs.litlen, 0,
+              DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0]));
+       cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */
+       for (i = 0; i < block_length; i++)
+               c->freqs.litlen[block_begin[i]]++;
+       for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+               if (c->freqs.litlen[i] > cutoff)
+                       num_used_literals++;
+       }
+       if (num_used_literals == 0)
+               num_used_literals = 1;
+
+       /*
+        * Estimate the relative frequency of literals and matches in the
+        * optimal parsing solution.  We don't know the optimal solution, so
+        * this can only be a very rough estimate.  Therefore, we basically use
+        * the match frequency from a greedy parse.  We also apply the min_len
+        * heuristic used by the greedy and lazy parsers, to avoid counting too
+        * many matches when literals are cheaper than short matches.
+        */
+       match_freq = 0;
+       i = choose_min_match_len(num_used_literals, c->max_search_depth);
+       for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
+               match_freq += c->p.n.match_len_freqs[i];
+               literal_freq -= i * c->p.n.match_len_freqs[i];
+       }
+       if ((s32)literal_freq < 0) /* shouldn't happen */
+               literal_freq = 0;
+
+       if (match_freq > literal_freq)
+               i = 2; /* many matches */
+       else if (match_freq * 4 > literal_freq)
+               i = 1; /* neutral */
+       else
+               i = 0; /* few matches */
+
+       STATIC_ASSERT(BIT_COST == 16);
+       *lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[
+                                                       num_used_literals];
+       *len_sym_cost = default_litlen_costs[i].len_sym_cost;
+}
+
+static forceinline u32
+deflate_default_length_cost(unsigned len, u32 len_sym_cost)
+{
+       unsigned slot = deflate_length_slot[len];
+       u32 num_extra_bits = deflate_extra_length_bits[slot];
+
+       return len_sym_cost + (num_extra_bits * BIT_COST);
+}
+
+static forceinline u32
+deflate_default_offset_slot_cost(unsigned slot)
+{
+       u32 num_extra_bits = deflate_extra_offset_bits[slot];
+       /*
+        * Assume that all offset symbols are equally probable.
+        * The resulting cost is 'int(-log2(1/30) * BIT_COST)',
+        * where 30 is the number of potentially-used offset symbols.
+        */
+       u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000;
+
+       return offset_sym_cost + (num_extra_bits * BIT_COST);
+}
+
+/* Set default symbol costs for the first block's first optimization pass. */
+static void
+deflate_set_default_costs(struct libdeflate_compressor *c,
+                         u32 lit_cost, u32 len_sym_cost)
+{
+       unsigned i;
+
+       /* Literals */
+       for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
+               c->p.n.costs.literal[i] = lit_cost;
+
+       /* Lengths */
+       for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
+               c->p.n.costs.length[i] =
+                       deflate_default_length_cost(i, len_sym_cost);
+
+       /* Offset slots */
+       for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
+               c->p.n.costs.offset_slot[i] =
+                       deflate_default_offset_slot_cost(i);
+}
+
+static forceinline void
+deflate_adjust_cost(u32 *cost_p, u32 default_cost, int change_amount)
+{
+       if (change_amount == 0)
+               /* Block is very similar to previous; prefer previous costs. */
+               *cost_p = (default_cost + 3 * *cost_p) / 4;
+       else if (change_amount == 1)
+               *cost_p = (default_cost + *cost_p) / 2;
+       else if (change_amount == 2)
+               *cost_p = (5 * default_cost + 3 * *cost_p) / 8;
+       else
+               /* Block differs greatly from previous; prefer default costs. */
+               *cost_p = (3 * default_cost + *cost_p) / 4;
+}
+
+static forceinline void
+deflate_adjust_costs_impl(struct libdeflate_compressor *c,
+                         u32 lit_cost, u32 len_sym_cost, int change_amount)
+{
+       unsigned i;
+
+       /* Literals */
+       for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
+               deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost,
+                                   change_amount);
+
+       /* Lengths */
+       for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
+               deflate_adjust_cost(&c->p.n.costs.length[i],
+                                   deflate_default_length_cost(i,
+                                                               len_sym_cost),
+                                   change_amount);
+
+       /* Offset slots */
+       for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
+               deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
+                                   deflate_default_offset_slot_cost(i),
+                                   change_amount);
+}
+
+/*
+ * Adjust the costs when beginning a new block.
+ *
+ * Since the current costs have been optimized for the data, it's undesirable to
+ * throw them away and start over with the default costs.  At the same time, we
+ * don't want to bias the parse by assuming that the next block will be similar
+ * to the current block.  As a compromise, make the costs closer to the
+ * defaults, but don't simply set them to the defaults.
+ */
+static void
+deflate_adjust_costs(struct libdeflate_compressor *c,
+                    u32 lit_cost, u32 len_sym_cost)
+{
+       u64 total_delta = 0;
+       u64 cutoff;
+       int i;
+
+       /*
+        * Decide how different the current block is from the previous block,
+        * using the block splitting statistics from the current and previous
+        * blocks.  The more different the current block is, the more we prefer
+        * the default costs rather than the previous block's costs.
+        *
+        * The algorithm here is similar to the end-of-block check one, but here
+        * we compare two entire blocks rather than a partial block with a small
+        * extra part, and therefore we need 64-bit numbers in some places.
+        */
+       for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+               u64 prev = (u64)c->p.n.prev_observations[i] *
+                           c->split_stats.num_observations;
+               u64 cur = (u64)c->split_stats.observations[i] *
+                         c->p.n.prev_num_observations;
+
+               total_delta += prev > cur ? prev - cur : cur - prev;
+       }
+       cutoff = ((u64)c->p.n.prev_num_observations *
+                 c->split_stats.num_observations * 200) / 512;
+
+       if (4 * total_delta > 9 * cutoff)
+               deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3);
+       else if (2 * total_delta > 3 * cutoff)
+               deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2);
+       else if (2 * total_delta > cutoff)
+               deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1);
+       else
+               deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0);
+}
+
+/*
+ * Find the minimum-cost path through the graph of possible match/literal
+ * choices for this block.
+ *
+ * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
+ * represents the node at the beginning of the block, to
+ * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
+ * the block.  Edge costs are evaluated using the cost model 'c->p.n.costs'.
+ *
+ * The algorithm works backwards, starting at the end node and proceeding
+ * backwards one node at a time.  At each node, the minimum cost to reach the
+ * end node is computed and the match/literal choice that begins that path is
+ * saved.
+ */
+static void
+deflate_find_min_cost_path(struct libdeflate_compressor *c,
+                          const u32 block_length,
+                          const struct lz_match *cache_ptr)
+{
+       struct deflate_optimum_node *end_node =
+               &c->p.n.optimum_nodes[block_length];
+       struct deflate_optimum_node *cur_node = end_node;
+
+       cur_node->cost_to_end = 0;
+       do {
+               unsigned num_matches;
+               unsigned literal;
+               u32 best_cost_to_end;
+
+               cur_node--;
+               cache_ptr--;
+
+               num_matches = cache_ptr->length;
+               literal = cache_ptr->offset;
+
+               /* It's always possible to choose a literal. */
+               best_cost_to_end = c->p.n.costs.literal[literal] +
+                                  (cur_node + 1)->cost_to_end;
+               cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
+
+               /* Also consider matches if there are any. */
+               if (num_matches) {
+                       const struct lz_match *match;
+                       unsigned len;
+                       unsigned offset;
+                       unsigned offset_slot;
+                       u32 offset_cost;
+                       u32 cost_to_end;
+
+                       /*
+                        * Consider each length from the minimum
+                        * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
+                        * match found at this position.  For each length, we
+                        * consider only the smallest offset for which that
+                        * length is available.  Although this is not guaranteed
+                        * to be optimal due to the possibility of a larger
+                        * offset costing less than a smaller offset to code,
+                        * this is a very useful heuristic.
+                        */
+                       match = cache_ptr - num_matches;
+                       len = DEFLATE_MIN_MATCH_LEN;
+                       do {
+                               offset = match->offset;
+                               offset_slot = c->p.n.offset_slot_full[offset];
+                               offset_cost =
+                                       c->p.n.costs.offset_slot[offset_slot];
+                               do {
+                                       cost_to_end = offset_cost +
+                                               c->p.n.costs.length[len] +
+                                               (cur_node + len)->cost_to_end;
+                                       if (cost_to_end < best_cost_to_end) {
+                                               best_cost_to_end = cost_to_end;
+                                               cur_node->item = len |
+                                                       ((u32)offset <<
+                                                        OPTIMUM_OFFSET_SHIFT);
+                                       }
+                               } while (++len <= match->length);
+                       } while (++match != cache_ptr);
+                       cache_ptr -= num_matches;
+               }
+               cur_node->cost_to_end = best_cost_to_end;
+       } while (cur_node != &c->p.n.optimum_nodes[0]);
+}
+
+/*
+ * Choose the literal/match sequence to use for the current block.  The basic
+ * algorithm finds a minimum-cost path through the block's graph of
+ * literal/match choices, given a cost model.  However, the cost of each symbol
+ * is unknown until the Huffman codes have been built, but at the same time the
+ * Huffman codes depend on the frequencies of chosen symbols.  Consequently,
+ * multiple passes must be used to try to approximate an optimal solution.  The
+ * first pass uses default costs, mixed with the costs from the previous block
+ * if any.  Later passes use the Huffman codeword lengths from the previous pass
+ * as the costs.
+ */
+static void
+deflate_optimize_block(struct libdeflate_compressor *c,
+                      const u8 *block_begin, u32 block_length,
+                      const struct lz_match *cache_ptr, bool is_first_block,
+                      bool is_final_block)
+{
+       unsigned num_passes_remaining = c->p.n.num_optim_passes;
+       u32 lit_cost, len_sym_cost;
+       u32 i;
+
+       /*
+        * Force the block to really end at the desired length, even if some
+        * matches extend beyond it.
+        */
+       for (i = block_length;
+            i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
+                     ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
+               c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
+
+       /* Set the initial costs. */
+       deflate_choose_default_litlen_costs(c, block_begin, block_length,
+                                           &lit_cost, &len_sym_cost);
+       if (is_first_block)
+               deflate_set_default_costs(c, lit_cost, len_sym_cost);
+       else
+               deflate_adjust_costs(c, lit_cost, len_sym_cost);
+
+       do {
+               /* Find the minimum cost path for this pass. */
+               deflate_find_min_cost_path(c, block_length, cache_ptr);
+
+               /* Compute frequencies of the chosen symbols. */
+               deflate_reset_symbol_frequencies(c);
+               deflate_tally_item_list(c, block_length);
+
+               /* Make the Huffman codes. */
+               deflate_make_huffman_codes(&c->freqs, &c->codes);
+
+               /*
+                * Update the costs.  After the last optimization pass, the
+                * final costs won't be needed for this block, but they will be
+                * used in determining the initial costs for the next block.
+                */
+               if (--num_passes_remaining || !is_final_block)
+                       deflate_set_costs_from_codes(c, &c->codes.lens);
+       } while (num_passes_remaining);
+}
+
+static void
+deflate_near_optimal_init_stats(struct libdeflate_compressor *c)
+{
+       init_block_split_stats(&c->split_stats);
+       memset(c->p.n.new_match_len_freqs, 0,
+              sizeof(c->p.n.new_match_len_freqs));
+       memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
+}
+
+static void
+deflate_near_optimal_merge_stats(struct libdeflate_compressor *c)
+{
+       unsigned i;
+
+       merge_new_observations(&c->split_stats);
+       for (i = 0; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
+               c->p.n.match_len_freqs[i] += c->p.n.new_match_len_freqs[i];
+               c->p.n.new_match_len_freqs[i] = 0;
+       }
+}
+
+/*
+ * Save some literal/match statistics from the previous block so that
+ * deflate_adjust_costs() will be able to decide how much the current block
+ * differs from the previous one.
+ */
+static void
+deflate_near_optimal_save_stats(struct libdeflate_compressor *c)
+{
+       int i;
+
+       for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
+               c->p.n.prev_observations[i] = c->split_stats.observations[i];
+       c->p.n.prev_num_observations = c->split_stats.num_observations;
+}
+
+static void
+deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c)
+{
+       int i;
+
+       for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
+               c->split_stats.observations[i] = 0;
+       c->split_stats.num_observations = 0;
+       memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
+}
+
+/*
+ * This is the "near-optimal" DEFLATE compressor.  It computes the optimal
+ * representation of each DEFLATE block using a minimum-cost path search over
+ * the graph of possible match/literal choices for that block, assuming a
+ * certain cost for each Huffman symbol.
+ *
+ * For several reasons, the end result is not guaranteed to be optimal:
+ *
+ * - Nonoptimal choice of blocks
+ * - Heuristic limitations on which matches are actually considered
+ * - Symbol costs are unknown until the symbols have already been chosen
+ *   (so iterative optimization must be used)
+ */
+static size_t
+deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
+                             const u8 * restrict in, size_t in_nbytes,
+                             u8 * restrict out, size_t out_nbytes_avail)
+{
+       const u8 *in_next = in;
+       const u8 *in_block_begin = in_next;
+       const u8 *in_end = in_next + in_nbytes;
+       struct deflate_output_bitstream os;
+       const u8 *in_cur_base = in_next;
+       const u8 *in_next_slide =
+               in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
+       unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+       unsigned nice_len = MIN(c->nice_match_length, max_len);
+       struct lz_match *cache_ptr = c->p.n.match_cache;
+       u32 next_hashes[2] = {0, 0};
+
+       deflate_init_output(&os, out, out_nbytes_avail);
+       bt_matchfinder_init(&c->p.n.bt_mf);
+       deflate_near_optimal_init_stats(c);
+
+       do {
+               /* Starting a new DEFLATE block */
+               const u8 * const in_max_block_end = choose_max_block_end(
+                               in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH);
+               const u8 *prev_end_block_check = NULL;
+               bool change_detected = false;
+               const u8 *next_observation = in_next;
+               unsigned min_len;
+
+               /*
+                * Use the minimum match length heuristic to improve the
+                * literal/match statistics gathered during matchfinding.
+                * However, the actual near-optimal parse won't respect min_len,
+                * as it can accurately assess the costs of different matches.
+                */
+               min_len = calculate_min_match_len(
+                                       in_block_begin,
+                                       in_max_block_end - in_block_begin,
+                                       c->max_search_depth);
+
+               /*
+                * Find matches until we decide to end the block.  We end the
+                * block if any of the following is true:
+                *
+                * (1) Maximum block length has been reached
+                * (2) Match catch may overflow.
+                * (3) Block split heuristic says to split now.
+                */
+               for (;;) {
+                       struct lz_match *matches;
+                       unsigned best_len;
+                       size_t remaining = in_end - in_next;
+
+                       /* Slide the window forward if needed. */
+                       if (in_next == in_next_slide) {
+                               bt_matchfinder_slide_window(&c->p.n.bt_mf);
+                               in_cur_base = in_next;
+                               in_next_slide = in_next +
+                                       MIN(remaining, MATCHFINDER_WINDOW_SIZE);
+                       }
+
+                       /*
+                        * Find matches with the current position using the
+                        * binary tree matchfinder and save them in match_cache.
+                        *
+                        * Note: the binary tree matchfinder is more suited for
+                        * optimal parsing than the hash chain matchfinder.  The
+                        * reasons for this include:
+                        *
+                        * - The binary tree matchfinder can find more matches
+                        *   in the same number of steps.
+                        * - One of the major advantages of hash chains is that
+                        *   skipping positions (not searching for matches at
+                        *   them) is faster; however, with optimal parsing we
+                        *   search for matches at almost all positions, so this
+                        *   advantage of hash chains is negated.
+                        */
+                       matches = cache_ptr;
+                       best_len = 0;
+                       adjust_max_and_nice_len(&max_len, &nice_len, remaining);
+                       if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
+                               cache_ptr = bt_matchfinder_get_matches(
+                                               &c->p.n.bt_mf,
+                                               in_cur_base,
+                                               in_next - in_cur_base,
+                                               max_len,
+                                               nice_len,
+                                               c->max_search_depth,
+                                               next_hashes,
+                                               matches);
+                               if (cache_ptr > matches)
+                                       best_len = cache_ptr[-1].length;
+                       }
+                       if (in_next >= next_observation) {
+                               if (best_len >= min_len) {
+                                       observe_match(&c->split_stats,
+                                                     best_len);
+                                       next_observation = in_next + best_len;
+                                       c->p.n.new_match_len_freqs[best_len]++;
+                               } else {
+                                       observe_literal(&c->split_stats,
+                                                       *in_next);
+                                       next_observation = in_next + 1;
+                               }
+                       }
+
+                       cache_ptr->length = cache_ptr - matches;
+                       cache_ptr->offset = *in_next;
+                       in_next++;
+                       cache_ptr++;
+
+                       /*
+                        * If there was a very long match found, don't cache any
+                        * matches for the bytes covered by that match.  This
+                        * avoids degenerate behavior when compressing highly
+                        * redundant data, where the number of matches can be
+                        * very large.
+                        *
+                        * This heuristic doesn't actually hurt the compression
+                        * ratio very much.  If there's a long match, then the
+                        * data must be highly compressible, so it doesn't
+                        * matter much what we do.
+                        */
+                       if (best_len >= DEFLATE_MIN_MATCH_LEN &&
+                           best_len >= nice_len) {
+                               --best_len;
+                               do {
+                                       remaining = in_end - in_next;
+                                       if (in_next == in_next_slide) {
+                                               bt_matchfinder_slide_window(
+                                                       &c->p.n.bt_mf);
+                                               in_cur_base = in_next;
+                                               in_next_slide = in_next +
+                                                       MIN(remaining,
+                                                           MATCHFINDER_WINDOW_SIZE);
+                                       }
+                                       adjust_max_and_nice_len(&max_len,
+                                                               &nice_len,
+                                                               remaining);
+                                       if (max_len >=
+                                           BT_MATCHFINDER_REQUIRED_NBYTES) {
+                                               bt_matchfinder_skip_byte(
+                                                       &c->p.n.bt_mf,
+                                                       in_cur_base,
+                                                       in_next - in_cur_base,
+                                                       nice_len,
+                                                       c->max_search_depth,
+                                                       next_hashes);
+                                       }
+                                       cache_ptr->length = 0;
+                                       cache_ptr->offset = *in_next;
+                                       in_next++;
+                                       cache_ptr++;
+                               } while (--best_len);
+                       }
+                       /* Maximum block length or end of input reached? */
+                       if (in_next >= in_max_block_end)
+                               break;
+                       /* Match cache overflowed? */
+                       if (cache_ptr >=
+                           &c->p.n.match_cache[MATCH_CACHE_LENGTH])
+                               break;
+                       /* Not ready to try to end the block (again)? */
+                       if (!ready_to_check_block(&c->split_stats,
+                                                 in_block_begin, in_next,
+                                                 in_end))
+                               continue;
+                       /* Check if it would be worthwhile to end the block. */
+                       if (do_end_block_check(&c->split_stats,
+                                              in_next - in_block_begin)) {
+                               change_detected = true;
+                               break;
+                       }
+                       /* Ending the block doesn't seem worthwhile here. */
+                       deflate_near_optimal_merge_stats(c);
+                       prev_end_block_check = in_next;
+               }
+               /*
+                * All the matches for this block have been cached.  Now choose
+                * the precise end of the block and the sequence of items to
+                * output to represent it, then flush the block.
+                */
+               if (change_detected && prev_end_block_check != NULL) {
+                       /*
+                        * The block is being ended because a recent chunk of
+                        * data differs from the rest of the block.  We could
+                        * end the block at 'in_next' like the greedy and lazy
+                        * compressors do, but that's not ideal since it would
+                        * include the differing chunk in the block.  The
+                        * near-optimal compressor has time to do a better job.
+                        * Therefore, we rewind to just before the chunk, and
+                        * output a block that only goes up to there.
+                        *
+                        * We then set things up to correctly start the next
+                        * block, considering that some work has already been
+                        * done on it (some matches found and stats gathered).
+                        */
+                       struct lz_match *orig_cache_ptr = cache_ptr;
+                       const u8 *in_block_end = prev_end_block_check;
+                       u32 block_length = in_block_end - in_block_begin;
+                       bool is_first = (in_block_begin == in);
+                       bool is_final = false;
+                       u32 num_bytes_to_rewind = in_next - in_block_end;
+                       size_t cache_len_rewound;
+
+                       /* Rewind the match cache. */
+                       do {
+                               cache_ptr--;
+                               cache_ptr -= cache_ptr->length;
+                       } while (--num_bytes_to_rewind);
+                       cache_len_rewound = orig_cache_ptr - cache_ptr;
+
+                       deflate_optimize_block(c, in_block_begin, block_length,
+                                              cache_ptr, is_first, is_final);
+                       deflate_flush_block(c, &os, in_block_begin,
+                                           block_length, NULL, is_final);
+                       memmove(c->p.n.match_cache, cache_ptr,
+                               cache_len_rewound * sizeof(*cache_ptr));
+                       cache_ptr = &c->p.n.match_cache[cache_len_rewound];
+                       deflate_near_optimal_save_stats(c);
+                       /*
+                        * Clear the stats for the just-flushed block, leaving
+                        * just the stats for the beginning of the next block.
+                        */
+                       deflate_near_optimal_clear_old_stats(c);
+                       in_block_begin = in_block_end;
+               } else {
+                       /*
+                        * The block is being ended for a reason other than a
+                        * differing data chunk being detected.  Don't rewind at
+                        * all; just end the block at the current position.
+                        */
+                       u32 block_length = in_next - in_block_begin;
+                       bool is_first = (in_block_begin == in);
+                       bool is_final = (in_next == in_end);
+
+                       deflate_near_optimal_merge_stats(c);
+                       deflate_optimize_block(c, in_block_begin, block_length,
+                                              cache_ptr, is_first, is_final);
+                       deflate_flush_block(c, &os, in_block_begin,
+                                           block_length, NULL, is_final);
+                       cache_ptr = &c->p.n.match_cache[0];
+                       deflate_near_optimal_save_stats(c);
+                       deflate_near_optimal_init_stats(c);
+                       in_block_begin = in_next;
+               }
+       } while (in_next != in_end);
+
+       return deflate_flush_output(&os);
+}
+
+/* Initialize c->p.n.offset_slot_full. */
+static void
+deflate_init_offset_slot_full(struct libdeflate_compressor *c)
+{
+       unsigned offset_slot;
+       unsigned offset;
+       unsigned offset_end;
+
+       for (offset_slot = 0; offset_slot < ARRAY_LEN(deflate_offset_slot_base);
+            offset_slot++) {
+               offset = deflate_offset_slot_base[offset_slot];
+               offset_end = offset +
+                            (1 << deflate_extra_offset_bits[offset_slot]);
+               do {
+                       c->p.n.offset_slot_full[offset] = offset_slot;
+               } while (++offset != offset_end);
+       }
+}
+
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
+libdeflate_alloc_compressor(int compression_level)
+{
+       struct libdeflate_compressor *c;
+       size_t size = offsetof(struct libdeflate_compressor, p);
+
+       check_buildtime_parameters();
+
+       if (compression_level < 0 || compression_level > 12)
+               return NULL;
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+       if (compression_level >= 10)
+               size += sizeof(c->p.n);
+       else
+#endif
+       {
+               if (compression_level >= 2)
+                       size += sizeof(c->p.g);
+               else if (compression_level == 1)
+                       size += sizeof(c->p.f);
+       }
+
+       c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size);
+       if (!c)
+               return NULL;
+
+       c->compression_level = compression_level;
+
+       /*
+        * The higher the compression level, the more we should bother trying to
+        * compress very small inputs.
+        */
+       c->min_size_to_compress = 56 - (compression_level * 4);
+
+       switch (compression_level) {
+       case 0:
+               c->impl = deflate_compress_none;
+               break;
+       case 1:
+               c->impl = deflate_compress_fastest;
+               /* max_search_depth is unused. */
+               c->nice_match_length = 32;
+               break;
+       case 2:
+               c->impl = deflate_compress_greedy;
+               c->max_search_depth = 6;
+               c->nice_match_length = 10;
+               break;
+       case 3:
+               c->impl = deflate_compress_greedy;
+               c->max_search_depth = 12;
+               c->nice_match_length = 14;
+               break;
+       case 4:
+               c->impl = deflate_compress_greedy;
+               c->max_search_depth = 16;
+               c->nice_match_length = 30;
+               break;
+       case 5:
+               c->impl = deflate_compress_lazy;
+               c->max_search_depth = 16;
+               c->nice_match_length = 30;
+               break;
+       case 6:
+               c->impl = deflate_compress_lazy;
+               c->max_search_depth = 35;
+               c->nice_match_length = 65;
+               break;
+       case 7:
+               c->impl = deflate_compress_lazy;
+               c->max_search_depth = 100;
+               c->nice_match_length = 130;
+               break;
+       case 8:
+               c->impl = deflate_compress_lazy2;
+               c->max_search_depth = 300;
+               c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+               break;
+       case 9:
+#if !SUPPORT_NEAR_OPTIMAL_PARSING
+       default:
+#endif
+               c->impl = deflate_compress_lazy2;
+               c->max_search_depth = 600;
+               c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+               break;
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+       case 10:
+               c->impl = deflate_compress_near_optimal;
+               c->max_search_depth = 35;
+               c->nice_match_length = 75;
+               c->p.n.num_optim_passes = 2;
+               deflate_init_offset_slot_full(c);
+               break;
+       case 11:
+               c->impl = deflate_compress_near_optimal;
+               c->max_search_depth = 70;
+               c->nice_match_length = 150;
+               c->p.n.num_optim_passes = 3;
+               deflate_init_offset_slot_full(c);
+               break;
+       case 12:
+       default:
+               c->impl = deflate_compress_near_optimal;
+               c->max_search_depth = 150;
+               c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+               c->p.n.num_optim_passes = 4;
+               deflate_init_offset_slot_full(c);
+               break;
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+       }
+
+       deflate_init_static_codes(c);
+
+       return c;
+}
+
+LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+libdeflate_deflate_compress(struct libdeflate_compressor *c,
+                           const void *in, size_t in_nbytes,
+                           void *out, size_t out_nbytes_avail)
+{
+       if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING))
+               return 0;
+
+       /* For extremely small inputs, just use a single uncompressed block. */
+       if (unlikely(in_nbytes < c->min_size_to_compress)) {
+               struct deflate_output_bitstream os;
+               deflate_init_output(&os, out, out_nbytes_avail);
+               if (in_nbytes == 0)
+                       in = &os; /* Avoid passing NULL to memcpy(). */
+               deflate_write_uncompressed_block(&os, in, in_nbytes, true);
+               return deflate_flush_output(&os);
+       }
+
+       return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
+}
+
+LIBDEFLATEEXPORT void LIBDEFLATEAPI
+libdeflate_free_compressor(struct libdeflate_compressor *c)
+{
+       libdeflate_aligned_free(c);
+}
+
+unsigned int
+deflate_get_compression_level(struct libdeflate_compressor *c)
+{
+       return c->compression_level;
+}
+
+LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
+                                 size_t in_nbytes)
+{
+       /*
+        * The worst case is all uncompressed blocks where one block has length
+        * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH.
+        * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE,
+        * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
+        */
+       size_t max_num_blocks =
+               MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
+
+       return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING;
+}
diff --git a/src/3rdparty/libdeflate/lib/deflate_compress.h b/src/3rdparty/libdeflate/lib/deflate_compress.h
new file mode 100644 (file)
index 0000000..8bb6cb9
--- /dev/null
@@ -0,0 +1,15 @@
+#ifndef LIB_DEFLATE_COMPRESS_H
+#define LIB_DEFLATE_COMPRESS_H
+
+#include "lib_common.h"
+
+/*
+ * DEFLATE compression is private to deflate_compress.c, but we do need to be
+ * able to query the compression level for zlib and gzip header generation.
+ */
+
+struct libdeflate_compressor;
+
+unsigned int deflate_get_compression_level(struct libdeflate_compressor *c);
+
+#endif /* LIB_DEFLATE_COMPRESS_H */
diff --git a/src/3rdparty/libdeflate/lib/deflate_constants.h b/src/3rdparty/libdeflate/lib/deflate_constants.h
new file mode 100644 (file)
index 0000000..5982c15
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * deflate_constants.h - constants for the DEFLATE compression format
+ */
+
+#ifndef LIB_DEFLATE_CONSTANTS_H
+#define LIB_DEFLATE_CONSTANTS_H
+
+/* Valid block types  */
+#define DEFLATE_BLOCKTYPE_UNCOMPRESSED         0
+#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN       1
+#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN      2
+
+/* Minimum and maximum supported match lengths (in bytes)  */
+#define DEFLATE_MIN_MATCH_LEN                  3
+#define DEFLATE_MAX_MATCH_LEN                  258
+
+/* Maximum supported match offset (in bytes) */
+#define DEFLATE_MAX_MATCH_OFFSET               32768
+
+/* log2 of DEFLATE_MAX_MATCH_OFFSET */
+#define DEFLATE_WINDOW_ORDER                   15
+
+/* Number of symbols in each Huffman code.  Note: for the literal/length
+ * and offset codes, these are actually the maximum values; a given block
+ * might use fewer symbols.  */
+#define DEFLATE_NUM_PRECODE_SYMS               19
+#define DEFLATE_NUM_LITLEN_SYMS                        288
+#define DEFLATE_NUM_OFFSET_SYMS                        32
+
+/* The maximum number of symbols across all codes  */
+#define DEFLATE_MAX_NUM_SYMS                   288
+
+/* Division of symbols in the literal/length code  */
+#define DEFLATE_NUM_LITERALS                   256
+#define DEFLATE_END_OF_BLOCK                   256
+#define DEFLATE_FIRST_LEN_SYM                  257
+
+/* Maximum codeword length, in bits, within each Huffman code  */
+#define DEFLATE_MAX_PRE_CODEWORD_LEN           7
+#define DEFLATE_MAX_LITLEN_CODEWORD_LEN                15
+#define DEFLATE_MAX_OFFSET_CODEWORD_LEN                15
+
+/* The maximum codeword length across all codes  */
+#define DEFLATE_MAX_CODEWORD_LEN               15
+
+/* Maximum possible overrun when decoding codeword lengths  */
+#define DEFLATE_MAX_LENS_OVERRUN               137
+
+/*
+ * Maximum number of extra bits that may be required to represent a match
+ * length or offset.
+ *
+ * TODO: are we going to have full DEFLATE64 support?  If so, up to 16
+ * length bits must be supported.
+ */
+#define DEFLATE_MAX_EXTRA_LENGTH_BITS          5
+#define DEFLATE_MAX_EXTRA_OFFSET_BITS          14
+
+#endif /* LIB_DEFLATE_CONSTANTS_H */
diff --git a/src/3rdparty/libdeflate/lib/deflate_decompress.c b/src/3rdparty/libdeflate/lib/deflate_decompress.c
new file mode 100644 (file)
index 0000000..6138206
--- /dev/null
@@ -0,0 +1,997 @@
+/*
+ * deflate_decompress.c - a decompressor for DEFLATE
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ---------------------------------------------------------------------------
+ *
+ * This is a highly optimized DEFLATE decompressor.  When compiled with gcc on
+ * x86_64, it decompresses data in about 52% of the time of zlib (48% if BMI2
+ * instructions are available).  On other architectures it should still be
+ * significantly faster than zlib, but the difference may be smaller.
+ *
+ * Why this is faster than zlib's implementation:
+ *
+ * - Word accesses rather than byte accesses when reading input
+ * - Word accesses rather than byte accesses when copying matches
+ * - Faster Huffman decoding combined with various DEFLATE-specific tricks
+ * - Larger bitbuffer variable that doesn't need to be filled as often
+ * - Other optimizations to remove unnecessary branches
+ * - Only full-buffer decompression is supported, so the code doesn't need to
+ *   support stopping and resuming decompression.
+ * - On x86_64, compile a version of the decompression routine using BMI2
+ *   instructions and use it automatically at runtime when supported.
+ */
+
+#include <limits.h>
+
+#include "deflate_constants.h"
+#include "unaligned.h"
+
+#include "libdeflate.h"
+
+/*
+ * If the expression passed to SAFETY_CHECK() evaluates to false, then the
+ * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the
+ * compressed data is invalid.
+ *
+ * Theoretically, these checks could be disabled for specialized applications
+ * where all input to the decompressor will be trusted.
+ */
+#if 0
+#  pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!")
+#  define SAFETY_CHECK(expr)   (void)(expr)
+#else
+#  define SAFETY_CHECK(expr)   if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA
+#endif
+
+/*
+ * Each TABLEBITS number is the base-2 logarithm of the number of entries in the
+ * main portion of the corresponding decode table.  Each number should be large
+ * enough to ensure that for typical data, the vast majority of symbols can be
+ * decoded by a direct lookup of the next TABLEBITS bits of compressed data.
+ * However, this must be balanced against the fact that a larger table requires
+ * more memory and requires more time to fill.
+ *
+ * Note: you cannot change a TABLEBITS number without also changing the
+ * corresponding ENOUGH number!
+ */
+#define PRECODE_TABLEBITS      7
+#define LITLEN_TABLEBITS       10
+#define OFFSET_TABLEBITS       8
+
+/*
+ * Each ENOUGH number is the maximum number of decode table entries that may be
+ * required for the corresponding Huffman code, including the main table and all
+ * subtables.  Each number depends on three parameters:
+ *
+ *     (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS)
+ *     (2) the number of main table bits (the TABLEBITS numbers defined above)
+ *     (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
+ *
+ * The ENOUGH numbers were computed using the utility program 'enough' from
+ * zlib.  This program enumerates all possible relevant Huffman codes to find
+ * the worst-case usage of decode table entries.
+ */
+#define PRECODE_ENOUGH         128     /* enough 19 7 7        */
+#define LITLEN_ENOUGH          1334    /* enough 288 10 15     */
+#define OFFSET_ENOUGH          402     /* enough 32 8 15       */
+
+/*
+ * The main DEFLATE decompressor structure.  Since this implementation only
+ * supports full buffer decompression, this structure does not store the entire
+ * decompression state, but rather only some arrays that are too large to
+ * comfortably allocate on the stack.
+ */
+struct libdeflate_decompressor {
+
+       /*
+        * The arrays aren't all needed at the same time.  'precode_lens' and
+        * 'precode_decode_table' are unneeded after 'lens' has been filled.
+        * Furthermore, 'lens' need not be retained after building the litlen
+        * and offset decode tables.  In fact, 'lens' can be in union with
+        * 'litlen_decode_table' provided that 'offset_decode_table' is separate
+        * and is built first.
+        */
+
+       union {
+               u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
+
+               struct {
+                       u8 lens[DEFLATE_NUM_LITLEN_SYMS +
+                               DEFLATE_NUM_OFFSET_SYMS +
+                               DEFLATE_MAX_LENS_OVERRUN];
+
+                       u32 precode_decode_table[PRECODE_ENOUGH];
+               } l;
+
+               u32 litlen_decode_table[LITLEN_ENOUGH];
+       } u;
+
+       u32 offset_decode_table[OFFSET_ENOUGH];
+
+       /* used only during build_decode_table() */
+       u16 sorted_syms[DEFLATE_MAX_NUM_SYMS];
+
+       bool static_codes_loaded;
+};
+
+/*****************************************************************************
+ *                             Input bitstream                              *
+ *****************************************************************************/
+
+/*
+ * The state of the "input bitstream" consists of the following variables:
+ *
+ *     - in_next: pointer to the next unread byte in the input buffer
+ *
+ *     - in_end: pointer just past the end of the input buffer
+ *
+ *     - bitbuf: a word-sized variable containing bits that have been read from
+ *               the input buffer.  The buffered bits are right-aligned
+ *               (they're the low-order bits).
+ *
+ *     - bitsleft: number of bits in 'bitbuf' that are valid.
+ *
+ * To make it easier for the compiler to optimize the code by keeping variables
+ * in registers, these are declared as normal variables and manipulated using
+ * macros.
+ */
+
+/*
+ * The type for the bitbuffer variable ('bitbuf' described above).  For best
+ * performance, this should have size equal to a machine word.
+ *
+ * 64-bit platforms have a significant advantage: they get a bigger bitbuffer
+ * which they have to fill less often.
+ */
+typedef machine_word_t bitbuf_t;
+
+/*
+ * Number of bits the bitbuffer variable can hold.
+ *
+ * This is one less than the obvious value because of the optimized arithmetic
+ * in FILL_BITS_WORDWISE() that leaves 'bitsleft' in the range
+ * [WORDBITS - 8, WORDBITS - 1] rather than [WORDBITS - 7, WORDBITS].
+ */
+#define BITBUF_NBITS   (8 * sizeof(bitbuf_t) - 1)
+
+/*
+ * The maximum number of bits that can be ensured in the bitbuffer variable,
+ * i.e. the maximum value of 'n' that can be passed ENSURE_BITS(n).  The decoder
+ * only reads whole bytes from memory, so this is the lowest value of 'bitsleft'
+ * at which another byte cannot be read without first consuming some bits.
+ */
+#define MAX_ENSURE     (BITBUF_NBITS - 7)
+
+/*
+ * Evaluates to true if 'n' is a valid argument to ENSURE_BITS(n), or false if
+ * 'n' is too large to be passed to ENSURE_BITS(n).  Note: if 'n' is a compile
+ * time constant, then this expression will be a compile-type constant.
+ * Therefore, CAN_ENSURE() can be used choose between alternative
+ * implementations at compile time.
+ */
+#define CAN_ENSURE(n)  ((n) <= MAX_ENSURE)
+
+/*
+ * Fill the bitbuffer variable, reading one byte at a time.
+ *
+ * If we would overread the input buffer, we just don't read anything, leaving
+ * the bits zeroed but marking them filled.  This simplifies the decompressor
+ * because it removes the need to always be able to distinguish between real
+ * overreads and overreads caused only by the decompressor's own lookahead.
+ *
+ * We do still keep track of the number of bytes that have been overread, for
+ * two reasons.  First, it allows us to determine the exact number of bytes that
+ * were consumed once the stream ends or an uncompressed block is reached.
+ * Second, it allows us to stop early if the overread amount gets so large (more
+ * than sizeof bitbuf) that it can only be caused by a real overread.  (The
+ * second part is arguably unneeded, since libdeflate is buffer-based; given
+ * infinite zeroes, it will eventually either completely fill the output buffer
+ * or return an error.  However, we do it to be slightly more friendly to the
+ * not-recommended use case of decompressing with an unknown output size.)
+ */
+#define FILL_BITS_BYTEWISE()                                   \
+do {                                                           \
+       if (likely(in_next != in_end)) {                        \
+               bitbuf |= (bitbuf_t)*in_next++ << bitsleft;     \
+       } else {                                                \
+               overread_count++;                               \
+               SAFETY_CHECK(overread_count <= sizeof(bitbuf)); \
+       }                                                       \
+       bitsleft += 8;                                          \
+} while (bitsleft <= BITBUF_NBITS - 8)
+
+/*
+ * Fill the bitbuffer variable by reading the next word from the input buffer
+ * and branchlessly updating 'in_next' and 'bitsleft' based on how many bits
+ * were filled.  This can be significantly faster than FILL_BITS_BYTEWISE().
+ * However, for this to work correctly, the word must be interpreted in
+ * little-endian format.  In addition, the memory access may be unaligned.
+ * Therefore, this method is most efficient on little-endian architectures that
+ * support fast unaligned access, such as x86 and x86_64.
+ *
+ * For faster updating of 'bitsleft', we consider the bitbuffer size in bits to
+ * be 1 less than the word size and therefore be all 1 bits.  Then the number of
+ * bits filled is the value of the 0 bits in position >= 3 when changed to 1.
+ * E.g. if words are 64 bits and bitsleft = 16 = b010000 then we refill b101000
+ * = 40 bits = 5 bytes.  This uses only 4 operations to update 'in_next' and
+ * 'bitsleft': one each of +, ^, >>, and |.  (Not counting operations the
+ * compiler optimizes out.)  In contrast, the alternative of:
+ *
+ *     in_next += (BITBUF_NBITS - bitsleft) >> 3;
+ *     bitsleft += (BITBUF_NBITS - bitsleft) & ~7;
+ *
+ * (where BITBUF_NBITS would be WORDBITS rather than WORDBITS - 1) would on
+ * average refill an extra bit, but uses 5 operations: two +, and one each of
+ * -, >>, and &.  Also the - and & must be completed before 'bitsleft' can be
+ * updated, while the current solution updates 'bitsleft' with no dependencies.
+ */
+#define FILL_BITS_WORDWISE()                                   \
+do {                                                           \
+       /* BITBUF_NBITS must be all 1's in binary, see above */ \
+       STATIC_ASSERT((BITBUF_NBITS & (BITBUF_NBITS + 1)) == 0);\
+                                                               \
+       bitbuf |= get_unaligned_leword(in_next) << bitsleft;    \
+       in_next += (bitsleft ^ BITBUF_NBITS) >> 3;              \
+       bitsleft |= BITBUF_NBITS & ~7;                          \
+} while (0)
+
+/*
+ * Does the bitbuffer variable currently contain at least 'n' bits?
+ */
+#define HAVE_BITS(n) (bitsleft >= (n))
+
+/*
+ * Load more bits from the input buffer until the specified number of bits is
+ * present in the bitbuffer variable.  'n' cannot be too large; see MAX_ENSURE
+ * and CAN_ENSURE().
+ */
+#define ENSURE_BITS(n)                                         \
+if (!HAVE_BITS(n)) {                                           \
+       if (CPU_IS_LITTLE_ENDIAN() &&                           \
+           UNALIGNED_ACCESS_IS_FAST &&                         \
+           likely(in_end - in_next >= sizeof(bitbuf_t)))       \
+               FILL_BITS_WORDWISE();                           \
+       else                                                    \
+               FILL_BITS_BYTEWISE();                           \
+}
+
+/*
+ * Return the next 'n' bits from the bitbuffer variable without removing them.
+ */
+#define BITS(n) ((u32)bitbuf & (((u32)1 << (n)) - 1))
+
+/*
+ * Remove the next 'n' bits from the bitbuffer variable.
+ */
+#define REMOVE_BITS(n) (bitbuf >>= (n), bitsleft -= (n))
+
+/*
+ * Remove and return the next 'n' bits from the bitbuffer variable.
+ */
+#define POP_BITS(n) (tmp32 = BITS(n), REMOVE_BITS(n), tmp32)
+
+/*
+ * Verify that the input buffer hasn't been overread, then align the input to
+ * the next byte boundary, discarding any remaining bits in the current byte.
+ *
+ * Note that if the bitbuffer variable currently contains more than 7 bits, then
+ * we must rewind 'in_next', effectively putting those bits back.  Only the bits
+ * in what would be the "current" byte if we were reading one byte at a time can
+ * be actually discarded.
+ */
+#define ALIGN_INPUT()                                                  \
+do {                                                                   \
+       SAFETY_CHECK(overread_count <= (bitsleft >> 3));                \
+       in_next -= (bitsleft >> 3) - overread_count;                    \
+       overread_count = 0;                                             \
+       bitbuf = 0;                                                     \
+       bitsleft = 0;                                                   \
+} while(0)
+
+/*
+ * Read a 16-bit value from the input.  This must have been preceded by a call
+ * to ALIGN_INPUT(), and the caller must have already checked for overrun.
+ */
+#define READ_U16() (tmp16 = get_unaligned_le16(in_next), in_next += 2, tmp16)
+
+/*****************************************************************************
+ *                              Huffman decoding                             *
+ *****************************************************************************/
+
+/*
+ * A decode table for order TABLEBITS consists of a main table of (1 <<
+ * TABLEBITS) entries followed by a variable number of subtables.
+ *
+ * The decoding algorithm takes the next TABLEBITS bits of compressed data and
+ * uses them as an index into the decode table.  The resulting entry is either a
+ * "direct entry", meaning that it contains the value desired, or a "subtable
+ * pointer", meaning that the entry references a subtable that must be indexed
+ * using more bits of the compressed data to decode the symbol.
+ *
+ * Each decode table (a main table along with its subtables, if any) is
+ * associated with a Huffman code.  Logically, the result of a decode table
+ * lookup is a symbol from the alphabet from which the corresponding Huffman
+ * code was constructed.  A symbol with codeword length n <= TABLEBITS is
+ * associated with 2**(TABLEBITS - n) direct entries in the table, whereas a
+ * symbol with codeword length n > TABLEBITS is associated with one or more
+ * subtable entries.
+ *
+ * On top of this basic design, we implement several optimizations:
+ *
+ * - We store the length of each codeword directly in each of its decode table
+ *   entries.  This allows the codeword length to be produced without indexing
+ *   an additional table.
+ *
+ * - When beneficial, we don't store the Huffman symbol itself, but instead data
+ *   generated from it.  For example, when decoding an offset symbol in DEFLATE,
+ *   it's more efficient if we can decode the offset base and number of extra
+ *   offset bits directly rather than decoding the offset symbol and then
+ *   looking up both of those values in an additional table or tables.
+ *
+ * The size of each decode table entry is 32 bits, which provides slightly
+ * better performance than 16-bit entries on 32 and 64 bit processers, provided
+ * that the table doesn't get so large that it takes up too much memory and
+ * starts generating cache misses.  The bits of each decode table entry are
+ * defined as follows:
+ *
+ * - Bits 30 -- 31: flags (see below)
+ * - Bits 8 -- 29: decode result: a Huffman symbol or related data
+ * - Bits 0 -- 7: codeword length
+ */
+
+/*
+ * This flag is set in all main decode table entries that represent subtable
+ * pointers.
+ */
+#define HUFFDEC_SUBTABLE_POINTER       0x80000000
+
+/*
+ * This flag is set in all entries in the litlen decode table that represent
+ * literals.
+ */
+#define HUFFDEC_LITERAL                        0x40000000
+
+/* Mask for extracting the codeword length from a decode table entry.  */
+#define HUFFDEC_LENGTH_MASK            0xFF
+
+/* Shift to extract the decode result from a decode table entry.  */
+#define HUFFDEC_RESULT_SHIFT           8
+
+/* Shift a decode result into its position in the decode table entry.  */
+#define HUFFDEC_RESULT_ENTRY(result)   ((u32)(result) << HUFFDEC_RESULT_SHIFT)
+
+/* The decode result for each precode symbol.  There is no special optimization
+ * for the precode; the decode result is simply the symbol value.  */
+static const u32 precode_decode_results[DEFLATE_NUM_PRECODE_SYMS] = {
+#define ENTRY(presym)  HUFFDEC_RESULT_ENTRY(presym)
+       ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
+       ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
+       ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
+       ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
+       ENTRY(16)  , ENTRY(17)  , ENTRY(18)  ,
+#undef ENTRY
+};
+
+/* The decode result for each litlen symbol.  For literals, this is the literal
+ * value itself and the HUFFDEC_LITERAL flag.  For lengths, this is the length
+ * base and the number of extra length bits.  */
+static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = {
+
+       /* Literals  */
+#define ENTRY(literal) (HUFFDEC_LITERAL | HUFFDEC_RESULT_ENTRY(literal))
+       ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
+       ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
+       ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
+       ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
+       ENTRY(16)  , ENTRY(17)  , ENTRY(18)  , ENTRY(19)  ,
+       ENTRY(20)  , ENTRY(21)  , ENTRY(22)  , ENTRY(23)  ,
+       ENTRY(24)  , ENTRY(25)  , ENTRY(26)  , ENTRY(27)  ,
+       ENTRY(28)  , ENTRY(29)  , ENTRY(30)  , ENTRY(31)  ,
+       ENTRY(32)  , ENTRY(33)  , ENTRY(34)  , ENTRY(35)  ,
+       ENTRY(36)  , ENTRY(37)  , ENTRY(38)  , ENTRY(39)  ,
+       ENTRY(40)  , ENTRY(41)  , ENTRY(42)  , ENTRY(43)  ,
+       ENTRY(44)  , ENTRY(45)  , ENTRY(46)  , ENTRY(47)  ,
+       ENTRY(48)  , ENTRY(49)  , ENTRY(50)  , ENTRY(51)  ,
+       ENTRY(52)  , ENTRY(53)  , ENTRY(54)  , ENTRY(55)  ,
+       ENTRY(56)  , ENTRY(57)  , ENTRY(58)  , ENTRY(59)  ,
+       ENTRY(60)  , ENTRY(61)  , ENTRY(62)  , ENTRY(63)  ,
+       ENTRY(64)  , ENTRY(65)  , ENTRY(66)  , ENTRY(67)  ,
+       ENTRY(68)  , ENTRY(69)  , ENTRY(70)  , ENTRY(71)  ,
+       ENTRY(72)  , ENTRY(73)  , ENTRY(74)  , ENTRY(75)  ,
+       ENTRY(76)  , ENTRY(77)  , ENTRY(78)  , ENTRY(79)  ,
+       ENTRY(80)  , ENTRY(81)  , ENTRY(82)  , ENTRY(83)  ,
+       ENTRY(84)  , ENTRY(85)  , ENTRY(86)  , ENTRY(87)  ,
+       ENTRY(88)  , ENTRY(89)  , ENTRY(90)  , ENTRY(91)  ,
+       ENTRY(92)  , ENTRY(93)  , ENTRY(94)  , ENTRY(95)  ,
+       ENTRY(96)  , ENTRY(97)  , ENTRY(98)  , ENTRY(99)  ,
+       ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) ,
+       ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) ,
+       ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) ,
+       ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) ,
+       ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) ,
+       ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) ,
+       ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) ,
+       ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) ,
+       ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) ,
+       ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) ,
+       ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) ,
+       ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) ,
+       ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) ,
+       ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) ,
+       ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) ,
+       ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) ,
+       ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) ,
+       ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) ,
+       ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) ,
+       ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) ,
+       ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) ,
+       ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) ,
+       ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) ,
+       ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) ,
+       ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) ,
+       ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) ,
+       ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) ,
+       ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) ,
+       ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) ,
+       ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) ,
+       ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) ,
+       ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) ,
+       ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) ,
+       ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) ,
+       ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) ,
+       ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) ,
+       ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) ,
+       ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) ,
+       ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) ,
+#undef ENTRY
+
+#define HUFFDEC_EXTRA_LENGTH_BITS_MASK 0xFF
+#define HUFFDEC_LENGTH_BASE_SHIFT      8
+#define HUFFDEC_END_OF_BLOCK_LENGTH    0
+
+#define ENTRY(length_base, num_extra_bits)     HUFFDEC_RESULT_ENTRY(   \
+       ((u32)(length_base) << HUFFDEC_LENGTH_BASE_SHIFT) | (num_extra_bits))
+
+       /* End of block  */
+       ENTRY(HUFFDEC_END_OF_BLOCK_LENGTH, 0),
+
+       /* Lengths  */
+       ENTRY(3  , 0) , ENTRY(4  , 0) , ENTRY(5  , 0) , ENTRY(6  , 0),
+       ENTRY(7  , 0) , ENTRY(8  , 0) , ENTRY(9  , 0) , ENTRY(10 , 0),
+       ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1),
+       ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2),
+       ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3),
+       ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4),
+       ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5),
+       ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) ,
+#undef ENTRY
+};
+
+/* The decode result for each offset symbol.  This is the offset base and the
+ * number of extra offset bits.  */
+static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = {
+
+#define HUFFDEC_EXTRA_OFFSET_BITS_SHIFT 16
+#define HUFFDEC_OFFSET_BASE_MASK (((u32)1 << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) - 1)
+
+#define ENTRY(offset_base, num_extra_bits)     HUFFDEC_RESULT_ENTRY(   \
+               ((u32)(num_extra_bits) << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) | \
+               (offset_base))
+       ENTRY(1     , 0)  , ENTRY(2     , 0)  , ENTRY(3     , 0)  , ENTRY(4     , 0)  ,
+       ENTRY(5     , 1)  , ENTRY(7     , 1)  , ENTRY(9     , 2)  , ENTRY(13    , 2) ,
+       ENTRY(17    , 3)  , ENTRY(25    , 3)  , ENTRY(33    , 4)  , ENTRY(49    , 4)  ,
+       ENTRY(65    , 5)  , ENTRY(97    , 5)  , ENTRY(129   , 6)  , ENTRY(193   , 6)  ,
+       ENTRY(257   , 7)  , ENTRY(385   , 7)  , ENTRY(513   , 8)  , ENTRY(769   , 8)  ,
+       ENTRY(1025  , 9)  , ENTRY(1537  , 9)  , ENTRY(2049  , 10) , ENTRY(3073  , 10) ,
+       ENTRY(4097  , 11) , ENTRY(6145  , 11) , ENTRY(8193  , 12) , ENTRY(12289 , 12) ,
+       ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(32769 , 14) , ENTRY(49153 , 14) ,
+#undef ENTRY
+};
+
+/*
+ * Build a table for fast decoding of symbols from a Huffman code.  As input,
+ * this function takes the codeword length of each symbol which may be used in
+ * the code.  As output, it produces a decode table for the canonical Huffman
+ * code described by the codeword lengths.  The decode table is built with the
+ * assumption that it will be indexed with "bit-reversed" codewords, where the
+ * low-order bit is the first bit of the codeword.  This format is used for all
+ * Huffman codes in DEFLATE.
+ *
+ * @decode_table
+ *     The array in which the decode table will be generated.  This array must
+ *     have sufficient length; see the definition of the ENOUGH numbers.
+ * @lens
+ *     An array which provides, for each symbol, the length of the
+ *     corresponding codeword in bits, or 0 if the symbol is unused.  This may
+ *     alias @decode_table, since nothing is written to @decode_table until all
+ *     @lens have been consumed.  All codeword lengths are assumed to be <=
+ *     @max_codeword_len but are otherwise considered untrusted.  If they do
+ *     not form a valid Huffman code, then the decode table is not built and
+ *     %false is returned.
+ * @num_syms
+ *     The number of symbols in the code, including all unused symbols.
+ * @decode_results
+ *     An array which provides, for each symbol, the actual value to store into
+ *     the decode table.  This value will be directly produced as the result of
+ *     decoding that symbol, thereby moving the indirection out of the decode
+ *     loop and into the table initialization.
+ * @table_bits
+ *     The log base-2 of the number of main table entries to use.
+ * @max_codeword_len
+ *     The maximum allowed codeword length for this Huffman code.
+ *     Must be <= DEFLATE_MAX_CODEWORD_LEN.
+ * @sorted_syms
+ *     A temporary array of length @num_syms.
+ *
+ * Returns %true if successful; %false if the codeword lengths do not form a
+ * valid Huffman code.
+ */
+static bool
+build_decode_table(u32 decode_table[],
+                  const u8 lens[],
+                  const unsigned num_syms,
+                  const u32 decode_results[],
+                  const unsigned table_bits,
+                  const unsigned max_codeword_len,
+                  u16 *sorted_syms)
+{
+       unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
+       unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1];
+       unsigned sym;           /* current symbol */
+       unsigned codeword;      /* current codeword, bit-reversed */
+       unsigned len;           /* current codeword length in bits */
+       unsigned count;         /* num codewords remaining with this length */
+       u32 codespace_used;     /* codespace used out of '2^max_codeword_len' */
+       unsigned cur_table_end; /* end index of current table */
+       unsigned subtable_prefix; /* codeword prefix of current subtable */
+       unsigned subtable_start;  /* start index of current subtable */
+       unsigned subtable_bits;   /* log2 of current subtable length */
+
+       /* Count how many codewords have each length, including 0. */
+       for (len = 0; len <= max_codeword_len; len++)
+               len_counts[len] = 0;
+       for (sym = 0; sym < num_syms; sym++)
+               len_counts[lens[sym]]++;
+
+       /*
+        * Sort the symbols primarily by increasing codeword length and
+        * secondarily by increasing symbol value; or equivalently by their
+        * codewords in lexicographic order, since a canonical code is assumed.
+        *
+        * For efficiency, also compute 'codespace_used' in the same pass over
+        * 'len_counts[]' used to build 'offsets[]' for sorting.
+        */
+
+       /* Ensure that 'codespace_used' cannot overflow. */
+       STATIC_ASSERT(sizeof(codespace_used) == 4);
+       STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >=
+                     DEFLATE_MAX_NUM_SYMS);
+
+       offsets[0] = 0;
+       offsets[1] = len_counts[0];
+       codespace_used = 0;
+       for (len = 1; len < max_codeword_len; len++) {
+               offsets[len + 1] = offsets[len] + len_counts[len];
+               codespace_used = (codespace_used << 1) + len_counts[len];
+       }
+       codespace_used = (codespace_used << 1) + len_counts[len];
+
+       for (sym = 0; sym < num_syms; sym++)
+               sorted_syms[offsets[lens[sym]]++] = sym;
+
+       sorted_syms += offsets[0]; /* Skip unused symbols */
+
+       /* lens[] is done being used, so we can write to decode_table[] now. */
+
+       /*
+        * Check whether the lengths form a complete code (exactly fills the
+        * codespace), an incomplete code (doesn't fill the codespace), or an
+        * overfull code (overflows the codespace).  A codeword of length 'n'
+        * uses proportion '1/(2^n)' of the codespace.  An overfull code is
+        * nonsensical, so is considered invalid.  An incomplete code is
+        * considered valid only in two specific cases; see below.
+        */
+
+       /* overfull code? */
+       if (unlikely(codespace_used > (1U << max_codeword_len)))
+               return false;
+
+       /* incomplete code? */
+       if (unlikely(codespace_used < (1U << max_codeword_len))) {
+               u32 entry;
+               unsigned i;
+
+               if (codespace_used == 0) {
+                       /*
+                        * An empty code is allowed.  This can happen for the
+                        * offset code in DEFLATE, since a dynamic Huffman block
+                        * need not contain any matches.
+                        */
+
+                       /* sym=0, len=1 (arbitrary) */
+                       entry = decode_results[0] | 1;
+               } else {
+                       /*
+                        * Allow codes with a single used symbol, with codeword
+                        * length 1.  The DEFLATE RFC is unclear regarding this
+                        * case.  What zlib's decompressor does is permit this
+                        * for the litlen and offset codes and assume the
+                        * codeword is '0' rather than '1'.  We do the same
+                        * except we allow this for precodes too, since there's
+                        * no convincing reason to treat the codes differently.
+                        * We also assign both codewords '0' and '1' to the
+                        * symbol to avoid having to handle '1' specially.
+                        */
+                       if (codespace_used != (1U << (max_codeword_len - 1)) ||
+                           len_counts[1] != 1)
+                               return false;
+                       entry = decode_results[*sorted_syms] | 1;
+               }
+               /*
+                * Note: the decode table still must be fully initialized, in
+                * case the stream is malformed and contains bits from the part
+                * of the codespace the incomplete code doesn't use.
+                */
+               for (i = 0; i < (1U << table_bits); i++)
+                       decode_table[i] = entry;
+               return true;
+       }
+
+       /*
+        * The lengths form a complete code.  Now, enumerate the codewords in
+        * lexicographic order and fill the decode table entries for each one.
+        *
+        * First, process all codewords with len <= table_bits.  Each one gets
+        * '2^(table_bits-len)' direct entries in the table.
+        *
+        * Since DEFLATE uses bit-reversed codewords, these entries aren't
+        * consecutive but rather are spaced '2^len' entries apart.  This makes
+        * filling them naively somewhat awkward and inefficient, since strided
+        * stores are less cache-friendly and preclude the use of word or
+        * vector-at-a-time stores to fill multiple entries per instruction.
+        *
+        * To optimize this, we incrementally double the table size.  When
+        * processing codewords with length 'len', the table is treated as
+        * having only '2^len' entries, so each codeword uses just one entry.
+        * Then, each time 'len' is incremented, the table size is doubled and
+        * the first half is copied to the second half.  This significantly
+        * improves performance over naively doing strided stores.
+        *
+        * Note that some entries copied for each table doubling may not have
+        * been initialized yet, but it doesn't matter since they're guaranteed
+        * to be initialized later (because the Huffman code is complete).
+        */
+       codeword = 0;
+       len = 1;
+       while ((count = len_counts[len]) == 0)
+               len++;
+       cur_table_end = 1U << len;
+       while (len <= table_bits) {
+               /* Process all 'count' codewords with length 'len' bits. */
+               do {
+                       unsigned bit;
+
+                       /* Fill the first entry for the current codeword. */
+                       decode_table[codeword] =
+                               decode_results[*sorted_syms++] | len;
+
+                       if (codeword == cur_table_end - 1) {
+                               /* Last codeword (all 1's) */
+                               for (; len < table_bits; len++) {
+                                       memcpy(&decode_table[cur_table_end],
+                                              decode_table,
+                                              cur_table_end *
+                                               sizeof(decode_table[0]));
+                                       cur_table_end <<= 1;
+                               }
+                               return true;
+                       }
+                       /*
+                        * To advance to the lexicographically next codeword in
+                        * the canonical code, the codeword must be incremented,
+                        * then 0's must be appended to the codeword as needed
+                        * to match the next codeword's length.
+                        *
+                        * Since the codeword is bit-reversed, appending 0's is
+                        * a no-op.  However, incrementing it is nontrivial.  To
+                        * do so efficiently, use the 'bsr' instruction to find
+                        * the last (highest order) 0 bit in the codeword, set
+                        * it, and clear any later (higher order) 1 bits.  But
+                        * 'bsr' actually finds the highest order 1 bit, so to
+                        * use it first flip all bits in the codeword by XOR'ing
+                        * it with (1U << len) - 1 == cur_table_end - 1.
+                        */
+                       bit = 1U << bsr32(codeword ^ (cur_table_end - 1));
+                       codeword &= bit - 1;
+                       codeword |= bit;
+               } while (--count);
+
+               /* Advance to the next codeword length. */
+               do {
+                       if (++len <= table_bits) {
+                               memcpy(&decode_table[cur_table_end],
+                                      decode_table,
+                                      cur_table_end * sizeof(decode_table[0]));
+                               cur_table_end <<= 1;
+                       }
+               } while ((count = len_counts[len]) == 0);
+       }
+
+       /* Process codewords with len > table_bits.  These require subtables. */
+       cur_table_end = 1U << table_bits;
+       subtable_prefix = -1;
+       subtable_start = 0;
+       for (;;) {
+               u32 entry;
+               unsigned i;
+               unsigned stride;
+               unsigned bit;
+
+               /*
+                * Start a new subtable if the first 'table_bits' bits of the
+                * codeword don't match the prefix of the current subtable.
+                */
+               if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) {
+                       subtable_prefix = (codeword & ((1U << table_bits) - 1));
+                       subtable_start = cur_table_end;
+                       /*
+                        * Calculate the subtable length.  If the codeword has
+                        * length 'table_bits + n', then the subtable needs
+                        * '2^n' entries.  But it may need more; if fewer than
+                        * '2^n' codewords of length 'table_bits + n' remain,
+                        * then the length will need to be incremented to bring
+                        * in longer codewords until the subtable can be
+                        * completely filled.  Note that because the Huffman
+                        * code is complete, it will always be possible to fill
+                        * the subtable eventually.
+                        */
+                       subtable_bits = len - table_bits;
+                       codespace_used = count;
+                       while (codespace_used < (1U << subtable_bits)) {
+                               subtable_bits++;
+                               codespace_used = (codespace_used << 1) +
+                                       len_counts[table_bits + subtable_bits];
+                       }
+                       cur_table_end = subtable_start + (1U << subtable_bits);
+
+                       /*
+                        * Create the entry that points from the main table to
+                        * the subtable.  This entry contains the index of the
+                        * start of the subtable and the number of bits with
+                        * which the subtable is indexed (the log base 2 of the
+                        * number of entries it contains).
+                        */
+                       decode_table[subtable_prefix] =
+                               HUFFDEC_SUBTABLE_POINTER |
+                               HUFFDEC_RESULT_ENTRY(subtable_start) |
+                               subtable_bits;
+               }
+
+               /* Fill the subtable entries for the current codeword. */
+               entry = decode_results[*sorted_syms++] | (len - table_bits);
+               i = subtable_start + (codeword >> table_bits);
+               stride = 1U << (len - table_bits);
+               do {
+                       decode_table[i] = entry;
+                       i += stride;
+               } while (i < cur_table_end);
+
+               /* Advance to the next codeword. */
+               if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */
+                       return true;
+               bit = 1U << bsr32(codeword ^ ((1U << len) - 1));
+               codeword &= bit - 1;
+               codeword |= bit;
+               count--;
+               while (count == 0)
+                       count = len_counts[++len];
+       }
+}
+
+/* Build the decode table for the precode.  */
+static bool
+build_precode_decode_table(struct libdeflate_decompressor *d)
+{
+       /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+       STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128);
+
+       return build_decode_table(d->u.l.precode_decode_table,
+                                 d->u.precode_lens,
+                                 DEFLATE_NUM_PRECODE_SYMS,
+                                 precode_decode_results,
+                                 PRECODE_TABLEBITS,
+                                 DEFLATE_MAX_PRE_CODEWORD_LEN,
+                                 d->sorted_syms);
+}
+
+/* Build the decode table for the literal/length code.  */
+static bool
+build_litlen_decode_table(struct libdeflate_decompressor *d,
+                         unsigned num_litlen_syms, unsigned num_offset_syms)
+{
+       /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+       STATIC_ASSERT(LITLEN_TABLEBITS == 10 && LITLEN_ENOUGH == 1334);
+
+       return build_decode_table(d->u.litlen_decode_table,
+                                 d->u.l.lens,
+                                 num_litlen_syms,
+                                 litlen_decode_results,
+                                 LITLEN_TABLEBITS,
+                                 DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+                                 d->sorted_syms);
+}
+
+/* Build the decode table for the offset code.  */
+static bool
+build_offset_decode_table(struct libdeflate_decompressor *d,
+                         unsigned num_litlen_syms, unsigned num_offset_syms)
+{
+       /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+       STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402);
+
+       return build_decode_table(d->offset_decode_table,
+                                 d->u.l.lens + num_litlen_syms,
+                                 num_offset_syms,
+                                 offset_decode_results,
+                                 OFFSET_TABLEBITS,
+                                 DEFLATE_MAX_OFFSET_CODEWORD_LEN,
+                                 d->sorted_syms);
+}
+
+static forceinline machine_word_t
+repeat_byte(u8 b)
+{
+       machine_word_t v;
+
+       STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+
+       v = b;
+       v |= v << 8;
+       v |= v << 16;
+       v |= v << ((WORDBITS == 64) ? 32 : 0);
+       return v;
+}
+
+static forceinline void
+copy_word_unaligned(const void *src, void *dst)
+{
+       store_word_unaligned(load_word_unaligned(src), dst);
+}
+
+/*****************************************************************************
+ *                         Main decompression routine
+ *****************************************************************************/
+
+typedef enum libdeflate_result (*decompress_func_t)
+       (struct libdeflate_decompressor * restrict d,
+        const void * restrict in, size_t in_nbytes,
+        void * restrict out, size_t out_nbytes_avail,
+        size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
+
+#undef DEFAULT_IMPL
+#undef DISPATCH
+#if defined(__i386__) || defined(__x86_64__)
+#  include "x86/decompress_impl.h"
+#endif
+
+#ifndef DEFAULT_IMPL
+#  define FUNCNAME deflate_decompress_default
+#  define ATTRIBUTES
+#  include "decompress_template.h"
+#  define DEFAULT_IMPL deflate_decompress_default
+#endif
+
+#ifdef DISPATCH
+static enum libdeflate_result
+dispatch(struct libdeflate_decompressor * restrict d,
+        const void * restrict in, size_t in_nbytes,
+        void * restrict out, size_t out_nbytes_avail,
+        size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
+
+static volatile decompress_func_t decompress_impl = dispatch;
+
+/* Choose the fastest implementation at runtime */
+static enum libdeflate_result
+dispatch(struct libdeflate_decompressor * restrict d,
+        const void * restrict in, size_t in_nbytes,
+        void * restrict out, size_t out_nbytes_avail,
+        size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
+{
+       decompress_func_t f = arch_select_decompress_func();
+
+       if (f == NULL)
+               f = DEFAULT_IMPL;
+
+       decompress_impl = f;
+       return (*f)(d, in, in_nbytes, out, out_nbytes_avail,
+                   actual_in_nbytes_ret, actual_out_nbytes_ret);
+}
+#else
+#  define decompress_impl DEFAULT_IMPL /* only one implementation, use it */
+#endif
+
+
+/*
+ * This is the main DEFLATE decompression routine.  See libdeflate.h for the
+ * documentation.
+ *
+ * Note that the real code is in decompress_template.h.  The part here just
+ * handles calling the appropriate implementation depending on the CPU features
+ * at runtime.
+ */
+LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+libdeflate_deflate_decompress_ex(struct libdeflate_decompressor * restrict d,
+                                const void * restrict in, size_t in_nbytes,
+                                void * restrict out, size_t out_nbytes_avail,
+                                size_t *actual_in_nbytes_ret,
+                                size_t *actual_out_nbytes_ret)
+{
+       return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail,
+                              actual_in_nbytes_ret, actual_out_nbytes_ret);
+}
+
+LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+libdeflate_deflate_decompress(struct libdeflate_decompressor * restrict d,
+                             const void * restrict in, size_t in_nbytes,
+                             void * restrict out, size_t out_nbytes_avail,
+                             size_t *actual_out_nbytes_ret)
+{
+       return libdeflate_deflate_decompress_ex(d, in, in_nbytes,
+                                               out, out_nbytes_avail,
+                                               NULL, actual_out_nbytes_ret);
+}
+
+LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI
+libdeflate_alloc_decompressor(void)
+{
+       /*
+        * Note that only certain parts of the decompressor actually must be
+        * initialized here:
+        *
+        * - 'static_codes_loaded' must be initialized to false.
+        *
+        * - The first half of the main portion of each decode table must be
+        *   initialized to any value, to avoid reading from uninitialized
+        *   memory during table expansion in build_decode_table().  (Although,
+        *   this is really just to avoid warnings with dynamic tools like
+        *   valgrind, since build_decode_table() is guaranteed to initialize
+        *   all entries eventually anyway.)
+        *
+        * But for simplicity, we currently just zero the whole decompressor.
+        */
+       struct libdeflate_decompressor *d = libdeflate_malloc(sizeof(*d));
+
+       if (d == NULL)
+               return NULL;
+       memset(d, 0, sizeof(*d));
+       return d;
+}
+
+LIBDEFLATEEXPORT void LIBDEFLATEAPI
+libdeflate_free_decompressor(struct libdeflate_decompressor *d)
+{
+       libdeflate_free(d);
+}
diff --git a/src/3rdparty/libdeflate/lib/gzip_compress.c b/src/3rdparty/libdeflate/lib/gzip_compress.c
new file mode 100644 (file)
index 0000000..755fe5f
--- /dev/null
@@ -0,0 +1,93 @@
+/*
+ * gzip_compress.c - compress with a gzip wrapper
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "deflate_compress.h"
+#include "gzip_constants.h"
+#include "unaligned.h"
+
+#include "libdeflate.h"
+
+LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+libdeflate_gzip_compress(struct libdeflate_compressor *c,
+                        const void *in, size_t in_nbytes,
+                        void *out, size_t out_nbytes_avail)
+{
+       u8 *out_next = out;
+       unsigned compression_level;
+       u8 xfl;
+       size_t deflate_size;
+
+       if (out_nbytes_avail <= GZIP_MIN_OVERHEAD)
+               return 0;
+
+       /* ID1 */
+       *out_next++ = GZIP_ID1;
+       /* ID2 */
+       *out_next++ = GZIP_ID2;
+       /* CM */
+       *out_next++ = GZIP_CM_DEFLATE;
+       /* FLG */
+       *out_next++ = 0;
+       /* MTIME */
+       put_unaligned_le32(GZIP_MTIME_UNAVAILABLE, out_next);
+       out_next += 4;
+       /* XFL */
+       xfl = 0;
+       compression_level = deflate_get_compression_level(c);
+       if (compression_level < 2)
+               xfl |= GZIP_XFL_FASTEST_COMPRESSION;
+       else if (compression_level >= 8)
+               xfl |= GZIP_XFL_SLOWEST_COMPRESSION;
+       *out_next++ = xfl;
+       /* OS */
+       *out_next++ = GZIP_OS_UNKNOWN;  /* OS  */
+
+       /* Compressed data  */
+       deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next,
+                                       out_nbytes_avail - GZIP_MIN_OVERHEAD);
+       if (deflate_size == 0)
+               return 0;
+       out_next += deflate_size;
+
+       /* CRC32 */
+       put_unaligned_le32(libdeflate_crc32(0, in, in_nbytes), out_next);
+       out_next += 4;
+
+       /* ISIZE */
+       put_unaligned_le32((u32)in_nbytes, out_next);
+       out_next += 4;
+
+       return out_next - (u8 *)out;
+}
+
+LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+libdeflate_gzip_compress_bound(struct libdeflate_compressor *c,
+                              size_t in_nbytes)
+{
+       return GZIP_MIN_OVERHEAD +
+              libdeflate_deflate_compress_bound(c, in_nbytes);
+}
diff --git a/src/3rdparty/libdeflate/lib/gzip_constants.h b/src/3rdparty/libdeflate/lib/gzip_constants.h
new file mode 100644 (file)
index 0000000..35e4728
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * gzip_constants.h - constants for the gzip wrapper format
+ */
+
+#ifndef LIB_GZIP_CONSTANTS_H
+#define LIB_GZIP_CONSTANTS_H
+
+#define GZIP_MIN_HEADER_SIZE   10
+#define GZIP_FOOTER_SIZE       8
+#define GZIP_MIN_OVERHEAD      (GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE)
+
+#define GZIP_ID1               0x1F
+#define GZIP_ID2               0x8B
+
+#define GZIP_CM_DEFLATE                8
+
+#define GZIP_FTEXT             0x01
+#define GZIP_FHCRC             0x02
+#define GZIP_FEXTRA            0x04
+#define GZIP_FNAME             0x08
+#define GZIP_FCOMMENT          0x10
+#define GZIP_FRESERVED         0xE0
+
+#define GZIP_MTIME_UNAVAILABLE 0
+
+#define GZIP_XFL_SLOWEST_COMPRESSION   0x02
+#define GZIP_XFL_FASTEST_COMPRESSION   0x04
+
+#define GZIP_OS_FAT            0
+#define GZIP_OS_AMIGA          1
+#define GZIP_OS_VMS            2
+#define GZIP_OS_UNIX           3
+#define GZIP_OS_VM_CMS         4
+#define GZIP_OS_ATARI_TOS      5
+#define GZIP_OS_HPFS           6
+#define GZIP_OS_MACINTOSH      7
+#define GZIP_OS_Z_SYSTEM       8
+#define GZIP_OS_CP_M           9
+#define GZIP_OS_TOPS_20                10
+#define GZIP_OS_NTFS           11
+#define GZIP_OS_QDOS           12
+#define GZIP_OS_RISCOS         13
+#define GZIP_OS_UNKNOWN                255
+
+#endif /* LIB_GZIP_CONSTANTS_H */
diff --git a/src/3rdparty/libdeflate/lib/gzip_decompress.c b/src/3rdparty/libdeflate/lib/gzip_decompress.c
new file mode 100644 (file)
index 0000000..2a96a4e
--- /dev/null
@@ -0,0 +1,146 @@
+/*
+ * gzip_decompress.c - decompress with a gzip wrapper
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "gzip_constants.h"
+#include "unaligned.h"
+
+#include "libdeflate.h"
+
+LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d,
+                             const void *in, size_t in_nbytes,
+                             void *out, size_t out_nbytes_avail,
+                             size_t *actual_in_nbytes_ret,
+                             size_t *actual_out_nbytes_ret)
+{
+       const u8 *in_next = in;
+       const u8 * const in_end = in_next + in_nbytes;
+       u8 flg;
+       size_t actual_in_nbytes;
+       size_t actual_out_nbytes;
+       enum libdeflate_result result;
+
+       if (in_nbytes < GZIP_MIN_OVERHEAD)
+               return LIBDEFLATE_BAD_DATA;
+
+       /* ID1 */
+       if (*in_next++ != GZIP_ID1)
+               return LIBDEFLATE_BAD_DATA;
+       /* ID2 */
+       if (*in_next++ != GZIP_ID2)
+               return LIBDEFLATE_BAD_DATA;
+       /* CM */
+       if (*in_next++ != GZIP_CM_DEFLATE)
+               return LIBDEFLATE_BAD_DATA;
+       flg = *in_next++;
+       /* MTIME */
+       in_next += 4;
+       /* XFL */
+       in_next += 1;
+       /* OS */
+       in_next += 1;
+
+       if (flg & GZIP_FRESERVED)
+               return LIBDEFLATE_BAD_DATA;
+
+       /* Extra field */
+       if (flg & GZIP_FEXTRA) {
+               u16 xlen = get_unaligned_le16(in_next);
+               in_next += 2;
+
+               if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE)
+                       return LIBDEFLATE_BAD_DATA;
+
+               in_next += xlen;
+       }
+
+       /* Original file name (zero terminated) */
+       if (flg & GZIP_FNAME) {
+               while (*in_next++ != 0 && in_next != in_end)
+                       ;
+               if (in_end - in_next < GZIP_FOOTER_SIZE)
+                       return LIBDEFLATE_BAD_DATA;
+       }
+
+       /* File comment (zero terminated) */
+       if (flg & GZIP_FCOMMENT) {
+               while (*in_next++ != 0 && in_next != in_end)
+                       ;
+               if (in_end - in_next < GZIP_FOOTER_SIZE)
+                       return LIBDEFLATE_BAD_DATA;
+       }
+
+       /* CRC16 for gzip header */
+       if (flg & GZIP_FHCRC) {
+               in_next += 2;
+               if (in_end - in_next < GZIP_FOOTER_SIZE)
+                       return LIBDEFLATE_BAD_DATA;
+       }
+
+       /* Compressed data  */
+       result = libdeflate_deflate_decompress_ex(d, in_next,
+                                       in_end - GZIP_FOOTER_SIZE - in_next,
+                                       out, out_nbytes_avail,
+                                       &actual_in_nbytes,
+                                       actual_out_nbytes_ret);
+       if (result != LIBDEFLATE_SUCCESS)
+               return result;
+
+       if (actual_out_nbytes_ret)
+               actual_out_nbytes = *actual_out_nbytes_ret;
+       else
+               actual_out_nbytes = out_nbytes_avail;
+
+       in_next += actual_in_nbytes;
+
+       /* CRC32 */
+       if (libdeflate_crc32(0, out, actual_out_nbytes) !=
+           get_unaligned_le32(in_next))
+               return LIBDEFLATE_BAD_DATA;
+       in_next += 4;
+
+       /* ISIZE */
+       if ((u32)actual_out_nbytes != get_unaligned_le32(in_next))
+               return LIBDEFLATE_BAD_DATA;
+       in_next += 4;
+
+       if (actual_in_nbytes_ret)
+               *actual_in_nbytes_ret = in_next - (u8 *)in;
+
+       return LIBDEFLATE_SUCCESS;
+}
+
+LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
+                          const void *in, size_t in_nbytes,
+                          void *out, size_t out_nbytes_avail,
+                          size_t *actual_out_nbytes_ret)
+{
+       return libdeflate_gzip_decompress_ex(d, in, in_nbytes,
+                                            out, out_nbytes_avail,
+                                            NULL, actual_out_nbytes_ret);
+}
diff --git a/src/3rdparty/libdeflate/lib/hc_matchfinder.h b/src/3rdparty/libdeflate/lib/hc_matchfinder.h
new file mode 100644 (file)
index 0000000..c656913
--- /dev/null
@@ -0,0 +1,402 @@
+/*
+ * hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ---------------------------------------------------------------------------
+ *
+ *                                Algorithm
+ *
+ * This is a Hash Chains (hc) based matchfinder.
+ *
+ * The main data structure is a hash table where each hash bucket contains a
+ * linked list (or "chain") of sequences whose first 4 bytes share the same hash
+ * code.  Each sequence is identified by its starting position in the input
+ * buffer.
+ *
+ * The algorithm processes the input buffer sequentially.  At each byte
+ * position, the hash code of the first 4 bytes of the sequence beginning at
+ * that position (the sequence being matched against) is computed.  This
+ * identifies the hash bucket to use for that position.  Then, this hash
+ * bucket's linked list is searched for matches.  Then, a new linked list node
+ * is created to represent the current sequence and is prepended to the list.
+ *
+ * This algorithm has several useful properties:
+ *
+ * - It only finds true Lempel-Ziv matches; i.e., those where the matching
+ *   sequence occurs prior to the sequence being matched against.
+ *
+ * - The sequences in each linked list are always sorted by decreasing starting
+ *   position.  Therefore, the closest (smallest offset) matches are found
+ *   first, which in many compression formats tend to be the cheapest to encode.
+ *
+ * - Although fast running time is not guaranteed due to the possibility of the
+ *   lists getting very long, the worst degenerate behavior can be easily
+ *   prevented by capping the number of nodes searched at each position.
+ *
+ * - If the compressor decides not to search for matches at a certain position,
+ *   then that position can be quickly inserted without searching the list.
+ *
+ * - The algorithm is adaptable to sliding windows: just store the positions
+ *   relative to a "base" value that is updated from time to time, and stop
+ *   searching each list when the sequences get too far away.
+ *
+ * ----------------------------------------------------------------------------
+ *
+ *                              Optimizations
+ *
+ * The main hash table and chains handle length 4+ matches.  Length 3 matches
+ * are handled by a separate hash table with no chains.  This works well for
+ * typical "greedy" or "lazy"-style compressors, where length 3 matches are
+ * often only helpful if they have small offsets.  Instead of searching a full
+ * chain for length 3+ matches, the algorithm just checks for one close length 3
+ * match, then focuses on finding length 4+ matches.
+ *
+ * The longest_match() and skip_bytes() functions are inlined into the
+ * compressors that use them.  This isn't just about saving the overhead of a
+ * function call.  These functions are intended to be called from the inner
+ * loops of compressors, where giving the compiler more control over register
+ * allocation is very helpful.  There is also significant benefit to be gained
+ * from allowing the CPU to predict branches independently at each call site.
+ * For example, "lazy"-style compressors can be written with two calls to
+ * longest_match(), each of which starts with a different 'best_len' and
+ * therefore has significantly different performance characteristics.
+ *
+ * Although any hash function can be used, a multiplicative hash is fast and
+ * works well.
+ *
+ * On some processors, it is significantly faster to extend matches by whole
+ * words (32 or 64 bits) instead of by individual bytes.  For this to be the
+ * case, the processor must implement unaligned memory accesses efficiently and
+ * must have either a fast "find first set bit" instruction or a fast "find last
+ * set bit" instruction, depending on the processor's endianness.
+ *
+ * The code uses one loop for finding the first match and one loop for finding a
+ * longer match.  Each of these loops is tuned for its respective task and in
+ * combination are faster than a single generalized loop that handles both
+ * tasks.
+ *
+ * The code also uses a tight inner loop that only compares the last and first
+ * bytes of a potential match.  It is only when these bytes match that a full
+ * match extension is attempted.
+ *
+ * ----------------------------------------------------------------------------
+ */
+
+#ifndef LIB_HC_MATCHFINDER_H
+#define LIB_HC_MATCHFINDER_H
+
+#include "matchfinder_common.h"
+
+#define HC_MATCHFINDER_HASH3_ORDER     15
+#define HC_MATCHFINDER_HASH4_ORDER     16
+
+#define HC_MATCHFINDER_TOTAL_HASH_SIZE                 \
+       (((1UL << HC_MATCHFINDER_HASH3_ORDER) +         \
+         (1UL << HC_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t))
+
+struct hc_matchfinder {
+
+       /* The hash table for finding length 3 matches  */
+       mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER];
+
+       /* The hash table which contains the first nodes of the linked lists for
+        * finding length 4+ matches  */
+       mf_pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER];
+
+       /* The "next node" references for the linked lists.  The "next node" of
+        * the node for the sequence with position 'pos' is 'next_tab[pos]'.  */
+       mf_pos_t next_tab[MATCHFINDER_WINDOW_SIZE];
+
+} MATCHFINDER_ALIGNED;
+
+/* Prepare the matchfinder for a new input buffer.  */
+static forceinline void
+hc_matchfinder_init(struct hc_matchfinder *mf)
+{
+       STATIC_ASSERT(HC_MATCHFINDER_TOTAL_HASH_SIZE %
+                     MATCHFINDER_SIZE_ALIGNMENT == 0);
+
+       matchfinder_init((mf_pos_t *)mf, HC_MATCHFINDER_TOTAL_HASH_SIZE);
+}
+
+static forceinline void
+hc_matchfinder_slide_window(struct hc_matchfinder *mf)
+{
+       STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0);
+
+       matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf));
+}
+
+/*
+ * Find the longest match longer than 'best_len' bytes.
+ *
+ * @mf
+ *     The matchfinder structure.
+ * @in_base_p
+ *     Location of a pointer which points to the place in the input data the
+ *     matchfinder currently stores positions relative to.  This may be updated
+ *     by this function.
+ * @in_next
+ *     Pointer to the next position in the input buffer, i.e. the sequence
+ *     being matched against.
+ * @best_len
+ *     Require a match longer than this length.
+ * @max_len
+ *     The maximum permissible match length at this position.
+ * @nice_len
+ *     Stop searching if a match of at least this length is found.
+ *     Must be <= @max_len.
+ * @max_search_depth
+ *     Limit on the number of potential matches to consider.  Must be >= 1.
+ * @next_hashes
+ *     The precomputed hash codes for the sequence beginning at @in_next.
+ *     These will be used and then updated with the precomputed hashcodes for
+ *     the sequence beginning at @in_next + 1.
+ * @offset_ret
+ *     If a match is found, its offset is returned in this location.
+ *
+ * Return the length of the match found, or 'best_len' if no match longer than
+ * 'best_len' was found.
+ */
+static forceinline u32
+hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
+                            const u8 ** const restrict in_base_p,
+                            const u8 * const restrict in_next,
+                            u32 best_len,
+                            const u32 max_len,
+                            const u32 nice_len,
+                            const u32 max_search_depth,
+                            u32 * const restrict next_hashes,
+                            u32 * const restrict offset_ret)
+{
+       u32 depth_remaining = max_search_depth;
+       const u8 *best_matchptr = in_next;
+       mf_pos_t cur_node3, cur_node4;
+       u32 hash3, hash4;
+       u32 next_hashseq;
+       u32 seq4;
+       const u8 *matchptr;
+       u32 len;
+       u32 cur_pos = in_next - *in_base_p;
+       const u8 *in_base;
+       mf_pos_t cutoff;
+
+       if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
+               hc_matchfinder_slide_window(mf);
+               *in_base_p += MATCHFINDER_WINDOW_SIZE;
+               cur_pos = 0;
+       }
+
+       in_base = *in_base_p;
+       cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
+
+       if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */
+               goto out;
+
+       /* Get the precomputed hash codes.  */
+       hash3 = next_hashes[0];
+       hash4 = next_hashes[1];
+
+       /* From the hash buckets, get the first node of each linked list.  */
+       cur_node3 = mf->hash3_tab[hash3];
+       cur_node4 = mf->hash4_tab[hash4];
+
+       /* Update for length 3 matches.  This replaces the singleton node in the
+        * 'hash3' bucket with the node for the current sequence.  */
+       mf->hash3_tab[hash3] = cur_pos;
+
+       /* Update for length 4 matches.  This prepends the node for the current
+        * sequence to the linked list in the 'hash4' bucket.  */
+       mf->hash4_tab[hash4] = cur_pos;
+       mf->next_tab[cur_pos] = cur_node4;
+
+       /* Compute the next hash codes.  */
+       next_hashseq = get_unaligned_le32(in_next + 1);
+       next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
+       next_hashes[1] = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
+       prefetchw(&mf->hash3_tab[next_hashes[0]]);
+       prefetchw(&mf->hash4_tab[next_hashes[1]]);
+
+       if (best_len < 4) {  /* No match of length >= 4 found yet?  */
+
+               /* Check for a length 3 match if needed.  */
+
+               if (cur_node3 <= cutoff)
+                       goto out;
+
+               seq4 = load_u32_unaligned(in_next);
+
+               if (best_len < 3) {
+                       matchptr = &in_base[cur_node3];
+                       if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) {
+                               best_len = 3;
+                               best_matchptr = matchptr;
+                       }
+               }
+
+               /* Check for a length 4 match.  */
+
+               if (cur_node4 <= cutoff)
+                       goto out;
+
+               for (;;) {
+                       /* No length 4 match found yet.  Check the first 4 bytes.  */
+                       matchptr = &in_base[cur_node4];
+
+                       if (load_u32_unaligned(matchptr) == seq4)
+                               break;
+
+                       /* The first 4 bytes did not match.  Keep trying.  */
+                       cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
+                       if (cur_node4 <= cutoff || !--depth_remaining)
+                               goto out;
+               }
+
+               /* Found a match of length >= 4.  Extend it to its full length.  */
+               best_matchptr = matchptr;
+               best_len = lz_extend(in_next, best_matchptr, 4, max_len);
+               if (best_len >= nice_len)
+                       goto out;
+               cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
+               if (cur_node4 <= cutoff || !--depth_remaining)
+                       goto out;
+       } else {
+               if (cur_node4 <= cutoff || best_len >= nice_len)
+                       goto out;
+       }
+
+       /* Check for matches of length >= 5.  */
+
+       for (;;) {
+               for (;;) {
+                       matchptr = &in_base[cur_node4];
+
+                       /* Already found a length 4 match.  Try for a longer
+                        * match; start by checking either the last 4 bytes and
+                        * the first 4 bytes, or the last byte.  (The last byte,
+                        * the one which would extend the match length by 1, is
+                        * the most important.)  */
+               #if UNALIGNED_ACCESS_IS_FAST
+                       if ((load_u32_unaligned(matchptr + best_len - 3) ==
+                            load_u32_unaligned(in_next + best_len - 3)) &&
+                           (load_u32_unaligned(matchptr) ==
+                            load_u32_unaligned(in_next)))
+               #else
+                       if (matchptr[best_len] == in_next[best_len])
+               #endif
+                               break;
+
+                       /* Continue to the next node in the list.  */
+                       cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
+                       if (cur_node4 <= cutoff || !--depth_remaining)
+                               goto out;
+               }
+
+       #if UNALIGNED_ACCESS_IS_FAST
+               len = 4;
+       #else
+               len = 0;
+       #endif
+               len = lz_extend(in_next, matchptr, len, max_len);
+               if (len > best_len) {
+                       /* This is the new longest match.  */
+                       best_len = len;
+                       best_matchptr = matchptr;
+                       if (best_len >= nice_len)
+                               goto out;
+               }
+
+               /* Continue to the next node in the list.  */
+               cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
+               if (cur_node4 <= cutoff || !--depth_remaining)
+                       goto out;
+       }
+out:
+       *offset_ret = in_next - best_matchptr;
+       return best_len;
+}
+
+/*
+ * Advance the matchfinder, but don't search for matches.
+ *
+ * @mf
+ *     The matchfinder structure.
+ * @in_base_p
+ *     Location of a pointer which points to the place in the input data the
+ *     matchfinder currently stores positions relative to.  This may be updated
+ *     by this function.
+ * @in_next
+ *     Pointer to the next position in the input buffer.
+ * @in_end
+ *     Pointer to the end of the input buffer.
+ * @count
+ *     The number of bytes to advance.  Must be > 0.
+ * @next_hashes
+ *     The precomputed hash codes for the sequence beginning at @in_next.
+ *     These will be used and then updated with the precomputed hashcodes for
+ *     the sequence beginning at @in_next + @count.
+ */
+static forceinline void
+hc_matchfinder_skip_bytes(struct hc_matchfinder * const restrict mf,
+                         const u8 ** const restrict in_base_p,
+                         const u8 *in_next,
+                         const u8 * const in_end,
+                         const u32 count,
+                         u32 * const restrict next_hashes)
+{
+       u32 cur_pos;
+       u32 hash3, hash4;
+       u32 next_hashseq;
+       u32 remaining = count;
+
+       if (unlikely(count + 5 > in_end - in_next))
+               return;
+
+       cur_pos = in_next - *in_base_p;
+       hash3 = next_hashes[0];
+       hash4 = next_hashes[1];
+       do {
+               if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
+                       hc_matchfinder_slide_window(mf);
+                       *in_base_p += MATCHFINDER_WINDOW_SIZE;
+                       cur_pos = 0;
+               }
+               mf->hash3_tab[hash3] = cur_pos;
+               mf->next_tab[cur_pos] = mf->hash4_tab[hash4];
+               mf->hash4_tab[hash4] = cur_pos;
+
+               next_hashseq = get_unaligned_le32(++in_next);
+               hash3 = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
+               hash4 = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
+               cur_pos++;
+       } while (--remaining);
+
+       prefetchw(&mf->hash3_tab[hash3]);
+       prefetchw(&mf->hash4_tab[hash4]);
+       next_hashes[0] = hash3;
+       next_hashes[1] = hash4;
+}
+
+#endif /* LIB_HC_MATCHFINDER_H */
diff --git a/src/3rdparty/libdeflate/lib/ht_matchfinder.h b/src/3rdparty/libdeflate/lib/ht_matchfinder.h
new file mode 100644 (file)
index 0000000..e8323c3
--- /dev/null
@@ -0,0 +1,234 @@
+/*
+ * ht_matchfinder.h - Lempel-Ziv matchfinding with a hash table
+ *
+ * Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ---------------------------------------------------------------------------
+ *
+ * This is a Hash Table (ht) matchfinder.
+ *
+ * This is a variant of the Hash Chains (hc) matchfinder that is optimized for
+ * very fast compression.  The ht_matchfinder stores the hash chains inline in
+ * the hash table, whereas the hc_matchfinder stores them in a separate array.
+ * Storing the hash chains inline is the faster method when max_search_depth
+ * (the maximum chain length) is very small.  It is not appropriate when
+ * max_search_depth is larger, as then it uses too much memory.
+ *
+ * Due to its focus on speed, the ht_matchfinder doesn't support length 3
+ * matches.  It also doesn't allow max_search_depth to vary at runtime; it is
+ * fixed at build time as HT_MATCHFINDER_BUCKET_SIZE.
+ *
+ * See hc_matchfinder.h for more information.
+ */
+
+#ifndef LIB_HT_MATCHFINDER_H
+#define LIB_HT_MATCHFINDER_H
+
+#include "matchfinder_common.h"
+
+#define HT_MATCHFINDER_HASH_ORDER      15
+#define HT_MATCHFINDER_BUCKET_SIZE     2
+
+#define HT_MATCHFINDER_MIN_MATCH_LEN   4
+/* Minimum value of max_len for ht_matchfinder_longest_match() */
+#define HT_MATCHFINDER_REQUIRED_NBYTES 5
+
+struct ht_matchfinder {
+       mf_pos_t hash_tab[1UL << HT_MATCHFINDER_HASH_ORDER]
+                        [HT_MATCHFINDER_BUCKET_SIZE];
+} MATCHFINDER_ALIGNED;
+
+static forceinline void
+ht_matchfinder_init(struct ht_matchfinder *mf)
+{
+       STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0);
+
+       matchfinder_init((mf_pos_t *)mf, sizeof(*mf));
+}
+
+static forceinline void
+ht_matchfinder_slide_window(struct ht_matchfinder *mf)
+{
+       matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf));
+}
+
+/* Note: max_len must be >= HT_MATCHFINDER_REQUIRED_NBYTES */
+static forceinline u32
+ht_matchfinder_longest_match(struct ht_matchfinder * const restrict mf,
+                            const u8 ** const restrict in_base_p,
+                            const u8 * const restrict in_next,
+                            const u32 max_len,
+                            const u32 nice_len,
+                            u32 * const restrict next_hash,
+                            u32 * const restrict offset_ret)
+{
+       u32 best_len = 0;
+       const u8 *best_matchptr = in_next;
+       u32 cur_pos = in_next - *in_base_p;
+       const u8 *in_base;
+       mf_pos_t cutoff;
+       u32 hash;
+       u32 seq;
+       mf_pos_t cur_node;
+       const u8 *matchptr;
+#if HT_MATCHFINDER_BUCKET_SIZE > 1
+       mf_pos_t to_insert;
+       u32 len;
+#endif
+#if HT_MATCHFINDER_BUCKET_SIZE > 2
+       int i;
+#endif
+
+       /* This is assumed throughout this function. */
+       STATIC_ASSERT(HT_MATCHFINDER_MIN_MATCH_LEN == 4);
+
+       if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
+               ht_matchfinder_slide_window(mf);
+               *in_base_p += MATCHFINDER_WINDOW_SIZE;
+               cur_pos = 0;
+       }
+       in_base = *in_base_p;
+       cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
+
+       hash = *next_hash;
+       STATIC_ASSERT(HT_MATCHFINDER_REQUIRED_NBYTES == 5);
+       *next_hash = lz_hash(get_unaligned_le32(in_next + 1),
+                            HT_MATCHFINDER_HASH_ORDER);
+       seq = load_u32_unaligned(in_next);
+       prefetchw(&mf->hash_tab[*next_hash]);
+#if HT_MATCHFINDER_BUCKET_SIZE == 1
+       /* Hand-unrolled version for BUCKET_SIZE == 1 */
+       cur_node = mf->hash_tab[hash][0];
+       mf->hash_tab[hash][0] = cur_pos;
+       if (cur_node <= cutoff)
+               goto out;
+       matchptr = &in_base[cur_node];
+       if (load_u32_unaligned(matchptr) == seq) {
+               best_len = lz_extend(in_next, matchptr, 4, max_len);
+               best_matchptr = matchptr;
+       }
+#elif HT_MATCHFINDER_BUCKET_SIZE == 2
+       /*
+        * Hand-unrolled version for BUCKET_SIZE == 2.  The logic here also
+        * differs slightly in that it copies the first entry to the second even
+        * if nice_len is reached on the first, as this can be slightly faster.
+        */
+       cur_node = mf->hash_tab[hash][0];
+       mf->hash_tab[hash][0] = cur_pos;
+       if (cur_node <= cutoff)
+               goto out;
+       matchptr = &in_base[cur_node];
+
+       to_insert = cur_node;
+       cur_node = mf->hash_tab[hash][1];
+       mf->hash_tab[hash][1] = to_insert;
+
+       if (load_u32_unaligned(matchptr) == seq) {
+               best_len = lz_extend(in_next, matchptr, 4, max_len);
+               best_matchptr = matchptr;
+               if (cur_node <= cutoff || best_len >= nice_len)
+                       goto out;
+               matchptr = &in_base[cur_node];
+               if (load_u32_unaligned(matchptr) == seq &&
+                   load_u32_unaligned(matchptr + best_len - 3) ==
+                   load_u32_unaligned(in_next + best_len - 3)) {
+                       len = lz_extend(in_next, matchptr, 4, max_len);
+                       if (len > best_len) {
+                               best_len = len;
+                               best_matchptr = matchptr;
+                       }
+               }
+       } else {
+               if (cur_node <= cutoff)
+                       goto out;
+               matchptr = &in_base[cur_node];
+               if (load_u32_unaligned(matchptr) == seq) {
+                       best_len = lz_extend(in_next, matchptr, 4, max_len);
+                       best_matchptr = matchptr;
+               }
+       }
+#else
+       /* Generic version for HT_MATCHFINDER_BUCKET_SIZE > 2 */
+       to_insert = cur_pos;
+       for (i = 0; i < HT_MATCHFINDER_BUCKET_SIZE; i++) {
+               cur_node = mf->hash_tab[hash][i];
+               mf->hash_tab[hash][i] = to_insert;
+               if (cur_node <= cutoff)
+                       goto out;
+               matchptr = &in_base[cur_node];
+               if (load_u32_unaligned(matchptr) == seq) {
+                       len = lz_extend(in_next, matchptr, 4, max_len);
+                       if (len > best_len) {
+                               best_len = len;
+                               best_matchptr = matchptr;
+                               if (best_len >= nice_len)
+                                       goto out;
+                       }
+               }
+               to_insert = cur_node;
+       }
+#endif
+out:
+       *offset_ret = in_next - best_matchptr;
+       return best_len;
+}
+
+static forceinline void
+ht_matchfinder_skip_bytes(struct ht_matchfinder * const restrict mf,
+                         const u8 ** const restrict in_base_p,
+                         const u8 *in_next,
+                         const u8 * const in_end,
+                         const u32 count,
+                         u32 * const restrict next_hash)
+{
+       s32 cur_pos = in_next - *in_base_p;
+       u32 hash;
+       u32 remaining = count;
+       int i;
+
+       if (unlikely(count + HT_MATCHFINDER_REQUIRED_NBYTES > in_end - in_next))
+               return;
+
+       if (cur_pos + count - 1 >= MATCHFINDER_WINDOW_SIZE) {
+               ht_matchfinder_slide_window(mf);
+               *in_base_p += MATCHFINDER_WINDOW_SIZE;
+               cur_pos -= MATCHFINDER_WINDOW_SIZE;
+       }
+
+       hash = *next_hash;
+       do {
+               for (i = HT_MATCHFINDER_BUCKET_SIZE - 1; i > 0; i--)
+                       mf->hash_tab[hash][i] = mf->hash_tab[hash][i - 1];
+               mf->hash_tab[hash][0] = cur_pos;
+
+               hash = lz_hash(get_unaligned_le32(++in_next),
+                              HT_MATCHFINDER_HASH_ORDER);
+               cur_pos++;
+       } while (--remaining);
+
+       prefetchw(&mf->hash_tab[hash]);
+       *next_hash = hash;
+}
+
+#endif /* LIB_HT_MATCHFINDER_H */
diff --git a/src/3rdparty/libdeflate/lib/lib_common.h b/src/3rdparty/libdeflate/lib/lib_common.h
new file mode 100644 (file)
index 0000000..2eea56c
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * lib_common.h - internal header included by all library code
+ */
+
+#ifndef LIB_LIB_COMMON_H
+#define LIB_LIB_COMMON_H
+
+#ifdef LIBDEFLATE_H
+#  error "lib_common.h must always be included before libdeflate.h"
+   /* because BUILDING_LIBDEFLATE must be set first */
+#endif
+
+#define BUILDING_LIBDEFLATE
+
+#include "../common/common_defs.h"
+
+/*
+ * Prefix with "_libdeflate_" all global symbols which are not part of the API
+ * and don't already have a "libdeflate" prefix.  This avoids exposing overly
+ * generic names when libdeflate is built as a static library.
+ *
+ * Note that the chosen prefix is not really important and can be changed
+ * without breaking library users.  It was just chosen so that the resulting
+ * symbol names are unlikely to conflict with those from any other software.
+ * Also note that this fixup has no useful effect when libdeflate is built as a
+ * shared library, since these symbols are not exported.
+ */
+#define SYM_FIXUP(sym)                 _libdeflate_##sym
+#define deflate_get_compression_level  SYM_FIXUP(deflate_get_compression_level)
+#define _cpu_features                  SYM_FIXUP(_cpu_features)
+#define setup_cpu_features             SYM_FIXUP(setup_cpu_features)
+
+void *libdeflate_malloc(size_t size);
+void libdeflate_free(void *ptr);
+
+void *libdeflate_aligned_malloc(size_t alignment, size_t size);
+void libdeflate_aligned_free(void *ptr);
+
+#ifdef FREESTANDING
+/*
+ * With -ffreestanding, <string.h> may be missing, and we must provide
+ * implementations of memset(), memcpy(), memmove(), and memcmp().
+ * See https://gcc.gnu.org/onlinedocs/gcc/Standards.html
+ *
+ * Also, -ffreestanding disables interpreting calls to these functions as
+ * built-ins.  E.g., calling memcpy(&v, p, WORDBYTES) will make a function call,
+ * not be optimized to a single load instruction.  For performance reasons we
+ * don't want that.  So, declare these functions as macros that expand to the
+ * corresponding built-ins.  This approach is recommended in the gcc man page.
+ * We still need the actual function definitions in case gcc calls them.
+ */
+void *memset(void *s, int c, size_t n);
+#define memset(s, c, n)                __builtin_memset((s), (c), (n))
+
+void *memcpy(void *dest, const void *src, size_t n);
+#define memcpy(dest, src, n)   __builtin_memcpy((dest), (src), (n))
+
+void *memmove(void *dest, const void *src, size_t n);
+#define memmove(dest, src, n)  __builtin_memmove((dest), (src), (n))
+
+int memcmp(const void *s1, const void *s2, size_t n);
+#define memcmp(s1, s2, n)      __builtin_memcmp((s1), (s2), (n))
+#else
+#include <string.h>
+#endif
+
+#endif /* LIB_LIB_COMMON_H */
diff --git a/src/3rdparty/libdeflate/lib/matchfinder_common.h b/src/3rdparty/libdeflate/lib/matchfinder_common.h
new file mode 100644 (file)
index 0000000..5aa3325
--- /dev/null
@@ -0,0 +1,179 @@
+/*
+ * matchfinder_common.h - common code for Lempel-Ziv matchfinding
+ */
+
+#ifndef LIB_MATCHFINDER_COMMON_H
+#define LIB_MATCHFINDER_COMMON_H
+
+#include "lib_common.h"
+#include "unaligned.h"
+
+#ifndef MATCHFINDER_WINDOW_ORDER
+#  error "MATCHFINDER_WINDOW_ORDER must be defined!"
+#endif
+
+#define MATCHFINDER_WINDOW_SIZE (1UL << MATCHFINDER_WINDOW_ORDER)
+
+typedef s16 mf_pos_t;
+
+#define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE)
+
+/*
+ * Required alignment of the matchfinder buffer pointer and size.  The values
+ * here come from the AVX-2 implementation, which is the worst case.
+ */
+#define MATCHFINDER_MEM_ALIGNMENT      32
+#define MATCHFINDER_SIZE_ALIGNMENT     128
+
+#undef matchfinder_init
+#undef matchfinder_rebase
+#ifdef _aligned_attribute
+#  define MATCHFINDER_ALIGNED _aligned_attribute(MATCHFINDER_MEM_ALIGNMENT)
+#  if defined(__arm__) || defined(__aarch64__)
+#    include "arm/matchfinder_impl.h"
+#  elif defined(__i386__) || defined(__x86_64__)
+#    include "x86/matchfinder_impl.h"
+#  endif
+#else
+#  define MATCHFINDER_ALIGNED
+#endif
+
+/*
+ * Initialize the hash table portion of the matchfinder.
+ *
+ * Essentially, this is an optimized memset().
+ *
+ * 'data' must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and
+ * 'size' must be a multiple of MATCHFINDER_SIZE_ALIGNMENT.
+ */
+#ifndef matchfinder_init
+static forceinline void
+matchfinder_init(mf_pos_t *data, size_t size)
+{
+       size_t num_entries = size / sizeof(*data);
+       size_t i;
+
+       for (i = 0; i < num_entries; i++)
+               data[i] = MATCHFINDER_INITVAL;
+}
+#endif
+
+/*
+ * Slide the matchfinder by WINDOW_SIZE bytes.
+ *
+ * This must be called just after each WINDOW_SIZE bytes have been run through
+ * the matchfinder.
+ *
+ * This will subtract WINDOW_SIZE bytes from each entry in the array specified.
+ * The effect is that all entries are updated to be relative to the current
+ * position, rather than the position WINDOW_SIZE bytes prior.
+ *
+ * Underflow is detected and replaced with signed saturation.  This ensures that
+ * once the sliding window has passed over a position, that position forever
+ * remains out of bounds.
+ *
+ * The array passed in must contain all matchfinder data that is
+ * position-relative.  Concretely, this will include the hash table as well as
+ * the table of positions that is used to link together the sequences in each
+ * hash bucket.  Note that in the latter table, the links are 1-ary in the case
+ * of "hash chains", and 2-ary in the case of "binary trees".  In either case,
+ * the links need to be rebased in the same way.
+ *
+ * 'data' must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and
+ * 'size' must be a multiple of MATCHFINDER_SIZE_ALIGNMENT.
+ */
+#ifndef matchfinder_rebase
+static forceinline void
+matchfinder_rebase(mf_pos_t *data, size_t size)
+{
+       size_t num_entries = size / sizeof(*data);
+       size_t i;
+
+       if (MATCHFINDER_WINDOW_SIZE == 32768) {
+               /* Branchless version for 32768 byte windows.  If the value was
+                * already negative, clear all bits except the sign bit; this
+                * changes the value to -32768.  Otherwise, set the sign bit;
+                * this is equivalent to subtracting 32768.  */
+               for (i = 0; i < num_entries; i++) {
+                       u16 v = data[i];
+                       u16 sign_bit = v & 0x8000;
+                       v &= sign_bit - ((sign_bit >> 15) ^ 1);
+                       v |= 0x8000;
+                       data[i] = v;
+               }
+               return;
+       }
+
+       for (i = 0; i < num_entries; i++) {
+               if (data[i] >= 0)
+                       data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
+               else
+                       data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
+       }
+}
+#endif
+
+/*
+ * The hash function: given a sequence prefix held in the low-order bits of a
+ * 32-bit value, multiply by a carefully-chosen large constant.  Discard any
+ * bits of the product that don't fit in a 32-bit value, but take the
+ * next-highest @num_bits bits of the product as the hash value, as those have
+ * the most randomness.
+ */
+static forceinline u32
+lz_hash(u32 seq, unsigned num_bits)
+{
+       return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits);
+}
+
+/*
+ * Return the number of bytes at @matchptr that match the bytes at @strptr, up
+ * to a maximum of @max_len.  Initially, @start_len bytes are matched.
+ */
+static forceinline unsigned
+lz_extend(const u8 * const strptr, const u8 * const matchptr,
+         const unsigned start_len, const unsigned max_len)
+{
+       unsigned len = start_len;
+       machine_word_t v_word;
+
+       if (UNALIGNED_ACCESS_IS_FAST) {
+
+               if (likely(max_len - len >= 4 * WORDBYTES)) {
+
+               #define COMPARE_WORD_STEP                               \
+                       v_word = load_word_unaligned(&matchptr[len]) ^  \
+                                load_word_unaligned(&strptr[len]);     \
+                       if (v_word != 0)                                \
+                               goto word_differs;                      \
+                       len += WORDBYTES;                               \
+
+                       COMPARE_WORD_STEP
+                       COMPARE_WORD_STEP
+                       COMPARE_WORD_STEP
+                       COMPARE_WORD_STEP
+               #undef COMPARE_WORD_STEP
+               }
+
+               while (len + WORDBYTES <= max_len) {
+                       v_word = load_word_unaligned(&matchptr[len]) ^
+                                load_word_unaligned(&strptr[len]);
+                       if (v_word != 0)
+                               goto word_differs;
+                       len += WORDBYTES;
+               }
+       }
+
+       while (len < max_len && matchptr[len] == strptr[len])
+               len++;
+       return len;
+
+word_differs:
+       if (CPU_IS_LITTLE_ENDIAN())
+               len += (bsfw(v_word) >> 3);
+       else
+               len += (WORDBITS - 1 - bsrw(v_word)) >> 3;
+       return len;
+}
+
+#endif /* LIB_MATCHFINDER_COMMON_H */
diff --git a/src/3rdparty/libdeflate/lib/unaligned.h b/src/3rdparty/libdeflate/lib/unaligned.h
new file mode 100644 (file)
index 0000000..bb48bf8
--- /dev/null
@@ -0,0 +1,228 @@
+/*
+ * unaligned.h - inline functions for unaligned memory accesses
+ */
+
+#ifndef LIB_UNALIGNED_H
+#define LIB_UNALIGNED_H
+
+#include "lib_common.h"
+
+/***** Unaligned loads and stores without endianness conversion *****/
+
+/*
+ * memcpy() is portable, and it usually gets optimized appropriately by modern
+ * compilers.  I.e., each memcpy() of 1, 2, 4, or WORDBYTES bytes gets compiled
+ * to a load or store instruction, not to an actual function call.
+ *
+ * We no longer use the "packed struct" approach, as that is nonstandard, has
+ * unclear semantics, and doesn't receive enough testing
+ * (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994).
+ *
+ * arm32 with __ARM_FEATURE_UNALIGNED in gcc 5 and earlier is a known exception
+ * where memcpy() generates inefficient code
+ * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67366).  However, we no longer
+ * consider that one case important enough to maintain different code for.
+ * If you run into it, please just use a newer version of gcc (or use clang).
+ */
+
+#define DEFINE_UNALIGNED_TYPE(type)                            \
+static forceinline type                                                \
+load_##type##_unaligned(const void *p)                         \
+{                                                              \
+       type v;                                                 \
+       memcpy(&v, p, sizeof(v));                               \
+       return v;                                               \
+}                                                              \
+                                                               \
+static forceinline void                                                \
+store_##type##_unaligned(type v, void *p)                      \
+{                                                              \
+       memcpy(p, &v, sizeof(v));                               \
+}
+
+DEFINE_UNALIGNED_TYPE(u16)
+DEFINE_UNALIGNED_TYPE(u32)
+DEFINE_UNALIGNED_TYPE(u64)
+DEFINE_UNALIGNED_TYPE(machine_word_t)
+
+#define load_word_unaligned    load_machine_word_t_unaligned
+#define store_word_unaligned   store_machine_word_t_unaligned
+
+/***** Unaligned loads with endianness conversion *****/
+
+static forceinline u16
+get_unaligned_le16(const u8 *p)
+{
+       if (UNALIGNED_ACCESS_IS_FAST)
+               return le16_bswap(load_u16_unaligned(p));
+       else
+               return ((u16)p[1] << 8) | p[0];
+}
+
+static forceinline u16
+get_unaligned_be16(const u8 *p)
+{
+       if (UNALIGNED_ACCESS_IS_FAST)
+               return be16_bswap(load_u16_unaligned(p));
+       else
+               return ((u16)p[0] << 8) | p[1];
+}
+
+static forceinline u32
+get_unaligned_le32(const u8 *p)
+{
+       if (UNALIGNED_ACCESS_IS_FAST)
+               return le32_bswap(load_u32_unaligned(p));
+       else
+               return ((u32)p[3] << 24) | ((u32)p[2] << 16) |
+                       ((u32)p[1] << 8) | p[0];
+}
+
+static forceinline u32
+get_unaligned_be32(const u8 *p)
+{
+       if (UNALIGNED_ACCESS_IS_FAST)
+               return be32_bswap(load_u32_unaligned(p));
+       else
+               return ((u32)p[0] << 24) | ((u32)p[1] << 16) |
+                       ((u32)p[2] << 8) | p[3];
+}
+
+static forceinline u64
+get_unaligned_le64(const u8 *p)
+{
+       if (UNALIGNED_ACCESS_IS_FAST)
+               return le64_bswap(load_u64_unaligned(p));
+       else
+               return ((u64)p[7] << 56) | ((u64)p[6] << 48) |
+                       ((u64)p[5] << 40) | ((u64)p[4] << 32) |
+                       ((u64)p[3] << 24) | ((u64)p[2] << 16) |
+                       ((u64)p[1] << 8) | p[0];
+}
+
+static forceinline machine_word_t
+get_unaligned_leword(const u8 *p)
+{
+       STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+       if (WORDBITS == 32)
+               return get_unaligned_le32(p);
+       else
+               return get_unaligned_le64(p);
+}
+
+/***** Unaligned stores with endianness conversion *****/
+
+static forceinline void
+put_unaligned_le16(u16 v, u8 *p)
+{
+       if (UNALIGNED_ACCESS_IS_FAST) {
+               store_u16_unaligned(le16_bswap(v), p);
+       } else {
+               p[0] = (u8)(v >> 0);
+               p[1] = (u8)(v >> 8);
+       }
+}
+
+static forceinline void
+put_unaligned_be16(u16 v, u8 *p)
+{
+       if (UNALIGNED_ACCESS_IS_FAST) {
+               store_u16_unaligned(be16_bswap(v), p);
+       } else {
+               p[0] = (u8)(v >> 8);
+               p[1] = (u8)(v >> 0);
+       }
+}
+
+static forceinline void
+put_unaligned_le32(u32 v, u8 *p)
+{
+       if (UNALIGNED_ACCESS_IS_FAST) {
+               store_u32_unaligned(le32_bswap(v), p);
+       } else {
+               p[0] = (u8)(v >> 0);
+               p[1] = (u8)(v >> 8);
+               p[2] = (u8)(v >> 16);
+               p[3] = (u8)(v >> 24);
+       }
+}
+
+static forceinline void
+put_unaligned_be32(u32 v, u8 *p)
+{
+       if (UNALIGNED_ACCESS_IS_FAST) {
+               store_u32_unaligned(be32_bswap(v), p);
+       } else {
+               p[0] = (u8)(v >> 24);
+               p[1] = (u8)(v >> 16);
+               p[2] = (u8)(v >> 8);
+               p[3] = (u8)(v >> 0);
+       }
+}
+
+static forceinline void
+put_unaligned_le64(u64 v, u8 *p)
+{
+       if (UNALIGNED_ACCESS_IS_FAST) {
+               store_u64_unaligned(le64_bswap(v), p);
+       } else {
+               p[0] = (u8)(v >> 0);
+               p[1] = (u8)(v >> 8);
+               p[2] = (u8)(v >> 16);
+               p[3] = (u8)(v >> 24);
+               p[4] = (u8)(v >> 32);
+               p[5] = (u8)(v >> 40);
+               p[6] = (u8)(v >> 48);
+               p[7] = (u8)(v >> 56);
+       }
+}
+
+static forceinline void
+put_unaligned_leword(machine_word_t v, u8 *p)
+{
+       STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+       if (WORDBITS == 32)
+               put_unaligned_le32(v, p);
+       else
+               put_unaligned_le64(v, p);
+}
+
+/***** 24-bit loads *****/
+
+/*
+ * Given a 32-bit value that was loaded with the platform's native endianness,
+ * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24
+ * bits contain the first 3 bytes, arranged in octets in a platform-dependent
+ * order, at the memory location from which the input 32-bit value was loaded.
+ */
+static forceinline u32
+loaded_u32_to_u24(u32 v)
+{
+       if (CPU_IS_LITTLE_ENDIAN())
+               return v & 0xFFFFFF;
+       else
+               return v >> 8;
+}
+
+/*
+ * Load the next 3 bytes from the memory location @p into the 24 low-order bits
+ * of a 32-bit value.  The order in which the 3 bytes will be arranged as octets
+ * in the 24 bits is platform-dependent.  At least LOAD_U24_REQUIRED_NBYTES
+ * bytes must be available at @p; note that this may be more than 3.
+ */
+static forceinline u32
+load_u24_unaligned(const u8 *p)
+{
+#if UNALIGNED_ACCESS_IS_FAST
+#  define LOAD_U24_REQUIRED_NBYTES 4
+       return loaded_u32_to_u24(load_u32_unaligned(p));
+#else
+#  define LOAD_U24_REQUIRED_NBYTES 3
+       if (CPU_IS_LITTLE_ENDIAN())
+               return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
+       else
+               return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16);
+#endif
+}
+
+#endif /* LIB_UNALIGNED_H */
diff --git a/src/3rdparty/libdeflate/lib/utils.c b/src/3rdparty/libdeflate/lib/utils.c
new file mode 100644 (file)
index 0000000..c626af1
--- /dev/null
@@ -0,0 +1,142 @@
+/*
+ * utils.c - utility functions for libdeflate
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "lib_common.h"
+
+#include "libdeflate.h"
+
+#ifdef FREESTANDING
+#  define malloc NULL
+#  define free NULL
+#else
+#  include <stdlib.h>
+#endif
+
+static void *(*libdeflate_malloc_func)(size_t) = malloc;
+static void (*libdeflate_free_func)(void *) = free;
+
+void *
+libdeflate_malloc(size_t size)
+{
+       return (*libdeflate_malloc_func)(size);
+}
+
+void
+libdeflate_free(void *ptr)
+{
+       (*libdeflate_free_func)(ptr);
+}
+
+void *
+libdeflate_aligned_malloc(size_t alignment, size_t size)
+{
+       void *ptr = libdeflate_malloc(sizeof(void *) + alignment - 1 + size);
+       if (ptr) {
+               void *orig_ptr = ptr;
+               ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
+               ((void **)ptr)[-1] = orig_ptr;
+       }
+       return ptr;
+}
+
+void
+libdeflate_aligned_free(void *ptr)
+{
+       if (ptr)
+               libdeflate_free(((void **)ptr)[-1]);
+}
+
+LIBDEFLATEEXPORT void LIBDEFLATEAPI
+libdeflate_set_memory_allocator(void *(*malloc_func)(size_t),
+                               void (*free_func)(void *))
+{
+       libdeflate_malloc_func = malloc_func;
+       libdeflate_free_func = free_func;
+}
+
+/*
+ * Implementations of libc functions for freestanding library builds.
+ * Normal library builds don't use these.  Not optimized yet; usually the
+ * compiler expands these functions and doesn't actually call them anyway.
+ */
+#ifdef FREESTANDING
+#undef memset
+void * __attribute__((weak))
+memset(void *s, int c, size_t n)
+{
+       u8 *p = s;
+       size_t i;
+
+       for (i = 0; i < n; i++)
+               p[i] = c;
+       return s;
+}
+
+#undef memcpy
+void * __attribute__((weak))
+memcpy(void *dest, const void *src, size_t n)
+{
+       u8 *d = dest;
+       const u8 *s = src;
+       size_t i;
+
+       for (i = 0; i < n; i++)
+               d[i] = s[i];
+       return dest;
+}
+
+#undef memmove
+void * __attribute__((weak))
+memmove(void *dest, const void *src, size_t n)
+{
+       u8 *d = dest;
+       const u8 *s = src;
+       size_t i;
+
+       if (d <= s)
+               return memcpy(d, s, n);
+
+       for (i = n; i > 0; i--)
+               d[i - 1] = s[i - 1];
+       return dest;
+}
+
+#undef memcmp
+int __attribute__((weak))
+memcmp(const void *s1, const void *s2, size_t n)
+{
+       const u8 *p1 = s1;
+       const u8 *p2 = s2;
+       size_t i;
+
+       for (i = 0; i < n; i++) {
+               if (p1[i] != p2[i])
+                       return (int)p1[i] - (int)p2[i];
+       }
+       return 0;
+}
+#endif /* FREESTANDING */
diff --git a/src/3rdparty/libdeflate/lib/x86/adler32_impl.h b/src/3rdparty/libdeflate/lib/x86/adler32_impl.h
new file mode 100644 (file)
index 0000000..f89bde5
--- /dev/null
@@ -0,0 +1,337 @@
+/*
+ * x86/adler32_impl.h - x86 implementations of Adler-32 checksum algorithm
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_X86_ADLER32_IMPL_H
+#define LIB_X86_ADLER32_IMPL_H
+
+#include "cpu_features.h"
+
+/*
+ * The following macros horizontally sum the s1 counters and add them to the
+ * real s1, and likewise for s2.  They do this via a series of reductions, each
+ * of which halves the vector length, until just one counter remains.
+ *
+ * The s1 reductions don't depend on the s2 reductions and vice versa, so for
+ * efficiency they are interleaved.  Also, every other s1 counter is 0 due to
+ * the 'psadbw' instruction (_mm_sad_epu8) summing groups of 8 bytes rather than
+ * 4; hence, one of the s1 reductions is skipped when going from 128 => 32 bits.
+ */
+
+#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2)                   \
+{                                                                          \
+       __v4su s1_last = (v_s1), s2_last = (v_s2);                          \
+                                                                           \
+       /* 128 => 32 bits */                                                \
+       s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x31);       \
+       s1_last += (__v4su)_mm_shuffle_epi32((__m128i)s1_last, 0x02);       \
+       s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x02);       \
+                                                                           \
+       *(s1) += (u32)_mm_cvtsi128_si32((__m128i)s1_last);                  \
+       *(s2) += (u32)_mm_cvtsi128_si32((__m128i)s2_last);                  \
+}
+
+#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2)                   \
+{                                                                          \
+       __v4su s1_128bit, s2_128bit;                                        \
+                                                                           \
+       /* 256 => 128 bits */                                               \
+       s1_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 0) +  \
+                   (__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 1);   \
+       s2_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 0) +  \
+                   (__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 1);   \
+                                                                           \
+       ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit);     \
+}
+
+#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2)                   \
+{                                                                          \
+       __v8su s1_256bit, s2_256bit;                                        \
+                                                                           \
+       /* 512 => 256 bits */                                               \
+       s1_256bit = (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s1), 0) + \
+                   (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s1), 1);  \
+       s2_256bit = (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s2), 0) + \
+                   (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s2), 1);  \
+                                                                           \
+       ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit);     \
+}
+
+/* AVX-512BW implementation: like the AVX2 one, but does 64 bytes at a time */
+#undef DISPATCH_AVX512BW
+#if !defined(DEFAULT_IMPL) &&                                          \
+    /*
+     * clang before v3.9 is missing some AVX-512BW intrinsics including
+     * _mm512_sad_epu8(), a.k.a. __builtin_ia32_psadbw512.  So just make using
+     * AVX-512BW, even when __AVX512BW__ is defined, conditional on
+     * COMPILER_SUPPORTS_AVX512BW_TARGET where we check for that builtin.
+     */                                                                        \
+    COMPILER_SUPPORTS_AVX512BW_TARGET &&                               \
+    (defined(__AVX512BW__) || (X86_CPU_FEATURES_ENABLED &&             \
+                              COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS))
+#  define FUNCNAME             adler32_avx512bw
+#  define FUNCNAME_CHUNK       adler32_avx512bw_chunk
+#  define IMPL_ALIGNMENT       64
+#  define IMPL_SEGMENT_SIZE    64
+#  define IMPL_MAX_CHUNK_SIZE  MAX_CHUNK_SIZE
+#  ifdef __AVX512BW__
+#    define ATTRIBUTES
+#    define DEFAULT_IMPL       adler32_avx512bw
+#  else
+#    define ATTRIBUTES         __attribute__((target("avx512bw")))
+#    define DISPATCH           1
+#    define DISPATCH_AVX512BW  1
+#  endif
+#  include <immintrin.h>
+static forceinline ATTRIBUTES void
+adler32_avx512bw_chunk(const __m512i *p, const __m512i *const end,
+                      u32 *s1, u32 *s2)
+{
+       const __m512i zeroes = _mm512_setzero_si512();
+       const __v64qi multipliers = (__v64qi){
+               64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
+               48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
+               32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+               16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
+       };
+       const __v32hi ones = (__v32hi)_mm512_set1_epi16(1);
+       __v16si v_s1 = (__v16si)zeroes;
+       __v16si v_s1_sums = (__v16si)zeroes;
+       __v16si v_s2 = (__v16si)zeroes;
+
+       do {
+               /* Load the next 64-byte segment */
+               __m512i bytes = *p++;
+
+               /* Multiply the bytes by 64...1 (the number of times they need
+                * to be added to s2) and add adjacent products */
+               __v32hi sums = (__v32hi)_mm512_maddubs_epi16(
+                                               bytes, (__m512i)multipliers);
+
+               /* Keep sum of all previous s1 counters, for adding to s2 later.
+                * This allows delaying the multiplication by 64 to the end. */
+               v_s1_sums += v_s1;
+
+               /* Add the sum of each group of 8 bytes to the corresponding s1
+                * counter */
+               v_s1 += (__v16si)_mm512_sad_epu8(bytes, zeroes);
+
+               /* Add the sum of each group of 4 products of the bytes by
+                * 64...1 to the corresponding s2 counter */
+               v_s2 += (__v16si)_mm512_madd_epi16((__m512i)sums,
+                                                  (__m512i)ones);
+       } while (p != end);
+
+       /* Finish the s2 counters by adding the sum of the s1 values at the
+        * beginning of each segment, multiplied by the segment size (64) */
+       v_s2 += (__v16si)_mm512_slli_epi32((__m512i)v_s1_sums, 6);
+
+       /* Add the counters to the real s1 and s2 */
+       ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2);
+}
+#  include "../adler32_vec_template.h"
+#endif /* AVX-512BW implementation */
+
+/* AVX2 implementation: like the AVX-512BW one, but does 32 bytes at a time */
+#undef DISPATCH_AVX2
+#if !defined(DEFAULT_IMPL) &&  \
+       (defined(__AVX2__) || (X86_CPU_FEATURES_ENABLED &&      \
+                              COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS))
+#  define FUNCNAME             adler32_avx2
+#  define FUNCNAME_CHUNK       adler32_avx2_chunk
+#  define IMPL_ALIGNMENT       32
+#  define IMPL_SEGMENT_SIZE    32
+#  define IMPL_MAX_CHUNK_SIZE  MAX_CHUNK_SIZE
+#  ifdef __AVX2__
+#    define ATTRIBUTES
+#    define DEFAULT_IMPL       adler32_avx2
+#  else
+#    define ATTRIBUTES         __attribute__((target("avx2")))
+#    define DISPATCH           1
+#    define DISPATCH_AVX2      1
+#  endif
+#  include <immintrin.h>
+static forceinline ATTRIBUTES void
+adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
+{
+       const __m256i zeroes = _mm256_setzero_si256();
+       const __v32qu multipliers = (__v32qu){
+               32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+               16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
+       };
+       const __v16hu ones = (__v16hu)_mm256_set1_epi16(1);
+       __v8su v_s1 = (__v8su)zeroes;
+       __v8su v_s1_sums = (__v8su)zeroes;
+       __v8su v_s2 = (__v8su)zeroes;
+
+       do {
+               /* Load the next 32-byte segment */
+               __m256i bytes = *p++;
+
+               /* Multiply the bytes by 32...1 (the number of times they need
+                * to be added to s2) and add adjacent products */
+               __v16hu sums = (__v16hu)_mm256_maddubs_epi16(
+                                               bytes, (__m256i)multipliers);
+
+               /* Keep sum of all previous s1 counters, for adding to s2 later.
+                * This allows delaying the multiplication by 32 to the end. */
+               v_s1_sums += v_s1;
+
+               /* Add the sum of each group of 8 bytes to the corresponding s1
+                * counter */
+               v_s1 += (__v8su)_mm256_sad_epu8(bytes, zeroes);
+
+               /* Add the sum of each group of 4 products of the bytes by
+                * 32...1 to the corresponding s2 counter */
+               v_s2 += (__v8su)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
+       } while (p != end);
+
+       /* Finish the s2 counters by adding the sum of the s1 values at the
+        * beginning of each segment, multiplied by the segment size (32) */
+       v_s2 += (__v8su)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
+
+       /* Add the counters to the real s1 and s2 */
+       ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2);
+}
+#  include "../adler32_vec_template.h"
+#endif /* AVX2 implementation */
+
+/* SSE2 implementation */
+#undef DISPATCH_SSE2
+#if !defined(DEFAULT_IMPL) &&  \
+       (defined(__SSE2__) || (X86_CPU_FEATURES_ENABLED &&      \
+                              COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS))
+#  define FUNCNAME             adler32_sse2
+#  define FUNCNAME_CHUNK       adler32_sse2_chunk
+#  define IMPL_ALIGNMENT       16
+#  define IMPL_SEGMENT_SIZE    32
+/*
+ * The 16-bit precision byte counters must not be allowed to undergo *signed*
+ * overflow, otherwise the signed multiplications at the end (_mm_madd_epi16)
+ * would behave incorrectly.
+ */
+#  define IMPL_MAX_CHUNK_SIZE  (32 * (0x7FFF / 0xFF))
+#  ifdef __SSE2__
+#    define ATTRIBUTES
+#    define DEFAULT_IMPL       adler32_sse2
+#  else
+#    define ATTRIBUTES         __attribute__((target("sse2")))
+#    define DISPATCH           1
+#    define DISPATCH_SSE2      1
+#  endif
+#  include <emmintrin.h>
+static forceinline ATTRIBUTES void
+adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
+{
+       const __m128i zeroes = _mm_setzero_si128();
+
+       /* s1 counters: 32-bit, sum of bytes */
+       __v4su v_s1 = (__v4su)zeroes;
+
+       /* s2 counters: 32-bit, sum of s1 values */
+       __v4su v_s2 = (__v4su)zeroes;
+
+       /*
+        * Thirty-two 16-bit counters for byte sums.  Each accumulates the bytes
+        * that eventually need to be multiplied by a number 32...1 for addition
+        * into s2.
+        */
+       __v8hu v_byte_sums_a = (__v8hu)zeroes;
+       __v8hu v_byte_sums_b = (__v8hu)zeroes;
+       __v8hu v_byte_sums_c = (__v8hu)zeroes;
+       __v8hu v_byte_sums_d = (__v8hu)zeroes;
+
+       do {
+               /* Load the next 32 bytes */
+               const __m128i bytes1 = *p++;
+               const __m128i bytes2 = *p++;
+
+               /*
+                * Accumulate the previous s1 counters into the s2 counters.
+                * Logically, this really should be v_s2 += v_s1 * 32, but we
+                * can do the multiplication (or left shift) later.
+                */
+               v_s2 += v_s1;
+
+               /*
+                * s1 update: use "Packed Sum of Absolute Differences" to add
+                * the bytes horizontally with 8 bytes per sum.  Then add the
+                * sums to the s1 counters.
+                */
+               v_s1 += (__v4su)_mm_sad_epu8(bytes1, zeroes);
+               v_s1 += (__v4su)_mm_sad_epu8(bytes2, zeroes);
+
+               /*
+                * Also accumulate the bytes into 32 separate counters that have
+                * 16-bit precision.
+                */
+               v_byte_sums_a += (__v8hu)_mm_unpacklo_epi8(bytes1, zeroes);
+               v_byte_sums_b += (__v8hu)_mm_unpackhi_epi8(bytes1, zeroes);
+               v_byte_sums_c += (__v8hu)_mm_unpacklo_epi8(bytes2, zeroes);
+               v_byte_sums_d += (__v8hu)_mm_unpackhi_epi8(bytes2, zeroes);
+
+       } while (p != end);
+
+       /* Finish calculating the s2 counters */
+       v_s2 = (__v4su)_mm_slli_epi32((__m128i)v_s2, 5);
+       v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_a,
+                                      (__m128i)(__v8hu){ 32, 31, 30, 29, 28, 27, 26, 25 });
+       v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_b,
+                                      (__m128i)(__v8hu){ 24, 23, 22, 21, 20, 19, 18, 17 });
+       v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_c,
+                                      (__m128i)(__v8hu){ 16, 15, 14, 13, 12, 11, 10, 9 });
+       v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_d,
+                                      (__m128i)(__v8hu){ 8,  7,  6,  5,  4,  3,  2,  1 });
+
+       /* Add the counters to the real s1 and s2 */
+       ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2);
+}
+#  include "../adler32_vec_template.h"
+#endif /* SSE2 implementation */
+
+#ifdef DISPATCH
+static inline adler32_func_t
+arch_select_adler32_func(void)
+{
+       u32 features = get_cpu_features();
+
+#ifdef DISPATCH_AVX512BW
+       if (features & X86_CPU_FEATURE_AVX512BW)
+               return adler32_avx512bw;
+#endif
+#ifdef DISPATCH_AVX2
+       if (features & X86_CPU_FEATURE_AVX2)
+               return adler32_avx2;
+#endif
+#ifdef DISPATCH_SSE2
+       if (features & X86_CPU_FEATURE_SSE2)
+               return adler32_sse2;
+#endif
+       return NULL;
+}
+#endif /* DISPATCH */
+
+#endif /* LIB_X86_ADLER32_IMPL_H */
diff --git a/src/3rdparty/libdeflate/lib/x86/cpu_features.c b/src/3rdparty/libdeflate/lib/x86/cpu_features.c
new file mode 100644 (file)
index 0000000..e3471d4
--- /dev/null
@@ -0,0 +1,152 @@
+/*
+ * x86/cpu_features.c - feature detection for x86 processors
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "../cpu_features_common.h" /* must be included first */
+#include "cpu_features.h"
+
+#if X86_CPU_FEATURES_ENABLED
+
+volatile u32 _cpu_features = 0;
+
+/* With old GCC versions we have to manually save and restore the x86_32 PIC
+ * register (ebx).  See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602  */
+#if defined(__i386__) && defined(__PIC__)
+#  define EBX_CONSTRAINT "=&r"
+#else
+#  define EBX_CONSTRAINT "=b"
+#endif
+
+/* Execute the CPUID instruction.  */
+static inline void
+cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
+{
+       __asm__(".ifnc %%ebx, %1; mov  %%ebx, %1; .endif\n"
+               "cpuid                                  \n"
+               ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
+               : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
+               : "a" (leaf), "c" (subleaf));
+}
+
+/* Read an extended control register.  */
+static inline u64
+read_xcr(u32 index)
+{
+       u32 edx, eax;
+
+       /* Execute the "xgetbv" instruction.  Old versions of binutils do not
+        * recognize this instruction, so list the raw bytes instead.  */
+       __asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index));
+
+       return ((u64)edx << 32) | eax;
+}
+
+#undef BIT
+#define BIT(nr)                        (1UL << (nr))
+
+#define XCR0_BIT_SSE           BIT(1)
+#define XCR0_BIT_AVX           BIT(2)
+#define XCR0_BIT_OPMASK                BIT(5)
+#define XCR0_BIT_ZMM_HI256     BIT(6)
+#define XCR0_BIT_HI16_ZMM      BIT(7)
+
+#define IS_SET(reg, nr)                ((reg) & BIT(nr))
+#define IS_ALL_SET(reg, mask)  (((reg) & (mask)) == (mask))
+
+static const struct cpu_feature x86_cpu_feature_table[] = {
+       {X86_CPU_FEATURE_SSE2,          "sse2"},
+       {X86_CPU_FEATURE_PCLMUL,        "pclmul"},
+       {X86_CPU_FEATURE_AVX,           "avx"},
+       {X86_CPU_FEATURE_AVX2,          "avx2"},
+       {X86_CPU_FEATURE_BMI2,          "bmi2"},
+       {X86_CPU_FEATURE_AVX512BW,      "avx512bw"},
+};
+
+/* Initialize _cpu_features with bits for interesting processor features. */
+void setup_cpu_features(void)
+{
+       u32 features = 0;
+       u32 dummy1, dummy2, dummy3, dummy4;
+       u32 max_function;
+       u32 features_1, features_2, features_3, features_4;
+       bool os_avx_support = false;
+       bool os_avx512_support = false;
+
+       /* Get maximum supported function  */
+       cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4);
+       if (max_function < 1)
+               goto out;
+
+       /* Standard feature flags  */
+       cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1);
+
+       if (IS_SET(features_1, 26))
+               features |= X86_CPU_FEATURE_SSE2;
+
+       if (IS_SET(features_2, 1))
+               features |= X86_CPU_FEATURE_PCLMUL;
+
+       if (IS_SET(features_2, 27)) { /* OSXSAVE set? */
+               u64 xcr0 = read_xcr(0);
+
+               os_avx_support = IS_ALL_SET(xcr0,
+                                           XCR0_BIT_SSE |
+                                           XCR0_BIT_AVX);
+
+               os_avx512_support = IS_ALL_SET(xcr0,
+                                              XCR0_BIT_SSE |
+                                              XCR0_BIT_AVX |
+                                              XCR0_BIT_OPMASK |
+                                              XCR0_BIT_ZMM_HI256 |
+                                              XCR0_BIT_HI16_ZMM);
+       }
+
+       if (os_avx_support && IS_SET(features_2, 28))
+               features |= X86_CPU_FEATURE_AVX;
+
+       if (max_function < 7)
+               goto out;
+
+       /* Extended feature flags  */
+       cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4);
+
+       if (os_avx_support && IS_SET(features_3, 5))
+               features |= X86_CPU_FEATURE_AVX2;
+
+       if (IS_SET(features_3, 8))
+               features |= X86_CPU_FEATURE_BMI2;
+
+       if (os_avx512_support && IS_SET(features_3, 30))
+               features |= X86_CPU_FEATURE_AVX512BW;
+
+out:
+       disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
+                                        ARRAY_LEN(x86_cpu_feature_table));
+
+       _cpu_features = features | X86_CPU_FEATURES_KNOWN;
+}
+
+#endif /* X86_CPU_FEATURES_ENABLED */
diff --git a/src/3rdparty/libdeflate/lib/x86/cpu_features.h b/src/3rdparty/libdeflate/lib/x86/cpu_features.h
new file mode 100644 (file)
index 0000000..4c02353
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * x86/cpu_features.h - feature detection for x86 processors
+ */
+
+#ifndef LIB_X86_CPU_FEATURES_H
+#define LIB_X86_CPU_FEATURES_H
+
+#include "../lib_common.h"
+
+#if (defined(__i386__) || defined(__x86_64__)) && \
+       COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
+#  define X86_CPU_FEATURES_ENABLED 1
+#else
+#  define X86_CPU_FEATURES_ENABLED 0
+#endif
+
+#if X86_CPU_FEATURES_ENABLED
+
+#define X86_CPU_FEATURE_SSE2           0x00000001
+#define X86_CPU_FEATURE_PCLMUL         0x00000002
+#define X86_CPU_FEATURE_AVX            0x00000004
+#define X86_CPU_FEATURE_AVX2           0x00000008
+#define X86_CPU_FEATURE_BMI2           0x00000010
+#define X86_CPU_FEATURE_AVX512BW       0x00000020
+
+#define X86_CPU_FEATURES_KNOWN         0x80000000
+
+extern volatile u32 _cpu_features;
+
+void setup_cpu_features(void);
+
+static inline u32 get_cpu_features(void)
+{
+       if (_cpu_features == 0)
+               setup_cpu_features();
+       return _cpu_features;
+}
+
+#endif /* X86_CPU_FEATURES_ENABLED */
+
+#endif /* LIB_X86_CPU_FEATURES_H */
diff --git a/src/3rdparty/libdeflate/lib/x86/crc32_impl.h b/src/3rdparty/libdeflate/lib/x86/crc32_impl.h
new file mode 100644 (file)
index 0000000..14a6867
--- /dev/null
@@ -0,0 +1,92 @@
+/*
+ * x86/crc32_impl.h - x86 implementations of CRC-32 checksum algorithm
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_X86_CRC32_IMPL_H
+#define LIB_X86_CRC32_IMPL_H
+
+#include "cpu_features.h"
+
+/*
+ * Include the PCLMUL/AVX implementation?  Although our PCLMUL-optimized CRC-32
+ * function doesn't use any AVX intrinsics specifically, it can benefit a lot
+ * from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100
+ * MB/s.  I expect this is related to the PCLMULQDQ instructions being assembled
+ * in the newer three-operand form rather than the older two-operand form.
+ *
+ * Note: this is only needed if __AVX__ is *not* defined, since otherwise the
+ * "regular" PCLMUL implementation would already be AVX enabled.
+ */
+#undef DISPATCH_PCLMUL_AVX
+#if !defined(DEFAULT_IMPL) && !defined(__AVX__) &&     \
+       X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET &&     \
+       (defined(__PCLMUL__) || COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS)
+#  define FUNCNAME             crc32_pclmul_avx
+#  define FUNCNAME_ALIGNED     crc32_pclmul_avx_aligned
+#  define ATTRIBUTES           __attribute__((target("pclmul,avx")))
+#  define DISPATCH             1
+#  define DISPATCH_PCLMUL_AVX  1
+#  include "crc32_pclmul_template.h"
+#endif
+
+/* PCLMUL implementation */
+#undef DISPATCH_PCLMUL
+#if !defined(DEFAULT_IMPL) &&  \
+       (defined(__PCLMUL__) || (X86_CPU_FEATURES_ENABLED &&    \
+                                COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS))
+#  define FUNCNAME             crc32_pclmul
+#  define FUNCNAME_ALIGNED     crc32_pclmul_aligned
+#  ifdef __PCLMUL__
+#    define ATTRIBUTES
+#    define DEFAULT_IMPL       crc32_pclmul
+#  else
+#    define ATTRIBUTES         __attribute__((target("pclmul")))
+#    define DISPATCH           1
+#    define DISPATCH_PCLMUL    1
+#  endif
+#  include "crc32_pclmul_template.h"
+#endif
+
+#ifdef DISPATCH
+static inline crc32_func_t
+arch_select_crc32_func(void)
+{
+       u32 features = get_cpu_features();
+
+#ifdef DISPATCH_PCLMUL_AVX
+       if ((features & X86_CPU_FEATURE_PCLMUL) &&
+           (features & X86_CPU_FEATURE_AVX))
+               return crc32_pclmul_avx;
+#endif
+#ifdef DISPATCH_PCLMUL
+       if (features & X86_CPU_FEATURE_PCLMUL)
+               return crc32_pclmul;
+#endif
+       return NULL;
+}
+#endif /* DISPATCH */
+
+#endif /* LIB_X86_CRC32_IMPL_H */
diff --git a/src/3rdparty/libdeflate/lib/x86/crc32_pclmul_template.h b/src/3rdparty/libdeflate/lib/x86/crc32_pclmul_template.h
new file mode 100644 (file)
index 0000000..a5eda9b
--- /dev/null
@@ -0,0 +1,262 @@
+/*
+ * x86/crc32_pclmul_template.h
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <immintrin.h>
+
+/*
+ * CRC-32 folding with PCLMULQDQ.
+ *
+ * The basic idea is to repeatedly "fold" each 512 bits into the next 512 bits,
+ * producing an abbreviated message which is congruent the original message
+ * modulo the generator polynomial G(x).
+ *
+ * Folding each 512 bits is implemented as eight 64-bit folds, each of which
+ * uses one carryless multiplication instruction.  It's expected that CPUs may
+ * be able to execute some of these multiplications in parallel.
+ *
+ * Explanation of "folding": let A(x) be 64 bits from the message, and let B(x)
+ * be 95 bits from a constant distance D later in the message.  The relevant
+ * portion of the message can be written as:
+ *
+ *     M(x) = A(x)*x^D + B(x)
+ *
+ * ... where + and * represent addition and multiplication, respectively, of
+ * polynomials over GF(2).  Note that when implemented on a computer, these
+ * operations are equivalent to XOR and carryless multiplication, respectively.
+ *
+ * For the purpose of CRC calculation, only the remainder modulo the generator
+ * polynomial G(x) matters:
+ *
+ *     M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x)
+ *
+ * Since the modulo operation can be applied anywhere in a sequence of additions
+ * and multiplications without affecting the result, this is equivalent to:
+ *
+ *     M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x)
+ *
+ * For any D, 'x^D mod G(x)' will be a polynomial with maximum degree 31, i.e.
+ * a 32-bit quantity.  So 'A(x) * (x^D mod G(x))' is equivalent to a carryless
+ * multiplication of a 64-bit quantity by a 32-bit quantity, producing a 95-bit
+ * product.  Then, adding (XOR-ing) the product to B(x) produces a polynomial
+ * with the same length as B(x) but with the same remainder as 'A(x)*x^D +
+ * B(x)'.  This is the basic fold operation with 64 bits.
+ *
+ * Note that the carryless multiplication instruction PCLMULQDQ actually takes
+ * two 64-bit inputs and produces a 127-bit product in the low-order bits of a
+ * 128-bit XMM register.  This works fine, but care must be taken to account for
+ * "bit endianness".  With the CRC version implemented here, bits are always
+ * ordered such that the lowest-order bit represents the coefficient of highest
+ * power of x and the highest-order bit represents the coefficient of the lowest
+ * power of x.  This is backwards from the more intuitive order.  Still,
+ * carryless multiplication works essentially the same either way.  It just must
+ * be accounted for that when we XOR the 95-bit product in the low-order 95 bits
+ * of a 128-bit XMM register into 128-bits of later data held in another XMM
+ * register, we'll really be XOR-ing the product into the mathematically higher
+ * degree end of those later bits, not the lower degree end as may be expected.
+ *
+ * So given that caveat and the fact that we process 512 bits per iteration, the
+ * 'D' values we need for the two 64-bit halves of each 128 bits of data are:
+ *
+ *     D = (512 + 95) - 64      for the higher-degree half of each 128 bits,
+ *                              i.e. the lower order bits in the XMM register
+ *
+ *     D = (512 + 95) - 128     for the lower-degree half of each 128 bits,
+ *                              i.e. the higher order bits in the XMM register
+ *
+ * The required 'x^D mod G(x)' values were precomputed.
+ *
+ * When <= 512 bits remain in the message, we finish up by folding across
+ * smaller distances.  This works similarly; the distance D is just different,
+ * so different constant multipliers must be used.  Finally, once the remaining
+ * message is just 64 bits, it is reduced to the CRC-32 using Barrett reduction
+ * (explained later).
+ *
+ * For more information see the original paper from Intel:
+ *     "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ *     December 2009
+ *     http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ */
+static u32 ATTRIBUTES
+FUNCNAME_ALIGNED(u32 remainder, const __m128i *p, size_t nr_segs)
+{
+       /* Constants precomputed by gen_crc32_multipliers.c.  Do not edit! */
+       const __v2di multipliers_4 = (__v2di){ 0x8F352D95, 0x1D9513D7 };
+       const __v2di multipliers_2 = (__v2di){ 0xF1DA05AA, 0x81256527 };
+       const __v2di multipliers_1 = (__v2di){ 0xAE689191, 0xCCAA009E };
+       const __v2di final_multiplier = (__v2di){ 0xB8BC6765 };
+       const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF };
+       const __v2di barrett_reduction_constants =
+                       (__v2di){ 0x00000001F7011641, 0x00000001DB710641 };
+
+       const __m128i * const end = p + nr_segs;
+       const __m128i * const end512 = p + (nr_segs & ~3);
+       __m128i x0, x1, x2, x3;
+
+       /*
+        * Account for the current 'remainder', i.e. the CRC of the part of the
+        * message already processed.  Explanation: rewrite the message
+        * polynomial M(x) in terms of the first part A(x), the second part
+        * B(x), and the length of the second part in bits |B(x)| >= 32:
+        *
+        *      M(x) = A(x)*x^|B(x)| + B(x)
+        *
+        * Then the CRC of M(x) is:
+        *
+        *      CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x))
+        *                = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x))
+        *                = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x))
+        *
+        * Note: all arithmetic is modulo G(x), the generator polynomial; that's
+        * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x).
+        *
+        * So the CRC of the full message is the CRC of the second part of the
+        * message where the first 32 bits of the second part of the message
+        * have been XOR'ed with the CRC of the first part of the message.
+        */
+       x0 = *p++;
+       x0 ^= (__m128i)(__v4si){ remainder };
+
+       if (p > end512) /* only 128, 256, or 384 bits of input? */
+               goto _128_bits_at_a_time;
+       x1 = *p++;
+       x2 = *p++;
+       x3 = *p++;
+
+       /* Fold 512 bits at a time */
+       for (; p != end512; p += 4) {
+               __m128i y0, y1, y2, y3;
+
+               y0 = p[0];
+               y1 = p[1];
+               y2 = p[2];
+               y3 = p[3];
+
+               /*
+                * Note: the immediate constant for PCLMULQDQ specifies which
+                * 64-bit halves of the 128-bit vectors to multiply:
+                *
+                * 0x00 means low halves (higher degree polynomial terms for us)
+                * 0x11 means high halves (lower degree polynomial terms for us)
+                */
+               y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x00);
+               y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x00);
+               y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x00);
+               y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x00);
+               y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x11);
+               y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x11);
+               y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x11);
+               y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x11);
+
+               x0 = y0;
+               x1 = y1;
+               x2 = y2;
+               x3 = y3;
+       }
+
+       /* Fold 512 bits => 128 bits */
+       x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x00);
+       x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x00);
+       x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x11);
+       x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x11);
+       x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x00);
+       x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x11);
+       x0 = x3;
+
+_128_bits_at_a_time:
+       while (p != end) {
+               /* Fold 128 bits into next 128 bits */
+               x1 = *p++;
+               x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x00);
+               x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x11);
+               x0 = x1;
+       }
+
+       /* Now there are just 128 bits left, stored in 'x0'. */
+
+       /*
+        * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
+        * which is equivalent to multiplying by x^32.  This is needed because
+        * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
+        */
+       x0 = _mm_srli_si128(x0, 8) ^
+            _mm_clmulepi64_si128(x0, multipliers_1, 0x10);
+
+       /* Fold 96 => 64 bits */
+       x0 = _mm_srli_si128(x0, 4) ^
+            _mm_clmulepi64_si128(x0 & mask32, final_multiplier, 0x00);
+
+        /*
+        * Finally, reduce 64 => 32 bits using Barrett reduction.
+        *
+        * Let M(x) = A(x)*x^32 + B(x) be the remaining message.  The goal is to
+        * compute R(x) = M(x) mod G(x).  Since degree(B(x)) < degree(G(x)):
+        *
+        *      R(x) = (A(x)*x^32 + B(x)) mod G(x)
+        *           = (A(x)*x^32) mod G(x) + B(x)
+        *
+        * Then, by the Division Algorithm there exists a unique q(x) such that:
+        *
+        *      A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
+        *
+        * Since the left-hand side is of maximum degree 31, the right-hand side
+        * must be too.  This implies that we can apply 'mod x^32' to the
+        * right-hand side without changing its value:
+        *
+        *      (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
+        *
+        * Note that '+' is equivalent to '-' in polynomials over GF(2).
+        *
+        * We also know that:
+        *
+        *                    / A(x)*x^32 \
+        *      q(x) = floor (  ---------  )
+        *                    \    G(x)   /
+        *
+        * To compute this efficiently, we can multiply the top and bottom by
+        * x^32 and move the division by G(x) to the top:
+        *
+        *                    / A(x) * floor(x^64 / G(x)) \
+        *      q(x) = floor (  -------------------------  )
+        *                    \           x^32            /
+        *
+        * Note that floor(x^64 / G(x)) is a constant.
+        *
+        * So finally we have:
+        *
+        *                                / A(x) * floor(x^64 / G(x)) \
+        *      R(x) = B(x) + G(x)*floor (  -------------------------  )
+        *                                \           x^32            /
+        */
+       x1 = x0;
+       x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00);
+       x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10);
+       return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4));
+}
+
+#define IMPL_ALIGNMENT         16
+#define IMPL_SEGMENT_SIZE      16
+#include "../crc32_vec_template.h"
diff --git a/src/3rdparty/libdeflate/lib/x86/decompress_impl.h b/src/3rdparty/libdeflate/lib/x86/decompress_impl.h
new file mode 100644 (file)
index 0000000..de6d236
--- /dev/null
@@ -0,0 +1,31 @@
+#ifndef LIB_X86_DECOMPRESS_IMPL_H
+#define LIB_X86_DECOMPRESS_IMPL_H
+
+#include "cpu_features.h"
+
+/* Include the BMI2-optimized version? */
+#undef DISPATCH_BMI2
+#if !defined(__BMI2__) && X86_CPU_FEATURES_ENABLED && \
+       COMPILER_SUPPORTS_BMI2_TARGET
+#  define FUNCNAME     deflate_decompress_bmi2
+#  define ATTRIBUTES   __attribute__((target("bmi2")))
+#  define DISPATCH     1
+#  define DISPATCH_BMI2        1
+#  include "../decompress_template.h"
+#endif
+
+#ifdef DISPATCH
+static inline decompress_func_t
+arch_select_decompress_func(void)
+{
+       u32 features = get_cpu_features();
+
+#ifdef DISPATCH_BMI2
+       if (features & X86_CPU_FEATURE_BMI2)
+               return deflate_decompress_bmi2;
+#endif
+       return NULL;
+}
+#endif /* DISPATCH */
+
+#endif /* LIB_X86_DECOMPRESS_IMPL_H */
diff --git a/src/3rdparty/libdeflate/lib/x86/matchfinder_impl.h b/src/3rdparty/libdeflate/lib/x86/matchfinder_impl.h
new file mode 100644 (file)
index 0000000..99fbebe
--- /dev/null
@@ -0,0 +1,122 @@
+/*
+ * x86/matchfinder_impl.h - x86 implementations of matchfinder functions
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_X86_MATCHFINDER_IMPL_H
+#define LIB_X86_MATCHFINDER_IMPL_H
+
+#ifdef __AVX2__
+#  include <immintrin.h>
+static forceinline void
+matchfinder_init_avx2(mf_pos_t *data, size_t size)
+{
+       __m256i *p = (__m256i *)data;
+       __m256i v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
+
+       STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+       STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+       STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+       do {
+               p[0] = v;
+               p[1] = v;
+               p[2] = v;
+               p[3] = v;
+               p += 4;
+               size -= 4 * sizeof(*p);
+       } while (size != 0);
+}
+#define matchfinder_init matchfinder_init_avx2
+
+static forceinline void
+matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
+{
+       __m256i *p = (__m256i *)data;
+       __m256i v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
+
+       STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+       STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+       STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+       do {
+               /* PADDSW: Add Packed Signed Integers With Signed Saturation  */
+               p[0] = _mm256_adds_epi16(p[0], v);
+               p[1] = _mm256_adds_epi16(p[1], v);
+               p[2] = _mm256_adds_epi16(p[2], v);
+               p[3] = _mm256_adds_epi16(p[3], v);
+               p += 4;
+               size -= 4 * sizeof(*p);
+       } while (size != 0);
+}
+#define matchfinder_rebase matchfinder_rebase_avx2
+
+#elif defined(__SSE2__)
+#  include <emmintrin.h>
+static forceinline void
+matchfinder_init_sse2(mf_pos_t *data, size_t size)
+{
+       __m128i *p = (__m128i *)data;
+       __m128i v = _mm_set1_epi16(MATCHFINDER_INITVAL);
+
+       STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+       STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+       STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+       do {
+               p[0] = v;
+               p[1] = v;
+               p[2] = v;
+               p[3] = v;
+               p += 4;
+               size -= 4 * sizeof(*p);
+       } while (size != 0);
+}
+#define matchfinder_init matchfinder_init_sse2
+
+static forceinline void
+matchfinder_rebase_sse2(mf_pos_t *data, size_t size)
+{
+       __m128i *p = (__m128i *)data;
+       __m128i v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
+
+       STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+       STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+       STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+       do {
+               /* PADDSW: Add Packed Signed Integers With Signed Saturation  */
+               p[0] = _mm_adds_epi16(p[0], v);
+               p[1] = _mm_adds_epi16(p[1], v);
+               p[2] = _mm_adds_epi16(p[2], v);
+               p[3] = _mm_adds_epi16(p[3], v);
+               p += 4;
+               size -= 4 * sizeof(*p);
+       } while (size != 0);
+}
+#define matchfinder_rebase matchfinder_rebase_sse2
+#endif /* __SSE2__ */
+
+#endif /* LIB_X86_MATCHFINDER_IMPL_H */
diff --git a/src/3rdparty/libdeflate/lib/zlib_compress.c b/src/3rdparty/libdeflate/lib/zlib_compress.c
new file mode 100644 (file)
index 0000000..f066d52
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * zlib_compress.c - compress with a zlib wrapper
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "deflate_compress.h"
+#include "unaligned.h"
+#include "zlib_constants.h"
+
+#include "libdeflate.h"
+
+LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+libdeflate_zlib_compress(struct libdeflate_compressor *c,
+                        const void *in, size_t in_nbytes,
+                        void *out, size_t out_nbytes_avail)
+{
+       u8 *out_next = out;
+       u16 hdr;
+       unsigned compression_level;
+       unsigned level_hint;
+       size_t deflate_size;
+
+       if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD)
+               return 0;
+
+       /* 2 byte header: CMF and FLG  */
+       hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12);
+       compression_level = deflate_get_compression_level(c);
+       if (compression_level < 2)
+               level_hint = ZLIB_FASTEST_COMPRESSION;
+       else if (compression_level < 6)
+               level_hint = ZLIB_FAST_COMPRESSION;
+       else if (compression_level < 8)
+               level_hint = ZLIB_DEFAULT_COMPRESSION;
+       else
+               level_hint = ZLIB_SLOWEST_COMPRESSION;
+       hdr |= level_hint << 6;
+       hdr |= 31 - (hdr % 31);
+
+       put_unaligned_be16(hdr, out_next);
+       out_next += 2;
+
+       /* Compressed data  */
+       deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next,
+                                       out_nbytes_avail - ZLIB_MIN_OVERHEAD);
+       if (deflate_size == 0)
+               return 0;
+       out_next += deflate_size;
+
+       /* ADLER32  */
+       put_unaligned_be32(libdeflate_adler32(1, in, in_nbytes), out_next);
+       out_next += 4;
+
+       return out_next - (u8 *)out;
+}
+
+LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+libdeflate_zlib_compress_bound(struct libdeflate_compressor *c,
+                              size_t in_nbytes)
+{
+       return ZLIB_MIN_OVERHEAD +
+              libdeflate_deflate_compress_bound(c, in_nbytes);
+}
diff --git a/src/3rdparty/libdeflate/lib/zlib_constants.h b/src/3rdparty/libdeflate/lib/zlib_constants.h
new file mode 100644 (file)
index 0000000..f304310
--- /dev/null
@@ -0,0 +1,21 @@
+/*
+ * zlib_constants.h - constants for the zlib wrapper format
+ */
+
+#ifndef LIB_ZLIB_CONSTANTS_H
+#define LIB_ZLIB_CONSTANTS_H
+
+#define ZLIB_MIN_HEADER_SIZE   2
+#define ZLIB_FOOTER_SIZE       4
+#define ZLIB_MIN_OVERHEAD      (ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE)
+
+#define ZLIB_CM_DEFLATE                8
+
+#define ZLIB_CINFO_32K_WINDOW  7
+
+#define ZLIB_FASTEST_COMPRESSION       0
+#define ZLIB_FAST_COMPRESSION          1
+#define ZLIB_DEFAULT_COMPRESSION       2
+#define ZLIB_SLOWEST_COMPRESSION       3
+
+#endif /* LIB_ZLIB_CONSTANTS_H */
diff --git a/src/3rdparty/libdeflate/lib/zlib_decompress.c b/src/3rdparty/libdeflate/lib/zlib_decompress.c
new file mode 100644 (file)
index 0000000..11fc7a8
--- /dev/null
@@ -0,0 +1,106 @@
+/*
+ * zlib_decompress.c - decompress with a zlib wrapper
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "unaligned.h"
+#include "zlib_constants.h"
+
+#include "libdeflate.h"
+
+LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *d,
+                             const void *in, size_t in_nbytes,
+                             void *out, size_t out_nbytes_avail,
+                             size_t *actual_in_nbytes_ret,
+                             size_t *actual_out_nbytes_ret)
+{
+       const u8 *in_next = in;
+       const u8 * const in_end = in_next + in_nbytes;
+       u16 hdr;
+       size_t actual_in_nbytes;
+       size_t actual_out_nbytes;
+       enum libdeflate_result result;
+
+       if (in_nbytes < ZLIB_MIN_OVERHEAD)
+               return LIBDEFLATE_BAD_DATA;
+
+       /* 2 byte header: CMF and FLG  */
+       hdr = get_unaligned_be16(in_next);
+       in_next += 2;
+
+       /* FCHECK */
+       if ((hdr % 31) != 0)
+               return LIBDEFLATE_BAD_DATA;
+
+       /* CM */
+       if (((hdr >> 8) & 0xF) != ZLIB_CM_DEFLATE)
+               return LIBDEFLATE_BAD_DATA;
+
+       /* CINFO */
+       if ((hdr >> 12) > ZLIB_CINFO_32K_WINDOW)
+               return LIBDEFLATE_BAD_DATA;
+
+       /* FDICT */
+       if ((hdr >> 5) & 1)
+               return LIBDEFLATE_BAD_DATA;
+
+       /* Compressed data  */
+       result = libdeflate_deflate_decompress_ex(d, in_next,
+                                       in_end - ZLIB_FOOTER_SIZE - in_next,
+                                       out, out_nbytes_avail,
+                                       &actual_in_nbytes, actual_out_nbytes_ret);
+       if (result != LIBDEFLATE_SUCCESS)
+               return result;
+
+       if (actual_out_nbytes_ret)
+               actual_out_nbytes = *actual_out_nbytes_ret;
+       else
+               actual_out_nbytes = out_nbytes_avail;
+
+       in_next += actual_in_nbytes;
+
+       /* ADLER32  */
+       if (libdeflate_adler32(1, out, actual_out_nbytes) !=
+           get_unaligned_be32(in_next))
+               return LIBDEFLATE_BAD_DATA;
+       in_next += 4;
+
+       if (actual_in_nbytes_ret)
+               *actual_in_nbytes_ret = in_next - (u8 *)in;
+
+       return LIBDEFLATE_SUCCESS;
+}
+
+LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+libdeflate_zlib_decompress(struct libdeflate_decompressor *d,
+                          const void *in, size_t in_nbytes,
+                          void *out, size_t out_nbytes_avail,
+                          size_t *actual_out_nbytes_ret)
+{
+       return libdeflate_zlib_decompress_ex(d, in, in_nbytes,
+                                            out, out_nbytes_avail,
+                                            NULL, actual_out_nbytes_ret);
+}
diff --git a/src/3rdparty/libdeflate/libdeflate.h b/src/3rdparty/libdeflate/libdeflate.h
new file mode 100644 (file)
index 0000000..4c72ef9
--- /dev/null
@@ -0,0 +1,366 @@
+/*
+ * libdeflate.h - public header for libdeflate
+ */
+
+#ifndef LIBDEFLATE_H
+#define LIBDEFLATE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBDEFLATE_VERSION_MAJOR       1
+#define LIBDEFLATE_VERSION_MINOR       9
+#define LIBDEFLATE_VERSION_STRING      "1.9"
+
+#include <stddef.h>
+#include <stdint.h>
+
+/*
+ * On Windows, if you want to link to the DLL version of libdeflate, then
+ * #define LIBDEFLATE_DLL.  Note that the calling convention is "stdcall".
+ */
+#ifdef LIBDEFLATE_DLL
+#  ifdef BUILDING_LIBDEFLATE
+#    define LIBDEFLATEEXPORT   LIBEXPORT
+#  elif defined(_WIN32) || defined(__CYGWIN__)
+#    define LIBDEFLATEEXPORT   __declspec(dllimport)
+#  endif
+#endif
+#ifndef LIBDEFLATEEXPORT
+#  define LIBDEFLATEEXPORT
+#endif
+
+#if defined(_WIN32) && !defined(_WIN64)
+#  define LIBDEFLATEAPI_ABI    __stdcall
+#else
+#  define LIBDEFLATEAPI_ABI
+#endif
+
+#if defined(BUILDING_LIBDEFLATE) && defined(__GNUC__) && \
+       defined(_WIN32) && !defined(_WIN64)
+    /*
+     * On 32-bit Windows, gcc assumes 16-byte stack alignment but MSVC only 4.
+     * Realign the stack when entering libdeflate to avoid crashing in SSE/AVX
+     * code when called from an MSVC-compiled application.
+     */
+#  define LIBDEFLATEAPI_STACKALIGN     __attribute__((force_align_arg_pointer))
+#else
+#  define LIBDEFLATEAPI_STACKALIGN
+#endif
+
+#define LIBDEFLATEAPI  LIBDEFLATEAPI_ABI LIBDEFLATEAPI_STACKALIGN
+
+/* ========================================================================== */
+/*                             Compression                                    */
+/* ========================================================================== */
+
+struct libdeflate_compressor;
+
+/*
+ * libdeflate_alloc_compressor() allocates a new compressor that supports
+ * DEFLATE, zlib, and gzip compression.  'compression_level' is the compression
+ * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 =
+ * medium/default, 9 = slow, 12 = slowest).  Level 0 is also supported and means
+ * "no compression", specifically "create a valid stream, but only emit
+ * uncompressed blocks" (this will expand the data slightly).
+ *
+ * The return value is a pointer to the new compressor, or NULL if out of memory
+ * or if the compression level is invalid (i.e. outside the range [0, 12]).
+ *
+ * Note: for compression, the sliding window size is defined at compilation time
+ * to 32768, the largest size permissible in the DEFLATE format.  It cannot be
+ * changed at runtime.
+ *
+ * A single compressor is not safe to use by multiple threads concurrently.
+ * However, different threads may use different compressors concurrently.
+ */
+LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
+libdeflate_alloc_compressor(int compression_level);
+
+/*
+ * libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of
+ * data.  The function attempts to compress 'in_nbytes' bytes of data located at
+ * 'in' and write the results to 'out', which has space for 'out_nbytes_avail'
+ * bytes.  The return value is the compressed size in bytes, or 0 if the data
+ * could not be compressed to 'out_nbytes_avail' bytes or fewer.
+ */
+LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+libdeflate_deflate_compress(struct libdeflate_compressor *compressor,
+                           const void *in, size_t in_nbytes,
+                           void *out, size_t out_nbytes_avail);
+
+/*
+ * libdeflate_deflate_compress_bound() returns a worst-case upper bound on the
+ * number of bytes of compressed data that may be produced by compressing any
+ * buffer of length less than or equal to 'in_nbytes' using
+ * libdeflate_deflate_compress() with the specified compressor.  Mathematically,
+ * this bound will necessarily be a number greater than or equal to 'in_nbytes'.
+ * It may be an overestimate of the true upper bound.  The return value is
+ * guaranteed to be the same for all invocations with the same compressor and
+ * same 'in_nbytes'.
+ *
+ * As a special case, 'compressor' may be NULL.  This causes the bound to be
+ * taken across *any* libdeflate_compressor that could ever be allocated with
+ * this build of the library, with any options.
+ *
+ * Note that this function is not necessary in many applications.  With
+ * block-based compression, it is usually preferable to separately store the
+ * uncompressed size of each block and to store any blocks that did not compress
+ * to less than their original size uncompressed.  In that scenario, there is no
+ * need to know the worst-case compressed size, since the maximum number of
+ * bytes of compressed data that may be used would always be one less than the
+ * input length.  You can just pass a buffer of that size to
+ * libdeflate_deflate_compress() and store the data uncompressed if
+ * libdeflate_deflate_compress() returns 0, indicating that the compressed data
+ * did not fit into the provided output buffer.
+ */
+LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+libdeflate_deflate_compress_bound(struct libdeflate_compressor *compressor,
+                                 size_t in_nbytes);
+
+/*
+ * Like libdeflate_deflate_compress(), but stores the data in the zlib wrapper
+ * format.
+ */
+LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+libdeflate_zlib_compress(struct libdeflate_compressor *compressor,
+                        const void *in, size_t in_nbytes,
+                        void *out, size_t out_nbytes_avail);
+
+/*
+ * Like libdeflate_deflate_compress_bound(), but assumes the data will be
+ * compressed with libdeflate_zlib_compress() rather than with
+ * libdeflate_deflate_compress().
+ */
+LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+libdeflate_zlib_compress_bound(struct libdeflate_compressor *compressor,
+                              size_t in_nbytes);
+
+/*
+ * Like libdeflate_deflate_compress(), but stores the data in the gzip wrapper
+ * format.
+ */
+LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+libdeflate_gzip_compress(struct libdeflate_compressor *compressor,
+                        const void *in, size_t in_nbytes,
+                        void *out, size_t out_nbytes_avail);
+
+/*
+ * Like libdeflate_deflate_compress_bound(), but assumes the data will be
+ * compressed with libdeflate_gzip_compress() rather than with
+ * libdeflate_deflate_compress().
+ */
+LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor,
+                              size_t in_nbytes);
+
+/*
+ * libdeflate_free_compressor() frees a compressor that was allocated with
+ * libdeflate_alloc_compressor().  If a NULL pointer is passed in, no action is
+ * taken.
+ */
+LIBDEFLATEEXPORT void LIBDEFLATEAPI
+libdeflate_free_compressor(struct libdeflate_compressor *compressor);
+
+/* ========================================================================== */
+/*                             Decompression                                  */
+/* ========================================================================== */
+
+struct libdeflate_decompressor;
+
+/*
+ * libdeflate_alloc_decompressor() allocates a new decompressor that can be used
+ * for DEFLATE, zlib, and gzip decompression.  The return value is a pointer to
+ * the new decompressor, or NULL if out of memory.
+ *
+ * This function takes no parameters, and the returned decompressor is valid for
+ * decompressing data that was compressed at any compression level and with any
+ * sliding window size.
+ *
+ * A single decompressor is not safe to use by multiple threads concurrently.
+ * However, different threads may use different decompressors concurrently.
+ */
+LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI
+libdeflate_alloc_decompressor(void);
+
+/*
+ * Result of a call to libdeflate_deflate_decompress(),
+ * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress().
+ */
+enum libdeflate_result {
+       /* Decompression was successful.  */
+       LIBDEFLATE_SUCCESS = 0,
+
+       /* Decompressed failed because the compressed data was invalid, corrupt,
+        * or otherwise unsupported.  */
+       LIBDEFLATE_BAD_DATA = 1,
+
+       /* A NULL 'actual_out_nbytes_ret' was provided, but the data would have
+        * decompressed to fewer than 'out_nbytes_avail' bytes.  */
+       LIBDEFLATE_SHORT_OUTPUT = 2,
+
+       /* The data would have decompressed to more than 'out_nbytes_avail'
+        * bytes.  */
+       LIBDEFLATE_INSUFFICIENT_SPACE = 3,
+};
+
+/*
+ * libdeflate_deflate_decompress() decompresses the DEFLATE-compressed stream
+ * from the buffer 'in' with compressed size up to 'in_nbytes' bytes.  The
+ * uncompressed data is written to 'out', a buffer with size 'out_nbytes_avail'
+ * bytes.  If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned.
+ * Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned.  If
+ * a nonzero result code is returned, then the contents of the output buffer are
+ * undefined.
+ *
+ * Decompression stops at the end of the DEFLATE stream (as indicated by the
+ * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes.
+ *
+ * libdeflate_deflate_decompress() can be used in cases where the actual
+ * uncompressed size is known (recommended) or unknown (not recommended):
+ *
+ *   - If the actual uncompressed size is known, then pass the actual
+ *     uncompressed size as 'out_nbytes_avail' and pass NULL for
+ *     'actual_out_nbytes_ret'.  This makes libdeflate_deflate_decompress() fail
+ *     with LIBDEFLATE_SHORT_OUTPUT if the data decompressed to fewer than the
+ *     specified number of bytes.
+ *
+ *   - If the actual uncompressed size is unknown, then provide a non-NULL
+ *     'actual_out_nbytes_ret' and provide a buffer with some size
+ *     'out_nbytes_avail' that you think is large enough to hold all the
+ *     uncompressed data.  In this case, if the data decompresses to less than
+ *     or equal to 'out_nbytes_avail' bytes, then
+ *     libdeflate_deflate_decompress() will write the actual uncompressed size
+ *     to *actual_out_nbytes_ret and return 0 (LIBDEFLATE_SUCCESS).  Otherwise,
+ *     it will return LIBDEFLATE_INSUFFICIENT_SPACE if the provided buffer was
+ *     not large enough but no other problems were encountered, or another
+ *     nonzero result code if decompression failed for another reason.
+ */
+LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor,
+                             const void *in, size_t in_nbytes,
+                             void *out, size_t out_nbytes_avail,
+                             size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret'
+ * argument.  If decompression succeeds and 'actual_in_nbytes_ret' is not NULL,
+ * then the actual compressed size of the DEFLATE stream (aligned to the next
+ * byte boundary) is written to *actual_in_nbytes_ret.
+ */
+LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor,
+                                const void *in, size_t in_nbytes,
+                                void *out, size_t out_nbytes_avail,
+                                size_t *actual_in_nbytes_ret,
+                                size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format
+ * instead of raw DEFLATE.
+ *
+ * Decompression will stop at the end of the zlib stream, even if it is shorter
+ * than 'in_nbytes'.  If you need to know exactly where the zlib stream ended,
+ * use libdeflate_zlib_decompress_ex().
+ */
+LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor,
+                          const void *in, size_t in_nbytes,
+                          void *out, size_t out_nbytes_avail,
+                          size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_zlib_decompress(), but adds the 'actual_in_nbytes_ret'
+ * argument.  If 'actual_in_nbytes_ret' is not NULL and the decompression
+ * succeeds (indicating that the first zlib-compressed stream in the input
+ * buffer was decompressed), then the actual number of input bytes consumed is
+ * written to *actual_in_nbytes_ret.
+ */
+LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *decompressor,
+                             const void *in, size_t in_nbytes,
+                             void *out, size_t out_nbytes_avail,
+                             size_t *actual_in_nbytes_ret,
+                             size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format
+ * instead of raw DEFLATE.
+ *
+ * If multiple gzip-compressed members are concatenated, then only the first
+ * will be decompressed.  Use libdeflate_gzip_decompress_ex() if you need
+ * multi-member support.
+ */
+LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor,
+                          const void *in, size_t in_nbytes,
+                          void *out, size_t out_nbytes_avail,
+                          size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret'
+ * argument.  If 'actual_in_nbytes_ret' is not NULL and the decompression
+ * succeeds (indicating that the first gzip-compressed member in the input
+ * buffer was decompressed), then the actual number of input bytes consumed is
+ * written to *actual_in_nbytes_ret.
+ */
+LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor,
+                             const void *in, size_t in_nbytes,
+                             void *out, size_t out_nbytes_avail,
+                             size_t *actual_in_nbytes_ret,
+                             size_t *actual_out_nbytes_ret);
+
+/*
+ * libdeflate_free_decompressor() frees a decompressor that was allocated with
+ * libdeflate_alloc_decompressor().  If a NULL pointer is passed in, no action
+ * is taken.
+ */
+LIBDEFLATEEXPORT void LIBDEFLATEAPI
+libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor);
+
+/* ========================================================================== */
+/*                                Checksums                                   */
+/* ========================================================================== */
+
+/*
+ * libdeflate_adler32() updates a running Adler-32 checksum with 'len' bytes of
+ * data and returns the updated checksum.  When starting a new checksum, the
+ * required initial value for 'adler' is 1.  This value is also returned when
+ * 'buffer' is specified as NULL.
+ */
+LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI
+libdeflate_adler32(uint32_t adler, const void *buffer, size_t len);
+
+
+/*
+ * libdeflate_crc32() updates a running CRC-32 checksum with 'len' bytes of data
+ * and returns the updated checksum.  When starting a new checksum, the required
+ * initial value for 'crc' is 0.  This value is also returned when 'buffer' is
+ * specified as NULL.
+ */
+LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI
+libdeflate_crc32(uint32_t crc, const void *buffer, size_t len);
+
+/* ========================================================================== */
+/*                           Custom memory allocator                          */
+/* ========================================================================== */
+
+/*
+ * Install a custom memory allocator which libdeflate will use for all memory
+ * allocations.  'malloc_func' is a function that must behave like malloc(), and
+ * 'free_func' is a function that must behave like free().
+ *
+ * There must not be any libdeflate_compressor or libdeflate_decompressor
+ * structures in existence when calling this function.
+ */
+LIBDEFLATEEXPORT void LIBDEFLATEAPI
+libdeflate_set_memory_allocator(void *(*malloc_func)(size_t),
+                               void (*free_func)(void *));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LIBDEFLATE_H */
index 43a2a96..1362a27 100644 (file)
@@ -1,7 +1,6 @@
 # add_definitions()
 set(EXTRA_CORE_LIBS
     ${ICU_LIBRARIES}
-    ${DEFLATE_LIBRARIES}
     ${JANSSON_LIBRARIES}
     ${CMAKE_THREAD_LIBS_INIT}
     # libm should be present by default becaue this is C++
@@ -108,7 +107,6 @@ include_directories(
     ${CMAKE_BINARY_DIR}/privateinclude
     ${CMAKE_BINARY_DIR}/include/QtCore
     ${ICU_INCLUDES}
-    ${DEFLATE_INCLUDES}
     ${JANSSON_INCLUDES}
 )
 
@@ -420,6 +418,33 @@ if(WITH_EXECINFO AND EXECINFO_FOUND)
     include_directories(${EXECINFO_INCLUDES})
 endif()
 
+if(WITH_DEFLATE AND DEFLATE_FOUND)
+    set(EXTRA_CORE_LIBS
+        ${EXTRA_CORE_LIBS}
+        ${DEFLATE_LIBRARIES}
+    )
+    include_directories(${DEFLATE_INCLUDES})
+else()
+    set(CORE_SOURCES
+        ${CORE_SOURCES}
+        # common files
+        ${CMAKE_SOURCE_DIR}/src/3rdparty/libdeflate/lib/deflate_decompress.c
+        ${CMAKE_SOURCE_DIR}/src/3rdparty/libdeflate/lib/deflate_compress.c
+        ${CMAKE_SOURCE_DIR}/src/3rdparty/libdeflate/lib/utils.c
+        ${CMAKE_SOURCE_DIR}/src/3rdparty/libdeflate/lib/arm/cpu_features.c
+        ${CMAKE_SOURCE_DIR}/src/3rdparty/libdeflate/lib/x86/cpu_features.c
+        # zlib wrapper files
+        ${CMAKE_SOURCE_DIR}/src/3rdparty/libdeflate/lib/adler32.c
+        ${CMAKE_SOURCE_DIR}/src/3rdparty/libdeflate/lib/zlib_decompress.c
+        ${CMAKE_SOURCE_DIR}/src/3rdparty/libdeflate/lib/zlib_compress.c
+        # gzip wrapper files
+        ${CMAKE_SOURCE_DIR}/src/3rdparty/libdeflate/lib/crc32.c
+        ${CMAKE_SOURCE_DIR}/src/3rdparty/libdeflate/lib/gzip_decompress.c
+        ${CMAKE_SOURCE_DIR}/src/3rdparty/libdeflate/lib/gzip_compress.c
+    )
+    include_directories(${CMAKE_SOURCE_DIR}/src/3rdparty/libdeflate)
+endif()
+
 katie_unity_exclude(
     ${CMAKE_CURRENT_SOURCE_DIR}/global/qt_error_string.cpp
 )