em_inflate.c

Fast, small, in-memory inflate (zlib, deflate and gzip decompression)
git clone git://git.finwo.net/lib/em_inflate.c
Log | Files | Refs | README | LICENSE

commit 2c4488f0852805d3a8aed1d91bb64506f0526b24
parent cb9d062e35b73d0f25849fff36d21a3a0a758b13
Author: Yersa Nordman <finwo@pm.me>
Date:   Sun, 19 May 2024 23:30:36 +0200

Using dynamic buffers as decompression target

Diffstat:
MMakefile | 25++++++++++---------------
Mexample/main.c | 38+++++++++++++++++++-------------------
Msrc/em_inflate.c | 138++++++++++++++++++++++++++++++++-----------------------------------------------
Msrc/em_inflate.h | 18++++++++++++------
4 files changed, 97 insertions(+), 122 deletions(-)

diff --git a/Makefile b/Makefile @@ -1,27 +1,22 @@ +APP := em_inflate + CC=cc CFLAGS=-O3 -fomit-frame-pointer -Isrc -Iexample OBJDIR=obj LDFLAGS= -include lib/.dep/config.mk - -$(OBJDIR)/%.o: example/../%.c - @mkdir -p '$(@D)' - $(CC) $(CFLAGS) -c $< -o $@ +SRC:=$(wildcard example/*.c) +SRC+=$(wildcard src/*.c) -$(OBJDIR)/%.o: src/../%.c - @mkdir -p '$(@D)' - $(CC) $(CFLAGS) -c $< -o $@ - -APP := em_inflate +include lib/.dep/config.mk -OBJS := $(OBJDIR)/example/main.o -OBJS += $(OBJDIR)/src/em_inflate.o +CFLAGS+=$(INCLUDES) +OBJ=$(SRC:.c=.o) -all: $(APP) +default: $(APP) -$(APP): $(OBJS) - $(CC) $^ $(LDFLAGS) -o $(APP) +$(APP): $(OBJ) + $(CC) $(LDFLAGS) $(OBJ) -o $@ clean: @rm -rf $(APP) $(OBJDIR) diff --git a/example/main.c b/example/main.c @@ -24,11 +24,14 @@ #include <stdlib.h> #include <string.h> #include <memory.h> + +#include "tidwall/buf.h" + #include "em_inflate.h" int main(int argc, char **argv) { - unsigned char *pCompressedData, *pOutData; - size_t nCompressedDataSize, nMaxOutDataSize, nActualOutDataSize; + unsigned char *pCompressedData; + size_t nCompressedDataSize; FILE *f; if (argc != 3) { @@ -68,44 +71,41 @@ int main(int argc, char **argv) { /* Decompress */ - nMaxOutDataSize = 200000000; - pOutData = (unsigned char*)malloc(nMaxOutDataSize); - if (!pOutData) { - fprintf(stderr, "out of memory, %zu bytes needed\n", nMaxOutDataSize); - free(pCompressedData); - return 100; - } + struct buf *inflated = em_inflate(&(struct buf){ + .data = (void*)pCompressedData, + .len = nCompressedDataSize, + }); - nActualOutDataSize = em_inflate(pCompressedData, nCompressedDataSize, pOutData, nMaxOutDataSize); - if (nActualOutDataSize == -1) { - fprintf(stderr, "decompression error\n"); - free(pOutData); - free(pCompressedData); + if (!inflated) { + fprintf(stderr, "Error inflating data\n"); return 100; } - fprintf(stdout, "decompressed %zu bytes\n", nActualOutDataSize); + fprintf(stdout, "decompressed %zu bytes\n", inflated->len); /* Write decompressed file out all at once */ f = fopen(argv[2], "wb"); if (!f) { - free(pOutData); + buf_clear(inflated); + free(inflated); free(pCompressedData); fprintf(stderr, "error opening '%s' for writing\n", argv[1]); return 100; } - if (fwrite(pOutData, 1, nActualOutDataSize, f) != nActualOutDataSize) { + if (fwrite(inflated->data, 1, inflated->len, f) != inflated->len) { fclose(f); - free(pOutData); + buf_clear(inflated); + free(inflated); free(pCompressedData); fprintf(stderr, "I/O error writing '%s'\n", argv[1]); return 100; } fclose(f); - free(pOutData); + buf_clear(inflated); + free(inflated); free(pCompressedData); return 0; diff --git a/src/em_inflate.c b/src/em_inflate.c @@ -26,6 +26,9 @@ #include <sys/types.h> #include <stdlib.h> #include <string.h> + +#include "tidwall/buf.h" + #include "em_inflate.h" /* Comment out this define to disable checksum verification */ @@ -450,7 +453,7 @@ static int em_lsb_huffman_decoder_read_var_lengths(em_lsb_huffman_decoder_t *pTa /*-- zlib stored blocks copier --*/ -static size_t em_inflate_copy_stored(em_lsb_bitreader_t *pBitReader, unsigned char *pOutData, size_t nOutDataOffset, size_t nBlockMaxSize) { +static size_t em_inflate_copy_stored(em_lsb_bitreader_t *pBitReader, struct buf *inflated) { /* Align on byte */ if (em_lsb_bitreader_byte_align(pBitReader) < 0) return -1; @@ -467,11 +470,8 @@ static size_t em_inflate_copy_stored(em_lsb_bitreader_t *pBitReader, unsigned ch /* Make sure that the len and the two's complement match */ if (nStoredLen != ((~nNegStoredLen) & 0xffff)) return -1; - /* Make sure there is room */ - if (nStoredLen > nBlockMaxSize) return -1; - /* Copy stored data */ - memcpy(pOutData + nOutDataOffset, pBitReader->pInBlock, nStoredLen); + buf_append(inflated, (const char *) pBitReader->pInBlock, nStoredLen); pBitReader->pInBlock += nStoredLen; return (size_t)nStoredLen; @@ -516,13 +516,14 @@ static const unsigned int em_inflate_offset_code[NOFFSETSYMS] = { * @param nOutDataOffset starting index of where to store decompressed bytes in output buffer (and size of previously decompressed bytes) * @param nBlockMaxSize total size of output decompression buffer, in bytes * - * @return size of decompressed data in bytes, or -1 for error + * @return decompressed block size on success, -1 on error */ -static size_t em_inflate_decompress_block(em_lsb_bitreader_t *pBitReader, int nDynamicBlock, unsigned char *pOutData, size_t nOutDataOffset, size_t nBlockMaxSize) { +static size_t em_inflate_decompress_block(em_lsb_bitreader_t *pBitReader, int nDynamicBlock, struct buf *inflated) { em_lsb_huffman_decoder_t literalsDecoder; em_lsb_huffman_decoder_t offsetDecoder; unsigned int nLiteralsRevSymbolTable[NLITERALSYMS * 2]; unsigned int nOffsetRevSymbolTable[NOFFSETSYMS * 2]; + size_t originalLength = inflated->len; int i; if (nDynamicBlock) { @@ -617,9 +618,9 @@ static size_t em_inflate_decompress_block(em_lsb_bitreader_t *pBitReader, int nD /* Finally, loop to read all the literals/match len codewords in the block to decompress it */ - unsigned char *pCurOutData = pOutData + nOutDataOffset; - const unsigned char *pOutDataEnd = pCurOutData + nBlockMaxSize; - const unsigned char *pOutDataFastEnd = pOutDataEnd - 15; + /*unsigned char *pCurOutData = pOutData + nOutDataOffset;*/ + /*const unsigned char *pOutDataEnd = pCurOutData + nBlockMaxSize;*/ + /*const unsigned char *pOutDataFastEnd = pOutDataEnd - 15;*/ while (1) { em_lsb_bitreader_refill_32(pBitReader); @@ -627,12 +628,8 @@ static size_t em_inflate_decompress_block(em_lsb_bitreader_t *pBitReader, int nD unsigned int nLiteralsCodeword = em_lsb_huffman_decoder_read_value(&literalsDecoder, nLiteralsRevSymbolTable, pBitReader); if (nLiteralsCodeword < 256) { /* 0..255: literal. copy. */ - if (pCurOutData < pOutDataEnd) - *pCurOutData++ = nLiteralsCodeword; - else - return -1; - } - else { + buf_append_byte(inflated, nLiteralsCodeword); + } else { if (nLiteralsCodeword == NEODMARKERSYM) break; /* EOD marker, all done */ if (nLiteralsCodeword == -1) return -1; @@ -655,35 +652,13 @@ static size_t em_inflate_decompress_block(em_lsb_bitreader_t *pBitReader, int nD /* Copy match */ - const unsigned char *pSrc = pCurOutData - nMatchOffset; - if (pSrc >= pOutData) { - if (nMatchOffset >= 16 && (pCurOutData + nMatchLen) <= pOutDataFastEnd) { - const unsigned char *pCopySrc = pSrc; - unsigned char *pCopyDst = pCurOutData; - const unsigned char *pCopyEndDst = pCurOutData + nMatchLen; - - do { - memcpy(pCopyDst, pCopySrc, 16); - pCopySrc += 16; - pCopyDst += 16; - } while (pCopyDst < pCopyEndDst); - - pCurOutData += nMatchLen; - } - else { - if ((pCurOutData + nMatchLen) > pOutDataEnd) return -1; - - while (nMatchLen--) { - *pCurOutData++ = *pSrc++; - } - } - } - else - return -1; + const char *pSrc = inflated->data + inflated->len - nMatchOffset; + if (pSrc < (inflated->data)) return -1; + buf_append(inflated, pSrc, nMatchLen); } } - return (size_t)(pCurOutData - (pOutData + nOutDataOffset)); + return inflated->len - originalLength; } /*-- zlib adler32 calculation --*/ @@ -725,7 +700,8 @@ static size_t em_inflate_decompress_block(em_lsb_bitreader_t *pBitReader, int nD # define MOD28(a) a %= BASE # define MOD63(a) a %= BASE -static unsigned int em_inflate_adler32_z(unsigned int adler, const unsigned char *buf, size_t len) { +static unsigned int em_inflate_adler32_z(unsigned int adler, const void *buf, size_t len) { + const char *pBuf = buf; unsigned long sum2; unsigned n; @@ -735,7 +711,7 @@ static unsigned int em_inflate_adler32_z(unsigned int adler, const unsigned char /* in case user likes doing a byte at a time, keep it fast */ if (len == 1) { - adler += buf[0]; + adler += pBuf[0]; if (adler >= BASE) adler -= BASE; sum2 += adler; @@ -745,13 +721,13 @@ static unsigned int em_inflate_adler32_z(unsigned int adler, const unsigned char } /* initial Adler-32 value (deferred check for len == 1 speed) */ - if (buf == NULL) + if (pBuf == NULL) return 1L; /* in case short lengths are provided, keep it somewhat fast */ if (len < 16) { while (len--) { - adler += *buf++; + adler += *pBuf++; sum2 += adler; } if (adler >= BASE) @@ -765,8 +741,8 @@ static unsigned int em_inflate_adler32_z(unsigned int adler, const unsigned char len -= NMAX; n = NMAX / 16; /* NMAX is divisible by 16 */ do { - DO16(buf); /* 16 sums unrolled */ - buf += 16; + DO16(pBuf); /* 16 sums unrolled */ + pBuf += 16; } while (--n); MOD(adler); MOD(sum2); @@ -776,11 +752,11 @@ static unsigned int em_inflate_adler32_z(unsigned int adler, const unsigned char if (len) { /* avoid modulos if none remaining */ while (len >= 16) { len -= 16; - DO16(buf); - buf += 16; + DO16(pBuf); + pBuf += 16; } while (len--) { - adler += *buf++; + adler += *pBuf++; sum2 += adler; } MOD(adler); @@ -1015,30 +991,29 @@ typedef enum { EM_INFLATE_CHECKSUM_NONE = 0, EM_INFLATE_CHECKSUM_GZIP, EM_INFLAT /** * Inflate gzip or zlib data * - * @param pCompressedData pointer to start of zlib data - * @param nCompressedDataSize size of zlib data, in bytes - * @param pOutData pointer to start of decompression buffer - * @param nMaxOutDataSize maximum size of decompression buffer, in bytes + * @param compressed pointer to a buffer containing the compressed data * - * @return number of bytes decompressed, or -1 in case of an error + * @return pointer to a buffer containing the decompressed data, or NULL in case of an error */ -size_t em_inflate(const void *pCompressedData, size_t nCompressedDataSize, unsigned char *pOutData, size_t nMaxOutDataSize) { - const unsigned char *pCurCompressedData = (const unsigned char *)pCompressedData; - const unsigned char *pEndCompressedData = pCurCompressedData + nCompressedDataSize; +struct buf * em_inflate(const struct buf *compressed) { + struct buf *inflated = calloc(1, sizeof(struct buf)); + + const unsigned char *pCurCompressedData = (const unsigned char *)(compressed->data); + const unsigned char *pEndCompressedData = pCurCompressedData + (compressed->len); em_lsb_bitreader_t bitReader; unsigned int nIsFinalBlock; size_t nCurOutOffset; em_inflate_checksum_type_t nCheckSumType = EM_INFLATE_CHECKSUM_NONE; unsigned long nCheckSum = 0; - if ((pCurCompressedData + 2) > pEndCompressedData) return -1; + if ((pCurCompressedData + 2) > pEndCompressedData) return NULL; /* Check header */ if (pCurCompressedData[0] == 0x1f && pCurCompressedData[1] == 0x8b) { /* gzip wrapper */ pCurCompressedData += 2; if ((pCurCompressedData + 8) > pEndCompressedData || pCurCompressedData[0] != 0x08 /* deflate */) - return -1; + return NULL; pCurCompressedData++; /* Read flags and skip over the rest of the header */ @@ -1046,40 +1021,39 @@ size_t em_inflate(const void *pCompressedData, size_t nCompressedDataSize, unsig pCurCompressedData += 6; if (flags & 0x02) { /* Part number present */ - if ((pCurCompressedData + 2) > pEndCompressedData) return -1; + if ((pCurCompressedData + 2) > pEndCompressedData) return NULL; pCurCompressedData += 2; } if (flags & 0x04) { /* Extra field present, starts with two-byte length */ - if ((pCurCompressedData + 2) > pEndCompressedData) return -1; + if ((pCurCompressedData + 2) > pEndCompressedData) return NULL; unsigned short nExtraFieldLen = ((unsigned short)pCurCompressedData[0]) | (((unsigned short)pCurCompressedData[1]) << 8); pCurCompressedData += 2; - if ((pCurCompressedData + nExtraFieldLen) > pEndCompressedData) return -1; + if ((pCurCompressedData + nExtraFieldLen) > pEndCompressedData) return NULL; pCurCompressedData += nExtraFieldLen; } if (flags & 0x08) { /* Original filename present, zero terminated */ do { - if (pCurCompressedData >= pEndCompressedData) return -1; + if (pCurCompressedData >= pEndCompressedData) return NULL; pCurCompressedData++; } while (pCurCompressedData[-1]); } if (flags & 0x10) { /* File comment present, zero terminated */ do { - if (pCurCompressedData >= pEndCompressedData) return -1; + if (pCurCompressedData >= pEndCompressedData) return NULL; pCurCompressedData++; } while (pCurCompressedData[-1]); } if (flags & 0x20) { /* Encryption header present */ - return -1; + return NULL; } nCheckSumType = EM_INFLATE_CHECKSUM_GZIP; - } - else if ((pCurCompressedData[0] & 0x0f) == 0x08) { + } else if ((pCurCompressedData[0] & 0x0f) == 0x08) { /* zlib wrapper? */ unsigned char CMF = pCurCompressedData[0]; unsigned char FLG = pCurCompressedData[1]; @@ -1089,7 +1063,7 @@ size_t em_inflate(const void *pCompressedData, size_t nCompressedDataSize, unsig /* Looks like a valid zlib wrapper */ pCurCompressedData += 2; if (FLG & 0x20) { /* Preset dictionary present */ - if ((pCurCompressedData + 4) > pEndCompressedData) return -1; + if ((pCurCompressedData + 4) > pEndCompressedData) return NULL; pCurCompressedData += 4; } } @@ -1118,29 +1092,29 @@ size_t em_inflate(const void *pCompressedData, size_t nCompressedDataSize, unsig switch (nBlockType) { case 0: /* Stored */ - nBlockResult = em_inflate_copy_stored(&bitReader, pOutData, nCurOutOffset, nMaxOutDataSize - nCurOutOffset); + nBlockResult = em_inflate_copy_stored(&bitReader, inflated); break; case 1: /* Static huffman */ - nBlockResult = em_inflate_decompress_block(&bitReader, 0 /* static */, pOutData, nCurOutOffset, nMaxOutDataSize - nCurOutOffset); + nBlockResult = em_inflate_decompress_block(&bitReader, 0 /* static */, inflated); break; case 2: /* Dynamic huffman */ - nBlockResult = em_inflate_decompress_block(&bitReader, 1 /* dynamic */, pOutData, nCurOutOffset, nMaxOutDataSize - nCurOutOffset); + nBlockResult = em_inflate_decompress_block(&bitReader, 1 /* dynamic */, inflated); break; case 3: /* Invalid */ - return -1; + return NULL; } - if (nBlockResult == -1) return -1; + if (nBlockResult == -1) return NULL; #ifdef EM_INFLATE_VERIFY_CHECKSUM /* Update checksum with the decompressed block's contents */ if (nCheckSumType == EM_INFLATE_CHECKSUM_GZIP) - nCheckSum = em_inflate_crc32_4bytes(pOutData + nCurOutOffset, nBlockResult, nCheckSum); + nCheckSum = em_inflate_crc32_4bytes(inflated->data + inflated->len - nBlockResult, nBlockResult, nCheckSum); else if (nCheckSumType == EM_INFLATE_CHECKSUM_ZLIB) - nCheckSum = em_inflate_adler32_z(nCheckSum, pOutData + nCurOutOffset, nBlockResult); + nCheckSum = em_inflate_adler32_z(nCheckSum, inflated->data + inflated->len - nBlockResult, nBlockResult); #endif /* EM_INFLATE_VERIFY_CHECKSUM */ nCurOutOffset += nBlockResult; @@ -1156,22 +1130,22 @@ size_t em_inflate(const void *pCompressedData, size_t nCompressedDataSize, unsig unsigned int nStoredCheckSum; switch (nCheckSumType) { case EM_INFLATE_CHECKSUM_GZIP: /* gzip - little endian crc32 */ - if ((pCurCompressedData + 4) > pEndCompressedData) return -1; + if ((pCurCompressedData + 4) > pEndCompressedData) return NULL; nStoredCheckSum = ((unsigned int)pCurCompressedData[0]); nStoredCheckSum |= ((unsigned int)pCurCompressedData[1]) << 8; nStoredCheckSum |= ((unsigned int)pCurCompressedData[2]) << 16; nStoredCheckSum |= ((unsigned int)pCurCompressedData[3]) << 24; - if (nStoredCheckSum != nCheckSum) return -1; + if (nStoredCheckSum != nCheckSum) return NULL; /* pCurCompressedData += 4; */ break; case EM_INFLATE_CHECKSUM_ZLIB: /* zlib - big endian adler32 */ - if ((pCurCompressedData + 4) > pEndCompressedData) return -1; + if ((pCurCompressedData + 4) > pEndCompressedData) return NULL; nStoredCheckSum = ((unsigned int)pCurCompressedData[0]) << 24; nStoredCheckSum |= ((unsigned int)pCurCompressedData[1]) << 16; nStoredCheckSum |= ((unsigned int)pCurCompressedData[2]) << 8; nStoredCheckSum |= ((unsigned int)pCurCompressedData[3]); - if (nStoredCheckSum != nCheckSum) return -1; + if (nStoredCheckSum != nCheckSum) return NULL; /* pCurCompressedData += 4; */ break; @@ -1181,5 +1155,5 @@ size_t em_inflate(const void *pCompressedData, size_t nCompressedDataSize, unsig #endif /* EM_INFLATE_VERIFY_CHECKSUM */ /* Success, return decompressed size */ - return nCurOutOffset; + return inflated; } diff --git a/src/em_inflate.h b/src/em_inflate.h @@ -18,6 +18,13 @@ * 2. Altered source versions must be plainly marked as such, and must not be * misrepresented as being the original software. * 3. This notice may not be removed or altered from any source distribution. + * + * CHANGELOG + * + * 2024-05-19 Finwo + * - Move source files from lib/ to src/ + * - Auto-formatting of source files + * - Using dynamic buffer as decompression target */ #ifndef _EM_INFLATE_H @@ -26,6 +33,8 @@ #include <sys/types.h> #include <stdlib.h> +#include "tidwall/buf.h" + #ifdef __cplusplus extern "C" { #endif @@ -33,14 +42,11 @@ extern "C" { /** * Inflate gzip or zlib data * - * @param pCompressedData pointer to start of zlib data - * @param nCompressedDataSize size of zlib data, in bytes - * @param pOutData pointer to start of decompression buffer - * @param nMaxOutDataSize maximum size of decompression buffer, in bytes + * @param compressed pointer to a buffer containing the compressed data * - * @return number of bytes decompressed, or -1 in case of an error + * @return pointer to a buffer containing the decompressed data, or NULL in case of an error */ -size_t em_inflate(const void *pCompressedData, size_t nCompressedDataSize, unsigned char *pOutData, size_t nMaxOutDataSize); +struct buf * em_inflate(const struct buf *compressed); #ifdef __cplusplus }