添加lzw压缩算法
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -2,4 +2,4 @@
|
||||
.vs/
|
||||
*.exe
|
||||
*.pkt
|
||||
|
||||
zlib-1.3/
|
||||
|
3
.vscode/settings.json
vendored
3
.vscode/settings.json
vendored
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"files.associations": {
|
||||
"coder_lib.h": "c",
|
||||
"huffman_.h": "c"
|
||||
"huffman_.h": "c",
|
||||
"random": "c"
|
||||
}
|
||||
}
|
23
lzw-ab/.vscode/c_cpp_properties.json
vendored
Normal file
23
lzw-ab/.vscode/c_cpp_properties.json
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
{
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Win32",
|
||||
"includePath": [
|
||||
"${workspaceFolder}/**",
|
||||
"C:/MinGW/include/"
|
||||
],
|
||||
"defines": [
|
||||
"_DEBUG",
|
||||
"UNICODE",
|
||||
"_UNICODE",
|
||||
"__WIN32__",
|
||||
"DLL_EXPORT"
|
||||
],
|
||||
"cStandard": "c17",
|
||||
"cppStandard": "gnu++17",
|
||||
"intelliSenseMode": "windows-gcc-x64",
|
||||
"compilerPath": "C:/MinGW/bin/gcc.exe"
|
||||
}
|
||||
],
|
||||
"version": 4
|
||||
}
|
6
lzw-ab/.vscode/settings.json
vendored
Normal file
6
lzw-ab/.vscode/settings.json
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"files.associations": {
|
||||
"coder_lib.h": "c",
|
||||
"huffman_.h": "c"
|
||||
}
|
||||
}
|
15
lzw-ab/Makefile
Normal file
15
lzw-ab/Makefile
Normal file
@@ -0,0 +1,15 @@
|
||||
|
||||
|
||||
CC = gcc
|
||||
|
||||
|
||||
SRCS = $(wildcard *.c)
|
||||
|
||||
|
||||
STR = $(subst from,to,from your heart)
|
||||
|
||||
all:
|
||||
$(CC) $(SRCS) -o hello
|
||||
|
||||
clean:
|
||||
rm -rf *.exe
|
83
lzw-ab/README
Normal file
83
lzw-ab/README
Normal file
@@ -0,0 +1,83 @@
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// **** LZW-AB **** //
|
||||
// Adjusted Binary LZW Compressor/Decompressor //
|
||||
// Copyright (c) 2016-2020 David Bryant //
|
||||
// All Rights Reserved //
|
||||
// Distributed under the BSD Software License (see license.txt) //
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
This is an implementation of the Lempel-Ziv-Welch general-purpose data
|
||||
compression algorithm. It is targeted at embedded applications that require
|
||||
high speed compression or decompression facilities where lots of RAM for
|
||||
large dictionaries might not be available. I have used this in several
|
||||
projects for storing compressed firmware images, and once I even coded the
|
||||
decompressor in Z-80 assembly language for speed! Depending on the maximum
|
||||
symbol size selected, the implementation can require from 2368 to 335616
|
||||
bytes of RAM for decoding (and about half again more for encoding).
|
||||
|
||||
This is a streaming compressor in that the data is not divided into blocks
|
||||
and no context information like dictionaries or Huffman tables are sent
|
||||
ahead of the compressed data (except for one byte to signal the maximum
|
||||
bit depth). This limits the maximum possible compression ratio compared to
|
||||
algorithms that significantly preprocess the data, but with the help of
|
||||
some enhancements to the LZW algorithm (described below) it is able to
|
||||
compress better than the UNIX "compress" utility (which is also LZW) and
|
||||
is in fact closer to and sometimes beats the compression level of "gzip".
|
||||
|
||||
The symbols are stored in "adjusted binary" which provides somewhat better
|
||||
compression (with virtually no speed penalty) compared to the fixed word
|
||||
sizes normally used. Once the dictionary is full, the encoder returns to
|
||||
the beginning and recycles string codes that have not been used yet for
|
||||
longer strings. In this way the dictionary constantly "churns" based on the
|
||||
the incoming stream, thereby improving and adapting to optimal compression.
|
||||
The compression performance is constantly monitored and a dictionary flush
|
||||
is forced on stretches of negative compression which limits worst-case
|
||||
performance to about 8% inflation.
|
||||
|
||||
LZW-AB consists of three standard C files: the library, a command-line
|
||||
filter demo using pipes, and a command-line test harness. Each program
|
||||
builds with a single command on most platforms. It has been designed with
|
||||
maximum portability in mind and should work correctly on big-endian as well
|
||||
as little-endian machines.
|
||||
|
||||
Linux:
|
||||
% gcc -O3 lzwfilter.c lzwlib.c -o lzwfilter
|
||||
% gcc -O3 lzwtester.c lzwlib.c -o lzwtester
|
||||
|
||||
Darwin/Mac:
|
||||
% clang -O3 lzwfilter.c lzwlib.c -o lzwfilter
|
||||
% clang -O3 lzwtester.c lzwlib.c -o lzwtester
|
||||
|
||||
MS Visual Studio:
|
||||
cl -O2 lzwfilter.c lzwlib.c
|
||||
cl -O2 lzwtester.c lzwlib.c
|
||||
|
||||
There are Windows binaries (built on MinGW) for the filter and the tester on the
|
||||
GitHub release page (v3). The "help" display for the filter looks like this:
|
||||
|
||||
Usage: lzwfilter [-options] [< infile] [> outfile]
|
||||
|
||||
Operation: compression is default, use -d to decompress
|
||||
|
||||
Options: -d = decompress
|
||||
-h = display this "help" message
|
||||
-1 = maximum symbol size = 9 bits
|
||||
-2 = maximum symbol size = 10 bits
|
||||
-3 = maximum symbol size = 11 bits
|
||||
-4 = maximum symbol size = 12 bits
|
||||
-5 = maximum symbol size = 13 bits
|
||||
-6 = maximum symbol size = 14 bits
|
||||
-7 = maximum symbol size = 15 bits
|
||||
-8 = maximum symbol size = 16 bits (default)
|
||||
-v = verbose (display ratio and checksum)
|
||||
|
||||
Here's the "help" display for the tester:
|
||||
|
||||
Usage: lzwtester [options] file [...]
|
||||
|
||||
Options: -1 ... -8 = test using only specified max symbol size (9 - 16)
|
||||
-0 = cycle through all maximum symbol sizes (default)
|
||||
-e = exhaustive test (by successive truncation)
|
||||
-f = fuzz test (randomly corrupt compressed data)
|
||||
-q = quiet mode (only reports errors and summary)
|
||||
|
25
lzw-ab/license.txt
Normal file
25
lzw-ab/license.txt
Normal file
@@ -0,0 +1,25 @@
|
||||
Copyright (c) David Bryant
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of Conifer Software nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this software
|
||||
without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
191
lzw-ab/lzwfilter.ctt
Normal file
191
lzw-ab/lzwfilter.ctt
Normal file
@@ -0,0 +1,191 @@
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// **** LZW-AB **** //
|
||||
// Adjusted Binary LZW Compressor/Decompressor //
|
||||
// Copyright (c) 2016-2020 David Bryant //
|
||||
// All Rights Reserved //
|
||||
// Distributed under the BSD Software License (see license.txt) //
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <fcntl.h>
|
||||
#endif
|
||||
|
||||
#include "lzwlib.h"
|
||||
|
||||
/* This module provides a command-line filter for testing the lzw library.
|
||||
* It can also optionally calculate and display the compression ratio and
|
||||
* a simple checksum for informational purposes. Other command-line
|
||||
* arguments select decoding mode or the maximum symbol size (9 to 16 bits)
|
||||
* for encoding.
|
||||
*/
|
||||
|
||||
static const char *usage =
|
||||
" Usage: lzwfilter [-options] [< infile] [> outfile]\n\n"
|
||||
" Operation: compression is default, use -d to decompress\n\n"
|
||||
" Options: -d = decompress\n"
|
||||
" -h = display this \"help\" message\n"
|
||||
" -1 = maximum symbol size = 9 bits\n"
|
||||
" -2 = maximum symbol size = 10 bits\n"
|
||||
" -3 = maximum symbol size = 11 bits\n"
|
||||
" -4 = maximum symbol size = 12 bits\n"
|
||||
" -5 = maximum symbol size = 13 bits\n"
|
||||
" -6 = maximum symbol size = 14 bits\n"
|
||||
" -7 = maximum symbol size = 15 bits\n"
|
||||
" -8 = maximum symbol size = 16 bits (default)\n"
|
||||
" -v = verbose (display ratio and checksum)\n\n"
|
||||
" Web: Visit www.github.com/dbry/lzw-ab for latest version and info\n\n";
|
||||
|
||||
typedef struct {
|
||||
unsigned char buffer [65536];
|
||||
int checksum, head, tail;
|
||||
size_t byte_count;
|
||||
} streamer;
|
||||
|
||||
static int read_buff (void *ctx)
|
||||
{
|
||||
streamer *stream = ctx;
|
||||
int value;
|
||||
|
||||
if (stream->head == stream->tail)
|
||||
stream->tail = (stream->head = 0) + fread (stream->buffer, 1, sizeof (stream->buffer), stdin);
|
||||
|
||||
if (stream->head < stream->tail) {
|
||||
value = stream->buffer [stream->head++];
|
||||
stream->checksum = stream->checksum * 3 + (unsigned char) value;
|
||||
stream->byte_count++;
|
||||
}
|
||||
else
|
||||
value = EOF;
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
static void write_buff (int value, void *ctx)
|
||||
{
|
||||
streamer *stream = ctx;
|
||||
|
||||
if (value == EOF) {
|
||||
fwrite (stream->buffer, 1, stream->head, stdout);
|
||||
return;
|
||||
}
|
||||
|
||||
stream->buffer [stream->head++] = value;
|
||||
|
||||
if (stream->head == sizeof (stream->buffer)) {
|
||||
fwrite (stream->buffer, 1, stream->head, stdout);
|
||||
stream->head = 0;
|
||||
}
|
||||
|
||||
stream->checksum = stream->checksum * 3 + (unsigned char) value;
|
||||
stream->byte_count++;
|
||||
}
|
||||
|
||||
int main (int argc, char **argv)
|
||||
{
|
||||
int decompress = 0, maxbits = 16, verbose = 0, error = 0;
|
||||
streamer reader, writer;
|
||||
|
||||
memset (&reader, 0, sizeof (reader));
|
||||
memset (&writer, 0, sizeof (writer));
|
||||
reader.checksum = writer.checksum = -1;
|
||||
|
||||
while (--argc) {
|
||||
if ((**++argv == '-') && (*argv)[1])
|
||||
while (*++*argv)
|
||||
switch (**argv) {
|
||||
case '1':
|
||||
maxbits = 9;
|
||||
break;
|
||||
|
||||
case '2':
|
||||
maxbits = 10;
|
||||
break;
|
||||
|
||||
case '3':
|
||||
maxbits = 11;
|
||||
break;
|
||||
|
||||
case '4':
|
||||
maxbits = 12;
|
||||
break;
|
||||
|
||||
case '5':
|
||||
maxbits = 13;
|
||||
break;
|
||||
|
||||
case '6':
|
||||
maxbits = 14;
|
||||
break;
|
||||
|
||||
case '7':
|
||||
maxbits = 15;
|
||||
break;
|
||||
|
||||
case '8':
|
||||
maxbits = 16;
|
||||
break;
|
||||
|
||||
case 'D': case 'd':
|
||||
decompress = 1;
|
||||
break;
|
||||
|
||||
case 'H': case 'h':
|
||||
fprintf (stderr, "%s", usage);
|
||||
return 0;
|
||||
break;
|
||||
|
||||
case 'V': case 'v':
|
||||
verbose = 1;
|
||||
break;
|
||||
|
||||
default:
|
||||
fprintf (stderr, "illegal option: %c !\n", **argv);
|
||||
error = 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
fprintf (stderr, "unknown argument: %s\n", *argv);
|
||||
error = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (error) {
|
||||
fprintf (stderr, "%s", usage);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
setmode (fileno (stdin), O_BINARY);
|
||||
setmode (fileno (stdout), O_BINARY);
|
||||
#endif
|
||||
|
||||
if (decompress) {
|
||||
if (lzw_decompress (write_buff, &writer, read_buff, &reader)) {
|
||||
fprintf (stderr, "lzw_decompress() returned non-zero!\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
write_buff (EOF, &writer);
|
||||
|
||||
if (verbose && writer.byte_count)
|
||||
fprintf (stderr, "output checksum = %x, ratio = %.2f%%\n", writer.checksum, reader.byte_count * 100.0 / writer.byte_count);
|
||||
}
|
||||
else {
|
||||
if (lzw_compress (write_buff, &writer, read_buff, &reader, maxbits)) {
|
||||
fprintf (stderr, "lzw_compress() returned non-zero!\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
write_buff (EOF, &writer);
|
||||
|
||||
if (verbose && reader.byte_count)
|
||||
fprintf (stderr, "source checksum = %x, ratio = %.2f%%\n", reader.checksum, writer.byte_count * 100.0 / reader.byte_count);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
513
lzw-ab/lzwlib.c
Normal file
513
lzw-ab/lzwlib.c
Normal file
@@ -0,0 +1,513 @@
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// **** LZW-AB **** //
|
||||
// Adjusted Binary LZW Compressor/Decompressor //
|
||||
// Copyright (c) 2016-2020 David Bryant //
|
||||
// All Rights Reserved //
|
||||
// Distributed under the BSD Software License (see license.txt) //
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "lzwlib.h"
|
||||
|
||||
/* This library implements the LZW general-purpose data compression algorithm.
|
||||
* The algorithm was originally described as a hardware implementation by
|
||||
* Terry Welsh here:
|
||||
*
|
||||
* Welch, T.A. “A Technique for High-Performance Data Compression.”
|
||||
* IEEE Computer 17,6 (June 1984), pp. 8-19.
|
||||
*
|
||||
* Since then there have been enumerable refinements and variations on the
|
||||
* basic technique, and this implementation is no different. The target of
|
||||
* the present implementation is embedded systems, and so emphasis was placed
|
||||
* on simplicity, fast execution, and minimal RAM usage.
|
||||
*
|
||||
* This is a streaming compressor in that the data is not divided into blocks
|
||||
* and no context information like dictionaries or Huffman tables are sent
|
||||
* ahead of the compressed data (except for one byte to signal the maximum
|
||||
* bit depth). This limits the maximum possible compression ratio compared to
|
||||
* algorithms that significantly preprocess the data, but with the help of
|
||||
* some enhancements to the LZW algorithm (described below) it is able to
|
||||
* compress better than the UNIX "compress" utility (which is also LZW) and
|
||||
* is in fact closer to and sometimes beats the compression level of "gzip".
|
||||
*
|
||||
* The symbols are stored in "adjusted binary" which provides somewhat better
|
||||
* compression, with virtually no speed penalty, compared to the fixed word
|
||||
* sizes normally used. These are sometimes called "phased-in" binary codes
|
||||
* and their use in LZW is described here:
|
||||
*
|
||||
* R. N. Horspool, "Improving LZW (data compression algorithm)", Data
|
||||
* Compression Conference, pp. 332-341, 1991.
|
||||
*
|
||||
* Earlier versions of this compressor would reset as soon as the dictionary
|
||||
* became full to ensure good performance on heterogenous data (such as tar
|
||||
* files or executable images). While trivial to implement, this is not
|
||||
* particularly efficient with homogeneous data (or in general) because we
|
||||
* spend a lot of time sending short symbols where the compression is poor.
|
||||
*
|
||||
* This newer version utilizes a technique such that once the dictionary is
|
||||
* full, we restart at the beginning and recycle only those codes that were
|
||||
* seen only once. We know this because they are not referenced by longer
|
||||
* strings, and are easy to replace in the dictionary for the same reason.
|
||||
* Since they have only been seen once it's also more likely that we will
|
||||
* be replacing them with a more common string, and this is especially
|
||||
* true if the data characteristics are changing.
|
||||
*
|
||||
* Replacing string codes in this manner has the interesting side effect that
|
||||
* some older shorter strings that the removed strings were based on will
|
||||
* possibly become unreferenced themselves and be recycled on the next pass.
|
||||
* In this way, the entire dictionary constantly "churns" based on the
|
||||
* incoming stream, thereby improving and adapting to optimal compression.
|
||||
*
|
||||
* Even with this technique there is still a possibility that a sudden change
|
||||
* in the data characteristics will appear, resulting in significant negative
|
||||
* compression (up to 100% for 16-bit codes). To detect this case we generate
|
||||
* an exponentially decaying average of the current compression ratio and reset
|
||||
* when this hits about 1.06, which limits worst case inflation to about 8%.
|
||||
*
|
||||
* The maximum symbol size is configurable on the encode side (from 9 bits to
|
||||
* 16 bits) and determines the RAM footprint required by both sides and, to a
|
||||
* large extent, the compression performance. This information is communicated
|
||||
* to the decoder in the first stream byte so that it can allocate accordingly.
|
||||
* The RAM requirements are as follows:
|
||||
*
|
||||
* maximum encoder RAM decoder RAM
|
||||
* symbol size requirement requirement
|
||||
* -----------------------------------------
|
||||
* 9-bit 4096 bytes 2368 bytes
|
||||
* 10-bit 8192 bytes 4992 bytes
|
||||
* 11-bit 16384 bytes 10240 bytes
|
||||
* 12-bit 32768 bytes 20736 bytes
|
||||
* 13-bit 65536 bytes 41728 bytes
|
||||
* 14-bit 131072 bytes 83712 bytes
|
||||
* 15-bit 262144 bytes 167680 bytes
|
||||
* 16-bit 524288 bytes 335616 bytes
|
||||
*
|
||||
* This implementation uses malloc(), but obviously an embedded version could
|
||||
* use static arrays instead if desired (assuming that the maxbits was
|
||||
* controlled outside).
|
||||
*/
|
||||
|
||||
#define NULL_CODE 65535 // indicates a NULL prefix (must be unsigned short)
|
||||
#define CLEAR_CODE 256 // code to flush dictionary and restart decoder
|
||||
#define FIRST_STRING 257 // code of first dictionary string
|
||||
|
||||
/* This macro determines the number of bits required to represent the given value,
|
||||
* not counting the implied MSB. For GNU C it will use the provided built-in,
|
||||
* otherwise a comparison tree is employed. Note that in the non-GNU case, only
|
||||
* values up to 65535 (15 bits) are supported.
|
||||
*/
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define CODE_BITS(n) (31 - __builtin_clz(n))
|
||||
#else
|
||||
#define CODE_BITS(n) ((n) < 4096 ? \
|
||||
((n) < 1024 ? 8 + ((n) >= 512) : 10 + ((n) >= 2048)) : \
|
||||
((n) < 16384 ? 12 + ((n) >= 8192) : 14 + ((n) >= 32768)))
|
||||
#endif
|
||||
|
||||
/* This macro writes the adjusted-binary symbol "code" given the maximum
|
||||
* symbol "maxcode". A macro is used here just to avoid the duplication in
|
||||
* the lzw_compress() function. The idea is that if "maxcode" is not one
|
||||
* less than a power of two (which it rarely will be) then this code can
|
||||
* often send fewer bits that would be required with a fixed-sized code.
|
||||
*
|
||||
* For example, the first code we send will have a "maxcode" of 257, so
|
||||
* every "code" would normally consume 9 bits. But with adjusted binary we
|
||||
* can actually represent any code from 0 to 253 with just 8 bits -- only
|
||||
* the 4 codes from 254 to 257 take 9 bits.
|
||||
*/
|
||||
|
||||
#define WRITE_CODE(code,maxcode) do { \
|
||||
unsigned int code_bits = CODE_BITS (maxcode); \
|
||||
unsigned int extras = (2 << code_bits) - (maxcode) - 1; \
|
||||
if ((code) < extras) { \
|
||||
shifter |= ((code) << bits); \
|
||||
bits += code_bits; \
|
||||
} \
|
||||
else { \
|
||||
shifter |= ((((code) + extras) >> 1) << bits); \
|
||||
bits += code_bits; \
|
||||
shifter |= ((((code) + extras) & 1) << bits++); \
|
||||
} \
|
||||
do { (*dst)(shifter,dstctx); shifter >>= 8; \
|
||||
output_bytes += 256; \
|
||||
} while ((bits -= 8) >= 8); \
|
||||
} while (0)
|
||||
|
||||
/* LZW compression function. Bytes (8-bit) are read and written through callbacks and the
|
||||
* "maxbits" parameter specifies the maximum symbol size (9-16), which in turn determines
|
||||
* the RAM requirement and, to a large extent, the level of compression achievable. A return
|
||||
* value of EOF from the "src" callback terminates the compression process. A non-zero return
|
||||
* value indicates one of the two possible errors -- bad "maxbits" param or failed malloc().
|
||||
* There are contexts (void pointers) that are passed to the callbacks to easily facilitate
|
||||
* multiple instances of the compression operation (but simple applications can ignore these).
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
unsigned short first_reference, next_reference, back_reference;
|
||||
unsigned char terminator;
|
||||
} encoder_entry_t;
|
||||
|
||||
int lzw_compress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void *srcctx, int maxbits)
|
||||
{
|
||||
unsigned int maxcode = FIRST_STRING, next_string = FIRST_STRING, prefix = NULL_CODE, total_codes;
|
||||
unsigned int dictionary_full = 0, available_entries, max_available_entries, max_available_code;
|
||||
unsigned int input_bytes = 65536, output_bytes = 65536;
|
||||
unsigned int shifter = 0, bits = 0;
|
||||
encoder_entry_t *dictionary;
|
||||
int c;
|
||||
|
||||
if (maxbits < 9 || maxbits > 16) // check for valid "maxbits" setting
|
||||
return 1;
|
||||
|
||||
// based on the "maxbits" parameter, compute total codes and allocate dictionary storage
|
||||
|
||||
total_codes = 1 << maxbits;
|
||||
dictionary = malloc (total_codes * sizeof (encoder_entry_t));
|
||||
max_available_entries = total_codes - FIRST_STRING - 1;
|
||||
max_available_code = total_codes - 2;
|
||||
|
||||
if (!dictionary)
|
||||
return 1; // failed malloc()
|
||||
|
||||
// clear the dictionary
|
||||
|
||||
available_entries = max_available_entries;
|
||||
memset (dictionary, 0, 256 * sizeof (encoder_entry_t));
|
||||
|
||||
(*dst)(maxbits - 9, dstctx); // first byte in output stream indicates the maximum symbol bits
|
||||
|
||||
// This is the main loop where we read input bytes and compress them. We always keep track of the
|
||||
// "prefix", which represents a pending byte (if < 256) or string entry (if >= FIRST_STRING) that
|
||||
// has not been sent to the decoder yet. The output symbols are kept in the "shifter" and "bits"
|
||||
// variables and are sent to the output every time 8 bits are available (done in the macro).
|
||||
|
||||
while ((c = (*src)(srcctx)) != EOF) {
|
||||
unsigned int cti; // coding table index
|
||||
|
||||
input_bytes += 256;
|
||||
|
||||
if (prefix == NULL_CODE) { // this only happens the very first byte when we don't yet have a prefix
|
||||
prefix = c;
|
||||
continue;
|
||||
}
|
||||
|
||||
memset (dictionary + next_string, 0, sizeof (encoder_entry_t));
|
||||
|
||||
if ((cti = dictionary [prefix].first_reference)) { // if any longer strings are built on the current prefix...
|
||||
while (1)
|
||||
if (dictionary [cti].terminator == c) { // we found a matching string, so we just update the prefix
|
||||
prefix = cti; // to that string and continue without sending anything
|
||||
break;
|
||||
}
|
||||
else if (!dictionary [cti].next_reference) { // this string did not match the new character and
|
||||
dictionary [cti].next_reference = next_string; // there aren't any more, so we'll add a new string,
|
||||
// point to it with "next_reference", and also make the
|
||||
dictionary [next_string].back_reference = cti; // "back_reference" which is used for recycling entries
|
||||
cti = 0;
|
||||
break;
|
||||
}
|
||||
else
|
||||
cti = dictionary [cti].next_reference; // there are more possible matches to check, so loop back
|
||||
}
|
||||
else { // no longer strings are based on the current prefix, so now
|
||||
dictionary [prefix].first_reference = next_string; // the current prefix plus the new byte will be the next string
|
||||
dictionary [next_string].back_reference = prefix; // also make the back_reference used for recycling
|
||||
if (prefix >= FIRST_STRING) available_entries--; // the codes 0-255 are never available for recycling
|
||||
}
|
||||
|
||||
// If "cti" is zero, we could not simply extend our "prefix" to a longer string because we did not find a
|
||||
// dictionary match, so we send the symbol representing the current "prefix" and add the new string to the
|
||||
// dictionary. Since the current byte "c" was not included in the prefix, that now becomes our new prefix.
|
||||
|
||||
if (!cti) {
|
||||
WRITE_CODE (prefix, maxcode); // send symbol for current prefix (0 to maxcode-1)
|
||||
dictionary [next_string].terminator = c; // newly created string has current byte as the terminator
|
||||
prefix = c; // current byte also becomes new prefix for next string
|
||||
|
||||
// If the dictionary is not full yet, we bump the maxcode and next_string and check to see if the
|
||||
// dictionary is now full. If it is we set the dictionary_full flag and leave maxcode set to two
|
||||
// less than total_codes because every string entry is now available for matching, but the actual
|
||||
// maximum code is reserved for EOF.
|
||||
|
||||
if (!dictionary_full) {
|
||||
dictionary_full = (++next_string > max_available_code);
|
||||
maxcode++;
|
||||
}
|
||||
|
||||
// If the dictionary is full we look for an entry to recycle starting at next_string (the one we
|
||||
// just created or recycled) plus one (with check for wrap check). We know there is one because at
|
||||
// a minimum the string we just added. This also takes care of removing the entry to be recycled
|
||||
// (which is possible/easy because no longer strings have been based on it).
|
||||
|
||||
if (dictionary_full) {
|
||||
for (next_string++; next_string <= max_available_code || (next_string = FIRST_STRING); next_string++)
|
||||
if (!dictionary [next_string].first_reference)
|
||||
break;
|
||||
|
||||
cti = dictionary [next_string].back_reference; // dictionary [cti] references the entry we're
|
||||
// trying to recycle (either as a first or a next)
|
||||
|
||||
if (dictionary [cti].first_reference == next_string) {
|
||||
dictionary [cti].first_reference = dictionary [next_string].next_reference;
|
||||
|
||||
// if we just cleared a first reference, and that string is not 0-255,
|
||||
// then that's a newly available entry
|
||||
if (!dictionary [cti].first_reference && cti >= FIRST_STRING)
|
||||
available_entries++;
|
||||
}
|
||||
else if (dictionary [cti].next_reference == next_string) // fixup a "next_reference"
|
||||
dictionary [cti].next_reference = dictionary [next_string].next_reference;
|
||||
|
||||
// If the entry we're recycling had a next reference, then update the back reference
|
||||
// so it's completely out of the chain. Of course we know it didn't have a first
|
||||
// reference because then we wouldn't be recycling it.
|
||||
|
||||
if (dictionary [next_string].next_reference)
|
||||
dictionary [dictionary [next_string].next_reference].back_reference = cti;
|
||||
|
||||
// This check is technically not needed because there will always be an available entry
|
||||
// (the last string we added at a minimum) but we don't want to get in a situation where
|
||||
// we only have a few entries that we're cycling though. I pulled the limits (16 entries
|
||||
// or 1% of total) out of a hat.
|
||||
|
||||
if (available_entries < 16 || available_entries * 100 < max_available_entries) {
|
||||
// clear the dictionary and reset the byte counters -- basically everything starts over
|
||||
// except that we keep the last pending "prefix" (which, of course, was never sent)
|
||||
|
||||
WRITE_CODE (CLEAR_CODE, maxcode);
|
||||
memset (dictionary, 0, 256 * sizeof (encoder_entry_t));
|
||||
available_entries = max_available_entries;
|
||||
next_string = maxcode = FIRST_STRING;
|
||||
input_bytes = output_bytes = 65536;
|
||||
dictionary_full = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// This is similar to the above check, except that it's used whether the dictionary is full or not.
|
||||
// It uses an exponentially decaying average of the current compression ratio, so it can terminate
|
||||
// very early if the incoming data is uncompressible or it can terminate any later time that the
|
||||
// dictionary no longer compresses the incoming stream.
|
||||
|
||||
if (output_bytes > input_bytes + (input_bytes >> 4)) {
|
||||
WRITE_CODE (CLEAR_CODE, maxcode);
|
||||
memset (dictionary, 0, 256 * sizeof (encoder_entry_t));
|
||||
available_entries = max_available_entries;
|
||||
next_string = maxcode = FIRST_STRING;
|
||||
input_bytes = output_bytes = 65536;
|
||||
dictionary_full = 0;
|
||||
}
|
||||
else {
|
||||
output_bytes -= output_bytes >> 8;
|
||||
input_bytes -= input_bytes >> 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// we're done with input, so if we've received anything we still need to send that pesky pending prefix...
|
||||
|
||||
if (prefix != NULL_CODE) {
|
||||
WRITE_CODE (prefix, maxcode);
|
||||
|
||||
if (!dictionary_full)
|
||||
maxcode++;
|
||||
}
|
||||
|
||||
WRITE_CODE (maxcode, maxcode); // the maximum possible code is always reserved for our END_CODE
|
||||
|
||||
if (bits) // finally, flush any pending bits from the shifter
|
||||
(*dst)(shifter, dstctx);
|
||||
|
||||
free (dictionary);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* LZW decompression function. Bytes (8-bit) are read and written through callbacks. The
|
||||
* "maxbits" parameter is read as the first byte in the stream and controls how much memory
|
||||
* is allocated for decoding. A return value of EOF from the "src" callback terminates the
|
||||
* decompression process (although this should not normally occur). A non-zero return value
|
||||
* indicates an error, which in this case can be a bad "maxbits" read from the stream, a
|
||||
* failed malloc(), or if an EOF is read from the input stream before the decompression
|
||||
* terminates naturally with END_CODE. There are contexts (void pointers) that are passed
|
||||
* to the callbacks to easily facilitate multiple instances of the decompression operation
|
||||
* (but simple applications can ignore these).
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
unsigned char terminator, extra_references;
|
||||
unsigned short prefix;
|
||||
} decoder_entry_t;
|
||||
|
||||
int lzw_decompress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void *srcctx)
|
||||
{
|
||||
unsigned int maxcode = FIRST_STRING, next_string = FIRST_STRING - 1, prefix = CLEAR_CODE;
|
||||
unsigned int dictionary_full = 0, max_available_code, total_codes;
|
||||
unsigned int shifter = 0, bits = 0, read_byte, i;
|
||||
unsigned char *reverse_buffer, *referenced;
|
||||
decoder_entry_t *dictionary;
|
||||
|
||||
if ((read_byte = ((*src)(srcctx))) == EOF || (read_byte & 0xf8)) //sanitize first byte
|
||||
return 1;
|
||||
|
||||
// based on the "maxbits" parameter, compute total codes and allocate dictionary storage
|
||||
|
||||
total_codes = 512 << (read_byte & 0x7);
|
||||
max_available_code = total_codes - 2;
|
||||
dictionary = malloc (total_codes * sizeof (decoder_entry_t));
|
||||
reverse_buffer = malloc (total_codes - 256);
|
||||
referenced = malloc (total_codes / 8); // bitfield indicating code is referenced at least once
|
||||
|
||||
// Note that to implement the dictionary entry recycling we have to keep track of how many
|
||||
// longer strings are based on each string in the dictionary. This can be between 0 (no
|
||||
// references) to 256 (every possible next byte), but unfortunately that's one more value
|
||||
// than what can be stored in a byte. The solution is to have a single bit for each entry
|
||||
// indicating any references (i.e., the code cannot be recycled) and an additional byte
|
||||
// in the dictionary entry struct counting the "extra" references (beyond one).
|
||||
|
||||
if (!reverse_buffer || !dictionary) // check for malloc() failure
|
||||
return 1;
|
||||
|
||||
for (i = 0; i < 256; ++i) { // these never change
|
||||
dictionary [i].prefix = NULL_CODE;
|
||||
dictionary [i].terminator = i;
|
||||
}
|
||||
|
||||
// This is the main loop where we read input symbols. The values range from 0 to the code value
|
||||
// of the "next" string in the dictionary (although the actual "next" code cannot be used yet,
|
||||
// and so we reserve that code for the END_CODE). Note that receiving an EOF from the input
|
||||
// stream is actually an error because we should have gotten the END_CODE first.
|
||||
|
||||
while (1) {
|
||||
unsigned int code_bits = CODE_BITS (maxcode), code;
|
||||
unsigned int extras = (2 << code_bits) - maxcode - 1;
|
||||
|
||||
do {
|
||||
if ((read_byte = ((*src)(srcctx))) == EOF) {
|
||||
free (dictionary); free (reverse_buffer); free (referenced);
|
||||
return 1;
|
||||
}
|
||||
|
||||
shifter |= read_byte << bits;
|
||||
} while ((bits += 8) < code_bits);
|
||||
|
||||
// first we assume the code will fit in the minimum number of required bits
|
||||
|
||||
code = shifter & ((1 << code_bits) - 1);
|
||||
shifter >>= code_bits;
|
||||
bits -= code_bits;
|
||||
|
||||
// but if code >= extras, then we need to read another bit to calculate the real code
|
||||
// (this is the "adjusted binary" part)
|
||||
|
||||
if (code >= extras) {
|
||||
if (!bits) {
|
||||
if ((read_byte = ((*src)(srcctx))) == EOF) {
|
||||
free (dictionary); free (reverse_buffer); free (referenced);
|
||||
return 1;
|
||||
}
|
||||
|
||||
shifter = read_byte;
|
||||
bits = 8;
|
||||
}
|
||||
|
||||
code = (code << 1) - extras + (shifter & 1);
|
||||
shifter >>= 1;
|
||||
bits--;
|
||||
}
|
||||
|
||||
if (code == maxcode) // sending the maximum code is reserved for the end of the file
|
||||
break;
|
||||
else if (code == CLEAR_CODE) { // otherwise check for a CLEAR_CODE to start over early
|
||||
next_string = FIRST_STRING - 1;
|
||||
maxcode = FIRST_STRING;
|
||||
dictionary_full = 0;
|
||||
}
|
||||
else if (prefix == CLEAR_CODE) { // this only happens at the first symbol which is always sent
|
||||
(*dst)(code, dstctx); // literally and becomes our initial prefix
|
||||
next_string++;
|
||||
maxcode++;
|
||||
}
|
||||
// Otherwise we have a valid prefix so we step through the string from end to beginning storing the
|
||||
// bytes in the "reverse_buffer", and then we send them out in the proper order. One corner-case
|
||||
// we have to handle here is that the string might be the same one that is actually being defined
|
||||
// now (code == next_string).
|
||||
else {
|
||||
unsigned int cti = (code == next_string) ? prefix : code;
|
||||
unsigned char *rbp = reverse_buffer, c;
|
||||
|
||||
do {
|
||||
*rbp++ = dictionary [cti].terminator;
|
||||
if (rbp == reverse_buffer + total_codes - 256) {
|
||||
free (dictionary); free (reverse_buffer); free (referenced);
|
||||
return 1;
|
||||
}
|
||||
} while ((cti = dictionary [cti].prefix) != NULL_CODE);
|
||||
|
||||
c = *--rbp; // the first byte in this string is the terminator for the last string, which is
|
||||
// the one that we'll create a new dictionary entry for this time
|
||||
|
||||
do // send string in corrected order (except for the terminator which we don't know yet)
|
||||
(*dst)(*rbp, dstctx);
|
||||
while (rbp-- != reverse_buffer);
|
||||
|
||||
if (code == next_string) {
|
||||
(*dst)(c,dstctx);
|
||||
}
|
||||
|
||||
// This should always execute (the conditional is to catch corruptions) and is where we add a new string to
|
||||
// the dictionary, either at the end or elsewhere when we are "recycling" entries that were never referenced
|
||||
|
||||
if (next_string >= FIRST_STRING && next_string < total_codes) {
|
||||
if (referenced [prefix >> 3] & (1 << (prefix & 7))) // increment reference count on prefix
|
||||
dictionary [prefix].extra_references++;
|
||||
else
|
||||
referenced [prefix >> 3] |= 1 << (prefix & 7);
|
||||
|
||||
dictionary [next_string].prefix = prefix; // now update the next dictionary entry with the new string
|
||||
dictionary [next_string].terminator = c; // (but we're always one behind, so it's not the string just sent)
|
||||
dictionary [next_string].extra_references = 0; // newly created string has not been referenced
|
||||
referenced [next_string >> 3] &= ~(1 << (next_string & 7));
|
||||
}
|
||||
|
||||
// If the dictionary is not full yet, we bump the maxcode and next_string and check to see if the
|
||||
// dictionary is now full. If it is we set the dictionary_full flag and set next_string back to the
|
||||
// beginning of the dictionary strings to start recycling them. Note that then maxcode will remain
|
||||
// two less than total_codes because every string entry is available for matching, and the actual
|
||||
// maximum code is reserved for EOF.
|
||||
|
||||
if (!dictionary_full) {
|
||||
maxcode++;
|
||||
|
||||
if (++next_string > max_available_code) {
|
||||
dictionary_full = 1;
|
||||
maxcode--;
|
||||
}
|
||||
}
|
||||
|
||||
// If the dictionary is full we look for an entry to recycle starting at next_string (the one we
|
||||
// created or recycled) plus one. We know there is one because at a minimum the string we just added
|
||||
// has not been referenced). This also takes care of removing the entry to be recycled (which is
|
||||
// possible/easy because no longer strings have been based on it).
|
||||
|
||||
if (dictionary_full) {
|
||||
for (next_string++; next_string <= max_available_code || (next_string = FIRST_STRING); next_string++)
|
||||
if (!(referenced [next_string >> 3] & (1 << (next_string & 7))))
|
||||
break;
|
||||
|
||||
if (dictionary [dictionary [next_string].prefix].extra_references)
|
||||
dictionary [dictionary [next_string].prefix].extra_references--;
|
||||
else
|
||||
referenced [dictionary [next_string].prefix >> 3] &= ~(1 << (dictionary [next_string].prefix & 7));
|
||||
}
|
||||
}
|
||||
|
||||
prefix = code; // the code we just received becomes the prefix for the next dictionary string entry
|
||||
// (which we'll create once we find out the terminator)
|
||||
}
|
||||
|
||||
free (dictionary); free (reverse_buffer); free (referenced);
|
||||
return 0;
|
||||
}
|
15
lzw-ab/lzwlib.h
Normal file
15
lzw-ab/lzwlib.h
Normal file
@@ -0,0 +1,15 @@
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// **** LZW-AB **** //
|
||||
// Adjusted Binary LZW Compressor/Decompressor //
|
||||
// Copyright (c) 2016-2020 David Bryant //
|
||||
// All Rights Reserved //
|
||||
// Distributed under the BSD Software License (see license.txt) //
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef LZWLIB_H_
|
||||
#define LZWLIB_H_
|
||||
|
||||
int lzw_compress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void *srcctx, int maxbits);
|
||||
int lzw_decompress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void *srcctx);
|
||||
|
||||
#endif /* LZWLIB_H_ */
|
317
lzw-ab/lzwtester.c
Normal file
317
lzw-ab/lzwtester.c
Normal file
@@ -0,0 +1,317 @@
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// **** LZW-AB **** //
|
||||
// Adjusted Binary LZW Compressor/Decompressor //
|
||||
// Copyright (c) 2016-2020 David Bryant //
|
||||
// All Rights Reserved //
|
||||
// Distributed under the BSD Software License (see license.txt) //
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#include "lzwlib.h"
|
||||
|
||||
/* This module provides a command-line test harness for the lzw library.
|
||||
* Given a list of files, it will read each one and byte-for-byte verify
|
||||
* the data after a round-trip through a compression / decompression cycle
|
||||
* at each of the 8 available maximum symbol size settings.
|
||||
*
|
||||
* It can also optionally perform fuzz testing by randomly corrupting the
|
||||
* compressed bitstream. Obviously this will introduce integrity failures,
|
||||
* but it should not cause a crash. It also has an "exhaustive" mode that
|
||||
* creates hundreds of simulated images from each input file by successive
|
||||
* truncation from both ends.
|
||||
*/
|
||||
|
||||
static const char *usage =
|
||||
" Usage: lzwtester [options] file [...]\n\n"
|
||||
" Options: -1 ... -8 = test using only specified max symbol size (9 - 16)\n"
|
||||
" -0 = cycle through all maximum symbol sizes (default)\n"
|
||||
" -e = exhaustive test (by successive truncation)\n"
|
||||
" -f = fuzz test (randomly corrupt compressed data)\n"
|
||||
" -q = quiet mode (only reports errors and summary)\n\n"
|
||||
" Web: Visit www.github.com/dbry/lzw-ab for latest version and info\n\n";
|
||||
|
||||
typedef struct {
|
||||
unsigned int size, index, wrapped, byte_errors, first_error, fuzz_testing;
|
||||
unsigned char *buffer;
|
||||
} streamer;
|
||||
|
||||
static int read_buff (void *ctx)
|
||||
{
|
||||
streamer *stream = ctx;
|
||||
|
||||
if (stream->index == stream->size)
|
||||
return EOF;
|
||||
|
||||
return stream->buffer [stream->index++];
|
||||
}
|
||||
|
||||
static void write_buff (int value, void *ctx)
|
||||
{
|
||||
streamer *stream = ctx;
|
||||
|
||||
// for fuzz testing, randomly corrupt 1 byte in every 65536 (on average)
|
||||
|
||||
if (stream->fuzz_testing) {
|
||||
static unsigned long long kernel = 0x3141592653589793;
|
||||
kernel = ((kernel << 4) - kernel) ^ 1;
|
||||
kernel = ((kernel << 4) - kernel) ^ 1;
|
||||
kernel = ((kernel << 4) - kernel) ^ 1;
|
||||
|
||||
if (!(kernel >> 48))
|
||||
value ^= (int)(kernel >> 40);
|
||||
}
|
||||
|
||||
if (stream->index == stream->size) {
|
||||
stream->index = 0;
|
||||
stream->wrapped++;
|
||||
}
|
||||
|
||||
stream->buffer [stream->index++] = value;
|
||||
}
|
||||
|
||||
static void check_buff (int value, void *ctx)
|
||||
{
|
||||
streamer *stream = ctx;
|
||||
|
||||
if (stream->index == stream->size) {
|
||||
stream->wrapped++;
|
||||
return;
|
||||
}
|
||||
|
||||
if (stream->buffer [stream->index] != value)
|
||||
if (!stream->byte_errors++)
|
||||
stream->first_error = stream->index;
|
||||
|
||||
stream->index++;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
long long DoGetFileSize (FILE *hFile)
|
||||
{
|
||||
LARGE_INTEGER Size;
|
||||
HANDLE fHandle;
|
||||
|
||||
if (hFile == NULL)
|
||||
return 0;
|
||||
|
||||
fHandle = (HANDLE)_get_osfhandle(_fileno(hFile));
|
||||
if (fHandle == INVALID_HANDLE_VALUE)
|
||||
return 0;
|
||||
|
||||
Size.u.LowPart = GetFileSize(fHandle, &Size.u.HighPart);
|
||||
|
||||
if (Size.u.LowPart == INVALID_FILE_SIZE && GetLastError() != NO_ERROR)
|
||||
return 0;
|
||||
|
||||
return (long long)Size.QuadPart;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
long long DoGetFileSize (FILE *hFile)
|
||||
{
|
||||
struct stat statbuf;
|
||||
|
||||
if (!hFile || fstat (fileno (hFile), &statbuf) || !S_ISREG(statbuf.st_mode))
|
||||
return 0;
|
||||
|
||||
return (long long) statbuf.st_size;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int main (int argc, char **argv)
|
||||
{
|
||||
int index, checked = 0, tests = 0, skipped = 0, errors = 0;
|
||||
int set_maxbits = 0, quiet_mode = 0, exhaustive_mode = 0;
|
||||
long long total_input_bytes = 0, total_output_bytes = 0;
|
||||
streamer reader, writer, checker;
|
||||
|
||||
memset (&reader, 0, sizeof (reader));
|
||||
memset (&writer, 0, sizeof (writer));
|
||||
memset (&checker, 0, sizeof (checker));
|
||||
|
||||
if (argc < 2) {
|
||||
printf ("%s", usage);
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (index = 1; index < argc; ++index) {
|
||||
const char *filename = argv [index];
|
||||
int test_size, bytes_read, maxbits;
|
||||
unsigned char *file_buffer;
|
||||
long long file_size;
|
||||
FILE *infile;
|
||||
|
||||
if (!strcmp (filename, "-q")) {
|
||||
quiet_mode = 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!strcmp (filename, "-e")) {
|
||||
exhaustive_mode = 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!strcmp (filename, "-f")) {
|
||||
writer.fuzz_testing = 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (strlen (filename) == 2 && filename [0] == '-' && filename [1] >= '0' && filename [1] <= '8') {
|
||||
if (filename [1] > '0')
|
||||
set_maxbits = filename [1] - '0' + 8;
|
||||
else
|
||||
set_maxbits = 0;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
infile = fopen (filename, "rb");
|
||||
|
||||
if (!infile) {
|
||||
printf ("\ncan't open file %s!\n", filename);
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
file_size = DoGetFileSize (infile);
|
||||
|
||||
if (!file_size) {
|
||||
printf ("\ncan't get file size of %s (may be zero)!\n", filename);
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (file_size > 1024LL * 1024LL * 1024LL) {
|
||||
printf ("\nfile %s is too big!\n", filename);
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
file_buffer = malloc (file_size);
|
||||
writer.size = (unsigned int)(file_size * 2 + 10);
|
||||
writer.buffer = malloc (writer.size);
|
||||
|
||||
if (!file_buffer || !writer.buffer) {
|
||||
printf ("\nfile %s is too big!\n", filename);
|
||||
if (writer.buffer) free (writer.buffer);
|
||||
if (file_buffer) free (file_buffer);
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
bytes_read = fread (file_buffer, 1, (int) file_size, infile);
|
||||
fclose (infile);
|
||||
|
||||
if (bytes_read != (int) file_size) {
|
||||
printf ("\nfile %s could not be read!\n", filename);
|
||||
free (writer.buffer);
|
||||
free (file_buffer);
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!quiet_mode)
|
||||
printf ("\n");
|
||||
|
||||
test_size = file_size;
|
||||
checked++;
|
||||
|
||||
do {
|
||||
for (maxbits = set_maxbits ? set_maxbits : 9; maxbits <= (set_maxbits ? set_maxbits : 16); ++maxbits) {
|
||||
int res, got_error = 0;
|
||||
|
||||
reader.buffer = file_buffer + (file_size - test_size) / 2;
|
||||
reader.size = test_size;
|
||||
|
||||
reader.index = writer.index = writer.wrapped = 0;
|
||||
|
||||
if (lzw_compress (write_buff, &writer, read_buff, &reader, maxbits)) {
|
||||
printf ("\nlzw_compress() returned error on file %s, maxbits = %d\n", filename, maxbits);
|
||||
errors++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (writer.wrapped) {
|
||||
printf ("\nover 100%% inflation on file %s, maxbits = %d!\n", filename, maxbits);
|
||||
errors++;
|
||||
continue;
|
||||
}
|
||||
|
||||
checker.buffer = reader.buffer;
|
||||
checker.size = reader.size;
|
||||
checker.wrapped = checker.byte_errors = checker.index = 0;
|
||||
|
||||
reader.buffer = writer.buffer;
|
||||
reader.size = writer.index;
|
||||
reader.index = 0;
|
||||
|
||||
res = lzw_decompress (check_buff, &checker, read_buff, &reader);
|
||||
|
||||
reader.buffer = checker.buffer;
|
||||
reader.size = checker.size;
|
||||
|
||||
got_error = res || checker.index != checker.size || checker.wrapped || checker.byte_errors;
|
||||
|
||||
if (!quiet_mode || got_error)
|
||||
printf ("file %s, maxbits = %2d: %u bytes --> %u bytes, %.2f%%\n", filename, maxbits,
|
||||
reader.size, writer.index, writer.index * 100.0 / reader.size);
|
||||
|
||||
if (got_error) {
|
||||
if (res)
|
||||
printf ("decompressor returned an error\n");
|
||||
|
||||
if (!checker.index)
|
||||
printf ("decompression didn't generate any data\n");
|
||||
else if (checker.index != checker.size)
|
||||
printf ("decompression terminated %u bytes early\n", checker.size - checker.index);
|
||||
else if (checker.wrapped)
|
||||
printf ("decompression generated %u extra bytes\n", checker.wrapped);
|
||||
|
||||
if (checker.byte_errors)
|
||||
printf ("there were %u byte data errors starting at index %u\n",
|
||||
checker.byte_errors, checker.first_error);
|
||||
else if (checker.index != checker.size || checker.wrapped)
|
||||
printf ("(but the data generated was all correct)\n");
|
||||
|
||||
printf ("\n");
|
||||
errors++;
|
||||
}
|
||||
else {
|
||||
total_input_bytes += reader.size;
|
||||
total_output_bytes += writer.index;
|
||||
}
|
||||
|
||||
tests++;
|
||||
|
||||
if (exhaustive_mode)
|
||||
test_size -= (test_size + 98) / 100;
|
||||
}
|
||||
|
||||
} while (exhaustive_mode && test_size > 1 && test_size > file_size / 100);
|
||||
|
||||
free (writer.buffer);
|
||||
free (file_buffer);
|
||||
}
|
||||
|
||||
if (errors)
|
||||
printf ("\n***** %d errors detected in %d tests using %d files (%d skipped) *****\n\n", errors, tests, checked, skipped);
|
||||
else {
|
||||
printf ("\nsuccessfully ran %d tests using %d files (%d skipped) with no errors detected\n", tests, checked, skipped);
|
||||
printf ("cumulative results: %llu bytes --> %llu bytes, %.2f%%\n\n", total_input_bytes, total_output_bytes,
|
||||
total_output_bytes * 100.0 / total_input_bytes);
|
||||
}
|
||||
|
||||
return errors;
|
||||
}
|
BIN
other/lzw-ab-master.zip
Normal file
BIN
other/lzw-ab-master.zip
Normal file
Binary file not shown.
15
zl77/Makefile
Normal file
15
zl77/Makefile
Normal file
@@ -0,0 +1,15 @@
|
||||
|
||||
|
||||
CC = gcc
|
||||
|
||||
|
||||
# SRCS = $(wildcard *.c)
|
||||
SRCS = zl77.c ../huffman/huffman_.c
|
||||
|
||||
STR = $(subst from,to,from your heart)
|
||||
|
||||
all:
|
||||
$(CC) $(SRCS) -o hello
|
||||
|
||||
clean:
|
||||
rm -rf *.exe
|
362
zl77/zl77.c
Normal file
362
zl77/zl77.c
Normal file
@@ -0,0 +1,362 @@
|
||||
|
||||
|
||||
#include "zl77.h"
|
||||
#include "stdio.h"
|
||||
#include "stdlib.h"
|
||||
#include "string.h"
|
||||
#include "../huffman/huffman_.h"
|
||||
|
||||
// zl77 算法的实现
|
||||
|
||||
#define DBG_WARN printf
|
||||
#define DBG_LOG printf
|
||||
|
||||
|
||||
|
||||
// 定义数据缓冲区步长
|
||||
#define LZ77_BUFF_STEP_SIZE 10
|
||||
|
||||
|
||||
typedef struct _buff_item{
|
||||
uint8_t data[LZ77_BUFF_STEP_SIZE];
|
||||
struct _buff_item *next;
|
||||
struct _buff_item *prev;
|
||||
}buff_item;
|
||||
|
||||
|
||||
typedef struct _buff_def{
|
||||
buff_item *current;
|
||||
buff_item *head;
|
||||
int used;
|
||||
int all;
|
||||
int current_index;
|
||||
int bit_used;
|
||||
}buff_def;
|
||||
|
||||
|
||||
|
||||
typedef struct _zl77_def
|
||||
{
|
||||
int dict_len;// 字典长度
|
||||
int tran_len;// 转换区长度
|
||||
int index;// 窗口位置
|
||||
buff_def buff_chars;//字符编码区
|
||||
buff_def buff_pos;//标号编码区
|
||||
buff_def buff_bits;//编码类型标识区(1,字符;0,标号)
|
||||
const uint8_t *in;
|
||||
int in_len;
|
||||
uint8_t cmp_pos;// 匹配到的pos距离
|
||||
uint8_t cmp_len;// 匹配到的长度
|
||||
uint8_t cmp_skip;// 窗口移动的距离
|
||||
}zl77_def;
|
||||
|
||||
|
||||
|
||||
uint8_t zl77_buff_get_byte(buff_def *buff,int index);
|
||||
void zl77_buff_set_byte(buff_def *buff,int index,uint8_t d);
|
||||
void zl77_buff_append_bit(buff_def *buff,int bit);
|
||||
void zl77_buff_append_byte(buff_def *buff, const uint8_t d);
|
||||
int zl77_buff_get_bit(buff_def *buff, int index);
|
||||
|
||||
|
||||
|
||||
zl77_def *zl77_creat(void)
|
||||
{
|
||||
zl77_def *z=calloc(1,sizeof(zl77_def));
|
||||
z->dict_len=5;
|
||||
z->tran_len=3;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// 删除缓存
|
||||
void zl77_del_buff(buff_def *buff)
|
||||
{
|
||||
buff_item *t=buff->head;
|
||||
buff_item *o;
|
||||
while(t){
|
||||
o=t;
|
||||
t=t->next;
|
||||
free(o);
|
||||
}
|
||||
}
|
||||
|
||||
// 添加一个字节
|
||||
void zl77_buff_append_byte(buff_def *buff, const uint8_t d)
|
||||
{
|
||||
if(buff->used>=buff->all){
|
||||
buff_item *t=buff->head;
|
||||
buff_item *t_old=0;
|
||||
while (t)
|
||||
{
|
||||
t_old=t;
|
||||
t=t->next;
|
||||
}
|
||||
t=calloc(1,sizeof(buff_item));
|
||||
if(t_old){
|
||||
t_old->next=t;
|
||||
t->prev=t_old;
|
||||
}else{
|
||||
buff->head=t;
|
||||
}
|
||||
buff->all+=LZ77_BUFF_STEP_SIZE;
|
||||
buff->current=t;
|
||||
buff->current_index=buff->used;
|
||||
}
|
||||
while((buff->used/LZ77_BUFF_STEP_SIZE)>(buff->current_index/LZ77_BUFF_STEP_SIZE)){
|
||||
buff->current=buff->current->next;
|
||||
buff->current_index+=LZ77_BUFF_STEP_SIZE;
|
||||
}
|
||||
buff->current->data[buff->used%LZ77_BUFF_STEP_SIZE]=d;
|
||||
buff->used++;
|
||||
}
|
||||
|
||||
// 添加一个位
|
||||
void zl77_buff_append_bit(buff_def *buff,int bit)
|
||||
{
|
||||
if(buff->bit_used/8>=buff->used){
|
||||
zl77_buff_append_byte(buff,0);
|
||||
}
|
||||
uint8_t d=zl77_buff_get_byte(buff,buff->bit_used/8);
|
||||
d|=bit<<(buff->bit_used%8);
|
||||
zl77_buff_set_byte(buff,-1,d);
|
||||
buff->bit_used++;
|
||||
}
|
||||
|
||||
// 调整最近使用的缓冲区
|
||||
static void zl77_buff_adjust_current(buff_def *buff,int index){
|
||||
while((index/LZ77_BUFF_STEP_SIZE)>(buff->current_index/LZ77_BUFF_STEP_SIZE)){
|
||||
buff->current=buff->current->next;
|
||||
buff->current_index+=LZ77_BUFF_STEP_SIZE;
|
||||
}
|
||||
while((index/LZ77_BUFF_STEP_SIZE)<(buff->current_index/LZ77_BUFF_STEP_SIZE)){
|
||||
buff->current=buff->current->prev;
|
||||
buff->current_index-=LZ77_BUFF_STEP_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// 获取指定字节
|
||||
uint8_t zl77_buff_get_byte(buff_def *buff,int index){
|
||||
if(index<0) index=buff->used+index;
|
||||
if(index>=buff->used||index<0) return 0;
|
||||
zl77_buff_adjust_current(buff,index);
|
||||
return buff->current->data[index%LZ77_BUFF_STEP_SIZE];
|
||||
}
|
||||
|
||||
// 设置指定字节
|
||||
void zl77_buff_set_byte(buff_def *buff,int index,uint8_t d){
|
||||
if(index<0) index=buff->used+index;
|
||||
if(index>=buff->used||index<0) return ;
|
||||
zl77_buff_adjust_current(buff,index);
|
||||
buff->current->data[index%LZ77_BUFF_STEP_SIZE]=d;
|
||||
}
|
||||
|
||||
|
||||
// 获取指定位
|
||||
int zl77_buff_get_bit(buff_def *buff, int index){
|
||||
uint8_t d=zl77_buff_get_byte(buff,index/8);
|
||||
return (d&(1<<(index%8)))?1:0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void zl77_buff_print(buff_def *buff)
|
||||
{
|
||||
DBG_LOG("buff:[");
|
||||
for(int i=0;i<buff->used;i++){
|
||||
DBG_LOG("%02x ",zl77_buff_get_byte(buff,i));
|
||||
}
|
||||
DBG_LOG("]\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
static uint8_t zl77_get_char(zl77_def *z,int index)
|
||||
{
|
||||
// DBG_LOG("get_char:[%d]\n",index);
|
||||
if(index<0||index>=z->in_len) return 0;
|
||||
return z->in[index];
|
||||
}
|
||||
|
||||
// 比对,找到了返回0没找到返回1
|
||||
// 0记录标号,1记录原始数据
|
||||
static int zl77_cmp(zl77_def *z,int index){
|
||||
uint8_t pos=0;
|
||||
uint8_t len=0;
|
||||
// DBG_LOG("index=%d\n",index);
|
||||
for(int i=z->dict_len;i>0;i--){
|
||||
if(zl77_get_char(z,index-i)==zl77_get_char(z,index)){
|
||||
pos=i;
|
||||
len=0;
|
||||
for(int j=0;j<i;j++){
|
||||
if(zl77_get_char(z,index-i+j)==zl77_get_char(z,index+j))
|
||||
{
|
||||
// DBG_LOG("%c|%c \n",zl77_get_char(z,index-i+j),zl77_get_char(z,index+j));
|
||||
len++;
|
||||
if(len>z->cmp_len){
|
||||
z->cmp_len=len;
|
||||
z->cmp_pos=pos;
|
||||
}
|
||||
}else{
|
||||
len=0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if((pos|len)==0){
|
||||
z->cmp_skip=1;
|
||||
return 1;
|
||||
}
|
||||
else{
|
||||
// for(int i=0;i<z->cmp_len;i++){
|
||||
// DBG_LOG("%02x|%02x ",zl77_get_char(z,index-z->cmp_pos+i),zl77_get_char(z,index+i));
|
||||
// }
|
||||
z->cmp_skip=z->cmp_len;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
static inline void zl77_append_u32(uint8_t *data,int *index,uint32_t value){
|
||||
data[(*index)++]=value&0xff;
|
||||
data[(*index)++]=(value>>8)&0xff;
|
||||
data[(*index)++]=(value>>16)&0xff;
|
||||
data[(*index)++]=(value>>24)&0xff;
|
||||
}
|
||||
|
||||
static inline uint32_t zl77_get_u32(const uint8_t *data,int index){
|
||||
uint32_t ret=0;
|
||||
for(int i=0;i<4;i++){
|
||||
ret|=data[index+i]<<(8*i);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int zl77_encode(const uint8_t *in,const int in_len,uint8_t **out,int *out_len)
|
||||
{
|
||||
int ret;
|
||||
zl77_def *z=zl77_creat();
|
||||
z->in=in;
|
||||
z->in_len=in_len;
|
||||
for(int i=0;i<z->in_len;){
|
||||
z->cmp_pos=0;
|
||||
z->cmp_len=0;
|
||||
ret=zl77_cmp(z,i);
|
||||
if(ret){
|
||||
zl77_buff_append_byte(&z->buff_chars,zl77_get_char(z,i));
|
||||
// DBG_LOG("char(%c);",zl77_get_char(z,i));
|
||||
}else{
|
||||
zl77_buff_append_byte(&z->buff_pos,((z->cmp_pos&0xf)<<4)|(z->cmp_len&0xf));
|
||||
// DBG_LOG("pos(%d,%d);",z->cmp_pos,z->cmp_len);
|
||||
if((z->cmp_pos|z->cmp_len)==0){
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
zl77_buff_append_bit(&z->buff_bits,ret);
|
||||
i+=z->cmp_skip;
|
||||
}
|
||||
// DBG_LOG("\n");
|
||||
// zl77_buff_print(&z->buff_chars);
|
||||
// zl77_buff_print(&z->buff_pos);
|
||||
// zl77_buff_print(&z->buff_bits);
|
||||
uint32_t size_chars=z->buff_chars.used;
|
||||
uint32_t size_pos=z->buff_pos.used;
|
||||
uint32_t size_bits=z->buff_bits.used;
|
||||
uint32_t size_unpack=z->in_len;
|
||||
int index=0;
|
||||
(*out_len)=16+size_chars+size_pos+size_bits;
|
||||
(*out)=calloc(*out_len,sizeof(uint8_t));
|
||||
zl77_append_u32(*out,&index,size_chars);
|
||||
zl77_append_u32(*out,&index,size_pos);
|
||||
zl77_append_u32(*out,&index,size_bits);
|
||||
zl77_append_u32(*out,&index,size_unpack);
|
||||
for(int i=0;i<size_chars;i++){
|
||||
(*out)[index++]=zl77_buff_get_byte(&z->buff_chars,i);
|
||||
}
|
||||
for(int i=0;i<size_pos;i++){
|
||||
(*out)[index++]=zl77_buff_get_byte(&z->buff_pos,i);
|
||||
}
|
||||
for(int i=0;i<size_bits;i++){
|
||||
(*out)[index++]=zl77_buff_get_byte(&z->buff_bits,i);
|
||||
}
|
||||
zl77_del_buff(&z->buff_chars);
|
||||
zl77_del_buff(&z->buff_pos);
|
||||
zl77_del_buff(&z->buff_bits);
|
||||
free(z);
|
||||
|
||||
DBG_LOG("in_len=%d,out_len=%d\n",in_len,*out_len);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static inline int zl77_get_bit(const uint8_t *data,int index){
|
||||
uint8_t c=data[index/8];
|
||||
return c&(1<<(index%8))?1:0;
|
||||
}
|
||||
|
||||
int zl77_decode(const uint8_t* in, const int in_len, uint8_t** out, int* out_len)
|
||||
{
|
||||
int ret;
|
||||
int index_chars=0;
|
||||
int index_pos=0;
|
||||
int index_bits=0;
|
||||
uint8_t cmp_pos,cmp_len,ch;
|
||||
zl77_def *z=zl77_creat();
|
||||
uint32_t size_chars=zl77_get_u32(in,0);
|
||||
uint32_t size_pos=zl77_get_u32(in,4);
|
||||
uint32_t size_bits=zl77_get_u32(in,8);
|
||||
uint32_t size_unpack=zl77_get_u32(in,12);
|
||||
const uint8_t *chars=in+16;
|
||||
const uint8_t *pos=in+16+size_chars;
|
||||
const uint8_t *bits=in+16+size_chars+size_pos;
|
||||
(*out)=calloc(size_unpack+1,sizeof(uint8_t));
|
||||
for(int i=0;i<size_unpack;){
|
||||
ret=zl77_get_bit(bits,index_bits);
|
||||
index_bits++;
|
||||
// DBG_LOG("index:%d,bit=%d\n",index_bits,ret);
|
||||
if(ret){
|
||||
ch=chars[index_chars++];
|
||||
(*out)[i++]=ch;
|
||||
// DBG_LOG("char(%c)",ch);
|
||||
}else{
|
||||
cmp_pos=pos[index_pos]>>4;
|
||||
cmp_len=pos[index_pos]&0xf;index_pos++;
|
||||
// DBG_LOG("pos(%d,%d)",cmp_pos,cmp_len);
|
||||
memcpy(&(*out)[i],&(*out)[i-cmp_pos],cmp_len);
|
||||
i+=cmp_len;
|
||||
}
|
||||
}
|
||||
// DBG_LOG("\n");
|
||||
free(z);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
void main(int argc,const char *argv[])
|
||||
{
|
||||
uint8_t *encode_data=0;
|
||||
int encode_len=0;
|
||||
uint8_t *decode_data=0;
|
||||
int decode_len=0;
|
||||
hm_encode(argv[1],strlen(argv[1]),&encode_data,&encode_len);
|
||||
// for(int i=0;i<encode_len;i++)
|
||||
// {
|
||||
// DBG_LOG("%02x,",encode_data[i]);
|
||||
// }
|
||||
// DBG_LOG("\n");
|
||||
hm_encode(encode_data,encode_len,&decode_data,&decode_len);
|
||||
// zl77_decode(encode_data,encode_len,&decode_data,&decode_len);
|
||||
// printf("decode:%s\n",decode_data);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
25
zl77/zl77.h
Normal file
25
zl77/zl77.h
Normal file
@@ -0,0 +1,25 @@
|
||||
|
||||
#ifndef zl77_h__
|
||||
#define zl77_h__
|
||||
|
||||
|
||||
#include "stdint.h"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
int zl77_encode(const uint8_t *in,const int in_len,uint8_t **out,int *out_len);
|
||||
|
||||
int zl77_decode(const uint8_t* in, const int in_len, uint8_t** out, int* out_len);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
Reference in New Issue
Block a user