diff --git a/.gitignore b/.gitignore index 0e8781b..da0ba03 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,4 @@ .vs/ *.exe *.pkt - +zlib-1.3/ diff --git a/.vscode/settings.json b/.vscode/settings.json index 854061e..104bafc 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,7 @@ { "files.associations": { "coder_lib.h": "c", - "huffman_.h": "c" + "huffman_.h": "c", + "random": "c" } } \ No newline at end of file diff --git a/hello.exe b/hello.exe deleted file mode 100644 index 0576a1c..0000000 Binary files a/hello.exe and /dev/null differ diff --git a/Makefile b/huffman/Makefile similarity index 100% rename from Makefile rename to huffman/Makefile diff --git a/hello.c b/huffman/hello.c similarity index 100% rename from hello.c rename to huffman/hello.c diff --git a/huffman_.c b/huffman/huffman_.c similarity index 100% rename from huffman_.c rename to huffman/huffman_.c diff --git a/huffman_.h b/huffman/huffman_.h similarity index 100% rename from huffman_.h rename to huffman/huffman_.h diff --git a/lzw-ab/.vscode/c_cpp_properties.json b/lzw-ab/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..f3679c6 --- /dev/null +++ b/lzw-ab/.vscode/c_cpp_properties.json @@ -0,0 +1,23 @@ +{ + "configurations": [ + { + "name": "Win32", + "includePath": [ + "${workspaceFolder}/**", + "C:/MinGW/include/" + ], + "defines": [ + "_DEBUG", + "UNICODE", + "_UNICODE", + "__WIN32__", + "DLL_EXPORT" + ], + "cStandard": "c17", + "cppStandard": "gnu++17", + "intelliSenseMode": "windows-gcc-x64", + "compilerPath": "C:/MinGW/bin/gcc.exe" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/lzw-ab/.vscode/settings.json b/lzw-ab/.vscode/settings.json new file mode 100644 index 0000000..854061e --- /dev/null +++ b/lzw-ab/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "files.associations": { + "coder_lib.h": "c", + "huffman_.h": "c" + } +} \ No newline at end of file diff --git a/lzw-ab/Makefile b/lzw-ab/Makefile new file mode 100644 index 0000000..1a1dd1f --- /dev/null +++ b/lzw-ab/Makefile @@ -0,0 +1,15 @@ + + +CC = gcc + + +SRCS = $(wildcard *.c) + + +STR = $(subst from,to,from your heart) + +all: + $(CC) $(SRCS) -o hello + +clean: + rm -rf *.exe diff --git a/lzw-ab/README b/lzw-ab/README new file mode 100644 index 0000000..e77444e --- /dev/null +++ b/lzw-ab/README @@ -0,0 +1,83 @@ +//////////////////////////////////////////////////////////////////////////// +// **** LZW-AB **** // +// Adjusted Binary LZW Compressor/Decompressor // +// Copyright (c) 2016-2020 David Bryant // +// All Rights Reserved // +// Distributed under the BSD Software License (see license.txt) // +//////////////////////////////////////////////////////////////////////////// + +This is an implementation of the Lempel-Ziv-Welch general-purpose data +compression algorithm. It is targeted at embedded applications that require +high speed compression or decompression facilities where lots of RAM for +large dictionaries might not be available. I have used this in several +projects for storing compressed firmware images, and once I even coded the +decompressor in Z-80 assembly language for speed! Depending on the maximum +symbol size selected, the implementation can require from 2368 to 335616 +bytes of RAM for decoding (and about half again more for encoding). + +This is a streaming compressor in that the data is not divided into blocks +and no context information like dictionaries or Huffman tables are sent +ahead of the compressed data (except for one byte to signal the maximum +bit depth). This limits the maximum possible compression ratio compared to +algorithms that significantly preprocess the data, but with the help of +some enhancements to the LZW algorithm (described below) it is able to +compress better than the UNIX "compress" utility (which is also LZW) and +is in fact closer to and sometimes beats the compression level of "gzip". + +The symbols are stored in "adjusted binary" which provides somewhat better +compression (with virtually no speed penalty) compared to the fixed word +sizes normally used. Once the dictionary is full, the encoder returns to +the beginning and recycles string codes that have not been used yet for +longer strings. In this way the dictionary constantly "churns" based on the +the incoming stream, thereby improving and adapting to optimal compression. +The compression performance is constantly monitored and a dictionary flush +is forced on stretches of negative compression which limits worst-case +performance to about 8% inflation. + +LZW-AB consists of three standard C files: the library, a command-line +filter demo using pipes, and a command-line test harness. Each program +builds with a single command on most platforms. It has been designed with +maximum portability in mind and should work correctly on big-endian as well +as little-endian machines. + +Linux: +% gcc -O3 lzwfilter.c lzwlib.c -o lzwfilter +% gcc -O3 lzwtester.c lzwlib.c -o lzwtester + +Darwin/Mac: +% clang -O3 lzwfilter.c lzwlib.c -o lzwfilter +% clang -O3 lzwtester.c lzwlib.c -o lzwtester + +MS Visual Studio: +cl -O2 lzwfilter.c lzwlib.c +cl -O2 lzwtester.c lzwlib.c + +There are Windows binaries (built on MinGW) for the filter and the tester on the +GitHub release page (v3). The "help" display for the filter looks like this: + + Usage: lzwfilter [-options] [< infile] [> outfile] + + Operation: compression is default, use -d to decompress + + Options: -d = decompress + -h = display this "help" message + -1 = maximum symbol size = 9 bits + -2 = maximum symbol size = 10 bits + -3 = maximum symbol size = 11 bits + -4 = maximum symbol size = 12 bits + -5 = maximum symbol size = 13 bits + -6 = maximum symbol size = 14 bits + -7 = maximum symbol size = 15 bits + -8 = maximum symbol size = 16 bits (default) + -v = verbose (display ratio and checksum) + +Here's the "help" display for the tester: + + Usage: lzwtester [options] file [...] + + Options: -1 ... -8 = test using only specified max symbol size (9 - 16) + -0 = cycle through all maximum symbol sizes (default) + -e = exhaustive test (by successive truncation) + -f = fuzz test (randomly corrupt compressed data) + -q = quiet mode (only reports errors and summary) + diff --git a/lzw-ab/license.txt b/lzw-ab/license.txt new file mode 100644 index 0000000..65d4a2e --- /dev/null +++ b/lzw-ab/license.txt @@ -0,0 +1,25 @@ + Copyright (c) David Bryant + All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Conifer Software nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/lzw-ab/lzwfilter.ctt b/lzw-ab/lzwfilter.ctt new file mode 100644 index 0000000..40920a7 --- /dev/null +++ b/lzw-ab/lzwfilter.ctt @@ -0,0 +1,191 @@ +//////////////////////////////////////////////////////////////////////////// +// **** LZW-AB **** // +// Adjusted Binary LZW Compressor/Decompressor // +// Copyright (c) 2016-2020 David Bryant // +// All Rights Reserved // +// Distributed under the BSD Software License (see license.txt) // +//////////////////////////////////////////////////////////////////////////// + + +#include +#include +#include + +#ifdef _WIN32 +#include +#endif + +#include "lzwlib.h" + +/* This module provides a command-line filter for testing the lzw library. + * It can also optionally calculate and display the compression ratio and + * a simple checksum for informational purposes. Other command-line + * arguments select decoding mode or the maximum symbol size (9 to 16 bits) + * for encoding. + */ + +static const char *usage = +" Usage: lzwfilter [-options] [< infile] [> outfile]\n\n" +" Operation: compression is default, use -d to decompress\n\n" +" Options: -d = decompress\n" +" -h = display this \"help\" message\n" +" -1 = maximum symbol size = 9 bits\n" +" -2 = maximum symbol size = 10 bits\n" +" -3 = maximum symbol size = 11 bits\n" +" -4 = maximum symbol size = 12 bits\n" +" -5 = maximum symbol size = 13 bits\n" +" -6 = maximum symbol size = 14 bits\n" +" -7 = maximum symbol size = 15 bits\n" +" -8 = maximum symbol size = 16 bits (default)\n" +" -v = verbose (display ratio and checksum)\n\n" +" Web: Visit www.github.com/dbry/lzw-ab for latest version and info\n\n"; + +typedef struct { + unsigned char buffer [65536]; + int checksum, head, tail; + size_t byte_count; +} streamer; + +static int read_buff (void *ctx) +{ + streamer *stream = ctx; + int value; + + if (stream->head == stream->tail) + stream->tail = (stream->head = 0) + fread (stream->buffer, 1, sizeof (stream->buffer), stdin); + + if (stream->head < stream->tail) { + value = stream->buffer [stream->head++]; + stream->checksum = stream->checksum * 3 + (unsigned char) value; + stream->byte_count++; + } + else + value = EOF; + + return value; +} + +static void write_buff (int value, void *ctx) +{ + streamer *stream = ctx; + + if (value == EOF) { + fwrite (stream->buffer, 1, stream->head, stdout); + return; + } + + stream->buffer [stream->head++] = value; + + if (stream->head == sizeof (stream->buffer)) { + fwrite (stream->buffer, 1, stream->head, stdout); + stream->head = 0; + } + + stream->checksum = stream->checksum * 3 + (unsigned char) value; + stream->byte_count++; +} + +int main (int argc, char **argv) +{ + int decompress = 0, maxbits = 16, verbose = 0, error = 0; + streamer reader, writer; + + memset (&reader, 0, sizeof (reader)); + memset (&writer, 0, sizeof (writer)); + reader.checksum = writer.checksum = -1; + + while (--argc) { + if ((**++argv == '-') && (*argv)[1]) + while (*++*argv) + switch (**argv) { + case '1': + maxbits = 9; + break; + + case '2': + maxbits = 10; + break; + + case '3': + maxbits = 11; + break; + + case '4': + maxbits = 12; + break; + + case '5': + maxbits = 13; + break; + + case '6': + maxbits = 14; + break; + + case '7': + maxbits = 15; + break; + + case '8': + maxbits = 16; + break; + + case 'D': case 'd': + decompress = 1; + break; + + case 'H': case 'h': + fprintf (stderr, "%s", usage); + return 0; + break; + + case 'V': case 'v': + verbose = 1; + break; + + default: + fprintf (stderr, "illegal option: %c !\n", **argv); + error = 1; + break; + } + else { + fprintf (stderr, "unknown argument: %s\n", *argv); + error = 1; + } + } + + if (error) { + fprintf (stderr, "%s", usage); + return 0; + } + +#ifdef _WIN32 + setmode (fileno (stdin), O_BINARY); + setmode (fileno (stdout), O_BINARY); +#endif + + if (decompress) { + if (lzw_decompress (write_buff, &writer, read_buff, &reader)) { + fprintf (stderr, "lzw_decompress() returned non-zero!\n"); + return 1; + } + + write_buff (EOF, &writer); + + if (verbose && writer.byte_count) + fprintf (stderr, "output checksum = %x, ratio = %.2f%%\n", writer.checksum, reader.byte_count * 100.0 / writer.byte_count); + } + else { + if (lzw_compress (write_buff, &writer, read_buff, &reader, maxbits)) { + fprintf (stderr, "lzw_compress() returned non-zero!\n"); + return 1; + } + + write_buff (EOF, &writer); + + if (verbose && reader.byte_count) + fprintf (stderr, "source checksum = %x, ratio = %.2f%%\n", reader.checksum, writer.byte_count * 100.0 / reader.byte_count); + } + + return 0; +} diff --git a/lzw-ab/lzwlib.c b/lzw-ab/lzwlib.c new file mode 100644 index 0000000..636472b --- /dev/null +++ b/lzw-ab/lzwlib.c @@ -0,0 +1,513 @@ +//////////////////////////////////////////////////////////////////////////// +// **** LZW-AB **** // +// Adjusted Binary LZW Compressor/Decompressor // +// Copyright (c) 2016-2020 David Bryant // +// All Rights Reserved // +// Distributed under the BSD Software License (see license.txt) // +//////////////////////////////////////////////////////////////////////////// + +#include +#include +#include + +#include "lzwlib.h" + +/* This library implements the LZW general-purpose data compression algorithm. + * The algorithm was originally described as a hardware implementation by + * Terry Welsh here: + * + * Welch, T.A. “A Technique for High-Performance Data Compression.” + * IEEE Computer 17,6 (June 1984), pp. 8-19. + * + * Since then there have been enumerable refinements and variations on the + * basic technique, and this implementation is no different. The target of + * the present implementation is embedded systems, and so emphasis was placed + * on simplicity, fast execution, and minimal RAM usage. + * + * This is a streaming compressor in that the data is not divided into blocks + * and no context information like dictionaries or Huffman tables are sent + * ahead of the compressed data (except for one byte to signal the maximum + * bit depth). This limits the maximum possible compression ratio compared to + * algorithms that significantly preprocess the data, but with the help of + * some enhancements to the LZW algorithm (described below) it is able to + * compress better than the UNIX "compress" utility (which is also LZW) and + * is in fact closer to and sometimes beats the compression level of "gzip". + * + * The symbols are stored in "adjusted binary" which provides somewhat better + * compression, with virtually no speed penalty, compared to the fixed word + * sizes normally used. These are sometimes called "phased-in" binary codes + * and their use in LZW is described here: + * + * R. N. Horspool, "Improving LZW (data compression algorithm)", Data + * Compression Conference, pp. 332-341, 1991. + * + * Earlier versions of this compressor would reset as soon as the dictionary + * became full to ensure good performance on heterogenous data (such as tar + * files or executable images). While trivial to implement, this is not + * particularly efficient with homogeneous data (or in general) because we + * spend a lot of time sending short symbols where the compression is poor. + * + * This newer version utilizes a technique such that once the dictionary is + * full, we restart at the beginning and recycle only those codes that were + * seen only once. We know this because they are not referenced by longer + * strings, and are easy to replace in the dictionary for the same reason. + * Since they have only been seen once it's also more likely that we will + * be replacing them with a more common string, and this is especially + * true if the data characteristics are changing. + * + * Replacing string codes in this manner has the interesting side effect that + * some older shorter strings that the removed strings were based on will + * possibly become unreferenced themselves and be recycled on the next pass. + * In this way, the entire dictionary constantly "churns" based on the + * incoming stream, thereby improving and adapting to optimal compression. + * + * Even with this technique there is still a possibility that a sudden change + * in the data characteristics will appear, resulting in significant negative + * compression (up to 100% for 16-bit codes). To detect this case we generate + * an exponentially decaying average of the current compression ratio and reset + * when this hits about 1.06, which limits worst case inflation to about 8%. + * + * The maximum symbol size is configurable on the encode side (from 9 bits to + * 16 bits) and determines the RAM footprint required by both sides and, to a + * large extent, the compression performance. This information is communicated + * to the decoder in the first stream byte so that it can allocate accordingly. + * The RAM requirements are as follows: + * + * maximum encoder RAM decoder RAM + * symbol size requirement requirement + * ----------------------------------------- + * 9-bit 4096 bytes 2368 bytes + * 10-bit 8192 bytes 4992 bytes + * 11-bit 16384 bytes 10240 bytes + * 12-bit 32768 bytes 20736 bytes + * 13-bit 65536 bytes 41728 bytes + * 14-bit 131072 bytes 83712 bytes + * 15-bit 262144 bytes 167680 bytes + * 16-bit 524288 bytes 335616 bytes + * + * This implementation uses malloc(), but obviously an embedded version could + * use static arrays instead if desired (assuming that the maxbits was + * controlled outside). + */ + +#define NULL_CODE 65535 // indicates a NULL prefix (must be unsigned short) +#define CLEAR_CODE 256 // code to flush dictionary and restart decoder +#define FIRST_STRING 257 // code of first dictionary string + +/* This macro determines the number of bits required to represent the given value, + * not counting the implied MSB. For GNU C it will use the provided built-in, + * otherwise a comparison tree is employed. Note that in the non-GNU case, only + * values up to 65535 (15 bits) are supported. + */ + +#ifdef __GNUC__ +#define CODE_BITS(n) (31 - __builtin_clz(n)) +#else +#define CODE_BITS(n) ((n) < 4096 ? \ + ((n) < 1024 ? 8 + ((n) >= 512) : 10 + ((n) >= 2048)) : \ + ((n) < 16384 ? 12 + ((n) >= 8192) : 14 + ((n) >= 32768))) +#endif + +/* This macro writes the adjusted-binary symbol "code" given the maximum + * symbol "maxcode". A macro is used here just to avoid the duplication in + * the lzw_compress() function. The idea is that if "maxcode" is not one + * less than a power of two (which it rarely will be) then this code can + * often send fewer bits that would be required with a fixed-sized code. + * + * For example, the first code we send will have a "maxcode" of 257, so + * every "code" would normally consume 9 bits. But with adjusted binary we + * can actually represent any code from 0 to 253 with just 8 bits -- only + * the 4 codes from 254 to 257 take 9 bits. + */ + +#define WRITE_CODE(code,maxcode) do { \ + unsigned int code_bits = CODE_BITS (maxcode); \ + unsigned int extras = (2 << code_bits) - (maxcode) - 1; \ + if ((code) < extras) { \ + shifter |= ((code) << bits); \ + bits += code_bits; \ + } \ + else { \ + shifter |= ((((code) + extras) >> 1) << bits); \ + bits += code_bits; \ + shifter |= ((((code) + extras) & 1) << bits++); \ + } \ + do { (*dst)(shifter,dstctx); shifter >>= 8; \ + output_bytes += 256; \ + } while ((bits -= 8) >= 8); \ +} while (0) + +/* LZW compression function. Bytes (8-bit) are read and written through callbacks and the + * "maxbits" parameter specifies the maximum symbol size (9-16), which in turn determines + * the RAM requirement and, to a large extent, the level of compression achievable. A return + * value of EOF from the "src" callback terminates the compression process. A non-zero return + * value indicates one of the two possible errors -- bad "maxbits" param or failed malloc(). + * There are contexts (void pointers) that are passed to the callbacks to easily facilitate + * multiple instances of the compression operation (but simple applications can ignore these). + */ + +typedef struct { + unsigned short first_reference, next_reference, back_reference; + unsigned char terminator; +} encoder_entry_t; + +int lzw_compress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void *srcctx, int maxbits) +{ + unsigned int maxcode = FIRST_STRING, next_string = FIRST_STRING, prefix = NULL_CODE, total_codes; + unsigned int dictionary_full = 0, available_entries, max_available_entries, max_available_code; + unsigned int input_bytes = 65536, output_bytes = 65536; + unsigned int shifter = 0, bits = 0; + encoder_entry_t *dictionary; + int c; + + if (maxbits < 9 || maxbits > 16) // check for valid "maxbits" setting + return 1; + + // based on the "maxbits" parameter, compute total codes and allocate dictionary storage + + total_codes = 1 << maxbits; + dictionary = malloc (total_codes * sizeof (encoder_entry_t)); + max_available_entries = total_codes - FIRST_STRING - 1; + max_available_code = total_codes - 2; + + if (!dictionary) + return 1; // failed malloc() + + // clear the dictionary + + available_entries = max_available_entries; + memset (dictionary, 0, 256 * sizeof (encoder_entry_t)); + + (*dst)(maxbits - 9, dstctx); // first byte in output stream indicates the maximum symbol bits + + // This is the main loop where we read input bytes and compress them. We always keep track of the + // "prefix", which represents a pending byte (if < 256) or string entry (if >= FIRST_STRING) that + // has not been sent to the decoder yet. The output symbols are kept in the "shifter" and "bits" + // variables and are sent to the output every time 8 bits are available (done in the macro). + + while ((c = (*src)(srcctx)) != EOF) { + unsigned int cti; // coding table index + + input_bytes += 256; + + if (prefix == NULL_CODE) { // this only happens the very first byte when we don't yet have a prefix + prefix = c; + continue; + } + + memset (dictionary + next_string, 0, sizeof (encoder_entry_t)); + + if ((cti = dictionary [prefix].first_reference)) { // if any longer strings are built on the current prefix... + while (1) + if (dictionary [cti].terminator == c) { // we found a matching string, so we just update the prefix + prefix = cti; // to that string and continue without sending anything + break; + } + else if (!dictionary [cti].next_reference) { // this string did not match the new character and + dictionary [cti].next_reference = next_string; // there aren't any more, so we'll add a new string, + // point to it with "next_reference", and also make the + dictionary [next_string].back_reference = cti; // "back_reference" which is used for recycling entries + cti = 0; + break; + } + else + cti = dictionary [cti].next_reference; // there are more possible matches to check, so loop back + } + else { // no longer strings are based on the current prefix, so now + dictionary [prefix].first_reference = next_string; // the current prefix plus the new byte will be the next string + dictionary [next_string].back_reference = prefix; // also make the back_reference used for recycling + if (prefix >= FIRST_STRING) available_entries--; // the codes 0-255 are never available for recycling + } + + // If "cti" is zero, we could not simply extend our "prefix" to a longer string because we did not find a + // dictionary match, so we send the symbol representing the current "prefix" and add the new string to the + // dictionary. Since the current byte "c" was not included in the prefix, that now becomes our new prefix. + + if (!cti) { + WRITE_CODE (prefix, maxcode); // send symbol for current prefix (0 to maxcode-1) + dictionary [next_string].terminator = c; // newly created string has current byte as the terminator + prefix = c; // current byte also becomes new prefix for next string + + // If the dictionary is not full yet, we bump the maxcode and next_string and check to see if the + // dictionary is now full. If it is we set the dictionary_full flag and leave maxcode set to two + // less than total_codes because every string entry is now available for matching, but the actual + // maximum code is reserved for EOF. + + if (!dictionary_full) { + dictionary_full = (++next_string > max_available_code); + maxcode++; + } + + // If the dictionary is full we look for an entry to recycle starting at next_string (the one we + // just created or recycled) plus one (with check for wrap check). We know there is one because at + // a minimum the string we just added. This also takes care of removing the entry to be recycled + // (which is possible/easy because no longer strings have been based on it). + + if (dictionary_full) { + for (next_string++; next_string <= max_available_code || (next_string = FIRST_STRING); next_string++) + if (!dictionary [next_string].first_reference) + break; + + cti = dictionary [next_string].back_reference; // dictionary [cti] references the entry we're + // trying to recycle (either as a first or a next) + + if (dictionary [cti].first_reference == next_string) { + dictionary [cti].first_reference = dictionary [next_string].next_reference; + + // if we just cleared a first reference, and that string is not 0-255, + // then that's a newly available entry + if (!dictionary [cti].first_reference && cti >= FIRST_STRING) + available_entries++; + } + else if (dictionary [cti].next_reference == next_string) // fixup a "next_reference" + dictionary [cti].next_reference = dictionary [next_string].next_reference; + + // If the entry we're recycling had a next reference, then update the back reference + // so it's completely out of the chain. Of course we know it didn't have a first + // reference because then we wouldn't be recycling it. + + if (dictionary [next_string].next_reference) + dictionary [dictionary [next_string].next_reference].back_reference = cti; + + // This check is technically not needed because there will always be an available entry + // (the last string we added at a minimum) but we don't want to get in a situation where + // we only have a few entries that we're cycling though. I pulled the limits (16 entries + // or 1% of total) out of a hat. + + if (available_entries < 16 || available_entries * 100 < max_available_entries) { + // clear the dictionary and reset the byte counters -- basically everything starts over + // except that we keep the last pending "prefix" (which, of course, was never sent) + + WRITE_CODE (CLEAR_CODE, maxcode); + memset (dictionary, 0, 256 * sizeof (encoder_entry_t)); + available_entries = max_available_entries; + next_string = maxcode = FIRST_STRING; + input_bytes = output_bytes = 65536; + dictionary_full = 0; + } + } + + // This is similar to the above check, except that it's used whether the dictionary is full or not. + // It uses an exponentially decaying average of the current compression ratio, so it can terminate + // very early if the incoming data is uncompressible or it can terminate any later time that the + // dictionary no longer compresses the incoming stream. + + if (output_bytes > input_bytes + (input_bytes >> 4)) { + WRITE_CODE (CLEAR_CODE, maxcode); + memset (dictionary, 0, 256 * sizeof (encoder_entry_t)); + available_entries = max_available_entries; + next_string = maxcode = FIRST_STRING; + input_bytes = output_bytes = 65536; + dictionary_full = 0; + } + else { + output_bytes -= output_bytes >> 8; + input_bytes -= input_bytes >> 8; + } + } + } + + // we're done with input, so if we've received anything we still need to send that pesky pending prefix... + + if (prefix != NULL_CODE) { + WRITE_CODE (prefix, maxcode); + + if (!dictionary_full) + maxcode++; + } + + WRITE_CODE (maxcode, maxcode); // the maximum possible code is always reserved for our END_CODE + + if (bits) // finally, flush any pending bits from the shifter + (*dst)(shifter, dstctx); + + free (dictionary); + return 0; +} + +/* LZW decompression function. Bytes (8-bit) are read and written through callbacks. The + * "maxbits" parameter is read as the first byte in the stream and controls how much memory + * is allocated for decoding. A return value of EOF from the "src" callback terminates the + * decompression process (although this should not normally occur). A non-zero return value + * indicates an error, which in this case can be a bad "maxbits" read from the stream, a + * failed malloc(), or if an EOF is read from the input stream before the decompression + * terminates naturally with END_CODE. There are contexts (void pointers) that are passed + * to the callbacks to easily facilitate multiple instances of the decompression operation + * (but simple applications can ignore these). + */ + +typedef struct { + unsigned char terminator, extra_references; + unsigned short prefix; +} decoder_entry_t; + +int lzw_decompress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void *srcctx) +{ + unsigned int maxcode = FIRST_STRING, next_string = FIRST_STRING - 1, prefix = CLEAR_CODE; + unsigned int dictionary_full = 0, max_available_code, total_codes; + unsigned int shifter = 0, bits = 0, read_byte, i; + unsigned char *reverse_buffer, *referenced; + decoder_entry_t *dictionary; + + if ((read_byte = ((*src)(srcctx))) == EOF || (read_byte & 0xf8)) //sanitize first byte + return 1; + + // based on the "maxbits" parameter, compute total codes and allocate dictionary storage + + total_codes = 512 << (read_byte & 0x7); + max_available_code = total_codes - 2; + dictionary = malloc (total_codes * sizeof (decoder_entry_t)); + reverse_buffer = malloc (total_codes - 256); + referenced = malloc (total_codes / 8); // bitfield indicating code is referenced at least once + + // Note that to implement the dictionary entry recycling we have to keep track of how many + // longer strings are based on each string in the dictionary. This can be between 0 (no + // references) to 256 (every possible next byte), but unfortunately that's one more value + // than what can be stored in a byte. The solution is to have a single bit for each entry + // indicating any references (i.e., the code cannot be recycled) and an additional byte + // in the dictionary entry struct counting the "extra" references (beyond one). + + if (!reverse_buffer || !dictionary) // check for malloc() failure + return 1; + + for (i = 0; i < 256; ++i) { // these never change + dictionary [i].prefix = NULL_CODE; + dictionary [i].terminator = i; + } + + // This is the main loop where we read input symbols. The values range from 0 to the code value + // of the "next" string in the dictionary (although the actual "next" code cannot be used yet, + // and so we reserve that code for the END_CODE). Note that receiving an EOF from the input + // stream is actually an error because we should have gotten the END_CODE first. + + while (1) { + unsigned int code_bits = CODE_BITS (maxcode), code; + unsigned int extras = (2 << code_bits) - maxcode - 1; + + do { + if ((read_byte = ((*src)(srcctx))) == EOF) { + free (dictionary); free (reverse_buffer); free (referenced); + return 1; + } + + shifter |= read_byte << bits; + } while ((bits += 8) < code_bits); + + // first we assume the code will fit in the minimum number of required bits + + code = shifter & ((1 << code_bits) - 1); + shifter >>= code_bits; + bits -= code_bits; + + // but if code >= extras, then we need to read another bit to calculate the real code + // (this is the "adjusted binary" part) + + if (code >= extras) { + if (!bits) { + if ((read_byte = ((*src)(srcctx))) == EOF) { + free (dictionary); free (reverse_buffer); free (referenced); + return 1; + } + + shifter = read_byte; + bits = 8; + } + + code = (code << 1) - extras + (shifter & 1); + shifter >>= 1; + bits--; + } + + if (code == maxcode) // sending the maximum code is reserved for the end of the file + break; + else if (code == CLEAR_CODE) { // otherwise check for a CLEAR_CODE to start over early + next_string = FIRST_STRING - 1; + maxcode = FIRST_STRING; + dictionary_full = 0; + } + else if (prefix == CLEAR_CODE) { // this only happens at the first symbol which is always sent + (*dst)(code, dstctx); // literally and becomes our initial prefix + next_string++; + maxcode++; + } + // Otherwise we have a valid prefix so we step through the string from end to beginning storing the + // bytes in the "reverse_buffer", and then we send them out in the proper order. One corner-case + // we have to handle here is that the string might be the same one that is actually being defined + // now (code == next_string). + else { + unsigned int cti = (code == next_string) ? prefix : code; + unsigned char *rbp = reverse_buffer, c; + + do { + *rbp++ = dictionary [cti].terminator; + if (rbp == reverse_buffer + total_codes - 256) { + free (dictionary); free (reverse_buffer); free (referenced); + return 1; + } + } while ((cti = dictionary [cti].prefix) != NULL_CODE); + + c = *--rbp; // the first byte in this string is the terminator for the last string, which is + // the one that we'll create a new dictionary entry for this time + + do // send string in corrected order (except for the terminator which we don't know yet) + (*dst)(*rbp, dstctx); + while (rbp-- != reverse_buffer); + + if (code == next_string) { + (*dst)(c,dstctx); + } + + // This should always execute (the conditional is to catch corruptions) and is where we add a new string to + // the dictionary, either at the end or elsewhere when we are "recycling" entries that were never referenced + + if (next_string >= FIRST_STRING && next_string < total_codes) { + if (referenced [prefix >> 3] & (1 << (prefix & 7))) // increment reference count on prefix + dictionary [prefix].extra_references++; + else + referenced [prefix >> 3] |= 1 << (prefix & 7); + + dictionary [next_string].prefix = prefix; // now update the next dictionary entry with the new string + dictionary [next_string].terminator = c; // (but we're always one behind, so it's not the string just sent) + dictionary [next_string].extra_references = 0; // newly created string has not been referenced + referenced [next_string >> 3] &= ~(1 << (next_string & 7)); + } + + // If the dictionary is not full yet, we bump the maxcode and next_string and check to see if the + // dictionary is now full. If it is we set the dictionary_full flag and set next_string back to the + // beginning of the dictionary strings to start recycling them. Note that then maxcode will remain + // two less than total_codes because every string entry is available for matching, and the actual + // maximum code is reserved for EOF. + + if (!dictionary_full) { + maxcode++; + + if (++next_string > max_available_code) { + dictionary_full = 1; + maxcode--; + } + } + + // If the dictionary is full we look for an entry to recycle starting at next_string (the one we + // created or recycled) plus one. We know there is one because at a minimum the string we just added + // has not been referenced). This also takes care of removing the entry to be recycled (which is + // possible/easy because no longer strings have been based on it). + + if (dictionary_full) { + for (next_string++; next_string <= max_available_code || (next_string = FIRST_STRING); next_string++) + if (!(referenced [next_string >> 3] & (1 << (next_string & 7)))) + break; + + if (dictionary [dictionary [next_string].prefix].extra_references) + dictionary [dictionary [next_string].prefix].extra_references--; + else + referenced [dictionary [next_string].prefix >> 3] &= ~(1 << (dictionary [next_string].prefix & 7)); + } + } + + prefix = code; // the code we just received becomes the prefix for the next dictionary string entry + // (which we'll create once we find out the terminator) + } + + free (dictionary); free (reverse_buffer); free (referenced); + return 0; +} diff --git a/lzw-ab/lzwlib.h b/lzw-ab/lzwlib.h new file mode 100644 index 0000000..246ca24 --- /dev/null +++ b/lzw-ab/lzwlib.h @@ -0,0 +1,15 @@ +//////////////////////////////////////////////////////////////////////////// +// **** LZW-AB **** // +// Adjusted Binary LZW Compressor/Decompressor // +// Copyright (c) 2016-2020 David Bryant // +// All Rights Reserved // +// Distributed under the BSD Software License (see license.txt) // +//////////////////////////////////////////////////////////////////////////// + +#ifndef LZWLIB_H_ +#define LZWLIB_H_ + +int lzw_compress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void *srcctx, int maxbits); +int lzw_decompress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void *srcctx); + +#endif /* LZWLIB_H_ */ diff --git a/lzw-ab/lzwtester.c b/lzw-ab/lzwtester.c new file mode 100644 index 0000000..35007d7 --- /dev/null +++ b/lzw-ab/lzwtester.c @@ -0,0 +1,317 @@ +//////////////////////////////////////////////////////////////////////////// +// **** LZW-AB **** // +// Adjusted Binary LZW Compressor/Decompressor // +// Copyright (c) 2016-2020 David Bryant // +// All Rights Reserved // +// Distributed under the BSD Software License (see license.txt) // +//////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#include +#endif + +#include "lzwlib.h" + +/* This module provides a command-line test harness for the lzw library. + * Given a list of files, it will read each one and byte-for-byte verify + * the data after a round-trip through a compression / decompression cycle + * at each of the 8 available maximum symbol size settings. + * + * It can also optionally perform fuzz testing by randomly corrupting the + * compressed bitstream. Obviously this will introduce integrity failures, + * but it should not cause a crash. It also has an "exhaustive" mode that + * creates hundreds of simulated images from each input file by successive + * truncation from both ends. + */ + +static const char *usage = +" Usage: lzwtester [options] file [...]\n\n" +" Options: -1 ... -8 = test using only specified max symbol size (9 - 16)\n" +" -0 = cycle through all maximum symbol sizes (default)\n" +" -e = exhaustive test (by successive truncation)\n" +" -f = fuzz test (randomly corrupt compressed data)\n" +" -q = quiet mode (only reports errors and summary)\n\n" +" Web: Visit www.github.com/dbry/lzw-ab for latest version and info\n\n"; + +typedef struct { + unsigned int size, index, wrapped, byte_errors, first_error, fuzz_testing; + unsigned char *buffer; +} streamer; + +static int read_buff (void *ctx) +{ + streamer *stream = ctx; + + if (stream->index == stream->size) + return EOF; + + return stream->buffer [stream->index++]; +} + +static void write_buff (int value, void *ctx) +{ + streamer *stream = ctx; + + // for fuzz testing, randomly corrupt 1 byte in every 65536 (on average) + + if (stream->fuzz_testing) { + static unsigned long long kernel = 0x3141592653589793; + kernel = ((kernel << 4) - kernel) ^ 1; + kernel = ((kernel << 4) - kernel) ^ 1; + kernel = ((kernel << 4) - kernel) ^ 1; + + if (!(kernel >> 48)) + value ^= (int)(kernel >> 40); + } + + if (stream->index == stream->size) { + stream->index = 0; + stream->wrapped++; + } + + stream->buffer [stream->index++] = value; +} + +static void check_buff (int value, void *ctx) +{ + streamer *stream = ctx; + + if (stream->index == stream->size) { + stream->wrapped++; + return; + } + + if (stream->buffer [stream->index] != value) + if (!stream->byte_errors++) + stream->first_error = stream->index; + + stream->index++; +} + +#ifdef _WIN32 + +long long DoGetFileSize (FILE *hFile) +{ + LARGE_INTEGER Size; + HANDLE fHandle; + + if (hFile == NULL) + return 0; + + fHandle = (HANDLE)_get_osfhandle(_fileno(hFile)); + if (fHandle == INVALID_HANDLE_VALUE) + return 0; + + Size.u.LowPart = GetFileSize(fHandle, &Size.u.HighPart); + + if (Size.u.LowPart == INVALID_FILE_SIZE && GetLastError() != NO_ERROR) + return 0; + + return (long long)Size.QuadPart; +} + +#else + +long long DoGetFileSize (FILE *hFile) +{ + struct stat statbuf; + + if (!hFile || fstat (fileno (hFile), &statbuf) || !S_ISREG(statbuf.st_mode)) + return 0; + + return (long long) statbuf.st_size; +} + +#endif + +int main (int argc, char **argv) +{ + int index, checked = 0, tests = 0, skipped = 0, errors = 0; + int set_maxbits = 0, quiet_mode = 0, exhaustive_mode = 0; + long long total_input_bytes = 0, total_output_bytes = 0; + streamer reader, writer, checker; + + memset (&reader, 0, sizeof (reader)); + memset (&writer, 0, sizeof (writer)); + memset (&checker, 0, sizeof (checker)); + + if (argc < 2) { + printf ("%s", usage); + return 0; + } + + for (index = 1; index < argc; ++index) { + const char *filename = argv [index]; + int test_size, bytes_read, maxbits; + unsigned char *file_buffer; + long long file_size; + FILE *infile; + + if (!strcmp (filename, "-q")) { + quiet_mode = 1; + continue; + } + + if (!strcmp (filename, "-e")) { + exhaustive_mode = 1; + continue; + } + + if (!strcmp (filename, "-f")) { + writer.fuzz_testing = 1; + continue; + } + + if (strlen (filename) == 2 && filename [0] == '-' && filename [1] >= '0' && filename [1] <= '8') { + if (filename [1] > '0') + set_maxbits = filename [1] - '0' + 8; + else + set_maxbits = 0; + + continue; + } + + infile = fopen (filename, "rb"); + + if (!infile) { + printf ("\ncan't open file %s!\n", filename); + skipped++; + continue; + } + + file_size = DoGetFileSize (infile); + + if (!file_size) { + printf ("\ncan't get file size of %s (may be zero)!\n", filename); + skipped++; + continue; + } + + if (file_size > 1024LL * 1024LL * 1024LL) { + printf ("\nfile %s is too big!\n", filename); + skipped++; + continue; + } + + file_buffer = malloc (file_size); + writer.size = (unsigned int)(file_size * 2 + 10); + writer.buffer = malloc (writer.size); + + if (!file_buffer || !writer.buffer) { + printf ("\nfile %s is too big!\n", filename); + if (writer.buffer) free (writer.buffer); + if (file_buffer) free (file_buffer); + skipped++; + continue; + } + + bytes_read = fread (file_buffer, 1, (int) file_size, infile); + fclose (infile); + + if (bytes_read != (int) file_size) { + printf ("\nfile %s could not be read!\n", filename); + free (writer.buffer); + free (file_buffer); + skipped++; + continue; + } + + if (!quiet_mode) + printf ("\n"); + + test_size = file_size; + checked++; + + do { + for (maxbits = set_maxbits ? set_maxbits : 9; maxbits <= (set_maxbits ? set_maxbits : 16); ++maxbits) { + int res, got_error = 0; + + reader.buffer = file_buffer + (file_size - test_size) / 2; + reader.size = test_size; + + reader.index = writer.index = writer.wrapped = 0; + + if (lzw_compress (write_buff, &writer, read_buff, &reader, maxbits)) { + printf ("\nlzw_compress() returned error on file %s, maxbits = %d\n", filename, maxbits); + errors++; + continue; + } + + if (writer.wrapped) { + printf ("\nover 100%% inflation on file %s, maxbits = %d!\n", filename, maxbits); + errors++; + continue; + } + + checker.buffer = reader.buffer; + checker.size = reader.size; + checker.wrapped = checker.byte_errors = checker.index = 0; + + reader.buffer = writer.buffer; + reader.size = writer.index; + reader.index = 0; + + res = lzw_decompress (check_buff, &checker, read_buff, &reader); + + reader.buffer = checker.buffer; + reader.size = checker.size; + + got_error = res || checker.index != checker.size || checker.wrapped || checker.byte_errors; + + if (!quiet_mode || got_error) + printf ("file %s, maxbits = %2d: %u bytes --> %u bytes, %.2f%%\n", filename, maxbits, + reader.size, writer.index, writer.index * 100.0 / reader.size); + + if (got_error) { + if (res) + printf ("decompressor returned an error\n"); + + if (!checker.index) + printf ("decompression didn't generate any data\n"); + else if (checker.index != checker.size) + printf ("decompression terminated %u bytes early\n", checker.size - checker.index); + else if (checker.wrapped) + printf ("decompression generated %u extra bytes\n", checker.wrapped); + + if (checker.byte_errors) + printf ("there were %u byte data errors starting at index %u\n", + checker.byte_errors, checker.first_error); + else if (checker.index != checker.size || checker.wrapped) + printf ("(but the data generated was all correct)\n"); + + printf ("\n"); + errors++; + } + else { + total_input_bytes += reader.size; + total_output_bytes += writer.index; + } + + tests++; + + if (exhaustive_mode) + test_size -= (test_size + 98) / 100; + } + + } while (exhaustive_mode && test_size > 1 && test_size > file_size / 100); + + free (writer.buffer); + free (file_buffer); + } + + if (errors) + printf ("\n***** %d errors detected in %d tests using %d files (%d skipped) *****\n\n", errors, tests, checked, skipped); + else { + printf ("\nsuccessfully ran %d tests using %d files (%d skipped) with no errors detected\n", tests, checked, skipped); + printf ("cumulative results: %llu bytes --> %llu bytes, %.2f%%\n\n", total_input_bytes, total_output_bytes, + total_output_bytes * 100.0 / total_input_bytes); + } + + return errors; +} diff --git a/bytearray.c b/other/bytearray.c similarity index 100% rename from bytearray.c rename to other/bytearray.c diff --git a/bytearray.h b/other/bytearray.h similarity index 100% rename from bytearray.h rename to other/bytearray.h diff --git a/coder_lib.c b/other/coder_lib.c similarity index 100% rename from coder_lib.c rename to other/coder_lib.c diff --git a/coder_lib.h b/other/coder_lib.h similarity index 100% rename from coder_lib.h rename to other/coder_lib.h diff --git a/crc.c b/other/crc.c similarity index 100% rename from crc.c rename to other/crc.c diff --git a/crc.h b/other/crc.h similarity index 100% rename from crc.h rename to other/crc.h diff --git a/for_hello.c b/other/for_hello.c similarity index 100% rename from for_hello.c rename to other/for_hello.c diff --git a/huffman.ctt b/other/huffman.ctt similarity index 100% rename from huffman.ctt rename to other/huffman.ctt diff --git a/huffman.htt b/other/huffman.htt similarity index 100% rename from huffman.htt rename to other/huffman.htt diff --git a/other/lzw-ab-master.zip b/other/lzw-ab-master.zip new file mode 100644 index 0000000..809259d Binary files /dev/null and b/other/lzw-ab-master.zip differ diff --git a/mystring.c b/other/mystring.c similarity index 100% rename from mystring.c rename to other/mystring.c diff --git a/mystring.h b/other/mystring.h similarity index 100% rename from mystring.h rename to other/mystring.h diff --git a/zl77/Makefile b/zl77/Makefile new file mode 100644 index 0000000..a2d3d43 --- /dev/null +++ b/zl77/Makefile @@ -0,0 +1,15 @@ + + +CC = gcc + + +# SRCS = $(wildcard *.c) +SRCS = zl77.c ../huffman/huffman_.c + +STR = $(subst from,to,from your heart) + +all: + $(CC) $(SRCS) -o hello + +clean: + rm -rf *.exe diff --git a/zl77/zl77.c b/zl77/zl77.c new file mode 100644 index 0000000..dcb3936 --- /dev/null +++ b/zl77/zl77.c @@ -0,0 +1,362 @@ + + +#include "zl77.h" +#include "stdio.h" +#include "stdlib.h" +#include "string.h" +#include "../huffman/huffman_.h" + +// zl77 算法的实现 + +#define DBG_WARN printf +#define DBG_LOG printf + + + +// 定义数据缓冲区步长 +#define LZ77_BUFF_STEP_SIZE 10 + + +typedef struct _buff_item{ + uint8_t data[LZ77_BUFF_STEP_SIZE]; + struct _buff_item *next; + struct _buff_item *prev; +}buff_item; + + +typedef struct _buff_def{ + buff_item *current; + buff_item *head; + int used; + int all; + int current_index; + int bit_used; +}buff_def; + + + +typedef struct _zl77_def +{ + int dict_len;// 字典长度 + int tran_len;// 转换区长度 + int index;// 窗口位置 + buff_def buff_chars;//字符编码区 + buff_def buff_pos;//标号编码区 + buff_def buff_bits;//编码类型标识区(1,字符;0,标号) + const uint8_t *in; + int in_len; + uint8_t cmp_pos;// 匹配到的pos距离 + uint8_t cmp_len;// 匹配到的长度 + uint8_t cmp_skip;// 窗口移动的距离 +}zl77_def; + + + +uint8_t zl77_buff_get_byte(buff_def *buff,int index); +void zl77_buff_set_byte(buff_def *buff,int index,uint8_t d); +void zl77_buff_append_bit(buff_def *buff,int bit); +void zl77_buff_append_byte(buff_def *buff, const uint8_t d); +int zl77_buff_get_bit(buff_def *buff, int index); + + + +zl77_def *zl77_creat(void) +{ + zl77_def *z=calloc(1,sizeof(zl77_def)); + z->dict_len=5; + z->tran_len=3; +} + + + +// 删除缓存 +void zl77_del_buff(buff_def *buff) +{ + buff_item *t=buff->head; + buff_item *o; + while(t){ + o=t; + t=t->next; + free(o); + } +} + +// 添加一个字节 +void zl77_buff_append_byte(buff_def *buff, const uint8_t d) +{ + if(buff->used>=buff->all){ + buff_item *t=buff->head; + buff_item *t_old=0; + while (t) + { + t_old=t; + t=t->next; + } + t=calloc(1,sizeof(buff_item)); + if(t_old){ + t_old->next=t; + t->prev=t_old; + }else{ + buff->head=t; + } + buff->all+=LZ77_BUFF_STEP_SIZE; + buff->current=t; + buff->current_index=buff->used; + } + while((buff->used/LZ77_BUFF_STEP_SIZE)>(buff->current_index/LZ77_BUFF_STEP_SIZE)){ + buff->current=buff->current->next; + buff->current_index+=LZ77_BUFF_STEP_SIZE; + } + buff->current->data[buff->used%LZ77_BUFF_STEP_SIZE]=d; + buff->used++; +} + +// 添加一个位 +void zl77_buff_append_bit(buff_def *buff,int bit) +{ + if(buff->bit_used/8>=buff->used){ + zl77_buff_append_byte(buff,0); + } + uint8_t d=zl77_buff_get_byte(buff,buff->bit_used/8); + d|=bit<<(buff->bit_used%8); + zl77_buff_set_byte(buff,-1,d); + buff->bit_used++; +} + +// 调整最近使用的缓冲区 +static void zl77_buff_adjust_current(buff_def *buff,int index){ + while((index/LZ77_BUFF_STEP_SIZE)>(buff->current_index/LZ77_BUFF_STEP_SIZE)){ + buff->current=buff->current->next; + buff->current_index+=LZ77_BUFF_STEP_SIZE; + } + while((index/LZ77_BUFF_STEP_SIZE)<(buff->current_index/LZ77_BUFF_STEP_SIZE)){ + buff->current=buff->current->prev; + buff->current_index-=LZ77_BUFF_STEP_SIZE; + } +} + + +// 获取指定字节 +uint8_t zl77_buff_get_byte(buff_def *buff,int index){ + if(index<0) index=buff->used+index; + if(index>=buff->used||index<0) return 0; + zl77_buff_adjust_current(buff,index); + return buff->current->data[index%LZ77_BUFF_STEP_SIZE]; +} + +// 设置指定字节 +void zl77_buff_set_byte(buff_def *buff,int index,uint8_t d){ + if(index<0) index=buff->used+index; + if(index>=buff->used||index<0) return ; + zl77_buff_adjust_current(buff,index); + buff->current->data[index%LZ77_BUFF_STEP_SIZE]=d; +} + + +// 获取指定位 +int zl77_buff_get_bit(buff_def *buff, int index){ + uint8_t d=zl77_buff_get_byte(buff,index/8); + return (d&(1<<(index%8)))?1:0; +} + + + + +void zl77_buff_print(buff_def *buff) +{ + DBG_LOG("buff:["); + for(int i=0;iused;i++){ + DBG_LOG("%02x ",zl77_buff_get_byte(buff,i)); + } + DBG_LOG("]\n"); +} + + + +static uint8_t zl77_get_char(zl77_def *z,int index) +{ + // DBG_LOG("get_char:[%d]\n",index); + if(index<0||index>=z->in_len) return 0; + return z->in[index]; +} + +// 比对,找到了返回0没找到返回1 +// 0记录标号,1记录原始数据 +static int zl77_cmp(zl77_def *z,int index){ + uint8_t pos=0; + uint8_t len=0; + // DBG_LOG("index=%d\n",index); + for(int i=z->dict_len;i>0;i--){ + if(zl77_get_char(z,index-i)==zl77_get_char(z,index)){ + pos=i; + len=0; + for(int j=0;jz->cmp_len){ + z->cmp_len=len; + z->cmp_pos=pos; + } + }else{ + len=0; + break; + } + } + } + } + if((pos|len)==0){ + z->cmp_skip=1; + return 1; + } + else{ + // for(int i=0;icmp_len;i++){ + // DBG_LOG("%02x|%02x ",zl77_get_char(z,index-z->cmp_pos+i),zl77_get_char(z,index+i)); + // } + z->cmp_skip=z->cmp_len; + return 0; + } +} + + + +static inline void zl77_append_u32(uint8_t *data,int *index,uint32_t value){ + data[(*index)++]=value&0xff; + data[(*index)++]=(value>>8)&0xff; + data[(*index)++]=(value>>16)&0xff; + data[(*index)++]=(value>>24)&0xff; +} + +static inline uint32_t zl77_get_u32(const uint8_t *data,int index){ + uint32_t ret=0; + for(int i=0;i<4;i++){ + ret|=data[index+i]<<(8*i); + } + return ret; +} + + +int zl77_encode(const uint8_t *in,const int in_len,uint8_t **out,int *out_len) +{ + int ret; + zl77_def *z=zl77_creat(); + z->in=in; + z->in_len=in_len; + for(int i=0;iin_len;){ + z->cmp_pos=0; + z->cmp_len=0; + ret=zl77_cmp(z,i); + if(ret){ + zl77_buff_append_byte(&z->buff_chars,zl77_get_char(z,i)); + // DBG_LOG("char(%c);",zl77_get_char(z,i)); + }else{ + zl77_buff_append_byte(&z->buff_pos,((z->cmp_pos&0xf)<<4)|(z->cmp_len&0xf)); + // DBG_LOG("pos(%d,%d);",z->cmp_pos,z->cmp_len); + if((z->cmp_pos|z->cmp_len)==0){ + exit(1); + } + } + zl77_buff_append_bit(&z->buff_bits,ret); + i+=z->cmp_skip; + } + // DBG_LOG("\n"); + // zl77_buff_print(&z->buff_chars); + // zl77_buff_print(&z->buff_pos); + // zl77_buff_print(&z->buff_bits); + uint32_t size_chars=z->buff_chars.used; + uint32_t size_pos=z->buff_pos.used; + uint32_t size_bits=z->buff_bits.used; + uint32_t size_unpack=z->in_len; + int index=0; + (*out_len)=16+size_chars+size_pos+size_bits; + (*out)=calloc(*out_len,sizeof(uint8_t)); + zl77_append_u32(*out,&index,size_chars); + zl77_append_u32(*out,&index,size_pos); + zl77_append_u32(*out,&index,size_bits); + zl77_append_u32(*out,&index,size_unpack); + for(int i=0;ibuff_chars,i); + } + for(int i=0;ibuff_pos,i); + } + for(int i=0;ibuff_bits,i); + } + zl77_del_buff(&z->buff_chars); + zl77_del_buff(&z->buff_pos); + zl77_del_buff(&z->buff_bits); + free(z); + + DBG_LOG("in_len=%d,out_len=%d\n",in_len,*out_len); + return 0; +} + + +static inline int zl77_get_bit(const uint8_t *data,int index){ + uint8_t c=data[index/8]; + return c&(1<<(index%8))?1:0; +} + +int zl77_decode(const uint8_t* in, const int in_len, uint8_t** out, int* out_len) +{ + int ret; + int index_chars=0; + int index_pos=0; + int index_bits=0; + uint8_t cmp_pos,cmp_len,ch; + zl77_def *z=zl77_creat(); + uint32_t size_chars=zl77_get_u32(in,0); + uint32_t size_pos=zl77_get_u32(in,4); + uint32_t size_bits=zl77_get_u32(in,8); + uint32_t size_unpack=zl77_get_u32(in,12); + const uint8_t *chars=in+16; + const uint8_t *pos=in+16+size_chars; + const uint8_t *bits=in+16+size_chars+size_pos; + (*out)=calloc(size_unpack+1,sizeof(uint8_t)); + for(int i=0;i>4; + cmp_len=pos[index_pos]&0xf;index_pos++; + // DBG_LOG("pos(%d,%d)",cmp_pos,cmp_len); + memcpy(&(*out)[i],&(*out)[i-cmp_pos],cmp_len); + i+=cmp_len; + } + } + // DBG_LOG("\n"); + free(z); + return 0; +} + + + + + +void main(int argc,const char *argv[]) +{ + uint8_t *encode_data=0; + int encode_len=0; + uint8_t *decode_data=0; + int decode_len=0; + hm_encode(argv[1],strlen(argv[1]),&encode_data,&encode_len); + // for(int i=0;i