c_progarm/lzw-ab/lzwlib.c

////////////////////////////////////////////////////////////////////////////
//                            **** LZW-AB ****                            //
//               Adjusted Binary LZW Compressor/Decompressor              //
//                  Copyright (c) 2016-2020 David Bryant                  //
//                           All Rights Reserved                          //
//      Distributed under the BSD Software License (see license.txt)      //
////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "lzwlib.h"

/* This library implements the LZW general-purpose data compression algorithm.
 * The algorithm was originally described as a hardware implementation by
 * Terry Welsh here:
 *
 *   Welch, T.A. “A Technique for High-Performance Data Compression.”
 *   IEEE Computer 17,6 (June 1984), pp. 8-19.
 *
 * Since then there have been enumerable refinements and variations on the
 * basic technique, and this implementation is no different. The target of
 * the present implementation is embedded systems, and so emphasis was placed
 * on simplicity, fast execution, and minimal RAM usage.
 * 
 * This is a streaming compressor in that the data is not divided into blocks
 * and no context information like dictionaries or Huffman tables are sent
 * ahead of the compressed data (except for one byte to signal the maximum
 * bit depth). This limits the maximum possible compression ratio compared to
 * algorithms that significantly preprocess the data, but with the help of
 * some enhancements to the LZW algorithm (described below) it is able to
 * compress better than the UNIX "compress" utility (which is also LZW) and
 * is in fact closer to and sometimes beats the compression level of "gzip".
 *
 * The symbols are stored in "adjusted binary" which provides somewhat better
 * compression, with virtually no speed penalty, compared to the fixed word
 * sizes normally used. These are sometimes called "phased-in" binary codes
 * and their use in LZW is described here:
 *
 *   R. N. Horspool, "Improving LZW (data compression algorithm)", Data
 *   Compression Conference, pp. 332-341, 1991.
 *
 * Earlier versions of this compressor would reset as soon as the dictionary
 * became full to ensure good performance on heterogenous data (such as tar
 * files or executable images). While trivial to implement, this is not
 * particularly efficient with homogeneous data (or in general) because we
 * spend a lot of time sending short symbols where the compression is poor.
 *
 * This newer version utilizes a technique such that once the dictionary is
 * full, we restart at the beginning and recycle only those codes that were
 * seen only once. We know this because they are not referenced by longer
 * strings, and are easy to replace in the dictionary for the same reason.
 * Since they have only been seen once it's also more likely that we will
 * be replacing them with a more common string, and this is especially
 * true if the data characteristics are changing.
 *
 * Replacing string codes in this manner has the interesting side effect that
 * some older shorter strings that the removed strings were based on will
 * possibly become unreferenced themselves and be recycled on the next pass.
 * In this way, the entire dictionary constantly "churns" based on the
 * incoming stream, thereby improving and adapting to optimal compression.
 *
 * Even with this technique there is still a possibility that a sudden change
 * in the data characteristics will appear, resulting in significant negative
 * compression (up to 100% for 16-bit codes). To detect this case we generate
 * an exponentially decaying average of the current compression ratio and reset
 * when this hits about 1.06, which limits worst case inflation to about 8%.
 *
 * The maximum symbol size is configurable on the encode side (from 9 bits to
 * 16 bits) and determines the RAM footprint required by both sides and, to a
 * large extent, the compression performance. This information is communicated
 * to the decoder in the first stream byte so that it can allocate accordingly.
 * The RAM requirements are as follows:
 *
 *    maximum    encoder RAM   decoder RAM
 *  symbol size  requirement   requirement
 * -----------------------------------------
 *     9-bit      4096 bytes    2368 bytes
 *    10-bit      8192 bytes    4992 bytes
 *    11-bit     16384 bytes   10240 bytes
 *    12-bit     32768 bytes   20736 bytes
 *    13-bit     65536 bytes   41728 bytes
 *    14-bit    131072 bytes   83712 bytes
 *    15-bit    262144 bytes  167680 bytes
 *    16-bit    524288 bytes  335616 bytes
 *
 * This implementation uses malloc(), but obviously an embedded version could
 * use static arrays instead if desired (assuming that the maxbits was
 * controlled outside).
 */

#define NULL_CODE       65535   // indicates a NULL prefix (must be unsigned short)
#define CLEAR_CODE      256     // code to flush dictionary and restart decoder
#define FIRST_STRING    257     // code of first dictionary string

/* This macro determines the number of bits required to represent the given value,
 * not counting the implied MSB. For GNU C it will use the provided built-in,
 * otherwise a comparison tree is employed. Note that in the non-GNU case, only
 * values up to 65535 (15 bits) are supported.
 */

#ifdef __GNUC__
#define CODE_BITS(n) (31 - __builtin_clz(n))
#else
#define CODE_BITS(n) ((n) < 4096 ?                                      \
            ((n) < 1024  ? 8  + ((n) >= 512)  : 10 + ((n) >= 2048)) :   \
            ((n) < 16384 ? 12 + ((n) >= 8192) : 14 + ((n) >= 32768)))
#endif

/* This macro writes the adjusted-binary symbol "code" given the maximum
 * symbol "maxcode". A macro is used here just to avoid the duplication in
 * the lzw_compress() function. The idea is that if "maxcode" is not one
 * less than a power of two (which it rarely will be) then this code can
 * often send fewer bits that would be required with a fixed-sized code.
 *
 * For example, the first code we send will have a "maxcode" of 257, so
 * every "code" would normally consume 9 bits. But with adjusted binary we
 * can actually represent any code from 0 to 253 with just 8 bits -- only
 * the 4 codes from 254 to 257 take 9 bits.
 */

#define WRITE_CODE(code,maxcode) do {                               \
    unsigned int code_bits = CODE_BITS (maxcode);                   \
    unsigned int extras = (2 << code_bits) - (maxcode) - 1;         \
    if ((code) < extras) {                                          \
        shifter |= ((code) << bits);                                \
        bits += code_bits;                                          \
    }                                                               \
    else {                                                          \
        shifter |= ((((code) + extras) >> 1) << bits);              \
        bits += code_bits;                                          \
        shifter |= ((((code) + extras) & 1) << bits++);             \
    }                                                               \
    do { (*dst)(shifter,dstctx); shifter >>= 8;                     \
        output_bytes += 256;                                        \
    } while ((bits -= 8) >= 8);                                     \
} while (0)

/* LZW compression function. Bytes (8-bit) are read and written through callbacks and the
 * "maxbits" parameter specifies the maximum symbol size (9-16), which in turn determines
 * the RAM requirement and, to a large extent, the level of compression achievable. A return
 * value of EOF from the "src" callback terminates the compression process. A non-zero return
 * value indicates one of the two possible errors -- bad "maxbits" param or failed malloc().
 * There are contexts (void pointers) that are passed to the callbacks to easily facilitate
 * multiple instances of the compression operation (but simple applications can ignore these).
 */

typedef struct {
    unsigned short first_reference, next_reference, back_reference;
    unsigned char terminator;
} encoder_entry_t;

int lzw_compress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void *srcctx, int maxbits)
{
    unsigned int maxcode = FIRST_STRING, next_string = FIRST_STRING, prefix = NULL_CODE, total_codes;
    unsigned int dictionary_full = 0, available_entries, max_available_entries, max_available_code;
    unsigned int input_bytes = 65536, output_bytes = 65536;
    unsigned int shifter = 0, bits = 0;
    encoder_entry_t *dictionary;
    int c;

    if (maxbits < 9 || maxbits > 16)    // check for valid "maxbits" setting
        return 1;

    // based on the "maxbits" parameter, compute total codes and allocate dictionary storage

    total_codes = 1 << maxbits;
    dictionary = malloc (total_codes * sizeof (encoder_entry_t));
    max_available_entries = total_codes - FIRST_STRING - 1;
    max_available_code = total_codes - 2;

    if (!dictionary)
        return 1;                       // failed malloc()

    // clear the dictionary

    available_entries = max_available_entries;
    memset (dictionary, 0, 256 * sizeof (encoder_entry_t));

    (*dst)(maxbits - 9, dstctx);    // first byte in output stream indicates the maximum symbol bits

    // This is the main loop where we read input bytes and compress them. We always keep track of the
    // "prefix", which represents a pending byte (if < 256) or string entry (if >= FIRST_STRING) that
    // has not been sent to the decoder yet. The output symbols are kept in the "shifter" and "bits"
    // variables and are sent to the output every time 8 bits are available (done in the macro).

    while ((c = (*src)(srcctx)) != EOF) {
        unsigned int cti;                   // coding table index

        input_bytes += 256;

        if (prefix == NULL_CODE) {          // this only happens the very first byte when we don't yet have a prefix
            prefix = c;
            continue;
        }

        memset (dictionary + next_string, 0, sizeof (encoder_entry_t));

        if ((cti = dictionary [prefix].first_reference)) {          // if any longer strings are built on the current prefix...
            while (1)
                if (dictionary [cti].terminator == c) {             // we found a matching string, so we just update the prefix
                    prefix = cti;                                   // to that string and continue without sending anything
                    break;
                }
                else if (!dictionary [cti].next_reference) {        // this string did not match the new character and
                    dictionary [cti].next_reference = next_string;  // there aren't any more, so we'll add a new string,
                                                                    // point to it with "next_reference", and also make the
                    dictionary [next_string].back_reference = cti;  // "back_reference" which is used for recycling entries
                    cti = 0;
                    break;
                }
                else
                    cti = dictionary [cti].next_reference;          // there are more possible matches to check, so loop back
        }
        else {                                                      // no longer strings are based on the current prefix, so now
            dictionary [prefix].first_reference = next_string;      // the current prefix plus the new byte will be the next string
            dictionary [next_string].back_reference = prefix;       // also make the back_reference used for recycling
            if (prefix >= FIRST_STRING) available_entries--;        // the codes 0-255 are never available for recycling
        }

        // If "cti" is zero, we could not simply extend our "prefix" to a longer string because we did not find a
        // dictionary match, so we send the symbol representing the current "prefix" and add the new string to the
        // dictionary. Since the current byte "c" was not included in the prefix, that now becomes our new prefix.

        if (!cti) {
            WRITE_CODE (prefix, maxcode);               // send symbol for current prefix (0 to maxcode-1)
            dictionary [next_string].terminator = c;    // newly created string has current byte as the terminator
            prefix = c;                                 // current byte also becomes new prefix for next string

            // If the dictionary is not full yet, we bump the maxcode and next_string and check to see if the
            // dictionary is now full. If it is we set the dictionary_full flag and leave maxcode set to two
            // less than total_codes because every string entry is now available for matching, but the actual
            // maximum code is reserved for EOF.

            if (!dictionary_full) {
                dictionary_full = (++next_string > max_available_code);
                maxcode++;
            }

            // If the dictionary is full we look for an entry to recycle starting at next_string (the one we
            // just created or recycled) plus one (with check for wrap check). We know there is one because at
            // a minimum the string we just added. This also takes care of removing the entry to be recycled
            // (which is possible/easy because no longer strings have been based on it).

            if (dictionary_full) {
                for (next_string++; next_string <= max_available_code || (next_string = FIRST_STRING); next_string++)
                    if (!dictionary [next_string].first_reference)
                        break;

                cti = dictionary [next_string].back_reference;  // dictionary [cti] references the entry we're
                                                                // trying to recycle (either as a first or a next)

                if (dictionary [cti].first_reference == next_string) {
                    dictionary [cti].first_reference = dictionary [next_string].next_reference;

                    // if we just cleared a first reference, and that string is not 0-255,
                    // then that's a newly available entry
                    if (!dictionary [cti].first_reference && cti >= FIRST_STRING)
                        available_entries++;
                }
                else if (dictionary [cti].next_reference == next_string)    // fixup a "next_reference"
                    dictionary [cti].next_reference = dictionary [next_string].next_reference;

                // If the entry we're recycling had a next reference, then update the back reference
                // so it's completely out of the chain. Of course we know it didn't have a first
                // reference because then we wouldn't be recycling it.

                if (dictionary [next_string].next_reference)
                    dictionary [dictionary [next_string].next_reference].back_reference = cti;

                // This check is technically not needed because there will always be an available entry
                // (the last string we added at a minimum) but we don't want to get in a situation where
                // we only have a few entries that we're cycling though. I pulled the limits (16 entries
                // or 1% of total) out of a hat.

                if (available_entries < 16 || available_entries * 100 < max_available_entries) {
                    // clear the dictionary and reset the byte counters -- basically everything starts over
                    // except that we keep the last pending "prefix" (which, of course, was never sent)

                    WRITE_CODE (CLEAR_CODE, maxcode);
                    memset (dictionary, 0, 256 * sizeof (encoder_entry_t));
                    available_entries = max_available_entries;
                    next_string = maxcode = FIRST_STRING;
                    input_bytes = output_bytes = 65536;
                    dictionary_full = 0;
                }
            }

            // This is similar to the above check, except that it's used whether the dictionary is full or not.
            // It uses an exponentially decaying average of the current compression ratio, so it can terminate
            // very early if the incoming data is uncompressible or it can terminate any later time that the
            // dictionary no longer compresses the incoming stream.

            if (output_bytes > input_bytes + (input_bytes >> 4)) {
                WRITE_CODE (CLEAR_CODE, maxcode);
                memset (dictionary, 0, 256 * sizeof (encoder_entry_t));
                available_entries = max_available_entries;
                next_string = maxcode = FIRST_STRING;
                input_bytes = output_bytes = 65536;
                dictionary_full = 0;
            }
            else {
                output_bytes -= output_bytes >> 8;
                input_bytes -= input_bytes >> 8;
            }
        }
    }

    // we're done with input, so if we've received anything we still need to send that pesky pending prefix...

    if (prefix != NULL_CODE) {
        WRITE_CODE (prefix, maxcode);

        if (!dictionary_full)
            maxcode++;
    }

    WRITE_CODE (maxcode, maxcode);  // the maximum possible code is always reserved for our END_CODE

    if (bits)                       // finally, flush any pending bits from the shifter
        (*dst)(shifter, dstctx);

    free (dictionary);
    return 0;
}

/* LZW decompression function. Bytes (8-bit) are read and written through callbacks. The
 * "maxbits" parameter is read as the first byte in the stream and controls how much memory
 * is allocated for decoding. A return value of EOF from the "src" callback terminates the
 * decompression process (although this should not normally occur). A non-zero return value
 * indicates an error, which in this case can be a bad "maxbits" read from the stream, a
 * failed malloc(), or if an EOF is read from the input stream before the decompression
 * terminates naturally with END_CODE. There are contexts (void pointers) that are passed
 * to the callbacks to easily facilitate multiple instances of the decompression operation
 * (but simple applications can ignore these).
 */

typedef struct {
    unsigned char terminator, extra_references;
    unsigned short prefix;
} decoder_entry_t;

int lzw_decompress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void *srcctx)
{
    unsigned int maxcode = FIRST_STRING, next_string = FIRST_STRING - 1, prefix = CLEAR_CODE;
    unsigned int dictionary_full = 0, max_available_code, total_codes;
    unsigned int shifter = 0, bits = 0, read_byte, i;
    unsigned char *reverse_buffer, *referenced;
    decoder_entry_t *dictionary;

    if ((read_byte = ((*src)(srcctx))) == EOF || (read_byte & 0xf8))  //sanitize first byte
        return 1;

    // based on the "maxbits" parameter, compute total codes and allocate dictionary storage

    total_codes = 512 << (read_byte & 0x7);
    max_available_code = total_codes - 2;
    dictionary = malloc (total_codes * sizeof (decoder_entry_t));
    reverse_buffer = malloc (total_codes - 256);
    referenced = malloc (total_codes / 8);  // bitfield indicating code is referenced at least once

    // Note that to implement the dictionary entry recycling we have to keep track of how many
    // longer strings are based on each string in the dictionary. This can be between 0 (no
    // references) to 256 (every possible next byte), but unfortunately that's one more value
    // than what can be stored in a byte. The solution is to have a single bit for each entry
    // indicating any references (i.e., the code cannot be recycled) and an additional byte
    // in the dictionary entry struct counting the "extra" references (beyond one).

    if (!reverse_buffer || !dictionary)         // check for malloc() failure
        return 1;

    for (i = 0; i < 256; ++i) {                 // these never change
        dictionary [i].prefix = NULL_CODE;
        dictionary [i].terminator = i;
    }

    // This is the main loop where we read input symbols. The values range from 0 to the code value
    // of the "next" string in the dictionary (although the actual "next" code cannot be used yet,
    // and so we reserve that code for the END_CODE). Note that receiving an EOF from the input
    // stream is actually an error because we should have gotten the END_CODE first.

    while (1) {
        unsigned int code_bits = CODE_BITS (maxcode), code;
        unsigned int extras = (2 << code_bits) - maxcode - 1;

        do {
            if ((read_byte = ((*src)(srcctx))) == EOF) {
                free (dictionary); free (reverse_buffer); free (referenced);
                return 1;
            }

            shifter |= read_byte << bits;
        } while ((bits += 8) < code_bits);

        // first we assume the code will fit in the minimum number of required bits

        code = shifter & ((1 << code_bits) - 1);
        shifter >>= code_bits;
        bits -= code_bits;

        // but if code >= extras, then we need to read another bit to calculate the real code
        // (this is the "adjusted binary" part)

        if (code >= extras) {
            if (!bits) {
                if ((read_byte = ((*src)(srcctx))) == EOF) {
                    free (dictionary); free (reverse_buffer); free (referenced);
                    return 1;
                }

                shifter = read_byte;
                bits = 8;
            }

            code = (code << 1) - extras + (shifter & 1);
            shifter >>= 1;
            bits--;
        }

        if (code == maxcode)                // sending the maximum code is reserved for the end of the file
            break;
        else if (code == CLEAR_CODE) {      // otherwise check for a CLEAR_CODE to start over early
            next_string = FIRST_STRING - 1;
            maxcode = FIRST_STRING;
            dictionary_full = 0;
        }
        else if (prefix == CLEAR_CODE) {    // this only happens at the first symbol which is always sent
            (*dst)(code, dstctx);           // literally and becomes our initial prefix
            next_string++;
            maxcode++;
        }
        // Otherwise we have a valid prefix so we step through the string from end to beginning storing the
        // bytes in the "reverse_buffer", and then we send them out in the proper order. One corner-case
        // we have to handle here is that the string might be the same one that is actually being defined
        // now (code == next_string).
        else {
            unsigned int cti = (code == next_string) ? prefix : code;
            unsigned char *rbp = reverse_buffer, c;

            do {
                *rbp++ = dictionary [cti].terminator;
                if (rbp == reverse_buffer + total_codes - 256) {
                    free (dictionary); free (reverse_buffer); free (referenced);
                    return 1;
                }
            } while ((cti = dictionary [cti].prefix) != NULL_CODE);

            c = *--rbp;     // the first byte in this string is the terminator for the last string, which is
                            // the one that we'll create a new dictionary entry for this time

            do      // send string in corrected order (except for the terminator which we don't know yet)
                (*dst)(*rbp, dstctx);
            while (rbp-- != reverse_buffer);

            if (code == next_string) {
                (*dst)(c,dstctx);
            }

            // This should always execute (the conditional is to catch corruptions) and is where we add a new string to
            // the dictionary, either at the end or elsewhere when we are "recycling" entries that were never referenced

            if (next_string >= FIRST_STRING && next_string < total_codes) {
                if (referenced [prefix >> 3] & (1 << (prefix & 7)))     // increment reference count on prefix
                    dictionary [prefix].extra_references++;
                else
                    referenced [prefix >> 3] |= 1 << (prefix & 7);

                dictionary [next_string].prefix = prefix;       // now update the next dictionary entry with the new string
                dictionary [next_string].terminator = c;        // (but we're always one behind, so it's not the string just sent)
                dictionary [next_string].extra_references = 0;  // newly created string has not been referenced
                referenced [next_string >> 3] &= ~(1 << (next_string & 7));
            }

            // If the dictionary is not full yet, we bump the maxcode and next_string and check to see if the
            // dictionary is now full. If it is we set the dictionary_full flag and set next_string back to the
            // beginning of the dictionary strings to start recycling them. Note that then maxcode will remain
            // two less than total_codes because every string entry is available for matching, and the actual
            // maximum code is reserved for EOF.

            if (!dictionary_full) {
                maxcode++;

                if (++next_string > max_available_code) {
                    dictionary_full = 1;
                    maxcode--;
                }
            }

            // If the dictionary is full we look for an entry to recycle starting at next_string (the one we
            // created or recycled) plus one. We know there is one because at a minimum the string we just added
            // has not been referenced). This also takes care of removing the entry to be recycled (which is
            // possible/easy because no longer strings have been based on it).

            if (dictionary_full) {
                for (next_string++; next_string <= max_available_code || (next_string = FIRST_STRING); next_string++)
                    if (!(referenced [next_string >> 3] & (1 << (next_string & 7))))
                        break;

                if (dictionary [dictionary [next_string].prefix].extra_references)
                    dictionary [dictionary [next_string].prefix].extra_references--;
                else
                    referenced [dictionary [next_string].prefix >> 3] &= ~(1 << (dictionary [next_string].prefix & 7));
            }
        }

        prefix = code;      // the code we just received becomes the prefix for the next dictionary string entry
                            // (which we'll create once we find out the terminator)
    }

    free (dictionary); free (reverse_buffer); free (referenced);
    return 0;
}