HUFF.C

/************************** Start of HUFF.C *************************
 *
 * This is the Huffman coding module
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <windows.h>
#include "bitio.h"
#include "errhand.h"

/*
 * The NODE structure is a node in the Huffman decoding tree.  It has a
 * count, which is its weight in the tree, and the node numbers of its
 * two children.  The saved_count member of the structure is only
 * there for debugging purposes, and can be safely taken out at any
 * time.  It just holds the intial count for each of the symbols, since
 * the count member is continually being modified as the tree grows.
 */
typedef struct tree_node
{
    unsigned int count;
    unsigned int saved_count;
    int child_0;
    int child_1;
} NODE;

/*
 * A Huffman tree is set up for decoding, not encoding.  When encoding,
 * I first walk through the tree and build up a table of codes for
 * each symbol.  The codes are stored in this CODE structure.
 */
typedef struct code
{
    unsigned int code;
    int code_bits;
} CODE;

/*
 * The special EOS symbol is 256, the first available symbol after all
 * of the possible bytes.  When decoding, reading this symbols
 * indicates that all of the data has been read in.
 */
#define END_OF_STREAM 256

/*
 * Local function prototypes, defined with or without ANSI prototypes.
 */

void count_bytes(FILE *input, unsigned long *long_counts);
void scale_counts(unsigned long *long_counts, NODE *nodes);
int build_tree(NODE *nodes);
void convert_tree_to_code(NODE *nodes,
                          CODE *codes,
                          unsigned int code_so_far,
                          int bits,
                          int node);
void output_counts(BIT_FILE *output, NODE *nodes);
void input_counts(BIT_FILE *input, NODE *nodes);
void compress_data(FILE *input, BIT_FILE *output, CODE *codes);
void expand_data(BIT_FILE *input, FILE *output, NODE *nodes,
                 int root_node);

/*
 * These two strings are used by MAIN-C.C and MAIN-E.C to print
 * messages of importance to the user of the program.
 */
// char *CompressionName = "static order 0 model with Huffman coding";
// char *Usage = "infile outfile\n\n";

/*
 * CompressFile is the compression routine called by MAIN-C.C.  It
 * looks for a single additional argument to be passed to it from
 * the command line:  "-d".  If a "-d" is present, it means the
 * user wants to see the model data dumped out for debugging
 * purposes.
 *
 * This routine works in a fairly straightforward manner.  First,
 * it has to allocate storage for three different arrays of data.
 * Next, it counts all the bytes in the input file.  The counts
 * are all stored in long int, so the next step is scale them down
 * to single byte counts in the NODE array.  After the counts are
 * scaled, the Huffman decoding tree is built on top of the NODE
 * array.  Another routine walks through the tree to build a table
 * of codes, one per symbol.  Finally, when the codes are all ready,
 * compressing the file is a simple matter.  After the file is
 * compressed, the storage is freed up, and the routine returns.
 *
 */
int CompressFile_huffman(FILE *input, BIT_FILE *output)
{
    //分配空间
    unsigned long *counts;
    NODE *nodes;
    CODE *codes;
    int root_node;

    DWORD milliseconds = GetTickCount();  //返回毫秒数

    /* calloc 在内存的动态存储区中分配n个长度为size的连续空间
     * 返回一个指向分配起始地址的指针
     */
    counts = (unsigned long *)calloc(256, sizeof(unsigned long));
    if (counts == NULL)
        fatal_error("Error allocating counts array\n");
    if ((nodes = (NODE *)calloc(514, sizeof(NODE))) == NULL)
        fatal_error("Error allocating nodes array\n");
    if ((codes = (CODE *)calloc(257, sizeof(CODE))) == NULL)
        fatal_error("Error allocating codes array\n");
    count_bytes(input, counts);  //计算字节数
    scale_counts(counts, nodes);  //等比例缩小计数 并匹配权重
    output_counts(output, nodes);  //压缩文件中的符号计数
    root_node = build_tree(nodes);  //Huffman Tree
    convert_tree_to_code(nodes, codes, 0, 0, root_node);  //得到编码
    compress_data(input, output, codes);  //compress
    //释放内存空间
    free((char *)counts);
    free((char *)nodes);
    free((char *)codes);
    return (GetTickCount() - milliseconds);
}

/*
 * ExpandFile is the routine called by MAIN-E.C to expand a file that
 * has been compressed with order 0 Huffman coding.  This routine has
 * a simpler job than that of the Compression routine.  All it has to
 * do is read in the counts that have been stored in the compressed
 * file, then build the Huffman tree.  The data can then be expanded
 * by reading in a bit at a time from the compressed file.  Finally,
 * the node array is freed and the routine returns.
 *
 */
/* ? ? ? what is order 0 Huffman coding ? ? ? */
int ExpandFile_huffman(BIT_FILE *input, FILE *output)
{
    NODE *nodes;
    int root_node;

    DWORD milliseconds = GetTickCount();
    if ((nodes = (NODE *)calloc(514, sizeof(NODE))) == NULL)
        fatal_error("Error allocating nodes array\n");
    input_counts(input, nodes);
    root_node = build_tree(nodes);
    expand_data(input, output, nodes, root_node);
    free((char *)nodes);
    return (GetTickCount() - milliseconds);
}

/*
 * In order for the compressor to build the same model, I have to store
 * the symbol counts in the compressed file so the expander can read
 * them in.  In order to save space, I don't save all 256 symbols
 * unconditionally.  The format used to store counts looks like this:
 *
 *  start, stop, counts, start, stop, counts, ... 0
 *
 * This means that I store runs of counts, until all the non-zero
 * counts have been stored.  At this time the list is terminated by
 * storing a start value of 0.  Note that at least 1 run of counts has
 * to be stored, so even if the first start value is 0, I read it in.
 * It also means that even in an empty file that has no counts, I have
 * to pass at least one count.
 *
 * In order to efficiently use this format, I have to identify runs of
 * non-zero counts.  Because of the format used, I don't want to stop a
 * run because of just one or two zeros in the count stream.  So I have
 * to sit in a loop looking for strings of three or more zero values in
 * a row.
 *
 * This is simple in concept, but it ends up being one of the most
 * complicated routines in the whole program.  A routine that just
 * writes out 256 values without attempting to optimize would be much
 * simpler, but would hurt compression quite a bit on small files.
 *
 */
void output_counts(BIT_FILE *output, NODE *nodes)
{
    int first;
    int last;
    int next;
    int i;

    first = 0;
    while (first < 255 && nodes[first].count == 0)
        first++;
    /*
     * Each time I hit the start of the loop, I assume that first is the
     * number for a run of non-zero values.  The rest of the loop is
     * concerned with finding the value for last, which is the end of the
     * run, and the value of next, which is the start of the next run.
     * At the end of the loop, I assign next to first, so it starts in on
     * the next run.
     */
    for (; first < 256; first = next)
    {
        last = first + 1;
        for (;;)
        {
            for (; last < 256; last++)
                if (nodes[last].count == 0)
                    break;
            last--;
            for (next = last + 1; next < 256; next++)
                if (nodes[next].count != 0)
                    break;
            if (next > 255)
                break;
            if ((next - last) > 3)
                break;
            last = next;
        };
        /*
         * Here is where I output first, last, and all the counts in between.
         */
        if (putc(first, output->file) != first)
            fatal_error("Error writing byte counts\n");
        if (putc(last, output->file) != last)
            fatal_error("Error writing byte counts\n");
        for (i = first; i <= last; i++)
        {
            if (putc(nodes[i].count, output->file) !=
                (int)nodes[i].count)
                fatal_error("Error writing byte counts\n");
        }
    }
    if (putc(0, output->file) != 0)
        fatal_error("Error writing byte counts\n");
}

/*
 * When expanding, I have to read in the same set of counts.  This is
 * quite a bit easier that the process of writing them out, since no
 * decision making needs to be done.  All I do is read in first, check
 * to see if I am all done, and if not, read in last and a string of
 * counts.
 */

void input_counts(BIT_FILE *input, NODE *nodes)
{
    int first;
    int last;
    int i;
    int c;

    for (i = 0; i < 256; i++)
        nodes[i].count = 0;
    if ((first = getc(input->file)) == EOF)
        fatal_error("Error reading byte counts\n");
    if ((last = getc(input->file)) == EOF)
        fatal_error("Error reading byte counts\n");
    for (;;)
    {
        for (i = first; i <= last; i++)
            if ((c = getc(input->file)) == EOF)
                fatal_error("Error reading byte counts\n");
            else
                nodes[i].count = (unsigned int)c;
        if ((first = getc(input->file)) == EOF)
            fatal_error("Error reading byte counts\n");
        if (first == 0)
            break;
        if ((last = getc(input->file)) == EOF)
            fatal_error("Error reading byte counts\n");
    }
    nodes[END_OF_STREAM].count = 1;
}

/*
 * This routine counts the frequency of occurence of every byte in
 * the input file.  It marks the place in the input stream where it
 * started, counts up all the bytes, then returns to the place where
 * it started.  In most C implementations, the length of a file
 * cannot exceed an unsigned long, so this routine should always
 * work.
 */

#ifndef SEEK_SET
#define SEEK_SET 0
#endif

void count_bytes(FILE *input, unsigned long *counts)
{
    long input_marker;
    int c;

    //ftell()返回指定流的文件指示器指针的当前值
    input_marker = ftell(input);
    while ((c = getc(input)) != EOF)
        counts[c]++;
    //fseek()设置流的文件位置为给定的偏移
    //input_marker--相对于SEEK_SET(文件的开头)的偏移量
    fseek(input, input_marker, SEEK_SET);
}

/*
 * In order to limit the size of my Huffman codes to 16 bits, I scale
 * my counts down so they fit in an unsigned char, and then store them
 * all as initial weights in my NODE array.  The only thing to be
 * careful of is to make sure that a node with a non-zero count doesn't
 * get scaled down to 0.  Nodes with values of 0 don't get codes.
 */
void scale_counts(unsigned long *counts, NODE *nodes)
{
    unsigned long max_count;
    int i;

    max_count = 0;
    for (i = 0; i < 256; i++)
        if (counts[i] > max_count)
            max_count = counts[i];
    if (max_count == 0)
    {
        counts[0] = 1;
        max_count = 1;
    }
    max_count = max_count / 255;
    max_count = max_count + 1;
    for (i = 0; i < 256; i++)
    {
        nodes[i].count = (unsigned int)(counts[i] / max_count);
        if (nodes[i].count == 0 && counts[i] != 0)
            nodes[i].count = 1;
    }
    nodes[END_OF_STREAM].count = 1;
}
/*
 * Building the Huffman tree is fairly simple.  All of the active nodes
 * are scanned in order to locate the two nodes with the minimum
 * weights.  These two weights are added together and assigned to a new
 * node.  The new node makes the two minimum nodes into its 0 child
 * and 1 child.  The two minimum nodes are then marked as inactive.
 * This process repeats until their is only one node left, which is the
 * root node.  The tree is done, and the root node is passed back
 * to the calling routine.
 *
 * Node 513 is used here to arbitratily provide a node with a guaranteed
 * maximum value.  It starts off being min_1 and min_2.  After all active
 * nodes have been scanned, I can tell if there is only one active node
 * left by checking to see if min_1 is still 513.
 */
int build_tree(NODE *nodes)
{
    int next_free;
    int i;
    int min_1;
    int min_2;

    nodes[513].count = 0xffff;
    for (next_free = END_OF_STREAM + 1;; next_free++)
    {
        min_1 = 513;
        min_2 = 513;
        for (i = 0; i < next_free; i++)
            if (nodes[i].count != 0)
            {
                if (nodes[i].count < nodes[min_1].count)
                {
                    min_2 = min_1;
                    min_1 = i;
                }
                else if (nodes[i].count < nodes[min_2].count)
                    min_2 = i;
            }
        if (min_2 == 513)
            break;
        nodes[next_free].count = nodes[min_1].count + nodes[min_2].count;
        nodes[min_1].saved_count = nodes[min_1].count;
        nodes[min_1].count = 0;
        nodes[min_2].saved_count = nodes[min_2].count;
        nodes[min_2].count = 0;
        nodes[next_free].child_0 = min_1;
        nodes[next_free].child_1 = min_2;
    }
    next_free--;
    nodes[next_free].saved_count = nodes[next_free].count;
    return (next_free);
}

/*
 * Since the Huffman tree is built as a decoding tree, there is
 * no simple way to get the encoding values for each symbol out of
 * it.  This routine recursively walks through the tree, adding the
 * child bits to each code until it gets to a leaf.  When it gets
 * to a leaf, it stores the code value in the CODE element, and
 * returns.
 */
void convert_tree_to_code(NODE *nodes, CODE *codes, unsigned int code_so_far, int bits, int node)
{
    if (node <= END_OF_STREAM)
    {
        codes[node].code = code_so_far;
        codes[node].code_bits = bits;
        return;
    }
    code_so_far <<= 1;
    bits++;
    convert_tree_to_code(nodes, codes, code_so_far, bits,
                         nodes[node].child_0);
    convert_tree_to_code(nodes, codes, code_so_far | 1,
                         bits, nodes[node].child_1);
}

/*
 * Once the tree gets built, and the CODE table is built, compressing
 * the data is a breeze.  Each byte is read in, and its corresponding
 * Huffman code is sent out.
 */
void compress_data(FILE *input, BIT_FILE *output, CODE *codes)
{
    int c;

    while ((c = getc(input)) != EOF)
        OutputBits(output, (unsigned long)codes[c].code,
                   codes[c].code_bits);
    OutputBits(output, (unsigned long)codes[END_OF_STREAM].code,
               codes[END_OF_STREAM].code_bits);
}

/*
 * Expanding compressed data is a little harder than the compression
 * phase.  As each new symbol is decoded, the tree is traversed,
 * starting at the root node, reading a bit in, and taking either the
 * child_0 or child_1 path.  Eventually, the tree winds down to a
 * leaf node, and the corresponding symbol is output.  If the symbol
 * is the END_OF_STREAM symbol, it doesn't get written out, and
 * instead the whole process terminates.
 */
void expand_data(BIT_FILE *input, FILE *output, NODE *nodes, int root_node)
{
    int node;

    for (;;)
    {
        node = root_node;
        do
        {
            if (InputBit(input))
                node = nodes[node].child_1;
            else
                node = nodes[node].child_0;
        } while (node > END_OF_STREAM);
        if (node == END_OF_STREAM)
            break;
        if ((putc(node, output)) != node)
            fatal_error("Error trying to write expanded byte to output");
    }
}
/*************************** End of HUFF.C **************************/