-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHUFF.C
450 lines (421 loc) · 15.2 KB
/
HUFF.C
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
/************************** Start of HUFF.C *************************
*
* This is the Huffman coding module
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <windows.h>
#include "bitio.h"
#include "errhand.h"
/*
* The NODE structure is a node in the Huffman decoding tree. It has a
* count, which is its weight in the tree, and the node numbers of its
* two children. The saved_count member of the structure is only
* there for debugging purposes, and can be safely taken out at any
* time. It just holds the intial count for each of the symbols, since
* the count member is continually being modified as the tree grows.
*/
typedef struct tree_node
{
unsigned int count;
unsigned int saved_count;
int child_0;
int child_1;
} NODE;
/*
* A Huffman tree is set up for decoding, not encoding. When encoding,
* I first walk through the tree and build up a table of codes for
* each symbol. The codes are stored in this CODE structure.
*/
typedef struct code
{
unsigned int code;
int code_bits;
} CODE;
/*
* The special EOS symbol is 256, the first available symbol after all
* of the possible bytes. When decoding, reading this symbols
* indicates that all of the data has been read in.
*/
#define END_OF_STREAM 256
/*
* Local function prototypes, defined with or without ANSI prototypes.
*/
void count_bytes(FILE *input, unsigned long *long_counts);
void scale_counts(unsigned long *long_counts, NODE *nodes);
int build_tree(NODE *nodes);
void convert_tree_to_code(NODE *nodes,
CODE *codes,
unsigned int code_so_far,
int bits,
int node);
void output_counts(BIT_FILE *output, NODE *nodes);
void input_counts(BIT_FILE *input, NODE *nodes);
void compress_data(FILE *input, BIT_FILE *output, CODE *codes);
void expand_data(BIT_FILE *input, FILE *output, NODE *nodes,
int root_node);
/*
* These two strings are used by MAIN-C.C and MAIN-E.C to print
* messages of importance to the user of the program.
*/
// char *CompressionName = "static order 0 model with Huffman coding";
// char *Usage = "infile outfile\n\n";
/*
* CompressFile is the compression routine called by MAIN-C.C. It
* looks for a single additional argument to be passed to it from
* the command line: "-d". If a "-d" is present, it means the
* user wants to see the model data dumped out for debugging
* purposes.
*
* This routine works in a fairly straightforward manner. First,
* it has to allocate storage for three different arrays of data.
* Next, it counts all the bytes in the input file. The counts
* are all stored in long int, so the next step is scale them down
* to single byte counts in the NODE array. After the counts are
* scaled, the Huffman decoding tree is built on top of the NODE
* array. Another routine walks through the tree to build a table
* of codes, one per symbol. Finally, when the codes are all ready,
* compressing the file is a simple matter. After the file is
* compressed, the storage is freed up, and the routine returns.
*
*/
int CompressFile_huffman(FILE *input, BIT_FILE *output)
{
//分配空间
unsigned long *counts;
NODE *nodes;
CODE *codes;
int root_node;
DWORD milliseconds = GetTickCount(); //返回毫秒数
/* calloc 在内存的动态存储区中分配n个长度为size的连续空间
* 返回一个指向分配起始地址的指针
*/
counts = (unsigned long *)calloc(256, sizeof(unsigned long));
if (counts == NULL)
fatal_error("Error allocating counts array\n");
if ((nodes = (NODE *)calloc(514, sizeof(NODE))) == NULL)
fatal_error("Error allocating nodes array\n");
if ((codes = (CODE *)calloc(257, sizeof(CODE))) == NULL)
fatal_error("Error allocating codes array\n");
count_bytes(input, counts); //计算字节数
scale_counts(counts, nodes); //等比例缩小计数 并匹配权重
output_counts(output, nodes); //压缩文件中的符号计数
root_node = build_tree(nodes); //Huffman Tree
convert_tree_to_code(nodes, codes, 0, 0, root_node); //得到编码
compress_data(input, output, codes); //compress
//释放内存空间
free((char *)counts);
free((char *)nodes);
free((char *)codes);
return (GetTickCount() - milliseconds);
}
/*
* ExpandFile is the routine called by MAIN-E.C to expand a file that
* has been compressed with order 0 Huffman coding. This routine has
* a simpler job than that of the Compression routine. All it has to
* do is read in the counts that have been stored in the compressed
* file, then build the Huffman tree. The data can then be expanded
* by reading in a bit at a time from the compressed file. Finally,
* the node array is freed and the routine returns.
*
*/
/* ? ? ? what is order 0 Huffman coding ? ? ? */
int ExpandFile_huffman(BIT_FILE *input, FILE *output)
{
NODE *nodes;
int root_node;
DWORD milliseconds = GetTickCount();
if ((nodes = (NODE *)calloc(514, sizeof(NODE))) == NULL)
fatal_error("Error allocating nodes array\n");
input_counts(input, nodes);
root_node = build_tree(nodes);
expand_data(input, output, nodes, root_node);
free((char *)nodes);
return (GetTickCount() - milliseconds);
}
/*
* In order for the compressor to build the same model, I have to store
* the symbol counts in the compressed file so the expander can read
* them in. In order to save space, I don't save all 256 symbols
* unconditionally. The format used to store counts looks like this:
*
* start, stop, counts, start, stop, counts, ... 0
*
* This means that I store runs of counts, until all the non-zero
* counts have been stored. At this time the list is terminated by
* storing a start value of 0. Note that at least 1 run of counts has
* to be stored, so even if the first start value is 0, I read it in.
* It also means that even in an empty file that has no counts, I have
* to pass at least one count.
*
* In order to efficiently use this format, I have to identify runs of
* non-zero counts. Because of the format used, I don't want to stop a
* run because of just one or two zeros in the count stream. So I have
* to sit in a loop looking for strings of three or more zero values in
* a row.
*
* This is simple in concept, but it ends up being one of the most
* complicated routines in the whole program. A routine that just
* writes out 256 values without attempting to optimize would be much
* simpler, but would hurt compression quite a bit on small files.
*
*/
void output_counts(BIT_FILE *output, NODE *nodes)
{
int first;
int last;
int next;
int i;
first = 0;
while (first < 255 && nodes[first].count == 0)
first++;
/*
* Each time I hit the start of the loop, I assume that first is the
* number for a run of non-zero values. The rest of the loop is
* concerned with finding the value for last, which is the end of the
* run, and the value of next, which is the start of the next run.
* At the end of the loop, I assign next to first, so it starts in on
* the next run.
*/
for (; first < 256; first = next)
{
last = first + 1;
for (;;)
{
for (; last < 256; last++)
if (nodes[last].count == 0)
break;
last--;
for (next = last + 1; next < 256; next++)
if (nodes[next].count != 0)
break;
if (next > 255)
break;
if ((next - last) > 3)
break;
last = next;
};
/*
* Here is where I output first, last, and all the counts in between.
*/
if (putc(first, output->file) != first)
fatal_error("Error writing byte counts\n");
if (putc(last, output->file) != last)
fatal_error("Error writing byte counts\n");
for (i = first; i <= last; i++)
{
if (putc(nodes[i].count, output->file) !=
(int)nodes[i].count)
fatal_error("Error writing byte counts\n");
}
}
if (putc(0, output->file) != 0)
fatal_error("Error writing byte counts\n");
}
/*
* When expanding, I have to read in the same set of counts. This is
* quite a bit easier that the process of writing them out, since no
* decision making needs to be done. All I do is read in first, check
* to see if I am all done, and if not, read in last and a string of
* counts.
*/
void input_counts(BIT_FILE *input, NODE *nodes)
{
int first;
int last;
int i;
int c;
for (i = 0; i < 256; i++)
nodes[i].count = 0;
if ((first = getc(input->file)) == EOF)
fatal_error("Error reading byte counts\n");
if ((last = getc(input->file)) == EOF)
fatal_error("Error reading byte counts\n");
for (;;)
{
for (i = first; i <= last; i++)
if ((c = getc(input->file)) == EOF)
fatal_error("Error reading byte counts\n");
else
nodes[i].count = (unsigned int)c;
if ((first = getc(input->file)) == EOF)
fatal_error("Error reading byte counts\n");
if (first == 0)
break;
if ((last = getc(input->file)) == EOF)
fatal_error("Error reading byte counts\n");
}
nodes[END_OF_STREAM].count = 1;
}
/*
* This routine counts the frequency of occurence of every byte in
* the input file. It marks the place in the input stream where it
* started, counts up all the bytes, then returns to the place where
* it started. In most C implementations, the length of a file
* cannot exceed an unsigned long, so this routine should always
* work.
*/
#ifndef SEEK_SET
#define SEEK_SET 0
#endif
void count_bytes(FILE *input, unsigned long *counts)
{
long input_marker;
int c;
//ftell()返回指定流的文件指示器指针的当前值
input_marker = ftell(input);
while ((c = getc(input)) != EOF)
counts[c]++;
//fseek()设置流的文件位置为给定的偏移
//input_marker--相对于SEEK_SET(文件的开头)的偏移量
fseek(input, input_marker, SEEK_SET);
}
/*
* In order to limit the size of my Huffman codes to 16 bits, I scale
* my counts down so they fit in an unsigned char, and then store them
* all as initial weights in my NODE array. The only thing to be
* careful of is to make sure that a node with a non-zero count doesn't
* get scaled down to 0. Nodes with values of 0 don't get codes.
*/
void scale_counts(unsigned long *counts, NODE *nodes)
{
unsigned long max_count;
int i;
max_count = 0;
for (i = 0; i < 256; i++)
if (counts[i] > max_count)
max_count = counts[i];
if (max_count == 0)
{
counts[0] = 1;
max_count = 1;
}
max_count = max_count / 255;
max_count = max_count + 1;
for (i = 0; i < 256; i++)
{
nodes[i].count = (unsigned int)(counts[i] / max_count);
if (nodes[i].count == 0 && counts[i] != 0)
nodes[i].count = 1;
}
nodes[END_OF_STREAM].count = 1;
}
/*
* Building the Huffman tree is fairly simple. All of the active nodes
* are scanned in order to locate the two nodes with the minimum
* weights. These two weights are added together and assigned to a new
* node. The new node makes the two minimum nodes into its 0 child
* and 1 child. The two minimum nodes are then marked as inactive.
* This process repeats until their is only one node left, which is the
* root node. The tree is done, and the root node is passed back
* to the calling routine.
*
* Node 513 is used here to arbitratily provide a node with a guaranteed
* maximum value. It starts off being min_1 and min_2. After all active
* nodes have been scanned, I can tell if there is only one active node
* left by checking to see if min_1 is still 513.
*/
int build_tree(NODE *nodes)
{
int next_free;
int i;
int min_1;
int min_2;
nodes[513].count = 0xffff;
for (next_free = END_OF_STREAM + 1;; next_free++)
{
min_1 = 513;
min_2 = 513;
for (i = 0; i < next_free; i++)
if (nodes[i].count != 0)
{
if (nodes[i].count < nodes[min_1].count)
{
min_2 = min_1;
min_1 = i;
}
else if (nodes[i].count < nodes[min_2].count)
min_2 = i;
}
if (min_2 == 513)
break;
nodes[next_free].count = nodes[min_1].count + nodes[min_2].count;
nodes[min_1].saved_count = nodes[min_1].count;
nodes[min_1].count = 0;
nodes[min_2].saved_count = nodes[min_2].count;
nodes[min_2].count = 0;
nodes[next_free].child_0 = min_1;
nodes[next_free].child_1 = min_2;
}
next_free--;
nodes[next_free].saved_count = nodes[next_free].count;
return (next_free);
}
/*
* Since the Huffman tree is built as a decoding tree, there is
* no simple way to get the encoding values for each symbol out of
* it. This routine recursively walks through the tree, adding the
* child bits to each code until it gets to a leaf. When it gets
* to a leaf, it stores the code value in the CODE element, and
* returns.
*/
void convert_tree_to_code(NODE *nodes, CODE *codes, unsigned int code_so_far, int bits, int node)
{
if (node <= END_OF_STREAM)
{
codes[node].code = code_so_far;
codes[node].code_bits = bits;
return;
}
code_so_far <<= 1;
bits++;
convert_tree_to_code(nodes, codes, code_so_far, bits,
nodes[node].child_0);
convert_tree_to_code(nodes, codes, code_so_far | 1,
bits, nodes[node].child_1);
}
/*
* Once the tree gets built, and the CODE table is built, compressing
* the data is a breeze. Each byte is read in, and its corresponding
* Huffman code is sent out.
*/
void compress_data(FILE *input, BIT_FILE *output, CODE *codes)
{
int c;
while ((c = getc(input)) != EOF)
OutputBits(output, (unsigned long)codes[c].code,
codes[c].code_bits);
OutputBits(output, (unsigned long)codes[END_OF_STREAM].code,
codes[END_OF_STREAM].code_bits);
}
/*
* Expanding compressed data is a little harder than the compression
* phase. As each new symbol is decoded, the tree is traversed,
* starting at the root node, reading a bit in, and taking either the
* child_0 or child_1 path. Eventually, the tree winds down to a
* leaf node, and the corresponding symbol is output. If the symbol
* is the END_OF_STREAM symbol, it doesn't get written out, and
* instead the whole process terminates.
*/
void expand_data(BIT_FILE *input, FILE *output, NODE *nodes, int root_node)
{
int node;
for (;;)
{
node = root_node;
do
{
if (InputBit(input))
node = nodes[node].child_1;
else
node = nodes[node].child_0;
} while (node > END_OF_STREAM);
if (node == END_OF_STREAM)
break;
if ((putc(node, output)) != node)
fatal_error("Error trying to write expanded byte to output");
}
}
/*************************** End of HUFF.C **************************/