-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathARITH.C
575 lines (532 loc) · 17.3 KB
/
ARITH.C
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
/************************** Start of ARITH.C *************************
*
* This is the arithmetic coding module
*/
#include <stdio.h>
#include <stdlib.h>
#include <windows.h>
#include "errhand.h"
#include "bitio.h"
/*
* The SYMBOL structure is what is used to define a symbol in
* arithmetic coding terms. A symbol is defined as a range between
* 0 and 1. Since we are using integer math, instead of using 0 and 1
* as our end points, we have an integer scale. The low_count and
* high_count define where the symbol falls in the range.
*/
typedef struct {
unsigned short int low_count;
unsigned short int high_count;
unsigned short int scale;
} SYMBOL;
/*
* Internal function prototypes, with or without ANSI prototypes.
*/
static void build_model( FILE *input, FILE *output );
static void scale_counts( unsigned long counts[], unsigned char scaled_counts[] );
static void build_totals( unsigned char scaled_counts[] );
static void count_bytes( FILE *input, unsigned long counts[] );
static void output_counts( FILE *output, unsigned char scaled_counts[] );
static void input_counts( FILE *stream );
static void convert_int_to_symbol( int symbol, SYMBOL *s );
static void get_symbol_scale( SYMBOL *s );
static int convert_symbol_to_int( int count, SYMBOL *s );
static void initialize_arithmetic_encoder( void );
static void encode_symbol( BIT_FILE *stream, SYMBOL *s );
static void flush_arithmetic_encoder( BIT_FILE *stream );
static short int get_current_count( SYMBOL *s );
static void initialize_arithmetic_decoder( BIT_FILE *stream );
static void remove_symbol_from_stream( BIT_FILE *stream, SYMBOL *s );
#define END_OF_STREAM 256
static short int totals[ 258 ]; /* The cumulative totals */
//char *CompressionName = "Fixed order 0 model with arithmetic coding";
//char *Usage = "in-file out-file\n\n";
/*
* This compress file routine is a fairly orthodox compress routine.
* It first gathers statistics, and initializes the arithmetic
* encoder. It then encodes all the characters in the file, followed
* by the EOF character. The output stream is then flushed, and we exit.
* Note that an extra two bytes are output. When decoding an arithmetic
* stream, we have to read in extra bits. The decoding process takes
* place in the msb of the low and high range ints, so when we are
* decoding our last bit we will still have to have at least 15 junk
* bits loaded into the registers. The extra two bytes account for
* that.
*/
int CompressFile_fixed_order_0_arithmetic(FILE *input, BIT_FILE *output)
{
int c;
SYMBOL s;
DWORD milliseconds = GetTickCount();
build_model( input, output->file );
initialize_arithmetic_encoder();
while ( ( c = getc( input ) ) != EOF ) {
convert_int_to_symbol( c, &s );
encode_symbol( output, &s );
}
convert_int_to_symbol( END_OF_STREAM, &s );
encode_symbol( output, &s );
flush_arithmetic_encoder( output );
OutputBits( output, 0L, 16 );
return (GetTickCount() - milliseconds);
}
/*
* This expand routine is also very conventional. It reads in the
* model, initializes the decoder, then sits in a loop reading in
* characters. When we decode an END_OF_STREAM, it means we can close
* up the files and exit. Note decoding a single character is a three
* step process: first we determine what the scale is for the current
* symbol by looking at the difference between the high an low values.
* We then see where the current input values fall in that range.
* Finally, we look in our totals array to find out what symbol is
* a match. After that is done, we still have to remove that symbol
* from the decoder. Lots of work.
*/
int ExpandFile_fixed_order_0_arithmetic(BIT_FILE *input, FILE *output)
{
SYMBOL s;
int c;
int count;
DWORD milliseconds = GetTickCount();
input_counts( input->file );
initialize_arithmetic_decoder( input );
for ( ; ; ) {
get_symbol_scale( &s );
count = get_current_count( &s );
c = convert_symbol_to_int( count, &s );
if ( c == END_OF_STREAM )
break;
remove_symbol_from_stream( input, &s );
putc( (char) c, output );
}
return (GetTickCount() - milliseconds);
}
/*
* This is the routine that is called to scan the input file, scale
* the counts, build the totals array, the output the scaled counts
* to the output file.
*/
void build_model( input, output )
FILE *input;
FILE *output;
{
unsigned long counts[ 256 ];
unsigned char scaled_counts[ 256 ];
count_bytes( input, counts );
scale_counts( counts, scaled_counts );
output_counts( output, scaled_counts );
build_totals( scaled_counts );
}
/*
* This routine runs through the file and counts the appearances of each
* character.
*/
#ifndef SEEK_SET
#define SEEK_SET 0
#endif
void count_bytes( input, counts )
FILE *input;
unsigned long counts[];
{
long input_marker;
int i;
int c;
for ( i = 0 ; i < 256 ; i++ )
counts[ i ] = 0;
input_marker = ftell( input );
while ( ( c = getc( input )) != EOF )
counts[ c ]++;
fseek( input, input_marker, SEEK_SET );
}
/*
* This routine is called to scale the counts down. There are two types
* of scaling that must be done. First, the counts need to be scaled
* down so that the individual counts fit into a single unsigned char.
* Then, the counts need to be rescaled so that the total of all counts
* is less than 16384.
*/
void scale_counts( counts, scaled_counts )
unsigned long counts[];
unsigned char scaled_counts[];
{
int i;
unsigned long max_count;
unsigned int total;
unsigned long scale;
/*
* The first section of code makes sure each count fits into a single byte.
*/
max_count = 0;
for ( i = 0 ; i < 256 ; i++ )
if ( counts[ i ] > max_count )
max_count = counts[ i ];
scale = max_count / 256;
scale = scale + 1;
for ( i = 0 ; i < 256 ; i++ ) {
scaled_counts[ i ] = (unsigned char ) ( counts[ i ] / scale );
if ( scaled_counts[ i ] == 0 && counts[ i ] != 0 )
scaled_counts[ i ] = 1;
}
/*
* This next section makes sure the total is less than 16384. I initialize
* the total to 1 instead of 0 because there will be an additional 1 added
* in for the END_OF_STREAM symbol;
*/
total = 1;
for ( i = 0 ; i < 256 ; i++ )
total += scaled_counts[ i ];
if ( total > ( 32767 - 256 ) )
scale = 4;
else if ( total > 16383 )
scale = 2;
else
return;
for ( i = 0 ; i < 256 ; i++ )
scaled_counts[ i ] /= scale;
}
/*
* This routine is used by both the encoder and decoder to build the
* table of cumulative totals. The counts for the characters in the
* file are in the counts array, and we know that there will be a single
* instance of the EOF symbol.
*/
void build_totals( scaled_counts )
unsigned char scaled_counts[];
{
int i;
totals[ 0 ] = 0;
for ( i = 0 ; i < END_OF_STREAM ; i++ )
totals[ i + 1 ] = totals[ i ] + scaled_counts[ i ];
totals[ END_OF_STREAM + 1 ] = totals[ END_OF_STREAM ] + 1;
}
/*
* In order for the compressor to build the same model, I have to store
* the symbol counts in the compressed file so the expander can read
* them in. In order to save space, I don't save all 256 symbols
* unconditionally. The format used to store counts looks like this:
*
* start, stop, counts, start, stop, counts, ... 0
*
* This means that I store runs of counts, until all the non-zero
* counts have been stored. At this time the list is terminated by
* storing a start value of 0. Note that at least 1 run of counts has
* to be stored, so even if the first start value is 0, I read it in.
* It also means that even in an empty file that has no counts, I have
* to pass at least one count.
*
* In order to efficiently use this format, I have to identify runs of
* non-zero counts. Because of the format used, I don't want to stop a
* run because of just one or two zeros in the count stream. So I have
* to sit in a loop looking for strings of three or more zero values in
* a row.
*
* This is simple in concept, but it ends up being one of the most
* complicated routines in the whole program. A routine that just
* writes out 256 values without attempting to optimize would be much
* simpler, but would hurt compression quite a bit on small files.
*
*/
void output_counts( output, scaled_counts )
FILE *output;
unsigned char scaled_counts[];
{
int first;
int last;
int next;
int i;
first = 0;
while ( first < 255 && scaled_counts[ first ] == 0 )
first++;
/*
* Each time I hit the start of the loop, I assume that first is the
* number for a run of non-zero values. The rest of the loop is
* concerned with finding the value for last, which is the end of the
* run, and the value of next, which is the start of the next run.
* At the end of the loop, I assign next to first, so it starts in on
* the next run.
*/
for ( ; first < 256 ; first = next ) {
last = first + 1;
for ( ; ; ) {
for ( ; last < 256 ; last++ )
if ( scaled_counts[ last ] == 0 )
break;
last--;
for ( next = last + 1; next < 256 ; next++ )
if ( scaled_counts[ next ] != 0 )
break;
if ( next > 255 )
break;
if ( ( next - last ) > 3 )
break;
last = next;
};
/*
* Here is where I output first, last, and all the counts in between.
*/
if ( putc( first, output ) != first )
fatal_error( "Error writing byte counts\n" );
if ( putc( last, output ) != last )
fatal_error( "Error writing byte counts\n" );
for ( i = first ; i <= last ; i++ ) {
if ( putc( scaled_counts[ i ], output ) !=
(int) scaled_counts[ i ] )
fatal_error( "Error writing byte counts\n" );
}
}
if ( putc( 0, output ) != 0 )
fatal_error( "Error writing byte counts\n" );
}
/*
* When expanding, I have to read in the same set of counts. This is
* quite a bit easier that the process of writing them out, since no
* decision making needs to be done. All I do is read in first, check
* to see if I am all done, and if not, read in last and a string of
* counts.
*/
void input_counts( input )
FILE *input;
{
int first;
int last;
int i;
int c;
unsigned char scaled_counts[ 256 ];
for ( i = 0 ; i < 256 ; i++ )
scaled_counts[ i ] = 0;
if ( ( first = getc( input ) ) == EOF )
fatal_error( "Error reading byte counts\n" );
if ( ( last = getc( input ) ) == EOF )
fatal_error( "Error reading byte counts\n" );
for ( ; ; ) {
for ( i = first ; i <= last ; i++ )
if ( ( c = getc( input ) ) == EOF )
fatal_error( "Error reading byte counts\n" );
else
scaled_counts[ i ] = (unsigned char) c;
if ( ( first = getc( input ) ) == EOF )
fatal_error( "Error reading byte counts\n" );
if ( first == 0 )
break;
if ( ( last = getc( input ) ) == EOF )
fatal_error( "Error reading byte counts\n" );
}
build_totals( scaled_counts );
}
/*
* Everything from here down define the arithmetic coder section
* of the program.
*/
/*
* These four variables define the current state of the arithmetic
* coder/decoder. They are assumed to be 16 bits long. Note that
* by declaring them as short ints, they will actually be 16 bits
* on most 80X86 and 680X0 machines, as well as VAXen.
*/
static unsigned short int code; /* The present input code value */
static unsigned short int low; /* Start of the current code range */
static unsigned short int high; /* End of the current code range */
long underflow_bits; /* Number of underflow bits pending */
/*
* This routine must be called to initialize the encoding process.
* The high register is initialized to all 1s, and it is assumed that
* it has an infinite string of 1s to be shifted into the lower bit
* positions when needed.
*/
void initialize_arithmetic_encoder()
{
low = 0;
high = 0xffff;
underflow_bits = 0;
}
/*
* At the end of the encoding process, there are still significant
* bits left in the high and low registers. We output two bits,
* plus as many underflow bits as are necessary.
*/
void flush_arithmetic_encoder( stream )
BIT_FILE *stream;
{
OutputBit( stream, low & 0x4000 );
underflow_bits++;
while ( underflow_bits-- > 0 )
OutputBit( stream, ~low & 0x4000 );
}
/*
* Finding the low count, high count, and scale for a symbol
* is really easy, because of the way the totals are stored.
* This is the one redeeming feature of the data structure used
* in this implementation.
*/
void convert_int_to_symbol( c, s )
int c;
SYMBOL *s;
{
s->scale = totals[ END_OF_STREAM + 1 ];
s->low_count = totals[ c ];
s->high_count = totals[ c + 1 ];
}
/*
* Getting the scale for the current context is easy.
*/
void get_symbol_scale( s )
SYMBOL *s;
{
s->scale = totals[ END_OF_STREAM + 1 ];
}
/*
* During decompression, we have to search through the table until
* we find the symbol that straddles the "count" parameter. When
* it is found, it is returned. The reason for also setting the
* high count and low count is so that symbol can be properly removed
* from the arithmetic coded input.
*/
int convert_symbol_to_int( count, s)
int count;
SYMBOL *s;
{
int c;
for ( c = END_OF_STREAM ; count < totals[ c ] ; c-- )
;
s->high_count = totals[ c + 1 ];
s->low_count = totals[ c ];
return( c );
}
/*
* This routine is called to encode a symbol. The symbol is passed
* in the SYMBOL structure as a low count, a high count, and a range,
* instead of the more conventional probability ranges. The encoding
* process takes two steps. First, the values of high and low are
* updated to take into account the range restriction created by the
* new symbol. Then, as many bits as possible are shifted out to
* the output stream. Finally, high and low are stable again and
* the routine returns.
*/
void encode_symbol( stream, s )
BIT_FILE *stream;
SYMBOL *s;
{
long range;
/*
* These three lines rescale high and low for the new symbol.
*/
range = (long) ( high-low ) + 1;
high = low + (unsigned short int)
(( range * s->high_count ) / s->scale - 1 );
low = low + (unsigned short int)
(( range * s->low_count ) / s->scale );
/*
* This loop turns out new bits until high and low are far enough
* apart to have stabilized.
*/
for ( ; ; ) {
/*
* If this test passes, it means that the MSDigits match, and can
* be sent to the output stream.
*/
if ( ( high & 0x8000 ) == ( low & 0x8000 ) ) {
OutputBit( stream, high & 0x8000 );
while ( underflow_bits > 0 ) {
OutputBit( stream, ~high & 0x8000 );
underflow_bits--;
}
}
/*
* If this test passes, the numbers are in danger of underflow, because
* the MSDigits don't match, and the 2nd digits are just one apart.
*/
else if ( ( low & 0x4000 ) && !( high & 0x4000 )) {
underflow_bits += 1;
low &= 0x3fff;
high |= 0x4000;
} else
return ;
low <<= 1;
high <<= 1;
high |= 1;
}
}
/*
* When decoding, this routine is called to figure out which symbol
* is presently waiting to be decoded. This routine expects to get
* the current model scale in the s->scale parameter, and it returns
* a count that corresponds to the present floating point code:
*
* code = count / s->scale
*/
short int get_current_count( s )
SYMBOL *s;
{
long range;
short int count;
range = (long) ( high - low ) + 1;
count = (short int)
((((long) ( code - low ) + 1 ) * s->scale-1 ) / range );
return( count );
}
/*
* This routine is called to initialize the state of the arithmetic
* decoder. This involves initializing the high and low registers
* to their conventional starting values, plus reading the first
* 16 bits from the input stream into the code value.
*/
void initialize_arithmetic_decoder( stream )
BIT_FILE *stream;
{
int i;
code = 0;
for ( i = 0 ; i < 16 ; i++ ) {
code <<= 1;
code += InputBit( stream );
}
low = 0;
high = 0xffff;
}
/*
* Just figuring out what the present symbol is doesn't remove
* it from the input bit stream. After the character has been
* decoded, this routine has to be called to remove it from the
* input stream.
*/
void remove_symbol_from_stream( stream, s )
BIT_FILE *stream;
SYMBOL *s;
{
long range;
/*
* First, the range is expanded to account for the symbol removal.
*/
range = (long)( high - low ) + 1;
high = low + (unsigned short int)
(( range * s->high_count ) / s->scale - 1 );
low = low + (unsigned short int)
(( range * s->low_count ) / s->scale );
/*
* Next, any possible bits are shipped out.
*/
for ( ; ; ) {
/*
* If the MSDigits match, the bits will be shifted out.
*/
if ( ( high & 0x8000 ) == ( low & 0x8000 ) ) {
}
/*
* Else, if underflow is threatening, shift out the 2nd MSDigit.
*/
else if ((low & 0x4000) == 0x4000 && (high & 0x4000) == 0 ) {
code ^= 0x4000;
low &= 0x3fff;
high |= 0x4000;
} else
/*
* Otherwise, nothing can be shifted out, so I return.
*/
return;
low <<= 1;
high <<= 1;
high |= 1;
code <<= 1;
code += InputBit( stream );
}
}
/************************** End of ARITH.C ****************************/