1 | /******************************************************************** |
2 | * * |
3 | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | * * |
8 | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
10 | * * |
11 | ******************************************************************** |
12 | |
13 | function: |
14 | last mod: $Id$ |
15 | |
16 | ********************************************************************/ |
17 | |
18 | #include <stdlib.h> |
19 | #include <string.h> |
20 | #include <ogg/ogg.h> |
21 | #include "huffdec.h" |
22 | #include "decint.h" |
23 | |
24 | |
25 | |
26 | /*Instead of storing every branching in the tree, subtrees can be collapsed |
27 | into one node, with a table of size 1<<nbits pointing directly to its |
28 | descedents nbits levels down. |
29 | This allows more than one bit to be read at a time, and avoids following all |
30 | the intermediate branches with next to no increased code complexity once |
31 | the collapsed tree has been built. |
32 | We do _not_ require that a subtree be complete to be collapsed, but instead |
33 | store duplicate pointers in the table, and record the actual depth of the |
34 | node below its parent. |
35 | This tells us the number of bits to advance the stream after reaching it. |
36 | |
37 | This turns out to be equivalent to the method described in \cite{Hash95}, |
38 | without the requirement that codewords be sorted by length. |
39 | If the codewords were sorted by length (so-called ``canonical-codes''), they |
40 | could be decoded much faster via either Lindell and Moffat's approach or |
41 | Hashemian's Condensed Huffman Code approach, the latter of which has an |
42 | extremely small memory footprint. |
43 | We can't use Choueka et al.'s finite state machine approach, which is |
44 | extremely fast, because we can't allow multiple symbols to be output at a |
45 | time; the codebook can and does change between symbols. |
46 | It also has very large memory requirements, which impairs cache coherency. |
47 | |
48 | We store the tree packed in an array of 16-bit integers (words). |
49 | Each node consists of a single word, followed consecutively by two or more |
50 | indices of its children. |
51 | Let n be the value of this first word. |
52 | This is the number of bits that need to be read to traverse the node, and |
53 | must be positive. |
54 | 1<<n entries follow in the array, each an index to a child node. |
55 | If the child is positive, then it is the index of another internal node in |
56 | the table. |
57 | If the child is negative or zero, then it is a leaf node. |
58 | These are stored directly in the child pointer to save space, since they only |
59 | require a single word. |
60 | If a leaf node would have been encountered before reading n bits, then it is |
61 | duplicated the necessary number of times in this table. |
62 | Leaf nodes pack both a token value and their actual depth in the tree. |
63 | The token in the leaf node is (-leaf&255). |
64 | The number of bits that need to be consumed to reach the leaf, starting from |
65 | the current node, is (-leaf>>8). |
66 | |
67 | @ARTICLE{Hash95, |
68 | author="Reza Hashemian", |
69 | title="Memory Efficient and High-Speed Search {Huffman} Coding", |
70 | journal="{IEEE} Transactions on Communications", |
71 | volume=43, |
72 | number=10, |
73 | pages="2576--2581", |
74 | month=Oct, |
75 | year=1995 |
76 | }*/ |
77 | |
78 | |
79 | |
80 | /*The map from external spec-defined tokens to internal tokens. |
81 | This is constructed so that any extra bits read with the original token value |
82 | can be masked off the least significant bits of its internal token index. |
83 | In addition, all of the tokens which require additional extra bits are placed |
84 | at the start of the list, and grouped by type. |
85 | OC_DCT_REPEAT_RUN3_TOKEN is placed first, as it is an extra-special case, so |
86 | giving it index 0 may simplify comparisons on some architectures. |
87 | These requirements require some substantial reordering.*/ |
88 | static const unsigned char OC_DCT_TOKEN_MAP[TH_NDCT_TOKENS]={ |
89 | /*OC_DCT_EOB1_TOKEN (0 extra bits)*/ |
90 | 15, |
91 | /*OC_DCT_EOB2_TOKEN (0 extra bits)*/ |
92 | 16, |
93 | /*OC_DCT_EOB3_TOKEN (0 extra bits)*/ |
94 | 17, |
95 | /*OC_DCT_REPEAT_RUN0_TOKEN (2 extra bits)*/ |
96 | 88, |
97 | /*OC_DCT_REPEAT_RUN1_TOKEN (3 extra bits)*/ |
98 | 80, |
99 | /*OC_DCT_REPEAT_RUN2_TOKEN (4 extra bits)*/ |
100 | 1, |
101 | /*OC_DCT_REPEAT_RUN3_TOKEN (12 extra bits)*/ |
102 | 0, |
103 | /*OC_DCT_SHORT_ZRL_TOKEN (3 extra bits)*/ |
104 | 48, |
105 | /*OC_DCT_ZRL_TOKEN (6 extra bits)*/ |
106 | 14, |
107 | /*OC_ONE_TOKEN (0 extra bits)*/ |
108 | 56, |
109 | /*OC_MINUS_ONE_TOKEN (0 extra bits)*/ |
110 | 57, |
111 | /*OC_TWO_TOKEN (0 extra bits)*/ |
112 | 58, |
113 | /*OC_MINUS_TWO_TOKEN (0 extra bits)*/ |
114 | 59, |
115 | /*OC_DCT_VAL_CAT2 (1 extra bit)*/ |
116 | 60, |
117 | 62, |
118 | 64, |
119 | 66, |
120 | /*OC_DCT_VAL_CAT3 (2 extra bits)*/ |
121 | 68, |
122 | /*OC_DCT_VAL_CAT4 (3 extra bits)*/ |
123 | 72, |
124 | /*OC_DCT_VAL_CAT5 (4 extra bits)*/ |
125 | 2, |
126 | /*OC_DCT_VAL_CAT6 (5 extra bits)*/ |
127 | 4, |
128 | /*OC_DCT_VAL_CAT7 (6 extra bits)*/ |
129 | 6, |
130 | /*OC_DCT_VAL_CAT8 (10 extra bits)*/ |
131 | 8, |
132 | /*OC_DCT_RUN_CAT1A (1 extra bit)*/ |
133 | 18, |
134 | 20, |
135 | 22, |
136 | 24, |
137 | 26, |
138 | /*OC_DCT_RUN_CAT1B (3 extra bits)*/ |
139 | 32, |
140 | /*OC_DCT_RUN_CAT1C (4 extra bits)*/ |
141 | 12, |
142 | /*OC_DCT_RUN_CAT2A (2 extra bits)*/ |
143 | 28, |
144 | /*OC_DCT_RUN_CAT2B (3 extra bits)*/ |
145 | 40 |
146 | }; |
147 | |
148 | /*The log base 2 of number of internal tokens associated with each of the spec |
149 | tokens (i.e., how many of the extra bits are folded into the token value). |
150 | Increasing the maximum value beyond 3 will enlarge the amount of stack |
151 | required for tree construction.*/ |
152 | static const unsigned char OC_DCT_TOKEN_MAP_LOG_NENTRIES[TH_NDCT_TOKENS]={ |
153 | 0,0,0,2,3,0,0,3,0,0,0,0,0,1,1,1,1,2,3,1,1,1,2,1,1,1,1,1,3,1,2,3 |
154 | }; |
155 | |
156 | |
157 | /*The size a lookup table is allowed to grow to relative to the number of |
158 | unique nodes it contains. |
159 | E.g., if OC_HUFF_SLUSH is 4, then at most 75% of the space in the tree is |
160 | wasted (1/4 of the space must be used). |
161 | Larger numbers can decode tokens with fewer read operations, while smaller |
162 | numbers may save more space. |
163 | With a sample file: |
164 | 32233473 read calls are required when no tree collapsing is done (100.0%). |
165 | 19269269 read calls are required when OC_HUFF_SLUSH is 1 (59.8%). |
166 | 11144969 read calls are required when OC_HUFF_SLUSH is 2 (34.6%). |
167 | 10538563 read calls are required when OC_HUFF_SLUSH is 4 (32.7%). |
168 | 10192578 read calls are required when OC_HUFF_SLUSH is 8 (31.6%). |
169 | Since a value of 2 gets us the vast majority of the speed-up with only a |
170 | small amount of wasted memory, this is what we use. |
171 | This value must be less than 128, or you could create a tree with more than |
172 | 32767 entries, which would overflow the 16-bit words used to index it.*/ |
173 | #define OC_HUFF_SLUSH (2) |
174 | /*The root of the tree is on the fast path, and a larger value here is more |
175 | beneficial than elsewhere in the tree. |
176 | 7 appears to give the best performance, trading off between increased use of |
177 | the single-read fast path and cache footprint for the tables, though |
178 | obviously this will depend on your cache size. |
179 | Using 7 here, the VP3 tables are about twice as large compared to using 2.*/ |
180 | #define OC_ROOT_HUFF_SLUSH (7) |
181 | |
182 | |
183 | |
184 | /*Unpacks a Huffman codebook. |
185 | _opb: The buffer to unpack from. |
186 | _tokens: Stores a list of internal tokens, in the order they were found in |
187 | the codebook, and the lengths of their corresponding codewords. |
188 | This is enough to completely define the codebook, while minimizing |
189 | stack usage and avoiding temporary allocations (for platforms |
190 | where free() is a no-op). |
191 | Return: The number of internal tokens in the codebook, or a negative value |
192 | on error.*/ |
193 | int oc_huff_tree_unpack(oc_pack_buf *_opb,unsigned char _tokens[256][2]){ |
194 | ogg_uint32_t code; |
195 | int len; |
196 | int ntokens; |
197 | int nleaves; |
198 | code=0; |
199 | len=ntokens=nleaves=0; |
200 | for(;;){ |
201 | long bits; |
202 | bits=oc_pack_read1(_opb); |
203 | /*Only process nodes so long as there's more bits in the buffer.*/ |
204 | if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER; |
205 | /*Read an internal node:*/ |
206 | if(!bits){ |
207 | len++; |
208 | /*Don't allow codewords longer than 32 bits.*/ |
209 | if(len>32)return TH_EBADHEADER; |
210 | } |
211 | /*Read a leaf node:*/ |
212 | else{ |
213 | ogg_uint32_t code_bit; |
214 | int neb; |
215 | int nentries; |
216 | int token; |
217 | /*Don't allow more than 32 spec-tokens per codebook.*/ |
218 | if(++nleaves>32)return TH_EBADHEADER; |
219 | bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS); |
220 | neb=OC_DCT_TOKEN_MAP_LOG_NENTRIES[bits]; |
221 | token=OC_DCT_TOKEN_MAP[bits]; |
222 | nentries=1<<neb; |
223 | while(nentries-->0){ |
224 | _tokens[ntokens][0]=(unsigned char)token++; |
225 | _tokens[ntokens][1]=(unsigned char)(len+neb); |
226 | ntokens++; |
227 | } |
228 | code_bit=0x80000000U>>len-1; |
229 | while(len>0&&(code&code_bit)){ |
230 | code^=code_bit; |
231 | code_bit<<=1; |
232 | len--; |
233 | } |
234 | if(len<=0)break; |
235 | code|=code_bit; |
236 | } |
237 | } |
238 | return ntokens; |
239 | } |
240 | |
241 | /*Count how many tokens would be required to fill a subtree at depth _depth. |
242 | _tokens: A list of internal tokens, in the order they are found in the |
243 | codebook, and the lengths of their corresponding codewords. |
244 | _depth: The depth of the desired node in the corresponding tree structure. |
245 | Return: The number of tokens that belong to that subtree.*/ |
246 | static int oc_huff_subtree_tokens(unsigned char _tokens[][2],int _depth){ |
247 | ogg_uint32_t code; |
248 | int ti; |
249 | code=0; |
250 | ti=0; |
251 | do{ |
252 | if(_tokens[ti][1]-_depth<32)code+=0x80000000U>>_tokens[ti++][1]-_depth; |
253 | else{ |
254 | /*Because of the expanded internal tokens, we can have codewords as long |
255 | as 35 bits. |
256 | A single recursion here is enough to advance past them.*/ |
257 | code++; |
258 | ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+31); |
259 | } |
260 | } |
261 | while(code<0x80000000U); |
262 | return ti; |
263 | } |
264 | |
265 | /*Compute the number of bits to use for a collapsed tree node at the given |
266 | depth. |
267 | _tokens: A list of internal tokens, in the order they are found in the |
268 | codebook, and the lengths of their corresponding codewords. |
269 | _ntokens: The number of tokens corresponding to this tree node. |
270 | _depth: The depth of this tree node. |
271 | Return: The number of bits to use for a collapsed tree node rooted here. |
272 | This is always at least one, even if this was a leaf node.*/ |
273 | static int oc_huff_tree_collapse_depth(unsigned char _tokens[][2], |
274 | int _ntokens,int _depth){ |
275 | int got_leaves; |
276 | int loccupancy; |
277 | int occupancy; |
278 | int slush; |
279 | int nbits; |
280 | int best_nbits; |
281 | slush=_depth>0?OC_HUFF_SLUSH:OC_ROOT_HUFF_SLUSH; |
282 | /*It's legal to have a tree with just a single node, which requires no bits |
283 | to decode and always returns the same token. |
284 | However, no encoder actually does this (yet). |
285 | To avoid a special case in oc_huff_token_decode(), we force the number of |
286 | lookahead bits to be at least one. |
287 | This will produce a tree that looks ahead one bit and then advances the |
288 | stream zero bits.*/ |
289 | nbits=1; |
290 | occupancy=2; |
291 | got_leaves=1; |
292 | do{ |
293 | int ti; |
294 | if(got_leaves)best_nbits=nbits; |
295 | nbits++; |
296 | got_leaves=0; |
297 | loccupancy=occupancy; |
298 | for(occupancy=ti=0;ti<_ntokens;occupancy++){ |
299 | if(_tokens[ti][1]<_depth+nbits)ti++; |
300 | else if(_tokens[ti][1]==_depth+nbits){ |
301 | got_leaves=1; |
302 | ti++; |
303 | } |
304 | else ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+nbits); |
305 | } |
306 | } |
307 | while(occupancy>loccupancy&&occupancy*slush>=1<<nbits); |
308 | return best_nbits; |
309 | } |
310 | |
311 | /*Determines the size in words of a Huffman tree node that represents a |
312 | subtree of depth _nbits. |
313 | _nbits: The depth of the subtree. |
314 | This must be greater than zero. |
315 | Return: The number of words required to store the node.*/ |
316 | static size_t oc_huff_node_size(int _nbits){ |
317 | return 1+(1<<_nbits); |
318 | } |
319 | |
320 | /*Produces a collapsed-tree representation of the given token list. |
321 | _tree: The storage for the collapsed Huffman tree. |
322 | This may be NULL to compute the required storage size instead of |
323 | constructing the tree. |
324 | _tokens: A list of internal tokens, in the order they are found in the |
325 | codebook, and the lengths of their corresponding codewords. |
326 | _ntokens: The number of tokens corresponding to this tree node. |
327 | Return: The number of words required to store the tree.*/ |
328 | static size_t oc_huff_tree_collapse(ogg_int16_t *_tree, |
329 | unsigned char _tokens[][2],int _ntokens){ |
330 | ogg_int16_t node[34]; |
331 | unsigned char depth[34]; |
332 | unsigned char last[34]; |
333 | size_t ntree; |
334 | int ti; |
335 | int l; |
336 | depth[0]=0; |
337 | last[0]=(unsigned char)(_ntokens-1); |
338 | ntree=0; |
339 | ti=0; |
340 | l=0; |
341 | do{ |
342 | int nbits; |
343 | nbits=oc_huff_tree_collapse_depth(_tokens+ti,last[l]+1-ti,depth[l]); |
344 | node[l]=(ogg_int16_t)ntree; |
345 | ntree+=oc_huff_node_size(nbits); |
346 | if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)nbits; |
347 | do{ |
348 | while(ti<=last[l]&&_tokens[ti][1]<=depth[l]+nbits){ |
349 | if(_tree!=NULL){ |
350 | ogg_int16_t leaf; |
351 | int nentries; |
352 | nentries=1<<depth[l]+nbits-_tokens[ti][1]; |
353 | leaf=(ogg_int16_t)-(_tokens[ti][1]-depth[l]<<8|_tokens[ti][0]); |
354 | while(nentries-->0)_tree[node[l]++]=leaf; |
355 | } |
356 | ti++; |
357 | } |
358 | if(ti<=last[l]){ |
359 | /*We need to recurse*/ |
360 | depth[l+1]=(unsigned char)(depth[l]+nbits); |
361 | if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)ntree; |
362 | l++; |
363 | last[l]= |
364 | (unsigned char)(ti+oc_huff_subtree_tokens(_tokens+ti,depth[l])-1); |
365 | break; |
366 | } |
367 | /*Pop back up a level of recursion.*/ |
368 | else if(l-->0)nbits=depth[l+1]-depth[l]; |
369 | } |
370 | while(l>=0); |
371 | } |
372 | while(l>=0); |
373 | return ntree; |
374 | } |
375 | |
376 | /*Unpacks a set of Huffman trees, and reduces them to a collapsed |
377 | representation. |
378 | _opb: The buffer to unpack the trees from. |
379 | _nodes: The table to fill with the Huffman trees. |
380 | Return: 0 on success, or a negative value on error. |
381 | The caller is responsible for cleaning up any partially initialized |
382 | _nodes on failure.*/ |
383 | int oc_huff_trees_unpack(oc_pack_buf *_opb, |
384 | ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){ |
385 | int i; |
386 | for(i=0;i<TH_NHUFFMAN_TABLES;i++){ |
387 | unsigned char tokens[256][2]; |
388 | int ntokens; |
389 | ogg_int16_t *tree; |
390 | size_t size; |
391 | /*Unpack the full tree into a temporary buffer.*/ |
392 | ntokens=oc_huff_tree_unpack(_opb,tokens); |
393 | if(ntokens<0)return ntokens; |
394 | /*Figure out how big the collapsed tree will be and allocate space for it.*/ |
395 | size=oc_huff_tree_collapse(NULL,tokens,ntokens); |
396 | /*This should never happen; if it does it means you set OC_HUFF_SLUSH or |
397 | OC_ROOT_HUFF_SLUSH too large.*/ |
398 | if(size>32767)return TH_EIMPL; |
399 | tree=(ogg_int16_t *)_ogg_malloc(size*sizeof(*tree)); |
400 | if(tree==NULL)return TH_EFAULT; |
401 | /*Construct the collapsed the tree.*/ |
402 | oc_huff_tree_collapse(tree,tokens,ntokens); |
403 | _nodes[i]=tree; |
404 | } |
405 | return 0; |
406 | } |
407 | |
408 | /*Determines the size in words of a Huffman subtree. |
409 | _tree: The complete Huffman tree. |
410 | _node: The index of the root of the desired subtree. |
411 | Return: The number of words required to store the tree.*/ |
412 | static size_t oc_huff_tree_size(const ogg_int16_t *_tree,int _node){ |
413 | size_t size; |
414 | int nchildren; |
415 | int n; |
416 | int i; |
417 | n=_tree[_node]; |
418 | size=oc_huff_node_size(n); |
419 | nchildren=1<<n; |
420 | i=0; |
421 | do{ |
422 | int child; |
423 | child=_tree[_node+i+1]; |
424 | if(child<=0)i+=1<<n-(-child>>8); |
425 | else{ |
426 | size+=oc_huff_tree_size(_tree,child); |
427 | i++; |
428 | } |
429 | } |
430 | while(i<nchildren); |
431 | return size; |
432 | } |
433 | |
434 | /*Makes a copy of the given set of Huffman trees. |
435 | _dst: The array to store the copy in. |
436 | _src: The array of trees to copy.*/ |
437 | int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES], |
438 | const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]){ |
439 | int total; |
440 | int i; |
441 | total=0; |
442 | for(i=0;i<TH_NHUFFMAN_TABLES;i++){ |
443 | size_t size; |
444 | size=oc_huff_tree_size(_src[i],0); |
445 | total+=size; |
446 | _dst[i]=(ogg_int16_t *)_ogg_malloc(size*sizeof(*_dst[i])); |
447 | if(_dst[i]==NULL){ |
448 | while(i-->0)_ogg_free(_dst[i]); |
449 | return TH_EFAULT; |
450 | } |
451 | memcpy(_dst[i],_src[i],size*sizeof(*_dst[i])); |
452 | } |
453 | return 0; |
454 | } |
455 | |
456 | /*Frees the memory used by a set of Huffman trees. |
457 | _nodes: The array of trees to free.*/ |
458 | void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){ |
459 | int i; |
460 | for(i=0;i<TH_NHUFFMAN_TABLES;i++)_ogg_free(_nodes[i]); |
461 | } |
462 | |
463 | |
464 | /*Unpacks a single token using the given Huffman tree. |
465 | _opb: The buffer to unpack the token from. |
466 | _node: The tree to unpack the token with. |
467 | Return: The token value.*/ |
468 | int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_tree){ |
469 | const unsigned char *ptr; |
470 | const unsigned char *stop; |
471 | oc_pb_window window; |
472 | int available; |
473 | long bits; |
474 | int node; |
475 | int n; |
476 | ptr=_opb->ptr; |
477 | window=_opb->window; |
478 | stop=_opb->stop; |
479 | available=_opb->bits; |
480 | node=0; |
481 | for(;;){ |
482 | n=_tree[node]; |
483 | if(n>available){ |
484 | unsigned shift; |
485 | shift=OC_PB_WINDOW_SIZE-available; |
486 | do{ |
487 | /*We don't bother setting eof because we won't check for it after we've |
488 | started decoding DCT tokens.*/ |
489 | if(ptr>=stop){ |
490 | shift=(unsigned)-OC_LOTS_OF_BITS; |
491 | break; |
492 | } |
493 | shift-=8; |
494 | window|=(oc_pb_window)*ptr++<<shift; |
495 | } |
496 | while(shift>=8); |
497 | /*Note: We never request more than 24 bits, so there's no need to fill in |
498 | the last partial byte here.*/ |
499 | available=OC_PB_WINDOW_SIZE-shift; |
500 | } |
501 | bits=window>>OC_PB_WINDOW_SIZE-n; |
502 | node=_tree[node+1+bits]; |
503 | if(node<=0)break; |
504 | window<<=n; |
505 | available-=n; |
506 | } |
507 | node=-node; |
508 | n=node>>8; |
509 | window<<=n; |
510 | available-=n; |
511 | _opb->ptr=ptr; |
512 | _opb->window=window; |
513 | _opb->bits=available; |
514 | return node&255; |
515 | } |
516 | |