aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZoltan Szabadka <szabadka@google.com>2013-11-28 17:37:13 +0100
committerZoltan Szabadka <szabadka@google.com>2013-11-28 17:37:13 +0100
commitfe79fac8da1ec850d94679705a6f3405153f51dd (patch)
tree57f05331abe77449e06f2a74479e1f69cd134dd3
parent4147ca26463c01f4309d0cd63c617c4876b9ba3e (diff)
downloadsrc-fe79fac8da1ec850d94679705a6f3405153f51dd.tar.gz
Add draft specification of the brotli format
-rw-r--r--brotli/brotlispec.txt1203
-rw-r--r--brotli/dec/decode.c133
-rw-r--r--brotli/enc/encode.cc34
3 files changed, 1268 insertions, 102 deletions
diff --git a/brotli/brotlispec.txt b/brotli/brotlispec.txt
new file mode 100644
index 0000000..55232aa
--- /dev/null
+++ b/brotli/brotlispec.txt
@@ -0,0 +1,1203 @@
+J. Alakuijala
+Z. Szabadka
+ ______ _______ _______ _______ _________
+ ( __ \ ( ____ )( ___ )( ____ \\__ __/
+ | ( \ )| ( )|| ( ) || ( \/ ) (
+ | | ) || (____)|| (___) || (__ | |
+ | | | || __)| ___ || __) | |
+ | | ) || (\ ( | ( ) || ( | |
+ | (__/ )| ) \ \__| ) ( || ) | |
+ (______/ |/ \__/|/ \||/ )_(
+
+
+ DRAFT of
+ Brotli Compression Algorithm Compressed Data Format Specification 1.0
+
+Status of This Memo
+
+ This memo provides information for the Internet community. This memo
+ does not specify an Internet standard of any kind. Distribution of
+ this memo is unlimited.
+
+Notices
+
+ Copyright (c) 2013 J. Alakuijala and Z. Szabadka
+
+ Permission is granted to copy and distribute this document for any
+ purpose and without charge, including translations into other
+ languages and incorporation into compilations, provided that the
+ copyright notice and this notice are preserved, and that any
+ substantive changes or deletions from the original are clearly
+ marked.
+
+Abstract
+
+ This specification defines a lossless compressed data format that
+ compresses data using a combination of the LZ77 algorithm and Huffman
+ coding, with efficiency comparable to the best currently available
+ general-purpose compression methods.
+
+1. Introduction
+
+ 1.1. Purpose
+
+ The purpose of this specification is to define a lossless
+ compressed data format that:
+ * Is independent of CPU type, operating system, file system,
+ and character set, and hence can be used for interchange;
+ * Can be produced or consumed, even for an arbitrarily long
+ sequentially presented input data stream, using only an a
+ priori bounded amount of intermediate storage, and hence
+ can be used in data communications or similar structures
+ such as Unix filters;
+ * Compresses data with efficiency comparable to the best
+ currently available general-purpose compression methods,
+ and in particular considerably better than the gzip
+ program and decompresses much faster than the LZMA
+ implementations.
+
+ The data format defined by this specification does not attempt to:
+ * Allow random access to compressed data;
+ * Compress specialized data (e.g., raster graphics) as well
+ as the best currently available specialized algorithms.
+
+ 1.2. Intended audience
+
+ This specification is intended for use by implementors of software
+ to compress data into "brotli" format and/or decompress data from
+ "brotli" format.
+
+ The text of the specification assumes a basic background in
+ programming at the level of bits and other primitive data
+ representations. Familiarity with the technique of Huffman coding
+ is helpful but not required.
+
+ This specification uses heavily the notations and terminology
+ introduced in the DEFLATE format specification (RFC 1951, see
+ reference [3] below). For the sake of completeness, we always
+ include the whole text of the relevant parts of RFC 1951,
+ therefore familiarity with the DEFLATE format is helpful but not
+ required.
+
+ 1.3. Scope
+
+ The specification specifies a method for representing a sequence
+ of bytes as a (usually shorter) sequence of bits, and a method for
+ packing the latter bit sequence into bytes.
+
+ 1.4. Compliance
+
+ Unless otherwise indicated below, a compliant decompressor must be
+ able to accept and decompress any data set that conforms to all
+ the specifications presented here; a compliant compressor must
+ produce data sets that conform to all the specifications presented
+ here.
+
+ 1.5. Definitions of terms and conventions used
+
+ Byte: 8 bits stored or transmitted as a unit (same as an octet).
+ For this specification, a byte is exactly 8 bits, even on machines
+ which store a character on a number of bits different from eight.
+ See below, for the numbering of bits within a byte.
+
+ String: a sequence of arbitrary bytes.
+
+ Bytes stored within a computer do not have a "bit order", since
+ they are always treated as a unit. However, a byte considered as
+ an integer between 0 and 255 does have a most- and least-
+ significant bit, and since we write numbers with the most-
+ significant digit on the left, we also write bytes with the most-
+ significant bit on the left. In the diagrams below, we number the
+ bits of a byte so that bit 0 is the least-significant bit, i.e.,
+ the bits are numbered:
+
+ +--------+
+ |76543210|
+ +--------+
+
+ Within a computer, a number may occupy multiple bytes. All
+ multi-byte numbers in the format described here are stored with
+ the least-significant byte first (at the lower memory address).
+ For example, the decimal number 520 is stored as:
+
+ 0 1
+ +--------+--------+
+ |00001000|00000010|
+ +--------+--------+
+ ^ ^
+ | |
+ | + more significant byte = 2 x 256
+ + less significant byte = 8
+
+
+ 1.5.1. Packing into bytes
+
+ This document does not address the issue of the order in which
+ bits of a byte are transmitted on a bit-sequential medium,
+ since the final data format described here is byte- rather than
+ bit-oriented. However, we describe the compressed block format
+ in below, as a sequence of data elements of various bit
+ lengths, not a sequence of bytes. We must therefore specify
+ how to pack these data elements into bytes to form the final
+ compressed byte sequence:
+
+ * Data elements are packed into bytes in order of
+ increasing bit number within the byte, i.e., starting
+ with the least-significant bit of the byte.
+ * Data elements other than Huffman codes are packed
+ starting with the least-significant bit of the data
+ element.
+ * Huffman codes are packed starting with the most-
+ significant bit of the code.
+
+ In other words, if one were to print out the compressed data as
+ a sequence of bytes, starting with the first byte at the
+ *right* margin and proceeding to the *left*, with the most-
+ significant bit of each byte on the left as usual, one would be
+ able to parse the result from right to left, with fixed-width
+ elements in the correct MSB-to-LSB order and Huffman codes in
+ bit-reversed order (i.e., with the first bit of the code in the
+ relative LSB position).
+
+2. Compressed representation overview
+
+ A compressed data set consists of a header and a series of meta-
+ blocks, corresponding to successive meta-blocks of input data. The
+ meta-block sizes are limited to bytes and the maximum meta-block size
+ is 268,435,456 bytes.
+
+ The header contains the size of a sliding window on the input data
+ that is sufficient to keep on the intermediate storage at any given
+ point during decoding the stream.
+
+ Each meta-block is compressed using a combination of the LZ77
+ algorithm (Lempel-Ziv 1977, see reference [2] below) and Huffman
+ coding. The Huffman trees for each block are independent of those for
+ previous or subsequent blocks; the LZ77 algorithm may use a
+ reference to a duplicated string occurring in a previous meta-block,
+ up to sliding window size input bytes before.
+
+ Each meta-block consists of two parts: a meta-block header that
+ describes the representation of the compressed data part, and a
+ compressed data part. The compressed data consists of a series of
+ commands. Each command consists of two parts: a sequence of literal
+ bytes (of strings that have not been detected as duplicated within
+ the sliding window), and a pointer to a duplicated string,
+ represented as a pair <length, backward distance>.
+
+ Each command in the compressed data is represented using three kinds
+ of Huffman codes: one kind of code tree for the literal sequence
+ lengths (also referred to as literal insertion lengths) and backward
+ copy lengths (that is, a single code word represents two lengths,
+ one of the literal sequence and one of the backward copy), a separate
+ kind of code tree for literals, and a third kind of code tree for
+ distances. The code trees for each meta-block appear in a compact
+ form just before the compressed data in the meta-block header.
+
+ The sequence of each type of value in the representation of a command
+ (insert-and-copy lengths, literals and distances) within a meta-
+ block is further divided into blocks. In other words, each meta-block
+ has a series of insert-and-copy length blocks, a series of literal
+ blocks and a series of distance blocks. These are also called the
+ three block categories: a meta-block has a series of blocks for each
+ block category. The subsequent blocks within each block category have
+ different block types, but blocks further away in the block sequence
+ can have the same types. The block types are numbered from 0 to the
+ maximum block type number of 253 and the first block of each block
+ category has type 0. The block structure of a meta-block is
+ represented by the sequence of block-switch commands for each block
+ category, where a block-switch command is a pair <block type, block
+ length>. The block-switch commands are represented in the compressed
+ data before the start of each new block using a Huffman code tree for
+ block types and a separate Huffman code tree for block lengths for
+ each block category. The code trees for block types and lengths
+ (total of six Huffman code trees) appear in a compact form in the
+ meta-block header.
+
+ Each type of value (insert-and-copy lengths, literals and distances)
+ can be encoded with any Huffman tree from a collection of Huffman
+ trees of the same kind appearing in the meta-block header. The
+ particual Huffman tree used can depend on two factors: the block type
+ of the block the value appears in, and the context of the value. In
+ the case of the literals, the context is the previous two bytes in
+ the input data, and in the case of distances, the context is the copy
+ length from the same command. For insert-and-copy lengths, no context
+ is used and the Huffman tree depends only on the block type (in fact,
+ the index of the Huffman tree is the block type number). In the case
+ of literals and distances, the context is mapped to a context id in
+ the rage [0, 63] for literals and [0, 3] for distances and the matrix
+ of the Huffman tree indices for each block type and context id,
+ called the context map, is encoded in a compact form in the meta-
+ block header.
+
+ In addition to the parts listed above (Huffman code trees for insert-
+ and-copy lengths, literals, distances, block types and block lengths
+ and the context map), the meta-block header contains the number of
+ input bytes in the meta-block and two additional parameters used in
+ the representation of copy distances (number of "postfix bits" and
+ number of direct distance codes, see later).
+
+3. Compressed representation of Huffman codes
+
+ 3.1. Introduction to prefix and Huffman coding
+
+ Prefix coding represents symbols from an a priori known alphabet
+ by bit sequences (codes), one code for each symbol, in a manner
+ such that different symbols may be represented by bit sequences of
+ different lengths, but a parser can always parse an encoded string
+ unambiguously symbol-by-symbol.
+
+ We define a prefix code in terms of a binary tree in which the two
+ edges descending from each non-leaf node are labeled 0 and 1 and
+ in which the leaf nodes correspond one-for-one with (are labeled
+ with) the symbols of the alphabet; then the code for a symbol is
+ the sequence of 0's and 1's on the edges leading from the root to
+ the leaf labeled with that symbol. For example:
+
+ /\ Symbol Code
+ 0 1 ------ ----
+ / \ A 00
+ /\ B B 1
+ 0 1 C 011
+ / \ D 010
+ A /\
+ 0 1
+ / \
+ D C
+
+ A parser can decode the next symbol from an encoded input stream
+ by walking down the tree from the root, at each step choosing the
+ edge corresponding to the next input bit.
+
+ Given an alphabet with known symbol frequencies, the Huffman
+ algorithm allows the construction of an optimal prefix code (one
+ which represents strings with those symbol frequencies using the
+ fewest bits of any possible prefix codes for that alphabet). Such
+ a code is called a Huffman code. (See reference [1] in Chapter 5,
+ references for additional information on Huffman codes.)
+
+ Note that in the "brotli" format, the Huffman codes for the
+ various alphabets must not exceed certain maximum code lengths.
+ This constraint complicates the algorithm for computing code
+ lengths from symbol frequencies. Again, see Chapter 5, references
+ for details.
+
+ 3.2. Use of Huffman coding in the "brotli" format
+
+ The Huffman codes used for each alphabet in the "brotli" format
+ are canonical Huffman codes, which have two additional rules:
+
+ * All codes of a given bit length have lexicographically
+ consecutive values, in the same order as the symbols they
+ represent;
+
+ * Shorter codes lexicographically precede longer codes.
+
+ We could recode the example above to follow this rule as follows,
+ assuming that the order of the alphabet is ABCD:
+
+ Symbol Code
+ ------ ----
+ A 10
+ B 0
+ C 110
+ D 111
+
+ I.e., 0 precedes 10 which precedes 11x, and 110 and 111 are
+ lexicographically consecutive.
+
+ Given this rule, we can define the canonical Huffman code for an
+ alphabet just by giving the bit lengths of the codes for each
+ symbol of the alphabet in order; this is sufficient to determine
+ the actual codes. In our example, the code is completely defined
+ by the sequence of bit lengths (2, 1, 3, 3). The following
+ algorithm generates the codes as integers, intended to be read
+ from most- to least-significant bit. The code lengths are
+ initially in tree[I].Len; the codes are produced in tree[I].Code.
+
+ 1) Count the number of codes for each code length. Let
+ bl_count[N] be the number of codes of length N, N >= 1.
+
+ 2) Find the numerical value of the smallest code for each
+ code length:
+
+ code = 0;
+ bl_count[0] = 0;
+ for (bits = 1; bits <= MAX_BITS; bits++) {
+ code = (code + bl_count[bits-1]) << 1;
+ next_code[bits] = code;
+ }
+
+ 3) Assign numerical values to all codes, using consecutive
+ values for all codes of the same length with the base
+ values determined at step 2. Codes that are never used
+ (which have a bit length of zero) must not be assigned a
+ value.
+
+ for (n = 0; n <= max_code; n++) {
+ len = tree[n].Len;
+ if (len != 0) {
+ tree[n].Code = next_code[len];
+ next_code[len]++;
+ }
+ }
+
+ Example:
+
+ Consider the alphabet ABCDEFGH, with bit lengths (3, 3, 3, 3, 3,
+ 2, 4, 4). After step 1, we have:
+
+ N bl_count[N]
+ - -----------
+ 2 1
+ 3 5
+ 4 2
+
+ Step 2 computes the following next_code values:
+
+ N next_code[N]
+ - ------------
+ 1 0
+ 2 0
+ 3 2
+ 4 14
+
+ Step 3 produces the following code values:
+
+ Symbol Length Code
+ ------ ------ ----
+ A 3 010
+ B 3 011
+ C 3 100
+ D 3 101
+ E 3 110
+ F 2 00
+ G 4 1110
+ H 4 1111
+
+ 3.3. Alphabet sizes
+
+ Huffman codes are used for different purposes in the "brotli"
+ format, and each purpose has a different alphabet size. For
+ literal codes, the alphabet size is 256. For insert-and-copy
+ length codes, the alphabet size is 704. For block length codes,
+ the alphabet size is 26. For distance codes, block type codes and
+ the Huffman codes used in compressing the context map, the
+ alphabet size is dynamic and is based on other parameters (see
+ later).
+
+ 3.4. Simple Huffman codes
+
+ The first bit of the compressed representation of each Huffman
+ code distinguishes between simple and complex Huffman codes. If
+ the first bit is 1, then a simple, otherwise a complex Huffman
+ code follows.
+
+ A simple Huffman code can have only up to four symbols with non-
+ zero code length. The format of the simple Huffman code is as
+ follows:
+
+ 1 bit: 1, indicating a simple Huffman code
+ 2 bits: NSYM - 1, where NSYM = # of symbols with non-zero
+ code length
+
+ NSYM symbols, each encoded using ALPHABET_BITS bits
+
+ 1 bit: tree-select, present only for NSYM = 4
+
+ The value of ALPHABET_BITS depends on the alphabet of the Huffman
+ code: it is the smallest number of bits that can represent all
+ symbols in the alphabet. E.g. for the alphabet of literal bytes,
+ ALPHABET_BITS is 8.
+
+ The (non-zero) code lengths of the symbols can be reconstructed as
+ follows:
+
+ * if NSYM = 1, the code length for the one symbol is one at
+ this stage, but only to distinguish it from the other zero
+ code length symbols, when encoding this symbol in the
+ compressed data stream using this Huffman code later, no
+ actual bits are emitted. Similarly, when decoding a symbol
+ using this Huffman code, no bits are read and the one symbol
+ is returned.
+
+ * if NSYM = 2, both symbols have code length 1.
+
+ * if NSYM = 3, the code lengths for the symbols are 1, 2, 2 in
+ the order they appear in the representation of the simple
+ Huffman code.
+
+ * if NSYM = 4, the code lengths (in order of symbols decoded)
+ depend on the tree-select bit: 2, 2, 2, 2, (tree-select bit 0)
+ or 1, 2, 3, 3 (tree-select bit 1).
+
+ 3.5. Complex Huffman codes
+
+ A complex Huffman code is a canonical Huffman code, defined by the
+ sequence of code lengths, as discussed in Paragraph 3.2, above.
+ For even greater compactness, the code length sequences themselves
+ are compressed using a Huffman code. The alphabet for code lengths
+ is as follows:
+
+ 0 - 15: Represent code lengths of 0 - 15
+ 16: Copy the previous non-zero code length 3 - 6 times
+ The next 2 bits indicate repeat length
+ (0 = 3, ... , 3 = 6)
+ If this is the first code length, or all previous
+ code lengths are zero, a code length of 8 is
+ repeated 3 - 6 times
+ Example: Codes 7, 16 (+2 bits 11),
+ 16 (+2 bits 10) will expand to
+ 12 code lengths of 7 (1 + 6 + 5)
+ 17: Repeat a code length of 0 for 3 - 10 times.
+ (3 bits of length)
+ 18: Repeat a code length of 0 for 11 - 138 times
+ (7 bits of length)
+
+ A code length of 0 indicates that the corresponding symbol in the
+ alphabet will not occur in the compressed data, and should not
+ participate in the Huffman code construction algorithm given
+ earlier.
+
+ The bit lengths of the Huffman code over the code length alphabet
+ are compressed with the following static Huffman code:
+
+ Symbol Code
+ ------ ----
+ 0 00
+ 1 1010
+ 2 100
+ 3 11
+ 4 01
+ 5 1011
+
+ We can now define the format of the complex Huffman code as
+ follows:
+
+ 1 bit: 0, indicating a complex Huffman code
+ 4 bits: HCLEN, # of code length codes - 4
+ 1 bit : HSKIP, if 1, skip over first two code length codes
+
+ (HCLEN + 4 - 2 * HSKIP) code lengths for symbols in the code
+ length alphabet given just above, in the order: 1, 2, 3,
+ 4, 0, 17, 18, 5, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ If HSKIP is 1, code lengths of code length symbols 1 and
+ 2 are implicit zeros. Code lengths of code length symbols
+ beyond the (HCLEN + 4)th in the ordering above are also
+ implicit zeros.
+
+ The code lengths of code length symbols are between 0 and
+ 5 and they are represented with 2 - 5 bits according to
+ the static Huffman code above. A code length of 0 means
+ the corresponding code length symbol is not used.
+
+ 1 bit: HLENINC, if 1, the number of code length symbols is
+ encoded next
+
+ 3 bits: HNBITPAIRS, (# of bit pairs to represent HLEN) - 2,
+ appears only if HLENINC = 1
+
+ 2 * HNBITPAIRS + 2 bits: HLEN, # of code length symbols - 2,
+ appears only if HLENINC = 1
+
+ Sequence of code lengths symbols, encoded using the code
+ length Huffman code. The number of code length symbols
+ is either HLEN + 2 (in case of HLENINC = 1), or as many
+ as is needed to assign a code length to each symbol in
+ the alphabet (i.e. the alphabet size minus the sum of all
+ the repeat lengths defined by extra bits of code length
+ symbols 16 - 18). In case of HLENINC = 1, all symbols
+ not assigned a code length have implicit code length 0.
+
+ 3.6. Validity of the Huffman code
+
+ There are two kinds of valid Huffman codes:
+ * Huffman code that contains one symbol of length 1, and
+ * Full canonical Huffman code.
+
+ A decoder can check if the Huffman code is full by using integer
+ arithmetic, by computing if the sum of (32768 right-shifted by
+ code-length) over the non-zero code lengths leads to a total sum
+ of 32768. However, if there is only one non-zero code-length, it
+ shall have an implicit code length of one and the code is
+ considered valid.
+
+4. Encoding of distances
+
+ As described in Section 2, one component of a compressed meta-block
+ is a sequence of backward distances. In this section we provide the
+ details to the encoding of distances.
+
+ Each distance in the compressed data part of a meta-block is
+ represented with a pair <distance code, extra bits>. The distance
+ code and the extra bits are encoded back-to-back, the distance code
+ is encoded using a Huffman code over the distance code alphabet,
+ while the extra bits value is encoded as a fixed-width machine
+ integer. The number of extra bits can be 0 - 24, and it is dependent
+ on the distance code.
+
+ To convert a distance code and associated extra bits to a backward
+ distance, we need the sequence of past distances and two additonal
+ parameters, the number of "postfix bits", denoted by NPOSTFIX, and
+ the number of direct distance codes, denoted by NDIRECT. Both of
+ these parameters are encoded in the meta-block header. We will also
+ use the folowing derived parameter:
+
+ POSTFIX_MASK = ((1 << NPOSTFIX) - 1)
+
+ The first 16 distance codes are special short codes that reference
+ past distances as follows:
+
+ 0: last distance
+ 1: second last distance
+ 2: third last distance
+ 3: fourth last distance
+ 4: last distance - 1
+ 5: last distance + 1
+ 6: last distance - 2
+ 7: last distance + 2
+ 8: last distance - 3
+ 9: last disatnce + 3
+ 10: second last distance - 1
+ 11: second last distance + 1
+ 12: second last distance - 2
+ 13: second last distance + 2
+ 14: second last distance - 3
+ 15: second last distance + 3
+
+ The ring-buffer of four last distances is initialized by the values
+ 16, 15, 11 and 4 (i.e. the fourth last is set to 16, the third last
+ to 15, the second last to 11 and the last distance to 4) at the
+ beginning of the *stream* (as opposed to the beginning of the meta-
+ block) and it is not reset at meta-block boundaries. When a distance
+ code 0 appears, the distance it represents (i.e. the last distance
+ in the sequence of distances) is not pushed to the ring-buffer of
+ last distances, in other words, the expression "(second, third,
+ fourth) last distance" means the (second, third, fourth) last
+ distance that was not represented by a 0 distance code.
+
+ The next NDIRECT distance codes, from 16 to 15 + NDIRECT, represent
+ distances from 1 to NDIRECT. Neither the distance short codes, nor
+ the NDIRECT direct distance codes have any extra bits.
+
+ Distance codes 16 + NDIRECT and greater all have extra bits, the
+ number of extra bits for a distance code `dcode' is given by the
+ following formula:
+
+ ndistbits = 1 + ((dcode - NDIRECT - 16) >> (NPOSTFIX + 1))
+
+ The maximum number of extra bits is 24, therefore the size of the
+ distance code alphabet is (16 + NDIRECT + (48 << NPOSTFIX)).
+
+ Given a distance code `dcode' (>= 16 + NDIRECT), and extra bits
+ `dextra', the backward distance is given by the following formula:
+
+ hcode = (dcode - NDIRECT - 16) >> NPOSTFIX
+ lcode = (dcode - NDIRECT - 16) & POSTFIX_MASK
+ offset = ((2 + (hcode & 1)) << ndistbits) - 4;
+ distance = ((offset + dextra) << NPOSTFIX) + lcode + NDIRECT + 1
+
+5. Encoding of literal insertion lengths and copy lengths
+
+ As described in Section 2, the literal insertion lengths and backward
+ copy lengths are encoded using a single Huffman code. This section
+ provides the details to this encoding.
+
+ Each <insertion length, copy length> pair in the compressed data part
+ of a meta-block is represented with the following triplet:
+
+ <insert-and-copy length code, insert extra bits, copy extra bits>
+
+ The insert-and-copy length code, the insert extra bits and the copy
+ extra bits are encoded back-to-back, the insert-and-copy length code
+ is encoded using a Huffman code over the insert-and-copy length code
+ alphabet, while the extra bits values are encoded as fixed-width
+ machine integers. The number of insert and copy extra bits can be
+ 0 - 24, and they are dependent on the insert-and-copy length code.
+
+ Some of the insert-and-copy length codes also express the fact that
+ the distance code of the distance in the same command is 0, i.e. the
+ distance component of the command is the same as that of the previous
+ command. In this case, the distance code and extra bits for the
+ distance are omitted from the compressed data stream.
+
+ We describe the insert-and-copy length code alphabet in terms of the
+ (not directly used) insert length code and copy length code
+ alphabets. The symbols of the insert length code alphabet, along with
+ the number of insert extra bits and the range of the insert lengths
+ are as follows:
+
+ Extra Extra Extra
+ Code Bits Lengths Code Bits Lengths Code Bits Lengths
+ ---- ---- ------ ---- ---- ------- ---- ---- -------
+ 0 0 0 8 2 10-13 16 6 130-193
+ 1 0 1 9 2 14-17 17 7 194-321
+ 2 0 2 10 3 18-25 18 8 322-527
+ 3 0 3 11 3 26-33 19 9 578-1089
+ 4 0 4 12 4 34-49 20 10 1090-2113
+ 5 0 5 13 4 50-65 21 12 2114-6209
+ 6 1 6,7 14 5 66-97 22 14 6210-22593
+ 7 1 8,9 15 5 98-129 23 24 22594-16799809
+
+ The symbols of the copy length code alphabet, along with the number
+ of copy extra bits and the range of copy lengths are as follows:
+
+ Extra Extra Extra
+ Code Bits Lengths Code Bits Lengths Code Bits Lengths
+ ---- ---- ------ ---- ---- ------- ---- ---- -------
+ 0 0 2 8 1 10,11 16 5 70-101
+ 1 0 3 9 1 12,13 17 5 102-133
+ 2 0 4 10 2 14-17 18 6 134-197
+ 3 0 5 11 2 18-21 19 7 198-325
+ 4 0 6 12 3 22-29 20 8 326-581
+ 5 0 7 13 3 30-37 21 9 582-1093
+ 6 0 8 14 4 38-53 22 10 1094-2117
+ 7 0 9 15 4 54-69 23 24 2118-16779333
+
+ To convert an insert-and-copy length code to an insert length code
+ and a copy length code, the following table can be used:
+
+ Insert
+ length Copy length code
+ code 0-7 8-15 16-23
+ +---------+---------+
+ | | |
+ 0-7 | 0-63 | 64-127 | <--- distance code 0
+ | | |
+ +---------+---------+---------+
+ | | | |
+ 0-7 | 128-191 | 192-255 | 383-447 |
+ | | | |
+ +---------+---------+---------+
+ | | | |
+ 8-15 | 256-319 | 320-383 | 512-575 |
+ | | | |
+ +---------+---------+---------+
+ | | | |
+ 16-23 | 448-551 | 576-639 | 640-703 |
+ | | | |
+ +---------+---------+---------+
+
+ First, look up the cell with the 64 value range containing the
+ insert-and-copy length code, this gives the insert length code and
+ and the copy length code ranges, both 8 values long. The copy length
+ code within its range is determined by the lowest 3 bits of the
+ insert-and-copy length code, and the insert length code within its
+ range is determined by bits 3-5 (counted from the LSB) of the insert-
+ and-copy length code. Given the insert length and copy length codes,
+ the actual insert and copy lengths can be obtained by reading the
+ number of extra bits given by the tables above.
+
+ If the insert-and-copy length code is between 0 and 127, the distance
+ code of the command is set to zero (the last distance reused).
+
+6. Encoding of block switch commands
+
+ As described in Section 2, a block-switch command is a pair
+ <block type, block length>. These are encoded in the compressed data
+ part of the meta-block, right before the start of each new block of a
+ particular block category.
+
+ Each block type in the compressed data is represented with a block
+ type code, encoded using a Huffman code over the block type code
+ alphabet. A block type code 0 means that the block type is the same
+ as the type of the second last block from the same block category,
+ while a block type code 1 means that the block type equals the last
+ block type plus one. Block type codes 2 - 255 represent block types
+ 0 - 253. The second last and last block types are initialized with 0
+ and 1, respectively, at the beginning of each meta-block.
+
+ The first block type of each block category must be 0, and the block
+ type of the first block switch command is therefore not encoded in
+ the compressed data.
+
+ The number of different block types in each block category, denoted
+ by NBLTYPESL, NBLTYPESI, and NBLTYPESD for literals, insert-and-copy
+ lengths and distances, respectively, is encoded in the meta-block
+ header, and it must equal to the largest block type plus one in that
+ block category. In other words, the set of literal, insert-and-copy
+ length and distance block types must be [0..NBLTYPESL-1],
+ [0..NBLTYPESI-1], and [0..NBLTYPESD-1], respectively. From this it
+ follows that the alphabet size of literal, insert-and-copy length and
+ distance block type codes is NBLTYPES + 2, NBLTYPESI + 2 and
+ NBLTYPESD + 2, respectively.
+
+ Each block length in the compressed data is represented with a pair
+ <block length code, extra bits>. The block length code and the extra
+ bits are encoded back-to-back, the block length code is encoded using
+ a Huffman code over the block length code alphabet, while the extra
+ bits value is encoded as a fixed-width machine integer. The number of
+ extra bits can be 0 - 24, and it is dependent on the block length
+ code. The symbols of the block length code alphabet, along with the
+ number of extra bits and the range of block lengths are as follows:
+
+ Extra Extra Extra
+ Code Bits Lengths Code Bits Lengths Code Bits Lengths
+ ---- ---- ------ ---- ---- ------- ---- ---- -------
+ 0 2 1-4 9 4 65-80 18 7 369-496
+ 1 2 5-8 10 4 81-96 19 8 497-752
+ 2 2 9-12 11 4 97-112 20 9 753-1264
+ 3 2 13-16 12 5 113-144 21 10 1265-2288
+ 4 3 17-24 13 5 145-176 22 11 2289-4336
+ 5 3 25-32 14 5 177-208 23 12 4337-8432
+ 6 3 33-40 15 5 209-240 24 13 8433-16624
+ 7 3 41-48 16 6 241-304 25 24 16625-16793840
+ 8 4 49-64 17 6 305-368
+
+ The first block switch command of each block category is special in
+ the sense that it is encoded in the meta-block header, and as
+ described earlier the block type code is omitted, since it is an
+ implicit zero.
+
+7. Context modeling
+
+ As described in Section 2, the Huffman tree used to encode a literal
+ byte or a distance code depends on the context id and the block type.
+ This section specifies how to compute the context id for a particular
+ literal and distance code, and how to encode the context map that
+ maps a <context id, block type> pair to the index of a Huffman
+ tree in the array of literal and distance Huffman trees.
+
+ 7.1. Context modes and context id lookup for literals
+
+ The context for encoding the next literal is defined by the last
+ two bytes in the stream (p1, p2, where p1 is the most recent
+ byte), regardless if these bytes are produced by backward
+ references or by literal insertions.
+
+ There are four methods, called context modes, to compute the
+ Context ID:
+ * MSB6, where the Context ID is the value of six most
+ significant bits of p1,
+ * LSB6, where the Context ID is the value of six least
+ significant bits of p1,
+ * UTF8, where the Context ID is a complex function of p1, p2,
+ optimized for text compression, and
+ * Signed, where Context ID is a complex function of p1, p2,
+ optimized for compressing sequences of signed integers.
+
+ The Context ID for the UTF8 and Signed context modes is computed
+ using the following lookup tables Lut0, Lut1, and Lut2.
+
+ Lut0 :=
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 4, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
+ 12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
+ 52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
+ 12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
+ 60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12, 0,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
+
+ Lut1 :=
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
+ 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+
+ Lut2 :=
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7
+
+ Given p1 = last decoded byte, and p2 = second last decoded byte,
+ the context ids can be computed as follows:
+
+ For LSB6 : Context ID = p1 & 0x3f
+ For MSB6 : Context ID = p1 >> 2
+ For UTF8 : Context ID = Lut0[p1] | Lut1[p2]
+ For Signed: Context ID = (Lut2[p1] << 3) | Lut2[p2]
+
+ The context modes LSB6, MSB6, UTF8, and Signed are denoted by
+ integers 0, 1, 2, 3.
+
+ The context mode is defined for each literal block type, and they
+ are stored in a consequtive array of bits in the meta-block
+ header, always two bits per block type.
+
+ 7.2. Context id for distances
+
+ The context for encoding the next distance code is defined by the
+ copy length corresponding to the distance. The context ids are
+ 0, 1, 2, and 3 for copy lengths 2, 3, 4, and more than 4,
+ respectively.
+
+ 7.3. Encoding of the context map
+
+ There are two kinds of context maps, for literals and for
+ distances. The size of the context map is 64 * NBLTYPESL for
+ literals, and 4 * NBLTYPESD for distances. Each value in the
+ context map is an integer between 0 and 255, indicating the index
+ of the Huffman tree to be used when encoding the next literal or
+ distance.
+
+ The context map is encoded as a one-dimensional array,
+ CMAPL[0..(64 * NBLTYPESL - 1)] and CMAPD[0..(4 * NBLTYPESD - 1)].
+
+ The index of the Huffman tree for encoding a literal or distance
+ code with context id `cid' and block type `bltype' is
+
+ index of literal Huffman tree = CMAPL[bltype * 64 + cid]
+
+ index of distance Huffman tree = CMAPD[bltype * 4 + cid]
+
+ The values of the context map are encoded with the combination
+ of run length encoding for zero values and Huffman coding. Let
+ RLEMAX denote the number of run length codes and NTREES denote the
+ maximum value in the context map plus one. NTREES must equal the
+ number of different values in the context map, in other words,
+ the different values in the context map must be the [0..NTREES-1]
+ interval. The alphabet of the Huffman code has the following
+ RLEMAX + NTREES symbols:
+
+ 0: value zero
+ 1: repeat a zero 2-3 times, read 1 bit for repeat length
+ 2: repeat a zero 4-7 times, read 2 bits for repeat length
+ ...
+ RLEMAX: repeat a zero (2^RLEMAX)-(2^(RLEMAX+1) - 1) times,
+ read RLEMAX bits for repeat length
+ RLEMAX + 1: value 1
+ ...
+ RLEMAX + NTREES - 1: value NTREES - 1
+
+ If RLEMAX = 0, the run length coding is not used, and the symbols
+ of the alphabet are directly the values in the context map. We can
+ now define the format of the context map (the same format is used
+ for literal and distance context maps):
+
+ 8 bits: NTREES - 1, if NTREES = 1 all values in the context
+ map are zeros, and no further bits are needed for
+ the context map encoding. Otherwise,
+ 1-5 bits: RLEMAX, 0 is encoded with one 0 bit, and values
+ 1 - 16 are encoded with bit pattern 1xxxx
+
+ Huffman code with alphabet size NTREES + RLEMAX
+
+ Context map size values encoded with the above Huffman code
+ and run length coding for zero values
+
+ 1 bit: IMTF bit, if set, we do an inverse move-to-front
+ transform on the values in the context map to get
+ the Huffman code indexes
+
+8. Language-based static dictionaries
+
+ At any given point during decoding the compressed data, a reference
+ to a duplicated string in the output produced so far has a maximum
+ backward distance value, which is the minumum of the window size and
+ the number of output bytes produced. However, decoding a distance
+ from the input stream, as described in section 4, can produce
+ distances that are greater than this maximum allowed value. The
+ difference between these distances and the first invalid distance
+ value is treated as reference to a word in one of the language-based
+ static dictionaries given in Appendix A. The id of the static
+ dictionary is determined by the copy length of the command:
+
+ dictionary id = copy length - 4
+ word id = distance - (max allowed distance + 1)
+
+ If the copy length is less than 4, or the dictionary id is invalid,
+ the compressed data set is invalid and must be discarded.
+
+ Each of the static dictionaries has 2^N words, and the index of the
+ referenced word is formed by the N least significant bits of the word
+ id. The word id right-shifted by N gives the index to one of the word
+ transformations given in Appendix B. If this transformation index is
+ greater than the maximum transformation index, the compressed data
+ set is invalid and must be discarded. The string copied to the output
+ stream is computed by applying the transformation to the referenced
+ static dictionary word.
+
+9. Compressed data format
+
+ In this section we describe the format of the compressed data set in
+ terms of the format of the individual data items described in the
+ previous secions.
+
+ 9.1. Format of the stream header
+
+ The stream header has only the following one field:
+
+ 1-4 bits: WBITS, a value in the range 16 - 24, value 16 is
+ encoded with one 0 bit, and values 17 - 24 are
+ encoded with bit pattern 1xxx
+
+ The size of the sliding window, which is the maximum value of any
+ non-dictionary reference backward distance, is given by the
+ following formula:
+
+ window size = (1 << WBITS) - 16
+
+ 9.2. Format of the meta-block header
+
+ A compliant compressed data set has at least one meta-block. Each
+ meta-block contains a header, with information about the
+ uncompressed length of the meta-block, and a bit signaling if the
+ meta-block is the last one. The format of the meta-block header is
+ the following:
+
+ 1 bit: ISLAST, set to 1 if this is the last meta-block
+ 1 bit: ISEMPTY, set to 1 if the meta-block is empty, this
+ field is only present if ISLAST bit is set, since
+ only the last meta-block can be empty
+ 2 bits: MNIBBLES, (# of nibbles to represent the length) - 4
+
+ (MNIBBLES + 4) x 4 bits: MLEN - 1, where MLEN is the length
+ of the meta-block in the input data in bytes
+
+ 1-11 bits: NBLTYPESL, # of literal block types, encoded with
+ the following variable length code:
+
+ Value Bit Pattern
+ ----- -----------
+ 1 0
+ 2 1000
+ 3-4 1001x
+ 5-8 1010xx
+ 9-16 1011xxx
+ 17-32 1100xxxx
+ 33-64 1101xxxxx
+ 65-128 1110xxxxxx
+ 129-256 1111xxxxxxx
+
+ Huffman code over the block type code alphabet for literal
+ block types, appears only if NBLTYPESL >= 2
+
+ Huffman code over the block length code alphabet for literal
+ block lengths, appears only if NBLTYPESL >= 2
+
+ Block length code + Extra bits for first literal block
+ length, appears only if NBLTYPESL >= 2
+
+ 1-11 bits: NBLTYPESI, # of insert-and-copy block types, encoded
+ with the same variable length code as above
+
+ Huffman code over the block type code alphabet for insert-
+ and-copy block types, only if NBLTYPESI >= 2
+
+ Huffman code over the block length code alphabet for insert-
+ and-copy block lengths, only if NBLTYPESI >= 2
+
+ Block length code + Extra bits for first insert-and-copy
+ block length, only if NBLTYPESI >= 2
+
+ 1-11 bits: NBLTYPESD, # of distance block types, encoded with
+ the same variable length code as above
+
+ Huffman code over the block type code alphabet for distance
+ block types, appears only if NBLTYPESD >= 2
+
+ Huffman code over the block length code alphabet for
+ distance block lengths, only if NBLTYPESD >= 2
+
+ Block length code + Extra bits for first distance block
+ length, only if NBLTYPESD >= 2
+
+ 2 bits: NPOSTFIX, parameter used in the distance coding
+
+ 4 bits: four most significant bits of NDIRECT, to get the
+ actual value of the parameter NDIRECT, left-shift
+ this four bit number by NPOSTFIX bits
+
+ NBLTYPESL x 2 bits: context mode for each literal block type
+
+ Literal context map, encoded as described in Paragraph 7.3,
+ the number of Huffman tree indexes is denoted by NHTREESL
+
+ Distance context map, encoded as described in Paragraph 7.3,
+ the number of Huffman tree indexes is denoted by NHTREESD
+
+ NHTREESL Huffman codes for literals
+
+ NBLTYPESI Huffman codes for insert-and-copy lengths
+
+ NHTREESD Huffman codes for distances
+
+ 9.3. Format of the meta-block data
+
+ The compressed data part of a meta-block consists of a series of
+ commands. Each command has the following format:
+
+ Block type code for next insert-and-copy block type, appears
+ only if NBLTYPESI >= 2 and the previous insert-and-copy
+ block has ended
+
+ Block length code + Extra bits for next insert-and-copy
+ block length, appears only if NBLTYPESI >= 2 and the
+ previous insert and-copy block has ended
+
+ Insert-and-copy length, encoded as in section 5, using the
+ insert-and-copy length Huffman code with the current
+ insert-and-copy block type index
+
+ Insert length number of literals, with the following format:
+
+ Block type code for next literal block type, appears
+ only if NBLTYPESL >= 2 and the previous literal
+ block has ended
+
+ Block length code + Extra bits for next literal block
+ length, appears only if NBLTYPESL >= 2 and the
+ previous literal block has ended
+
+ Next byte of the input data, encoded with the literal
+ Huffman code with the index determined by the
+ previuos two bytes of the input data, the current
+ literal block type and the context map, as
+ described in Paragraph 7.3.
+
+ Block type code for next distance block type, appears only
+ if NBLTYPESD >= 2 and the previous distance block has
+ ended
+
+ Block length code + Extra bits for next distance block
+ length, appears only if NBLTYPESD >= 2 and the previous
+ distance block has ended
+
+ Distance code, encoded as in section 4, using the distance
+ Huffman code with the current distance block type index,
+ appears only if the distance code is not an implicit 0,
+ as indicated by the insert-and-copy length code
+
+ The number of commands in the meta-block is such that the sum of
+ insert lengths and copy lengths over all the commands gives the
+ uncompressed length, MLEN encoded in the meta-block header.
+
+10. Decoding algorithm
+
+ The decoding algorithm that produces the output data is as follows:
+
+ read window size
+ do
+ read ISLAST bit
+ if ISLAST
+ read ISEMPTY bit
+ if ISEMPTY
+ break from loop
+ read MLEN
+ loop for each three block categories (i = L, I, D)
+ read NBLTYPESi
+ if NBLTYPESi >= 2
+ read Huffman code for block types, HTREE_BTYPE_i
+ read Huffman code for block lengths, HTREE_BLEN_i
+ read block length, BLEN_i
+ set block type, BTYPE_i to 0
+ initialize second last and last block types to 0 and 1
+ else
+ set block type, BTYPE_i to 0
+ set block length, BLEN_i to 268435456
+ read NPOSTFIX and NDIRECT
+ read array of literal context modes, CMODE[]
+ read literal context map, CMAPL[]
+ read distance context map, CMAPD[]
+ read array of Huffman codes for literals, HTREEL[]
+ read array of Huffman codes for insert-and-copy, HTREEI[]
+ read array of Huffman codes for distances, HTREED[]
+ do
+ if BLEN_I is zero
+ read block type using HTREE_BTYPE_I and set BTYPE_I
+ read block length using HTREE_BLEN_I and set BLEN_I
+ decrement BLEN_I
+ read insert and copy length, ILEN, CLEN with HTREEI[BTYPE_I]
+ loop for ILEN
+ if BLEN_L is zero
+ read block type using HTREE_BTYPE_L and set BTYPE_L
+ read block length using HTREE_BLEN_L and set BLEN_L
+ decrement BLEN_L
+ look up context mode CMODE[BTYPE_L]
+ compute context id, CIDL from last two bytes of output
+ read literal using HTREEL[CMAPL[64 * BTYPE_L + CIDL]]
+ copy literal to output stream
+ if number of output bytes produced in the loop is MLEN
+ break from loop
+ if distance code is implicit zero from insert-and-copy code
+ set backward distance to the last distance
+ else
+ if BLEN_D is zero
+ read block type using HTREE_BTYPE_D and set BTYPE_D
+ read block length using HTREE_BLEN_D and set BLEN_D
+ decrement BLEN_D
+ compute context id, CIDD from CLEN
+ read distance code with HTREED[CMAPD[4 * BTYPE_D + CIDD]]
+ compute distance by distance short code substitution
+ move backwards distance bytes in the output stream, and
+ copy CLEN bytes from this position to the output stream,
+ or look up the static dictionary word and copy it to the
+ output stram
+ while number of output bytes produced in the loop < MLEN
+ while not ISLAST
+
+ Note that a duplicated string reference may refer to a string in a
+ previous meta-block; i.e., the backward distance may cross one or
+ more meta-block boundaries. However a backward copy distance
+ cannot refer past the beginning of the output stream, and it can
+ not be greater than the window size, any such distance must be
+ interpreted as a reference to a static dictionary word. Note also
+ that the referenced string may overlap the current position; for
+ example, if the last 2 bytes decoded have values X and Y, a string
+ reference with <length = 5, distance = 2> adds X,Y,X,Y,X to the
+ output stream.
+
+11. References
+
+ [1] Huffman, D. A., "A Method for the Construction of Minimum
+ Redundancy Codes", Proceedings of the Institute of Radio
+ Engineers, September 1952, Volume 40, Number 9, pp. 1098-1101.
+
+ [2] Ziv J., Lempel A., "A Universal Algorithm for Sequential Data
+ Compression", IEEE Transactions on Information Theory, Vol. 23,
+ No. 3, pp. 337-343.
+
+ [3] Deutsch, P., "DEFLATE Compressed Data Format Specification
+ version 1.3", RFC 1951, Aladdin Enterprises, May 1996.
+ http://www.ietf.org/rfc/rfc1951.txt
+
+12. Source code
+
+ Source code for a C language implementation of a "brotli" compliant
+ decompressor and a C++ language implementation of a compressor is
+ available in the brotli/ directory within the font-compression-
+ reference open-source project:
+ https://code.google.com/p/font-compression-reference/source/browse/
+
+Appendix A. List of dictionary words
+
+ TO BE WRITTEN
+
+Appendix B. List of word transformations
+
+ TO BE WRITTEN
diff --git a/brotli/dec/decode.c b/brotli/dec/decode.c
index 6f08c5a..3bc37cf 100644
--- a/brotli/dec/decode.c
+++ b/brotli/dec/decode.c
@@ -63,53 +63,28 @@ static const int kDistanceShortCodeValueOffset[NUM_DISTANCE_SHORT_CODES] = {
0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3
};
-static int64_t DecodeSize(BrotliBitReader* br) {
- int size_bytes = BrotliReadBits(br, 3);
- int i = 0;
- int64_t len = 0;
- if (size_bytes == 0) {
- return -1;
- }
- for (; i < size_bytes; ++i) {
- len |= BrotliReadBits(br, 8) << (i * 8);
+static BROTLI_INLINE int DecodeWindowBits(BrotliBitReader* br) {
+ if (BrotliReadBits(br, 1)) {
+ return 17 + BrotliReadBits(br, 3);
+ } else {
+ return 16;
}
- return len;
}
-static void DecodeMetaBlockLength(int input_size_bits,
- size_t pos,
- int64_t input_size,
- BrotliBitReader* br,
+static void DecodeMetaBlockLength(BrotliBitReader* br,
size_t* meta_block_length,
int* input_end) {
*input_end = BrotliReadBits(br, 1);
- if (input_size < 0) {
- *meta_block_length = 0;
- if (!*input_end) {
- int size_nibbles = BrotliReadBits(br, 3);
- int i;
- for (i = 0; i < size_nibbles; ++i) {
- *meta_block_length |= BrotliReadBits(br, 4) << (i * 4);
- }
- ++(*meta_block_length);
- }
- } else {
- if (*input_end) {
- *meta_block_length = (size_t)input_size - pos;
- } else {
- int shift = 0;
- *meta_block_length = 0;
- while (input_size_bits > 0) {
- *meta_block_length |= BrotliReadBits(br, 8) << shift;
- input_size_bits -= 8;
- shift += 8;
- }
- if (input_size_bits > 0) {
- *meta_block_length |= BrotliReadBits(br, input_size_bits) << shift;
- }
- ++(*meta_block_length);
- }
+ *meta_block_length = 0;
+ if (*input_end && BrotliReadBits(br, 1)) {
+ return;
}
+ int size_nibbles = BrotliReadBits(br, 2) + 4;
+ int i;
+ for (i = 0; i < size_nibbles; ++i) {
+ *meta_block_length |= BrotliReadBits(br, 4) << (i * 4);
+ }
+ ++(*meta_block_length);
}
// Decodes the next Huffman code from bit-stream.
@@ -289,7 +264,7 @@ static int ReadHuffmanCode(int alphabet_size,
for (i = BrotliReadBits(br, 1) * 2; i < num_codes; ++i) {
int code_len_idx = kCodeLengthCodeOrder[i];
int v = BrotliReadBits(br, 2);
- if (v == 3) {
+ if (v == 1) {
v = BrotliReadBits(br, 1);
if (v == 0) {
v = 2;
@@ -301,8 +276,6 @@ static int ReadHuffmanCode(int alphabet_size,
v = 5;
}
}
- } else if (v == 1) {
- v = 3;
} else if (v == 2) {
v = 4;
}
@@ -593,11 +566,14 @@ int BrotliDecompressedSize(size_t encoded_size,
if (!BrotliInitBitReader(&br, input)) {
return 0;
}
- int64_t size = DecodeSize(&br);
- if (size < 0) {
+ DecodeWindowBits(&br);
+ size_t meta_block_len;
+ int input_end;
+ DecodeMetaBlockLength(&br, &meta_block_len, &input_end);
+ if (!input_end) {
return 0;
}
- *decoded_size = (size_t)size;
+ *decoded_size = meta_block_len;
return 1;
}
@@ -618,8 +594,6 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
int ok = 1;
int i;
size_t pos = 0;
- int64_t decoded_size;
- int input_size_bits = 0;
int input_end = 0;
int window_bits = 0;
size_t max_backward_distance;
@@ -629,7 +603,7 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
uint8_t* ringbuffer_end;
// This ring buffer holds a few past copy distances that will be used by
// some special distance codes.
- int dist_rb[4] = { 4, 11, 15, 16 };
+ int dist_rb[4] = { 16, 15, 11, 4 };
size_t dist_rb_idx = 0;
// The previous 2 bytes used for context.
uint8_t prev_byte1 = 0;
@@ -641,42 +615,27 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
return 0;
}
- decoded_size = DecodeSize(&br);
- if (decoded_size == 0) {
- return 1;
- }
-
- if (decoded_size > 0) {
- size_t n = (size_t)decoded_size;
- input_size_bits = (n == (n &~ (n - 1))) ? -1 : 0;
- while (n) {
- ++input_size_bits;
- n >>= 1;
- }
- }
-
// Decode window size.
- if ((decoded_size < 0 || input_size_bits > 16) && BrotliReadBits(&br, 1)) {
- window_bits = 17 + BrotliReadBits(&br, 3);
- } else {
- window_bits = 16;
- }
+ window_bits = DecodeWindowBits(&br);
max_backward_distance = (1 << window_bits) - 16;
+ static const int kRingBufferWriteAheadSlack = 16;
+
+ static const int kMaxDictionaryWordLength = 0;
+
ringbuffer_size = 1 << window_bits;
ringbuffer_mask = ringbuffer_size - 1;
- ringbuffer = (uint8_t*)malloc(ringbuffer_size + 16);
+ ringbuffer = (uint8_t*)malloc(ringbuffer_size +
+ kRingBufferWriteAheadSlack +
+ kMaxDictionaryWordLength);
ringbuffer_end = ringbuffer + ringbuffer_size;
- BROTLI_LOG_UINT(decoded_size);
- BROTLI_LOG_UINT(input_size_bits);
-
while (!input_end && ok) {
size_t meta_block_len = 0;
size_t meta_block_end_pos;
- size_t block_length[3] = { 0 };
+ uint32_t block_length[3] = { UINT32_MAX, UINT32_MAX, UINT32_MAX };
int block_type[3] = { 0 };
- int num_block_types[3] = { 0 };
+ int num_block_types[3] = { 1, 1, 1 };
int block_type_rb[6] = { 0, 1, 0, 1, 0, 1 };
size_t block_type_rb_index[3] = { 0 };
HuffmanTree block_type_trees[3];
@@ -713,8 +672,7 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
goto End;
}
BROTLI_LOG_UINT(pos);
- DecodeMetaBlockLength(input_size_bits, pos, decoded_size,
- &br, &meta_block_len, &input_end);
+ DecodeMetaBlockLength(&br, &meta_block_len, &input_end);
BROTLI_LOG_UINT(meta_block_len);
if (meta_block_len == 0) {
goto End;
@@ -724,7 +682,12 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
block_type_trees[i].root_ = NULL;
block_len_trees[i].root_ = NULL;
if (BrotliReadBits(&br, 1)) {
- num_block_types[i] = BrotliReadBits(&br, 8) + 1;
+ int nbits = BrotliReadBits(&br, 3);
+ if (nbits == 0) {
+ num_block_types[i] = 2;
+ } else {
+ num_block_types[i] = BrotliReadBits(&br, nbits) + (1 << nbits) + 1;
+ }
if (!ReadHuffmanCode(
num_block_types[i] + 2, &block_type_trees[i], &br) ||
!ReadHuffmanCode(kNumBlockLengthCodes, &block_len_trees[i], &br)) {
@@ -733,9 +696,6 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
}
block_length[i] = ReadBlockLength(&block_len_trees[i], &br);
block_type_rb_index[i] = 1;
- } else {
- num_block_types[i] = 1;
- block_length[i] = meta_block_len;
}
}
@@ -891,23 +851,24 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
max_distance = pos;
}
+ copy_dst = &ringbuffer[pos & ringbuffer_mask];
+
if ((size_t)distance > max_distance) {
- printf("Invalid backward reference. pos: %ld distance: %d "
- "len: %d end: %lu\n", pos, distance, copy_length,
+ printf("Invalid backward reference. pos: %lu distance: %d "
+ "len: %d end: %lu\n", (unsigned long)pos, distance, copy_length,
(unsigned long)meta_block_end_pos);
ok = 0;
goto End;
} else {
if (pos + copy_length > meta_block_end_pos) {
- printf("Invalid backward reference. pos: %zu distance: %d "
- "len: %d end: %zu\n", pos, distance, copy_length,
- meta_block_end_pos);
+ printf("Invalid backward reference. pos: %lu distance: %d "
+ "len: %d end: %lu\n", (unsigned long)pos, distance,
+ copy_length, (unsigned long)meta_block_end_pos);
ok = 0;
goto End;
}
copy_src = &ringbuffer[(pos - distance) & ringbuffer_mask];
- copy_dst = &ringbuffer[pos & ringbuffer_mask];
#if (defined(__x86_64__) || defined(_M_X64))
if (copy_src + copy_length <= ringbuffer_end &&
diff --git a/brotli/enc/encode.cc b/brotli/enc/encode.cc
index 88e1c4c..4223ca4 100644
--- a/brotli/enc/encode.cc
+++ b/brotli/enc/encode.cc
@@ -80,15 +80,15 @@ void EncodeMetaBlockLength(size_t meta_block_size,
int* storage_ix, uint8_t* storage) {
WriteBits(1, 0, storage_ix, storage);
int num_bits = Log2Floor(meta_block_size) + 1;
- WriteBits(3, (num_bits + 3) >> 2, storage_ix, storage);
+ if (num_bits < 16) {
+ num_bits = 16;
+ }
+ WriteBits(2, (num_bits - 13) >> 2, storage_ix, storage);
while (num_bits > 0) {
WriteBits(4, meta_block_size & 0xf, storage_ix, storage);
meta_block_size >>= 4;
num_bits -= 4;
}
- if (num_bits > 0) {
- WriteBits(num_bits, meta_block_size, storage_ix, storage);
- }
}
template<int kSize>
@@ -121,7 +121,7 @@ void StoreHuffmanTreeOfHuffmanTreeToBitMask(
for (int i = skip_two_first * 2; i < codes_to_store; ++i) {
uint8_t len[] = { 2, 4, 3, 2, 2, 4 };
- uint8_t bits[] = { 0, 7, 3, 1, 2, 15 };
+ uint8_t bits[] = { 0, 5, 1, 3, 2, 13 };
int v = code_length_bitdepth[kStorageOrder[i]];
WriteBits(len[v], bits[v], storage_ix, storage);
}
@@ -558,6 +558,12 @@ void BuildAndEncodeBlockSplitCode(const BlockSplit& split,
return;
}
WriteBits(1, 1, storage_ix, storage);
+ int nbits = Log2Floor(split.num_types_ - 1);
+ WriteBits(3, nbits, storage_ix, storage);
+ if (nbits > 0) {
+ WriteBits(nbits, split.num_types_ - 1 - (1 << nbits), storage_ix, storage);
+ }
+
HistogramLiteral type_histo;
for (int i = 0; i < split.type_codes_.size(); ++i) {
type_histo.Add(split.type_codes_[i]);
@@ -570,7 +576,6 @@ void BuildAndEncodeBlockSplitCode(const BlockSplit& split,
}
BuildEntropyCode(length_histo, 15, kNumBlockLenPrefixes,
&code->block_len_code);
- WriteBits(8, split.num_types_ - 1, storage_ix, storage);
StoreHuffmanCode(code->block_type_code, split.num_types_ + 2,
storage_ix, storage);
StoreHuffmanCode(code->block_len_code, kNumBlockLenPrefixes,
@@ -769,10 +774,10 @@ BrotliCompressor::BrotliCompressor()
literal_cost_(1 << kRingBufferBits),
storage_ix_(0),
storage_(new uint8_t[2 << kMetaBlockSizeBits]) {
- dist_ringbuffer_[0] = 4;
- dist_ringbuffer_[1] = 11;
- dist_ringbuffer_[2] = 15;
- dist_ringbuffer_[3] = 16;
+ dist_ringbuffer_[0] = 16;
+ dist_ringbuffer_[1] = 15;
+ dist_ringbuffer_[2] = 11;
+ dist_ringbuffer_[3] = 4;
storage_[0] = 0;
}
@@ -782,8 +787,6 @@ BrotliCompressor::~BrotliCompressor() {
}
void BrotliCompressor::WriteStreamHeader() {
- // Don't encode input size.
- WriteBits(3, 0, &storage_ix_, storage_);
// Encode window size.
if (window_bits_ == 16) {
WriteBits(1, 0, &storage_ix_, storage_);
@@ -828,7 +831,7 @@ void BrotliCompressor::WriteMetaBlock(const size_t input_size,
void BrotliCompressor::FinishStream(
size_t* encoded_size, uint8_t* encoded_buffer) {
- WriteBits(1, 1, &storage_ix_, storage_);
+ WriteBits(2, 0x3, &storage_ix_, storage_);
*encoded_size = (storage_ix_ + 7) >> 3;
memcpy(encoded_buffer, storage_, *encoded_size);
}
@@ -839,9 +842,8 @@ int BrotliCompressBuffer(size_t input_size,
size_t* encoded_size,
uint8_t* encoded_buffer) {
if (input_size == 0) {
- encoded_buffer[0] = 1;
- encoded_buffer[1] = 0;
- *encoded_size = 2;
+ encoded_buffer[0] = 6;
+ *encoded_size = 1;
return 1;
}