1 files changed, 559 insertions, 0 deletions
diff --git a/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java b/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java
new file mode 100644
index 000000000..27fec8d3c
--- /dev/null
+++ b/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java
@@ -0,0 +1,559 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.commons.compress.compressors.lz77support;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+/**
+ * Helper class for compression algorithms that use the ideas of LZ77.
+ *
+ * <p>Most LZ77 derived algorithms split input data into blocks of
+ * uncompressed data (called literal blocks) and back-references
+ * (pairs of offsets and lengths) that state "add <code>length</code>
+ * bytes that are the same as those already written starting
+ * <code>offset</code> bytes before the current position. The details
+ * of how those blocks and back-references are encoded are quite
+ * different between the algorithms and some algorithms perform
+ * additional steps (Huffman encoding in the case of DEFLATE for
+ * example).</p>
+ *
+ * <p>This class attempts to extract the core logic - finding
+ * back-references - so it can be re-used. It follows the algorithm
+ * explained in section 4 of RFC 1951 (DEFLATE) and currently doesn't
+ * implement the "lazy match" optimization. The three-byte hash
+ * function used in this class is the same as the one used by zlib and
+ * InfoZIP's ZIP implementation of DEFLATE. The whole class is
+ * strongly inspired by InfoZIP's implementation.</p>
+ *
+ * <p>LZ77 is used vaguely here (as well as many other places that
+ * talk about it :-), LZSS would likely be closer to the truth but
+ * LZ77 has become the synonym for a whole family of algorithms.</p>
+ *
+ * <p>The API consists of a compressor that is fed <code>byte</code>s
+ * and emits {@link Block}s to a registered callback where the blocks
+ * represent either {@link LiteralBlock literal blocks}, {@link
+ * BackReference back-references} or {@link EOD end of data
+ * markers}. In order to ensure the callback receives all information,
+ * the {@code #finish} method must be used once all data has been fed
+ * into the compressor.</p>
+ *
+ * <p>Several parameters influence the outcome of the "compression":</p>
+ * <dl>
+ *
+ *  <dt><code>windowSize</code></dt> <dd>the size of the sliding
+ *  window, must be a power of two - this determines the maximum
+ *  offset a back-reference can take. The compressor maintains a
+ *  buffer of twice of <code>windowSize</code> - real world values are
+ *  in the area of 32k.</dd>
+ *
+ *  <dt><code>minBackReferenceLength</code></dt>
+ *  <dd>Minimal length of a back-reference found. A true minimum of 3 is
+ *  hard-coded inside of this implemention but bigger lengths can be
+ *  configured.</dd>
+ *
+ *  <dt><code>maxBackReferenceLength</code></dt>
+ *  <dd>Maximal length of a back-reference found.</dd>
+ *
+ *  <dt><code>maxOffset</code></dt>
+ *  <dd>Maximal offset of a back-reference.</dd>
+ *
+ *  <dt><code>maxLiteralLength</code></dt>
+ *  <dd>Maximal length of a literal block.</dd>
+ * </dl>
+ *
+ * @see "https://tools.ietf.org/html/rfc1951#section-4"
+ * @since 1.14
+ * @NotThreadSafe
+ */
+public class LZ77Compressor {
+
+    /**
+     * Base class representing blocks the compressor may emit.
+     *
+     * <p>This class is not supposed to be subclassed by classes
+     * outside of Commons Compress so it is considered internal and
+     * changed that would break subclasses may get introduced with
+     * future releases.</p>
+     */
+    public static abstract class Block {
+        /** Enumeration of the block types the compressor may emit. */
+        public enum BlockType {
+            LITERAL, BACK_REFERENCE, EOD
+        }
+        public abstract BlockType getType();
+    }
+
+    /**
+     * Represents a literal block of data.
+     *
+     * <p>For performance reasons this encapsulates the real data, not
+     * a copy of it. Don't modify the data and process it inside of
+     * {@link Callback#accept} immediately as it will get overwritten
+     * sooner or later.</p>
+     */
+    public static final class LiteralBlock extends Block {
+        private final byte[] data;
+        private final int offset, length;
+        public LiteralBlock(byte[] data, int offset, int length) {
+            this.data = data;
+            this.offset = offset;
+            this.length = length;
+        }
+        /**
+         * The literal data.
+         *
+         * <p>This returns a life view of the actual data in order to
+         * avoid copying, modify the array at your own risk.</p>
+         * @return the data
+         */
+        public byte[] getData() {
+            return data;
+        }
+        /**
+         * Offset into data where the literal block starts.
+         * @return the offset
+         */
+        public int getOffset() {
+            return offset;
+        }
+        /**
+         * Length of literal block.
+         * @return the length
+         */
+        public int getLength() {
+            return length;
+        }
+        @Override
+        public BlockType getType() {
+            return BlockType.LITERAL;
+        }
+        @Override
+        public String toString() {
+            return "LiteralBlock starting at " + offset + " with length " + length;
+        }
+    }
+
+    /**
+     * Represents a back-reference.
+     */
+    public static final class BackReference extends Block {
+        private final int offset, length;
+        public BackReference(int offset, int length) {
+            this.offset = offset;
+            this.length = length;
+        }
+        /**
+         * Provides the offset of the back-reference.
+         * @return the offset
+         */
+        public int getOffset() {
+            return offset;
+        }
+        /**
+         * Provides the length of the back-reference.
+         * @return the length
+         */
+        public int getLength() {
+            return length;
+        }
+        @Override
+        public BlockType getType() {
+            return BlockType.BACK_REFERENCE;
+        }
+        @Override
+        public String toString() {
+            return "BackReference with offset " + offset + " and length " + length;
+        }
+    }
+
+    /** A simple "we are done" marker. */
+    public static final class EOD extends Block {
+        @Override
+        public BlockType getType() {
+            return BlockType.EOD;
+        }
+    }
+
+    private static final Block THE_EOD = new EOD();
+
+    /**
+     * Callback invoked while the compressor processes data.
+     *
+     * <p>The callback is invoked on the same thread that receives the
+     * bytes to compress and may be invoked multiple times during the
+     * execution of {@link #compress} or {@link #finish}.</p>
+     */
+    public interface Callback {
+        /**
+         * Consumes a block.
+         * @param b the block to consume
+         * @throws IOException in case of an error
+         */
+        void accept(Block b) throws IOException;
+    }
+
+    static final int NUMBER_OF_BYTES_IN_HASH = 3;
+    private static final int NO_MATCH = -1;
+
+    private final Parameters params;
+    private final Callback callback;
+
+    // the sliding window, twice as big as "windowSize" parameter
+    private final byte[] window;
+    // the head of hash-chain - indexed by hash-code, points to the
+    // location inside of window of the latest sequence of bytes with
+    // the given hash.
+    private final int[] head;
+    // for each window-location points to the latest earlier location
+    // with the same hash. Only stores values for the latest
+    // "windowSize" elements, the index is "window location modulo
+    // windowSize".
+    private final int[] prev;
+
+    // bit mask used when indexing into prev
+    private final int wMask;
+
+    private boolean initialized = false;
+    // the position inside of window that shall be encoded right now
+    private int currentPosition;
+    // the number of bytes available to compress including the one at
+    // currentPosition
+    private int lookahead = 0;
+    // the hash of the three bytes stating at the current position
+    private int insertHash = 0;
+    // the position inside of the window where the current literal
+    // block starts (in case we are inside of a literal block).
+    private int blockStart = 0;
+    // position of the current match
+    private int matchStart = NO_MATCH;
+    // number of missed insertString calls for the up to three last
+    // bytes of the last match that can only be performed once more
+    // data has been read
+    private int missedInserts = 0;
+
+    /**
+     * Initializes a compressor with parameters and a callback.
+     * @param params the parameters
+     * @param callback the callback
+     * @throws NullPointerException if either parameter is <code>null</code>
+     */
+    public LZ77Compressor(Parameters params, Callback callback) {
+        if (params == null) {
+            throw new NullPointerException("params must not be null");
+        }
+        if (callback == null) {
+            throw new NullPointerException("callback must not be null");
+        }
+        this.params = params;
+        this.callback = callback;
+
+        final int wSize = params.getWindowSize();
+        window = new byte[wSize * 2];
+        wMask = wSize - 1;
+        head = new int[HASH_SIZE];
+        Arrays.fill(head, NO_MATCH);
+        prev = new int[wSize];
+    }
+
+    /**
+     * Feeds bytes into the compressor which in turn may emit zero or
+     * more blocks to the callback during the execution of this
+     * method.
+     * @param data the data to compress - must not be null
+     * @throws IOException if the callback throws an exception
+     */
+    public void compress(byte[] data) throws IOException {
+        compress(data, 0, data.length);
+    }
+
+    /**
+     * Feeds bytes into the compressor which in turn may emit zero or
+     * more blocks to the callback during the execution of this
+     * method.
+     * @param data the data to compress - must not be null
+     * @param off the start offset of the data
+     * @param len the number of bytes to compress
+     * @throws IOException if the callback throws an exception
+     */
+    public void compress(byte[] data, int off, int len) throws IOException {
+        final int wSize = params.getWindowSize();
+        while (len > wSize) { // chop into windowSize sized chunks
+            doCompress(data, off, wSize);
+            off += wSize;
+            len -= wSize;
+        }
+        if (len > 0) {
+            doCompress(data, off, len);
+        }
+    }
+
+    /**
+     * Tells the compressor to process all remaining data and signal
+     * end of data to the callback.
+     *
+     * <p>The compressor will in turn emit at least one block ({@link
+     * EOD}) but potentially multiple blocks to the callback during
+     * the execution of this method.</p>
+     * @throws IOException if the callback throws an exception
+     */
+    public void finish() throws IOException {
+        if (blockStart != currentPosition || lookahead > 0) {
+            currentPosition += lookahead;
+            flushLiteralBlock();
+        }
+        callback.accept(THE_EOD);
+    }
+
+    /**
+     * Adds some initial data to fill the window with.
+     *
+     * <p>This is used if the stream has been cut into blocks and
+     * back-references of one block may refer to data of the previous
+     * block(s). One such example is the LZ4 frame format using block
+     * dependency.</p>
+     *
+     * @param data the data to fill the window with.
+     * @throws IllegalStateException if the compressor has already started to accept data
+     */
+    public void prefill(byte[] data) {
+        if (currentPosition != 0 || lookahead != 0) {
+            throw new IllegalStateException("the compressor has already started to accept data, can't prefill anymore");
+        }
+
+        // don't need more than windowSize for back-references
+        final int len = Math.min(params.getWindowSize(), data.length);
+        System.arraycopy(data, data.length - len, window, 0, len);
+
+        if (len >= NUMBER_OF_BYTES_IN_HASH) {
+            initialize();
+            final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1;
+            for (int i = 0; i < stop; i++) {
+                insertString(i);
+            }
+            missedInserts = NUMBER_OF_BYTES_IN_HASH - 1;
+        } else { // not enough data to hash anything
+            missedInserts = len;
+        }
+        blockStart = currentPosition = len;
+    }
+
+    // we use a 15 bit hashcode as calculated in updateHash
+    private static final int HASH_SIZE = 1 << 15;
+    private static final int HASH_MASK = HASH_SIZE - 1;
+    private static final int H_SHIFT = 5;
+
+    /**
+     * Assumes we are calculating the hash for three consecutive bytes
+     * as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC
+     * the new hash for BCD is nextHash(H, D).
+     *
+     * <p>The hash is shifted by five bits on each update so all
+     * effects of A have been swapped after the third update.</p>
+     */
+    private int nextHash(int oldHash, byte nextByte) {
+        final int nextVal = nextByte & 0xFF;
+        return ((oldHash << H_SHIFT) ^ nextVal) & HASH_MASK;
+    }
+
+    // performs the actual algorithm with the pre-condition len <= windowSize
+    private void doCompress(byte[] data, int off, int len) throws IOException {
+        int spaceLeft = window.length - currentPosition - lookahead;
+        if (len > spaceLeft) {
+            slide();
+        }
+        System.arraycopy(data, off, window, currentPosition + lookahead, len);
+        lookahead += len;
+        if (!initialized && lookahead >= params.getMinBackReferenceLength()) {
+            initialize();
+        }
+        if (initialized) {
+            compress();
+        }
+    }
+
+    private void slide() throws IOException {
+        final int wSize = params.getWindowSize();
+        if (blockStart != currentPosition && blockStart < wSize) {
+            flushLiteralBlock();
+            blockStart = currentPosition;
+        }
+        System.arraycopy(window, wSize, window, 0, wSize);
+        currentPosition -= wSize;
+        matchStart -= wSize;
+        blockStart -= wSize;
+        for (int i = 0; i < HASH_SIZE; i++) {
+            int h = head[i];
+            head[i] = h >= wSize ? h - wSize : NO_MATCH;
+        }
+        for (int i = 0; i < wSize; i++) {
+            int p = prev[i];
+            prev[i] = p >= wSize ? p - wSize : NO_MATCH;
+        }
+    }
+
+    private void initialize() {
+        for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
+            insertHash = nextHash(insertHash, window[i]);
+        }
+        initialized = true;
+    }
+
+    private void compress() throws IOException {
+        final int minMatch = params.getMinBackReferenceLength();
+        final boolean lazy = params.getLazyMatching();
+        final int lazyThreshold = params.getLazyMatchingThreshold();
+
+        while (lookahead >= minMatch) {
+            catchUpMissedInserts();
+            int matchLength = 0;
+            int hashHead = insertString(currentPosition);
+            if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) {
+                // sets matchStart as a side effect
+                matchLength = longestMatch(hashHead);
+
+                if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) {
+                    // try to find a longer match using the next position
+                    matchLength = longestMatchForNextPosition(matchLength);
+                }
+            }
+            if (matchLength >= minMatch) {
+                if (blockStart != currentPosition) {
+                    // emit preceeding literal block
+                    flushLiteralBlock();
+                    blockStart = NO_MATCH;
+                }
+                flushBackReference(matchLength);
+                insertStringsInMatch(matchLength);
+                lookahead -= matchLength;
+                currentPosition += matchLength;
+                blockStart = currentPosition;
+            } else {
+                // no match, append to current or start a new literal
+                lookahead--;
+                currentPosition++;
+                if (currentPosition - blockStart >= params.getMaxLiteralLength()) {
+                    flushLiteralBlock();
+                    blockStart = currentPosition;
+                }
+            }
+        }
+    }
+
+    /**
+     * Inserts the current three byte sequence into the dictionary and
+     * returns the previous head of the hash-chain.
+     *
+     * <p>Updates <code>insertHash</code> and <code>prev</code> as a
+     * side effect.</p>
+     */
+    private int insertString(int pos) {
+        insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
+        int hashHead = head[insertHash];
+        prev[pos & wMask] = hashHead;
+        head[insertHash] = pos;
+        return hashHead;
+    }
+
+    private int longestMatchForNextPosition(final int prevMatchLength) {
+        // save a bunch of values to restore them if the next match isn't better than the current one
+        final int prevMatchStart = matchStart;
+        final int prevInsertHash = insertHash;
+
+        lookahead--;
+        currentPosition++;
+        int hashHead = insertString(currentPosition);
+        final int prevHashHead = prev[currentPosition & wMask];
+        int matchLength = longestMatch(hashHead);
+
+        if (matchLength <= prevMatchLength) {
+            // use the first match, as the next one isn't any better
+            matchLength = prevMatchLength;
+            matchStart = prevMatchStart;
+
+            // restore modified values
+            head[insertHash] = prevHashHead;
+            insertHash = prevInsertHash;
+            currentPosition--;
+            lookahead++;
+        }
+        return matchLength;
+    }
+
+    private void insertStringsInMatch(int matchLength) {
+        // inserts strings contained in current match
+        // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts
+        final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
+        // currentPosition has been inserted already
+        for (int i = 1; i <= stop; i++) {
+            insertString(currentPosition + i);
+        }
+        missedInserts = matchLength - stop - 1;
+    }
+
+    private void catchUpMissedInserts() {
+        while (missedInserts > 0) {
+            insertString(currentPosition - missedInserts--);
+        }
+    }
+
+    private void flushBackReference(int matchLength) throws IOException {
+        callback.accept(new BackReference(currentPosition - matchStart, matchLength));
+    }
+
+    private void flushLiteralBlock() throws IOException {
+        callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
+    }
+
+    /**
+     * Searches the hash chain for real matches and returns the length
+     * of the longest match (0 if none were found) that isn't too far
+     * away (WRT maxOffset).
+     *
+     * <p>Sets matchStart to the index of the start position of the
+     * longest match as a side effect.</p>
+     */
+    private int longestMatch(int matchHead) {
+        final int minLength = params.getMinBackReferenceLength();
+        int longestMatchLength = minLength - 1;
+        final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead);
+        final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
+        final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength());
+        final int maxCandidates = params.getMaxCandidates();
+        for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) {
+            int currentLength = 0;
+            for (int i = 0; i < maxPossibleLength; i++) {
+                if (window[matchHead + i] != window[currentPosition + i]) {
+                    break;
+                }
+                currentLength++;
+            }
+            if (currentLength > longestMatchLength) {
+                longestMatchLength = currentLength;
+                matchStart = matchHead;
+                if (currentLength >= niceBackReferenceLength) {
+                    // no need to search any further
+                    break;
+                }
+            }
+            matchHead = prev[matchHead & wMask];
+        }
+        return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress()
+    }
+}