aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/org/apache/commons/compress/compressors/lz77support/AbstractLZ77CompressorInputStream.java
blob: 8a1371af93d41d9937e1941a176eb2975171b373 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.commons.compress.compressors.lz77support;

import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;

import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.utils.ByteUtils;
import org.apache.commons.compress.utils.CountingInputStream;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.commons.compress.utils.InputStreamStatistics;

/**
 * Encapsulates code common to LZ77 decompressors.
 *
 * <p>Assumes the stream consists of blocks of literal data and
 * back-references (called copies) in any order. Of course the first
 * block must be a literal block for the scheme to work - unless the
 * {@link #prefill prefill} method has been used to provide initial
 * data that is never returned by {@link #read read} but only used for
 * back-references.</p>
 *
 * <p>Subclasses must override the three-arg {@link #read read} method
 * as the no-arg version delegates to it and the default
 * implementation delegates to the no-arg version, leading to infinite
 * mutual recursion and a {@code StackOverflowError} otherwise.</p>
 *
 * <p>The contract for subclasses' {@code read} implementation is:</p>
 * <ul>
 *
 *  <li>keep track of the current state of the stream. Is it inside a
 *  literal block or a back-reference or in-between blocks?</li>
 *
 *  <li>Use {@link #readOneByte} to access the underlying stream
 *  directly.</li>
 *
 *  <li>If a new literal block starts, use {@link #startLiteral} to
 *  tell this class about it and read the literal data using {@link
 *  #readLiteral} until it returns {@code 0}. {@link
 *  #hasMoreDataInBlock} will return {@code false} before the next
 *  call to {@link #readLiteral} would return {@code 0}.</li>
 *
 *  <li>If a new back-reference starts, use {@link #startBackReference} to
 *  tell this class about it and read the literal data using {@link
 *  #readBackReference} until it returns {@code 0}. {@link
 *  #hasMoreDataInBlock} will return {@code false} before the next
 *  call to {@link #readBackReference} would return {@code 0}.</li>
 *
 *  <li>If the end of the stream has been reached, return {@code -1}
 *  as this class' methods will never do so themselves.</li>
 *
 * </ul>
 *
 * <p>{@link #readOneByte} and {@link #readLiteral} update the counter
 * for bytes read.</p>
 *
 * @since 1.14
 */
public abstract class AbstractLZ77CompressorInputStream extends CompressorInputStream
    implements InputStreamStatistics {

    /** Size of the window - must be bigger than the biggest offset expected. */
    private final int windowSize;

    /**
     * Buffer to write decompressed bytes to for back-references, will
     * be three times windowSize big.
     *
     * <p>Three times so we can slide the whole buffer a windowSize to
     * the left once we've read twice windowSize and still have enough
     * data inside of it to satisfy back-references.</p>
     */
    private final byte[] buf;

    /** One behind the index of the last byte in the buffer that was written, i.e. the next position to write to */
    private int writeIndex;

    /** Index of the next byte to be read. */
    private int readIndex;

    /** The underlying stream to read compressed data from */
    private final CountingInputStream in;

    /** Number of bytes still to be read from the current literal or back-reference. */
    private long bytesRemaining;

    /** Offset of the current back-reference. */
    private int backReferenceOffset;

    /** uncompressed size */
    private int size = 0;

    // used in no-arg read method
    private final byte[] oneByte = new byte[1];

    /**
     * Supplier that delegates to {@link #readOneByte}.
     */
    protected final ByteUtils.ByteSupplier supplier = new ByteUtils.ByteSupplier() {
        @Override
        public int getAsByte() throws IOException {
            return readOneByte();
        }
    };

    /**
     * Creates a new LZ77 input stream.
     *
     * @param is
     *            An InputStream to read compressed data from
     * @param windowSize
     *            Size of the window kept for back-references, must be bigger than the biggest offset expected.
     *
     * @throws IOException if reading fails
     */
    public AbstractLZ77CompressorInputStream(final InputStream is, int windowSize) throws IOException {
        this.in = new CountingInputStream(is);
        this.windowSize = windowSize;
        buf = new byte[3 * windowSize];
        writeIndex = readIndex = 0;
        bytesRemaining = 0;
    }

    /** {@inheritDoc} */
    @Override
    public int read() throws IOException {
        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
    }

    /** {@inheritDoc} */
    @Override
    public void close() throws IOException {
        in.close();
    }

    /** {@inheritDoc} */
    @Override
    public int available() {
        return writeIndex - readIndex;
    }

    /**
     * Get the uncompressed size of the stream
     *
     * @return the uncompressed size
     */
    public int getSize() {
        return size;
    }

    /**
     * Adds some initial data to fill the window with.
     *
     * <p>This is used if the stream has been cut into blocks and
     * back-references of one block may refer to data of the previous
     * block(s). One such example is the LZ4 frame format using block
     * dependency.</p>
     *
     * @param data the data to fill the window with.
     * @throws IllegalStateException if the stream has already started to read data
     */
    public void prefill(byte[] data) {
        if (writeIndex != 0) {
            throw new IllegalStateException("the stream has already been read from, can't prefill anymore");
        }
        // we don't need more data than the big offset could refer to, so cap it
        int len = Math.min(windowSize, data.length);
        // we need the last data as we are dealing with *back*-references
        System.arraycopy(data, data.length - len, buf, 0, len);
        writeIndex += len;
        readIndex += len;
    }

    /**
     * @since 1.17
     */
    @Override
    public long getCompressedCount() {
        return in.getBytesRead();
    }

    /**
     * Used by subclasses to signal the next block contains the given
     * amount of literal data.
     * @param length the length of the block
     */
    protected final void startLiteral(long length) {
        bytesRemaining = length;
    }

    /**
     * Is there still data remaining inside the current block?
     * @return true if there is still data remaining inside the current block.
     */
    protected final boolean hasMoreDataInBlock() {
        return bytesRemaining > 0;
    }

    /**
     * Reads data from the current literal block.
     * @param b buffer to write data to
     * @param off offset to start writing to
     * @param len maximum amount of data to read
     * @return number of bytes read, may be 0. Will never return -1 as
     * EOF-detection is the responsibility of the subclass
     * @throws IOException if the underlying stream throws or signals
     * an EOF before the amount of data promised for the block have
     * been read
     */
    protected final int readLiteral(final byte[] b, final int off, final int len) throws IOException {
        final int avail = available();
        if (len > avail) {
            tryToReadLiteral(len - avail);
        }
        return readFromBuffer(b, off, len);
    }

    private void tryToReadLiteral(int bytesToRead) throws IOException {
        // min of "what is still inside the literal", "what does the user want" and "how muc can fit into the buffer"
        final int reallyTryToRead = Math.min((int) Math.min(bytesToRead, bytesRemaining),
                                             buf.length - writeIndex);
        final int bytesRead = reallyTryToRead > 0
            ? IOUtils.readFully(in, buf, writeIndex, reallyTryToRead)
            : 0 /* happens for bytesRemaining == 0 */;
        count(bytesRead);
        if (reallyTryToRead != bytesRead) {
            throw new IOException("Premature end of stream reading literal");
        }
        writeIndex += reallyTryToRead;
        bytesRemaining -= reallyTryToRead;
    }

    private int readFromBuffer(final byte[] b, final int off, final int len) {
        final int readable = Math.min(len, available());
        if (readable > 0) {
            System.arraycopy(buf, readIndex, b, off, readable);
            readIndex += readable;
            if (readIndex > 2 * windowSize) {
                slideBuffer();
            }
        }
        size += readable;
        return readable;
    }

    private void slideBuffer() {
        System.arraycopy(buf, windowSize, buf, 0, windowSize * 2);
        writeIndex -= windowSize;
        readIndex -= windowSize;
    }

    /**
     * Used by subclasses to signal the next block contains a back-reference with the given coordinates.
     * @param offset the offset of the back-reference
     * @param length the length of the back-reference
     */
    protected final void startBackReference(int offset, long length) {
        backReferenceOffset = offset;
        bytesRemaining = length;
    }

    /**
     * Reads data from the current back-reference.
     * @param b buffer to write data to
     * @param off offset to start writing to
     * @param len maximum amount of data to read
     * @return number of bytes read, may be 0. Will never return -1 as
     * EOF-detection is the responsibility of the subclass
     */
    protected final int readBackReference(final byte[] b, final int off, final int len) {
        final int avail = available();
        if (len > avail) {
            tryToCopy(len - avail);
        }
        return readFromBuffer(b, off, len);
    }

    private void tryToCopy(int bytesToCopy) {
        // this will fit into the buffer without sliding and not
        // require more than is available inside the back-reference
        int copy = Math.min((int) Math.min(bytesToCopy, bytesRemaining),
                            buf.length - writeIndex);
        if (copy == 0) {
            // NOP
        } else if (backReferenceOffset == 1) { // pretty common special case
            final byte last = buf[writeIndex - 1];
            Arrays.fill(buf, writeIndex, writeIndex + copy, last);
            writeIndex += copy;
        } else if (copy < backReferenceOffset) {
            System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, copy);
            writeIndex += copy;
        } else {
            // back-reference overlaps with the bytes created from it
            // like go back two bytes and then copy six (by copying
            // the last two bytes three time).
            final int fullRots = copy / backReferenceOffset;
            for (int i = 0; i < fullRots; i++) {
                System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, backReferenceOffset);
                writeIndex += backReferenceOffset;
            }

            final int pad = copy - (backReferenceOffset * fullRots);
            if (pad > 0) {
                System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, pad);
                writeIndex += pad;
            }
        }
        bytesRemaining -= copy;
    }

    /**
     * Reads a single byte from the real input stream and ensures the data is accounted for.
     *
     * @return the byte read as value between 0 and 255 or -1 if EOF has been reached.
     * @throws IOException if the underlying stream throws
     */
    protected final int readOneByte() throws IOException {
        final int b = in.read();
        if (b != -1) {
            count(1);
            return b & 0xFF;
        }
        return -1;
    }
}