aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/org/apache/commons/compress/compressors/snappy/FramedSnappyCompressorInputStream.java
blob: f6dc30c0aae59f7fb86f9a0b550538554ff2552d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.commons.compress.compressors.snappy;

import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.util.Arrays;

import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.utils.BoundedInputStream;
import org.apache.commons.compress.utils.ByteUtils;
import org.apache.commons.compress.utils.CountingInputStream;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.commons.compress.utils.InputStreamStatistics;

/**
 * CompressorInputStream for the framing Snappy format.
 *
 * <p>Based on the "spec" in the version "Last revised: 2013-10-25"</p>
 *
 * @see <a href="https://github.com/google/snappy/blob/master/framing_format.txt">Snappy framing format description</a>
 * @since 1.7
 */
public class FramedSnappyCompressorInputStream extends CompressorInputStream
    implements InputStreamStatistics {

    /**
     * package private for tests only.
     */
    static final long MASK_OFFSET = 0xa282ead8L;

    private static final int STREAM_IDENTIFIER_TYPE = 0xff;
    static final int COMPRESSED_CHUNK_TYPE = 0;
    private static final int UNCOMPRESSED_CHUNK_TYPE = 1;
    private static final int PADDING_CHUNK_TYPE = 0xfe;
    private static final int MIN_UNSKIPPABLE_TYPE = 2;
    private static final int MAX_UNSKIPPABLE_TYPE = 0x7f;
    private static final int MAX_SKIPPABLE_TYPE = 0xfd;

    // used by FramedSnappyCompressorOutputStream as well
    static final byte[] SZ_SIGNATURE = new byte[] { //NOSONAR
        (byte) STREAM_IDENTIFIER_TYPE, // tag
        6, 0, 0, // length
        's', 'N', 'a', 'P', 'p', 'Y'
    };

    private long unreadBytes;
    private final CountingInputStream countingStream;

    /** The underlying stream to read compressed data from */
    private final PushbackInputStream in;

    /** The dialect to expect */
    private final FramedSnappyDialect dialect;

    private SnappyCompressorInputStream currentCompressedChunk;

    // used in no-arg read method
    private final byte[] oneByte = new byte[1];

    private boolean endReached, inUncompressedChunk;

    private int uncompressedBytesRemaining;
    private long expectedChecksum = -1;
    private final int blockSize;
    private final PureJavaCrc32C checksum = new PureJavaCrc32C();

    private final ByteUtils.ByteSupplier supplier = new ByteUtils.ByteSupplier() {
        @Override
        public int getAsByte() throws IOException {
            return readOneByte();
        }
    };

    /**
     * Constructs a new input stream that decompresses
     * snappy-framed-compressed data from the specified input stream
     * using the {@link FramedSnappyDialect#STANDARD} dialect.
     * @param in  the InputStream from which to read the compressed data
     * @throws IOException if reading fails
     */
    public FramedSnappyCompressorInputStream(final InputStream in) throws IOException {
        this(in, FramedSnappyDialect.STANDARD);
    }

    /**
     * Constructs a new input stream that decompresses snappy-framed-compressed data
     * from the specified input stream.
     * @param in  the InputStream from which to read the compressed data
     * @param dialect the dialect used by the compressed stream
     * @throws IOException if reading fails
     */
    public FramedSnappyCompressorInputStream(final InputStream in,
                                             final FramedSnappyDialect dialect)
        throws IOException {
        this(in, SnappyCompressorInputStream.DEFAULT_BLOCK_SIZE, dialect);
    }

    /**
     * Constructs a new input stream that decompresses snappy-framed-compressed data
     * from the specified input stream.
     * @param in  the InputStream from which to read the compressed data
     * @param blockSize the block size to use for the compressed stream
     * @param dialect the dialect used by the compressed stream
     * @throws IOException if reading fails
     * @since 1.14
     */
    public FramedSnappyCompressorInputStream(final InputStream in,
                                             final int blockSize,
                                             final FramedSnappyDialect dialect)
        throws IOException {
        countingStream = new CountingInputStream(in);
        this.in = new PushbackInputStream(countingStream, 1);
        this.blockSize = blockSize;
        this.dialect = dialect;
        if (dialect.hasStreamIdentifier()) {
            readStreamIdentifier();
        }
    }

    /** {@inheritDoc} */
    @Override
    public int read() throws IOException {
        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
    }

    /** {@inheritDoc} */
    @Override
    public void close() throws IOException {
        try {
            if (currentCompressedChunk != null) {
                currentCompressedChunk.close();
                currentCompressedChunk = null;
            }
        } finally {
            in.close();
        }
    }

    /** {@inheritDoc} */
    @Override
    public int read(final byte[] b, final int off, final int len) throws IOException {
        int read = readOnce(b, off, len);
        if (read == -1) {
            readNextBlock();
            if (endReached) {
                return -1;
            }
            read = readOnce(b, off, len);
        }
        return read;
    }

    /** {@inheritDoc} */
    @Override
    public int available() throws IOException {
        if (inUncompressedChunk) {
            return Math.min(uncompressedBytesRemaining,
                            in.available());
        } else if (currentCompressedChunk != null) {
            return currentCompressedChunk.available();
        }
        return 0;
    }

    /**
     * @since 1.17
     */
    @Override
    public long getCompressedCount() {
        return countingStream.getBytesRead() - unreadBytes;
    }

    /**
     * Read from the current chunk into the given array.
     *
     * @return -1 if there is no current chunk or the number of bytes
     * read from the current chunk (which may be -1 if the end of the
     * chunk is reached).
     */
    private int readOnce(final byte[] b, final int off, final int len) throws IOException {
        int read = -1;
        if (inUncompressedChunk) {
            final int amount = Math.min(uncompressedBytesRemaining, len);
            if (amount == 0) {
                return -1;
            }
            read = in.read(b, off, amount);
            if (read != -1) {
                uncompressedBytesRemaining -= read;
                count(read);
            }
        } else if (currentCompressedChunk != null) {
            final long before = currentCompressedChunk.getBytesRead();
            read = currentCompressedChunk.read(b, off, len);
            if (read == -1) {
                currentCompressedChunk.close();
                currentCompressedChunk = null;
            } else {
                count(currentCompressedChunk.getBytesRead() - before);
            }
        }
        if (read > 0) {
            checksum.update(b, off, read);
        }
        return read;
    }

    private void readNextBlock() throws IOException {
        verifyLastChecksumAndReset();
        inUncompressedChunk = false;
        final int type = readOneByte();
        if (type == -1) {
            endReached = true;
        } else if (type == STREAM_IDENTIFIER_TYPE) {
            in.unread(type);
            unreadBytes++;
            pushedBackBytes(1);
            readStreamIdentifier();
            readNextBlock();
        } else if (type == PADDING_CHUNK_TYPE
                   || (type > MAX_UNSKIPPABLE_TYPE && type <= MAX_SKIPPABLE_TYPE)) {
            skipBlock();
            readNextBlock();
        } else if (type >= MIN_UNSKIPPABLE_TYPE && type <= MAX_UNSKIPPABLE_TYPE) {
            throw new IOException("unskippable chunk with type " + type
                                  + " (hex " + Integer.toHexString(type) + ")"
                                  + " detected.");
        } else if (type == UNCOMPRESSED_CHUNK_TYPE) {
            inUncompressedChunk = true;
            uncompressedBytesRemaining = readSize() - 4 /* CRC */;
            expectedChecksum = unmask(readCrc());
        } else if (type == COMPRESSED_CHUNK_TYPE) {
            final boolean expectChecksum = dialect.usesChecksumWithCompressedChunks();
            final long size = readSize() - (expectChecksum ? 4L : 0L);
            if (expectChecksum) {
                expectedChecksum = unmask(readCrc());
            } else {
                expectedChecksum = -1;
            }
            currentCompressedChunk =
                new SnappyCompressorInputStream(new BoundedInputStream(in, size), blockSize);
            // constructor reads uncompressed size
            count(currentCompressedChunk.getBytesRead());
        } else {
            // impossible as all potential byte values have been covered
            throw new IOException("unknown chunk type " + type
                                  + " detected.");
        }
    }

    private long readCrc() throws IOException {
        final byte[] b = new byte[4];
        final int read = IOUtils.readFully(in, b);
        count(read);
        if (read != 4) {
            throw new IOException("premature end of stream");
        }
        return ByteUtils.fromLittleEndian(b);
    }

    static long unmask(long x) {
        // ugly, maybe we should just have used ints and deal with the
        // overflow
        x -= MASK_OFFSET;
        x &= 0xffffFFFFL;
        return ((x >> 17) | (x << 15)) & 0xffffFFFFL;
    }

    private int readSize() throws IOException {
        return (int) ByteUtils.fromLittleEndian(supplier, 3);
    }

    private void skipBlock() throws IOException {
        final int size = readSize();
        final long read = IOUtils.skip(in, size);
        count(read);
        if (read != size) {
            throw new IOException("premature end of stream");
        }
    }

    private void readStreamIdentifier() throws IOException {
        final byte[] b = new byte[10];
        final int read = IOUtils.readFully(in, b);
        count(read);
        if (10 != read || !matches(b, 10)) {
            throw new IOException("Not a framed Snappy stream");
        }
    }

    private int readOneByte() throws IOException {
        final int b = in.read();
        if (b != -1) {
            count(1);
            return b & 0xFF;
        }
        return -1;
    }

    private void verifyLastChecksumAndReset() throws IOException {
        if (expectedChecksum >= 0 && expectedChecksum != checksum.getValue()) {
            throw new IOException("Checksum verification failed");
        }
        expectedChecksum = -1;
        checksum.reset();
    }

    /**
     * Checks if the signature matches what is expected for a .sz file.
     *
     * <p>.sz files start with a chunk with tag 0xff and content sNaPpY.</p>
     *
     * @param signature the bytes to check
     * @param length    the number of bytes to check
     * @return          true if this is a .sz stream, false otherwise
     */
    public static boolean matches(final byte[] signature, final int length) {

        if (length < SZ_SIGNATURE.length) {
            return false;
        }

        byte[] shortenedSig = signature;
        if (signature.length > SZ_SIGNATURE.length) {
            shortenedSig = new byte[SZ_SIGNATURE.length];
            System.arraycopy(signature, 0, shortenedSig, 0, SZ_SIGNATURE.length);
        }

        return Arrays.equals(shortenedSig, SZ_SIGNATURE);
    }

}