diff options
Diffstat (limited to 'src/block/decompress.rs')
-rw-r--r-- | src/block/decompress.rs | 544 |
1 files changed, 544 insertions, 0 deletions
diff --git a/src/block/decompress.rs b/src/block/decompress.rs new file mode 100644 index 0000000..62065a9 --- /dev/null +++ b/src/block/decompress.rs @@ -0,0 +1,544 @@ +//! The block decompression algorithm. +use crate::block::{DecompressError, MINMATCH}; +use crate::fastcpy_unsafe; +use crate::sink::SliceSink; +use crate::sink::{PtrSink, Sink}; +use alloc::vec::Vec; + +/// Copies data to output_ptr by self-referential copy from start and match_length +#[inline] +unsafe fn duplicate( + output_ptr: &mut *mut u8, + output_end: *mut u8, + start: *const u8, + match_length: usize, +) { + // We cannot simply use memcpy or `extend_from_slice`, because these do not allow + // self-referential copies: http://ticki.github.io/img/lz4_runs_encoding_diagram.svg + + // Considering that `wild_copy_match_16` can copy up to `16 - 1` extra bytes. + // Defer to `duplicate_overlapping` in case of an overlapping match + // OR the if the wild copy would copy beyond the end of the output. + if (output_ptr.offset_from(start) as usize) < match_length + 16 - 1 + || (output_end.offset_from(*output_ptr) as usize) < match_length + 16 - 1 + { + duplicate_overlapping(output_ptr, start, match_length); + } else { + debug_assert!( + output_ptr.add(match_length / 16 * 16 + ((match_length % 16) != 0) as usize * 16) + <= output_end + ); + wild_copy_from_src_16(start, *output_ptr, match_length); + *output_ptr = output_ptr.add(match_length); + } +} + +#[inline] +fn wild_copy_from_src_16(mut source: *const u8, mut dst_ptr: *mut u8, num_items: usize) { + // Note: if the compiler auto-vectorizes this it'll hurt performance! + // It's not the case for 16 bytes stepsize, but for 8 bytes. + unsafe { + let dst_ptr_end = dst_ptr.add(num_items); + loop { + core::ptr::copy_nonoverlapping(source, dst_ptr, 16); + source = source.add(16); + dst_ptr = dst_ptr.add(16); + if dst_ptr >= dst_ptr_end { + break; + } + } + } +} + +/// Copy function, if the data start + match_length overlaps into output_ptr +#[inline] +#[cfg_attr(nightly, optimize(size))] // to avoid loop unrolling +unsafe fn duplicate_overlapping( + output_ptr: &mut *mut u8, + mut start: *const u8, + match_length: usize, +) { + // There is an edge case when output_ptr == start, which causes the decoder to potentially + // expose up to match_length bytes of uninitialized data in the decompression buffer. + // To prevent that we write a dummy zero to output, which will zero out output in such cases. + // This is the same strategy used by the reference C implementation https://github.com/lz4/lz4/pull/772 + output_ptr.write(0u8); + let dst_ptr_end = output_ptr.add(match_length); + + while output_ptr.add(1) < dst_ptr_end { + // Note that this loop unrolling is done, so that the compiler doesn't do it in a awful + // way. + // Without that the compiler will unroll/auto-vectorize the copy with a lot of branches. + // This is not what we want, as large overlapping copies are not that common. + core::ptr::copy(start, *output_ptr, 1); + start = start.add(1); + *output_ptr = output_ptr.add(1); + + core::ptr::copy(start, *output_ptr, 1); + start = start.add(1); + *output_ptr = output_ptr.add(1); + } + + if *output_ptr < dst_ptr_end { + core::ptr::copy(start, *output_ptr, 1); + *output_ptr = output_ptr.add(1); + } +} + +#[inline] +unsafe fn copy_from_dict( + output_base: *mut u8, + output_ptr: &mut *mut u8, + ext_dict: &[u8], + offset: usize, + match_length: usize, +) -> usize { + // If we're here we know offset > output pos, so we have at least 1 byte to copy from dict + debug_assert!(output_ptr.offset_from(output_base) >= 0); + debug_assert!(offset > output_ptr.offset_from(output_base) as usize); + // If unchecked-decode is not disabled we also know that the offset falls within ext_dict + debug_assert!(ext_dict.len() + output_ptr.offset_from(output_base) as usize >= offset); + + let dict_offset = ext_dict.len() + output_ptr.offset_from(output_base) as usize - offset; + // Can't copy past ext_dict len, the match may cross dict and output + let dict_match_length = match_length.min(ext_dict.len() - dict_offset); + // TODO test fastcpy_unsafe + core::ptr::copy_nonoverlapping( + ext_dict.as_ptr().add(dict_offset), + *output_ptr, + dict_match_length, + ); + *output_ptr = output_ptr.add(dict_match_length); + dict_match_length +} + +/// Read an integer. +/// +/// In LZ4, we encode small integers in a way that we can have an arbitrary number of bytes. In +/// particular, we add the bytes repeatedly until we hit a non-0xFF byte. When we do, we add +/// this byte to our sum and terminate the loop. +/// +/// # Example +/// +/// ```notest +/// 255, 255, 255, 4, 2, 3, 4, 6, 7 +/// ``` +/// +/// is encoded to _255 + 255 + 255 + 4 = 769_. The bytes after the first 4 is ignored, because +/// 4 is the first non-0xFF byte. +#[inline] +fn read_integer_ptr( + input_ptr: &mut *const u8, + _input_ptr_end: *const u8, +) -> Result<u32, DecompressError> { + // We start at zero and count upwards. + let mut n: u32 = 0; + // If this byte takes value 255 (the maximum value it can take), another byte is read + // and added to the sum. This repeats until a byte lower than 255 is read. + loop { + // We add the next byte until we get a byte which we add to the counting variable. + + #[cfg(not(feature = "unchecked-decode"))] + { + if *input_ptr >= _input_ptr_end { + return Err(DecompressError::ExpectedAnotherByte); + } + } + let extra = unsafe { input_ptr.read() }; + *input_ptr = unsafe { input_ptr.add(1) }; + n += extra as u32; + + // We continue if we got 255, break otherwise. + if extra != 0xFF { + break; + } + } + + // 255, 255, 255, 8 + // 111, 111, 111, 101 + + Ok(n) +} + +/// Read a little-endian 16-bit integer from the input stream. +#[inline] +fn read_u16_ptr(input_ptr: &mut *const u8) -> u16 { + let mut num: u16 = 0; + unsafe { + core::ptr::copy_nonoverlapping(*input_ptr, &mut num as *mut u16 as *mut u8, 2); + *input_ptr = input_ptr.add(2); + } + + u16::from_le(num) +} + +const FIT_TOKEN_MASK_LITERAL: u8 = 0b00001111; +const FIT_TOKEN_MASK_MATCH: u8 = 0b11110000; + +#[test] +fn check_token() { + assert!(!does_token_fit(15)); + assert!(does_token_fit(14)); + assert!(does_token_fit(114)); + assert!(!does_token_fit(0b11110000)); + assert!(does_token_fit(0b10110000)); +} + +/// The token consists of two parts, the literal length (upper 4 bits) and match_length (lower 4 +/// bits) if the literal length and match_length are both below 15, we don't need to read additional +/// data, so the token does fit the metadata in a single u8. +#[inline] +fn does_token_fit(token: u8) -> bool { + !((token & FIT_TOKEN_MASK_LITERAL) == FIT_TOKEN_MASK_LITERAL + || (token & FIT_TOKEN_MASK_MATCH) == FIT_TOKEN_MASK_MATCH) +} + +/// Decompress all bytes of `input` into `output`. +/// +/// Returns the number of bytes written (decompressed) into `output`. +#[inline] +pub(crate) fn decompress_internal<const USE_DICT: bool, S: Sink>( + input: &[u8], + output: &mut S, + ext_dict: &[u8], +) -> Result<usize, DecompressError> { + // Prevent segfault for empty input + if input.is_empty() { + return Err(DecompressError::ExpectedAnotherByte); + } + + let ext_dict = if USE_DICT { + ext_dict + } else { + // ensure optimizer knows ext_dict length is 0 if !USE_DICT + debug_assert!(ext_dict.is_empty()); + &[] + }; + let output_base = unsafe { output.base_mut_ptr() }; + let output_end = unsafe { output_base.add(output.capacity()) }; + let output_start_pos_ptr = unsafe { output.base_mut_ptr().add(output.pos()) as *mut u8 }; + let mut output_ptr = output_start_pos_ptr; + + let mut input_ptr = input.as_ptr(); + let input_ptr_end = unsafe { input.as_ptr().add(input.len()) }; + let safe_distance_from_end = (16 /* literal copy */ + 2 /* u16 match offset */ + 1 /* The next token to read (we can skip the check) */).min(input.len()) ; + let input_ptr_safe = unsafe { input_ptr_end.sub(safe_distance_from_end) }; + + let safe_output_ptr = unsafe { + let mut output_num_safe_bytes = output + .capacity() + .saturating_sub(16 /* literal copy */ + 18 /* match copy */); + if USE_DICT { + // In the dictionary case the output pointer is moved by the match length in the dictionary. + // This may be up to 17 bytes without exiting the loop. So we need to ensure that we have + // at least additional 17 bytes of space left in the output buffer in the fast loop. + output_num_safe_bytes = output_num_safe_bytes.saturating_sub(17); + }; + + output_base.add(output_num_safe_bytes) + }; + + // Exhaust the decoder by reading and decompressing all blocks until the remaining buffer is + // empty. + loop { + // Read the token. The token is the first byte in a block. It is divided into two 4-bit + // subtokens, the higher and the lower. + // This token contains to 4-bit "fields", a higher and a lower, representing the literals' + // length and the back reference's length, respectively. + let token = unsafe { input_ptr.read() }; + input_ptr = unsafe { input_ptr.add(1) }; + + // Checking for hot-loop. + // In most cases the metadata does fit in a single 1byte token (statistically) and we are in + // a safe-distance to the end. This enables some optimized handling. + // + // Ideally we want to check for safe output pos like: output.pos() <= safe_output_pos; But + // that doesn't work when the safe_output_ptr is == output_ptr due to insufficient + // capacity. So we use `<` instead of `<=`, which covers that case. + if does_token_fit(token) + && (input_ptr as usize) <= input_ptr_safe as usize + && output_ptr < safe_output_ptr + { + let literal_length = (token >> 4) as usize; + let mut match_length = MINMATCH + (token & 0xF) as usize; + + // output_ptr <= safe_output_ptr should guarantee we have enough space in output + debug_assert!( + unsafe { output_ptr.add(literal_length + match_length) } <= output_end, + "{literal_length} + {match_length} {} wont fit ", + literal_length + match_length + ); + + // Copy the literal + // The literal is at max 16 bytes, and the is_safe_distance check assures + // that we are far away enough from the end so we can safely copy 16 bytes + unsafe { + core::ptr::copy_nonoverlapping(input_ptr, output_ptr, 16); + input_ptr = input_ptr.add(literal_length); + output_ptr = output_ptr.add(literal_length); + } + + // input_ptr <= input_ptr_safe should guarantee we have enough space in input + debug_assert!(input_ptr_end as usize - input_ptr as usize >= 2); + let offset = read_u16_ptr(&mut input_ptr) as usize; + + let output_len = unsafe { output_ptr.offset_from(output_base) as usize }; + let offset = offset.min(output_len + ext_dict.len()); + + // Check if part of the match is in the external dict + if USE_DICT && offset > output_len { + let copied = unsafe { + copy_from_dict(output_base, &mut output_ptr, ext_dict, offset, match_length) + }; + if copied == match_length { + continue; + } + // match crosses ext_dict and output + match_length -= copied; + } + + // Calculate the start of this duplicate segment. At this point offset was already + // checked to be in bounds and the external dictionary copy, if any, was + // already copied and subtracted from match_length. + let start_ptr = unsafe { output_ptr.sub(offset) }; + debug_assert!(start_ptr >= output_base); + debug_assert!(start_ptr < output_end); + debug_assert!(unsafe { output_end.offset_from(start_ptr) as usize } >= match_length); + + // In this branch we know that match_length is at most 18 (14 + MINMATCH). + // But the blocks can overlap, so make sure they are at least 18 bytes apart + // to enable an optimized copy of 18 bytes. + if offset >= match_length { + unsafe { + // _copy_, not copy_non_overlaping, as it may overlap. + // Compiles to the same assembly on x68_64. + core::ptr::copy(start_ptr, output_ptr, 18); + output_ptr = output_ptr.add(match_length); + } + } else { + unsafe { + duplicate_overlapping(&mut output_ptr, start_ptr, match_length); + } + } + + continue; + } + + // Now, we read the literals section. + // Literal Section + // If the initial value is 15, it is indicated that another byte will be read and added to + // it + let mut literal_length = (token >> 4) as usize; + if literal_length != 0 { + if literal_length == 15 { + // The literal_length length took the maximal value, indicating that there is more + // than 15 literal_length bytes. We read the extra integer. + literal_length += read_integer_ptr(&mut input_ptr, input_ptr_end)? as usize; + } + + #[cfg(not(feature = "unchecked-decode"))] + { + // Check if literal is out of bounds for the input, and if there is enough space on + // the output + if literal_length > input_ptr_end as usize - input_ptr as usize { + return Err(DecompressError::LiteralOutOfBounds); + } + if literal_length > unsafe { output_end.offset_from(output_ptr) as usize } { + return Err(DecompressError::OutputTooSmall { + expected: unsafe { output_ptr.offset_from(output_base) as usize } + + literal_length, + actual: output.capacity(), + }); + } + } + unsafe { + fastcpy_unsafe::slice_copy(input_ptr, output_ptr, literal_length); + output_ptr = output_ptr.add(literal_length); + input_ptr = input_ptr.add(literal_length); + } + } + + // If the input stream is emptied, we break out of the loop. This is only the case + // in the end of the stream, since the block is intact otherwise. + if input_ptr >= input_ptr_end { + break; + } + + // Read duplicate section + #[cfg(not(feature = "unchecked-decode"))] + { + if (input_ptr_end as usize) - (input_ptr as usize) < 2 { + return Err(DecompressError::ExpectedAnotherByte); + } + } + let offset = read_u16_ptr(&mut input_ptr) as usize; + // Obtain the initial match length. The match length is the length of the duplicate segment + // which will later be copied from data previously decompressed into the output buffer. The + // initial length is derived from the second part of the token (the lower nibble), we read + // earlier. Since having a match length of less than 4 would mean negative compression + // ratio, we start at 4 (MINMATCH). + + // The initial match length can maximally be 19 (MINMATCH + 15). As with the literal length, + // this indicates that there are more bytes to read. + let mut match_length = MINMATCH + (token & 0xF) as usize; + if match_length == MINMATCH + 15 { + // The match length took the maximal value, indicating that there is more bytes. We + // read the extra integer. + match_length += read_integer_ptr(&mut input_ptr, input_ptr_end)? as usize; + } + + // We now copy from the already decompressed buffer. This allows us for storing duplicates + // by simply referencing the other location. + let output_len = unsafe { output_ptr.offset_from(output_base) as usize }; + + // We'll do a bounds check except unchecked-decode is enabled. + #[cfg(not(feature = "unchecked-decode"))] + { + if offset > output_len + ext_dict.len() { + return Err(DecompressError::OffsetOutOfBounds); + } + if match_length > unsafe { output_end.offset_from(output_ptr) as usize } { + return Err(DecompressError::OutputTooSmall { + expected: output_len + match_length, + actual: output.capacity(), + }); + } + } + + if USE_DICT && offset > output_len { + let copied = unsafe { + copy_from_dict(output_base, &mut output_ptr, ext_dict, offset, match_length) + }; + if copied == match_length { + #[cfg(not(feature = "unchecked-decode"))] + { + if input_ptr >= input_ptr_end { + return Err(DecompressError::ExpectedAnotherByte); + } + } + + continue; + } + // match crosses ext_dict and output + match_length -= copied; + } + + // Calculate the start of this duplicate segment. At this point offset was already checked + // to be in bounds and the external dictionary copy, if any, was already copied and + // subtracted from match_length. + let start_ptr = unsafe { output_ptr.sub(offset) }; + debug_assert!(start_ptr >= output_base); + debug_assert!(start_ptr < output_end); + debug_assert!(unsafe { output_end.offset_from(start_ptr) as usize } >= match_length); + unsafe { + duplicate(&mut output_ptr, output_end, start_ptr, match_length); + } + #[cfg(not(feature = "unchecked-decode"))] + { + if input_ptr >= input_ptr_end { + return Err(DecompressError::ExpectedAnotherByte); + } + } + } + unsafe { + output.set_pos(output_ptr.offset_from(output_base) as usize); + Ok(output_ptr.offset_from(output_start_pos_ptr) as usize) + } +} + +/// Decompress all bytes of `input` into `output`. +/// `output` should be preallocated with a size of of the uncompressed data. +#[inline] +pub fn decompress_into(input: &[u8], output: &mut [u8]) -> Result<usize, DecompressError> { + decompress_internal::<false, _>(input, &mut SliceSink::new(output, 0), b"") +} + +/// Decompress all bytes of `input` into `output`. +/// +/// Returns the number of bytes written (decompressed) into `output`. +#[inline] +pub fn decompress_into_with_dict( + input: &[u8], + output: &mut [u8], + ext_dict: &[u8], +) -> Result<usize, DecompressError> { + decompress_internal::<true, _>(input, &mut SliceSink::new(output, 0), ext_dict) +} + +/// Decompress all bytes of `input` into a new vec. +/// The passed parameter `min_uncompressed_size` needs to be equal or larger than the uncompressed size. +/// +/// # Panics +/// May panic if the parameter `min_uncompressed_size` is smaller than the +/// uncompressed data. + +#[inline] +pub fn decompress_with_dict( + input: &[u8], + min_uncompressed_size: usize, + ext_dict: &[u8], +) -> Result<Vec<u8>, DecompressError> { + // Allocate a vector to contain the decompressed stream. + let mut vec = Vec::with_capacity(min_uncompressed_size); + let decomp_len = + decompress_internal::<true, _>(input, &mut PtrSink::from_vec(&mut vec, 0), ext_dict)?; + unsafe { + vec.set_len(decomp_len); + } + Ok(vec) +} + +/// Decompress all bytes of `input` into a new vec. The first 4 bytes are the uncompressed size in +/// little endian. Can be used in conjunction with `compress_prepend_size` +#[inline] +pub fn decompress_size_prepended(input: &[u8]) -> Result<Vec<u8>, DecompressError> { + let (uncompressed_size, input) = super::uncompressed_size(input)?; + decompress(input, uncompressed_size) +} + +/// Decompress all bytes of `input` into a new vec. +/// The passed parameter `min_uncompressed_size` needs to be equal or larger than the uncompressed size. +/// +/// # Panics +/// May panic if the parameter `min_uncompressed_size` is smaller than the +/// uncompressed data. +#[inline] +pub fn decompress(input: &[u8], min_uncompressed_size: usize) -> Result<Vec<u8>, DecompressError> { + // Allocate a vector to contain the decompressed stream. + let mut vec = Vec::with_capacity(min_uncompressed_size); + let decomp_len = + decompress_internal::<true, _>(input, &mut PtrSink::from_vec(&mut vec, 0), b"")?; + unsafe { + vec.set_len(decomp_len); + } + Ok(vec) +} + +/// Decompress all bytes of `input` into a new vec. The first 4 bytes are the uncompressed size in +/// little endian. Can be used in conjunction with `compress_prepend_size_with_dict` +#[inline] +pub fn decompress_size_prepended_with_dict( + input: &[u8], + ext_dict: &[u8], +) -> Result<Vec<u8>, DecompressError> { + let (uncompressed_size, input) = super::uncompressed_size(input)?; + decompress_with_dict(input, uncompressed_size, ext_dict) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn all_literal() { + assert_eq!(decompress(&[0x30, b'a', b'4', b'9'], 3).unwrap(), b"a49"); + } + + // this error test is only valid with checked-decode. + #[cfg(not(feature = "unchecked-decode"))] + #[test] + fn offset_oob() { + decompress(&[0x10, b'a', 2, 0], 4).unwrap_err(); + decompress(&[0x40, b'a', 1, 0], 4).unwrap_err(); + } +} |