5 files changed, 2230 insertions, 0 deletions
diff --git a/src/block/compress.rs b/src/block/compress.rs
new file mode 100644
index 0000000..c18474a
--- /dev/null
+++ b/src/block/compress.rs
@@ -0,0 +1,969 @@
+//! The compression algorithm.
+//!
+//! We make use of hash tables to find duplicates. This gives a reasonable compression ratio with a
+//! high performance. It has fixed memory usage, which contrary to other approachs, makes it less
+//! memory hungry.
+
+use crate::block::hashtable::HashTable;
+use crate::block::END_OFFSET;
+use crate::block::LZ4_MIN_LENGTH;
+use crate::block::MAX_DISTANCE;
+use crate::block::MFLIMIT;
+use crate::block::MINMATCH;
+#[cfg(not(feature = "safe-encode"))]
+use crate::sink::PtrSink;
+use crate::sink::Sink;
+use crate::sink::SliceSink;
+#[allow(unused_imports)]
+use alloc::vec;
+use alloc::vec::Vec;
+
+#[cfg(feature = "safe-encode")]
+use core::convert::TryInto;
+
+use super::hashtable::HashTable4K;
+use super::hashtable::HashTable4KU16;
+use super::{CompressError, WINDOW_SIZE};
+
+/// Increase step size after 1<<INCREASE_STEPSIZE_BITSHIFT non matches
+const INCREASE_STEPSIZE_BITSHIFT: usize = 5;
+
+/// Read a 4-byte "batch" from some position.
+///
+/// This will read a native-endian 4-byte integer from some position.
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+pub(super) fn get_batch(input: &[u8], n: usize) -> u32 {
+    unsafe { read_u32_ptr(input.as_ptr().add(n)) }
+}
+
+#[inline]
+#[cfg(feature = "safe-encode")]
+pub(super) fn get_batch(input: &[u8], n: usize) -> u32 {
+    u32::from_ne_bytes(input[n..n + 4].try_into().unwrap())
+}
+
+/// Read an usize sized "batch" from some position.
+///
+/// This will read a native-endian usize from some position.
+#[inline]
+#[allow(dead_code)]
+#[cfg(not(feature = "safe-encode"))]
+pub(super) fn get_batch_arch(input: &[u8], n: usize) -> usize {
+    unsafe { read_usize_ptr(input.as_ptr().add(n)) }
+}
+
+#[inline]
+#[allow(dead_code)]
+#[cfg(feature = "safe-encode")]
+pub(super) fn get_batch_arch(input: &[u8], n: usize) -> usize {
+    const USIZE_SIZE: usize = core::mem::size_of::<usize>();
+    let arr: &[u8; USIZE_SIZE] = input[n..n + USIZE_SIZE].try_into().unwrap();
+    usize::from_ne_bytes(*arr)
+}
+
+#[inline]
+fn token_from_literal(lit_len: usize) -> u8 {
+    if lit_len < 0xF {
+        // Since we can fit the literals length into it, there is no need for saturation.
+        (lit_len as u8) << 4
+    } else {
+        // We were unable to fit the literals into it, so we saturate to 0xF. We will later
+        // write the extensional value.
+        0xF0
+    }
+}
+
+#[inline]
+fn token_from_literal_and_match_length(lit_len: usize, duplicate_length: usize) -> u8 {
+    let mut token = if lit_len < 0xF {
+        // Since we can fit the literals length into it, there is no need for saturation.
+        (lit_len as u8) << 4
+    } else {
+        // We were unable to fit the literals into it, so we saturate to 0xF. We will later
+        // write the extensional value.
+        0xF0
+    };
+
+    token |= if duplicate_length < 0xF {
+        // We could fit it in.
+        duplicate_length as u8
+    } else {
+        // We were unable to fit it in, so we default to 0xF, which will later be extended.
+        0xF
+    };
+
+    token
+}
+
+/// Counts the number of same bytes in two byte streams.
+/// `input` is the complete input
+/// `cur` is the current position in the input. it will be incremented by the number of matched
+/// bytes `source` either the same as input or an external slice
+/// `candidate` is the candidate position in `source`
+///
+/// The function ignores the last END_OFFSET bytes in input as those should be literals.
+#[inline]
+#[cfg(feature = "safe-encode")]
+fn count_same_bytes(input: &[u8], cur: &mut usize, source: &[u8], candidate: usize) -> usize {
+    const USIZE_SIZE: usize = core::mem::size_of::<usize>();
+    let cur_slice = &input[*cur..input.len() - END_OFFSET];
+    let cand_slice = &source[candidate..];
+
+    let mut num = 0;
+    for (block1, block2) in cur_slice
+        .chunks_exact(USIZE_SIZE)
+        .zip(cand_slice.chunks_exact(USIZE_SIZE))
+    {
+        let input_block = usize::from_ne_bytes(block1.try_into().unwrap());
+        let match_block = usize::from_ne_bytes(block2.try_into().unwrap());
+
+        if input_block == match_block {
+            num += USIZE_SIZE;
+        } else {
+            let diff = input_block ^ match_block;
+            num += (diff.to_le().trailing_zeros() / 8) as usize;
+            *cur += num;
+            return num;
+        }
+    }
+
+    // If we're here we may have 1 to 7 bytes left to check close to the end of input
+    // or source slices. Since this is rare occurrence we mark it cold to get better
+    // ~5% better performance.
+    #[cold]
+    fn count_same_bytes_tail(a: &[u8], b: &[u8], offset: usize) -> usize {
+        a.iter()
+            .zip(b)
+            .skip(offset)
+            .take_while(|(a, b)| a == b)
+            .count()
+    }
+    num += count_same_bytes_tail(cur_slice, cand_slice, num);
+
+    *cur += num;
+    num
+}
+
+/// Counts the number of same bytes in two byte streams.
+/// `input` is the complete input
+/// `cur` is the current position in the input. it will be incremented by the number of matched
+/// bytes `source` either the same as input OR an external slice
+/// `candidate` is the candidate position in `source`
+///
+/// The function ignores the last END_OFFSET bytes in input as those should be literals.
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn count_same_bytes(input: &[u8], cur: &mut usize, source: &[u8], candidate: usize) -> usize {
+    let max_input_match = input.len().saturating_sub(*cur + END_OFFSET);
+    let max_candidate_match = source.len() - candidate;
+    // Considering both limits calc how far we may match in input.
+    let input_end = *cur + max_input_match.min(max_candidate_match);
+
+    let start = *cur;
+    let mut source_ptr = unsafe { source.as_ptr().add(candidate) };
+
+    // compare 4/8 bytes blocks depending on the arch
+    const STEP_SIZE: usize = core::mem::size_of::<usize>();
+    while *cur + STEP_SIZE <= input_end {
+        let diff = read_usize_ptr(unsafe { input.as_ptr().add(*cur) }) ^ read_usize_ptr(source_ptr);
+
+        if diff == 0 {
+            *cur += STEP_SIZE;
+            unsafe {
+                source_ptr = source_ptr.add(STEP_SIZE);
+            }
+        } else {
+            *cur += (diff.to_le().trailing_zeros() / 8) as usize;
+            return *cur - start;
+        }
+    }
+
+    // compare 4 bytes block
+    #[cfg(target_pointer_width = "64")]
+    {
+        if input_end - *cur >= 4 {
+            let diff = read_u32_ptr(unsafe { input.as_ptr().add(*cur) }) ^ read_u32_ptr(source_ptr);
+
+            if diff == 0 {
+                *cur += 4;
+                unsafe {
+                    source_ptr = source_ptr.add(4);
+                }
+            } else {
+                *cur += (diff.to_le().trailing_zeros() / 8) as usize;
+                return *cur - start;
+            }
+        }
+    }
+
+    // compare 2 bytes block
+    if input_end - *cur >= 2
+        && unsafe { read_u16_ptr(input.as_ptr().add(*cur)) == read_u16_ptr(source_ptr) }
+    {
+        *cur += 2;
+        unsafe {
+            source_ptr = source_ptr.add(2);
+        }
+    }
+
+    if *cur < input_end
+        && unsafe { input.as_ptr().add(*cur).read() } == unsafe { source_ptr.read() }
+    {
+        *cur += 1;
+    }
+
+    *cur - start
+}
+
+/// Write an integer to the output.
+///
+/// Each additional byte then represent a value from 0 to 255, which is added to the previous value
+/// to produce a total length. When the byte value is 255, another byte must read and added, and so
+/// on. There can be any number of bytes of value "255" following token
+#[inline]
+#[cfg(feature = "safe-encode")]
+fn write_integer(output: &mut impl Sink, mut n: usize) {
+    // Note: Since `n` is usually < 0xFF and writing multiple bytes to the output
+    // requires 2 branches of bound check (due to the possibility of add overflows)
+    // the simple byte at a time implementation below is faster in most cases.
+    while n >= 0xFF {
+        n -= 0xFF;
+        push_byte(output, 0xFF);
+    }
+    push_byte(output, n as u8);
+}
+
+/// Write an integer to the output.
+///
+/// Each additional byte then represent a value from 0 to 255, which is added to the previous value
+/// to produce a total length. When the byte value is 255, another byte must read and added, and so
+/// on. There can be any number of bytes of value "255" following token
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn write_integer(output: &mut impl Sink, mut n: usize) {
+    // Write the 0xFF bytes as long as the integer is higher than said value.
+    if n >= 4 * 0xFF {
+        // In this unlikelly branch we use a fill instead of a loop,
+        // otherwise rustc may output a large unrolled/vectorized loop.
+        let bulk = n / (4 * 0xFF);
+        n %= 4 * 0xFF;
+        unsafe {
+            core::ptr::write_bytes(output.pos_mut_ptr(), 0xFF, 4 * bulk);
+            output.set_pos(output.pos() + 4 * bulk);
+        }
+    }
+
+    // Handle last 1 to 4 bytes
+    push_u32(output, 0xFFFFFFFF);
+    // Updating output len for the remainder
+    unsafe {
+        output.set_pos(output.pos() - 4 + 1 + n / 255);
+        // Write the remaining byte.
+        *output.pos_mut_ptr().sub(1) = (n % 255) as u8;
+    }
+}
+
+/// Handle the last bytes from the input as literals
+#[cold]
+fn handle_last_literals(output: &mut impl Sink, input: &[u8], start: usize) {
+    let lit_len = input.len() - start;
+
+    let token = token_from_literal(lit_len);
+    push_byte(output, token);
+    if lit_len >= 0xF {
+        write_integer(output, lit_len - 0xF);
+    }
+    // Now, write the actual literals.
+    output.extend_from_slice(&input[start..]);
+}
+
+/// Moves the cursors back as long as the bytes match, to find additional bytes in a duplicate
+#[inline]
+#[cfg(feature = "safe-encode")]
+fn backtrack_match(
+    input: &[u8],
+    cur: &mut usize,
+    literal_start: usize,
+    source: &[u8],
+    candidate: &mut usize,
+) {
+    // Note: Even if iterator version of this loop has less branches inside the loop it has more
+    // branches before the loop. That in practice seems to make it slower than the while version
+    // bellow. TODO: It should be possible remove all bounds checks, since we are walking
+    // backwards
+    while *candidate > 0 && *cur > literal_start && input[*cur - 1] == source[*candidate - 1] {
+        *cur -= 1;
+        *candidate -= 1;
+    }
+}
+
+/// Moves the cursors back as long as the bytes match, to find additional bytes in a duplicate
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn backtrack_match(
+    input: &[u8],
+    cur: &mut usize,
+    literal_start: usize,
+    source: &[u8],
+    candidate: &mut usize,
+) {
+    while unsafe {
+        *candidate > 0
+            && *cur > literal_start
+            && input.get_unchecked(*cur - 1) == source.get_unchecked(*candidate - 1)
+    } {
+        *cur -= 1;
+        *candidate -= 1;
+    }
+}
+
+/// Compress all bytes of `input[input_pos..]` into `output`.
+///
+/// Bytes in `input[..input_pos]` are treated as a preamble and can be used for lookback.
+/// This part is known as the compressor "prefix".
+/// Bytes in `ext_dict` logically precede the bytes in `input` and can also be used for lookback.
+///
+/// `input_stream_offset` is the logical position of the first byte of `input`. This allows same
+/// `dict` to be used for many calls to `compress_internal` as we can "readdress" the first byte of
+/// `input` to be something other than 0.
+///
+/// `dict` is the dictionary of previously encoded sequences.
+///
+/// This is used to find duplicates in the stream so they are not written multiple times.
+///
+/// Every four bytes are hashed, and in the resulting slot their position in the input buffer
+/// is placed in the dict. This way we can easily look up a candidate to back references.
+///
+/// Returns the number of bytes written (compressed) into `output`.
+///
+/// # Const parameters
+/// `USE_DICT`: Disables usage of ext_dict (it'll panic if a non-empty slice is used).
+/// In other words, this generates more optimized code when an external dictionary isn't used.
+///
+/// A similar const argument could be used to disable the Prefix mode (eg. USE_PREFIX),
+/// which would impose `input_pos == 0 && input_stream_offset == 0`. Experiments didn't
+/// show significant improvement though.
+// Intentionally avoid inlining.
+// Empirical tests revealed it to be rarely better but often significantly detrimental.
+#[inline(never)]
+pub(crate) fn compress_internal<T: HashTable, const USE_DICT: bool, S: Sink>(
+    input: &[u8],
+    input_pos: usize,
+    output: &mut S,
+    dict: &mut T,
+    ext_dict: &[u8],
+    input_stream_offset: usize,
+) -> Result<usize, CompressError> {
+    assert!(input_pos <= input.len());
+    if USE_DICT {
+        assert!(ext_dict.len() <= super::WINDOW_SIZE);
+        assert!(ext_dict.len() <= input_stream_offset);
+        // Check for overflow hazard when using ext_dict
+        assert!(input_stream_offset
+            .checked_add(input.len())
+            .and_then(|i| i.checked_add(ext_dict.len()))
+            .map_or(false, |i| i <= isize::MAX as usize));
+    } else {
+        assert!(ext_dict.is_empty());
+    }
+    if output.capacity() - output.pos() < get_maximum_output_size(input.len() - input_pos) {
+        return Err(CompressError::OutputTooSmall);
+    }
+
+    let output_start_pos = output.pos();
+    if input.len() - input_pos < LZ4_MIN_LENGTH {
+        handle_last_literals(output, input, input_pos);
+        return Ok(output.pos() - output_start_pos);
+    }
+
+    let ext_dict_stream_offset = input_stream_offset - ext_dict.len();
+    let end_pos_check = input.len() - MFLIMIT;
+    let mut literal_start = input_pos;
+    let mut cur = input_pos;
+
+    if cur == 0 && input_stream_offset == 0 {
+        // According to the spec we can't start with a match,
+        // except when referencing another block.
+        let hash = T::get_hash_at(input, 0);
+        dict.put_at(hash, 0);
+        cur = 1;
+    }
+
+    loop {
+        // Read the next block into two sections, the literals and the duplicates.
+        let mut step_size;
+        let mut candidate;
+        let mut candidate_source;
+        let mut offset;
+        let mut non_match_count = 1 << INCREASE_STEPSIZE_BITSHIFT;
+        // The number of bytes before our cursor, where the duplicate starts.
+        let mut next_cur = cur;
+
+        // In this loop we search for duplicates via the hashtable. 4bytes or 8bytes are hashed and
+        // compared.
+        loop {
+            step_size = non_match_count >> INCREASE_STEPSIZE_BITSHIFT;
+            non_match_count += 1;
+
+            cur = next_cur;
+            next_cur += step_size;
+
+            // Same as cur + MFLIMIT > input.len()
+            if cur > end_pos_check {
+                handle_last_literals(output, input, literal_start);
+                return Ok(output.pos() - output_start_pos);
+            }
+            // Find a candidate in the dictionary with the hash of the current four bytes.
+            // Unchecked is safe as long as the values from the hash function don't exceed the size
+            // of the table. This is ensured by right shifting the hash values
+            // (`dict_bitshift`) to fit them in the table
+
+            // [Bounds Check]: Can be elided due to `end_pos_check` above
+            let hash = T::get_hash_at(input, cur);
+            candidate = dict.get_at(hash);
+            dict.put_at(hash, cur + input_stream_offset);
+
+            // Sanity check: Matches can't be ahead of `cur`.
+            debug_assert!(candidate <= input_stream_offset + cur);
+
+            // Two requirements to the candidate exists:
+            // - We should not return a position which is merely a hash collision, so that the
+            //   candidate actually matches what we search for.
+            // - We can address up to 16-bit offset, hence we are only able to address the candidate
+            //   if its offset is less than or equals to 0xFFFF.
+            if input_stream_offset + cur - candidate > MAX_DISTANCE {
+                continue;
+            }
+
+            if candidate >= input_stream_offset {
+                // match within input
+                offset = (input_stream_offset + cur - candidate) as u16;
+                candidate -= input_stream_offset;
+                candidate_source = input;
+            } else if USE_DICT {
+                // Sanity check, which may fail if we lost history beyond MAX_DISTANCE
+                debug_assert!(
+                    candidate >= ext_dict_stream_offset,
+                    "Lost history in ext dict mode"
+                );
+                // match within ext dict
+                offset = (input_stream_offset + cur - candidate) as u16;
+                candidate -= ext_dict_stream_offset;
+                candidate_source = ext_dict;
+            } else {
+                // Match is not reachable anymore
+                // eg. compressing an independent block frame w/o clearing
+                // the matches tables, only increasing input_stream_offset.
+                // Sanity check
+                debug_assert!(input_pos == 0, "Lost history in prefix mode");
+                continue;
+            }
+            // [Bounds Check]: Candidate is coming from the Hashmap. It can't be out of bounds, but
+            // impossible to prove for the compiler and remove the bounds checks.
+            let cand_bytes: u32 = get_batch(candidate_source, candidate);
+            // [Bounds Check]: Should be able to be elided due to `end_pos_check`.
+            let curr_bytes: u32 = get_batch(input, cur);
+
+            if cand_bytes == curr_bytes {
+                break;
+            }
+        }
+
+        // Extend the match backwards if we can
+        backtrack_match(
+            input,
+            &mut cur,
+            literal_start,
+            candidate_source,
+            &mut candidate,
+        );
+
+        // The length (in bytes) of the literals section.
+        let lit_len = cur - literal_start;
+
+        // Generate the higher half of the token.
+        cur += MINMATCH;
+        candidate += MINMATCH;
+        let duplicate_length = count_same_bytes(input, &mut cur, candidate_source, candidate);
+
+        // Note: The `- 2` offset was copied from the reference implementation, it could be
+        // arbitrary.
+        let hash = T::get_hash_at(input, cur - 2);
+        dict.put_at(hash, cur - 2 + input_stream_offset);
+
+        let token = token_from_literal_and_match_length(lit_len, duplicate_length);
+
+        // Push the token to the output stream.
+        push_byte(output, token);
+        // If we were unable to fit the literals length into the token, write the extensional
+        // part.
+        if lit_len >= 0xF {
+            write_integer(output, lit_len - 0xF);
+        }
+
+        // Now, write the actual literals.
+        //
+        // The unsafe version copies blocks of 8bytes, and therefore may copy up to 7bytes more than
+        // needed. This is safe, because the last 12 bytes (MF_LIMIT) are handled in
+        // handle_last_literals.
+        copy_literals_wild(output, input, literal_start, lit_len);
+        // write the offset in little endian.
+        push_u16(output, offset);
+
+        // If we were unable to fit the duplicates length into the token, write the
+        // extensional part.
+        if duplicate_length >= 0xF {
+            write_integer(output, duplicate_length - 0xF);
+        }
+        literal_start = cur;
+    }
+}
+
+#[inline]
+#[cfg(feature = "safe-encode")]
+fn push_byte(output: &mut impl Sink, el: u8) {
+    output.push(el);
+}
+
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn push_byte(output: &mut impl Sink, el: u8) {
+    unsafe {
+        core::ptr::write(output.pos_mut_ptr(), el);
+        output.set_pos(output.pos() + 1);
+    }
+}
+
+#[inline]
+#[cfg(feature = "safe-encode")]
+fn push_u16(output: &mut impl Sink, el: u16) {
+    output.extend_from_slice(&el.to_le_bytes());
+}
+
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn push_u16(output: &mut impl Sink, el: u16) {
+    unsafe {
+        core::ptr::copy_nonoverlapping(el.to_le_bytes().as_ptr(), output.pos_mut_ptr(), 2);
+        output.set_pos(output.pos() + 2);
+    }
+}
+
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn push_u32(output: &mut impl Sink, el: u32) {
+    unsafe {
+        core::ptr::copy_nonoverlapping(el.to_le_bytes().as_ptr(), output.pos_mut_ptr(), 4);
+        output.set_pos(output.pos() + 4);
+    }
+}
+
+#[inline(always)] // (always) necessary otherwise compiler fails to inline it
+#[cfg(feature = "safe-encode")]
+fn copy_literals_wild(output: &mut impl Sink, input: &[u8], input_start: usize, len: usize) {
+    output.extend_from_slice_wild(&input[input_start..input_start + len], len)
+}
+
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn copy_literals_wild(output: &mut impl Sink, input: &[u8], input_start: usize, len: usize) {
+    debug_assert!(input_start + len / 8 * 8 + ((len % 8) != 0) as usize * 8 <= input.len());
+    debug_assert!(output.pos() + len / 8 * 8 + ((len % 8) != 0) as usize * 8 <= output.capacity());
+    unsafe {
+        // Note: This used to be a wild copy loop of 8 bytes, but the compiler consistently
+        // transformed it into a call to memcopy, which hurts performance significantly for
+        // small copies, which are common.
+        let start_ptr = input.as_ptr().add(input_start);
+        match len {
+            0..=8 => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), 8),
+            9..=16 => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), 16),
+            17..=24 => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), 24),
+            _ => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), len),
+        }
+        output.set_pos(output.pos() + len);
+    }
+}
+
+/// Compress all bytes of `input` into `output`.
+/// The method chooses an appropriate hashtable to lookup duplicates.
+/// output should be preallocated with a size of
+/// `get_maximum_output_size`.
+///
+/// Returns the number of bytes written (compressed) into `output`.
+
+#[inline]
+pub(crate) fn compress_into_sink_with_dict<const USE_DICT: bool>(
+    input: &[u8],
+    output: &mut impl Sink,
+    mut dict_data: &[u8],
+) -> Result<usize, CompressError> {
+    if dict_data.len() + input.len() < u16::MAX as usize {
+        let mut dict = HashTable4KU16::new();
+        init_dict(&mut dict, &mut dict_data);
+        compress_internal::<_, USE_DICT, _>(input, 0, output, &mut dict, dict_data, dict_data.len())
+    } else {
+        let mut dict = HashTable4K::new();
+        init_dict(&mut dict, &mut dict_data);
+        compress_internal::<_, USE_DICT, _>(input, 0, output, &mut dict, dict_data, dict_data.len())
+    }
+}
+
+#[inline]
+fn init_dict<T: HashTable>(dict: &mut T, dict_data: &mut &[u8]) {
+    if dict_data.len() > WINDOW_SIZE {
+        *dict_data = &dict_data[dict_data.len() - WINDOW_SIZE..];
+    }
+    let mut i = 0usize;
+    while i + core::mem::size_of::<usize>() <= dict_data.len() {
+        let hash = T::get_hash_at(dict_data, i);
+        dict.put_at(hash, i);
+        // Note: The 3 byte step was copied from the reference implementation, it could be
+        // arbitrary.
+        i += 3;
+    }
+}
+
+/// Returns the maximum output size of the compressed data.
+/// Can be used to preallocate capacity on the output vector
+#[inline]
+pub fn get_maximum_output_size(input_len: usize) -> usize {
+    16 + 4 + (input_len as f64 * 1.1) as usize
+}
+
+/// Compress all bytes of `input` into `output`.
+/// The method chooses an appropriate hashtable to lookup duplicates.
+/// output should be preallocated with a size of
+/// `get_maximum_output_size`.
+///
+/// Returns the number of bytes written (compressed) into `output`.
+#[inline]
+pub fn compress_into(input: &[u8], output: &mut [u8]) -> Result<usize, CompressError> {
+    compress_into_sink_with_dict::<false>(input, &mut SliceSink::new(output, 0), b"")
+}
+
+/// Compress all bytes of `input` into `output`.
+/// The method chooses an appropriate hashtable to lookup duplicates.
+/// output should be preallocated with a size of
+/// `get_maximum_output_size`.
+///
+/// Returns the number of bytes written (compressed) into `output`.
+#[inline]
+pub fn compress_into_with_dict(
+    input: &[u8],
+    output: &mut [u8],
+    dict_data: &[u8],
+) -> Result<usize, CompressError> {
+    compress_into_sink_with_dict::<true>(input, &mut SliceSink::new(output, 0), dict_data)
+}
+
+#[inline]
+fn compress_into_vec_with_dict<const USE_DICT: bool>(
+    input: &[u8],
+    prepend_size: bool,
+    mut dict_data: &[u8],
+) -> Vec<u8> {
+    let prepend_size_num_bytes = if prepend_size { 4 } else { 0 };
+    let max_compressed_size = get_maximum_output_size(input.len()) + prepend_size_num_bytes;
+    if dict_data.len() <= 3 {
+        dict_data = b"";
+    }
+    #[cfg(feature = "safe-encode")]
+    let mut compressed = {
+        let mut compressed: Vec<u8> = vec![0u8; max_compressed_size];
+        let out = if prepend_size {
+            compressed[..4].copy_from_slice(&(input.len() as u32).to_le_bytes());
+            &mut compressed[4..]
+        } else {
+            &mut compressed
+        };
+        let compressed_len =
+            compress_into_sink_with_dict::<USE_DICT>(input, &mut SliceSink::new(out, 0), dict_data)
+                .unwrap();
+
+        compressed.truncate(prepend_size_num_bytes + compressed_len);
+        compressed
+    };
+    #[cfg(not(feature = "safe-encode"))]
+    let mut compressed = {
+        let mut vec = Vec::with_capacity(max_compressed_size);
+        let start_pos = if prepend_size {
+            vec.extend_from_slice(&(input.len() as u32).to_le_bytes());
+            4
+        } else {
+            0
+        };
+        let compressed_len = compress_into_sink_with_dict::<USE_DICT>(
+            input,
+            &mut PtrSink::from_vec(&mut vec, start_pos),
+            dict_data,
+        )
+        .unwrap();
+        unsafe {
+            vec.set_len(prepend_size_num_bytes + compressed_len);
+        }
+        vec
+    };
+
+    compressed.shrink_to_fit();
+    compressed
+}
+
+/// Compress all bytes of `input` into `output`. The uncompressed size will be prepended as a little
+/// endian u32. Can be used in conjunction with `decompress_size_prepended`
+#[inline]
+pub fn compress_prepend_size(input: &[u8]) -> Vec<u8> {
+    compress_into_vec_with_dict::<false>(input, true, b"")
+}
+
+/// Compress all bytes of `input`.
+#[inline]
+pub fn compress(input: &[u8]) -> Vec<u8> {
+    compress_into_vec_with_dict::<false>(input, false, b"")
+}
+
+/// Compress all bytes of `input` with an external dictionary.
+#[inline]
+pub fn compress_with_dict(input: &[u8], ext_dict: &[u8]) -> Vec<u8> {
+    compress_into_vec_with_dict::<true>(input, false, ext_dict)
+}
+
+/// Compress all bytes of `input` into `output`. The uncompressed size will be prepended as a little
+/// endian u32. Can be used in conjunction with `decompress_size_prepended_with_dict`
+#[inline]
+pub fn compress_prepend_size_with_dict(input: &[u8], ext_dict: &[u8]) -> Vec<u8> {
+    compress_into_vec_with_dict::<true>(input, true, ext_dict)
+}
+
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn read_u16_ptr(input: *const u8) -> u16 {
+    let mut num: u16 = 0;
+    unsafe {
+        core::ptr::copy_nonoverlapping(input, &mut num as *mut u16 as *mut u8, 2);
+    }
+    num
+}
+
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn read_u32_ptr(input: *const u8) -> u32 {
+    let mut num: u32 = 0;
+    unsafe {
+        core::ptr::copy_nonoverlapping(input, &mut num as *mut u32 as *mut u8, 4);
+    }
+    num
+}
+
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn read_usize_ptr(input: *const u8) -> usize {
+    let mut num: usize = 0;
+    unsafe {
+        core::ptr::copy_nonoverlapping(
+            input,
+            &mut num as *mut usize as *mut u8,
+            core::mem::size_of::<usize>(),
+        );
+    }
+    num
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_count_same_bytes() {
+        // 8byte aligned block, zeros and ones are added because the end/offset
+        let first: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        ];
+        let second: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        ];
+        assert_eq!(count_same_bytes(first, &mut 0, second, 0), 16);
+
+        // 4byte aligned block
+        let first: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0,
+        ];
+        let second: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1,
+        ];
+        assert_eq!(count_same_bytes(first, &mut 0, second, 0), 20);
+
+        // 2byte aligned block
+        let first: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0,
+        ];
+        let second: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1,
+        ];
+        assert_eq!(count_same_bytes(first, &mut 0, second, 0), 22);
+
+        // 1byte aligned block
+        let first: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 5, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0,
+        ];
+        let second: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 5, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1,
+        ];
+        assert_eq!(count_same_bytes(first, &mut 0, second, 0), 23);
+
+        // 1byte aligned block - last byte different
+        let first: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 5, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0,
+        ];
+        let second: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 6, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1,
+        ];
+        assert_eq!(count_same_bytes(first, &mut 0, second, 0), 22);
+
+        // 1byte aligned block
+        let first: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 9, 5, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0,
+        ];
+        let second: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 6, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1,
+        ];
+        assert_eq!(count_same_bytes(first, &mut 0, second, 0), 21);
+
+        for diff_idx in 8..100 {
+            let first: Vec<u8> = (0u8..255).cycle().take(100 + 12).collect();
+            let mut second = first.clone();
+            second[diff_idx] = 255;
+            for start in 0..=diff_idx {
+                let same_bytes = count_same_bytes(&first, &mut start.clone(), &second, start);
+                assert_eq!(same_bytes, diff_idx - start);
+            }
+        }
+    }
+
+    #[test]
+    fn test_bug() {
+        let input: &[u8] = &[
+            10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18,
+        ];
+        let _out = compress(input);
+    }
+
+    #[test]
+    fn test_dict() {
+        let input: &[u8] = &[
+            10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18,
+        ];
+        let dict = input;
+        let compressed = compress_with_dict(input, dict);
+        assert_lt!(compressed.len(), compress(input).len());
+
+        assert!(compressed.len() < compress(input).len());
+        let mut uncompressed = vec![0u8; input.len()];
+        let uncomp_size = crate::block::decompress::decompress_into_with_dict(
+            &compressed,
+            &mut uncompressed,
+            dict,
+        )
+        .unwrap();
+        uncompressed.truncate(uncomp_size);
+        assert_eq!(input, uncompressed);
+    }
+
+    #[test]
+    fn test_dict_no_panic() {
+        let input: &[u8] = &[
+            10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18,
+        ];
+        let dict = &[10, 12, 14];
+        let _compressed = compress_with_dict(input, dict);
+    }
+
+    #[test]
+    fn test_dict_match_crossing() {
+        let input: &[u8] = &[
+            10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18,
+        ];
+        let dict = input;
+        let compressed = compress_with_dict(input, dict);
+        assert_lt!(compressed.len(), compress(input).len());
+
+        let mut uncompressed = vec![0u8; input.len() * 2];
+        // copy first half of the input into output
+        let dict_cutoff = dict.len() / 2;
+        let output_start = dict.len() - dict_cutoff;
+        uncompressed[..output_start].copy_from_slice(&dict[dict_cutoff..]);
+        let uncomp_len = {
+            let mut sink = SliceSink::new(&mut uncompressed[..], output_start);
+            crate::block::decompress::decompress_internal::<true, _>(
+                &compressed,
+                &mut sink,
+                &dict[..dict_cutoff],
+            )
+            .unwrap()
+        };
+        assert_eq!(input.len(), uncomp_len);
+        assert_eq!(
+            input,
+            &uncompressed[output_start..output_start + uncomp_len]
+        );
+    }
+
+    #[test]
+    fn test_conformant_last_block() {
+        // From the spec:
+        // The last match must start at least 12 bytes before the end of block.
+        // The last match is part of the penultimate sequence. It is followed by the last sequence,
+        // which contains only literals. Note that, as a consequence, an independent block <
+        // 13 bytes cannot be compressed, because the match must copy "something",
+        // so it needs at least one prior byte.
+        // When a block can reference data from another block, it can start immediately with a match
+        // and no literal, so a block of 12 bytes can be compressed.
+        let aaas: &[u8] = b"aaaaaaaaaaaaaaa";
+
+        // uncompressible
+        let out = compress(&aaas[..12]);
+        assert_gt!(out.len(), 12);
+        // compressible
+        let out = compress(&aaas[..13]);
+        assert_le!(out.len(), 13);
+        let out = compress(&aaas[..14]);
+        assert_le!(out.len(), 14);
+        let out = compress(&aaas[..15]);
+        assert_le!(out.len(), 15);
+
+        // dict uncompressible
+        let out = compress_with_dict(&aaas[..11], aaas);
+        assert_gt!(out.len(), 11);
+        // compressible
+        let out = compress_with_dict(&aaas[..12], aaas);
+        // According to the spec this _could_ compres, but it doesn't in this lib
+        // as it aborts compression for any input len < LZ4_MIN_LENGTH
+        assert_gt!(out.len(), 12);
+        let out = compress_with_dict(&aaas[..13], aaas);
+        assert_le!(out.len(), 13);
+        let out = compress_with_dict(&aaas[..14], aaas);
+        assert_le!(out.len(), 14);
+        let out = compress_with_dict(&aaas[..15], aaas);
+        assert_le!(out.len(), 15);
+    }
+
+    #[test]
+    fn test_dict_size() {
+        let dict = vec![b'a'; 1024 * 1024];
+        let input = &b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaa"[..];
+        let compressed = compress_prepend_size_with_dict(input, &dict);
+        let decompressed =
+            crate::block::decompress_size_prepended_with_dict(&compressed, &dict).unwrap();
+        assert_eq!(decompressed, input);
+    }
+}
diff --git a/src/block/decompress.rs b/src/block/decompress.rs
new file mode 100644
index 0000000..62065a9
--- /dev/null
+++ b/src/block/decompress.rs
@@ -0,0 +1,544 @@
+//! The block decompression algorithm.
+use crate::block::{DecompressError, MINMATCH};
+use crate::fastcpy_unsafe;
+use crate::sink::SliceSink;
+use crate::sink::{PtrSink, Sink};
+use alloc::vec::Vec;
+
+/// Copies data to output_ptr by self-referential copy from start and match_length
+#[inline]
+unsafe fn duplicate(
+    output_ptr: &mut *mut u8,
+    output_end: *mut u8,
+    start: *const u8,
+    match_length: usize,
+) {
+    // We cannot simply use memcpy or `extend_from_slice`, because these do not allow
+    // self-referential copies: http://ticki.github.io/img/lz4_runs_encoding_diagram.svg
+
+    // Considering that `wild_copy_match_16` can copy up to `16 - 1` extra bytes.
+    // Defer to `duplicate_overlapping` in case of an overlapping match
+    // OR the if the wild copy would copy beyond the end of the output.
+    if (output_ptr.offset_from(start) as usize) < match_length + 16 - 1
+        || (output_end.offset_from(*output_ptr) as usize) < match_length + 16 - 1
+    {
+        duplicate_overlapping(output_ptr, start, match_length);
+    } else {
+        debug_assert!(
+            output_ptr.add(match_length / 16 * 16 + ((match_length % 16) != 0) as usize * 16)
+                <= output_end
+        );
+        wild_copy_from_src_16(start, *output_ptr, match_length);
+        *output_ptr = output_ptr.add(match_length);
+    }
+}
+
+#[inline]
+fn wild_copy_from_src_16(mut source: *const u8, mut dst_ptr: *mut u8, num_items: usize) {
+    // Note: if the compiler auto-vectorizes this it'll hurt performance!
+    // It's not the case for 16 bytes stepsize, but for 8 bytes.
+    unsafe {
+        let dst_ptr_end = dst_ptr.add(num_items);
+        loop {
+            core::ptr::copy_nonoverlapping(source, dst_ptr, 16);
+            source = source.add(16);
+            dst_ptr = dst_ptr.add(16);
+            if dst_ptr >= dst_ptr_end {
+                break;
+            }
+        }
+    }
+}
+
+/// Copy function, if the data start + match_length overlaps into output_ptr
+#[inline]
+#[cfg_attr(nightly, optimize(size))] // to avoid loop unrolling
+unsafe fn duplicate_overlapping(
+    output_ptr: &mut *mut u8,
+    mut start: *const u8,
+    match_length: usize,
+) {
+    // There is an edge case when output_ptr == start, which causes the decoder to potentially
+    // expose up to match_length bytes of uninitialized data in the decompression buffer.
+    // To prevent that we write a dummy zero to output, which will zero out output in such cases.
+    // This is the same strategy used by the reference C implementation https://github.com/lz4/lz4/pull/772
+    output_ptr.write(0u8);
+    let dst_ptr_end = output_ptr.add(match_length);
+
+    while output_ptr.add(1) < dst_ptr_end {
+        // Note that this loop unrolling is done, so that the compiler doesn't do it in a awful
+        // way.
+        // Without that the compiler will unroll/auto-vectorize the copy with a lot of branches.
+        // This is not what we want, as large overlapping copies are not that common.
+        core::ptr::copy(start, *output_ptr, 1);
+        start = start.add(1);
+        *output_ptr = output_ptr.add(1);
+
+        core::ptr::copy(start, *output_ptr, 1);
+        start = start.add(1);
+        *output_ptr = output_ptr.add(1);
+    }
+
+    if *output_ptr < dst_ptr_end {
+        core::ptr::copy(start, *output_ptr, 1);
+        *output_ptr = output_ptr.add(1);
+    }
+}
+
+#[inline]
+unsafe fn copy_from_dict(
+    output_base: *mut u8,
+    output_ptr: &mut *mut u8,
+    ext_dict: &[u8],
+    offset: usize,
+    match_length: usize,
+) -> usize {
+    // If we're here we know offset > output pos, so we have at least 1 byte to copy from dict
+    debug_assert!(output_ptr.offset_from(output_base) >= 0);
+    debug_assert!(offset > output_ptr.offset_from(output_base) as usize);
+    // If unchecked-decode is not disabled we also know that the offset falls within ext_dict
+    debug_assert!(ext_dict.len() + output_ptr.offset_from(output_base) as usize >= offset);
+
+    let dict_offset = ext_dict.len() + output_ptr.offset_from(output_base) as usize - offset;
+    // Can't copy past ext_dict len, the match may cross dict and output
+    let dict_match_length = match_length.min(ext_dict.len() - dict_offset);
+    // TODO test fastcpy_unsafe
+    core::ptr::copy_nonoverlapping(
+        ext_dict.as_ptr().add(dict_offset),
+        *output_ptr,
+        dict_match_length,
+    );
+    *output_ptr = output_ptr.add(dict_match_length);
+    dict_match_length
+}
+
+/// Read an integer.
+///
+/// In LZ4, we encode small integers in a way that we can have an arbitrary number of bytes. In
+/// particular, we add the bytes repeatedly until we hit a non-0xFF byte. When we do, we add
+/// this byte to our sum and terminate the loop.
+///
+/// # Example
+///
+/// ```notest
+///     255, 255, 255, 4, 2, 3, 4, 6, 7
+/// ```
+///
+/// is encoded to _255 + 255 + 255 + 4 = 769_. The bytes after the first 4 is ignored, because
+/// 4 is the first non-0xFF byte.
+#[inline]
+fn read_integer_ptr(
+    input_ptr: &mut *const u8,
+    _input_ptr_end: *const u8,
+) -> Result<u32, DecompressError> {
+    // We start at zero and count upwards.
+    let mut n: u32 = 0;
+    // If this byte takes value 255 (the maximum value it can take), another byte is read
+    // and added to the sum. This repeats until a byte lower than 255 is read.
+    loop {
+        // We add the next byte until we get a byte which we add to the counting variable.
+
+        #[cfg(not(feature = "unchecked-decode"))]
+        {
+            if *input_ptr >= _input_ptr_end {
+                return Err(DecompressError::ExpectedAnotherByte);
+            }
+        }
+        let extra = unsafe { input_ptr.read() };
+        *input_ptr = unsafe { input_ptr.add(1) };
+        n += extra as u32;
+
+        // We continue if we got 255, break otherwise.
+        if extra != 0xFF {
+            break;
+        }
+    }
+
+    // 255, 255, 255, 8
+    // 111, 111, 111, 101
+
+    Ok(n)
+}
+
+/// Read a little-endian 16-bit integer from the input stream.
+#[inline]
+fn read_u16_ptr(input_ptr: &mut *const u8) -> u16 {
+    let mut num: u16 = 0;
+    unsafe {
+        core::ptr::copy_nonoverlapping(*input_ptr, &mut num as *mut u16 as *mut u8, 2);
+        *input_ptr = input_ptr.add(2);
+    }
+
+    u16::from_le(num)
+}
+
+const FIT_TOKEN_MASK_LITERAL: u8 = 0b00001111;
+const FIT_TOKEN_MASK_MATCH: u8 = 0b11110000;
+
+#[test]
+fn check_token() {
+    assert!(!does_token_fit(15));
+    assert!(does_token_fit(14));
+    assert!(does_token_fit(114));
+    assert!(!does_token_fit(0b11110000));
+    assert!(does_token_fit(0b10110000));
+}
+
+/// The token consists of two parts, the literal length (upper 4 bits) and match_length (lower 4
+/// bits) if the literal length and match_length are both below 15, we don't need to read additional
+/// data, so the token does fit the metadata in a single u8.
+#[inline]
+fn does_token_fit(token: u8) -> bool {
+    !((token & FIT_TOKEN_MASK_LITERAL) == FIT_TOKEN_MASK_LITERAL
+        || (token & FIT_TOKEN_MASK_MATCH) == FIT_TOKEN_MASK_MATCH)
+}
+
+/// Decompress all bytes of `input` into `output`.
+///
+/// Returns the number of bytes written (decompressed) into `output`.
+#[inline]
+pub(crate) fn decompress_internal<const USE_DICT: bool, S: Sink>(
+    input: &[u8],
+    output: &mut S,
+    ext_dict: &[u8],
+) -> Result<usize, DecompressError> {
+    // Prevent segfault for empty input
+    if input.is_empty() {
+        return Err(DecompressError::ExpectedAnotherByte);
+    }
+
+    let ext_dict = if USE_DICT {
+        ext_dict
+    } else {
+        // ensure optimizer knows ext_dict length is 0 if !USE_DICT
+        debug_assert!(ext_dict.is_empty());
+        &[]
+    };
+    let output_base = unsafe { output.base_mut_ptr() };
+    let output_end = unsafe { output_base.add(output.capacity()) };
+    let output_start_pos_ptr = unsafe { output.base_mut_ptr().add(output.pos()) as *mut u8 };
+    let mut output_ptr = output_start_pos_ptr;
+
+    let mut input_ptr = input.as_ptr();
+    let input_ptr_end = unsafe { input.as_ptr().add(input.len()) };
+    let safe_distance_from_end =  (16 /* literal copy */ +  2 /* u16 match offset */ + 1 /* The next token to read (we can skip the check) */).min(input.len()) ;
+    let input_ptr_safe = unsafe { input_ptr_end.sub(safe_distance_from_end) };
+
+    let safe_output_ptr = unsafe {
+        let mut output_num_safe_bytes = output
+            .capacity()
+            .saturating_sub(16 /* literal copy */ + 18 /* match copy */);
+        if USE_DICT {
+            // In the dictionary case the output pointer is moved by the match length in the dictionary.
+            // This may be up to 17 bytes without exiting the loop. So we need to ensure that we have
+            // at least additional 17 bytes of space left in the output buffer in the fast loop.
+            output_num_safe_bytes = output_num_safe_bytes.saturating_sub(17);
+        };
+
+        output_base.add(output_num_safe_bytes)
+    };
+
+    // Exhaust the decoder by reading and decompressing all blocks until the remaining buffer is
+    // empty.
+    loop {
+        // Read the token. The token is the first byte in a block. It is divided into two 4-bit
+        // subtokens, the higher and the lower.
+        // This token contains to 4-bit "fields", a higher and a lower, representing the literals'
+        // length and the back reference's length, respectively.
+        let token = unsafe { input_ptr.read() };
+        input_ptr = unsafe { input_ptr.add(1) };
+
+        // Checking for hot-loop.
+        // In most cases the metadata does fit in a single 1byte token (statistically) and we are in
+        // a safe-distance to the end. This enables some optimized handling.
+        //
+        // Ideally we want to check for safe output pos like: output.pos() <= safe_output_pos; But
+        // that doesn't work when the safe_output_ptr is == output_ptr due to insufficient
+        // capacity. So we use `<` instead of `<=`, which covers that case.
+        if does_token_fit(token)
+            && (input_ptr as usize) <= input_ptr_safe as usize
+            && output_ptr < safe_output_ptr
+        {
+            let literal_length = (token >> 4) as usize;
+            let mut match_length = MINMATCH + (token & 0xF) as usize;
+
+            // output_ptr <= safe_output_ptr should guarantee we have enough space in output
+            debug_assert!(
+                unsafe { output_ptr.add(literal_length + match_length) } <= output_end,
+                "{literal_length} + {match_length} {} wont fit ",
+                literal_length + match_length
+            );
+
+            // Copy the literal
+            // The literal is at max 16 bytes, and the is_safe_distance check assures
+            // that we are far away enough from the end so we can safely copy 16 bytes
+            unsafe {
+                core::ptr::copy_nonoverlapping(input_ptr, output_ptr, 16);
+                input_ptr = input_ptr.add(literal_length);
+                output_ptr = output_ptr.add(literal_length);
+            }
+
+            // input_ptr <= input_ptr_safe should guarantee we have enough space in input
+            debug_assert!(input_ptr_end as usize - input_ptr as usize >= 2);
+            let offset = read_u16_ptr(&mut input_ptr) as usize;
+
+            let output_len = unsafe { output_ptr.offset_from(output_base) as usize };
+            let offset = offset.min(output_len + ext_dict.len());
+
+            // Check if part of the match is in the external dict
+            if USE_DICT && offset > output_len {
+                let copied = unsafe {
+                    copy_from_dict(output_base, &mut output_ptr, ext_dict, offset, match_length)
+                };
+                if copied == match_length {
+                    continue;
+                }
+                // match crosses ext_dict and output
+                match_length -= copied;
+            }
+
+            // Calculate the start of this duplicate segment. At this point offset was already
+            // checked to be in bounds and the external dictionary copy, if any, was
+            // already copied and subtracted from match_length.
+            let start_ptr = unsafe { output_ptr.sub(offset) };
+            debug_assert!(start_ptr >= output_base);
+            debug_assert!(start_ptr < output_end);
+            debug_assert!(unsafe { output_end.offset_from(start_ptr) as usize } >= match_length);
+
+            // In this branch we know that match_length is at most 18 (14 + MINMATCH).
+            // But the blocks can overlap, so make sure they are at least 18 bytes apart
+            // to enable an optimized copy of 18 bytes.
+            if offset >= match_length {
+                unsafe {
+                    // _copy_, not copy_non_overlaping, as it may overlap.
+                    // Compiles to the same assembly on x68_64.
+                    core::ptr::copy(start_ptr, output_ptr, 18);
+                    output_ptr = output_ptr.add(match_length);
+                }
+            } else {
+                unsafe {
+                    duplicate_overlapping(&mut output_ptr, start_ptr, match_length);
+                }
+            }
+
+            continue;
+        }
+
+        // Now, we read the literals section.
+        // Literal Section
+        // If the initial value is 15, it is indicated that another byte will be read and added to
+        // it
+        let mut literal_length = (token >> 4) as usize;
+        if literal_length != 0 {
+            if literal_length == 15 {
+                // The literal_length length took the maximal value, indicating that there is more
+                // than 15 literal_length bytes. We read the extra integer.
+                literal_length += read_integer_ptr(&mut input_ptr, input_ptr_end)? as usize;
+            }
+
+            #[cfg(not(feature = "unchecked-decode"))]
+            {
+                // Check if literal is out of bounds for the input, and if there is enough space on
+                // the output
+                if literal_length > input_ptr_end as usize - input_ptr as usize {
+                    return Err(DecompressError::LiteralOutOfBounds);
+                }
+                if literal_length > unsafe { output_end.offset_from(output_ptr) as usize } {
+                    return Err(DecompressError::OutputTooSmall {
+                        expected: unsafe { output_ptr.offset_from(output_base) as usize }
+                            + literal_length,
+                        actual: output.capacity(),
+                    });
+                }
+            }
+            unsafe {
+                fastcpy_unsafe::slice_copy(input_ptr, output_ptr, literal_length);
+                output_ptr = output_ptr.add(literal_length);
+                input_ptr = input_ptr.add(literal_length);
+            }
+        }
+
+        // If the input stream is emptied, we break out of the loop. This is only the case
+        // in the end of the stream, since the block is intact otherwise.
+        if input_ptr >= input_ptr_end {
+            break;
+        }
+
+        // Read duplicate section
+        #[cfg(not(feature = "unchecked-decode"))]
+        {
+            if (input_ptr_end as usize) - (input_ptr as usize) < 2 {
+                return Err(DecompressError::ExpectedAnotherByte);
+            }
+        }
+        let offset = read_u16_ptr(&mut input_ptr) as usize;
+        // Obtain the initial match length. The match length is the length of the duplicate segment
+        // which will later be copied from data previously decompressed into the output buffer. The
+        // initial length is derived from the second part of the token (the lower nibble), we read
+        // earlier. Since having a match length of less than 4 would mean negative compression
+        // ratio, we start at 4 (MINMATCH).
+
+        // The initial match length can maximally be 19 (MINMATCH + 15). As with the literal length,
+        // this indicates that there are more bytes to read.
+        let mut match_length = MINMATCH + (token & 0xF) as usize;
+        if match_length == MINMATCH + 15 {
+            // The match length took the maximal value, indicating that there is more bytes. We
+            // read the extra integer.
+            match_length += read_integer_ptr(&mut input_ptr, input_ptr_end)? as usize;
+        }
+
+        // We now copy from the already decompressed buffer. This allows us for storing duplicates
+        // by simply referencing the other location.
+        let output_len = unsafe { output_ptr.offset_from(output_base) as usize };
+
+        // We'll do a bounds check except unchecked-decode is enabled.
+        #[cfg(not(feature = "unchecked-decode"))]
+        {
+            if offset > output_len + ext_dict.len() {
+                return Err(DecompressError::OffsetOutOfBounds);
+            }
+            if match_length > unsafe { output_end.offset_from(output_ptr) as usize } {
+                return Err(DecompressError::OutputTooSmall {
+                    expected: output_len + match_length,
+                    actual: output.capacity(),
+                });
+            }
+        }
+
+        if USE_DICT && offset > output_len {
+            let copied = unsafe {
+                copy_from_dict(output_base, &mut output_ptr, ext_dict, offset, match_length)
+            };
+            if copied == match_length {
+                #[cfg(not(feature = "unchecked-decode"))]
+                {
+                    if input_ptr >= input_ptr_end {
+                        return Err(DecompressError::ExpectedAnotherByte);
+                    }
+                }
+
+                continue;
+            }
+            // match crosses ext_dict and output
+            match_length -= copied;
+        }
+
+        // Calculate the start of this duplicate segment. At this point offset was already checked
+        // to be in bounds and the external dictionary copy, if any, was already copied and
+        // subtracted from match_length.
+        let start_ptr = unsafe { output_ptr.sub(offset) };
+        debug_assert!(start_ptr >= output_base);
+        debug_assert!(start_ptr < output_end);
+        debug_assert!(unsafe { output_end.offset_from(start_ptr) as usize } >= match_length);
+        unsafe {
+            duplicate(&mut output_ptr, output_end, start_ptr, match_length);
+        }
+        #[cfg(not(feature = "unchecked-decode"))]
+        {
+            if input_ptr >= input_ptr_end {
+                return Err(DecompressError::ExpectedAnotherByte);
+            }
+        }
+    }
+    unsafe {
+        output.set_pos(output_ptr.offset_from(output_base) as usize);
+        Ok(output_ptr.offset_from(output_start_pos_ptr) as usize)
+    }
+}
+
+/// Decompress all bytes of `input` into `output`.
+/// `output` should be preallocated with a size of of the uncompressed data.
+#[inline]
+pub fn decompress_into(input: &[u8], output: &mut [u8]) -> Result<usize, DecompressError> {
+    decompress_internal::<false, _>(input, &mut SliceSink::new(output, 0), b"")
+}
+
+/// Decompress all bytes of `input` into `output`.
+///
+/// Returns the number of bytes written (decompressed) into `output`.
+#[inline]
+pub fn decompress_into_with_dict(
+    input: &[u8],
+    output: &mut [u8],
+    ext_dict: &[u8],
+) -> Result<usize, DecompressError> {
+    decompress_internal::<true, _>(input, &mut SliceSink::new(output, 0), ext_dict)
+}
+
+/// Decompress all bytes of `input` into a new vec.
+/// The passed parameter `min_uncompressed_size` needs to be equal or larger than the uncompressed size.
+///
+/// # Panics
+/// May panic if the parameter `min_uncompressed_size` is smaller than the
+/// uncompressed data.
+
+#[inline]
+pub fn decompress_with_dict(
+    input: &[u8],
+    min_uncompressed_size: usize,
+    ext_dict: &[u8],
+) -> Result<Vec<u8>, DecompressError> {
+    // Allocate a vector to contain the decompressed stream.
+    let mut vec = Vec::with_capacity(min_uncompressed_size);
+    let decomp_len =
+        decompress_internal::<true, _>(input, &mut PtrSink::from_vec(&mut vec, 0), ext_dict)?;
+    unsafe {
+        vec.set_len(decomp_len);
+    }
+    Ok(vec)
+}
+
+/// Decompress all bytes of `input` into a new vec. The first 4 bytes are the uncompressed size in
+/// little endian. Can be used in conjunction with `compress_prepend_size`
+#[inline]
+pub fn decompress_size_prepended(input: &[u8]) -> Result<Vec<u8>, DecompressError> {
+    let (uncompressed_size, input) = super::uncompressed_size(input)?;
+    decompress(input, uncompressed_size)
+}
+
+/// Decompress all bytes of `input` into a new vec.
+/// The passed parameter `min_uncompressed_size` needs to be equal or larger than the uncompressed size.
+///
+/// # Panics
+/// May panic if the parameter `min_uncompressed_size` is smaller than the
+/// uncompressed data.
+#[inline]
+pub fn decompress(input: &[u8], min_uncompressed_size: usize) -> Result<Vec<u8>, DecompressError> {
+    // Allocate a vector to contain the decompressed stream.
+    let mut vec = Vec::with_capacity(min_uncompressed_size);
+    let decomp_len =
+        decompress_internal::<true, _>(input, &mut PtrSink::from_vec(&mut vec, 0), b"")?;
+    unsafe {
+        vec.set_len(decomp_len);
+    }
+    Ok(vec)
+}
+
+/// Decompress all bytes of `input` into a new vec. The first 4 bytes are the uncompressed size in
+/// little endian. Can be used in conjunction with `compress_prepend_size_with_dict`
+#[inline]
+pub fn decompress_size_prepended_with_dict(
+    input: &[u8],
+    ext_dict: &[u8],
+) -> Result<Vec<u8>, DecompressError> {
+    let (uncompressed_size, input) = super::uncompressed_size(input)?;
+    decompress_with_dict(input, uncompressed_size, ext_dict)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn all_literal() {
+        assert_eq!(decompress(&[0x30, b'a', b'4', b'9'], 3).unwrap(), b"a49");
+    }
+
+    // this error test is only valid with checked-decode.
+    #[cfg(not(feature = "unchecked-decode"))]
+    #[test]
+    fn offset_oob() {
+        decompress(&[0x10, b'a', 2, 0], 4).unwrap_err();
+        decompress(&[0x40, b'a', 1, 0], 4).unwrap_err();
+    }
+}
diff --git a/src/block/decompress_safe.rs b/src/block/decompress_safe.rs
new file mode 100644
index 0000000..cebe3cb
--- /dev/null
+++ b/src/block/decompress_safe.rs
@@ -0,0 +1,399 @@
+//! The block decompression algorithm.
+
+use core::convert::TryInto;
+
+use crate::block::DecompressError;
+use crate::block::MINMATCH;
+use crate::sink::Sink;
+use crate::sink::SliceSink;
+use alloc::vec;
+use alloc::vec::Vec;
+
+/// Read an integer.
+///
+/// In LZ4, we encode small integers in a way that we can have an arbitrary number of bytes. In
+/// particular, we add the bytes repeatedly until we hit a non-0xFF byte. When we do, we add
+/// this byte to our sum and terminate the loop.
+///
+/// # Example
+///
+/// ```notest
+///     255, 255, 255, 4, 2, 3, 4, 6, 7
+/// ```
+///
+/// is encoded to _255 + 255 + 255 + 4 = 769_. The bytes after the first 4 is ignored, because
+/// 4 is the first non-0xFF byte.
+#[inline]
+fn read_integer(input: &[u8], input_pos: &mut usize) -> Result<u32, DecompressError> {
+    // We start at zero and count upwards.
+    let mut n: u32 = 0;
+    // If this byte takes value 255 (the maximum value it can take), another byte is read
+    // and added to the sum. This repeats until a byte lower than 255 is read.
+    loop {
+        // We add the next byte until we get a byte which we add to the counting variable.
+        let extra: u8 = *input
+            .get(*input_pos)
+            .ok_or(DecompressError::ExpectedAnotherByte)?;
+        *input_pos += 1;
+        n += extra as u32;
+
+        // We continue if we got 255, break otherwise.
+        if extra != 0xFF {
+            break;
+        }
+    }
+
+    // 255, 255, 255, 8
+    // 111, 111, 111, 101
+
+    Ok(n)
+}
+
+/// Read a little-endian 16-bit integer from the input stream.
+#[inline]
+fn read_u16(input: &[u8], input_pos: &mut usize) -> Result<u16, DecompressError> {
+    let dst = input
+        .get(*input_pos..*input_pos + 2)
+        .ok_or(DecompressError::ExpectedAnotherByte)?;
+    *input_pos += 2;
+    Ok(u16::from_le_bytes(dst.try_into().unwrap()))
+}
+
+const FIT_TOKEN_MASK_LITERAL: u8 = 0b00001111;
+const FIT_TOKEN_MASK_MATCH: u8 = 0b11110000;
+
+#[test]
+fn check_token() {
+    assert!(!does_token_fit(15));
+    assert!(does_token_fit(14));
+    assert!(does_token_fit(114));
+    assert!(!does_token_fit(0b11110000));
+    assert!(does_token_fit(0b10110000));
+}
+
+/// The token consists of two parts, the literal length (upper 4 bits) and match_length (lower 4
+/// bits) if the literal length and match_length are both below 15, we don't need to read additional
+/// data, so the token does fit the metadata.
+#[inline]
+fn does_token_fit(token: u8) -> bool {
+    !((token & FIT_TOKEN_MASK_LITERAL) == FIT_TOKEN_MASK_LITERAL
+        || (token & FIT_TOKEN_MASK_MATCH) == FIT_TOKEN_MASK_MATCH)
+}
+
+/// Decompress all bytes of `input` into `output`.
+///
+/// Returns the number of bytes written (decompressed) into `output`.
+#[inline(always)] // (always) necessary to get the best performance in non LTO builds
+pub(crate) fn decompress_internal<const USE_DICT: bool, S: Sink>(
+    input: &[u8],
+    output: &mut S,
+    ext_dict: &[u8],
+) -> Result<usize, DecompressError> {
+    let mut input_pos = 0;
+    let initial_output_pos = output.pos();
+
+    let safe_input_pos = input
+        .len()
+        .saturating_sub(16 /* literal copy */ +  2 /* u16 match offset */);
+    let mut safe_output_pos = output
+        .capacity()
+        .saturating_sub(16 /* literal copy */ + 18 /* match copy */);
+
+    if USE_DICT {
+        // In the dictionary case the output pointer is moved by the match length in the dictionary.
+        // This may be up to 17 bytes without exiting the loop. So we need to ensure that we have
+        // at least additional 17 bytes of space left in the output buffer in the fast loop.
+        safe_output_pos = safe_output_pos.saturating_sub(17);
+    };
+
+    // Exhaust the decoder by reading and decompressing all blocks until the remaining buffer is
+    // empty.
+    loop {
+        // Read the token. The token is the first byte in a block. It is divided into two 4-bit
+        // subtokens, the higher and the lower.
+        // This token contains to 4-bit "fields", a higher and a lower, representing the literals'
+        // length and the back reference's length, respectively.
+        let token = *input
+            .get(input_pos)
+            .ok_or(DecompressError::ExpectedAnotherByte)?;
+        input_pos += 1;
+
+        // Checking for hot-loop.
+        // In most cases the metadata does fit in a single 1byte token (statistically) and we are in
+        // a safe-distance to the end. This enables some optimized handling.
+        //
+        // Ideally we want to check for safe output pos like: output.pos() <= safe_output_pos; But
+        // that doesn't work when the safe_output_pos is 0 due to saturated_sub. So we use
+        // `<` instead of `<=`, which covers that case.
+        if does_token_fit(token) && input_pos <= safe_input_pos && output.pos() < safe_output_pos {
+            let literal_length = (token >> 4) as usize;
+
+            // casting to [u8;u16] doesn't seem to make a difference vs &[u8] (same assembly)
+            let input: &[u8; 16] = input[input_pos..input_pos + 16].try_into().unwrap();
+
+            // Copy the literal
+            // The literal is at max 14 bytes, and the is_safe_distance check assures
+            // that we are far away enough from the end so we can safely copy 16 bytes
+            output.extend_from_slice_wild(input, literal_length);
+            input_pos += literal_length;
+
+            // clone as we don't want to mutate
+            let offset = read_u16(input, &mut literal_length.clone())? as usize;
+            input_pos += 2;
+
+            let mut match_length = MINMATCH + (token & 0xF) as usize;
+
+            if USE_DICT && offset > output.pos() {
+                let copied = copy_from_dict(output, ext_dict, offset, match_length)?;
+                if copied == match_length {
+                    continue;
+                }
+                // match crosses ext_dict and output, offset is still correct as output pos
+                // increased
+                match_length -= copied;
+            }
+
+            // In this branch we know that match_length is at most 18 (14 + MINMATCH).
+            // But the blocks can overlap, so make sure they are at least 18 bytes apart
+            // to enable an optimized copy of 18 bytes.
+            let start = output.pos().saturating_sub(offset);
+            if offset >= match_length {
+                output.extend_from_within(start, 18, match_length);
+            } else {
+                output.extend_from_within_overlapping(start, match_length)
+            }
+
+            continue;
+        }
+
+        // Now, we read the literals section.
+        // Literal Section
+        // If the initial value is 15, it is indicated that another byte will be read and added to
+        // it
+        let mut literal_length = (token >> 4) as usize;
+        if literal_length != 0 {
+            if literal_length == 15 {
+                // The literal_length length took the maximal value, indicating that there is more
+                // than 15 literal_length bytes. We read the extra integer.
+                literal_length += read_integer(input, &mut input_pos)? as usize;
+            }
+
+            if literal_length > input.len() - input_pos {
+                return Err(DecompressError::LiteralOutOfBounds);
+            }
+            #[cfg(not(feature = "unchecked-decode"))]
+            if literal_length > output.capacity() - output.pos() {
+                return Err(DecompressError::OutputTooSmall {
+                    expected: output.pos() + literal_length,
+                    actual: output.capacity(),
+                });
+            }
+            output.extend_from_slice(&input[input_pos..input_pos + literal_length]);
+            input_pos += literal_length;
+        }
+
+        // If the input stream is emptied, we break out of the loop. This is only the case
+        // in the end of the stream, since the block is intact otherwise.
+        if input_pos >= input.len() {
+            break;
+        }
+
+        let offset = read_u16(input, &mut input_pos)? as usize;
+        // Obtain the initial match length. The match length is the length of the duplicate segment
+        // which will later be copied from data previously decompressed into the output buffer. The
+        // initial length is derived from the second part of the token (the lower nibble), we read
+        // earlier. Since having a match length of less than 4 would mean negative compression
+        // ratio, we start at 4 (MINMATCH).
+
+        // The initial match length can maximally be 19. As with the literal length, this indicates
+        // that there are more bytes to read.
+        let mut match_length = MINMATCH + (token & 0xF) as usize;
+        if match_length == MINMATCH + 15 {
+            // The match length took the maximal value, indicating that there is more bytes. We
+            // read the extra integer.
+            match_length += read_integer(input, &mut input_pos)? as usize;
+        }
+
+        #[cfg(not(feature = "unchecked-decode"))]
+        if output.pos() + match_length > output.capacity() {
+            return Err(DecompressError::OutputTooSmall {
+                expected: output.pos() + match_length,
+                actual: output.capacity(),
+            });
+        }
+        if USE_DICT && offset > output.pos() {
+            let copied = copy_from_dict(output, ext_dict, offset, match_length)?;
+            if copied == match_length {
+                continue;
+            }
+            // match crosses ext_dict and output, offset is still correct as output_len was
+            // increased
+            match_length -= copied;
+        }
+        // We now copy from the already decompressed buffer. This allows us for storing duplicates
+        // by simply referencing the other location.
+        duplicate_slice(output, offset, match_length)?;
+    }
+    Ok(output.pos() - initial_output_pos)
+}
+
+#[inline]
+fn copy_from_dict(
+    output: &mut impl Sink,
+    ext_dict: &[u8],
+    offset: usize,
+    match_length: usize,
+) -> Result<usize, DecompressError> {
+    // If we're here we know offset > output.pos
+    debug_assert!(offset > output.pos());
+    let (dict_offset, did_overflow) = ext_dict.len().overflowing_sub(offset - output.pos());
+    if did_overflow {
+        return Err(DecompressError::OffsetOutOfBounds);
+    }
+    // Can't copy past ext_dict len, the match may cross dict and output
+    let dict_match_length = match_length.min(ext_dict.len() - dict_offset);
+    let ext_match = &ext_dict[dict_offset..dict_offset + dict_match_length];
+    output.extend_from_slice(ext_match);
+    Ok(dict_match_length)
+}
+
+/// Extends output by self-referential copies
+#[inline(always)] // (always) necessary otherwise compiler fails to inline it
+fn duplicate_slice(
+    output: &mut impl Sink,
+    offset: usize,
+    match_length: usize,
+) -> Result<(), DecompressError> {
+    // This function assumes output will fit match_length, it might panic otherwise.
+    if match_length > offset {
+        duplicate_overlapping_slice(output, offset, match_length)?;
+    } else {
+        let (start, did_overflow) = output.pos().overflowing_sub(offset);
+        if did_overflow {
+            return Err(DecompressError::OffsetOutOfBounds);
+        }
+
+        match match_length {
+            0..=32 if output.pos() + 32 <= output.capacity() => {
+                output.extend_from_within(start, 32, match_length)
+            }
+            33..=64 if output.pos() + 64 <= output.capacity() => {
+                output.extend_from_within(start, 64, match_length)
+            }
+            _ => output.extend_from_within(start, match_length, match_length),
+        }
+    }
+    Ok(())
+}
+
+/// self-referential copy for the case data start (end of output - offset) + match_length overlaps
+/// into output
+#[inline]
+fn duplicate_overlapping_slice(
+    sink: &mut impl Sink,
+    offset: usize,
+    match_length: usize,
+) -> Result<(), DecompressError> {
+    // This function assumes output will fit match_length, it might panic otherwise.
+    let (start, did_overflow) = sink.pos().overflowing_sub(offset);
+    if did_overflow {
+        return Err(DecompressError::OffsetOutOfBounds);
+    }
+    if offset == 1 {
+        let val = sink.byte_at(start);
+        sink.extend_with_fill(val, match_length);
+    } else {
+        sink.extend_from_within_overlapping(start, match_length);
+    }
+    Ok(())
+}
+
+/// Decompress all bytes of `input` into `output`.
+/// `output` should be preallocated with a size of of the uncompressed data.
+#[inline]
+pub fn decompress_into(input: &[u8], output: &mut [u8]) -> Result<usize, DecompressError> {
+    decompress_internal::<false, _>(input, &mut SliceSink::new(output, 0), b"")
+}
+
+/// Decompress all bytes of `input` into `output`.
+///
+/// Returns the number of bytes written (decompressed) into `output`.
+#[inline]
+pub fn decompress_into_with_dict(
+    input: &[u8],
+    output: &mut [u8],
+    ext_dict: &[u8],
+) -> Result<usize, DecompressError> {
+    decompress_internal::<true, _>(input, &mut SliceSink::new(output, 0), ext_dict)
+}
+
+/// Decompress all bytes of `input` into a new vec. The first 4 bytes are the uncompressed size in
+/// litte endian. Can be used in conjunction with `compress_prepend_size`
+#[inline]
+pub fn decompress_size_prepended(input: &[u8]) -> Result<Vec<u8>, DecompressError> {
+    let (uncompressed_size, input) = super::uncompressed_size(input)?;
+    decompress(input, uncompressed_size)
+}
+
+/// Decompress all bytes of `input` into a new vec.
+/// The passed parameter `min_uncompressed_size` needs to be equal or larger than the uncompressed size.
+///
+/// # Panics
+/// May panic if the parameter `min_uncompressed_size` is smaller than the
+/// uncompressed data.
+#[inline]
+pub fn decompress(input: &[u8], min_uncompressed_size: usize) -> Result<Vec<u8>, DecompressError> {
+    let mut decompressed: Vec<u8> = vec![0; min_uncompressed_size];
+    let decomp_len =
+        decompress_internal::<false, _>(input, &mut SliceSink::new(&mut decompressed, 0), b"")?;
+    decompressed.truncate(decomp_len);
+    Ok(decompressed)
+}
+
+/// Decompress all bytes of `input` into a new vec. The first 4 bytes are the uncompressed size in
+/// little endian. Can be used in conjunction with `compress_prepend_size_with_dict`
+#[inline]
+pub fn decompress_size_prepended_with_dict(
+    input: &[u8],
+    ext_dict: &[u8],
+) -> Result<Vec<u8>, DecompressError> {
+    let (uncompressed_size, input) = super::uncompressed_size(input)?;
+    decompress_with_dict(input, uncompressed_size, ext_dict)
+}
+
+/// Decompress all bytes of `input` into a new vec.
+/// The passed parameter `min_uncompressed_size` needs to be equal or larger than the uncompressed size.
+///
+/// # Panics
+/// May panic if the parameter `min_uncompressed_size` is smaller than the
+/// uncompressed data.
+#[inline]
+pub fn decompress_with_dict(
+    input: &[u8],
+    min_uncompressed_size: usize,
+    ext_dict: &[u8],
+) -> Result<Vec<u8>, DecompressError> {
+    let mut decompressed: Vec<u8> = vec![0; min_uncompressed_size];
+    let decomp_len =
+        decompress_internal::<true, _>(input, &mut SliceSink::new(&mut decompressed, 0), ext_dict)?;
+    decompressed.truncate(decomp_len);
+    Ok(decompressed)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn all_literal() {
+        assert_eq!(decompress(&[0x30, b'a', b'4', b'9'], 3).unwrap(), b"a49");
+    }
+
+    // this error test is only valid in safe-decode.
+    #[cfg(feature = "safe-decode")]
+    #[test]
+    fn offset_oob() {
+        decompress(&[0x10, b'a', 2, 0], 4).unwrap_err();
+        decompress(&[0x40, b'a', 1, 0], 4).unwrap_err();
+    }
+}
diff --git a/src/block/hashtable.rs b/src/block/hashtable.rs
new file mode 100644
index 0000000..7c29db7
--- /dev/null
+++ b/src/block/hashtable.rs
@@ -0,0 +1,161 @@
+use alloc::boxed::Box;
+use core::convert::TryInto;
+
+/// The Hashtable trait used by the compression to store hashed bytes to their position.
+/// `val` can be maximum the size of the input in bytes.
+///
+/// `pos` can have a maximum value of u16::MAX or 65535
+/// If the hashtable is smaller it needs to reduce the pos to its space, e.g. by right
+/// shifting.
+///
+/// Duplication dictionary size.
+///
+/// Every four bytes is assigned an entry. When this number is lower, fewer entries exists, and
+/// thus collisions are more likely, hurting the compression ratio.
+
+/// hashes and right shifts to a maximum value of 16bit, 65535
+/// The right shift is done in order to not exceed, the hashtables capacity
+#[inline]
+fn hash(sequence: u32) -> u32 {
+    (sequence.wrapping_mul(2654435761_u32)) >> 16
+}
+
+/// hashes and right shifts to a maximum value of 16bit, 65535
+/// The right shift is done in order to not exceed, the hashtables capacity
+#[cfg(target_pointer_width = "64")]
+#[inline]
+fn hash5(sequence: usize) -> u32 {
+    let primebytes = if cfg!(target_endian = "little") {
+        889523592379_usize
+    } else {
+        11400714785074694791_usize
+    };
+    (((sequence << 24).wrapping_mul(primebytes)) >> 48) as u32
+}
+
+pub trait HashTable {
+    fn get_at(&self, pos: usize) -> usize;
+    fn put_at(&mut self, pos: usize, val: usize);
+    fn clear(&mut self);
+    #[inline]
+    #[cfg(target_pointer_width = "64")]
+    fn get_hash_at(input: &[u8], pos: usize) -> usize {
+        hash5(super::compress::get_batch_arch(input, pos)) as usize
+    }
+    #[inline]
+    #[cfg(target_pointer_width = "32")]
+    fn get_hash_at(input: &[u8], pos: usize) -> usize {
+        hash(super::compress::get_batch(input, pos)) as usize
+    }
+}
+
+const HASHTABLE_SIZE_4K: usize = 4 * 1024;
+const HASHTABLE_BIT_SHIFT_4K: usize = 4;
+
+#[derive(Debug)]
+#[repr(align(64))]
+pub struct HashTable4KU16 {
+    dict: Box<[u16; HASHTABLE_SIZE_4K]>,
+}
+impl HashTable4KU16 {
+    #[inline]
+    pub fn new() -> Self {
+        // This generates more efficient assembly in contrast to Box::new(slice), because of an
+        // optmized call alloc_zeroed, vs. alloc + memset
+        // try_into is optimized away
+        let dict = alloc::vec![0; HASHTABLE_SIZE_4K]
+            .into_boxed_slice()
+            .try_into()
+            .unwrap();
+        Self { dict }
+    }
+}
+impl HashTable for HashTable4KU16 {
+    #[inline]
+    fn get_at(&self, hash: usize) -> usize {
+        self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] as usize
+    }
+    #[inline]
+    fn put_at(&mut self, hash: usize, val: usize) {
+        self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] = val as u16;
+    }
+    #[inline]
+    fn clear(&mut self) {
+        self.dict.fill(0);
+    }
+    #[inline]
+    fn get_hash_at(input: &[u8], pos: usize) -> usize {
+        hash(super::get_batch(input, pos)) as usize
+    }
+}
+
+#[derive(Debug)]
+pub struct HashTable4K {
+    dict: Box<[u32; HASHTABLE_SIZE_4K]>,
+}
+impl HashTable4K {
+    #[inline]
+    pub fn new() -> Self {
+        let dict = alloc::vec![0; HASHTABLE_SIZE_4K]
+            .into_boxed_slice()
+            .try_into()
+            .unwrap();
+        Self { dict }
+    }
+
+    #[cold]
+    #[allow(dead_code)]
+    pub fn reposition(&mut self, offset: u32) {
+        for i in self.dict.iter_mut() {
+            *i = i.saturating_sub(offset);
+        }
+    }
+}
+impl HashTable for HashTable4K {
+    #[inline]
+    fn get_at(&self, hash: usize) -> usize {
+        self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] as usize
+    }
+    #[inline]
+    fn put_at(&mut self, hash: usize, val: usize) {
+        self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] = val as u32;
+    }
+    #[inline]
+    fn clear(&mut self) {
+        self.dict.fill(0);
+    }
+}
+
+const HASHTABLE_SIZE_8K: usize = 8 * 1024;
+const HASH_TABLE_BIT_SHIFT_8K: usize = 3;
+
+#[derive(Debug)]
+pub struct HashTable8K {
+    dict: Box<[u32; HASHTABLE_SIZE_8K]>,
+}
+#[allow(dead_code)]
+impl HashTable8K {
+    #[inline]
+    pub fn new() -> Self {
+        let dict = alloc::vec![0; HASHTABLE_SIZE_8K]
+            .into_boxed_slice()
+            .try_into()
+            .unwrap();
+
+        Self { dict }
+    }
+}
+impl HashTable for HashTable8K {
+    #[inline]
+    fn get_at(&self, hash: usize) -> usize {
+        self.dict[hash >> HASH_TABLE_BIT_SHIFT_8K] as usize
+    }
+    #[inline]
+    fn put_at(&mut self, hash: usize, val: usize) {
+        self.dict[hash >> HASH_TABLE_BIT_SHIFT_8K] = val as u32;
+    }
+    #[inline]
+    fn clear(&mut self) {
+        self.dict.fill(0);
+    }
+}
diff --git a/src/block/mod.rs b/src/block/mod.rs
new file mode 100644
index 0000000..31c52f4
--- /dev/null
+++ b/src/block/mod.rs
@@ -0,0 +1,157 @@
+//! LZ4 Block Format
+//!
+//! As defined in <https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md>
+//!
+//! Currently for no_std support only the block format is supported.
+//!
+//! # Example: block format roundtrip
+//! ```
+//! use lz4_flex::block::{compress_prepend_size, decompress_size_prepended};
+//! let input: &[u8] = b"Hello people, what's up?";
+//! let compressed = compress_prepend_size(input);
+//! let uncompressed = decompress_size_prepended(&compressed).unwrap();
+//! assert_eq!(input, uncompressed);
+//! ```
+//!
+
+#[cfg_attr(feature = "safe-encode", forbid(unsafe_code))]
+pub(crate) mod compress;
+pub(crate) mod hashtable;
+
+#[cfg(feature = "safe-decode")]
+#[cfg_attr(feature = "safe-decode", forbid(unsafe_code))]
+pub(crate) mod decompress_safe;
+#[cfg(feature = "safe-decode")]
+pub(crate) use decompress_safe as decompress;
+
+#[cfg(not(feature = "safe-decode"))]
+pub(crate) mod decompress;
+
+pub use compress::*;
+pub use decompress::*;
+
+use core::convert::TryInto;
+use core::fmt;
+
+pub(crate) const WINDOW_SIZE: usize = 64 * 1024;
+
+/// https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md#end-of-block-restrictions
+/// The last match must start at least 12 bytes before the end of block. The last match is part of
+/// the penultimate sequence. It is followed by the last sequence, which contains only literals.
+///
+/// Note that, as a consequence, an independent block < 13 bytes cannot be compressed, because the
+/// match must copy "something", so it needs at least one prior byte.
+///
+/// When a block can reference data from another block, it can start immediately with a match and no
+/// literal, so a block of 12 bytes can be compressed.
+const MFLIMIT: usize = 12;
+
+/// The last 5 bytes of input are always literals. Therefore, the last sequence contains at least 5
+/// bytes.
+const LAST_LITERALS: usize = 5;
+
+/// Due the way the compression loop is arrange we may read up to (register_size - 2) bytes from the
+/// current position. So we must end the matches 6 bytes before the end, 1 more than required by the
+/// spec.
+const END_OFFSET: usize = LAST_LITERALS + 1;
+
+/// https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md#end-of-block-restrictions
+/// Minimum length of a block
+///
+/// MFLIMIT + 1 for the token.
+const LZ4_MIN_LENGTH: usize = MFLIMIT + 1;
+
+const MAXD_LOG: usize = 16;
+const MAX_DISTANCE: usize = (1 << MAXD_LOG) - 1;
+
+#[allow(dead_code)]
+const MATCH_LENGTH_MASK: u32 = (1_u32 << 4) - 1; // 0b1111 / 15
+
+/// The minimum length of a duplicate
+const MINMATCH: usize = 4;
+
+#[allow(dead_code)]
+const FASTLOOP_SAFE_DISTANCE: usize = 64;
+
+/// Switch for the hashtable size byU16
+#[allow(dead_code)]
+static LZ4_64KLIMIT: usize = (64 * 1024) + (MFLIMIT - 1);
+
+/// An error representing invalid compressed data.
+#[derive(Debug)]
+#[non_exhaustive]
+pub enum DecompressError {
+    /// The provided output is too small
+    OutputTooSmall {
+        /// Minimum expected output size
+        expected: usize,
+        /// Actual size of output
+        actual: usize,
+    },
+    /// Literal is out of bounds of the input
+    LiteralOutOfBounds,
+    /// Expected another byte, but none found.
+    ExpectedAnotherByte,
+    /// Deduplication offset out of bounds (not in buffer).
+    OffsetOutOfBounds,
+}
+
+#[derive(Debug)]
+#[non_exhaustive]
+/// Errors that can happen during compression.
+pub enum CompressError {
+    /// The provided output is too small.
+    OutputTooSmall,
+}
+
+impl fmt::Display for DecompressError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            DecompressError::OutputTooSmall { expected, actual } => {
+                write!(
+                    f,
+                    "provided output is too small for the decompressed data, actual {actual}, expected \
+                     {expected}"
+                )
+            }
+            DecompressError::LiteralOutOfBounds => {
+                f.write_str("literal is out of bounds of the input")
+            }
+            DecompressError::ExpectedAnotherByte => {
+                f.write_str("expected another byte, found none")
+            }
+            DecompressError::OffsetOutOfBounds => {
+                f.write_str("the offset to copy is not contained in the decompressed buffer")
+            }
+        }
+    }
+}
+
+impl fmt::Display for CompressError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            CompressError::OutputTooSmall => f.write_str(
+                "output is too small for the compressed data, use get_maximum_output_size to \
+                 reserve enough space",
+            ),
+        }
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for DecompressError {}
+
+#[cfg(feature = "std")]
+impl std::error::Error for CompressError {}
+
+/// This can be used in conjunction with `decompress_size_prepended`.
+/// It will read the first 4 bytes as little-endian encoded length, and return
+/// the rest of the bytes after the length encoding.
+#[inline]
+pub fn uncompressed_size(input: &[u8]) -> Result<(usize, &[u8]), DecompressError> {
+    let size = input.get(..4).ok_or(DecompressError::ExpectedAnotherByte)?;
+    let size: &[u8; 4] = size.try_into().unwrap();
+    let uncompressed_size = u32::from_le_bytes(*size) as usize;
+    let rest = &input[4..];
+    Ok((uncompressed_size, rest))
+}