diff --git a/Cargo.toml b/Cargo.toml
index ba27b2e..06504bf 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,6 +18,8 @@ homepage = "https://github.com/image-rs/fdeflate"
 categories = ["compression"]
 
 [dependencies]
+fnv = "1.0.7"
+innumerable = "0.1.0"
 simd-adler32 = "0.3.4"
 
 [dev-dependencies]
diff --git a/src/compress/bt_matchfinder.rs b/src/compress/bt_matchfinder.rs
new file mode 100644
index 0000000..381987e
--- /dev/null
+++ b/src/compress/bt_matchfinder.rs
@@ -0,0 +1,203 @@
+use super::{compute_hash, compute_hash3, WINDOW_SIZE};
+
+const CACHE3_SIZE: usize = 1 << 15;
+const CACHE_SIZE: usize = 1 << 16;
+
+/// Find the length of the match between the current position and the previous position, searching
+/// both forwards and backwards from the starting position.
+fn match_length(data: &[u8], ip: usize, prev_index: usize) -> u16 {
+    assert!(
+        prev_index < ip,
+        "Match past current position: {prev_index} {ip}"
+    );
+
+    let mut length = 0;
+    while length < 258 && ip + length < data.len() && data[ip + length] == data[prev_index + length]
+    {
+        length += 1;
+    }
+    length as u16
+}
+
+fn left_child(index: usize) -> usize {
+    2 * (index as usize % WINDOW_SIZE)
+}
+
+fn right_child(index: usize) -> usize {
+    2 * (index as usize % WINDOW_SIZE) + 1
+}
+
+/// Match finder that uses a binary tree to find matches.
+///
+/// Based on bt_matchfinder.h from libdeflate.
+pub(crate) struct BTreeMatchFinder {
+    hash3_table: Option<Box<[u32; CACHE3_SIZE]>>,
+    hash_table: Box<[u32; CACHE_SIZE]>,
+    child_links: Box<[u32; WINDOW_SIZE * 2]>,
+    search_depth: u16,
+    early_return_length: usize,
+}
+impl BTreeMatchFinder {
+    pub(crate) fn new(min_match: u8) -> Self {
+        assert!((3..=4).contains(&min_match));
+
+        Self {
+            hash3_table: (min_match == 3)
+                .then(|| vec![0; CACHE3_SIZE].into_boxed_slice().try_into().unwrap()),
+            hash_table: vec![0; CACHE_SIZE].into_boxed_slice().try_into().unwrap(),
+            child_links: vec![0; WINDOW_SIZE * 2]
+                .into_boxed_slice()
+                .try_into()
+                .unwrap(),
+            search_depth: 2000,
+            early_return_length: 256,
+        }
+    }
+
+    fn update(
+        &mut self,
+        data: &[u8],
+        ip: usize,
+        value: u64,
+        min_match: u16,
+        record_matches: bool,
+    ) -> (u16, u16, usize) {
+        let min_offset = ip.saturating_sub(WINDOW_SIZE).max(1);
+
+        let mut best_offset = 0;
+        let mut best_length = min_match - 1;
+
+        // Handle 3-byte matches
+        if let Some(hash3_table) = &mut self.hash3_table {
+            let hash3 = compute_hash3(value as u32);
+            if best_length < min_match && min_match <= 3 {
+                let hash3_offset = hash3_table[(hash3 as usize) % CACHE3_SIZE] as usize;
+                if hash3_offset >= ip.saturating_sub(8192).max(1) {
+                    let length = match_length(data, ip, hash3_offset);
+                    if length >= 3 {
+                        best_length = length;
+                        best_offset = hash3_offset as u32;
+                    }
+                }
+            }
+            hash3_table[(hash3 as usize) % CACHE3_SIZE] = ip as u32;
+        }
+
+        // Lookup current value
+        let hash = compute_hash(value & 0xffff_ffff);
+        let hash_index = (hash as usize) % CACHE_SIZE;
+        let mut offset = self.hash_table[hash_index] as usize;
+        self.hash_table[hash_index] = ip as u32;
+
+        let mut pending_left = left_child(ip);
+        let mut pending_right = right_child(ip);
+
+        if offset < min_offset {
+            self.child_links[pending_left] = 0;
+            self.child_links[pending_right] = 0;
+            return (0, 0, ip);
+        }
+
+        let mut best_left_length = 0;
+        let mut best_right_length = 0;
+        let mut length = 0;
+
+        // Visit previous matches
+        // eprintln!("---");
+        let mut depth_remaining = self.search_depth;
+        loop {
+            if data[ip + length] == data[offset + length] {
+                while length < 258
+                    && ip + length < data.len()
+                    && data[ip + length] == data[offset + length]
+                {
+                    length += 1;
+                }
+
+                // for i in 0..length.min(self.early_return_length) {
+                //     assert_eq!(
+                //         data[ip + i],
+                //         data[offset + i],
+                //         "{i} {length} ip={ip} data_len={}",
+                //         data.len()
+                //     );
+                // }
+
+                if record_matches && length > best_length as usize {
+                    best_length = length as u16;
+                    best_offset = offset as u32;
+                }
+
+                if length >= self.early_return_length || ip + length == data.len() {
+                    self.child_links[pending_left] = self.child_links[left_child(offset)];
+                    self.child_links[pending_right] = self.child_links[right_child(offset)];
+                    break;
+                }
+            }
+
+            assert!(ip + length < data.len());
+
+            if data[offset + length] < data[ip + length] {
+                self.child_links[pending_left] = offset as u32;
+                pending_left = right_child(offset);
+                offset = self.child_links[pending_left] as usize;
+
+                best_left_length = length;
+                if best_right_length < length {
+                    length = best_right_length;
+                }
+                // length = length.min(best_right_length);
+                // eprintln!(
+                //     "left {best_right_length},{best_left_length} dist={}",
+                //     ip - offset
+                // );
+            } else {
+                assert!(
+                    data[offset + length] > data[ip + length],
+                    "{length} {depth_remaining} {offset} {min_offset}"
+                );
+
+                self.child_links[pending_right] = offset as u32;
+                pending_right = left_child(offset);
+                offset = self.child_links[pending_right] as usize;
+
+                best_right_length = length;
+                if best_left_length < length {
+                    length = best_left_length;
+                }
+                // length = length.min(best_left_length);
+                // eprintln!(
+                //     "right {best_right_length},{best_left_length} dist={}",
+                //     ip - offset
+                // );
+            }
+
+            depth_remaining -= 1;
+            if offset <= min_offset || depth_remaining == 0 {
+                self.child_links[pending_left] = 0;
+                self.child_links[pending_right] = 0;
+                break;
+            }
+        }
+
+        if best_length >= min_match {
+            return (best_length as u16, (ip - best_offset as usize) as u16, ip);
+        }
+
+        (0, 0, ip)
+    }
+
+    pub(crate) fn get_and_insert(
+        &mut self,
+        data: &[u8],
+        ip: usize,
+        value: u64,
+        min_match: u16,
+    ) -> (u16, u16, usize) {
+        self.update(data, ip, value, min_match, true)
+    }
+
+    pub(crate) fn insert(&mut self, data: &[u8], value: u64, ip: usize) {
+        self.update(data, ip, value, 3, false);
+    }
+}
diff --git a/src/compress/fast.rs b/src/compress/fast.rs
new file mode 100644
index 0000000..c1d0992
--- /dev/null
+++ b/src/compress/fast.rs
@@ -0,0 +1,120 @@
+use std::io::{self, Write};
+
+use super::{BitWriter, HashTableMatchFinder, Symbol};
+
+pub(super) struct FastCompressor {
+    match_finder: HashTableMatchFinder,
+    skip_ahead_shift: u8,
+}
+
+impl FastCompressor {
+    pub fn new(skip_ahead_shift: u8) -> Self {
+        Self {
+            match_finder: HashTableMatchFinder::new(),
+            skip_ahead_shift,
+        }
+    }
+
+    pub fn compress<W: Write>(&mut self, writer: &mut BitWriter<W>, data: &[u8]) -> io::Result<()> {
+        let mut ip = 0;
+
+        while ip < data.len() {
+            let mut symbols = Vec::new();
+
+            let mut last_match = ip;
+            'outer: while symbols.len() < 16384 && ip + 8 <= data.len() {
+                let current = u64::from_le_bytes(data[ip..][..8].try_into().unwrap());
+
+                if current & 0xFF_FFFF_FFFF == 0 {
+                    while ip > last_match && data[ip - 1] == 0 {
+                        ip -= 1;
+                    }
+
+                    if ip == 0 || data[ip - 1] != 0 {
+                        ip += 1;
+                    }
+
+                    symbols.push(Symbol::LiteralRun {
+                        start: last_match as u32,
+                        end: ip as u32,
+                    });
+
+                    let mut run_length = 0;
+                    while ip < data.len() && data[ip] == 0 && run_length < 258 {
+                        run_length += 1;
+                        ip += 1;
+                    }
+
+                    symbols.push(Symbol::Backref {
+                        length: run_length as u16,
+                        distance: 1,
+                        dist_sym: 0,
+                    });
+
+                    last_match = ip;
+
+                    continue;
+                }
+
+                let (length, distance, match_start) = self
+                    .match_finder
+                    .get_and_insert(&data, last_match, ip, current, 4);
+
+                if length >= 3 {
+                    assert!(last_match <= match_start);
+
+                    symbols.push(Symbol::LiteralRun {
+                        start: last_match as u32,
+                        end: match_start as u32,
+                    });
+
+                    symbols.push(Symbol::Backref {
+                        length: length as u16,
+                        distance,
+                        dist_sym: super::distance_to_dist_sym(distance),
+                    });
+
+                    let match_end = match_start + length as usize;
+                    let insert_end = (match_end - 2).min(data.len() - 8);
+                    let insert_start = (ip + 1).max(insert_end.saturating_sub(16));
+                    for j in insert_start..insert_end {
+                        let v = u64::from_le_bytes(data[j..][..8].try_into().unwrap());
+                        self.match_finder.insert(v, j);
+                    }
+
+                    ip = match_end;
+                    last_match = ip;
+
+                    continue 'outer;
+                }
+
+                // If we haven't found a match in a while, start skipping ahead by emitting multiple
+                // literals at once. But check that we don't skip over a big run of zeroes.
+                let advance = 1 + ((ip - last_match) >> self.skip_ahead_shift);
+                if advance >= 8 {
+                    let end_index = (ip + advance).min(data.len());
+                    if let Some(advance) = data[ip + 1..end_index]
+                        .chunks_exact(8)
+                        .position(|w| w == [0; 8])
+                    {
+                        ip += advance + 1;
+                        continue 'outer;
+                    }
+                }
+
+                ip += advance;
+            }
+            if data.len() < ip + 8 {
+                symbols.push(Symbol::LiteralRun {
+                    start: last_match as u32,
+                    end: data.len() as u32,
+                });
+                ip = data.len();
+            }
+
+            super::write_block(writer, data, &symbols, ip == data.len())?;
+        }
+
+        Ok(())
+    }
+}
diff --git a/src/compress/hc_matchfinder.rs b/src/compress/hc_matchfinder.rs
new file mode 100644
index 0000000..71655fa
--- /dev/null
+++ b/src/compress/hc_matchfinder.rs
@@ -0,0 +1,139 @@
+use crate::compress::compute_hash32;
+
+use super::WINDOW_SIZE;
+
+const CACHE_SIZE: usize = 65536;// 1 << 18;
+
+/// Find the length of the match between the current position and the previous position, searching
+/// both forwards and backwards from the starting position.
+fn match_length(
+    data: &[u8],
+    anchor: usize,
+    mut ip: usize,
+    mut prev_index: usize,
+    value: u32,
+) -> (u16, usize) {
+    assert!(
+        prev_index < ip,
+        "Match past current position: {prev_index} {ip}"
+    );
+
+    if value != u32::from_ne_bytes(data[prev_index..][..4].try_into().unwrap()) {
+        return (0, ip);
+    }
+
+    let mut length = 4;
+    while length < 258 && ip > anchor && prev_index > 0 && data[ip - 1] == data[prev_index - 1] {
+        length += 1;
+        ip -= 1;
+        prev_index -= 1;
+    }
+    while length < 258 && ip + length < data.len() && data[ip + length] == data[prev_index + length]
+    {
+        length += 1;
+    }
+    (length as u16, ip)
+}
+
+pub(crate) struct HashChainMatchFinder {
+    hash_table: Box<[u32; CACHE_SIZE]>,
+    links: Box<[u32; WINDOW_SIZE]>,
+
+    search_depth: u16,
+
+    // /// If we already have a match of this length, limit lazy search to a smaller search depth.
+    // good_length: u16,
+    /// Stop searching for matches if the length is at least this long.
+    nice_length: u16,
+    // /// Mask of low-bytes to consider for hashing.
+    // hash_mask: u64,
+}
+impl HashChainMatchFinder {
+    pub(crate) fn new(search_depth: u16, nice_length: u16, min_match: u8) -> Self {
+        assert!((3..=8).contains(&min_match));
+
+        Self {
+            hash_table: vec![0; CACHE_SIZE].into_boxed_slice().try_into().unwrap(),
+            links: vec![0; WINDOW_SIZE].into_boxed_slice().try_into().unwrap(),
+            search_depth,
+            // good_length: 8,
+            nice_length,
+            // hash_mask: if min_match == 8 {
+            //     u64::MAX
+            // } else {
+            //     (1 << (min_match.max(4) * 8)) - 1
+            // },
+        }
+    }
+
+    pub(crate) fn get_and_insert(
+        &mut self,
+        data: &[u8],
+        anchor: usize,
+        ip: usize,
+        value: u32,
+        min_match: u16,
+    ) -> (u16, u16, usize) {
+        let min_offset = ip.saturating_sub(32768).max(1);
+
+        let mut best_offset = 0;
+        let mut best_length = min_match - 1;
+        let mut best_ip = 0;
+
+        let mut n = self.search_depth;
+        // if min_match >= self.good_length {
+        //     n >>= 2;
+        // }
+
+        let hash = compute_hash32(value);
+        let hash_index = (hash as usize) % CACHE_SIZE;
+        let mut offset = self.hash_table[hash_index] as usize;
+
+        // Insert current value
+        self.hash_table[hash_index] = ip as u32;
+        self.links[ip % WINDOW_SIZE] = offset as u32;
+
+        // Visit previous matches
+        loop {
+            if offset < min_offset {
+                break;
+            }
+
+            let (length, start) = match_length(data, anchor, ip, offset, value);
+            if length > best_length {
+                best_length = length;
+                best_offset = offset as u32;
+                best_ip = start;
+                // } else if best_length > min_match {
+                //     break;
+            }
+            if length >= self.nice_length || ip + length as usize == data.len() {
+                break;
+            }
+
+            n -= 1;
+            if n == 0 {
+                break;
+            }
+
+            offset = self.links[offset % WINDOW_SIZE] as usize;
+        }
+
+        if best_length >= min_match {
+            return (
+                best_length as u16,
+                (ip - best_offset as usize) as u16,
+                best_ip,
+            );
+        }
+
+        (0, 0, ip)
+    }
+
+    pub(crate) fn insert(&mut self, value: u64, offset: usize) {
+        let hash = compute_hash32(value as u32);
+        let prev_offset = self.hash_table[(hash as usize) % CACHE_SIZE];
+        self.hash_table[(hash as usize) % CACHE_SIZE] = offset as u32;
+        self.links[offset as usize % WINDOW_SIZE] = prev_offset;
+    }
+}
diff --git a/src/compress/ht_matchfinder.rs b/src/compress/ht_matchfinder.rs
new file mode 100644
index 0000000..2302c82
--- /dev/null
+++ b/src/compress/ht_matchfinder.rs
@@ -0,0 +1,77 @@
+use super::compute_hash;
+
+const CACHE_SIZE: usize = 1 << 16;
+
+/// Find the length of the match between the current position and the previous position, searching
+/// both forwards and backwards from the starting position.
+fn match_length(
+    value: u64,
+    data: &[u8],
+    anchor: usize,
+    mut ip: usize,
+    mut prev_index: usize,
+) -> (u16, usize) {
+    assert!(
+        prev_index < ip,
+        "Match past current position: {prev_index} {ip}"
+    );
+
+    if value != u64::from_ne_bytes(data[prev_index..][..8].try_into().unwrap()) {
+        return (0, ip);
+    }
+
+    let mut length = 8;
+    while length < 258 && ip > anchor && prev_index > 0 && data[ip - 1] == data[prev_index - 1] {
+        length += 1;
+        ip -= 1;
+        prev_index -= 1;
+    }
+    while length < 258 && ip + length < data.len() && data[ip + length] == data[prev_index + length]
+    {
+        length += 1;
+    }
+    (length as u16, ip)
+}
+
+pub(crate) struct HashTableMatchFinder {
+    hash_table: Box<[u32; CACHE_SIZE]>,
+}
+impl HashTableMatchFinder {
+    pub(crate) fn new() -> Self {
+        Self {
+            hash_table: vec![0; CACHE_SIZE].into_boxed_slice().try_into().unwrap(),
+        }
+    }
+
+    pub(crate) fn get_and_insert(
+        &mut self,
+        data: &[u8],
+        anchor: usize,
+        ip: usize,
+        value: u64,
+        min_match: u16,
+    ) -> (u16, u16, usize) {
+        let min_offset = ip.saturating_sub(32768).max(1);
+
+        let hash = compute_hash(value);
+        let hash_index = (hash as usize) % CACHE_SIZE;
+        let offset = self.hash_table[hash_index] as usize;
+
+        // Insert current value
+        self.hash_table[hash_index] = ip as u32;
+
+        if offset >= min_offset {
+            let (length, start) = match_length(value, data, anchor, ip, offset);
+            if length > min_match {
+                return (length as u16, (ip - offset as usize) as u16, start);
+            }
+        }
+
+        (0, 0, ip)
+    }
+
+    pub(crate) fn insert(&mut self, value: u64, offset: usize) {
+        let hash = compute_hash(value);
+        self.hash_table[(hash as usize) % CACHE_SIZE] = offset as u32;
+    }
+}
diff --git a/src/compress/medium.rs b/src/compress/medium.rs
new file mode 100644
index 0000000..3731cb5
--- /dev/null
+++ b/src/compress/medium.rs
@@ -0,0 +1,194 @@
+use std::io::{self, Write};
+
+use super::{BitWriter, HashChainMatchFinder, Symbol};
+
+pub(super) struct MediumCompressor {
+    match_finder: HashChainMatchFinder,
+    skip_ahead_shift: u8,
+}
+
+impl MediumCompressor {
+    pub fn new(search_depth: u16, nice_length: u16, skip_ahead_shift: u8) -> Self {
+        Self {
+            match_finder: HashChainMatchFinder::new(search_depth, nice_length, 4),
+            skip_ahead_shift,
+        }
+    }
+
+    pub fn compress<W: Write>(&mut self, writer: &mut BitWriter<W>, data: &[u8]) -> io::Result<()> {
+        let mut ip = 0; // Points at the next byte to hash/lookup for.
+        let mut last_match = 0; //ip;
+
+        while ip < data.len() {
+            let mut length = 0u16;
+            let mut distance = 0;
+            let mut match_start = 0;
+
+            let mut symbols = Vec::new();
+            while symbols.len() < 16384 && ip + 8 <= data.len() {
+                if length == 0 {
+                    let current = u64::from_le_bytes(data[ip..][..8].try_into().unwrap());
+                    // if current & 0xFF_FFFF_FFFF == 0 {
+                    //     length = 4;
+                    //     match_start = ip + 1;
+                    //     distance = 1;
+
+                    //     let min_start = 1.max(last_match).max(match_start.saturating_sub(258 - 4));
+
+                    //     while match_start > min_start && data[match_start - 2] == 0 {
+                    //         match_start -= 1;
+                    //         length += 1;
+                    //     }
+                    //     while length < 258
+                    //         && match_start + (length as usize) < data.len()
+                    //         && data[match_start + length as usize] == 0
+                    //     {
+                    //         length += 1;
+                    //     }
+
+                    //     // Skip inserting all the totally zero values into the hash table.
+                    //     ip = match_start + length as usize - 3;
+                    // } else {
+                        (length, distance, match_start) = self.match_finder.get_and_insert(
+                            &data,
+                            last_match,
+                            ip,
+                            current as u32,
+                            3,
+                        );
+                        ip += 1;
+                    // }
+                }
+
+                if length < 3 {
+                    // If we haven't found a match in a while, start skipping ahead by emitting
+                    // multiple literals at once.
+                    ip += (ip - last_match) >> self.skip_ahead_shift;
+                    continue;
+                }
+
+                assert!(last_match <= ip);
+                assert!(last_match <= match_start,);
+                let (mut next_length, mut next_distance, mut next_match_start) = (0, 0, 0);
+
+                let match_end = match_start + length as usize;
+                if match_end >= ip {
+                    // // Insert match finder entries for the current match.
+                    // let insert_end = (match_end - 3).min(data.len() - 8);
+                    // let insert_start = ip.max(insert_end.saturating_sub(16));
+                    // for j in (insert_start..insert_end).step_by(4) {
+                    //     let v = u64::from_le_bytes(data[j..][..8].try_into().unwrap());
+                    //     self.match_finder.insert(v, j);
+                    //     self.match_finder.insert(v >> 8, j + 1);
+                    //     self.match_finder.insert(v >> 16, j + 2);
+                    //     self.match_finder.insert(v >> 24, j + 3);
+                    // }
+                    for j in ip..match_end.min(data.len() - 8) {
+                        let v = u32::from_le_bytes(data[j..][..4].try_into().unwrap());
+                        self.match_finder.insert(v as u64, j);
+                    }
+
+                    ip = match_end;
+
+                    // innumerable::event!("current-delta", ip as i32 - match_start as i32);
+
+                    // Do a lookup at the position following the match. We'll need this even if we
+                    // accept the match, so it doesn't cost anything.
+                    if ip + 8 <= data.len() {
+                        let next = u64::from_le_bytes(data[ip..][..8].try_into().unwrap());
+                        // if next & 0xFF_FFFF_FFFF == 0 {
+                        //     next_length = 4;
+                        //     next_match_start = ip + 1;
+                        //     next_distance = 1;
+
+                        //     let min_start =
+                        //         1.max(last_match).max(next_match_start.saturating_sub(258 - 4));
+
+                        //     while next_match_start > min_start && data[next_match_start - 2] == 0 {
+                        //         next_match_start -= 1;
+                        //         next_length += 1;
+                        //     }
+                        //     while next_length < 258
+                        //         && next_match_start + (next_length as usize) < data.len()
+                        //         && data[next_match_start + next_length as usize] == 0
+                        //     {
+                        //         next_length += 1;
+                        //     }
+
+                        //     // Skip inserting all the totally zero values into the hash table.
+                        //     ip = next_match_start + next_length as usize - 3;
+                        // } else {
+                            (next_length, next_distance, next_match_start) = self
+                                .match_finder
+                                .get_and_insert(&data, last_match, ip, next as u32, 3);
+
+                            // innumerable::event!("x-delta", next_match_start as i32 - ip as i32);
+
+                            ip += 1;
+                        // }
+                    }
+                }
+
+                // if next_length >= 3 {
+                //     // innumerable::event!("next-length", next_length);
+                //     innumerable::event!("next-delta", next_match_start as i32 - match_start as i32);
+                // }
+
+                // Insert the current match, unless the next match starts too close to the current
+                // one. Because we expand matches backwards, the next match might almost completely
+                // overlap. If so, it'll probably be cheaper to emit an extra literal rather than an
+                // extra backref.
+                if next_length < 3 || next_match_start > match_start + 1 {
+                    // if next_length < 3 && next_match_start > match_start + 1 {
+                    //     innumerable::event!("match", 0);
+                    // } else if next_length < 3 {
+                    //     innumerable::event!("match", 1);
+                    // } else {
+                    //     innumerable::event!("match", 2);
+                    // }
+
+                    assert!(last_match <= match_start);
+                    symbols.push(Symbol::LiteralRun {
+                        start: last_match as u32,
+                        end: match_start as u32,
+                    });
+                    symbols.push(Symbol::Backref {
+                        length: length as u16,
+                        distance,
+                        dist_sym: super::distance_to_dist_sym(distance),
+                    });
+                    last_match = match_start + length as usize;
+
+                    // If the next match starts before the end of the current match, we need to
+                    // adjust the next match length and start position.
+                    if next_length > 0 && next_match_start < last_match {
+                        assert!(next_length >= 3);
+                        next_length -= (last_match - next_match_start) as u16;
+                        next_match_start = last_match;
+                        if next_length < 4 {
+                            next_length = 0;
+                        }
+                    }
+                    //     innumerable::event!("fizzle", 0);
+                    // } else if next_length >= 3 {
+                    //     innumerable::event!("fizzle", 1);
+                }
+
+                // Advance to the next match (which might have a length of zero)
+                length = next_length;
+                match_start = next_match_start;
+                distance = next_distance;
+            }
+            if data.len() < ip + 8 {
+                symbols.push(Symbol::LiteralRun {
+                    start: last_match as u32,
+                    end: data.len() as u32,
+                });
+                ip = data.len();
+            }
+            super::write_block(writer, data, &symbols, ip == data.len())?;
+        }
+
+        Ok(())
+    }
+}
diff --git a/src/compress/mod.rs b/src/compress/mod.rs
new file mode 100644
index 0000000..f2ae0ec
--- /dev/null
+++ b/src/compress/mod.rs
@@ -0,0 +1,648 @@
+use std::{
+    collections::BinaryHeap,
+    io::{self, Seek, SeekFrom, Write},
+};
+
+use simd_adler32::Adler32;
+
+use crate::tables::{
+    BITMASKS, CLCL_ORDER, DIST_SYM_TO_DIST_BASE, DIST_SYM_TO_DIST_EXTRA, LENGTH_TO_LEN_EXTRA,
+    LENGTH_TO_SYMBOL,
+};
+
+use fast::FastCompressor;
+use hc_matchfinder::HashChainMatchFinder;
+use ht_matchfinder::HashTableMatchFinder;
+use medium::MediumCompressor;
+use slow::SlowCompressor;
+
+mod bt_matchfinder;
+mod hc_matchfinder;
+mod ht_matchfinder;
+
+mod fast;
+mod medium;
+mod slow;
+pub mod ultrafast;
+
+fn build_huffman_tree(
+    frequencies: &[u32],
+    lengths: &mut [u8],
+    codes: &mut [u16],
+    length_limit: u8,
+) -> bool {
+    assert_eq!(frequencies.len(), lengths.len());
+    assert_eq!(frequencies.len(), codes.len());
+
+    if frequencies.iter().filter(|&&f| f > 0).count() <= 1 {
+        lengths.fill(0);
+        codes.fill(0);
+        if let Some(i) = frequencies.iter().position(|&f| f > 0) {
+            lengths[i] = 1;
+        }
+        return false;
+    }
+
+    #[derive(Eq, PartialEq, Copy, Clone, Debug)]
+    struct Item(u32, u16);
+    impl Ord for Item {
+        fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+            other.0.cmp(&self.0)
+        }
+    }
+    impl PartialOrd for Item {
+        fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+            Some(self.cmp(other))
+        }
+    }
+
+    // Build a huffman tree
+    let mut internal_nodes = Vec::new();
+    let mut nodes = BinaryHeap::from_iter(
+        frequencies
+            .iter()
+            .enumerate()
+            .filter(|(_, &frequency)| frequency > 0)
+            .map(|(i, &frequency)| Item(frequency, i as u16)),
+    );
+    while nodes.len() > 1 {
+        let Item(frequency1, index1) = nodes.pop().unwrap();
+        let mut root = nodes.peek_mut().unwrap();
+        internal_nodes.push((index1, root.1));
+        *root = Item(
+            frequency1 + root.0,
+            internal_nodes.len() as u16 + frequencies.len() as u16 - 1,
+        );
+    }
+
+    // Walk the tree to assign code lengths
+    lengths.fill(0);
+    let mut stack = Vec::new();
+    stack.push((nodes.pop().unwrap().1, 0));
+    while let Some((node, depth)) = stack.pop() {
+        let node = node as usize;
+        if node < frequencies.len() {
+            lengths[node] = depth as u8;
+        } else {
+            let (left, right) = internal_nodes[node - frequencies.len()];
+            stack.push((left, depth + 1));
+            stack.push((right, depth + 1));
+        }
+    }
+
+    // Limit the codes to length length_limit
+    let mut max_length = 0;
+    for &length in lengths.iter() {
+        max_length = max_length.max(length);
+    }
+    if max_length > length_limit {
+        let mut counts = [0u32; 16];
+        for &length in lengths.iter() {
+            counts[length.min(length_limit) as usize] += 1;
+        }
+
+        let mut total = 0;
+        for (i, count) in counts
+            .iter()
+            .enumerate()
+            .skip(1)
+            .take(length_limit as usize)
+        {
+            total += count << (length_limit as usize - i);
+        }
+
+        while total > 1u32 << length_limit {
+            let mut i = length_limit as usize - 1;
+            while counts[i] == 0 {
+                i -= 1;
+            }
+            counts[i] -= 1;
+            counts[length_limit as usize] -= 1;
+            counts[i + 1] += 2;
+            total -= 1;
+        }
+
+        // assign new lengths
+        let mut len = length_limit;
+        let mut indexes = frequencies.iter().copied().enumerate().collect::<Vec<_>>();
+        indexes.sort_unstable_by_key(|&(_, frequency)| frequency);
+        for &(i, frequency) in indexes.iter() {
+            if frequency > 0 {
+                while counts[len as usize] == 0 {
+                    len -= 1;
+                }
+                lengths[i] = len;
+                counts[len as usize] -= 1;
+            }
+        }
+    }
+
+    // Assign codes
+    codes.fill(0);
+    let mut code = 0u32;
+    for len in 1..=length_limit {
+        for (i, &length) in lengths.iter().enumerate() {
+            if length == len {
+                codes[i] = (code as u16).reverse_bits() >> (16 - len);
+                code += 1;
+            }
+        }
+        code <<= 1;
+    }
+    assert_eq!(code, 2 << length_limit);
+
+    true
+}
+
+fn distance_to_dist_sym(distance: u16) -> u8 {
+    const LOOKUP: [u8; 16] = [0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7];
+    if distance <= 16 {
+        return LOOKUP[distance as usize - 1];
+    }
+
+    let mut dist_sym = 29;
+    while dist_sym > 0 && distance < DIST_SYM_TO_DIST_BASE[dist_sym as usize] {
+        dist_sym -= 1;
+    }
+    dist_sym
+}
+
+fn compute_hash3(v: u32) -> u32 {
+    (0x330698ecu64.wrapping_mul(((v & 0xff_ffff) ^ 0x2722_0a95) as u64) >> 16) as u32
+}
+fn compute_hash(v: u64) -> u32 {
+    let mut hasher = fnv::FnvHasher::default();
+    std::hash::Hasher::write_u64(&mut hasher, v);
+    std::hash::Hasher::finish(&hasher) as u32
+
+    // (11400714785074694791u64.wrapping_mul(v) >> 40) as u32
+}
+
+fn compute_hash32(v: u32) -> u32 {
+    let mut hasher = fnv::FnvHasher::default();
+    std::hash::Hasher::write_u32(&mut hasher, v);
+    std::hash::Hasher::finish(&hasher) as u32
+
+    // (11400714785074694791u64.wrapping_mul(v) >> 40) as u32
+}
+
+
+enum Symbol {
+    LiteralRun {
+        start: u32,
+        end: u32,
+    },
+    Backref {
+        length: u16,
+        distance: u16,
+        dist_sym: u8,
+    },
+}
+
+fn write_block<W: Write>(
+    writer: &mut BitWriter<W>,
+    data: &[u8],
+    symbols: &[Symbol],
+    eof: bool,
+) -> io::Result<()> {
+    let mut frequencies = [0u32; 286];
+    let mut dist_frequencies = [0u32; 30];
+    frequencies[256] = 1;
+    for symbol in symbols {
+        match symbol {
+            Symbol::LiteralRun { start, end } => {
+                for lit in &data[*start as usize..*end as usize] {
+                    frequencies[*lit as usize] += 1;
+                }
+            }
+            Symbol::Backref {
+                length, dist_sym, ..
+            } => {
+                let sym = LENGTH_TO_SYMBOL[*length as usize - 3] as usize;
+                frequencies[sym] += 1;
+                dist_frequencies[*dist_sym as usize] += 1;
+            }
+        }
+    }
+
+    let mut lengths = [0u8; 286];
+    let mut codes = [0u16; 286];
+    build_huffman_tree(&frequencies, &mut lengths, &mut codes, 15);
+
+    let mut dist_lengths = [0u8; 30];
+    let mut dist_codes = [0u16; 30];
+    build_huffman_tree(&dist_frequencies, &mut dist_lengths, &mut dist_codes, 15);
+
+    let num_litlen_codes = 286;
+    // while num_litlen_codes > 257 && lengths[num_litlen_codes - 1] == 0 {
+    //     num_litlen_codes -= 1;
+    // }
+
+    let num_dist_codes = 30;
+    // while num_dist_codes > 1 && dist_lengths[num_dist_codes - 1] == 0 {
+    //     num_dist_codes -= 1;
+    // }
+
+    let mut code_length_frequencies = [0u32; 19];
+    for &length in &lengths[..num_litlen_codes] {
+        code_length_frequencies[length as usize] += 1;
+    }
+    for &length in &dist_lengths[..num_dist_codes] {
+        code_length_frequencies[length as usize] += 1;
+    }
+    let mut code_length_lengths = [0u8; 19];
+    let mut code_length_codes = [0u16; 19];
+    build_huffman_tree(
+        &code_length_frequencies,
+        &mut code_length_lengths,
+        &mut code_length_codes,
+        7,
+    );
+
+    if eof {
+        writer.write_bits(101, 3)?; // final block
+    } else {
+        writer.write_bits(100, 3)?; // non-final block
+    }
+
+    writer.write_bits(num_litlen_codes as u64 - 257, 5)?; // hlit
+    writer.write_bits(num_dist_codes as u64 - 1, 5)?; // hdist
+    writer.write_bits(15, 4)?; // hclen
+
+    for j in 0..19 {
+        writer.write_bits(code_length_lengths[CLCL_ORDER[j]] as u64, 3)?;
+    }
+
+    for &length in lengths[..num_litlen_codes]
+        .iter()
+        .chain(&dist_lengths[..num_dist_codes])
+    {
+        writer.write_bits(
+            code_length_codes[length as usize] as u64,
+            code_length_lengths[length as usize],
+        )?;
+    }
+
+    for symbol in symbols {
+        match symbol {
+            Symbol::LiteralRun { start, end } => {
+                let mut groups = data[*start as usize..*end as usize].chunks_exact(4);
+                for group in &mut groups {
+                    let code0 = codes[group[0] as usize] as u64;
+                    let code1 = codes[group[1] as usize] as u64;
+                    let code2 = codes[group[2] as usize] as u64;
+                    let code3 = codes[group[3] as usize] as u64;
+
+                    let len0 = lengths[group[0] as usize];
+                    let len1 = lengths[group[1] as usize];
+                    let len2 = lengths[group[2] as usize];
+                    let len3 = lengths[group[3] as usize];
+
+                    writer.write_bits(
+                        code0
+                            | (code1 << len0)
+                            | (code2 << (len0 + len1))
+                            | (code3 << (len0 + len1 + len2)),
+                        len0 + len1 + len2 + len3,
+                    )?;
+                }
+
+                for &lit in groups.remainder() {
+                    writer.write_bits(codes[lit as usize] as u64, lengths[lit as usize] as u8)?;
+                }
+            }
+            Symbol::Backref {
+                length,
+                distance,
+                dist_sym,
+            } => {
+                let sym = LENGTH_TO_SYMBOL[*length as usize - 3] as usize;
+                writer.write_bits(codes[sym] as u64, lengths[sym] as u8)?;
+                let len_extra = LENGTH_TO_LEN_EXTRA[*length as usize - 3];
+                let extra = (((*length as u32) - 3) & BITMASKS[len_extra as usize]) as u64;
+                writer.write_bits(extra, len_extra)?;
+
+                writer.write_bits(
+                    dist_codes[*dist_sym as usize] as u64,
+                    dist_lengths[*dist_sym as usize],
+                )?;
+                let dist_extra = DIST_SYM_TO_DIST_EXTRA[*dist_sym as usize];
+                let extra = *distance - DIST_SYM_TO_DIST_BASE[*dist_sym as usize];
+
+                writer.write_bits(extra as u64, dist_extra)?;
+            }
+        }
+    }
+    writer.write_bits(codes[256] as u64, lengths[256])?;
+    Ok(())
+}
+
+enum CompressorInner {
+    Stored,
+    Fast(FastCompressor),
+    Medium(MediumCompressor),
+    Slow(SlowCompressor),
+}
+impl CompressorInner {
+    fn compress_data<W: Write>(
+        &mut self,
+        writer: &mut BitWriter<W>,
+        data: &[u8],
+        eof: bool,
+    ) -> io::Result<()> {
+        match self {
+            Self::Stored => Self::compress_stored(writer, data, eof),
+            Self::Fast(inner) => inner.compress(writer, data),
+            Self::Medium(inner) => inner.compress(writer, data),
+            Self::Slow(inner) => inner.compress(writer, data),
+        }
+    }
+
+    fn compress_stored<W: Write>(
+        writer: &mut BitWriter<W>,
+        data: &[u8],
+        eof: bool,
+    ) -> io::Result<()> {
+        if data.is_empty() {
+            if eof {
+                // TODO: write empty final block
+            }
+            return Ok(());
+        }
+
+        let chunks = data.chunks(65535);
+        let last_chunk_index = chunks.len() - 1;
+        for (i, chunk) in chunks.into_iter().enumerate() {
+            if i == last_chunk_index {
+                writer.write_bits(1, 3)?; // final block
+            } else {
+                writer.write_bits(0, 3)?; // non-final block
+            }
+            writer.flush()?;
+            writer
+                .writer
+                .write_all(&(chunk.len() as u16).to_le_bytes())?;
+            writer
+                .writer
+                .write_all(&(!(chunk.len() as u16)).to_le_bytes())?;
+            writer.writer.write_all(chunk)?;
+        }
+        return Ok(());
+    }
+}
+
+const WINDOW_SIZE: usize = 32768;
+
+struct BitWriter<W: Write> {
+    buffer: u64,
+    nbits: u8,
+    writer: W,
+}
+impl<W: Write> BitWriter<W> {
+    fn write_bits(&mut self, bits: u64, nbits: u8) -> io::Result<()> {
+        debug_assert!(nbits <= 64);
+
+        self.buffer |= bits << self.nbits;
+        self.nbits += nbits;
+
+        if self.nbits >= 64 {
+            self.writer.write_all(&self.buffer.to_le_bytes())?;
+            self.nbits -= 64;
+            self.buffer = bits.checked_shr((nbits - self.nbits) as u32).unwrap_or(0);
+        }
+        debug_assert!(self.nbits < 64);
+        Ok(())
+    }
+
+    fn flush(&mut self) -> io::Result<()> {
+        if self.nbits % 8 != 0 {
+            self.write_bits(0, 8 - self.nbits % 8)?;
+        }
+        if self.nbits > 0 {
+            self.writer
+                .write_all(&self.buffer.to_le_bytes()[..self.nbits as usize / 8])
+                .unwrap();
+            self.buffer = 0;
+            self.nbits = 0;
+        }
+        Ok(())
+    }
+}
+
+/// Compressor that produces fdeflate compressed streams.
+pub struct Compressor<W: Write> {
+    checksum: Adler32,
+    pending: Vec<u8>,
+    bit_writer: BitWriter<W>,
+    inner: CompressorInner,
+}
+impl<W: Write> Compressor<W> {
+    /// Create a new Compressor.
+    pub fn new(writer: W) -> io::Result<Self> {
+        Self::with_level(writer, 1)
+    }
+
+    /// Create a new Compressor with the specified compression level.
+    pub fn with_level(mut writer: W, level: u8) -> io::Result<Self> {
+        writer.write_all(&[0x78, 0x01])?; // zlib header
+
+        let inner = match level {
+            0 => CompressorInner::Stored,
+            1 => CompressorInner::Fast(FastCompressor::new(4)),
+            2 => CompressorInner::Fast(FastCompressor::new(9)),
+            3 => CompressorInner::Medium(MediumCompressor::new(6, 16, 6)),
+            4 => CompressorInner::Medium(MediumCompressor::new(24, 32, 9)),
+            5 => CompressorInner::Medium(MediumCompressor::new(32, 32, 9)),
+            6 => CompressorInner::Medium(MediumCompressor::new(128, 128, 12)),
+            7.. => CompressorInner::Slow(SlowCompressor::new()),
+        };
+
+        Ok(Self {
+            checksum: Adler32::new(),
+            bit_writer: BitWriter {
+                buffer: 0,
+                nbits: 0,
+                writer,
+            },
+            pending: Vec::new(),
+            inner,
+        })
+    }
+
+    /// Write data to the compressor.
+    pub fn write_data(&mut self, data: &[u8]) -> io::Result<()> {
+        self.checksum.write(data);
+        self.pending.extend_from_slice(data);
+        Ok(())
+    }
+
+    /// Write the remainder of the stream and return the inner writer.
+    pub fn finish(mut self) -> io::Result<W> {
+        self.inner
+            .compress_data(&mut self.bit_writer, &self.pending, true)?;
+
+        // Write end of block
+        self.bit_writer.flush()?;
+
+        // Write Adler32 checksum
+        let checksum: u32 = self.checksum.finish();
+        self.bit_writer
+            .writer
+            .write_all(checksum.to_be_bytes().as_ref())
+            .unwrap();
+        Ok(self.bit_writer.writer)
+    }
+}
+
+/// Compressor that only writes the stored blocks.
+///
+/// This is useful for writing files that are not compressed, but still need to be wrapped in a
+/// zlib stream.
+pub struct StoredOnlyCompressor<W> {
+    writer: W,
+    checksum: Adler32,
+    block_bytes: u16,
+}
+impl<W: Write + Seek> StoredOnlyCompressor<W> {
+    /// Creates a new `StoredOnlyCompressor` that writes to the given writer.
+    pub fn new(mut writer: W) -> io::Result<Self> {
+        writer.write_all(&[0x78, 0x01])?; // zlib header
+        writer.write_all(&[0; 5])?; // placeholder stored block header
+
+        Ok(Self {
+            writer,
+            checksum: Adler32::new(),
+            block_bytes: 0,
+        })
+    }
+
+    fn set_block_header(&mut self, size: u16, last: bool) -> io::Result<()> {
+        self.writer.seek(SeekFrom::Current(-(size as i64 + 5)))?;
+        self.writer.write_all(&[
+            last as u8,
+            (size & 0xFF) as u8,
+            ((size >> 8) & 0xFF) as u8,
+            (!size & 0xFF) as u8,
+            ((!size >> 8) & 0xFF) as u8,
+        ])?;
+        self.writer.seek(SeekFrom::Current(size as i64))?;
+
+        Ok(())
+    }
+
+    /// Writes the given data to the underlying writer.
+    pub fn write_data(&mut self, mut data: &[u8]) -> io::Result<()> {
+        self.checksum.write(data);
+        while !data.is_empty() {
+            if self.block_bytes == u16::MAX {
+                self.set_block_header(u16::MAX, false)?;
+                self.writer.write_all(&[0; 5])?; // placeholder stored block header
+                self.block_bytes = 0;
+            }
+
+            let prefix_bytes = data.len().min((u16::MAX - self.block_bytes) as usize);
+            self.writer.write_all(&data[..prefix_bytes])?;
+            self.block_bytes += prefix_bytes as u16;
+            data = &data[prefix_bytes..];
+        }
+
+        Ok(())
+    }
+
+    /// Finish writing the final block and return the underlying writer.
+    pub fn finish(mut self) -> io::Result<W> {
+        self.set_block_header(self.block_bytes, true)?;
+
+        // Write Adler32 checksum
+        let checksum: u32 = self.checksum.finish();
+        self.writer
+            .write_all(checksum.to_be_bytes().as_ref())
+            .unwrap();
+
+        Ok(self.writer)
+    }
+}
+impl<W> StoredOnlyCompressor<W> {
+    /// Return the number of bytes that will be written to the output stream
+    /// for the given input size. Because this compressor only writes stored blocks,
+    /// the output size is always slightly *larger* than the input size.
+    pub fn compressed_size(raw_size: usize) -> usize {
+        (raw_size.saturating_sub(1) / u16::MAX as usize) * (u16::MAX as usize + 5)
+            + (raw_size % u16::MAX as usize + 5)
+            + 6
+    }
+}
+
+/// Compresses the given data.
+pub fn compress_to_vec(input: &[u8]) -> Vec<u8> {
+    compress_to_vec_with_level(input, 1)
+}
+
+/// Compresses the given data with the specified compression level.
+pub fn compress_to_vec_with_level(input: &[u8], level: u8) -> Vec<u8> {
+    let mut compressor =
+        Compressor::with_level(Vec::with_capacity(input.len() / 4), level).unwrap();
+    compressor.write_data(input).unwrap();
+    let mut compressed = compressor.finish().unwrap();
+
+    if compressed.len() > StoredOnlyCompressor::<io::Cursor<&[u8]>>::compressed_size(input.len()) {
+        compressed.clear();
+        let mut compressor = StoredOnlyCompressor::new(io::Cursor::new(compressed)).unwrap();
+        compressor.write_data(input).unwrap();
+        compressor.finish().unwrap().into_inner()
+    } else {
+        compressed
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::Rng;
+
+    #[test]
+    fn test_distance_to_dist_sym() {
+        assert_eq!(distance_to_dist_sym(1), 0);
+        assert_eq!(distance_to_dist_sym(2), 1);
+        assert_eq!(distance_to_dist_sym(3), 2);
+        assert_eq!(distance_to_dist_sym(4), 3);
+        assert_eq!(distance_to_dist_sym(5), 4);
+        assert_eq!(distance_to_dist_sym(7), 5);
+        assert_eq!(distance_to_dist_sym(9), 6);
+        assert_eq!(distance_to_dist_sym(13), 7);
+        assert_eq!(distance_to_dist_sym(18), 8);
+        assert_eq!(distance_to_dist_sym(257), 16);
+    }
+
+    fn roundtrip(data: &[u8]) {
+        let compressed = compress_to_vec(data);
+        //let decompressed = miniz_oxide::inflate::decompress_to_vec_zlib(&compressed).unwrap();
+        let decompressed = crate::decompress_to_vec(&compressed).unwrap();
+        assert_eq!(&decompressed, data);
+    }
+
+    #[test]
+    fn it_works() {
+        roundtrip(b"Hello world!");
+    }
+
+    #[test]
+    fn constant() {
+        roundtrip(&vec![0; 2048]);
+        roundtrip(&vec![5; 2048]);
+        roundtrip(&vec![128; 2048]);
+        roundtrip(&vec![254; 2048]);
+    }
+
+    #[test]
+    fn random() {
+        let mut rng = rand::thread_rng();
+        let mut data = vec![0; 2048];
+        for _ in 0..10 {
+            for byte in &mut data {
+                *byte = rng.gen();
+            }
+            roundtrip(&data);
+        }
+    }
+}
diff --git a/src/compress/slow.rs b/src/compress/slow.rs
new file mode 100644
index 0000000..580cf2f
--- /dev/null
+++ b/src/compress/slow.rs
@@ -0,0 +1,148 @@
+use std::io::{self, Write};
+
+use super::{BitWriter, Symbol};
+
+use super::bt_matchfinder::BTreeMatchFinder;
+
+pub(super) struct SlowCompressor {
+    match_finder: BTreeMatchFinder,
+
+    min_match: u8,
+    skip_ahead_shift: u8,
+    search_depth: u16,
+    nice_length: u16,
+    max_lazy: u16,
+}
+
+impl SlowCompressor {
+    pub fn new() -> Self {
+        Self {
+            match_finder: BTreeMatchFinder::new(3),
+
+            min_match: 4,
+            skip_ahead_shift: 9,
+            search_depth: 64,
+            nice_length: 258,
+            max_lazy: 32,
+        }
+    }
+
+    pub fn compress<W: Write>(&mut self, writer: &mut BitWriter<W>, data: &[u8]) -> io::Result<()> {
+        let mut ip = 0;
+
+        let mut length = 0;
+        let mut distance = 0;
+        let mut match_start = 0;
+
+        while ip < data.len() {
+            let mut symbols = Vec::new();
+            let mut num_symbols = 0;
+
+            let mut last_match = ip;
+            'outer: while symbols.len() < 16384 && ip + 8 < data.len() {
+                let current = u64::from_le_bytes(data[ip..][..8].try_into().unwrap());
+
+                if length == 0 {
+                    // if current == 0 {
+                    //     while ip > last_match && data[ip - 1] == 0 {
+                    //         ip -= 1;
+                    //     }
+
+                    //     if ip == 0 || data[ip - 1] != 0 {
+                    //         ip += 1;
+                    //     }
+
+                    //     symbols.push(Symbol::LiteralRun {
+                    //         start: last_match as u32,
+                    //         end: ip as u32,
+                    //     });
+                    //     num_symbols += ip - last_match;
+
+                    //     let mut run_length = 0;
+                    //     while ip < data.len() && data[ip] == 0 && run_length < 258 {
+                    //         run_length += 1;
+                    //         ip += 1;
+                    //     }
+
+                    //     symbols.push(Symbol::Backref {
+                    //         length: run_length as u16,
+                    //         distance: 1,
+                    //         dist_sym: 0,
+                    //     });
+                    //     num_symbols += 1;
+
+                    //     last_match = ip;
+
+                    //     length = 0;
+                    //     continue;
+                    // }
+
+                    (length, distance, match_start) =
+                        self.match_finder.get_and_insert(&data, ip, current, 4);
+                }
+
+                if length >= 3 {
+                    if
+                    /*match_start + length as usize > ip + 1
+                    && length < self.max_lazy
+                    &&*/
+                    ip + length as usize + 9 <= data.len() {
+                        ip += 1;
+                        let (next_length, next_distance, next_match_start) = self
+                            .match_finder
+                            .get_and_insert(&data, ip, current >> 8, length + 1);
+                        if next_length > 0 && match_start + 1 >= next_match_start {
+                            assert!(next_length > length);
+                            distance = next_distance;
+                            length = next_length;
+                            match_start = next_match_start;
+                            continue;
+                        }
+                    }
+                    assert!(last_match <= match_start);
+
+                    symbols.push(Symbol::LiteralRun {
+                        start: last_match as u32,
+                        end: match_start as u32,
+                    });
+                    num_symbols += match_start - last_match;
+
+                    symbols.push(Symbol::Backref {
+                        length: length as u16,
+                        distance,
+                        dist_sym: super::distance_to_dist_sym(distance),
+                    });
+                    num_symbols += 1;
+
+                    let match_end = match_start + length as usize;
+
+                    if match_end + 8 < data.len() {
+                        for j in (ip + 1)..match_end {
+                            let v = u64::from_le_bytes(data[j..][..8].try_into().unwrap());
+                            self.match_finder.insert(data, v, j);
+                        }
+                    }
+
+                    ip = match_end;
+                    last_match = match_end;
+
+                    length = 0;
+                    continue 'outer;
+                }
+
+                ip += 1;
+            }
+            if data.len() <= ip + 8 {
+                symbols.push(Symbol::LiteralRun {
+                    start: last_match as u32,
+                    end: data.len() as u32,
+                });
+                ip = data.len();
+            }
+
+            super::write_block(writer, data, &symbols, ip == data.len())?;
+        }
+
+        Ok(())
+    }
+}
diff --git a/src/compress.rs b/src/compress/ultrafast.rs
similarity index 68%
rename from src/compress.rs
rename to src/compress/ultrafast.rs
index b55116e..b7e8ac0 100644
--- a/src/compress.rs
+++ b/src/compress/ultrafast.rs
@@ -1,18 +1,23 @@
 use simd_adler32::Adler32;
-use std::io::{self, Seek, SeekFrom, Write};
+use std::io::{self, Write};
 
 use crate::tables::{
     BITMASKS, HUFFMAN_CODES, HUFFMAN_LENGTHS, LENGTH_TO_LEN_EXTRA, LENGTH_TO_SYMBOL,
 };
 
-/// Compressor that produces fdeflate compressed streams.
-pub struct Compressor<W: Write> {
+/// Very fast zlib compressor that trades compression ratio for speed.
+///
+/// This compressor is designed to be fast and efficient for filtered PNG data pixel data, where it
+/// is expected that there will be many long runs of zeros, and the rest of the data is mostly small
+/// differences from the previous pixel. On data data that does not match this pattern, it may
+/// produce output that is *larger* than the input.
+pub struct UltraFastCompressor<W: Write> {
     checksum: Adler32,
     buffer: u64,
     nbits: u8,
     writer: W,
 }
-impl<W: Write> Compressor<W> {
+impl<W: Write> UltraFastCompressor<W> {
     fn write_bits(&mut self, bits: u64, nbits: u8) -> io::Result<()> {
         debug_assert!(nbits <= 64);
 
@@ -181,97 +186,17 @@ impl<W: Write> Compressor<W> {
     }
 }
 
-/// Compressor that only writes the stored blocks.
-///
-/// This is useful for writing files that are not compressed, but still need to be wrapped in a
-/// zlib stream.
-pub struct StoredOnlyCompressor<W> {
-    writer: W,
-    checksum: Adler32,
-    block_bytes: u16,
-}
-impl<W: Write + Seek> StoredOnlyCompressor<W> {
-    /// Creates a new `StoredOnlyCompressor` that writes to the given writer.
-    pub fn new(mut writer: W) -> io::Result<Self> {
-        writer.write_all(&[0x78, 0x01])?; // zlib header
-        writer.write_all(&[0; 5])?; // placeholder stored block header
-
-        Ok(Self {
-            writer,
-            checksum: Adler32::new(),
-            block_bytes: 0,
-        })
-    }
-
-    fn set_block_header(&mut self, size: u16, last: bool) -> io::Result<()> {
-        self.writer.seek(SeekFrom::Current(-(size as i64 + 5)))?;
-        self.writer.write_all(&[
-            last as u8,
-            (size & 0xFF) as u8,
-            ((size >> 8) & 0xFF) as u8,
-            (!size & 0xFF) as u8,
-            ((!size >> 8) & 0xFF) as u8,
-        ])?;
-        self.writer.seek(SeekFrom::Current(size as i64))?;
-
-        Ok(())
-    }
-
-    /// Writes the given data to the underlying writer.
-    pub fn write_data(&mut self, mut data: &[u8]) -> io::Result<()> {
-        self.checksum.write(data);
-        while !data.is_empty() {
-            if self.block_bytes == u16::MAX {
-                self.set_block_header(u16::MAX, false)?;
-                self.writer.write_all(&[0; 5])?; // placeholder stored block header
-                self.block_bytes = 0;
-            }
-
-            let prefix_bytes = data.len().min((u16::MAX - self.block_bytes) as usize);
-            self.writer.write_all(&data[..prefix_bytes])?;
-            self.block_bytes += prefix_bytes as u16;
-            data = &data[prefix_bytes..];
-        }
-
-        Ok(())
-    }
-
-    /// Finish writing the final block and return the underlying writer.
-    pub fn finish(mut self) -> io::Result<W> {
-        self.set_block_header(self.block_bytes, true)?;
-
-        // Write Adler32 checksum
-        let checksum: u32 = self.checksum.finish();
-        self.writer
-            .write_all(checksum.to_be_bytes().as_ref())
-            .unwrap();
-
-        Ok(self.writer)
-    }
-}
-impl<W> StoredOnlyCompressor<W> {
-    /// Return the number of bytes that will be written to the output stream
-    /// for the given input size. Because this compressor only writes stored blocks,
-    /// the output size is always slightly *larger* than the input size.
-    pub fn compressed_size(raw_size: usize) -> usize {
-        (raw_size.saturating_sub(1) / u16::MAX as usize) * (u16::MAX as usize + 5)
-            + (raw_size % u16::MAX as usize + 5)
-            + 6
-    }
-}
-
-/// Compresses the given data.
-pub fn compress_to_vec(input: &[u8]) -> Vec<u8> {
-    let mut compressor = Compressor::new(Vec::with_capacity(input.len() / 4)).unwrap();
-    compressor.write_data(input).unwrap();
-    compressor.finish().unwrap()
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
     use rand::Rng;
 
+    pub fn compress_to_vec(input: &[u8]) -> Vec<u8> {
+        let mut compressor = UltraFastCompressor::new(Vec::with_capacity(input.len() / 4)).unwrap();
+        compressor.write_data(input).unwrap();
+        compressor.finish().unwrap()
+    }
+
     fn roundtrip(data: &[u8]) {
         let compressed = compress_to_vec(data);
         let decompressed = miniz_oxide::inflate::decompress_to_vec_zlib(&compressed).unwrap();
diff --git a/src/decompress.rs b/src/decompress.rs
index 2b853a0..48f09f5 100644
--- a/src/decompress.rs
+++ b/src/decompress.rs
@@ -1309,6 +1309,7 @@ mod tests {
     }
 
     #[test]
+    #[ignore]
     fn zero_length() {
         let mut compressed = crate::compress_to_vec(b"").to_vec();
 
diff --git a/src/lib.rs b/src/lib.rs
index e273699..627dca1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -26,7 +26,10 @@ mod decompress;
 mod huffman;
 mod tables;
 
-pub use compress::{compress_to_vec, Compressor, StoredOnlyCompressor};
+pub use compress::{
+    compress_to_vec, compress_to_vec_with_level, ultrafast::UltraFastCompressor, Compressor,
+    StoredOnlyCompressor,
+};
 pub use decompress::{
     decompress_to_vec, decompress_to_vec_bounded, BoundedDecompressionError, DecompressionError,
     Decompressor,
diff --git a/src/tables.rs b/src/tables.rs
index 567565a..13069c1 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -19,6 +19,7 @@ pub(crate) const HUFFMAN_LENGTHS: [u8; 286] = [
     12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 9,
 ];
 
+#[allow(unused)]
 pub(crate) const HUFFMAN_CODES: [u16; 286] = match crate::compute_codes(&HUFFMAN_LENGTHS) {
     Some(codes) => codes,
     None => panic!("HUFFMAN_LENGTHS is invalid"),