diff --git a/Makefile b/Makefile index 502a851..9b0109d 100644 --- a/Makefile +++ b/Makefile @@ -1,21 +1,25 @@ all: +$(MAKE) -C python +$(MAKE) -C js + +$(MAKE) -C rust +$(MAKE) -C docs html deps: +$(MAKE) -C python deps +$(MAKE) -C js deps + +$(MAKE) -C rust deps +$(MAKE) -C docs deps check: +$(MAKE) -C python check +$(MAKE) -C js check + +$(MAKE) -C rust check +$(MAKE) -C docs doctest clean: +$(MAKE) -C python clean +$(MAKE) -C js clean + +$(MAKE) -C rust clean +$(MAKE) -C docs clean .PHONY: all deps check clean diff --git a/README.rst b/README.rst index 46346ae..1c8c742 100644 --- a/README.rst +++ b/README.rst @@ -27,9 +27,9 @@ For example: Languages --------- -|PyPI version| |npm version| +|PyPI version| |npm version| |crates.io version| -bistring is available in multiple languages, currently `Python `_ and `JavaScript/TypeScript `_. +bistring is available in multiple languages, currently `Python `_, `JavaScript/TypeScript `_, and `Rust `_. Ports to other languages are planned for the near future. The code is structured similarly in each language to make it easy to share algorithms, tests, and fixes between them. @@ -65,3 +65,5 @@ For more information see the `Code of Conduct FAQ "] +edition = "2018" +description = "Bidirectionally transformed strings" +readme = "README.md" +repository = "https://github.com/microsoft/bistring" +license = "MIT" +keywords = ["string", "text", "nlp"] +categories = ["text-processing", "data-structures"] + +[dependencies] + +[dev-dependencies] diff --git a/rust/Makefile b/rust/Makefile new file mode 100644 index 0000000..96fcd30 --- /dev/null +++ b/rust/Makefile @@ -0,0 +1,13 @@ +all: + cargo build + +deps: + cargo fetch + +check: + cargo test + +clean: + cargo clean + +.PHONY: all deps check clean diff --git a/rust/README.md b/rust/README.md new file mode 100644 index 0000000..456ca93 --- /dev/null +++ b/rust/README.md @@ -0,0 +1,27 @@ +bistring +======== + +[![crates.io version](https://img.shields.io/crates/v/bistring)](https://crates.io/crates/bistring) + +The bistring library provides non-destructive versions of common string processing operations like normalization, case folding, and find/replace. +Each bistring remembers the original string, and how its substrings map to substrings of the modified version. + +For example: + +```rust +use bistring::BiString; + +let mut s = BiString::from("๐•ฟ๐–๐–Š ๐––๐–š๐–Ž๐–ˆ๐–, ๐–‡๐–—๐–”๐–œ๐–“ ๐ŸฆŠ ๐–๐–š๐–’๐–•๐–˜ ๐–”๐–›๐–Š๐–— ๐–™๐–๐–Š ๐–‘๐–†๐–Ÿ๐–ž ๐Ÿถ"); +s = s.nfkd(); // Unicode normalization +s = s.casefold(); // Case-insensitivity +s = s.replace("๐ŸฆŠ", "fox"); // Replace emoji with text +s = s.replace("๐Ÿถ", "dog"); +s = s.replace(/[^\w\s]+/g, ""); // Strip everything but letters and spaces +let slice = &s[..19]; // Extract a substring +// The modified substring, after changes +assert_eq!(slice.modified(), "the quick brown fox"); +// The original substring, before changes +assert_eq!(slice.original(), "๐•ฟ๐–๐–Š ๐––๐–š๐–Ž๐–ˆ๐–, ๐–‡๐–—๐–”๐–œ๐–“ ๐ŸฆŠ"); +``` + +This allows you to perform very aggressive text processing completely invisibly. diff --git a/rust/src/align.rs b/rust/src/align.rs new file mode 100644 index 0000000..e76baae --- /dev/null +++ b/rust/src/align.rs @@ -0,0 +1,1015 @@ +//! Sequence alignments. + +use crate::bound::Bounds; + +use std::fmt::{self, Debug, Formatter}; +use std::iter::{FromIterator, IntoIterator}; +use std::mem; +use std::ops::{Add, Bound, Range, RangeBounds}; + +/// An alignment between two related sequences. +/// +/// Consider this alignment between two strings: +/// +/// ```text +/// |it's| |aligned!| +/// | \ \ | +/// |it is| |aligned| +/// ``` +/// +/// An alignment stores all the indices that are known to correspond between the original and +/// modified sequences. For the above example, it would be +/// +/// # use bistring::Alignment; +/// # use std::iter::FromIterator; +/// let a = Alignment::from_iter([ +/// (0, 0), +/// (4, 5), +/// (5, 6), +/// (13, 13), +/// ]); +/// +/// Alignments can be used to answer questions like, "what's the smallest range of the original +/// sequence that is guaranteed to contain this part of the modified sequence?" For example, the +/// range `0..5` ("it is") is known to match the range `0..4` ("it's") of the original sequence: +/// +/// # use bistring::Alignment; +/// # use std::iter::FromIterator; +/// # let a = Alignment::from_iter([(0, 0), (4, 5), (5, 6), (13, 13)]); +/// assert_eq!(a.to_original_range(0..5), 0..4); +/// +/// Results may be imprecise if the alignment is too course to match the exact inputs: +/// +/// # use bistring::Alignment; +/// # use std::iter::FromIterator; +/// # let a = Alignment::from_iter([(0, 0), (4, 5), (5, 6), (13, 13)]); +/// assert_eq!(a.to_original_range(0..2), 0..4); +/// +/// A more granular alignment like this: +/// +/// ```text +/// |i|t|'s| |a|l|i|g|n|e|d|!| +/// | | | \ \ \ \ \ \ \ \ \ / +/// |i|t| is| |a|l|i|g|n|e|d| +/// ``` +/// +/// # use bistring::Alignment; +/// # use std::iter::FromIterator; +/// let a = Alignment::from_iter([ +/// (0, 0), (1, 1), (2, 2), (4, 5), (5, 6), (6, 7), (7, 8), +/// (8, 9), (9, 10), (10, 11), (11, 12), (12, 13), (13, 13), +/// ]); +/// +/// Can be more precise: +/// +/// # use bistring::Alignment; +/// # use std::iter::FromIterator; +/// # let a = Alignment::from_iter([ +/// # (0, 0), (1, 1), (2, 2), (4, 5), (5, 6), (6, 7), (7, 8), +/// # (8, 9), (9, 10), (10, 11), (11, 12), (12, 13), (13, 13), +/// # ]); +/// assert_eq!(a.to_original_range(0..2), 0..2); +#[derive(Clone, Default, Eq, PartialEq)] +pub struct Alignment { + indices: Vec<(usize, usize)>, +} + +impl Alignment { + /// Create a new empty alignment. + pub fn new() -> Self { + Self::default() + } + + /// Create an identity alignment. + /// + /// An identity alignment aligns sequence positions with themselves. For example, + /// + /// # use bistring::Alignment; + /// let alignment = Alignment::identity(0..=8); + /// assert_eq!(alignment.to_original_range(3..5), 3..5); + /// assert_eq!(alignment.to_modified_range(3..5), 3..5); + pub fn identity(indices: impl IntoIterator) -> Self { + indices.into_iter() + .map(|i| (i, i)) + .collect() + } + + /// Infer the alignment between two sequences with the lowest edit distance. + /// + /// # use bistring::Alignment; + /// assert_eq!( + /// Alignment::infer("color".chars(), "color".chars()), + /// Alignment::identity(0..=5), + /// ); + /// + /// let a = Alignment::infer("color".chars(), "colour".chars()); + /// // "o" <-> "ou" + /// assert_eq!(a.to_original_range(3..5), 3..4); + /// + /// # Warning + /// + /// This operation has time complexity `O(N*M)`, where `N` and `M` are the lengths of the + /// original and modified sequences, and so should only be used for relatively short sequences. + pub fn infer(original: O, modified: M) -> Self + where + O: IntoIterator, + M: IntoIterator, + T: PartialEq, + { + Self::infer_with_costs(original, modified, |e| { + match e { + Edit::Replacement(t, u) => (t != u) as i32, + _ => 1, + } + }) + } + + /// Infer the alignment between two sequences with the lowest edit distance. + /// + /// This function is similar to [`Self::infer()`], but allows a custom cost function to be + /// specified. + /// + /// # Warning + /// + /// This operation has time complexity `O(N*M)`, where `N` and `M` are the lengths of the + /// original and modified sequences, and so should only be used for relatively short sequences. + pub fn infer_with_costs(original: O, modified: M, cost_fn: F) -> Self + where + O: IntoIterator, + M: IntoIterator, + F: Fn(Edit<&T, &U>) -> N, + N: Cost, + { + let original: Vec = original.into_iter().collect(); + let modified: Vec = modified.into_iter().collect(); + + if original.len() < modified.len() { + // Keep the memory consumption bounded by the smaller of the two sequences + Self::infer_recursive(&modified, &original, |e| cost_fn(e.inverse())) + .inverse() + } else { + Self::infer_recursive(&original, &modified, &cost_fn) + } + } + + /// [Hirschberg's algorithm] for computing optimal alignments in linear space. + /// + /// [Hirschberg's algorithm]: https://en.wikipedia.org/wiki/Hirschberg's_algorithm + fn infer_recursive(original: &[T], modified: &[U], cost_fn: F) -> Self + where + F: Copy + Fn(Edit<&T, &U>) -> N, + N: Cost, + { + if original.len() <= 1 || modified.len() <= 1 { + return Self::infer_matrix(original, modified, cost_fn); + } + + let omid = original.len() / 2; + let (oleft, oright) = original.split_at(omid); + + let lcosts = Self::infer_costs(oleft, modified, false, cost_fn); + let rcosts = Self::infer_costs(oright, modified, true, cost_fn); + + let mut mmid = 0; + let mut min = lcosts[0] + rcosts[0]; + // min_by_key would be nice + for i in 1..lcosts.len() { + let cost = lcosts[i] + rcosts[i]; + if cost < min { + mmid = i; + min = cost; + } + } + let (mleft, mright) = modified.split_at(mmid); + + let mut left = Self::infer_recursive(oleft, mleft, cost_fn); + let right = Self::infer_recursive(oright, mright, cost_fn); + left.extend(right.shifted(omid as isize, mmid as isize)); + left + } + + /// The [Needlemanโ€“Wunsch] or [Wagnerโ€“Fischer] algorithm, using the entire matrix to compute the + /// optimal alignment. + /// + /// [Needlemanโ€“Wunsch]: https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm + /// [Wagnerโ€“Fischer]: https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm + fn infer_matrix(original: &[T], modified: &[U], cost_fn: F) -> Self + where + F: Fn(Edit<&T, &U>) -> N, + N: Cost, + { + let rows = 1 + original.len(); + let cols = 1 + modified.len(); + + let mut matrix = Vec::with_capacity(rows * cols); + matrix.push((N::default(), 0, 0)); + for (j, m) in modified.iter().enumerate() { + let cost = matrix[j].0 + cost_fn(Edit::Insertion(m)); + matrix.push((cost, 0, j)); + } + + let mut prev = 0; + for (i, o) in original.iter().enumerate() { + let next = prev + cols; + let cost = matrix[prev].0 + cost_fn(Edit::Deletion(o)); + matrix.push((cost, i, 0)); + + for (j, m) in modified.iter().enumerate() { + let mut cost = matrix[prev + j].0 + cost_fn(Edit::Replacement(o, m)); + let (mut x, mut y) = (i, j); + + let del_cost = matrix[prev + j + 1].0 + cost_fn(Edit::Deletion(o)); + if del_cost < cost { + cost = del_cost; + x = i; + y = j + 1; + } + + let ins_cost = matrix[next + j].0 + cost_fn(Edit::Insertion(m)); + if ins_cost < cost { + cost = ins_cost; + x = i + 1; + y = j; + } + + matrix.push((cost, x, y)); + } + + prev = next; + } + + let mut result = Vec::new(); + let mut i = rows - 1; + let mut j = cols - 1; + loop { + result.push((i, j)); + if i == 0 && j == 0 { + break; + } + let prev = matrix[i * cols + j]; + i = prev.1; + j = prev.2; + } + + result + .into_iter() + .rev() + .collect() + } + + /// Index a sequence from the beginning or end. + fn index(seq: &[T], i: usize, reverse: bool) -> &T { + if reverse { + &seq[seq.len() - i - 1] + } else { + &seq[i] + } + } + + /// The [Needlemanโ€“Wunsch] or [Wagnerโ€“Fischer] algorithm. Here we use it in a way that only + /// computes the final row of costs, without finding the alignment itself. Hirschberg's + /// algorithm uses it as a subroutine to find the optimal alignment in less than O(N*M) space. + /// + /// [Needlemanโ€“Wunsch]: https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm + /// [Wagnerโ€“Fischer]: https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm + fn infer_costs(original: &[T], modified: &[U], reverse: bool, cost_fn: F) -> Vec + where + F: Fn(Edit<&T, &U>) -> N, + N: Cost, + { + let mlen = modified.len(); + + let mut row = Vec::with_capacity(mlen + 1); + row.push(N::default()); + + for j in 0..mlen { + let m = Self::index(modified, j, reverse); + let cost = row[j] + cost_fn(Edit::Insertion(m)); + row.push(cost); + } + + let mut prev = vec![N::default(); row.len()]; + + for i in 0..original.len() { + mem::swap(&mut row, &mut prev); + + let o = Self::index(original, i, reverse); + row[0] = prev[0] + cost_fn(Edit::Deletion(o)); + + for j in 0..mlen { + let m = Self::index(modified, j, reverse); + + let sub_cost = prev[j] + cost_fn(Edit::Replacement(o, m)); + let del_cost = prev[j + 1] + cost_fn(Edit::Deletion(o)); + let ins_cost = row[j] + cost_fn(Edit::Insertion(m)); + + let mut min_cost = sub_cost; + if del_cost < min_cost { + min_cost = del_cost; + } + if ins_cost < min_cost { + min_cost = ins_cost; + } + + row[j + 1] = min_cost; + } + } + + if reverse { + row.reverse(); + } + + row + } + + /// Get the number of indices in this alignment. + /// + /// Note that this is not the same as the size of either the original or modified sequence. For + /// that, see [`original_range()`](#method.original_range) or + /// [`modified_range()`](#method.modified_range). + pub fn len(&self) -> usize { + self.indices.len() + } + + /// Iterate over the indices in this alignment. + pub fn iter(&self) -> Iter<'_> { + (&self).into_iter() + } + + /// Extract a slice of this alignment. + pub fn slice(&self, range: impl RangeBounds) -> Slice<'_> { + let start = match range.start_bound() { + Bound::Included(&n) => n, + Bound::Excluded(&n) => n + 1, + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&n) => n + 1, + Bound::Excluded(&n) => n, + Bound::Unbounded => self.len(), + }; + + Slice::new(&self.indices[start..end]) + } + + /// Add a new pair of indices to this alignment. + /// + /// The original sequence position `o` will be considered to correspond to the modified sequence + /// position `m`. + /// + /// # Panics + /// + /// If either the original or modified sequence position moves backwards. + pub fn push(&mut self, o: usize, m: usize) { + if !self.indices.is_empty() { + let (ol, ml) = self.indices[self.len() - 1]; + assert!(o >= ol); + assert!(m >= ml); + if (o, m) == (ol, ml) { + return; + } + } + + self.indices.push((o, m)); + } + + /// Get the bounds of the original sequence as a [`Range`]. + pub fn original_range(&self) -> Range { + let (start, _) = self.indices[0]; + let (end, _) = self.indices[self.len() - 1]; + start..end + } + + /// Get the bounds of the modified sequence as a [`Range`]. + pub fn modified_range(&self) -> Range { + let (_, start) = self.indices[0]; + let (_, end) = self.indices[self.len() - 1]; + start..end + } + + /// Maps a subrange of the modified sequence to the original sequence. + /// + /// Any [range-like](RangeBounds) type is accepted, for example: + /// + /// # use bistring::Alignment; + /// // (0, 0), (2, 1), (4, 2), ..., (16, 8) + /// let a: Alignment = (0..=8) + /// .map(|i| (2 * i, i)) + /// .collect(); + /// + /// assert_eq!(a.to_original_range(3.. 5), 6..10); + /// assert_eq!(a.to_original_range(3..=5), 6..12); + /// assert_eq!(a.to_original_range(3.. ), 6..16); + /// assert_eq!(a.to_original_range( ..=5), 0..12); + pub fn to_original_range(&self, range: impl RangeBounds) -> Range { + self.to_range(range, |(o, m)| (o, m)) + } + + /// Maps a subrange of the original sequence to the modified sequence. + /// + /// Any [range-like](RangeBounds) type is accepted, for example: + /// + /// # use bistring::Alignment; + /// // (0, 0), (2, 1), (4, 2), ..., (16, 8) + /// let a: Alignment = (0..=8) + /// .map(|i| (2 * i, i)) + /// .collect(); + /// + /// assert_eq!(a.to_modified_range(6.. 10), 3..5); + /// assert_eq!(a.to_modified_range(6..=10), 3..6); + /// assert_eq!(a.to_modified_range(6.. ), 3..8); + /// assert_eq!(a.to_modified_range( ..=10), 0..6); + pub fn to_modified_range(&self, range: impl RangeBounds) -> Range { + self.to_range(range, |(o, m)| (m, o)) + } + + /// Shared implementation for to_{original,modified}_range(). + fn to_range(&self, range: R, which: F) -> Range + where + R: RangeBounds, + F: Fn((usize, usize)) -> (usize, usize), + { + let (lb, ub) = self.to_bounds(range, |i| which(i).1); + let i = which(self.indices[lb]).0; + let j = which(self.indices[ub]).0; + i..j + } + + /// Find the bounds of an interval on one side of this alignment. + fn to_bounds(&self, range: R, which: F) -> (usize, usize) + where + R: RangeBounds, + F: Copy + Fn((usize, usize)) -> usize, + { + let lb = self.lower_bound(range.start_bound(), which); + let ub = self.upper_bound(range.end_bound(), which); + (lb, ub) + } + + /// Find the lower bound of an interval on one side of this alignment. + fn lower_bound(&self, start: Bound<&usize>, which: impl Fn((usize, usize)) -> usize) -> usize { + let start = match start { + Bound::Included(&n) => n, + Bound::Excluded(&n) => n + 1, + Bound::Unbounded => return 0, + }; + + let lb = self.indices.partition_point(|&i| which(i) <= start); + assert!(lb > 0); + lb - 1 + } + + /// Finds the upper bound of an interval on one side of this alignment. + fn upper_bound(&self, end: Bound<&usize>, which: impl Fn((usize, usize)) -> usize) -> usize { + let end = match end { + Bound::Included(&n) => n + 1, + Bound::Excluded(&n) => n, + Bound::Unbounded => return self.len() - 1, + }; + + let ub = self.indices.partition_point(|&i| which(i) < end); + assert!(ub < self.len()); + ub + } + + /// Slice this alignment by a range of the original sequence. + /// + /// # use bistring::Alignment; + /// let alignment: Alignment = (0..=5) + /// .map(|i| (i + 1, i)) + /// .collect(); + /// let slice = alignment.slice_by_original(2..4); + /// assert!(slice.iter().eq([(2, 1), (3, 2), (4, 3)])); + pub fn slice_by_original(&self, range: impl RangeBounds) -> Slice<'_> { + let bounds = Bounds::new(range); + let (lb, ub) = self.to_bounds(bounds, |(o, _m)| o); + Slice::clamped(&self.indices[lb..=ub], bounds, ..) + } + + /// Slice this alignment by a range of the modified sequence. + /// + /// # use bistring::Alignment; + /// let alignment: Alignment = (0..=5) + /// .map(|i| (i + 1, i)) + /// .collect(); + /// let slice = alignment.slice_by_modified(1..3); + /// assert!(slice.iter().eq([(2, 1), (3, 2), (4, 3)])); + pub fn slice_by_modified(&self, range: impl RangeBounds) -> Slice<'_> { + let bounds = Bounds::new(range); + let (lb, ub) = self.to_bounds(bounds, |(_o, m)| m); + Slice::clamped(&self.indices[lb..=ub], .., bounds) + } + + /// Returns a view of this slice with sequence indices shifted. + pub fn shifted(&self, o: isize, m: isize) -> Slice<'_> { + self.slice(..).shifted(o, m) + } + + /// Returns a view of this alignment shifted to start at (0, 0). + pub fn shifted_to_origin(&self) -> Slice<'_> { + self.slice(..).shifted_to_origin() + } + + /// Returns a new alignment equivalent to applying this one first, then the `other`. + pub fn compose(&self, other: &Alignment) -> Self { + assert_eq!(self.modified_range(), other.original_range()); + + let mut composed = Self::new(); + + let mut i = 0; + let i_max = self.len(); + + let mut j = 0; + let j_max = other.len(); + + while i < i_max { + // Map self.original[i] to its lower bound in other + while self.indices[i].1 > other.indices[j].0 { + j += 1; + } + while self.indices[i].1 < other.indices[j].0 + && self.indices[i + 1].1 <= other.indices[j].0 + { + i += 1; + } + composed.push(self.indices[i].0, other.indices[j].1); + + // Map self.original[i] to its upper bound in other (if it's different) + while i + 1 < i_max && self.indices[i].0 == self.indices[i + 1].0 { + i += 1; + } + + let mut needs_upper = false; + while j + 1 < j_max && self.indices[i].1 >= other.indices[j + 1].0 { + needs_upper = true; + j += 1; + } + if needs_upper { + composed.push(self.indices[i].0, other.indices[j].1); + } + + i += 1; + } + + composed + } + + /// Returns the inverse of this alignment, swapping the original and modified sequences. + pub fn inverse(&self) -> Self { + self.iter() + .map(|(o, m)| (m, o)) + .collect() + } +} + +impl Debug for Alignment { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "[")?; + + let mut comma = false; + for (o, m) in self.iter() { + if comma { + write!(f, ", ")?; + } + write!(f, "{}โ‡‹{}", o, m)?; + comma = true; + } + + write!(f, "]") + } +} + +impl<'a> From> for Alignment { + fn from(slice: Slice<'a>) -> Alignment { + Self::from_iter(slice) + } +} + +impl FromIterator<(usize, usize)> for Alignment { + fn from_iter>(items: I) -> Self { + let mut alignment = Self::new(); + for (o, m) in items { + alignment.push(o, m); + } + alignment + } +} + +/// An iterator over the indices in an alignment. +#[derive(Debug)] +pub struct IntoIter(std::vec::IntoIter<(usize, usize)>); + +impl Iterator for IntoIter { + type Item = (usize, usize); + + fn next(&mut self) -> Option<(usize, usize)> { + self.0.next() + } +} + +impl IntoIterator for Alignment { + type Item = (usize, usize); + type IntoIter = IntoIter; + + fn into_iter(self) -> Self::IntoIter { + IntoIter(self.indices.into_iter()) + } +} + +/// An iterator over the indices in an alignment. +#[derive(Debug)] +pub struct Iter<'a>(std::slice::Iter<'a, (usize, usize)>); + +impl<'a> Iterator for Iter<'a> { + type Item = (usize, usize); + + fn next(&mut self) -> Option<(usize, usize)> { + self.0.next().copied() + } +} + +impl<'a> IntoIterator for &'a Alignment { + type Item = (usize, usize); + type IntoIter = Iter<'a>; + + fn into_iter(self) -> Self::IntoIter { + Iter(self.indices.iter()) + } +} + +impl Extend<(usize, usize)> for Alignment { + fn extend>(&mut self, iter: I) { + for (o, m) in iter { + self.push(o, m); + } + } +} + +/// A slice of a sequence alignment. +#[derive(Clone, Copy, Debug)] +pub struct Slice<'a> { + slice: &'a [(usize, usize)], + o_bounds: Bounds, + m_bounds: Bounds, + o_shift: isize, + m_shift: isize, +} + +impl<'a> Slice<'a> { + /// Create a new simple slice. + fn new(slice: &'a [(usize, usize)]) -> Self { + Self::clamped(slice, .., ..) + } + + /// Create a new clamped slice. + fn clamped( + slice: &'a [(usize, usize)], + o_range: impl RangeBounds, + m_range: impl RangeBounds, + ) -> Self { + Self { + slice, + o_bounds: Bounds::new(o_range), + m_bounds: Bounds::new(m_range), + o_shift: 0, + m_shift: 0, + } + } + + /// Get the number of indices in this slice. + /// + /// Note that this is not the same as the size of either the original or modified sequence. For + /// that, see [`original_range()`](#method.original_range) or + /// [`modified_range()`](#method.modified_range). + pub fn len(&self) -> usize { + self.slice.len() + } + + /// Iterate over the indices in this slice. + pub fn iter(&self) -> SliceIter<'_> { + self.into_iter() + } + + /// Get the bounds of this slice of the original sequence as a [`Range`]. + pub fn original_range(&self) -> Range { + let (mut start, _) = self.slice[0]; + let (mut end, _) = self.slice[self.len() - 1]; + + start = self.o_bounds.clamp(start); + end = self.o_bounds.clamp(end); + + start..end + } + + /// Get the bounds of this slice of the modified sequence as a [`Range`]. + pub fn modified_range(&self) -> Range { + let (_, mut start) = self.slice[0]; + let (_, mut end) = self.slice[self.len() - 1]; + + start = self.m_bounds.clamp(start); + end = self.m_bounds.clamp(end); + + start..end + } + + /// Returns a view of this slice with sequence indices shifted. + pub fn shifted(&self, o: isize, m: isize) -> Self { + Self { + o_shift: self.o_shift + o, + m_shift: self.m_shift + m, + ..*self + } + } + + /// Returns a view of this slice shifted to start at (0, 0). + pub fn shifted_to_origin(&self) -> Self { + let (mut o, mut m) = self.slice[0]; + o = self.o_bounds.clamp(o); + m = self.m_bounds.clamp(m); + self.shifted(-(o as isize), -(m as isize)) + } +} + +impl<'a, 'b> PartialEq> for Slice<'a> { + fn eq(&self, rhs: &Slice<'b>) -> bool { + self.iter().eq(rhs) + } +} + +impl<'a> Eq for Slice<'a> {} + +/// An iterator over a Slice. +#[derive(Debug)] +pub struct SliceIter<'a> { + iter: std::slice::Iter<'a, (usize, usize)>, + o_bounds: Bounds, + m_bounds: Bounds, + o_shift: isize, + m_shift: isize, +} + +impl<'a> Iterator for SliceIter<'a> { + type Item = (usize, usize); + + fn next(&mut self) -> Option<(usize, usize)> { + self.iter + .next() + .copied() + .map(|(o, m)| ( + self.o_bounds.clamp(o).wrapping_add(self.o_shift as usize), + self.m_bounds.clamp(m).wrapping_add(self.m_shift as usize), + )) + } +} + +impl<'a> IntoIterator for Slice<'a> { + type Item = (usize, usize); + type IntoIter = SliceIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + let Self { slice, o_bounds, m_bounds, o_shift, m_shift } = self; + SliceIter { + iter: slice.iter(), + o_bounds, + m_bounds, + o_shift, + m_shift, + } + } +} + +impl<'a> IntoIterator for &Slice<'a> { + type Item = (usize, usize); + type IntoIter = SliceIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + (*self).into_iter() + } +} + +/// A type suitable for edit costs when inferring alignments. +pub trait Cost: Add + Copy + Default + PartialOrd {} + +/// Blanket impl for [Cost]. +impl + Copy + Default + PartialOrd> Cost for T {} + +/// An individual edit, for computing [edit distances]. +/// +/// [edit distances]: https://en.wikipedia.org/wiki/Levenshtein_distance +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Edit { + /// A replacement of one item with another. + Replacement(T, U), + /// The deletion of an item. + Deletion(T), + /// The insertion of an item. + Insertion(U), +} + +impl Edit { + /// Returns the edit that inverts this one. + pub fn inverse(self) -> Edit { + match self { + Edit::Replacement(t, u) => Edit::Replacement(u, t), + Edit::Deletion(t) => Edit::Insertion(t), + Edit::Insertion(u) => Edit::Deletion(u), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_empty() { + let alignment = Alignment::identity([0]); + + assert!(alignment.iter().eq([(0, 0)])); + + assert_eq!(alignment.original_range(), 0..0); + assert_eq!(alignment.modified_range(), 0..0); + + assert_eq!(alignment.to_original_range(0..0), 0..0); + assert_eq!(alignment.to_modified_range(0..0), 0..0); + } + + #[test] + fn test_identity() { + let alignment = Alignment::identity(1..=5); + + assert_eq!( + alignment, + Alignment::from_iter([ + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + ]), + ); + + assert_eq!(alignment.original_range(), 1..5); + assert_eq!(alignment.modified_range(), 1..5); + + assert_eq!(alignment.to_original_range(2..4), 2..4); + assert_eq!(alignment.to_modified_range(2..4), 2..4); + } + + #[test] + fn test_aligning() { + let alignment = Alignment::from_iter([(0, 0), (1, 2), (2, 4), (3, 6)]); + + assert_eq!(alignment.original_range(), 0..3); + assert_eq!(alignment.modified_range(), 0..6); + + assert_eq!(alignment.to_original_range(0..0), 0..0); + assert_eq!(alignment.to_original_range(0..1), 0..1); + assert_eq!(alignment.to_original_range(0..2), 0..1); + assert_eq!(alignment.to_original_range(0..3), 0..2); + assert_eq!(alignment.to_original_range(1..1), 0..1); + assert_eq!(alignment.to_original_range(1..3), 0..2); + assert_eq!(alignment.to_original_range(1..4), 0..2); + assert_eq!(alignment.to_original_range(2..2), 1..1); + assert_eq!(alignment.to_original_range(2..4), 1..2); + assert_eq!(alignment.to_original_range(2..5), 1..3); + assert_eq!(alignment.to_original_range(3..3), 1..2); + + assert_eq!(alignment.to_modified_range(0..0), 0..0); + assert_eq!(alignment.to_modified_range(0..1), 0..2); + assert_eq!(alignment.to_modified_range(0..2), 0..4); + assert_eq!(alignment.to_modified_range(0..3), 0..6); + assert_eq!(alignment.to_modified_range(1..1), 2..2); + assert_eq!(alignment.to_modified_range(2..2), 4..4); + } + + #[test] + fn test_slice() { + let alignment = Alignment::from_iter([(0, 0), (1, 2), (2, 4), (3, 6), (4, 8)]); + + let slice = alignment.slice(1..4); + assert!(slice.into_iter().eq([(1, 2), (2, 4), (3, 6)])); + } + + #[test] + fn test_canonicalization() { + assert_eq!( + Alignment::from_iter([ + (0, 0), + (1, 2), + (1, 2), + (2, 4), + ]), + Alignment::from_iter([ + (0, 0), + (1, 2), + (2, 4), + ]), + ); + } + + fn test_composition(first: &Alignment, second: &Alignment) { + let composed = first.compose(second); + + let ob = composed.original_range(); + let mb = composed.modified_range(); + + assert_eq!(ob, first.original_range()); + assert_eq!(mb, second.modified_range()); + + let (of, ol) = (ob.start, ob.end); + let (mf, ml) = (mb.start, mb.end); + + for i in of..=ol { + for j in i..=ol { + assert_eq!( + composed.to_modified_range(i..j), + second.to_modified_range(first.to_modified_range(i..j)), + ); + } + } + + for i in mf..=ml { + for j in i..=ml { + assert_eq!( + composed.to_original_range(i..j), + first.to_original_range(second.to_original_range(i..j)), + ); + } + } + } + + #[test] + fn test_compose() { + let first: Alignment = (0..=3) + .map(|i| (i, 2 * i)) + .collect(); + let second = (0..=6) + .map(|i| (i, 2 * i)) + .collect(); + test_composition(&first, &second) + } + + fn test_identity_composition(alignment: &Alignment) { + let or = alignment.original_range(); + let oident = Alignment::identity(or.start..=or.end); + test_composition(&oident, alignment); + + let mr = alignment.modified_range(); + let mident = Alignment::identity(mr.start..=mr.end); + test_composition(alignment, &mident); + } + + #[test] + fn test_compose_identity() { + let alignment = Alignment::from_iter([ + (0, 2), + (2, 2), + (4, 4), + (6, 6), + (8, 6), + ]); + + // Modified sequence is smaller + test_identity_composition(&alignment); + + // Original sequence is smaller + test_identity_composition(&alignment.inverse()); + } + + #[test] + fn test_infer() { + assert_eq!(Alignment::infer("test".chars(), "test".chars()), Alignment::identity(0..=4)); + assert_eq!(Alignment::infer("asdf".chars(), "jkl;".chars()), Alignment::identity(0..=4)); + + assert_eq!( + Alignment::infer("color".chars(), "colour".chars()), + Alignment::from_iter([ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (4, 5), + (5, 6), + ]), + ); + + assert_eq!( + Alignment::infer("color".chars(), "colour".chars()), + Alignment::infer("colour".chars(), "color".chars()).inverse(), + ); + + assert_eq!( + Alignment::infer("ab---".chars(), "ab".chars()), + Alignment::from_iter([ + (0, 0), + (1, 1), + (2, 2), + (3, 2), + (4, 2), + (5, 2), + ]), + ); + } +} diff --git a/rust/src/bound.rs b/rust/src/bound.rs new file mode 100644 index 0000000..85609a3 --- /dev/null +++ b/rust/src/bound.rs @@ -0,0 +1,84 @@ +//! Our unified range type. + +use std::ops::{Bound, Range, RangeBounds}; + +/// A unified range type. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Bounds { + pub start: Option, + pub end: Option, +} + +impl Bounds { + /// Create a new Bounds from an existing range. + pub fn new(range: impl RangeBounds) -> Self { + Self { + start: match range.start_bound() { + Bound::Included(&n) => Some(n), + Bound::Excluded(&n) => Some(n + 1), + Bound::Unbounded => None, + }, + end: match range.end_bound() { + Bound::Included(&n) => Some(n + 1), + Bound::Excluded(&n) => Some(n), + Bound::Unbounded => None, + }, + } + } + + /// Convert these bounds to a concrete Range. + pub fn to_range(&self, len: usize) -> Range { + let start = self.start.unwrap_or(0); + let end = self.end.unwrap_or(len); + start..end + } + + /// Index these bounds by another range. + pub fn slice(&self, range: impl RangeBounds) -> Self { + let offset = self.start.unwrap_or(0); + + let start = match range.start_bound() { + Bound::Included(&n) => Bound::Included(offset + n), + Bound::Excluded(&n) => Bound::Excluded(offset + n), + Bound::Unbounded => copy(self.start_bound()), + }; + + let end = match range.end_bound() { + Bound::Included(&n) => Bound::Included(offset + n), + Bound::Excluded(&n) => Bound::Excluded(offset + n), + Bound::Unbounded => copy(self.end_bound()), + }; + + Self::new((start, end)) + } + + /// Clamp a value to within this range. + pub fn clamp(&self, mut n: usize) -> usize { + if let Some(start) = self.start { + n = n.max(start); + } + if let Some(end) = self.end { + n = n.min(end); + } + n + } +} + +/// Waiting for feature(bound_cloned) +fn copy(bound: Bound<&usize>) -> Bound { + match bound { + Bound::Included(&n) => Bound::Included(n), + Bound::Excluded(&n) => Bound::Excluded(n), + Bound::Unbounded => Bound::Unbounded, + } +} + +impl RangeBounds for Bounds { + fn start_bound(&self) -> Bound<&usize> { + self.start.as_ref().map_or(Bound::Unbounded, |n| Bound::Included(n)) + } + + fn end_bound(&self) -> Bound<&usize> { + self.end.as_ref().map_or(Bound::Unbounded, |n| Bound::Excluded(n)) + } +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs new file mode 100644 index 0000000..ede4a7c --- /dev/null +++ b/rust/src/lib.rs @@ -0,0 +1,13 @@ +//! Bidirectionally transformed strings. + +#![warn(rust_2018_idioms)] + +mod bound; +mod owned; +mod slice; + +pub mod align; + +pub use align::Alignment; +pub use owned::BiString; +pub use slice::BiStr; diff --git a/rust/src/owned.rs b/rust/src/owned.rs new file mode 100644 index 0000000..9bf9c63 --- /dev/null +++ b/rust/src/owned.rs @@ -0,0 +1,231 @@ +use crate::align::Alignment; +use crate::slice::BiStr; + +use std::borrow::Borrow; +use std::fmt::{self, Debug, Formatter}; +use std::iter::{self, FromIterator}; +use std::ops::{Add, AddAssign, Deref, Index, RangeBounds}; + +/// A bidirectionally transformed string. +#[derive(Clone, Eq, PartialEq)] +pub struct BiString { + original: String, + modified: String, + alignment: Alignment, +} + +impl BiString { + /// Create a new BiString with an explicit alignment. + pub fn new( + original: impl Into, + modified: impl Into, + alignment: impl Into, + ) -> Self { + Self::new_impl(original.into(), modified.into(), alignment.into()) + } + + /// Outlined non-generic part of new() + fn new_impl(original: String, modified: String, alignment: Alignment) -> Self { + assert_eq!(alignment.original_range(), 0..original.len()); + assert_eq!(alignment.modified_range(), 0..modified.len()); + + Self { + original, + modified, + alignment, + } + } + + /// Create a new BiString with a course alignment. + pub fn chunk(original: impl Into, modified: impl Into) -> Self { + let original = original.into(); + let modified = modified.into(); + let alignment = Alignment::from_iter([(0, 0), (original.len(), modified.len())]); + Self::new(original, modified, alignment) + } + + /// Create a new BiString with identical original and modified strings. + pub fn from_string(string: impl Into) -> Self { + Self::from_string_impl(string.into()) + } + + /// Outlined non-generic part of from_string() + fn from_string_impl(original: String) -> Self { + let modified = original.clone(); + let alignment = Alignment::identity( + original + .char_indices() + .map(|(i, _c)| i) + .chain(iter::once(original.len())), + ); + Self::new(original, modified, alignment) + } + + /// The original string, before any modifications. + pub fn original(&self) -> &str { + &self.original + } + + /// The current value of the string, after all modifications. + pub fn modified(&self) -> &str { + &self.modified + } + + /// The sequence alignment between the original and modified strings. + pub fn alignment(&self) -> &Alignment { + &self.alignment + } + + /// Append a string to this BiString. + pub fn push_str(&mut self, string: &str) { + let ol = self.original.len(); + let ml = self.modified.len(); + for i in 1..=string.len() { + self.alignment.push(ol + i, ml + i); + } + + self.original.push_str(string); + self.modified.push_str(string); + } + + /// Append another BiString slice to this BiString. + pub fn push_bistr(&mut self, bistring: &BiStr) { + let ol = self.original.len(); + let ml = self.modified.len(); + for (o, m) in bistring.alignment() { + self.alignment.push(ol + o, ml + m); + } + + self.original.push_str(bistring.original()); + self.modified.push_str(bistring.modified()); + } + + /// Make a copy of this bistring with its ASCII characters lowercased. + pub fn to_ascii_lowercase(&self) -> Self { + Self::new(&self.original, self.modified.to_ascii_lowercase(), self.alignment.clone()) + } + + /// Make a copy of this bistring with its ASCII characters uppercased. + pub fn to_ascii_uppercase(&self) -> Self { + Self::new(&self.original, self.modified.to_ascii_uppercase(), self.alignment.clone()) + } +} + +impl Add<&str> for BiString { + type Output = Self; + + fn add(mut self, rhs: &str) -> Self { + self.push_str(rhs); + self + } +} + +impl Add<&BiStr> for BiString { + type Output = Self; + + fn add(mut self, rhs: &BiStr) -> Self { + self.push_bistr(rhs); + self + } +} + +impl Add<&BiString> for BiString { + type Output = Self; + + fn add(mut self, rhs: &BiString) -> Self { + self.push_bistr(rhs); + self + } +} + +impl Add for &str { + type Output = BiString; + + fn add(self, rhs: BiString) -> BiString { + // XXX: Re-use allocation? + let mut bs = BiString::from(self); + bs.push_bistr(&rhs); + bs + } +} + +impl AddAssign<&str> for BiString { + fn add_assign(&mut self, rhs: &str) { + self.push_str(rhs); + } +} + +impl AddAssign<&BiStr> for BiString { + fn add_assign(&mut self, rhs: &BiStr) { + self.push_bistr(rhs); + } +} + +impl AddAssign<&BiString> for BiString { + fn add_assign(&mut self, rhs: &Self) { + self.push_bistr(rhs); + } +} + +impl Borrow for BiString { + fn borrow(&self) -> &BiStr { + &*self + } +} + +impl Debug for BiString { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "{:?}", &self[..]) + } +} + +impl Deref for BiString { + type Target = BiStr; + + fn deref(&self) -> &BiStr { + BiStr::new(self, ..) + } +} + +impl From<&str> for BiString { + fn from(string: &str) -> Self { + Self::from_string(string) + } +} + +impl From for BiString { + fn from(string: String) -> Self { + Self::from_string(string) + } +} + +impl> Index for BiString { + type Output = BiStr; + + fn index(&self, index: R) -> &BiStr { + BiStr::new(self, index) + } +} + +impl PartialEq for BiString { + fn eq(&self, rhs: &BiStr) -> bool { + &self[..] == rhs + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_concat() { + let mut bs = BiString::chunk(" ", ""); + bs += "Hello"; + bs += &BiString::chunk(" ", " "); + bs += "world!"; + bs += &BiString::chunk(" ", ""); + + assert_eq!(bs.original(), " Hello world! "); + assert_eq!(bs.modified(), "Hello world!"); + } +} diff --git a/rust/src/slice.rs b/rust/src/slice.rs new file mode 100644 index 0000000..5f4f75e --- /dev/null +++ b/rust/src/slice.rs @@ -0,0 +1,205 @@ +//! BiString slices. + +use crate::align; +use crate::bound::Bounds; +use crate::owned::BiString; + +use std::borrow::ToOwned; +use std::fmt::{self, Debug, Formatter}; +use std::ops::{Index, Range, RangeBounds}; + +// We want a &BiStr slice to refer to three slices at once, really: the original substring, the +// modified substring, and the slice of the alignment. Rust has fat pointers, but they're not +// *that* fat! So we cheat and encode the slice positions in the fat pointer itself. BiStr is +// newtype wrapper over a slice of zero-sized types, so &BiStr is a pointer and a length. The start +// and end indices of the slice are squeezed into the length, taking up half the bits each: +// +// +----------------------------------+ +// | original: "HELLO WORLD" | +// let s: BiString = ... | modified: "hello world" | +// ^ | alignment: [(0, 0), (1, 1), ...] | +// | +----------------------------------+ +// | +// +----------------------------+ high low +// | 0001 | 0100 +// +------------------------+ +// let slice: &BiStr = &s[1..4] | pointer | length | +// +------------------------+ +// +// Inspired by bitvec: https://myrrlyn.net/blog/bitvec/addressing-bits + +/// The bit width of a single range bound (half a usize). +const WIDTH: u32 = usize::BITS / 2; + +/// The largest possible value for a bound, representing an unbounded side of a range. +const MAX: usize = (1usize << WIDTH) - 1; + +/// Pack range bounds into a usize. +fn encode(bounds: Bounds) -> usize { + let start = if let Some(n) = bounds.start { + assert!(n < MAX); + n + } else { + MAX + }; + + let end = if let Some(n) = bounds.end { + assert!(n < MAX); + n + } else { + MAX + }; + + (start << WIDTH) | end +} + +/// Unpack range bounds from a usize. +fn decode(encoded: usize) -> Bounds { + Bounds { + start: match encoded >> WIDTH { + MAX => None, + n => Some(n), + }, + end: match encoded & MAX { + MAX => None, + n => Some(n), + }, + } +} + +/// A slice of a [`BiString`]. +/// +/// Like [`str`], `BiStr` is an unsized type, typically used behind a reference as `&BiStr`. A +/// bistring slice points to both a modified substring and the corresponding original substring. +/// +/// # use bistring::{BiString, BiStr}; +/// let s: BiString = BiString::from("HELLO WORLD").to_ascii_lowercase(); +/// assert_eq!(s.original(), "HELLO WORLD"); +/// assert_eq!(s.modified(), "hello world"); +/// +/// let slice: &BiStr = &s[1..4]; +/// assert_eq!(slice.original(), "ELL"); +/// assert_eq!(slice.modified(), "ell"); +pub struct BiStr([()]); + +impl BiStr { + /// Create a new bistring slice with the given range. + pub(crate) fn new(target: &BiString, range: impl RangeBounds) -> &Self { + let ptr = target as *const BiString as *const (); + let len = encode(Bounds::new(range)); + unsafe { &*(std::ptr::slice_from_raw_parts(ptr, len) as *const BiStr) } + } + + /// Convert this slice reference to a pointer. + fn as_ptr(&self) -> *const [()] { + self as *const BiStr as *const [()] + } + + /// Get the BiString this slice refers to. + fn target(&self) -> &BiString { + unsafe { &*(self.as_ptr() as *const BiString) } + } + + /// Get the bound information from this slice. + fn bounds(&self) -> Bounds { + // Could be safe with feature(slice_ptr_len) + decode(unsafe { &*self.as_ptr() }.len()) + } + + /// Get the bound information from this slice as a Range. + fn range(&self) -> Range { + self.bounds().to_range(self.target().modified().len()) + } + + /// The original substring. + pub fn original(&self) -> &str { + let target = self.target(); + let range = target.alignment().to_original_range(self.bounds()); + &target.original()[range] + } + + /// The modified substring. + pub fn modified(&self) -> &str { + &self.target().modified()[self.range()] + } + + /// The alignment for this slice. + pub fn alignment(&self) -> align::Slice<'_> { + self.target() + .alignment() + .slice_by_modified(self.bounds()) + .shifted_to_origin() + } +} + +impl Debug for BiStr { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + let original = self.original(); + let modified = self.modified(); + if original == modified { + write!(f, "โฎŽ{:?}โฎŒ", original) + } else { + write!(f, "({:?} โ‡‹ {:?})", original, modified) + } + } +} + +impl Eq for BiStr {} + +impl PartialEq for BiStr { + fn eq(&self, rhs: &Self) -> bool { + self.original() == rhs.original() + && self.modified() == rhs.modified() + && self.alignment() == rhs.alignment() + } +} + +impl PartialEq for BiStr { + fn eq(&self, rhs: &BiString) -> bool { + self == &rhs[..] + } +} + +impl> Index for BiStr { + type Output = BiStr; + + fn index(&self, index: R) -> &Self { + &self.target()[self.bounds().slice(index)] + } +} + +impl ToOwned for BiStr { + type Owned = BiString; + + fn to_owned(&self) -> BiString { + BiString::new(self.original(), self.modified(), self.alignment()) + } +} + +#[cfg(test)] +mod tests { + use crate::BiString; + + #[test] + fn test_concat() { + let mut bs = BiString::chunk(" ", ""); + bs += "Hello"; + bs += &BiString::chunk(" ", " "); + bs += "world!"; + bs += &BiString::chunk(" ", ""); + + let mut slice = &bs[..]; + slice = &slice[..]; + assert_eq!(slice.original(), " Hello world! "); + assert_eq!(slice.modified(), "Hello world!"); + + slice = &slice[4..7]; + assert_eq!(slice.original(), "o w"); + assert_eq!(slice.modified(), "o w"); + assert_eq!(slice, &("o" + BiString::chunk(" ", " ") + "w")); + + slice = &slice[1..2]; + assert_eq!(slice.original(), " "); + assert_eq!(slice.modified(), " "); + } +}