From 36bef763f012ea46bb93c634e556607bed4b6071 Mon Sep 17 00:00:00 2001 From: idrisibrahimerten Date: Thu, 3 Jul 2025 13:09:34 +0300 Subject: [PATCH 1/3] feat(strings): add professional suffix array and LCP implementation --- strings/suffix_array.py | 106 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 strings/suffix_array.py diff --git a/strings/suffix_array.py b/strings/suffix_array.py new file mode 100644 index 000000000000..d8c0ca28769e --- /dev/null +++ b/strings/suffix_array.py @@ -0,0 +1,106 @@ +''' +suffix_array.py + +Professional implementation of Suffix Array and LCP (Longest Common Prefix) array in Python. + +Features: +- Efficient O(n log n) construction using doubling method +- Kasai's algorithm for LCP array in O(n) +- Detailed docstrings and complexity analysis +- Standalone usage example and simple unit tests + +Author: Idris Ibrahim Erten +License: MIT +''' + +def build_suffix_array(s: str) -> list[int]: + """ + Builds the suffix array of the given string using the doubling algorithm. + + Parameters: + s (str): Input string + + Returns: + list[int]: List of starting indices of suffixes in sorted order + + Complexity: + O(n log n) time and O(n) space. + """ + # Append a sentinel that is lexicographically smaller than all other characters + s += '\0' + n = len(s) + # Initial ranking by character code + ranks = [ord(c) for c in s] + sa = list(range(n)) + tmp = [0] * n + k = 1 + # Doubling loop + while k < n: + # Sort by (rank[i], rank[i+k]) pairs + sa.sort(key=lambda i: (ranks[i], ranks[i + k] if i + k < n else -1)) + # Temporary array for new ranks + tmp[sa[0]] = 0 + for i in range(1, n): + prev, curr = sa[i - 1], sa[i] + # Compare pair (rank, next rank) + r_prev = (ranks[prev], ranks[prev + k] if prev + k < n else -1) + r_curr = (ranks[curr], ranks[curr + k] if curr + k < n else -1) + tmp[curr] = tmp[prev] + (1 if r_curr != r_prev else 0) + ranks, tmp = tmp, ranks # reuse lists to save memory + k <<= 1 + if ranks[sa[-1]] == n - 1: + break + # Drop the sentinel index + return sa[1:] + + +def build_lcp_array(s: str, sa: list[int]) -> list[int]: + """ + Builds the LCP (Longest Common Prefix) array using Kasai's algorithm. + + Parameters: + s (str): Original string + sa (list[int]): Suffix array of s + + Returns: + list[int]: LCP array where lcp[i] = LCP(sa[i], sa[i-1]) + + Complexity: + O(n) time and O(n) space. + """ + n = len(sa) + # Inverse of suffix array: pos[i] gives rank of suffix at i + pos = [0] * n + for i, suf in enumerate(sa): + pos[suf] = i + lcp = [0] * n + k = 0 + for i in range(len(s)): + if pos[i] == 0: + k = 0 + continue + j = sa[pos[i] - 1] + # Compare characters starting from k + while i + k < len(s) and j + k < len(s) and s[i + k] == s[j + k]: + k += 1 + lcp[pos[i]] = k + if k: + k -= 1 + return lcp[1:] + + +if __name__ == '__main__': + # Example usage and simple tests + test_strings = ['banana', 'abracadabra', 'mississippi'] + for s in test_strings: + sa = build_suffix_array(s) + lcp = build_lcp_array(s, sa) + print(f"String: {s}") + print(f"Suffix Array: {sa}") + print(f"LCP Array : {lcp}\n") + + # Assertions for correctness + s = 'banana' + expected_sa = [5, 3, 1, 0, 4, 2] # indices of sorted suffixes + assert build_suffix_array(s) == expected_sa, 'SA test failed' + print('All tests passed!') From 732aaf27e6f0f7c00d9f83e8e10d81de25d965e5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 3 Jul 2025 10:13:17 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- strings/suffix_array.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/strings/suffix_array.py b/strings/suffix_array.py index d8c0ca28769e..b86ecd3ebebb 100644 --- a/strings/suffix_array.py +++ b/strings/suffix_array.py @@ -1,4 +1,4 @@ -''' +""" suffix_array.py Professional implementation of Suffix Array and LCP (Longest Common Prefix) array in Python. @@ -11,7 +11,8 @@ Author: Idris Ibrahim Erten License: MIT -''' +""" + def build_suffix_array(s: str) -> list[int]: """ @@ -27,7 +28,7 @@ def build_suffix_array(s: str) -> list[int]: O(n log n) time and O(n) space. """ # Append a sentinel that is lexicographically smaller than all other characters - s += '\0' + s += "\0" n = len(s) # Initial ranking by character code ranks = [ord(c) for c in s] @@ -89,9 +90,9 @@ def build_lcp_array(s: str, sa: list[int]) -> list[int]: return lcp[1:] -if __name__ == '__main__': +if __name__ == "__main__": # Example usage and simple tests - test_strings = ['banana', 'abracadabra', 'mississippi'] + test_strings = ["banana", "abracadabra", "mississippi"] for s in test_strings: sa = build_suffix_array(s) lcp = build_lcp_array(s, sa) @@ -100,7 +101,7 @@ def build_lcp_array(s: str, sa: list[int]) -> list[int]: print(f"LCP Array : {lcp}\n") # Assertions for correctness - s = 'banana' + s = "banana" expected_sa = [5, 3, 1, 0, 4, 2] # indices of sorted suffixes - assert build_suffix_array(s) == expected_sa, 'SA test failed' - print('All tests passed!') + assert build_suffix_array(s) == expected_sa, "SA test failed" + print("All tests passed!") From 9e7827300824d1460723da2a8648a01d91f04e48 Mon Sep 17 00:00:00 2001 From: idrisibrahimerten Date: Thu, 3 Jul 2025 13:18:03 +0300 Subject: [PATCH 3/3] feat(strings): add professional suffix array and LCP implementation --- strings/suffix_array.py | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/strings/suffix_array.py b/strings/suffix_array.py index b86ecd3ebebb..3066630b73cd 100644 --- a/strings/suffix_array.py +++ b/strings/suffix_array.py @@ -1,3 +1,4 @@ +<<<<<<< HEAD """ suffix_array.py @@ -14,19 +15,9 @@ """ +======= +>>>>>>> c176d091 (feat(strings): add professional suffix array and LCP implementation) def build_suffix_array(s: str) -> list[int]: - """ - Builds the suffix array of the given string using the doubling algorithm. - - Parameters: - s (str): Input string - - Returns: - list[int]: List of starting indices of suffixes in sorted order - - Complexity: - O(n log n) time and O(n) space. - """ # Append a sentinel that is lexicographically smaller than all other characters s += "\0" n = len(s) @@ -56,19 +47,6 @@ def build_suffix_array(s: str) -> list[int]: def build_lcp_array(s: str, sa: list[int]) -> list[int]: - """ - Builds the LCP (Longest Common Prefix) array using Kasai's algorithm. - - Parameters: - s (str): Original string - sa (list[int]): Suffix array of s - - Returns: - list[int]: LCP array where lcp[i] = LCP(sa[i], sa[i-1]) - - Complexity: - O(n) time and O(n) space. - """ n = len(sa) # Inverse of suffix array: pos[i] gives rank of suffix at i pos = [0] * n