Skip to content

Commit 0ba8ffa

Browse files
committed
raise StringLengthException if vectoriser is applied to strings that are not all greater in length than ngram_size
1 parent 0c08cef commit 0ba8ffa

File tree

2 files changed

+28
-1
lines changed

2 files changed

+28
-1
lines changed

string_grouper/string_grouper.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,11 @@ class StringGrouperNotFitException(Exception):
194194
pass
195195

196196

197+
class StringLengthException(Exception):
198+
"""Raised when vectoriser is fit on strings that are not of length greater than ngram size"""
199+
pass
200+
201+
197202
class StringGrouper(object):
198203
def __init__(self, master: pd.Series,
199204
duplicates: Optional[pd.Series] = None,
@@ -258,6 +263,13 @@ def n_grams(self, string: str) -> List[str]:
258263

259264
def fit(self) -> 'StringGrouper':
260265
"""Builds the _matches list which contains string matches indices and similarity"""
266+
267+
# Validate match strings length
268+
if not StringGrouper._strings_are_of_sufficient_length(self._master, self._config.ngram_size) or \
269+
(self._duplicates is not None
270+
and not StringGrouper._strings_are_of_sufficient_length(self._duplicates, self._config.ngram_size)):
271+
raise StringLengthException('Input string lengths are not all greater than n_gram length')
272+
261273
master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
262274

263275
# Calculate the matches using the cosine similarity
@@ -697,6 +709,16 @@ def _is_series_of_strings(series_to_test: pd.Series) -> bool:
697709
return False
698710
return True
699711

712+
@staticmethod
713+
def _strings_are_of_sufficient_length(series_to_test: pd.Series, ngram_size: int) -> bool:
714+
if not isinstance(series_to_test, pd.Series):
715+
return False
716+
elif series_to_test.to_frame().applymap(
717+
lambda x: not len(x) >= ngram_size
718+
).squeeze(axis=1).all():
719+
return False
720+
return True
721+
700722
@staticmethod
701723
def _is_input_data_combination_valid(duplicates, master_id, duplicates_id) -> bool:
702724
if duplicates is None and (duplicates_id is not None) \

string_grouper/test/test_string_grouper.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
DEFAULT_REGEX, DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \
77
StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \
88
match_most_similar, group_similar_strings, match_strings, \
9-
compute_pairwise_similarities
9+
compute_pairwise_similarities, StringLengthException
1010
from unittest.mock import patch
1111

1212

@@ -822,6 +822,11 @@ def test_prior_matches_added(self):
822822
# All strings should now match to the same "master" string
823823
self.assertEqual(1, len(df.deduped.unique()))
824824

825+
def test_group_similar_strings_stopwords(self):
826+
"""StringGrouper shouldn't raise a ValueError if all strings are shorter than 3 characters"""
827+
with self.assertRaises(StringLengthException):
828+
StringGrouper(pd.Series(['zz', 'yy', 'xx'])).fit()
829+
825830

826831
if __name__ == '__main__':
827832
unittest.main()

0 commit comments

Comments
 (0)