@@ -194,6 +194,11 @@ class StringGrouperNotFitException(Exception):
194
194
pass
195
195
196
196
197
+ class StringLengthException (Exception ):
198
+ """Raised when vectoriser is fit on strings that are not of length greater than ngram size"""
199
+ pass
200
+
201
+
197
202
class StringGrouper (object ):
198
203
def __init__ (self , master : pd .Series ,
199
204
duplicates : Optional [pd .Series ] = None ,
@@ -258,6 +263,13 @@ def n_grams(self, string: str) -> List[str]:
258
263
259
264
def fit (self ) -> 'StringGrouper' :
260
265
"""Builds the _matches list which contains string matches indices and similarity"""
266
+
267
+ # Validate match strings length
268
+ if not StringGrouper ._strings_are_of_sufficient_length (self ._master , self ._config .ngram_size ) or \
269
+ (self ._duplicates is not None
270
+ and not StringGrouper ._strings_are_of_sufficient_length (self ._duplicates , self ._config .ngram_size )):
271
+ raise StringLengthException ('Input string lengths are not all greater than n_gram length' )
272
+
261
273
master_matrix , duplicate_matrix = self ._get_tf_idf_matrices ()
262
274
263
275
# Calculate the matches using the cosine similarity
@@ -697,6 +709,16 @@ def _is_series_of_strings(series_to_test: pd.Series) -> bool:
697
709
return False
698
710
return True
699
711
712
+ @staticmethod
713
+ def _strings_are_of_sufficient_length (series_to_test : pd .Series , ngram_size : int ) -> bool :
714
+ if not isinstance (series_to_test , pd .Series ):
715
+ return False
716
+ elif series_to_test .to_frame ().applymap (
717
+ lambda x : not len (x ) >= ngram_size
718
+ ).squeeze (axis = 1 ).all ():
719
+ return False
720
+ return True
721
+
700
722
@staticmethod
701
723
def _is_input_data_combination_valid (duplicates , master_id , duplicates_id ) -> bool :
702
724
if duplicates is None and (duplicates_id is not None ) \
0 commit comments