diff --git a/probing/ud_filter/filtering_probing.py b/probing/ud_filter/filtering_probing.py index d114353..d3d6bfb 100644 --- a/probing/ud_filter/filtering_probing.py +++ b/probing/ud_filter/filtering_probing.py @@ -70,6 +70,7 @@ def upload_files( list_texts = [read(p) for p in self.paths] conllu_data = "\n".join(list_texts) + conllu_data = re.sub(r"\d+\-\d+.*\n", "", conllu_data) self.language = extract_lang_from_udfile_path(self.paths[0], language=language) self.sentences = parse(conllu_data) diff --git a/tests/filter_test/test_filtering_probing.py b/tests/filter_test/test_filtering_probing.py index 01283b7..d664284 100644 --- a/tests/filter_test/test_filtering_probing.py +++ b/tests/filter_test/test_filtering_probing.py @@ -150,7 +150,7 @@ def test_filter_and_convert_all_saved(self): save_dir_path=task_dir.name, task_name="cl", ) - self.assertEqual(queries_sents, self.probing_filter.probing_dict) + self.assertCountEqual(queries_sents, self.probing_filter.probing_dict) with open(f"{task_dir.name}/ru_taiga_cl.csv") as f: self.assertEqual(27, len(f.readlines())) task_dir.cleanup()