From 995c6c225de5436390518facb6f9a02c0dab1d2b Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@workstation.fritz.box>
Date: Tue, 6 May 2025 00:07:23 +0200
Subject: [PATCH 1/6] feat(datasets): the UniversalDependenciesCorpus object
 gets new  argument. Support for various Latin Universal Depedency datasets
 are added.

---
 flair/datasets/treebanks.py | 126 +++++++++++++++++++++++++++++++++++-
 1 file changed, 125 insertions(+), 1 deletion(-)

diff --git a/flair/datasets/treebanks.py b/flair/datasets/treebanks.py
index 21ae327691..2d7b9af633 100644
--- a/flair/datasets/treebanks.py
+++ b/flair/datasets/treebanks.py
@@ -19,6 +19,7 @@ def __init__(
         test_file=None,
         dev_file=None,
         in_memory: bool = True,
+        sample_missing_splits=True,
         split_multiwords: bool = True,
     ) -> None:
         """Instantiates a Corpus from CoNLL-U column-formatted task data such as the UD corpora.
@@ -28,6 +29,7 @@ def __init__(
         :param test_file: the name of the test file
         :param dev_file: the name of the dev file, if None, dev data is sampled from train
         :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads
+        :param sample_missing_splits: If set to True, missing splits will be randomly sampled from the training split
         :param split_multiwords: If set to True, multiwords are split (default), otherwise kept as single tokens
         :return: a Corpus with annotated train, dev and test data
         """
@@ -55,7 +57,7 @@ def __init__(
             else None
         )
 
-        super().__init__(train, dev, test, name=str(data_folder))
+        super().__init__(train, dev, test, name=str(data_folder), sample_missing_splits=sample_missing_splits)
 
 
 class UniversalDependenciesDataset(FlairDataset):
@@ -581,6 +583,128 @@ def __init__(
         super().__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
 
 
+class UD_LATIN_CIRCSE(UniversalDependenciesCorpus):
+    def __init__(
+        self,
+        base_path: Optional[Union[str, Path]] = None,
+        in_memory: bool = True,
+        split_multiwords: bool = True,
+        revision: str = "master",
+    ) -> None:
+        base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path)
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Latin-CIRCSE/{revision}/"
+        cached_path(f"{web_path}/la_circse-ud-test.conllu", Path("datasets") / dataset_name)
+
+        super().__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
+
+
+class UD_LATIN_ITTB(UniversalDependenciesCorpus):
+    def __init__(
+        self,
+        base_path: Optional[Union[str, Path]] = None,
+        in_memory: bool = True,
+        split_multiwords: bool = True,
+        revision: str = "master",
+    ) -> None:
+        base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path)
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Latin-ITTB/{revision}/"
+
+        for split in ["train", "dev", "test"]:
+            cached_path(f"{web_path}/la_ittb-ud-{split}.conllu", Path("datasets") / dataset_name)
+
+        super().__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
+
+
+class UD_LATIN_UDANTE(UniversalDependenciesCorpus):
+    def __init__(
+        self,
+        base_path: Optional[Union[str, Path]] = None,
+        in_memory: bool = True,
+        split_multiwords: bool = True,
+        revision: str = "master",
+    ) -> None:
+        base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path)
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Latin-UDANTE/{revision}/"
+
+        for split in ["train", "dev", "test"]:
+            cached_path(f"{web_path}/la_udante-ud-{split}.conllu", Path("datasets") / dataset_name)
+
+        super().__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
+
+
+class UD_LATIN_Perseus(UniversalDependenciesCorpus):
+    def __init__(
+        self,
+        base_path: Optional[Union[str, Path]] = None,
+        in_memory: bool = True,
+        split_multiwords: bool = True,
+        revision: str = "master",
+    ) -> None:
+        base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path)
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Latin-Perseus/{revision}/"
+
+        for split in ["train", "test"]:
+            cached_path(f"{web_path}/la_perseus-ud-{split}.conllu", Path("datasets") / dataset_name)
+
+        super().__init__(
+            data_folder, in_memory=in_memory, sample_missing_splits=False, split_multiwords=split_multiwords
+        )
+
+
+class UD_LATIN_PROIEL(UniversalDependenciesCorpus):
+    def __init__(
+        self,
+        base_path: Optional[Union[str, Path]] = None,
+        in_memory: bool = True,
+        split_multiwords: bool = True,
+        revision: str = "master",
+    ) -> None:
+        base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path)
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Latin-PROIEL/{revision}/"
+
+        for split in ["train", "dev", "test"]:
+            cached_path(f"{web_path}/la_proiel-ud-{split}.conllu", Path("datasets") / dataset_name)
+
+        super().__init__(
+            data_folder, in_memory=in_memory, sample_missing_splits=False, split_multiwords=split_multiwords
+        )
+
+
 class UD_SPANISH(UniversalDependenciesCorpus):
     def __init__(
         self,

From f217723bd826504896e11d70144af2d9fe32ced0 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Tue, 6 May 2025 00:08:11 +0200
Subject: [PATCH 2/6] feat: make new Latin UD datasets globally available

---
 flair/datasets/__init__.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index f1c1279355..bd0d51662b 100644
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -297,6 +297,11 @@
     UD_KAZAKH,
     UD_KOREAN,
     UD_LATIN,
+    UD_LATIN_CIRCSE,
+    UD_LATIN_ITTB,
+    UD_LATIN_UDANTE,
+    UD_LATIN_Perseus,
+    UD_LATIN_PROIEL,
     UD_LATVIAN,
     UD_LITHUANIAN,
     UD_LIVVI,
@@ -536,6 +541,11 @@
     "UD_KAZAKH",
     "UD_KOREAN",
     "UD_LATIN",
+    "UD_LATIN_CIRCSE",
+    "UD_LATIN_ITTB",
+    "UD_LATIN_UDANTE",
+    "UD_LATIN_Perseus",
+    "UD_LATIN_PROIEL",
     "UD_LATVIAN",
     "UD_LITHUANIAN",
     "UD_LIVVI",

From 3f15a5e319861353d530b51f1fa67dcbaa20f41d Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Tue, 6 May 2025 00:09:01 +0200
Subject: [PATCH 3/6] tests: add test cases for newly added Latin UD datasets

---
 tests/test_datasets.py | 84 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 121a8521bb..d4c2cf5e28 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1018,6 +1018,90 @@ def test_bavarian_wiki(tasks_base_path):
     ), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!"
 
 
+@pytest.mark.skip()
+def test_ud_latin():
+    revision = "f16caaa"
+    corpus = flair.datasets.UD_LATIN(revision=revision)
+
+    # Taken from: https://github.com/UniversalDependencies/UD_Latin-LLCT/blob/f16caaa3b0c57e3319396a1af74ee2bc7c9b4323/stats.xml#L8
+    ref_sentences = 9023
+    actual_sentences = len(corpus.train) + len(corpus.dev) + len(corpus.test)
+
+    assert (
+        ref_sentences == actual_sentences
+    ), f"Number of parsed token ({actual_sentences}) does not match with reported number of sentences ({ref_sentences})!"
+
+
+@pytest.mark.skip()
+def test_ud_latin_circse():
+    revision = "13cc204"
+    corpus = flair.datasets.UD_LATIN_CIRCSE(revision=revision)
+
+    # Taken from: https://github.com/UniversalDependencies/UD_Latin-CIRCSE/blob/13cc204a1d8910d7f95fd78b23aec93ccb64be5c/stats.xml#L8
+    ref_sentences = 1263
+    actual_sentences = len(corpus.test)
+
+    assert (
+        ref_sentences == actual_sentences
+    ), f"Number of parsed token ({actual_sentences}) does not match with reported number of sentences ({ref_sentences})!"
+
+
+@pytest.mark.skip()
+def test_ud_latin_ittb():
+    revision = "9991421"
+    corpus = flair.datasets.UD_LATIN_ITTB(revision=revision)
+
+    # Taken from: https://github.com/UniversalDependencies/UD_Latin-ITTB/blob/9991421cd858f6603b4f27b26c1f11d4619fc8cc/stats.xml#L8
+    ref_sentences = 26977
+    actual_sentences = len(corpus.train) + len(corpus.dev) + len(corpus.test)
+
+    assert (
+        ref_sentences == actual_sentences
+    ), f"Number of parsed token ({actual_sentences}) does not match with reported number of sentences ({ref_sentences})!"
+
+
+@pytest.mark.skip()
+def test_ud_latin_udante():
+    revision = "f817abd"
+    corpus = flair.datasets.UD_LATIN_UDANTE(revision=revision)
+
+    # Taken from: https://github.com/UniversalDependencies/UD_Latin-UDante/blob/f817abdaeaf3b40250b65d1a6bbbd5c7dcee7836/stats.xml#L8
+    ref_sentences = 1723
+    actual_sentences = len(corpus.train) + len(corpus.dev) + len(corpus.test)
+
+    assert (
+        ref_sentences == actual_sentences
+    ), f"Number of parsed token ({actual_sentences}) does not match with reported number of sentences ({ref_sentences})!"
+
+
+@pytest.mark.skip()
+def test_ud_latin_perseus():
+    revision = "b3c7f9b"
+    corpus = flair.datasets.UD_LATIN_Perseus(revision=revision)
+
+    # Taken from: https://github.com/UniversalDependencies/UD_Latin-Perseus/blob/b3c7f9b6751c404db3b1f9e436ba4557d8b945c5/stats.xml#L8
+    ref_sentences = 2273
+    actual_sentences = len(corpus.train) + len(corpus.test)
+
+    assert (
+        ref_sentences == actual_sentences
+    ), f"Number of parsed token ({actual_sentences}) does not match with reported number of sentences ({ref_sentences})!"
+
+
+@pytest.mark.skip()
+def test_ud_latin_proiel():
+    revision = "6d7c717"
+    corpus = flair.datasets.UD_LATIN_PROIEL(revision=revision)
+
+    # Taken from: https://github.com/UniversalDependencies/UD_Latin-PROIEL/blob/6d7c717f6c9fa971c312fa2071016cbd5f2e6a41/stats.xml#L8
+    ref_sentences = 18689
+    actual_sentences = len(corpus.train) + len(corpus.dev) + len(corpus.test)
+
+    assert (
+        ref_sentences == actual_sentences
+    ), f"Number of parsed token ({actual_sentences}) does not match with reported number of sentences ({ref_sentences})!"
+
+
 def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
     corpus = MultiFileJsonlCorpus(
         train_files=[tasks_base_path / "jsonl/train.jsonl"],

From e6eadd766e327dbcbf510655e69ef2be0c74e536 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Tue, 6 May 2025 00:11:20 +0200
Subject: [PATCH 4/6] fix(datasets): rename UD_LATIN_Perseus to
 UD_LATIN_PERSEUS

---
 flair/datasets/treebanks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/datasets/treebanks.py b/flair/datasets/treebanks.py
index 2d7b9af633..37c6147652 100644
--- a/flair/datasets/treebanks.py
+++ b/flair/datasets/treebanks.py
@@ -653,7 +653,7 @@ def __init__(
         super().__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
 
 
-class UD_LATIN_Perseus(UniversalDependenciesCorpus):
+class UD_LATIN_PERSEUS(UniversalDependenciesCorpus):
     def __init__(
         self,
         base_path: Optional[Union[str, Path]] = None,

From 5a1497d9d36e6ed0b39143e7562d47fe8ef632f1 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Tue, 6 May 2025 00:11:45 +0200
Subject: [PATCH 5/6] fix: rename UD_LATIN_Perseus to UD_LATIN_PERSEUS

---
 flair/datasets/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index bd0d51662b..31300ee326 100644
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -300,7 +300,7 @@
     UD_LATIN_CIRCSE,
     UD_LATIN_ITTB,
     UD_LATIN_UDANTE,
-    UD_LATIN_Perseus,
+    UD_LATIN_PERSEUS,
     UD_LATIN_PROIEL,
     UD_LATVIAN,
     UD_LITHUANIAN,
@@ -544,7 +544,7 @@
     "UD_LATIN_CIRCSE",
     "UD_LATIN_ITTB",
     "UD_LATIN_UDANTE",
-    "UD_LATIN_Perseus",
+    "UD_LATIN_PERSEUS",
     "UD_LATIN_PROIEL",
     "UD_LATVIAN",
     "UD_LITHUANIAN",

From 194b2ba3e3841d9ed33a8f7bfdef41bc874623c2 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Tue, 6 May 2025 00:12:00 +0200
Subject: [PATCH 6/6] tests: rename UD_LATIN_Perseus to UD_LATIN_PERSEUS

---
 tests/test_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index d4c2cf5e28..f78f3dfd13 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1077,7 +1077,7 @@ def test_ud_latin_udante():
 @pytest.mark.skip()
 def test_ud_latin_perseus():
     revision = "b3c7f9b"
-    corpus = flair.datasets.UD_LATIN_Perseus(revision=revision)
+    corpus = flair.datasets.UD_LATIN_PERSEUS(revision=revision)
 
     # Taken from: https://github.com/UniversalDependencies/UD_Latin-Perseus/blob/b3c7f9b6751c404db3b1f9e436ba4557d8b945c5/stats.xml#L8
     ref_sentences = 2273