From 95937e2aca6cb0ac5731874164f2a5e92bcf3ddf Mon Sep 17 00:00:00 2001 From: Arjun Dinesh Jagdale <142811259+ArjunJagdale@users.noreply.github.com> Date: Tue, 1 Jul 2025 13:00:51 +0530 Subject: [PATCH 1/4] Update utils.py --- services/worker/src/worker/utils.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index 0e528ea29..b651ff7b4 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -17,6 +17,7 @@ from datasets import Dataset, DatasetInfo, DownloadConfig, Features, IterableDataset, load_dataset from datasets.utils.file_utils import SINGLE_FILE_COMPRESSION_EXTENSION_TO_PROTOCOL from huggingface_hub import HfFileSystem, HfFileSystemFile +from huggingface_hub import revision_exists from huggingface_hub.errors import RepositoryNotFoundError from huggingface_hub.hf_api import HfApi from libcommon.constants import CONFIG_SPLIT_NAMES_KIND, MAX_COLUMN_NAME_LENGTH @@ -176,13 +177,21 @@ def retry_on_arrow_invalid_open_file( def create_branch(dataset: str, target_revision: str, hf_api: HfApi, committer_hf_api: HfApi) -> None: try: - refs = retry(on=[requests.exceptions.ConnectionError], sleeps=LIST_REPO_REFS_RETRY_SLEEPS)( - hf_api.list_repo_refs - )(repo_id=dataset, repo_type=DATASET_TYPE) - if all(ref.ref != target_revision for ref in refs.converts): - initial_commit = hf_api.list_repo_commits(repo_id=dataset, repo_type=DATASET_TYPE)[-1].commit_id + # Check if the target revision (branch) already exists + if not revision_exists(dataset, target_revision): + # If not, get the latest commit from the main branch (or current default) + initial_commit = hf_api.list_repo_commits( + repo_id=dataset, + repo_type=DATASET_TYPE + )[-1].commit_id + + # Create a new branch at the latest commit committer_hf_api.create_branch( - repo_id=dataset, branch=target_revision, repo_type=DATASET_TYPE, revision=initial_commit, exist_ok=True + repo_id=dataset, + branch=target_revision, + repo_type=DATASET_TYPE, + revision=initial_commit, + exist_ok=True ) except RepositoryNotFoundError as err: raise DatasetNotFoundError("The dataset does not exist on the Hub (was deleted during job).") from err From 802045d1639fae70288df14003560b23fe393a88 Mon Sep 17 00:00:00 2001 From: Arjun Dinesh Jagdale <142811259+ArjunJagdale@users.noreply.github.com> Date: Tue, 1 Jul 2025 13:40:16 +0530 Subject: [PATCH 2/4] Update utils.py --- services/worker/src/worker/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index b651ff7b4..f479152c7 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -16,8 +16,7 @@ import requests from datasets import Dataset, DatasetInfo, DownloadConfig, Features, IterableDataset, load_dataset from datasets.utils.file_utils import SINGLE_FILE_COMPRESSION_EXTENSION_TO_PROTOCOL -from huggingface_hub import HfFileSystem, HfFileSystemFile -from huggingface_hub import revision_exists +from huggingface_hub import HfFileSystem, HfFileSystemFile, revision_exists from huggingface_hub.errors import RepositoryNotFoundError from huggingface_hub.hf_api import HfApi from libcommon.constants import CONFIG_SPLIT_NAMES_KIND, MAX_COLUMN_NAME_LENGTH From 9e14b287f89a5b561bb6ebb25ad573f3a249ad40 Mon Sep 17 00:00:00 2001 From: Arjun Dinesh Jagdale <142811259+ArjunJagdale@users.noreply.github.com> Date: Tue, 1 Jul 2025 16:19:54 +0530 Subject: [PATCH 3/4] chore: remove unused import to fix Ruff code-quality check --- services/worker/src/worker/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index f479152c7..69e0e15f9 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -13,7 +13,6 @@ from urllib.parse import quote import PIL -import requests from datasets import Dataset, DatasetInfo, DownloadConfig, Features, IterableDataset, load_dataset from datasets.utils.file_utils import SINGLE_FILE_COMPRESSION_EXTENSION_TO_PROTOCOL from huggingface_hub import HfFileSystem, HfFileSystemFile, revision_exists From 2c73bfa40295514ea60ff93f1d2a9d68163586f9 Mon Sep 17 00:00:00 2001 From: Arjun Jagdale <142811259+ArjunJagdale@users.noreply.github.com> Date: Tue, 15 Jul 2025 00:39:44 +0530 Subject: [PATCH 4/4] Update utils.py Linting issue --- services/worker/src/worker/utils.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index 69e0e15f9..f07f1b39d 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -178,18 +178,11 @@ def create_branch(dataset: str, target_revision: str, hf_api: HfApi, committer_h # Check if the target revision (branch) already exists if not revision_exists(dataset, target_revision): # If not, get the latest commit from the main branch (or current default) - initial_commit = hf_api.list_repo_commits( - repo_id=dataset, - repo_type=DATASET_TYPE - )[-1].commit_id + initial_commit = hf_api.list_repo_commits(repo_id=dataset, repo_type=DATASET_TYPE)[-1].commit_id # Create a new branch at the latest commit committer_hf_api.create_branch( - repo_id=dataset, - branch=target_revision, - repo_type=DATASET_TYPE, - revision=initial_commit, - exist_ok=True + repo_id=dataset, branch=target_revision, repo_type=DATASET_TYPE, revision=initial_commit, exist_ok=True ) except RepositoryNotFoundError as err: raise DatasetNotFoundError("The dataset does not exist on the Hub (was deleted during job).") from err