Skip to content

Commit 1d87c30

Browse files
committed
refactor: only extract metadata and don't try to calculate offset index
1 parent bb43b7c commit 1d87c30

File tree

7 files changed

+176
-222
lines changed

7 files changed

+176
-222
lines changed

libs/libviewer/Cargo.lock

Lines changed: 56 additions & 51 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

libs/libviewer/Cargo.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@ name = "libviewer"
99
crate-type = ["cdylib"]
1010

1111
[dependencies]
12-
arrow = { version = "55", features = ["pyarrow"] }
12+
arrow = { version = "56", features = ["pyarrow"] }
1313
futures = "0.3"
1414
object_store = "0.12.0"
1515
object_store_opendal = "0.52.0"
1616
opendal = { version = "0.53.2", features = ["services-huggingface"] }
17-
parquet = { version = "55", features = ["async", "object_store"] }
18-
pyo3 = { version = "0.24", features = ["extension-module"] }
19-
pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"] }
17+
parquet = { version = "56", features = ["async", "object_store"] }
18+
pyo3 = { version = "^0.25", features = ["extension-module"] }
19+
pyo3-async-runtimes = { version = "^0.25", features = ["tokio-runtime"] }
2020
tempfile = "3.20.0"
2121
thiserror = "2.0.12"
2222
thrift = "0.17"

libs/libviewer/libviewer/__init__.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
import os
2-
from ._internal import PyDataset
2+
33
from huggingface_hub import hf_hub_download, list_repo_files
44

5+
from ._internal import PyDataset
6+
7+
58
__all__ = ["Dataset"]
69

710

@@ -12,9 +15,9 @@ class Dataset(PyDataset):
1215
# ...
1316
# ]
1417

15-
def from_hub(repo, metadata_store):
18+
def from_hub(repo, metadata_store, revision=None):
1619
"""Create a Dataset from Hugging Face Hub."""
17-
repo_files = list_repo_files(repo, repo_type="dataset")
20+
repo_files = list_repo_files(repo, repo_type="dataset", revision=revision)
1821

1922
parquet_files = []
2023
for filename in repo_files:
@@ -31,12 +34,16 @@ def from_hub(repo, metadata_store):
3134
raise ValueError(f"No parquet files found in the dataset '{repo}'.")
3235

3336
return Dataset(
34-
repo, parquet_files, data_store="hf://", metadata_store=metadata_store
37+
repo,
38+
parquet_files,
39+
revision=revision,
40+
data_store=f"hf://datasets/{repo}",
41+
metadata_store=metadata_store,
3542
)
3643

37-
def from_cache(repo, metadata_store, download=False):
44+
def from_cache(repo, metadata_store, revision=None, download=False):
3845
"""Create a Dataset from HF local cache."""
39-
repo_files = list_repo_files(repo, repo_type="dataset")
46+
repo_files = list_repo_files(repo, repo_type="dataset", revision=revision)
4047

4148
parquet_files = []
4249
for filename in repo_files:
@@ -57,5 +64,9 @@ def from_cache(repo, metadata_store, download=False):
5764
raise ValueError(f"No parquet files found in the dataset '{repo}'.")
5865

5966
return Dataset(
60-
repo, parquet_files, data_store="file://", metadata_store=metadata_store
67+
repo,
68+
parquet_files,
69+
revision=revision,
70+
data_store="file://",
71+
metadata_store=metadata_store,
6172
)

0 commit comments

Comments
 (0)