Skip to content

Commit 1cafec3

Browse files
authored
Merge pull request #20 from ssl-hep/feat/18-ds-input-file-peeking
Feat/ New DS input for file peeking
2 parents 0800f96 + 04edfef commit 1cafec3

File tree

2 files changed

+74
-13
lines changed

2 files changed

+74
-13
lines changed

servicex_analysis_utils/file_peeking.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import awkward as ak
3333
import json
3434
import logging
35+
from servicex.dataset_identifier import DataSetIdentifier
3536

3637

3738
def run_query(input_filenames):
@@ -85,8 +86,8 @@ def build_deliver_spec(datasets):
8586
Supports multiple inputs for multiple sample queries.
8687
8788
Parameters:
88-
datasets (str, [str], or dict): Rucio DIDs to be checked by the servicex workers.
89-
If dict, custom names can be inputed
89+
datasets (str, [str], dict, DataSetIdentifier): Rucio DIDs (str) or DataSetIdentifier object.
90+
If dict, custom names can be inputed for each dataset
9091
9192
Returns:
9293
spec_python (dict): The specification for the python function query containing Name, Query, Dataset, NFiles
@@ -98,23 +99,29 @@ def build_deliver_spec(datasets):
9899
dataset_dict = {}
99100
user_in = type(datasets)
100101

102+
# Rucio DID as str
101103
if user_in == str:
102-
dataset_dict.update({datasets: datasets})
104+
dataset_dict.update({datasets: datasets}) # Use dataset ID as sample name
103105
elif user_in == list and type(datasets[0]) is str:
104106
for ds in datasets:
105107
dataset_dict.update({ds: ds})
106-
elif user_in == dict:
108+
elif user_in == dict: # Custom sample names
107109
dataset_dict = datasets
110+
# Single DataSetIdentifier object
111+
elif isinstance(datasets, DataSetIdentifier):
112+
dataset_dict.update({"Dataset": datasets})
108113
else:
109114
raise ValueError(
110-
f"Unsupported dataset input type: {user_in}.\nInput must be dict ('sample_name':'dataset_id'), str or list of str"
115+
f"Unsupported dataset input type: {user_in}.\n"
116+
"Input must be str or list of str of Rucio DIDs, "
117+
"a DataSetIdentifier object or a dict ('sample_name':'dataset_id')"
111118
)
112119

113120
sample_list = [
114121
{
115122
"NFiles": 1,
116123
"Name": name,
117-
"Dataset": dataset.Rucio(did),
124+
"Dataset": dataset.Rucio(did) if isinstance(did, str) else did,
118125
"Query": query_PythonFunction,
119126
}
120127
for name, did in dataset_dict.items()

tests/test_file_peeking.py

Lines changed: 61 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,14 @@
3434
import re
3535
import filecmp
3636
from servicex_analysis_utils import file_peeking
37+
from servicex import dataset
38+
from servicex.python_dataset import PythonFunction
39+
from servicex.dataset_identifier import (
40+
RucioDatasetIdentifier,
41+
FileListDataset,
42+
CERNOpenDataDatasetIdentifier,
43+
XRootDDatasetIdentifier,
44+
)
3745

3846

3947
@pytest.fixture
@@ -146,14 +154,10 @@ def test_spec_builder():
146154
assert isinstance(first_entry["NFiles"], int), "'NFiles' should be an integer"
147155
assert isinstance(first_entry["Name"], str), "'Name' should be a string"
148156

149-
from servicex.dataset_identifier import RucioDatasetIdentifier
150-
151157
assert isinstance(
152158
first_entry["Dataset"], RucioDatasetIdentifier
153159
), "'Dataset' should be a RucioDatasetIdentifier"
154160

155-
from servicex.python_dataset import PythonFunction
156-
157161
assert isinstance(
158162
first_entry["Query"], PythonFunction
159163
), "'Query' should be a PythonFunction"
@@ -173,15 +177,65 @@ def test_spec_builder():
173177

174178
# wrong input type
175179
wrong_did = 1234
180+
expected_msg = (
181+
f"Unsupported dataset input type: {type(wrong_did)}.\n"
182+
"Input must be str or list of str of Rucio DIDs, "
183+
"a DataSetIdentifier object or a dict "
184+
"('sample_name':'dataset_id')"
185+
)
186+
176187
with pytest.raises(
177188
ValueError,
178-
match=re.escape(
179-
f"Unsupported dataset input type: {type(wrong_did)}.\nInput must be dict ('sample_name':'dataset_id'), str or list of str"
180-
),
189+
match=re.escape(expected_msg),
181190
):
182191
file_peeking.build_deliver_spec(wrong_did)
183192

184193

194+
def test_spec_builder_with_dataset_identifier():
195+
# Build multiple types of dataset identifiers
196+
ds1 = dataset.Rucio("random_space:did")
197+
ds2 = dataset.XRootD("root://server/file.root")
198+
ds3 = dataset.CERNOpenData("cernopendata:12345")
199+
ds4 = dataset.FileList(["file1.root", "file2.root"])
200+
201+
ds_list = [ds1, ds2, ds3, ds4]
202+
ds_types = [
203+
RucioDatasetIdentifier,
204+
XRootDDatasetIdentifier,
205+
CERNOpenDataDatasetIdentifier,
206+
FileListDataset,
207+
]
208+
for did, did_type in zip(ds_list, ds_types):
209+
spec = file_peeking.build_deliver_spec(did)
210+
211+
# Check return type
212+
assert isinstance(spec, dict), "build_deliver_spec does not return a dict"
213+
assert "Sample" in spec, "Key 'Sample' is missing in the returned dict"
214+
assert isinstance(spec["Sample"], list), "'Sample' should be a list"
215+
216+
# Get return size
217+
size = len(spec["Sample"])
218+
assert (
219+
size == 1
220+
), f"Only one did given but sample item of spec is not len 1: {size}"
221+
222+
# Check first sample
223+
first_entry = spec["Sample"][0]
224+
assert isinstance(first_entry, dict), "Each entry in 'Sample' should be a dict"
225+
226+
# Check each key type
227+
assert isinstance(first_entry["NFiles"], int), "'NFiles' should be an integer"
228+
assert isinstance(first_entry["Name"], str), "'Name' should be a string"
229+
230+
assert isinstance(
231+
first_entry["Query"], PythonFunction
232+
), "'Query' should be a PythonFunction"
233+
234+
assert isinstance(
235+
first_entry["Dataset"], did_type
236+
), f"Input Dataset identifier {did} should be a {did_type} but is {type(first_entry['Dataset'])}"
237+
238+
185239
def test_decoding_to_array(build_test_samples, array_out=True):
186240
path = build_test_samples
187241
query_output = file_peeking.run_query(path)

0 commit comments

Comments
 (0)