Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ servicex.yaml

#Testing
samples_structure.txt

tmp_test.py
40 changes: 34 additions & 6 deletions servicex_analysis_utils/file_peeking.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@
from servicex.dataset_identifier import DataSetIdentifier


def run_query(input_filenames):
def run_query(
input_filenames,
):
import uproot
import awkward as ak
import json
Expand All @@ -59,13 +61,24 @@ def is_tree(obj):
tree_dict = {}

with uproot.open(input_filenames) as file:

for tree_name in file.keys():
tree_name_clean = tree_name.rstrip(";1")
tree = file[tree_name]

if not is_tree(tree):
continue

if tree_name_clean == "MetaData":
fm_branches = [
b for b in tree.keys() if b.startswith("FileMetaDataAuxDyn.")
]
# remove the prefix in keys
meta_dict = {
p[19:]: str(tree[p].array(library="ak")[0]) for p in fm_branches
}
tree_dict["FileMetaData"] = meta_dict

branch_dict = {}
for branch_name, branch in tree.items():
branch_type = str(branch.interpretation)
Expand Down Expand Up @@ -174,9 +187,6 @@ def print_structure_from_str(
import json

output_lines = []
output_lines.append(
f"\nFile structure of all samples with branch filter '{filter_branch}':"
)

for sample_name, path in deliver_dict.items():
structure_str = open_delivered_file(sample_name, path)
Expand All @@ -186,9 +196,25 @@ def print_structure_from_str(
structure_dict = json.loads(structure_str)

output_lines.append(
f"\n---------------------------\n"
"\n---------------------------\n"
f"\U0001f4c1 Sample: {sample_name}\n"
f"---------------------------"
"---------------------------"
)

# Get the metadata first
output_lines.append("\nFile Metadata \u2139\ufe0f :\n")
if "FileMetaData" not in structure_dict:
output_lines.append("No FileMetaData found in dataset.")
else:
for key, value in structure_dict.get("FileMetaData", {}).items():
output_lines.append(f"── {key}: {value}")
output_lines.append("\n---------------------------")

# drop the File metadata from the trees
structure_dict.pop("FileMetaData", {})

output_lines.append(
f"\nFile structure with branch filter \U0001f33f '{filter_branch}':\n"
)

for tree_name, branches in structure_dict.items():
Expand Down Expand Up @@ -259,6 +285,8 @@ def str_to_array(encoded_json_str):
"""
reconstructed_data = {}
structure_dict = json.loads(encoded_json_str)
# drop the File metadata from the trees
structure_dict.pop("FileMetaData", {})

for treename, branch_dict in structure_dict.items():
branches = {}
Expand Down
19 changes: 19 additions & 0 deletions tests/data/expected_metadata.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

---------------------------
📁 Sample: test_file
---------------------------

File Metadata ℹ️ :

── test_100: 100
── test_abc: abc

---------------------------

File structure with branch filter 🌿 '':


🌳 Tree: MetaData
├── Branches:
│ ├── FileMetaDataAuxDyn.test_100 ; dtype: AsDtype('>i8')
│ ├── FileMetaDataAuxDyn.test_abc ; dtype: AsStrings()
11 changes: 9 additions & 2 deletions tests/data/expected_structure.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@

File structure of all samples with branch filter '':

---------------------------
📁 Sample: test_file
---------------------------

File Metadata ℹ️ :

No FileMetaData found in dataset.

---------------------------

File structure with branch filter 🌿 '':


🌳 Tree: background
├── Branches:
│ ├── branch1 ; dtype: AsDtype('>f8')
Expand Down
88 changes: 88 additions & 0 deletions tests/test_file_peeking_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Copyright (c) 2025, IRIS-HEP
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import pytest
import uproot
import json
import os
from servicex_analysis_utils import file_peeking
from pathlib import Path


@pytest.fixture
def build_test_samples(tmp_path):

test_path = str(tmp_path / "test_metadata.root")
# example data for two branches
tree_data = {
"FileMetaDataAuxDyn.test_100": [100],
"FileMetaDataAuxDyn.test_abc": ["abc"],
}

# Create tmp .root files
with uproot.create(test_path) as file:
file["MetaData"] = tree_data

return test_path


# Test run_query and print_structure_from_str
def test_metadata_retrieval(build_test_samples, tmp_path, capsys):

path = build_test_samples
query_output = file_peeking.run_query(path)
# Check result
expected_result = {
"FileMetaData": {"test_100": "100", "test_abc": "abc"},
"MetaData": {
"FileMetaDataAuxDyn.test_100": "AsDtype('>i8')",
"FileMetaDataAuxDyn.test_abc": "AsStrings()",
},
}
encoded_result = json.loads(query_output[0])

assert encoded_result == expected_result

# Produce servicex.deliver() like dict
# i.e {"Sample Name":"Path"}
tree_data = {"branch": query_output}
with uproot.create(tmp_path / "encoded.root") as file:
file["servicex"] = tree_data
assert os.path.exists(
tmp_path / "encoded.root"
), f"servicex-like test file not found."
deliver_dict = {"test_file": [str(tmp_path / "encoded.root")]}

## Test str formating
output_str = file_peeking.print_structure_from_str(deliver_dict)

expected_path = Path("tests/data/expected_metadata.txt")
expected = expected_path.read_text(encoding="utf-8")

assert (
expected == output_str
), f"Output does not match expected.\n Output: {output_str}"