Skip to content

Commit 03faa82

Browse files
committed
Add tarball codec and tests
1 parent ee43511 commit 03faa82

File tree

5 files changed

+158
-72
lines changed

5 files changed

+158
-72
lines changed

src/range_streams/codecs/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
The currently supported list of codecs is:
66
- .zip
77
- .conda (zip containing zstd-compressed tarballs, used for the conda package archives)
8+
- .tar (uncompressed only)
9+
- .png
810
911
There are planned extensions to other archive and image formats.
1012
"""

src/range_streams/codecs/tar/data.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ class HeaderData(SimpleDataClass):
5555
_H_CHECKSUM_SIZE = 8
5656
_H_LINK_INDICATOR_SIZE = 1
5757
_H_LINKED_NAME_SIZE = 100
58+
_H_PAD_SIZE = 512
59+
# Standard end-of-file padding is 2 padding records
60+
_H_END_PAD_SIZE = 2 * _H_PAD_SIZE
5861

5962

6063
class TarData:

src/range_streams/codecs/tar/stream.py

Lines changed: 95 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,9 @@
33
import io
44
import struct
55

6-
from pyzstd import ZstdFile
76
from ranges import Range
87

98
from ...range_stream import RangeStream
10-
from ..zstd import ZstdTarFile
119
from .data import COMPRESSIONS, TarData
1210

1311
__all__ = ["ZipStream"]
@@ -20,10 +18,10 @@ def __init__(
2018
client=None,
2119
byte_range: Range | tuple[int, int] = Range("[0, 0)"),
2220
pruning_level: int = 0,
23-
scan_header: bool = True,
21+
scan_headers: bool = True,
2422
):
2523
"""
26-
As for RangeStream, but if `scan_header` is True, then immediately call
24+
As for RangeStream, but if `scan_headers` is True, then immediately call
2725
:meth:`check_header_rec` on initialisation (which will perform the necessary
2826
of range request to identify the files in the tar from the header record),
2927
setting :attr:`tarred_files`, and :meth:`~RangeStream.add` their file content
@@ -43,17 +41,86 @@ def __init__(
4341
url=url, client=client, byte_range=byte_range, pruning_level=pruning_level
4442
)
4543
self.data = TarData()
46-
if scan_header:
47-
self.check_header_rec()
48-
# self.add_file_ranges()
44+
if scan_headers:
45+
self.check_header_recs()
46+
self.add_file_ranges()
4947

50-
def check_header_rec(self):
51-
head_byte_range = Range(0, 257) # rest of first 512 bytes is padding
52-
self.add(head_byte_range)
53-
start_bytes = self.active_range_response.read()
48+
def check_header_recs(self):
49+
"""
50+
Scan through all header records in the file, building a list of
51+
:class:`range_streams.codecs.tar.TarredFileInfo` objects describing the
52+
files described by the headers (but do not download those corresponding
53+
archived file ranges).
54+
55+
For efficiency, only look at the particular fields of interest, not the
56+
entire header each time.
57+
"""
58+
self.tarred_files: list[TarredFileInfo] = []
59+
scan_tell = 0
60+
assert self.total_bytes is not None
61+
while scan_tell < (self.total_bytes - self.data.HEADER._H_END_PAD_SIZE):
62+
try:
63+
file_name = self.read_file_name(start_pos_offset=scan_tell)
64+
except StopIteration:
65+
# Expected if a tarball has more than 2 end-of-file padding records
66+
break
67+
file_size = self.read_file_size(start_pos_offset=scan_tell)
68+
pad_size = self.data.HEADER._H_PAD_SIZE
69+
pad_remainder = file_size % pad_size
70+
file_padding = (pad_size - pad_remainder) if pad_remainder else 0
71+
file_end_offset = pad_size + file_size + file_padding
72+
tf_info = TarredFileInfo(
73+
size=file_size,
74+
padded_size=file_end_offset,
75+
filename_length=len(file_name),
76+
header_offset=scan_tell,
77+
filename=file_name,
78+
)
79+
self.tarred_files.append(tf_info)
80+
scan_tell += (
81+
file_end_offset # increment to move the cursor to the next file
82+
)
83+
84+
def read_file_name(self, start_pos_offset: int = 0) -> str:
85+
file_name_rng_start = start_pos_offset + self.data.HEADER._H_FILENAME_START
86+
file_name_rng_end = file_name_rng_start + self.data.HEADER._H_FILENAME_SIZE
87+
file_name_rng = Range(file_name_rng_start, file_name_rng_end)
88+
self.add(file_name_rng)
89+
file_name_b = self.active_range_response.read().rstrip(b"\x00")
90+
if file_name_b == b"":
91+
raise StopIteration("Expected file name, got padding bytes")
92+
return file_name_b.decode("ascii")
93+
94+
def read_file_size(self, start_pos_offset: int = 0) -> int:
95+
file_size_rng_start = start_pos_offset + self.data.HEADER._H_FILE_SIZE_START
96+
file_size_rng_end = file_size_rng_start + self.data.HEADER._H_FILE_SIZE_SIZE
97+
file_size_rng = Range(file_size_rng_start, file_size_rng_end)
98+
self.add(file_size_rng)
99+
file_size_b = self.active_range_response.read()
100+
file_size = int(file_size_b, 8) # convert octal number from bitstring
101+
return file_size
102+
103+
def add_file_ranges(self):
104+
for tf_info in self.tarred_files:
105+
assert tf_info.filename is not None
106+
self.add(tf_info.file_range, name=tf_info.filename)
107+
108+
@property
109+
def filename_list(self) -> list[str]:
110+
"""
111+
Return the names of files stored in
112+
:attr:`~range_streams.codecs.tar.TarStream.tarred_files`.
113+
"""
114+
if not hasattr(self, "tarred_files"): # pragma: no cover
115+
self.check_header_recs()
116+
return [f.filename for f in self.tarred_files if f.filename is not None]
54117

55118

56119
class HeaderInfo:
120+
"""
121+
Not used, may be useful if extending the class. Note USTAR format variant.
122+
"""
123+
57124
_H_FILENAME = 0
58125
_H_FILE_MODE = 1
59126
_H_OWNER_UID = 2
@@ -75,71 +142,27 @@ class TarredFileInfo(HeaderInfo):
75142

76143
def __init__(
77144
self,
78-
# signature: bytes | int,
79-
# flags: bytes | int,
80-
# compress_type: bytes | int,
81-
# compressed_size: bytes | int,
82-
# uncompressed_size: bytes | int,
83-
# filename_length: bytes | int,
84-
# extra_field_length: bytes | int,
85-
# comment_length: bytes | int,
86-
# local_header_offset: bytes | int,
87-
# filename: str | None,
145+
size: int, # ignoring header and trailing padding
146+
padded_size: bytes | int, # including both header and trailing padding
147+
filename_length: bytes | int,
148+
header_offset: int,
149+
filename: str | None,
88150
):
89-
pass
90-
# self.signature = signature
91-
# self.flags = flags
92-
# self.compress_type = compress_type
93-
# self.compressed_size = compressed_size
94-
# self.uncompressed_size = uncompressed_size
95-
# self.filename_length = filename_length
96-
# self.extra_field_length = extra_field_length
97-
# self.comment_length = comment_length
98-
# self.local_header_offset = local_header_offset
99-
# self.filename = filename
151+
self.size = size
152+
self.padded_size = padded_size
153+
self.filename_length = filename_length
154+
self.header_offset = header_offset
155+
self.filename = filename
100156

101157
def __repr__(self):
102158
return (
103159
f"{self.__class__.__name__}"
104-
# f" '{self.filename if self.filename is not None else ''}'"
105-
# f" @ {self.local_header_offset!r}: {self.compressed_size!r}B"
160+
f" '{self.filename if self.filename is not None else ''}'"
161+
f" @ {self.header_offset!r}: {self.size!r}B"
106162
)
107163

108-
# @classmethod
109-
# def from_central_directory_entry(
110-
# cls,
111-
# cd_entry: tuple,
112-
# filename: str | None = None,
113-
# ):
114-
# """
115-
# Instantiate directly from an unpacked central directory struct
116-
# (describing the zipped file entry in a standardised entry order).
117-
# """
118-
# signature = cd_entry[cls._CD_SIGNATURE]
119-
# flags = cd_entry[cls._CD_FLAG_BITS]
120-
# compress_type = cd_entry[cls._CD_COMPRESS_TYPE]
121-
# compressed_size = cd_entry[cls._CD_COMPRESSED_SIZE]
122-
# uncompressed_size = cd_entry[cls._CD_UNCOMPRESSED_SIZE]
123-
# filename_length = cd_entry[cls._CD_FILENAME_LENGTH]
124-
# extra_field_length = cd_entry[cls._CD_EXTRA_FIELD_LENGTH]
125-
# comment_length = cd_entry[cls._CD_COMMENT_LENGTH]
126-
# local_header_offset = cd_entry[cls._CD_LOCAL_HEADER_OFFSET]
127-
# return cls(
128-
# signature=signature,
129-
# flags=flags,
130-
# compress_type=compress_type,
131-
# compressed_size=compressed_size,
132-
# uncompressed_size=uncompressed_size,
133-
# filename_length=filename_length,
134-
# extra_field_length=extra_field_length,
135-
# comment_length=comment_length,
136-
# local_header_offset=local_header_offset,
137-
# filename=filename,
138-
# )
139-
140-
# @property
141-
# def file_range(self):
142-
# sig_start = self.local_header_offset
143-
# start = sig_start + ZipData().LOC_F_H.get_size() + self.filename_length
144-
# end = start + self.compressed_size
145-
# return Range(start, end)
164+
@property
165+
def file_range(self):
166+
start = self.header_offset
167+
end = start + self.size
168+
return Range(start, end)

tests/codecs/data.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
__all__ = [
22
"EXAMPLE_ZIP_URL",
33
"EXAMPLE_CONDA_URL",
4+
"EXAMPLE_TAR_URL",
45
"EXAMPLE_PNG_URL",
56
"EXAMPLE_SEMITRANSPARENT_PNG_URL",
67
]
@@ -9,5 +10,6 @@
910

1011
EXAMPLE_ZIP_URL = f"{data_dir_URL}example_text_file.txt.zip"
1112
EXAMPLE_CONDA_URL = f"{data_dir_URL}tqdm-4.61.1-pyhd3eb1b0_1.conda"
13+
EXAMPLE_TAR_URL = f"{data_dir_URL}data.tar"
1214
EXAMPLE_PNG_URL = f"{data_dir_URL}red_square.png"
1315
EXAMPLE_SEMITRANSPARENT_PNG_URL = f"{data_dir_URL}red_square_rgba_semitransparent.png"

tests/codecs/tar_test.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
from __future__ import annotations
2+
3+
from pytest import fixture, mark, raises
4+
from ranges import Range
5+
6+
from range_streams.codecs import TarStream
7+
8+
from .data import EXAMPLE_TAR_URL
9+
10+
11+
@fixture(scope="session")
12+
def example_tar_stream():
13+
return TarStream(url=EXAMPLE_TAR_URL)
14+
15+
16+
@mark.parametrize("expected", [8192])
17+
def test_tar_total_bytes(example_tar_stream, expected):
18+
assert example_tar_stream.total_bytes == expected
19+
20+
21+
@mark.parametrize(
22+
"expected", [(["red_square_rgba_semitransparent.png", "example_text_file.txt"])]
23+
)
24+
def test_tar_list_files(example_tar_stream, expected):
25+
assert example_tar_stream.filename_list == expected
26+
27+
28+
@mark.parametrize(
29+
"file_i,size,padded_size,fname,fname_len,header_offset",
30+
[
31+
(0, 5124, 6144, "red_square_rgba_semitransparent.png", 35, 0),
32+
(1, 11, 1024, "example_text_file.txt", 21, 6144),
33+
],
34+
)
35+
def test_tarred_file_contents(
36+
example_tar_stream, file_i, size, padded_size, fname, fname_len, header_offset
37+
):
38+
tf_l = example_tar_stream.tarred_files
39+
assert len(tf_l) == 2
40+
tf = tf_l[file_i]
41+
assert tf.size == size
42+
assert tf.padded_size == padded_size
43+
assert tf.filename == fname
44+
assert tf.filename_length == fname_len
45+
assert tf.header_offset == header_offset
46+
47+
48+
@mark.parametrize(
49+
"file_i,expected",
50+
[
51+
(0, "TarredFileInfo 'red_square_rgba_semitransparent.png' @ 0: 5124B"),
52+
(1, "TarredFileInfo 'example_text_file.txt' @ 6144: 11B"),
53+
],
54+
)
55+
def test_tar_repr(example_tar_stream, file_i, expected):
56+
assert example_tar_stream.tarred_files[file_i].__repr__() == expected

0 commit comments

Comments
 (0)