3
3
import io
4
4
import struct
5
5
6
- from pyzstd import ZstdFile
7
6
from ranges import Range
8
7
9
8
from ...range_stream import RangeStream
10
- from ..zstd import ZstdTarFile
11
9
from .data import COMPRESSIONS , TarData
12
10
13
11
__all__ = ["ZipStream" ]
@@ -20,10 +18,10 @@ def __init__(
20
18
client = None ,
21
19
byte_range : Range | tuple [int , int ] = Range ("[0, 0)" ),
22
20
pruning_level : int = 0 ,
23
- scan_header : bool = True ,
21
+ scan_headers : bool = True ,
24
22
):
25
23
"""
26
- As for RangeStream, but if `scan_header ` is True, then immediately call
24
+ As for RangeStream, but if `scan_headers ` is True, then immediately call
27
25
:meth:`check_header_rec` on initialisation (which will perform the necessary
28
26
of range request to identify the files in the tar from the header record),
29
27
setting :attr:`tarred_files`, and :meth:`~RangeStream.add` their file content
@@ -43,17 +41,86 @@ def __init__(
43
41
url = url , client = client , byte_range = byte_range , pruning_level = pruning_level
44
42
)
45
43
self .data = TarData ()
46
- if scan_header :
47
- self .check_header_rec ()
48
- # self.add_file_ranges()
44
+ if scan_headers :
45
+ self .check_header_recs ()
46
+ self .add_file_ranges ()
49
47
50
- def check_header_rec (self ):
51
- head_byte_range = Range (0 , 257 ) # rest of first 512 bytes is padding
52
- self .add (head_byte_range )
53
- start_bytes = self .active_range_response .read ()
48
+ def check_header_recs (self ):
49
+ """
50
+ Scan through all header records in the file, building a list of
51
+ :class:`range_streams.codecs.tar.TarredFileInfo` objects describing the
52
+ files described by the headers (but do not download those corresponding
53
+ archived file ranges).
54
+
55
+ For efficiency, only look at the particular fields of interest, not the
56
+ entire header each time.
57
+ """
58
+ self .tarred_files : list [TarredFileInfo ] = []
59
+ scan_tell = 0
60
+ assert self .total_bytes is not None
61
+ while scan_tell < (self .total_bytes - self .data .HEADER ._H_END_PAD_SIZE ):
62
+ try :
63
+ file_name = self .read_file_name (start_pos_offset = scan_tell )
64
+ except StopIteration :
65
+ # Expected if a tarball has more than 2 end-of-file padding records
66
+ break
67
+ file_size = self .read_file_size (start_pos_offset = scan_tell )
68
+ pad_size = self .data .HEADER ._H_PAD_SIZE
69
+ pad_remainder = file_size % pad_size
70
+ file_padding = (pad_size - pad_remainder ) if pad_remainder else 0
71
+ file_end_offset = pad_size + file_size + file_padding
72
+ tf_info = TarredFileInfo (
73
+ size = file_size ,
74
+ padded_size = file_end_offset ,
75
+ filename_length = len (file_name ),
76
+ header_offset = scan_tell ,
77
+ filename = file_name ,
78
+ )
79
+ self .tarred_files .append (tf_info )
80
+ scan_tell += (
81
+ file_end_offset # increment to move the cursor to the next file
82
+ )
83
+
84
+ def read_file_name (self , start_pos_offset : int = 0 ) -> str :
85
+ file_name_rng_start = start_pos_offset + self .data .HEADER ._H_FILENAME_START
86
+ file_name_rng_end = file_name_rng_start + self .data .HEADER ._H_FILENAME_SIZE
87
+ file_name_rng = Range (file_name_rng_start , file_name_rng_end )
88
+ self .add (file_name_rng )
89
+ file_name_b = self .active_range_response .read ().rstrip (b"\x00 " )
90
+ if file_name_b == b"" :
91
+ raise StopIteration ("Expected file name, got padding bytes" )
92
+ return file_name_b .decode ("ascii" )
93
+
94
+ def read_file_size (self , start_pos_offset : int = 0 ) -> int :
95
+ file_size_rng_start = start_pos_offset + self .data .HEADER ._H_FILE_SIZE_START
96
+ file_size_rng_end = file_size_rng_start + self .data .HEADER ._H_FILE_SIZE_SIZE
97
+ file_size_rng = Range (file_size_rng_start , file_size_rng_end )
98
+ self .add (file_size_rng )
99
+ file_size_b = self .active_range_response .read ()
100
+ file_size = int (file_size_b , 8 ) # convert octal number from bitstring
101
+ return file_size
102
+
103
+ def add_file_ranges (self ):
104
+ for tf_info in self .tarred_files :
105
+ assert tf_info .filename is not None
106
+ self .add (tf_info .file_range , name = tf_info .filename )
107
+
108
+ @property
109
+ def filename_list (self ) -> list [str ]:
110
+ """
111
+ Return the names of files stored in
112
+ :attr:`~range_streams.codecs.tar.TarStream.tarred_files`.
113
+ """
114
+ if not hasattr (self , "tarred_files" ): # pragma: no cover
115
+ self .check_header_recs ()
116
+ return [f .filename for f in self .tarred_files if f .filename is not None ]
54
117
55
118
56
119
class HeaderInfo :
120
+ """
121
+ Not used, may be useful if extending the class. Note USTAR format variant.
122
+ """
123
+
57
124
_H_FILENAME = 0
58
125
_H_FILE_MODE = 1
59
126
_H_OWNER_UID = 2
@@ -75,71 +142,27 @@ class TarredFileInfo(HeaderInfo):
75
142
76
143
def __init__ (
77
144
self ,
78
- # signature: bytes | int,
79
- # flags: bytes | int,
80
- # compress_type: bytes | int,
81
- # compressed_size: bytes | int,
82
- # uncompressed_size: bytes | int,
83
- # filename_length: bytes | int,
84
- # extra_field_length: bytes | int,
85
- # comment_length: bytes | int,
86
- # local_header_offset: bytes | int,
87
- # filename: str | None,
145
+ size : int , # ignoring header and trailing padding
146
+ padded_size : bytes | int , # including both header and trailing padding
147
+ filename_length : bytes | int ,
148
+ header_offset : int ,
149
+ filename : str | None ,
88
150
):
89
- pass
90
- # self.signature = signature
91
- # self.flags = flags
92
- # self.compress_type = compress_type
93
- # self.compressed_size = compressed_size
94
- # self.uncompressed_size = uncompressed_size
95
- # self.filename_length = filename_length
96
- # self.extra_field_length = extra_field_length
97
- # self.comment_length = comment_length
98
- # self.local_header_offset = local_header_offset
99
- # self.filename = filename
151
+ self .size = size
152
+ self .padded_size = padded_size
153
+ self .filename_length = filename_length
154
+ self .header_offset = header_offset
155
+ self .filename = filename
100
156
101
157
def __repr__ (self ):
102
158
return (
103
159
f"{ self .__class__ .__name__ } "
104
- # f" '{self.filename if self.filename is not None else ''}'"
105
- # f" @ {self.local_header_offset !r}: {self.compressed_size !r}B"
160
+ f" '{ self .filename if self .filename is not None else '' } '"
161
+ f" @ { self .header_offset !r} : { self .size !r} B"
106
162
)
107
163
108
- # @classmethod
109
- # def from_central_directory_entry(
110
- # cls,
111
- # cd_entry: tuple,
112
- # filename: str | None = None,
113
- # ):
114
- # """
115
- # Instantiate directly from an unpacked central directory struct
116
- # (describing the zipped file entry in a standardised entry order).
117
- # """
118
- # signature = cd_entry[cls._CD_SIGNATURE]
119
- # flags = cd_entry[cls._CD_FLAG_BITS]
120
- # compress_type = cd_entry[cls._CD_COMPRESS_TYPE]
121
- # compressed_size = cd_entry[cls._CD_COMPRESSED_SIZE]
122
- # uncompressed_size = cd_entry[cls._CD_UNCOMPRESSED_SIZE]
123
- # filename_length = cd_entry[cls._CD_FILENAME_LENGTH]
124
- # extra_field_length = cd_entry[cls._CD_EXTRA_FIELD_LENGTH]
125
- # comment_length = cd_entry[cls._CD_COMMENT_LENGTH]
126
- # local_header_offset = cd_entry[cls._CD_LOCAL_HEADER_OFFSET]
127
- # return cls(
128
- # signature=signature,
129
- # flags=flags,
130
- # compress_type=compress_type,
131
- # compressed_size=compressed_size,
132
- # uncompressed_size=uncompressed_size,
133
- # filename_length=filename_length,
134
- # extra_field_length=extra_field_length,
135
- # comment_length=comment_length,
136
- # local_header_offset=local_header_offset,
137
- # filename=filename,
138
- # )
139
-
140
- # @property
141
- # def file_range(self):
142
- # sig_start = self.local_header_offset
143
- # start = sig_start + ZipData().LOC_F_H.get_size() + self.filename_length
144
- # end = start + self.compressed_size
145
- # return Range(start, end)
164
+ @property
165
+ def file_range (self ):
166
+ start = self .header_offset
167
+ end = start + self .size
168
+ return Range (start , end )
0 commit comments