@@ -56,8 +56,13 @@ class BaseCache:
5656
5757    def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
5858        self.blocksize = blocksize
59+         self.nblocks = 0
5960        self.fetcher = fetcher
6061        self.size = size
62+         self.hit_count = 0
63+         self.miss_count = 0
64+         # the bytes that we actually requested
65+         self.total_requested_bytes = 0
6166
6267    def _fetch(self, start: int | None, stop: int | None) -> bytes:
6368        if start is None:
@@ -68,6 +73,36 @@ def _fetch(self, start: int | None, stop: int | None) -> bytes:
6873            return b""
6974        return self.fetcher(start, stop)
7075
76+     def _reset_stats(self) -> None:
77+         """Reset hit and miss counts for a more ganular report e.g. by file."""
78+         self.hit_count = 0
79+         self.miss_count = 0
80+         self.total_requested_bytes = 0
81+ 
82+     def _log_stats(self) -> str:
83+         """Return a formatted string of the cache statistics."""
84+         if self.hit_count == 0 and self.miss_count == 0:
85+             # a cache that does nothing, this is for logs only
86+             return ""
87+         return " , %s: %d hits, %d misses, %d total requested bytes" % (
88+             self.name,
89+             self.hit_count,
90+             self.miss_count,
91+             self.total_requested_bytes,
92+         )
93+ 
94+     def __repr__(self) -> str:
95+         # TODO: use rich for better formatting
96+         return f"""
97+         <{self.__class__.__name__}:
98+             block size  :   {self.blocksize}
99+             block count :   {self.nblocks}
100+             file size   :   {self.size}
101+             cache hits  :   {self.hit_count}
102+             cache misses:   {self.miss_count}
103+             total requested bytes: {self.total_requested_bytes}>
104+         """
105+ 
71106
72107class MMapCache(BaseCache):
73108    """memory-mapped sparse file cache
@@ -126,13 +161,18 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
126161        start_block = start // self.blocksize
127162        end_block = end // self.blocksize
128163        need = [i for i in range(start_block, end_block + 1) if i not in self.blocks]
164+         hits = [i for i in range(start_block, end_block + 1) if i in self.blocks]
165+         self.miss_count += len(need)
166+         self.hit_count += len(hits)
129167        while need:
130168            # TODO: not a for loop so we can consolidate blocks later to
131169            # make fewer fetch calls; this could be parallel
132170            i = need.pop(0)
171+ 
133172            sstart = i * self.blocksize
134173            send = min(sstart + self.blocksize, self.size)
135-             logger.debug(f"MMap get block #{i} ({sstart}-{send}")
174+             self.total_requested_bytes += send - sstart
175+             logger.debug(f"MMap get block #{i} ({sstart}-{send})")
136176            self.cache[sstart:send] = self.fetcher(sstart, send)
137177            self.blocks.add(i)
138178
@@ -176,16 +216,20 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
176216        l = end - start
177217        if start >= self.start and end <= self.end:
178218            # cache hit
219+             self.hit_count += 1
179220            return self.cache[start - self.start : end - self.start]
180221        elif self.start <= start < self.end:
181222            # partial hit
223+             self.miss_count += 1
182224            part = self.cache[start - self.start :]
183225            l -= len(part)
184226            start = self.end
185227        else:
186228            # miss
229+             self.miss_count += 1
187230            part = b""
188231        end = min(self.size, end + self.blocksize)
232+         self.total_requested_bytes += end - start
189233        self.cache = self.fetcher(start, end)  # new block replaces old
190234        self.start = start
191235        self.end = self.start + len(self.cache)
@@ -202,24 +246,39 @@ class FirstChunkCache(BaseCache):
202246    name = "first"
203247
204248    def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
249+         if blocksize > size:
250+             # this will buffer the whole thing
251+             blocksize = size
205252        super().__init__(blocksize, fetcher, size)
206253        self.cache: bytes | None = None
207254
208255    def _fetch(self, start: int | None, end: int | None) -> bytes:
209256        start = start or 0
210-         end = end or self.size
257+         if start > self.size:
258+             logger.debug("FirstChunkCache: requested start > file size")
259+             return b""
260+ 
261+         end = min(end, self.size)
262+ 
211263        if start < self.blocksize:
212264            if self.cache is None:
265+                 self.miss_count += 1
213266                if end > self.blocksize:
267+                     self.total_requested_bytes += end
214268                    data = self.fetcher(0, end)
215269                    self.cache = data[: self.blocksize]
216270                    return data[start:]
217271                self.cache = self.fetcher(0, self.blocksize)
272+                 self.total_requested_bytes += self.blocksize
218273            part = self.cache[start:end]
219274            if end > self.blocksize:
275+                 self.total_requested_bytes += end - self.blocksize
220276                part += self.fetcher(self.blocksize, end)
277+             self.hit_count += 1
221278            return part
222279        else:
280+             self.miss_count += 1
281+             self.total_requested_bytes += end - start
223282            return self.fetcher(start, end)
224283
225284
@@ -256,12 +315,6 @@ def __init__(
256315        self.maxblocks = maxblocks
257316        self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block)
258317
259-     def __repr__(self) -> str:
260-         return (
261-             f"<BlockCache blocksize={self.blocksize}, "
262-             f"size={self.size}, nblocks={self.nblocks}>"
263-         )
264- 
265318    def cache_info(self):
266319        """
267320        The statistics on the block cache.
@@ -319,6 +372,8 @@ def _fetch_block(self, block_number: int) -> bytes:
319372
320373        start = block_number * self.blocksize
321374        end = start + self.blocksize
375+         self.total_requested_bytes += end - start
376+         self.miss_count += 1
322377        logger.info("BlockCache fetching block %d", block_number)
323378        block_contents = super()._fetch(start, end)
324379        return block_contents
@@ -339,6 +394,7 @@ def _read_cache(
339394        start_pos = start % self.blocksize
340395        end_pos = end % self.blocksize
341396
397+         self.hit_count += 1
342398        if start_block_number == end_block_number:
343399            block: bytes = self._fetch_block_cached(start_block_number)
344400            return block[start_pos:end_pos]
@@ -404,6 +460,7 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
404460        ):
405461            # cache hit: we have all the required data
406462            offset = start - self.start
463+             self.hit_count += 1
407464            return self.cache[offset : offset + end - start]
408465
409466        if self.blocksize:
@@ -418,27 +475,34 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
418475            self.end is None or end > self.end
419476        ):
420477            # First read, or extending both before and after
478+             self.total_requested_bytes += bend - start
479+             self.miss_count += 1
421480            self.cache = self.fetcher(start, bend)
422481            self.start = start
423482        else:
424483            assert self.start is not None
425484            assert self.end is not None
485+             self.miss_count += 1
426486
427487            if start < self.start:
428488                if self.end is None or self.end - end > self.blocksize:
489+                     self.total_requested_bytes += bend - start
429490                    self.cache = self.fetcher(start, bend)
430491                    self.start = start
431492                else:
493+                     self.total_requested_bytes += self.start - start
432494                    new = self.fetcher(start, self.start)
433495                    self.start = start
434496                    self.cache = new + self.cache
435497            elif self.end is not None and bend > self.end:
436498                if self.end > self.size:
437499                    pass
438500                elif end - self.end > self.blocksize:
501+                     self.total_requested_bytes += bend - start
439502                    self.cache = self.fetcher(start, bend)
440503                    self.start = start
441504                else:
505+                     self.total_requested_bytes += bend - self.end
442506                    new = self.fetcher(self.end, bend)
443507                    self.cache = self.cache + new
444508
@@ -470,10 +534,13 @@ def __init__(
470534    ) -> None:
471535        super().__init__(blocksize, fetcher, size)  # type: ignore[arg-type]
472536        if data is None:
537+             self.miss_count += 1
538+             self.total_requested_bytes += self.size
473539            data = self.fetcher(0, self.size)
474540        self.data = data
475541
476542    def _fetch(self, start: int | None, stop: int | None) -> bytes:
543+         self.hit_count += 1
477544        return self.data[start:stop]
478545
479546
@@ -551,6 +618,7 @@ def _fetch(self, start: int | None, stop: int | None) -> bytes:
551618                    # are allowed to pad reads beyond the
552619                    # buffer with zero
553620                    out += b"\x00" * (stop - start - len(out))
621+                     self.hit_count += 1
554622                    return out
555623                else:
556624                    # The request ends outside a known range,
@@ -572,6 +640,8 @@ def _fetch(self, start: int | None, stop: int | None) -> bytes:
572640            f"IO/caching performance may be poor!"
573641        )
574642        logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}")
643+         self.total_requested_bytes += stop - start
644+         self.miss_count += 1
575645        return out + super()._fetch(start, stop)
576646
577647
@@ -676,12 +746,6 @@ def __init__(
676746        self._fetch_future: Future[bytes] | None = None
677747        self._fetch_future_lock = threading.Lock()
678748
679-     def __repr__(self) -> str:
680-         return (
681-             f"<BackgroundBlockCache blocksize={self.blocksize}, "
682-             f"size={self.size}, nblocks={self.nblocks}>"
683-         )
684- 
685749    def cache_info(self) -> UpdatableLRU.CacheInfo:
686750        """
687751        The statistics on the block cache.
@@ -799,6 +863,8 @@ def _fetch_block(self, block_number: int, log_info: str = "sync") -> bytes:
799863        start = block_number * self.blocksize
800864        end = start + self.blocksize
801865        logger.info("BlockCache fetching block (%s) %d", log_info, block_number)
866+         self.total_requested_bytes += end - start
867+         self.miss_count += 1
802868        block_contents = super()._fetch(start, end)
803869        return block_contents
804870
@@ -818,6 +884,9 @@ def _read_cache(
818884        start_pos = start % self.blocksize
819885        end_pos = end % self.blocksize
820886
887+         # kind of pointless to count this as a hit, but it is
888+         self.hit_count += 1
889+ 
821890        if start_block_number == end_block_number:
822891            block = self._fetch_block_cached(start_block_number)
823892            return block[start_pos:end_pos]
0 commit comments