Skip to content

Commit 48ce4b6

Browse files
committed
2.9.0 - Added tail, io_tail, reverse_io functions
- Added `common.reverse_io` function, allows reading blocks of bytes from files from the end of the file efficiently. - Added `common.io_tail` function, a pure python generator function, which works similarly to UNIX `tail`, and efficiently reads the file from the end, instead of having to load the entire file to access the last lines. - Added `common.tail` function, which is a simple wrapper around `io_tail` - to simplify usage when tailing a relatively small ( < 10k lines? ) amount of lines. Iterates over `io_tail`, loading each chunk into memory, and correctly orders the lines for immediate usage of the returned list. - Added `io_tail`, `reverse_io` and `tail` to the docs. - Added thorough unit tests for `io_tail` and `tail` - Minor cleanup of whitespace in `common.py`
1 parent 2d05b9c commit 48ce4b6

File tree

4 files changed

+248
-13
lines changed

4 files changed

+248
-13
lines changed

docs/source/helpers/privex.helpers.common.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,17 @@ privex.helpers.common
2424
env_keyval
2525
extract_settings
2626
get_function_params
27+
io_tail
2728
is_false
2829
is_true
2930
inject_items
3031
parse_csv
3132
parse_keyval
33+
random_str
34+
reverse_io
3235
shell_quote
3336
stringify
34-
random_str
37+
tail
3538
_filter_params
3639

3740

privex/helpers/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def _setup_logging(level=logging.WARNING):
136136
log = _setup_logging()
137137
name = 'helpers'
138138

139-
VERSION = '2.8.1'
139+
VERSION = '2.9.0'
140140

141141

142142

privex/helpers/common.py

Lines changed: 142 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
"""
2424
import inspect
2525
import math
26+
import os
2627
import random
2728
import re
2829
import shlex
@@ -35,7 +36,7 @@
3536
from decimal import Decimal, getcontext
3637
from os import getenv as env
3738
from subprocess import PIPE, STDOUT
38-
from typing import Sequence, List, Union, Tuple, Type, Dict, Any, Iterable, Optional
39+
from typing import Sequence, List, Union, Tuple, Type, Dict, Any, Iterable, Optional, BinaryIO, Generator
3940

4041
from privex.helpers import settings
4142

@@ -51,8 +52,6 @@
5152
"""All characters from a-z, A-Z, and 0-9 - for random strings where there's no risk of user font confusion"""
5253

5354

54-
55-
5655
def random_str(size: int = 50, chars: Sequence = SAFE_CHARS) -> str:
5756
"""
5857
Generate a random string of arbitrary length using a given character set (string / list / tuple). Uses Python's
@@ -118,9 +117,6 @@ def empty(v, zero: bool = False, itr: bool = False) -> bool:
118117
return False
119118

120119

121-
122-
123-
124120
def empty_if(v: V, is_empty: K = None, not_empty: T = USE_ORIG_VAR, **kwargs) -> Union[T, K, V]:
125121
"""
126122
Syntactic sugar for ``x if empty(y) else z``. If ``not_empty`` isn't specified, then the original value ``v``
@@ -757,8 +753,6 @@ def human_name(class_name: Union[str, bytes, callable, Type[object]]) -> str:
757753
return ''.join(new_name).strip()
758754

759755

760-
761-
762756
def shell_quote(*args: str) -> str:
763757
"""
764758
Takes command line arguments as positional args, and properly quotes each argument to make it safe to
@@ -830,6 +824,146 @@ def call_sys(proc, *args, write: STRBYTES = None, **kwargs) -> Tuple[bytes, byte
830824
return stdout, stderr
831825

832826

827+
def reverse_io(f: BinaryIO, blocksize: int = 4096) -> Generator[bytes, None, None]:
828+
"""
829+
Read file as series of blocks from end of file to start.
830+
831+
The data itself is in normal order, only the order of the blocks is reversed.
832+
ie. "hello world" -> ["ld","wor", "lo ", "hel"]
833+
Note that the file must be opened in binary mode.
834+
835+
Original source: https://stackoverflow.com/a/136354
836+
"""
837+
if 'b' not in f.mode.lower():
838+
raise Exception("File must be opened using binary mode.")
839+
size = os.stat(f.name).st_size
840+
fullblocks, lastblock = divmod(size, blocksize)
841+
842+
# The first(end of file) block will be short, since this leaves
843+
# the rest aligned on a blocksize boundary. This may be more
844+
# efficient than having the last (first in file) block be short
845+
f.seek(-lastblock, 2)
846+
yield f.read(lastblock)
847+
848+
for i in range(fullblocks - 1, -1, -1):
849+
f.seek(i * blocksize)
850+
yield f.read(blocksize)
851+
852+
853+
def io_tail(f: BinaryIO, nlines: int = 20, bsz: int = 4096) -> Generator[List[str], None, None]:
854+
"""
855+
NOTE: If you're only loading a small amount of lines, e.g. less than 1MB, consider using the much easier :func:`.tail`
856+
function - it only requires one call and returns the lines as a singular, correctly ordered list.
857+
858+
This is a generator function which works similarly to ``tail`` on UNIX systems. It efficiently retrieves lines in reverse order using
859+
the passed file handle ``f``.
860+
861+
WARNING: This function is a generator which returns "chunks" of lines - while the lines within each chunk are in the correct order,
862+
the chunks themselves are backwards, i.e. each chunk retrieves lines prior to the previous chunk.
863+
864+
This function was designed as a generator to allow for **memory efficient handling of large files**, and tailing large amounts of lines.
865+
It only loads ``bsz`` bytes from the file handle into memory with each iteration, allowing you to process each chunk of lines as
866+
they're read from the file, instead of having to load all ``nlines`` lines into memory at once.
867+
868+
To ensure your retrieved lines are in the correct order, with each iteration you must PREPEND the outputted chunk to your final result,
869+
rather than APPEND. Example::
870+
871+
>>> from privex.helpers import io_tail
872+
>>> lines = []
873+
>>> with open('/tmp/example', 'rb') as fp:
874+
... # We prepend each chunk from 'io_tail' to our result variable 'lines'
875+
... for chunk in io_tail(fp, nlines=10):
876+
... lines = chunk + lines
877+
>>> print('\\n'.join(lines))
878+
879+
Modified to be more memory efficient, but originally based on this SO code snippet: https://stackoverflow.com/a/136354
880+
881+
:param BinaryIO f: An open file handle for the file to tail, must be in **binary mode** (e.g. ``rb``)
882+
:param int nlines: Total number of lines to retrieve from the end of the file
883+
:param int bsz: Block size (in bytes) to load with each iteration (default: 4096 bytes). DON'T CHANGE UNLESS YOU
884+
UNDERSTAND WHAT THIS MEANS.
885+
:return Generator chunks: Generates chunks (in reverse order) of correctly ordered lines as ``List[str]``
886+
"""
887+
buf = ''
888+
lines_read = 0
889+
# Load 4096 bytes at a time, from file handle 'f' in reverse
890+
for block in reverse_io(f, blocksize=int(bsz)):
891+
# Incase we had a partial line during our previous iteration, we append leftover bytes from
892+
# the previous iteration to the end of the newly loaded block
893+
buf = stringify(block) + buf
894+
lines = buf.splitlines()
895+
896+
# Return all lines except the first (since may be partial)
897+
if lines:
898+
# First line may not be complete, since we're loading blocks from the bottom of the file.
899+
# We yield from line 2 onwards, storing line 1 back into 'buf' to be appended to the next block.
900+
result = lines[1:]
901+
res_lines = len(result)
902+
903+
# If we've retrieved enough lines to meet the requested 'nlines', then we just calculate how many
904+
# more lines the caller wants, yield them, then return to finish execution.
905+
if (lines_read + res_lines) >= nlines:
906+
rem_lines = nlines - lines_read
907+
lines_read += rem_lines
908+
yield result[-rem_lines:]
909+
return
910+
911+
# Yield the lines we've loaded so far
912+
if res_lines > 0:
913+
lines_read += res_lines
914+
yield result
915+
916+
# Replace the buffer with the discarded 1st line from earlier.
917+
buf = lines[0]
918+
# If the loop is broken, it means we've probably reached the start of the file, and we're missing the first line...
919+
# Thus we have to yield the buffer, which should contain the first line of the file.
920+
yield [buf]
921+
922+
923+
def tail(filename: str, nlines: int = 20, bsz: int = 4096) -> List[str]:
924+
"""
925+
Pure python equivalent of the UNIX ``tail`` command. Simply pass a filename and the number of lines you want to load
926+
from the end of the file, and a ``List[str]`` of lines (in forward order) will be returned.
927+
928+
This function is simply a wrapper for the highly efficient :func:`.io_tail`, designed for usage with a small (<10,000) amount
929+
of lines to be tailed. To allow for the lines to be returned in the correct order, it must load all ``nlines`` lines into memory
930+
before it can return the data.
931+
932+
If you need to ``tail`` a large amount of data, e.g. 10,000+ lines of a logfile, you should consider using the lower level
933+
function :func:`.io_tail` - which acts as a generator, only loading a certain amount of bytes into memory per iteration.
934+
935+
Example file ``/tmp/testing``::
936+
937+
this is an example 1
938+
this is an example 2
939+
this is an example 3
940+
this is an example 4
941+
this is an example 5
942+
this is an example 6
943+
944+
Example usage::
945+
946+
>>> from privex.helpers import tail
947+
>>> lines = tail('/tmp/testing', nlines=3)
948+
>>> print("\\n".join(lines))
949+
this is an example 4
950+
this is an example 5
951+
this is an example 6
952+
953+
954+
:param str filename: Path to file to tail. Relative or absolute path. Absolute path is recommended for safety.
955+
:param int nlines: Total number of lines to retrieve from the end of the file
956+
:param int bsz: Block size (in bytes) to load with each iteration (default: 4096 bytes). DON'T CHANGE UNLESS YOU
957+
UNDERSTAND WHAT THIS MEANS.
958+
:return List[str] lines: The last 'nlines' lines of the file 'filename' - in forward order.
959+
"""
960+
res = []
961+
with open(filename, 'rb') as fp:
962+
for chunk in io_tail(f=fp, nlines=nlines, bsz=bsz):
963+
res = chunk + res
964+
return res
965+
966+
833967
IS_XARGS = re.compile('^\*([a-zA-Z0-9_])+$')
834968
"""Pre-compiled regex for matching catch-all positional argument parameter names like ``*args``"""
835969
IS_XKWARGS = re.compile('^\*\*([a-zA-Z0-9_])+$')

tests/general/test_general.py

Lines changed: 101 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,15 @@
2222
2323
2424
"""
25-
25+
import os
2626
from os import path, makedirs
27-
from tempfile import TemporaryDirectory, NamedTemporaryFile
28-
from typing import Union
27+
from tempfile import TemporaryDirectory, NamedTemporaryFile, mkstemp
28+
from typing import Union, Tuple, List, TextIO, BinaryIO
2929
from privex import helpers
3030
from tests import PrivexBaseCase
31+
import logging
32+
33+
log = logging.getLogger(__name__)
3134

3235

3336
class TestGeneral(PrivexBaseCase):
@@ -309,3 +312,98 @@ def test_extract_settings_case_sensitive_lowercase_keys_fail(self):
309312
self.assertTrue(isinstance(extracted, dict))
310313
self.assertEqual(len(extracted.keys()), 0)
311314

315+
def _create_test_file(self, tfile: BinaryIO, nlines=10) -> List[str]:
316+
"""Helper function for populating a testing temp file with numbered example lines for comparison"""
317+
lines = [f"This is an example line {i}\n".encode('utf-8') for i in range(1, nlines+1)]
318+
tfile.writelines(lines)
319+
tfile.flush()
320+
return [l.decode().strip("\n") for l in lines]
321+
322+
def test_io_tail_500_lines_300(self):
323+
"""
324+
Test :func:`.io_tail` by tailing 300 lines of a 500 line file, then comparing each line from generated chunks against the
325+
original lines written to the file.
326+
"""
327+
with NamedTemporaryFile() as tfile:
328+
lines = self._create_test_file(tfile, 500)
329+
330+
i = -1 # Position -1 is the last line in the ``lines`` list
331+
for chunk in helpers.io_tail(tfile, 300):
332+
# We reverse each chunk, so that we can cleanly compare last lines -> first lines
333+
chunk.reverse()
334+
# We lower i by 1 for each line in the chunk, so we're reading ``lines`` backwards, while reading the reversed ``chunk``
335+
# from the last line until the first line of the chunk.
336+
for l in chunk:
337+
self.assertEqual(l, lines[i], msg=f"l == lines[{i}] // '{l}' == '{lines[i]}'")
338+
i -= 1
339+
# Since the last line of ``lines`` was -1 instead of -0, the final iteration should result in -301
340+
self.assertEqual(i, -301)
341+
342+
def test_tail_10_lines_3(self):
343+
"""
344+
Test :func:`.tail` by comparing the last 3 lines of a 10 line testing file.
345+
"""
346+
with NamedTemporaryFile() as tfile:
347+
lines = self._create_test_file(tfile, 10)
348+
349+
tailed = helpers.tail(tfile.name, 3)
350+
self.assertEqual(len(tailed), 3)
351+
self.assertEqual(tailed[0], "This is an example line 8")
352+
self.assertEqual(tailed[1], "This is an example line 9")
353+
self.assertEqual(tailed[2], "This is an example line 10")
354+
355+
def test_tail_10_lines_5(self):
356+
"""
357+
Test :func:`.tail` by comparing the first and last tailed 5 lines of a 10 line testing file.
358+
"""
359+
with NamedTemporaryFile() as tfile:
360+
lines = self._create_test_file(tfile, 10)
361+
362+
tailed = helpers.tail(tfile.name, 5)
363+
self.assertEqual(len(tailed), 5)
364+
self.assertEqual(tailed[0], lines[-5])
365+
self.assertEqual(tailed[4], lines[-1])
366+
367+
def test_tail_10_lines_10(self):
368+
"""
369+
Test :func:`.tail` works when ``nlines`` is equal to the amount of lines in the file. We tail 10 lines of a 10 line test file,
370+
then compare all 10 original lines against the output from tail.
371+
"""
372+
with NamedTemporaryFile() as tfile:
373+
lines = self._create_test_file(tfile, 10)
374+
375+
tailed = helpers.tail(tfile.name, 10)
376+
self.assertEqual(len(tailed), 10)
377+
for i, l in enumerate(lines):
378+
self.assertEqual(tailed[i], lines[i], msg=f"tailed[{i}] == lines[{i}] // '{tailed[i]}' == '{lines[i]}'")
379+
380+
def test_tail_500_lines_20(self):
381+
"""
382+
Test :func:`.tail` with a larger test file. Tailing 20 lines of a 500 line test file.
383+
"""
384+
with NamedTemporaryFile() as tfile:
385+
lines = self._create_test_file(tfile, 500)
386+
387+
tailed = helpers.tail(tfile.name, 20)
388+
self.assertEqual(len(tailed), 20)
389+
# Compare the last 20 lines from ``lines``, against ``tailed`` starting from position 0
390+
i = 0
391+
for l in lines[480:]:
392+
self.assertEqual(tailed[i], l, msg=f"tailed[i] == l // '{tailed[i]}' == '{l}'")
393+
i += 1
394+
395+
def test_tail_500_lines_300(self):
396+
"""
397+
Test :func:`.tail` with a larger line count. Tailing 300 lines of a 500 line test file.
398+
"""
399+
with NamedTemporaryFile() as tfile:
400+
lines = self._create_test_file(tfile, 500)
401+
402+
tailed = helpers.tail(tfile.name, 300)
403+
self.assertEqual(len(tailed), 300)
404+
# Compare the last 300 lines from ``lines``, against ``tailed`` starting from position 0
405+
i = 0
406+
for l in lines[200:]:
407+
self.assertEqual(tailed[i], l, msg=f"tailed[i] == l // '{tailed[i]}' == '{l}'")
408+
i += 1
409+

0 commit comments

Comments
 (0)