From f76268c7c844f11c70849eb656603a02335ea9ce Mon Sep 17 00:00:00 2001 From: Stelios Voutsinas Date: Tue, 29 Jul 2025 10:38:56 -0700 Subject: [PATCH 1/2] Add converters benchmark and add Bitarray column test for votable --- benchmarks/votable.py | 170 +++++++++++++++++++++++-------- benchmarks/votable_converters.py | 36 +++++++ 2 files changed, 163 insertions(+), 43 deletions(-) create mode 100644 benchmarks/votable_converters.py diff --git a/benchmarks/votable.py b/benchmarks/votable.py index f37f6dc4a1..11b8aa5080 100644 --- a/benchmarks/votable.py +++ b/benchmarks/votable.py @@ -1,8 +1,5 @@ """Benchmarks for VOTable binary/binary2 parsing performance.""" import io -import os -import tempfile - import numpy as np from astropy.io.votable import parse, from_table from astropy.table import Table @@ -20,21 +17,33 @@ id_data = np.arange(LARGE_SIZE, dtype=np.int64) flag_data = np.random.choice([True, False], LARGE_SIZE) quality_data = np.random.randint(0, 256, LARGE_SIZE, dtype=np.uint8) +bool_data = np.random.randint(0, 2, LARGE_SIZE).astype(bool) short_names = np.array([f"OBJ_{i:08d}" for i in range(LARGE_SIZE)]) -filter_names = np.random.choice(['u', 'g', 'r', 'i', 'z', 'Y'], LARGE_SIZE) +filter_names = np.random.choice(["u", "g", "r", "i", "z", "Y"], LARGE_SIZE) classifications = np.random.choice( - ['STAR', 'GALAXY', 'QSO', 'UNKNOWN'], LARGE_SIZE + ["STAR", "GALAXY", "QSO", "UNKNOWN"], LARGE_SIZE) +long_descriptions = np.array( + [ + f"Extend description about a field {i // 1000:04d}" + for i in range(LARGE_SIZE) + ] ) -long_descriptions = np.array([ - f"Extend description about a field {i//1000:04d}" - for i in range(LARGE_SIZE) -]) -def create_votable_bytes(table_data, format_type='binary2'): +def create_votable_bytes( + table_data, + format_type="binary2", + bitarray_size=None): """Helper to create VOTables with a specific serialization.""" votable = from_table(table_data) + + if bitarray_size is not None: + first_table = votable.get_first_table() + for field in first_table.fields: + if field.datatype == "bit": + field.arraysize = str(bitarray_size) + output = io.BytesIO() votable.to_xml(output, tabledata_format=format_type) return output.getvalue() @@ -52,13 +61,15 @@ def setup(self): flux_data[:LARGE_SIZE], count_data[:LARGE_SIZE], id_data[:LARGE_SIZE], - quality_data[:LARGE_SIZE] + quality_data[:LARGE_SIZE], ], - names=['ra', 'dec', 'mag', 'flux', 'counts', 'id', 'quality'] + names=["ra", "dec", "mag", "flux", "counts", "id", "quality"], ) - self.binary_data = create_votable_bytes(table, 'binary') - self.binary2_data = create_votable_bytes(table, 'binary2') + self.binary_data = create_votable_bytes( + table, "binary", bitarray_size=8) + self.binary2_data = create_votable_bytes( + table, "binary2", bitarray_size=8) def time_numeric_binary(self): parse(io.BytesIO(self.binary_data)) @@ -78,13 +89,13 @@ def setup(self): short_names[:LARGE_SIZE], filter_names[:LARGE_SIZE], classifications[:LARGE_SIZE], - mag_data[:LARGE_SIZE] + mag_data[:LARGE_SIZE], ], - names=['ra', 'dec', 'object_id', 'filter', 'class', 'mag'] + names=["ra", "dec", "object_id", "filter", "class", "mag"], ) - self.binary_data = create_votable_bytes(table, 'binary') - self.binary2_data = create_votable_bytes(table, 'binary2') + self.binary_data = create_votable_bytes(table, "binary") + self.binary2_data = create_votable_bytes(table, "binary2") def time_short_strings_binary(self): parse(io.BytesIO(self.binary_data)) @@ -102,13 +113,13 @@ def setup(self): ra_data[:LARGE_SIZE], dec_data[:LARGE_SIZE], long_descriptions[:LARGE_SIZE], - mag_data[:LARGE_SIZE] + mag_data[:LARGE_SIZE], ], - names=['ra', 'dec', 'description', 'mag'] + names=["ra", "dec", "description", "mag"], ) - self.binary_data = create_votable_bytes(table, 'binary') - self.binary2_data = create_votable_bytes(table, 'binary2') + self.binary_data = create_votable_bytes(table, "binary") + self.binary2_data = create_votable_bytes(table, "binary2") def time_long_strings_binary(self): parse(io.BytesIO(self.binary_data)) @@ -126,19 +137,25 @@ def setup(self): short_names[:LARGE_SIZE], filter_names[:LARGE_SIZE], classifications[:LARGE_SIZE], - np.random.choice(['A', 'B', 'C', 'D'], LARGE_SIZE), - np.random.choice(['HIGH', 'MED', 'LOW'], LARGE_SIZE), + np.random.choice(["A", "B", "C", "D"], LARGE_SIZE), + np.random.choice(["HIGH", "MED", "LOW"], LARGE_SIZE), long_descriptions[:LARGE_SIZE], ra_data[:LARGE_SIZE], - dec_data[:LARGE_SIZE] + dec_data[:LARGE_SIZE], ], names=[ - 'id', 'filter', 'class', 'grade', - 'priority', 'desc', 'ra', 'dec' - ] + "id", + "filter", + "class", + "grade", + "priority", + "desc", + "ra", + "dec", + ], ) - self.binary2_data = create_votable_bytes(table, 'binary2') + self.binary2_data = create_votable_bytes(table, "binary2") def time_string_intensive_binary2(self): parse(io.BytesIO(self.binary2_data)) @@ -162,13 +179,21 @@ def setup(self): np.random.choice([True, False], LARGE_SIZE), ], names=[ - 'ra', 'dec', 'saturated', 'flagged', 'edge_pixel', - 'cosmic_ray', 'variable', 'extended', 'public', 'calibrated' - ] + "ra", + "dec", + "saturated", + "flagged", + "edge_pixel", + "cosmic_ray", + "variable", + "extended", + "public", + "calibrated", + ], ) - self.binary_data = create_votable_bytes(table, 'binary') - self.binary2_data = create_votable_bytes(table, 'binary2') + self.binary_data = create_votable_bytes(table, "binary") + self.binary2_data = create_votable_bytes(table, "binary2") def time_booleans_binary(self): parse(io.BytesIO(self.binary_data)) @@ -177,6 +202,57 @@ def time_booleans_binary2(self): parse(io.BytesIO(self.binary2_data)) +class TimeVOTableBitArrayOptimization: + """Benchmark BitArray columns in Binary/Binary2 VOTables.""" + + def setup(self): + table = Table( + [ + ra_data[:LARGE_SIZE], + dec_data[:LARGE_SIZE], + mag_data[:LARGE_SIZE], + np.random.randint(0, 2, LARGE_SIZE).astype(bool), + np.random.randint(0, 2, LARGE_SIZE).astype(bool), + np.random.randint(0, 2, LARGE_SIZE).astype(bool), + np.random.randint(0, 2, LARGE_SIZE).astype(bool), + ], + names=[ + "ra", + "dec", + "mag", + "detected", + "saturated", + "edge_pixel", + "cosmic_ray", + ], + ) + + self.binary_bitarray_8_data = create_votable_bytes( + table, "binary", "8") + self.binary_bitarray_16_data = create_votable_bytes( + table, "binary", "16") + self.binary2_bitarray_8_data = create_votable_bytes( + table, "binary2", "8") + self.binary2_bitarray_16_data = create_votable_bytes( + table, "binary2", "16") + + def time_bitarray_8bit_binary(self): + """Parse BitArray with 8-bit arraysize.""" + parse(io.BytesIO(self.binary_bitarray_8_data)) + + def time_bitarray_16bit_binary(self): + """Parse BitArray with 16-bit arraysize.""" + parse(io.BytesIO(self.binary_bitarray_16_data)) + + def time_bitarray_8bit_binary2(self): + """Parse binary2 BitArray with 8-bit arraysize.""" + parse(io.BytesIO(self.binary2_bitarray_8_data)) + + def time_bitarray_16bit_binary2(self): + """Parse binary2 BitArray with 16-bit arraysize.""" + parse(io.BytesIO(self.binary2_bitarray_16_data)) + + class TimeVOTableMixed: """Benchmark for a table with mixed fields types.""" @@ -195,13 +271,21 @@ def setup(self): flag_data[:LARGE_SIZE], ], names=[ - 'ra', 'dec', 'id', 'mag', 'flux', - 'filter', 'class', 'counts', 'quality', 'detected' - ] + "ra", + "dec", + "id", + "mag", + "flux", + "filter", + "class", + "counts", + "quality", + "detected", + ], ) - self.binary_data = create_votable_bytes(table, 'binary') - self.binary2_data = create_votable_bytes(table, 'binary2') + self.binary_data = create_votable_bytes(table, "binary") + self.binary2_data = create_votable_bytes(table, "binary2") def time_mixed_binary(self): parse(io.BytesIO(self.binary_data)) @@ -218,13 +302,13 @@ def setup(self): [ ra_data[:SMALL_SIZE], dec_data[:SMALL_SIZE], - mag_data[:SMALL_SIZE] + mag_data[:SMALL_SIZE], ], - names=['ra', 'dec', 'mag'] + names=["ra", "dec", "mag"], ) - self.binary_data = create_votable_bytes(table, 'binary') - self.binary2_data = create_votable_bytes(table, 'binary2') + self.binary_data = create_votable_bytes(table, "binary") + self.binary2_data = create_votable_bytes(table, "binary2") def time_small_binary(self): parse(io.BytesIO(self.binary_data)) diff --git a/benchmarks/votable_converters.py b/benchmarks/votable_converters.py new file mode 100644 index 0000000000..95956c65f9 --- /dev/null +++ b/benchmarks/votable_converters.py @@ -0,0 +1,36 @@ +import numpy as np +import numpy.ma as ma +from astropy.io.votable.converters import bool_to_bitarray, bitarray_to_bool + +SMALL_SIZE = 1000 +LARGE_SIZE = 100000 + + +class TimeBitArrayConverters: + """Direct converter function benchmarks.""" + + def setup(self): + + self.small_bool = np.random.randint(0, 2, SMALL_SIZE).astype(bool) + self.large_bool = np.random.randint(0, 2, LARGE_SIZE).astype(bool) + + mask = np.random.random(LARGE_SIZE) < 0.2 + self.masked_bool = ma.array(self.large_bool, mask=mask) + + self.small_bits = bool_to_bitarray(self.small_bool) + self.large_bits = bool_to_bitarray(self.large_bool) + + def time_bool_to_bitarray_small(self): + bool_to_bitarray(self.small_bool) + + def time_bool_to_bitarray_large(self): + bool_to_bitarray(self.large_bool) + + def time_bool_to_bitarray_masked(self): + bool_to_bitarray(self.masked_bool) + + def time_bitarray_to_bool_small(self): + bitarray_to_bool(self.small_bits, len(self.small_bool)) + + def time_bitarray_to_bool_large(self): + bitarray_to_bool(self.large_bits, len(self.large_bool)) From 3247d53bd2bcc3ac3d2b5b2c7fea9e0c745eec8d Mon Sep 17 00:00:00 2001 From: Stelios Voutsinas Date: Thu, 31 Jul 2025 15:06:25 -0700 Subject: [PATCH 2/2] PR Review changes - Remove formatting changes, change to use np rng --- benchmarks/votable.py | 130 ++++++++++++------------------- benchmarks/votable_converters.py | 7 +- 2 files changed, 53 insertions(+), 84 deletions(-) diff --git a/benchmarks/votable.py b/benchmarks/votable.py index 11b8aa5080..533456fd09 100644 --- a/benchmarks/votable.py +++ b/benchmarks/votable.py @@ -5,6 +5,7 @@ from astropy.table import Table np.random.seed(42) +rng = np.random.default_rng(42) SMALL_SIZE = 1000 LARGE_SIZE = 200000 @@ -17,18 +18,17 @@ id_data = np.arange(LARGE_SIZE, dtype=np.int64) flag_data = np.random.choice([True, False], LARGE_SIZE) quality_data = np.random.randint(0, 256, LARGE_SIZE, dtype=np.uint8) -bool_data = np.random.randint(0, 2, LARGE_SIZE).astype(bool) +bool_data = rng.integers(0, 2, LARGE_SIZE, dtype=bool) short_names = np.array([f"OBJ_{i:08d}" for i in range(LARGE_SIZE)]) -filter_names = np.random.choice(["u", "g", "r", "i", "z", "Y"], LARGE_SIZE) +filter_names = np.random.choice(['u', 'g', 'r', 'i', 'z', 'Y'], LARGE_SIZE) classifications = np.random.choice( - ["STAR", "GALAXY", "QSO", "UNKNOWN"], LARGE_SIZE) -long_descriptions = np.array( - [ - f"Extend description about a field {i // 1000:04d}" - for i in range(LARGE_SIZE) - ] + ['STAR', 'GALAXY', 'QSO', 'UNKNOWN'], LARGE_SIZE ) +long_descriptions = np.array([ + f"Extend description about a field {i//1000:04d}" + for i in range(LARGE_SIZE) +]) def create_votable_bytes( @@ -39,8 +39,7 @@ def create_votable_bytes( votable = from_table(table_data) if bitarray_size is not None: - first_table = votable.get_first_table() - for field in first_table.fields: + for field in votable.get_first_table().fields: if field.datatype == "bit": field.arraysize = str(bitarray_size) @@ -61,9 +60,9 @@ def setup(self): flux_data[:LARGE_SIZE], count_data[:LARGE_SIZE], id_data[:LARGE_SIZE], - quality_data[:LARGE_SIZE], + quality_data[:LARGE_SIZE] ], - names=["ra", "dec", "mag", "flux", "counts", "id", "quality"], + names=['ra', 'dec', 'mag', 'flux', 'counts', 'id', 'quality'] ) self.binary_data = create_votable_bytes( @@ -89,13 +88,13 @@ def setup(self): short_names[:LARGE_SIZE], filter_names[:LARGE_SIZE], classifications[:LARGE_SIZE], - mag_data[:LARGE_SIZE], + mag_data[:LARGE_SIZE] ], - names=["ra", "dec", "object_id", "filter", "class", "mag"], + names=['ra', 'dec', 'object_id', 'filter', 'class', 'mag'] ) - self.binary_data = create_votable_bytes(table, "binary") - self.binary2_data = create_votable_bytes(table, "binary2") + self.binary_data = create_votable_bytes(table, 'binary') + self.binary2_data = create_votable_bytes(table, 'binary2') def time_short_strings_binary(self): parse(io.BytesIO(self.binary_data)) @@ -113,13 +112,13 @@ def setup(self): ra_data[:LARGE_SIZE], dec_data[:LARGE_SIZE], long_descriptions[:LARGE_SIZE], - mag_data[:LARGE_SIZE], + mag_data[:LARGE_SIZE] ], - names=["ra", "dec", "description", "mag"], + names=['ra', 'dec', 'description', 'mag'] ) - self.binary_data = create_votable_bytes(table, "binary") - self.binary2_data = create_votable_bytes(table, "binary2") + self.binary_data = create_votable_bytes(table, 'binary') + self.binary2_data = create_votable_bytes(table, 'binary2') def time_long_strings_binary(self): parse(io.BytesIO(self.binary_data)) @@ -137,25 +136,19 @@ def setup(self): short_names[:LARGE_SIZE], filter_names[:LARGE_SIZE], classifications[:LARGE_SIZE], - np.random.choice(["A", "B", "C", "D"], LARGE_SIZE), - np.random.choice(["HIGH", "MED", "LOW"], LARGE_SIZE), + np.random.choice(['A', 'B', 'C', 'D'], LARGE_SIZE), + np.random.choice(['HIGH', 'MED', 'LOW'], LARGE_SIZE), long_descriptions[:LARGE_SIZE], ra_data[:LARGE_SIZE], - dec_data[:LARGE_SIZE], + dec_data[:LARGE_SIZE] ], names=[ - "id", - "filter", - "class", - "grade", - "priority", - "desc", - "ra", - "dec", - ], + 'id', 'filter', 'class', 'grade', + 'priority', 'desc', 'ra', 'dec' + ] ) - self.binary2_data = create_votable_bytes(table, "binary2") + self.binary2_data = create_votable_bytes(table, 'binary2') def time_string_intensive_binary2(self): parse(io.BytesIO(self.binary2_data)) @@ -179,21 +172,13 @@ def setup(self): np.random.choice([True, False], LARGE_SIZE), ], names=[ - "ra", - "dec", - "saturated", - "flagged", - "edge_pixel", - "cosmic_ray", - "variable", - "extended", - "public", - "calibrated", - ], + 'ra', 'dec', 'saturated', 'flagged', 'edge_pixel', + 'cosmic_ray', 'variable', 'extended', 'public', 'calibrated' + ] ) - self.binary_data = create_votable_bytes(table, "binary") - self.binary2_data = create_votable_bytes(table, "binary2") + self.binary_data = create_votable_bytes(table, 'binary') + self.binary2_data = create_votable_bytes(table, 'binary2') def time_booleans_binary(self): parse(io.BytesIO(self.binary_data)) @@ -207,24 +192,15 @@ class TimeVOTableBitArrayOptimization: def setup(self): table = Table( - [ - ra_data[:LARGE_SIZE], - dec_data[:LARGE_SIZE], - mag_data[:LARGE_SIZE], - np.random.randint(0, 2, LARGE_SIZE).astype(bool), - np.random.randint(0, 2, LARGE_SIZE).astype(bool), - np.random.randint(0, 2, LARGE_SIZE).astype(bool), - np.random.randint(0, 2, LARGE_SIZE).astype(bool), - ], - names=[ - "ra", - "dec", - "mag", - "detected", - "saturated", - "edge_pixel", - "cosmic_ray", - ], + { + "ra": ra_data[:LARGE_SIZE], + "dec": dec_data[:LARGE_SIZE], + "mag": mag_data[:LARGE_SIZE], + "detected": rng.integers(0, 2, LARGE_SIZE).astype(bool), + "saturated": rng.integers(0, 2, LARGE_SIZE).astype(bool), + "edge_pixel": rng.integers(0, 2, LARGE_SIZE).astype(bool), + "cosmic_ray": rng.integers(0, 2, LARGE_SIZE).astype(bool), + } ) self.binary_bitarray_8_data = create_votable_bytes( @@ -271,21 +247,13 @@ def setup(self): flag_data[:LARGE_SIZE], ], names=[ - "ra", - "dec", - "id", - "mag", - "flux", - "filter", - "class", - "counts", - "quality", - "detected", - ], + 'ra', 'dec', 'id', 'mag', 'flux', + 'filter', 'class', 'counts', 'quality', 'detected' + ] ) - self.binary_data = create_votable_bytes(table, "binary") - self.binary2_data = create_votable_bytes(table, "binary2") + self.binary_data = create_votable_bytes(table, 'binary') + self.binary2_data = create_votable_bytes(table, 'binary2') def time_mixed_binary(self): parse(io.BytesIO(self.binary_data)) @@ -302,13 +270,13 @@ def setup(self): [ ra_data[:SMALL_SIZE], dec_data[:SMALL_SIZE], - mag_data[:SMALL_SIZE], + mag_data[:SMALL_SIZE] ], - names=["ra", "dec", "mag"], + names=['ra', 'dec', 'mag'] ) - self.binary_data = create_votable_bytes(table, "binary") - self.binary2_data = create_votable_bytes(table, "binary2") + self.binary_data = create_votable_bytes(table, 'binary') + self.binary2_data = create_votable_bytes(table, 'binary2') def time_small_binary(self): parse(io.BytesIO(self.binary_data)) diff --git a/benchmarks/votable_converters.py b/benchmarks/votable_converters.py index 95956c65f9..960781a9b3 100644 --- a/benchmarks/votable_converters.py +++ b/benchmarks/votable_converters.py @@ -10,11 +10,12 @@ class TimeBitArrayConverters: """Direct converter function benchmarks.""" def setup(self): + rng = np.random.default_rng(42) - self.small_bool = np.random.randint(0, 2, SMALL_SIZE).astype(bool) - self.large_bool = np.random.randint(0, 2, LARGE_SIZE).astype(bool) + self.small_bool = rng.integers(0, 2, SMALL_SIZE, dtype=bool) + self.large_bool = rng.integers(0, 2, LARGE_SIZE, dtype=bool) - mask = np.random.random(LARGE_SIZE) < 0.2 + mask = rng.random(LARGE_SIZE) < 0.2 self.masked_bool = ma.array(self.large_bool, mask=mask) self.small_bits = bool_to_bitarray(self.small_bool)