Skip to content

Commit 100000b

Browse files
authored
Merge pull request #330 from lsst-dm/tickets/DM-49670
DM-49670: Add option to use a service for Butler database writes
2 parents 72f4e9c + 7bf8843 commit 100000b

File tree

5 files changed

+330
-49
lines changed

5 files changed

+330
-49
lines changed

python/activator/activator.py

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@
5858
from .exception import GracefulShutdownInterrupt, IgnorableVisit, InvalidVisitError, \
5959
NonRetriableError, RetriableError
6060
from .middleware_interface import get_central_butler, \
61-
make_local_repo, make_local_cache, MiddlewareInterface
61+
make_local_repo, make_local_cache, MiddlewareInterface, ButlerWriter, DirectButlerWriter
62+
from .kafka_butler_writer import KafkaButlerWriter
6263
from .repo_tracker import LocalRepoTracker
6364

6465
# Platform that prompt processing will run on
@@ -96,6 +97,23 @@
9697
# The number of seconds to delay retrying connections to the Redis stream.
9798
redis_retry = float(os.environ.get("REDIS_RETRY_DELAY", 30))
9899

100+
# If '1', sends outputs to a service for transfer into the central Butler
101+
# repository instead of writing to the database directly.
102+
use_kafka_butler_writer = os.environ.get("USE_KAFKA_BUTLER_WRITER", "0") == "1"
103+
if use_kafka_butler_writer:
104+
# Hostname of the Kafka cluster used by the Butler writer.
105+
butler_writer_kafka_cluster = os.environ["BUTLER_WRITER_KAFKA_CLUSTER"]
106+
# Username for authentication to BUTLER_WRITER_KAFKA_CLUSTER.
107+
butler_writer_kafka_username = os.environ["BUTLER_WRITER_KAFKA_USERNAME"]
108+
# Password for authentication to BUTLER_WRITER_KAFKA_CLUSTER.
109+
butler_writer_kafka_password = os.environ["BUTLER_WRITER_KAFKA_PASSWORD"]
110+
# Topic used to transfer output datasets to the central repository.
111+
butler_writer_kafka_topic = os.environ["BUTLER_WRITER_KAFKA_TOPIC"]
112+
# URI to the path where output datasets will be written when using the Kafka
113+
# writer to transfer outputs to the central Butler repository.
114+
# This will generally be in the same S3 bucket used by the central Butler.
115+
butler_writer_file_output_path = os.environ["BUTLER_WRITER_FILE_OUTPUT_PATH"]
116+
99117
# Conditionally load keda environment variables
100118
if platform == "keda":
101119
# Time to wait for fanned out messages before spawning new pod.
@@ -163,6 +181,18 @@ def _get_consumer():
163181
})
164182

165183

184+
@functools.cache
185+
def _get_producer():
186+
"""Lazy initialization of Kafka Producer for Butler writer."""
187+
return kafka.Producer({
188+
"bootstrap.servers": butler_writer_kafka_cluster,
189+
"security.protocol": "sasl_plaintext",
190+
"sasl.mechanism": "SCRAM-SHA-512",
191+
"sasl.username": butler_writer_kafka_username,
192+
"sasl.password": butler_writer_kafka_password
193+
})
194+
195+
166196
@functools.cache
167197
def _get_storage_client():
168198
"""Lazy initialization of cloud storage reader."""
@@ -189,6 +219,19 @@ def _get_read_butler():
189219
return _get_write_butler()
190220

191221

222+
@functools.cache
223+
def _get_butler_writer() -> ButlerWriter:
224+
"""Lazy initialization of Butler writer."""
225+
if use_kafka_butler_writer:
226+
return KafkaButlerWriter(
227+
_get_producer(),
228+
output_topic=butler_writer_kafka_topic,
229+
file_output_path=butler_writer_file_output_path
230+
)
231+
else:
232+
return DirectButlerWriter(_get_write_butler())
233+
234+
192235
@functools.cache
193236
def _get_local_repo():
194237
"""Lazy initialization of local repo.
@@ -461,7 +504,7 @@ def create_app():
461504
_get_consumer()
462505
_get_storage_client()
463506
_get_read_butler()
464-
_get_write_butler()
507+
_get_butler_writer()
465508
_get_local_repo()
466509

467510
app = flask.Flask(__name__)
@@ -510,7 +553,7 @@ def keda_start():
510553
_get_consumer()
511554
_get_storage_client()
512555
_get_read_butler()
513-
_get_write_butler()
556+
_get_butler_writer()
514557
_get_local_repo()
515558

516559
redis_session = RedisStreamSession(
@@ -1002,7 +1045,7 @@ def process_visit(expected_visit: FannedOutVisit):
10021045
# Create a fresh MiddlewareInterface object to avoid accidental
10031046
# "cross-talk" between different visits.
10041047
mwi = MiddlewareInterface(_get_read_butler(),
1005-
_get_write_butler(),
1048+
_get_butler_writer(),
10061049
image_bucket,
10071050
expected_visit,
10081051
pre_pipelines,
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
# This file is part of prompt_processing.
2+
#
3+
# Developed for the LSST Data Management System.
4+
# This product includes software developed by the LSST Project
5+
# (https://www.lsst.org).
6+
# See the COPYRIGHT file at the top-level directory of this distribution
7+
# for details of code ownership.
8+
#
9+
# This program is free software: you can redistribute it and/or modify
10+
# it under the terms of the GNU General Public License as published by
11+
# the Free Software Foundation, either version 3 of the License, or
12+
# (at your option) any later version.
13+
#
14+
# This program is distributed in the hope that it will be useful,
15+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
16+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17+
# GNU General Public License for more details.
18+
#
19+
# You should have received a copy of the GNU General Public License
20+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
21+
22+
from __future__ import annotations
23+
24+
__all__ = ("KafkaButlerWriter",)
25+
26+
from datetime import date
27+
from typing import Literal
28+
from uuid import uuid4
29+
30+
from confluent_kafka import Producer
31+
import pydantic
32+
33+
from lsst.daf.butler import (
34+
Butler,
35+
DatasetRef,
36+
SerializedDimensionRecord,
37+
SerializedFileDataset,
38+
)
39+
from lsst.resources import ResourcePath
40+
41+
from .middleware_interface import ButlerWriter, GroupedDimensionRecords
42+
43+
44+
class KafkaButlerWriter(ButlerWriter):
45+
def __init__(self, producer: Producer, *, output_topic: str, file_output_path: str) -> None:
46+
self._producer = producer
47+
self._output_topic = output_topic
48+
self._file_output_path = ResourcePath(file_output_path, forceDirectory=True)
49+
50+
def transfer_outputs(
51+
self, local_butler: Butler, dimension_records: GroupedDimensionRecords, datasets: list[DatasetRef]
52+
) -> list[DatasetRef]:
53+
# Create a subdirectory in the output root distinct to this processing
54+
# run.
55+
date_string = date.today().strftime("%Y-%m-%d")
56+
subdirectory = f"{date_string}/{uuid4()}/"
57+
output_directory = self._file_output_path.join(subdirectory, forceDirectory=True)
58+
# There is no such thing as a directory in S3, but the Butler complains
59+
# if there is not an object at the prefix of the export path.
60+
output_directory.mkdir()
61+
62+
# Copy files to the output directory, and retrieve metadata required to
63+
# ingest them into the central Butler.
64+
file_datasets = local_butler._datastore.export(datasets, directory=output_directory, transfer="copy")
65+
66+
# Serialize Butler data as a JSON string.
67+
event = PromptProcessingOutputEvent(
68+
type="pp-output",
69+
dimension_records=_serialize_dimension_records(dimension_records),
70+
datasets=[dataset.to_simple() for dataset in file_datasets],
71+
root_directory=subdirectory,
72+
)
73+
message = event.model_dump_json()
74+
75+
self._producer.produce(self._output_topic, message)
76+
self._producer.flush()
77+
78+
return datasets
79+
80+
81+
class PromptProcessingOutputEvent(pydantic.BaseModel):
82+
type: Literal["pp-output"]
83+
root_directory: str
84+
dimension_records: list[SerializedDimensionRecord]
85+
datasets: list[SerializedFileDataset]
86+
87+
88+
def _serialize_dimension_records(grouped_records: GroupedDimensionRecords) -> list[SerializedDimensionRecord]:
89+
output = []
90+
for records in grouped_records.values():
91+
for item in records:
92+
output.append(item.to_simple())
93+
return output

0 commit comments

Comments
 (0)