Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@ RUN --mount=type=secret,id=csd-activation-key,env=CSD_ACTIVATION_KEY \
--mount=type=bind,source=README.md,target=/opt/csd-optimade/README.md \
--mount=type=bind,source=pyproject.toml,target=/opt/csd-optimade/pyproject.toml \
--mount=type=bind,source=uv.lock,target=/opt/csd-optimade/uv.lock \
--mount=type=bind,source=/tmp,target=/opt/csd-optimade/data,rw=true \
--mount=type=bind,source=/tmp,target=/tmp,rw=true \
--mount=type=tmpfs,target=/tmp,rw=true \
mkdir -p /root/.config/CCDC && \
echo "[licensing_v1]\nlicence_key=${CSD_ACTIVATION_KEY}" > /root/.config/CCDC/ApplicationServices.ini && \
mkdir -p data && \
Expand Down Expand Up @@ -203,13 +202,18 @@ fi

if [ "$CSD_OPTIMADE_INSERT" = "1" ] || [ "$CSD_OPTIMADE_INSERT" = "true" ]; then
# Run the API twice: once to wipe and reinsert the data then exit, the second to run the API
(gpg --batch --passphrase ${CSD_ACTIVATION_KEY} --decrypt /opt/csd-optimade/csd-optimade.jsonl.gz.gpg | gunzip > /opt/csd-optimade/csd-optimade.jsonl;
exec uv run --no-sync csd-serve --port 5001 --exit-after-insert --drop-first /opt/csd-optimade/csd-optimade.jsonl) &
(gpg --batch --passphrase ${CSD_ACTIVATION_KEY} --decrypt /opt/csd-optimade/csd-optimade.jsonl.gz.gpg | gunzip > /opt/csd-optimade/optimade.jsonl;
exec uv run --no-sync csd-serve --port 5001 --exit-after-insert --drop-first /opt/csd-optimade/optimade.jsonl) &
fi

# Run CLI with 'fake' file
touch /tmp/csd-optimade.jsonl
exec uv run --no-sync csd-serve --no-insert /tmp/csd-optimade.jsonl
if [ "$OPTIMAKE_DATABASE_BACKEND" = "mongomock" ]; then
gpg --batch --passphrase ${CSD_ACTIVATION_KEY} --decrypt /opt/csd-optimade/csd-optimade.jsonl.gz.gpg | gunzip > /opt/csd-optimade/optimade.jsonl
exec uv run --no-sync csd-serve --port 5001 /opt/csd-optimade/optimade.jsonl
else
# Run CLI with 'fake' file
touch /tmp/optimade.jsonl
exec uv run --no-sync csd-serve --no-insert /tmp/optimade.jsonl
fi

EOF

Expand Down
24 changes: 22 additions & 2 deletions docker-bake.hcl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
variable "CI" {
// Set to true if running in a CI environment; this affects how secrets are mounted
// Set to true if running in a CI environment; this affects how caching is handled
default = false
}

Expand All @@ -13,6 +13,11 @@ variable "VERSION" {
default = "latest"
}

variable "CSD_REINGEST" {
// Whether to reingest the CSD data even if it already exists in the image
default = "false"
}

variable "CSD_NUM_STRUCTURES" {
// Number of structures to ingest (default: all)
default = ""
Expand Down Expand Up @@ -51,11 +56,26 @@ target "csd-ingester-test" {
secret = ["type=env,id=csd-activation-key,env=CSD_ACTIVATION_KEY", "id=csd-installer-url,env=CSD_INSTALLER_URL"]
}

target "csd-optimade-dev" {
inherits = ["docker-metadata-action"]
context = "."
dockerfile = "Dockerfile"
args = {CSD_NUM_STRUCTURES = CSD_NUM_STRUCTURES, REINGEST = CSD_REINGEST}
target = "csd-optimade-server"
tags = ["${IMAGE_BASE}-dev:${VERSION}"]
cache-from = [
"type=registry,ref=${IMAGE_BASE}:${VERSION}",
"type=registry,ref=${IMAGE_BASE}:cache",
]
cache-to = []
secret = ["type=env,id=csd-activation-key,env=CSD_ACTIVATION_KEY", "id=csd-installer-url,env=CSD_INSTALLER_URL"]
}

target "csd-optimade-server" {
inherits = ["docker-metadata-action"]
context = "."
dockerfile = "Dockerfile"
args = {CSD_NUM_STRUCTURES = CSD_NUM_STRUCTURES}
args = {CSD_NUM_STRUCTURES = CSD_NUM_STRUCTURES, REINGEST = CSD_REINGEST}
target = "csd-optimade-server"
tags = ["${IMAGE_BASE}:${VERSION}"]
cache-from = [
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ classifiers = [
requires-python = ">= 3.11, < 3.12"
dependencies = [
"optimade ~= 1.2",
"optimade-maker ~= 0.4",
"optimade-maker ~= 0.4, < 0.5",
"tqdm ~= 4.66",
"pymongo >= 4, < 5",
]
Expand Down
10 changes: 5 additions & 5 deletions src/csd_optimade/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,18 +78,18 @@ def generate_csd_provider_fields():
},
{
"name": "_csd_inchi",
"type": "string",
"description": "CSD InChI string.",
"type": "list",
"description": "A list of InChI strings for individual components in the structure.",
},
{
"name": "_csd_inchi_key",
"type": "string",
"description": "CSD InChIKey.",
"type": "list",
"description": "A list of InChIKeys for individual components in the structure.",
},
{
"name": "_csd_smiles",
"type": "string",
"description": "CSD SMILES string.",
"description": "A SMILES string computed for the 3D structure.",
},
{
"name": "_csd_z_value",
Expand Down
25 changes: 19 additions & 6 deletions src/csd_optimade/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
import glob
import itertools
import json
import logging
import math
import os
import tempfile
import time
import warnings
from functools import partial
Expand All @@ -39,6 +39,10 @@

from csd_optimade.mappers import from_csd_entry_directly

LOG = logging.getLogger(__name__)
LOG.handlers = [logging.StreamHandler()]
LOG.setLevel(logging.INFO)


def from_csd_database(
reader: ccdc.io.EntryReader,
Expand Down Expand Up @@ -69,7 +73,8 @@ def handle_chunk(args, run_name: str = "test", num_chunks: int | None = None):
bad_count: int = 0
total_count: int = 0
str_chunk_id = f"{chunk_id:0{len(str(num_chunks))}d}"
with open(f"data/{run_name}-optimade-{str_chunk_id}.jsonl", "w") as f:
chunk_path = Path(f"data/{run_name}-optimade-{str_chunk_id}.jsonl")
with open(chunk_path, "w") as f:
try:
for entry in from_csd_database(ccdc.io.EntryReader("CSD"), range_):
total_count += 1
Expand All @@ -84,6 +89,8 @@ def handle_chunk(args, run_name: str = "test", num_chunks: int | None = None):
if total_count == 0 and bad_count != 0:
raise RuntimeError("No good entries found in chunk; something went wrong.")

LOG.info("Wrote chunk % to %", chunk_id, chunk_path)

return chunk_id, total_count, bad_count


Expand Down Expand Up @@ -176,9 +183,10 @@ def cli():
# Combine all results into a single JSONL file, first temporary
output_dir = Path("data")
output_file = output_dir / f"{run_name}-optimade.jsonl"
tmp_dir = tempfile.TemporaryDirectory()
tmp_jsonl_path = Path(tmp_dir.name) / output_file.name
print(f"Collecting results into {output_file}")
tmp_dir = Path(f"/tmp/csd-optimade/{run_name}")
tmp_dir.mkdir(exist_ok=True, parents=True)
tmp_jsonl_path = tmp_dir / output_file.name
LOG.info(f"Collecting results into {output_file}")

pattern = f"{run_name}-optimade-*.jsonl"
input_files = sorted(
Expand Down Expand Up @@ -245,7 +253,12 @@ def cli():
ids_by_type[_type].add(json_entry["id"])
final_jsonl.write(line_entry)

tmp_dir.cleanup()
# Remove the temporary directory
try:
tmp_jsonl_path.unlink()
tmp_dir.rmdir()
except OSError:
raise OSError(f"Dir entries: {os.listdir(tmp_dir)}")

# Final scan to remove duplicates an empty lines
print(
Expand Down
8 changes: 3 additions & 5 deletions src/csd_optimade/mappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,7 @@ def from_csd_entry_directly(
}
}

inchi = entry.crystal.generate_inchi()
if not inchi.success:
inchi = None
inchis = entry.component_inchis

structure_features = []
try:
Expand Down Expand Up @@ -252,8 +250,8 @@ def from_csd_entry_directly(
_csd_crystal_system=entry.crystal.crystal_system,
_csd_space_group_symbol_hermann_mauginn=entry.crystal.spacegroup_symbol, # Need to double-check if this matches OPTIMADE 1.2 definition
_csd_chemical_name=entry.chemical_name,
_csd_inchi=inchi.inchi if inchi else None,
_csd_inchi_key=inchi.key if inchi else None,
_csd_inchi=[inchi.inchi for inchi in inchis] if inchis else None,
_csd_inchi_key=[inchi.key for inchi in inchis] if inchis else None,
_csd_smiles=asym_unit.smiles,
_csd_z_value=entry.crystal.z_value,
_csd_z_prime=entry.crystal.z_prime,
Expand Down
Loading
Loading