Rocksdb usage sketch #228

aothms · 2025-09-13T11:07:29Z

Not intended for direct merge, but shows how could be integrated.

Requires:

Recent ifcopsh https://builds.ifcopenshell.org/
Patches and migration below for new task and status

Notes:

I create the RocksDB file in /tmp because I'm running on WSL and my storage is on the Windows side which is very slow to interact with from WSL

backend\apps\ifc_validation_models

diff --git a/models.py b/models.py
index f85ce3d..ba56e7d 100644
--- a/models.py
+++ b/models.py
@@ -609,6 +609,16 @@ class Model(TimestampedBaseModel, IdObfuscator):
         help_text="Status of the Syntax Validation.",
     )

+    status_rocksdb_conversion = models.CharField(
+        max_length=1,
+        choices=Status.choices,
+        default=Status.NOT_VALIDATED,
+        db_index=True,
+        null=False,
+        blank=False,
+        help_text="Status of the SPF to RocksDB conversion.",
+    )
+
     status_header_syntax = models.CharField(
         max_length=1,
         choices=Status.choices,
@@ -965,6 +975,7 @@ class ValidationTask(TimestampedBaseModel, IdObfuscator):
         INDUSTRY_PRACTICES  = 'INDUSTRY', 'Industry Practices'
         INSTANCE_COMPLETION = 'INST_COMPLETION', 'Instance Completion'
         DIGITAL_SIGNATURES  = 'DIGITAL_SIGNATURES', 'Digital Signatures'
+        ROCKSDB_CONVERSION  = 'ROCKSDB_CONVERSION', 'Conversion from SPF to RocksDB (for large models only)'

     class Status(models.TextChoices):
         """

migrations/0017_model_status_rocksdb_conversion_and_more.py

# Generated by Django 5.1.1 on 2025-09-12 08:35

from django.db import migrations, models


class Migration(migrations.Migration):

    dependencies = [
        ('ifc_validation_models', '0016_alter_validationrequest_channel'),
    ]

    operations = [
        migrations.AddField(
            model_name='model',
            name='status_rocksdb_conversion',
            field=models.CharField(choices=[('v', 'Valid'), ('i', 'Invalid'), ('n', 'Not Validated'), ('w', 'Warning'), ('-', 'Not Applicable')], db_index=True, default='n', help_text='Status of the SPF to RocksDB conversion.', max_length=1),
        ),
        migrations.AlterField(
            model_name='validationtask',
            name='type',
            field=models.CharField(choices=[('SYNTAX', 'STEP Physical File Syntax'), ('HEADER_SYNTAX', 'STEP Physical File Syntax (HEADER section)'), ('SCHEMA', 'Schema (EXPRESS language)'), ('MVD', 'Model View Definitions'), ('BSDD', 'bSDD Compliance'), ('INFO', 'Parse Info'), ('PREREQ', 'Prerequisites'), ('HEADER', 'Header Validation'), ('NORMATIVE_IA', 'Implementer Agreements (IA)'), ('NORMATIVE_IP', 'Informal Propositions (IP)'), ('INDUSTRY', 'Industry Practices'), ('INST_COMPLETION', 'Instance Completion'), ('DIGITAL_SIGNATURES', 'Digital Signatures'), ('ROCKSDB_CONVERSION', 'Conversion from SPF to RocksDB (for large models only)')], db_index=True, help_text='Type of the Validation Task.', max_length=25),
        ),
    ]

Measuring memory usage

The first behave invocation on prerequisites creates a huge spike btw. This is probably because it runs with --purepythonparser. We should really parse only the header for IFC101.

memory plotter.py

#!/usr/bin/env python3
"""
Monitor memory usage for selected processes and save a PNG on interrupt.

- Targets (default): python/python3, behave, celery
- Filters out a few common Ubuntu system jobs
- One line per PID, default Matplotlib color cycle
- X-axis: time.perf_counter() since each PID was first seen
- Legend label: "<proc.name()> (<pid>)"
- Non-interactive: no window; writes PNG on Ctrl-C

Usage:
    python monitor_procs_mem.py
    python monitor_procs_mem.py -n "python,celery" -i 0.5 -o mem.png
    python monitor_procs_mem.py -x "unattended-upgrade,cloud-init"
"""

import argparse
import json
import time
import re
from dataclasses import dataclass, field
from typing import Dict, List
from datetime import datetime
import os

import psutil

# Use a headless backend so we can save figures without a display
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# Default substrings to exclude if they appear anywhere in the command line.
DEFAULT_EXCLUDE_SUBSTRINGS = [
    "unattended-upgrade",
    "apt.systemd.daily",
    "cloud-init",
    "apport",
    "whoopsie",
    "update-notifier",
    "snapd",
    "ubuntu-advantage",
    "landscape",
    "subiquity",
    "networkd-dispat",
    "plotter.py",
    "flower"
]

# Default target names to include.
DEFAULT_TARGETS = ["python", "python3", "behave", "celery"]


@dataclass
class ProcTrack:
    pid: int
    name: str
    label: str
    t0: float
    times: List[float] = field(default_factory=list)
    mem_mb: List[float] = field(default_factory=list)
    alive: bool = True


def parse_args():
    p = argparse.ArgumentParser(description="Plot memory usage for selected processes; save PNG on interrupt.")
    p.add_argument("-n", "--names", default="python,behave,celery",
                   help="Comma-separated target names (default: python,behave,celery). "
                        "Matches process name or any cmdline token.")
    p.add_argument("-x", "--exclude", default=",".join(DEFAULT_EXCLUDE_SUBSTRINGS),
                   help="Comma-separated substrings to exclude if present in cmdline.")
    p.add_argument("-i", "--interval", type=float, default=0.2,
                   help="Sampling interval in seconds (default: 0.2).")
    p.add_argument("--scan-interval", type=float, default=0.2,
                   help="How often to rescan for new processes (default: 0.2s).")
    p.add_argument("-o", "--output", default=None,
                   help="Output PNG path. Default: mem_plot_YYYYmmdd_HHMMSS.png in CWD.")
    p.add_argument("--dpi", type=int, default=120, help="PNG DPI (default: 120).")
    p.add_argument("--figsize", default="12x6",
                   help='Figure size in inches, WxH (default: "12x6").')
    return p.parse_args()


def compile_patterns(targets: List[str]):
    targets_lower = [t.strip().lower() for t in targets if t.strip()]
    # Exact match on process name (case-insensitive)
    name_pattern = re.compile(r"^(%s)$" % "|".join(map(re.escape, targets_lower)), re.IGNORECASE) if targets_lower else None
    return targets_lower, name_pattern


def should_track(proc: psutil.Process,
                 targets_lower: List[str],
                 name_pattern: re.Pattern,
                 exclude_substrings: List[str]) -> bool:
    try:
        name = (proc.name() or "").lower()
        cmd = [c.lower() for c in (proc.cmdline() or [])]
    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
        return False

    # Target match: name matches OR any cmd token contains a target term
    match_name = bool(name_pattern and name_pattern.match(name))
    match_cmd = any(any(t in token for t in targets_lower) for token in cmd) if targets_lower else False
    if not (match_name or match_cmd):
        return False

    # Exclusions: if any excluded substring appears anywhere in cmdline, skip
    if cmd:
        joined = " ".join(cmd)
        for sub in exclude_substrings:
            if sub and sub.lower() in joined:
                return False

    return True

name_mapping = {
    "celery": "celery",
    "runserver": "django dev-server",
    "convert_path_to_rocksdb": "conversion to RocksDB",
    "validate_header.py": "header validation",
    "ifcopenshell.simple_spf": "syntax check",
    "--rule-type": "behave launcher",
    "behave": "behave",
    "check_signatures.py": "digital signatures",
    "ifcopenshell.validate": "schema validation"
}

def label_process(proc: psutil.Process):
    try:
        cmd = [c.lower() for c in (proc.cmdline() or [])]
    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
        return ''
    cmd = ' '.join(cmd)
    for k, v in name_mapping.items():
        if k in cmd: return v
    return ''

def is_alive(proc: psutil.Process) -> bool:
    try:
        return proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE
    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
        return False


def rss_mb(proc: psutil.Process) -> float:
    try:
        return proc.memory_info().rss / (1024 * 1024)
    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
        return float("nan")


def parse_figsize(s: str):
    try:
        w_str, h_str = s.lower().split("x")
        return float(w_str), float(h_str)
    except Exception:
        return 12.0, 6.0


def main():
    global_t0 = time.perf_counter()

    args = parse_args()

    targets = [s.strip() for s in args.names.split(",") if s.strip()]
    exclude_substrings = [s.strip() for s in args.exclude.split(",") if s.strip()]
    targets_lower, name_pattern = compile_patterns(targets)

    tracks: Dict[int, ProcTrack] = {}
    last_scan = 0.0

    print("Monitoring… Press Ctrl-C to save PNG and exit.")
    try:
        while True:
            now = time.perf_counter()

            # Rescan for new matching processes periodically
            if now - last_scan >= args.scan_interval:
                last_scan = now
                for proc in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
                    if proc.pid in tracks:
                        continue
                    if should_track(proc, targets_lower, name_pattern, exclude_substrings):
                        desc = f"{proc.info.get('name') or proc.name()} ({proc.pid})"
                        tracks[proc.pid] = ProcTrack(
                            pid=proc.pid,
                            name=proc.info.get("name") or proc.name(),
                            label=f"{label_process(proc)} [{proc.pid}]",
                            t0=time.perf_counter(),
                        )
                        print(f"Tracking: {desc} ({proc.cmdline()}) -> {label_process(proc)}")

            # Sample all tracked processes
            for pid, track in list(tracks.items()):
                try:
                    proc = psutil.Process(pid)
                except psutil.NoSuchProcess:
                    track.alive = False
                    continue

                if not is_alive(proc):
                    track.alive = False
                    continue

                t = time.perf_counter() - global_t0 # track.t0
                m = rss_mb(proc)
                if m == m:  # not NaN
                    track.times.append(t)
                    track.mem_mb.append(m)

            time.sleep(args.interval)

    except KeyboardInterrupt:
        # Build a final plot and save to PNG
        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        out = args.output or f"mem_plot_{ts}.png"
        w, h = parse_figsize(args.figsize)

        fig, ax = plt.subplots(figsize=(w, h))
        ax.set_title("Process RSS Memory Over Time")
        ax.set_xlabel("Time since start (s)")
        ax.set_ylabel("RSS Memory (MB)")
        ax.grid(True)

        # Plot each PID with default color cycle
        # Sort by PID for a stable color assignment order
        plotted = 0
        for pid in sorted(tracks.keys()):
            tr = tracks[pid]
            if tr.times and tr.mem_mb:
                line, = ax.plot(tr.times, tr.mem_mb, label=tr.label)
                plotted += 1

                x_last = tr.times[-1]
                y_last = tr.mem_mb[-1]
                ax.text(
                    x_last, y_last,
                    str(pid),               # text (just the PID)
                    fontsize=8,
                    color=line.get_color(), # match line color
                    ha="left", va="bottom", # offset positioning
                )
        
        js = []
        for pid in sorted(tracks.keys()):
            tr = tracks[pid]
            if tr.times and tr.mem_mb:
                js.append({
                    'pid': pid,
                    'label': tr.label,
                    'times': tr.times,
                    'mem': tr.mem_mb,
                })
        with open(f"mem_plot_{ts}.json", 'w') as J:
            json.dump(js, J)

        if plotted == 0:
            print("No data collected; nothing to plot.")
        else:
            ax.legend(loc="best", fontsize="small")
            os.makedirs(os.path.dirname(out) or ".", exist_ok=True)
            fig.tight_layout()
            fig.savefig(out, dpi=args.dpi)
            print(f"\nSaved plot -> {out}")

        # Do not show any window (non-interactive)
        plt.close(fig)


if __name__ == "__main__":
    main()

aothms added 2 commits September 12, 2025 18:52

RocksDB usage sketch

abfe4ad

Add missing files

b1e4687

aothms requested a review from Ghesselink September 13, 2025 11:07

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Rocksdb usage sketch #228

Rocksdb usage sketch #228

Uh oh!

aothms commented Sep 13, 2025

Uh oh!

Uh oh!

Rocksdb usage sketch #228

Are you sure you want to change the base?

Rocksdb usage sketch #228

Uh oh!

Conversation

aothms commented Sep 13, 2025

backend\apps\ifc_validation_models

migrations/0017_model_status_rocksdb_conversion_and_more.py

Measuring memory usage

memory plotter.py

Uh oh!

Uh oh!