Skip to content

Conversation

aothms
Copy link
Collaborator

@aothms aothms commented Sep 13, 2025

Not intended for direct merge, but shows how could be integrated.

Requires:

Notes:

  1. I create the RocksDB file in /tmp because I'm running on WSL and my storage is on the Windows side which is very slow to interact with from WSL

backend\apps\ifc_validation_models

diff --git a/models.py b/models.py
index f85ce3d..ba56e7d 100644
--- a/models.py
+++ b/models.py
@@ -609,6 +609,16 @@ class Model(TimestampedBaseModel, IdObfuscator):
         help_text="Status of the Syntax Validation.",
     )

+    status_rocksdb_conversion = models.CharField(
+        max_length=1,
+        choices=Status.choices,
+        default=Status.NOT_VALIDATED,
+        db_index=True,
+        null=False,
+        blank=False,
+        help_text="Status of the SPF to RocksDB conversion.",
+    )
+
     status_header_syntax = models.CharField(
         max_length=1,
         choices=Status.choices,
@@ -965,6 +975,7 @@ class ValidationTask(TimestampedBaseModel, IdObfuscator):
         INDUSTRY_PRACTICES  = 'INDUSTRY', 'Industry Practices'
         INSTANCE_COMPLETION = 'INST_COMPLETION', 'Instance Completion'
         DIGITAL_SIGNATURES  = 'DIGITAL_SIGNATURES', 'Digital Signatures'
+        ROCKSDB_CONVERSION  = 'ROCKSDB_CONVERSION', 'Conversion from SPF to RocksDB (for large models only)'

     class Status(models.TextChoices):
         """

migrations/0017_model_status_rocksdb_conversion_and_more.py

# Generated by Django 5.1.1 on 2025-09-12 08:35

from django.db import migrations, models


class Migration(migrations.Migration):

    dependencies = [
        ('ifc_validation_models', '0016_alter_validationrequest_channel'),
    ]

    operations = [
        migrations.AddField(
            model_name='model',
            name='status_rocksdb_conversion',
            field=models.CharField(choices=[('v', 'Valid'), ('i', 'Invalid'), ('n', 'Not Validated'), ('w', 'Warning'), ('-', 'Not Applicable')], db_index=True, default='n', help_text='Status of the SPF to RocksDB conversion.', max_length=1),
        ),
        migrations.AlterField(
            model_name='validationtask',
            name='type',
            field=models.CharField(choices=[('SYNTAX', 'STEP Physical File Syntax'), ('HEADER_SYNTAX', 'STEP Physical File Syntax (HEADER section)'), ('SCHEMA', 'Schema (EXPRESS language)'), ('MVD', 'Model View Definitions'), ('BSDD', 'bSDD Compliance'), ('INFO', 'Parse Info'), ('PREREQ', 'Prerequisites'), ('HEADER', 'Header Validation'), ('NORMATIVE_IA', 'Implementer Agreements (IA)'), ('NORMATIVE_IP', 'Informal Propositions (IP)'), ('INDUSTRY', 'Industry Practices'), ('INST_COMPLETION', 'Instance Completion'), ('DIGITAL_SIGNATURES', 'Digital Signatures'), ('ROCKSDB_CONVERSION', 'Conversion from SPF to RocksDB (for large models only)')], db_index=True, help_text='Type of the Validation Task.', max_length=25),
        ),
    ]

Measuring memory usage

mem_plot_20250912_205852

The first behave invocation on prerequisites creates a huge spike btw. This is probably because it runs with --purepythonparser. We should really parse only the header for IFC101.

memory plotter.py

#!/usr/bin/env python3
"""
Monitor memory usage for selected processes and save a PNG on interrupt.

- Targets (default): python/python3, behave, celery
- Filters out a few common Ubuntu system jobs
- One line per PID, default Matplotlib color cycle
- X-axis: time.perf_counter() since each PID was first seen
- Legend label: "<proc.name()> (<pid>)"
- Non-interactive: no window; writes PNG on Ctrl-C

Usage:
    python monitor_procs_mem.py
    python monitor_procs_mem.py -n "python,celery" -i 0.5 -o mem.png
    python monitor_procs_mem.py -x "unattended-upgrade,cloud-init"
"""

import argparse
import json
import time
import re
from dataclasses import dataclass, field
from typing import Dict, List
from datetime import datetime
import os

import psutil

# Use a headless backend so we can save figures without a display
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# Default substrings to exclude if they appear anywhere in the command line.
DEFAULT_EXCLUDE_SUBSTRINGS = [
    "unattended-upgrade",
    "apt.systemd.daily",
    "cloud-init",
    "apport",
    "whoopsie",
    "update-notifier",
    "snapd",
    "ubuntu-advantage",
    "landscape",
    "subiquity",
    "networkd-dispat",
    "plotter.py",
    "flower"
]

# Default target names to include.
DEFAULT_TARGETS = ["python", "python3", "behave", "celery"]


@dataclass
class ProcTrack:
    pid: int
    name: str
    label: str
    t0: float
    times: List[float] = field(default_factory=list)
    mem_mb: List[float] = field(default_factory=list)
    alive: bool = True


def parse_args():
    p = argparse.ArgumentParser(description="Plot memory usage for selected processes; save PNG on interrupt.")
    p.add_argument("-n", "--names", default="python,behave,celery",
                   help="Comma-separated target names (default: python,behave,celery). "
                        "Matches process name or any cmdline token.")
    p.add_argument("-x", "--exclude", default=",".join(DEFAULT_EXCLUDE_SUBSTRINGS),
                   help="Comma-separated substrings to exclude if present in cmdline.")
    p.add_argument("-i", "--interval", type=float, default=0.2,
                   help="Sampling interval in seconds (default: 0.2).")
    p.add_argument("--scan-interval", type=float, default=0.2,
                   help="How often to rescan for new processes (default: 0.2s).")
    p.add_argument("-o", "--output", default=None,
                   help="Output PNG path. Default: mem_plot_YYYYmmdd_HHMMSS.png in CWD.")
    p.add_argument("--dpi", type=int, default=120, help="PNG DPI (default: 120).")
    p.add_argument("--figsize", default="12x6",
                   help='Figure size in inches, WxH (default: "12x6").')
    return p.parse_args()


def compile_patterns(targets: List[str]):
    targets_lower = [t.strip().lower() for t in targets if t.strip()]
    # Exact match on process name (case-insensitive)
    name_pattern = re.compile(r"^(%s)$" % "|".join(map(re.escape, targets_lower)), re.IGNORECASE) if targets_lower else None
    return targets_lower, name_pattern


def should_track(proc: psutil.Process,
                 targets_lower: List[str],
                 name_pattern: re.Pattern,
                 exclude_substrings: List[str]) -> bool:
    try:
        name = (proc.name() or "").lower()
        cmd = [c.lower() for c in (proc.cmdline() or [])]
    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
        return False

    # Target match: name matches OR any cmd token contains a target term
    match_name = bool(name_pattern and name_pattern.match(name))
    match_cmd = any(any(t in token for t in targets_lower) for token in cmd) if targets_lower else False
    if not (match_name or match_cmd):
        return False

    # Exclusions: if any excluded substring appears anywhere in cmdline, skip
    if cmd:
        joined = " ".join(cmd)
        for sub in exclude_substrings:
            if sub and sub.lower() in joined:
                return False

    return True

name_mapping = {
    "celery": "celery",
    "runserver": "django dev-server",
    "convert_path_to_rocksdb": "conversion to RocksDB",
    "validate_header.py": "header validation",
    "ifcopenshell.simple_spf": "syntax check",
    "--rule-type": "behave launcher",
    "behave": "behave",
    "check_signatures.py": "digital signatures",
    "ifcopenshell.validate": "schema validation"
}

def label_process(proc: psutil.Process):
    try:
        cmd = [c.lower() for c in (proc.cmdline() or [])]
    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
        return ''
    cmd = ' '.join(cmd)
    for k, v in name_mapping.items():
        if k in cmd: return v
    return ''

def is_alive(proc: psutil.Process) -> bool:
    try:
        return proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE
    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
        return False


def rss_mb(proc: psutil.Process) -> float:
    try:
        return proc.memory_info().rss / (1024 * 1024)
    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
        return float("nan")


def parse_figsize(s: str):
    try:
        w_str, h_str = s.lower().split("x")
        return float(w_str), float(h_str)
    except Exception:
        return 12.0, 6.0


def main():
    global_t0 = time.perf_counter()

    args = parse_args()

    targets = [s.strip() for s in args.names.split(",") if s.strip()]
    exclude_substrings = [s.strip() for s in args.exclude.split(",") if s.strip()]
    targets_lower, name_pattern = compile_patterns(targets)

    tracks: Dict[int, ProcTrack] = {}
    last_scan = 0.0

    print("Monitoring… Press Ctrl-C to save PNG and exit.")
    try:
        while True:
            now = time.perf_counter()

            # Rescan for new matching processes periodically
            if now - last_scan >= args.scan_interval:
                last_scan = now
                for proc in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
                    if proc.pid in tracks:
                        continue
                    if should_track(proc, targets_lower, name_pattern, exclude_substrings):
                        desc = f"{proc.info.get('name') or proc.name()} ({proc.pid})"
                        tracks[proc.pid] = ProcTrack(
                            pid=proc.pid,
                            name=proc.info.get("name") or proc.name(),
                            label=f"{label_process(proc)} [{proc.pid}]",
                            t0=time.perf_counter(),
                        )
                        print(f"Tracking: {desc} ({proc.cmdline()}) -> {label_process(proc)}")

            # Sample all tracked processes
            for pid, track in list(tracks.items()):
                try:
                    proc = psutil.Process(pid)
                except psutil.NoSuchProcess:
                    track.alive = False
                    continue

                if not is_alive(proc):
                    track.alive = False
                    continue

                t = time.perf_counter() - global_t0 # track.t0
                m = rss_mb(proc)
                if m == m:  # not NaN
                    track.times.append(t)
                    track.mem_mb.append(m)

            time.sleep(args.interval)

    except KeyboardInterrupt:
        # Build a final plot and save to PNG
        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        out = args.output or f"mem_plot_{ts}.png"
        w, h = parse_figsize(args.figsize)

        fig, ax = plt.subplots(figsize=(w, h))
        ax.set_title("Process RSS Memory Over Time")
        ax.set_xlabel("Time since start (s)")
        ax.set_ylabel("RSS Memory (MB)")
        ax.grid(True)

        # Plot each PID with default color cycle
        # Sort by PID for a stable color assignment order
        plotted = 0
        for pid in sorted(tracks.keys()):
            tr = tracks[pid]
            if tr.times and tr.mem_mb:
                line, = ax.plot(tr.times, tr.mem_mb, label=tr.label)
                plotted += 1

                x_last = tr.times[-1]
                y_last = tr.mem_mb[-1]
                ax.text(
                    x_last, y_last,
                    str(pid),               # text (just the PID)
                    fontsize=8,
                    color=line.get_color(), # match line color
                    ha="left", va="bottom", # offset positioning
                )
        
        js = []
        for pid in sorted(tracks.keys()):
            tr = tracks[pid]
            if tr.times and tr.mem_mb:
                js.append({
                    'pid': pid,
                    'label': tr.label,
                    'times': tr.times,
                    'mem': tr.mem_mb,
                })
        with open(f"mem_plot_{ts}.json", 'w') as J:
            json.dump(js, J)

        if plotted == 0:
            print("No data collected; nothing to plot.")
        else:
            ax.legend(loc="best", fontsize="small")
            os.makedirs(os.path.dirname(out) or ".", exist_ok=True)
            fig.tight_layout()
            fig.savefig(out, dpi=args.dpi)
            print(f"\nSaved plot -> {out}")

        # Do not show any window (non-interactive)
        plt.close(fig)


if __name__ == "__main__":
    main()

@aothms aothms requested a review from Ghesselink September 13, 2025 11:07
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant