IST-DASLab · ngc92 · Dec 18, 2025 · Dec 17, 2025 · Dec 17, 2025 · Dec 17, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -149,12 +149,13 @@ add_library(llmq-common SHARED
         src/training/dataloader.cpp
         src/training/logging.cpp
         src/training/checkpoint.cpp
+        src/training/model.cpp
+        src/training/transformer_config.cpp
 
         src/models/llama_run_state.cpp
         src/models/llama_model.cpp
         src/models/llama_weights.cpp
         src/models/llama_gradients.cpp
-        src/models/llama_config.cpp
         src/models/llama_optimizer.cpp
 
         src/kernels/kernels.cpp

diff --git a/export-checkpoint.cpp b/export-checkpoint.cpp
@@ -56,7 +56,7 @@ int main(int argc, const char** argv) {
 
 
     std::string config_path = ModelRootPath + "/config.json";
-    LLamaConfig config = load_llama_config(config_path.c_str(), ModelDType);
+    TransformerConfig config = load_transformer_config(config_path.c_str(), ModelDType);
 
     int latest_step = find_latest_checkpoint(CkptDir);
     if (latest_step < 0) {
@@ -76,7 +76,7 @@ int main(int argc, const char** argv) {
 
                 std::filesystem::path p(OutDir);
                 std::filesystem::create_directories(p);
-                save_llama_config(config, (p / "config.json").c_str());
+                save_transformer_config(config, (p / "config.json").c_str());
                 model.export_weights((p / "model.safetensors").c_str(), comm);
             });
 }
diff --git a/scripts/demo.py b/scripts/demo.py
@@ -14,7 +14,7 @@
 
 def main():
     # configure the model architecture
-    config = pyllmq.LLamaConfig(
+    config = pyllmq.Config(
         architecture="qwen2",
         hidden_size=896,
         intermediate_size=4864,

diff --git a/scripts/export_wandb.py b/scripts/export_wandb.py
@@ -62,8 +62,11 @@ def log_line(run: "wandb.Run", entry: dict):
     elif kind == "dataset":
         pass
         # run.config["dataset"] = entry
-    elif kind in ["option", "info"]:
+    elif kind in ["option", "info", "message"]:
         pass
+    elif kind == "abs-maxes":
+        for stats in entry["abs_maxes"]:
+            run.log({f"abs_maxes/{stats['name']}": stats['value']}, step=step)
     elif kind == "sol":
         if entry["rank"] != 0:
             return

diff --git a/scripts/modal_test_app.py b/scripts/modal_test_app.py
@@ -123,13 +123,13 @@ def run_fixed_result_test(dtype: str = "bf16"):
         }
     elif dtype == "e4m3":
         expected = {
-            "losses": [3.6896581649780273, 3.3729734420776367, 3.4653379917144775, 3.442567825317383, 3.209272861480713, 3.598208427429199, 3.3247671127319336, 3.16266131401062, 3.340975284576416, 3.1603269577026367, 2.9368526935577393],
-            "norms": [7.225314140319824, 7.687681198120117, 5.967431545257568, 6.447912693023682, 6.524477958679199, 6.501705169677734, 5.946033477783203, 5.431783199310303, 5.570157527923584, 4.84951639175415]
+            "losses": [3.6896581649780273, 3.350105047225952, 3.4565865993499756, 3.4244627952575684, 3.1961753368377686, 3.5905539989471436, 3.314474582672119, 3.1519875526428223, 3.328141689300537, 3.1484572887420654, 2.935676097869873],
+            "norms": [7.225314140319824, 6.242846965789795, 6.185355186462402, 5.921780586242676, 6.2074384689331055, 6.655130863189697, 5.875917911529541, 5.7703142166137695, 5.688600063323975, 4.981103897094727],
         }
     elif dtype == "e5m2":
         expected = {
-            "losses": [3.6896581649780273, 3.374328136444092, 3.4656238555908203, 3.442816734313965, 3.212712287902832, 3.600202798843384, 3.3219680786132812, 3.165616035461426, 3.3386964797973633, 3.1590282917022705, 2.9364452362060547],
-            "norms": [7.1560797691345215, 7.673510551452637, 5.905216693878174, 6.354756832122803, 6.572442531585693, 6.477637767791748, 5.919188022613525, 5.463287353515625, 5.5615410804748535, 4.890221118927002],
+            "losses": [3.6896581649780273, 3.352154016494751, 3.4541091918945312, 3.4255144596099854, 3.194918394088745, 3.591022491455078, 3.3119752407073975, 3.1553726196289062, 3.326538562774658, 3.150322675704956, 2.9374852180480957],
+            "norms": [7.1560797691345215, 6.176127910614014, 6.131178855895996, 5.944278240203857, 6.135629653930664, 6.733249187469482, 5.816003322601318, 5.759800910949707, 5.704397201538086, 4.93273401260376],
         }
     else:
         raise ValueError(f"Unknown dtype: {dtype}")

diff --git a/scripts/train.py b/scripts/train.py
@@ -134,7 +134,7 @@ def main():
         if config.continue_from_checkpoint:
             latest_step = pyllmq.find_latest_checkpoint(ckpt_dir)
             if latest_step >= 0:
-                trainer = pyllmq.LLMQTrainer(ngpu=config.gpus, config=pyllmq.LLamaConfig.from_pretrained(config.model, config.model_dtype),
+                trainer = pyllmq.LLMQTrainer(ngpu=config.gpus, config=pyllmq.Config.from_pretrained(config.model, config.model_dtype),
                                              options=options, batch_size=config.batch_size, seq_len=config.seq_len, grad_accum=config.grad_accumulation,
                                              memcpy_all_gather=config.memcpy_all_gather, memcpy_send_recv=config.memcpy_send_recv)
                 print(f"Loading checkpoint from step {latest_step}...")
@@ -144,7 +144,7 @@ def main():
                 exit(1)
         elif config.from_scratch:
             print(f"Creating {config.model} from scratch...")
-            trainer = pyllmq.LLMQTrainer(ngpu=config.gpus, config=pyllmq.LLamaConfig.from_name(config.model, config.model_dtype),
+            trainer = pyllmq.LLMQTrainer(ngpu=config.gpus, config=pyllmq.Config.from_name(config.model, config.model_dtype),
                                          options=options, batch_size=config.batch_size, seq_len=config.seq_len, grad_accum=config.grad_accumulation,
                                          memcpy_all_gather=config.memcpy_all_gather, memcpy_send_recv=config.memcpy_send_recv)
             trainer.init_weights()

diff --git a/scripts/watch_wandb.py b/scripts/watch_wandb.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env -S uv run --script
+#
+# /// script
+# requires-python = ">=3.12"
+# dependencies = ["wandb", "plotly[express]", "pandas"]
+# ///
+
+
+import argparse
+import datetime
+import json
+import time
+from pathlib import Path
+from typing import Optional
+
+import wandb
+
+
+def log_line(run: "wandb.Run", entry: dict):
+    kind = entry["log"]
+    del entry["log"]
+    step = entry["step"]
+    del entry["step"]
+    del entry["time"]  # TODO can we associate a datetime with step?
+    if kind == "step":
+        tps = entry["step_tokens"] / (entry["duration_ms"] / 1000)
+        del entry["step_tokens"]
+        run.log({f"train/{k}": v for k, v in entry.items()}, step=step)
+        run.log({"train/tokens_per_second": tps}, step=step)
+    elif kind == "eval":
+        tps = entry["eval_tokens"] / (entry["duration_ms"] / 1000)
+        del entry["eval_tokens"]
+        run.log({f"eval/{k}": v for k, v in entry.items()}, step=step)
+        run.log({"eval/tokens_per_second": tps}, step=step)
+    elif kind == "gpu":
+        del entry["throttle"]  # can't log this nicely?
+        del entry["id"]        # not useful?
+        if entry["fan"] == 0:  # indicates not recorded
+            del entry["fan"]
+        entry["dram_free"] /= 1024**2   # MiB
+        entry["pcie_rx"] /= 1024**2     # MiB/s
+        entry["pcie_tx"] /= 1024**2     # MiB/s
+        run.log({f"gpu/{k}": v for k, v in entry.items()}, step=step)
+    elif kind == "cmd":
+        # TODO figure out if we can actually put this in the _wandb config object
+        # where is belongs
+        run.config["cmd"] = entry["cmd"]
+    elif kind == "gpu-model":
+        if entry["rank"] == 0:
+            run.config["gpu"] = entry
+        else:
+            run.config[f"gpu-{entry['rank']}"] = entry
+    elif kind == "allocator":
+        import plotly.express as px
+        names = [alloc["name"] for alloc in entry["stats"]]
+        amounts = [round(alloc["device"] / 1024 / 1024, 1) for alloc in entry["stats"]]
+
+        fig = px.pie(
+            names=names,
+            values=amounts,
+            title=f"GPU Allocations",
+        )
+        run.log({"allocations": fig}, step=step)
+    elif kind == "dataset":
+        pass
+        # run.config["dataset"] = entry
+    elif kind in ["option", "info"]:
+        pass
+    elif kind == "message":
+        print(entry["message"])
+    elif kind == "abs-maxes":
+        for stats in entry["abs_maxes"]:
+            run.log({f"abs_maxes/{stats['name']}": stats['value']}, step=step)
+    elif kind == "sol":
+        if entry["rank"] != 0:
+            return
+        import plotly.express as px
+        names = ["Blocks", "LM-Head", "Attention"]
+        amounts = [entry["blocks"], entry["lm_head"], entry["attention"]]
+
+        fig = px.pie(
+            names=names,
+            values=amounts,
+            title=f"FLOPs",
+        )
+        run.log({"ops": fig}, step=step)
+    else:
+        raise RuntimeError(f"Unknown kind {kind}")
+
+def convert_log(file_name: str, *, name: Optional[str], project: str, notes: str="", tags: list[str] = None):
+    while name is None:
+        log_data = json.loads(Path(file_name).read_text())
+        for entry in log_data:
+            if entry["log"] == "option":
+                opt_name = entry["name"]
+                opt_value = entry["value"]
+                if opt_name == "name":
+                    name = opt_value
+        time.sleep(10)
+
+    entry_count = 0
+    running = True
+    with wandb.init(
+            project=project,
+            name=name,
+            notes=notes,
+            tags=tags,
+    ) as run:
+        while running:
+            content = Path(file_name).read_text()
+            log_data = json.loads(content)
+            for i, entry in enumerate(log_data):
+                if i < entry_count:
+                    continue
+                if entry["log"] == "message" and entry["message"].startswith("Done. validation loss"):
+                    running = False
+                log_line(run, entry)
+                entry_count += 1
+            time.sleep(10)
+
+def main():
+    parser = argparse.ArgumentParser(description="Plot training run")
+    parser.add_argument("--log-file", type=str, help="Log file", default="log.json")
+    parser.add_argument("--project", help="WandB project name")
+    parser.add_argument("--name", help="Name for the run", default=None)
+    args = parser.parse_args()
+    convert_log(args.log_file, project=args.project, name=args.name)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/binding/binding.cpp b/src/binding/binding.cpp
@@ -99,15 +99,15 @@ NB_MODULE(_pyllmq, m) {
         })
         ;
 
-    nb::class_<LLamaConfig> (m, "LLamaConfig")
-        .def("__init__", [](LLamaConfig *t,
+    nb::class_<TransformerConfig> (m, "Config")
+        .def("__init__", [](TransformerConfig *t,
             const std::string& arch, std::optional<int> bos_token_id, std::optional<int> eos_token_id,
             int hidden_size, int intermediate_size, std::optional<int> vocab_size, int num_attention_heads, int num_key_value_heads,
             int num_hidden_layers, std::optional<int> max_position_embeddings, std::optional<float> rope_theta, float rms_norm_eps, bool tie_word_embeddings, std::optional<bool> use_qkv_bias, std::string dtype) {
             // default values depend on selected architecture
-             LLamaConfig::LLamaBasedModels architecture;
+             TransformerConfig::EArchitecture architecture;
             if(arch == "qwen2" || arch == "Qwen2" || arch == "Qwen2ForCausalLM") {
-                architecture = LLamaConfig::QWEN2;
+                architecture = TransformerConfig::QWEN2;
                 eos_token_id = eos_token_id.value_or(151643);
                 bos_token_id = bos_token_id.value_or(151643);
                 vocab_size = vocab_size.value_or(151936);
@@ -118,7 +118,7 @@ NB_MODULE(_pyllmq, m) {
                 throw std::runtime_error("At this point, only qwen2 architecture is supported.");
             }
 
-            new (t) LLamaConfig{
+            new (t) TransformerConfig{
                 .Architecture = architecture,
                 .BosTokenId = bos_token_id.value(),
                 .EosTokenId = eos_token_id.value(),
@@ -139,38 +139,38 @@ NB_MODULE(_pyllmq, m) {
              nb::arg("architecture"), nb::arg("bos_token_id") = nb::none(), nb::arg("eos_token_id") = nb::none(), nb::arg("hidden_size"), nb::arg("intermediate_size"),
              nb::arg("vocab_size") = nb::none(), nb::arg("num_attention_heads"), nb::arg("num_key_value_heads"), nb::arg("num_hidden_layers"), nb::arg("max_position_embeddings") = nb::none(),
              nb::arg("rope_theta") = nb::none(), nb::arg("rms_norm_eps"), nb::arg("tie_word_embeddings"), nb::arg("use_qkv_bias") = nb::none(), nb::arg("dtype") = "bf16")
-        .def_rw("architecture", &LLamaConfig::Architecture)
-        .def_rw("bos_token_id", &LLamaConfig::BosTokenId)
-        .def_rw("eos_token_id", &LLamaConfig::EosTokenId)
-        .def_rw("hidden_size", &LLamaConfig::HiddenSize)
-        .def_rw("intermediate_size", &LLamaConfig::IntermediateSize)
-        .def_rw("vocab_size", &LLamaConfig::VocabSize)
-        .def_rw("num_attention_heads", &LLamaConfig::NumQueryHeads)
-        .def_rw("num_key_value_heads", &LLamaConfig::NumKeyValHeads)
-        .def_rw("num_hidden_layers", &LLamaConfig::NumLayers)
-        .def_rw("max_position_embeddings", &LLamaConfig::MaxPositionEmbeddings)
-        .def_rw("rope_theta", &LLamaConfig::RopeTheta)
-        .def_rw("rms_norm_eps", &LLamaConfig::RopeTheta)
-        .def_rw("tie_word_embeddings", &LLamaConfig::TiedWordEmbeddings)
-        .def_rw("use_qkv_bias", &LLamaConfig::UseQKVBias)
+        .def_rw("architecture", &TransformerConfig::Architecture)
+        .def_rw("bos_token_id", &TransformerConfig::BosTokenId)
+        .def_rw("eos_token_id", &TransformerConfig::EosTokenId)
+        .def_rw("hidden_size", &TransformerConfig::HiddenSize)
+        .def_rw("intermediate_size", &TransformerConfig::IntermediateSize)
+        .def_rw("vocab_size", &TransformerConfig::VocabSize)
+        .def_rw("num_attention_heads", &TransformerConfig::NumQueryHeads)
+        .def_rw("num_key_value_heads", &TransformerConfig::NumKeyValHeads)
+        .def_rw("num_hidden_layers", &TransformerConfig::NumLayers)
+        .def_rw("max_position_embeddings", &TransformerConfig::MaxPositionEmbeddings)
+        .def_rw("rope_theta", &TransformerConfig::RopeTheta)
+        .def_rw("rms_norm_eps", &TransformerConfig::RopeTheta)
+        .def_rw("tie_word_embeddings", &TransformerConfig::TiedWordEmbeddings)
+        .def_rw("use_qkv_bias", &TransformerConfig::UseQKVBias)
         .def_prop_rw("dtype",
-                     [](const LLamaConfig* cfg){ return dtype_to_str(cfg->DType); },
-                     [](LLamaConfig* cfg, const std::string& dtype_str){ cfg->DType = dtype_from_str(dtype_str); })
-        .def_prop_ro("head_size", &LLamaConfig::head_size)
-        .def_prop_ro("qkv_channels", &LLamaConfig::qkv_channels)
-        .def_prop_ro("model_name", &LLamaConfig::model_name)
+                     [](const TransformerConfig* cfg){ return dtype_to_str(cfg->DType); },
+                     [](TransformerConfig* cfg, const std::string& dtype_str){ cfg->DType = dtype_from_str(dtype_str); })
+        .def_prop_ro("head_size", &TransformerConfig::head_size)
+        .def_prop_ro("qkv_channels", &TransformerConfig::qkv_channels)
+        .def_prop_ro("model_name", &TransformerConfig::model_name)
         .def_static("from_pretrained", [](const std::string& name, const std::string& dtype_str)
         {
             std::string hf_path = get_hf_model_files(name);
             if (hf_path.empty()) {
                 throw std::runtime_error("Could not find model files for " + name);
             }
             std::string config_path = hf_path + "/config.json";
-            return new LLamaConfig(load_llama_config(config_path.c_str(), dtype_from_str(dtype_str)));
+            return new TransformerConfig(load_transformer_config(config_path.c_str(), dtype_from_str(dtype_str)));
         }, nb::arg("name"), nb::arg("dtype"), "Load the config file from an existing hf model")
         .def_static("from_name", [](const std::string& name, const std::string& dtype_str)
         {
-            return new LLamaConfig(create_config_from_name(name, dtype_from_str(dtype_str)));
+            return new TransformerConfig(create_config_from_name(name, dtype_from_str(dtype_str)));
         }, nb::arg("name"), nb::arg("dtype"), "Create a config based on the model name.")
         ;
 
@@ -263,7 +263,7 @@ NB_MODULE(_pyllmq, m) {
         ;
 
     nb::class_<MultiGPUPyTrainer>(m, "LLMQTrainer")
-        .def("__init__", [](MultiGPUPyTrainer *t, int ngpu, LLamaConfig config, LLamaOptions options, int batch_size, int seq_len, int grad_accum, bool memcpy_all_gather, bool memcpy_send_recv) {
+        .def("__init__", [](MultiGPUPyTrainer *t, int ngpu, TransformerConfig config, LLamaOptions options, int batch_size, int seq_len, int grad_accum, bool memcpy_all_gather, bool memcpy_send_recv) {
             options.ModelType = config.DType;
             new (t) MultiGPUPyTrainer(ngpu, config, options, batch_size, seq_len, grad_accum, memcpy_all_gather, memcpy_send_recv);
         }, nb::arg("ngpu"), nb::arg("config"), nb::arg("options"), nb::arg("batch_size"), nb::arg("seq_len"), nb::arg("grad_accum"),
@@ -280,7 +280,7 @@ NB_MODULE(_pyllmq, m) {
             if (!std::filesystem::exists(model_path)) {
                 model_path = hf_path + "/model.safetensors.index.json";
             }
-            LLamaConfig config = load_llama_config(config_path.c_str(), dtype_from_str(dtype));
+            TransformerConfig config = load_transformer_config(config_path.c_str(), dtype_from_str(dtype));
             options.ModelType = config.DType;
             auto trainer = new MultiGPUPyTrainer(ngpu, config, options, batch_size, seq_len, grad_accum, memcpy_all_gather, memcpy_send_recv);
             trainer->import_weights(model_path);

diff --git a/src/binding/py_train.cpp b/src/binding/py_train.cpp
@@ -16,7 +16,7 @@
 #include "models/llama_gradients.h"
 #include "models/llama_run_state.h"
 
-MultiGPUPyTrainer::MultiGPUPyTrainer(int ngpus, LLamaConfig config, LLamaOptions options, int batch_size, int seq_len, int grad_accum, bool memcpy_all_gather, bool memcpy_send_recv) :
+MultiGPUPyTrainer::MultiGPUPyTrainer(int ngpus, TransformerConfig config, LLamaOptions options, int batch_size, int seq_len, int grad_accum, bool memcpy_all_gather, bool memcpy_send_recv) :
     mConfig(config), mOptions(options), B(batch_size), T(seq_len), mGradAccumulation(grad_accum)
 {
     int gpus_available = 0;
@@ -71,7 +71,7 @@ void MultiGPUPyTrainer::export_model(std::string path) {
     run_work([path](sThreadContext& ctx) {
         std::filesystem::path p(path);
         std::filesystem::create_directories(p);
-        save_llama_config(ctx.Model->config(), (p / "config.json").c_str());
+        save_transformer_config(ctx.Model->config(), (p / "config.json").c_str());
         ctx.Model->export_weights((p / "model.safetensors").c_str(), *ctx.Communicator);
     });
 }

diff --git a/src/binding/py_train.h b/src/binding/py_train.h
@@ -10,7 +10,7 @@
 #include <thread>
 #include <functional>
 
-#include "models/llama_config.h"
+#include "../training/transformer_config.h"
 #include "models/llama_model.h"
 
 class DataLoader;
@@ -41,7 +41,7 @@ class CommunicatorThreadsPack;
 class MultiGPUPyTrainer
 {
 public:
-    MultiGPUPyTrainer(int ngpus, LLamaConfig config, LLamaOptions options, int batch_size, int seq_len, int grad_accum, bool memcpy_all_gather, bool memcpy_send_recv);
+    MultiGPUPyTrainer(int ngpus, TransformerConfig config, LLamaOptions options, int batch_size, int seq_len, int grad_accum, bool memcpy_all_gather, bool memcpy_send_recv);
     ~MultiGPUPyTrainer();
 
     void import_weights(std::string path);
@@ -59,15 +59,15 @@ class MultiGPUPyTrainer
     int world_size() const;
     int batch_size() const { return B; }
     int seq_length() const { return T; }
-    const LLamaConfig& config() const { return mConfig; }
+    const TransformerConfig& config() const { return mConfig; }
     const LLamaOptions& options() const { return mOptions; }
 
     std::vector<std::pair<std::string, sSegmentMemory>> get_allocations(int gpu_id);
     std::vector<std::pair<std::string, long>> get_stack_info(int gpu_id);
     std::vector<std::pair<std::string, Tensor>> get_gradients(int gpu_id);
 
 private:
-    LLamaConfig mConfig;
+    TransformerConfig mConfig;
     LLamaOptions mOptions;
     int B;
     int T;