Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,13 @@ add_library(llmq-common SHARED
src/training/dataloader.cpp
src/training/logging.cpp
src/training/checkpoint.cpp
src/training/model.cpp
src/training/transformer_config.cpp

src/models/llama_run_state.cpp
src/models/llama_model.cpp
src/models/llama_weights.cpp
src/models/llama_gradients.cpp
src/models/llama_config.cpp
src/models/llama_optimizer.cpp

src/kernels/kernels.cpp
Expand Down
4 changes: 2 additions & 2 deletions export-checkpoint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ int main(int argc, const char** argv) {


std::string config_path = ModelRootPath + "/config.json";
LLamaConfig config = load_llama_config(config_path.c_str(), ModelDType);
TransformerConfig config = load_transformer_config(config_path.c_str(), ModelDType);

int latest_step = find_latest_checkpoint(CkptDir);
if (latest_step < 0) {
Expand All @@ -76,7 +76,7 @@ int main(int argc, const char** argv) {

std::filesystem::path p(OutDir);
std::filesystem::create_directories(p);
save_llama_config(config, (p / "config.json").c_str());
save_transformer_config(config, (p / "config.json").c_str());
model.export_weights((p / "model.safetensors").c_str(), comm);
});
}
2 changes: 1 addition & 1 deletion scripts/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

def main():
# configure the model architecture
config = pyllmq.LLamaConfig(
config = pyllmq.Config(
architecture="qwen2",
hidden_size=896,
intermediate_size=4864,
Expand Down
5 changes: 4 additions & 1 deletion scripts/export_wandb.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,11 @@ def log_line(run: "wandb.Run", entry: dict):
elif kind == "dataset":
pass
# run.config["dataset"] = entry
elif kind in ["option", "info"]:
elif kind in ["option", "info", "message"]:
pass
elif kind == "abs-maxes":
for stats in entry["abs_maxes"]:
run.log({f"abs_maxes/{stats['name']}": stats['value']}, step=step)
elif kind == "sol":
if entry["rank"] != 0:
return
Expand Down
8 changes: 4 additions & 4 deletions scripts/modal_test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,13 @@ def run_fixed_result_test(dtype: str = "bf16"):
}
elif dtype == "e4m3":
expected = {
"losses": [3.6896581649780273, 3.3729734420776367, 3.4653379917144775, 3.442567825317383, 3.209272861480713, 3.598208427429199, 3.3247671127319336, 3.16266131401062, 3.340975284576416, 3.1603269577026367, 2.9368526935577393],
"norms": [7.225314140319824, 7.687681198120117, 5.967431545257568, 6.447912693023682, 6.524477958679199, 6.501705169677734, 5.946033477783203, 5.431783199310303, 5.570157527923584, 4.84951639175415]
"losses": [3.6896581649780273, 3.350105047225952, 3.4565865993499756, 3.4244627952575684, 3.1961753368377686, 3.5905539989471436, 3.314474582672119, 3.1519875526428223, 3.328141689300537, 3.1484572887420654, 2.935676097869873],
"norms": [7.225314140319824, 6.242846965789795, 6.185355186462402, 5.921780586242676, 6.2074384689331055, 6.655130863189697, 5.875917911529541, 5.7703142166137695, 5.688600063323975, 4.981103897094727],
}
elif dtype == "e5m2":
expected = {
"losses": [3.6896581649780273, 3.374328136444092, 3.4656238555908203, 3.442816734313965, 3.212712287902832, 3.600202798843384, 3.3219680786132812, 3.165616035461426, 3.3386964797973633, 3.1590282917022705, 2.9364452362060547],
"norms": [7.1560797691345215, 7.673510551452637, 5.905216693878174, 6.354756832122803, 6.572442531585693, 6.477637767791748, 5.919188022613525, 5.463287353515625, 5.5615410804748535, 4.890221118927002],
"losses": [3.6896581649780273, 3.352154016494751, 3.4541091918945312, 3.4255144596099854, 3.194918394088745, 3.591022491455078, 3.3119752407073975, 3.1553726196289062, 3.326538562774658, 3.150322675704956, 2.9374852180480957],
"norms": [7.1560797691345215, 6.176127910614014, 6.131178855895996, 5.944278240203857, 6.135629653930664, 6.733249187469482, 5.816003322601318, 5.759800910949707, 5.704397201538086, 4.93273401260376],
}
else:
raise ValueError(f"Unknown dtype: {dtype}")
Expand Down
4 changes: 2 additions & 2 deletions scripts/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def main():
if config.continue_from_checkpoint:
latest_step = pyllmq.find_latest_checkpoint(ckpt_dir)
if latest_step >= 0:
trainer = pyllmq.LLMQTrainer(ngpu=config.gpus, config=pyllmq.LLamaConfig.from_pretrained(config.model, config.model_dtype),
trainer = pyllmq.LLMQTrainer(ngpu=config.gpus, config=pyllmq.Config.from_pretrained(config.model, config.model_dtype),
options=options, batch_size=config.batch_size, seq_len=config.seq_len, grad_accum=config.grad_accumulation,
memcpy_all_gather=config.memcpy_all_gather, memcpy_send_recv=config.memcpy_send_recv)
print(f"Loading checkpoint from step {latest_step}...")
Expand All @@ -144,7 +144,7 @@ def main():
exit(1)
elif config.from_scratch:
print(f"Creating {config.model} from scratch...")
trainer = pyllmq.LLMQTrainer(ngpu=config.gpus, config=pyllmq.LLamaConfig.from_name(config.model, config.model_dtype),
trainer = pyllmq.LLMQTrainer(ngpu=config.gpus, config=pyllmq.Config.from_name(config.model, config.model_dtype),
options=options, batch_size=config.batch_size, seq_len=config.seq_len, grad_accum=config.grad_accumulation,
memcpy_all_gather=config.memcpy_all_gather, memcpy_send_recv=config.memcpy_send_recv)
trainer.init_weights()
Expand Down
131 changes: 131 additions & 0 deletions scripts/watch_wandb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/usr/bin/env -S uv run --script
#
# /// script
# requires-python = ">=3.12"
# dependencies = ["wandb", "plotly[express]", "pandas"]
# ///


import argparse
import datetime
import json
import time
from pathlib import Path
from typing import Optional

import wandb


def log_line(run: "wandb.Run", entry: dict):
kind = entry["log"]
del entry["log"]
step = entry["step"]
del entry["step"]
del entry["time"] # TODO can we associate a datetime with step?
if kind == "step":
tps = entry["step_tokens"] / (entry["duration_ms"] / 1000)
del entry["step_tokens"]
run.log({f"train/{k}": v for k, v in entry.items()}, step=step)
run.log({"train/tokens_per_second": tps}, step=step)
elif kind == "eval":
tps = entry["eval_tokens"] / (entry["duration_ms"] / 1000)
del entry["eval_tokens"]
run.log({f"eval/{k}": v for k, v in entry.items()}, step=step)
run.log({"eval/tokens_per_second": tps}, step=step)
elif kind == "gpu":
del entry["throttle"] # can't log this nicely?
del entry["id"] # not useful?
if entry["fan"] == 0: # indicates not recorded
del entry["fan"]
entry["dram_free"] /= 1024**2 # MiB
entry["pcie_rx"] /= 1024**2 # MiB/s
entry["pcie_tx"] /= 1024**2 # MiB/s
run.log({f"gpu/{k}": v for k, v in entry.items()}, step=step)
elif kind == "cmd":
# TODO figure out if we can actually put this in the _wandb config object
# where is belongs
run.config["cmd"] = entry["cmd"]
elif kind == "gpu-model":
if entry["rank"] == 0:
run.config["gpu"] = entry
else:
run.config[f"gpu-{entry['rank']}"] = entry
elif kind == "allocator":
import plotly.express as px
names = [alloc["name"] for alloc in entry["stats"]]
amounts = [round(alloc["device"] / 1024 / 1024, 1) for alloc in entry["stats"]]

fig = px.pie(
names=names,
values=amounts,
title=f"GPU Allocations",
)
run.log({"allocations": fig}, step=step)
elif kind == "dataset":
pass
# run.config["dataset"] = entry
elif kind in ["option", "info"]:
pass
elif kind == "message":
print(entry["message"])
elif kind == "abs-maxes":
for stats in entry["abs_maxes"]:
run.log({f"abs_maxes/{stats['name']}": stats['value']}, step=step)
elif kind == "sol":
if entry["rank"] != 0:
return
import plotly.express as px
names = ["Blocks", "LM-Head", "Attention"]
amounts = [entry["blocks"], entry["lm_head"], entry["attention"]]

fig = px.pie(
names=names,
values=amounts,
title=f"FLOPs",
)
run.log({"ops": fig}, step=step)
else:
raise RuntimeError(f"Unknown kind {kind}")

def convert_log(file_name: str, *, name: Optional[str], project: str, notes: str="", tags: list[str] = None):
while name is None:
log_data = json.loads(Path(file_name).read_text())
for entry in log_data:
if entry["log"] == "option":
opt_name = entry["name"]
opt_value = entry["value"]
if opt_name == "name":
name = opt_value
time.sleep(10)

entry_count = 0
running = True
with wandb.init(
project=project,
name=name,
notes=notes,
tags=tags,
) as run:
while running:
content = Path(file_name).read_text()
log_data = json.loads(content)
for i, entry in enumerate(log_data):
if i < entry_count:
continue
if entry["log"] == "message" and entry["message"].startswith("Done. validation loss"):
running = False
log_line(run, entry)
entry_count += 1
time.sleep(10)

def main():
parser = argparse.ArgumentParser(description="Plot training run")
parser.add_argument("--log-file", type=str, help="Log file", default="log.json")
parser.add_argument("--project", help="WandB project name")
parser.add_argument("--name", help="Name for the run", default=None)
args = parser.parse_args()
convert_log(args.log_file, project=args.project, name=args.name)


if __name__ == "__main__":
main()
56 changes: 28 additions & 28 deletions src/binding/binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,15 @@ NB_MODULE(_pyllmq, m) {
})
;

nb::class_<LLamaConfig> (m, "LLamaConfig")
.def("__init__", [](LLamaConfig *t,
nb::class_<TransformerConfig> (m, "Config")
.def("__init__", [](TransformerConfig *t,
const std::string& arch, std::optional<int> bos_token_id, std::optional<int> eos_token_id,
int hidden_size, int intermediate_size, std::optional<int> vocab_size, int num_attention_heads, int num_key_value_heads,
int num_hidden_layers, std::optional<int> max_position_embeddings, std::optional<float> rope_theta, float rms_norm_eps, bool tie_word_embeddings, std::optional<bool> use_qkv_bias, std::string dtype) {
// default values depend on selected architecture
LLamaConfig::LLamaBasedModels architecture;
TransformerConfig::EArchitecture architecture;
if(arch == "qwen2" || arch == "Qwen2" || arch == "Qwen2ForCausalLM") {
architecture = LLamaConfig::QWEN2;
architecture = TransformerConfig::QWEN2;
eos_token_id = eos_token_id.value_or(151643);
bos_token_id = bos_token_id.value_or(151643);
vocab_size = vocab_size.value_or(151936);
Expand All @@ -118,7 +118,7 @@ NB_MODULE(_pyllmq, m) {
throw std::runtime_error("At this point, only qwen2 architecture is supported.");
}

new (t) LLamaConfig{
new (t) TransformerConfig{
.Architecture = architecture,
.BosTokenId = bos_token_id.value(),
.EosTokenId = eos_token_id.value(),
Expand All @@ -139,38 +139,38 @@ NB_MODULE(_pyllmq, m) {
nb::arg("architecture"), nb::arg("bos_token_id") = nb::none(), nb::arg("eos_token_id") = nb::none(), nb::arg("hidden_size"), nb::arg("intermediate_size"),
nb::arg("vocab_size") = nb::none(), nb::arg("num_attention_heads"), nb::arg("num_key_value_heads"), nb::arg("num_hidden_layers"), nb::arg("max_position_embeddings") = nb::none(),
nb::arg("rope_theta") = nb::none(), nb::arg("rms_norm_eps"), nb::arg("tie_word_embeddings"), nb::arg("use_qkv_bias") = nb::none(), nb::arg("dtype") = "bf16")
.def_rw("architecture", &LLamaConfig::Architecture)
.def_rw("bos_token_id", &LLamaConfig::BosTokenId)
.def_rw("eos_token_id", &LLamaConfig::EosTokenId)
.def_rw("hidden_size", &LLamaConfig::HiddenSize)
.def_rw("intermediate_size", &LLamaConfig::IntermediateSize)
.def_rw("vocab_size", &LLamaConfig::VocabSize)
.def_rw("num_attention_heads", &LLamaConfig::NumQueryHeads)
.def_rw("num_key_value_heads", &LLamaConfig::NumKeyValHeads)
.def_rw("num_hidden_layers", &LLamaConfig::NumLayers)
.def_rw("max_position_embeddings", &LLamaConfig::MaxPositionEmbeddings)
.def_rw("rope_theta", &LLamaConfig::RopeTheta)
.def_rw("rms_norm_eps", &LLamaConfig::RopeTheta)
.def_rw("tie_word_embeddings", &LLamaConfig::TiedWordEmbeddings)
.def_rw("use_qkv_bias", &LLamaConfig::UseQKVBias)
.def_rw("architecture", &TransformerConfig::Architecture)
.def_rw("bos_token_id", &TransformerConfig::BosTokenId)
.def_rw("eos_token_id", &TransformerConfig::EosTokenId)
.def_rw("hidden_size", &TransformerConfig::HiddenSize)
.def_rw("intermediate_size", &TransformerConfig::IntermediateSize)
.def_rw("vocab_size", &TransformerConfig::VocabSize)
.def_rw("num_attention_heads", &TransformerConfig::NumQueryHeads)
.def_rw("num_key_value_heads", &TransformerConfig::NumKeyValHeads)
.def_rw("num_hidden_layers", &TransformerConfig::NumLayers)
.def_rw("max_position_embeddings", &TransformerConfig::MaxPositionEmbeddings)
.def_rw("rope_theta", &TransformerConfig::RopeTheta)
.def_rw("rms_norm_eps", &TransformerConfig::RopeTheta)
.def_rw("tie_word_embeddings", &TransformerConfig::TiedWordEmbeddings)
.def_rw("use_qkv_bias", &TransformerConfig::UseQKVBias)
.def_prop_rw("dtype",
[](const LLamaConfig* cfg){ return dtype_to_str(cfg->DType); },
[](LLamaConfig* cfg, const std::string& dtype_str){ cfg->DType = dtype_from_str(dtype_str); })
.def_prop_ro("head_size", &LLamaConfig::head_size)
.def_prop_ro("qkv_channels", &LLamaConfig::qkv_channels)
.def_prop_ro("model_name", &LLamaConfig::model_name)
[](const TransformerConfig* cfg){ return dtype_to_str(cfg->DType); },
[](TransformerConfig* cfg, const std::string& dtype_str){ cfg->DType = dtype_from_str(dtype_str); })
.def_prop_ro("head_size", &TransformerConfig::head_size)
.def_prop_ro("qkv_channels", &TransformerConfig::qkv_channels)
.def_prop_ro("model_name", &TransformerConfig::model_name)
.def_static("from_pretrained", [](const std::string& name, const std::string& dtype_str)
{
std::string hf_path = get_hf_model_files(name);
if (hf_path.empty()) {
throw std::runtime_error("Could not find model files for " + name);
}
std::string config_path = hf_path + "/config.json";
return new LLamaConfig(load_llama_config(config_path.c_str(), dtype_from_str(dtype_str)));
return new TransformerConfig(load_transformer_config(config_path.c_str(), dtype_from_str(dtype_str)));
}, nb::arg("name"), nb::arg("dtype"), "Load the config file from an existing hf model")
.def_static("from_name", [](const std::string& name, const std::string& dtype_str)
{
return new LLamaConfig(create_config_from_name(name, dtype_from_str(dtype_str)));
return new TransformerConfig(create_config_from_name(name, dtype_from_str(dtype_str)));
}, nb::arg("name"), nb::arg("dtype"), "Create a config based on the model name.")
;

Expand Down Expand Up @@ -263,7 +263,7 @@ NB_MODULE(_pyllmq, m) {
;

nb::class_<MultiGPUPyTrainer>(m, "LLMQTrainer")
.def("__init__", [](MultiGPUPyTrainer *t, int ngpu, LLamaConfig config, LLamaOptions options, int batch_size, int seq_len, int grad_accum, bool memcpy_all_gather, bool memcpy_send_recv) {
.def("__init__", [](MultiGPUPyTrainer *t, int ngpu, TransformerConfig config, LLamaOptions options, int batch_size, int seq_len, int grad_accum, bool memcpy_all_gather, bool memcpy_send_recv) {
options.ModelType = config.DType;
new (t) MultiGPUPyTrainer(ngpu, config, options, batch_size, seq_len, grad_accum, memcpy_all_gather, memcpy_send_recv);
}, nb::arg("ngpu"), nb::arg("config"), nb::arg("options"), nb::arg("batch_size"), nb::arg("seq_len"), nb::arg("grad_accum"),
Expand All @@ -280,7 +280,7 @@ NB_MODULE(_pyllmq, m) {
if (!std::filesystem::exists(model_path)) {
model_path = hf_path + "/model.safetensors.index.json";
}
LLamaConfig config = load_llama_config(config_path.c_str(), dtype_from_str(dtype));
TransformerConfig config = load_transformer_config(config_path.c_str(), dtype_from_str(dtype));
options.ModelType = config.DType;
auto trainer = new MultiGPUPyTrainer(ngpu, config, options, batch_size, seq_len, grad_accum, memcpy_all_gather, memcpy_send_recv);
trainer->import_weights(model_path);
Expand Down
4 changes: 2 additions & 2 deletions src/binding/py_train.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include "models/llama_gradients.h"
#include "models/llama_run_state.h"

MultiGPUPyTrainer::MultiGPUPyTrainer(int ngpus, LLamaConfig config, LLamaOptions options, int batch_size, int seq_len, int grad_accum, bool memcpy_all_gather, bool memcpy_send_recv) :
MultiGPUPyTrainer::MultiGPUPyTrainer(int ngpus, TransformerConfig config, LLamaOptions options, int batch_size, int seq_len, int grad_accum, bool memcpy_all_gather, bool memcpy_send_recv) :
mConfig(config), mOptions(options), B(batch_size), T(seq_len), mGradAccumulation(grad_accum)
{
int gpus_available = 0;
Expand Down Expand Up @@ -71,7 +71,7 @@ void MultiGPUPyTrainer::export_model(std::string path) {
run_work([path](sThreadContext& ctx) {
std::filesystem::path p(path);
std::filesystem::create_directories(p);
save_llama_config(ctx.Model->config(), (p / "config.json").c_str());
save_transformer_config(ctx.Model->config(), (p / "config.json").c_str());
ctx.Model->export_weights((p / "model.safetensors").c_str(), *ctx.Communicator);
});
}
Expand Down
8 changes: 4 additions & 4 deletions src/binding/py_train.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include <thread>
#include <functional>

#include "models/llama_config.h"
#include "../training/transformer_config.h"
#include "models/llama_model.h"

class DataLoader;
Expand Down Expand Up @@ -41,7 +41,7 @@ class CommunicatorThreadsPack;
class MultiGPUPyTrainer
{
public:
MultiGPUPyTrainer(int ngpus, LLamaConfig config, LLamaOptions options, int batch_size, int seq_len, int grad_accum, bool memcpy_all_gather, bool memcpy_send_recv);
MultiGPUPyTrainer(int ngpus, TransformerConfig config, LLamaOptions options, int batch_size, int seq_len, int grad_accum, bool memcpy_all_gather, bool memcpy_send_recv);
~MultiGPUPyTrainer();

void import_weights(std::string path);
Expand All @@ -59,15 +59,15 @@ class MultiGPUPyTrainer
int world_size() const;
int batch_size() const { return B; }
int seq_length() const { return T; }
const LLamaConfig& config() const { return mConfig; }
const TransformerConfig& config() const { return mConfig; }
const LLamaOptions& options() const { return mOptions; }

std::vector<std::pair<std::string, sSegmentMemory>> get_allocations(int gpu_id);
std::vector<std::pair<std::string, long>> get_stack_info(int gpu_id);
std::vector<std::pair<std::string, Tensor>> get_gradients(int gpu_id);

private:
LLamaConfig mConfig;
TransformerConfig mConfig;
LLamaOptions mOptions;
int B;
int T;
Expand Down
Loading