Refined qwen colab

mydatascience · mydatascience · commit e1cb7e4d43e5 · 2025-09-19T22:20:42.000+04:00
Signed-off-by: Vladimir Suvorov &lt;suvorovv@google.com&gt;
diff --git a/src/MaxText/examples/sft_qwen3_demo.ipynb b/src/MaxText/examples/sft_qwen3_demo.ipynb
@@ -0,0 +1,257 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Run SFT on Qwen3-0.6B model\n",
+        "\n",
+        "This collab can run on the public TPU 5e-1\n",
+        "\n",
+        "This notebook demonstrates how to perform Supervised Fine-Tuning (SFT) on Qwen3-0.6B using the Hugging Face ultrachat_200k dataset with Tunix integration for efficient training.\n",
+        "\n",
+        "Dataset Overview\n",
+        "https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n",
+        "\n",
+        "Dataset Information:\n",
+        "\n",
+        "Name: HuggingFaceH4/ultrachat_200k\n",
+        "Type: Supervised Fine-Tuning dataset\n",
+        "Size: ~200k conversations\n",
+        "Format: Chat conversations with human-AI pairs\n",
+        "Splits: train_sft, test_sft\n",
+        "Data columns: ['messages']\n",
+        "Dataset Structure: Each example contains a 'messages' field with:\n",
+        "\n",
+        "role: 'user' or 'assistant'\n",
+        "content: The actual message text\n",
+        "Example data format:\n",
+        "\n",
+        "{\n",
+        "  \"messages\": [\n",
+        "    {\"role\": \"user\", \"content\": \"What is the capital of France?\"},\n",
+        "    {\"role\": \"assistant\", \"content\": \"The capital of France is Paris.\"}\n",
+        "  ]\n",
+        "}\n",
+        "\n",
+        "Prerequisites\n",
+        "HuggingFace access token for dataset download\n",
+        "Sufficient compute resources (TPU/GPU)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Wr4OOETu8elP"
+      },
+      "outputs": [],
+      "source": [
+        "### (Optional) Run this if you just have this file and nothing else\n",
+        "\n",
+        "# 1. Clone the MaxText repository (from AI‑Hypercomputer)\n",
+        "!git clone https://github.com/AI-Hypercomputer/maxtext.git\n",
+        "\n",
+        "# 2. Navigate into the cloned directory\n",
+        "%cd maxtext"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5KPyOE8e9WbO"
+      },
+      "outputs": [],
+      "source": [
+        "### (Optional) Do not run this if you already installed the dependencies\n",
+        "\n",
+        "# 3. Ensure setup.sh is executable\n",
+        "!chmod +x setup.sh\n",
+        "\n",
+        "# 4. Execute the setup script\n",
+        "!./setup.sh\n",
+        "\n",
+        "# force numpy version\n",
+        "!pip install --force-reinstall numpy==2.1.2\n",
+        "#install nest_asyncio\n",
+        "!pip install nest_asyncio\n",
+        "\n",
+        "import nest_asyncio\n",
+        "nest_asyncio.apply()\n",
+        "# To fix \"This event loop is already running\" error in Colab\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CJnhPxUq_G6a"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import sys\n",
+        "#  Set  home directory. Change this to your home directory where maxtext is cloned\n",
+        "MAXTEXT_HOME = os.path.join(\"/content\", \"maxtext\")\n",
+        "print(f\"Home directory (from Python): {MAXTEXT_HOME}\")\n",
+        "#MODEL_CHECKPOINT_PATH = \"path/to/scanned/checkpoint\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CxzKMBQd_U5-"
+      },
+      "outputs": [],
+      "source": [
+        "from pathlib import Path\n",
+        "from typing import Optional, Dict, Any\n",
+        "\n",
+        "# Find MaxText directory and change working directory to it\n",
+        "current_dir = Path.cwd()\n",
+        "if current_dir.name == 'examples':\n",
+        "    # We're in the examples folder, go up one level\n",
+        "    maxtext_path = current_dir.parent.parent\n",
+        "else:\n",
+        "    # We're in the root, MaxText is a subfolder\n",
+        "    maxtext_path = Path(f'{MAXTEXT_HOME}') / 'src' / 'MaxText'\n",
+        "\n",
+        "# Change working directory to MaxText project root\n",
+        "os.chdir(maxtext_path)\n",
+        "sys.path.insert(0, str(maxtext_path))\n",
+        "\n",
+        "print(f\"✓ Changed working directory to: {os.getcwd()}\")\n",
+        "print(f\"✓ MaxText project root: {maxtext_path}\")\n",
+        "print(f\"✓ Added to Python path: {maxtext_path}\")\n",
+        "import jax\n",
+        "if not jax.distributed.is_initialized():\n",
+        "    jax.distributed.initialize()\n",
+        "print(f\"JAX version: {jax.__version__}\")\n",
+        "print(f\"JAX devices: {jax.devices()}\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rKS8nVYgAbwE"
+      },
+      "outputs": [],
+      "source": [
+        "# Hugging Face Authentication Setup\n",
+        "from huggingface_hub import login\n",
+        "\n",
+        "# Set your Hugging Face token here\n",
+        "HF_TOKEN = \"your_actual_token_here\"  # Replace with your actual token\n",
+        "login(token=HF_TOKEN)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "aR0zTWkxAs4t"
+      },
+      "outputs": [],
+      "source": [
+        "# MaxText imports\n",
+        "try:\n",
+        "    from MaxText import pyconfig\n",
+        "    from MaxText.sft.sft_trainer import train as sft_train\n",
+        "\n",
+        "    MAXTEXT_AVAILABLE = True\n",
+        "    print(\"✓ MaxText imports successful\")\n",
+        "except ImportError as e:\n",
+        "    print(f\"⚠️ MaxText not available: {e}\")\n",
+        "    MAXTEXT_AVAILABLE = False"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "In-jdp1AAwrL"
+      },
+      "outputs": [],
+      "source": [
+        "# Fixed configuration setup for Qwen-0.6B on small TPU\n",
+        "if MAXTEXT_AVAILABLE:\n",
+        "    config_argv = [\n",
+        "        \"\",\n",
+        "        f\"{MAXTEXT_HOME}/src/MaxText/configs/sft.yml\",   # base SFT config\n",
+        "        \"model_name=qwen3-0.6b\",\n",
+        "        \"steps=20\",                                     # very short run for testing\n",
+        "        \"per_device_batch_size=1\",                      # minimal to avoid OOM\n",
+        "        \"max_target_length=512\",                        # shorter context to fit memory\n",
+        "        \"learning_rate=2.0e-5\",                         # safe small LR\n",
+        "        \"eval_steps=5\",\n",
+        "        \"weight_dtype=bfloat16\",\n",
+        "        \"dtype=bfloat16\",\n",
+        "        \"hf_path=HuggingFaceH4/ultrachat_200k\",                       # HuggingFace dataset/model if needed\n",
+        "        f\"hf_access_token={HF_TOKEN}\",\n",
+        "        \"base_output_directory=/tmp/maxtext_qwen06\",\n",
+        "        \"run_name=sft_qwen0.6b_test\",\n",
+        "        \"tokenizer_path=Qwen/Qwen3-0.6B\",                # Qwen tokenizer\n",
+        "        \"eval_interval=10\",\n",
+        "        \"steps=100\",\n",
+        "        \"profiler=xplane\",\n",
+        "    ]\n",
+        "\n",
+        "    # Initialize configuration using MaxText's pyconfig\n",
+        "    config = pyconfig.initialize(config_argv)\n",
+        "\n",
+        "    print(\"✓ Fixed configuration loaded:\")\n",
+        "    print(f\"  - Model: {config.model_name}\")\n",
+        "    print(f\"  - Dataset: {config.hf_path}\")\n",
+        "    print(f\"  - Steps: {config.steps}\")\n",
+        "    print(f\"  - Use SFT: {config.use_sft}\")\n",
+        "    print(f\"  - Learning Rate: {config.learning_rate}\")\n",
+        "else:\n",
+        "    print(\"MaxText not available - cannot load configuration\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "EJE1ookSAzz-"
+      },
+      "source": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "mgwpNgQYCJEd"
+      },
+      "outputs": [],
+      "source": [
+        "#  Execute the training using MaxText SFT trainer's train() function\n",
+        "if MAXTEXT_AVAILABLE:\n",
+        "    print(\"=\"*60)\n",
+        "    print(\"EXECUTING ACTUAL TRAINING\")\n",
+        "    print(\"=\"*60)\n",
+        "\n",
+        "\n",
+        "    sft_train(config)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "TPU",
+    "colab": {
+      "gpuType": "V5E1",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}