Add knowledge distillation model and loss function support

mrT23 · web-flow · commit b48e88cd81dd · 2025-10-15T14:35:06.000+03:00
diff --git a/timm/utils/model_kd.py b/timm/utils/model_kd.py
@@ -0,0 +1,77 @@
+import logging
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from timm.models import create_model
+
+_logger = logging.getLogger(__name__)
+
+class build_kd_model(nn.Module):
+    def __init__(self, args):
+        super(build_kd_model, self).__init__()
+
+        _logger.info(f"Creating KD model: from '{args.kd_model_name}'")
+        in_chans = 3
+        if args.in_chans is not None:
+            in_chans = args.in_chans
+        model_kd = create_model(
+            model_name=args.kd_model_name,
+            num_classes=args.num_classes,
+            pretrained=True,
+            in_chans=in_chans)
+
+        # compile model
+        model_kd.cpu().eval()
+        try:
+            model_kd = torch.compile(model_kd)
+            _logger.info(f"torch.compile applied successfully to KD model")
+        except Exception as e:
+            _logger.warning(f"torch.compile failed with error {e}, continuing KD model without torch compilation")
+
+        self.model = model_kd.cuda()
+        self.mean_model_kd = model_kd.default_cfg['mean']
+        self.std_model_kd = model_kd.default_cfg['std']
+
+    # handling different normalization of teacher and student
+    def normalize_input(self, input, student_model):
+        if hasattr(student_model, 'module'):
+            model_s = student_model.module
+        else:
+            model_s = student_model
+
+        mean_student = model_s.default_cfg['mean']
+        std_student = model_s.default_cfg['std']
+
+        input_kd = input
+        if mean_student != self.mean_model_kd or std_student != self.std_model_kd:
+            std = (self.std_model_kd[0] / std_student[0], self.std_model_kd[1] / std_student[1],
+                   self.std_model_kd[2] / std_student[2])
+            transform_std = T.Normalize(mean=(0, 0, 0), std=std)
+
+            mean = (self.mean_model_kd[0] - mean_student[0], self.mean_model_kd[1] - mean_student[1],
+                    self.mean_model_kd[2] - mean_student[2])
+            transform_mean = T.Normalize(mean=mean, std=(1, 1, 1))
+
+            input_kd = transform_mean(transform_std(input))
+
+        return input_kd
+
+
+def add_kd_loss(_loss, output, input, model, model_kd, args):
+    # student probability calculation
+    prob_s = torch.nn.functional.log_softmax(output, dim=-1)
+
+    # teacher probability calculation
+    with torch.no_grad():
+        input_kd = model_kd.normalize_input(input, model)
+        out_t = model_kd.model(input_kd.detach())
+        prob_t = torch.nn.functional.softmax(out_t, dim=-1)
+
+    # adding KL loss
+    if not args.use_kd_only_loss:
+        _loss += args.alpha_kd * torch.nn.functional.kl_div(prob_s, prob_t, reduction='batchmean')
+    else:  # only kd
+        _loss = args.alpha_kd * torch.nn.functional.kl_div(prob_s, prob_t, reduction='batchmean')
+
+    return _loss
+
diff --git a/train.py b/train.py
@@ -41,6 +41,7 @@
 from timm.optim import create_optimizer_v2, optimizer_kwargs
 from timm.scheduler import create_scheduler_v2, scheduler_kwargs
 from timm.utils import ApexScaler, NativeScaler
+from timm.utils.model_kd import build_kd_model, add_kd_loss
 
 try:
     from apex import amp
@@ -415,6 +416,14 @@
 group.add_argument('--naflex-loss-scale', default='linear', type=str,
                    help='Scale loss (gradient) by batch_size ("none", "sqrt", or "linear")')
 
+# Knowledge Distillation parameters
+parser.add_argument('--kd-model-name', default=None, type=str,
+                    help='Name of teacher model for knowledge distillation')
+parser.add_argument('--alpha-kd', default=5, type=float,
+                    help='Weight for KD loss (default: 5)')
+parser.add_argument('--use-kd-only-loss', action='store_true', default=False,
+                    help='Use only KD loss, without cross-entropy loss')
+
 
 def _parse_args():
     # Do we have a config file to parse?
@@ -480,6 +489,11 @@ def main():
 
     utils.random_seed(args.seed, args.rank)
 
+    # Create the KD teacher model if specified
+    model_kd = None
+    if args.kd_model_name is not None:
+        model_kd = build_kd_model(args)
+
     if args.fuser:
         utils.set_jit_fuser(args.fuser)
     if args.fast_norm:
@@ -1006,6 +1020,7 @@ def main():
                 mixup_fn=mixup_fn,
                 num_updates_total=num_epochs * updates_per_epoch,
                 naflex_mode=naflex_mode,
+                model_kd=model_kd,
             )
 
             if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
@@ -1109,6 +1124,7 @@ def train_one_epoch(
         mixup_fn=None,
         num_updates_total=None,
         naflex_mode=False,
+        model_kd=None,
 ):
     if args.mixup_off_epoch and epoch >= args.mixup_off_epoch:
         if args.prefetcher and loader.mixup_enabled:
@@ -1155,6 +1171,11 @@ def _forward():
             with amp_autocast():
                 output = model(input)
                 _loss = loss_fn(output, target)
+
+                # KD logic
+                if model_kd is not None:
+                    _loss= add_kd_loss(_loss, output, input, model, model_kd, args)
+
             if accum_steps > 1:
                 _loss /= accum_steps
             return _loss