fix: update ner_crf & span_classifier params in place in post_init to avoid optimizer issues

percevalw · percevalw · commit d2e1f396aa51 · 2025-09-04T00:05:22.000+02:00
diff --git a/changelog.md b/changelog.md
@@ -23,6 +23,7 @@
 - We now support `words[-10:10]` syntax in trainable span classifier `context_getter` parameter
 - :ambulance: Until now, `post_init` was applied **after** the instantiation of the optimizer : if the model discovered new labels, and therefore changed its parameter tensors to reflect that, these new tensors were not taken into account by the optimizer, which could likely lead to subpar performance. Now, `post_init` is applied **before** the optimizer is instantiated, so that the optimizer can correctly handle the new tensors.
 - Added missing entry points for readers and writers in the registry, including `write_parquet` and support for `polars` in `pyproject.toml`. Now all implemented readers and writers are correctly registered as entry points.
+- Parameters are now updated *in place* by "post_init" is run in `eds.ner_crf` and `eds.span_classifier`, and are therefore correctly taken into account by the optimizer.
 
 ### Changed
 
diff --git a/edsnlp/pipes/trainable/ner_crf/ner_crf.py b/edsnlp/pipes/trainable/ner_crf/ner_crf.py
@@ -362,9 +362,9 @@ def update_labels(self, labels: Sequence[str]):
         new_linear = torch.nn.Linear(self.embedding.output_size, len(labels) * 5)
         new_linear.weight.data[new_index] = self.linear.weight.data[old_index]
         new_linear.bias.data[new_index] = self.linear.bias.data[old_index]
-        self.linear.weight = new_linear.weight
+        self.linear.weight.data = new_linear.weight.data
         self.linear.out_features = new_linear.out_features
-        self.linear.bias = new_linear.bias
+        self.linear.bias.data = new_linear.bias.data
 
         # Update initialization arguments
         self.labels = labels
diff --git a/edsnlp/pipes/trainable/span_classifier/span_classifier.py b/edsnlp/pipes/trainable/span_classifier/span_classifier.py
@@ -401,7 +401,7 @@ def update_bindings(self, bindings: List[Tuple[str, SpanFilter, List[Any]]]):
         new_index = [new_bindings_to_idx[label] for label in common]
         new_linear = torch.nn.Linear(self.classifier.in_features, len(new_bindings))
         new_linear.weight.data[new_index] = self.classifier.weight.data[old_index]
-        self.classifier.weight = new_linear.weight
+        self.classifier.weight.data = new_linear.weight.data
         self.classifier.out_features = new_bindings
         missing_bindings = set(new_bindings) - set(old_bindings)
         if missing_bindings and len(old_bindings) > 0:
@@ -412,7 +412,7 @@ def update_bindings(self, bindings: List[Tuple[str, SpanFilter, List[Any]]]):
 
         if hasattr(self.classifier, "bias"):
             new_linear.bias.data[new_index] = self.classifier.bias.data[old_index]
-            self.classifier.bias = new_linear.bias
+            self.classifier.bias.data = new_linear.bias.data
 
         def simplify_indexer(indexer):
             return (
diff --git a/edsnlp/training/trainer.py b/edsnlp/training/trainer.py
@@ -728,7 +728,8 @@ def train(
                     )
                 )
             accelerator.print(
-                f"Keeping frozen {len(all_params - grad_params):} weight tensors "
+                ("! WARNING ! " if (len(all_params - grad_params) > 0) else "")
+                + f"Keeping frozen {len(all_params - grad_params):} weight tensors "
                 f"({sum(p.numel() for p in all_params - grad_params):,} parameters)"
             )
 
diff --git a/tests/training/ner_qlf_same_bert_config.yml b/tests/training/ner_qlf_same_bert_config.yml
@@ -56,13 +56,17 @@ scorer:
 
 # 🎛️ OPTIMIZER
 optimizer:
-  "@core": optimizer
+  "@core": optimizer !draft
   optim: AdamW
   module: ${ nlp }
   groups:
     # Transformer
     - selector: "transformer"
-      exclude: true
+      lr:
+          "@schedules": linear
+          start_value: 1e-5
+          max_value: 2e-5
+          warmup_rate: 0.5
     - selector: ".*"
       lr: 1e-3
 

Original file line number	Diff line number	Diff line change
`@@ -728,7 +728,8 @@ def train(`
`728`	`728`	`)`
`729`	`729`	`)`
`730`	`730`	`accelerator.print(`
`731`		`- f"Keeping frozen {len(all_params - grad_params):} weight tensors "`
	`731`	`+ ("! WARNING ! " if (len(all_params - grad_params) > 0) else "")`
	`732`	`+ + f"Keeping frozen {len(all_params - grad_params):} weight tensors "`
`732`	`733`	`f"({sum(p.numel() for p in all_params - grad_params):,} parameters)"`
`733`	`734`	`)`
`734`	`735`