feat: add class_weight handling

LucasDedieu · LucasDedieu · commit a9383179420e · 2025-09-02T10:15:27.000Z
diff --git a/edsnlp/pipes/trainable/doc_classifier/doc_classifier.py b/edsnlp/pipes/trainable/doc_classifier/doc_classifier.py
@@ -48,17 +48,17 @@ def __init__(
         id2label: Optional[Dict[int, str]] = None,
         loss_fn=None,
         labels: Optional[Sequence[str]] = None,
-        class_weights: Optional[Union[Dict[str, float], str]] = None, 
+        class_weights: Optional[Union[Dict[str, float], str]] = None,
     ):
         self.label_attr: Attributes = label_attr
         self.label2id = label2id or {}
         self.id2label = id2label or {}
         self.labels = labels
-        self.class_weights = class_weights  
-        
+        self.class_weights = class_weights
+
         super().__init__(nlp, name)
         self.embedding = embedding
-        
+
         self._loss_fn = loss_fn
         self.loss_fn = None
 
@@ -76,19 +76,19 @@ def _compute_class_weights(self, freq_dict: Dict[str, int]) -> torch.Tensor:
         Uses inverse frequency weighting: weight = 1 / frequency
         """
         total_samples = sum(freq_dict.values())
-        
+
         weights = torch.zeros(len(self.label2id))
-        
+
         for label, freq in freq_dict.items():
             if label in self.label2id:
                 weight = total_samples / (len(self.label2id) * freq)
                 weights[self.label2id[label]] = weight
-        
+
         return weights
 
     def _load_class_weights_from_file(self, filepath: str) -> Dict[str, int]:
         """Load class weights from pickle file."""
-        with open(filepath, 'rb') as f:
+        with open(filepath, "rb") as f:
             return pickle.load(f)
 
     def set_extensions(self) -> None:
@@ -116,22 +116,22 @@ def post_init(self, gold_data: Iterable[Doc], exclude: Set[str]):
                 self.classifier = torch.nn.Linear(
                     self.embedding.output_size, len(self.label2id)
                 )
-        
+
         weight_tensor = None
         if self.class_weights is not None:
             if isinstance(self.class_weights, str):
                 freq_dict = self._load_class_weights_from_file(self.class_weights)
                 weight_tensor = self._compute_class_weights(freq_dict)
             elif isinstance(self.class_weights, dict):
                 weight_tensor = self._compute_class_weights(self.class_weights)
-            
+
             print(f"Using class weights: {weight_tensor}")
-        
+
         if self._loss_fn is not None:
             self.loss_fn = self._loss_fn
         else:
             self.loss_fn = torch.nn.CrossEntropyLoss(weight=weight_tensor)
-        
+
         super().post_init(gold_data, exclude=exclude)
 
     def preprocess(self, doc: Doc) -> Dict[str, Any]:
@@ -161,6 +161,10 @@ def collate(self, batch: Dict[str, Sequence[Any]]) -> DocClassifierBatchInput:
         return batch_input
 
     def forward(self, batch: DocClassifierBatchInput) -> DocClassifierBatchOutput:
+        """
+        Forward pass: compute embeddings, classify, and calculate loss
+        if targets provided.
+        """
         pooled = self.embedding(batch["embedding"])
         embeddings = pooled["embeddings"]
 
@@ -187,6 +191,7 @@ def postprocess(self, docs, results, input):
         return docs
 
     def to_disk(self, path, *, exclude=set()):
+        """Save classifier state to disk."""
         repr_id = object.__repr__(self)
         if repr_id in exclude:
             return
@@ -206,11 +211,12 @@ def to_disk(self, path, *, exclude=set()):
 
     @classmethod
     def from_disk(cls, path, **kwargs):
+        """Load classifier from disk."""
         data_path = path / "label_attr.pkl"
         with open(data_path, "rb") as f:
             data = pickle.load(f)
         obj = super().from_disk(path, **kwargs)
         obj.label_attr = data.get("label_attr", "label")
         obj.label2id = data.get("label2id", {})
         obj.id2label = data.get("id2label", {})
-        return obj
+        return obj