bigdata-ustc
diff --git a/‎.gitignore‎
Lines changed: 7 additions & 1 deletion b/‎.gitignore‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎AUTHORS.md‎
Lines changed: 2 additions & 0 deletions b/‎AUTHORS.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎CHANGE.txt‎
Lines changed: 9 additions & 0 deletions b/‎CHANGE.txt‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎EduNLP/I2V/i2v.py‎
Lines changed: 84 additions & 95 deletions b/‎EduNLP/I2V/i2v.py‎
Lines changed: 84 additions & 95 deletions
diff --git a/‎EduNLP/ModelZoo/__init__.py‎
Lines changed: 2 additions & 3 deletions b/‎EduNLP/ModelZoo/__init__.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎EduNLP/ModelZoo/base_model.py‎
Lines changed: 88 additions & 0 deletions b/‎EduNLP/ModelZoo/base_model.py‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎EduNLP/ModelZoo/bert/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎EduNLP/ModelZoo/bert/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎EduNLP/ModelZoo/bert/bert.py‎
Lines changed: 151 additions & 0 deletions b/‎EduNLP/ModelZoo/bert/bert.py‎
Lines changed: 151 additions & 0 deletions
diff --git a/‎EduNLP/ModelZoo/disenqnet/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎EduNLP/ModelZoo/disenqnet/__init__.py‎
Lines changed: 1 addition & 1 deletion
@@ -110,4 +110,10 @@ venv.bak/
 .pyre/
 
 # User Definition
-data/
+data/
+deprecated/
+tmp*/
+jieba.cache
+*.kv
+*.zip
+examples/test_model
@@ -18,5 +18,7 @@
 
 [Yuting Ning](https://github.com/nnnyt)
 
+[Jundong Wu](https://github.com/wintermelon008)
+
 
 The stared contributors are the corresponding authors. 
@@ -1,3 +1,12 @@
+v0.0.9
+    1. Refactor tokenizer Basic Tokenizer and Pretrained Tokenizer
+    2. Refactor model structures following huggingface styles for Elmo, BERT, DisenQNet and QuesNet
+    3. Add PreprocessingPipeline and Pipeline
+    4. Add downstream task: knowledge prediction and property prediction
+    5. Fix a bug in RNN which causes ELMo not converging
+    6. Move all the test models to modelhub
+    7. Update test data files
+
 v0.0.8
     1. add Emlo
     2. add DisenQNet
 
@@ -1,4 +1,3 @@
-# coding: utf-8
-# 2021/7/12 @ tongshiwei
-
 from .utils import *
+from .bert import *
+from .rnn import *
@@ -0,0 +1,88 @@
+import torch.nn as nn
+import json
+import os
+from pathlib import Path
+import torch
+from transformers import PretrainedConfig
+# import logging
+from ..utils import logger
+
+
+class BaseModel(nn.Module):
+    base_model_prefix = ''
+
+    def __init__(self):
+        super(BaseModel, self).__init__()
+        self.config = PretrainedConfig()
+
+    def forward(self, *input):
+        raise NotImplementedError
+
+    def save_pretrained(self, output_dir):
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir, exist_ok=True)
+        model_path = os.path.join(output_dir, 'pytorch_model.bin')
+        model_path = Path(model_path)
+        torch.save(self.state_dict(), model_path.open('wb'))
+        self.save_config(output_dir)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path, *args, **kwargs):
+        config_path = os.path.join(pretrained_model_path, "config.json")
+        model_path = os.path.join(pretrained_model_path, "pytorch_model.bin")
+        model = cls.from_config(config_path, *args, **kwargs)
+        loaded_state_dict = torch.load(model_path)
+        loaded_keys = loaded_state_dict.keys()
+        expected_keys = model.state_dict().keys()
+
+        prefix = cls.base_model_prefix
+
+        if set(loaded_keys) == set(expected_keys):
+            # same architecture
+            model.load_state_dict(loaded_state_dict)
+        else:
+            has_prefix_module = any(s.startswith(prefix) for s in loaded_keys)
+            expects_prefix_module = any(s.startswith(prefix) for s in expected_keys)
+
+            new_loaded_state_dict = {}
+            if expects_prefix_module and not has_prefix_module:
+                # add prefix
+                for key in loaded_keys:
+                    new_loaded_state_dict['.'.join([prefix, key])] = loaded_state_dict[key]
+            if has_prefix_module and not expects_prefix_module:
+                # remove prefix
+                for key in loaded_keys:
+                    if key.startswith(prefix):
+                        new_loaded_state_dict['.'.join(key.split('.')[1:])] = loaded_state_dict[key]
+            if has_prefix_module and expects_prefix_module:
+                # both have prefix, only load the base encoder
+                for key in loaded_keys:
+                    if key.startswith(prefix):
+                        new_loaded_state_dict[key] = loaded_state_dict[key]
+            loaded_state_dict = new_loaded_state_dict
+            model.load_state_dict(loaded_state_dict, strict=False)
+        loaded_keys = loaded_state_dict.keys()
+        missing_keys = set(expected_keys) - set(loaded_keys)
+        if len(missing_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_path}.\nIf your task is similar to the task the model of the checkpoint"
+                f" was trained on, you can already use {model.__class__.__name__} for predictions without further"
+                " training."
+            )
+        elif len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        return model
+
+    def save_config(self, config_dir):
+        config_path = os.path.join(config_dir, "config.json")
+        with open(config_path, "w", encoding="utf-8") as wf:
+            json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2)
+
+    @classmethod
+    def from_config(cls, config_path, *args, **kwargs):
+        raise NotImplementedError
@@ -0,0 +1 @@
+from .bert import *
@@ -0,0 +1,151 @@
+import torch
+from torch import nn
+from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
+from baize.torch import load_net
+import torch.nn.functional as F
+import json
+import os
+from ..base_model import BaseModel
+from transformers.modeling_outputs import ModelOutput
+from transformers import BertModel, PretrainedConfig
+from typing import List, Optional
+from ..rnn.harnn import HAM
+
+__all__ = ["BertForPropertyPrediction", "BertForKnowledgePrediction"]
+
+
+class BertForPPOutput(ModelOutput):
+    loss: torch.FloatTensor = None
+    logits: torch.FloatTensor = None
+
+
+class BertForPropertyPrediction(BaseModel):
+    def __init__(self, pretrained_model_dir=None, head_dropout=0.5):
+        super(BertForPropertyPrediction, self).__init__()
+        self.bert = BertModel.from_pretrained(pretrained_model_dir)
+        self.hidden_size = self.bert.config.hidden_size
+        self.head_dropout = head_dropout
+        self.dropout = nn.Dropout(head_dropout)
+        self.classifier = nn.Linear(self.hidden_size, 1)
+        self.sigmoid = nn.Sigmoid()
+        self.criterion = nn.MSELoss()
+
+        self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__"]}
+        self.config['architecture'] = 'BertForPropertyPrediction'
+        self.config = PretrainedConfig.from_dict(self.config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                labels=None):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
+        item_embeds = outputs.last_hidden_state[:, 0, :]
+        item_embeds = self.dropout(item_embeds)
+
+        logits = self.sigmoid(self.classifier(item_embeds)).squeeze(1)
+        loss = None
+        if labels is not None:
+            loss = self.criterion(logits, labels) if labels is not None else None
+        return BertForPPOutput(
+            loss=loss,
+            logits=logits,
+        )
+
+    @classmethod
+    def from_config(cls, config_path, **kwargs):
+        with open(config_path, "r", encoding="utf-8") as rf:
+            model_config = json.load(rf)
+            model_config.update(kwargs)
+            return cls(
+                pretrained_model_dir=model_config['pretrained_model_dir'],
+                head_dropout=model_config.get("head_dropout", 0.5)
+            )
+
+    # @classmethod
+    # def from_pretrained(cls):
+    #     NotImplementedError
+    #     # 需要验证是否和huggingface的模型兼容
+
+
+class BertForKnowledgePrediction(BaseModel):
+    def __init__(self,
+                 num_classes_list: List[int] = None,
+                 num_total_classes: int = None,
+                 pretrained_model_dir=None,
+                 head_dropout=0.5,
+                 flat_cls_weight=0.5,
+                 attention_unit_size=256,
+                 fc_hidden_size=512,
+                 beta=0.5,
+                 ):
+        super(BertForKnowledgePrediction, self).__init__()
+        self.bert = BertModel.from_pretrained(pretrained_model_dir)
+        self.hidden_size = self.bert.config.hidden_size
+        self.head_dropout = head_dropout
+        self.dropout = nn.Dropout(head_dropout)
+        self.classifier = nn.Linear(self.hidden_size, 1)
+        self.sigmoid = nn.Sigmoid()
+        self.criterion = nn.MSELoss()
+        self.flat_classifier = nn.Linear(self.hidden_size, num_total_classes)
+        self.ham_classifier = HAM(
+            num_classes_list=num_classes_list,
+            num_total_classes=num_total_classes,
+            sequence_model_hidden_size=self.bert.config.hidden_size,
+            attention_unit_size=attention_unit_size,
+            fc_hidden_size=fc_hidden_size,
+            beta=beta,
+            dropout_rate=head_dropout
+        )
+        self.flat_cls_weight = flat_cls_weight
+        self.num_classes_list = num_classes_list
+        self.num_total_classes = num_total_classes
+
+        self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__"]}
+        self.config['architecture'] = 'BertForKnowledgePrediction'
+        self.config = PretrainedConfig.from_dict(self.config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                labels=None):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
+        item_embeds = outputs.last_hidden_state[:, 0, :]
+        item_embeds = self.dropout(item_embeds)
+        tokens_embeds = outputs.last_hidden_state
+        tokens_embeds = self.dropout(tokens_embeds)
+        flat_logits = self.sigmoid(self.flat_classifier(item_embeds))
+        ham_outputs = self.ham_classifier(tokens_embeds)
+        ham_logits = self.sigmoid(ham_outputs.scores)
+        logits = self.flat_cls_weight * flat_logits + (1 - self.flat_cls_weight) * ham_logits
+        loss = None
+        if labels is not None:
+            labels = torch.sum(torch.nn.functional.one_hot(labels, num_classes=self.num_total_classes), dim=1)
+            labels = labels.float()
+            loss = self.criterion(logits, labels) if labels is not None else None
+        return BertForPPOutput(
+            loss=loss,
+            logits=logits,
+        )
+
+    @classmethod
+    def from_config(cls, config_path, **kwargs):
+        with open(config_path, "r", encoding="utf-8") as rf:
+            model_config = json.load(rf)
+            model_config.update(kwargs)
+            return cls(
+                pretrained_model_dir=model_config['pretrained_model_dir'],
+                head_dropout=model_config.get("head_dropout", 0.5),
+                num_classes_list=model_config.get('num_classes_list'),
+                num_total_classes=model_config.get('num_total_classes'),
+                flat_cls_weight=model_config.get('flat_cls_weight', 0.5),
+                attention_unit_size=model_config.get('attention_unit_size', 256),
+                fc_hidden_size=model_config.get('fc_hidden_size', 512),
+                beta=model_config.get('beta', 0.5),
+            )
+
+    # @classmethod
+    # def from_pretrained(cls):
+    #     NotImplementedError
+    #     # 需要验证是否和huggingface的模型兼容
@@ -1,3 +1,3 @@
 # -*- coding: utf-8 -*-
 
-from .disenqnet import DisenQNet
+from .disenqnet import DisenQNet, DisenQNetForPreTraining
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`# -- coding: utf-8 --`
`2`	`2`
`3`		`-from .disenqnet import DisenQNet`
	`3`	`+from .disenqnet import DisenQNet, DisenQNetForPreTraining`