Reset extraction

Kyle1668 · Kyle1668 · commit 69194eedfae5 · 2023-04-22T21:59:14.000Z
diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
@@ -5,12 +5,14 @@
 from dataclasses import InitVar, dataclass
 from itertools import islice
 from typing import Any, Iterable, Literal
+from warnings import filterwarnings
 
 import torch
 from datasets import (
+    Array2D,
     Array3D,
-    ClassLabel,
     DatasetDict,
+    DownloadMode,
     Features,
     Sequence,
     SplitDict,
@@ -20,23 +22,23 @@
 )
 from simple_parsing import Serializable, field
 from torch import Tensor
-from transformers import AutoConfig, AutoTokenizer, GPT2TokenizerFast
+from transformers import AutoConfig, PreTrainedModel
 from transformers.modeling_outputs import Seq2SeqLMOutput
 
 from ..promptsource import DatasetTemplates
 from ..utils import (
     assert_type,
-    convert_span,
     float32_to_int16,
+    infer_label_column,
+    infer_num_classes,
     instantiate_model,
+    instantiate_tokenizer,
     is_autoregressive,
     select_train_val_splits,
     select_usable_devices,
 )
-from .balanced_sampler import BalancedSampler
 from .generator import _GeneratorBuilder
 from .prompt_loading import PromptConfig, load_prompts
-from ..rwkv_lm.rwkv_hf import RWKVConfig
 
 
 @dataclass
@@ -58,6 +60,7 @@ class Extract(Serializable):
     layers: tuple[int, ...] = ()
     layer_stride: InitVar[int] = 1
     token_loc: Literal["first", "last", "mean"] = "last"
+    use_encoder_states: bool = False
 
     def __post_init__(self, layer_stride: int):
         if self.layers and layer_stride > 1:
@@ -85,7 +88,7 @@ def explode(self) -> list["Extract"]:
         return copies
 
 
-@torch.no_grad()
+@torch.inference_mode()
 def extract_hiddens(
     cfg: "Extract",
     *,
@@ -99,135 +102,135 @@ def extract_hiddens(
 
     # Silence datasets logging messages from all but the first process
     if rank != 0:
+        filterwarnings("ignore")
         logging.disable(logging.CRITICAL)
 
-    ds_names = cfg.prompts.datasets
+    p_cfg = cfg.prompts
+    ds_names = p_cfg.datasets
     assert len(ds_names) == 1, "Can only extract hiddens from one dataset at a time."
 
-    prompt_ds = load_prompts(
-        ds_names[0],
-        split_type=split_type,
-        stream=cfg.prompts.stream,
-        rank=rank,
-        world_size=world_size,
-    )  # this dataset is already sharded, but hasn't been truncated to max_examples
-
     model = instantiate_model(
         cfg.model, torch_dtype="auto" if device != "cpu" else torch.float32
     ).to(device)
-    tokenizer = None
+    tokenizer = instantiate_tokenizer(
+        cfg.model, truncation_side="left", verbose=rank == 0
+    )
 
-    if cfg.model.startswith("RWKV"):
-        tokenizer = GPT2TokenizerFast(tokenizer_file='/home/kyle/repos/elk/elk/rwkv_lm/20B_tokenizer.json')
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(
-            cfg.model, truncation_side="left", verbose=False
-        )
+    is_enc_dec = model.config.is_encoder_decoder
+    if is_enc_dec and cfg.use_encoder_states:
+        assert hasattr(model, "get_encoder") and callable(model.get_encoder)
+        model = assert_type(PreTrainedModel, model.get_encoder())
+        is_enc_dec = False
 
-    has_lm_preds = is_autoregressive(model.config)
+    has_lm_preds = is_autoregressive(model.config, not cfg.use_encoder_states)
     if has_lm_preds and rank == 0:
         print("Model has language model head, will store predictions.")
 
+    prompt_ds = load_prompts(
+        ds_names[0],
+        label_column=p_cfg.label_columns[0] if p_cfg.label_columns else None,
+        num_classes=p_cfg.num_classes,
+        split_type=split_type,
+        stream=p_cfg.stream,
+        rank=rank,
+        world_size=world_size,
+    )
+
     # Iterating over questions
     layer_indices = cfg.layers or tuple(range(model.config.num_hidden_layers))
 
-    global_max_examples = cfg.prompts.max_examples[0 if split_type == "train" else 1]
+    global_max_examples = p_cfg.max_examples[0 if split_type == "train" else 1]
     # break `max_examples` among the processes roughly equally
     max_examples = global_max_examples // world_size
     # the last process gets the remainder (which is usually small)
     if rank == world_size - 1:
         max_examples += global_max_examples % world_size
 
-    for example in islice(BalancedSampler(prompt_ds), max_examples):
+    for example in islice(prompt_ds, max_examples):
         num_variants = len(example["prompts"])
+        num_choices = len(example["prompts"][0])
+
         hidden_dict = {
             f"hidden_{layer_idx}": torch.empty(
                 num_variants,
-                2,  # contrast pair
+                num_choices,
                 model.config.hidden_size,
                 device=device,
                 dtype=torch.int16,
             )
             for layer_idx in layer_indices
         }
-        lm_preds = torch.empty(
+        lm_logits = torch.empty(
             num_variants,
-            2,  # contrast pair
+            num_choices,
             device=device,
             dtype=torch.float32,
         )
-        text_inputs = []
+        text_questions = []
 
         # Iterate over variants
         for i, record in enumerate(example["prompts"]):
-            variant_inputs = []
+            variant_questions = []
 
             # Iterate over answers
             for j, choice in enumerate(record):
-                text = choice["text"]
-
-                # TODO: Do something smarter than "rindex" here. Really we want to
-                # get the span of the answer directly from Jinja, but that doesn't
-                # seem possible. This approach may fail for complex templates.
-                answer_start = text.rindex(choice["answer"])
+                text = choice["question"]
 
                 # Only feed question, not the answer, to the encoder for enc-dec models
-                if model.config.is_encoder_decoder:
-                    # TODO: Maybe make this more generic for complex templates?
-                    text = text[:answer_start].rstrip()
-                    target = choice["answer"]
-                else:
-                    target = None
-
-                # Record the EXACT string we fed to the model
-                variant_inputs.append(text)
-                # inputs = None
-                # if cfg.model.startswith("RWKV"):
-                #     inputs = tokenizer(
-                #         text,
-                #         return_offsets_mapping=True,
-                #         text_target=target,  # type: ignore[arg-type]
-                #         truncation=True,
-                #     )
-                # else:
-                inputs = tokenizer(
+                target = choice["answer"] if is_enc_dec else None
+
+                # Record the EXACT question we fed to the model
+                variant_questions.append(text)
+                encoding = tokenizer(
                     text,
-                    return_offsets_mapping=True,
+                    add_special_tokens=False,
                     return_tensors="pt",
                     text_target=target,  # type: ignore[arg-type]
                     truncation=True,
-                )
+                ).to(device)
+                input_ids = assert_type(Tensor, encoding.input_ids)
+
+                if is_enc_dec:
+                    answer = assert_type(Tensor, encoding.labels)
+                else:
+                    encoding2 = tokenizer(
+                        choice["answer"],
+                        add_special_tokens=False,
+                        return_tensors="pt",
+                    ).to(device)
+                    answer = assert_type(Tensor, encoding2.input_ids)
 
-                # The offset_mapping is a sorted list of (start, end) tuples. We locate
-                # the start of the answer in the tokenized sequence with binary search.
-                offsets = inputs.pop("offset_mapping") if cfg.model.startswith("RWKV") else inputs.pop("offset_mapping").squeeze().tolist()
-                inputs = inputs if cfg.model.startswith("RWKV") else  inputs.to(device)
+                    input_ids = torch.cat([input_ids, answer], dim=-1)
+                    if max_len := tokenizer.model_max_length:
+                        input_ids = input_ids[..., -max_len:]
 
-                # Run the forward pass
-                outputs = model(**inputs) if cfg.model.startswith("RWKV") else model(**inputs, output_hidden_states=True)
+                # Make sure we only pass the arguments that the model expects
+                inputs = dict(input_ids=input_ids)
+                if is_enc_dec:
+                    inputs["labels"] = answer
+
+                with torch.autocast("cuda", enabled=torch.cuda.is_available()):
+                    outputs = model(**inputs, output_hidden_states=True)
 
                 # Compute the log probability of the answer tokens if available
                 if has_lm_preds:
-                    start, end = convert_span(
-                        offsets, (answer_start, answer_start + len(choice["answer"]))
-                    )
-                    log_p = outputs.logits[..., start - 1 : end - 1, :].log_softmax(
-                        dim=-1
-                    )
-                    tokens = inputs.input_ids[..., start:end, None]
-                    lm_preds[i, j] = log_p.gather(-1, tokens).sum()
+                    answer_len = answer.shape[-1]
+
+                    log_p = outputs.logits[..., -answer_len:, :].log_softmax(dim=-1)
+                    tokens = answer[..., None]
+                    lm_logits[i, j] = log_p.gather(-1, tokens).sum()
 
                 elif isinstance(outputs, Seq2SeqLMOutput):
                     # The cross entropy loss is averaged over tokens, so we need to
                     # multiply by the length to get the total log probability.
-                    length = inputs.labels.shape[-1]
-                    lm_preds[i, j] = -assert_type(Tensor, outputs.loss) * length
+                    length = encoding.labels.shape[-1]
+                    lm_logits[i, j] = -assert_type(Tensor, outputs.loss) * length
 
-                hiddens = outputs if cfg.model.startswith("RWKV") else (
+                hiddens = (
                     outputs.get("decoder_hidden_states") or outputs["hidden_states"]
                 )
                 # First element of list is the input embeddings
-                hiddens = hiddens if cfg.model.startswith("RWKV") else hiddens[1:]
+                hiddens = hiddens[1:]
 
                 # Throw out layers we don't care about
                 hiddens = [hiddens[i] for i in layer_indices]
@@ -245,17 +248,16 @@ def extract_hiddens(
                 for layer_idx, hidden in zip(layer_indices, hiddens):
                     hidden_dict[f"hidden_{layer_idx}"][i, j] = float32_to_int16(hidden)
 
-            text_inputs.append(variant_inputs)
+            text_questions.append(variant_questions)
 
         out_record: dict[str, Any] = dict(
             label=example["label"],
             variant_ids=example["template_names"],
-            text_inputs=text_inputs,
+            text_questions=text_questions,
             **hidden_dict,
         )
         if has_lm_preds:
-            # We only need the probability of the positive example since this is binary
-            out_record["model_preds"] = lm_preds.softmax(dim=-1)[..., 1]
+            out_record["model_logits"] = lm_logits
 
         yield out_record
 
@@ -266,7 +268,11 @@ def _extraction_worker(**kwargs):
 
 
 def extract(
-    cfg: "Extract", num_gpus: int = -1, min_gpu_mem: int | None = None
+    cfg: "Extract",
+    *,
+    disable_cache: bool = False,
+    num_gpus: int = -1,
+    min_gpu_mem: int | None = None,
 ) -> DatasetDict:
     """Extract hidden states from a model and return a `DatasetDict` containing them."""
 
@@ -292,15 +298,18 @@ def get_splits() -> SplitDict:
             dataset_name=available_splits.dataset_name,
         )
 
-    model_cfg = None
-    if cfg.model.startswith("RWKV"):
-        model_cfg = RWKVConfig()
-    else:
-        model_cfg = AutoConfig.from_pretrained(cfg.model)
+    model_cfg = AutoConfig.from_pretrained(cfg.model)
 
     ds_name, _, config_name = cfg.prompts.datasets[0].partition(" ")
     info = get_dataset_config_info(ds_name, config_name or None)
 
+    ds_features = assert_type(Features, info.features)
+    label_col = (
+        cfg.prompts.label_columns[0]
+        if cfg.prompts.label_columns
+        else infer_label_column(ds_features)
+    )
+    num_classes = cfg.prompts.num_classes or infer_num_classes(ds_features[label_col])
     num_variants = cfg.prompts.num_variants
     if num_variants < 0:
         prompter = DatasetTemplates(ds_name, config_name)
@@ -309,7 +318,7 @@ def get_splits() -> SplitDict:
     layer_cols = {
         f"hidden_{layer}": Array3D(
             dtype="int16",
-            shape=(num_variants, 2, model_cfg.hidden_size),
+            shape=(num_variants, num_classes, model_cfg.hidden_size),
         )
         for layer in cfg.layers or range(model_cfg.num_hidden_layers)
     }
@@ -318,21 +327,20 @@ def get_splits() -> SplitDict:
             Value(dtype="string"),
             length=num_variants,
         ),
-        "label": ClassLabel(names=["neg", "pos"]),
-        "text_inputs": Sequence(
+        "label": Value(dtype="int64"),
+        "text_questions": Sequence(
             Sequence(
                 Value(dtype="string"),
-                length=2,
             ),
             length=num_variants,
         ),
     }
 
-    # Only add model_preds if the model is an autoregressive model
-    if is_autoregressive(model_cfg):
-        other_cols["model_preds"] = Sequence(
-            Value(dtype="float32"),
-            length=num_variants,
+    # Only add model_logits if the model is an autoregressive model
+    if is_autoregressive(model_cfg, not cfg.use_encoder_states):
+        other_cols["model_logits"] = Array2D(
+            shape=(num_variants, num_classes),
+            dtype="float32",
         )
 
     devices = select_usable_devices(num_gpus, min_memory=min_gpu_mem)
@@ -361,7 +369,10 @@ def get_splits() -> SplitDict:
 
     ds = dict()
     for split, builder in builders.items():
-        builder.download_and_prepare(num_proc=len(devices))
+        builder.download_and_prepare(
+            download_mode=DownloadMode.FORCE_REDOWNLOAD if disable_cache else None,
+            num_proc=len(devices),
+        )
         ds[split] = builder.as_dataset(split=split)
 
-    return DatasetDict(ds)
+    return DatasetDict(ds)