-
Notifications
You must be signed in to change notification settings - Fork 30.3k
Open
Labels
CacheCore: PipelineInternals of the library; Pipeline.Internals of the library; Pipeline.WIPLabel your PR/Issue with WIP for some long outstanding Issues/PRs that are work in progressLabel your PR/Issue with WIP for some long outstanding Issues/PRs that are work in progressbug
Description
System Info
Transformers Patch release v4.45.2
PyTorch 1.10.1
Python 3.8.0
cuda 11.1
NVIDIA V100
Who can help?
@gante @zucchini-nlp @Rocketknight1
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examples
folder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
Stack trace:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[19], line 1
----> 1 outputs = pipe(
2 messages,
3 max_new_tokens=3000,
4 eos_token_id=terminators,
5 do_sample=True,
6 temperature=0.6,
7 top_p=0.9,
8 # cache_implementation="static",
9 cache_implementation="offloaded_static",
10 )
11 assistant_response = outputs[0]["generated_text"][-1]["content"]
12 print(assistant_response)
File python3.8/site-packages/transformers/pipelines/text_generation.py:267, in TextGenerationPipeline.__call__(self, text_inputs, **kwargs)
262 if isinstance(
263 text_inputs, (list, tuple, KeyDataset) if is_torch_available() else (list, tuple)
264 ) and isinstance(text_inputs[0], (list, tuple, dict)):
265 # We have one or more prompts in list-of-dicts format, so this is chat mode
266 if isinstance(text_inputs[0], dict):
--> 267 return super().__call__(Chat(text_inputs), **kwargs)
268 else:
269 chats = [Chat(chat) for chat in text_inputs] # 🐈 🐈 🐈
File python3.8/site-packages/transformers/pipelines/base.py:1268, in Pipeline.__call__(self, inputs, num_workers, batch_size, *args, **kwargs)
1260 return next(
1261 iter(
1262 self.get_iterator(
(...)
1265 )
1266 )
1267 else:
-> 1268 return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
File python3.8/site-packages/transformers/pipelines/base.py:1275, in Pipeline.run_single(self, inputs, preprocess_params, forward_params, postprocess_params)
1273 def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
1274 model_inputs = self.preprocess(inputs, **preprocess_params)
-> 1275 model_outputs = self.forward(model_inputs, **forward_params)
1276 outputs = self.postprocess(model_outputs, **postprocess_params)
1277 return outputs
File python3.8/site-packages/transformers/pipelines/base.py:1175, in Pipeline.forward(self, model_inputs, **forward_params)
1173 with inference_context():
1174 model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
-> 1175 model_outputs = self._forward(model_inputs, **forward_params)
1176 model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
1177 else:
File python3.8/site-packages/transformers/pipelines/text_generation.py:370, in TextGenerationPipeline._forward(self, model_inputs, **generate_kwargs)
367 if "generation_config" not in generate_kwargs:
368 generate_kwargs["generation_config"] = self.generation_config
--> 370 generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
371 out_b = generated_sequence.shape[0]
372 if self.framework == "pt":
File python3.8/site-packages/torch/autograd/grad_mode.py:28, in _DecoratorContextManager.__call__.<locals>.decorate_context(*args, **kwargs)
25 @functools.wraps(func)
26 def decorate_context(*args, **kwargs):
27 with self.__class__():
---> 28 return func(*args, **kwargs)
File python3.8/site-packages/transformers/generation/utils.py:1921, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
1915 if (
1916 inputs_tensor.shape[1] != input_ids_length
1917 and model_input_name == "inputs_embeds"
1918 and not self.config.is_encoder_decoder
1919 ):
1920 max_cache_length += inputs_tensor.shape[1]
-> 1921 self._prepare_cache_for_generation(
1922 generation_config, model_kwargs, assistant_model, batch_size, max_cache_length, device
1923 )
1925 # 8. determine generation mode
1926 generation_mode = generation_config.get_generation_mode(assistant_model)
File python3.8/site-packages/transformers/generation/utils.py:1566, in GenerationMixin._prepare_cache_for_generation(self, generation_config, model_kwargs, assistant_model, batch_size, max_cache_length, device)
1561 if generation_config.cache_implementation == "static" and not self._supports_static_cache:
1562 raise ValueError(
1563 "This model does not support `cache_implementation='static'`. Please check the following "
1564 "issue: https://github.com/huggingface/transformers/issues/28981"
1565 )
-> 1566 model_kwargs[cache_name] = self._get_cache(
1567 cache_implementation=generation_config.cache_implementation,
1568 batch_size=max(generation_config.num_beams, generation_config.num_return_sequences) * batch_size,
1569 max_cache_len=max_cache_length,
1570 device=device,
1571 model_kwargs=model_kwargs,
1572 )
1573 elif generation_config.cache_implementation == "quantized":
1574 if not self._supports_quantized_cache:
File python3.8/site-packages/transformers/generation/utils.py:1476, in GenerationMixin._get_cache(self, cache_implementation, batch_size, max_cache_len, device, model_kwargs)
1466 layer_device_map = get_layer_device_map(execution_device_map)
1468 cache_kwargs = {
1469 "config": self.config.get_text_config(),
1470 "max_batch_size": batch_size,
(...)
1474 "layer_device_map": layer_device_map,
1475 }
-> 1476 self._cache = cache_cls(**cache_kwargs)
1477 if requires_cross_attention_cache:
1478 encoder_kwargs = cache_kwargs.copy()
TypeError: __init__() got an unexpected keyword argument 'layer_device_map'
Code:
from transformers import pipeline
import torch
cuda_dev_id = 2
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
pipe = pipeline(
"text-generation",
model=model_id,
model_kwargs={"torch_dtype": torch.float16}, # bfloat16 breaks on torch 1.10.1
device="cuda:" + str(cuda_dev_id)
)
role = """
You are an AI assistant REDACTED.
"""
prompt = """Here is the id: """ + "\n" + str(example_id) + "\n\n" + """Here is the cid: """ + "\n" + example_cid + "\n\n" + """Here is the s: """ + "\n"+ example_s + "\n\n" + """Here is the c: """ + "\n" + example_c
messages = [
{"role": "system", "content": role},
{"role": "user", "content": prompt},
]
terminators = [
pipe.tokenizer.eos_token_id,
pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
outputs = pipe(
messages,
max_new_tokens=3000,
eos_token_id=terminators,
do_sample=True,
temperature=0.6,
top_p=0.9,
cache_implementation="offloaded_static",
)
assistant_response = outputs[0]["generated_text"][-1]["content"]
print(assistant_response)
Expected behavior
assistant_response should be a generated response from the LLaMa model.
mdocekal
Metadata
Metadata
Assignees
Labels
CacheCore: PipelineInternals of the library; Pipeline.Internals of the library; Pipeline.WIPLabel your PR/Issue with WIP for some long outstanding Issues/PRs that are work in progressLabel your PR/Issue with WIP for some long outstanding Issues/PRs that are work in progressbug