Skip to content

Commit a78436a

Browse files
LLM BKC: Always set config.use_cache to true (#3657)
Co-authored-by: Chunyuan WU <[email protected]>
1 parent c2ccab2 commit a78436a

File tree

2 files changed

+4
-2
lines changed

2 files changed

+4
-2
lines changed

examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,8 @@ def get_checkpoint_files(model_name_or_path):
405405
kv_cache_dtype = torch.float8_e5m2
406406
config.kv_cache_dtype = kv_cache_dtype
407407

408+
config.use_cache = True # For inference, it should always be True
409+
408410
# For DeepSeek models
409411
if not args.ipex_weight_only_quantization and args.ipex and args.dtype == "bfloat16":
410412
config.use_fused_moe = True
@@ -419,8 +421,6 @@ def get_checkpoint_files(model_name_or_path):
419421
config.max_seq_len = int(args.input_tokens) + int(args.max_new_tokens)
420422
if model_type == "whisper":
421423
config.text_max_length = config.max_source_positions + config.max_target_positions
422-
if model_type == "llava":
423-
config.use_cache = True
424424
if model_type == "jamba":
425425
config.use_mamba_kernels = False
426426
if not hasattr(config, "lm_head_generation"):

examples/cpu/llm/inference/single_instance/run_quantization.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,8 @@
344344
args.config_file, torchscript=True, trust_remote_code=True
345345
)
346346

347+
config.use_cache = True # For inference, it should always be True
348+
347349
# For DeepSeek models
348350
if args.ipex_weight_only_quantization and args.weight_dtype == "INT8":
349351
config.use_fused_moe = True

0 commit comments

Comments
 (0)