LLM BKC: Always set config.use_cache to true (#3657)

Xia-Weiwen · chunyuan-w · web-flow · commit a78436ac0de1 · 2025-04-18T15:50:59.000+08:00
Co-authored-by: Chunyuan WU &lt;chunyuan.wu@intel.com&gt;
diff --git a/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
@@ -405,6 +405,8 @@ def get_checkpoint_files(model_name_or_path):
     kv_cache_dtype = torch.float8_e5m2
 config.kv_cache_dtype = kv_cache_dtype
 
+config.use_cache = True  # For inference, it should always be True
+
 # For DeepSeek models
 if not args.ipex_weight_only_quantization and args.ipex and args.dtype == "bfloat16":
     config.use_fused_moe = True
@@ -419,8 +421,6 @@ def get_checkpoint_files(model_name_or_path):
     config.max_seq_len = int(args.input_tokens) + int(args.max_new_tokens)
 if model_type == "whisper":
     config.text_max_length = config.max_source_positions + config.max_target_positions
-if model_type == "llava":
-    config.use_cache = True
 if model_type == "jamba":
     config.use_mamba_kernels = False
 if not hasattr(config, "lm_head_generation"):
diff --git a/examples/cpu/llm/inference/single_instance/run_quantization.py b/examples/cpu/llm/inference/single_instance/run_quantization.py
@@ -344,6 +344,8 @@
         args.config_file, torchscript=True, trust_remote_code=True
     )
 
+config.use_cache = True  # For inference, it should always be True
+
 # For DeepSeek models
 if args.ipex_weight_only_quantization and args.weight_dtype == "INT8":
     config.use_fused_moe = True

Original file line number	Diff line number	Diff line change
`@@ -344,6 +344,8 @@`
`344`	`344`	`args.config_file, torchscript=True, trust_remote_code=True`
`345`	`345`	`)`
`346`	`346`
	`347`	`+config.use_cache = True # For inference, it should always be True`
	`348`	`+`
`347`	`349`	`# For DeepSeek models`
`348`	`350`	`if args.ipex_weight_only_quantization and args.weight_dtype == "INT8":`
`349`	`351`	`config.use_fused_moe = True`