Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 346211c

Browse files
zhentaoyuVincyZhangchangwangss
authored
[Transformers] Support load mode from HF Hub when use Neural Speed (#1449)
Co-authored-by: Wenxin Zhang <[email protected]> Co-authored-by: changwangss <[email protected]>
1 parent 02a6984 commit 346211c

File tree

12 files changed

+66
-34
lines changed

12 files changed

+66
-34
lines changed

examples/.config/pytorch_optimize.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1580,7 +1580,8 @@
15801580
"params": {
15811581
"topology": "mistral_7b_autoround",
15821582
"task": "generation",
1583-
"output_model": "saved_results"
1583+
"output_model": "saved_results",
1584+
"weight_dtype": "int4_clip"
15841585
}
15851586
},
15861587
"benchmark": {
@@ -1590,11 +1591,10 @@
15901591
"task": "generation",
15911592
"backend": "neuralspeed",
15921593
"mode": "benchmark",
1593-
"batch_size": "112",
1594+
"batch_size": "10",
15941595
"iters": "100",
15951596
"int8": "false",
1596-
"config": "saved_results",
1597-
"weight_dtype": "int4_clip"
1597+
"config": "saved_results"
15981598
}
15991599
}
16001600
},
@@ -1616,7 +1616,7 @@
16161616
"task": "generation",
16171617
"mode": "benchmark",
16181618
"backend": "neuralspeed",
1619-
"batch_size": "112",
1619+
"batch_size": "10",
16201620
"iters": "100",
16211621
"int8": "false",
16221622
"config": "saved_results"
@@ -1642,7 +1642,7 @@
16421642
"task": "generation",
16431643
"backend": "neuralspeed",
16441644
"mode": "benchmark",
1645-
"batch_size": "112",
1645+
"batch_size": "10",
16461646
"iters": "100",
16471647
"int8": "false",
16481648
"config": "saved_results"
@@ -1732,7 +1732,7 @@
17321732
"task": "generation",
17331733
"backend": "neuralspeed",
17341734
"mode": "benchmark",
1735-
"batch_size": "112",
1735+
"batch_size": "10",
17361736
"iters": "100",
17371737
"int8": "false",
17381738
"config": "saved_results",
@@ -1750,7 +1750,7 @@
17501750
"task": "generation",
17511751
"mode": "benchmark",
17521752
"backend": "neuralspeed",
1753-
"batch_size": "112",
1753+
"batch_size": "10",
17541754
"iters": "100",
17551755
"int8": "false",
17561756
"config": "saved_results",

examples/huggingface/neural_speed/perplexity/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@ tiktoken
1313
py-cpuinfo
1414
cmake
1515
gguf
16-
neural-speed==1.0a0
16+
neural-speed

examples/huggingface/neural_speed/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
intel_extension_for_transformers
2-
neural-speed==1.0a0
2+
neural-speed
33
git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
44
sentencepiece
55
gguf

examples/huggingface/neural_speed/run_accuracy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,15 @@
1919
parser = argparse.ArgumentParser(description="Evaluate diff for a model")
2020
parser.add_argument('--model_name', type=str, default="~/Llama-2-7b-chat-hf", help="path to model")
2121
parser.add_argument('--tasks', type=str, default="lambada_openai")
22-
parser.add_argument('--model_format', type=str, default="runtime")
22+
parser.add_argument('--model_format', type=str, default="neural_speed")
2323
parser.add_argument('--use_gptq', action='store_true')
2424
parser.add_argument('--batch_size', type=int, default=1)
2525
args = parser.parse_args()
2626
print(args)
2727
model_args=f'pretrained="{args.model_name}",dtype=float32,trust_remote_code=True'
2828
if args.use_gptq:
2929
model_args += ",use_gptq=True"
30-
if args.model_format == "runtime":
30+
if args.model_format == "neural_speed":
3131
results = evaluate(
3232
model="hf-causal",
3333
model_args=model_args,

examples/huggingface/pytorch/text-generation/quantization/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ bitsandbytes #baichuan
1313
transformers_stream_generator
1414
tiktoken #qwen
1515
einops #qwen
16-
neural-speed
16+
git+https://github.com/intel/neural-speed[email protected]
1717
auto-round
1818
git+https://github.com/intel/neural-compressor.git
1919
git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
20+
huggingface_hub

examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ function run_benchmark {
163163
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
164164
elif [ "${topology}" = "mistral_7b_rtn" ] && [ "$model_source" != "huggingface" ]; then
165165
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
166+
elif [ "${topology}" = "mistral_7b_gptq" ] && [ "$model_source" != "huggingface" ]; then
167+
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
166168
fi
167169

168170
if [[ ${int8} == "true" ]]; then

examples/huggingface/pytorch/text-generation/quantization/run_generation.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,10 @@
250250
args.model = args.peft_model_id if args.peft_model_id is not None else args.model
251251

252252
# Generation
253-
generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4)
253+
if args.use_neural_speed:
254+
generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=1)
255+
else:
256+
generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4)
254257

255258
# mp/sq/woq/bitsandbytes config setting
256259
quantization_config = None
@@ -478,10 +481,9 @@
478481

479482
if args.benchmark:
480483
user_model = (
481-
user_model.eval() if not (args.int8 or args.int8_bf16_mixed) else user_model
484+
user_model.eval() if (not (args.int8 or args.int8_bf16_mixed) and hasattr(user_model, "eval")) else user_model
482485
)
483486
prompt = "Once upon a time, there existed a little girl, who liked to have adventures. She wanted to go to places and meet new people, and have fun."
484-
485487
input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1)
486488
print("---- Prompt size:", input_size)
487489

@@ -521,7 +523,7 @@
521523
toc = time.time()
522524
# please check the gen_ids if include input_ids.
523525
input_tokens_num = input_ids.numel()
524-
output_tokens_num = gen_ids.numel() - input_tokens_num
526+
output_tokens_num = torch.tensor(gen_ids).numel() - input_tokens_num
525527
print(gen_text, flush=True)
526528
if i >= num_warmup:
527529
total_time += toc - tic
@@ -534,18 +536,30 @@
534536
print("Throughput: {} samples/sec".format(throughput))
535537

536538
if args.accuracy:
537-
user_model = (user_model.eval() if not (args.int8 or args.int8_bf16_mixed) else user_model)
539+
user_model = (user_model.eval() if (not (args.int8 or args.int8_bf16_mixed) and hasattr(user_model, "eval")) \
540+
else user_model)
538541
args.model = (peft_config.base_model_name_or_path if args.peft_model_id else args.model)
539542
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate
540-
543+
pretrained = ',pretrained=' + args.model
541544
args._commit_hash = "main" if args._commit_hash is None else args._commit_hash
545+
eval_args = "tokenizer=" + args.model + ",dtype=float32" + ",_commit_hash=" + \
546+
args._commit_hash + ",trust_remote_code=" + str(args.trust_remote_code)
547+
if args.use_neural_speed:
548+
eval_args += pretrained
549+
q_conf = user_model.config.quantization_config
550+
if isinstance(q_conf, dict):
551+
q_algo = q_conf.get("quant_method", None)
552+
else:
553+
q_algo = q_conf.quant_method.value
554+
if q_algo.upper() in ["AWQ", "GPTQ", "AUTOROUND"]:
555+
eval_args += ",use_gptq=True"
542556
results = evaluate(
543557
model="hf-causal",
544-
model_args="pretrained=" + args.model + ",tokenizer=" + args.model + ",dtype=float32" + ",_commit_hash=" + args._commit_hash +
545-
",trust_remote_code=" + str(args.trust_remote_code),
558+
model_args=eval_args,
546559
user_model=user_model,
547560
batch_size=args.batch_size,
548561
tasks=args.tasks,
562+
model_format="neural_speed" if args.use_neural_speed else "torch",
549563
)
550564
dumped = json.dumps(results, indent=2)
551565
if args.save_accuracy_path:

examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@
323323

324324
results = evaluate(
325325
model="hf-causal",
326-
model_args='pretrained=' + args.model + ',tokenizer=' + args.model + \
326+
model_args='tokenizer=' + args.model + \
327327
',dtype=float32,trust_remote_code=' + str(args.trust_remote_code),
328328
user_model=user_model,
329329
batch_size=args.batch_size,

examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ function run_tuning {
220220
extra_cmd=$extra_cmd" --weight_dtype ${weight_dtype}"
221221
elif [ "${topology}" = "mistral_7b_rtn" ]; then
222222
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
223-
extra_cmd=$extra_cmd" --woq --bits 4 -compute_dtype fp32 --scheme asym "
223+
extra_cmd=$extra_cmd" --woq --bits 4 --compute_dtype fp32 --scheme asym "
224224
extra_cmd=$extra_cmd" --woq_algo "Rtn" --desc_act --blocksize 128 --max_input_length 2048 "
225225
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
226226
extra_cmd=$extra_cmd" --trust_remote_code"

intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/evaluator.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,8 @@ def evaluate(model,
124124
}
125125
if user_model:
126126
kwargs["init_empty_weights"] = True
127+
if "pretrained" not in model_args:
128+
model_args = "pretrained='Muennighoff/tiny-random-bert'," + model_args
127129

128130
if device == "hpu":
129131
# if hpu, set user_model

0 commit comments

Comments
 (0)