Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 6eceb08

Browse files
Use transfomers tokenizer and streamer for python api (#388)
1 parent 9807821 commit 6eceb08

File tree

10 files changed

+137
-77
lines changed

10 files changed

+137
-77
lines changed

README.md

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,20 +56,36 @@ Below are the sample code to enable weight-only low precision inference. See mor
5656

5757
### INT4 Inference
5858
```python
59+
from transformers import AutoTokenizer
5960
from intel_extension_for_transformers.transformers import AutoModel, WeightOnlyQuantConfig
61+
62+
model_name = "EleutherAI/gpt-j-6B"
63+
config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4")
6064
prompt = "Once upon a time, a little girl"
61-
config = WeightOnlyQuantConfig(compute_dtype="int8")
62-
model = AutoModel.from_pretrained("Intel/neural-chat-7b-v1-1", quantization_config=config)
63-
print(model.generate(prompt, max_new_tokens=30))
65+
66+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
67+
inputs = tokenizer(prompt, return_tensors="pt").input_ids
68+
69+
model = AutoModel.from_pretrained(model_name, quantization_config=config)
70+
gen_tokens = model.generate(inputs, max_new_tokens=300)
71+
gen_text = tokenizer.batch_decode(gen_tokens)
6472
```
6573

6674
### INT8 Inference
6775
```python
76+
from transformers import AutoTokenizer
6877
from intel_extension_for_transformers.transformers import AutoModel, WeightOnlyQuantConfig
69-
prompt = "Once upon a time, a little girl"
78+
79+
model_name = "EleutherAI/gpt-j-6B"
7080
config = WeightOnlyQuantConfig(compute_dtype="bf16", weight_dtype="int8")
71-
model = AutoModel.from_pretrained("Intel/neural-chat-7b-v1-1", quantization_config=config)
72-
print(model.generate(prompt, max_new_tokens=30))
81+
prompt = "Once upon a time, a little girl"
82+
83+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
84+
inputs = tokenizer(prompt, return_tensors="pt").input_ids
85+
86+
model = AutoModel.from_pretrained(model_name, quantization_config=config)
87+
gen_tokens = model.generate(inputs, max_new_tokens=300)
88+
gen_text = tokenizer.batch_decode(gen_tokens)
7389
```
7490

7591
## 🎯Validated Models

intel_extension_for_transformers/llm/runtime/graph/README.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,19 @@ cmake --build . -j
6565

6666
You can use Python API to run Hugging Face model simply. Here is the sample code:
6767
```python
68+
from transformers import AutoTokenizer, TextStreamer
6869
from intel_extension_for_transformers.transformers import AutoModel, WeightOnlyQuantConfig
6970
model_name = "Intel/neural-chat-7b-v1-1" # Hugging Face model_id or local model
7071
woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4")
71-
model = AutoModel.from_pretrained(model_name, quantization_config=woq_config)
7272
prompt = "Once upon a time, a little girl"
73-
output = model.generate(prompt, max_new_tokens=30)
73+
74+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
75+
inputs = tokenizer(prompt, return_tensors="pt").input_ids
76+
streamer = TextStreamer(tokenizer)
77+
78+
model = AutoModel.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)
79+
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
80+
7481
```
7582

7683
### 3. Run LLM with Python Script

intel_extension_for_transformers/llm/runtime/graph/__init__.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import os
1818
from transformers import AutoConfig
1919
from intel_extension_for_transformers.llm.runtime.graph.scripts.convert import convert_model
20-
20+
import torch
2121
model_maps = {"gpt_neox": "gptneox", "RefinedWebModel": "falcon"}
2222

2323
class Model:
@@ -66,10 +66,12 @@ def init(self, model_name, **kwargs):
6666
# 1. convert model
6767
fp32_bin = "ne_{}_f32.bin".format(model_type)
6868
convert_model(model_name, fp32_bin, "f32")
69+
assert(os.path.exists(fp32_bin), "Fail to convert pytorch model")
6970

7071
# 2. quant model
7172
quant_bin = "ne_{}_q.bin".format(model_type)
7273
self.module.Model.quant_model(model_path = fp32_bin, out_path = quant_bin, **kwargs)
74+
assert(os.path.exists(quant_bin), "Fail to quantize model")
7375

7476
self.model_type = model_type
7577
self.bin_file = quant_bin
@@ -88,13 +90,22 @@ def quant_model(self, model_name, model_path, out_path, **kwargs):
8890
self.module.Model.quant_model(model_path = model_path,
8991
out_path = out_path, **kwargs)
9092

91-
def generate(self, prompt, streamer = None, sentence_mode = True, **kwargs):
92-
# TODO support streamer
93+
def generate(self, input_ids, streamer = None, **kwargs):
9394
if self.model is None:
9495
self.init_from_bin(self.model_type, self.bin_file, **kwargs)
95-
96-
out = self.model.generate(prompt = prompt, sentence_mode = sentence_mode)
97-
return out
96+
# TODO support multi batch
97+
assert(input_ids.shape[0] == 1, "Unsupport multi-batch input ids.")
98+
if streamer:
99+
ret = input_ids.tolist()
100+
while not self.is_token_end():
101+
out = self.model.generate(input_ids = input_ids.tolist()[0])
102+
streamer.put(torch.tensor([out]))
103+
ret[0].extend(out)
104+
return ret
105+
else:
106+
ret = input_ids.tolist()
107+
ret[0].extend(self.model.generate_tokens(input_ids = input_ids.tolist()[0]))
108+
return ret
98109

99110
def is_token_end(self):
100111
return self.model.is_token_end()

intel_extension_for_transformers/llm/runtime/graph/application/main_pybind.cpp

Lines changed: 18 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <unordered_map>
3030
#include <utility>
3131
#include <pybind11/pybind11.h>
32+
#include <pybind11/stl.h>
3233
#include "common.h"
3334
#include "models/model_utils/model_types.h"
3435
#include "models/model_utils/model_config.h"
@@ -57,7 +58,8 @@ class Model {
5758
void init_model(const std::string& model_path, int n_predict, int batch_size, int ctx_size, int seed, int threads,
5859
float repeat_penalty, const std::string& post_process);
5960
void reinit();
60-
std::string generate(const std::string& prompt, bool sentence_mode = true);
61+
std::vector<int> generate(const std::vector<int>& input_ids);
62+
std::vector<int> generate_tokens(const std::vector<int>& input_ids);
6163
bool is_token_end() { return token_eos; }
6264
static int quant_model(const std::string& model_path, const std::string& out_path, const std::string& weight_dtype,
6365
const std::string& alg, int group_size, const std::string& scale_dtype,
@@ -73,8 +75,6 @@ class Model {
7375
std::vector<model_token> last_n_tokens;
7476
bool token_eos = false;
7577

76-
std::string generate_one_token(const std::string& prompt);
77-
std::string generate_tokens(const std::string& prompt);
7878
int post_process(float* logits);
7979
};
8080

@@ -108,10 +108,9 @@ void Model::reinit() {
108108
curr_input_ids.clear();
109109
}
110110

111-
std::string Model::generate_one_token(const std::string& prompt) {
111+
std::vector<int> Model::generate(const std::vector<int>& input_ids) {
112112
if (curr_input_ids.empty()) {
113-
auto embd_inp = ::model_tokenize(ctx, prompt, false);
114-
curr_input_ids = embd_inp;
113+
curr_input_ids = input_ids;
115114
}
116115
for (auto item : curr_input_ids) {
117116
last_n_tokens.erase(last_n_tokens.begin());
@@ -124,7 +123,7 @@ std::string Model::generate_one_token(const std::string& prompt) {
124123
int next_token_id = post_process(logits);
125124
curr_input_ids = {next_token_id};
126125

127-
if (next_token_id == ctx->vocab.eos_token_id || n_past - prompt.size() == params.n_predict) {
126+
if (next_token_id == ctx->vocab.eos_token_id || n_past - input_ids.size() == params.n_predict) {
128127
token_eos = true;
129128
}
130129

@@ -133,20 +132,17 @@ std::string Model::generate_one_token(const std::string& prompt) {
133132
token_eos = true;
134133
}
135134

136-
return next_token;
135+
return {next_token_id};
137136
}
138137

139-
std::string Model::generate_tokens(const std::string& prompt) {
140-
int n_past = 0;
138+
std::vector<int> Model::generate_tokens(const std::vector<int>& input_ids) {
141139
int n_remain = params.n_predict;
142-
int max_length = 512;
143-
auto embd_inp = ::model_tokenize(ctx, prompt, false);
144-
int n_eval = embd_inp.size();
145-
std::vector<int> curr_input_ids(embd_inp);
146140
std::vector<int> output_ids;
147-
output_ids.reserve(max_length);
148-
std::string ret;
149-
ret += prompt;
141+
142+
if (curr_input_ids.empty()) {
143+
curr_input_ids = input_ids;
144+
}
145+
150146
while (output_ids.size() < n_remain) {
151147
for (auto item : curr_input_ids) {
152148
last_n_tokens.erase(last_n_tokens.begin());
@@ -158,24 +154,14 @@ std::string Model::generate_tokens(const std::string& prompt) {
158154
float* logits = model_get_logits(ctx);
159155
int next_token_id = post_process(logits);
160156
curr_input_ids = {next_token_id};
161-
162157
output_ids.push_back(next_token_id);
163-
ret += model_token_to_str(ctx, next_token_id);
164-
165-
if (next_token_id == model_token_eos()) {
158+
if (next_token_id == ctx->vocab.eos_token_id || n_past - input_ids.size() == params.n_predict) {
159+
token_eos = true;
166160
break;
167161
}
168162
}
169163

170-
return ret;
171-
}
172-
173-
std::string Model::generate(const std::string& prompt, bool sentence_mode) {
174-
if (sentence_mode) {
175-
return generate_tokens(prompt);
176-
}
177-
178-
return generate_one_token(prompt);
164+
return output_ids;
179165
}
180166

181167
int Model::post_process(float* logits) {
@@ -300,8 +286,8 @@ PYBIND11_MODULE(chatglm_cpp, m)
300286
.def("init_model", &Model::init_model, "initial model with model path and parameters", py::arg("model_path"),
301287
py::arg("max_new_tokens") = -1, py::arg("batch_size") = 512, py::arg("ctx_size") = 512, py::arg("seed") = -1,
302288
py::arg("threads") = 8, py::arg("repeat_penalty") = 1.1f, py::arg("post_process") = "topk")
303-
.def("generate", &Model::generate, "Generate tokens with prompt", py::arg("prompt"),
304-
py::arg("sentence_mode") = true)
289+
.def("generate", &Model::generate, "Generate token with input ids", py::arg("input_ids"))
290+
.def("generate_tokens", &Model::generate_tokens, "Generate tokens with input ids", py::arg("input_ids"))
305291
.def_static("quant_model", &Model::quant_model, "Quantize model", py::arg("model_path"), py::arg("out_path"),
306292
py::arg("weight_dtype") = "int4", py::arg("alg") = "sym", py::arg("group_size") = 32,
307293
py::arg("scale_dtype") = "fp32", py::arg("compute_dtype") = "ggml", py::arg("use_ggml") = false)

intel_extension_for_transformers/llm/runtime/graph/scripts/python_api_example.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,17 @@
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18+
from transformers import AutoTokenizer, TextStreamer
1819
from intel_extension_for_transformers.transformers import AutoModel, WeightOnlyQuantConfig
19-
model_name = "mosaicml/mpt-7b"
20-
woq_config = WeightOnlyQuantConfig(compute_dtype="int8")
21-
22-
model = AutoModel.from_pretrained(model_name, quantization_config=woq_config)
2320

21+
model_name = "THUDM/chatglm2-6b" # or local path to model
22+
woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4")
2423
prompt = "Once upon a time, a little girl"
25-
print(model.generate(prompt, max_new_tokens=30))
24+
25+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
26+
inputs = tokenizer(prompt, return_tensors="pt").input_ids
27+
streamer = TextStreamer(tokenizer)
28+
29+
model = AutoModel.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)
30+
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
31+

intel_extension_for_transformers/neural_chat/examples/talkingbot_pc/build_talkingbot_on_pc.ipynb

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -99,13 +99,18 @@
9999
"metadata": {},
100100
"outputs": [],
101101
"source": [
102+
"from transformers import AutoTokenizer, TextStreamer\n",
102103
"from intel_extension_for_transformers.llm.runtime.graph import Model\n",
103-
"model = Model()\n",
104-
"model.bin_file = r\"mpt_q4_0.bin\"\n",
105-
"model.init_from_bin(\"mpt\", model.bin_file, max_new_tokens=32, seed=12)\n",
104+
"\n",
106105
"prompt = text\n",
107-
"output = model.generate(prompt)\n",
108-
"print(output)"
106+
"tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-j-6b\", trust_remote_code=True)\n",
107+
"inputs = tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
108+
"streamer = TextStreamer(tokenizer)\n",
109+
"\n",
110+
"model = Model()\n",
111+
"model.init_from_bin(\"gptj\", \"ne_gptj_q.bin\", max_new_tokens=320, seed=12)\n",
112+
"\n",
113+
"outputs = model.generate(inputs, streamer=streamer)\n"
109114
]
110115
},
111116
{
@@ -121,12 +126,19 @@
121126
"metadata": {},
122127
"outputs": [],
123128
"source": [
129+
"from transformers import AutoTokenizer, TextStreamer\n",
124130
"from intel_extension_for_transformers.transformers import AutoModel, WeightOnlyQuantConfig\n",
125-
"model_name = r\"THUDM/ChatGLM2-6B\"\n",
131+
"\n",
132+
"model_name = \"EleutherAI/gpt-j-6b\" # or local path to model\n",
126133
"woq_config = WeightOnlyQuantConfig(compute_dtype=\"int8\", weight_dtype=\"int4\")\n",
127-
"model = AutoModel.from_pretrained(model_name, quantization_config=woq_config, use_llm_runtime=True, trust_remote_code=True)\n",
128134
"prompt = text\n",
129-
"output = model.generate(prompt, max_new_tokens=32)"
135+
"\n",
136+
"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
137+
"inputs = tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
138+
"streamer = TextStreamer(tokenizer)\n",
139+
"\n",
140+
"model = AutoModel.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)\n",
141+
"outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)\n"
130142
]
131143
},
132144
{

intel_extension_for_transformers/transformers/modeling/modeling_auto.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
143143
alg=quantization_config.scheme,
144144
group_size=quantization_config.group_size,
145145
scale_dtype=quantization_config.scale_dtype,
146-
compute_dtype=quantization_config.compute_dtype
146+
compute_dtype=quantization_config.compute_dtype,
147+
use_ggml=quantization_config.use_ggml,
147148
)
148149
return model
149150
else:

intel_extension_for_transformers/transformers/utils/quantization_config.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def __init__(
3939
group_size=32,
4040
scheme="sym",
4141
algorithm="RTN",
42+
use_ggml=False,
4243
**kwargs,
4344
):
4445
from intel_extension_for_transformers.llm.quantization.utils import convert_dtype_2_str
@@ -57,6 +58,7 @@ def __init__(
5758
self.calib_dataset = kwargs.pop("calib_dataset", "NeelNanda/pile-10k")
5859
self.calib_dataloader = kwargs.pop("calib_dataloader", None)
5960
self.calib_iters = kwargs.pop("calib_iters", 100)
61+
self.use_ggml = use_ggml
6062

6163
if compute_dtype is None:
6264
self.compute_dtype = "fp32"
@@ -116,8 +118,8 @@ def post_init_runtime(self):
116118

117119
if self.compute_dtype is None:
118120
self.compute_dtype = "int8"
119-
elif self.compute_dtype not in ['int8', 'fp32']:
120-
raise ValueError("compute_dtype must be 'int8', 'fp32'.")
121+
elif self.compute_dtype not in ['int8', 'bf16', 'fp32']:
122+
raise ValueError("compute_dtype must be 'int8', 'bf16', 'fp32'.")
121123

122124
if self.weight_dtype is None:
123125
self.weight_dtype = "int4"

setup.py

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,11 @@ def check_env_flag(name: str, default: bool = False) -> bool:
5757
class CMakeExtension(Extension):
5858
"""CMakeExtension class."""
5959

60-
def __init__(self, name, sourcedir="", lib_only=False, compile=True):
60+
def __init__(self, name, sourcedir="", lib_only=False):
6161
"""Init a CMakeExtension object."""
6262
Extension.__init__(self, name, sources=[])
6363
self.sourcedir = os.path.abspath(sourcedir)
6464
self.optional = lib_only # we only deliver shared object but not as a python extension module
65-
self.compile = compile
6665

6766

6867
class CMakeBuild(build_ext):
@@ -107,8 +106,6 @@ def get_source_files(self):
107106
return files
108107

109108
def build_extension(self, ext: CMakeExtension) -> None:
110-
if not ext.compile:
111-
return
112109
# Must be in this form due to bug in .resolve() only fixed in Python 3.10+
113110
ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)
114111
extdir = ext_fullpath.parent.resolve()
@@ -248,18 +245,9 @@ def check_submodules():
248245
check_submodules()
249246
ext_modules.extend([
250247
CMakeExtension("intel_extension_for_transformers.neural_engine_py", "intel_extension_for_transformers/llm/runtime/deprecated/"),
251-
CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.gptj_cpp", "intel_extension_for_transformers/llm/runtime/graph/"),
252-
CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.falcon_cpp", "intel_extension_for_transformers/llm/runtime/graph/", compile=False),
253-
CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.gptneox_cpp", "intel_extension_for_transformers/llm/runtime/graph/", compile=False),
254-
CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.dolly_cpp", "intel_extension_for_transformers/llm/runtime/graph/", compile=False),
255-
CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.llama_cpp", "intel_extension_for_transformers/llm/runtime/graph/", compile=False),
256-
CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.mpt_cpp", "intel_extension_for_transformers/llm/runtime/graph/", compile=False),
257-
CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.starcoder_cpp", "intel_extension_for_transformers/llm/runtime/graph/", compile=False),
258-
CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.opt_cpp", "intel_extension_for_transformers/llm/runtime/graph/", compile=False),
259-
CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.bloom_cpp", "intel_extension_for_transformers/llm/runtime/graph/", compile=False),
260-
CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.chatglm2_cpp", "intel_extension_for_transformers/llm/runtime/graph/", compile=False)
248+
CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.Model", "intel_extension_for_transformers/llm/runtime/graph/"),
261249
])
262-
cmdclass={'build_ext': CMakeBuild}
250+
cmdclass={'build_ext': CMakeBuild}
263251

264252
setup(
265253
name="intel-extension-for-transformers",

0 commit comments

Comments
 (0)