@@ -133,69 +133,29 @@ def make_generate_function(self):
133
133
134
134
self .generate_function = self .generate_step
135
135
if keras .config .backend () == "openvino" :
136
- import os
137
- from multiprocessing import Pipe
138
- from multiprocessing import Process
139
-
140
136
import openvino as ov
141
137
import openvino .runtime .opset14 as ov_opset
142
- import psutil
143
138
144
139
from keras_hub .src .utils .keras_utils import print_msg
145
140
146
141
def ov_infer (inputs , stop_token_ids , fn ):
147
- def isolated_infer (pipe , compiled_model , flat_inputs ):
148
- outputs = compiled_model (flat_inputs )
149
- outputs = outputs .to_tuple ()
150
- pipe .send (outputs )
151
- pipe .close ()
152
-
153
142
def get_outputs (inputs , struct_outputs , compiled_ov_model ):
154
143
flatten_inputs = tree .flatten (inputs )
155
- free_mem = psutil .virtual_memory ().available / (1024 ** 3 )
156
- # On average OpenVINO needs about 2 GB to run
157
- # an inference, also it is wrapped by an env var,
158
- # to be tuned.
159
- threshold = float (
160
- os .getenv ("OV_INFER_FREE_MEM_THRESHOLD" , 2 )
161
- )
162
- if free_mem > threshold :
163
- """Run inference in a separate process only if
164
- free memory usage is above a certain threshold.
165
- This threshold is calculated to ensure that
166
- swap memory won't be triggered. When swap is
167
- likely to be used, fallback to normal inference
168
- to avoid severe performance degradation.
169
- Running inference in a subprocess prevents OpenVINO from
170
- allocating extra memory in the main process during its
171
- internal infer request creation. This can reduce memory
172
- usage by 0.5–2 GB depending on the model size.
173
- However, using a subprocess introduces an extra
174
- overhead, increasing latency by around 1–2 seconds
175
- per inference.
176
- """
177
- parent_conn , child_conn = Pipe ()
178
- p = Process (
179
- target = isolated_infer ,
180
- args = (
181
- child_conn ,
182
- compiled_ov_model ,
183
- flatten_inputs ,
184
- ),
185
- )
186
- p .start ()
187
- outputs = parent_conn .recv ()
188
- p .join ()
189
- else :
190
- outputs = compiled_ov_model (flatten_inputs )
191
- outputs = outputs .to_tuple ()
144
+ outputs = compiled_ov_model (flatten_inputs ).to_tuple ()
192
145
outputs = self ._unpack_singleton (
193
146
tree .pack_sequence_as (struct_outputs , outputs )
194
147
)
195
148
return outputs
196
149
150
+ core = ov .Core ()
151
+ device = "GPU" if "GPU" in core .available_devices else "CPU"
152
+
197
153
# Try using the existing compiled model
198
- if self .ov_compiled_model is not None :
154
+ if (
155
+ self .ov_compiled_model is not None
156
+ and getattr (self , "ov_device" , None ) is not None
157
+ and device == self .ov_device
158
+ ):
199
159
try :
200
160
return get_outputs (
201
161
inputs , self .struct_outputs , self .ov_compiled_model
@@ -228,10 +188,17 @@ def get_outputs(inputs, struct_outputs, compiled_ov_model):
228
188
ov .PartialShape ([- 1 ] * rank )
229
189
)
230
190
ov_model .validate_nodes_and_infer_types ()
231
- core = ov .Core ()
232
- device = "CPU"
233
- # OpenVINO supports only compiling with 'CPU' devices.
234
- self .ov_compiled_model = core .compile_model (ov_model , device )
191
+
192
+ self .ov_device = device
193
+ model_dtype = (
194
+ "f16"
195
+ if self .dtype == "float16" or self .dtype == "bfloat16"
196
+ else "f32"
197
+ )
198
+ config = {"INFERENCE_PRECISION_HINT" : model_dtype }
199
+ self .ov_compiled_model = core .compile_model (
200
+ ov_model , device , config
201
+ )
235
202
return get_outputs (
236
203
inputs , self .struct_outputs , self .ov_compiled_model
237
204
)
0 commit comments