|
17 | 17 | from transformers import AutoProcessor |
18 | 18 | from optimum.intel.openvino import OVModelForVisualCausalLM |
19 | 19 | from transformers import LlavaNextVideoProcessor |
20 | | -from huggingface_hub import login |
21 | 20 | from optimum.intel import OVWeightQuantizationConfig, OVPipelineQuantizationConfig |
22 | 21 | from PIL import Image |
23 | 22 | import tempfile |
@@ -309,8 +308,16 @@ def run(video_path: str, model_name: str, flip: bool = True, video_input: bool = |
309 | 308 | device_mapping = utils.available_devices(exclude=["NPU"]) |
310 | 309 | device_type = "AUTO" |
311 | 310 |
|
| 311 | + #Downloadn and convert Image and Video models |
312 | 312 | vision_model, text_decoder, processor = load_models(model_name, device_type) |
313 | 313 |
|
| 314 | + #For video captioning |
| 315 | + model_name_video = "llava-hf/LLaVA-NeXT-Video-7B-hf" |
| 316 | + device_type_video = "AUTO" |
| 317 | + |
| 318 | + # Load video input model and processor |
| 319 | + model_video, processor_video = load_llava_video_models(model_name_video, device_type_video) |
| 320 | + |
314 | 321 | # initialize video player to deliver frames |
315 | 322 | if isinstance(video_path, str) and video_path.isnumeric(): |
316 | 323 | video_path = int(video_path) |
@@ -461,24 +468,14 @@ def run(video_path: str, model_name: str, flip: bool = True, video_input: bool = |
461 | 468 | caption = "Switching to image_input mode..." |
462 | 469 | else: |
463 | 470 | print("Switching to video_input mode...") |
464 | | - model_name = "llava-hf/LLaVA-NeXT-Video-7B-hf" |
465 | | - device_type = "AUTO" |
466 | | - |
467 | | - # Stop current worker |
468 | | - global_stop_event.set() |
469 | | - worker.join(timeout=1) |
470 | | - global_stop_event.clear() |
471 | | - |
472 | | - # Load video input model and processor |
473 | | - model, processor = load_llava_video_models(model_name, device_type) |
474 | 471 |
|
475 | 472 | # Start new inference worker with video_input=True |
476 | 473 | worker = threading.Thread( |
477 | 474 | target=inference_worker, |
478 | 475 | kwargs={ |
479 | 476 | "video_input": True, |
480 | | - "model": model, |
481 | | - "processor": processor, |
| 477 | + "model": model_video, |
| 478 | + "processor": processor_video, |
482 | 479 | "vision_model": None, |
483 | 480 | "text_decoder": None |
484 | 481 | }, |
|
0 commit comments