Skip to content

Commit 650bc67

Browse files
zucchini-nlpCyrilvallez
authored andcommitted
[smolvlm] fix video inference (#39147)
* fix smolvlm * better do as before, set sampling params in overwritten `apply_chat_template` * style * update with `setdefault`
1 parent 3c5f910 commit 650bc67

File tree

2 files changed

+40
-12
lines changed

2 files changed

+40
-12
lines changed

src/transformers/models/smolvlm/processing_smolvlm.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,10 @@ def apply_chat_template(
434434
if chat_template is None and has_video:
435435
# re-assign to the correct default template for BC, if user is not requesting their own template
436436
chat_template = DEFAULT_CHAT_TEMPLATE
437+
438+
kwargs.setdefault("num_frames", self.video_processor.num_frames)
439+
kwargs.setdefault("fps", self.video_processor.fps)
440+
437441
return super().apply_chat_template(conversation, chat_template, **kwargs)
438442

439443

tests/models/smolvlm/test_modeling_smolvlm.py

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -556,23 +556,24 @@ def setUp(self):
556556
).content
557557
)
558558
)
559-
self.image2 = Image.open(
560-
BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content)
561-
)
562-
self.image3 = Image.open(
563-
BytesIO(
564-
requests.get(
565-
"https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
566-
).content
567-
)
568-
)
559+
560+
self.video_messages = [
561+
{
562+
"role": "user",
563+
"content": [
564+
{
565+
"type": "video",
566+
"path": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_1_1080p.mov",
567+
},
568+
{"type": "text", "text": "Describe this video in detail"},
569+
],
570+
},
571+
]
569572

570573
def tearDown(self):
571574
cleanup(torch_device, gc_collect=True)
572575

573576
@slow
574-
# TODO (Orr?) this is a dummy test to check if the model generates things that make sense.
575-
# Needs to be expanded to a tiny video
576577
def test_integration_test(self):
577578
model = SmolVLMForConditionalGeneration.from_pretrained(
578579
"HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
@@ -591,3 +592,26 @@ def test_integration_test(self):
591592

592593
expected_generated_text = "\n\n\n\nIn this image, we see a view of the Statue of Liberty and the"
593594
self.assertEqual(generated_texts[0], expected_generated_text)
595+
596+
@slow
597+
def test_integration_test_video(self):
598+
model = SmolVLMForConditionalGeneration.from_pretrained(
599+
"HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
600+
torch_dtype=torch.bfloat16,
601+
device_map="auto",
602+
)
603+
604+
# Create inputs
605+
inputs = self.processor.apply_chat_template(
606+
self.video_messages,
607+
add_generation_prompt=True,
608+
tokenize=True,
609+
return_dict=True,
610+
return_tensors="pt",
611+
).to(device=torch_device, dtype=torch.bfloat16)
612+
613+
generated_ids = model.generate(**inputs, max_new_tokens=20)
614+
generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
615+
616+
expected_generated_text = 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature' # fmt: skip
617+
self.assertEqual(generated_texts[0], expected_generated_text)

0 commit comments

Comments
 (0)