|
1 | | -# Copyright (c) Meta Platforms, Inc. and affiliates. |
2 | | -# All rights reserved. |
3 | | -# |
4 | | -# This source code is licensed under the BSD-style license found in the |
5 | | -# LICENSE file in the root directory of this source tree. |
| 1 | +from pathlib import Path |
| 2 | +from time import perf_counter_ns |
6 | 3 |
|
7 | | -import abc |
8 | | -import argparse |
9 | | -import importlib |
10 | | -import os |
11 | | - |
12 | | -import decord |
13 | | -import numpy as np |
14 | 4 | import torch |
15 | | - |
16 | | -import torch.utils.benchmark as benchmark |
17 | | -from torchcodec.samplers import ( |
18 | | - IndexBasedSamplerArgs, |
19 | | - TimeBasedSamplerArgs, |
20 | | - VideoArgs, |
21 | | - VideoClipSampler, |
22 | | -) |
23 | | -from torchmultimodal.fb.utils.video_utils import ( |
24 | | - ClipSamplerType, |
25 | | - VideoClipSampler as tmm_vcs, |
26 | | -) |
27 | | -from torchvision.datasets.video_clip_sampler import ( # @manual=//pytorch/vision:internal_datasets |
28 | | - TVVideoClipDecoder, |
29 | | - UniformClipSamplingStrategy, |
30 | | - VideoClipSampler as ta_vcs, |
31 | | -) |
32 | | - |
33 | | - |
34 | | -class AbstractSampler: |
35 | | - def __init__(self): |
36 | | - pass |
37 | | - |
38 | | - @abc.abstractmethod |
39 | | - def sample_frames_uniformly(self, video_file, clips_per_video): |
40 | | - pass |
41 | | - |
42 | | - |
43 | | -class TorchCodecTimeBasedSampler(AbstractSampler): |
44 | | - def __init__(self): |
45 | | - pass |
46 | | - |
47 | | - def sample_frames_uniformly(self, video_file, clips_per_video): |
48 | | - arr = np.fromfile(video_file, dtype=np.uint8) |
49 | | - video_tensor = torch.from_numpy(arr) |
50 | | - video_input = VideoArgs() |
51 | | - sampler_input = TimeBasedSamplerArgs( |
52 | | - sampler_type="uniform", clips_per_video=clips_per_video, frames_per_clip=1 |
53 | | - ) |
54 | | - sampler = VideoClipSampler(video_input, sampler_input) |
55 | | - return sampler(video_tensor) |
56 | | - |
57 | | - |
58 | | -class TorchCodecIndexBasedSampler(AbstractSampler): |
59 | | - def __init__(self): |
60 | | - pass |
61 | | - |
62 | | - def sample_frames_uniformly(self, video_file, clips_per_video): |
63 | | - arr = np.fromfile(video_file, dtype=np.uint8) |
64 | | - video_tensor = torch.from_numpy(arr) |
65 | | - video_input = VideoArgs() |
66 | | - sampler_input = IndexBasedSamplerArgs( |
67 | | - sampler_type="uniform", clips_per_video=clips_per_video, frames_per_clip=1 |
68 | | - ) |
69 | | - sampler = VideoClipSampler(video_input, sampler_input) |
70 | | - return sampler(video_tensor) |
71 | | - |
72 | | - |
73 | | -class TorchCodecIndexBasedSamplerWithStackedOutput(AbstractSampler): |
74 | | - """ |
75 | | - On large batch, torch stack has impact on performance, but it's not obvious locally. |
76 | | - """ |
77 | | - |
78 | | - def __init__(self): |
79 | | - pass |
80 | | - |
81 | | - def sample_frames_uniformly(self, video_file, clips_per_video): |
82 | | - arr = np.fromfile(video_file, dtype=np.uint8) |
83 | | - video_tensor = torch.from_numpy(arr) |
84 | | - video_input = VideoArgs() |
85 | | - sampler_input = IndexBasedSamplerArgs( |
86 | | - sampler_type="uniform", clips_per_video=clips_per_video, frames_per_clip=1 |
87 | | - ) |
88 | | - sampler = VideoClipSampler(video_input, sampler_input) |
89 | | - clips = sampler(video_tensor) |
90 | | - return torch.stack([clip[0] for clip in clips]) |
91 | | - |
92 | | - |
93 | | -class DecordSampler(AbstractSampler): |
94 | | - def __init__(self): |
95 | | - pass |
96 | | - |
97 | | - def sample_frames_uniformly(self, video_file, clips_per_video): |
98 | | - decord.bridge.set_bridge("torch") |
99 | | - av_reader = decord.VideoReader(video_file) |
100 | | - num_frames = len(av_reader) |
101 | | - frame_indices = np.linspace(0, num_frames - 1, clips_per_video, dtype=int) |
102 | | - frames = av_reader.get_batch(frame_indices) |
103 | | - return frames |
104 | | - |
105 | | - |
106 | | -class TorchMMSamplerWithTorchVisionBackend(AbstractSampler): |
107 | | - """ |
108 | | - Here we use TorchMultimodal sampler as it's updated version on top of torchvision decoder. |
109 | | - """ |
110 | | - |
111 | | - def __init__(self): |
112 | | - pass |
113 | | - |
114 | | - def sample_frames_uniformly(self, video_file, clips_per_video): |
115 | | - arr = np.fromfile(video_file, dtype=np.uint8) |
116 | | - video_tensor = torch.from_numpy(arr) |
117 | | - sampler = tmm_vcs( |
118 | | - clip_sampler_type=ClipSamplerType("UNIFORM"), |
119 | | - clips_per_video=clips_per_video, |
120 | | - frames_per_clip=1, |
121 | | - frame_dilation=1, |
122 | | - ) |
123 | | - return sampler(video_tensor) |
124 | | - |
125 | | - |
126 | | -class TorchVisionNewSamplerWithTorchVisionBackend(AbstractSampler): |
127 | | - def __init__(self): |
128 | | - pass |
129 | | - |
130 | | - def sample_frames_uniformly(self, video_file, clips_per_video): |
131 | | - clip_sampling_strategy = UniformClipSamplingStrategy( |
132 | | - clips_per_video=clips_per_video |
133 | | - ) |
134 | | - decoder = TVVideoClipDecoder(clip_length_in_frames=1, read_audio_stream=False) |
135 | | - sampler = ta_vcs(clip_sampling_strategy, decoder) |
136 | | - return sampler(str(video_file)) |
137 | | - |
138 | | - |
139 | | -def main(): |
140 | | - """Benchmarks the performance of different samplers""" |
141 | | - |
142 | | - parser = argparse.ArgumentParser() |
143 | | - parser.add_argument( |
144 | | - "--bm_small_video_speed", |
145 | | - help="Benchmark small video decoding speed", |
146 | | - default=True, |
147 | | - action=argparse.BooleanOptionalAction, |
148 | | - ) |
149 | | - parser.add_argument( |
150 | | - "--bm_large_video_speed", |
151 | | - help="Benchmark large video decoding speed", |
152 | | - default=True, |
153 | | - action=argparse.BooleanOptionalAction, |
| 5 | +from torchcodec.decoders import VideoDecoder |
| 6 | +from torchcodec.samplers import clips_at_random_indices |
| 7 | + |
| 8 | + |
| 9 | +def bench(f, *args, num_exp=100, warmup=0, **kwargs): |
| 10 | + |
| 11 | + for _ in range(warmup): |
| 12 | + f(*args, **kwargs) |
| 13 | + |
| 14 | + times = [] |
| 15 | + for _ in range(num_exp): |
| 16 | + start = perf_counter_ns() |
| 17 | + f(*args, **kwargs) |
| 18 | + end = perf_counter_ns() |
| 19 | + times.append(end - start) |
| 20 | + return torch.tensor(times).float() |
| 21 | + |
| 22 | + |
| 23 | +def report_stats(times, unit="ms"): |
| 24 | + mul = { |
| 25 | + "ns": 1, |
| 26 | + "µs": 1e-3, |
| 27 | + "ms": 1e-6, |
| 28 | + "s": 1e-9, |
| 29 | + }[unit] |
| 30 | + times = times * mul |
| 31 | + std = times.std().item() |
| 32 | + med = times.median().item() |
| 33 | + print(f"{med = :.2f}{unit} +- {std:.2f}") |
| 34 | + return med |
| 35 | + |
| 36 | + |
| 37 | +def sample(num_clips): |
| 38 | + decoder = VideoDecoder(VIDEO_PATH) |
| 39 | + clips_at_random_indices( |
| 40 | + decoder, |
| 41 | + num_clips=num_clips, |
| 42 | + num_frames_per_clip=10, |
| 43 | + num_indices_between_frames=2, |
154 | 44 | ) |
155 | | - parser.add_argument( |
156 | | - "--bm_video_speed_min_run_seconds", |
157 | | - help="Benchmark minimum run time, in seconds, to wait per datapoint", |
158 | | - type=float, |
159 | | - default=5.0, |
160 | | - ) |
161 | | - args = parser.parse_args() |
162 | | - |
163 | | - small_video_path = importlib.resources.path(__package__, "nasa_13013.mp4") |
164 | | - small_video_path = os.fspath(str(small_video_path)) |
165 | | - |
166 | | - large_video_path = importlib.resources.path(__package__, "853.mp4") |
167 | | - large_video_path = os.fspath(str(large_video_path)) |
168 | | - |
169 | | - clips_per_video = 8 |
170 | | - |
171 | | - sampler_dict = {} |
172 | | - sampler_dict["TorchCodecTimeBasedSampler"] = TorchCodecTimeBasedSampler() |
173 | | - sampler_dict["TorchCodecIndexBasedSampler"] = TorchCodecIndexBasedSampler() |
174 | | - sampler_dict["TorchCodecIndexBasedSamplerWithStackedOutput"] = ( |
175 | | - TorchCodecIndexBasedSamplerWithStackedOutput() |
176 | | - ) |
177 | | - sampler_dict["DecordSampler"] = DecordSampler() |
178 | | - sampler_dict["TorchMMSamplerWithTorchVisionBackend"] = ( |
179 | | - TorchMMSamplerWithTorchVisionBackend() |
180 | | - ) |
181 | | - sampler_dict["TorchVisionNewSamplerWithTorchVisionBackend"] = ( |
182 | | - TorchVisionNewSamplerWithTorchVisionBackend() |
183 | | - ) |
184 | | - |
185 | | - results = [] |
186 | 45 |
|
187 | | - for sampler_name, sampler in sampler_dict.items(): |
188 | | - if args.bm_small_video_speed: |
189 | | - sampler_result = benchmark.Timer( |
190 | | - stmt="sampler.sample_frames_uniformly(video_file, clips_per_video)", |
191 | | - globals={ |
192 | | - "video_file": small_video_path, |
193 | | - "clips_per_video": clips_per_video, |
194 | | - "sampler": sampler, |
195 | | - }, |
196 | | - label="uniform sampling latency for 700KB video", |
197 | | - sub_label=sampler_name, |
198 | | - description=f"uniform sampling {clips_per_video} frames", |
199 | | - ) |
200 | | - results.append( |
201 | | - sampler_result.blocked_autorange( |
202 | | - min_run_time=args.bm_video_speed_min_run_seconds |
203 | | - ) |
204 | | - ) |
205 | 46 |
|
206 | | - if args.bm_large_video_speed: |
207 | | - if sampler_name == "TorchMMSamplerWithTorchVisionBackend": |
208 | | - continue |
209 | | - sampler_result = benchmark.Timer( |
210 | | - stmt="sampler.sample_frames_uniformly(video_file, clips_per_video)", |
211 | | - globals={ |
212 | | - "video_file": large_video_path, |
213 | | - "clips_per_video": clips_per_video, |
214 | | - "sampler": sampler, |
215 | | - }, |
216 | | - label="uniform sampling latency for 50MB video", |
217 | | - sub_label=sampler_name, |
218 | | - description=f"uniform sampling {clips_per_video} frames", |
219 | | - ) |
220 | | - results.append( |
221 | | - sampler_result.blocked_autorange( |
222 | | - min_run_time=args.bm_video_speed_min_run_seconds |
223 | | - ) |
224 | | - ) |
| 47 | +VIDEO_PATH = Path(__file__).parent / "../../test/resources/nasa_13013.mp4" |
225 | 48 |
|
226 | | - compare = benchmark.Compare(results) |
227 | | - compare.print() |
| 49 | +times = bench(sample, num_clips=1, num_exp=30, warmup=2) |
| 50 | +report_stats(times, unit="ms") |
| 51 | +times = bench(sample, num_clips=50, num_exp=30, warmup=2) |
| 52 | +report_stats(times, unit="ms") |
0 commit comments