@@ -84,8 +84,7 @@ def get_event(self):
8484
8585class SyclTimer :
8686 """
87- Context to measure device time and host wall-time of execution
88- of commands submitted to :class:`dpctl.SyclQueue`.
87+ Context to time execution of tasks submitted to :class:`dpctl.SyclQueue`.
8988
9089 :Example:
9190 .. code-block:: python
@@ -99,13 +98,18 @@ class SyclTimer:
9998 milliseconds_sc = 1e3
10099 timer = dpctl.SyclTimer(time_scale = milliseconds_sc)
101100
101+ untimed_code_block_1
102102 # use the timer
103103 with timer(queue=q):
104- code_block1
104+ timed_code_block1
105+
106+ untimed_code_block_2
105107
106108 # use the timer
107109 with timer(queue=q):
108- code_block2
110+ timed_code_block2
111+
112+ untimed_code_block_3
109113
110114 # retrieve elapsed times in milliseconds
111115 wall_dt, device_dt = timer.dt
@@ -116,16 +120,41 @@ class SyclTimer:
116120 associated with these submissions to perform the timing. Thus
117121 :class:`dpctl.SyclTimer` requires the queue with ``"enable_profiling"``
118122 property. In order to be able to collect the profiling information,
119- the ``dt`` property ensures that both submitted barriers complete their
120- execution and thus effectively synchronizes the queue.
121-
122- `device_timer` keyword argument controls the type of tasks submitted.
123- With `device_timer="queue_barrier"`, queue barrier tasks are used. With
124- `device_timer="order_manager"`, a single empty body task is inserted
125- instead relying on order manager (used by `dpctl.tensor` operations) to
123+ the ``dt`` property ensures that both tasks submitted by the timer
124+ complete their execution and thus effectively synchronizes the queue.
125+
126+ Execution of the above example results in the following task graph,
127+ where each group of tasks is ordered after the one preceding it,
128+ ``[tasks_of_untimed_block1]``, ``[timer_fence_start_task]``,
129+ ``[tasks_of_timed_block1]``, ``[timer_fence_finish_task]``,
130+ ``[tasks_of_untimed_block2]``, ``[timer_fence_start_task]``,
131+ ``[tasks_of_timed_block2]``, ``[timer_fence_finish_task]``,
132+ ``[tasks_of_untimed_block3]``.
133+
134+ ``device_timer`` keyword argument controls the type of tasks submitted.
135+ With ``"queue_barrier"`` value, queue barrier tasks are used. With
136+ ``"order_manager"`` value, a single empty body task is inserted
137+ and order manager (used by all `dpctl.tensor` operations) is used to
126138 order these tasks so that they fence operations performed within
127139 timer's context.
128140
141+ Timing offloading operations that do not use the order manager with
142+ the timer that uses ``"order_manager"`` as ``device_timer`` value
143+ will be misleading becaused the tasks submitted by the timer will not
144+ be ordered with respect to tasks we intend to time.
145+
146+ Note, that host timer effectively measures the time of task
147+ submissions. To measure host timer wall-time that includes execution
148+ of submitted tasks, make sure to include synchronization point in
149+ the timed block.
150+
151+ :Example:
152+ .. code-block:: python
153+
154+ with timer(q):
155+ timed_block
156+ q.wait()
157+
129158 Args:
130159 host_timer (callable, optional):
131160 A callable such that host_timer() returns current
@@ -134,7 +163,7 @@ class SyclTimer:
134163 device_timer (Literal["queue_barrier", "order_manager"], optional):
135164 Device timing method. Default: "queue_barrier".
136165 time_scale (Union[int, float], optional):
137- Ratio of the unit of time of interest and one second .
166+ Ratio of one second and the unit of time-scale of interest.
138167 Default: ``1``.
139168 """
140169
0 commit comments