From d3369a82dc4adb85cfad0fd85a3c54a3d2271d4d Mon Sep 17 00:00:00 2001 From: "Matias N. Goldberg" Date: Mon, 19 May 2025 14:50:59 -0300 Subject: [PATCH] Add CPU/GPU sync mode (renderer low latency) - Add CPU_GPU_SYNC_AUTO - Remove redundant calls to get_ticks_usec() - Add waitable swapchain - Add rendering/rendering_device/vsync/latency_mode which supports 4 options: low_extreme (only available through the GDScript API and command line interface. Cannot be set by default) low (default) medium high_throughput - Add PacingMethod which describes which method is being used (None, AUTO, Waitable Swapchains, Android Swappy). - Add CLI parameter --latency-mode to override the latency mode (low_extreme, low, etc). - Add debug CLI parameter --pacing-mode-mask (mask is a hex number, where valid combinations are bits OR'ed from PacingMethod enum). This prevents Godot from using certain pacing modes; which are useful for debugging (or troubleshooting a bug in pacing methods). - Add Monitors to debug and understand by AUTO decides to use SEQ or PAR. Even if Waitable Swapchains are being used; these monitor values are very useful for detecting jitter and stutter. Fixes an unrelated bug when NAVIGATION_2D_DISABLED or NAVIGATION_3D_DISABLED are defined Co-authored-by: Danni <34800072+KeyboardDanni@users.noreply.github.com> Co-authored-by: Matias N. Goldberg --- core/config/project_settings.cpp | 1 + doc/classes/Performance.xml | 29 ++++- doc/classes/ProjectSettings.xml | 5 + doc/classes/RenderingDevice.xml | 29 +++++ doc/classes/RenderingServer.xml | 16 +++ .../d3d12/rendering_device_driver_d3d12.cpp | 55 ++++++++ drivers/d3d12/rendering_device_driver_d3d12.h | 4 + drivers/gles3/storage/utilities.cpp | 4 + drivers/gles3/storage/utilities.h | 1 + drivers/metal/rendering_device_driver_metal.h | 3 + .../metal/rendering_device_driver_metal.mm | 9 ++ .../vulkan/rendering_device_driver_vulkan.cpp | 117 +++++++++++++++++ .../vulkan/rendering_device_driver_vulkan.h | 7 ++ editor/debugger/editor_visual_profiler.cpp | 6 +- editor/editor_node.cpp | 6 + main/main.cpp | 76 +++++++++-- main/performance.cpp | 119 ++++++++++++------ main/performance.h | 6 + misc/dist/shell/_godot.zsh-completion | 1 + misc/dist/shell/godot.bash-completion | 5 + misc/dist/shell/godot.fish | 7 ++ platform/windows/display_server_windows.cpp | 72 +++++++++++ platform/windows/display_server_windows.h | 1 + scene/main/scene_tree.cpp | 4 + servers/display_server.h | 1 + servers/rendering/dummy/storage/utilities.h | 1 + .../renderer_rd/storage_rd/utilities.cpp | 6 + .../renderer_rd/storage_rd/utilities.h | 1 + servers/rendering/renderer_viewport.cpp | 16 +++ servers/rendering/rendering_device.cpp | 112 +++++++++++++++++ servers/rendering/rendering_device.h | 39 ++++++ servers/rendering/rendering_device_commons.h | 25 ++++ servers/rendering/rendering_device_driver.h | 4 + .../rendering/rendering_server_default.cpp | 31 ++++- servers/rendering/storage/utilities.h | 9 +- servers/rendering_server.cpp | 111 ++++++++++++++++ servers/rendering_server.h | 38 ++++++ 37 files changed, 920 insertions(+), 57 deletions(-) diff --git a/core/config/project_settings.cpp b/core/config/project_settings.cpp index bd4762bd1443..4b7494006e8a 100644 --- a/core/config/project_settings.cpp +++ b/core/config/project_settings.cpp @@ -1641,6 +1641,7 @@ ProjectSettings::ProjectSettings() { GLOBAL_DEF_RST(PropertyInfo(Variant::INT, "rendering/rendering_device/vsync/frame_queue_size", PROPERTY_HINT_RANGE, "2,3,1"), 2); GLOBAL_DEF_RST(PropertyInfo(Variant::INT, "rendering/rendering_device/vsync/swapchain_image_count", PROPERTY_HINT_RANGE, "2,4,1"), 3); + GLOBAL_DEF(PropertyInfo(Variant::INT, "rendering/rendering_device/vsync/latency_mode", PROPERTY_HINT_ENUM, "low,medium,high_throughput"), 0); GLOBAL_DEF(PropertyInfo(Variant::INT, "rendering/rendering_device/staging_buffer/block_size_kb", PROPERTY_HINT_RANGE, "4,2048,1,or_greater"), 256); GLOBAL_DEF(PropertyInfo(Variant::INT, "rendering/rendering_device/staging_buffer/max_size_mb", PROPERTY_HINT_RANGE, "1,1024,1,or_greater"), 128); GLOBAL_DEF(PropertyInfo(Variant::INT, "rendering/rendering_device/staging_buffer/texture_upload_region_size_px", PROPERTY_HINT_RANGE, "1,256,1,or_greater"), 64); diff --git a/doc/classes/Performance.xml b/doc/classes/Performance.xml index e882aa8e1d95..847af09d9c13 100644 --- a/doc/classes/Performance.xml +++ b/doc/classes/Performance.xml @@ -299,7 +299,34 @@ Number of active navigation obstacles in the [NavigationServer3D]. - + + Value used by Godot when PACING_METHOD_SEQUENTIAL_SYNC is available and no other better latency-reduction method is available. to determine whether we should be in [constant RenderingServer.CPU_GPU_SYNC_PARALLEL] or in [constant RenderingServer.CPU_GPU_SYNC_SEQUENTIAL] mode. It is the sum of CPU Time + GPU Time. If the value is consistently high enough, Godot will determine to use PARALLEL, otherwise it will prefer SEQUENTIAL. + [b]Note:[/b] this value attempts to be bereft of any additional time caused from waiting for V-Sync, therefore it will not match any other timing value (e.g. actual FPS, time taken by physics, etc). It is an estimation of how long the system would take if CPU and GPU were to be processing a frame serially, without the added delay of waiting for V-Sync. + [b]Note:[/b] When using these monitors, it's best to set the Editor to a simple view like the Script tab to avoid the 2D/3D view from consuming system resources that could interfere with readings. Or better yet, run the Editor profiler in another machine. + + + How long CPU took to process the frame, bereft of waiting delays caused by V-Sync. This value is an approximation and might not match any other timing value. If this value is added to GPU Time, you get Total Time. Useful to know where to focus optimization efforts. + + + How long GPU took to process the frame, bereft of waiting delays caused by V-Sync. This value is an approximation and will not match any other timing value. If this value is added to CPU Time, you get Total Time. Useful to know where to focus optimization efforts. + + + The mode decided by Godot that we should be in for each frame based on Total Time. "1" means we should be in [constant RenderingServer.CPU_GPU_SYNC_PARALLEL], "2" means we should be in [constant RenderingServer.CPU_GPU_SYNC_SEQUENTIAL]. + [b]Note:[/b] This value is not the actual mode Godot is in, because the decision is averaged over time to prevent Godot from constantly switching back and forth between PARALLEL and SEQUENTIAL (which would cause visible stutters). Ideally this should be a perfect flat line of either 1s or 2s. If you see the game going back and forth between 1 and 2, then the system is not fast enough for a smooth low-latency experience; or the game should be optimized further until it is. The higher the monitor refresh rate, the higher the system requirements are for the line to be flat. + + + The [b]actual[/b] mode the game currently is. "1" means we are in [constant RenderingServer.CPU_GPU_SYNC_PARALLEL], "2" means we are in [constant RenderingServer.CPU_GPU_SYNC_SEQUENTIAL]. + [b]Note:[/b] This value should be as flat as possible. Every time it switches between "1" and "2", the game may suffer a small stutter. + [b]Note:[/b] This value is ignored if PACING_METHOD_WAITABLE_SWAPCHAIN is available; or if [method RenderingDevice.get_latency_mode] is equal or higher than [constant RenderingDevice.LATENCY_MODE_MEDIUM] + + + The number of frames where the "Total Time" has exceeded the monitor's refresh rate or the max FPS (whichever is lower). This does not necessarily mean the game has missed a V-Blank (if the game is running in [constant RenderingServer.CPU_GPU_SYNC_PARALLEL], then total frame time should be lower than the sum of CPU Time + GPU Time; thus in practice the app may not have missed any V-Blank) but it indicates V-Blanks would've been missed if executing in [constant RenderingServer.CPU_GPU_SYNC_SEQUENTIAL]. The value is expressed in thousands. + For example one missed Hard Target will be shown as 1000. Two missed Hard Targets will be shown as 2000. This value decreases quickly over time. Missed Hard Targets weight heavily on Godot deciding to switch to PARALLEL to avoid degrading the experience further. + [b]Note:[/b] While in PARALLEL mode, this counter is always reset to 0 each new frame, thus while [constant FRAME_PACING_ACTUAL_SYNC_MODE] is 1, this value will be either 0 or 1000, where a flat 1000 line means the game is always failing to reach the target framerate. + [b]Note:[/b] Spikes in missed hard targets almost always means very visible stutter and thus should be avoided at all costs during gameplay. This value should be kept at 0 at all times. If the system isn't fast enough to keep the target framerate, this value should always be 1000 to keep pacing consistent. + [b]Note:[/b] Periodically failing this metric means you should optimize your content to run faster, avoid spikes, or increase [member ProjectSettings.rendering/rendering_device/vsync/latency_mode] to a higher latency mode. + + Represents the size of the [enum Monitor] enum. diff --git a/doc/classes/ProjectSettings.xml b/doc/classes/ProjectSettings.xml index cef1253b6ddd..a56e1e0c0a19 100644 --- a/doc/classes/ProjectSettings.xml +++ b/doc/classes/ProjectSettings.xml @@ -3211,6 +3211,11 @@ Try the [url=https://darksylinc.github.io/vsync_simulator/]V-Sync Simulator[/url], an interactive interface that simulates presentation to better understand how it is affected by different variables under various conditions. [b]Note:[/b] This property is only read when the project starts. There is currently no way to change this value at run-time. + + Sets the default latency mode. Lower is better for input-to-display latency, but it will sacrifice FPS (frames per second) in return. This setting can be changed at runtime via [method RenderingDevice.set_latency_mode] on [method RenderingServer.get_rendering_device]. See documentation for [constant RenderingDevice.LATENCY_MODE_LOW], [constant RenderingDevice.LATENCY_MODE_MEDIUM], and [constant RenderingDevice.LATENCY_MODE_HIGH_THROUGHPUT] for what each individual setting entails. + [b]Note:[/b] The setting [constant RenderingDevice.LATENCY_MODE_LOW_EXTREME] is not available through this property as it is strongly ill-advised to ship with this value as the default. + [b]Note:[/b] This property may be overridden with the [code]--latency-mode[/code] command-line argument. When this argument is used, this project setting is ignored. + The number of images the swapchain will consist of (back buffers + front buffer). [code]2[/code] corresponds to double-buffering and [code]3[/code] to triple-buffering. diff --git a/doc/classes/RenderingDevice.xml b/doc/classes/RenderingDevice.xml index 54edd2cada14..7ba1c1733573 100644 --- a/doc/classes/RenderingDevice.xml +++ b/doc/classes/RenderingDevice.xml @@ -646,6 +646,12 @@ Returns the frame count kept by the graphics API. Higher values result in higher input lag, but with more consistent throughput. For the main [RenderingDevice], frames are cycled (usually 3 with triple-buffered V-Sync enabled). However, local [RenderingDevice]s only have 1 frame. + + + + Returns the current latency mode used by this [RenderingDevice]. See [member ProjectSettings.rendering/rendering_device/vsync/latency_mode] for details. + + @@ -782,6 +788,13 @@ [b]Note:[/b] Only the main [RenderingDevice] returned by [method RenderingServer.get_rendering_device] has a width. If called on a local [RenderingDevice], this method prints an error and returns [constant INVALID_ID]. + + + + + Sets the current latency mode for this [RenderingDevice]. See [member ProjectSettings.rendering/rendering_device/vsync/latency_mode] for details. + + @@ -2730,5 +2743,21 @@ Ignore the previous contents of all attachments. + + The engine is willing to sacrifice a considerable amount of FPS (frames per second) to achieve the lowest possible latency. It's generally recommended to use [constant LATENCY_MODE_LOW] instead, as the FPS cost tends to be too high. It is strongly recommended this setting should only be set by the end user in user settings, and not be shipped by default. + [b]Note:[/b] Actually receiving low latency is not guaranteed, as it depends on various factors such as system speed, scene complexity and driver support. + [b]Note:[/b] Consider using [constant DisplayServer.VSYNC_ADAPTIVE] to reduce jitter and stutter while in this mode. + + + The engine is willing to sacrifice some amount of FPS (frames per second) to achieve a generably enjoyable and acceptable low latency experience. This is the recommended setting. + [b]Note:[/b] Actually receiving low latency is not guaranteed, as it depends on various factors such as system speed, scene complexity and driver support. + [b]Note:[/b] Consider using [constant DisplayServer.VSYNC_ADAPTIVE] to reduce jitter and stutter while in this mode. + + + The engine is not willing to sacrifice much FPS (frames per second), but still maintaining a decent amount of latency. This setting is best for slow systems, or scenes that are too complex to run at decent FPS in lower latency modes. It's also useful as a workaround if the user is experiencing pacing (jitter, stutter) problems with lower latency settings. + + + The engine will prefer maximizing FPS (frames per second), with no consideration for latency. This setting is ideal for apps that have no user interaction, like servers or headless processes. + diff --git a/doc/classes/RenderingServer.xml b/doc/classes/RenderingServer.xml index d87d2a48e6e2..eb5e97e6eb9f 100644 --- a/doc/classes/RenderingServer.xml +++ b/doc/classes/RenderingServer.xml @@ -1592,6 +1592,12 @@ Tries to free an object in the RenderingServer. To avoid memory leaks, this should be called after using an object as memory management does not occur automatically when using RenderingServer directly. + + + + See [constant Performance.FRAME_PACING_ACTUAL_SYNC_MODE]. + + @@ -5870,6 +5876,16 @@ Represents the size of the [enum GlobalShaderParameterType] enum. + + Indicates the renderer is prioritizing higher framerate by allowing the CPU to queue up additional frames before they're rendered by the GPU. This allows the CPU and GPU to work in tandem, improving the framerate and framepacing in complex scenes at the expense of input latency. This default setting is suitable for most 3D applications, especially on mobile and lower-performance desktop hardware. + [b]Note:[/b] This is part of a fallback mechanism to reduce latency when PACING_METHOD_WAITABLE_SWAPCHAIN is not available. + + + Indicates the renderer is prioritizing lower display latency by severely limiting how far the CPU is allowed to get ahead of the GPU when queuing frames. This can greatly help with input lag, at the cost of significantly reduced framerate in most scenes. This setting is useful for games and applications with simple graphics where responsive input is important. Your results may vary based on platform, drivers, and scene contents. + [b]Note:[/b] This is part of a fallback mechanism to reduce latency when PACING_METHOD_WAITABLE_SWAPCHAIN is not available. + [b]Note:[/b] Important FPS drops are expected while in this mode. It prioritizes low latency over framerate. + [b]Note:[/b] Stutter can be reduced if using [constant DisplayServer.VSYNC_ADAPTIVE]. But it risks always degenerating to [constant DisplayServer.VSYNC_DISABLED] if the system is too slow. + Number of objects rendered in the current 3D scene. This varies depending on camera position and rotation. diff --git a/drivers/d3d12/rendering_device_driver_d3d12.cpp b/drivers/d3d12/rendering_device_driver_d3d12.cpp index a7208c5af9ea..9f77bb9ed8da 100644 --- a/drivers/d3d12/rendering_device_driver_d3d12.cpp +++ b/drivers/d3d12/rendering_device_driver_d3d12.cpp @@ -2510,6 +2510,11 @@ void RenderingDeviceDriverD3D12::_swap_chain_release_buffers(SwapChain *p_swap_c p_swap_chain->render_targets.clear(); p_swap_chain->render_targets_info.clear(); + if (p_swap_chain->waitable_object) { + CloseHandle(p_swap_chain->waitable_object); + p_swap_chain->waitable_object = nullptr; + } + for (RDD::FramebufferID framebuffer : p_swap_chain->framebuffers) { framebuffer_free(framebuffer); } @@ -2567,6 +2572,7 @@ Error RenderingDeviceDriverD3D12::swap_chain_resize(CommandQueueID p_cmd_queue, case DisplayServer::VSYNC_ENABLED: { sync_interval = 1; present_flags = 0; + creation_flags = DXGI_SWAP_CHAIN_FLAG_FRAME_LATENCY_WAITABLE_OBJECT; } break; case DisplayServer::VSYNC_DISABLED: { sync_interval = 0; @@ -2577,6 +2583,7 @@ Error RenderingDeviceDriverD3D12::swap_chain_resize(CommandQueueID p_cmd_queue, default: sync_interval = 1; present_flags = 0; + creation_flags = DXGI_SWAP_CHAIN_FLAG_FRAME_LATENCY_WAITABLE_OBJECT; break; } @@ -2628,6 +2635,11 @@ Error RenderingDeviceDriverD3D12::swap_chain_resize(CommandQueueID p_cmd_queue, ERR_FAIL_COND_V(!SUCCEEDED(res), ERR_CANT_CREATE); } + if (creation_flags & DXGI_SWAP_CHAIN_FLAG_FRAME_LATENCY_WAITABLE_OBJECT) { + swap_chain->d3d_swap_chain->SetMaximumFrameLatency(UINT(frames.size())); + swap_chain->waitable_object = swap_chain->d3d_swap_chain->GetFrameLatencyWaitableObject(); + } + #ifdef DCOMP_ENABLED if (surface->composition_device.Get() == nullptr) { using PFN_DCompositionCreateDevice = HRESULT(WINAPI *)(IDXGIDevice *, REFIID, void **); @@ -2746,6 +2758,49 @@ void RenderingDeviceDriverD3D12::swap_chain_free(SwapChainID p_swap_chain) { memdelete(swap_chain); } +Error RenderingDeviceDriverD3D12::swap_chain_wait_for_present(DisplayServer::WindowID p_window, SwapChainID p_swap_chain, uint32_t p_max_frame_delay) { + SwapChain *swap_chain = (SwapChain *)(p_swap_chain.id); + if (swap_chain->waitable_object != NULL) { + UINT timeout = 1000u; + + HRESULT res; + + { + UINT current_frame_latency = 0u; + res = swap_chain->d3d_swap_chain->GetMaximumFrameLatency(¤t_frame_latency); + + ERR_FAIL_COND_V_MSG(!SUCCEEDED(res), FAILED, "GetMaximumFrameLatency failed with error " + vformat("0x%08ux", (uint64_t)res) + "."); + + if (p_max_frame_delay != current_frame_latency) { + swap_chain->d3d_swap_chain->SetMaximumFrameLatency(UINT(p_max_frame_delay)); + } + } + + do { + res = WaitForSingleObjectEx(swap_chain->waitable_object, timeout, FALSE); + } while (res == WAIT_IO_COMPLETION); + + if (res == WAIT_TIMEOUT) { + ERR_FAIL_COND_V_MSG(!SUCCEEDED(res), ERR_TIMEOUT, "swap_chain_wait_for_present timeout exceeded."); + } else if (res == (HRESULT)WAIT_FAILED) { + DWORD error = GetLastError(); + ERR_FAIL_COND_V_MSG(!SUCCEEDED(res), FAILED, "WaitForSingleObjectEx failed with error " + vformat("0x%08ux", (uint64_t)error) + "."); + } else if (res != WAIT_OBJECT_0) { + ERR_FAIL_COND_V_MSG(!SUCCEEDED(res), FAILED, "WaitForSingleObjectEx returned " + vformat("0x%08ux", (uint64_t)res) + "."); + } + return OK; + } else { + return ERR_UNAVAILABLE; + } +} + +BitField RenderingDeviceDriverD3D12::get_available_pacing_methods() const { + BitField methods = 0; + methods.set_flag(PACING_METHOD_SEQUENTIAL_SYNC); + methods.set_flag(PACING_METHOD_WAITABLE_SWAPCHAIN); + return methods; +} + /*********************/ /**** FRAMEBUFFER ****/ /*********************/ diff --git a/drivers/d3d12/rendering_device_driver_d3d12.h b/drivers/d3d12/rendering_device_driver_d3d12.h index 54b57cbc2f57..9412b2ef0121 100644 --- a/drivers/d3d12/rendering_device_driver_d3d12.h +++ b/drivers/d3d12/rendering_device_driver_d3d12.h @@ -468,6 +468,7 @@ class RenderingDeviceDriverD3D12 : public RenderingDeviceDriver { struct SwapChain { ComPtr d3d_swap_chain; + HANDLE waitable_object; RenderingContextDriver::SurfaceID surface = RenderingContextDriver::SurfaceID(); UINT present_flags = 0; UINT sync_interval = 1; @@ -489,6 +490,9 @@ class RenderingDeviceDriverD3D12 : public RenderingDeviceDriver { virtual RenderPassID swap_chain_get_render_pass(SwapChainID p_swap_chain) override; virtual DataFormat swap_chain_get_format(SwapChainID p_swap_chain) override; virtual void swap_chain_free(SwapChainID p_swap_chain) override; + virtual Error swap_chain_wait_for_present(DisplayServer::WindowID p_window, SwapChainID p_swap_chain, uint32_t p_max_frame_delay) override final; + + virtual BitField get_available_pacing_methods() const override final; /*********************/ /**** FRAMEBUFFER ****/ diff --git a/drivers/gles3/storage/utilities.cpp b/drivers/gles3/storage/utilities.cpp index b096a8a7f302..b5a479dfa62e 100644 --- a/drivers/gles3/storage/utilities.cpp +++ b/drivers/gles3/storage/utilities.cpp @@ -335,6 +335,10 @@ void Utilities::capture_timestamp(const String &p_name) { frames[frame].timestamp_count++; } +void Utilities::capture_timestamps_sync_mode_auto_end() { + // Not implemented for OpenGL. +} + void Utilities::_capture_timestamps_begin() { // frame is incremented at the end of the frame so this gives us the queries for frame - 2. By then they should be ready. if (frames[frame].timestamp_count) { diff --git a/drivers/gles3/storage/utilities.h b/drivers/gles3/storage/utilities.h index 9e40d596df06..92040b3d7c7a 100644 --- a/drivers/gles3/storage/utilities.h +++ b/drivers/gles3/storage/utilities.h @@ -201,6 +201,7 @@ class Utilities : public RendererUtilities { virtual void capture_timestamps_begin() override; virtual void capture_timestamp(const String &p_name) override; + virtual void capture_timestamps_sync_mode_auto_end() override; virtual uint32_t get_captured_timestamps_count() const override; virtual uint64_t get_captured_timestamps_frame() const override; virtual uint64_t get_captured_timestamp_gpu_time(uint32_t p_index) const override; diff --git a/drivers/metal/rendering_device_driver_metal.h b/drivers/metal/rendering_device_driver_metal.h index 0fed3aacb64e..c4e40787d66d 100644 --- a/drivers/metal/rendering_device_driver_metal.h +++ b/drivers/metal/rendering_device_driver_metal.h @@ -224,6 +224,9 @@ class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) RenderingDeviceDriverMet virtual DataFormat swap_chain_get_format(SwapChainID p_swap_chain) override final; virtual void swap_chain_set_max_fps(SwapChainID p_swap_chain, int p_max_fps) override final; virtual void swap_chain_free(SwapChainID p_swap_chain) override final; + virtual Error swap_chain_wait_for_present(DisplayServer::WindowID p_window, SwapChainID p_swap_chain, uint32_t p_max_frame_delay) override final; + + virtual BitField get_available_pacing_methods() const override final; #pragma mark - Frame Buffer diff --git a/drivers/metal/rendering_device_driver_metal.mm b/drivers/metal/rendering_device_driver_metal.mm index 53af8e49c5ee..c7495217393f 100644 --- a/drivers/metal/rendering_device_driver_metal.mm +++ b/drivers/metal/rendering_device_driver_metal.mm @@ -1052,6 +1052,15 @@ static const API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MTLSamplerBorderC memdelete(swap_chain); } +Error RenderingDeviceDriverMetal::swap_chain_wait_for_present(DisplayServer::WindowID p_window, SwapChainID p_swap_chain, uint32_t p_max_frame_delay) { + return ERR_UNAVAILABLE; +} + +BitField RenderingDeviceDriverMetal::get_available_pacing_methods() const { + BitField methods = 0; + return methods; +} + #pragma mark - Frame buffer RDD::FramebufferID RenderingDeviceDriverMetal::framebuffer_create(RenderPassID p_render_pass, VectorView p_attachments, uint32_t p_width, uint32_t p_height) { diff --git a/drivers/vulkan/rendering_device_driver_vulkan.cpp b/drivers/vulkan/rendering_device_driver_vulkan.cpp index 72b1881aea88..40776ac9359d 100644 --- a/drivers/vulkan/rendering_device_driver_vulkan.cpp +++ b/drivers/vulkan/rendering_device_driver_vulkan.cpp @@ -531,6 +531,8 @@ Error RenderingDeviceDriverVulkan::_initialize_device_extensions() { _register_requested_device_extension(VK_EXT_ASTC_DECODE_MODE_EXTENSION_NAME, false); _register_requested_device_extension(VK_KHR_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME, false); _register_requested_device_extension(VK_EXT_TEXTURE_COMPRESSION_ASTC_HDR_EXTENSION_NAME, false); + _register_requested_device_extension(VK_KHR_PRESENT_ID_EXTENSION_NAME, false); + _register_requested_device_extension(VK_KHR_PRESENT_WAIT_EXTENSION_NAME, false); // We don't actually use this extension, but some runtime components on some platforms // can and will fill the validation layers with useless info otherwise if not enabled. @@ -758,6 +760,8 @@ Error RenderingDeviceDriverVulkan::_check_device_capabilities() { VkPhysicalDevice16BitStorageFeaturesKHR storage_feature = {}; VkPhysicalDeviceMultiviewFeatures multiview_features = {}; VkPhysicalDevicePipelineCreationCacheControlFeatures pipeline_cache_control_features = {}; + VkPhysicalDevicePresentIdFeaturesKHR present_id_features = {}; + VkPhysicalDevicePresentWaitFeaturesKHR present_wait_features = {}; const bool use_1_2_features = physical_device_properties.apiVersion >= VK_API_VERSION_1_2; if (use_1_2_features) { @@ -807,6 +811,18 @@ Error RenderingDeviceDriverVulkan::_check_device_capabilities() { next_features = &pipeline_cache_control_features; } + if (enabled_device_extension_names.has(VK_KHR_PRESENT_ID_EXTENSION_NAME)) { + present_id_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENT_ID_FEATURES_KHR; + present_id_features.pNext = next_features; + next_features = &present_id_features; + } + + if (enabled_device_extension_names.has(VK_KHR_PRESENT_WAIT_EXTENSION_NAME)) { + present_wait_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENT_WAIT_FEATURES_KHR; + present_wait_features.pNext = next_features; + next_features = &present_wait_features; + } + VkPhysicalDeviceFeatures2 device_features_2 = {}; device_features_2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; device_features_2.pNext = next_features; @@ -866,6 +882,10 @@ Error RenderingDeviceDriverVulkan::_check_device_capabilities() { pipeline_cache_control_support = pipeline_cache_control_features.pipelineCreationCacheControl; } + if (enabled_device_extension_names.has(VK_KHR_PRESENT_ID_EXTENSION_NAME) && enabled_device_extension_names.has(VK_KHR_PRESENT_WAIT_EXTENSION_NAME)) { + waitable_swapchain_support = present_id_features.presentId && present_wait_features.presentWait; + } + if (enabled_device_extension_names.has(VK_EXT_DEVICE_FAULT_EXTENSION_NAME)) { device_fault_support = true; } @@ -1103,6 +1123,19 @@ Error RenderingDeviceDriverVulkan::_initialize_device(const LocalVector swapchains; thread_local LocalVector image_indices; thread_local LocalVector results; +#if !defined(SWAPPY_FRAME_PACING_ENABLED) + thread_local LocalVector present_ids; +#endif swapchains.clear(); image_indices.clear(); @@ -2775,6 +2814,18 @@ Error RenderingDeviceDriverVulkan::command_queue_execute_and_present(CommandQueu err = device_functions.QueuePresentKHR(device_queue.queue, &present_info); } #else + VkPresentIdKHR present_id = {}; + if (waitable_swapchain_support) { + present_ids.resize(swapchains.size()); + ++current_present_id; + for (uint64_t &id : present_ids) { + id = current_present_id; + } + present_id.sType = VK_STRUCTURE_TYPE_PRESENT_ID_KHR; + present_id.pPresentIds = present_ids.ptr(); + present_id.swapchainCount = present_ids.size(); + present_info.pNext = &present_id; + } err = device_functions.QueuePresentKHR(device_queue.queue, &present_info); #endif @@ -3486,6 +3537,72 @@ void RenderingDeviceDriverVulkan::swap_chain_free(SwapChainID p_swap_chain) { memdelete(swap_chain); } +Error RenderingDeviceDriverVulkan::swap_chain_wait_for_present(DisplayServer::WindowID p_window, SwapChainID p_swap_chain, uint32_t p_max_frame_delay) { + if (!waitable_swapchain_support) { + return ERR_UNAVAILABLE; + } + + if (current_present_id <= p_max_frame_delay) { + return OK; + } + + if (last_swapchain_occluded) { + // Last time we checked, we were occluded. Check again and early out. + // Otherwise we'll always timeout and that makes Alt+Tab back into the Editor very laggy. + if (DisplayServer::get_singleton()->_window_presentation_occluded(p_window)) { + return OK; // Still occluded. + } else { + last_swapchain_occluded = false; + } + } + + SwapChain *swap_chain = (SwapChain *)(p_swap_chain.id); + + // IMPORTANT: Low timeouts (e.g. 100ms) may be hit by the Godot Editor when creating new windows + // (e.g. opening the menu) or when resizing. + // + // Additionally, if the window is fully hidden, a low timeout like 100ms may be hit permanently, + // causing us to spam the warning but the game will run at 10 FPS, which may be confused with the + // scene being heavy. Getting stuck at 1 FPS signals much more strongly that we hit a bug. + constexpr uint64_t wait_timeout = 1'000'000'000; + VkResult err = device_functions.WaitForPresentKHR(vk_device, swap_chain->vk_swapchain, current_present_id - p_max_frame_delay, wait_timeout); + + if (err == VK_TIMEOUT) { + if (DisplayServer::get_singleton()->_window_presentation_occluded(p_window)) { + last_swapchain_occluded = true; + // The window is not presenting, thus it's normal to timeout. We swallow the warning + // in this case since it can be pretty common. A side effect of this is that Godot's + // framerate will be locked down to the timeout's framerate (e.g. 1 FPS). + // If this is an issue, user can either launch Godot with --pacing-mode-mask 0x1 + // or set the latency mode to LATENCY_MODE_HIGH_THROUGHPUT. + // + // Unfortunately Vulkan cannot signal us when the window is occluded or it's + // impossible to present at the time. + // See https://github.com/KhronosGroup/Vulkan-Docs/issues/2530 + print_verbose("vkWaitForPresentKHR timeout exceeded for expected reasons.") return ERR_TIMEOUT; + } else { + ERR_FAIL_COND_V_MSG(err, ERR_TIMEOUT, "vkWaitForPresentKHR timeout exceeded."); + } + } else if (err != VK_SUCCESS && err != VK_SUBOPTIMAL_KHR) { + ERR_FAIL_COND_V_MSG(err, FAILED, "vkWaitForPresentKHR failed with error " + itos(err) + "."); + } + return OK; +} + +BitField RenderingDeviceDriverVulkan::get_available_pacing_methods() const { + BitField methods = 0; + methods.set_flag(PACING_METHOD_SEQUENTIAL_SYNC); + if (waitable_swapchain_support) { + methods.set_flag(PACING_METHOD_WAITABLE_SWAPCHAIN); + } +#if defined(SWAPPY_FRAME_PACING_ENABLED) + if (swappy_frame_pacer_enable) { + methods.set_flag(PACING_METHOD_ANDROID_SWAPPY); + } +#endif + return methods; +} + /*********************/ /**** FRAMEBUFFER ****/ /*********************/ diff --git a/drivers/vulkan/rendering_device_driver_vulkan.h b/drivers/vulkan/rendering_device_driver_vulkan.h index 07c1b2a0a9ba..841de6602d61 100644 --- a/drivers/vulkan/rendering_device_driver_vulkan.h +++ b/drivers/vulkan/rendering_device_driver_vulkan.h @@ -98,6 +98,7 @@ class RenderingDeviceDriverVulkan : public RenderingDeviceDriver { PFN_vkQueuePresentKHR QueuePresentKHR = nullptr; PFN_vkCreateRenderPass2KHR CreateRenderPass2KHR = nullptr; PFN_vkCmdEndRenderPass2KHR EndRenderPass2KHR = nullptr; + PFN_vkWaitForPresentKHR WaitForPresentKHR = nullptr; // Debug marker extensions. PFN_vkCmdDebugMarkerBeginEXT CmdDebugMarkerBeginEXT = nullptr; @@ -115,6 +116,7 @@ class RenderingDeviceDriverVulkan : public RenderingDeviceDriver { RenderingContextDriverVulkan *context_driver = nullptr; RenderingContextDriver::Device context_device = {}; uint32_t frame_count = 1; + uint64_t current_present_id = 0ul; VkPhysicalDevice physical_device = VK_NULL_HANDLE; VkPhysicalDeviceProperties physical_device_properties = {}; VkPhysicalDeviceFeatures physical_device_features = {}; @@ -141,6 +143,8 @@ class RenderingDeviceDriverVulkan : public RenderingDeviceDriver { bool swappy_frame_pacer_enable = false; uint8_t swappy_mode = 2; // See default value for display/window/frame_pacing/android/swappy_mode. #endif + bool waitable_swapchain_support = false; + bool last_swapchain_occluded = false; DeviceFunctions device_functions; void _register_requested_device_extension(const CharString &p_extension_name, bool p_required); @@ -382,6 +386,9 @@ class RenderingDeviceDriverVulkan : public RenderingDeviceDriver { virtual DataFormat swap_chain_get_format(SwapChainID p_swap_chain) override final; virtual void swap_chain_set_max_fps(SwapChainID p_swap_chain, int p_max_fps) override final; virtual void swap_chain_free(SwapChainID p_swap_chain) override final; + virtual Error swap_chain_wait_for_present(DisplayServer::WindowID p_window, SwapChainID p_swap_chain, uint32_t p_max_frame_delay) override final; + + virtual BitField get_available_pacing_methods() const override final; private: /*********************/ diff --git a/editor/debugger/editor_visual_profiler.cpp b/editor/debugger/editor_visual_profiler.cpp index cbd97f0e616e..a03eb8cda321 100644 --- a/editor/debugger/editor_visual_profiler.cpp +++ b/editor/debugger/editor_visual_profiler.cpp @@ -351,10 +351,8 @@ void EditorVisualProfiler::_update_frame(bool p_focus_selected) { float cpu_time = m.areas[i].cpu_time; float gpu_time = m.areas[i].gpu_time; - if (i < m.areas.size() - 1) { - cpu_time = m.areas[i + 1].cpu_time - cpu_time; - gpu_time = m.areas[i + 1].gpu_time - gpu_time; - } + cpu_time = m.areas[i + 1].cpu_time - cpu_time; + gpu_time = m.areas[i + 1].gpu_time - gpu_time; if (name.begins_with(">")) { TreeItem *category = variables->create_item(parent); diff --git a/editor/editor_node.cpp b/editor/editor_node.cpp index dca54eb0283c..b3247f259cd4 100644 --- a/editor/editor_node.cpp +++ b/editor/editor_node.cpp @@ -426,6 +426,12 @@ void EditorNode::_update_from_settings() { } _update_title(); + RD *device = RD::get_singleton(); + if (device) { + const int latency_mode = int(GLOBAL_GET("rendering/rendering_device/vsync/latency_mode")) + 1; + device->set_latency_mode((RD::LatencyMode)latency_mode); + } + int current_filter = GLOBAL_GET("rendering/textures/canvas_textures/default_texture_filter"); if (current_filter != scene_root->get_default_canvas_item_texture_filter()) { Viewport::DefaultCanvasItemTextureFilter tf = (Viewport::DefaultCanvasItemTextureFilter)current_filter; diff --git a/main/main.cpp b/main/main.cpp index 103e85ad11a7..01fd44c435f2 100644 --- a/main/main.cpp +++ b/main/main.cpp @@ -199,6 +199,8 @@ String rendering_driver = ""; String rendering_method = ""; static int text_driver_idx = -1; static int audio_driver_idx = -1; +static int latency_mode = -1; +static uint8_t pacing_mode_mask = UINT8_MAX; // Engine config/tools @@ -597,6 +599,7 @@ void Main::print_help(const char *p_binary) { print_help_option("--rendering-method ", "Renderer name. Requires driver support.\n"); print_help_option("--rendering-driver ", "Rendering driver (depends on display driver).\n"); print_help_option("--gpu-index ", "Use a specific GPU (run with --verbose to get a list of available devices).\n"); + print_help_option("--latency-mode ", "Override latency mode [\"low_extreme\", \"low\", \"medium\", \"high_throughput\"].\n"); print_help_option("--text-driver ", "Text driver (used for font rendering, bidirectional support and shaping).\n"); print_help_option("--tablet-driver ", "Pen tablet input driver.\n"); print_help_option("--headless", "Enable headless mode (--display-driver headless --audio-driver Dummy). Useful for servers and with --script.\n"); @@ -639,6 +642,7 @@ void Main::print_help(const char *p_binary) { #endif print_help_option("--remote-debug ", "Remote debug (://[:], e.g. tcp://127.0.0.1:6007).\n"); print_help_option("--single-threaded-scene", "Force scene tree to run in single-threaded mode. Sub-thread groups are disabled and run on the main thread.\n"); + print_help_option("--pacing-mode-mask ", "8-bit hexadecimal mask to prevent Godot from using certain pacing modes. Use '1' as mask to only use the most basic fallback pacing method.\n"); #if defined(DEBUG_ENABLED) print_help_option("--debug-collisions", "Show collision shapes when running the scene.\n", CLI_OPTION_AVAILABILITY_TEMPLATE_DEBUG); print_help_option("--debug-paths", "Show path lines when running the scene.\n", CLI_OPTION_AVAILABILITY_TEMPLATE_DEBUG); @@ -1238,6 +1242,35 @@ Error Main::setup(const char *execpath, int argc, char *argv[], bool p_second_ph OS::get_singleton()->print("Missing rendering driver argument, aborting.\n"); goto error; } + } else if (arg == "--latency-mode") { + if (N) { + if (N->get() == "low_extreme") { + latency_mode = RenderingDevice::LATENCY_MODE_LOW_EXTREME; + } else if (N->get() == "low") { + latency_mode = RenderingDevice::LATENCY_MODE_LOW; + } else if (N->get() == "medium") { + latency_mode = RenderingDevice::LATENCY_MODE_MEDIUM; + } else if (N->get() == "high_throughput") { + latency_mode = RenderingDevice::LATENCY_MODE_HIGH_THROUGHPUT; + } else { + OS::get_singleton()->print("Unknown latency mode, aborting.\nValid options are 'low_extreme', 'low', 'medium' and 'high_throughput'.\n"); + goto error; + } + + N = N->next(); + } else { + OS::get_singleton()->print("Missing latency mode argument, aborting.\n"); + goto error; + } + } else if (arg == "--pacing-mode-mask") { + if (N) { + pacing_mode_mask = static_cast(N->get().hex_to_int()); + N = N->next(); + } else { + OS::get_singleton()->print("Missing pacing mode mask argument, aborting.\n"); + goto error; + } + } else if (arg == "-f" || arg == "--fullscreen") { // force fullscreen init_fullscreen = true; window_mode = DisplayServer::WINDOW_MODE_FULLSCREEN; @@ -3256,6 +3289,15 @@ Error Main::setup2(bool p_show_boot_logo) { RenderingDevice *rd = RenderingDevice::get_singleton(); if (rd) { rd->_set_max_fps(engine->get_max_fps()); + + if (pacing_mode_mask != UINT8_MAX) { + rd->_restrict_available_pacing_methods(pacing_mode_mask); + } + if (latency_mode >= 0) { + rd->set_latency_mode(RenderingDevice::LatencyMode(latency_mode)); + } else { + rd->set_latency_mode(RenderingDevice::LatencyMode((int)GLOBAL_GET("rendering/rendering_device/vsync/latency_mode") + 1)); + } } #ifdef TOOLS_ENABLED @@ -3358,6 +3400,8 @@ Error Main::setup2(bool p_show_boot_logo) { rendering_server->set_print_gpu_profile(true); } + rendering_server->update_cached_refresh_rate(); + OS::get_singleton()->benchmark_end_measure("Servers", "Rendering"); } @@ -4622,6 +4666,14 @@ static uint64_t navigation_process_max = 0; bool Main::iteration() { iterating++; + if (RD::get_singleton()) { + const bool sequential_sync = RenderingServer::get_singleton()->get_actual_cpu_gpu_sync_mode() == RenderingServer::CPU_GPU_SYNC_SEQUENTIAL; + + // We must do this right before input polling (i.e. DisplayServer**::process_events()). + // But we also must do this outside of timing measurements, so this is the 2nd best place. + RD::get_singleton()->_wait_for_present(sequential_sync); + } + const uint64_t ticks = OS::get_singleton()->get_ticks_usec(); Engine::get_singleton()->_frame_ticks = ticks; main_timer_sync.set_cpu_ticks_usec(ticks); @@ -4703,7 +4755,7 @@ bool Main::iteration() { } #if !defined(NAVIGATION_2D_DISABLED) || !defined(NAVIGATION_3D_DISABLED) - uint64_t navigation_begin = OS::get_singleton()->get_ticks_usec(); + const uint64_t navigation_begin = OS::get_singleton()->get_ticks_usec(); #ifndef NAVIGATION_2D_DISABLED NavigationServer2D::get_singleton()->physics_process(physics_step * time_scale); @@ -4712,8 +4764,9 @@ bool Main::iteration() { NavigationServer3D::get_singleton()->physics_process(physics_step * time_scale); #endif // NAVIGATION_3D_DISABLED - navigation_process_ticks = MAX(navigation_process_ticks, OS::get_singleton()->get_ticks_usec() - navigation_begin); // keep the largest one for reference - navigation_process_max = MAX(OS::get_singleton()->get_ticks_usec() - navigation_begin, navigation_process_max); + uint64_t tmp_tick = OS::get_singleton()->get_ticks_usec(); + navigation_process_ticks = MAX(navigation_process_ticks, tmp_tick - navigation_begin); // keep the largest one for reference + navigation_process_max = MAX(tmp_tick - navigation_begin, navigation_process_max); message_queue->flush(); #endif // !defined(NAVIGATION_2D_DISABLED) || !defined(NAVIGATION_3D_DISABLED) @@ -4732,8 +4785,9 @@ bool Main::iteration() { OS::get_singleton()->get_main_loop()->iteration_end(); - physics_process_ticks = MAX(physics_process_ticks, OS::get_singleton()->get_ticks_usec() - physics_begin); // keep the largest one for reference - physics_process_max = MAX(OS::get_singleton()->get_ticks_usec() - physics_begin, physics_process_max); + tmp_tick = OS::get_singleton()->get_ticks_usec(); + physics_process_ticks = MAX(physics_process_ticks, tmp_tick - physics_begin); // keep the largest one for reference + physics_process_max = MAX(tmp_tick - physics_begin, physics_process_max); Engine::get_singleton()->_in_physics = false; } @@ -4758,6 +4812,8 @@ bool Main::iteration() { RenderingServer::get_singleton()->sync(); //sync if still drawing from previous frames. + const uint64_t ticks_before_draw = OS::get_singleton()->get_ticks_usec(); + const bool has_pending_resources_for_processing = RD::get_singleton() && RD::get_singleton()->has_pending_resources_for_processing(); bool wants_present = (DisplayServer::get_singleton()->can_any_window_draw() || DisplayServer::get_singleton()->has_additional_outputs()) && @@ -4777,9 +4833,11 @@ bool Main::iteration() { } } - process_ticks = OS::get_singleton()->get_ticks_usec() - process_begin; + uint64_t tmp_tick = OS::get_singleton()->get_ticks_usec(); + const uint64_t ticks_after_draw = tmp_tick; + process_ticks = tmp_tick - process_begin; process_max = MAX(process_ticks, process_max); - uint64_t frame_time = OS::get_singleton()->get_ticks_usec() - ticks; + uint64_t frame_time = tmp_tick - ticks; for (int i = 0; i < ScriptServer::get_language_count(); i++) { ScriptServer::get_language(i)->frame(); @@ -4820,6 +4878,10 @@ bool Main::iteration() { frames = 0; } + tmp_tick = OS::get_singleton()->get_ticks_usec(); + const uint64_t cpu_time = (tmp_tick - ticks_after_draw) + (ticks_before_draw - ticks) + RenderingServer::draw_cpu_time; + RenderingServer::get_singleton()->notify_cpu_gpu_sync_timings(cpu_time, RenderingServer::draw_gpu_time); + iterating--; if (movie_writer) { diff --git a/main/performance.cpp b/main/performance.cpp index f679d7dcdba5..190ee56ec725 100644 --- a/main/performance.cpp +++ b/main/performance.cpp @@ -129,6 +129,12 @@ void Performance::_bind_methods() { BIND_ENUM_CONSTANT(NAVIGATION_3D_EDGE_FREE_COUNT); BIND_ENUM_CONSTANT(NAVIGATION_3D_OBSTACLE_COUNT); #endif // NAVIGATION_3D_DISABLED + BIND_ENUM_CONSTANT(FRAME_PACING_TOTAL_TIME); + BIND_ENUM_CONSTANT(FRAME_PACING_CPU_TIME); + BIND_ENUM_CONSTANT(FRAME_PACING_GPU_TIME); + BIND_ENUM_CONSTANT(FRAME_PACING_EVALUATED_SYNC_MODE); + BIND_ENUM_CONSTANT(FRAME_PACING_ACTUAL_SYNC_MODE); + BIND_ENUM_CONSTANT(FRAME_PACING_MISSED_HARD_TARGET); BIND_ENUM_CONSTANT(MONITOR_MAX); } @@ -141,9 +147,27 @@ int Performance::_get_node_count() const { return sml->get_node_count(); } +#if defined(NAVIGATION_2D_DISABLED) && defined(NAVIGATION_3D_DISABLED) +#define NAVIGATION_PREFIX "Unavailable" +#else +#define NAVIGATION_PREFIX "" +#endif + +#ifdef NAVIGATION_2D_DISABLED +#define NAVIGATION_2D_PREFIX "Unavailable" +#else +#define NAVIGATION_2D_PREFIX "" +#endif + +#ifdef NAVIGATION_3D_DISABLED +#define NAVIGATION_3D_PREFIX "Unavailable" +#else +#define NAVIGATION_3D_PREFIX "" +#endif + String Performance::get_monitor_name(Monitor p_monitor) const { ERR_FAIL_INDEX_V(p_monitor, MONITOR_MAX, String()); - static const char *names[MONITOR_MAX] = { + static const char *names[] = { PNAME("time/fps"), PNAME("time/process"), PNAME("time/physics_process"), @@ -168,47 +192,47 @@ String Performance::get_monitor_name(Monitor p_monitor) const { PNAME("physics_3d/collision_pairs"), PNAME("physics_3d/islands"), PNAME("audio/driver/output_latency"), -#if !defined(NAVIGATION_2D_DISABLED) || !defined(NAVIGATION_3D_DISABLED) - PNAME("navigation/active_maps"), - PNAME("navigation/regions"), - PNAME("navigation/agents"), - PNAME("navigation/links"), - PNAME("navigation/polygons"), - PNAME("navigation/edges"), - PNAME("navigation/edges_merged"), - PNAME("navigation/edges_connected"), - PNAME("navigation/edges_free"), - PNAME("navigation/obstacles"), -#endif // !defined(NAVIGATION_2D_DISABLED) || !defined(NAVIGATION_3D_DISABLED) + PNAME(NAVIGATION_PREFIX "navigation/active_maps"), + PNAME(NAVIGATION_PREFIX "navigation/regions"), + PNAME(NAVIGATION_PREFIX "navigation/agents"), + PNAME(NAVIGATION_PREFIX "navigation/links"), + PNAME(NAVIGATION_PREFIX "navigation/polygons"), + PNAME(NAVIGATION_PREFIX "navigation/edges"), + PNAME(NAVIGATION_PREFIX "navigation/edges_merged"), + PNAME(NAVIGATION_PREFIX "navigation/edges_connected"), + PNAME(NAVIGATION_PREFIX "navigation/edges_free"), + PNAME(NAVIGATION_PREFIX "navigation/obstacles"), PNAME("pipeline/compilations_canvas"), PNAME("pipeline/compilations_mesh"), PNAME("pipeline/compilations_surface"), PNAME("pipeline/compilations_draw"), PNAME("pipeline/compilations_specialization"), -#ifndef NAVIGATION_2D_DISABLED - PNAME("navigation_2d/active_maps"), - PNAME("navigation_2d/regions"), - PNAME("navigation_2d/agents"), - PNAME("navigation_2d/links"), - PNAME("navigation_2d/polygons"), - PNAME("navigation_2d/edges"), - PNAME("navigation_2d/edges_merged"), - PNAME("navigation_2d/edges_connected"), - PNAME("navigation_2d/edges_free"), - PNAME("navigation_2d/obstacles"), -#endif // NAVIGATION_2D_DISABLED -#ifndef NAVIGATION_3D_DISABLED - PNAME("navigation_3d/active_maps"), - PNAME("navigation_3d/regions"), - PNAME("navigation_3d/agents"), - PNAME("navigation_3d/links"), - PNAME("navigation_3d/polygons"), - PNAME("navigation_3d/edges"), - PNAME("navigation_3d/edges_merged"), - PNAME("navigation_3d/edges_connected"), - PNAME("navigation_3d/edges_free"), - PNAME("navigation_3d/obstacles"), -#endif // NAVIGATION_3D_DISABLED + PNAME(NAVIGATION_2D_PREFIX "navigation_2d/active_maps"), + PNAME(NAVIGATION_2D_PREFIX "navigation_2d/regions"), + PNAME(NAVIGATION_2D_PREFIX "navigation_2d/agents"), + PNAME(NAVIGATION_2D_PREFIX "navigation_2d/links"), + PNAME(NAVIGATION_2D_PREFIX "navigation_2d/polygons"), + PNAME(NAVIGATION_2D_PREFIX "navigation_2d/edges"), + PNAME(NAVIGATION_2D_PREFIX "navigation_2d/edges_merged"), + PNAME(NAVIGATION_2D_PREFIX "navigation_2d/edges_connected"), + PNAME(NAVIGATION_2D_PREFIX "navigation_2d/edges_free"), + PNAME(NAVIGATION_2D_PREFIX "navigation_2d/obstacles"), + PNAME(NAVIGATION_3D_PREFIX "navigation_3d/active_maps"), + PNAME(NAVIGATION_3D_PREFIX "navigation_3d/regions"), + PNAME(NAVIGATION_3D_PREFIX "navigation_3d/agents"), + PNAME(NAVIGATION_3D_PREFIX "navigation_3d/links"), + PNAME(NAVIGATION_3D_PREFIX "navigation_3d/polygons"), + PNAME(NAVIGATION_3D_PREFIX "navigation_3d/edges"), + PNAME(NAVIGATION_3D_PREFIX "navigation_3d/edges_merged"), + PNAME(NAVIGATION_3D_PREFIX "navigation_3d/edges_connected"), + PNAME(NAVIGATION_3D_PREFIX "navigation_3d/edges_free"), + PNAME(NAVIGATION_3D_PREFIX "navigation_3d/obstacles"), + PNAME("frame_pacing/total_time"), + PNAME("frame_pacing/cpu_time"), + PNAME("frame_pacing/gpu_time"), + PNAME("frame_pacing/evaluated_sync_mode"), + PNAME("frame_pacing/actual_sync_mode"), + PNAME("frame_pacing/missed_hard_target"), }; static_assert(std::size(names) == MONITOR_MAX); @@ -433,6 +457,19 @@ double Performance::get_monitor(Monitor p_monitor) const { return NavigationServer3D::get_singleton()->get_process_info(NavigationServer3D::INFO_OBSTACLE_COUNT); #endif // NAVIGATION_3D_DISABLED + case FRAME_PACING_TOTAL_TIME: + return double(RenderingServer::get_singleton()->get_last_cpu_time() + RenderingServer::get_singleton()->get_last_gpu_time()) / (1000.0 * 1000.0); + case FRAME_PACING_CPU_TIME: + return double(RenderingServer::get_singleton()->get_last_cpu_time()) / (1000.0 * 1000.0); + case FRAME_PACING_GPU_TIME: + return double(RenderingServer::get_singleton()->get_last_gpu_time()) / (1000.0 * 1000.0); + case FRAME_PACING_EVALUATED_SYNC_MODE: + return double(RenderingServer::get_singleton()->get_evaluated_cpu_gpu_sync_mode()); + case FRAME_PACING_ACTUAL_SYNC_MODE: + return double(RenderingServer::get_singleton()->get_actual_cpu_gpu_sync_mode()); + case FRAME_PACING_MISSED_HARD_TARGET: + return double(RenderingServer::get_singleton()->get_missed_hard_target()); + default: { } } @@ -443,7 +480,7 @@ double Performance::get_monitor(Monitor p_monitor) const { Performance::MonitorType Performance::get_monitor_type(Monitor p_monitor) const { ERR_FAIL_INDEX_V(p_monitor, MONITOR_MAX, MONITOR_TYPE_QUANTITY); // ugly - static const MonitorType types[MONITOR_MAX] = { + static const MonitorType types[] = { MONITOR_TYPE_QUANTITY, MONITOR_TYPE_TIME, MONITOR_TYPE_TIME, @@ -503,6 +540,12 @@ Performance::MonitorType Performance::get_monitor_type(Monitor p_monitor) const MONITOR_TYPE_QUANTITY, MONITOR_TYPE_QUANTITY, MONITOR_TYPE_QUANTITY, + MONITOR_TYPE_TIME, + MONITOR_TYPE_TIME, + MONITOR_TYPE_TIME, + MONITOR_TYPE_QUANTITY, + MONITOR_TYPE_QUANTITY, + MONITOR_TYPE_QUANTITY, }; static_assert((sizeof(types) / sizeof(MonitorType)) == MONITOR_MAX); diff --git a/main/performance.h b/main/performance.h index 0fdac285cd7d..a325c50d74f8 100644 --- a/main/performance.h +++ b/main/performance.h @@ -125,6 +125,12 @@ class Performance : public Object { NAVIGATION_3D_EDGE_CONNECTION_COUNT, NAVIGATION_3D_EDGE_FREE_COUNT, NAVIGATION_3D_OBSTACLE_COUNT, + FRAME_PACING_TOTAL_TIME, + FRAME_PACING_CPU_TIME, + FRAME_PACING_GPU_TIME, + FRAME_PACING_EVALUATED_SYNC_MODE, + FRAME_PACING_ACTUAL_SYNC_MODE, + FRAME_PACING_MISSED_HARD_TARGET, MONITOR_MAX }; diff --git a/misc/dist/shell/_godot.zsh-completion b/misc/dist/shell/_godot.zsh-completion index f65cf3787050..79fe139db855 100644 --- a/misc/dist/shell/_godot.zsh-completion +++ b/misc/dist/shell/_godot.zsh-completion @@ -48,6 +48,7 @@ _arguments \ "--rendering-method[set the renderer]:renderer name:((forward_plus\:'Desktop renderer' mobile\:'Desktop and mobile renderer' gl_compatibility\:'Desktop, mobile and web renderer'))" \ "--rendering-driver[set the rendering driver]:rendering driver name:((vulkan\:'Vulkan renderer' opengl3\:'OpenGL ES 3.0 renderer' dummy\:'Dummy renderer'))" \ "--gpu-index[use a specific GPU (run with --verbose to get available device list)]:device index" \ + "--latency-mode[set the latency mode]:latency mode:(low_extreme low medium high_throughput)" \ '--text-driver[set the text driver]:text driver name' \ '--tablet-driver[set the pen tablet input driver]:tablet driver name' \ '--headless[enable headless mode (--display-driver headless --audio-driver Dummy), useful for servers and with --script]' \ diff --git a/misc/dist/shell/godot.bash-completion b/misc/dist/shell/godot.bash-completion index 63efa95c10d3..3fd1aefb72fc 100644 --- a/misc/dist/shell/godot.bash-completion +++ b/misc/dist/shell/godot.bash-completion @@ -51,6 +51,7 @@ _complete_godot_options() { --rendering-method --rendering-driver --gpu-index +--latency-mode --text-driver --tablet-driver --headless @@ -129,6 +130,10 @@ _complete_godot_bash() { local IFS=$' \n\t' # shellcheck disable=SC2207 COMPREPLY=($(compgen -W "vulkan opengl3 dummy" -- "$cur")) + elif [[ $prev == "--latency-mode" ]]; then + local IFS=$' \n\t' + # shellcheck disable=SC2207 + COMPREPLY=($(compgen -W "parallel auto" -- "$cur")) elif [[ $prev == "--xr-mode" ]]; then local IFS=$' \n\t' # shellcheck disable=SC2207 diff --git a/misc/dist/shell/godot.fish b/misc/dist/shell/godot.fish index 3f0675fcb2bc..f8059111df60 100644 --- a/misc/dist/shell/godot.fish +++ b/misc/dist/shell/godot.fish @@ -37,6 +37,12 @@ function godot_rendering_driver_args echo -e "dummy\tDummy renderer" end +function godot_cpu_gpu_sync_args + # Use a function instead of a fixed string to customize the argument descriptions. + echo -e "parallel\tProcess frames in parallel" + echo -e "auto\tProcess frames in parallel or sequentially based on system performance" +end + # Erase existing completions for Godot. complete -c godot -e @@ -64,6 +70,7 @@ complete -c godot -l display-driver -d "Set the display driver" -x complete -c godot -l rendering-method -d "Set the renderer" -x -a "(godot_rendering_method_args)" complete -c godot -l rendering-driver -d "Set the rendering driver" -x -a "(godot_rendering_driver_args)" complete -c godot -l gpu-index -d "Use a specific GPU (run with --verbose to get available device list)" -x +complete -c godot -l cpu-gpu-sync -d "Set the CPU/GPU synchronization mode" -x -a "(godot_cpu_gpu_sync_args)" complete -c godot -l text-driver -d "Set the text driver" -x complete -c godot -l tablet-driver -d "Set the pen tablet input driver" -x complete -c godot -l headless -d "Enable headless mode (--display-driver headless --audio-driver Dummy). Useful for servers and with --script" diff --git a/platform/windows/display_server_windows.cpp b/platform/windows/display_server_windows.cpp index 026ce2f0927a..2e1dde63cec9 100644 --- a/platform/windows/display_server_windows.cpp +++ b/platform/windows/display_server_windows.cpp @@ -2760,6 +2760,78 @@ DisplayServerWindows::WindowID DisplayServerWindows::get_focused_window() const return last_focused_window; } +static BOOL is_window_cloaked(HWND hwnd) { + BOOL is_cloaked = FALSE; + HRESULT hr = DwmGetWindowAttribute(hwnd, DWMWA_CLOAKED, &is_cloaked, sizeof(is_cloaked)); + ERR_FAIL_COND_V_MSG(!SUCCEEDED(hr), FALSE, "DwmGetWindowAttribute DWMWA_CLOAKED failed!"); + return is_cloaked; +} + +static BOOL is_rect_fully_inside(const RECT &rect1, const RECT &rect2) { + return rect1.left >= rect2.left && + rect1.right <= rect2.right && + rect1.top >= rect2.top && + rect1.bottom <= rect2.bottom; +} + +struct ObscureTestEnumData { + DWORD process_id; + RECT rect; + bool fully_occluded; +}; + +static BOOL CALLBACK _enum_proc_is_window_fully_occluded(HWND p_hwnd_enum, LPARAM p_lparam) { + ObscureTestEnumData &data = *(ObscureTestEnumData *)p_lparam; + + // Get the current window's rectangle. + RECT other_rect; + GetWindowRect(p_hwnd_enum, &other_rect); + + // Check if we're inside it. + if (is_rect_fully_inside(data.rect, other_rect)) { + DWORD process_id = 0u; + GetWindowThreadProcessId(p_hwnd_enum, &process_id); + + // Ignore windows that belong to us. + if (data.process_id != process_id) { + data.fully_occluded = true; + return FALSE; // Stop enumeration. + } + } + + return TRUE; // Continue enumerating. +} + +bool DisplayServerWindows::_window_presentation_occluded(WindowID p_window) { + _THREAD_SAFE_METHOD_ + + ERR_FAIL_COND_V(!windows.has(p_window), false); + WindowData &wd = windows[p_window]; + + if (IsIconic(wd.hWnd)) { + return true; // Minimized. + } + + if (!IsWindowVisible(wd.hWnd)) { + return true; // Not visible (according to very old pre-Vista rules). + } + + if (is_window_cloaked(wd.hWnd)) { + return true; // Not visible (according to newer Windows 7+ rules). + } + + RECT target_rect{}; + DWORD process_id = 0u; + GetWindowRect(wd.hWnd, &target_rect); + GetWindowThreadProcessId(wd.hWnd, &process_id); + + ObscureTestEnumData data{ process_id, target_rect, false }; + + EnumWindows(_enum_proc_is_window_fully_occluded, (LPARAM)&data); + + return data.fully_occluded; +} + bool DisplayServerWindows::window_can_draw(WindowID p_window) const { _THREAD_SAFE_METHOD_ diff --git a/platform/windows/display_server_windows.h b/platform/windows/display_server_windows.h index 94f7dcc6bf74..8e0fb831ab5b 100644 --- a/platform/windows/display_server_windows.h +++ b/platform/windows/display_server_windows.h @@ -820,6 +820,7 @@ class DisplayServerWindows : public DisplayServer { virtual WindowID get_focused_window() const override; + virtual bool _window_presentation_occluded(WindowID p_window = MAIN_WINDOW_ID) override; virtual bool window_can_draw(WindowID p_window = MAIN_WINDOW_ID) const override; virtual bool can_any_window_draw() const override; diff --git a/scene/main/scene_tree.cpp b/scene/main/scene_tree.cpp index ad62b72e54ff..aa17e9af15e4 100644 --- a/scene/main/scene_tree.cpp +++ b/scene/main/scene_tree.cpp @@ -901,6 +901,10 @@ void SceneTree::_notification(int p_notification) { case NOTIFICATION_APPLICATION_FOCUS_OUT: { // Pass these to nodes, since they are mirrored. get_root()->propagate_notification(p_notification); + if (p_notification == NOTIFICATION_APPLICATION_FOCUS_IN) { + // TODO: Is this the right place? + RenderingServer::get_singleton()->update_cached_refresh_rate(); + } } break; } } diff --git a/servers/display_server.h b/servers/display_server.h index a6a4d4eb85e2..c05ddba0fc8f 100644 --- a/servers/display_server.h +++ b/servers/display_server.h @@ -512,6 +512,7 @@ class DisplayServer : public Object { virtual void window_set_window_buttons_offset(const Vector2i &p_offset, WindowID p_window = MAIN_WINDOW_ID) {} virtual Vector3i window_get_safe_title_margins(WindowID p_window = MAIN_WINDOW_ID) const { return Vector3i(); } + virtual bool _window_presentation_occluded(WindowID p_window = MAIN_WINDOW_ID) { return false; } virtual bool window_can_draw(WindowID p_window = MAIN_WINDOW_ID) const = 0; virtual bool can_any_window_draw() const = 0; diff --git a/servers/rendering/dummy/storage/utilities.h b/servers/rendering/dummy/storage/utilities.h index d8862cdba2ea..0bc0c63297b4 100644 --- a/servers/rendering/dummy/storage/utilities.h +++ b/servers/rendering/dummy/storage/utilities.h @@ -69,6 +69,7 @@ class Utilities : public RendererUtilities { virtual void capture_timestamps_begin() override {} virtual void capture_timestamp(const String &p_name) override {} + virtual void capture_timestamps_sync_mode_auto_end() override {} virtual uint32_t get_captured_timestamps_count() const override { return 0; } virtual uint64_t get_captured_timestamps_frame() const override { return 0; } virtual uint64_t get_captured_timestamp_gpu_time(uint32_t p_index) const override { return 0; } diff --git a/servers/rendering/renderer_rd/storage_rd/utilities.cpp b/servers/rendering/renderer_rd/storage_rd/utilities.cpp index 9e0c9476500e..e8b266b7584a 100644 --- a/servers/rendering/renderer_rd/storage_rd/utilities.cpp +++ b/servers/rendering/renderer_rd/storage_rd/utilities.cpp @@ -226,6 +226,12 @@ void Utilities::capture_timestamp(const String &p_name) { RD::get_singleton()->capture_timestamp(p_name); } +void Utilities::capture_timestamps_sync_mode_auto_end() { + if (RenderingDevice::should_capture_frame_pacing_timings()) { + RD::get_singleton()->capture_timestamp("_Sync Mode Auto"); + } +} + uint32_t Utilities::get_captured_timestamps_count() const { return RD::get_singleton()->get_captured_timestamps_count(); } diff --git a/servers/rendering/renderer_rd/storage_rd/utilities.h b/servers/rendering/renderer_rd/storage_rd/utilities.h index dc21b2f731bb..e751e10fafd1 100644 --- a/servers/rendering/renderer_rd/storage_rd/utilities.h +++ b/servers/rendering/renderer_rd/storage_rd/utilities.h @@ -93,6 +93,7 @@ class Utilities : public RendererUtilities { virtual void capture_timestamps_begin() override; virtual void capture_timestamp(const String &p_name) override; + virtual void capture_timestamps_sync_mode_auto_end() override; virtual uint32_t get_captured_timestamps_count() const override; virtual uint64_t get_captured_timestamps_frame() const override; virtual uint64_t get_captured_timestamp_gpu_time(uint32_t p_index) const override; diff --git a/servers/rendering/renderer_viewport.cpp b/servers/rendering/renderer_viewport.cpp index d84dced9e1df..325784efe97e 100644 --- a/servers/rendering/renderer_viewport.cpp +++ b/servers/rendering/renderer_viewport.cpp @@ -815,6 +815,8 @@ void RendererViewport::draw_viewports(bool p_swap_buffers) { int objects_drawn = 0; int draw_calls_used = 0; + bool sync_mode_captured = false; + for (int i = 0; i < sorted_active_viewports.size(); i++) { Viewport *vp = sorted_active_viewports[i]; @@ -859,6 +861,11 @@ void RendererViewport::draw_viewports(bool p_swap_buffers) { if (vp->viewport_to_screen != DisplayServer::INVALID_WINDOW_ID) { if (RSG::rasterizer->is_opengl()) { if (blits.size() > 0) { + if (!sync_mode_captured) { + RSG::utilities->capture_timestamps_sync_mode_auto_end(); + sync_mode_captured = true; + } + RSG::rasterizer->blit_render_targets_to_screen(vp->viewport_to_screen, blits.ptr(), blits.size()); RSG::rasterizer->gl_end_frame(p_swap_buffers); } @@ -893,6 +900,11 @@ void RendererViewport::draw_viewports(bool p_swap_buffers) { } if (RSG::rasterizer->is_opengl()) { + if (!sync_mode_captured) { + RSG::utilities->capture_timestamps_sync_mode_auto_end(); + sync_mode_captured = true; + } + RSG::rasterizer->blit_render_targets_to_screen(vp->viewport_to_screen, &blit, 1); RSG::rasterizer->gl_end_frame(p_swap_buffers); } else { @@ -928,6 +940,10 @@ void RendererViewport::draw_viewports(bool p_swap_buffers) { RENDER_TIMESTAMP("< Render Viewports"); + if (!sync_mode_captured) { + RSG::utilities->capture_timestamps_sync_mode_auto_end(); + } + if (p_swap_buffers && !blit_to_screen_list.is_empty()) { for (const KeyValue> &E : blit_to_screen_list) { RSG::rasterizer->blit_render_targets_to_screen(E.key, E.value.ptr(), E.value.size()); diff --git a/servers/rendering/rendering_device.cpp b/servers/rendering/rendering_device.cpp index eb6a32633809..b0daa693c048 100644 --- a/servers/rendering/rendering_device.cpp +++ b/servers/rendering/rendering_device.cpp @@ -6363,6 +6363,80 @@ void RenderingDevice::_free_pending_resources(int p_frame) { } } +void RenderingDevice::set_latency_mode(LatencyMode p_latency_mode) { + latency_mode = p_latency_mode; +} + +RenderingDevice::LatencyMode RenderingDevice::get_latency_mode() const { + return latency_mode; +} + +void RenderingDevice::_wait_for_present(bool p_sequential_sync) { + const PacingMethod pacing_method = get_current_pacing_method(p_sequential_sync); + + if (pacing_method == PACING_METHOD_WAITABLE_SWAPCHAIN) { + HashMap::ConstIterator it = screen_swap_chains.find(DisplayServer::MAIN_WINDOW_ID); + if (it != screen_swap_chains.end()) { + uint32_t max_frame_delay = frames.size(); + switch (latency_mode) { + case LATENCY_MODE_LOW_EXTREME: + max_frame_delay = 0u; + break; + case LATENCY_MODE_LOW: + max_frame_delay = 1u; + break; + case LATENCY_MODE_MEDIUM: + max_frame_delay = std::min(max_frame_delay, 2u); + break; + case LATENCY_MODE_HIGH_THROUGHPUT: + DEV_ASSERT(false && "This path should be unreachable!"); + max_frame_delay = 16u; + break; + } + driver->swap_chain_wait_for_present(it->key, it->value, max_frame_delay); + } + } else if (pacing_method == PACING_METHOD_SEQUENTIAL_SYNC) { + _stall_for_previous_frames(); + } +} + +void RenderingDevice::_restrict_available_pacing_methods(uint8_t mask) { + available_pacing_methods = mask; + if (driver) { + available_pacing_methods = available_pacing_methods.get_shared(driver->get_available_pacing_methods()); + } +} + +RDD::PacingMethod RenderingDevice::get_current_pacing_method(bool p_sequential_sync) const { + if (available_pacing_methods.has_flag(PACING_METHOD_ANDROID_SWAPPY)) { + return PACING_METHOD_ANDROID_SWAPPY; + } + if (available_pacing_methods.has_flag(PACING_METHOD_WAITABLE_SWAPCHAIN) && latency_mode != LATENCY_MODE_HIGH_THROUGHPUT) { + return PACING_METHOD_WAITABLE_SWAPCHAIN; + } + if (available_pacing_methods.has_flag(PACING_METHOD_SEQUENTIAL_SYNC) && p_sequential_sync && latency_mode <= LATENCY_MODE_LOW) { + return PACING_METHOD_SEQUENTIAL_SYNC; + } + return PACING_METHOD_NONE; +} + +bool RenderingDevice::should_capture_frame_pacing_timings() { + RenderingDevice *device = RenderingDevice::get_singleton(); + if (device) { + return device->_should_capture_frame_pacing_timings(); + } + return false; +} + +bool RenderingDevice::_should_capture_frame_pacing_timings() const { +#ifdef DEBUG_ENABLED + // Debug builds always measure this value for more performance metrics. + return true; +#else + return get_current_pacing_method(true) == RDD::PACING_METHOD_SEQUENTIAL_SYNC; +#endif +} + uint32_t RenderingDevice::get_frame_delay() const { return frames.size(); } @@ -6702,6 +6776,8 @@ Error RenderingDevice::initialize(RenderingContextDriver *p_context, DisplayServ Engine::get_singleton()->print_header(vformat("%s %s - %s - Using Device #%d: %s - %s", get_device_api_name(), get_device_api_version(), rendering_method, device_index, _get_device_vendor_name(device), device.name)); } + latency_mode = LatencyMode(GLOBAL_GET("rendering/rendering_device/vsync/latency_mode")); + // Pick the main queue family. It is worth noting we explicitly do not request the transfer bit, as apparently the specification defines // that the existence of either the graphics or compute bit implies that the queue can also do transfer operations, but it is optional // to indicate whether it supports them or not with the dedicated transfer bit if either is set. @@ -6864,6 +6940,35 @@ Error RenderingDevice::initialize(RenderingContextDriver *p_context, DisplayServ } } + const BitField supported_pacing_methods = driver->get_available_pacing_methods(); + print_verbose(vformat("Supported Pacing Methods (mask 0x%02x):", (uint64_t)supported_pacing_methods)); + if (supported_pacing_methods.has_flag(PACING_METHOD_SEQUENTIAL_SYNC)) { + print_verbose("\t SEQUENTIAL_SYNC"); + } + if (supported_pacing_methods.has_flag(PACING_METHOD_WAITABLE_SWAPCHAIN)) { + print_verbose("\t WAITABLE_SWAPCHAIN"); + } + if (supported_pacing_methods.has_flag(PACING_METHOD_ANDROID_SWAPPY)) { + print_verbose("\t ANDROID_SWAPPY"); + } + + available_pacing_methods = available_pacing_methods.get_shared(supported_pacing_methods); + + switch (get_current_pacing_method(true)) { + case RenderingDeviceCommons::PACING_METHOD_NONE: + print_verbose("Current Pacing Method (may change later): NONE"); + break; + case RenderingDeviceCommons::PACING_METHOD_SEQUENTIAL_SYNC: + print_verbose("Current Pacing Method (may change later): SEQUENTIAL_SYNC"); + break; + case RenderingDeviceCommons::PACING_METHOD_WAITABLE_SWAPCHAIN: + print_verbose("Current Pacing Method (may change later): WAITABLE_SWAPCHAIN"); + break; + case RenderingDeviceCommons::PACING_METHOD_ANDROID_SWAPPY: + print_verbose("Current Pacing Method (may change later): ANDROID_SWAPPY"); + break; + } + // Find the best method available for VRS on the current hardware. _vrs_detect_method(); @@ -7437,6 +7542,8 @@ void RenderingDevice::_bind_methods() { ClassDB::bind_method(D_METHOD("has_feature", "feature"), &RenderingDevice::has_feature); ClassDB::bind_method(D_METHOD("limit_get", "limit"), &RenderingDevice::limit_get); + ClassDB::bind_method(D_METHOD("set_latency_mode", "p_latency_mode"), &RenderingDevice::set_latency_mode); + ClassDB::bind_method(D_METHOD("get_latency_mode"), &RenderingDevice::get_latency_mode); ClassDB::bind_method(D_METHOD("get_frame_delay"), &RenderingDevice::get_frame_delay); ClassDB::bind_method(D_METHOD("submit"), &RenderingDevice::submit); ClassDB::bind_method(D_METHOD("sync"), &RenderingDevice::sync); @@ -8063,6 +8170,11 @@ void RenderingDevice::_bind_methods() { BIND_BITFIELD_FLAG(DRAW_IGNORE_STENCIL); BIND_BITFIELD_FLAG(DRAW_CLEAR_ALL); BIND_BITFIELD_FLAG(DRAW_IGNORE_ALL); + + BIND_ENUM_CONSTANT(LATENCY_MODE_LOW_EXTREME); + BIND_ENUM_CONSTANT(LATENCY_MODE_LOW); + BIND_ENUM_CONSTANT(LATENCY_MODE_MEDIUM); + BIND_ENUM_CONSTANT(LATENCY_MODE_HIGH_THROUGHPUT); } void RenderingDevice::make_current() { diff --git a/servers/rendering/rendering_device.h b/servers/rendering/rendering_device.h index 2c936b0100c0..3246f5f95574 100644 --- a/servers/rendering/rendering_device.h +++ b/servers/rendering/rendering_device.h @@ -195,6 +195,18 @@ class RenderingDevice : public RenderingDeviceCommons { Error _buffer_initialize(Buffer *p_buffer, const uint8_t *p_data, size_t p_data_size, uint32_t p_required_align = 32); void update_perf_report(); + +public: + enum LatencyMode { + LATENCY_MODE_LOW_EXTREME, + LATENCY_MODE_LOW, + LATENCY_MODE_MEDIUM, + LATENCY_MODE_HIGH_THROUGHPUT, + }; + +private: + LatencyMode latency_mode = LATENCY_MODE_LOW; + // Flag for batching descriptor sets. bool descriptor_set_batching = true; // When true, the final draw call that copies our offscreen result into the Swapchain is put into its @@ -1562,6 +1574,8 @@ class RenderingDevice : public RenderingDeviceCommons { TightLocalVector frames; uint64_t frames_drawn = 0; + BitField available_pacing_methods = 0xFF; + // Whenever logic/physics request a graphics operation (not just deleting a resource) that requires // us to flush all graphics commands, we must set frames_pending_resources_for_processing = frames.size(). // This is important for when the user requested for the logic loop to still be updated while @@ -1626,6 +1640,30 @@ class RenderingDevice : public RenderingDeviceCommons { void swap_buffers(bool p_present); + void set_latency_mode(LatencyMode p_latency_mode); + LatencyMode get_latency_mode() const; + + /// Uses Waitable Swapchains to wait for a swapchain to be released. This reduces latency. + /// + /// If waitable swapchains are not supported, p_sequential_sync controls a fallback method + /// where we stall for previous frames to prevent the CPU from going too far ahead. + /// This fallback improves latency, but not as good as waitable swapchains. + void _wait_for_present(bool p_sequential_sync); + + /// Caller can provide a mask that gets AND'ed against RenderingDeviceDriver::get_available_pacing_methods() + /// This prevents Godot from using certain pacing methods; which is useful for debugging malfunctioning + /// pacing; or seeing how pacing would behave in other systems that do not support better methods. + /// See PacingMethod for the relevant bit combinations. + void _restrict_available_pacing_methods(uint8_t mask); + + /// Returns the current PacingMethod in use. Note that PACING_METHOD_SEQUENTIAL_SYNC requires + /// higher level integration, and that higher level may decide to not use SEQUENTIAL_SYNC. + PacingMethod get_current_pacing_method(bool p_sequential_sync) const; + + bool _should_capture_frame_pacing_timings() const; + + static bool should_capture_frame_pacing_timings(); + uint32_t get_frame_delay() const; void submit(); @@ -1747,6 +1785,7 @@ VARIANT_ENUM_CAST(RenderingDevice::MemoryType) VARIANT_ENUM_CAST(RenderingDevice::Features) VARIANT_ENUM_CAST(RenderingDevice::BreadcrumbMarker) VARIANT_BITFIELD_CAST(RenderingDevice::DrawFlags); +VARIANT_ENUM_CAST(RenderingDevice::LatencyMode) #ifndef DISABLE_DEPRECATED VARIANT_BITFIELD_CAST(RenderingDevice::BarrierMask); diff --git a/servers/rendering/rendering_device_commons.h b/servers/rendering/rendering_device_commons.h index 63f2b8792b4f..49f5bd2bca75 100644 --- a/servers/rendering/rendering_device_commons.h +++ b/servers/rendering/rendering_device_commons.h @@ -910,6 +910,31 @@ class RenderingDeviceCommons : public Object { SUBGROUP_QUAD_BIT = 128, }; + enum PacingMethod { + // There are no methods reducing latency and/or reducing jitter and sutter. + PACING_METHOD_NONE = 0u, + + // Forcing the CPU to wait for the GPU is used to reduce latency if GPU & CPU are fast enough. + // It's an "OK" solution. It requires higher level integration (i.e. RenderingServer). + PACING_METHOD_SEQUENTIAL_SYNC = 1u << 0u, + + // Waitable Swapchain extension is used to reduce latency and control frame pacing (very good + // solution). Requires driver support. + // + // Normally Godot allows the GPU to get frame_queue_size ahead of the GPU. Assuming + // frame_queue_size = 2, Godot normally starts working on frame N+2 on the CPU when the GPU is + // done working on frame N. Waitable swapchains make Godot start frame N+2 when + // frame N is *done presenting*. + // + // This gap between when the GPU finishes its work and when that work actually appears on screen + // can increase latency. Thus waitable swapchains can reduce latency at the cost of some + // framerate (framerate may decrease because the CPU has to wait more time doing nothing). + PACING_METHOD_WAITABLE_SWAPCHAIN = 1u << 1u, + + // Android Swappy is being used to control frame pacing. It's a 3rd Party solution. + PACING_METHOD_ANDROID_SWAPPY = 1u << 2u, + }; + //////////////////////////////////////////// // PROTECTED STUFF // Not exposed by RenderingDevice, but shared diff --git a/servers/rendering/rendering_device_driver.h b/servers/rendering/rendering_device_driver.h index 0ac91df3884f..25c242afe66c 100644 --- a/servers/rendering/rendering_device_driver.h +++ b/servers/rendering/rendering_device_driver.h @@ -484,6 +484,10 @@ class RenderingDeviceDriver : public RenderingDeviceCommons { // Wait until all rendering associated to the swap chain is finished before deleting it. virtual void swap_chain_free(SwapChainID p_swap_chain) = 0; + virtual Error swap_chain_wait_for_present(DisplayServer::WindowID p_window, SwapChainID p_swap_chain, uint32_t p_max_frame_delay) = 0; + + virtual BitField get_available_pacing_methods() const = 0; + /*********************/ /**** FRAMEBUFFER ****/ /*********************/ diff --git a/servers/rendering/rendering_server_default.cpp b/servers/rendering/rendering_server_default.cpp index 3a381401a231..334f496a1525 100644 --- a/servers/rendering/rendering_server_default.cpp +++ b/servers/rendering/rendering_server_default.cpp @@ -70,7 +70,7 @@ void RenderingServerDefault::_draw(bool p_swap_buffers, double frame_step) { TIMESTAMP_BEGIN() - uint64_t time_usec = OS::get_singleton()->get_ticks_usec(); + const uint64_t time_usec = OS::get_singleton()->get_ticks_usec(); RENDER_TIMESTAMP("Prepare Render Frame"); RSG::scene->update(); //update scenes stuff before updating instances @@ -82,10 +82,22 @@ void RenderingServerDefault::_draw(bool p_swap_buffers, double frame_step) { RSG::scene->render_probes(); + // IMPORTANT BEGIN: In order for RenderingServer::CPUGPUSyncMode::CPU_GPU_SYNC_AUTO to work + // correctly, we need timing measures to be as accurate as possible. We need to measure all CPU & GPU + // work as if V-Sync were off (which is hard when V-Sync is on). + // GPU drivers have 2 opportunities to stall us: + // 1. In vkAcquireNextImageKHR (where it should be, by spec). This happens at the end of + // RSG::viewport->draw_viewports() (inside blit_render_targets_to_screen). + // 2. In vkQueuePresentKHR (where it often happens. Vulkan spec allows this). This happens in + // RSG::rasterizer->end_frame(). + // + // We're fortunate that both vkAcquireNextImageKHR & vkQueuePresentKHR happen extremely close + // together. But if that ever changes in the future and significant CPU/GPU work is done between + // them, we must split the timers to avoid measuring time spent waiting for V-Sync and include + // that extra time. RSG::viewport->draw_viewports(p_swap_buffers); - RSG::canvas_render->update(); - RSG::rasterizer->end_frame(p_swap_buffers); + // IMPORTANT ENDS. #ifndef XR_DISABLED XRServer *xr_server = XRServer::get_singleton(); @@ -98,12 +110,17 @@ void RenderingServerDefault::_draw(bool p_swap_buffers, double frame_step) { RSG::canvas->update_visibility_notifiers(); RSG::scene->update_visibility_notifiers(); + const uint64_t tick_pre_post_draw_steps = OS::get_singleton()->get_ticks_usec(); + if (create_thread) { callable_mp(this, &RenderingServerDefault::_run_post_draw_steps).call_deferred(); } else { _run_post_draw_steps(); } + RenderingServer::draw_cpu_time = 0ul; + RenderingServer::draw_gpu_time = 0ul; + if (RSG::utilities->get_captured_timestamps_count()) { Vector new_profile; if (RSG::utilities->capturing_timestamps) { @@ -122,6 +139,11 @@ void RenderingServerDefault::_draw(bool p_swap_buffers, double frame_step) { RSG::viewport->handle_timestamp(name, time_cpu, time_gpu); } + if (name == "_Sync Mode Auto") { + RenderingServer::draw_cpu_time = (time_cpu - base_cpu) / 1000ul; + RenderingServer::draw_gpu_time = (time_gpu - base_gpu) / 1000ul; + } + if (RSG::utilities->capturing_timestamps) { new_profile.write[i].gpu_msec = double((time_gpu - base_gpu) / 1000) / 1000.0; new_profile.write[i].cpu_msec = double(time_cpu - base_cpu) / 1000.0; @@ -178,6 +200,9 @@ void RenderingServerDefault::_draw(bool p_swap_buffers, double frame_step) { } RSG::utilities->update_memory_info(); + + const uint64_t tmp_tick = OS::get_singleton()->get_ticks_usec(); + RenderingServer::draw_cpu_time += (tmp_tick - tick_pre_post_draw_steps) / 1000ul; } void RenderingServerDefault::_run_post_draw_steps() { diff --git a/servers/rendering/storage/utilities.h b/servers/rendering/storage/utilities.h index 74d1cb66da40..e059a85dd5cb 100644 --- a/servers/rendering/storage/utilities.h +++ b/servers/rendering/storage/utilities.h @@ -148,10 +148,10 @@ class RendererUtilities { bool capturing_timestamps = false; -#define TIMESTAMP_BEGIN() \ - { \ - if (RSG::utilities->capturing_timestamps) \ - RSG::utilities->capture_timestamps_begin(); \ +#define TIMESTAMP_BEGIN() \ + { \ + if (RSG::utilities->capturing_timestamps || RenderingDevice::should_capture_frame_pacing_timings()) \ + RSG::utilities->capture_timestamps_begin(); \ } #define RENDER_TIMESTAMP(m_text) \ @@ -162,6 +162,7 @@ class RendererUtilities { virtual void capture_timestamps_begin() = 0; virtual void capture_timestamp(const String &p_name) = 0; + virtual void capture_timestamps_sync_mode_auto_end() = 0; virtual uint32_t get_captured_timestamps_count() const = 0; virtual uint64_t get_captured_timestamps_frame() const = 0; virtual uint64_t get_captured_timestamp_gpu_time(uint32_t p_index) const = 0; diff --git a/servers/rendering_server.cpp b/servers/rendering_server.cpp index 37a9d0947a3e..d48deca7d621 100644 --- a/servers/rendering_server.cpp +++ b/servers/rendering_server.cpp @@ -33,12 +33,16 @@ #include "core/config/project_settings.h" #include "core/variant/typed_array.h" +#include "servers/rendering/rendering_server_globals.h" #include "servers/rendering/shader_language.h" #include "servers/rendering/shader_warnings.h" RenderingServer *RenderingServer::singleton = nullptr; RenderingServer *(*RenderingServer::create_func)() = nullptr; +uint64_t RenderingServer::draw_gpu_time = 0ul; +uint64_t RenderingServer::draw_cpu_time = 0ul; + RenderingServer *RenderingServer::get_singleton() { return singleton; } @@ -3453,6 +3457,11 @@ void RenderingServer::_bind_methods() { /* Misc */ + ClassDB::bind_method(D_METHOD("get_actual_cpu_gpu_sync_mode"), &RenderingServer::get_actual_cpu_gpu_sync_mode); + + BIND_ENUM_CONSTANT(CPU_GPU_SYNC_PARALLEL); + BIND_ENUM_CONSTANT(CPU_GPU_SYNC_SEQUENTIAL); + ClassDB::bind_method(D_METHOD("request_frame_drawn_callback", "callable"), &RenderingServer::request_frame_drawn_callback); ClassDB::bind_method(D_METHOD("has_changed"), &RenderingServer::has_changed); ClassDB::bind_method(D_METHOD("get_rendering_info", "info"), &RenderingServer::get_rendering_info); @@ -3582,6 +3591,108 @@ TypedArray RenderingServer::_global_shader_parameter_get_list() cons return gsp; } +void RenderingServer::notify_cpu_gpu_sync_timings(uint64_t cpu_time, uint64_t gpu_time) { + // Clamp total time to prevent anomalous readings from sending the average out of sync + // for a long time until it converges again. Anomalous readings can be something as simple + // as Windows Update, Alt + Tab, etc. + const uint64_t total_time = MIN(cpu_time + gpu_time, 1000ul * 1000ul); + + last_cpu_time = cpu_time; + last_gpu_time = gpu_time; + + // Calculate rolling average & variance. + const int64_t delta = int64_t(total_time - avg_total_time); + variance_total_time = (variance_total_time + uint64_t(delta * delta)) >> 1ul; + avg_total_time = (avg_total_time + total_time) >> 1ul; + + const DisplayServer::VSyncMode vsync_mode = DisplayServer::get_singleton()->window_get_vsync_mode(DisplayServer::MAIN_WINDOW_ID); + if (vsync_mode == DisplayServer::VSYNC_DISABLED || vsync_mode == DisplayServer::VSYNC_MAILBOX || RSG::rasterizer->is_opengl()) { + // We're processing & presenting as fast as possible. Or using OpenGL (GL has a lot of variance, unpredictability + // and each time it switches between SEQ and PARALLEL there's a very visible stutter. Disabled until this is fixed). + same_cpu_sync_mode_count = 0u; + missed_hard_target = 0u; + last_frame_cpu_gpu_sync_mode = CPU_GPU_SYNC_PARALLEL; + actual_cpu_gpu_sync_mode = CPU_GPU_SYNC_PARALLEL; + return; + } + + const uint64_t stdev_total_time = uint64_t(Math::sqrt(double(variance_total_time))); + // We estimate the next frame will take "estimated_time". If that's below the monitor's + // refresh rate, switch to CPU_GPU_SYNC_SEQUENTIAL. Otherwise switch to CPU_GPU_SYNC_PARALLEL. + const uint64_t estimated_time = avg_total_time + stdev_total_time; + + const uint64_t max_mfps = uint64_t(Engine::get_singleton()->get_max_fps() * 1000.0); + uint64_t soft_target = cached_refresh_rate_us; + uint64_t curr_mhz = cached_refresh_rate_millihertz; + if (max_mfps != 0ul) { + soft_target = MAX((1000000ul * 1000ul) / max_mfps, cached_refresh_rate_us); + curr_mhz = MIN(cached_refresh_rate_millihertz, max_mfps); + } + const uint64_t hard_target = soft_target; + soft_target = soft_target / 2ul; + + const bool target_is_between_avg_and_std_dev = + Math::abs(int64_t(soft_target - avg_total_time)) <= int64_t(stdev_total_time); + + // Without threshold_same_mode_count + same_cpu_sync_mode_count + last_frame_cpu_gpu_sync_mode, + // we could end up in an unstable situation where the system performance is right there on the + // edge between parallel & sequential and thus keeps switching modes back and forth. + // That's a lot of stutter. Thus only switch if results have been consistent for + // frames in a row. + const uint32_t threshold_same_mode_count = target_is_between_avg_and_std_dev ? 1200u : 60u; + + if (actual_cpu_gpu_sync_mode == CPU_GPU_SYNC_PARALLEL) { + // If we don't reset it while in PARALLEL, missed_hard_target may have temporarily grown + // very large and take forever to get below threshold. + missed_hard_target = 0u; + } + + // Missing the hard target is an extreme offense. We only tolerate 2.0% of frames missing a VBLANK. + // If that happens, switch to PARALLEL. + const uint32_t threshold_missed_hard_target = 2000u; + if (estimated_time >= hard_target) { + missed_hard_target += 1000u; + } else { + // If we had a single missed frame, over the course of a second missed_hard_target should be + // 0 again (excluding fixed-point inaccuracies). If we can't attaing the target FPS, then + // missed_hard_target will decrease at a slower rate; but we don't care because that + // means we should be in PARALLEL anyway. + missed_hard_target -= MIN((1000u * 1000u) / curr_mhz, missed_hard_target); + } + + CPUGPUSyncMode new_cpu_gpu_sync_mode; + if ((estimated_time >= soft_target || missed_hard_target >= threshold_missed_hard_target) && + RenderingDevice::get_singleton()->get_latency_mode() != RD::LATENCY_MODE_LOW_EXTREME) { + new_cpu_gpu_sync_mode = CPU_GPU_SYNC_PARALLEL; + } else { + new_cpu_gpu_sync_mode = CPU_GPU_SYNC_SEQUENTIAL; + } + + if (last_frame_cpu_gpu_sync_mode != new_cpu_gpu_sync_mode) { + same_cpu_sync_mode_count = 0u; + } else { + ++same_cpu_sync_mode_count; + } + last_frame_cpu_gpu_sync_mode = new_cpu_gpu_sync_mode; + if (same_cpu_sync_mode_count >= threshold_same_mode_count) { + actual_cpu_gpu_sync_mode = new_cpu_gpu_sync_mode; + } +} + +void RenderingServer::update_cached_refresh_rate() { + double refresh_rate_hz = double(DisplayServer::get_singleton()->screen_get_refresh_rate()); + cached_refresh_rate_millihertz = uint32_t(MAX(refresh_rate_hz, 0.0) * 1000.0); + if (cached_refresh_rate_millihertz == 0u) { + // By setting cached_refresh_rate_us to 0, 2 things can happen: + // 1. If max FPS is set, MAX(1000000ul / max_fps, cached_refresh_rate_us) will always obey FPS. + // 2. If max FPS isn't set, the target will always be CPU_GPU_SYNC_PARALLEL. + cached_refresh_rate_millihertz = UINT32_MAX; + cached_refresh_rate_us = 0ul; + } else { + cached_refresh_rate_us = uint64_t((1000.0 * 1000.0) / refresh_rate_hz); + } +} + void RenderingServer::init() { // These are overrides, even if they are false Godot will still // import the texture formats that the host platform needs. diff --git a/servers/rendering_server.h b/servers/rendering_server.h index c83309275aec..e009292df0fd 100644 --- a/servers/rendering_server.h +++ b/servers/rendering_server.h @@ -58,6 +58,13 @@ class RenderingServer : public Object { GDCLASS(RenderingServer, Object); +public: + enum CPUGPUSyncMode { + CPU_GPU_SYNC_PARALLEL, + CPU_GPU_SYNC_SEQUENTIAL, + }; + +private: static RenderingServer *singleton; int mm_policy = 0; @@ -77,6 +84,21 @@ class RenderingServer : public Object { RID white_texture; RID test_material; + uint64_t variance_total_time = 0ul; + uint64_t avg_total_time = 0ul; + // Calling screen_get_refresh_rate() is expensive enough to appear in profilers. + // Plus it can cause internal mutexes in the OS that cause stutter. So it's cached. + uint64_t cached_refresh_rate_us = (1000ul * 1000ul) / 60ul; // Refresh rate in microseconds. + uint32_t cached_refresh_rate_millihertz = 1000u * 60u; // Refresh rate in millihertz (mHz). + + CPUGPUSyncMode actual_cpu_gpu_sync_mode = CPU_GPU_SYNC_PARALLEL; + CPUGPUSyncMode last_frame_cpu_gpu_sync_mode = CPU_GPU_SYNC_PARALLEL; + uint32_t same_cpu_sync_mode_count = 0u; + uint32_t missed_hard_target = 0u; + + uint64_t last_cpu_time = 0ul; // For performance counters. + uint64_t last_gpu_time = 0ul; // For performance counters. + Error _surface_set_data(Array p_arrays, uint64_t p_format, uint32_t *p_offsets, uint32_t p_vertex_stride, uint32_t p_normal_stride, uint32_t p_attrib_stride, uint32_t p_skin_stride, Vector &r_vertex_array, Vector &r_attrib_array, Vector &r_skin_array, int p_vertex_array_len, Vector &r_index_array, int p_index_array_len, AABB &r_aabb, Vector &r_bone_aabb, Vector4 &r_uv_scale); static RenderingServer *(*create_func)(); @@ -1768,6 +1790,21 @@ class RenderingServer : public Object { virtual void set_physics_interpolation_enabled(bool p_enabled) = 0; + /* SYNCHRONIZATION */ + + // Used by RenderingServer::CPUGPUSyncMode::CPU_GPU_SYNC_AUTO to track partial timings. + // final timings are set via notify_cpu_gpu_sync_timings(). + static uint64_t draw_gpu_time; + static uint64_t draw_cpu_time; + + CPUGPUSyncMode get_actual_cpu_gpu_sync_mode() const { return actual_cpu_gpu_sync_mode; } + void notify_cpu_gpu_sync_timings(uint64_t cpu_time, uint64_t gpu_time); + uint32_t get_missed_hard_target() const { return missed_hard_target; } + uint64_t get_last_cpu_time() const { return last_cpu_time; } + uint64_t get_last_gpu_time() const { return last_gpu_time; } + CPUGPUSyncMode get_evaluated_cpu_gpu_sync_mode() const { return last_frame_cpu_gpu_sync_mode; } + void update_cached_refresh_rate(); + /* EVENT QUEUING */ virtual void request_frame_drawn_callback(const Callable &p_callable) = 0; @@ -1987,6 +2024,7 @@ VARIANT_ENUM_CAST(RenderingServer::GlobalShaderParameterType); VARIANT_ENUM_CAST(RenderingServer::RenderingInfo); VARIANT_ENUM_CAST(RenderingServer::CanvasTextureChannel); VARIANT_ENUM_CAST(RenderingServer::BakeChannels); +VARIANT_ENUM_CAST(RenderingServer::CPUGPUSyncMode); #ifndef DISABLE_DEPRECATED VARIANT_ENUM_CAST(RenderingServer::Features);