pygfx · Korijn · Sep 18, 2024 · Sep 13, 2024 · Sep 14, 2024 · Sep 14, 2024
@@ -59,6 +59,32 @@ The wgpu_native backend provides a few extra functionalities:
     :return: Device
     :rtype: wgpu.GPUDevice
 
+There are two functions that allow you to perform multiple draw calls at once.
+Both require that you enable the feature "multi-draw-indirect".
+
+Typically, these calls do not reduce work or increase parallelism on the GPU. Rather
+they reduce drive overhead on the CPU.
+
+.. py:function:: wgpu.backends.wgpu_native.multi_draw_indirect(render_pass_encoder, buffer, *, offset=0, count):
+    This is equvalent to
+    for i in range(count):
+        render_pass_encoder.draw(buffer, offset + i * 16)
+
+    :param render_pass_encoder: The current render pass encoder.
+    :param buffer: The indirect buffer containing the arguments.
+    :param offset: The byte offset in the indirect buffer containing the first argument.
+    :param count: The number of write operations to perform.
+
+.. py:function:: wgpu.backends.wgpu_native.multi_draw_indexed_indirect(render_pass_encoder, buffer, *, offset=0, count):
+    This is equvalent to
+    for i in range(count):
+        render_pass_encoder.draw(buffer, offset + i * 20)
+
+    :param render_pass_encoder: The current render pass encoder.
+    :param buffer: The indirect buffer containing the arguments.
+    :param offset: The byte offset in the indirect buffer containing the first argument.
+    :param count: The number of write operations to perform.
+
 
 The js_webgpu backend
 ---------------------

@@ -0,0 +1,254 @@
+import itertools
+
+import numpy as np
+import pytest
+import wgpu.utils
+from tests.testutils import can_use_wgpu_lib, run_tests
+from wgpu import TextureFormat
+from wgpu.backends.wgpu_native.extras import (
+    multi_draw_indexed_indirect,
+    multi_draw_indirect,
+)
+
+MAX_INFO = 100
+
+if not can_use_wgpu_lib:
+    pytest.skip("Skipping tests that need the wgpu lib", allow_module_level=True)
+
+
+SHADER_SOURCE = (
+    f"""
+    const MAX_INFO: u32 = {MAX_INFO}u;
+    """
+    """
+    @group(0) @binding(0) var<storage, read_write> data: array<vec2u, MAX_INFO>;
+    @group(0) @binding(1) var<storage, read_write> counter: atomic<u32>;
+
+    struct VertexOutput {
+        @builtin(position) position: vec4f,
+        @location(0) info: vec2u
+    }
+
+    const POSITION: vec4f = vec4f(0, 0, 0, 1);
+
+    @vertex
+    fn vertexMain(
+        @builtin(vertex_index) vertexIndex: u32,
+        @builtin(instance_index) instanceIndex: u32
+    ) -> VertexOutput {
+        let info = vec2u(vertexIndex, instanceIndex);
+        return VertexOutput(POSITION, info);
+    }
+
+    @fragment
+    fn fragmentMain(@location(0) info: vec2u) -> @location(0) vec4f {
+        let index = atomicAdd(&counter, 1u);
+        data[index % MAX_INFO] = info;
+        return vec4f();
+    }
+"""
+)
+
+BIND_GROUP_ENTRIES = [
+    {"binding": 0, "visibility": "FRAGMENT", "buffer": {"type": "storage"}},
+    {"binding": 1, "visibility": "FRAGMENT", "buffer": {"type": "storage"}},
+]
+
+
+class Runner:
+    def __init__(self, use_multidraw_if_available: bool = True):
+        adapter = wgpu.gpu.request_adapter(power_preference="high-performance")
+        features = []
+        if use_multidraw_if_available and "multi-draw-indirect" in adapter.features:
+            features.append("multi-draw-indirect")
+        self.device = adapter.request_device(required_features=features)
+        self.output_texture = self.device.create_texture(
+            # Actual size is immaterial.  Could just be 1x1
+            size=[128, 128],
+            format=TextureFormat.rgba8unorm,
+            usage="RENDER_ATTACHMENT|COPY_SRC",
+        )
+        shader = self.device.create_shader_module(code=SHADER_SOURCE)
+        bind_group_layout = self.device.create_bind_group_layout(
+            entries=BIND_GROUP_ENTRIES
+        )
+        render_pipeline_layout = self.device.create_pipeline_layout(
+            bind_group_layouts=[bind_group_layout]
+        )
+        self.pipeline = self.device.create_render_pipeline(
+            layout=render_pipeline_layout,
+            vertex={
+                "module": shader,
+                "entry_point": "vertexMain",
+            },
+            fragment={
+                "module": shader,
+                "entry_point": "fragmentMain",
+                "targets": [{"format": self.output_texture.format}],
+            },
+            primitive={
+                "topology": "point-list",
+            },
+        )
+
+        self.vertex_call_buffer = self.device.create_buffer(
+            size=MAX_INFO * 2 * 4, usage="STORAGE|COPY_SRC"
+        )
+        self.counter_buffer = self.device.create_buffer(
+            size=4, usage="STORAGE|COPY_SRC|COPY_DST"
+        )
+        self.bind_group = self.device.create_bind_group(
+            layout=self.pipeline.get_bind_group_layout(0),
+            entries=[
+                {"binding": 0, "resource": {"buffer": self.vertex_call_buffer}},
+                {"binding": 1, "resource": {"buffer": self.counter_buffer}},
+            ],
+        )
+        self.render_pass_descriptor = {
+            "color_attachments": [
+                {
+                    "clear_value": (0, 0, 0, 0),  # only first value matters
+                    "load_op": "clear",
+                    "store_op": "store",
+                    "view": self.output_texture.create_view(),
+                }
+            ],
+        }
+
+    def create_render_bundle_encoder(self, draw_function):
+        render_bundle_encoder = self.device.create_render_bundle_encoder(
+            color_formats=[self.output_texture.format]
+        )
+        render_bundle_encoder.set_pipeline(self.pipeline)
+        render_bundle_encoder.set_bind_group(0, self.bind_group)
+        draw_function(render_bundle_encoder)
+        return render_bundle_encoder.finish()
+
+    def run_function(self, expected_result, draw_function):
+        encoder = self.device.create_command_encoder()
+        encoder.clear_buffer(self.counter_buffer)
+        this_pass = encoder.begin_render_pass(**self.render_pass_descriptor)
+        this_pass.set_pipeline(self.pipeline)
+        this_pass.set_bind_group(0, self.bind_group)
+        draw_function(this_pass)
+        this_pass.end()
+        self.device.queue.submit([encoder.finish()])
+        counter_buffer_view = self.device.queue.read_buffer(self.counter_buffer)
+        count = np.frombuffer(counter_buffer_view, dtype=np.uint32)[0]
+        if count > MAX_INFO:
+            pytest.fail("Too many data points written to output buffer")
+        info_view = self.device.queue.read_buffer(
+            self.vertex_call_buffer, size=count * 2 * 4
+        )
+        info = np.frombuffer(info_view, dtype=np.uint32).reshape(-1, 2)
+        info = [tuple(info[i]) for i in range(len(info))]
+        info_set = set(info)
+        assert len(info) == len(info_set)
+        assert info_set == expected_result
+
+    def run_functions(self, expected_result, functions):
+        for function in functions:
+            self.run_function(expected_result, function)
+
+
+def test_draw_no_index():
+    runner = Runner()
+
+    # vertex_count, index_count, first_vertex, first_index
+    draw_args1 = [2, 3, 100, 10]
+    draw_args2 = [1, 1, 30, 50]
+    expected_result = set(itertools.product((100, 101), (10, 11, 12))) | {(30, 50)}
+
+    draw_data_info = np.uint32([0, 0] + draw_args1 + draw_args2)
+    draw_data_buffer = runner.device.create_buffer_with_data(
+        data=draw_data_info, usage="INDIRECT"
+    )
+
+    def draw_direct(encoder):
+        encoder.draw(*draw_args1)
+        encoder.draw(*draw_args2)
+
+    def draw_indirect(encoder):
+        encoder.draw_indirect(draw_data_buffer, 8)
+        encoder.draw_indirect(draw_data_buffer, 8 + 16)
+
+    def draw_mixed(encoder):
+        encoder.draw(*draw_args1)
+        encoder.draw_indirect(draw_data_buffer, 8 + 16)
+
+    def draw_indirect_multi(encoder):
+        multi_draw_indirect(encoder, draw_data_buffer, offset=8, count=2)
+
+    render_bundle_encoder = runner.create_render_bundle_encoder(draw_mixed)
+
+    has_multi_draw_indirect = "multi-draw-indirect" in runner.device.features
+    runner.run_functions(
+        expected_result,
+        [
+            draw_direct,
+            draw_indirect,
+            draw_mixed,
+            *([draw_indirect_multi] if has_multi_draw_indirect else []),
+            lambda encoder: encoder.execute_bundles([render_bundle_encoder]),
+            lambda encoder: encoder.execute_bundles([render_bundle_encoder]),
+        ],
+    )
+
+
+def test_draw_indexed():
+    runner = Runner()
+
+    # index_count, instance_count, first_index, base_vertex, first_intance
+    draw_args1 = (4, 2, 1, 100, 1000)
+    draw_args2 = (1, 1, 7, 200, 2000)
+    index_buffer_data = (2, 3, 5, 7, 11, 13, 17, 19)
+    expected_result = set(itertools.product((103, 105, 107, 111), (1000, 1001)))
+    expected_result.add((219, 2000))
+
+    index_buffer_data = np.uint32(index_buffer_data)
+    index_buffer = runner.device.create_buffer_with_data(
+        data=index_buffer_data, usage="INDEX"
+    )
+
+    draw_data = np.uint32([0, 0] + list(draw_args1) + list(draw_args2))
+    draw_data_buffer = runner.device.create_buffer_with_data(
+        data=draw_data, usage="INDIRECT"
+    )
+
+    def draw_direct(encoder):
+        encoder.set_index_buffer(index_buffer, "uint32")
+        encoder.draw_indexed(*draw_args1)
+        encoder.draw_indexed(*draw_args2)
+
+    def draw_indirect(encoder):
+        encoder.set_index_buffer(index_buffer, "uint32")
+        encoder.draw_indexed_indirect(draw_data_buffer, 8)
+        encoder.draw_indexed_indirect(draw_data_buffer, 8 + 20)
+
+    def draw_mixed(encoder):
+        encoder.set_index_buffer(index_buffer, "uint32")
+        encoder.draw_indexed(*draw_args1)
+        encoder.draw_indexed_indirect(draw_data_buffer, 8 + 20)
+
+    def draw_indirect_multi(encoder):
+        encoder.set_index_buffer(index_buffer, "uint32")
+        multi_draw_indexed_indirect(encoder, draw_data_buffer, offset=8, count=2)
+
+    render_bundle_encoder = runner.create_command_encoder(draw_mixed)
+
+    has_multi_draw_indirect = "multi-draw-indirect" in runner.device.features
+    runner.run_functions(
+        expected_result,
+        [
+            draw_direct,
+            draw_indirect,
+            draw_mixed,
+            *([draw_indirect_multi] if has_multi_draw_indirect else []),
+            lambda encoder: encoder.execute_bundles([render_bundle_encoder]),
+            lambda encoder: encoder.execute_bundles([render_bundle_encoder]),
+        ],
+    )
+
+
+if __name__ == "__main__":
+    run_tests(globals())
@@ -2912,6 +2912,18 @@ def end_occlusion_query(self):
         # H: void f(WGPURenderPassEncoder renderPassEncoder)
         libf.wgpuRenderPassEncoderEndOcclusionQuery(self._internal)
 
+    def _multi_draw_indirect(self, buffer, offset, count):
+        # H: void f(WGPURenderPassEncoder encoder, WGPUBuffer buffer, uint64_t offset, uint32_t count)
+        libf.wgpuRenderPassEncoderMultiDrawIndirect(
+            self._internal, buffer._internal, int(offset), int(count)
+        )
+
+    def _multi_draw_indexed_indirect(self, buffer, offset, count):
+        # H: void f(WGPURenderPassEncoder encoder, WGPUBuffer buffer, uint64_t offset, uint32_t count)
+        libf.wgpuRenderPassEncoderMultiDrawIndexedIndirect(
+            self._internal, buffer._internal, int(offset), int(count)
+        )
+
     def _release(self):
         if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal

@@ -33,3 +33,26 @@ def request_device_tracing(
     return adapter._request_device(
         label, required_features, required_limits, default_queue, trace_path
     )
+
+
+def multi_draw_indirect(render_pass_encoder, buffer, *, offset=0, count):
+    """
+    This is equvalent to
+    for i in range(count):
+        render_pass_encoder.draw(buffer, offset + i * 16)
+
+    You must enable the featue "multi-draw-indirect" to use this function.
+    """
+    render_pass_encoder._multi_draw_indirect(buffer, offset, count)
+
+
+def multi_draw_indexed_indirect(render_pass_encoder, buffer, *, offset=0, count):
+    """
+    This is equvalent to
+
+    for i in range(count):
+        render_pass_encoder.draw_indexed(buffer, offset + i * 20)
+
+    You must enable the featue "multi-draw-indirect" to use this function.
+    """
+    render_pass_encoder._multi_draw_indexed_indirect(buffer, offset, count)
@@ -20,7 +20,7 @@
 * Diffs for GPUQueue: add read_buffer, add read_texture, hide copy_external_image_to_texture
 * Validated 37 classes, 112 methods, 45 properties
 ### Patching API for backends/wgpu_native/_api.py
-* Validated 37 classes, 112 methods, 0 properties
+* Validated 37 classes, 114 methods, 0 properties
 ## Validating backends/wgpu_native/_api.py
 * Enum field FeatureName.texture-compression-bc-sliced-3d missing in wgpu.h
 * Enum field FeatureName.clip-distances missing in wgpu.h
@@ -35,6 +35,6 @@
 * Enum CanvasAlphaMode missing in wgpu.h
 * Enum CanvasToneMappingMode missing in wgpu.h
 * Wrote 236 enum mappings and 47 struct-field mappings to wgpu_native/_mappings.py
-* Validated 132 C function calls
-* Not using 73 C functions
+* Validated 134 C function calls
+* Not using 71 C functions
 * Validated 78 C structs