appendices/VK_HUAWEI_cluster_culling_shader.adoc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275

// Copyright (c) 2020-2023 Huawei Technologies Co. Ltd.
//
// SPDX-License-Identifier: CC-BY-4.0

include::{generated}/meta/{refprefix}VK_HUAWEI_cluster_culling_shader.adoc[]

=== Other Extension Metadata

*Last Modified Date*::
    2022-11-17
*Interactions and External Dependencies*::
  - This extension requires
    {spirv}/HUAWEI/SPV_HUAWEI_cluster_culling_shader.html[`SPV_HUAWEI_cluster_culling_shader`].
  - This extension provides API support for
    {GLSLregistry}/huawei/GLSL_HUAWEI_cluster_culling_shader.txt[`GL_HUAWEI_cluster_culling_shader`].
*Contributors*::
  - Yuchang Wang, Huawei
  - Juntao Li, Huawei
  - Pan Gao, Huawei
  - Jie Cao, Huawei
  - Yunjin Zhang, Huawei
  - Shujie Zhou, Huawei
  - Chaojun Wang, Huawei
  - Jiajun Hu, Huawei
  - Cong Zhang, Huawei

=== Description

Cluster Culling Shaders (CCS) are similar to the existing compute shaders.
Their main purpose is to provide an execution environment in order to
perform coarse-level geometry culling and LOD selection more efficiently on
the GPU.

The traditional 2-pass GPU culling solution using a compute shader sometimes
needs a pipeline barrier between compute and graphics pipeline to optimize
performance.
An additional compaction process may also be required.
This extension addresses these shortcomings, allowing compute shaders to
directly emit visible clusters to the following graphics pipeline.

A set of new built-in output variables are used to express a visible
cluster.
In addition, a new built-in function is used to emit these variables from
CCS to the IA stage.
The IA stage can use these variables to fetches vertices of a visible
cluster and drive vertex shaders to shading these vertices.

Note that CCS do not work with geometry or tessellation shaders, but both IA
and vertex shaders are preserved.
Vertex shaders are still used for vertex position shading, instead of
directly outputting transformed vertices from the compute shader.
This makes CCS more suitable for mobile GPUs.

include::{generated}/interfaces/VK_HUAWEI_cluster_culling_shader.adoc[]

=== New Built-In Variables

  * <<interfaces-builtin-variables-indexcounthuawei,IndexCountHUAWEI>>
  * <<interfaces-builtin-variables-vertexcounthuawei,VertexCountHUAWEI>>
  * <<interfaces-builtin-variables-instancecounthuawei,InstanceCountHUAWEI>>
  * <<interfaces-builtin-variables-firstindexhuawei,FirstIndexHUAWEI>>
  * <<interfaces-builtin-variables-firstvertexhuawei,FirstVertexHUAWEI>>
  * <<interfaces-builtin-variables-vertexoffsethuawei,VertexOffsetHUAWEI>>
  * <<interfaces-builtin-variables-firstinstancehuawei,FirstInstanceHUAWEI>>
  * <<interfaces-builtin-variables-clusteridhuawei,ClusterIDHUAWEI>>

=== New SPIR-V Capability

  * <<spirvenv-capabilities-table-ClusterCullingShadingHUAWEI,
    code:ClusterCullingShadingHUAWEI>>

=== Sample Code

Example of cluster culling in a GLSL shader

[source,c]
----
#extension GL_HUAWEI_cluster_culling_shader: enable

#define GPU_WARP_SIZE                   32
#define GPU_GROUP_SIZE                  GPU_WARP_SIZE

#define GPU_CLUSTER_PER_INVOCATION      1
#define GPU_CLUSTER_PER_WORKGROUP       (GPU_GROUP_SIZE * GPU_CLUSTER_PER_INVOCATION)

// Number of threads per workgroup
// - 1D only
// - warpsize = 32
layout(local_size_x=GPU_GROUP_SIZE, local_size_y=1, local_size_z=1) in;


#define GPU_CLUSTER_DESCRIPTOR_BINDING      0
#define GPU_DRAW_BUFFER_BINDING             1
#define GPU_INSTANCE_DESCRIPTOR_BINDING     2

const float pi_half = 1.570795;
uint instance_id;

struct BoundingSphere
{
  vec3 center;
  float radius;
};

struct BoundingCone
{
  vec3 normal;
  float angle;
};

struct ClusterDescriptor
{
  BoundingSphere sphere;
  BoundingCone cone;
  uint instance_idx;
};

struct InstanceData
{
  mat4 mvp_matrix;                      // mvp matrix.
  vec4 frustum_planes[6];               // six frustum planes
  mat4 model_matrix_transpose_inverse;  // inverse transpose of model matrix.
  vec3 view_origin;                     // view original
};

struct InstanceDescriptor
{
  uint begin;
  uint end;
  uint cluster_count;
  uint debug;
  BoundingSphere sphere;
  InstanceData instance_data;
};

struct DrawElementsCommand{
  uint indexcount;
  uint instanceCount;
  uint firstIndex;
  int  vertexoffset;
  uint firstInstance;
  uint cluster_id;
};

// indexed mode
out gl_PerClusterHUAWEI{
  uint gl_IndexCountHUAWEI;
  uint gl_InstanceCountHUAWEI;
  uint gl_FirstIndexHUAWEI;
  int  gl_VertexOffsetHUAWEI;
  uint gl_FirstInstanceHUAWEI;
  uint gl_ClusterIDHUAWEI;
};


layout(binding = GPU_CLUSTER_DESCRIPTOR_BINDING, std430) readonly buffer cluster_descriptor_ssbo
{
        ClusterDescriptor cluster_descriptors[];
};


layout(binding = GPU_DRAW_BUFFER_BINDING, std430) buffer draw_indirect_ssbo
{
        DrawElementsCommand draw_commands[];
};

layout(binding = GPU_INSTANCE_DESCRIPTOR_BINDING, std430) buffer instance_descriptor_ssbo
{
        InstanceDescriptor instance_descriptors[];
};


bool isFrontFaceVisible( vec3 sphere_center, float sphere_radius, vec3 cone_normal, float cone_angle )
{
  vec3 sphere_center_dir = normalize(sphere_center -
                           instance_descriptors[instance_id].instance_data.view_origin);

  float sin_cone_angle = sin(min(cone_angle, pi_half));
  return dot(cone_normal, sphere_center_dir) < sin_cone_angle;
}

bool isSphereOutsideFrustum( vec3 sphere_center, float sphere_radius )
{
  bool isInside = false;

  for(int i = 0; i < 6; i++)
  {
      isInside = isInside ||
      (dot(instance_descriptors[instance_id].instance_data.frustum_planes[i].xyz,
      sphere_center) + instance_descriptors[instance_id].instance_data.frustum_planes[i].w <
      sphere_radius);
  }
  return isInside;
}


void main()
{
    uint cluster_id = gl_GlobalInvocationID.x;
    ClusterDescriptor desc = cluster_descriptors[cluster_id];

    // get instance description
    instance_id = desc.instance_idx;
    InstanceDescriptor inst_desc = instance_descriptors[instance_id];

    //instance based culling
    bool instance_render = !isSphereOutsideFrustum(inst_desc.sphere.center, inst_desc.sphere.radius);

    if( instance_render)
    {
        // cluster based culling
        bool render = (!isSphereOutsideFrustum(desc.sphere.center,
        desc.sphere.radius) && isFrontFaceVisible(desc.sphere.center, desc.sphere.radius, desc.cone.norm
        al, desc.cone.angle));

        if (render)
        {
            // this cluster passed coarse-level culling, update built-in output variable.
            // in case of indexed mode:
            gl_IndexCountHUAWEI     = draw_commands[cluster_id].indexcount;
            gl_InstanceCountHUAWEI  = draw_commands[cluster_id].instanceCount;
            gl_FirstIndexHUAWEI     = draw_commands[cluster_id].firstIndex;
            gl_VertexOffsetHUAWEI   = draw_commands[cluster_id].vertexoffset;
            gl_FirstInstanceHUAWEI  = draw_commands[cluster_id].firstInstance;
            gl_ClusterIDHUAWEI      = draw_commands[cluster_id].cluster_id;

            // emit built-in output variables as a drawing command to subsequent
            // rendering pipeline.
            dispatchClusterHUAWEI();
        }
    }
}
----

Example of graphics pipeline creation with cluster culling shader

[source,c]
----
// create a cluster culling shader stage info structure.
VkPipelineShaderStageCreateInfo ccsStageInfo{};
ccsStageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
ccsStageInfo.stage = VK_SHADER_STAGE_CLUSTER_CULLING_BIT_HUAWEI;
ccsStageInfo.module = clustercullingshaderModule;
ccsStageInfo.pName =  "main";

// pipeline shader stage creation
VkPipelineShaderStageCreateInfo shaderStages[] = { ccsStageInfo, vertexShaderStageInfo, fragmentShaderStageInfo };

// create graphics pipeline
VkGraphicsPipelineCreateInfo pipelineInfo{};
pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO;
pipelineInfo.stageCount = 3;
pipelineInfo.pStage = shaderStages;
pipelineInfo.pVertexInputState = &vertexInputInfo;
// ...
VkPipeline graphicsPipeline;
VkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, &pipelineInfo, nullptr, &graphicsPipeline);
----


Example of launching the execution of cluster culling shader

[source,c]
----
vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, graphicsPipeline);
vkCmdDrawClusterHUAWEI(commandBuffer, groupCountX, 1, 1);
vkCmdEndRenderPass(commandBuffer);
----

=== Version History

  * Revision 1, 2022-11-18 (YuChang Wang)
  ** Internal revisions
  * Revision 2, 2023-04-02 (Jon Leech)
  ** Grammar edits.