如何解决使用较大的工作组时,我的OpenGL计算着色器编译速度非常慢超过10分钟,我可以做些什么来加快速度吗?
因此,当我使用glGetShaderiv(m_shaderID,GL_COMPILE_STATUS,&status)
编译我的计算着色器时,我遇到了一个非常奇怪的现象(至少对我来说,它是一个计算着色器新手)。莫名其妙的是,当我增加工作组的大小时,我的计算着色器的编译时间会更长!当我有一个一维工作组时,它的编译时间不到一秒钟,但是当我将工作组的大小增加到4x1x6时,计算着色器的编译时间将超过10分钟!真奇怪。
对于背景,我正在尝试实现一种光聚类算法(本质上是这里显示的一种算法:http://www.aortiz.me/2018/12/21/CG.html#tiled-shading--forward),而我的计算着色器就是这个怪物:
// TODO: Figure out optimal tile size,currently using a 16x9x24 subdivision
#define FLT_MAX 3.402823466e+38
#define FLT_MIN 1.175494351e-38
#define DBL_MAX 1.7976931348623158e+308
#define DBL_MIN 2.2250738585072014e-308
layout(local_size_x = 4,local_size_y = 9,local_size_z = 4) in;
// TODO: Change to reflect my light structure
// struct PointLight{
// vec4 position;
// vec4 color;
// uint enabled;
// float intensity;
// float range;
// };
// TODO: Pack this more efficiently
struct Light {
vec4 position;
vec4 direction;
vec4 ambientColor;
vec4 diffuseColor;
vec4 specularColor;
vec4 attributes;
vec4 intensity;
ivec4 typeIndexAndFlags;
// uint flags;
};
// Array containing offset and number of lights in a cluster
struct LightGrid{
uint offset;
uint count;
};
struct VolumeTileAABB{
vec4 minPoint;
vec4 maxPoint;
};
layout(std430,binding = 0) readonly buffer LightBuffer {
Light data[];
} lightBuffer;
layout (std430,binding = 1) buffer clusterAABB{
VolumeTileAABB cluster[ ];
};
layout (std430,binding = 2) buffer screenToView{
mat4 inverseProjection;
uvec4 tileSizes;
uvec2 screenDimensions;
};
// layout (std430,binding = 3) buffer lightSSBO{
// PointLight pointLight[];
// };
// SSBO of active light indices
layout (std430,binding = 4) buffer lightIndexSSBO{
uint globalLightIndexList[];
};
layout (std430,binding = 5) buffer lightGridSSBO{
LightGrid lightGrid[];
};
layout (std430,binding = 6) buffer globalIndexCountSSBO{
uint globalIndexCount;
};
// Shared variables,shared between all invocations WITHIN A WORK GROUP
// TODO: See if I can use gl_WorkGroupSize for this,gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z
// A grouped-shared array which contains all the lights being evaluated
shared Light sharedLights[4*9*4]; // A grouped-shared array which contains all the lights being evaluated,size is thread-count
uniform mat4 viewMatrix;
bool testSphereAABB(uint light,uint tile);
float sqDistPointAABB(vec3 point,uint tile);
bool testConeAABB(uint light,uint tile);
float getLightRange(uint lightIndex);
bool isEnabled(uint lightIndex);
// Runs in batches of multiple Z slices at once
// In this implementation,6 batches,since each thread group contains four z slices (24/4=6)
// We begin by each thread representing a cluster
// Then in the light traversal loop they change to representing lights
// Then change again near the end to represent clusters
// NOTE: Tiles actually mean clusters,it's just a legacy name from tiled shading
void main(){
// Reset every frame
globalIndexCount = 0; // How many lights are active in t his scene
uint threadCount = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z; // Number of threads in a group,same as local_size_x,local_size_y,local_size_z
uint lightCount = lightBuffer.data.length(); // Number of total lights in the scene
uint numBatches = uint((lightCount + threadCount -1) / threadCount); // Number of groups of lights that will be completed,i.e.,number of passes
uint tileIndex = gl_LocalInvocationIndex + gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * gl_WorkGroupID.z;
// uint tileIndex = gl_GlobalInvocationID; // doesn't wortk,is uvec3
// Local thread variables
uint visibleLightCount = 0;
uint visibleLightIndices[100]; // local light index list,to be transferred to global list
// Every light is being checked against every cluster in the view frustum
// TODO: Perform active cluster determination
// Each individual thread will be responsible for loading a light and writing it to shared memory so other threads can read it
for( uint batch = 0; batch < numBatches; ++batch){
uint lightIndex = batch * threadCount + gl_LocalInvocationIndex;
//Prevent overflow by clamping to last light which is always null
lightIndex = min(lightIndex,lightCount);
//Populating shared light array
// NOTE: It is VERY important that lightBuffer.data not be referenced after this point,// since that is not thread-safe
sharedLights[gl_LocalInvocationIndex] = lightBuffer.data[lightIndex];
barrier(); // Synchronize read/writes between invocations within a work group
//Iterating within the current batch of lights
for( uint light = 0; light < threadCount; ++light){
if( isEnabled(light)){
uint lightType = uint(sharedLights[light].typeIndexAndFlags[0]);
if(lightType == 0){
// Point light
if( testSphereAABB(light,tileIndex) ){
visibleLightIndices[visibleLightCount] = batch * threadCount + light;
visibleLightCount += 1;
}
}
else if(lightType == 1){
// Directional light
visibleLightIndices[visibleLightCount] = batch * threadCount + light;
visibleLightCount += 1;
}
else if(lightType == 2){
// Spot light
if( testConeAABB(light,tileIndex) ){
visibleLightIndices[visibleLightCount] = batch * threadCount + light;
visibleLightCount += 1;
}
}
}
}
}
// We want all thread groups to have completed the light tests before continuing
barrier();
// Back to every thread representing a cluster
// Adding the light indices to the cluster light index list
uint offset = atomicAdd(globalIndexCount,visibleLightCount);
for(uint i = 0; i < visibleLightCount; ++i){
globalLightIndexList[offset + i] = visibleLightIndices[i];
}
// Updating the light grid for each cluster
lightGrid[tileIndex].offset = offset;
lightGrid[tileIndex].count = visibleLightCount;
}
// Return whether or not the specified light intersects with the specified tile (cluster)
bool testSphereAABB(uint light,uint tile){
float radius = getLightRange(light);
vec3 center = vec3(viewMatrix * sharedLights[light].position);
float squaredDistance = sqDistPointAABB(center,tile);
return squaredDistance <= (radius * radius);
}
// TODO: Different test for spot-lights
// Has been done by using several AABBs for spot-light cone,this could be a good approach,or even just use one to start.
bool testConeAABB(uint light,uint tile){
// Light light = lightBuffer.data[lightIndex];
// float innerAngleCos = light.attributes[0];
// float outerAngleCos = light.attributes[1];
// float innerAngle = acos(innerAngleCos);
// float outerAngle = acos(outerAngleCos);
// FIXME: Actually do something clever here
return true;
}
// Get range of light given the specified light index
float getLightRange(uint lightIndex){
int lightType = sharedLights[lightIndex].typeIndexAndFlags[0];
float range;
if(lightType == 0){
// Point light
float brightness = 0.01; // cutoff for end of range
float c = sharedLights[lightIndex].attributes.x;
float lin = sharedLights[lightIndex].attributes.y;
float quad = sharedLights[lightIndex].attributes.z;
range = (-lin + sqrt(lin*lin - 4.0 * c * quad + (4.0/brightness)* quad)) / (2.0 * quad);
}
else if(lightType == 1){
// Directional light
range = FLT_MAX;
}
else{
// Spot light
range = FLT_MAX;
}
return range;
}
// Whether the light at the specified index is enabled
bool isEnabled(uint lightIndex){
uint flags = sharedLights[lightIndex].typeIndexAndFlags[2];
return (flags | 1) != 0;
}
// Get squared distance from a point to the AABB of the specified tile (cluster)
float sqDistPointAABB(vec3 point,uint tile){
float sqDist = 0.0;
VolumeTileAABB currentCell = cluster[tile];
cluster[tile].maxPoint[3] = tile;
for(int i = 0; i < 3; ++i){
float v = point[i];
if(v < currentCell.minPoint[i]){
sqDist += (currentCell.minPoint[i] - v) * (currentCell.minPoint[i] - v);
}
if(v > currentCell.maxPoint[i]){
sqDist += (v - currentCell.maxPoint[i]) * (v - currentCell.maxPoint[i]);
}
}
return sqDist;
}
编辑:哎呀,丢掉了它的底部!
我不明白的是,为什么更改工作组的大小会完全影响编译时间?如果我的工作组大小太小而无法使计算着色器有效运行,那么这将使算法失去意义,所以我希望有一些我想念的东西。
最后一点,我想避免使用glGetProgramBinary
作为解决方案。不仅因为它只是绕过了问题而不是没有解决问题,而且还因为预编译着色器无法很好地与引擎的当前架构配合使用。
解决方法
因此,我认为这一定是编译器中的错误,因为我已将sqDistPointAABB
函数中的循环替换为:
vec3 minPoint = currentCell.minPoint.xyz;
vec3 maxPoint = currentCell.maxPoint.xyz;
vec3 t1 = vec3(lessThan(point,minPoint));
vec3 t2 = vec3(greaterThan(point,maxPoint));
vec3 sqDist = t1 * (minPoint - point) * (minPoint - point) + t2 * (maxPoint - point) * (maxPoint - point);
return sqDist.x + sqDist.y + sqDist.z;
它现在可以在不到一秒钟的时间内完成编译!太奇怪了
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。