使用较大的工作组时，我的OpenGL计算着色器编译速度非常慢超过10分钟，我可以做些什么来加快速度吗？

如何解决使用较大的工作组时，我的OpenGL计算着色器编译速度非常慢超过10分钟，我可以做些什么来加快速度吗？

因此，当我使用glGetShaderiv(m_shaderID,GL_COMPILE_STATUS,&status)编译我的计算着色器时，我遇到了一个非常奇怪的现象（至少对我来说，它是一个计算着色器新手）。莫名其妙的是，当我增加工作组的大小时，我的计算着色器的编译时间会更长！当我有一个一维工作组时，它的编译时间不到一秒钟，但是当我将工作组的大小增加到4x1x6时，计算着色器的编译时间将超过10分钟！真奇怪。

对于背景，我正在尝试实现一种光聚类算法（本质上是这里显示的一种算法：http://www.aortiz.me/2018/12/21/CG.html#tiled-shading--forward），而我的计算着色器就是这个怪物：

// TODO: Figure out optimal tile size,currently using a 16x9x24 subdivision

#define FLT_MAX 3.402823466e+38
#define FLT_MIN 1.175494351e-38
#define DBL_MAX 1.7976931348623158e+308
#define DBL_MIN 2.2250738585072014e-308

layout(local_size_x = 4,local_size_y = 9,local_size_z = 4) in;

// TODO: Change to reflect my light structure
// struct PointLight{
    // vec4 position;
    // vec4 color;
    // uint enabled;
    // float intensity;
    // float range;
// };

// TODO: Pack this more efficiently
struct Light {
    vec4 position;
    vec4 direction;
    vec4 ambientColor;
    vec4 diffuseColor;
    vec4 specularColor;
    vec4 attributes;
    vec4 intensity;
    ivec4 typeIndexAndFlags;
    // uint flags;
};

// Array containing offset and number of lights in a cluster
struct LightGrid{
    uint offset;
    uint count;
};

struct VolumeTileAABB{
    vec4 minPoint;
    vec4 maxPoint;
};

layout(std430,binding = 0) readonly buffer LightBuffer {
    Light data[];
} lightBuffer;


layout (std430,binding = 1) buffer clusterAABB{
    VolumeTileAABB cluster[ ];
};

layout (std430,binding = 2) buffer screenToView{
    mat4 inverseProjection;
    uvec4 tileSizes;
    uvec2 screenDimensions;
};

// layout (std430,binding = 3) buffer lightSSBO{
    // PointLight pointLight[];
// };

// SSBO of active light indices
layout (std430,binding = 4) buffer lightIndexSSBO{
    uint globalLightIndexList[];
};

layout (std430,binding = 5) buffer lightGridSSBO{
    LightGrid lightGrid[];
};

layout (std430,binding = 6) buffer globalIndexCountSSBO{
    uint globalIndexCount;
};

// Shared variables,shared between all invocations WITHIN A WORK GROUP
// TODO: See if I can use gl_WorkGroupSize for this,gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z
// A grouped-shared array which contains all the lights being evaluated
shared Light sharedLights[4*9*4]; // A grouped-shared array which contains all the lights being evaluated,size is thread-count

uniform mat4 viewMatrix;

bool testSphereAABB(uint light,uint tile);
float sqDistPointAABB(vec3 point,uint tile);
bool testConeAABB(uint light,uint tile);
float getLightRange(uint lightIndex);
bool isEnabled(uint lightIndex);

// Runs in batches of multiple Z slices at once
// In this implementation,6 batches,since each thread group contains four z slices (24/4=6)
// We begin by each thread representing a cluster
// Then in the light traversal loop they change to representing lights
// Then change again near the end to represent clusters
// NOTE: Tiles actually mean clusters,it's just a legacy name from tiled shading
void main(){
    // Reset every frame
    globalIndexCount = 0; // How many lights are active in t  his scene
    uint threadCount = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z; // Number of threads in a group,same as local_size_x,local_size_y,local_size_z
    uint lightCount  = lightBuffer.data.length(); // Number of total lights in the scene
    uint numBatches = uint((lightCount + threadCount -1) / threadCount); // Number of groups of lights that will be completed,i.e.,number of passes


    uint tileIndex = gl_LocalInvocationIndex + gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * gl_WorkGroupID.z;
    // uint tileIndex = gl_GlobalInvocationID; // doesn't wortk,is uvec3
    
    // Local thread variables
    uint visibleLightCount = 0;
    uint visibleLightIndices[100]; // local light index list,to be transferred to global list

    // Every light is being checked against every cluster in the view frustum
    // TODO: Perform active cluster determination
    // Each individual thread will be responsible for loading a light and writing it to shared memory so other threads can read it
    for( uint batch = 0; batch < numBatches; ++batch){
        uint lightIndex = batch * threadCount + gl_LocalInvocationIndex;

        //Prevent overflow by clamping to last light which is always null
        lightIndex = min(lightIndex,lightCount);

        //Populating shared light array
        // NOTE: It is VERY important that lightBuffer.data not be referenced after this point,// since that is not thread-safe
        sharedLights[gl_LocalInvocationIndex] = lightBuffer.data[lightIndex];
        barrier(); // Synchronize read/writes between invocations within a work group

        //Iterating within the current batch of lights
        for( uint light = 0; light < threadCount; ++light){
            if( isEnabled(light)){
                uint lightType = uint(sharedLights[light].typeIndexAndFlags[0]);
                if(lightType == 0){
                    // Point light
                    if( testSphereAABB(light,tileIndex) ){
                        visibleLightIndices[visibleLightCount] = batch * threadCount + light;
                        visibleLightCount += 1;
                    }
                }
                else if(lightType == 1){
                    // Directional light
                    visibleLightIndices[visibleLightCount] = batch * threadCount + light;
                    visibleLightCount += 1;
                }
                else if(lightType == 2){
                    // Spot light
                    if( testConeAABB(light,tileIndex) ){
                        visibleLightIndices[visibleLightCount] = batch * threadCount + light;
                        visibleLightCount += 1;
                    }
                }
            }
        }
    }

    // We want all thread groups to have completed the light tests before continuing
    barrier();
    
    // Back to every thread representing a cluster

    // Adding the light indices to the cluster light index list
    uint offset = atomicAdd(globalIndexCount,visibleLightCount);
    for(uint i = 0; i < visibleLightCount; ++i){
        globalLightIndexList[offset + i] = visibleLightIndices[i];
    }

    // Updating the light grid for each cluster
    lightGrid[tileIndex].offset = offset;
    lightGrid[tileIndex].count = visibleLightCount;
}

// Return whether or not the specified light intersects with the specified tile (cluster)
bool testSphereAABB(uint light,uint tile){
    float radius = getLightRange(light);
    vec3 center  = vec3(viewMatrix * sharedLights[light].position);
    float squaredDistance = sqDistPointAABB(center,tile);

    return squaredDistance <= (radius * radius);
}

// TODO: Different test for spot-lights
// Has been done by using several AABBs for spot-light cone,this could be a good approach,or even just use one to start.
bool testConeAABB(uint light,uint tile){
    // Light light = lightBuffer.data[lightIndex];
    // float innerAngleCos = light.attributes[0];
    // float outerAngleCos = light.attributes[1];
    // float innerAngle = acos(innerAngleCos);
    // float outerAngle = acos(outerAngleCos);
    // FIXME: Actually do something clever here
    return true;
}


// Get range of light given the specified light index
float getLightRange(uint lightIndex){
    int lightType = sharedLights[lightIndex].typeIndexAndFlags[0];
    float range;
    if(lightType == 0){
        // Point light
        float brightness = 0.01; // cutoff for end of range
        float c = sharedLights[lightIndex].attributes.x;
        float lin = sharedLights[lightIndex].attributes.y;
        float quad = sharedLights[lightIndex].attributes.z;
        
        range = (-lin + sqrt(lin*lin - 4.0 * c * quad + (4.0/brightness)* quad)) / (2.0 * quad);
    }
    else if(lightType == 1){
        // Directional light
        range = FLT_MAX;
    }
    else{
        // Spot light
        range = FLT_MAX;
    }
    return range;
}

// Whether the light at the specified index is enabled
bool isEnabled(uint lightIndex){
    uint flags = sharedLights[lightIndex].typeIndexAndFlags[2];
    return (flags | 1) != 0;
}

// Get squared distance from a point to the AABB of the specified tile (cluster)
float sqDistPointAABB(vec3 point,uint tile){
    float sqDist = 0.0;
    VolumeTileAABB currentCell = cluster[tile];
    cluster[tile].maxPoint[3] = tile;
    for(int i = 0; i < 3; ++i){
        float v = point[i];
        if(v < currentCell.minPoint[i]){
            sqDist += (currentCell.minPoint[i] - v) * (currentCell.minPoint[i] - v);
        }
        if(v > currentCell.maxPoint[i]){
            sqDist += (v - currentCell.maxPoint[i]) * (v - currentCell.maxPoint[i]);
        }
    }

    return sqDist;
}

编辑：哎呀，丢掉了它的底部！

我不明白的是，为什么更改工作组的大小会完全影响编译时间？如果我的工作组大小太小而无法使计算着色器有效运行，那么这将使算法失去意义，所以我希望有一些我想念的东西。

最后一点，我想避免使用glGetProgramBinary作为解决方案。不仅因为它只是绕过了问题而不是没有解决问题，而且还因为预编译着色器无法很好地与引擎的当前架构配合使用。

解决方法

因此，我认为这一定是编译器中的错误，因为我已将sqDistPointAABB函数中的循环替换为：

vec3 minPoint = currentCell.minPoint.xyz;
vec3 maxPoint = currentCell.maxPoint.xyz;
vec3 t1 = vec3(lessThan(point,minPoint));
vec3 t2 = vec3(greaterThan(point,maxPoint));
vec3 sqDist = t1 * (minPoint - point) * (minPoint - point) + t2 * (maxPoint - point) * (maxPoint - point);
return sqDist.x + sqDist.y + sqDist.z;

它现在可以在不到一秒钟的时间内完成编译！太奇怪了

使用较大的工作组时，我的OpenGL计算着色器编译速度非常慢超过10分钟，我可以做些什么来加快速度吗？

如何解决使用较大的工作组时，我的OpenGL计算着色器编译速度非常慢超过10分钟，我可以做些什么来加快速度吗？

解决方法

相关推荐