如何解决如何在NVIDIA GPU上处理来自CPU的RGB数据,以及如何使用OpenGL纹理可视化数据
我希望在C ++ / CUDA C ++中创建一个简单的计算机视觉库,使我能够执行以下操作:
- 从主机内存中获取一些RGB数据。这些数据将以BGR字节数组的形式出现,每个通道每像素8位。
- 在CUDA内核中处理该数据。
- 将该内核的输出写回到某些主机内存中。
- 以OpenGL纹理渲染输出以方便查看。
这些函数将放在这样的类中:
class Processor{
public:
setInput(const byte* data,int imageWidth,int imageHeight);
void processData();
GLuint getInputTexture();
GLuint getOutputTexture();
void writeOutputTo(byte* destination);
}
setInput()
将在视频的每一帧(数百或数千张相同尺寸的图像)中调用。
如何编写Processor类,以便setInput()
可以有效地更新实例的内部CUDA数组,而processData()
可以将CUDA数组与OpenGL纹理同步?
下面是我尝试实现这样的类的尝试,该类包含在一个CUDA C ++文件中以及一个简单的测试。 (需要GLFW和GLAD。)通过此实现,我可以提供一些输入图像数据,运行CUDA内核以生成输出图像,并使用OpenGL纹理将它们可视化。但这效率极低,因为每次调用setInput()
时,都需要创建两个OpenGL纹理和两个CUDA表面对象。而且,如果要处理多个图像,则还必须销毁两个OpenGL纹理和两个CUDA曲面对象。
#include <glad/glad.h>
#include <GLFW/glfw3.h>
#include <cudaGL.h>
#include <cuda_gl_interop.h>
#include <iostream>
/** Macro for checking if CUDA has problems */
#define cudaCheckError() { \
cudaError_t err = cudaGetLastError(); \
if(err != cudaSuccess) { \
printf("Cuda error: %s:%d: %s\n",__FILE__,__LINE__,cudaGetErrorString(err)); \
exit(1); \
} \
}
/*Window dimensions*/
const int windowWidth = 1280,windowHeight = 720;
/*Window address*/
GLFWwindow* currentGLFWWindow = 0;
/**
* A simple image processing kernel that copies the inverted data from the input surface to the output surface.
*/
__global__ void kernel(cudaSurfaceObject_t input,cudaSurfaceObject_t output,int width,int height) {
//Get the pixel index
unsigned int xPx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int yPx = threadIdx.y + blockIdx.y * blockDim.y;
//Don't do any computation if this thread is outside of the surface bounds.
if (xPx >= width || yPx >= height) return;
//Copy the contents of input to output.
uchar4 pixel = { 255,128,255 };
//Read a pixel from the input. Disable to default to the flat orange color above
surf2Dread<uchar4>(&pixel,input,xPx * sizeof(uchar4),yPx,cudaBoundaryModeClamp);
//Invert the color
pixel.x = ~pixel.x;
pixel.y = ~pixel.y;
pixel.z = ~pixel.z;
//Write the new pixel color to the
surf2Dwrite(pixel,output,yPx);
}
class Processor {
public:
void setInput( uint8_t* const data,int imageHeight);
void processData();
GLuint getInputTexture();
GLuint getOutputTexture();
void writeOutputTo(uint8_t* destination);
private:
/**
* @brief True if the textures and surfaces are initialized.
*
* Prevents memory leaks
*/
bool surfacesInitialized = false;
/**
* @brief The width and height of a texture/surface pair.
*
*/
struct ImgDim { int width,height; };
/**
* @brief Creates a CUDA surface object,CUDA resource,and OpenGL texture from some data.
*/
void createTextureSurfacePair(const ImgDim& dimensions,uint8_t* const data,GLuint& textureOut,cudaGraphicsResource_t& graphicsResourceOut,cudaSurfaceObject_t& surfaceOut);
/**
* @brief Destroys every CUDA surface object,and OpenGL texture created by this instance.
*/
void destroyEverything();
/**
* @brief The dimensions of an image and its corresponding texture.
*
*/
ImgDim imageInputDimensions,imageOutputDimensions;
/**
* @brief A CUDA surface that can be read to,written from,or synchronized with a Mat or
* OpenGL texture
*
*/
cudaSurfaceObject_t d_imageInputTexture = 0,d_imageOutputTexture = 0;
/**
* @brief A CUDA resource that's bound to an array in CUDA memory
*/
cudaGraphicsResource_t d_imageInputGraphicsResource,d_imageOutputGraphicsResource;
/**
* @brief A renderable OpenGL texture that is synchronized with the CUDA data
* @see d_imageInputTexture,d_imageOutputTexture
*/
GLuint imageInputTexture = 0,imageOutputTexture = 0;
/** Returns true if nothing can be rendered */
bool empty() { return imageInputTexture == 0; }
};
void Processor::setInput(uint8_t* const data,int imageHeight)
{
//Same-size images don't need texture regeneration,so skip that.
if (imageHeight == imageInputDimensions.height && imageWidth == imageInputDimensions.width) {
/*
Possible shortcut: we know the input is the same size as the texture and CUDA surface object.
So instead of destroying the surface and texture,why not just overwrite them?
That's what I try to do in the following block,but because "data" is BGR and the texture
is RGBA,the channels get all messed up.
*/
/*
//Use the input surface's CUDAResourceDesc to gain access to the surface data array
struct cudaResourceDesc resDesc;
memset(&resDesc,sizeof(resDesc));
cudaGetSurfaceObjectResourceDesc(&resDesc,d_imageInputTexture);
cudaCheckError();
//Copy the data from the input array to the surface
cudaMemcpyToArray(resDesc.res.array.array,input.data,imageInputDimensions.width * imageInputDimensions.height * 3,cudaMemcpyHostToDevice);
cudaCheckError();
//Set status flags
surfacesInitialized = true;
return;
*/
}
//Clear everything that originally existed in the texture/surface
destroyEverything();
//Get the size of the image and place it here.
imageInputDimensions.width = imageWidth;
imageInputDimensions.height = imageHeight;
imageOutputDimensions.width = imageWidth;
imageOutputDimensions.height = imageHeight;
//Create the input surface/texture pair
createTextureSurfacePair(imageInputDimensions,data,imageInputTexture,d_imageInputGraphicsResource,d_imageInputTexture);
//Create the output surface/texture pair
uint8_t* outData = new uint8_t[imageOutputDimensions.width * imageOutputDimensions.height * 3];
createTextureSurfacePair(imageOutputDimensions,outData,imageOutputTexture,d_imageOutputGraphicsResource,d_imageOutputTexture);
delete outData;
//Set status flags
surfacesInitialized = true;
}
void Processor::processData()
{
const int threadsPerBlock = 128;
//Call the algorithm
//Set the number of blocks to call the kernel with.
dim3 blocks((unsigned int)ceil((float)imageInputDimensions.width / threadsPerBlock),imageInputDimensions.height);
kernel <<<blocks,threadsPerBlock >>> (d_imageInputTexture,d_imageOutputTexture,imageInputDimensions.width,imageInputDimensions.height);
//Sync the surface with the texture
cudaDeviceSynchronize();
cudaCheckError();
}
GLuint Processor::getInputTexture()
{
return imageInputTexture;
}
GLuint Processor::getOutputTexture()
{
return imageOutputTexture;
}
void Processor::writeOutputTo(uint8_t* destination)
{
//Haven't figured this out yet
}
void Processor::createTextureSurfacePair(const Processor::ImgDim& dimensions,cudaSurfaceObject_t& surfaceOut) {
// Create the OpenGL texture that will be displayed with GLAD and GLFW
glGenTextures(1,&textureOut);
// Bind to our texture handle
glBindTexture(GL_TEXTURE_2D,textureOut);
// Set texture interpolation methods for minification and magnification
glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_NEAREST);
// Set texture clamping method
glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_CLAMP);
glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_CLAMP);
// Create the texture and its attributes
glTexImage2D(GL_TEXTURE_2D,// Type of texture
0,// Pyramid level (for mip-mapping) - 0 is the top level
GL_RGBA,// Internal color format to convert to
dimensions.width,// Image width i.e. 640 for Kinect in standard mode
dimensions.height,// Image height i.e. 480 for Kinect in standard mode
0,// Border width in pixels (can either be 1 or 0)
GL_BGR,// Input image format (i.e. GL_RGB,GL_RGBA,GL_BGR etc.)
GL_UNSIGNED_BYTE,// Image data type.
data); // The actual image data itself
//Note that the type of this texture is an RGBA UNSIGNED_BYTE type. When CUDA surfaces
//are synchronized with OpenGL textures,the surfaces will be of the same type.
//They won't know or care about their data types though,for they are all just byte arrays
//at heart. So be careful to ensure that any CUDA kernel that handles a CUDA surface
//uses it as an appropriate type. You will see that the update_surface kernel (defined
//above) treats each pixel as four unsigned bytes along the X-axis: one for red,green,blue,//and alpha respectively.
//Create the CUDA array and texture reference
cudaArray* bitmap_d;
//Register the GL texture with the CUDA graphics library. A new cudaGraphicsResource is created,and its address is placed in cudaTextureID.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OPENGL.html#group__CUDART__OPENGL_1g80d12187ae7590807c7676697d9fe03d
cudaGraphicsGLRegisterImage(&graphicsResourceOut,textureOut,GL_TEXTURE_2D,cudaGraphicsRegisterFlagsNone);
cudaCheckError();
//Map graphics resources for access by CUDA.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP_1gad8fbe74d02adefb8e7efb4971ee6322
cudaGraphicsMapResources(1,&graphicsResourceOut,0);
cudaCheckError();
//Get the location of the array of pixels that was mapped by the previous function and place that address in bitmap_d
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP_1g0dd6b5f024dfdcff5c28a08ef9958031
cudaGraphicsSubResourceGetMappedArray(&bitmap_d,graphicsResourceOut,0);
cudaCheckError();
//Create a CUDA resource descriptor. This is used to get and set attributes of CUDA resources.
//This one will tell CUDA how we want the bitmap_surface to be configured.
//Documentation for the struct: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaResourceDesc.html#structcudaResourceDesc
struct cudaResourceDesc resDesc;
//Clear it with 0s so that some flags aren't arbitrarily left at 1s
memset(&resDesc,sizeof(resDesc));
//Set the resource type to be an array for convenient processing in the CUDA kernel.
//List of resTypes: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g067b774c0e639817a00a972c8e2c203c
resDesc.resType = cudaResourceTypeArray;
//Bind the new descriptor with the bitmap created earlier.
resDesc.res.array.array = bitmap_d;
//Create a new CUDA surface ID reference.
//This is really just an unsigned long long.
//Docuentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1gbe57cf2ccbe7f9d696f18808dd634c0a
surfaceOut = 0;
//Create the surface with the given description. That surface ID is placed in bitmap_surface.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__SURFACE__OBJECT.html#group__CUDART__SURFACE__OBJECT_1g958899474ab2c5f40d233b524d6c5a01
cudaCreateSurfaceObject(&surfaceOut,&resDesc);
cudaCheckError();
}
void Processor::destroyEverything()
{
if (surfacesInitialized) {
//Input image CUDA surface
cudaDestroySurfaceObject(d_imageInputTexture);
cudaGraphicsUnmapResources(1,&d_imageInputGraphicsResource);
cudaGraphicsUnregisterResource(d_imageInputGraphicsResource);
d_imageInputTexture = 0;
//Output image CUDA surface
cudaDestroySurfaceObject(d_imageOutputTexture);
cudaGraphicsUnmapResources(1,&d_imageOutputGraphicsResource);
cudaGraphicsUnregisterResource(d_imageOutputGraphicsResource);
d_imageOutputTexture = 0;
//Input image GL texture
glDeleteTextures(1,&imageInputTexture);
imageInputTexture = 0;
//Output image GL texture
glDeleteTextures(1,&imageOutputTexture);
imageOutputTexture = 0;
surfacesInitialized = false;
}
}
/** A way to initialize OpenGL with GLFW and GLAD */
void initGL() {
// Setup window
if (!glfwInit())
return;
// Decide GL+GLSL versions
#if __APPLE__
// GL 3.2 + GLSL 150
const char* glsl_version = "#version 150";
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR,3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR,2);
glfwWindowHint(GLFW_OPENGL_PROFILE,GLFW_OPENGL_CORE_PROFILE); // 3.2+ only
glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT,GL_TRUE); // Required on Mac
#else
// GL 3.0 + GLSL 130
const char* glsl_version = "#version 130";
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR,0);
//glfwWindowHint(GLFW_OPENGL_PROFILE,GLFW_OPENGL_CORE_PROFILE); // 3.2+ only
//glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT,GL_TRUE); // 3.0+ only
#endif
// Create window with graphics context
currentGLFWWindow = glfwCreateWindow(windowWidth,windowHeight,"Output image (OpenGL + GLFW)",NULL,NULL);
if (currentGLFWWindow == NULL)
return;
glfwMakeContextCurrent(currentGLFWWindow);
glfwSwapInterval(3); // Enable vsync
if (!gladLoadGL()) {
// GLAD failed
printf( "GLAD failed to initialize :(" );
return;
}
//Change GL settings
glViewport(0,windowWidth,windowHeight); // use a screen size of WIDTH x HEIGHT
glMatrixMode(GL_PROJECTION); // Make a simple 2D projection on the entire window
glLoadIdentity();
glOrtho(0.0,0.0,100.0);
glMatrixMode(GL_MODELVIEW); // Set the matrix mode to object modeling
glClearColor(0.0f,0.0f,0.0f);
glClearDepth(0.0f);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // Clear the window
}
/** Renders the textures on the GLFW window and requests GLFW to update */
void showTextures(GLuint top,GLuint bottom) {
// Clear color and depth buffers
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glMatrixMode(GL_MODELVIEW); // Operate on model-view matrix
glBindTexture(GL_TEXTURE_2D,top);
/* Draw top quad */
glEnable(GL_TEXTURE_2D);
glBegin(GL_QUADS);
glTexCoord2i(0,0); glVertex2i(0,0);
glTexCoord2i(0,1); glVertex2i(0,windowHeight/2);
glTexCoord2i(1,1); glVertex2i(windowWidth,windowHeight / 2);
glTexCoord2i(1,0); glVertex2i(windowWidth,0);
glEnd();
glDisable(GL_TEXTURE_2D);
/* Draw top quad */
glBindTexture(GL_TEXTURE_2D,bottom);
glEnable(GL_TEXTURE_2D);
glBegin(GL_QUADS);
glTexCoord2i(0,windowHeight / 2);
glTexCoord2i(0,windowHeight);
glTexCoord2i(1,windowHeight / 2);
glEnd();
glDisable(GL_TEXTURE_2D);
glfwSwapBuffers(currentGLFWWindow);
glfwPollEvents();
}
int main() {
initGL();
int imageWidth = windowWidth;
int imageHeight = windowHeight / 2;
uint8_t* imageData = new uint8_t[imageWidth * imageHeight * 3];
Processor p;
while (!glfwWindowShouldClose(currentGLFWWindow))
{
//Process the image here
p.setInput(imageData,imageWidth,imageHeight);
p.processData();
showTextures(p.getInputTexture(),p.getOutputTexture());
}
}
解决方法
TL; DR:我在这里至少可以看到2种方法,要么将您的数据转换为4字节像素(以某种方式)并使用cudaMemcpy2DToArray
,要么允许CUDA内核接收原始数据(而不是使用表面作为输入)。我将尽力演示这两种方法,尽管我不想花很多精力来完善这一点,所以实际上只是演示思想。
此答案正在解决您在an edit中提供的代码(不是最新代码)。但是,在随后的编辑中,主要是您似乎只是淘汰了OpenCV,我通常对此表示赞赏。但是,由于我已经完成了包含OpenCV的编辑工作,因此我选择使用自己的OpenCV“测试用例”。
-
使用每像素4个字节的数据和
cudaMemcpy2DToArray
:尽管已注释掉,但这似乎与您所展示的内容最为接近。想法是,我们将直接将输入数据复制到CUDA数组(从互操作机制获取),以访问输入数据。正如您之前所指出的,cudaMemcpyToArray
是deprecated,因此我们不会使用它。此外,我们的数据格式(每像素字节)必须匹配数组中的内容。我认为有多种方法可以解决此问题,具体取决于您的总体渠道,但是我在这里展示的方法效率不高,只是为了证明该方法是“可行的”。但是,如果有一种方法可以在管道中使用每个像素4字节的数据,则可以摆脱这里的“低效率”。要使用此方法,请使用-DUSE_1
开关编译代码。 -
通过内核输入数据。我们可以通过仅允许内核动态进行3字节到4字节的数据转换来跳过第一种情况的低效率。无论哪种方式,都存在从主机到设备的数据副本,但是此方法不需要每像素4字节的输入数据。
以下代码演示了这两个选项:
//nvcc -arch=sm_35 -o t19 glad/src/glad.c t19.cu -lGL -lGLU -I./glad/include -lglfw -std=c++11 -lopencv_core -lopencv_highgui -lopencv_imgcodecs -Wno-deprecated-gpu-targets
#include <glad/glad.h>
#include <GLFW/glfw3.h>
#include <cudaGL.h>
#include <cuda_gl_interop.h>
#include <iostream>
#include <opencv2/highgui.hpp>
/** Macro for checking if CUDA has problems */
#define cudaCheckError() { \
cudaError_t err = cudaGetLastError(); \
if(err != cudaSuccess) { \
printf("Cuda error: %s:%d: %s\n",__FILE__,__LINE__,cudaGetErrorString(err)); \
exit(1); \
} \
}
/*Window dimensions*/
//const int windowWidth = 1280,windowHeight = 720;
/*Window address*/
GLFWwindow* currentGLFWWindow = 0;
/**
* A simple image processing kernel that copies the inverted data from the input surface to the output surface.
*/
__global__ void kernel(cudaSurfaceObject_t input,cudaSurfaceObject_t output,int width,int height,uint8_t *data) {
//Get the pixel index
unsigned int xPx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int yPx = threadIdx.y + blockIdx.y * blockDim.y;
//Don't do any computation if this thread is outside of the surface bounds.
if (xPx >= width || yPx >= height) return;
//Copy the contents of input to output.
#ifdef USE_1
uchar4 pixel = { 255,128,255 };
//Read a pixel from the input. Disable to default to the flat orange color above
surf2Dread<uchar4>(&pixel,input,xPx * sizeof(uchar4),yPx,cudaBoundaryModeClamp);
#else
uchar4 pixel;
pixel.x = data[(xPx+yPx*width)*3 + 0];
pixel.y = data[(xPx+yPx*width)*3 + 1];
pixel.z = data[(xPx+yPx*width)*3 + 2];
pixel.w = 255;
surf2Dwrite(pixel,yPx);
#endif
//Invert the color
pixel.x = ~pixel.x;
pixel.y = ~pixel.y;
pixel.z = ~pixel.z;
//Write the new pixel color to the
surf2Dwrite(pixel,output,yPx);
}
class Processor {
public:
void setInput( uint8_t* const data,int imageWidth,int imageHeight);
void processData(uint8_t *data,uint8_t *d_data);
GLuint getInputTexture();
GLuint getOutputTexture();
void writeOutputTo(uint8_t* destination);
private:
/**
* @brief True if the textures and surfaces are initialized.
*
* Prevents memory leaks
*/
bool surfacesInitialized = false;
/**
* @brief The width and height of a texture/surface pair.
*
*/
struct ImgDim { int width,height; };
/**
* @brief Creates a CUDA surface object,CUDA resource,and OpenGL texture from some data.
*/
void createTextureSurfacePair(const ImgDim& dimensions,uint8_t* const data,GLuint& textureOut,cudaGraphicsResource_t& graphicsResourceOut,cudaSurfaceObject_t& surfaceOut);
/**
* @brief Destroys every CUDA surface object,and OpenGL texture created by this instance.
*/
void destroyEverything();
/**
* @brief The dimensions of an image and its corresponding texture.
*
*/
ImgDim imageInputDimensions,imageOutputDimensions;
/**
* @brief A CUDA surface that can be read to,written from,or synchronized with a Mat or
* OpenGL texture
*
*/
cudaSurfaceObject_t d_imageInputTexture = 0,d_imageOutputTexture = 0;
/**
* @brief A CUDA resource that's bound to an array in CUDA memory
*/
cudaGraphicsResource_t d_imageInputGraphicsResource,d_imageOutputGraphicsResource;
/**
* @brief A renderable OpenGL texture that is synchronized with the CUDA data
* @see d_imageInputTexture,d_imageOutputTexture
*/
GLuint imageInputTexture = 0,imageOutputTexture = 0;
/** Returns true if nothing can be rendered */
bool empty() { return imageInputTexture == 0; }
};
void Processor::setInput(uint8_t* const data,int imageHeight)
{
//Same-size images don't need texture regeneration,so skip that.
if (imageHeight == imageInputDimensions.height && imageWidth == imageInputDimensions.width) {
/*
Possible shortcut: we know the input is the same size as the texture and CUDA surface object.
So instead of destroying the surface and texture,why not just overwrite them?
That's what I try to do in the following block,but because "data" is BGR and the texture
is RGBA,the channels get all messed up.
*/
//Use the input surface's CUDAResourceDesc to gain access to the surface data array
#ifdef USE_1
struct cudaResourceDesc resDesc;
memset(&resDesc,sizeof(resDesc));
cudaGetSurfaceObjectResourceDesc(&resDesc,d_imageInputTexture);
cudaCheckError();
uint8_t *data4 = new uint8_t[imageInputDimensions.width*imageInputDimensions.height*4];
for (int i = 0; i < imageInputDimensions.width*imageInputDimensions.height; i++){
data4[i*4+0] = data[i*3+0];
data4[i*4+1] = data[i*3+1];
data4[i*4+2] = data[i*3+2];
data4[i*4+3] = 255;}
//Copy the data from the input array to the surface
// cudaMemcpyToArray(resDesc.res.array.array,data,imageInputDimensions.width * imageInputDimensions.height * 3,cudaMemcpyHostToDevice);
cudaMemcpy2DToArray(resDesc.res.array.array,data4,imageInputDimensions.width*4,imageInputDimensions.height,cudaMemcpyHostToDevice);
cudaCheckError();
delete[] data4;
#endif
//Set status flags
surfacesInitialized = true;
return;
}
//Clear everything that originally existed in the texture/surface
destroyEverything();
//Get the size of the image and place it here.
imageInputDimensions.width = imageWidth;
imageInputDimensions.height = imageHeight;
imageOutputDimensions.width = imageWidth;
imageOutputDimensions.height = imageHeight;
//Create the input surface/texture pair
createTextureSurfacePair(imageInputDimensions,imageInputTexture,d_imageInputGraphicsResource,d_imageInputTexture);
//Create the output surface/texture pair
uint8_t* outData = new uint8_t[imageOutputDimensions.width * imageOutputDimensions.height * 3];
createTextureSurfacePair(imageOutputDimensions,outData,imageOutputTexture,d_imageOutputGraphicsResource,d_imageOutputTexture);
delete outData;
//Set status flags
surfacesInitialized = true;
}
void Processor::processData(uint8_t *data,uint8_t *d_data)
{
const int threadsPerBlock = 128;
//Call the algorithm
//Set the number of blocks to call the kernel with.
dim3 blocks((unsigned int)ceil((float)imageInputDimensions.width / threadsPerBlock),imageInputDimensions.height);
#ifndef USE_1
cudaMemcpy(d_data,imageInputDimensions.width*imageInputDimensions.height*3,cudaMemcpyHostToDevice);
#endif
kernel <<<blocks,threadsPerBlock >>> (d_imageInputTexture,d_imageOutputTexture,imageInputDimensions.width,d_data);
//Sync the surface with the texture
cudaDeviceSynchronize();
cudaCheckError();
}
GLuint Processor::getInputTexture()
{
return imageInputTexture;
}
GLuint Processor::getOutputTexture()
{
return imageOutputTexture;
}
void Processor::writeOutputTo(uint8_t* destination)
{
//Haven't figured this out yet
}
void Processor::createTextureSurfacePair(const Processor::ImgDim& dimensions,cudaSurfaceObject_t& surfaceOut) {
// Create the OpenGL texture that will be displayed with GLAD and GLFW
glGenTextures(1,&textureOut);
// Bind to our texture handle
glBindTexture(GL_TEXTURE_2D,textureOut);
// Set texture interpolation methods for minification and magnification
glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_NEAREST);
// Set texture clamping method
glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_CLAMP);
glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_CLAMP);
// Create the texture and its attributes
glTexImage2D(GL_TEXTURE_2D,// Type of texture
0,// Pyramid level (for mip-mapping) - 0 is the top level
GL_RGBA,// Internal color format to convert to
dimensions.width,// Image width i.e. 640 for Kinect in standard mode
dimensions.height,// Image height i.e. 480 for Kinect in standard mode
0,// Border width in pixels (can either be 1 or 0)
GL_BGR,// Input image format (i.e. GL_RGB,GL_RGBA,GL_BGR etc.)
GL_UNSIGNED_BYTE,// Image data type.
data); // The actual image data itself
//Note that the type of this texture is an RGBA UNSIGNED_BYTE type. When CUDA surfaces
//are synchronized with OpenGL textures,the surfaces will be of the same type.
//They won't know or care about their data types though,for they are all just byte arrays
//at heart. So be careful to ensure that any CUDA kernel that handles a CUDA surface
//uses it as an appropriate type. You will see that the update_surface kernel (defined
//above) treats each pixel as four unsigned bytes along the X-axis: one for red,green,blue,//and alpha respectively.
//Create the CUDA array and texture reference
cudaArray* bitmap_d;
//Register the GL texture with the CUDA graphics library. A new cudaGraphicsResource is created,and its address is placed in cudaTextureID.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OPENGL.html#group__CUDART__OPENGL_1g80d12187ae7590807c7676697d9fe03d
cudaGraphicsGLRegisterImage(&graphicsResourceOut,textureOut,GL_TEXTURE_2D,cudaGraphicsRegisterFlagsNone);
cudaCheckError();
//Map graphics resources for access by CUDA.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP_1gad8fbe74d02adefb8e7efb4971ee6322
cudaGraphicsMapResources(1,&graphicsResourceOut,0);
cudaCheckError();
//Get the location of the array of pixels that was mapped by the previous function and place that address in bitmap_d
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP_1g0dd6b5f024dfdcff5c28a08ef9958031
cudaGraphicsSubResourceGetMappedArray(&bitmap_d,graphicsResourceOut,0);
cudaCheckError();
//Create a CUDA resource descriptor. This is used to get and set attributes of CUDA resources.
//This one will tell CUDA how we want the bitmap_surface to be configured.
//Documentation for the struct: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaResourceDesc.html#structcudaResourceDesc
struct cudaResourceDesc resDesc;
//Clear it with 0s so that some flags aren't arbitrarily left at 1s
memset(&resDesc,sizeof(resDesc));
//Set the resource type to be an array for convenient processing in the CUDA kernel.
//List of resTypes: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g067b774c0e639817a00a972c8e2c203c
resDesc.resType = cudaResourceTypeArray;
//Bind the new descriptor with the bitmap created earlier.
resDesc.res.array.array = bitmap_d;
//Create a new CUDA surface ID reference.
//This is really just an unsigned long long.
//Docuentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1gbe57cf2ccbe7f9d696f18808dd634c0a
surfaceOut = 0;
//Create the surface with the given description. That surface ID is placed in bitmap_surface.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__SURFACE__OBJECT.html#group__CUDART__SURFACE__OBJECT_1g958899474ab2c5f40d233b524d6c5a01
cudaCreateSurfaceObject(&surfaceOut,&resDesc);
cudaCheckError();
}
void Processor::destroyEverything()
{
if (surfacesInitialized) {
//Input image CUDA surface
cudaDestroySurfaceObject(d_imageInputTexture);
cudaGraphicsUnmapResources(1,&d_imageInputGraphicsResource);
cudaGraphicsUnregisterResource(d_imageInputGraphicsResource);
d_imageInputTexture = 0;
//Output image CUDA surface
cudaDestroySurfaceObject(d_imageOutputTexture);
cudaGraphicsUnmapResources(1,&d_imageOutputGraphicsResource);
cudaGraphicsUnregisterResource(d_imageOutputGraphicsResource);
d_imageOutputTexture = 0;
//Input image GL texture
glDeleteTextures(1,&imageInputTexture);
imageInputTexture = 0;
//Output image GL texture
glDeleteTextures(1,&imageOutputTexture);
imageOutputTexture = 0;
surfacesInitialized = false;
}
}
/** A way to initialize OpenGL with GLFW and GLAD */
void initGL(int windowWidth,int windowHeight) {
// Setup window
if (!glfwInit())
return;
// Decide GL+GLSL versions
#if __APPLE__
// GL 3.2 + GLSL 150
const char* glsl_version = "#version 150";
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR,3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR,2);
glfwWindowHint(GLFW_OPENGL_PROFILE,GLFW_OPENGL_CORE_PROFILE); // 3.2+ only
glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT,GL_TRUE); // Required on Mac
#else
// GL 3.0 + GLSL 130
//const char* glsl_version = "#version 130";
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR,0);
//glfwWindowHint(GLFW_OPENGL_PROFILE,GLFW_OPENGL_CORE_PROFILE); // 3.2+ only
//glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT,GL_TRUE); // 3.0+ only
#endif
// Create window with graphics context
currentGLFWWindow = glfwCreateWindow(windowWidth,windowHeight,"Output image (OpenGL + GLFW)",NULL,NULL);
if (currentGLFWWindow == NULL)
return;
glfwMakeContextCurrent(currentGLFWWindow);
glfwSwapInterval(3); // Enable vsync
if (!gladLoadGL()) {
// GLAD failed
printf( "GLAD failed to initialize :(" );
return;
}
//Change GL settings
glViewport(0,windowWidth,windowHeight); // use a screen size of WIDTH x HEIGHT
glMatrixMode(GL_PROJECTION); // Make a simple 2D projection on the entire window
glLoadIdentity();
glOrtho(0.0,0.0,100.0);
glMatrixMode(GL_MODELVIEW); // Set the matrix mode to object modeling
glClearColor(0.0f,0.0f,0.0f);
glClearDepth(0.0f);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // Clear the window
}
/** Renders the textures on the GLFW window and requests GLFW to update */
void showTextures(GLuint top,GLuint bottom,int windowWidth,int windowHeight) {
// Clear color and depth buffers
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glMatrixMode(GL_MODELVIEW); // Operate on model-view matrix
glBindTexture(GL_TEXTURE_2D,top);
/* Draw top quad */
glEnable(GL_TEXTURE_2D);
glBegin(GL_QUADS);
glTexCoord2i(0,0); glVertex2i(0,0);
glTexCoord2i(0,1); glVertex2i(0,windowHeight/2);
glTexCoord2i(1,1); glVertex2i(windowWidth,windowHeight / 2);
glTexCoord2i(1,0); glVertex2i(windowWidth,0);
glEnd();
glDisable(GL_TEXTURE_2D);
/* Draw bottom quad */
glBindTexture(GL_TEXTURE_2D,bottom);
glEnable(GL_TEXTURE_2D);
glBegin(GL_QUADS);
glTexCoord2i(0,windowHeight / 2);
glTexCoord2i(0,windowHeight);
glTexCoord2i(1,windowHeight / 2);
glEnd();
glDisable(GL_TEXTURE_2D);
glfwSwapBuffers(currentGLFWWindow);
glfwPollEvents();
}
int main() {
using namespace cv;
using namespace std;
// initGL();
std::string filename = "./lena.pgm";
Mat image;
image = imread(filename,CV_LOAD_IMAGE_COLOR); // Read the file
if(! image.data ) // Check for invalid input
{
cout << "Could not open or find the image" << std::endl ;
return -1;
}
int windoww = 1280;
int windowh = 720;
initGL(windoww,windowh);
uint8_t *d_data;
cudaMalloc(&d_data,image.cols*image.rows*3);
Processor p;
for (int i = 0; i < image.cols; i++)
{
image.data[i*3+0] = 0;
image.data[i*3+1] = 0;
image.data[i*3+2] = 0;
//Process the image here
p.setInput(image.data,image.cols,image.rows);
p.processData(image.data,d_data);
showTextures(p.getInputTexture(),p.getOutputTexture(),windoww,windowh);
}
}
注意:
- 第一行的注释中给出了编译命令
- 我使用单个图像创建了一个“视频”。 “视频”将在图像的顶部像素行中以黑线或白线从左向右水平移动的方式显示图像。输入图像为
lena.pgm
,可以在CUDA样本中找到该图像(例如,在/usr/local/cuda-10.1/samples/3_Imaging/SobelFilter/data/lena.pgm
处)。 - 在我看来,您好像在OpenGL和CUDA之间“共享”资源。在我看来,这似乎不适合正确的映射/取消映射序列,但它似乎正在起作用,而且似乎也不是您提出问题的重点。我没有花任何时间进行调查。我可能错过了一些东西。
- 我并不是说此代码没有缺陷,也不适合任何特定目的。它主要是您的代码。我对其进行了一些修改,以演示文本中描述的一些想法。
- 无论您是否使用
-DUSE_1
进行编译,输出中都不应出现视觉差异。
这是(https://www.3dgep.com/opengl-interoperability-with-cuda/)中第一个遇到的有用功能,我对此进行了改进,以使用最新的CUDA API和流程。您可以在cudammf中引用这两个函数。
https://github.com/prabindh/cudammf/blob/5f93358784fcbaae7eea0850424c59d2ed057dab/decoder3.cpp#L507
基本工作如下:
- 创建常规的GL纹理(GLTextureId)。通过
cudaGraphicsGLRegisterImage
将其映射为CUDA访问
- 进行一些CUDA处理,结果在CUDA缓冲区中
- 使用
cudaMemcpyToArray
在以上2个设备存储器之间进行传输
如果您的输出来自Nvidia编解码器输出,则还应该参考Nvidia Video SDK(https://developer.nvidia.com/nvidia-video-codec-sdk)中的AppDecGL
示例。
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。