直方图内核在cuda gpu上执行时卡住了

问题描述 投票:-1回答:1

第一步,我正在转换成功完成的彩色图像到灰度。在第二步中,我尝试应用灰度图像上的直方图,但是在调用直方图内核时,代码卡住了,我使用了print语句进行调试,但无法找出导致内核卡住的原因。下面的代码卡在print语句,“在调用内核之前”,这可能表明内核根本没有运行。直方图内核代码和主要代码如下。下面提供所有四个步骤代码1-rgb2grayscale 2-histogram1D 3-contrast1D 4-smoothImage。

#include <iomanip>
#include <iostream>
#include <cstring>
#include "CImg.h"
#include <stdio.h>

#define CHECKCUDAERROR(err)     {if (cudaSuccess != err) {fprintf(stderr, "CUDA ERROR: %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(err), err,  __LINE__, __FILE__); exit(EXIT_FAILURE); }}

using std::cout;
using std::cerr;
using std::endl;
using std::fixed;
using std::setprecision;
using cimg_library::CImg;

// Constants
const bool displayImages = false;
const bool saveAllImages = false;
const unsigned int HISTOGRAM_SIZE = 256;
const unsigned int BAR_WIDTH = 4;
const unsigned int CONTRAST_THRESHOLD = 80;
const float filter[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 1.0f, 2.0f, 3.0f, 2.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};

unsigned char *d1, *d2, *d3, *d4;
unsigned int *d5;


double realtime()
{
    struct timeval tp;
    struct timezone tzp;
    gettimeofday(&tp, &tzp);
    return tp.tv_sec + tp.tv_usec * 1e-6;
}
__global__ void rgb2gray(unsigned char *inputImage, unsigned char *grayImage, const int width, 
const int height) 
{

    int col = threadIdx.x + blockIdx.x * blockDim.x;
    int row = threadIdx.y + blockIdx.y * blockDim.y;

    if (col < width && row < height)
    {

            float grayPix = 0.0f;
            float r = static_cast< float >(inputImage[(row * width) + col]);
            float g = static_cast< float >(inputImage[(width * height) + (row * width) + col]);
            float b = static_cast< float >(inputImage[(2 * width * height) + (row * width) + col]);

            grayPix = (0.3f * r) + (0.59f * g) + (0.11f * b);

            grayImage[(row * width) + col] = static_cast< unsigned char >(grayPix);
        }
    }
void rgb2gray_pl(unsigned char *inputImage, unsigned char *grayImage, const int width, const int height) {

    // Initialize device pointers.
    size_t size = width * height * sizeof(unsigned char);

    double cudamalloc_time = realtime();

    // Allocate device memory.
    CHECKCUDAERROR(cudaMalloc(&d1, 3*size));
    CHECKCUDAERROR(cudaMalloc(&d2, size));

    cout << fixed << setprecision(6);
    cout << "cudaMalloc: \t\t" << realtime() - cudamalloc_time << " seconds." << endl;

    double cudamemcpy_d1 = realtime();

    // Transfer from host to device.
    CHECKCUDAERROR(cudaMemcpy(d1, inputImage, 3*size, cudaMemcpyHostToDevice));

    cout << fixed << setprecision(6);
    cout << "cudaMemcpy_host_to_device: \t\t" << realtime() - cudamemcpy_d1 << " seconds." << endl;

    double kernel_time = realtime();

    //define block and grid dimensions
    const dim3 dimGrid((int)ceil(((width +16) /16)), (int)ceil(((height + 16) /16)));
    const dim3 dimBlock(16, 16);

    //execute cuda kernel
    rgb2gray<<<dimGrid, dimBlock>>>(d1, d2, width, height);
    CHECKCUDAERROR(cudaPeekAtLastError());
    cout << fixed << setprecision(6);
    cout << "kernel: \t\t" << realtime() - kernel_time << " seconds." << endl;

    double cudamemcpy_d2 = realtime();

    //copy computed gray data array from device to host
    CHECKCUDAERROR(cudaMemcpy(grayImage, d2, size, cudaMemcpyDeviceToHost));

    cout << fixed << setprecision(6);
    cout << "cudaMemcpy_device_to_host: \t\t" << realtime() - cudamemcpy_d2 << " seconds." << endl;

}
__global__ void histogram1D(unsigned char *grayImage, unsigned char *histogramImage, const int width, const int height, 
                unsigned int *histogram, const unsigned int HISTOGRAM_SIZE, const unsigned int BAR_WIDTH) 
{
    unsigned int max = 0;
    printf("hello\n");  
    memset(reinterpret_cast< void * >(histogram), 0, HISTOGRAM_SIZE * sizeof(unsigned int));

    int col = threadIdx.x + blockIdx.x * blockDim.x;
    int row = threadIdx.y + blockIdx.y * blockDim.y;
    printf("hello1\n"); 

    if (col < width && row < height) {
            histogram[static_cast< unsigned int >(grayImage[(row * width) + col])] += 1;
        }
    printf("hello2\n"); 
    for ( unsigned int i = 0; i < HISTOGRAM_SIZE; i++ ) {
        if ( histogram[i] > max ) {
            max = histogram[i];
        }
    }
    printf("hello3\n"); 

    for ( int x = 0; x < HISTOGRAM_SIZE * BAR_WIDTH; x += BAR_WIDTH ) {
        unsigned int value = HISTOGRAM_SIZE - ((histogram[x / BAR_WIDTH] * HISTOGRAM_SIZE) / max);
        printf("hello4\n"); 


        for ( unsigned int y = 0; y < value; y++ ) {
            for ( unsigned int i = 0; i < BAR_WIDTH; i++ ) {
                if ((y * HISTOGRAM_SIZE * BAR_WIDTH) < (HISTOGRAM_SIZE * HISTOGRAM_SIZE * BAR_WIDTH))
                {
                printf("hello5\n"); 
                histogramImage[(y * HISTOGRAM_SIZE * BAR_WIDTH) + x + i] = 0;
                }
            }
        }
        for ( unsigned int y = value; y < HISTOGRAM_SIZE; y++ ) {
            for ( unsigned int i = 0; i < BAR_WIDTH; i++ ) {
                if ((y * HISTOGRAM_SIZE * BAR_WIDTH) < (HISTOGRAM_SIZE * HISTOGRAM_SIZE * BAR_WIDTH))
                {
                printf("hello6\n"); 

                histogramImage[(y * HISTOGRAM_SIZE * BAR_WIDTH) + x + i] = 255;
                }
            }
        }
    }

printf("hello7\n"); 

}

void histogram_pl(unsigned char *grayImage, unsigned char *histogramImage, const int width, const int height, 
                unsigned int *histogram, const unsigned int HISTOGRAM_SIZE, const unsigned int BAR_WIDTH) {

    size_t size = width * height * sizeof(unsigned char);
    printf("cudamalloc");
    double malloc_time = realtime();
    CHECKCUDAERROR(cudaMalloc(&d4, BAR_WIDTH * HISTOGRAM_SIZE * HISTOGRAM_SIZE * sizeof(unsigned char)));
    CHECKCUDAERROR(cudaMalloc(&d5, HISTOGRAM_SIZE * sizeof(unsigned int)));


     cout << fixed << setprecision(6);
    cout << "histogram_malloc: \t\t" << realtime() - malloc_time << " seconds." << endl;

    double filter_time = realtime();

    printf("cudamemcpyHostToDevice");

    CHECKCUDAERROR(cudaMemcpy(d4, histogramImage, BAR_WIDTH * HISTOGRAM_SIZE * HISTOGRAM_SIZE * sizeof(unsigned char), cudaMemcpyHostToDevice));
    CHECKCUDAERROR(cudaMemcpy(d5, histogram, HISTOGRAM_SIZE * sizeof(unsigned int), cudaMemcpyHostToDevice));

    cout << fixed << setprecision(6);
    cout << "histogram_h_d: \t\t" << realtime() - filter_time << " seconds." << endl;

    double histogram_kernel_time = realtime();

    //execute cuda kernel
    const dim3 dimGrid((int)ceil(((width +16) /16)), (int)ceil(((height + 16) /16)));
    const dim3 dimBlock(16, 16);
    printf("before calling kernel\n");
    histogram1D<<<dimGrid, dimBlock>>>(d2, d4, width, height, d5,HISTOGRAM_SIZE,BAR_WIDTH);
    CHECKCUDAERROR(cudaPeekAtLastError());
    CHECKCUDAERROR(cudaDeviceSynchronize());
    cout << fixed << setprecision(6);
    cout << "histogram_kernel: \t\t" << realtime() - histogram_kernel_time << " seconds." << endl;
    printf("hello8\n"); 
    double d_h_time = realtime();

    //copy computed histogram data array from device to host
    CHECKCUDAERROR(cudaMemcpy(histogram, d5, HISTOGRAM_SIZE * sizeof(unsigned int), cudaMemcpyDeviceToHost));
    printf("hello9\n"); 

    cout << fixed << setprecision(6);
    cout << "histogram_d_to_h: \t\t" << realtime() - d_h_time << " seconds." << endl;
    }
__global__ void contrast1D(unsigned char *grayImage, const int width, const int height, unsigned int *histogram, 
                const unsigned int HISTOGRAM_SIZE, const unsigned int CONTRAST_THRESHOLD) 
{
    unsigned int i = 0;
    int col = threadIdx.x + blockIdx.x * blockDim.x;
    int row = threadIdx.y + blockIdx.y * blockDim.y;
    while ( (i < HISTOGRAM_SIZE) && (histogram[i] < CONTRAST_THRESHOLD) ) {
        i++;
    }
    unsigned int min = i;

    i = HISTOGRAM_SIZE - 1;
    while ( (i > min) && (histogram[i] < CONTRAST_THRESHOLD) ) {
        i--;
    }
    unsigned int max = i;
    float diff = max - min;

    if (col < width && row < height) {
            unsigned char pixel = grayImage[(row * width) + col];

            if ( pixel < min ) {
                pixel = 0;
            }
            else if ( pixel > max ) {
                pixel = 255;
            }
            else {
                pixel = static_cast< unsigned char >(255.0f * (pixel - min) / diff);
            }

            grayImage[(row * width) + col] = pixel;
        }

}
void contrast1D_pl(unsigned char *grayImage, const int width, const int height, unsigned int *histogram, const unsigned int HISTOGRAM_SIZE, const unsigned int CONTRAST_THRESHOLD) {



    double kernel_time = realtime();

    //define block and grid dimensions
    const dim3 dimGrid((int)ceil(((width +16) /16)), (int)ceil(((height + 16) /16)));
    const dim3 dimBlock(16, 16);

    //execute cuda kernel
    contrast1D<<<dimGrid, dimBlock>>>(d2,width, height,d5,HISTOGRAM_SIZE,CONTRAST_THRESHOLD);
    CHECKCUDAERROR(cudaPeekAtLastError());
    cout << fixed << setprecision(6);
    cout << "kernel: \t\t" << realtime() - kernel_time << " seconds." << endl;


}
__global__ void triangularSmooth(unsigned char *grayImage, unsigned char *smoothImage, 
                      const int width, const int height, const float *filter) 
{

    int col = threadIdx.x + blockIdx.x * blockDim.x;
    int row = threadIdx.y + blockIdx.y * blockDim.y;

    if (col < (width-1) && row < (height-1))
        {
            unsigned int filterItem = 0;
            float filterSum = 0.0f;
            float smoothPix = 0.0f;

            for ( int fy = row - 2; fy < row + 3; fy++ ) {
                for ( int fx = col - 2; fx < col + 3; fx++ ) {
                    if ( ((fy < 0) || (fy >= height)) || ((fx < 0) || (fx >= width)) ) {
                        filterItem++;
                        continue;
                    }

                    smoothPix += grayImage[(fy * width) + fx] * filter[filterItem];
                    filterSum += filter[filterItem];
                    filterItem++;
                }
            }

            smoothPix /= filterSum;
            smoothImage[(row * width) + col] = static_cast< unsigned char >(smoothPix);

        }

}

void smooth_pl(unsigned char *grayImage, unsigned char *smoothImage, const int width, const int height) {

    size_t size = width * height * sizeof(unsigned char);

    double malloc_time = realtime();
    float *filterGpu;
    CHECKCUDAERROR(cudaMalloc(&d3, size));
    CHECKCUDAERROR(cudaMalloc(&filterGpu, 25 * sizeof(float)));


     cout << fixed << setprecision(6);
    cout << "triangular_smooth_malloc: \t\t" << realtime() - malloc_time << " seconds." << endl;

    double filter_time = realtime();

    CHECKCUDAERROR(cudaMemcpy(filterGpu, filter, 25 * sizeof(float), cudaMemcpyHostToDevice));

    cout << fixed << setprecision(6);
    cout << "triangular_smooth_filter: \t\t" << realtime() - filter_time << " seconds." << endl;


    double t_s_kernel_time = realtime();

    //execute cuda kernel
    const dim3 dimGrid((int)ceil(((width +16) /16)), (int)ceil(((height + 16) /16)));
    const dim3 dimBlock(16, 16);

    triangularSmooth<<<dimGrid, dimBlock>>>(d2, d3, width, height, filterGpu);
    CHECKCUDAERROR(cudaPeekAtLastError());
    cout << fixed << setprecision(6);
    cout << "triangular_smooth_kernel: \t\t" << realtime() - t_s_kernel_time << " seconds." << endl;


    double d_h_time = realtime();

    //copy computed smooth data array from device to host
    CHECKCUDAERROR(cudaMemcpy(smoothImage, d3, size, cudaMemcpyDeviceToHost));

    cout << fixed << setprecision(6);
    cout << "triangular_smooth_d_to_h: \t\t" << realtime() - d_h_time << " seconds." << endl;


    double cuda_free = realtime();

    CHECKCUDAERROR(cudaFree(d1));
    CHECKCUDAERROR(cudaFree(d2));
    CHECKCUDAERROR(cudaFree(d3));
    CHECKCUDAERROR(cudaFree(d4));
    CHECKCUDAERROR(cudaFree(d5));


    cout << fixed << setprecision(6);
    cout << "cudaFree: \t\t" << realtime() - cuda_free << " seconds." << endl;
}

int main(int argc, char *argv[]) 
{
    //NSTimer total = NSTimer("total", false, false);
        //double prev_time;

    if ( argc != 2 ) {
        cerr << "Usage: " << argv[0] << " <filename>" << endl;        
        cout << fixed << setprecision(6);
        return 1;
    }

    // Load the input image
    CImg< unsigned char > inputImage = CImg< unsigned char >(argv[1]);
    if ( displayImages ) {
        inputImage.display("Input Image");
    }
    if ( inputImage.spectrum() != 3 ) {
        //cerr << "The input must be a color image." << endl;
        //return 1;
    }
    double total_rgb2gray__time = realtime();


    CImg<unsigned char> grayImage = CImg<unsigned char>(inputImage.width(), inputImage.height(), 1, 1);


    rgb2gray_pl(inputImage.data(), grayImage.data(), inputImage.width(), inputImage.height());

    cout << fixed << setprecision(6);
    cout << "total_time: \t\t" << realtime() - total_rgb2gray__time << " seconds." << endl;

    //grayImage.save("./grayscale.bmp");

    CImg< unsigned char > histogramImage = CImg< unsigned char >(BAR_WIDTH * HISTOGRAM_SIZE, HISTOGRAM_SIZE, 1, 1);


    unsigned int *histogram = new unsigned int [HISTOGRAM_SIZE];

    histogram_pl(grayImage.data(),histogramImage.data(),grayImage.width(), grayImage.height(), histogram, HISTOGRAM_SIZE, BAR_WIDTH);

    contrast1D_pl(grayImage.data(), grayImage.width(), grayImage.height(), histogram, HISTOGRAM_SIZE, CONTRAST_THRESHOLD);

    delete [] histogram;

    double total_smooth__time = realtime();

    CImg< unsigned char > smoothImage = CImg< unsigned char >(grayImage.width(), grayImage.height(), 1, 1);


    smooth_pl(grayImage.data(),smoothImage.data(), grayImage.width(), grayImage.height());

    cout << fixed << setprecision(6);
    cout << "total_time: \t\t" << realtime() - total_smooth__time << " seconds." << endl;


    smoothImage.save("./smooth.bmp");


    //allocate and initialize memory on device

    return 0;
}

c++ cuda
1个回答
0
投票

我正在使用google colab来使用gpu。经过进一步调试后,我知道代码在不同的gpu上显示了不同的行为,现在连接的gpu是P100。代码进入histogram1D内核并停留在printf(“ hello5 \”)]上,连续打印hello5

© www.soinside.com 2019 - 2024. All rights reserved.