2013-06-08 3 views
0

나는 세 가지 값을 0을 얻을 커널 내에서 인쇄 할 때 나는 CUDACUDA 쓰기

int main(int argc, char **argv){ 

    int exit_code; 

    if (argc < 4) { 
     std::cout << "Usage: \n " << argv[0] << " <input> <output> <nColors>" << std::endl; 
     return 1; 
    } 

    Color *h_input; 
    int h_rows, h_cols; 

    timer1.Start(); 
    exit_code = readText2RGB(argv[1], &h_input, &h_rows, &h_cols); 
    timer1.Stop(); 
    std::cout << "Reading: " << timer1.Elapsed() << std::endl; 

    if (exit_code != SUCCESS){ 
     std::cout << "Error trying to read file." << std::endl; 
     return FAILURE; 
    } 

    CpuTimer timer1; 
    GpuTimer timer2; 
    float timeStep2 = 0, timeStep3 = 0; 

    int h_numColors = atoi(argv[3]); 

    int h_change = 0; 
    int *h_pixelGroup = new int[h_rows*h_cols]; 
    Color *h_groupRep = new Color[h_numColors]; 
    Color *h_output = new Color[h_rows*h_cols]; 

    Color *d_input; 
    int *d_pixelGroup; 
    Color *d_groupRep; 
    Color *d_output; 

    dim3 block(B_WIDTH, B_HEIGHT); 
    dim3 grid((h_cols+B_WIDTH-1)/B_WIDTH, (h_rows+B_HEIGHT-1)/B_HEIGHT); 

    checkCudaError(cudaMalloc((void**)&d_input, sizeof(Color)*h_rows*h_cols)); 
    checkCudaError(cudaMalloc((void**)&d_pixelGroup, sizeof(int)*h_rows*h_cols)); 
    checkCudaError(cudaMalloc((void**)&d_groupRep, sizeof(Color)*h_numColors)); 
    checkCudaError(cudaMalloc((void**)&d_output, sizeof(Color)*h_rows*h_cols)); 

    //  STEP 1 
    //Evenly distribute all pixels of the image onto the color set 
    timer2.Start(); 
    checkCudaError(cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int))); 
    checkCudaError(cudaMemcpyToSymbol(c_cols, &h_cols, sizeof(int))); 
    checkCudaError(cudaMemcpyToSymbol(c_numColors, &h_numColors, sizeof(int))); 
    checkCudaError(cudaMemcpy(d_input, h_input, sizeof(Color)*h_rows*h_cols, cudaMemcpyHostToDevice)); 

    clut_distributePixels<<<grid, block>>>(d_pixelGroup); 
    checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost)); 
    timer2.Stop(); 
    std::cout << "Phase 1: " << timer2.Elapsed() << std::endl; 

    std::cout << h_pixelGroup[0] << "," 
       << h_pixelGroup[3] << "," 
       << h_pixelGroup[4] << "," 
       << h_pixelGroup[7] << "," 
       << h_pixelGroup[8] << std::endl; 

    //Do the STEP 2 and STEP 3 as long as there is at least one change of representative in a group 
    do { 
     //  STEP 2 
     //Set the representative value to the average colour of all pixels in the same set 
     timer1.Start(); 
     for (int ng = 0; ng < h_numColors; ng++) { 
      int r = 0, g = 0, b = 0; 
      int elem = 0; 
      for (int i = 0; i < h_rows; i++) { 
       for (int j = 0; j < h_cols; j++) { 
        if (h_pixelGroup[i*h_cols+j] == ng) { 
         r += h_input[i*h_cols+j].r; 
         g += h_input[i*h_cols+j].g; 
         b += h_input[i*h_cols+j].b; 
         elem++; 
        } 
       } 
      } 
      if (elem == 0) { 
       h_groupRep[ng].r = 255; 
       h_groupRep[ng].g = 255; 
       h_groupRep[ng].b = 255; 
      }else{ 
       h_groupRep[ng].r = r/elem; 
       h_groupRep[ng].g = g/elem; 
       h_groupRep[ng].b = b/elem; 
      } 
     } 
     timer1.Stop(); 
     timeStep2 += timer1.Elapsed(); 

     //  STEP 3 
     //For each pixel in the image, compute Euclidean's distance to each representative 
     //and assign it to the set which is closest 
     h_change = 0; 

     timer2.Start(); 
     checkCudaError(cudaMemcpyToSymbol(d_change, &h_change, sizeof(int))); 
     checkCudaError(cudaMemcpy(d_groupRep, h_groupRep, sizeof(Color)*h_numColors, cudaMemcpyHostToDevice)); 

     clut_checkDistances<<<grid, block>>>(d_input, d_pixelGroup, d_groupRep); 
     checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost)); 
     checkCudaError(cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int))); 
     timer2.Stop(); 
     timeStep3 += timer2.Elapsed(); 

     std::cout << "Chunche" << std::endl; 

    } while (h_change == 1); 

    std::cout << "Phase 2: " << timeStep2 << std::endl; 
    std::cout << "Phase 3: " << timeStep3 << std::endl; 

    //  STEP 4 
    //Create the new image with the resulting color lookup table 
    timer2.Start(); 
    clut_createImage<<<grid, block>>>(d_output, d_pixelGroup, d_groupRep); 
    checkCudaError(cudaMemcpy(h_output, d_output, sizeof(Color)*h_rows*h_cols, cudaMemcpyDeviceToHost)); 
    timer2.Stop(); 
    std::cout << "Phase 4: " << timer2.Elapsed() << std::endl; 

    checkCudaError(cudaFree(d_input)); 
    checkCudaError(cudaFree(d_pixelGroup)); 
    checkCudaError(cudaFree(d_groupRep)); 
    checkCudaError(cudaFree(d_output)); 

    timer1.Start(); 
    exit_code = writeRGB2Text(argv[2], h_input, h_rows, h_cols); 
    timer1.Stop(); 
    std::cout << "Writing: " << timer1.Elapsed() << std::endl; 

    delete[] h_pixelGroup; 
    delete[] h_groupRep; 
    delete[] h_output; 

    return SUCCESS; 
} 

__constant__ 변수에 호스트 변수 복사하려면 다음 코드를

__global__ 
void clut_distributePixels(int *pixelGroup){ 
    int i = blockDim.y * blockIdx.y + threadIdx.y; 
    int j = blockDim.x * blockIdx.x + threadIdx.x; 

    if(i == 0 && j == 0){ 
     printf("a: %d\n", c_rows); 
     printf("b: %d\n", c_cols); 
     printf("c: %d\n", c_numColors); 
    } 

    while (i < c_rows) { 
     while (j < c_cols) { 
      pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors; 
      j += gridDim.x * blockDim.x; 
     } 
     j = blockDim.x * blockIdx.x + threadIdx.x; 
     i += gridDim.y * blockDim.y; 
    } 

} 

상수 메모리에 올바르게 복사하지 못하거나 ... 무엇이 잘못 될 수 있는지 알지 못합니다. 어떤 조언을!? 전체 호스트 코드를 게시했는데 그 중 일부는 상수 사본을 망칠 것입니다.

UPDATE

Main.cu

#include "Imageproc.cuh" 
int main(){ 
    int h_change = 0; 
    int h_rows = 512; 
    cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int)); 
    chunche<<<1,1>>>(); 
    cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int)); 

    std::cout << "H = " << h_change << std::endl; 
    return 0 
} 

Imageproc.cuh

#ifndef _IMAGEPROC_CUH_ 
#define _IMAGEPROC_CUH_ 

#include "Utilities.cuh" 

#define B_WIDTH  16 
#define B_HEIGHT 16 

__constant__ int c_rows; 
__constant__ int c_cols; 
__constant__ int c_numColors; 

__device__ int d_change; 

    #ifdef __cplusplus 
     extern "C" 
     { 
    #endif 
     __global__ 
     void chunche(); 
     __global__ 
     void clut_distributePixels(int *pixelGroup); 
     __global__ 
     void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep); 
     __global__ 
     void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep); 
    #ifdef __cplusplus 
     } 
    #endif 

#endif 

Imageproc.cu

,617 451,515,
#include "Imageproc.cuh" 

__global__ 
void chunche(){ 
    d_change = c_rows + 1; 
} 

__global__ 
void clut_distributePixels(int *pixelGroup){ 
    int i = blockDim.y * blockIdx.y + threadIdx.y; 
    int j = blockDim.x * blockIdx.x + threadIdx.x; 

    while (i < c_rows) { 
     while (j < c_cols) { 
      pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors; 
      j += gridDim.x * blockDim.x; 
     } 
     j = blockDim.x * blockIdx.x + threadIdx.x; 
     i += gridDim.y * blockDim.y; 
    } 

} 

__global__ 
void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep){ 
    int i = blockDim.y * blockIdx.y + threadIdx.y; 
    int j = blockDim.x * blockIdx.x + threadIdx.x; 
    int newGroup; 

    while (i < c_rows) { 
     while (j < c_cols) { 
      newGroup = 0; 
      for (int ng = 1; ng < c_numColors; ng++) { 
       if (
        /*If distance from color to group ng is less than distance from color to group idx 
        then color should belong to ng*/ 
        (groupRep[ng].r-input[i*c_cols+j].r)*(groupRep[ng].r-input[i*c_cols+j].r) + 
        (groupRep[ng].g-input[i*c_cols+j].g)*(groupRep[ng].g-input[i*c_cols+j].g) + 
        (groupRep[ng].b-input[i*c_cols+j].b)*(groupRep[ng].b-input[i*c_cols+j].b) 
        < 
        (groupRep[newGroup].r-input[i*c_cols+j].r)*(groupRep[newGroup].r-input[i*c_cols+j].r)+ 
        (groupRep[newGroup].g-input[i*c_cols+j].g)*(groupRep[newGroup].g-input[i*c_cols+j].g)+ 
        (groupRep[newGroup].b-input[i*c_cols+j].b)*(groupRep[newGroup].b-input[i*c_cols+j].b) 
        ) 
       { 
        newGroup = ng; 
       } 
      } 

      if (pixelGroup[i*c_cols+j] != newGroup) { 
       pixelGroup[i*c_cols+j] = newGroup; 
       d_change = 1; 
      } 

      j += gridDim.x * blockDim.x; 
     } 
     j = blockDim.x * blockIdx.x + threadIdx.x; 
     i += gridDim.y * blockDim.y; 
    } 

} 

__global__ 
void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep){ 
    int i = blockDim.y * blockIdx.y + threadIdx.y; 
    int j = blockDim.x * blockIdx.x + threadIdx.x; 

    while (i < c_rows) { 
     while (j < c_cols) { 
      clutImage[i*c_cols+j].r = groupRep[pixelGroup[i*c_cols+j]].r; 
      clutImage[i*c_cols+j].g = groupRep[pixelGroup[i*c_cols+j]].g; 
      clutImage[i*c_cols+j].b = groupRep[pixelGroup[i*c_cols+j]].b; 
      j += gridDim.x * blockDim.x; 
     } 
     j = blockDim.x * blockIdx.x + threadIdx.x; 
     i += gridDim.y * blockDim.y; 
    } 
} 

Utilities.cuh

#ifndef _UTILITIES_CUH_ 
#define _UTILITIES_CUH_ 

#include <iostream> 
#include <fstream> 
#include <string> 

#define SUCCESS  1 
#define FAILURE  0 

#define checkCudaError(val) check((val), #val, __FILE__, __LINE__) 

typedef struct { 
    int r; 
    int g; 
    int b; 
} vec3u; 

typedef vec3u Color; 
typedef unsigned char uchar; 
typedef uchar Grayscale; 

struct GpuTimer{ 
    cudaEvent_t start; 
    cudaEvent_t stop; 
    GpuTimer(){ 
     cudaEventCreate(&start); 
     cudaEventCreate(&stop); 
    } 
    ~GpuTimer(){ 
     cudaEventDestroy(start); 
     cudaEventDestroy(stop); 
    } 
    void Start(){ 
     cudaEventRecord(start, 0); 
    } 
    void Stop(){ 
     cudaEventRecord(stop, 0); 
    } 
    float Elapsed(){ 
     float elapsed; 
     cudaEventSynchronize(stop); 
     cudaEventElapsedTime(&elapsed, start, stop); 
     return elapsed; 
    } 
}; 

template<typename T> 
void check(T err, const char* const func, const char* const file, const int line) { 
    if (err != cudaSuccess) { 
     std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 
     std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 
     exit(1); 
    } 
} 

int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols); 
int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols); 

int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols); 
int readText2RGB(const std::string filename, Color **image, int *rows, int *cols); 

struct CpuTimer{ 
    clock_t start; 
    clock_t stop; 
    void Start(){ 
     start = clock(); 
    } 
    void Stop(){ 
     stop = clock(); 
    } 
    float Elapsed(){ 
     return ((float)stop-start)/CLOCKS_PER_SEC * 1000.0f; 
    } 
}; 

#endif 

Utilities.cu

#include "Utilities.cuh" 

int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols){  
    std::ofstream fileWriter(filename.c_str()); 
    if (!fileWriter.is_open()) { 
     std::cerr << "** writeGrayscale2Text() ** : Unable to open file." << std::endl; 
     return FAILURE; 
    } 
    fileWriter << rows << "\n"; 
    fileWriter << cols << "\n"; 
    for (int i = 0; i < rows; i++) { 
     for (int j = 0; j < cols; j++) { 
      fileWriter << (int)image[i*cols+j] << "\n"; 
     } 
    } 
    fileWriter.close(); 
    return SUCCESS; 
} 

int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols){ 
    std::ifstream fileReader(filename.c_str()); 
    if (!fileReader.is_open()) { 
     std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl; 
     return FAILURE; 
    } 
    fileReader >> *rows; 
    fileReader >> *cols; 
    *image = new Grayscale[(*rows)*(*cols)]; 
    int value; 
    for (int i = 0; i < *rows; i++) { 
     for (int j = 0; j < *cols; j++) { 
      fileReader >> value; 
      (*image)[i*(*cols)+j] = (Grayscale)value; 
     } 
    } 
    fileReader.close(); 
    return SUCCESS; 
} 

int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols){ 
    std::ofstream fileWriter(filename.c_str()); 
    if (!fileWriter.is_open()) { 
     std::cerr << "** writeRGB2Text() ** : Unable to open file." << std::endl; 
     return FAILURE; 
    } 
    fileWriter << rows << "\n"; 
    fileWriter << cols << "\n"; 
    for (int k = 0; k < 3; k++) { 
     for (int i = 0; i < rows; i++) { 
      for (int j = 0; j < cols; j++) { 
       switch (k) { 
        case 0: 
         fileWriter << image[i*cols+j].r << "\n"; 
         break; 
        case 1: 
         fileWriter << image[i*cols+j].g << "\n"; 
         break; 
        case 2: 
         fileWriter << image[i*cols+j].b << "\n"; 
         break; 
       } 
      } 
     } 
    } 
    fileWriter.close(); 
    return SUCCESS; 
} 

int readText2RGB(const std::string filename, Color **image, int *rows, int *cols){ 
    std::ifstream fileReader(filename.c_str()); 
    if (!fileReader.is_open()) { 
     std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl; 
     return FAILURE; 
    } 
    fileReader >> *rows; 
    fileReader >> *cols; 
    *image = new Color[(*rows)*(*cols)]; 
    for (int k = 0; k < 3; k++) { 
     for (int i = 0; i < *rows; i++) { 
      for (int j = 0; j < *cols; j++) { 
       switch (k) { 
        case 0: 
         fileReader >> (*image)[i*(*cols)+j].r; 
         break; 
        case 1: 
         fileReader >> (*image)[i*(*cols)+j].g; 
         break; 
        case 2: 
         fileReader >> (*image)[i*(*cols)+j].b; 
         break; 
       } 
      } 
     } 
    } 
    fileReader.close(); 
    return SUCCESS; 
} 
+0

'c_rows','c_cols','c_numColors','d_change'는 어디에서 어떻게 선언 되었습니까? – sgarizvi

+0

주 기능이 포함 된 헤더 파일에 있습니다. 선언은'__constant__ int c_rows''__constant__ int c_cols''__constant__ int c_numColors'입니다. 사실, 문제는 컴파일 타임이 아니라 런타임에 발생합니다. – BRabbit27

+0

'cudaMemcpyToSymbol'에 문제가없는 것 같습니다. 이'__constant__' 정수는 다른 커널에서 사용됩니까? – sgarizvi

답변

4

상수 메모리 암시 로컬 범위 결합을 갖는다 - answer to this on stack overflow. 즉, cudaMemcpyToSymbol은 사용하려는 커널의 .obj 파일과 동일하게 생성되어야합니다. Main.cu에서 memcopy를 수행하지만 캔 스턴트 메모리를 사용하는 커널은 Imageproc.cu입니다. 따라서 상수 값은 커널 chunche에 대해 알 수 없습니다.

옵션을 사용하면 문제를 해결할 수 있고 래퍼를 구현할 수 있습니다. Imagepro.cu에 함수를 추가하고 cudaMemcpyToSymbol을 수행하고 래퍼를 Main.cu으로 호출하고 거기에 상수 메모리에 원하는 값을 전달하십시오.