CUDA에서 1D 레이어링 텍스처를 만들고 사용하는 방법

CUDA를 처음 사용했습니다. CUDA에서 1D 및 2D 텍스처를 수행하는 방법을 알아 냈습니다. 그러나 1D 계층 텍스처를 사용하는 방법에 어려움을 겪고 있습니다. 텍스처를 사용하는 커널의 출력은 모두 0입니다. 이것은 틀 렸습니다. 그러나, 내가 뭘 잘못하고 있는지 모르겠습니다. 이 텍스처를 올바르게 설정했는지에 대해서는 의문의 여지가 있지만 어디에서나 cuda 오류를 확인 했으므로 아무런 문제가 없습니다. 누군가가 올바르게 1D 계층 텍스처를 설정하고 사용하는 방법을 보여줄 수 있습니까? 여기 내 코드가있다. 미리 감사드립니다 :CUDA에서 1D 레이어링 텍스처를 만들고 사용하는 방법

// To Compile: nvcc backproj.cu -o backproj.out 
// To Run: ./backproj.out 

// Includes, system 
#include <stdlib.h> 
#include <stdio.h> 
#include <string.h> 
#include <math.h> 

// Includes CUDA 
#include <cuda_runtime.h> 
#include <cuda_profiler_api.h> 

#define pi acos(-1) 

// 1D float textures 
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef; 

// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB 
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int layer) { 
    unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x; 
    if (location_idx < numlocations) { 
     // Get the location you want to interpolate from the array 
     float loc2find = (float) d_locations[location_idx] + 0.5f; 
     // Read from texture and write to global memory 
     d_output[location_idx] = tex1DLayered(texRef, loc2find, layer); 
    } 
} 

// Host code 
int main() 
{ 
    // Setup h_data and locations to interpolate from 
    const unsigned int len = 10; 
    const unsigned int numlayers = 3; 
    const unsigned int upsamp = 3; 
    const unsigned int loclen = 1 + (len - 1) * upsamp; 
    float idx_spacing = 1/(float)upsamp; 
    float h_data[len][numlayers], h_loc[loclen]; 
    for (int i = 0; i < len; i++) 
     for (int j = 0; j < numlayers; j++) 
      h_data[i][j] = 1+cosf((float) pi*i/(j+1.0f)); 
    for (int i = 0; i < loclen; i ++) 
     h_loc[i] = i*idx_spacing; 

    // Get the memory locations you want 
    float* d_loc; 
    cudaMalloc(&d_loc, loclen * sizeof(float)); 
    cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice); 

    // Allocate CUDA array in device memory 
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); 
    cudaArray* cuArray; 
    cudaMallocArray(&cuArray, &channelDesc, len, numlayers); 

    // Copy to device memory some data located at address h_data in host memory 
    cudaMemcpyToArray(cuArray, 0, 0, h_data, len * numlayers * sizeof(float), cudaMemcpyHostToDevice); 

    // Set texture reference parameters 
    texRef.addressMode[0] = cudaAddressModeBorder; 
    texRef.filterMode = cudaFilterModeLinear; 
    texRef.normalized = false; 

    // Bind the array to the texture reference 
    cudaBindTextureToArray(texRef, cuArray, channelDesc); 

    // Allocate result of transformation in device memory 
    float* d_output; 
    cudaMalloc(&d_output, loclen * sizeof(float)); 

    // Invoke kernel 
    int thdsPerBlk = 256; 
    int blksPerGrid = (int) (loclen/thdsPerBlk) + 1; 
    printf("Threads Per Block: %d, Blocks Per Grid: %d\n", thdsPerBlk, blksPerGrid); 
    interp1Kernel <<<blksPerGrid, thdsPerBlk >>>(d_output, d_loc, loclen, 0); 

    // Print Results 
    printf("\n Original Indices \n"); 
    for (int i = 0; i < len; i++) printf(" %d ", i); 
    printf("\n Original array \n"); 
    for (int i = 0; i < len; i++) printf("%5.3f ", h_data[i][0]); 
    printf("\n Output Indices \n"); 
    for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]); 
    printf("\n Output Array \n"); 
    cudaMemcpy(h_loc, d_output, loclen * sizeof(float), cudaMemcpyDeviceToHost); 
    for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]); 
    printf("\n"); 

    // Free device memory 
    cudaFreeArray(cuArray); 
    cudaFree(d_output); 

    return 0; 
}

출처

2016-08-29 Rehman Ali

당신은 계층화 된 텍스처 메모리를 할당하는 설정 cudaArrayLayered 플래그 cudaMalloc3DArray를 사용해야합니다. 툴킷 샘플에는 complete example의 계층화 된 텍스처 사용법이 있으며,이 툴킷 샘플을 사용하여 어떻게 작동하는지 확인할 수 있습니다.

출처

2016-08-29 05:36:55 talonmies

불행히도 CUDA SDK는 2D 레이어링 텍스처가있는 경우에만 수행하는 방법을 보여줍니다. 1D 레이어링 텍스처의 경우 더 까다로운 부분이 있습니다. 그것은 당신이 다음과 같이 extentDesc을 할 때 make_cudaExtent에 대한 두 번째 인수에 공을 넣어야 할 판명 :

cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers); // <-- 0 height required for 1Dlayered

을하지만 cudaMemcpy3D에 대한 mParams.extent에 대한 make_cudaExtent를 사용하는 경우, 당신은 여전히 두 번째 인수에 1을 넣어 필요 :

또한, make_cudaPitchedPtr의 피치와 같은 일부 명백하지 않은 세부 정보가 있습니다. 그래서 1D 계층 텍스처에 대한 완전하고 기능적인 코드를 포함 시켰습니다. 나는 이것의 예를 어디에서도 발견 할 수 없었다. 그래서 이것은 같은 보트에있는 다른 사람들을 도울 것입니다.

// To Compile: nvcc layeredTexture1D.cu -o layeredTexture1D.out 
// To Run: ./layeredTexture1D.out 

// Includes, system 
#include <stdlib.h> 
#include <stdio.h> 
#include <string.h> 
#include <math.h> 

// Includes CUDA 
#include <cuda_runtime.h> 
#include <cuda_profiler_api.h> 

#define pi acos(-1) 

// 1D float textures: x is for input values, y is for corresponding output values 
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef; 

// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB 
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int numlayers) { 
    unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x; 
    unsigned int layer = blockIdx.y * blockDim.y + threadIdx.y; 
    if (location_idx < numlocations && layer < numlayers) { 
     // Get the location you want to interpolate from the array 
     float loc2find = (float)d_locations[location_idx] + 0.5f; 
     // Read from texture and write to global memory 
     d_output[location_idx + layer*numlocations] = tex1DLayered(texRef, loc2find, layer); 
     //printf("location=%d layer=%d loc2find=%f result=%f \n", location_idx, layer, loc2find, d_output[location_idx]); 
    } 
} 

// Host code 
int main() 
{ 
    // Setup h_data and locations to interpolate from 
    const unsigned int len = 7; 
    const unsigned int numlayers = 3; 
    const unsigned int upsamp = 4; 
    const unsigned int loclen = 1 + (len - 1) * upsamp; 
    float idx_spacing = 1/(float)upsamp; 
    float h_data[numlayers*len], h_loc[loclen]; 
    for (int i = 0; i < len; i++) 
     for (int j = 0; j < numlayers; j++) 
      h_data[len*j + i] = 1 + cosf((float)pi*i/(j + 1.0f)); 
    for (int i = 0; i < loclen; i++) 
     h_loc[i] = i*idx_spacing; 

    // Get the memory locations you want 
    float* d_loc; 
    cudaMalloc(&d_loc, loclen * sizeof(float)); 
    cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice); 

    // Allocate CUDA array in device memory 
    cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers); // <-- 0 height required for 1Dlayered 
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); 
    cudaMemcpy3DParms mParams = { 0 }; 
    mParams.srcPtr = make_cudaPitchedPtr(h_data, len*sizeof(float), len, 1); 
    mParams.kind = cudaMemcpyHostToDevice; 
    mParams.extent = make_cudaExtent(len, 1, numlayers); // <<-- non zero height required for memcpy to do anything 
    cudaArray* cuArray; 
    cudaMalloc3DArray(&cuArray, &channelDesc, extentDesc, cudaArrayLayered); 
    mParams.dstArray = cuArray; 
    cudaMemcpy3D(&mParams); 

    // Set texture reference parameters 
    texRef.addressMode[0] = cudaAddressModeBorder; 
    texRef.filterMode = cudaFilterModeLinear; 
    texRef.normalized = false; 

    // Bind the array to the texture reference 
    cudaBindTextureToArray(texRef, cuArray, channelDesc); 

    // Allocate result of transformation in device memory 
    float *d_output; 
    cudaMalloc(&d_output, loclen * numlayers * sizeof(float)); 
    float h_output[loclen * numlayers]; 

    // Invoke kernel 
    dim3 dimBlock(16, 16, 1); 
    dim3 dimGrid((loclen + dimBlock.x - 1)/dimBlock.x, 
     (numlayers + dimBlock.y - 1)/dimBlock.y, 1); 
    interp1Kernel<<<dimGrid, dimBlock>>>(d_output, d_loc, loclen, numlayers); 

    // Print Results 
    printf("\n Original Indices \n"); 
    for (int i = 0; i < len; i++) printf(" %d ", i); 
    printf("\n Original array \n"); 
    for (int j = 0; j < numlayers; j++) { 
     for (int i = 0; i < len; i++) { 
      printf("%5.3f ", h_data[i + j*len]); 
     } 
     printf("\n"); 
    } 
    printf("\n Output Indices \n"); 
    for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]); 
    printf("\n Output Array \n"); 
    cudaMemcpy(h_output, d_output, loclen * numlayers * sizeof(float), cudaMemcpyDeviceToHost); 
    for (int j = 0; j < numlayers; j++) { 
     for (int i = 0; i < loclen; i++) { 
      printf("%5.3f ", h_output[i + j*loclen]); 
     } 
     printf("\n"); 
    } 
    printf("\n"); 

    // Free device memory 
    cudaFreeArray(cuArray); 
    cudaFree(d_output); 

    return 0; 
}

출처

2016-08-30 01:08:33

CUDA에서 1D 레이어링 텍스처를 만들고 사용하는 방법

답변

관련 문제