CUDA 추력 라이브러리 해결 방법 - for_each 동기화 오류?

CUDA의 추력 라이브러리를 사용하여 CUDA에서 간단한 동적 벡터를 수정하려고합니다. 그러나 오류가 일부 동기화 프로세스와 관련되어 있음을 나타내는 화면에서 "launch_closure_by_value"오류가 표시됩니다.CUDA 추력 라이브러리 해결 방법 - for_each 동기화 오류?

이 오류로 인해 간단한 1D 동적 배열 수정이 불가능합니다.

오류의 원인이되는 코드 세그먼트는 다음과 같습니다.

void 
setIndexedGridInfo(float* a, float*b) 
{ 

    thrust::device_ptr<float> d_oldData(a); 
    thrust::device_ptr<float> d_newData(b); 

    float c = 0.0; 

    thrust::for_each(
     thrust::make_zip_iterator(thrust::make_tuple(d_oldData,d_newData)), 
     thrust::make_zip_iterator(thrust::make_tuple(d_oldData+8,d_newData+8)), 
     grid_functor(c)); 
}

grid_functor이 _kernel.cu

에 정의되어 System.cu

float* a= (float*)(malloc(8*sizeof(float))); 
a[0]= 0; a[1]= 1; a[2]= 2; a[3]= 3; a[4]= 4; a[5]= 5; a[6]= 6; a[7]= 7; 
float* b = (float*)(malloc(8*sizeof(float))); 
setIndexedGridInfo(a,b);

System.cu에있는 코드 세그먼트에 정의되어 내가 setIndexedGrid을 부르는 .cpp 파일에서

struct grid_functor 
{ 
    float a; 

    __host__ __device__ 
    grid_functor(float grid_Info) : a(grid_Info) {} 

    template <typename Tuple> 
    __device__ 
    void operator()(Tuple t) 
    { 
     volatile float data = thrust::get<0>(t); 
     float pos = data + 0.1; 
     thrust::get<1>(t) = pos; 
    } 

};

또한 출력 창 (Visual Studio 사용)에서도 볼 수 있습니다.

Particles.exe에서 0x000007fefdc7cacd에서

첫 번째 예외 : 마이크로 소프트 C++ 예외 : 메모리 위치에서 cudaError_enum 0x0029eb60 .. 첫 번째 예외에서 0x000007fefdc7cacd 에서 smokeParticles.exe : 마이크로 소프트 C++ 예외 : 추력 :: 시스템 :: 메모리 위치 0x0029ecf0 .. Particles.exe에서 0x000007fefdc7cacd에서 처리되지 않은 예외에 SYSTEM_ERROR : 마이크로 소프트 C++는 예외 을 : 메모리 위치 0x0029ecf0에서 추력 :: 시스템 :: SYSTEM_ERROR ..

는 무엇을 일으키는 문제?

출처

2012-03-10 Emre Turkoz

장치 메모리에 포인터가 있어야하는 함수에서 호스트 메모리 포인터를 사용하려고합니다.

float* a= (float*)(malloc(8*sizeof(float))); 
a[0]= 0; a[1]= 1; a[2]= 2; a[3]= 3; a[4]= 4; a[5]= 5; a[6]= 6; a[7]= 7; 
float* b = (float*)(malloc(8*sizeof(float))); 
setIndexedGridInfo(a,b); 

..... 

thrust::device_ptr<float> d_oldData(a); 
thrust::device_ptr<float> d_newData(b);

thrust::device_ptr는 "포장"추력이 사용할 수 있도록 CUDA API를 사용하여 할당 된 장치 메모리 포인터위한 것입니다 :이 코드는 문제입니다. 호스트 포인터를 장치 포인터로 직접 처리하려고합니다. 그것은 불법입니다. 이처럼 setIndexedGridInfo 기능을 수정할 수 :

void setIndexedGridInfo(float* a, float*b, const int n) 
{ 

    thrust::device_vector<float> d_oldData(a,a+n); 
    thrust::device_vector<float> d_newData(b,b+n); 

    float c = 0.0; 

    thrust::for_each(
     thrust::make_zip_iterator(thrust::make_tuple(d_oldData.begin(),d_newData.begin())), 
     thrust::make_zip_iterator(thrust::make_tuple(d_oldData.end(),d_newData.end())), 
     grid_functor(c)); 
}

device_vector 생성자는 장치 메모리를 할당 한 다음 장치에 호스트 메모리의 내용을 복사합니다. 당신이보고있는 오류를 고쳐야합니다. 비록 당신이 for_each 이터레이터로 무엇을하려고하는지, 그리고 당신이 wrttien을 가지고있는 펑터가 정확한지는 확실하지 않습니다.

편집 : 여기

는 코드의 완전한, 컴파일 가능한, 실행 가능한 버전입니다 :

#include <cstdlib> 
#include <cstdio> 
#include <thrust/device_vector.h> 
#include <thrust/for_each.h> 
#include <thrust/copy.h> 

struct grid_functor 
{ 
    float a; 

    __host__ __device__ 
    grid_functor(float grid_Info) : a(grid_Info) {} 

    template <typename Tuple> 
    __device__ 
    void operator()(Tuple t) 
    { 
     volatile float data = thrust::get<0>(t); 
     float pos = data + 0.1f; 
     thrust::get<1>(t) = pos; 
    } 

}; 

void setIndexedGridInfo(float* a, float*b, const int n) 
{ 

    thrust::device_vector<float> d_oldData(a,a+n); 
    thrust::device_vector<float> d_newData(b,b+n); 

    float c = 0.0; 

    thrust::for_each(
     thrust::make_zip_iterator(thrust::make_tuple(d_oldData.begin(),d_newData.begin())), 
     thrust::make_zip_iterator(thrust::make_tuple(d_oldData.end(),d_newData.end())), 
     grid_functor(c)); 

    thrust::copy(d_newData.begin(), d_newData.end(), b); 
} 

int main(void) 
{ 
    const int n = 8; 
    float* a= (float*)(malloc(n*sizeof(float))); 
    a[0]= 0; a[1]= 1; a[2]= 2; a[3]= 3; a[4]= 4; a[5]= 5; a[6]= 6; a[7]= 7; 
    float* b = (float*)(malloc(n*sizeof(float))); 
    setIndexedGridInfo(a,b,n); 

    for(int i=0; i<n; i++) { 
     fprintf(stdout, "%d (%f,%f)\n", i, a[i], b[i]); 
    } 

    return 0; 
}

내가 컴파일과 OS 10.6.8 호스트에서이 코드를 실행할 수 있습니다 CUDA 4.1 like this :

$ nvcc -Xptxas="-v" -arch=sm_12 -g -G thrustforeach.cu 
./thrustforeach.cu(18): Warning: Cannot tell what pointer points to, assuming global memory space 
./thrustforeach.cu(20): Warning: Cannot tell what pointer points to, assuming global memory space 
./thrustforeach.cu(18): Warning: Cannot tell what pointer points to, assuming global memory space 
./thrustforeach.cu(20): Warning: Cannot tell what pointer points to, assuming global memory space 
ptxas info : Compiling entry function '_ZN6thrust6detail7backend4cuda6detail23launch_closure_by_valueINS2_18for_each_n_closureINS_12zip_iteratorINS_5tupleINS0_15normal_iteratorINS_10device_ptrIfEEEESB_NS_9null_typeESC_SC_SC_SC_SC_SC_SC_EEEEi12grid_functorEEEEvT_' for 'sm_12' 
ptxas info : Used 14 registers, 160+0 bytes lmem, 16+16 bytes smem, 4 bytes cmem[1] 
ptxas info : Compiling entry function '_ZN6thrust6detail7backend4cuda6detail23launch_closure_by_valueINS2_18for_each_n_closureINS_12zip_iteratorINS_5tupleINS0_15normal_iteratorINS_10device_ptrIfEEEESB_NS_9null_typeESC_SC_SC_SC_SC_SC_SC_EEEEj12grid_functorEEEEvT_' for 'sm_12' 
ptxas info : Used 14 registers, 160+0 bytes lmem, 16+16 bytes smem, 4 bytes cmem[1] 

$ ./a.out 
0 (0.000000,0.100000) 
1 (1.000000,1.100000) 
2 (2.000000,2.100000) 
3 (3.000000,3.100000) 
4 (4.000000,4.100000) 
5 (5.000000,5.100000) 
6 (6.000000,6.100000) 
7 (7.000000,7.100000)

출처

2012-03-10 09:05:33 talonmies

저는 완전히 추력의 개념을 오해했습니다. 호스트 배열을 통과 할 수도 있다고 생각했습니다. 나는 모든 요소를 0.1 씩 증가 시키려고했습니다. 운동 용. 도와 줘서 고마워. –

하지만이 device_vector 초기화가 작동하지 않습니다. device_vector 이 충분하지 않습니다. device_vector도 typename Alloc를 필요로합니다. device_vector –

나를 신뢰하십시오. 'for_each' 호출에 대한 변경 내에서 작은 구문 오류가 발생했습니다. 새로운 버전을보십시오. 필자는 이제 컴파일러를 확인했고 compute 1.2 장치에서 CUDA 4.1을 실행합니다. – talonmies

CUDA 추력 라이브러리 해결 방법 - for_each 동기화 오류?

답변

관련 문제