CUDA를 사용한 잔여 계산

두 개의 벡터 (oldvector 및 newvector)가 있습니다. 나는 다음과 같은 의사에 의해 정의 된 잔류의 값을 계산해야합니다CUDA를 사용한 잔여 계산

forall i : oldvector[i] = oldvector[i] - newvector[i]

a로 다음 : 현재

residual = 0; 
forall i : residual += (oldvector[i] - newvector[i])^2

을, 나는 기본적으로하고있는 두 개의 CUDA 추력 운영과이를 계산하고

residual = 0; 
forall i : residual += oldvector[i]^2;

이의 문제는 분명히 글로벌 MEM에 중간 저장소입니다

:하고있다 단항 연산자로 사각형과 thrust::transform_reduce ory 전에 transform_reduce. 이 두 단계를 통합하는이 문제에 대한보다 효율적인 접근 방법이 있습니까? 내 자신의 CUDA 커널을 작성하는 것 외에 다른 옵션이 있습니까?

내가 생각했던 접근법은 zip 반복기가있는 thrust::reduce을 작성하는 것이 었습니다. 이 문제는 연산자의 반환 유형이 입력과 동일한 유형이어야한다는 점입니다. 이것은 나에게 환원 연산자가 여분의 추가를 의미하는 튜플을 반환한다는 것을 의미합니다.

감소 CUDA 커널을 작성한 경우, 감소 커널에 대한 CUDA 1.1 예제가 개선 되었습니까?

출처

2014-05-11 aatish

[추력 :: inner_product] (http://thrust.github.io/doc/group__transformed__reductions.html#gad9df36f7648745ca572037727b66b48d) 당신이 원하는 것을 할 것입니다. 차이 제곱을 계산하는'binary_op2'를 제공하십시오. 당신의'binary_op1'은 단지'thrust :: plus' 일 수 있습니다. –

thrust::inner_product은 단일 함수 호출로 처리합니다.

#include <iostream> 

#include <thrust/tuple.h> 
#include <thrust/iterator/zip_iterator.h> 
#include <thrust/transform.h> 
#include <thrust/device_vector.h> 
#include <thrust/inner_product.h> 
#include <thrust/functional.h> 

#define N 2 

struct zdiffsq{ 
template <typename Tuple> 
    __host__ __device__ float operator()(Tuple a) 
    { 
    float result = thrust::get<0>(a) - thrust::get<1>(a); 
    return result*result; 
    } 
}; 

struct diffsq{ 
    __host__ __device__ float operator()(float a, float b) 
    { 
    return (b-a)*(b-a); 
    } 
}; 

int main(){ 

    thrust::device_vector<float> oldvector(N); 
    thrust::device_vector<float> newvector(N); 
    oldvector[0] = 1.0f; oldvector[1] = 2.0f; 
    newvector[0] = 2.0f; newvector[1] = 5.0f; 

    float result = thrust::inner_product(oldvector.begin(), oldvector.end(), newvector.begin(), 0.0f, thrust::plus<float>(), diffsq()); 
    std::cout << "Result: " << result << std::endl; 

    float result2 = thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(oldvector.begin(), newvector.begin())), thrust::make_zip_iterator(thrust::make_tuple(oldvector.end(), newvector.end())), zdiffsq(), 0.0f, thrust::plus<float>()); 
    std::cout << "Result2: " << result2 << std::endl; 
}

또한 내적 예에 사용되는 펑터 정의를 제거 조사 할 수

, 추력을 사용하여 : 원래 아이디어는이 코드는 두 가지 방법을 보여줍니다 (함께 두 벡터를 압축하는과 thrust::transform_reduce 사용)도 작동하도록 만들 수 있습니다 placeholders.

자신 만의 CUDA 코드를 작성하려는 경우에도 병렬 감소 및 정렬과 같은 자주 사용되는 알고리즘에 대한 표준 권장 사항은 cub입니다.

그리고 예, CUDA parallel reduction sample 및 accompanying presentation은 여전히 빠른 병렬 감소에 좋은 기본적인 소개합니다.

출처

2014-05-11 23:17:48

Robert Crovella는 이미이 질문에 답변했으며 CUB을 사용하도록 제안했습니다.

추력과의 차이점에서 CUB는 사용하기위한 특정 감소 알고리즘의 선택과 사용자가 선택할 수있는 동시성의 정도를 결정할 때 성능에 중요한 매개 변수를 남겨 둡니다. 이러한 매개 변수는 특정 아키텍처 및 응용 프로그램의 성능을 최대화 할 수 있도록 조정할 수 있습니다. 컴파일 타임에 매개 변수를 지정할 수 있으므로 런타임 성능 저하를 피할 수 있습니다.

아래에는 잔여 계산에 CUB를 사용하는 방법에 대한 전체 예제가 나와 있습니다.

#include <cub/cub.cuh> 
#include <cuda.h> 

#include "Utilities.cuh" 

#include <iostream> 

#define BLOCKSIZE   256 
#define ITEMS_PER_THREAD 8 

const int N = 4096; 

/******************************/ 
/* TRANSFORM REDUCTION KERNEL */ 
/******************************/ 
__global__ void TransformSumKernel(const float * __restrict__ indata1, const float * __restrict__ indata2, float * __restrict__ outdata) { 

    unsigned int tid = threadIdx.x + blockIdx.x * gridDim.x; 

    // --- Specialize BlockReduce for type float. 
    typedef cub::BlockReduce<float, BLOCKSIZE * ITEMS_PER_THREAD> BlockReduceT; 

    __shared__ typename BlockReduceT::TempStorage temp_storage; 

    float result; 
    if(tid < N) result = BlockReduceT(temp_storage).Sum((indata1[tid] - indata2[tid]) * (indata1[tid] - indata2[tid])); 

    if(threadIdx.x == 0) atomicAdd(outdata, result); 

    return; 
} 

/********/ 
/* MAIN */ 
/********/ 
int main() { 

    // --- Allocate host side space for 
    float *h_data1  = (float *)malloc(N * sizeof(float)); 
    float *h_data2  = (float *)malloc(N * sizeof(float)); 
    float *h_result  = (float *)malloc(sizeof(float)); 

    float *d_data1;  gpuErrchk(cudaMalloc(&d_data1, N * sizeof(float))); 
    float *d_data2;  gpuErrchk(cudaMalloc(&d_data2, N * sizeof(float))); 
    float *d_result; gpuErrchk(cudaMalloc(&d_result, sizeof(float))); 

    for (int i = 0; i < N; i++) { 
     h_data1[i] = 1.f; 
     h_data2[i] = 3.f; 
    } 

    gpuErrchk(cudaMemcpy(d_data1, h_data1, N * sizeof(float), cudaMemcpyHostToDevice)); 
    gpuErrchk(cudaMemcpy(d_data2, h_data2, N * sizeof(float), cudaMemcpyHostToDevice)); 

    TransformSumKernel<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_data1, d_data2, d_result); 
    gpuErrchk(cudaPeekAtLastError()); 
    gpuErrchk(cudaDeviceSynchronize()); 

    gpuErrchk(cudaMemcpy(h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost)); 

    std::cout << "output: "; 
    std::cout << h_result[0]; 
    std::cout << std::endl; 

    gpuErrchk(cudaFree(d_data1)); 
    gpuErrchk(cudaFree(d_data2)); 
    gpuErrchk(cudaFree(d_result)); 

    return 0; 
}

출처

2015-08-04 07:51:12 JackOLantern

답변

관련 문제