다수의 k- 선택의 쿠다와 새끼

나는 여러 개의 탑 -k 선택을 병렬로 구현하려고한다. 각 선택은 n 개의 요소 목록에서 k 개의 요소를 선택하고 그와 같은 작업을 병렬로 실행한다. 나는 그것을하기 위해 새끼를 쓴다. 이상한 오류가있어서 어디서 잘못했는지 모릅니다. 나는 나의 이해에있어 명백한 실수를 저질렀다고 느낀다. 누군가 나를 점검하도록 도울 수 있는가?다수의 k- 선택의 쿠다와 새끼

편집 :

나는 free() 들어있는 두 개의 코드 섹션의 각 전에 두 cudaDeviceSynchronize() 호출을 추가하여 작업을했다. 그래서 지금은 내 질문은, 바로 여기 물어 또 다른 질문에 대한 반대는 허용되지 않습니다 진행 호출이 비동기에, cudaFree보다 free 동작합니다 다르게 수행합니다 Does cudaFree after asynchronous call work?

// Assume dtop has size k x m and dmat has size n x m, where k < n 
// Each column of dtop is supposed to obtain the top-k indices of 
// elements from the corresponding column in dmat. 
template<typename ValueType, typename IndexType> 
void TopKPerColumn_cub_test(DenseMatrix<IndexType, MemDev> dtop, 
    DenseMatrix<ValueType, MemDev, Const> dmat); 

template<typename T> 
struct SelectLE { 
    T x_; 
    __device__ SelectLE(const T& x):x_(x){} 
    __device__ bool operator() (const T& a) { 
    return a > x_; 
    } 
}; 

template<typename ValueType, typename IndexType> 
__global__ void k_TopKPerColumn_cub_test(DenseMatrix<IndexType, MemDev> dtop, 
    DenseMatrix<ValueType, MemDev, Const> dmat) { 
    int n = dmat.num_rows(); 
    int k = dtop.num_rows(); 

    cub::DoubleBuffer<ValueType> keys; 
    keys.d_buffers[0] = reinterpret_cast<ValueType*>(
     malloc(sizeof(ValueType) * n)); 
    keys.d_buffers[1] = reinterpret_cast<ValueType*>(
     malloc(sizeof(ValueType) * n)); 
    memcpy(keys.d_buffers[keys.selector], dmat.get_col(blockIdx.x).data(), 
     sizeof(ValueType) * n); 

    void* temp_storage = 0; 
    size_t temp_storage_size = 0; 
    cub::DeviceRadixSort::SortKeysDescending(
     temp_storage, temp_storage_size, keys, n); 
    temp_storage = malloc(temp_storage_size); 
    cub::DeviceRadixSort::SortKeysDescending(
     temp_storage, temp_storage_size, keys, n); 
    ValueType kth = keys.Current()[k-1]; 

    free(temp_storage); 
    free(keys.d_buffers[0]); 
    free(keys.d_buffers[1]); 

    temp_storage = 0; 
    temp_storage_size = 0; 
    int* nb_selected = reinterpret_cast<int*>(malloc(sizeof(int))); 
    SelectLE<ValueType> selector(kth); 

    cub::DeviceSelect::If(temp_storage, temp_storage_size, 
     const_cast<ValueType*>(dmat.get_col(blockIdx.x).data()), 
     dtop.get_col(blockIdx.x).data(), 
     nb_selected, n, selector); 
    temp_storage = malloc(temp_storage_size); 
    cub::DeviceSelect::If(temp_storage, temp_storage_size, 
     const_cast<ValueType*>(dmat.get_col(blockIdx.x).data()), 
     dtop.get_col(blockIdx.x).data(), 
     nb_selected, n, selector); 

    free(nb_selected); 
    free(temp_storage); 
} 

template<typename ValueType, typename IndexType> 
void TopKPerColumn_cub_test(DenseMatrix<IndexType, MemDev> dtop, 
    DenseMatrix<ValueType, MemDev, Const> dmat) { 
    k_TopKPerColumn_cub_test<<<dtop.num_cols(), 1>>>(dtop, dmat); 
}

출처

2014-01-18 shaoyl85

나는 그것이 작동되도록 할 수 있어요하지만 이 구현은 단일 스레드 CPU 코드보다 느리게 수행됩니다. 나는 결국 이것을 heap-sort로 구현하고 힙을 공유 메모리에 두었습니다. 성능이 좋습니다.

출처

2014-01-19 04:43:26 shaoyl85

답변

관련 문제