CUDA 스케줄링 문제 또는 커널 시작 버그?

제 프로그램에는 커널 실행에서 나온 각 블록에 대한 작업 대기열이 있습니다. 각 블록은 해당 deque에서 루프 팝핑 작업을하고 처리하고 동적으로 생성 된 작업을 푸시합니다. deque 플래그 배열이 유지되어 어떤 deques가 활성화되어 있는지, 즉 작동 하는지를 나타냅니다. deque가 비어 있으면 커널은 다른 블록의 deque에서 작업을 도용하려고 다른 루프에 들어갑니다. 더 이상 deques가 활성화되지 않으면 중지 조건이 성취됩니다. CUDA 스케줄링 문제 또는 커널 시작 버그?

테스트에서 나는 한 작업 항목과 시작하는 모든 deques을 설정합니다. 내 문제는 일부 블록이 전혀 실행되지 않는 것으로 보입니다. 그 중 일부는 실행되지 않기 때문에 활성 상태를 유지하고 내 프로그램은 무한 루프에 들어갑니다.

코드를 보자. 커널 및 보조 팝과 푸시 기능 :

#define NDEQUES 256 
#define DEQUE_SIZE 20000 

void workStealingWrap(bool *abDequeFlags , int *auiDequesBottoms , unsigned int *auiDequesAges , int2 *aDeques , 
int2 *aOutput , unsigned int *puiOutputCounter) 
{ 
workKernel<<<NDEQUES , WORK_STEALING_THREADS>>>(abDequeFlags , auiDequesBottoms , auiDequesAges , aDeques , aOutput , 
    puiOutputCounter); 
CUT_CHECK_ERROR("workKernel"); 
} 


//---------------------------------------------------------------------------------------------------------- 
// This entry point is for work stealing testing. 
//---------------------------------------------------------------------------------------------------------- 
int main(int argc, char* argv[]) 
{ 
//Test 0: All deques start with 1 work item. 
bool h_abDequeFlags[NDEQUES]; 
int h_aiDequesBottoms[NDEQUES]; 
unsigned int h_auiDequesAges[NDEQUES]; 
int2 *h_aDeques = (int2*) malloc(sizeof(int2)*NDEQUES*DEQUE_SIZE); 
unsigned int h_uiOutputCounter; 
int2 *h_aOutput = (int2*) malloc(sizeof(int2)*NDEQUES*DEQUE_SIZE); 

for(int i = 0 ; i < NDEQUES ; ++i) 
{ 
    h_abDequeFlags[i] = true; 
    h_aiDequesBottoms[i] = 1; 
    h_auiDequesAges[i] = 0; 
    int2 work; work.x = i ; work.y = i; 
    h_aDeques[DEQUE_SIZE*i] = work; 
} 

bool *d_abDequeFlags; 
int *d_auiDequesBottoms; 
unsigned int *d_auiDequesAges; 
int2 *d_aDeques; 
GPUMALLOC((void**)&d_abDequeFlags , sizeof(bool)*NDEQUES); 
GPUMALLOC((void**)&d_auiDequesBottoms , sizeof(int)*NDEQUES); 
GPUMALLOC((void**)&d_auiDequesAges , sizeof(unsigned int)*NDEQUES); 
GPUMALLOC((void**)&d_aDeques , sizeof(int2)*NDEQUES*DEQUE_SIZE); 

TOGPU(d_abDequeFlags , h_abDequeFlags , sizeof(bool)*NDEQUES); 
TOGPU(d_auiDequesBottoms , h_aiDequesBottoms , sizeof(int)*NDEQUES); 
TOGPU(d_auiDequesAges , h_auiDequesAges , sizeof(unsigned int)*NDEQUES); 
TOGPU(d_aDeques , h_aDeques , sizeof(int2)*NDEQUES*DEQUE_SIZE); 

int2 *d_aOutput; 
unsigned int *d_puiOutputCounter; 
GPUMALLOC((void**)&d_aOutput , sizeof(int2)*NDEQUES*DEQUE_SIZE); 
GPUMALLOC((void**)&d_puiOutputCounter , sizeof(unsigned int)); 
GPUMEMSET(d_aOutput , -1 , sizeof(int2)*NDEQUES*DEQUE_SIZE); 
GPUMEMSET(d_puiOutputCounter , 0 , sizeof(unsigned int)); 

workStealingWrap(d_abDequeFlags , d_auiDequesBottoms , d_auiDequesAges , d_aDeques , d_aOutput , d_puiOutputCounter); 

FROMGPU(h_aOutput , d_aOutput , sizeof(int2)*NDEQUES*DEQUE_SIZE); 
FROMGPU(&h_uiOutputCounter , d_puiOutputCounter , sizeof(unsigned int)); 
assert(h_uiOutputCounter == NDEQUES); 
for(int i = 0 ; i < NDEQUES*DEQUE_SIZE ; ++i) 
{ 
    int2 work = h_aOutput[i]; 
    if(i < NDEQUES) 
    { 
     assert(work.x >= 1 && work.x < NDEQUES*5 && work.y >= 1 && work.y < NDEQUES*5); 
    } 
    else 
    { 
     assert(work.x == -1 && work.y == -1); 
    } 
} 

GPUFREE(d_abDequeFlags); 
GPUFREE(d_auiDequesBottoms); 
GPUFREE(d_auiDequesAges); 
GPUFREE(d_aDeques); 
GPUFREE(d_aOutput); 
GPUFREE(d_puiOutputCounter); 

safeFree(h_aDeques); 
safeFree(h_aOutput); 
}

난 그냥 처음 8 개 블록이 실행되고 있는지 확인했습니다 NSight를 사용하여이 코드를 디버깅 :

bool __inline__ __device__ pushBottom(int *aiDequeBottoms , const int &iBid , const unsigned int &uiSize , 
unsigned int &uiPushStartIdx) 
{ 
int iOldBot = aiDequeBottoms[iBid]; 
uiPushStartIdx = iOldBot; 
iOldBot += uiSize; 
if(iOldBot < DEQUE_SIZE) 
{ 
    aiDequeBottoms[iBid] = iOldBot; 
    return true; 
} 
else 
{ 
    return false; 
} 
} 

bool __inline__ __device__ popTop(int *aiDequesBottoms , unsigned int *auiDequesAges , const int &iBid , 
int2 &popStartIdxAndSize) 
{ 
int index; 
unsigned int oldAge = auiDequesAges[iBid]; 
int localBot = aiDequesBottoms[iBid]; 
index = oldAge >> WORK_STEALING_TAG_NBITS; 
if(localBot < index + 2*WORK_STEALING_BATCH_SIZE) 
{ 
    return false; 
} 

int localTag = oldAge & WORK_STEALING_TAG_MASK; 
int size = min(WORK_STEALING_BATCH_SIZE , localBot - index); 
unsigned int newAge = (index+size << WORK_STEALING_TAG_NBITS)| localTag; 

if(oldAge == atomicCAS(&auiDequesAges[iBid] , oldAge , newAge)) 
{ 
    popStartIdxAndSize.x = index; 
    popStartIdxAndSize.y = size; 
    return true; 
} 
else 
{ 
    return false; 
} 
} 

bool __inline__ __device__ popBottom(int *aiDequesBottoms , unsigned int *auiDequesAges , const int &iBid , 
int2 &popStartIdxAndSize) 
{ 
int localBot = aiDequesBottoms[iBid]; 
if(localBot == 0) 
{ 
    return false; 
} 

int index = localBot; 
localBot = localBot - WORK_STEALING_BATCH_SIZE; 
aiDequesBottoms[iBid] = localBot; 
unsigned int oldAge = auiDequesAges[iBid]; 
int oldAgeTop = int(oldAge >> WORK_STEALING_TAG_NBITS); 
if(localBot > oldAgeTop) 
{ 
    popStartIdxAndSize.y = WORK_STEALING_BATCH_SIZE; 
    popStartIdxAndSize.x = index - WORK_STEALING_BATCH_SIZE; 
    return true; 
} 

aiDequesBottoms[iBid] = 0; 
unsigned int newAge = ((oldAge & WORK_STEALING_TAG_MASK) + 1) % (WORK_STEALING_TAG_MASK + 1); 
if(index > oldAgeTop) 
{ 
    if(oldAge == atomicCAS(&auiDequesAges[iBid] , oldAge , newAge)) 
    { 
     popStartIdxAndSize.y = index - oldAgeTop; 
     popStartIdxAndSize.x = index - popStartIdxAndSize.y; 
     return true; 
    } 
} 

auiDequesAges[iBid] = newAge; 
return false; 
} 

//---------------------------------------------------------------------------------------------------------------------------- 
// Function to pop work from deques. Each block try to pop from its own deque. If work isn't available, it try to steal from 
// other deques. 
//---------------------------------------------------------------------------------------------------------------------------- 
template <typename Work> 
bool __inline__ __device__ popWork(bool *abDequeFlags , int *aiDequesBottoms , unsigned int *auiDequesAges , 
const Work *aDeques , const int &iTid , const int &iBid , unsigned int &uiPopDequeIdx , int2 &popStartIdxAndSize , 
int &iLocalDequeCounter , bool &bPopFlag , unsigned int *uiActiveDeques , unsigned int &uiActiveDequesIdx , Work &work) 
{ 
if(iTid == 0) 
{ //Try to pop from block deque 
    iLocalDequeCounter = 0; 
    bPopFlag = popBottom(aiDequesBottoms , auiDequesAges , iBid , popStartIdxAndSize); 
    if(bPopFlag) 
    { 
     uiPopDequeIdx = iBid; 
    } 
    else 
    { 
     abDequeFlags[iBid] = false; 
    } 
} 
__syncthreads(); 

while(!bPopFlag) 
{ //No more work, try to steal some! 
    if(iTid == 0) 
    { 
     uiActiveDequesIdx = 0; 
    } 
    __syncthreads(); 

    if(iTid < NDEQUES) 
    { 
     if(abDequeFlags[iTid] == true) //assuming iTid >= NDEQUES 
     { //Set this deque for a work stealing atempt. 
      unsigned int uiIdx = atomicAdd(&uiActiveDequesIdx,1); 
      uiActiveDeques[uiIdx] = iTid; 
     } 
    } 
    __syncthreads(); 

    if(iTid == 0) 
    { //Try to steal until succeeds or there are no more deques left to search 
     bPopFlag = false; 
     for(uiPopDequeIdx = 0 ; uiPopDequeIdx < uiActiveDequesIdx && bPopFlag == false ; ++uiPopDequeIdx) 
     { 
      bPopFlag = popTop(aiDequesBottoms , auiDequesAges , uiPopDequeIdx , popStartIdxAndSize); 
     } 
    } 
    __syncthreads(); 

    if(uiActiveDequesIdx == 0) 
    { //No more work to steal. End. 
     return false; 
    } 
} 

//Get poped data 
if(iTid < popStartIdxAndSize.y) //assuming number of threads >= WORK_SIZE 
{ 
    work = aDeques[uiPopDequeIdx*DEQUE_SIZE + popStartIdxAndSize.x + iTid]; 
} 

return true; 
} 

//---------------------------------------------------------------------------------------------------------------------------- 
// Function to push work on deques. To achieve better coalescent global memory accesses the input data is assumed to be tight 
// packed in shared mem. 
//---------------------------------------------------------------------------------------------------------------------------- 
template <typename Work> 
bool __inline__ __device__ pushWork(int *aiDequesBottoms , Work *aDeques , const int &iTid , const int &iBid , 
const unsigned int &uiDequeOutputCounter , Work *aOutputLocalWork) 
{ 
//Transfer to global mem. 
unsigned int uiWorkLeft = uiDequeOutputCounter; 
unsigned int uiThreadOffset = iTid; 

while(uiWorkLeft > 0) 
{ 
    unsigned int uiWorkTransfered = min(WORK_STEALING_BATCH_SIZE , uiWorkLeft); 
    unsigned int uiPushStartIdx; 
    bool bPushFlag; 
    if(iTid == 0) 
    { 
     bPushFlag = pushBottom(aiDequesBottoms , iBid , uiWorkTransfered , uiPushStartIdx); 
    } 
    __syncthreads(); 
    if(!bPushFlag) 
    { 
     return false; 
    } 

    if(iTid < uiWorkTransfered) 
    { 
     aDeques[DEQUE_SIZE*iBid + uiPushStartIdx + uiThreadOffset] = aOutputLocalWork[uiThreadOffset]; 
    } 

    uiThreadOffset += WORK_STEALING_BATCH_SIZE; 
    uiWorkLeft -= uiWorkTransfered; 
} 

return true; 
} 

void __global__ workKernel(bool *abDequeFlags , int *aiDequesBottoms , unsigned int *auiDequesAges , int2 *aDeques , 
int2 *aOutput , unsigned int *puiOutputCounter) 
{ 
int iTid = threadIdx.x; 
int iBid = blockIdx.x; 

__shared__ int2 aOutputLocalWork[DEQUE_SHARED_SIZE]; 
__shared__ unsigned int uiPopDequeIdx; 
__shared__ int2 popStartIdxAndSize; 
__shared__ int iLocalDequeCounter; 
__shared__ bool bPopFlag; 
__shared__ unsigned int uiActiveDeques[NDEQUES]; //Contains indices for deques with useful work that can be stolen 
__shared__ unsigned int uiActiveDequesIdx; 
__shared__ unsigned int uiLastOutputCounter; 

int2 work; 
int iRun = 0; 

while(true) //Work loop will continue until cannot pop from bottom or cannot steal work from other deques 
{ 
    if(!popWork<int2>(abDequeFlags , aiDequesBottoms , auiDequesAges , aDeques , iTid , iBid , uiPopDequeIdx , 
     popStartIdxAndSize , iLocalDequeCounter , bPopFlag , uiActiveDeques , uiActiveDequesIdx , work)) 
    { //No more work 
     return; 
    } 

    //Useful work comes here. For now, just some dummy code for testing. 
    if(iRun < 5) 
    { //Just 5 iterations that generate more work 
     if(iTid < popStartIdxAndSize.y) 
     { 
      unsigned int uiNewWorkCounter = 1; 
      int iDequeOutputCounter = atomicAdd(&iLocalDequeCounter , uiNewWorkCounter); 
      work.x++; work.y++; 
      aOutputLocalWork[iDequeOutputCounter] = work; 
      __syncthreads(); 

      if(iTid == 0) 
      { 
       uiLastOutputCounter = atomicAdd(puiOutputCounter , iLocalDequeCounter); 
      } 
      __syncthreads(); 

      if(iTid < iLocalDequeCounter) //assuming iLocalDequeCounter <= blockDim.x 
      { 
       aOutput[uiLastOutputCounter + iTid] = aOutputLocalWork[iTid]; 
      } 
     } 
    } 

    //Push back to global mem 
    if(!pushWork<int2>(aiDequesBottoms , aDeques , iTid , iBid , iLocalDequeCounter , aOutputLocalWork)) 
    { //overflow 
     return; 
    } 
    ++iRun; 
} 
}

그리고 이것은 테스트입니다. 이 문제가 schedulling 경우 popWork polling 모든 리소스를 소모하는 또는 내 버그 그냥 버그 궁금하네요. 어떤 도움을 주시면 감사하겠습니다.

출처

2012-07-20 dsilva.vinicius

GPU가 실제로 한 번에 더 많은 블록을 실행할 수 있습니까? 모든 GPU에서 1000 개의 블록을 예약 할 수 있지만 동시에 실행한다는 것은 아닙니다.GPU는 가능한 한 많이 실행되며 이후 블록은 이전 GPU가 종료 될 때만 실행됩니다. – CygnusX1

블록이 동시에 실행되지 않는다는 것을 알고 있습니다. 하지만 나는 스케쥴러가 적어도 더 지능적이어야하고 커널 블록이 기아에 이런 방식으로 가게하지 말 것을 기대했다. 초기 블록이 끝난 후에 만 나중에 블록이 실행된다고 말하는 내용이 있습니까? 그것이 사실이라면 발행 된 블록 수를 줄입니다. CUDA에서 어떻게 워프 스케줄링이 수행되는지 읽었던 것을 기억하지 못합니다. –

블록이 절대로 스케줄되지 않습니다. 모든 리소스를 실행할 수있는 리소스가 충분하지 않으면 오래된 블록이 종료 될 때만 새 블록을 실행할 수 있습니다. 나는 프로그래밍 가이드가 언급 한 것으로 믿지만, 나는 (내 Fermi GPU에도 불구하고) 내 자신의 테스트를했고, 이것을 확인했다. – CygnusX1

코드에 몇 가지 문제점이 있습니다. 나는 그것을 실행하지 않았기 때문에 어떤 확장이 적절한 지 모른다.

popTop의 구현이 없습니다. 나는 그것이 성공할 수도 있고 실패 할 수도 있다고 가정하고 그 결과는 bPopFlag에 영향을 줄 것이다. 0이 아닌 uiActiveDequesIdx을 가질 수 있다고 가정하고 bPopFlag은 여전히 0이라고 가정합니다. 이 경우

는 popWork에 당신은 while(!bPopFlag) 루프를 가지고있다. 루프에서 마지막으로 __syncthreads()에 도달하는 두 개의 날실을 상상해보십시오. 이제 첫 번째 워프는 uiActiveDequesIdx (0이 아닌 것으로 가정)을 검사하고 루프의 처음으로 돌아가고 iTid 스레드는 uiActiveDequesIdx을 0으로 설정합니다. 이제 두 번째 워프가 마지막 __syncthreads()에서 실행을 다시 시작하고 uiActiveDequesIdx을 확인합니다 0) 루프를 종료합니다. 이 시점부터 너의 날실은 __syncthreads()에 분기하고 나쁜 일이 일어날 것입니다.
pushWork에서 bPushFlag은 로컬 레지스터입니다. iTid == 0 스레드에 의해서만 변경되었지만 모든 스레드에 의해 읽혀집니다. 초기화되지 않은 값을 읽고 더 이상 __syncthreads()에 분기 된 워프를 얻을 수 있습니다.

내가 아직 보지 못한 더 많은 문제가있을 수 있습니다. 특히, 전역 메모리에서 플래그를 설정할 때 __threadfence()을 건너 뛸지도 모른다고 걱정됩니다 (프로그래밍 가이드에서 해당 기능을 확인하십시오).

또한 다른 루프에서 다른 루프를 점검하여보고 한 문제와 유사한 동기화 문제가 있는지 확인하십시오 위의 경우 - 일부 워프는 이전 반복 루프에 있고 다른 루프는 이미 새 반복에 있습니다. 좋은 일반적인 접근 방식은 공유 값에 따라 비 확산 루프가있는 경우 항상 루프 마지막에 __syncthreads()을 넣고 코드의 미세 최적화를 수행 할 때만 제거하는 것이 좋습니다. 그것.

출처

2012-07-20 17:29:44 CygnusX1

팁 주셔서 감사합니다. 나는 블록 스케줄링 문제 때문에 코드를 적절히 테스트하지 못했다. 나는 나중에 문제를 피하기 위해 지적 된 오류를 바로 잡을 것이다. 내 질문에 대한 주요 문제를 지적한 이후 귀하의 게시물을 답변으로 표시하겠습니다. –

제대로 컴파일하려면 저수준 푸시 및 팝 기능으로 질문을 편집했습니다. –

CUDA 스케줄링 문제 또는 커널 시작 버그?

답변

관련 문제