제 프로그램에는 커널 실행에서 나온 각 블록에 대한 작업 대기열이 있습니다. 각 블록은 해당 deque에서 루프 팝핑 작업을하고 처리하고 동적으로 생성 된 작업을 푸시합니다. deque 플래그 배열이 유지되어 어떤 deques가 활성화되어 있는지, 즉 작동 하는지를 나타냅니다. deque가 비어 있으면 커널은 다른 블록의 deque에서 작업을 도용하려고 다른 루프에 들어갑니다. 더 이상 deques가 활성화되지 않으면 중지 조건이 성취됩니다. CUDA 스케줄링 문제 또는 커널 시작 버그?
테스트에서 나는 한 작업 항목과 시작하는 모든 deques을 설정합니다. 내 문제는 일부 블록이 전혀 실행되지 않는 것으로 보입니다. 그 중 일부는 실행되지 않기 때문에 활성 상태를 유지하고 내 프로그램은 무한 루프에 들어갑니다.코드를 보자. 커널 및 보조 팝과 푸시 기능 :
#define NDEQUES 256
#define DEQUE_SIZE 20000
void workStealingWrap(bool *abDequeFlags , int *auiDequesBottoms , unsigned int *auiDequesAges , int2 *aDeques ,
int2 *aOutput , unsigned int *puiOutputCounter)
{
workKernel<<<NDEQUES , WORK_STEALING_THREADS>>>(abDequeFlags , auiDequesBottoms , auiDequesAges , aDeques , aOutput ,
puiOutputCounter);
CUT_CHECK_ERROR("workKernel");
}
//----------------------------------------------------------------------------------------------------------
// This entry point is for work stealing testing.
//----------------------------------------------------------------------------------------------------------
int main(int argc, char* argv[])
{
//Test 0: All deques start with 1 work item.
bool h_abDequeFlags[NDEQUES];
int h_aiDequesBottoms[NDEQUES];
unsigned int h_auiDequesAges[NDEQUES];
int2 *h_aDeques = (int2*) malloc(sizeof(int2)*NDEQUES*DEQUE_SIZE);
unsigned int h_uiOutputCounter;
int2 *h_aOutput = (int2*) malloc(sizeof(int2)*NDEQUES*DEQUE_SIZE);
for(int i = 0 ; i < NDEQUES ; ++i)
{
h_abDequeFlags[i] = true;
h_aiDequesBottoms[i] = 1;
h_auiDequesAges[i] = 0;
int2 work; work.x = i ; work.y = i;
h_aDeques[DEQUE_SIZE*i] = work;
}
bool *d_abDequeFlags;
int *d_auiDequesBottoms;
unsigned int *d_auiDequesAges;
int2 *d_aDeques;
GPUMALLOC((void**)&d_abDequeFlags , sizeof(bool)*NDEQUES);
GPUMALLOC((void**)&d_auiDequesBottoms , sizeof(int)*NDEQUES);
GPUMALLOC((void**)&d_auiDequesAges , sizeof(unsigned int)*NDEQUES);
GPUMALLOC((void**)&d_aDeques , sizeof(int2)*NDEQUES*DEQUE_SIZE);
TOGPU(d_abDequeFlags , h_abDequeFlags , sizeof(bool)*NDEQUES);
TOGPU(d_auiDequesBottoms , h_aiDequesBottoms , sizeof(int)*NDEQUES);
TOGPU(d_auiDequesAges , h_auiDequesAges , sizeof(unsigned int)*NDEQUES);
TOGPU(d_aDeques , h_aDeques , sizeof(int2)*NDEQUES*DEQUE_SIZE);
int2 *d_aOutput;
unsigned int *d_puiOutputCounter;
GPUMALLOC((void**)&d_aOutput , sizeof(int2)*NDEQUES*DEQUE_SIZE);
GPUMALLOC((void**)&d_puiOutputCounter , sizeof(unsigned int));
GPUMEMSET(d_aOutput , -1 , sizeof(int2)*NDEQUES*DEQUE_SIZE);
GPUMEMSET(d_puiOutputCounter , 0 , sizeof(unsigned int));
workStealingWrap(d_abDequeFlags , d_auiDequesBottoms , d_auiDequesAges , d_aDeques , d_aOutput , d_puiOutputCounter);
FROMGPU(h_aOutput , d_aOutput , sizeof(int2)*NDEQUES*DEQUE_SIZE);
FROMGPU(&h_uiOutputCounter , d_puiOutputCounter , sizeof(unsigned int));
assert(h_uiOutputCounter == NDEQUES);
for(int i = 0 ; i < NDEQUES*DEQUE_SIZE ; ++i)
{
int2 work = h_aOutput[i];
if(i < NDEQUES)
{
assert(work.x >= 1 && work.x < NDEQUES*5 && work.y >= 1 && work.y < NDEQUES*5);
}
else
{
assert(work.x == -1 && work.y == -1);
}
}
GPUFREE(d_abDequeFlags);
GPUFREE(d_auiDequesBottoms);
GPUFREE(d_auiDequesAges);
GPUFREE(d_aDeques);
GPUFREE(d_aOutput);
GPUFREE(d_puiOutputCounter);
safeFree(h_aDeques);
safeFree(h_aOutput);
}
난 그냥 처음 8 개 블록이 실행되고 있는지 확인했습니다 NSight를 사용하여이 코드를 디버깅 :
bool __inline__ __device__ pushBottom(int *aiDequeBottoms , const int &iBid , const unsigned int &uiSize ,
unsigned int &uiPushStartIdx)
{
int iOldBot = aiDequeBottoms[iBid];
uiPushStartIdx = iOldBot;
iOldBot += uiSize;
if(iOldBot < DEQUE_SIZE)
{
aiDequeBottoms[iBid] = iOldBot;
return true;
}
else
{
return false;
}
}
bool __inline__ __device__ popTop(int *aiDequesBottoms , unsigned int *auiDequesAges , const int &iBid ,
int2 &popStartIdxAndSize)
{
int index;
unsigned int oldAge = auiDequesAges[iBid];
int localBot = aiDequesBottoms[iBid];
index = oldAge >> WORK_STEALING_TAG_NBITS;
if(localBot < index + 2*WORK_STEALING_BATCH_SIZE)
{
return false;
}
int localTag = oldAge & WORK_STEALING_TAG_MASK;
int size = min(WORK_STEALING_BATCH_SIZE , localBot - index);
unsigned int newAge = (index+size << WORK_STEALING_TAG_NBITS)| localTag;
if(oldAge == atomicCAS(&auiDequesAges[iBid] , oldAge , newAge))
{
popStartIdxAndSize.x = index;
popStartIdxAndSize.y = size;
return true;
}
else
{
return false;
}
}
bool __inline__ __device__ popBottom(int *aiDequesBottoms , unsigned int *auiDequesAges , const int &iBid ,
int2 &popStartIdxAndSize)
{
int localBot = aiDequesBottoms[iBid];
if(localBot == 0)
{
return false;
}
int index = localBot;
localBot = localBot - WORK_STEALING_BATCH_SIZE;
aiDequesBottoms[iBid] = localBot;
unsigned int oldAge = auiDequesAges[iBid];
int oldAgeTop = int(oldAge >> WORK_STEALING_TAG_NBITS);
if(localBot > oldAgeTop)
{
popStartIdxAndSize.y = WORK_STEALING_BATCH_SIZE;
popStartIdxAndSize.x = index - WORK_STEALING_BATCH_SIZE;
return true;
}
aiDequesBottoms[iBid] = 0;
unsigned int newAge = ((oldAge & WORK_STEALING_TAG_MASK) + 1) % (WORK_STEALING_TAG_MASK + 1);
if(index > oldAgeTop)
{
if(oldAge == atomicCAS(&auiDequesAges[iBid] , oldAge , newAge))
{
popStartIdxAndSize.y = index - oldAgeTop;
popStartIdxAndSize.x = index - popStartIdxAndSize.y;
return true;
}
}
auiDequesAges[iBid] = newAge;
return false;
}
//----------------------------------------------------------------------------------------------------------------------------
// Function to pop work from deques. Each block try to pop from its own deque. If work isn't available, it try to steal from
// other deques.
//----------------------------------------------------------------------------------------------------------------------------
template <typename Work>
bool __inline__ __device__ popWork(bool *abDequeFlags , int *aiDequesBottoms , unsigned int *auiDequesAges ,
const Work *aDeques , const int &iTid , const int &iBid , unsigned int &uiPopDequeIdx , int2 &popStartIdxAndSize ,
int &iLocalDequeCounter , bool &bPopFlag , unsigned int *uiActiveDeques , unsigned int &uiActiveDequesIdx , Work &work)
{
if(iTid == 0)
{ //Try to pop from block deque
iLocalDequeCounter = 0;
bPopFlag = popBottom(aiDequesBottoms , auiDequesAges , iBid , popStartIdxAndSize);
if(bPopFlag)
{
uiPopDequeIdx = iBid;
}
else
{
abDequeFlags[iBid] = false;
}
}
__syncthreads();
while(!bPopFlag)
{ //No more work, try to steal some!
if(iTid == 0)
{
uiActiveDequesIdx = 0;
}
__syncthreads();
if(iTid < NDEQUES)
{
if(abDequeFlags[iTid] == true) //assuming iTid >= NDEQUES
{ //Set this deque for a work stealing atempt.
unsigned int uiIdx = atomicAdd(&uiActiveDequesIdx,1);
uiActiveDeques[uiIdx] = iTid;
}
}
__syncthreads();
if(iTid == 0)
{ //Try to steal until succeeds or there are no more deques left to search
bPopFlag = false;
for(uiPopDequeIdx = 0 ; uiPopDequeIdx < uiActiveDequesIdx && bPopFlag == false ; ++uiPopDequeIdx)
{
bPopFlag = popTop(aiDequesBottoms , auiDequesAges , uiPopDequeIdx , popStartIdxAndSize);
}
}
__syncthreads();
if(uiActiveDequesIdx == 0)
{ //No more work to steal. End.
return false;
}
}
//Get poped data
if(iTid < popStartIdxAndSize.y) //assuming number of threads >= WORK_SIZE
{
work = aDeques[uiPopDequeIdx*DEQUE_SIZE + popStartIdxAndSize.x + iTid];
}
return true;
}
//----------------------------------------------------------------------------------------------------------------------------
// Function to push work on deques. To achieve better coalescent global memory accesses the input data is assumed to be tight
// packed in shared mem.
//----------------------------------------------------------------------------------------------------------------------------
template <typename Work>
bool __inline__ __device__ pushWork(int *aiDequesBottoms , Work *aDeques , const int &iTid , const int &iBid ,
const unsigned int &uiDequeOutputCounter , Work *aOutputLocalWork)
{
//Transfer to global mem.
unsigned int uiWorkLeft = uiDequeOutputCounter;
unsigned int uiThreadOffset = iTid;
while(uiWorkLeft > 0)
{
unsigned int uiWorkTransfered = min(WORK_STEALING_BATCH_SIZE , uiWorkLeft);
unsigned int uiPushStartIdx;
bool bPushFlag;
if(iTid == 0)
{
bPushFlag = pushBottom(aiDequesBottoms , iBid , uiWorkTransfered , uiPushStartIdx);
}
__syncthreads();
if(!bPushFlag)
{
return false;
}
if(iTid < uiWorkTransfered)
{
aDeques[DEQUE_SIZE*iBid + uiPushStartIdx + uiThreadOffset] = aOutputLocalWork[uiThreadOffset];
}
uiThreadOffset += WORK_STEALING_BATCH_SIZE;
uiWorkLeft -= uiWorkTransfered;
}
return true;
}
void __global__ workKernel(bool *abDequeFlags , int *aiDequesBottoms , unsigned int *auiDequesAges , int2 *aDeques ,
int2 *aOutput , unsigned int *puiOutputCounter)
{
int iTid = threadIdx.x;
int iBid = blockIdx.x;
__shared__ int2 aOutputLocalWork[DEQUE_SHARED_SIZE];
__shared__ unsigned int uiPopDequeIdx;
__shared__ int2 popStartIdxAndSize;
__shared__ int iLocalDequeCounter;
__shared__ bool bPopFlag;
__shared__ unsigned int uiActiveDeques[NDEQUES]; //Contains indices for deques with useful work that can be stolen
__shared__ unsigned int uiActiveDequesIdx;
__shared__ unsigned int uiLastOutputCounter;
int2 work;
int iRun = 0;
while(true) //Work loop will continue until cannot pop from bottom or cannot steal work from other deques
{
if(!popWork<int2>(abDequeFlags , aiDequesBottoms , auiDequesAges , aDeques , iTid , iBid , uiPopDequeIdx ,
popStartIdxAndSize , iLocalDequeCounter , bPopFlag , uiActiveDeques , uiActiveDequesIdx , work))
{ //No more work
return;
}
//Useful work comes here. For now, just some dummy code for testing.
if(iRun < 5)
{ //Just 5 iterations that generate more work
if(iTid < popStartIdxAndSize.y)
{
unsigned int uiNewWorkCounter = 1;
int iDequeOutputCounter = atomicAdd(&iLocalDequeCounter , uiNewWorkCounter);
work.x++; work.y++;
aOutputLocalWork[iDequeOutputCounter] = work;
__syncthreads();
if(iTid == 0)
{
uiLastOutputCounter = atomicAdd(puiOutputCounter , iLocalDequeCounter);
}
__syncthreads();
if(iTid < iLocalDequeCounter) //assuming iLocalDequeCounter <= blockDim.x
{
aOutput[uiLastOutputCounter + iTid] = aOutputLocalWork[iTid];
}
}
}
//Push back to global mem
if(!pushWork<int2>(aiDequesBottoms , aDeques , iTid , iBid , iLocalDequeCounter , aOutputLocalWork))
{ //overflow
return;
}
++iRun;
}
}
그리고 이것은 테스트입니다. 이 문제가 schedulling 경우 popWork polling 모든 리소스를 소모하는 또는 내 버그 그냥 버그 궁금하네요. 어떤 도움을 주시면 감사하겠습니다.
GPU가 실제로 한 번에 더 많은 블록을 실행할 수 있습니까? 모든 GPU에서 1000 개의 블록을 예약 할 수 있지만 동시에 실행한다는 것은 아닙니다.GPU는 가능한 한 많이 실행되며 이후 블록은 이전 GPU가 종료 될 때만 실행됩니다. – CygnusX1
블록이 동시에 실행되지 않는다는 것을 알고 있습니다. 하지만 나는 스케쥴러가 적어도 더 지능적이어야하고 커널 블록이 기아에 이런 방식으로 가게하지 말 것을 기대했다. 초기 블록이 끝난 후에 만 나중에 블록이 실행된다고 말하는 내용이 있습니까? 그것이 사실이라면 발행 된 블록 수를 줄입니다. CUDA에서 어떻게 워프 스케줄링이 수행되는지 읽었던 것을 기억하지 못합니다. –
블록이 절대로 스케줄되지 않습니다. 모든 리소스를 실행할 수있는 리소스가 충분하지 않으면 오래된 블록이 종료 될 때만 새 블록을 실행할 수 있습니다. 나는 프로그래밍 가이드가 언급 한 것으로 믿지만, 나는 (내 Fermi GPU에도 불구하고) 내 자신의 테스트를했고, 이것을 확인했다. – CygnusX1