다음 프로그램은 NVIDIA GeForce 310 GPU, CUDA v4.2에서 Gauss-Jordan 제거를 사용하여 GF (2^8)의 행렬을 역으로 변환합니다 (XOR과 유사하고 곱셈은 테이블 루프 업 방식을 사용함).왜 "-g -G"없이 "nvcc"를 사용하여 매트릭스 역전승을 수행하면 오류가 발생합니까?
typedef unsigned char BYTE;
#define BLOCK_SIZE 16
// addition
__inline __device__ BYTE
add_GF(BYTE a,BYTE b)
{
return a^b;
}
// subtraction
__inline __device__ BYTE
sub_GF(BYTE a,BYTE b)
{
return a^b;
}
// multiplication
__inline __device__ BYTE
mul_GF(BYTE a,BYTE b,BYTE *d_numOf, BYTE *d_indexOf)
{
if(a==0 || b == 0) return 0;
return d_numOf[(d_indexOf[a] + d_indexOf[b])%255];
}
// divison
__inline __device__ BYTE
div_GF(BYTE a,BYTE b, BYTE *d_numOf,BYTE *d_indexOf, BYTE *d_inv)
{
if(b == 0) return 0;
return mul_GF(a,d_inv[b],d_numOf,d_indexOf);
}
// swap two line
__global__ void
LineSwap(BYTE *M, int *n,int *a, int *b)
{
BYTE temp;
const unsigned int tid = blockIdx.x*blockDim.x+threadIdx.x;
temp = M[(*a)*(*n+*n)+tid];
M[*a*(*n+*n)+tid] = M[(*b)*(*n+*n)+tid];
M[*b*(*n+*n)+tid] = temp;
}
// multiply a line by a factor
__global__ void
LineMul(BYTE *M, int *n,int *a, BYTE *d_numOf, BYTE *d_indexOf, BYTE *d_inv)
{
BYTE k = div_GF(128, M[*a*(*n+*n)+*a], d_numOf, d_indexOf, d_inv);
const unsigned int tid = blockIdx.x*blockDim.x+threadIdx.x;
M[*a*(*n+*n)+tid] = mul_GF(k , M[*a*(*n+*n)+tid], d_numOf, d_indexOf);
}
// multiply a line by a factor then subtract another line
__global__ void
LineMulSub(BYTE *M, int *n,int *a, BYTE *k, int *b, BYTE *d_numOf, BYTE *d_indexOf)
{
const unsigned int tid = blockIdx.x*blockDim.x+threadIdx.x;
M[*b*(*n+*n)+tid] = sub_GF(M[*b*(*n+*n)+tid] , mul_GF(*k ,M[*a*(*n+*n)+tid], d_numOf, d_indexOf));
}
// compute the inverse matrix
bool InvMatGF(BYTE* h_A, BYTE* &h_Inv, int n)
{
//h_M[n*(n+n)] is a augmented matrix.
BYTE *h_M = new BYTE [n*(n+n)];
for(int i=0; i < n*(n+n); i++)
{
h_M[i] = 0;
}
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
{
h_M[i*(n+n)+j] = h_A[i*n+j];
h_M[i*(n+n)+(n+j)] = 0;
}
}
for(int i=0; i<n; i++)
{
h_M[i*(n+n)+(n+i)] = 128;
}
BYTE *d_A = NULL;
BYTE *d_M = NULL;
int *d_n = NULL;
int *d_i = NULL;
int *d_j = NULL;
BYTE *d_numOf = NULL;
BYTE *d_indexOf = NULL;
BYTE *d_inv = NULL;
int size_A = n*n*sizeof(BYTE);
int size_M = n*(n+n)*sizeof(BYTE);
int size_Lookup_Table = TABLE_SIZE*sizeof(BYTE);
int size_INTEGER = sizeof(int);
checkCudaErrors(cudaMalloc((void**) &d_A, size_A));
checkCudaErrors(cudaMalloc((void**) &d_M, size_M));
checkCudaErrors(cudaMalloc((void**) &d_n, size_INTEGER));
checkCudaErrors(cudaMalloc((void**) &d_i, size_INTEGER));
checkCudaErrors(cudaMalloc((void**) &d_j, size_INTEGER));
checkCudaErrors(cudaMalloc((void**) &d_numOf, size_Lookup_Table));
checkCudaErrors(cudaMalloc((void**) &d_indexOf, size_Lookup_Table));
checkCudaErrors(cudaMalloc((void**) &d_inv, size_Lookup_Table));
checkCudaErrors(cudaMemcpy(d_A,h_A,size_A,cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_n,&n,size_INTEGER,cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_numOf,&numOf,size_Lookup_Table,cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_indexOf,&indexOf,size_Lookup_Table,cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_inv,&inv,size_Lookup_Table,cudaMemcpyHostToDevice));
dim3 blockDim(BLOCK_SIZE,BLOCK_SIZE,1);
dim3 gridDim(((n+n)+blockDim.x-1)/blockDim.x,1,1);
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for(int i = 0; i < n; i++)
{
if(h_M[i*(n+n)+i] != 0)
{
checkCudaErrors(cudaMemcpy(d_i, &i, sizeof(int), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_M,h_M,size_M,cudaMemcpyHostToDevice));
LineMul<<<gridDim,blockDim,0>>>(d_M,d_n,d_i,d_numOf,d_indexOf,d_inv); // on GPU
checkCudaErrors(cudaMemcpy(h_M,d_M,size_M,cudaMemcpyDeviceToHost));
for(int j = 0; j < n; j++)
{
if(j != i)
{
BYTE *d_MElem = 0;
checkCudaErrors(cudaMalloc((void**) &d_MElem,sizeof(BYTE)));
checkCudaErrors(cudaMemcpy(d_j, &j, sizeof(int), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_MElem,&h_M[j*(n+n)+i],sizeof(BYTE),cudaMemcpyHostToDevice));
LineMulSub<<<gridDim,blockDim,0>>>(d_M,d_n,d_i,d_MElem,d_j,d_numOf,d_indexOf);// on GPU
checkCudaErrors(cudaMemcpy(h_M,d_M,size_M,cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(d_MElem));
}
}
}
else
{
for(int j = i+1; j < n; j++)
{
if(h_M[j*(n+n)+i] != 0)
{
checkCudaErrors(cudaMemcpy(d_i, &i, sizeof(int), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_j, &j, sizeof(int), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_M,h_M,size_M,cudaMemcpyHostToDevice));
LineSwap<<<gridDim,blockDim,0>>>(d_M,d_n,d_i,d_j);//on GPU
checkCudaErrors(cudaMemcpy(h_M,d_M,size_M,cudaMemcpyDeviceToHost));
i--;
break;
}
if(j == n-1)
{
printf("(1)No inverse matrix!\n");
return false;
}
}
}
}
for (int i = 0; i < n; i++)
{
if(h_M[i*(n+n)+i] != 128)
{
printf("(2)No inverse matrix: not full rank!\n");
return false;
}
}
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
h_Inv[i*n+j] = h_M[i*(n+n)+n+j];
}
}
cudaEventRecord(stop, 0);// united on "ms"
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
float throughputInverse = (float) n/(elapsedTime*0.001) *0.000001;
printf("%d\t%f\t%f\t",n,elapsedTime*0.001,throughputInverse);
checkCudaErrors(cudaFree(d_i));
checkCudaErrors(cudaFree(d_j));
checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_M));
checkCudaErrors(cudaFree(d_n));
checkCudaErrors(cudaFree(d_numOf));
checkCudaErrors(cudaFree(d_indexOf));
checkCudaErrors(cudaFree(d_inv));
delete[] h_M;
return true;
}
하지만
nvcc -g -G INVonGPUv1.1.cu -o INVonGPUv1.1 -I../../NVIDIA_GPU_Computing_SDK/C/common/inc -I../../NVIDIA_GPU_Computing_SDK/shared/inc -arch=compute_12
하여 컴파일 할 때 질문은은, 정상적인 출력은 바로 다음과 같다.
################### Inversing start ####################
#n timeInverse(s) throughputInverse(MB/s) errorRate(0~1) isInverse
#=================== INVERSE on GPU v1.0 ====================
128 1.565791 0.000082 1
256 14.190008 0.000018 1
512 154.687016 0.000003 1
################ Inversing stop ####################
하지만
nvcc INVonGPUv1.1.cu -o INVonGPUv1.1 -I../../NVIDIA_GPU_Computing_SDK/C/common/inc -I../../NVIDIA_GPU_Computing_SDK/shared/inc -arch=compute_12
에 "-g -G"와 complie을 제거 할 때, 나는 역 matrix.Why과를 얻을 수 있는지의 작동 원리 "-g -G" ?
################### Inversing start ####################
#n timeInverse(s) throughputInverse(MB/s) errorRate(0~1) isInverse
#=================== INVERSE on GPU v1.0 ====================
(1)No inverse matrix!
0
(1)No inverse matrix!
0
(1)No inverse matrix!
0
################ Inversing stop ####################
미리 감사드립니다.
커널 (장치) 코드를 게시하지 않았 음을 지적하는 것이 적절합니다. –
첫 번째 단계는 cuda-memcheck를 사용하여 코드를 실행하는 것입니다. 커널 호출 중 하나 (바깥에 보이지는 않음)에서 메모리 접근 범위를 벗어난 것이 거의 확실하며 최적화로 컴파일 할 때 실패하고 디버깅을 위해 컴파일 할 때 실패하면 – talonmies