OpenCL traversal kernel - 추가 최적화

현재, 다음과 같은 탐색을위한 OpenCL 커널이 있습니다. 누군가가이 아주 큰 커널의 최적화에 관해 어떤 점을 가지고 있다면 나는 기쁠 것입니다.OpenCL traversal kernel - 추가 최적화

저는이 코드를 SAH BVH로 실행하고 있습니다. 물론 그의 논문 (GPU에서의 Ray Traversal의 효율성 이해)에서 Timo Aila와 비슷한 성능을 얻고 싶습니다. 물론 코드 SplitBVH를 사용합니다 (SAH BVH 대신에 사용하는 것이 좋습니다. 제 의견으로는 빌드 시간이 매우 느립니다). 그러나 나는 BVH가 아닌 traversal에 대해서 묻고 있습니다. (SplitBVH가 SAH BVH보다 많은 이점을주지는 않습니다.

우선, 내가 지금까지 가지고있는 것이있다 (표준 while-while 탐색 커널).

__constant sampler_t sampler = CLK_FILTER_NEAREST; 

// Inline definition of horizontal max 
inline float max4(float a, float b, float c, float d) 
{ 
    return max(max(max(a, b), c), d); 
} 

// Inline definition of horizontal min 
inline float min4(float a, float b, float c, float d) 
{ 
    return min(min(min(a, b), c), d); 
} 

// Traversal kernel 
__kernel void traverse(__read_only image2d_t nodes, 
         __global const float4* triangles, 
         __global const float4* rays, 
         __global float4* result, 
         const int num, 
         const int w, 
         const int h) 
{ 
    // Ray index 
    int idx = get_global_id(0); 

    if(idx < num) 
    { 
     // Stack 
     int todo[32]; 
     int todoOffset = 0; 

     // Current node 
     int nodeNum = 0; 

     float tmin = 0.0f; 
     float depth = 2e30f; 

     // Fetch ray origin, direction and compute invdirection 
     float4 origin = rays[2 * idx + 0]; 
     float4 direction = rays[2 * idx + 1]; 
     float4 invdir = native_recip(direction); 

     float4 temp = (float4)(0.0f, 0.0f, 0.0f, 1.0f); 

     // Traversal loop 
     while(true) 
     { 
      // Fetch node information 
      int2 nodeCoord = (int2)((nodeNum << 2) % w, (nodeNum << 2)/w); 
      int4 specs = read_imagei(nodes, sampler, nodeCoord + (int2)(3, 0)); 

      // While node isn't leaf 
      while(specs.z == 0) 
      { 
       // Fetch child bounding boxes 
       float4 n0xy = read_imagef(nodes, sampler, nodeCoord); 
       float4 n1xy = read_imagef(nodes, sampler, nodeCoord + (int2)(1, 0)); 
       float4 nz = read_imagef(nodes, sampler, nodeCoord + (int2)(2, 0)); 

       // Test ray against child bounding boxes 
       float oodx = origin.x * invdir.x; 
       float oody = origin.y * invdir.y; 
       float oodz = origin.z * invdir.z; 
       float c0lox = n0xy.x * invdir.x - oodx; 
       float c0hix = n0xy.y * invdir.x - oodx; 
       float c0loy = n0xy.z * invdir.y - oody; 
       float c0hiy = n0xy.w * invdir.y - oody; 
       float c0loz = nz.x * invdir.z - oodz; 
       float c0hiz = nz.y * invdir.z - oodz; 
       float c1loz = nz.z * invdir.z - oodz; 
       float c1hiz = nz.w * invdir.z - oodz; 
       float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), tmin); 
       float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), depth); 
       float c1lox = n1xy.x * invdir.x - oodx; 
       float c1hix = n1xy.y * invdir.x - oodx; 
       float c1loy = n1xy.z * invdir.y - oody; 
       float c1hiy = n1xy.w * invdir.y - oody; 
       float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), tmin); 
       float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), depth); 

       bool traverseChild0 = (c0max >= c0min); 
       bool traverseChild1 = (c1max >= c1min); 

       nodeNum = specs.x; 
       int nodeAbove = specs.y; 

       // We hit just one out of 2 childs 
       if(traverseChild0 != traverseChild1) 
       { 
        if(traverseChild1) 
        { 
         nodeNum = nodeAbove; 
        } 
       } 
       // We hit either both or none 
       else 
       { 
        // If we hit none, pop node from stack (or exit traversal, if stack is empty) 
        if (!traverseChild0) 
        { 
         if(todoOffset == 0) 
         { 
          break; 
         } 
         nodeNum = todo[--todoOffset]; 
        } 
        // If we hit both 
        else 
        { 
         // Sort them (so nearest goes 1st, further 2nd) 
         if(c1min < c0min) 
         { 
          unsigned int tmp = nodeNum; 
          nodeNum = nodeAbove; 
          nodeAbove = tmp; 
         } 

         // Push further on stack 
         todo[todoOffset++] = nodeAbove; 
        } 
       } 

       // Fetch next node information 
       nodeCoord = (int2)((nodeNum << 2) % w, (nodeNum << 2)/w); 
       specs = read_imagei(nodes, sampler, nodeCoord + (int2)(3, 0)); 
      } 

      // If node is leaf & has some primitives 
      if(specs.z > 0) 
      { 
       // Loop through primitives & perform intersection with them (Woop triangles) 
       for(int i = specs.x; i < specs.y; i++) 
       { 
        // Fetch first point from global memory 
        float4 v0 = triangles[i * 4 + 0]; 

        float o_z = v0.w - origin.x * v0.x - origin.y * v0.y - origin.z * v0.z; 
        float i_z = 1.0f/(direction.x * v0.x + direction.y * v0.y + direction.z * v0.z); 
        float t = o_z * i_z; 

        if(t > 0.0f && t < depth) 
        { 
         // Fetch second point from global memory 
         float4 v1 = triangles[i * 4 + 1]; 

         float o_x = v1.w + origin.x * v1.x + origin.y * v1.y + origin.z * v1.z; 
         float d_x = direction.x * v1.x + direction.y * v1.y + direction.z * v1.z; 
         float u = o_x + t * d_x; 

         if(u >= 0.0f && u <= 1.0f) 
         { 
          // Fetch third point from global memory 
          float4 v2 = triangles[i * 4 + 2]; 

          float o_y = v2.w + origin.x * v2.x + origin.y * v2.y + origin.z * v2.z; 
          float d_y = direction.x * v2.x + direction.y * v2.y + direction.z * v2.z; 
          float v = o_y + t * d_y; 

          if(v >= 0.0f && u + v <= 1.0f) 
          { 
           // We got successful hit, store the information 
           depth = t; 
           temp.x = u; 
           temp.y = v; 
           temp.z = t; 
           temp.w = as_float(i); 
          } 
         } 
        } 
       } 
      } 

      // Pop node from stack (if empty, finish traversal) 
      if(todoOffset == 0) 
      { 
       break; 
      } 

      nodeNum = todo[--todoOffset]; 
     } 

     // Store the ray traversal result in global memory 
     result[idx] = temp; 
    } 
}

첫 번째 질문은 어떻게 OpenCL에서 지속적으로 while-while and speculative while-while 커널을 작성할 수 있습니까?

광고 지속성, 내가 실제로 실제로 로컬 작업 크기와 동일한 글로벌 작업 크기로 커널을 시작한다는 사실을 알 수 있습니까?이 두 숫자는 GPU의 워프/파면 크기와 같아야합니까? 나는 CUDA와 영구 스레드 구현은 다음과 같습니다 것을 얻을 :

do 
    { 
     volatile int& jobIndexBase = nextJobArray[threadIndex.y]; 

     if(threadIndex.x == 0) 
     { 
       jobIndexBase = atomicAdd(&warpCounter, WARP_SIZE); 
     } 

     index = jobIndexBase + threadIndex.x; 

     if(index >= totalJobs) 
       return; 

     /* Perform work for task numbered 'index' */ 
    } 
    while(true);

어떻게 할 수 OpenCL을 모양에 해당하는 같은, 내가 하나가 후에해야한다고 알고, 내가 거기에 몇 가지 장벽을해야 할 것이다 알고 warpCounter에 원자 적으로 WARP_SIZE를 추가하는 점수.

광고 투기 탐색 - 글쎄,이 아이디어를 OpenCL에서 어떻게 구현해야할지 모르니까 힌트를 환영합니다. 나는 또한 장벽을 놓을 생각이 없다. (왜냐하면 시뮬레이션 된 __any를 주위에두면 운전자가 추락 할 것이기 때문이다).

여기에 작성하면 &을 읽어 주셔서 감사합니다. 힌트, 답변 등을 환영합니다!

출처

2013-04-10 ZarakiKenpachi

당신이 할 수있는 최적화 작업은 벡터 변수와 융합 된 곱하기 덧셈 함수를 사용하여 설정 한 계산 속도를 높이는 것입니다. 커널의 나머지 부분은 branchy이기 때문에 느리다. 신호 데이터를 가정 할 수 있으면 코드 분기를 줄여 실행 시간을 줄일 수 있습니다. float4 swizles (.xxyy 및 .x .y .z .w float 4 변수 다음에 있음)를 검사하지 않았으므로이를 확인하십시오.

  float4 n0xy = read_imagef(nodes, sampler, nodeCoord); 
      float4 n1xy = read_imagef(nodes, sampler, nodeCoord + (int2)(1, 0)); 
      float4 nz = read_imagef(nodes, sampler, nodeCoord + (int2)(2, 0)); 

      float4 oodf4 = -origin * invdir; 

      float4 c0xyf4 = fma(n0xy,invdir.xxyy,oodf4); 

      float4 c0zc1z = fma(nz,(float4)(invdir.z),oodf4); 

      float c0min = max4(min(c0xyf4.x, c0xyf4.y), min(c0xyf4.z, c0xyf4.w), min(c0zc1z.z, c0zc1z.w), tmin); 
      float c0max = min4(max(c0xyf4.x, c0xyf4.y), max(c0xyf4.z, c0xyf4.w), max(c0zc1z.z, c0zc1z.w), depth); 

      float4 c1xy = fma(n1xy,invdir.xxyy,oodf4); 

      float c1min = max4(min(c1xy.x, c1xy.y), min(c1xy.z, c1xy.w), min(c0zc1z.z, c0zc1z.w), tmin); 
      float c1max = min4(max(c1xy.x, c1xy.y), max(c1xy.z, c1xy.w), max(c0zc1z.z, c0zc1z.w), depth);

출처

2013-07-17 10:50:44

OpenCL traversal kernel - 추가 최적화

답변

관련 문제