OpenCV의 매트 매트릭스와 SSE에 대한 16 바이트 정렬

OpenCV's Mat 처리를 위해 SSE/SSE2의 향상을 테스트하고 싶습니다. SSE's의 성능 향상은 16 바이트 정렬 데이터에 대해서만 명백하므로 (1) SSE 레지스터와 함께 사용할 Mat 매트릭스를 수정하려면 무엇이 필요합니까? 내가 한 일은 다음과 같았고 (2) 그 일을 올바르게하는 방법 이었습니까?OpenCV의 매트 매트릭스와 SSE에 대한 16 바이트 정렬

void test(Mat flowxy, Mat flowresult) 
    { 
     __m128 x, y, xsquare, ysquare, ybyx, xRecip , sum, r, theta ;//gen is for general purpose 
     float *input = (float*)(flowxy.data); 
     for(int i = 0; i < flowxy.rows; i++) 
      { 
       for(int j = 0; j + SSE_INCREMENT < flowxy.cols; j = j + SSE_INCREMENT) 
       { 

        x = _mm_set_ps(input[flowxy.step * (j+6) + i ], input[flowxy.step * (j+4) + i ], input[flowxy.step * (j+2) + i ], input[flowxy.step * (j) + i ]); 
        y = _mm_set_ps(input[flowxy.step * (j+7) + i ], input[flowxy.step * (j+5) + i ], input[flowxy.step * (j+3) + i ], input[flowxy.step * (j+1) + i ]); 
        xRecip = _mm_rcp_ps(x); 
        xsquare = _mm_mul_ps(x, x); 
        ysquare = _mm_mul_ps(y, y);    
        ybyx = _mm_mul_ps(xRecip , y); 
        sum = _mm_add_ps(xsquare, ysquare); 
        r = _mm_sqrt_ps(sum); 
        theta = taninverse(ybyx); 
       } 


      } 

    }

나는 토론 here에 따라 _mm_set_ps 설정의 순서를 반대로.

편집 1 :

void CObjectDetection_TrackingDlg::flow_XY_RTHETA(Mat flowxy, vector<Mat> &flowrtheta) 
{ 
    clock_t start; 
    clock_t finish; 
    start = clock(); 
    flowrtheta.resize(2); 
    if(flowrtheta[0].empty() && flowrtheta[1].empty()){ 
     flowrtheta[0].create(cvSize(flowxy.rows, flowxy.cols), CV_32FC1); 
     flowrtheta[1].create(cvSize(flowxy.rows, flowxy.cols), CV_32FC1); 
    } 
    vector<Mat> flowxy_S; 
    split(flowxy, flowxy_S); 
    printMatGrayDatainfloat(flowxy_S[0]); 
    printMatGrayDatainfloat(flowxy_S[1]); 
    //check SSE2 available 
    bool useSIMD = checkHardwareSupport(CV_CPU_SSE); 
    if(useSIMD) 
    { 
     __m128 x, y, xsquare, ysquare, ybyx, xRecip , sum, r, theta ;//gen is for general purpose  
     __declspec(align(16)) struct { int i, j; } sub; 
     for(sub.i = 0; sub.i < flowxy.rows; sub.i++) 
     { 
      const float *input_x = flowxy_S[0].ptr<float>(sub.i); 
      const float *input_y = flowxy_S[1].ptr<float>(sub.i); 
      float *output_r = flowrtheta[0].ptr<float>(sub.i); 
      float *output_t = flowrtheta[1].ptr<float>(sub.i); 
      for(sub.j = 0; sub.j + 4 < flowxy.cols; sub.j = sub.j + 4) 
      { 

       x = _mm_loadu_ps(&input_x[sub.j]); 
       y = _mm_loadu_ps(&input_y[sub.j]); 
       xRecip = _mm_rcp_ps(x); 
       xsquare = _mm_mul_ps(x, x); 
       ysquare = _mm_mul_ps(y, y);    
       ybyx = _mm_mul_ps(xRecip , y); 
       sum = _mm_add_ps(xsquare, ysquare); 
       r = _mm_sqrt_ps(sum); 
       theta = taninverse(ybyx); 
       _mm_storeu_ps(&output_r[sub.j], r); 
       _mm_storeu_ps(&output_t[sub.j], theta); 

      } 


     } 

    } 
    else 
    { 
     for(int i = 0; i < flowxy.rows; i++) 
     { 
      const float *input_x = flowxy_S[0].ptr<float>(i); 
      const float *input_y = flowxy_S[1].ptr<float>(i); 
      float *output_r = flowrtheta[0].ptr<float>(i); 
      float *output_t = flowrtheta[1].ptr<float>(i); 
      for(int j = 0; j < flowxy.cols; j++) 
      { 
       double x_sq = input_x[j] * input_x[j]; 
       double y_sq = input_y[j] * input_y[j]; 
       double y_by_x = input_y[j]/input_x[j]; 
       output_r[j] = sqrt(x_sq + y_sq); 
       output_t[j] = atan(y_by_x); 
      } 


     } 


    } 
    flowxy_S[0].release(); 
    flowxy_S[1].release(); 
    finish = clock() - start; 
    double interval = finish/(double)CLOCKS_PER_SEC; 
    //printMatGrayDatainfloat(flowrtheta[0]); 
    //printMatGrayDatainfloat(flowrtheta[1]); 
    return; 
}

출처

2014-06-06 batuman

16 바이트 정렬이 필요한로드 또는 저장소를 수행하지 않으므로 코드가 다소 이상하지 않습니다. 그러나'_mm_set_ps'를 사용하는 것은 매우 비효율적입니다. 연속 된 정렬되지 않은 데이터를로드하고 예를 들어를 사용하여 요소를 필요한 순서로 셔플하려면'_mm_loadu_ps '를 사용해야합니다. '_mm_shuffle_ps'. –

더 많이 탐험 해보고 당신과 잡을 것입니다. My r과 theta는 2 채널 Mat flowresult에 다시 쓸 필요가있는 결과입니다. 가장 좋은 방법은 무엇일까요? Thanks – batuman

'_mm_storeu_ps'를 사용하여 결과를 메모리에 다시 쓸 수 있습니다. –

그것은 당신이 명시 적 vectorisation에서 아무것도 얻을 수 없습니다 그래서 컴파일러는, 어쨌든이 코드를 vectorising 추상 것을 가능 - 당신의 스칼라 지점에 대해 생성 된 코드를보고는 SSE가 포함되어 있는지 확인 명령. 또한 오래된 CPU에서 잘못 정렬 된로드/저장소는 매우 비쌉니다 (예 : Core i7이라면 괜찮을 것입니다).

출처

2014-06-09 10:08:36

OpenCV의 매트 매트릭스와 SSE에 대한 16 바이트 정렬

답변

관련 문제