ARM의 최적화 된 FAST 계산

팔 네온 라이브러리를 사용하여 ARM 피 텍스 a8에서 약 5ms ORB 기능 계산을 발견 한 논문을 구현하고 싶습니다. 하지만 FAST 기능 탐지로 이미 어려움을 겪고 있습니다. 제가 구현하려고하는 논문은 here입니다. 먼저 Bright and Dark 제약 조건에 대해 잘 모릅니다. 그래서 내 이해에 센터 픽셀 주위에 9 픽셀이 더 밝거나 9 픽셀 더 밝은 지 확인해야합니다. 그래서 둘 다 확인합니다. 하지만 이제는 구현이 최종 시프트 연산없이 평균 3 배 더 오래 걸리는 문제가 있습니다. 그렇다면 전체 진행을위한 opencv의 평균 계산 다음에 계산이 필요합니다. 그래서 지금까지 제 코드가 있습니다. 아마도 누군가 제가 저에게 할 수있는 몇 가지 optimizatios를 가르쳐 줄 수 있습니다.ARM의 최적화 된 FAST 계산

 //detect with opncv 
     Clock::time_point t0 = Clock::now(); 
     detectors[y]->detect(img, ocv_kps); 
     Clock::time_point t1 = Clock::now(); 

     vector<Point2f> my_kps; 
     //threshhold for FAST 
     const uchar th = 8; 

     int b_cnt = 0; 
     int d_cnt = 0; 
     //array with four possible corners to be processed in parallel 
     uint32_t id_arr[4]; 
     uint32_t ib_arr[4]; 

     Clock::time_point t01 = Clock::now(); 
     for (int i = 3; i < img.rows - 3; i++) { 
      //get pointer to seven Image rows three above and three below center and center itself 
      const uchar* Mt3 = img.ptr<uchar>(i - 3); 
      const uchar* Mt2 = img.ptr<uchar>(i - 2); 
      const uchar* Mt1 = img.ptr<uchar>(i - 1); 
      const uchar* Mc = img.ptr<uchar>(i); 
      const uchar* Mb1 = img.ptr<uchar>(i + 1); 
      const uchar* Mb2 = img.ptr<uchar>(i + 2); 
      const uchar* Mb3 = img.ptr<uchar>(i + 3); 
      for (int j = 3; j < img.cols - 3; j++) { 
       const uchar j3 = j + 3; 
       const uchar j2 = j + 2; 
       const uchar j1 = j + 1; 
       const uchar jn3 = j - 3; 
       const uchar jn2 = j - 2; 
       const uchar jn1 = j - 1; 

       //image values for center left right top and bottom intensity of pixel 
       const uchar c = Mc[j]; 
       const uchar l = Mc[jn3]; 
       const uchar r = Mc[j3]; 
       const uchar t = Mt3[j]; 
       const uchar b = Mb3[j]; 

       //threshold for bright FAST constraint 
       const uchar thb = c + th; 

       //bools for bright constraint 
       const bool cbt = t > thb; 
       const bool cbb = b > thb; 
       const bool cbl = l > thb; 
       const bool cbr = r > thb; 

       uchar mt3; 
       uchar mt3n; 
       uchar mt2; 
       uchar mt2n; 
       uchar mt1; 
       uchar mt1n; 
       uchar mb3; 
       uchar mb3n; 
       uchar mb2; 
       uchar mb2n; 
       uchar mb1; 
       uchar mb1n; 
       bool bc = false; 
       //pre test do we have at least two points which fulfill bright constraint 
       if ((cbl && cbt) || (cbt && cbr) || (cbr && cbb) 
         || (cbb && cbl)) { 
        bc = true; 
        //get rest of image intensity values of circle 
        mt3 = Mt3[j1]; 
        mt3n = Mt3[jn1]; 
        mt2 = Mt2[j2]; 
        mt2n = Mt2[jn2]; 
        mt1 = Mt1[j3]; 
        mt1n = Mt1[jn3]; 
        mb3 = Mb3[j1]; 
        mb3n = Mb3[jn1]; 
        mb2 = Mb2[j2]; 
        mb2n = Mb2[jn2]; 
        mb1 = Mb1[j3]; 
        mb1n = Mb1[jn3]; 

        //values for bright constrain 
        ib_arr[b_cnt] = cbt | ((mt3) > thb) << 1 
          | ((mt2) > thb) << 2 | ((mt1) > thb) << 3 
          | (cbr << 4) | ((mb1) > thb) << 5 
          | ((mb2) > thb) << 6 | ((mb3) > thb) << 7 
          | cbb << 8 | ((mb3n) > thb) << 9 
          | ((mb2n) > thb) << 10 | ((mb1n) > thb) << 11 
          | (cbl) << 12 | ((mt1n) > thb) << 13 
          | ((mt2n) > thb) << 14 | ((mt3n) > thb) << 15 
          | (cbt) << 16 | ((mt3) > thb) << 17 
          | ((mt2) > thb) << 18 | ((mt1) > thb) << 19 
          | (cbr) << 20 | ((mb1) > thb) << 21 
          | ((mb2) > thb) << 22 | ((mb3) > thb) << 23; 
        b_cnt++; 
        //if we have four possible corners in array check if they are corners 
        if (b_cnt == 4) { 
         uint32x2x4_t IB = vld4_u32(ib_arr); 
         /* 
         * here the actual shift operation would take place 
         */ 
         b_cnt = 0; 
        } 
       } 

       //threshold for dark constraint 
       const uchar thd = c - th; 
       //bools for dark constraint 
       const bool cdl = l < thd; 
       const bool cdr = r < thd; 
       const bool cdt = t < thd; 
       const bool cdb = b < thd; 
       //pre test do we have at least two points which fulfill dark constraint 
       if ((cdl && cdt) || (cdt && cdr) || (cdr && cdb) 
         || (cdb && cdl)) { 
        //if bright pre test failed intensity values are not initialised 
        if (!bc) { 
         //get rest of image intensity values of circle 
         mt3 = Mt3[j1]; 
         mt3n = Mt3[jn1]; 
         mt2 = Mt2[j2]; 
         mt2n = Mt2[jn2]; 
         mt1 = Mt1[j3]; 
         mt1n = Mt1[jn3]; 
         mb3 = Mb3[j1]; 
         mb3n = Mb3[jn1]; 
         mb2 = Mb2[j2]; 
         mb2n = Mb2[jn2]; 
         mb1 = Mb1[j3]; 
         mb1n = Mb1[jn3]; 
        } 
        //bool values for dark constrain 
        id_arr[d_cnt] = cdt | ((mt3) < thd) << 1 
          | ((mt2) < thd) << 2 | ((mt1) < thd) << 3 
          | (cdr) << 4 | ((mb1) < thd) << 5 
          | ((mb2) < thd) << 6 | ((mb3) < thd) << 7 
          | (cdb) << 8 | ((mb3n) < thd) << 9 
          | ((mb2n) < thd) << 10 | ((mb1n) < thd) << 11 
          | (cdl) << 12 | ((mt1n) < thd) << 13 
          | ((mt2n) < thd) << 14 | ((mt3n) < thd) << 15 
          | (cdt) << 16 | ((mt3) < thd) << 17 
          | ((mt2) < thd) << 18 | ((mt1) < thd) << 19 
          | (cdr) << 20 | ((mb1) < thd) << 21 
          | ((mb2) < thd) << 22 | ((mb3) < thd) << 23; 
        d_cnt++; 
        //if we have four possible corners in array check if they are corners 
        if (d_cnt == 4) { 
         uint32x2x4_t IA = vld4_u32(id_arr); 
         /* 
         * here the actual shift operation would take place 
         */ 
         d_cnt = 0; 
        } 
        int h = cdt; 

       } 
      } 
     } 
     Clock::time_point t11 = Clock::now(); 
     cout << "my algorithm found " << my_kps.size() 
       << " and ocv found " << ocv_kps.size() << endl; 

     microseconds ms1 = std::chrono::duration_cast < microseconds 
       > (t1 - t0); 
     microseconds ms2 = std::chrono::duration_cast < microseconds 
       > (t11 - t01); 

     rs.Push((double) ms2.count()); 
     cout << "my algorithm duration " << ms2.count() 
       << " and ocv duration is " << ms1.count() << endl;

출처

2016-10-20 Felix Yah Batta Man

그래서 암 어셈블러에서 비트를 파기 한 후. 팔에서 적어도 2 배 더 빠르게 실행되는 코드를 작성한 다음 OpenCv Fast9 구현으로 빌드했습니다. GitHub에서 코드를 확인할 수 있습니다. 나는 그것을 최적화하기위한 어떤 권고안에 대해서도 매우 기쁩니다. 320X240 계조 화상 내 알고리즘을 OpenCV

위한 2000ms를 1000MS 내 라즈베리 파이 3 는 라운드 걸린다.

출처

2016-11-17 08:09:00

나는 나무 딸기 파이에서 30fps로 실행되는 ORB 추출기를 사용합니다.

https://github.com/0xfaded/pislam

최적화는 정말 검은 예술, 그리고 문제를 악화 ARM은 A53에 대한 최적화 가이드를 발표 결코 만들 수 있습니다. 우리가 가지고있는 최선은 비슷한 NEON 유닛을 가지고있는 a57을위한 것입니다.

전적으로 여기에 완전한 대답을 제시 할 수는 없지만 제 프로세스에 대해 조금 설명하겠습니다.

내 FAST 추출기의 첫 번째 부분은 테스트 픽셀 링을로드하고 코드와 마찬가지로 16 비트 벡터로 변환합니다. 직접 asm을 작성하지는 않았지만 대신 gcc 내장 함수를 사용했습니다.

이

각 비교에 대한 지침의 최소 수를 방출 스택
모든 레지스터를 유출하지 않았다

첫 번째 비교한다는 것을 알 수 있습니다 : 아직도하지만, 나는 확실히 그 GCC했다 그 비트를 마스크로 분리하지 마십시오. 0x80이었을 것입니다. 이것은 그렇지 않으면 상수를 가지고있을 것입니다 레지스터를 확보하고 gcc에게 레지스터를 엎 지르지 않을 정도로 흔들리는 방을주었습니다. 당신은 또한 몇 가지 매우 소름 끼치는 고유 사용량을 알 수

는 :

d0 = vbslq_u8(vdupq_n_u8(0x40u), vcgeq_u8(test, dark), d0); 
    l0 = vbslq_u8(vdupq_n_u8(0x40u), vcleq_u8(test, light), l0);

이 행복 하 게 후자를 컴파일

d0 |= test >= dark & 0x40; 
    l0 |= test >= light & 0x40;

GCC에 해당하지만, 1.5 배 많은 지침을 방출한다.

두 번째 부분은 16 비트 벡터에 대한 FAST-9 테스트가 예정되어있었습니다. 아래의 내용은 16 개의 명령어로 컴파일되지만 나에게 거의 한 달 동안 생각을 떠올리게합니다.

uint8x16_t t0 = vtstq_u8(d0, d1); 
    uint8x16_t t1 = vtstq_u8(d0, d1); 

    t0 = vbslq_u8(t0, l0, d0); 
    t1 = vbslq_u8(t1, l1, d1); 

    uint8x16_t cntLo = vclzq_u8(t0); 
    uint8x16_t testLo = t1 << (cntLo - 1); 
    asm("vceq.u8 %q0, %q0, #0" : [val] "+w" (testLo)); 

    uint8x16_t cntHi = vclzq_u8(t1); 
    uint8x16_t testHi = t0 << (cntHi - 1); 
    asm("vceq.u8 %q0, %q0, #0" : [val] "+w" (testHi)); 

    uint8x16_t result = (cntLo & testLo) | (cntHi & testHi); 
    result = vtstq_u8(result, result);

성가 시게, GCC는 일정한 제로로 비교하기위한 특별 지시 인, vceq.u8 %q0, %q0, #0로 testLo == 0을 컴파일하지 않을 것입니다. 나는 이것을 수동으로 삽입하여 다른 몇 가지 지침을 깎아 냈다.

희망 사항은 약간의 통찰력을 제공합니다. Fast.h

출처

2017-10-31 00:02:51 user364952

답변

관련 문제