OpenMP 구현이 직렬 구현보다 느립니다.

현재 OpenMP에 익숙해 지려고합니다. 연습을 위해 나는 OpenMP로 탐욕스러운 "학습"알고리즘을 구현했다. 그럼 난 내 직렬 구현과 비교 상관없이 내 프로그램이 OpenMP를 하나 더 느린 항상 곁에 중요하지 않는 것입니다 얼마나 많은 반복OpenMP 구현이 직렬 구현보다 느립니다.

time ./a.out

와 시간을 측정 하였다. 그것은이 구현 12S했다 내 홈 PC에서

#include <omp.h> 
#include <iostream> 
#include <vector> 
#include <cstdlib> 
#include <cmath> 
#include <stdio.h> 
#include <ctime> 

#define THREADS 4 

using namespace std; 

struct TrainData { 
    double input; 
    double output; 
}; 

//Long Term Memory struct 
struct LTM { 
     double a; //paramter a of the polynom 
     double b; 
     double c; 
     double score; //score to be minimized! 

     LTM() 
     { 
      a=0; 
      b=0; 
      c=0; 
      score=0; 
     } 

     //random LTM with paramters from low to high (including low and high) 
     LTM(int low, int high) 
     { 
      score=0; 
      a= rand() % high + low; 
      b= rand() % high + low; 
      c= rand() % high + low; 

     } 

     LTM(double _a, double _b, double _c) 
     { 
      a=_a; 
      b=_b; 
      c=_c; 
     } 

     void print() 
     { 
      cout<<"Score: "<<score<<endl; 
      cout<<"a: "<<a<<" b: "<<b<<" c: "<<c<<endl; 
     } 
}; 

//the acutal polynom function evaluating with passed LTM 
inline double evaluate(LTM &ltm, const double &x) 
{ 
    double ret; 
    ret = ltm.a*x*x + ltm.b*x + ltm.c; 

    return ret; 
} 


//scoring function calculates the Root Mean Square error (RMS) 
inline double score_function(LTM &ltmnew, vector<TrainData> &td) 
{ 
    double score; 
    double val; 
    int tdsize=td.size(); 
    score=0; 

    for(int i=0; i< tdsize; i++) 
    { 
     val = (td.at(i)).output - evaluate(ltmnew, (td.at(i)).input); 
     val *= val; 
     score += val; 
    } 

    score /= (double)tdsize; 

    score = sqrt(score); 

    return score; 
} 

LTM iterate(int iterations, vector<TrainData> td, int low, int high) 
{ 
    LTM fav = LTM(low,high); 
    fav.score = score_function(fav, td); 
    fav.print(); 
    LTM favs[THREADS]; // array for collecting the favorites of each thread 

    #pragma omp parallel num_threads(THREADS) firstprivate(fav, low, high, td) 
    { 
     #pragma omp master 
     printf("Threads: %d\n", omp_get_num_threads()); 

     LTM cand; 
     #pragma omp for private(cand) 
     for(int i=0; i<iterations; i++) 
     { 
      cand = LTM(low, high); 
      cand.score = score_function(cand, td); 

      if(cand.score < fav.score) 
       fav = cand; 
     } 

     //save the favorite before ending the parallel section 
     #pragma omp critical 
     favs[omp_get_thread_num()] = fav; 
    } 

    //search for the best one in the array 
    for(int i=0; i<THREADS; i++) 
    { 
     if(favs[i].score < fav.score) 
      fav=favs[i]; 
    } 

    return fav; 
} 

//generate training data from -50 up to 50 with the train LTM 
void generateTrainData(vector<TrainData> *td, LTM train) 
{ 
    #pragma omp parallel for schedule(dynamic, 25) 
    for(int i=-50; i< 50; i++) 
    { 
     struct TrainData d; 
     d.input = i; 
     d.output = evaluate(train, (double)i); 
     #pragma omp critical 
     td->push_back(d); 

     //cout<<"input: "<<d.input<<" -> "<<d.output<<endl; 
    } 

} 

int main(int argc, char *argv[]) 
{ 

    int its= 10000000; //number of iterations 
    int a=2; 
    int b=4; 
    int c=6; 

    srand(time(NULL)); 
    LTM pol = LTM(a,b,c); //original polynom parameters 
    vector<TrainData> td; 

    //first genarte some training data and save it to td 
    generateTrainData(&td, pol); 

    //try to find the best solution 
    LTM fav = iterate(its, td, 1, 6); 


    printf("Final: a=%f b=%f c=%f score: %f\n", fav.a, fav.b, fav.c, fav.score); 

    return 0; 
}

: 여기

내 코드입니다, 의견 잘하면 모든 것을 설명한다. 연재는 단 6 초. 반복 횟수를 10 배로 늘리면 약 2 분/1 분 (omp/serial)이됩니다.

아무도 도와 줄 수 있습니까?

출처

2017-05-19 Talto

'rand()'는 전역 상태를 사용하기 때문에 동시 컨텍스트에서 사용할 수 없습니다. 대신'drand48_r()'을 사용하십시오. –

내 초기 질문에 대한 의견 덕분에 성능 문제를 해결할 수있었습니다.

의견 에서처럼 문제는 내가 사용하고 있던 rand() 함수라고합니다. 적절한 스레드 안전 drand48_r()으로 교체했습니다.

처럼 :

... 
LTM(double low, double high, struct drand48_data *buff) 
{ 
    score=0; 
    double x; 
    drand48_r(buff,&x); 
    a= low + x * (high - low); 
    drand48_r(buff,&x); 
    b= low + x * (high - low); 
    drand48_r(buff,&x); 
    c= low + x * (high - low); 

} 
...

지금 내가 가지고 일초 아래 배! 감사합니다. :)

출처

2017-05-19 12:08:45 Talto

더 나은 API를 갖고 [PRNG] 알고리즘에 대한 더 많은 선택을 제공하는 [C++ 11 임의 생성기] (http://en.cppreference.com/w/cpp/numeric/random)를 사용할 수도 있습니다. 산출물을 다른 통계적 분포로 적용하십시오. 그냥 임의의 엔진 변수를'#pragma omp parallel' 블록 안에 선언하면 각 스레드에 로컬이됩니다. – Wyzard

OpenMP 구현이 직렬 구현보다 느립니다.

답변

관련 문제