for this post lets try and take an “embarrassingly” parallel problem, and solve it in parallel using OpenMP. One such example is matrix-vector multiplication.
For this problem I will use the Boost uBLAS library.
Let’s first write our own matrix vector product which also runs in serial:
//STL
#include<iostream>
//Boost
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/io.hpp>
//Types
using namespace boost::numeric::ublas;
typedef matrix<double>; matrix_t;
typedef vector<double>; vector_t;
double random_double(){
return (double)rand()/(double)RAND_MAX;
}
template<typename Matrix, typename Vector>;
void init_matrix_vector(Matrix & m, Vector& v){
srand((unsigned)time(0));
for (int i = 0; i < m.size1(); ++i){
v(i) = 10*random_double();
for (int j = 0; j < m.size2(); ++j)
m(i,j) = 10*random_double()
}
}
template<typename Matrix, typename Vector>;
void product(Matrix & m, Vector & v, Vector & r){
for(int i = 0; i < m.size1(); ++i){
r(i) = 0.0;
for(int j = 0; j < m.size2(); ++j){
r(i) += m(i,j)*v(j);
}
}
}
int main(int argc,char * argv[]){
if (argc != 2){
std::cout << "usage: "
<< argv[ 0]
<< " N"
<< std::endl;
std::exit( -1);
}
int N = atoi(argv[ 1]);
//create matrix/vector
matrix_t m (N, N);
vector_t v(N);
init_matrix_vector(m,v);
vector_t r(N);
product(m,v,r);
std::cout << "matrix: " << m << std::endl;
std::cout << "vector: " << v << std::endl;
std::cout << "boost product: " << prod(m,v) << std::endl;
std::cout << "our (serial) product: " << r << std::endl;
}
which produces:
./a.out 2 matrix: [2,2]((3.69343,5.94287),(7.27436,3.44084)) vector: [2](0.72093,0.580349) boost product: [2](6.11165,7.24119) our (serial) product: [2](6.11165,7.24119)
Great, we now have a properly working product()
function to which we can add parallelism.
Note that this boils down to threading the outer for loop in our
product() function.
Luckily OpenMP provides a compiler directive just for this. the ‘parallel for’
construct. Consider the following modified product:
template<typename Matrix, typename Vector>;
void parallel_product(Matrix & m, Vector & v, Vector & r){
#pragma omp parallel for num_threads(2)
for(int i = 0; i < m.size1(); ++i){
r(i) = 0.0;
for(int j = 0; j < m.size2(); ++j){
r(i) += m(i,j)*v(j);
}
}
}
Statements which begin with #pragma are pre-processor statements. The compilers
pre-processor uses this to modify our source code before compilation.
This statement in particular informs the compiler to create chunks of work from
the following for loop to run in a given set of threads threads.
As one would expect, specifying the num_threads() clause informs the compiler
how many threads to create.
Also, the for() loop could have iterated over a range, as long as the iterator type is a random access iterator.
Putting this all together we have:
//STL
#include<iostream>;
//OMP
#include<omp.h>;
//Boost
#include <boost/numeric/ublas/matrix.hpp>;
#include <boost/numeric/ublas/io.hpp>;
//Types
using namespace boost::numeric::ublas;
typedef matrix<double>; matrix_t;
typedef vector<double>; vector_t;
double random_double(){
return (double)rand()/(double)RAND_MAX;
}
template<typename Matrix, typename Vector>;
void init_matrix_vector(Matrix & m, Vector& v){
srand((unsigned)time(0));
for (int i = 0; i < m.size1(); ++i){
v(i) = 10*random_double();
for (int j = 0; j < m.size2(); ++j)
m(i,j) = 10*random_double();
}
}
template<typename Matrix, typename Vector>;
void product(Matrix & m, Vector & v, Vector & r){
for(int i = 0; i < m.size1(); ++i){
r(i) = 0.0;
for(int j = 0; j < m.size2(); ++j){
r(i) += m(i,j)*v(j);
}
}
}
template<typename Matrix, typename Vector>;
void parallel_product(Matrix & m, Vector & v, Vector & r){
#pragma omp parallel for num_threads(2)
for(int i = 0; i < m.size1(); ++i){
r(i) = 0.0;
for(int j = 0; j < m.size2(); ++j){
r(i) += m(i,j)*v(j);
}
}
}
int main(int argc,char * argv[]){
if (argc != 2){
std::cout << "usage: "
<< argv[ 0]
<< " N"
<< std::endl;
}
int N = atoi(argv[ 1]);
matrix_t m (N, N);
vector_t v(N);
init_matrix_vector(m,v);
vector_t r(N);
vector_t r1(N);
double serial_time = omp_get_wtime();
product(m,v,r);
serial_time = omp_get_wtime() - serial_time;
double parallel_time = omp_get_wtime();
parallel_product(m,v,r1);
parallel_time = omp_get_wtime() - parallel_time;
for(int i = 0; i < N; ++i){
if (r(i) != r1(i)){
std::cerr << "results differ @"
<< i
<< std::endl;
std::cerr << r(i)
<< " "
<< r1(i)
<< std::endl;
return -1;
}
}
std::cout << "the result is the same"
<< std::endl;
std::cout << "serial time: "
<< serial_time << std::endl;
std::cout << "parallel time: "
<< parallel_time << std::endl;
return 0;
}
Which when I run on my dual-core laptop produces:
$ ./a.out 1000 the result is the same serial time: 0.094412 parallel time: 0.048241 $ ./a.out 2000 the result is the same serial time: 0.376123 parallel time: 0.191999 $ ./a.out 5000 the result is the same serial time: 2.3542 parallel time: 1.21085 $ ./a.out 10000 the result is the same serial time: 9.41049 parallel time: 4.84689
We can see that with two threads we nicely get ~2x speedup.
Notice the use of omp_get_wtime() as the timer for this code.
What happens if we time the code with std::clock() ?
Sorry, the comment form is closed at this time.