for this post lets try and take an “embarrassingly” parallel problem, and solve it in parallel using OpenMP. One such example is matrix-vector multiplication.

For this problem I will use the Boost uBLAS library.

Let’s first write our own matrix vector product which also runs in serial:

//STL
#include<iostream>
//Boost
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/io.hpp>
//Types
using namespace boost::numeric::ublas;
typedef matrix<double>; matrix_t;
typedef vector<double>; vector_t;

double random_double(){
      return (double)rand()/(double)RAND_MAX;
}
template<typename Matrix, typename Vector>;
void init_matrix_vector(Matrix & m, Vector& v){
	srand((unsigned)time(0));
	for (int i = 0; i < m.size1(); ++i){
                v(i) = 10*random_double();
		for (int j = 0; j < m.size2(); ++j)
			m(i,j) = 10*random_double()
	}
}

template<typename Matrix, typename Vector>;
void product(Matrix & m, Vector & v, Vector & r){
	for(int i = 0; i < m.size1(); ++i){
		r(i) = 0.0;
		for(int j = 0; j < m.size2(); ++j){
			r(i) += m(i,j)*v(j);
		}
	}
}

int main(int argc,char * argv[]){
	if (argc != 2){
		std::cout << "usage: "
                          << argv[ 0]
                          << " N"
                          << std::endl;
		std::exit( -1);
	}
	int N = atoi(argv[ 1]);
        //create matrix/vector
	matrix_t m (N, N);
	vector_t v(N);
	init_matrix_vector(m,v);
	vector_t r(N);
	product(m,v,r);
	std::cout << "matrix: " << m << std::endl;
	std::cout << "vector: " << v << std::endl;
	std::cout << "boost product: " << prod(m,v) << std::endl;
	std::cout << "our (serial) product: " << r << std::endl;
}

which produces:

./a.out 2
matrix: [2,2]((3.69343,5.94287),(7.27436,3.44084))
vector: [2](0.72093,0.580349)
boost product: [2](6.11165,7.24119)
our (serial) product: [2](6.11165,7.24119)

Great, we now have a properly working product()
function to which we can add parallelism.

Note that this boils down to threading the outer for loop in our
product() function.

Luckily OpenMP provides a compiler directive just for this. the ‘parallel for’
construct. Consider the following modified product:

template<typename Matrix, typename Vector>;
void parallel_product(Matrix & m, Vector & v, Vector & r){
	#pragma omp parallel for num_threads(2)
        for(int i = 0; i < m.size1(); ++i){
                r(i) = 0.0;
                for(int j = 0; j < m.size2(); ++j){
                        r(i) += m(i,j)*v(j);
                }
        }
}

Statements which begin with #pragma are pre-processor statements. The compilers
pre-processor uses this to modify our source code before compilation.

This statement in particular informs the compiler to create chunks of work from
the following for loop to run in a given set of threads threads.
As one would expect, specifying the num_threads() clause informs the compiler
how many threads to create.

Also, the for() loop could have iterated over a range, as long as the iterator type is a random access iterator.

Putting this all together we have:

//STL
#include<iostream>;

//OMP
#include<omp.h>;

//Boost
#include <boost/numeric/ublas/matrix.hpp>;
#include <boost/numeric/ublas/io.hpp>;

//Types
using namespace boost::numeric::ublas;
typedef matrix<double>; matrix_t;
typedef vector<double>; vector_t;

double random_double(){
	return (double)rand()/(double)RAND_MAX;
}

template<typename Matrix, typename Vector>;
void init_matrix_vector(Matrix & m, Vector& v){
	srand((unsigned)time(0));
	for (int i = 0; i < m.size1(); ++i){
                v(i) = 10*random_double();
		for (int j = 0; j < m.size2(); ++j)
			m(i,j) = 10*random_double();
	}
}

template<typename Matrix, typename Vector>;
void product(Matrix & m, Vector & v, Vector & r){
	for(int i = 0; i < m.size1(); ++i){
		r(i) = 0.0;
		for(int j = 0; j < m.size2(); ++j){
			r(i) += m(i,j)*v(j);
		}
	}
}

template<typename Matrix, typename Vector>;
void parallel_product(Matrix & m, Vector & v, Vector & r){
	#pragma omp parallel for num_threads(2)
        for(int i = 0; i < m.size1(); ++i){
                r(i) = 0.0;
                for(int j = 0; j < m.size2(); ++j){
                        r(i) += m(i,j)*v(j);
                }
        }
}

int main(int argc,char * argv[]){
	if (argc != 2){
		std::cout << "usage: "
		          << argv[ 0]
			  << " N"
			  << std::endl;
	}
	int N = atoi(argv[ 1]);
       	matrix_t m (N, N);
	vector_t v(N);
	init_matrix_vector(m,v);
	vector_t r(N);
	vector_t r1(N);
	double serial_time = omp_get_wtime();
	product(m,v,r);
	serial_time = omp_get_wtime() - serial_time;
	double parallel_time = omp_get_wtime();
	parallel_product(m,v,r1);
	parallel_time = omp_get_wtime() - parallel_time;
	for(int i = 0; i < N; ++i){
		if (r(i) != r1(i)){
			std::cerr << "results differ @"
				  << i
				  << std::endl;
			std::cerr << r(i)
				  << " "
				  << r1(i)
				  << std::endl;
			return -1;
		}
	}
	std::cout << "the result is the same"
		  << std::endl;
	std::cout << "serial time: "
		  << serial_time << std::endl;
	std::cout << "parallel time: "
		  << parallel_time << std::endl;
	return 0;
}

Which when I run on my dual-core laptop produces:

$ ./a.out 1000
the result is the same
serial time: 0.094412
parallel time: 0.048241

$ ./a.out 2000
the result is the same
serial time: 0.376123
parallel time: 0.191999

$ ./a.out 5000
the result is the same
serial time: 2.3542
parallel time: 1.21085

$ ./a.out 10000
the result is the same
serial time: 9.41049
parallel time: 4.84689

We can see that with two threads we nicely get ~2x speedup.

Notice the use of omp_get_wtime() as the timer for this code.

What happens if we time the code with std::clock() ?

Sorry, the comment form is closed at this time.