Shashin Error:

Invalid data property __get for fancyboxLoadScript
rhl

for this post lets try and take an “embarrassingly” parallel problem, and solve it in parallel using OpenMP. One such example is matrix-vector multiplication.

For this problem I will use the Boost uBLAS library.

Let’s first write our own matrix vector product which also runs in serial:

//STL
#include<iostream>
//Boost
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/io.hpp>
//Types
using namespace boost::numeric::ublas;
typedef matrix<double>; matrix_t;
typedef vector<double>; vector_t;

double random_double(){
      return (double)rand()/(double)RAND_MAX;
}
template<typename Matrix, typename Vector>;
void init_matrix_vector(Matrix & m, Vector& v){
	srand((unsigned)time(0));
	for (int i = 0; i < m.size1(); ++i){
                v(i) = 10*random_double();
		for (int j = 0; j < m.size2(); ++j)
			m(i,j) = 10*random_double()
	}
}

template<typename Matrix, typename Vector>;
void product(Matrix & m, Vector & v, Vector & r){
	for(int i = 0; i < m.size1(); ++i){
		r(i) = 0.0;
		for(int j = 0; j < m.size2(); ++j){
			r(i) += m(i,j)*v(j);
		}
	}
}

int main(int argc,char * argv[]){
	if (argc != 2){
		std::cout << "usage: "
                          << argv[ 0]
                          << " N"
                          << std::endl;
		std::exit( -1);
	}
	int N = atoi(argv[ 1]);
        //create matrix/vector
	matrix_t m (N, N);
	vector_t v(N);
	init_matrix_vector(m,v);
	vector_t r(N);
	product(m,v,r);
	std::cout << "matrix: " << m << std::endl;
	std::cout << "vector: " << v << std::endl;
	std::cout << "boost product: " << prod(m,v) << std::endl;
	std::cout << "our (serial) product: " << r << std::endl;
}

which produces:

./a.out 2
matrix: [2,2]((3.69343,5.94287),(7.27436,3.44084))
vector: [2](0.72093,0.580349)
boost product: [2](6.11165,7.24119)
our (serial) product: [2](6.11165,7.24119)

Great, we now have a properly working product()
function to which we can add parallelism.

Note that this boils down to threading the outer for loop in our
product() function.

Luckily OpenMP provides a compiler directive just for this. the ‘parallel for’
construct. Consider the following modified product:

template<typename Matrix, typename Vector>;
void parallel_product(Matrix & m, Vector & v, Vector & r){
	#pragma omp parallel for num_threads(2)
        for(int i = 0; i < m.size1(); ++i){
                r(i) = 0.0;
                for(int j = 0; j < m.size2(); ++j){
                        r(i) += m(i,j)*v(j);
                }
        }
}

Statements which begin with #pragma are pre-processor statements. The compilers
pre-processor uses this to modify our source code before compilation.

This statement in particular informs the compiler to create chunks of work from
the following for loop to run in a given set of threads threads.
As one would expect, specifying the num_threads() clause informs the compiler
how many threads to create.

Also, the for() loop could have iterated over a range, as long as the iterator type is a random access iterator.

Putting this all together we have:

//STL
#include<iostream>;

//OMP
#include<omp.h>;

//Boost
#include <boost/numeric/ublas/matrix.hpp>;
#include <boost/numeric/ublas/io.hpp>;

//Types
using namespace boost::numeric::ublas;
typedef matrix<double>; matrix_t;
typedef vector<double>; vector_t;

double random_double(){
	return (double)rand()/(double)RAND_MAX;
}

template<typename Matrix, typename Vector>;
void init_matrix_vector(Matrix & m, Vector& v){
	srand((unsigned)time(0));
	for (int i = 0; i < m.size1(); ++i){
                v(i) = 10*random_double();
		for (int j = 0; j < m.size2(); ++j)
			m(i,j) = 10*random_double();
	}
}

template<typename Matrix, typename Vector>;
void product(Matrix & m, Vector & v, Vector & r){
	for(int i = 0; i < m.size1(); ++i){
		r(i) = 0.0;
		for(int j = 0; j < m.size2(); ++j){
			r(i) += m(i,j)*v(j);
		}
	}
}

template<typename Matrix, typename Vector>;
void parallel_product(Matrix & m, Vector & v, Vector & r){
	#pragma omp parallel for num_threads(2)
        for(int i = 0; i < m.size1(); ++i){
                r(i) = 0.0;
                for(int j = 0; j < m.size2(); ++j){
                        r(i) += m(i,j)*v(j);
                }
        }
}

int main(int argc,char * argv[]){
	if (argc != 2){
		std::cout << "usage: "
		          << argv[ 0]
			  << " N"
			  << std::endl;
	}
	int N = atoi(argv[ 1]);
       	matrix_t m (N, N);
	vector_t v(N);
	init_matrix_vector(m,v);
	vector_t r(N);
	vector_t r1(N);
	double serial_time = omp_get_wtime();
	product(m,v,r);
	serial_time = omp_get_wtime() - serial_time;
	double parallel_time = omp_get_wtime();
	parallel_product(m,v,r1);
	parallel_time = omp_get_wtime() - parallel_time;
	for(int i = 0; i < N; ++i){
		if (r(i) != r1(i)){
			std::cerr << "results differ @"
				  << i
				  << std::endl;
			std::cerr << r(i)
				  << " "
				  << r1(i)
				  << std::endl;
			return -1;
		}
	}
	std::cout << "the result is the same"
		  << std::endl;
	std::cout << "serial time: "
		  << serial_time << std::endl;
	std::cout << "parallel time: "
		  << parallel_time << std::endl;
	return 0;
}

Which when I run on my dual-core laptop produces:

$ ./a.out 1000
the result is the same
serial time: 0.094412
parallel time: 0.048241

$ ./a.out 2000
the result is the same
serial time: 0.376123
parallel time: 0.191999

$ ./a.out 5000
the result is the same
serial time: 2.3542
parallel time: 1.21085

$ ./a.out 10000
the result is the same
serial time: 9.41049
parallel time: 4.84689

We can see that with two threads we nicely get ~2x speedup.

Notice the use of omp_get_wtime() as the timer for this code.

What happens if we time the code with std::clock() ?

moore’s law is dead. at least that is what we keep hearing. personally, i think that while it might take time there will be talented engineers who will figure out a way to keep moore’s law going. in the meantime we programmers need something to speed up our codes. that something is parallel programming.

most likely the machine you are using to read this blog post has a multi-core processor in it. the idea is that your machine is capable of executing more than one instruction per clock tick. i.e if you have a total of four cores, then in theory, the programmer could make some computer programs run four times faster.

one way a programmer can take advantage of a multi-core machine is a language known as OpenMP or ‘open multi-processing.’

let’s start with a simple familiar example: `hello world.`

$ cat main.cpp

#include<iostream>
int main(int argc, char** argv)
{
std::cout << "Hello, World!" << std::endl;
}

$ gcc main.cpp -lstdc++

$ ./a.out
Hello, World!

The OpenMP API provides a number of preprocessor directives for creating threads. A directive is an action and the greek word for action is ‘pragma.’ thus to give a directive to the preprocessor we write:

#pragma omp [directive content here]

The first program we will demonstrate is the `parallel` directive. This directive creates a number of threads which execute any instruction in the parallel region:

#include<iostream>
int main(int argc, char** argv)
{
	#pragma omp parallel
	std::cout << "Hello, World!" << std::endl;
}

Now in order to have the pre-processor actually process OpenMP directives, we need to enable the openmp flag in gcc: `-fopenmp`

$ gcc main.cpp -fopenmp -lstdc++

And now when we run our binary we see something like:

$ ./a.out
Hello, World!
Hello, World!
Hello, World!

And now we have written out first OpenMP program!
Although if you run the program a few more times you will see:

$ ./a.out
Hello, World!Hello, World!

$ ./a.out
Hello, World!Hello, World!

$ ./a.out
Hello, World!
Hello, World!
$ ./a.out
Hello, World!Hello, World!

Notice how each line of our print statement executes in an arbitrary order! This is because each thread executes it’s print statement at a different time, and since there is no control which thread may execute this code, we sometimes will get unexpected results.

To correct this we need another directive, the ‘critical section.’ This directive tells the pre-processor than any code which follows may only be executed by one processor at a time.

#include<iostream>
int main(int argc, char** argv)
{
  #pragma omp parallel
  {
     //All code in this block is in the parallel region
     #pragma omp critical
     {
        //only one thread is able to execute code
        //in a 'critical' section at a time.
        std::cout << "Hello, World!" << std::endl;
     }
   }
}

And now when we compile and run we will always get a similar result:

$ ./a.out
Hello, World!
Hello, World!
$ ./a.out
Hello, World!
Hello, World!
$ ./a.out
Hello, World!
Hello, World!
$ ./a.out
Hello, World!
Hello, World!

In addition to pre-compiler directives. The OpenMP API also provides a number of functions for parallel flow control.omp_get_thread_num() for example, gives us an integer 0,..,N-1 corresponding to which thread is executing the function.
Consider:

#include<iostream>
#include <omp.h>
int main(int argc, char** argv)
{
  #pragma omp parallel
  {
     //All code in this block is in the parallel region
     #pragma omp critical
     {
        //only one thread is able to execute code
        //in a 'critical' section at a time.
        std::cout << "Hello, World: From Thread "
                  << omp_get_thread_num()
                  << std::endl;
     }
   }
}

which executes:

$ ./a.out
Hello, World: From Thread 0
Hello, World: From Thread 1