How can I parallelize this code with OpenMP?. The result I get is not correct.
I try to use temporary variables p1aux, p2aux, and psumaux because the Reduction clause cannot be used with pointers or intrinsic functions. But as I said the result is incorrect.
When I say "result is not correct", I would say: The calculations of array1 + array2 are stored in the matrix sumsse. The calculations are correct until the component sumsse[50] [50] [50], more or less, but the other components of the matrix are always 0.
Do You have any idea what might happen?.
If anyone can help me, thank you very much.
I try to use temporary variables p1aux, p2aux, and psumaux because the Reduction clause cannot be used with pointers or intrinsic functions. But as I said the result is incorrect.
When I say "result is not correct", I would say: The calculations of array1 + array2 are stored in the matrix sumsse. The calculations are correct until the component sumsse[50] [50] [50], more or less, but the other components of the matrix are always 0.
Do You have any idea what might happen?.
If anyone can help me, thank you very much.
Code:
#include <stdio.h>
#ifdef __SSE2__
#include <emmintrin.h>
#endif
#include <omp.h>
#include <stdlib.h>
#include <math.h>
double __attribute__((aligned(16))) array1[256][256][256],array2[256][256][256];
double __attribute__((aligned(16))) sumsse[256][256][256];
int main ()
{
int k,j,i,dim;
dim=256;
__m128d *p1= (__m128d*) &array1[0][0][0];
__m128d *p2= (__m128d*) &array2[0][0][0];
__m128d *psum= (__m128d*) &sumsse[0][0][0];
__m128d *p1aux= (__m128d*) &array1[0][0][0];
__m128d *p2aux= (__m128d*) &array2[0][0][0];
__m128d *psumaux= (__m128d*) &sumsse[0][0][0];
int nthreads =(8);
omp_set_num_threads(nthreads);
//initializa array2
for(k = 0; k < dim; ++k){
for(j = 0; j < dim; ++j){
for(i = 0; i < dim; ++i){
array2[k][j][i] = 1.0;
}
}
}
//initialize array1
for(k = 0; k < dim; ++k){
for(j = 0; j < dim; ++j){
for(i = 0; i < dim; ++i){
array1[k][j][i] = i + j + k;
}
}
}
//initialize array sumsse
for(k = 0; k < dim; ++k){
for(j = 0; j < dim; ++j){
for(i = 0; i < dim; ++i){
sumsse[k][j][i] = 0.0;
}
}
}
//add array1 + array2 with sse
#pragma omp parallel firstprivate(p1,p2,p1aux,p2aux)
{
for(k = 0; k < dim; ++k){
for(j = 0; j < dim; ++j){
#pragma omp for private(i)
for(i = 0; i < dim; i += 2){
*psum = _mm_add_pd(*p1,*p2);
psum = psumaux + 1;
p1 = p1aux+1;
p2 = p2aux+1;
psumaux = psum;
p1aux = p1;
p2aux = p2;
}
}
}
}// end parallel
//show some datas
printf("Elementosse=10 sumsse=%f",sumsse[10][10][10]);
printf("\n");
printf("Elementosse=50 sumsse=%f",sumsse[50][50][50]);
printf("Elementosse=100 sumsse=%f",sumsse[100][100][100]);
printf("\n");
printf("Elementosse=120 sumsse=%f",sumsse[120][120][120]);
printf("\n");
printf("Elementosse=200 sumsse=%f",sumsse[200][200][200]);
printf("\n");
return 0;
}