From 894f35c9be404bc2c13f90f4a6184a545029181a Mon Sep 17 00:00:00 2001 From: Gemma Fardell <47746591+gfardell@users.noreply.github.com> Date: Thu, 23 Jan 2020 11:52:46 +0000 Subject: =?UTF-8?q?Allows=20user=20to=20set=20number=20of=20threads=20used?= =?UTF-8?q?=20by=20openMP=20in=20C=20library=20grad=E2=80=A6=20(#476)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Allows user to set number of threads used by openMP in C library gradient operator. Changed to release build flags * closes #477 * added test function for c lib thread deployment * improved thread scaling for neumann algoritims * removed unnecessary thread sync * reverts omp number of threads at the end of the c function call Co-authored-by: Edoardo Pasca --- src/Core/CMakeLists.txt | 1 + src/Core/FiniteDifferenceLibrary.c | 281 +++++++++++-------------------------- 2 files changed, 82 insertions(+), 200 deletions(-) (limited to 'src') diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt index 741b985..e828fe5 100644 --- a/src/Core/CMakeLists.txt +++ b/src/Core/CMakeLists.txt @@ -1,5 +1,6 @@ set (CMAKE_C_STANDARD 11) +set (CMAKE_BUILD_TYPE Release) if(APPLE) if(NOT DEFINED OPENMP_INCLUDES OR NOT DEFINED OPENMP_LIBRARIES) diff --git a/src/Core/FiniteDifferenceLibrary.c b/src/Core/FiniteDifferenceLibrary.c index f5736e2..fbf2646 100644 --- a/src/Core/FiniteDifferenceLibrary.c +++ b/src/Core/FiniteDifferenceLibrary.c @@ -2,161 +2,66 @@ #include "FiniteDifferenceLibrary.h" -DLL_EXPORT int openMPtest(void) +DLL_EXPORT int openMPtest(int nThreads) { - int nThreads; + omp_set_num_threads(nThreads); + + int nThreads_running; #pragma omp parallel { if (omp_get_thread_num() == 0) { - nThreads = omp_get_num_threads(); + nThreads_running = omp_get_num_threads(); } } - return nThreads; + return nThreads_running; } -int fdiff_direct_neumann(const float *inimagefull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc) +void threads_setup(int nThreads_requested, int *nThreads_current) { - size_t volume = nx * ny * nz; - - const float * inimage = inimagefull; - float * outimageX = outimageXfull; - float * outimageY = outimageYfull; - float * outimageZ = outimageZfull; - - int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice - int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row - - long c; - for (c = 0; c < nc; c++) - { #pragma omp parallel - { - long ind, k, j, i; - float pix0; - //run over all and then fix boundaries -#pragma omp for - for (ind = 0; ind < nx * ny * (nz - 1); ind++) - { - pix0 = -inimage[ind]; - - outimageX[ind] = pix0 + inimage[ind + 1]; - outimageY[ind] = pix0 + inimage[ind + nx]; - outimageZ[ind] = pix0 + inimage[ind + nx * ny]; - } - -#pragma omp for - for (ind = 0; ind < nx * (ny - 1); ind++) - { - pix0 = -inimage[ind + offset1]; - - outimageX[ind + offset1] = pix0 + inimage[ind + offset1 + 1]; - outimageY[ind + offset1] = pix0 + inimage[ind + offset1 + nx]; - } - -#pragma omp for - for (ind = 0; ind < nx - 1; ind++) - { - pix0 = -inimage[ind + offset2]; - - outimageX[ind + offset2] = pix0 + inimage[ind + offset2 + 1]; - } - - //boundaries -#pragma omp for - for (k = 0; k < nz; k++) - { - for (i = 0; i < nx; i++) - { - outimageY[(k * ny * nx) + (ny - 1) * nx + i] = 0; - } - } - -#pragma omp for - for (k = 0; k < nz; k++) - { - for (j = 0; j < ny; j++) - { - outimageX[k * ny * nx + j * nx + nx - 1] = 0; - } - } - - if (nz > 1) - { -#pragma omp for - for (ind = 0; ind < ny * nx; ind++) - { - outimageZ[nx * ny * (nz - 1) + ind] = 0; - } - } - } - - inimage += volume; - outimageX += volume; - outimageY += volume; - outimageZ += volume; - } - - - //now the rest of the channels - if (nc > 1) { - long ind; - - for (c = 0; c < nc - 1; c++) - { - float * outimageC = outimageCfull + c * volume; - const float * inimage = inimagefull + c * volume; - -#pragma omp parallel for - for (ind = 0; ind < volume; ind++) - { - outimageC[ind] = -inimage[ind] + inimage[ind + volume]; - } - } - -#pragma omp parallel for - for (ind = 0; ind < volume; ind++) + if (omp_get_thread_num() == 0) { - outimageCfull[(nc - 1) * volume + ind] = 0; + *nThreads_current = omp_get_num_threads(); } } - - return 0; + omp_set_num_threads(nThreads_requested); } -int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normfull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc) + +int fdiff_direct_neumann(const float *inimagefull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc) { size_t volume = nx * ny * nz; - int z_dim = nz - 1; - - const float * inimage = inimagefull; - float * outimageX = outimageXfull; - float * outimageY = outimageYfull; - float * outimageZ = outimageZfull; - float * outimageL21norm = outimageL21normfull; + const float *inimage = inimagefull; + float *outimageX = outimageXfull; + float *outimageY = outimageYfull; + float *outimageZ = outimageZfull; - int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice + int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row long c; + + int z_dim = nz > 1 ? 1: 0; + for (c = 0; c < nc; c++) { #pragma omp parallel { - long ind, k; + long ind, k, j, i; float pix0; //run over all and then fix boundaries -#pragma omp for + +#pragma omp for nowait for (ind = 0; ind < nx * ny * (nz - 1); ind++) { pix0 = -inimage[ind]; - outimageX[ind] = pix0 + inimage[ind + 1]; outimageY[ind] = pix0 + inimage[ind + nx]; outimageZ[ind] = pix0 + inimage[ind + nx * ny]; } -#pragma omp for +#pragma omp for nowait for (ind = 0; ind < nx * (ny - 1); ind++) { pix0 = -inimage[ind + offset1]; @@ -169,29 +74,26 @@ int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normful for (ind = 0; ind < nx - 1; ind++) { pix0 = -inimage[ind + offset2]; - outimageX[ind + offset2] = pix0 + inimage[ind + offset2 + 1]; } //boundaries -#pragma omp for +#pragma omp for nowait for (k = 0; k < nz; k++) { - for (int i = 0; i < nx; i++) + for (i = 0; i < nx; i++) { outimageY[(k * ny * nx) + (ny - 1) * nx + i] = 0; } } - -#pragma omp for +#pragma omp for nowait for (k = 0; k < nz; k++) { - for (int j = 0; j < ny; j++) + for (j = 0; j < ny; j++) { outimageX[k * ny * nx + j * nx + nx - 1] = 0; } } - if (z_dim) { #pragma omp for @@ -199,34 +101,15 @@ int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normful { outimageZ[nx * ny * (nz - 1) + ind] = 0; } - -#pragma omp for - for (ind = 0; ind < volume; ind++) - { - outimageL21norm[ind] = outimageX[ind] * outimageX[ind] + outimageY[ind] * outimageY[ind] + outimageZ[ind] * outimageZ[ind]; - } - - } - else - { - -#pragma omp for - for (ind = 0; ind < volume; ind++) - { - outimageL21norm[ind] = outimageX[ind] * outimageX[ind] + outimageY[ind] * outimageY[ind]; - } } } - inimage += volume; outimageX += volume; outimageY += volume; outimageZ += volume; - outimageL21norm += volume; } - //now the rest of the channels if (nc > 1) { @@ -234,18 +117,13 @@ int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normful for (c = 0; c < nc - 1; c++) { - float * outimageC = outimageCfull + c * volume; - float * outimageL21norm = outimageL21normfull + c * volume; - const float * inimage = inimagefull + c * volume; + float *outimageC = outimageCfull + c * volume; + const float *inimage = inimagefull + c * volume; #pragma omp parallel for for (ind = 0; ind < volume; ind++) { outimageC[ind] = -inimage[ind] + inimage[ind + volume]; - outimageL21norm[ind] += outimageC[ind] * outimageC[ind]; - - //sqrt - outimageL21norm[ind] = (float)sqrt(outimageL21norm[ind]); } } @@ -262,15 +140,14 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float { size_t volume = nx * ny * nz; - const float * inimage = inimagefull; - float * outimageX = outimageXfull; - float * outimageY = outimageYfull; - float * outimageZ = outimageZfull; + const float *inimage = inimagefull; + float *outimageX = outimageXfull; + float *outimageY = outimageYfull; + float *outimageZ = outimageZfull; - int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice + int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row - long c; for (c = 0; c < nc; c++) { @@ -280,7 +157,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float long ind, k; float pix0; //run over all and then fix boundaries -#pragma omp for +#pragma omp for nowait for (ind = 0; ind < nx * ny * (nz - 1); ind++) { pix0 = -inimage[ind]; @@ -290,7 +167,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float outimageZ[ind] = pix0 + inimage[ind + nx * ny]; } -#pragma omp for +#pragma omp for nowait for (ind = 0; ind < nx * (ny - 1); ind++) { pix0 = -inimage[ind + offset1]; @@ -308,7 +185,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float } //boundaries -#pragma omp for +#pragma omp for nowait for (k = 0; k < nz; k++) { for (int i = 0; i < nx; i++) @@ -320,7 +197,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float } } -#pragma omp for +#pragma omp for nowait for (k = 0; k < nz; k++) { for (int j = 0; j < ny; j++) @@ -332,10 +209,9 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float } } - if (nz > 1) { -#pragma omp for +#pragma omp for nowait for (ind = 0; ind < ny * nx; ind++) { outimageZ[nx * ny * (nz - 1) + ind] = -inimage[nx * ny * (nz - 1) + ind] + inimage[ind]; @@ -356,8 +232,8 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float for (c = 0; c < nc - 1; c++) { - float * outimageC = outimageCfull + c * volume; - const float * inimage = inimagefull + c * volume; + float *outimageC = outimageCfull + c * volume; + const float *inimage = inimagefull + c * volume; #pragma omp parallel for for (ind = 0; ind < volume; ind++) @@ -383,18 +259,18 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const //assumes nx and ny > 1 int z_dim = nz - 1; - float * outimage = outimagefull; - const float * inimageX = inimageXfull; - const float * inimageY = inimageYfull; - const float * inimageZ = inimageZfull; + float *outimage = outimagefull; + const float *inimageX = inimageXfull; + const float *inimageY = inimageYfull; + const float *inimageZ = inimageZfull; - float * tempX = (float*)malloc(volume * sizeof(float)); - float * tempY = (float*)malloc(volume * sizeof(float)); - float * tempZ; - - if(z_dim) + float *tempX = (float *)malloc(volume * sizeof(float)); + float *tempY = (float *)malloc(volume * sizeof(float)); + float *tempZ; + + if (z_dim) { - tempZ = (float*)malloc(volume * sizeof(float)); + tempZ = (float *)malloc(volume * sizeof(float)); } long c; @@ -413,7 +289,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const for (ind = nx; ind < nx * ny * nz; ind++) { tempY[ind] = -inimageY[ind] + inimageY[ind - nx]; - } //boundaries @@ -442,7 +317,7 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const for (ind = nx * ny; ind < nx * ny * nz; ind++) { tempZ[ind] = -inimageZ[ind] + inimageZ[ind - nx * ny]; - } + } #pragma omp for for (ind = 0; ind < ny * nx; ind++) { @@ -454,7 +329,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const { outimage[ind] = tempX[ind] + tempY[ind] + tempZ[ind]; } - } else { @@ -464,7 +338,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const outimage[ind] = tempX[ind] + tempY[ind]; } } - } outimage += volume; @@ -475,10 +348,9 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const free(tempX); free(tempY); - if(z_dim) + if (z_dim) free(tempZ); - // //now the rest of the channels if (nc > 1) { @@ -500,7 +372,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const outimagefull[ind] += -inimageCfull[ind]; outimagefull[(nc - 1) * volume + ind] += inimageCfull[(nc - 2) * volume + ind]; } - } return 0; @@ -513,18 +384,18 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const //assumes nx and ny > 1 int z_dim = nz - 1; - float * outimage = outimagefull; - const float * inimageX = inimageXfull; - const float * inimageY = inimageYfull; - const float * inimageZ = inimageZfull; + float *outimage = outimagefull; + const float *inimageX = inimageXfull; + const float *inimageY = inimageYfull; + const float *inimageZ = inimageZfull; - float * tempX = (float*)malloc(volume * sizeof(float)); - float * tempY = (float*)malloc(volume * sizeof(float)); - float * tempZ; + float *tempX = (float *)malloc(volume * sizeof(float)); + float *tempY = (float *)malloc(volume * sizeof(float)); + float *tempZ; if (z_dim) { - tempZ = (float*)malloc(volume * sizeof(float)); + tempZ = (float *)malloc(volume * sizeof(float)); } long c; @@ -583,7 +454,6 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const { outimage[ind] = tempX[ind] + tempY[ind] + tempZ[ind]; } - } else { @@ -593,7 +463,6 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const outimage[ind] = tempX[ind] + tempY[ind]; } } - } outimage += volume; @@ -604,7 +473,7 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const free(tempX); free(tempY); - if(z_dim) + if (z_dim) free(tempZ); //now the rest of the channels @@ -632,9 +501,11 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const return 0; } - -DLL_EXPORT int fdiff4D(float *imagefull, float *gradCfull, float *gradZfull, float *gradYfull, float *gradXfull, long nc, long nz, long ny, long nx, int boundary, int direction) +DLL_EXPORT int fdiff4D(float *imagefull, float *gradCfull, float *gradZfull, float *gradYfull, float *gradXfull, long nc, long nz, long ny, long nx, int boundary, int direction, int nThreads) { + int nThreads_initial; + threads_setup(nThreads, &nThreads_initial); + if (boundary) { if (direction) @@ -643,17 +514,21 @@ DLL_EXPORT int fdiff4D(float *imagefull, float *gradCfull, float *gradZfull, flo fdiff_adjoint_periodic(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc); } else - { + { if (direction) fdiff_direct_neumann(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc); else fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc); } - + + omp_set_num_threads(nThreads_initial); return 0; } -DLL_EXPORT int fdiff3D(float *imagefull, float *gradZfull, float *gradYfull, float *gradXfull, long nz, long ny, long nx, int boundary, int direction) +DLL_EXPORT int fdiff3D(float *imagefull, float *gradZfull, float *gradYfull, float *gradXfull, long nz, long ny, long nx, int boundary, int direction, int nThreads) { + int nThreads_initial; + threads_setup(nThreads, &nThreads_initial); + if (boundary) { if (direction) @@ -669,10 +544,14 @@ DLL_EXPORT int fdiff3D(float *imagefull, float *gradZfull, float *gradYfull, flo fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, gradZfull, NULL, nx, ny, nz, 1); } + omp_set_num_threads(nThreads_initial); return 0; } -DLL_EXPORT int fdiff2D(float *imagefull, float *gradYfull, float *gradXfull, long ny, long nx, int boundary, int direction) +DLL_EXPORT int fdiff2D(float *imagefull, float *gradYfull, float *gradXfull, long ny, long nx, int boundary, int direction, int nThreads) { + int nThreads_initial; + threads_setup(nThreads, &nThreads_initial); + if (boundary) { if (direction) @@ -688,5 +567,7 @@ DLL_EXPORT int fdiff2D(float *imagefull, float *gradYfull, float *gradXfull, lon fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, NULL, NULL, nx, ny, 1, 1); } + omp_set_num_threads(nThreads_initial); return 0; } + -- cgit v1.2.3