From 894f35c9be404bc2c13f90f4a6184a545029181a Mon Sep 17 00:00:00 2001 From: Gemma Fardell <47746591+gfardell@users.noreply.github.com> Date: Thu, 23 Jan 2020 11:52:46 +0000 Subject: =?UTF-8?q?Allows=20user=20to=20set=20number=20of=20threads=20used?= =?UTF-8?q?=20by=20openMP=20in=20C=20library=20grad=E2=80=A6=20(#476)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Allows user to set number of threads used by openMP in C library gradient operator. Changed to release build flags * closes #477 * added test function for c lib thread deployment * improved thread scaling for neumann algoritims * removed unnecessary thread sync * reverts omp number of threads at the end of the c function call Co-authored-by: Edoardo Pasca --- Wrappers/Python/CMakeLists.txt | 2 + .../optimisation/operators/GradientOperator.py | 41 ++- src/Core/CMakeLists.txt | 1 + src/Core/FiniteDifferenceLibrary.c | 281 ++++++--------------- 4 files changed, 111 insertions(+), 214 deletions(-) diff --git a/Wrappers/Python/CMakeLists.txt b/Wrappers/Python/CMakeLists.txt index 9104afd..0c24540 100644 --- a/Wrappers/Python/CMakeLists.txt +++ b/Wrappers/Python/CMakeLists.txt @@ -76,7 +76,9 @@ if (BUILD_PYTHON_WRAPPER) ) endif() #set (PYTHON_DEST ${CMAKE_INSTALL_PREFIX}/python/) + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/build/lib/ccpi + DESTINATION ${PYTHON_DEST}) install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/data/ DESTINATION ${CMAKE_INSTALL_PREFIX}/share/ccpi) #file(TOUCH ${PYTHON_DEST}/edo/__init__.py) diff --git a/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py b/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py index 6391cf7..a45c3d2 100644 --- a/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py +++ b/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py @@ -25,6 +25,11 @@ from ccpi.framework import ImageData, ImageGeometry, BlockGeometry, BlockDataCon import numpy import warnings +#default nThreads +import multiprocessing +cpus = multiprocessing.cpu_count() +NUM_THREADS = max(int(cpus/2),1) + NEUMANN = 'Neumann' PERIODIC = 'Periodic' C = 'c' @@ -51,10 +56,11 @@ class Gradient(LinearOperator): 'Space' or 'SpaceChannels', defaults to 'Space' * *backend* (``str``) -- 'c' or 'numpy', defaults to 'c' if correlation is 'SpaceChannels' or channels = 1 - - + * *num_threads* (``int``) -- + If backend is 'c' specify the number of threads to use. Default is number of cpus/2 + + Example (2D): - .. math:: \nabla : X -> Y \\ @@ -85,7 +91,7 @@ class Gradient(LinearOperator): if backend == NUMPY: self.operator = Gradient_numpy(gm_domain, bnd_cond=bnd_cond, **kwargs) else: - self.operator = Gradient_C(gm_domain, bnd_cond=bnd_cond) + self.operator = Gradient_C(gm_domain, bnd_cond=bnd_cond, **kwargs) def direct(self, x, out=None): @@ -183,7 +189,7 @@ class Gradient_numpy(LinearOperator): # Call FiniteDiff operator self.FD = FiniteDiff(self.gm_domain, direction = 0, bnd_cond = self.bnd_cond) - + print("Initialised GradientOperator with numpy backend") def direct(self, x, out=None): @@ -275,7 +281,8 @@ class Gradient_numpy(LinearOperator): spMat = SparseFiniteDiff(self.gm_domain, direction=self.ind[i], bnd_cond=self.bnd_cond) res.append(spMat.sum_abs_col()) return BlockDataContainer(*res) - + + import ctypes, platform # check for the extension @@ -288,14 +295,13 @@ elif platform.system() == 'Darwin': else: raise ValueError('Not supported platform, ', platform.system()) -#print ("dll location", dll) -cilacc = ctypes.cdll.LoadLibrary(dll) -#FD = ctypes.CDLL(dll) +cilacc = ctypes.cdll.LoadLibrary(dll) c_float_p = ctypes.POINTER(ctypes.c_float) cilacc.openMPtest.restypes = ctypes.c_int32 +cilacc.openMPtest.argtypes = [ctypes.c_int32] cilacc.fdiff4D.argtypes = [ctypes.POINTER(ctypes.c_float), ctypes.POINTER(ctypes.c_float), @@ -307,6 +313,7 @@ cilacc.fdiff4D.argtypes = [ctypes.POINTER(ctypes.c_float), ctypes.c_long, ctypes.c_long, ctypes.c_int32, + ctypes.c_int32, ctypes.c_int32] cilacc.fdiff3D.argtypes = [ctypes.POINTER(ctypes.c_float), @@ -317,6 +324,7 @@ cilacc.fdiff3D.argtypes = [ctypes.POINTER(ctypes.c_float), ctypes.c_long, ctypes.c_long, ctypes.c_int32, + ctypes.c_int32, ctypes.c_int32] cilacc.fdiff2D.argtypes = [ctypes.POINTER(ctypes.c_float), @@ -325,6 +333,7 @@ cilacc.fdiff2D.argtypes = [ctypes.POINTER(ctypes.c_float), ctypes.c_long, ctypes.c_long, ctypes.c_int32, + ctypes.c_int32, ctypes.c_int32] @@ -336,10 +345,12 @@ class Gradient_C(LinearOperator): on 2D, 3D, 4D ImageData under Neumann/Periodic boundary conditions''' - def __init__(self, gm_domain, gm_range=None, bnd_cond = NEUMANN): + def __init__(self, gm_domain, gm_range=None, bnd_cond = NEUMANN, **kwargs): super(Gradient_C, self).__init__() + self.num_threads = kwargs.get('num_threads',NUM_THREADS) + self.gm_domain = gm_domain self.gm_range = gm_range @@ -362,8 +373,9 @@ class Gradient_C(LinearOperator): self.fd = cilacc.fdiff2D else: raise ValueError('Number of dimensions not supported, expected 2, 3 or 4, got {}'.format(len(gm_domain.shape))) - - + #self.num_threads + print("Initialised GradientOperator with C backend running with ", cilacc.openMPtest(self.num_threads)," threads") + @staticmethod def datacontainer_as_c_pointer(x): ndx = x.as_array() @@ -377,9 +389,10 @@ class Gradient_C(LinearOperator): out = self.gm_range.allocate(None) return_val = True + #pass list of all arguments arg1 = [Gradient_C.datacontainer_as_c_pointer(out.get_item(i))[1] for i in range(self.gm_range.shape[0])] arg2 = [el for el in x.shape] - args = arg1 + arg2 + [self.bnd_cond, 1] + args = arg1 + arg2 + [self.bnd_cond, 1, self.num_threads] self.fd(x_p, *args) if return_val is True: @@ -397,7 +410,7 @@ class Gradient_C(LinearOperator): arg1 = [Gradient_C.datacontainer_as_c_pointer(x.get_item(i))[1] for i in range(self.gm_range.shape[0])] arg2 = [el for el in out.shape] - args = arg1 + arg2 + [self.bnd_cond, 0] + args = arg1 + arg2 + [self.bnd_cond, 0, self.num_threads] self.fd(out_p, *args) diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt index 741b985..e828fe5 100644 --- a/src/Core/CMakeLists.txt +++ b/src/Core/CMakeLists.txt @@ -1,5 +1,6 @@ set (CMAKE_C_STANDARD 11) +set (CMAKE_BUILD_TYPE Release) if(APPLE) if(NOT DEFINED OPENMP_INCLUDES OR NOT DEFINED OPENMP_LIBRARIES) diff --git a/src/Core/FiniteDifferenceLibrary.c b/src/Core/FiniteDifferenceLibrary.c index f5736e2..fbf2646 100644 --- a/src/Core/FiniteDifferenceLibrary.c +++ b/src/Core/FiniteDifferenceLibrary.c @@ -2,161 +2,66 @@ #include "FiniteDifferenceLibrary.h" -DLL_EXPORT int openMPtest(void) +DLL_EXPORT int openMPtest(int nThreads) { - int nThreads; + omp_set_num_threads(nThreads); + + int nThreads_running; #pragma omp parallel { if (omp_get_thread_num() == 0) { - nThreads = omp_get_num_threads(); + nThreads_running = omp_get_num_threads(); } } - return nThreads; + return nThreads_running; } -int fdiff_direct_neumann(const float *inimagefull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc) +void threads_setup(int nThreads_requested, int *nThreads_current) { - size_t volume = nx * ny * nz; - - const float * inimage = inimagefull; - float * outimageX = outimageXfull; - float * outimageY = outimageYfull; - float * outimageZ = outimageZfull; - - int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice - int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row - - long c; - for (c = 0; c < nc; c++) - { #pragma omp parallel - { - long ind, k, j, i; - float pix0; - //run over all and then fix boundaries -#pragma omp for - for (ind = 0; ind < nx * ny * (nz - 1); ind++) - { - pix0 = -inimage[ind]; - - outimageX[ind] = pix0 + inimage[ind + 1]; - outimageY[ind] = pix0 + inimage[ind + nx]; - outimageZ[ind] = pix0 + inimage[ind + nx * ny]; - } - -#pragma omp for - for (ind = 0; ind < nx * (ny - 1); ind++) - { - pix0 = -inimage[ind + offset1]; - - outimageX[ind + offset1] = pix0 + inimage[ind + offset1 + 1]; - outimageY[ind + offset1] = pix0 + inimage[ind + offset1 + nx]; - } - -#pragma omp for - for (ind = 0; ind < nx - 1; ind++) - { - pix0 = -inimage[ind + offset2]; - - outimageX[ind + offset2] = pix0 + inimage[ind + offset2 + 1]; - } - - //boundaries -#pragma omp for - for (k = 0; k < nz; k++) - { - for (i = 0; i < nx; i++) - { - outimageY[(k * ny * nx) + (ny - 1) * nx + i] = 0; - } - } - -#pragma omp for - for (k = 0; k < nz; k++) - { - for (j = 0; j < ny; j++) - { - outimageX[k * ny * nx + j * nx + nx - 1] = 0; - } - } - - if (nz > 1) - { -#pragma omp for - for (ind = 0; ind < ny * nx; ind++) - { - outimageZ[nx * ny * (nz - 1) + ind] = 0; - } - } - } - - inimage += volume; - outimageX += volume; - outimageY += volume; - outimageZ += volume; - } - - - //now the rest of the channels - if (nc > 1) { - long ind; - - for (c = 0; c < nc - 1; c++) - { - float * outimageC = outimageCfull + c * volume; - const float * inimage = inimagefull + c * volume; - -#pragma omp parallel for - for (ind = 0; ind < volume; ind++) - { - outimageC[ind] = -inimage[ind] + inimage[ind + volume]; - } - } - -#pragma omp parallel for - for (ind = 0; ind < volume; ind++) + if (omp_get_thread_num() == 0) { - outimageCfull[(nc - 1) * volume + ind] = 0; + *nThreads_current = omp_get_num_threads(); } } - - return 0; + omp_set_num_threads(nThreads_requested); } -int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normfull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc) + +int fdiff_direct_neumann(const float *inimagefull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc) { size_t volume = nx * ny * nz; - int z_dim = nz - 1; - - const float * inimage = inimagefull; - float * outimageX = outimageXfull; - float * outimageY = outimageYfull; - float * outimageZ = outimageZfull; - float * outimageL21norm = outimageL21normfull; + const float *inimage = inimagefull; + float *outimageX = outimageXfull; + float *outimageY = outimageYfull; + float *outimageZ = outimageZfull; - int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice + int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row long c; + + int z_dim = nz > 1 ? 1: 0; + for (c = 0; c < nc; c++) { #pragma omp parallel { - long ind, k; + long ind, k, j, i; float pix0; //run over all and then fix boundaries -#pragma omp for + +#pragma omp for nowait for (ind = 0; ind < nx * ny * (nz - 1); ind++) { pix0 = -inimage[ind]; - outimageX[ind] = pix0 + inimage[ind + 1]; outimageY[ind] = pix0 + inimage[ind + nx]; outimageZ[ind] = pix0 + inimage[ind + nx * ny]; } -#pragma omp for +#pragma omp for nowait for (ind = 0; ind < nx * (ny - 1); ind++) { pix0 = -inimage[ind + offset1]; @@ -169,29 +74,26 @@ int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normful for (ind = 0; ind < nx - 1; ind++) { pix0 = -inimage[ind + offset2]; - outimageX[ind + offset2] = pix0 + inimage[ind + offset2 + 1]; } //boundaries -#pragma omp for +#pragma omp for nowait for (k = 0; k < nz; k++) { - for (int i = 0; i < nx; i++) + for (i = 0; i < nx; i++) { outimageY[(k * ny * nx) + (ny - 1) * nx + i] = 0; } } - -#pragma omp for +#pragma omp for nowait for (k = 0; k < nz; k++) { - for (int j = 0; j < ny; j++) + for (j = 0; j < ny; j++) { outimageX[k * ny * nx + j * nx + nx - 1] = 0; } } - if (z_dim) { #pragma omp for @@ -199,34 +101,15 @@ int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normful { outimageZ[nx * ny * (nz - 1) + ind] = 0; } - -#pragma omp for - for (ind = 0; ind < volume; ind++) - { - outimageL21norm[ind] = outimageX[ind] * outimageX[ind] + outimageY[ind] * outimageY[ind] + outimageZ[ind] * outimageZ[ind]; - } - - } - else - { - -#pragma omp for - for (ind = 0; ind < volume; ind++) - { - outimageL21norm[ind] = outimageX[ind] * outimageX[ind] + outimageY[ind] * outimageY[ind]; - } } } - inimage += volume; outimageX += volume; outimageY += volume; outimageZ += volume; - outimageL21norm += volume; } - //now the rest of the channels if (nc > 1) { @@ -234,18 +117,13 @@ int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normful for (c = 0; c < nc - 1; c++) { - float * outimageC = outimageCfull + c * volume; - float * outimageL21norm = outimageL21normfull + c * volume; - const float * inimage = inimagefull + c * volume; + float *outimageC = outimageCfull + c * volume; + const float *inimage = inimagefull + c * volume; #pragma omp parallel for for (ind = 0; ind < volume; ind++) { outimageC[ind] = -inimage[ind] + inimage[ind + volume]; - outimageL21norm[ind] += outimageC[ind] * outimageC[ind]; - - //sqrt - outimageL21norm[ind] = (float)sqrt(outimageL21norm[ind]); } } @@ -262,15 +140,14 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float { size_t volume = nx * ny * nz; - const float * inimage = inimagefull; - float * outimageX = outimageXfull; - float * outimageY = outimageYfull; - float * outimageZ = outimageZfull; + const float *inimage = inimagefull; + float *outimageX = outimageXfull; + float *outimageY = outimageYfull; + float *outimageZ = outimageZfull; - int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice + int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row - long c; for (c = 0; c < nc; c++) { @@ -280,7 +157,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float long ind, k; float pix0; //run over all and then fix boundaries -#pragma omp for +#pragma omp for nowait for (ind = 0; ind < nx * ny * (nz - 1); ind++) { pix0 = -inimage[ind]; @@ -290,7 +167,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float outimageZ[ind] = pix0 + inimage[ind + nx * ny]; } -#pragma omp for +#pragma omp for nowait for (ind = 0; ind < nx * (ny - 1); ind++) { pix0 = -inimage[ind + offset1]; @@ -308,7 +185,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float } //boundaries -#pragma omp for +#pragma omp for nowait for (k = 0; k < nz; k++) { for (int i = 0; i < nx; i++) @@ -320,7 +197,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float } } -#pragma omp for +#pragma omp for nowait for (k = 0; k < nz; k++) { for (int j = 0; j < ny; j++) @@ -332,10 +209,9 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float } } - if (nz > 1) { -#pragma omp for +#pragma omp for nowait for (ind = 0; ind < ny * nx; ind++) { outimageZ[nx * ny * (nz - 1) + ind] = -inimage[nx * ny * (nz - 1) + ind] + inimage[ind]; @@ -356,8 +232,8 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float for (c = 0; c < nc - 1; c++) { - float * outimageC = outimageCfull + c * volume; - const float * inimage = inimagefull + c * volume; + float *outimageC = outimageCfull + c * volume; + const float *inimage = inimagefull + c * volume; #pragma omp parallel for for (ind = 0; ind < volume; ind++) @@ -383,18 +259,18 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const //assumes nx and ny > 1 int z_dim = nz - 1; - float * outimage = outimagefull; - const float * inimageX = inimageXfull; - const float * inimageY = inimageYfull; - const float * inimageZ = inimageZfull; + float *outimage = outimagefull; + const float *inimageX = inimageXfull; + const float *inimageY = inimageYfull; + const float *inimageZ = inimageZfull; - float * tempX = (float*)malloc(volume * sizeof(float)); - float * tempY = (float*)malloc(volume * sizeof(float)); - float * tempZ; - - if(z_dim) + float *tempX = (float *)malloc(volume * sizeof(float)); + float *tempY = (float *)malloc(volume * sizeof(float)); + float *tempZ; + + if (z_dim) { - tempZ = (float*)malloc(volume * sizeof(float)); + tempZ = (float *)malloc(volume * sizeof(float)); } long c; @@ -413,7 +289,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const for (ind = nx; ind < nx * ny * nz; ind++) { tempY[ind] = -inimageY[ind] + inimageY[ind - nx]; - } //boundaries @@ -442,7 +317,7 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const for (ind = nx * ny; ind < nx * ny * nz; ind++) { tempZ[ind] = -inimageZ[ind] + inimageZ[ind - nx * ny]; - } + } #pragma omp for for (ind = 0; ind < ny * nx; ind++) { @@ -454,7 +329,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const { outimage[ind] = tempX[ind] + tempY[ind] + tempZ[ind]; } - } else { @@ -464,7 +338,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const outimage[ind] = tempX[ind] + tempY[ind]; } } - } outimage += volume; @@ -475,10 +348,9 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const free(tempX); free(tempY); - if(z_dim) + if (z_dim) free(tempZ); - // //now the rest of the channels if (nc > 1) { @@ -500,7 +372,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const outimagefull[ind] += -inimageCfull[ind]; outimagefull[(nc - 1) * volume + ind] += inimageCfull[(nc - 2) * volume + ind]; } - } return 0; @@ -513,18 +384,18 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const //assumes nx and ny > 1 int z_dim = nz - 1; - float * outimage = outimagefull; - const float * inimageX = inimageXfull; - const float * inimageY = inimageYfull; - const float * inimageZ = inimageZfull; + float *outimage = outimagefull; + const float *inimageX = inimageXfull; + const float *inimageY = inimageYfull; + const float *inimageZ = inimageZfull; - float * tempX = (float*)malloc(volume * sizeof(float)); - float * tempY = (float*)malloc(volume * sizeof(float)); - float * tempZ; + float *tempX = (float *)malloc(volume * sizeof(float)); + float *tempY = (float *)malloc(volume * sizeof(float)); + float *tempZ; if (z_dim) { - tempZ = (float*)malloc(volume * sizeof(float)); + tempZ = (float *)malloc(volume * sizeof(float)); } long c; @@ -583,7 +454,6 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const { outimage[ind] = tempX[ind] + tempY[ind] + tempZ[ind]; } - } else { @@ -593,7 +463,6 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const outimage[ind] = tempX[ind] + tempY[ind]; } } - } outimage += volume; @@ -604,7 +473,7 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const free(tempX); free(tempY); - if(z_dim) + if (z_dim) free(tempZ); //now the rest of the channels @@ -632,9 +501,11 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const return 0; } - -DLL_EXPORT int fdiff4D(float *imagefull, float *gradCfull, float *gradZfull, float *gradYfull, float *gradXfull, long nc, long nz, long ny, long nx, int boundary, int direction) +DLL_EXPORT int fdiff4D(float *imagefull, float *gradCfull, float *gradZfull, float *gradYfull, float *gradXfull, long nc, long nz, long ny, long nx, int boundary, int direction, int nThreads) { + int nThreads_initial; + threads_setup(nThreads, &nThreads_initial); + if (boundary) { if (direction) @@ -643,17 +514,21 @@ DLL_EXPORT int fdiff4D(float *imagefull, float *gradCfull, float *gradZfull, flo fdiff_adjoint_periodic(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc); } else - { + { if (direction) fdiff_direct_neumann(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc); else fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc); } - + + omp_set_num_threads(nThreads_initial); return 0; } -DLL_EXPORT int fdiff3D(float *imagefull, float *gradZfull, float *gradYfull, float *gradXfull, long nz, long ny, long nx, int boundary, int direction) +DLL_EXPORT int fdiff3D(float *imagefull, float *gradZfull, float *gradYfull, float *gradXfull, long nz, long ny, long nx, int boundary, int direction, int nThreads) { + int nThreads_initial; + threads_setup(nThreads, &nThreads_initial); + if (boundary) { if (direction) @@ -669,10 +544,14 @@ DLL_EXPORT int fdiff3D(float *imagefull, float *gradZfull, float *gradYfull, flo fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, gradZfull, NULL, nx, ny, nz, 1); } + omp_set_num_threads(nThreads_initial); return 0; } -DLL_EXPORT int fdiff2D(float *imagefull, float *gradYfull, float *gradXfull, long ny, long nx, int boundary, int direction) +DLL_EXPORT int fdiff2D(float *imagefull, float *gradYfull, float *gradXfull, long ny, long nx, int boundary, int direction, int nThreads) { + int nThreads_initial; + threads_setup(nThreads, &nThreads_initial); + if (boundary) { if (direction) @@ -688,5 +567,7 @@ DLL_EXPORT int fdiff2D(float *imagefull, float *gradYfull, float *gradXfull, lon fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, NULL, NULL, nx, ny, 1, 1); } + omp_set_num_threads(nThreads_initial); return 0; } + -- cgit v1.2.3