Allows user to set number of threads used by openMP in C library grad… (#476)

* Allows user to set number of threads used by openMP in C library gradient operator. Changed to release build flags * closes #477 * added test function for c lib thread deployment * improved thread scaling for neumann algoritims * removed unnecessary thread sync * reverts omp number of threads at the end of the c function call Co-authored-by: Edoardo Pasca <edo.paskino@gmail.com>
author: Gemma Fardell <47746591+gfardell@users.noreply.github.com> 2020-01-23 11:52:46 +0000
committer: Edoardo Pasca <edo.paskino@gmail.com> 2020-01-23 11:52:46 +0000
commit: 894f35c9be404bc2c13f90f4a6184a545029181a (patch)
tree: 17938025fe60afac7006b8fb6c4e7016bf687ccd /src
parent: fecff8ded735d309aba43d30226c0bb51386c905 (diff)
download: framework-894f35c9be404bc2c13f90f4a6184a545029181a.tar.gz
framework-894f35c9be404bc2c13f90f4a6184a545029181a.tar.bz2
framework-894f35c9be404bc2c13f90f4a6184a545029181a.tar.xz
framework-894f35c9be404bc2c13f90f4a6184a545029181a.zip
2 files changed, 82 insertions, 200 deletions
diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt
index 741b985..e828fe5 100644
--- a/src/Core/CMakeLists.txt
+++ b/src/Core/CMakeLists.txt
@@ -1,5 +1,6 @@
 
 set (CMAKE_C_STANDARD 11)
+set (CMAKE_BUILD_TYPE Release)
 
 if(APPLE)
     if(NOT DEFINED OPENMP_INCLUDES OR NOT DEFINED OPENMP_LIBRARIES)
diff --git a/src/Core/FiniteDifferenceLibrary.c b/src/Core/FiniteDifferenceLibrary.c
index f5736e2..fbf2646 100644
--- a/src/Core/FiniteDifferenceLibrary.c
+++ b/src/Core/FiniteDifferenceLibrary.c
@@ -2,161 +2,66 @@
 
 #include "FiniteDifferenceLibrary.h"
 
-DLL_EXPORT int openMPtest(void)
+DLL_EXPORT int openMPtest(int nThreads)
 {
-	int nThreads;
+	omp_set_num_threads(nThreads);
+
+	int nThreads_running;
 #pragma omp parallel
 	{
 		if (omp_get_thread_num() == 0)
 		{
-			nThreads = omp_get_num_threads();
+			nThreads_running = omp_get_num_threads();
 		}
 	}
-	return nThreads;
+	return nThreads_running;
 }
-int fdiff_direct_neumann(const float *inimagefull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc)
+void threads_setup(int nThreads_requested, int *nThreads_current)
 {
-	size_t volume = nx * ny * nz;
-
-	const float * inimage = inimagefull;
-	float * outimageX = outimageXfull;
-	float * outimageY = outimageYfull;
-	float * outimageZ = outimageZfull;
-
-	int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice
-	int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row
-
-	long c;
-	for (c = 0; c < nc; c++)
-	{
 #pragma omp parallel
-		{
-			long ind, k, j, i;
-			float pix0;
-			//run over all and then fix boundaries
-#pragma omp for
-			for (ind = 0; ind < nx * ny * (nz - 1); ind++)
-			{
-				pix0 = -inimage[ind];
-
-				outimageX[ind] = pix0 + inimage[ind + 1];
-				outimageY[ind] = pix0 + inimage[ind + nx];
-				outimageZ[ind] = pix0 + inimage[ind + nx * ny];
-			}
-
-#pragma omp for
-			for (ind = 0; ind < nx * (ny - 1); ind++)
-			{
-				pix0 = -inimage[ind + offset1];
-
-				outimageX[ind + offset1] = pix0 + inimage[ind + offset1 + 1];
-				outimageY[ind + offset1] = pix0 + inimage[ind + offset1 + nx];
-			}
-
-#pragma omp for
-			for (ind = 0; ind < nx - 1; ind++)
-			{
-				pix0 = -inimage[ind + offset2];
-
-				outimageX[ind + offset2] = pix0 + inimage[ind + offset2 + 1];
-			}
-
-			//boundaries
-#pragma omp for
-			for (k = 0; k < nz; k++)
-			{
-				for (i = 0; i < nx; i++)
-				{
-					outimageY[(k * ny * nx) + (ny - 1) * nx + i] = 0;
-				}
-			}
-
-#pragma omp for
-			for (k = 0; k < nz; k++)
-			{
-				for (j = 0; j < ny; j++)
-				{
-					outimageX[k * ny * nx + j * nx + nx - 1] = 0;
-				}
-			}
-
-			if (nz > 1)
-			{
-#pragma omp for
-				for (ind = 0; ind < ny * nx; ind++)
-				{
-					outimageZ[nx * ny * (nz - 1) + ind] = 0;
-				}
-			}
-		}
-
-		inimage += volume;
-		outimageX += volume;
-		outimageY += volume;
-		outimageZ += volume;
-	}
-
-
-	//now the rest of the channels
-	if (nc > 1)
 	{
-		long ind;
-
-		for (c = 0; c < nc - 1; c++)
-		{
-			float * outimageC = outimageCfull + c * volume;
-			const float * inimage = inimagefull + c * volume;
-
-#pragma omp parallel for
-			for (ind = 0; ind < volume; ind++)
-			{
-				outimageC[ind] = -inimage[ind] + inimage[ind + volume];
-			}
-		}
-
-#pragma omp parallel for
-		for (ind = 0; ind < volume; ind++)
+		if (omp_get_thread_num() == 0)
 		{
-			outimageCfull[(nc - 1) * volume + ind] = 0;
+			*nThreads_current = omp_get_num_threads();
 		}
 	}
-
-	return 0;
+	omp_set_num_threads(nThreads_requested);
 }
-int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normfull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc)
+
+int fdiff_direct_neumann(const float *inimagefull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc)
 {
 	size_t volume = nx * ny * nz;
-	int z_dim = nz - 1;
-
-	const float * inimage = inimagefull;
-	float * outimageX = outimageXfull;
-	float * outimageY = outimageYfull;
-	float * outimageZ = outimageZfull;
-	float * outimageL21norm = outimageL21normfull;
 
+	const float *inimage = inimagefull;
+	float *outimageX = outimageXfull;
+	float *outimageY = outimageYfull;
+	float *outimageZ = outimageZfull;
 
-	int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice
+	int offset1 = (nz - 1) * nx * ny;	  //ind to beginning of last slice
 	int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row
 
 	long c;
+	
+	int z_dim = nz > 1 ? 1: 0;
+
 	for (c = 0; c < nc; c++)
 	{
 #pragma omp parallel
 		{
-			long ind, k;
+			long ind, k, j, i;
 			float pix0;
 			//run over all and then fix boundaries
-#pragma omp for
+
+#pragma omp for nowait
 			for (ind = 0; ind < nx * ny * (nz - 1); ind++)
 			{
 				pix0 = -inimage[ind];
-
 				outimageX[ind] = pix0 + inimage[ind + 1];
 				outimageY[ind] = pix0 + inimage[ind + nx];
 				outimageZ[ind] = pix0 + inimage[ind + nx * ny];
 			}
 
-#pragma omp for
+#pragma omp for nowait
 			for (ind = 0; ind < nx * (ny - 1); ind++)
 			{
 				pix0 = -inimage[ind + offset1];
@@ -169,29 +74,26 @@ int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normful
 			for (ind = 0; ind < nx - 1; ind++)
 			{
 				pix0 = -inimage[ind + offset2];
-
 				outimageX[ind + offset2] = pix0 + inimage[ind + offset2 + 1];
 			}
 
 			//boundaries
-#pragma omp for
+#pragma omp for nowait
 			for (k = 0; k < nz; k++)
 			{
-				for (int i = 0; i < nx; i++)
+				for (i = 0; i < nx; i++)
 				{
 					outimageY[(k * ny * nx) + (ny - 1) * nx + i] = 0;
 				}
 			}
-
-#pragma omp for
+#pragma omp for nowait
 			for (k = 0; k < nz; k++)
 			{
-				for (int j = 0; j < ny; j++)
+				for (j = 0; j < ny; j++)
 				{
 					outimageX[k * ny * nx + j * nx + nx - 1] = 0;
 				}
 			}
-
 			if (z_dim)
 			{
 #pragma omp for
@@ -199,34 +101,15 @@ int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normful
 				{
 					outimageZ[nx * ny * (nz - 1) + ind] = 0;
 				}
-
-#pragma omp for
-				for (ind = 0; ind < volume; ind++)
-				{
-					outimageL21norm[ind] = outimageX[ind] * outimageX[ind] + outimageY[ind] * outimageY[ind] + outimageZ[ind] * outimageZ[ind];
-				}
-
-			}
-			else
-			{
-
-#pragma omp for
-				for (ind = 0; ind < volume; ind++)
-				{
-					outimageL21norm[ind] = outimageX[ind] * outimageX[ind] + outimageY[ind] * outimageY[ind];
-				}
 			}
 		}
 
-	
 		inimage += volume;
 		outimageX += volume;
 		outimageY += volume;
 		outimageZ += volume;
-		outimageL21norm += volume;
 	}
 
-
 	//now the rest of the channels
 	if (nc > 1)
 	{
@@ -234,18 +117,13 @@ int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normful
 
 		for (c = 0; c < nc - 1; c++)
 		{
-			float * outimageC = outimageCfull + c * volume;
-			float * outimageL21norm = outimageL21normfull + c * volume;
-			const float * inimage = inimagefull + c * volume;
+			float *outimageC = outimageCfull + c * volume;
+			const float *inimage = inimagefull + c * volume;
 
 #pragma omp parallel for
 			for (ind = 0; ind < volume; ind++)
 			{
 				outimageC[ind] = -inimage[ind] + inimage[ind + volume];
-				outimageL21norm[ind] += outimageC[ind] * outimageC[ind];
-
-				//sqrt
-				outimageL21norm[ind] = (float)sqrt(outimageL21norm[ind]);
 			}
 		}
 
@@ -262,15 +140,14 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float
 {
 	size_t volume = nx * ny * nz;
 
-	const float * inimage = inimagefull;
-	float * outimageX = outimageXfull;
-	float * outimageY = outimageYfull;
-	float * outimageZ = outimageZfull;
+	const float *inimage = inimagefull;
+	float *outimageX = outimageXfull;
+	float *outimageY = outimageYfull;
+	float *outimageZ = outimageZfull;
 
-	int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice
+	int offset1 = (nz - 1) * nx * ny;	  //ind to beginning of last slice
 	int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row
 
-
 	long c;
 	for (c = 0; c < nc; c++)
 	{
@@ -280,7 +157,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float
 			long ind, k;
 			float pix0;
 			//run over all and then fix boundaries
-#pragma omp for
+#pragma omp for nowait
 			for (ind = 0; ind < nx * ny * (nz - 1); ind++)
 			{
 				pix0 = -inimage[ind];
@@ -290,7 +167,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float
 				outimageZ[ind] = pix0 + inimage[ind + nx * ny];
 			}
 
-#pragma omp for
+#pragma omp for nowait
 			for (ind = 0; ind < nx * (ny - 1); ind++)
 			{
 				pix0 = -inimage[ind + offset1];
@@ -308,7 +185,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float
 			}
 
 			//boundaries
-#pragma omp for
+#pragma omp for nowait
 			for (k = 0; k < nz; k++)
 			{
 				for (int i = 0; i < nx; i++)
@@ -320,7 +197,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float
 				}
 			}
 
-#pragma omp for
+#pragma omp for nowait
 			for (k = 0; k < nz; k++)
 			{
 				for (int j = 0; j < ny; j++)
@@ -332,10 +209,9 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float
 				}
 			}
 
-
 			if (nz > 1)
 			{
-#pragma omp for
+#pragma omp for nowait
 				for (ind = 0; ind < ny * nx; ind++)
 				{
 					outimageZ[nx * ny * (nz - 1) + ind] = -inimage[nx * ny * (nz - 1) + ind] + inimage[ind];
@@ -356,8 +232,8 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float
 
 		for (c = 0; c < nc - 1; c++)
 		{
-			float * outimageC = outimageCfull + c * volume;
-			const float * inimage = inimagefull + c * volume;
+			float *outimageC = outimageCfull + c * volume;
+			const float *inimage = inimagefull + c * volume;
 
 #pragma omp parallel for
 			for (ind = 0; ind < volume; ind++)
@@ -383,18 +259,18 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const
 	//assumes nx and ny > 1
 	int z_dim = nz - 1;
 
-	float * outimage = outimagefull;
-	const float * inimageX = inimageXfull;
-	const float * inimageY = inimageYfull;
-	const float * inimageZ = inimageZfull;
+	float *outimage = outimagefull;
+	const float *inimageX = inimageXfull;
+	const float *inimageY = inimageYfull;
+	const float *inimageZ = inimageZfull;
 
-	float * tempX = (float*)malloc(volume * sizeof(float));
-	float * tempY = (float*)malloc(volume * sizeof(float));
-	float * tempZ;
-	
-	if(z_dim)
+	float *tempX = (float *)malloc(volume * sizeof(float));
+	float *tempY = (float *)malloc(volume * sizeof(float));
+	float *tempZ;
+
+	if (z_dim)
 	{
-		tempZ = (float*)malloc(volume * sizeof(float));
+		tempZ = (float *)malloc(volume * sizeof(float));
 	}
 
 	long c;
@@ -413,7 +289,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const
 			for (ind = nx; ind < nx * ny * nz; ind++)
 			{
 				tempY[ind] = -inimageY[ind] + inimageY[ind - nx];
-
 			}
 
 			//boundaries
@@ -442,7 +317,7 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const
 				for (ind = nx * ny; ind < nx * ny * nz; ind++)
 				{
 					tempZ[ind] = -inimageZ[ind] + inimageZ[ind - nx * ny];
-				}				
+				}
 #pragma omp for
 				for (ind = 0; ind < ny * nx; ind++)
 				{
@@ -454,7 +329,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const
 				{
 					outimage[ind] = tempX[ind] + tempY[ind] + tempZ[ind];
 				}
-
 			}
 			else
 			{
@@ -464,7 +338,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const
 					outimage[ind] = tempX[ind] + tempY[ind];
 				}
 			}
-
 		}
 
 		outimage += volume;
@@ -475,10 +348,9 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const
 	free(tempX);
 	free(tempY);
 
-	if(z_dim)
+	if (z_dim)
 		free(tempZ);
 
-
 	//	//now the rest of the channels
 	if (nc > 1)
 	{
@@ -500,7 +372,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const
 			outimagefull[ind] += -inimageCfull[ind];
 			outimagefull[(nc - 1) * volume + ind] += inimageCfull[(nc - 2) * volume + ind];
 		}
-
 	}
 
 	return 0;
@@ -513,18 +384,18 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const
 	//assumes nx and ny > 1
 	int z_dim = nz - 1;
 
-	float * outimage = outimagefull;
-	const float * inimageX = inimageXfull;
-	const float * inimageY = inimageYfull;
-	const float * inimageZ = inimageZfull;
+	float *outimage = outimagefull;
+	const float *inimageX = inimageXfull;
+	const float *inimageY = inimageYfull;
+	const float *inimageZ = inimageZfull;
 
-	float * tempX = (float*)malloc(volume * sizeof(float));
-	float * tempY = (float*)malloc(volume * sizeof(float));
-	float * tempZ;
+	float *tempX = (float *)malloc(volume * sizeof(float));
+	float *tempY = (float *)malloc(volume * sizeof(float));
+	float *tempZ;
 
 	if (z_dim)
 	{
-		tempZ = (float*)malloc(volume * sizeof(float));
+		tempZ = (float *)malloc(volume * sizeof(float));
 	}
 
 	long c;
@@ -583,7 +454,6 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const
 				{
 					outimage[ind] = tempX[ind] + tempY[ind] + tempZ[ind];
 				}
-
 			}
 			else
 			{
@@ -593,7 +463,6 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const
 					outimage[ind] = tempX[ind] + tempY[ind];
 				}
 			}
-
 		}
 
 		outimage += volume;
@@ -604,7 +473,7 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const
 	free(tempX);
 	free(tempY);
 
-	if(z_dim)
+	if (z_dim)
 		free(tempZ);
 
 	//now the rest of the channels
@@ -632,9 +501,11 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const
 	return 0;
 }
 
-
-DLL_EXPORT int fdiff4D(float *imagefull, float *gradCfull, float *gradZfull, float *gradYfull, float *gradXfull, long nc, long nz, long ny, long nx, int boundary, int direction)
+DLL_EXPORT int fdiff4D(float *imagefull, float *gradCfull, float *gradZfull, float *gradYfull, float *gradXfull, long nc, long nz, long ny, long nx, int boundary, int direction, int nThreads)
 {
+	int nThreads_initial;
+	threads_setup(nThreads, &nThreads_initial);
+
 	if (boundary)
 	{
 		if (direction)
@@ -643,17 +514,21 @@ DLL_EXPORT int fdiff4D(float *imagefull, float *gradCfull, float *gradZfull, flo
 			fdiff_adjoint_periodic(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc);
 	}
 	else
-		{
+	{
 		if (direction)
 			fdiff_direct_neumann(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc);
 		else
 			fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc);
 	}
-	
+
+	omp_set_num_threads(nThreads_initial);
 	return 0;
 }
-DLL_EXPORT int fdiff3D(float *imagefull, float *gradZfull, float *gradYfull, float *gradXfull, long nz, long ny, long nx, int boundary, int direction)
+DLL_EXPORT int fdiff3D(float *imagefull, float *gradZfull, float *gradYfull, float *gradXfull, long nz, long ny, long nx, int boundary, int direction, int nThreads)
 {
+	int nThreads_initial;
+	threads_setup(nThreads, &nThreads_initial);
+
 	if (boundary)
 	{
 		if (direction)
@@ -669,10 +544,14 @@ DLL_EXPORT int fdiff3D(float *imagefull, float *gradZfull, float *gradYfull, flo
 			fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, gradZfull, NULL, nx, ny, nz, 1);
 	}
 
+	omp_set_num_threads(nThreads_initial);
 	return 0;
 }
-DLL_EXPORT int fdiff2D(float *imagefull, float *gradYfull, float *gradXfull, long ny, long nx, int boundary, int direction)
+DLL_EXPORT int fdiff2D(float *imagefull, float *gradYfull, float *gradXfull, long ny, long nx, int boundary, int direction, int nThreads)
 {
+	int nThreads_initial;
+	threads_setup(nThreads, &nThreads_initial);
+
 	if (boundary)
 	{
 		if (direction)
@@ -688,5 +567,7 @@ DLL_EXPORT int fdiff2D(float *imagefull, float *gradYfull, float *gradXfull, lon
 			fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, NULL, NULL, nx, ny, 1, 1);
 	}
 
+	omp_set_num_threads(nThreads_initial);
 	return 0;
 }
+
author	Gemma Fardell <47746591+gfardell@users.noreply.github.com>	2020-01-23 11:52:46 +0000
committer	Edoardo Pasca <edo.paskino@gmail.com>	2020-01-23 11:52:46 +0000
commit	894f35c9be404bc2c13f90f4a6184a545029181a (patch)
tree	17938025fe60afac7006b8fb6c4e7016bf687ccd /src
parent	fecff8ded735d309aba43d30226c0bb51386c905 (diff)
download	framework-894f35c9be404bc2c13f90f4a6184a545029181a.tar.gz framework-894f35c9be404bc2c13f90f4a6184a545029181a.tar.bz2 framework-894f35c9be404bc2c13f90f4a6184a545029181a.tar.xz framework-894f35c9be404bc2c13f90f4a6184a545029181a.zip