6 files changed, 968 insertions, 0 deletions
diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt
new file mode 100644
index 0000000..a527289
--- /dev/null
+++ b/src/Core/CMakeLists.txt
@@ -0,0 +1,124 @@
+
+set (CMAKE_C_STANDARD 11)
+
+if(APPLE)
+    if(NOT DEFINED OPENMP_INCLUDES OR NOT DEFINED OPENMP_LIBRARIES)
+        set(OPENMP_INCLUDES "OPENMP_INCLUDES-NOTFOUND" CACHE PATH "Need to set OpenMP include dir on OSX")
+        set(OPENMP_LIBRARIES "OPENMP_LIBRARIES-NOTFOUND" CACHE PATH "Need to set OpenMP lib dir on OSX")
+    endif()
+    if(NOT EXISTS ${OPENMP_INCLUDES} OR NOT EXISTS ${OPENMP_LIBRARIES})
+        message(FATAL_ERROR "Need to set OPENMP_INCLUDES and OPENMP_LIBRARIES on OSX.")
+    endif()
+    if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+        set(OpenMP_C "${CMAKE_C_COMPILER}")
+        set(OpenMP_C_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
+        set(OpenMP_C_LIB_NAMES "libomp" "libgomp" "libiomp5")
+        set(OpenMP_libomp_LIBRARY ${OpenMP_C_LIB_NAMES})
+        set(OpenMP_libgomp_LIBRARY ${OpenMP_C_LIB_NAMES})
+        set(OpenMP_libiomp5_LIBRARY ${OpenMP_C_LIB_NAMES})
+    endif()
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
+        set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
+        set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
+        set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+        set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+        set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+    endif()
+endif()
+
+
+find_package(OpenMP REQUIRED)
+
+add_definitions(${OpenMP_CXX_FLAGS})
+add_definitions(${OpenMP_C_FLAGS})
+  
+  if (APPLE)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+    set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS}")
+    set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS}")   
+    include_directories("${OPENMP_INCLUDES}")
+    link_directories("${OPENMP_LIBRARIES}")
+  else()
+    if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.9.0") 
+      set (OpenMP_EXE_LINKER_FLAGS OpenMP::OpenMP_CXX)
+      set (OpenMP_EXE_LINKER_FLAGS OpenMP::OpenMP_C)
+    else()
+      message(WARNING "Your CMake version is old. OpenMP linking flags  might be incorrect.")
+      # need to explicitly set this. Definitely for gcc, hopefully also for other systems.
+      # See https://gitlab.kitware.com/cmake/cmake/issues/15392
+      set (OpenMP_EXE_LINKER_FLAGS ${OpenMP_CXX_FLAGS})
+      set (OpenMP_EXE_LINKER_FLAGS ${OpenMP_C_FLAGS})
+    endif()
+  endif()
+
+
+#if (OPENMP_FOUND)
+#    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+#    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+#    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
+#   set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
+#   set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
+#   if (UNIX)
+#    set (FLAGS "-O3 -funsigned-char -Wall  -Wl,--no-undefined -march=native")  
+#    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
+#    set (CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
+#  
+#    set (EXTRA_LIBRARIES 
+#      "gomp"
+#      "m"
+#      )
+#   endif()
+# endif()
+
+if (WIN32)
+	set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Ddll_EXPORTS")
+endif()
+
+
+message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")
+message("CMAKE_C_FLAGS ${CMAKE_C_FLAGS}")
+message("CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS}")
+message("CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}")
+message("CMAKE_STATIC_LINKER_FLAGS ${CMAKE_STATIC_LINKER_FLAGS}")
+
+
+
+add_library(cilacc SHARED ${CMAKE_CURRENT_SOURCE_DIR}/axpby.c
+                          ${CMAKE_CURRENT_SOURCE_DIR}/FiniteDifferenceLibrary.c )
+
+target_link_libraries(cilacc ${OpenMP_C_LIB_NAMES} )
+include_directories(cilacc PUBLIC 
+				  ${CMAKE_CURRENT_SOURCE_DIR}/include
+		          )
+
+## Install
+#include(GNUInstallDirs)
+#install(TARGETS cilacc
+#	RUNTIME DESTINATION bin
+#    LIBRARY DESTINATION lib
+#    ARCHIVE DESTINATION lib
+#	CONFIGURATIONS ${CMAKE_BUILD_TYPE} 
+#	)
+
+if (UNIX)
+message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib")
+install(TARGETS cilacc
+	LIBRARY DESTINATION lib
+	CONFIGURATIONS ${CMAKE_BUILD_TYPE} 
+	)
+elseif(WIN32)
+message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin")
+  install(TARGETS cilacc 
+	RUNTIME DESTINATION bin
+	ARCHIVE DESTINATION lib
+	CONFIGURATIONS ${CMAKE_BUILD_TYPE} 
+	)
+endif()
+
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/src/Core/include/
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/include/cil) 
+
+
diff --git a/src/Core/FiniteDifferenceLibrary.c b/src/Core/FiniteDifferenceLibrary.c
new file mode 100644
index 0000000..f5736e2
--- /dev/null
+++ b/src/Core/FiniteDifferenceLibrary.c
@@ -0,0 +1,692 @@
+#define dll_EXPORTS = 1
+
+#include "FiniteDifferenceLibrary.h"
+
+DLL_EXPORT int openMPtest(void)
+{
+	int nThreads;
+#pragma omp parallel
+	{
+		if (omp_get_thread_num() == 0)
+		{
+			nThreads = omp_get_num_threads();
+		}
+	}
+	return nThreads;
+}
+int fdiff_direct_neumann(const float *inimagefull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc)
+{
+	size_t volume = nx * ny * nz;
+
+	const float * inimage = inimagefull;
+	float * outimageX = outimageXfull;
+	float * outimageY = outimageYfull;
+	float * outimageZ = outimageZfull;
+
+	int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice
+	int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row
+
+	long c;
+	for (c = 0; c < nc; c++)
+	{
+#pragma omp parallel
+		{
+			long ind, k, j, i;
+			float pix0;
+			//run over all and then fix boundaries
+#pragma omp for
+			for (ind = 0; ind < nx * ny * (nz - 1); ind++)
+			{
+				pix0 = -inimage[ind];
+
+				outimageX[ind] = pix0 + inimage[ind + 1];
+				outimageY[ind] = pix0 + inimage[ind + nx];
+				outimageZ[ind] = pix0 + inimage[ind + nx * ny];
+			}
+
+#pragma omp for
+			for (ind = 0; ind < nx * (ny - 1); ind++)
+			{
+				pix0 = -inimage[ind + offset1];
+
+				outimageX[ind + offset1] = pix0 + inimage[ind + offset1 + 1];
+				outimageY[ind + offset1] = pix0 + inimage[ind + offset1 + nx];
+			}
+
+#pragma omp for
+			for (ind = 0; ind < nx - 1; ind++)
+			{
+				pix0 = -inimage[ind + offset2];
+
+				outimageX[ind + offset2] = pix0 + inimage[ind + offset2 + 1];
+			}
+
+			//boundaries
+#pragma omp for
+			for (k = 0; k < nz; k++)
+			{
+				for (i = 0; i < nx; i++)
+				{
+					outimageY[(k * ny * nx) + (ny - 1) * nx + i] = 0;
+				}
+			}
+
+#pragma omp for
+			for (k = 0; k < nz; k++)
+			{
+				for (j = 0; j < ny; j++)
+				{
+					outimageX[k * ny * nx + j * nx + nx - 1] = 0;
+				}
+			}
+
+			if (nz > 1)
+			{
+#pragma omp for
+				for (ind = 0; ind < ny * nx; ind++)
+				{
+					outimageZ[nx * ny * (nz - 1) + ind] = 0;
+				}
+			}
+		}
+
+		inimage += volume;
+		outimageX += volume;
+		outimageY += volume;
+		outimageZ += volume;
+	}
+
+
+	//now the rest of the channels
+	if (nc > 1)
+	{
+		long ind;
+
+		for (c = 0; c < nc - 1; c++)
+		{
+			float * outimageC = outimageCfull + c * volume;
+			const float * inimage = inimagefull + c * volume;
+
+#pragma omp parallel for
+			for (ind = 0; ind < volume; ind++)
+			{
+				outimageC[ind] = -inimage[ind] + inimage[ind + volume];
+			}
+		}
+
+#pragma omp parallel for
+		for (ind = 0; ind < volume; ind++)
+		{
+			outimageCfull[(nc - 1) * volume + ind] = 0;
+		}
+	}
+
+	return 0;
+}
+int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normfull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc)
+{
+	size_t volume = nx * ny * nz;
+	int z_dim = nz - 1;
+
+	const float * inimage = inimagefull;
+	float * outimageX = outimageXfull;
+	float * outimageY = outimageYfull;
+	float * outimageZ = outimageZfull;
+	float * outimageL21norm = outimageL21normfull;
+
+
+	int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice
+	int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row
+
+	long c;
+	for (c = 0; c < nc; c++)
+	{
+#pragma omp parallel
+		{
+			long ind, k;
+			float pix0;
+			//run over all and then fix boundaries
+#pragma omp for
+			for (ind = 0; ind < nx * ny * (nz - 1); ind++)
+			{
+				pix0 = -inimage[ind];
+
+				outimageX[ind] = pix0 + inimage[ind + 1];
+				outimageY[ind] = pix0 + inimage[ind + nx];
+				outimageZ[ind] = pix0 + inimage[ind + nx * ny];
+			}
+
+#pragma omp for
+			for (ind = 0; ind < nx * (ny - 1); ind++)
+			{
+				pix0 = -inimage[ind + offset1];
+
+				outimageX[ind + offset1] = pix0 + inimage[ind + offset1 + 1];
+				outimageY[ind + offset1] = pix0 + inimage[ind + offset1 + nx];
+			}
+
+#pragma omp for
+			for (ind = 0; ind < nx - 1; ind++)
+			{
+				pix0 = -inimage[ind + offset2];
+
+				outimageX[ind + offset2] = pix0 + inimage[ind + offset2 + 1];
+			}
+
+			//boundaries
+#pragma omp for
+			for (k = 0; k < nz; k++)
+			{
+				for (int i = 0; i < nx; i++)
+				{
+					outimageY[(k * ny * nx) + (ny - 1) * nx + i] = 0;
+				}
+			}
+
+#pragma omp for
+			for (k = 0; k < nz; k++)
+			{
+				for (int j = 0; j < ny; j++)
+				{
+					outimageX[k * ny * nx + j * nx + nx - 1] = 0;
+				}
+			}
+
+			if (z_dim)
+			{
+#pragma omp for
+				for (ind = 0; ind < ny * nx; ind++)
+				{
+					outimageZ[nx * ny * (nz - 1) + ind] = 0;
+				}
+
+#pragma omp for
+				for (ind = 0; ind < volume; ind++)
+				{
+					outimageL21norm[ind] = outimageX[ind] * outimageX[ind] + outimageY[ind] * outimageY[ind] + outimageZ[ind] * outimageZ[ind];
+				}
+
+			}
+			else
+			{
+
+#pragma omp for
+				for (ind = 0; ind < volume; ind++)
+				{
+					outimageL21norm[ind] = outimageX[ind] * outimageX[ind] + outimageY[ind] * outimageY[ind];
+				}
+			}
+		}
+
+	
+		inimage += volume;
+		outimageX += volume;
+		outimageY += volume;
+		outimageZ += volume;
+		outimageL21norm += volume;
+	}
+
+
+	//now the rest of the channels
+	if (nc > 1)
+	{
+		long ind;
+
+		for (c = 0; c < nc - 1; c++)
+		{
+			float * outimageC = outimageCfull + c * volume;
+			float * outimageL21norm = outimageL21normfull + c * volume;
+			const float * inimage = inimagefull + c * volume;
+
+#pragma omp parallel for
+			for (ind = 0; ind < volume; ind++)
+			{
+				outimageC[ind] = -inimage[ind] + inimage[ind + volume];
+				outimageL21norm[ind] += outimageC[ind] * outimageC[ind];
+
+				//sqrt
+				outimageL21norm[ind] = (float)sqrt(outimageL21norm[ind]);
+			}
+		}
+
+#pragma omp parallel for
+		for (ind = 0; ind < volume; ind++)
+		{
+			outimageCfull[(nc - 1) * volume + ind] = 0;
+		}
+	}
+
+	return 0;
+}
+int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc)
+{
+	size_t volume = nx * ny * nz;
+
+	const float * inimage = inimagefull;
+	float * outimageX = outimageXfull;
+	float * outimageY = outimageYfull;
+	float * outimageZ = outimageZfull;
+
+	int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice
+	int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row
+
+
+	long c;
+	for (c = 0; c < nc; c++)
+	{
+
+#pragma omp parallel
+		{
+			long ind, k;
+			float pix0;
+			//run over all and then fix boundaries
+#pragma omp for
+			for (ind = 0; ind < nx * ny * (nz - 1); ind++)
+			{
+				pix0 = -inimage[ind];
+
+				outimageX[ind] = pix0 + inimage[ind + 1];
+				outimageY[ind] = pix0 + inimage[ind + nx];
+				outimageZ[ind] = pix0 + inimage[ind + nx * ny];
+			}
+
+#pragma omp for
+			for (ind = 0; ind < nx * (ny - 1); ind++)
+			{
+				pix0 = -inimage[ind + offset1];
+
+				outimageX[ind + offset1] = pix0 + inimage[ind + offset1 + 1];
+				outimageY[ind + offset1] = pix0 + inimage[ind + offset1 + nx];
+			}
+
+#pragma omp for
+			for (ind = 0; ind < nx - 1; ind++)
+			{
+				pix0 = -inimage[ind + offset2];
+
+				outimageX[ind + offset2] = pix0 + inimage[ind + offset2 + 1];
+			}
+
+			//boundaries
+#pragma omp for
+			for (k = 0; k < nz; k++)
+			{
+				for (int i = 0; i < nx; i++)
+				{
+					int ind1 = (k * ny * nx);
+					int ind2 = ind1 + (ny - 1) * nx;
+
+					outimageY[ind2 + i] = -inimage[ind2 + i] + inimage[ind1 + i];
+				}
+			}
+
+#pragma omp for
+			for (k = 0; k < nz; k++)
+			{
+				for (int j = 0; j < ny; j++)
+				{
+					int ind1 = k * ny * nx + j * nx;
+					int ind2 = ind1 + nx - 1;
+
+					outimageX[ind2] = -inimage[ind2] + inimage[ind1];
+				}
+			}
+
+
+			if (nz > 1)
+			{
+#pragma omp for
+				for (ind = 0; ind < ny * nx; ind++)
+				{
+					outimageZ[nx * ny * (nz - 1) + ind] = -inimage[nx * ny * (nz - 1) + ind] + inimage[ind];
+				}
+			}
+		}
+
+		inimage += volume;
+		outimageX += volume;
+		outimageY += volume;
+		outimageZ += volume;
+	}
+
+	//now the rest of the channels
+	if (nc > 1)
+	{
+		long ind;
+
+		for (c = 0; c < nc - 1; c++)
+		{
+			float * outimageC = outimageCfull + c * volume;
+			const float * inimage = inimagefull + c * volume;
+
+#pragma omp parallel for
+			for (ind = 0; ind < volume; ind++)
+			{
+				outimageC[ind] = -inimage[ind] + inimage[ind + volume];
+			}
+		}
+
+#pragma omp parallel for
+		for (ind = 0; ind < volume; ind++)
+		{
+			outimageCfull[(nc - 1) * volume + ind] = -inimagefull[(nc - 1) * volume + ind] + inimagefull[ind];
+		}
+	}
+
+	return 0;
+}
+int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const float *inimageYfull, const float *inimageZfull, const float *inimageCfull, long nx, long ny, long nz, long nc)
+{
+	//runs over full data in x, y, z. then corrects elements for bounday conditions and sums
+	size_t volume = nx * ny * nz;
+
+	//assumes nx and ny > 1
+	int z_dim = nz - 1;
+
+	float * outimage = outimagefull;
+	const float * inimageX = inimageXfull;
+	const float * inimageY = inimageYfull;
+	const float * inimageZ = inimageZfull;
+
+	float * tempX = (float*)malloc(volume * sizeof(float));
+	float * tempY = (float*)malloc(volume * sizeof(float));
+	float * tempZ;
+	
+	if(z_dim)
+	{
+		tempZ = (float*)malloc(volume * sizeof(float));
+	}
+
+	long c;
+	for (c = 0; c < nc; c++) //just calculating x, y and z in each channel here
+	{
+#pragma omp parallel
+		{
+			long ind, k;
+
+#pragma omp for
+			for (ind = 1; ind < nx * ny * nz; ind++)
+			{
+				tempX[ind] = -inimageX[ind] + inimageX[ind - 1];
+			}
+#pragma omp for
+			for (ind = nx; ind < nx * ny * nz; ind++)
+			{
+				tempY[ind] = -inimageY[ind] + inimageY[ind - nx];
+
+			}
+
+			//boundaries
+#pragma omp for
+			for (k = 0; k < nz; k++)
+			{
+				for (int j = 0; j < ny; j++)
+				{
+					tempX[k * ny * nx + j * nx] = -inimageX[k * ny * nx + j * nx];
+					tempX[k * ny * nx + j * nx + nx - 1] = inimageX[k * ny * nx + j * nx + nx - 2];
+				}
+			}
+#pragma omp for
+			for (k = 0; k < nz; k++)
+			{
+				for (int i = 0; i < nx; i++)
+				{
+					tempY[(k * ny * nx) + i] = -inimageY[(k * ny * nx) + i];
+					tempY[(k * ny * nx) + nx * (ny - 1) + i] = inimageY[(k * ny * nx) + nx * (ny - 2) + i];
+				}
+			}
+
+			if (z_dim)
+			{
+#pragma omp for
+				for (ind = nx * ny; ind < nx * ny * nz; ind++)
+				{
+					tempZ[ind] = -inimageZ[ind] + inimageZ[ind - nx * ny];
+				}				
+#pragma omp for
+				for (ind = 0; ind < ny * nx; ind++)
+				{
+					tempZ[ind] = -inimageZ[ind];
+					tempZ[nx * ny * (nz - 1) + ind] = inimageZ[nx * ny * (nz - 2) + ind];
+				}
+#pragma omp for
+				for (ind = 0; ind < volume; ind++)
+				{
+					outimage[ind] = tempX[ind] + tempY[ind] + tempZ[ind];
+				}
+
+			}
+			else
+			{
+#pragma omp for
+				for (ind = 0; ind < volume; ind++)
+				{
+					outimage[ind] = tempX[ind] + tempY[ind];
+				}
+			}
+
+		}
+
+		outimage += volume;
+		inimageX += volume;
+		inimageY += volume;
+		inimageZ += volume;
+	}
+	free(tempX);
+	free(tempY);
+
+	if(z_dim)
+		free(tempZ);
+
+
+	//	//now the rest of the channels
+	if (nc > 1)
+	{
+		long ind;
+
+		for (c = 1; c < nc - 1; c++)
+		{
+
+#pragma omp parallel for
+			for (ind = 0; ind < volume; ind++)
+			{
+				outimagefull[ind + c * volume] += -inimageCfull[ind + c * volume] + inimageCfull[ind + (c - 1) * volume];
+			}
+		}
+
+#pragma omp parallel for
+		for (ind = 0; ind < volume; ind++)
+		{
+			outimagefull[ind] += -inimageCfull[ind];
+			outimagefull[(nc - 1) * volume + ind] += inimageCfull[(nc - 2) * volume + ind];
+		}
+
+	}
+
+	return 0;
+}
+int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const float *inimageYfull, const float *inimageZfull, const float *inimageCfull, long nx, long ny, long nz, long nc)
+{
+	//runs over full data in x, y, z. then correctects elements for bounday conditions and sums
+	size_t volume = nx * ny * nz;
+
+	//assumes nx and ny > 1
+	int z_dim = nz - 1;
+
+	float * outimage = outimagefull;
+	const float * inimageX = inimageXfull;
+	const float * inimageY = inimageYfull;
+	const float * inimageZ = inimageZfull;
+
+	float * tempX = (float*)malloc(volume * sizeof(float));
+	float * tempY = (float*)malloc(volume * sizeof(float));
+	float * tempZ;
+
+	if (z_dim)
+	{
+		tempZ = (float*)malloc(volume * sizeof(float));
+	}
+
+	long c;
+	for (c = 0; c < nc; c++) //just calculating x, y and z in each channel here
+	{
+#pragma omp parallel
+		{
+			long ind, k;
+
+			//run over all and then fix boundaries
+#pragma omp for
+			for (ind = 1; ind < volume; ind++)
+			{
+				tempX[ind] = -inimageX[ind] + inimageX[ind - 1];
+			}
+#pragma omp for
+			for (ind = nx; ind < volume; ind++)
+			{
+				tempY[ind] = -inimageY[ind] + inimageY[ind - nx];
+			}
+
+			//boundaries
+#pragma omp for
+			for (k = 0; k < nz; k++)
+			{
+				for (int i = 0; i < nx; i++)
+				{
+					tempY[(k * ny * nx) + i] = -inimageY[(k * ny * nx) + i] + inimageY[(k * ny * nx) + nx * (ny - 1) + i];
+				}
+			}
+#pragma omp for
+			for (k = 0; k < nz; k++)
+			{
+				for (int j = 0; j < ny; j++)
+				{
+					tempX[k * ny * nx + j * nx] = -inimageX[k * ny * nx + j * nx] + inimageX[k * ny * nx + j * nx + nx - 1];
+				}
+			}
+
+			if (z_dim)
+			{
+
+#pragma omp for
+				for (ind = nx * ny; ind < nx * ny * nz; ind++)
+				{
+					tempZ[ind] = -inimageZ[ind] + inimageZ[ind - nx * ny];
+				}
+#pragma omp for
+				for (ind = 0; ind < ny * nx; ind++)
+				{
+					tempZ[ind] = -inimageZ[ind] + inimageZ[nx * ny * (nz - 1) + ind];
+				}
+
+#pragma omp for
+				for (ind = 0; ind < volume; ind++)
+				{
+					outimage[ind] = tempX[ind] + tempY[ind] + tempZ[ind];
+				}
+
+			}
+			else
+			{
+#pragma omp for
+				for (ind = 0; ind < volume; ind++)
+				{
+					outimage[ind] = tempX[ind] + tempY[ind];
+				}
+			}
+
+		}
+
+		outimage += volume;
+		inimageX += volume;
+		inimageY += volume;
+		inimageZ += volume;
+	}
+	free(tempX);
+	free(tempY);
+
+	if(z_dim)
+		free(tempZ);
+
+	//now the rest of the channels
+	if (nc > 1)
+	{
+		long ind;
+
+		for (c = 1; c < nc; c++)
+		{
+
+#pragma omp parallel for
+			for (ind = 0; ind < volume; ind++)
+			{
+				outimagefull[ind + c * volume] += -inimageCfull[ind + c * volume] + inimageCfull[ind + (c - 1) * volume];
+			}
+		}
+
+#pragma omp parallel for
+		for (ind = 0; ind < volume; ind++)
+		{
+			outimagefull[ind] += -inimageCfull[ind] + inimageCfull[(nc - 1) * volume + ind];
+		}
+	}
+
+	return 0;
+}
+
+
+DLL_EXPORT int fdiff4D(float *imagefull, float *gradCfull, float *gradZfull, float *gradYfull, float *gradXfull, long nc, long nz, long ny, long nx, int boundary, int direction)
+{
+	if (boundary)
+	{
+		if (direction)
+			fdiff_direct_periodic(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc);
+		else
+			fdiff_adjoint_periodic(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc);
+	}
+	else
+		{
+		if (direction)
+			fdiff_direct_neumann(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc);
+		else
+			fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc);
+	}
+	
+	return 0;
+}
+DLL_EXPORT int fdiff3D(float *imagefull, float *gradZfull, float *gradYfull, float *gradXfull, long nz, long ny, long nx, int boundary, int direction)
+{
+	if (boundary)
+	{
+		if (direction)
+			fdiff_direct_periodic(imagefull, gradXfull, gradYfull, gradZfull, NULL, nx, ny, nz, 1);
+		else
+			fdiff_adjoint_periodic(imagefull, gradXfull, gradYfull, gradZfull, NULL, nx, ny, nz, 1);
+	}
+	else
+	{
+		if (direction)
+			fdiff_direct_neumann(imagefull, gradXfull, gradYfull, gradZfull, NULL, nx, ny, nz, 1);
+		else
+			fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, gradZfull, NULL, nx, ny, nz, 1);
+	}
+
+	return 0;
+}
+DLL_EXPORT int fdiff2D(float *imagefull, float *gradYfull, float *gradXfull, long ny, long nx, int boundary, int direction)
+{
+	if (boundary)
+	{
+		if (direction)
+			fdiff_direct_periodic(imagefull, gradXfull, gradYfull, NULL, NULL, nx, ny, 1, 1);
+		else
+			fdiff_adjoint_periodic(imagefull, gradXfull, gradYfull, NULL, NULL, nx, ny, 1, 1);
+	}
+	else
+	{
+		if (direction)
+			fdiff_direct_neumann(imagefull, gradXfull, gradYfull, NULL, NULL, nx, ny, 1, 1);
+		else
+			fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, NULL, NULL, nx, ny, 1, 1);
+	}
+
+	return 0;
+}
diff --git a/src/Core/axpby.c b/src/Core/axpby.c
new file mode 100755
index 0000000..c4d162d
--- /dev/null
+++ b/src/Core/axpby.c
@@ -0,0 +1,109 @@
+#include "axpby.h"
+
+
+DLL_EXPORT int padd(float * x, float * y, float * out, long size){
+    long i = 0;
+#pragma omp parallel for
+    for (i=0; i < size; i++)
+    {
+        *(out + i ) = *(x + i) + *(y+i); 
+    }
+    return 0;
+}
+
+DLL_EXPORT int psubtract(float * x, float * y, float * out, long size){
+    long i = 0;
+#pragma omp parallel
+{
+//#pragma omp single
+//{
+//		printf("current number of threads %d\n", omp_get_num_threads());
+//}
+#pragma omp for
+    for (i=0; i < size; i++)
+    {
+        *(out + i ) = *(x + i) - *(y+i); 
+    }
+}
+    return 0;
+    
+}
+
+DLL_EXPORT int pmultiply(float * x, float * y, float * out, long size){
+    long i = 0;
+#pragma omp parallel for
+    for (i=0; i < size; i++)
+    {
+        *(out + i ) = *(x + i) * *(y+i); 
+    }
+    return 0;
+}
+
+DLL_EXPORT int pdivide(float * x, float * y, float * out, long size, float default_value)
+{
+    long i = 0;
+#pragma omp parallel for
+    for (i=0; i < size; i++)
+    {
+        *(out + i ) = *(y+i) ? *(x + i) / *(y+i) : default_value;
+    }
+    return 0;
+}
+DLL_EXPORT int ppower(float * x, float * y, float * out, long size){
+    long i = 0;
+#pragma omp parallel for
+    for (i=0; i < size; i++)
+    {
+        *(out + i ) = (float)pow(*(x + i) , *(y+i)) ; 
+    }
+    return 0;
+}
+
+DLL_EXPORT int pminimum(float * x, float * y, float * out, long size){
+    long i = 0;
+#pragma omp parallel for
+    for (i=0; i < size; i++)
+    {
+        *(out + i ) = *(y+i) > (*x+i) ? *(x + i) :  *(y+i); 
+    }
+    return 0;
+}
+
+DLL_EXPORT int pmaximum(float * x, float * y, float * out, long size) {
+	long i = 0;
+#pragma omp parallel for
+	for (i = 0; i < size; i++)
+	{
+		*(out + i) = *(y + i) < (*x + i) ? *(x + i) : *(y + i);
+	}
+	return 0;
+}
+
+
+DLL_EXPORT int saxpby(float * x, float * y, float * out, float a, float b, long size){
+    long i = 0;
+#pragma omp parallel
+{
+#pragma omp for
+    for (i=0; i < size; i++)
+    {
+        *(out + i ) = a * ( *(x + i) ) + b * ( *(y + i) ); 
+    }
+}
+    return 0;
+    
+}
+
+DLL_EXPORT int daxpby(double * x, double * y, double * out, double a, double b, long size) {
+	long i = 0;
+#pragma omp parallel
+	{
+#pragma omp for
+		for (i = 0; i < size; i++)
+		{
+			*(out + i) = a * (*(x + i)) + b * (*(y + i));
+		}
+	}
+	return 0;
+
+}
+\ No newline at end of file
diff --git a/src/Core/include/FiniteDifferenceLibrary.h b/src/Core/include/FiniteDifferenceLibrary.h
new file mode 100644
index 0000000..6e426af
--- /dev/null
+++ b/src/Core/include/FiniteDifferenceLibrary.h
@@ -0,0 +1,6 @@
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "omp.h"
+//#include "ipp.h"
+#include "dll_export.h"
+\ No newline at end of file
diff --git a/src/Core/include/axpby.h b/src/Core/include/axpby.h
new file mode 100644
index 0000000..2849547
--- /dev/null
+++ b/src/Core/include/axpby.h
@@ -0,0 +1,17 @@
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "omp.h"
+#include "dll_export.h"
+
+
+DLL_EXPORT int padd(float * x, float * y, float * out, long size);
+DLL_EXPORT int psubtract(float * x, float * y, float * out, long size);
+DLL_EXPORT int pmultiply(float * x, float * y, float * out, long size);
+DLL_EXPORT int pdivide(float * x, float * y, float * out, long size, float default_value);
+DLL_EXPORT int ppower(float * x, float * y, float * out, long size);
+DLL_EXPORT int pminimum(float * x, float * y, float * out, long size);
+DLL_EXPORT int pmaximum(float * x, float * y, float * out, long size);
+
+DLL_EXPORT int saxpby(float * x, float * y, float * out, float a, float b, long size);
+DLL_EXPORT int daxpby(double * x, double * y, double * out, double a, double b, long size);
diff --git a/src/Core/include/dll_export.h b/src/Core/include/dll_export.h
new file mode 100755
index 0000000..6b816c3
--- /dev/null
+++ b/src/Core/include/dll_export.h
@@ -0,0 +1,20 @@
+#pragma once
+#ifndef DLLEXPORT_H
+#define DLLEXPORT_H
+
+#if defined(_WIN32) || defined(__WIN32__)
+#if defined(dll_EXPORTS)  // add by CMake 
+#define  DLL_EXPORT __declspec(dllexport)
+#define EXPIMP_TEMPLATE
+#else
+#define  DLL_EXPORT __declspec(dllimport)
+#define EXPIMP_TEMPLATE extern
+#endif 
+#elif defined(linux) || defined(__linux) || defined(__APPLE__)
+#define DLL_EXPORT
+#ifndef __cdecl
+#define __cdecl
+#endif
+#endif
+
+#endif