summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Core/CMakeLists.txt124
-rw-r--r--src/Core/FiniteDifferenceLibrary.c692
-rwxr-xr-xsrc/Core/axpby.c109
-rw-r--r--src/Core/include/FiniteDifferenceLibrary.h6
-rw-r--r--src/Core/include/axpby.h17
-rwxr-xr-xsrc/Core/include/dll_export.h20
6 files changed, 968 insertions, 0 deletions
diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt
new file mode 100644
index 0000000..a527289
--- /dev/null
+++ b/src/Core/CMakeLists.txt
@@ -0,0 +1,124 @@
+
+set (CMAKE_C_STANDARD 11)
+
+if(APPLE)
+ if(NOT DEFINED OPENMP_INCLUDES OR NOT DEFINED OPENMP_LIBRARIES)
+ set(OPENMP_INCLUDES "OPENMP_INCLUDES-NOTFOUND" CACHE PATH "Need to set OpenMP include dir on OSX")
+ set(OPENMP_LIBRARIES "OPENMP_LIBRARIES-NOTFOUND" CACHE PATH "Need to set OpenMP lib dir on OSX")
+ endif()
+ if(NOT EXISTS ${OPENMP_INCLUDES} OR NOT EXISTS ${OPENMP_LIBRARIES})
+ message(FATAL_ERROR "Need to set OPENMP_INCLUDES and OPENMP_LIBRARIES on OSX.")
+ endif()
+ if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+ set(OpenMP_C "${CMAKE_C_COMPILER}")
+ set(OpenMP_C_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
+ set(OpenMP_C_LIB_NAMES "libomp" "libgomp" "libiomp5")
+ set(OpenMP_libomp_LIBRARY ${OpenMP_C_LIB_NAMES})
+ set(OpenMP_libgomp_LIBRARY ${OpenMP_C_LIB_NAMES})
+ set(OpenMP_libiomp5_LIBRARY ${OpenMP_C_LIB_NAMES})
+ endif()
+ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
+ set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
+ set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
+ set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+ set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+ set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+ endif()
+endif()
+
+
+find_package(OpenMP REQUIRED)
+
+add_definitions(${OpenMP_CXX_FLAGS})
+add_definitions(${OpenMP_C_FLAGS})
+
+ if (APPLE)
+ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+ set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+ set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS}")
+ set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS}")
+ include_directories("${OPENMP_INCLUDES}")
+ link_directories("${OPENMP_LIBRARIES}")
+ else()
+ if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.9.0")
+ set (OpenMP_EXE_LINKER_FLAGS OpenMP::OpenMP_CXX)
+ set (OpenMP_EXE_LINKER_FLAGS OpenMP::OpenMP_C)
+ else()
+ message(WARNING "Your CMake version is old. OpenMP linking flags might be incorrect.")
+ # need to explicitly set this. Definitely for gcc, hopefully also for other systems.
+ # See https://gitlab.kitware.com/cmake/cmake/issues/15392
+ set (OpenMP_EXE_LINKER_FLAGS ${OpenMP_CXX_FLAGS})
+ set (OpenMP_EXE_LINKER_FLAGS ${OpenMP_C_FLAGS})
+ endif()
+ endif()
+
+
+#if (OPENMP_FOUND)
+# set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+# set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+# set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
+# set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
+# set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
+# if (UNIX)
+# set (FLAGS "-O3 -funsigned-char -Wall -Wl,--no-undefined -march=native")
+# set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
+# set (CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
+#
+# set (EXTRA_LIBRARIES
+# "gomp"
+# "m"
+# )
+# endif()
+# endif()
+
+if (WIN32)
+ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Ddll_EXPORTS")
+endif()
+
+
+message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")
+message("CMAKE_C_FLAGS ${CMAKE_C_FLAGS}")
+message("CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS}")
+message("CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}")
+message("CMAKE_STATIC_LINKER_FLAGS ${CMAKE_STATIC_LINKER_FLAGS}")
+
+
+
+add_library(cilacc SHARED ${CMAKE_CURRENT_SOURCE_DIR}/axpby.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/FiniteDifferenceLibrary.c )
+
+target_link_libraries(cilacc ${OpenMP_C_LIB_NAMES} )
+include_directories(cilacc PUBLIC
+ ${CMAKE_CURRENT_SOURCE_DIR}/include
+ )
+
+## Install
+#include(GNUInstallDirs)
+#install(TARGETS cilacc
+# RUNTIME DESTINATION bin
+# LIBRARY DESTINATION lib
+# ARCHIVE DESTINATION lib
+# CONFIGURATIONS ${CMAKE_BUILD_TYPE}
+# )
+
+if (UNIX)
+message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib")
+install(TARGETS cilacc
+ LIBRARY DESTINATION lib
+ CONFIGURATIONS ${CMAKE_BUILD_TYPE}
+ )
+elseif(WIN32)
+message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin")
+ install(TARGETS cilacc
+ RUNTIME DESTINATION bin
+ ARCHIVE DESTINATION lib
+ CONFIGURATIONS ${CMAKE_BUILD_TYPE}
+ )
+endif()
+
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/src/Core/include/
+ DESTINATION ${CMAKE_INSTALL_PREFIX}/include/cil)
+
+
diff --git a/src/Core/FiniteDifferenceLibrary.c b/src/Core/FiniteDifferenceLibrary.c
new file mode 100644
index 0000000..f5736e2
--- /dev/null
+++ b/src/Core/FiniteDifferenceLibrary.c
@@ -0,0 +1,692 @@
+#define dll_EXPORTS = 1
+
+#include "FiniteDifferenceLibrary.h"
+
+DLL_EXPORT int openMPtest(void)
+{
+ int nThreads;
+#pragma omp parallel
+ {
+ if (omp_get_thread_num() == 0)
+ {
+ nThreads = omp_get_num_threads();
+ }
+ }
+ return nThreads;
+}
+int fdiff_direct_neumann(const float *inimagefull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc)
+{
+ size_t volume = nx * ny * nz;
+
+ const float * inimage = inimagefull;
+ float * outimageX = outimageXfull;
+ float * outimageY = outimageYfull;
+ float * outimageZ = outimageZfull;
+
+ int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice
+ int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row
+
+ long c;
+ for (c = 0; c < nc; c++)
+ {
+#pragma omp parallel
+ {
+ long ind, k, j, i;
+ float pix0;
+ //run over all and then fix boundaries
+#pragma omp for
+ for (ind = 0; ind < nx * ny * (nz - 1); ind++)
+ {
+ pix0 = -inimage[ind];
+
+ outimageX[ind] = pix0 + inimage[ind + 1];
+ outimageY[ind] = pix0 + inimage[ind + nx];
+ outimageZ[ind] = pix0 + inimage[ind + nx * ny];
+ }
+
+#pragma omp for
+ for (ind = 0; ind < nx * (ny - 1); ind++)
+ {
+ pix0 = -inimage[ind + offset1];
+
+ outimageX[ind + offset1] = pix0 + inimage[ind + offset1 + 1];
+ outimageY[ind + offset1] = pix0 + inimage[ind + offset1 + nx];
+ }
+
+#pragma omp for
+ for (ind = 0; ind < nx - 1; ind++)
+ {
+ pix0 = -inimage[ind + offset2];
+
+ outimageX[ind + offset2] = pix0 + inimage[ind + offset2 + 1];
+ }
+
+ //boundaries
+#pragma omp for
+ for (k = 0; k < nz; k++)
+ {
+ for (i = 0; i < nx; i++)
+ {
+ outimageY[(k * ny * nx) + (ny - 1) * nx + i] = 0;
+ }
+ }
+
+#pragma omp for
+ for (k = 0; k < nz; k++)
+ {
+ for (j = 0; j < ny; j++)
+ {
+ outimageX[k * ny * nx + j * nx + nx - 1] = 0;
+ }
+ }
+
+ if (nz > 1)
+ {
+#pragma omp for
+ for (ind = 0; ind < ny * nx; ind++)
+ {
+ outimageZ[nx * ny * (nz - 1) + ind] = 0;
+ }
+ }
+ }
+
+ inimage += volume;
+ outimageX += volume;
+ outimageY += volume;
+ outimageZ += volume;
+ }
+
+
+ //now the rest of the channels
+ if (nc > 1)
+ {
+ long ind;
+
+ for (c = 0; c < nc - 1; c++)
+ {
+ float * outimageC = outimageCfull + c * volume;
+ const float * inimage = inimagefull + c * volume;
+
+#pragma omp parallel for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimageC[ind] = -inimage[ind] + inimage[ind + volume];
+ }
+ }
+
+#pragma omp parallel for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimageCfull[(nc - 1) * volume + ind] = 0;
+ }
+ }
+
+ return 0;
+}
+int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normfull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc)
+{
+ size_t volume = nx * ny * nz;
+ int z_dim = nz - 1;
+
+ const float * inimage = inimagefull;
+ float * outimageX = outimageXfull;
+ float * outimageY = outimageYfull;
+ float * outimageZ = outimageZfull;
+ float * outimageL21norm = outimageL21normfull;
+
+
+ int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice
+ int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row
+
+ long c;
+ for (c = 0; c < nc; c++)
+ {
+#pragma omp parallel
+ {
+ long ind, k;
+ float pix0;
+ //run over all and then fix boundaries
+#pragma omp for
+ for (ind = 0; ind < nx * ny * (nz - 1); ind++)
+ {
+ pix0 = -inimage[ind];
+
+ outimageX[ind] = pix0 + inimage[ind + 1];
+ outimageY[ind] = pix0 + inimage[ind + nx];
+ outimageZ[ind] = pix0 + inimage[ind + nx * ny];
+ }
+
+#pragma omp for
+ for (ind = 0; ind < nx * (ny - 1); ind++)
+ {
+ pix0 = -inimage[ind + offset1];
+
+ outimageX[ind + offset1] = pix0 + inimage[ind + offset1 + 1];
+ outimageY[ind + offset1] = pix0 + inimage[ind + offset1 + nx];
+ }
+
+#pragma omp for
+ for (ind = 0; ind < nx - 1; ind++)
+ {
+ pix0 = -inimage[ind + offset2];
+
+ outimageX[ind + offset2] = pix0 + inimage[ind + offset2 + 1];
+ }
+
+ //boundaries
+#pragma omp for
+ for (k = 0; k < nz; k++)
+ {
+ for (int i = 0; i < nx; i++)
+ {
+ outimageY[(k * ny * nx) + (ny - 1) * nx + i] = 0;
+ }
+ }
+
+#pragma omp for
+ for (k = 0; k < nz; k++)
+ {
+ for (int j = 0; j < ny; j++)
+ {
+ outimageX[k * ny * nx + j * nx + nx - 1] = 0;
+ }
+ }
+
+ if (z_dim)
+ {
+#pragma omp for
+ for (ind = 0; ind < ny * nx; ind++)
+ {
+ outimageZ[nx * ny * (nz - 1) + ind] = 0;
+ }
+
+#pragma omp for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimageL21norm[ind] = outimageX[ind] * outimageX[ind] + outimageY[ind] * outimageY[ind] + outimageZ[ind] * outimageZ[ind];
+ }
+
+ }
+ else
+ {
+
+#pragma omp for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimageL21norm[ind] = outimageX[ind] * outimageX[ind] + outimageY[ind] * outimageY[ind];
+ }
+ }
+ }
+
+
+ inimage += volume;
+ outimageX += volume;
+ outimageY += volume;
+ outimageZ += volume;
+ outimageL21norm += volume;
+ }
+
+
+ //now the rest of the channels
+ if (nc > 1)
+ {
+ long ind;
+
+ for (c = 0; c < nc - 1; c++)
+ {
+ float * outimageC = outimageCfull + c * volume;
+ float * outimageL21norm = outimageL21normfull + c * volume;
+ const float * inimage = inimagefull + c * volume;
+
+#pragma omp parallel for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimageC[ind] = -inimage[ind] + inimage[ind + volume];
+ outimageL21norm[ind] += outimageC[ind] * outimageC[ind];
+
+ //sqrt
+ outimageL21norm[ind] = (float)sqrt(outimageL21norm[ind]);
+ }
+ }
+
+#pragma omp parallel for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimageCfull[(nc - 1) * volume + ind] = 0;
+ }
+ }
+
+ return 0;
+}
+int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc)
+{
+ size_t volume = nx * ny * nz;
+
+ const float * inimage = inimagefull;
+ float * outimageX = outimageXfull;
+ float * outimageY = outimageYfull;
+ float * outimageZ = outimageZfull;
+
+ int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice
+ int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row
+
+
+ long c;
+ for (c = 0; c < nc; c++)
+ {
+
+#pragma omp parallel
+ {
+ long ind, k;
+ float pix0;
+ //run over all and then fix boundaries
+#pragma omp for
+ for (ind = 0; ind < nx * ny * (nz - 1); ind++)
+ {
+ pix0 = -inimage[ind];
+
+ outimageX[ind] = pix0 + inimage[ind + 1];
+ outimageY[ind] = pix0 + inimage[ind + nx];
+ outimageZ[ind] = pix0 + inimage[ind + nx * ny];
+ }
+
+#pragma omp for
+ for (ind = 0; ind < nx * (ny - 1); ind++)
+ {
+ pix0 = -inimage[ind + offset1];
+
+ outimageX[ind + offset1] = pix0 + inimage[ind + offset1 + 1];
+ outimageY[ind + offset1] = pix0 + inimage[ind + offset1 + nx];
+ }
+
+#pragma omp for
+ for (ind = 0; ind < nx - 1; ind++)
+ {
+ pix0 = -inimage[ind + offset2];
+
+ outimageX[ind + offset2] = pix0 + inimage[ind + offset2 + 1];
+ }
+
+ //boundaries
+#pragma omp for
+ for (k = 0; k < nz; k++)
+ {
+ for (int i = 0; i < nx; i++)
+ {
+ int ind1 = (k * ny * nx);
+ int ind2 = ind1 + (ny - 1) * nx;
+
+ outimageY[ind2 + i] = -inimage[ind2 + i] + inimage[ind1 + i];
+ }
+ }
+
+#pragma omp for
+ for (k = 0; k < nz; k++)
+ {
+ for (int j = 0; j < ny; j++)
+ {
+ int ind1 = k * ny * nx + j * nx;
+ int ind2 = ind1 + nx - 1;
+
+ outimageX[ind2] = -inimage[ind2] + inimage[ind1];
+ }
+ }
+
+
+ if (nz > 1)
+ {
+#pragma omp for
+ for (ind = 0; ind < ny * nx; ind++)
+ {
+ outimageZ[nx * ny * (nz - 1) + ind] = -inimage[nx * ny * (nz - 1) + ind] + inimage[ind];
+ }
+ }
+ }
+
+ inimage += volume;
+ outimageX += volume;
+ outimageY += volume;
+ outimageZ += volume;
+ }
+
+ //now the rest of the channels
+ if (nc > 1)
+ {
+ long ind;
+
+ for (c = 0; c < nc - 1; c++)
+ {
+ float * outimageC = outimageCfull + c * volume;
+ const float * inimage = inimagefull + c * volume;
+
+#pragma omp parallel for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimageC[ind] = -inimage[ind] + inimage[ind + volume];
+ }
+ }
+
+#pragma omp parallel for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimageCfull[(nc - 1) * volume + ind] = -inimagefull[(nc - 1) * volume + ind] + inimagefull[ind];
+ }
+ }
+
+ return 0;
+}
+int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const float *inimageYfull, const float *inimageZfull, const float *inimageCfull, long nx, long ny, long nz, long nc)
+{
+ //runs over full data in x, y, z. then corrects elements for bounday conditions and sums
+ size_t volume = nx * ny * nz;
+
+ //assumes nx and ny > 1
+ int z_dim = nz - 1;
+
+ float * outimage = outimagefull;
+ const float * inimageX = inimageXfull;
+ const float * inimageY = inimageYfull;
+ const float * inimageZ = inimageZfull;
+
+ float * tempX = (float*)malloc(volume * sizeof(float));
+ float * tempY = (float*)malloc(volume * sizeof(float));
+ float * tempZ;
+
+ if(z_dim)
+ {
+ tempZ = (float*)malloc(volume * sizeof(float));
+ }
+
+ long c;
+ for (c = 0; c < nc; c++) //just calculating x, y and z in each channel here
+ {
+#pragma omp parallel
+ {
+ long ind, k;
+
+#pragma omp for
+ for (ind = 1; ind < nx * ny * nz; ind++)
+ {
+ tempX[ind] = -inimageX[ind] + inimageX[ind - 1];
+ }
+#pragma omp for
+ for (ind = nx; ind < nx * ny * nz; ind++)
+ {
+ tempY[ind] = -inimageY[ind] + inimageY[ind - nx];
+
+ }
+
+ //boundaries
+#pragma omp for
+ for (k = 0; k < nz; k++)
+ {
+ for (int j = 0; j < ny; j++)
+ {
+ tempX[k * ny * nx + j * nx] = -inimageX[k * ny * nx + j * nx];
+ tempX[k * ny * nx + j * nx + nx - 1] = inimageX[k * ny * nx + j * nx + nx - 2];
+ }
+ }
+#pragma omp for
+ for (k = 0; k < nz; k++)
+ {
+ for (int i = 0; i < nx; i++)
+ {
+ tempY[(k * ny * nx) + i] = -inimageY[(k * ny * nx) + i];
+ tempY[(k * ny * nx) + nx * (ny - 1) + i] = inimageY[(k * ny * nx) + nx * (ny - 2) + i];
+ }
+ }
+
+ if (z_dim)
+ {
+#pragma omp for
+ for (ind = nx * ny; ind < nx * ny * nz; ind++)
+ {
+ tempZ[ind] = -inimageZ[ind] + inimageZ[ind - nx * ny];
+ }
+#pragma omp for
+ for (ind = 0; ind < ny * nx; ind++)
+ {
+ tempZ[ind] = -inimageZ[ind];
+ tempZ[nx * ny * (nz - 1) + ind] = inimageZ[nx * ny * (nz - 2) + ind];
+ }
+#pragma omp for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimage[ind] = tempX[ind] + tempY[ind] + tempZ[ind];
+ }
+
+ }
+ else
+ {
+#pragma omp for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimage[ind] = tempX[ind] + tempY[ind];
+ }
+ }
+
+ }
+
+ outimage += volume;
+ inimageX += volume;
+ inimageY += volume;
+ inimageZ += volume;
+ }
+ free(tempX);
+ free(tempY);
+
+ if(z_dim)
+ free(tempZ);
+
+
+ // //now the rest of the channels
+ if (nc > 1)
+ {
+ long ind;
+
+ for (c = 1; c < nc - 1; c++)
+ {
+
+#pragma omp parallel for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimagefull[ind + c * volume] += -inimageCfull[ind + c * volume] + inimageCfull[ind + (c - 1) * volume];
+ }
+ }
+
+#pragma omp parallel for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimagefull[ind] += -inimageCfull[ind];
+ outimagefull[(nc - 1) * volume + ind] += inimageCfull[(nc - 2) * volume + ind];
+ }
+
+ }
+
+ return 0;
+}
+int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const float *inimageYfull, const float *inimageZfull, const float *inimageCfull, long nx, long ny, long nz, long nc)
+{
+ //runs over full data in x, y, z. then correctects elements for bounday conditions and sums
+ size_t volume = nx * ny * nz;
+
+ //assumes nx and ny > 1
+ int z_dim = nz - 1;
+
+ float * outimage = outimagefull;
+ const float * inimageX = inimageXfull;
+ const float * inimageY = inimageYfull;
+ const float * inimageZ = inimageZfull;
+
+ float * tempX = (float*)malloc(volume * sizeof(float));
+ float * tempY = (float*)malloc(volume * sizeof(float));
+ float * tempZ;
+
+ if (z_dim)
+ {
+ tempZ = (float*)malloc(volume * sizeof(float));
+ }
+
+ long c;
+ for (c = 0; c < nc; c++) //just calculating x, y and z in each channel here
+ {
+#pragma omp parallel
+ {
+ long ind, k;
+
+ //run over all and then fix boundaries
+#pragma omp for
+ for (ind = 1; ind < volume; ind++)
+ {
+ tempX[ind] = -inimageX[ind] + inimageX[ind - 1];
+ }
+#pragma omp for
+ for (ind = nx; ind < volume; ind++)
+ {
+ tempY[ind] = -inimageY[ind] + inimageY[ind - nx];
+ }
+
+ //boundaries
+#pragma omp for
+ for (k = 0; k < nz; k++)
+ {
+ for (int i = 0; i < nx; i++)
+ {
+ tempY[(k * ny * nx) + i] = -inimageY[(k * ny * nx) + i] + inimageY[(k * ny * nx) + nx * (ny - 1) + i];
+ }
+ }
+#pragma omp for
+ for (k = 0; k < nz; k++)
+ {
+ for (int j = 0; j < ny; j++)
+ {
+ tempX[k * ny * nx + j * nx] = -inimageX[k * ny * nx + j * nx] + inimageX[k * ny * nx + j * nx + nx - 1];
+ }
+ }
+
+ if (z_dim)
+ {
+
+#pragma omp for
+ for (ind = nx * ny; ind < nx * ny * nz; ind++)
+ {
+ tempZ[ind] = -inimageZ[ind] + inimageZ[ind - nx * ny];
+ }
+#pragma omp for
+ for (ind = 0; ind < ny * nx; ind++)
+ {
+ tempZ[ind] = -inimageZ[ind] + inimageZ[nx * ny * (nz - 1) + ind];
+ }
+
+#pragma omp for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimage[ind] = tempX[ind] + tempY[ind] + tempZ[ind];
+ }
+
+ }
+ else
+ {
+#pragma omp for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimage[ind] = tempX[ind] + tempY[ind];
+ }
+ }
+
+ }
+
+ outimage += volume;
+ inimageX += volume;
+ inimageY += volume;
+ inimageZ += volume;
+ }
+ free(tempX);
+ free(tempY);
+
+ if(z_dim)
+ free(tempZ);
+
+ //now the rest of the channels
+ if (nc > 1)
+ {
+ long ind;
+
+ for (c = 1; c < nc; c++)
+ {
+
+#pragma omp parallel for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimagefull[ind + c * volume] += -inimageCfull[ind + c * volume] + inimageCfull[ind + (c - 1) * volume];
+ }
+ }
+
+#pragma omp parallel for
+ for (ind = 0; ind < volume; ind++)
+ {
+ outimagefull[ind] += -inimageCfull[ind] + inimageCfull[(nc - 1) * volume + ind];
+ }
+ }
+
+ return 0;
+}
+
+
+DLL_EXPORT int fdiff4D(float *imagefull, float *gradCfull, float *gradZfull, float *gradYfull, float *gradXfull, long nc, long nz, long ny, long nx, int boundary, int direction)
+{
+ if (boundary)
+ {
+ if (direction)
+ fdiff_direct_periodic(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc);
+ else
+ fdiff_adjoint_periodic(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc);
+ }
+ else
+ {
+ if (direction)
+ fdiff_direct_neumann(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc);
+ else
+ fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc);
+ }
+
+ return 0;
+}
+DLL_EXPORT int fdiff3D(float *imagefull, float *gradZfull, float *gradYfull, float *gradXfull, long nz, long ny, long nx, int boundary, int direction)
+{
+ if (boundary)
+ {
+ if (direction)
+ fdiff_direct_periodic(imagefull, gradXfull, gradYfull, gradZfull, NULL, nx, ny, nz, 1);
+ else
+ fdiff_adjoint_periodic(imagefull, gradXfull, gradYfull, gradZfull, NULL, nx, ny, nz, 1);
+ }
+ else
+ {
+ if (direction)
+ fdiff_direct_neumann(imagefull, gradXfull, gradYfull, gradZfull, NULL, nx, ny, nz, 1);
+ else
+ fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, gradZfull, NULL, nx, ny, nz, 1);
+ }
+
+ return 0;
+}
+DLL_EXPORT int fdiff2D(float *imagefull, float *gradYfull, float *gradXfull, long ny, long nx, int boundary, int direction)
+{
+ if (boundary)
+ {
+ if (direction)
+ fdiff_direct_periodic(imagefull, gradXfull, gradYfull, NULL, NULL, nx, ny, 1, 1);
+ else
+ fdiff_adjoint_periodic(imagefull, gradXfull, gradYfull, NULL, NULL, nx, ny, 1, 1);
+ }
+ else
+ {
+ if (direction)
+ fdiff_direct_neumann(imagefull, gradXfull, gradYfull, NULL, NULL, nx, ny, 1, 1);
+ else
+ fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, NULL, NULL, nx, ny, 1, 1);
+ }
+
+ return 0;
+}
diff --git a/src/Core/axpby.c b/src/Core/axpby.c
new file mode 100755
index 0000000..c4d162d
--- /dev/null
+++ b/src/Core/axpby.c
@@ -0,0 +1,109 @@
+#include "axpby.h"
+
+
+DLL_EXPORT int padd(float * x, float * y, float * out, long size){
+ long i = 0;
+#pragma omp parallel for
+ for (i=0; i < size; i++)
+ {
+ *(out + i ) = *(x + i) + *(y+i);
+ }
+ return 0;
+}
+
+DLL_EXPORT int psubtract(float * x, float * y, float * out, long size){
+ long i = 0;
+#pragma omp parallel
+{
+//#pragma omp single
+//{
+// printf("current number of threads %d\n", omp_get_num_threads());
+//}
+#pragma omp for
+ for (i=0; i < size; i++)
+ {
+ *(out + i ) = *(x + i) - *(y+i);
+ }
+}
+ return 0;
+
+}
+
+DLL_EXPORT int pmultiply(float * x, float * y, float * out, long size){
+ long i = 0;
+#pragma omp parallel for
+ for (i=0; i < size; i++)
+ {
+ *(out + i ) = *(x + i) * *(y+i);
+ }
+ return 0;
+}
+
+DLL_EXPORT int pdivide(float * x, float * y, float * out, long size, float default_value)
+{
+ long i = 0;
+#pragma omp parallel for
+ for (i=0; i < size; i++)
+ {
+ *(out + i ) = *(y+i) ? *(x + i) / *(y+i) : default_value;
+ }
+ return 0;
+}
+DLL_EXPORT int ppower(float * x, float * y, float * out, long size){
+ long i = 0;
+#pragma omp parallel for
+ for (i=0; i < size; i++)
+ {
+ *(out + i ) = (float)pow(*(x + i) , *(y+i)) ;
+ }
+ return 0;
+}
+
+DLL_EXPORT int pminimum(float * x, float * y, float * out, long size){
+ long i = 0;
+#pragma omp parallel for
+ for (i=0; i < size; i++)
+ {
+ *(out + i ) = *(y+i) > (*x+i) ? *(x + i) : *(y+i);
+ }
+ return 0;
+}
+
+DLL_EXPORT int pmaximum(float * x, float * y, float * out, long size) {
+ long i = 0;
+#pragma omp parallel for
+ for (i = 0; i < size; i++)
+ {
+ *(out + i) = *(y + i) < (*x + i) ? *(x + i) : *(y + i);
+ }
+ return 0;
+}
+
+
+DLL_EXPORT int saxpby(float * x, float * y, float * out, float a, float b, long size){
+ long i = 0;
+#pragma omp parallel
+{
+#pragma omp for
+ for (i=0; i < size; i++)
+ {
+ *(out + i ) = a * ( *(x + i) ) + b * ( *(y + i) );
+ }
+}
+ return 0;
+
+}
+
+DLL_EXPORT int daxpby(double * x, double * y, double * out, double a, double b, long size) {
+ long i = 0;
+#pragma omp parallel
+ {
+#pragma omp for
+ for (i = 0; i < size; i++)
+ {
+ *(out + i) = a * (*(x + i)) + b * (*(y + i));
+ }
+ }
+ return 0;
+
+} \ No newline at end of file
diff --git a/src/Core/include/FiniteDifferenceLibrary.h b/src/Core/include/FiniteDifferenceLibrary.h
new file mode 100644
index 0000000..6e426af
--- /dev/null
+++ b/src/Core/include/FiniteDifferenceLibrary.h
@@ -0,0 +1,6 @@
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "omp.h"
+//#include "ipp.h"
+#include "dll_export.h" \ No newline at end of file
diff --git a/src/Core/include/axpby.h b/src/Core/include/axpby.h
new file mode 100644
index 0000000..2849547
--- /dev/null
+++ b/src/Core/include/axpby.h
@@ -0,0 +1,17 @@
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "omp.h"
+#include "dll_export.h"
+
+
+DLL_EXPORT int padd(float * x, float * y, float * out, long size);
+DLL_EXPORT int psubtract(float * x, float * y, float * out, long size);
+DLL_EXPORT int pmultiply(float * x, float * y, float * out, long size);
+DLL_EXPORT int pdivide(float * x, float * y, float * out, long size, float default_value);
+DLL_EXPORT int ppower(float * x, float * y, float * out, long size);
+DLL_EXPORT int pminimum(float * x, float * y, float * out, long size);
+DLL_EXPORT int pmaximum(float * x, float * y, float * out, long size);
+
+DLL_EXPORT int saxpby(float * x, float * y, float * out, float a, float b, long size);
+DLL_EXPORT int daxpby(double * x, double * y, double * out, double a, double b, long size);
diff --git a/src/Core/include/dll_export.h b/src/Core/include/dll_export.h
new file mode 100755
index 0000000..6b816c3
--- /dev/null
+++ b/src/Core/include/dll_export.h
@@ -0,0 +1,20 @@
+#pragma once
+#ifndef DLLEXPORT_H
+#define DLLEXPORT_H
+
+#if defined(_WIN32) || defined(__WIN32__)
+#if defined(dll_EXPORTS) // add by CMake
+#define DLL_EXPORT __declspec(dllexport)
+#define EXPIMP_TEMPLATE
+#else
+#define DLL_EXPORT __declspec(dllimport)
+#define EXPIMP_TEMPLATE extern
+#endif
+#elif defined(linux) || defined(__linux) || defined(__APPLE__)
+#define DLL_EXPORT
+#ifndef __cdecl
+#define __cdecl
+#endif
+#endif
+
+#endif