summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSuren A. Chilingaryan <csa@suren.me>2020-03-27 20:15:30 +0100
committerSuren A. Chilingaryan <csa@suren.me>2020-03-27 20:15:30 +0100
commit324411a3e033c7f05b77e3bff829b55d674438da (patch)
tree1988ded680ccdecc2dd4d4a4b6ee6923dfb3dfd6
parent17727d5938b0c82483c0375b33171645533b4fa5 (diff)
downloadregularization-324411a3e033c7f05b77e3bff829b55d674438da.tar.gz
regularization-324411a3e033c7f05b77e3bff829b55d674438da.tar.bz2
regularization-324411a3e033c7f05b77e3bff829b55d674438da.tar.xz
regularization-324411a3e033c7f05b77e3bff829b55d674438da.zip
Optimized TNV routine (10x performance, 1/3 memcory consumption)
-rw-r--r--CMakeLists.txt1
-rw-r--r--cmake/FindGLIB2.cmake54
-rw-r--r--src/Core/CMakeLists.txt19
-rwxr-xr-xsrc/Core/regularisers_CPU/TNV_core.c948
-rwxr-xr-xsrc/Core/regularisers_CPU/TNV_core_backtrack.c695
-rw-r--r--src/Core/regularisers_CPU/hw_sched.c396
-rw-r--r--src/Core/regularisers_CPU/hw_sched.h151
-rw-r--r--src/Core/regularisers_CPU/hw_thread.c141
-rw-r--r--src/Core/regularisers_CPU/hw_thread.h76
9 files changed, 2097 insertions, 384 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 043f13c..afbff2d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,6 +13,7 @@
# limitations under the License.
cmake_minimum_required (VERSION 3.0)
+set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
project(RGL)
#https://stackoverflow.com/questions/13298504/using-cmake-with-setup-py
diff --git a/cmake/FindGLIB2.cmake b/cmake/FindGLIB2.cmake
new file mode 100644
index 0000000..2940d9f
--- /dev/null
+++ b/cmake/FindGLIB2.cmake
@@ -0,0 +1,54 @@
+# - Try to find the GLIB2 libraries
+# Once done this will define
+#
+# GLIB2_FOUND - system has glib2
+# GLIB2_DIR - path to the glib2 base directory
+# GLIB2_INCLUDE_DIR - the glib2 include directory
+# GLIB2_LIBRARIES - glib2 library
+
+set(GLIB2_DIR GLIB2_DIR-NOTFOUND CACHE PATH "Location of GLIB2 package")
+
+if(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES)
+ # Already in cache, be silent
+ set(GLIB2_FIND_QUIETLY TRUE)
+endif(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES)
+
+if (GLIB2_DIR)
+ set(PKG_GLIB_LIBRARY_DIRS ${GLIB2_DIR}/lib${CMAKE_BUILD_ARCH} ${GLIB2_DIR}/lib)
+ set(PKG_GLIB_INCLUDE_DIRS ${GLIB2_DIR}/include/)
+else (GLIB2_DIR)
+ if (NOT WIN32)
+ find_package(PkgConfig REQUIRED)
+ pkg_check_modules(PKG_GLIB REQUIRED glib-2.0)
+ endif(NOT WIN32)
+endif (GLIB2_DIR)
+
+find_path(GLIB2_MAIN_INCLUDE_DIR glib.h
+ PATH_SUFFIXES glib-2.0
+ PATHS ${PKG_GLIB_INCLUDE_DIRS} )
+
+# search the glibconfig.h include dir under the same root where the library is found
+find_library(GLIB2_LIBRARIES
+ NAMES glib-2.0
+ PATHS ${PKG_GLIB_LIBRARY_DIRS} )
+
+find_library(GTHREAD2_LIBRARIES
+ NAMES gthread-2.0
+ PATHS ${PKG_GLIB_LIBRARY_DIRS} )
+
+find_path(GLIB2_INTERNAL_INCLUDE_DIR glibconfig.h
+ PATH_SUFFIXES glib-2.0/include
+ PATHS ${PKG_GLIB_INCLUDE_DIRS} ${PKG_GLIB_LIBRARY_DIRS} ${CMAKE_SYSTEM_LIBRARY_PATH})
+
+set(GLIB2_INCLUDE_DIR ${GLIB2_MAIN_INCLUDE_DIR})
+
+# not sure if this include dir is optional or required
+# for now it is optional
+if(GLIB2_INTERNAL_INCLUDE_DIR)
+ set(GLIB2_INCLUDE_DIR ${GLIB2_INCLUDE_DIR} ${GLIB2_INTERNAL_INCLUDE_DIR})
+endif(GLIB2_INTERNAL_INCLUDE_DIR)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(GLIB2 DEFAULT_MSG GLIB2_LIBRARIES GTHREAD2_LIBRARIES GLIB2_MAIN_INCLUDE_DIR)
+
+mark_as_advanced(GLIB2_INCLUDE_DIR GLIB2_LIBRARIES GTHREAD2_LIBRARIES GLIB2_INTERNAL_INCLUDE_DIR GLIB2_MAIN_INCLUDE_DIR)
diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt
index 4e43b5e..10b16f3 100644
--- a/src/Core/CMakeLists.txt
+++ b/src/Core/CMakeLists.txt
@@ -15,6 +15,7 @@ message("CIL_VERSION ${CIL_VERSION}")
## Build the regularisers package as a library
message("Creating Regularisers as a shared library")
+find_package(GLIB2 REQUIRED)
find_package(OpenMP REQUIRED)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
@@ -24,8 +25,8 @@ if (OPENMP_FOUND)
set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
endif()
-message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")
-message("CMAKE_C_FLAGS ${CMAKE_C_FLAGS}")
+message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
+message("CMAKE_C_FLAGS ${CMAKE_C_FLAGS} -fno-omit-frame-pointer")
message("CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS}")
message("CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}")
message("CMAKE_STATIC_LINKER_FLAGS ${CMAKE_STATIC_LINKER_FLAGS}")
@@ -58,10 +59,17 @@ message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")
## Build the regularisers package as a library
message("Adding regularisers as a shared library")
-#set(CMAKE_C_COMPILER /apps/pgi/linux86-64/17.4/bin/pgcc)
+#set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc)
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=2 -qopt-report-phase=vec -qopenmp")
+set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp")
+#set(CMAKE_C_FLAGS "-march=native -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -mavx -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
+#set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer")
+
#set(CMAKE_C_FLAGS "-acc -Minfo -ta=tesla:cc20 -openmp")
#set(CMAKE_C_FLAGS "-acc -Minfo -ta=multicore -openmp -fPIC")
add_library(cilreg SHARED
+ ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_sched.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_thread.c
${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_TV_core.c
${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/SB_TV_core.c
${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PD_TV_core.c
@@ -78,8 +86,9 @@ add_library(cilreg SHARED
${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/Diffusion_Inpaint_core.c
${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/NonlocalMarching_Inpaint_core.c
)
-target_link_libraries(cilreg ${EXTRA_LIBRARIES} )
+target_link_libraries(cilreg ${EXTRA_LIBRARIES} ${GLIB2_LIBRARIES} ${GTHREAD2_LIBRARIES} )
include_directories(cilreg PUBLIC
+ ${GLIB2_INCLUDE_DIR}
${LIBRARY_INC}/include
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/
@@ -105,7 +114,7 @@ endif()
# GPU Regularisers
-if (BUILD_CUDA)
+if (BUILD_CUDA1)
find_package(CUDA)
if (CUDA_FOUND)
set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -shared -D_FORCE_INLINES")
diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c
index 753cc5f..be7fdef 100755
--- a/src/Core/regularisers_CPU/TNV_core.c
+++ b/src/Core/regularisers_CPU/TNV_core.c
@@ -5,6 +5,12 @@
*
* Copyright 2017 Daniil Kazantsev
* Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Copyriht 2020 Suren A. Chlingaryan
+ * Optimized version with 1/3 of memory consumption and ~10x performance
+ * This version is not able to perform back-track except during first iterations
+ * But warning would be printed if backtracking is required and slower version (TNV_core_backtrack.c)
+ * could be executed instead. It still better than original with 1/2 of memory consumption and 4x performance gain
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -19,6 +25,467 @@
#include "TNV_core.h"
+#define min(a,b) (((a)<(b))?(a):(b))
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+ int ii, num;
+ float divsigma = 1.0f / sigma;
+ float sum, shrinkfactor;
+ float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+ float proj[2] = {0};
+
+ // Compute eigenvalues of M
+ T = M1 + M3;
+ D = M1 * M3 - M2 * M2;
+ det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+ eig1 = MAX((T / 2.0f) + det, 0.0f);
+ eig2 = MAX((T / 2.0f) - det, 0.0f);
+ sig1 = sqrtf(eig1);
+ sig2 = sqrtf(eig2);
+
+ // Compute normalized eigenvectors
+ V1 = V2 = V3 = V4 = 0.0f;
+
+ if(M2 != 0.0f)
+ {
+ v0 = M2;
+ v1 = eig1 - M3;
+ v2 = eig2 - M3;
+
+ mu1 = sqrtf(v0 * v0 + v1 * v1);
+ mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+ if(mu1 > fTiny)
+ {
+ V1 = v1 / mu1;
+ V3 = v0 / mu1;
+ }
+
+ if(mu2 > fTiny)
+ {
+ V2 = v2 / mu2;
+ V4 = v0 / mu2;
+ }
+
+ } else
+ {
+ if(M1 > M3)
+ {
+ V1 = V4 = 1.0f;
+ V2 = V3 = 0.0f;
+
+ } else
+ {
+ V1 = V4 = 0.0f;
+ V2 = V3 = 1.0f;
+ }
+ }
+
+ // Compute prox_p of the diagonal entries
+ sig1_upd = sig2_upd = 0.0f;
+
+ if(p == 1)
+ {
+ sig1_upd = MAX(sig1 - divsigma, 0.0f);
+ sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+ } else if(p == INFNORM)
+ {
+ proj[0] = sigma * fabs(sig1);
+ proj[1] = sigma * fabs(sig2);
+
+ /*l1 projection part */
+ sum = fLarge;
+ num = 0l;
+ shrinkfactor = 0.0f;
+ while(sum > 1.0f)
+ {
+ sum = 0.0f;
+ num = 0;
+
+ for(ii = 0; ii < 2; ii++)
+ {
+ proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+ sum += fabs(proj[ii]);
+ if(proj[ii]!= 0.0f)
+ num++;
+ }
+
+ if(num > 0)
+ shrinkfactor = (sum - 1.0f) / num;
+ else
+ break;
+ }
+ /*l1 proj ends*/
+
+ sig1_upd = sig1 - divsigma * proj[0];
+ sig2_upd = sig2 - divsigma * proj[1];
+ }
+
+ // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+ if(sig1 > fTiny)
+ sig1_upd /= sig1;
+
+ if(sig2 > fTiny)
+ sig2_upd /= sig2;
+
+ // Compute solution
+ t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+ t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+ t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+ int offY, stepY, copY;
+ float *Input, *u, *qx, *qy, *gradx, *grady, *div;
+ float *div0, *udiff0, *udiff;
+ float resprimal, resdual;
+ float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+ int threads;
+ tnv_thread_t *thr_ctx;
+ float *InputT, *uT;
+ int dimX, dimY, dimZ, padZ;
+ float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ free(ctx->Input);
+ free(ctx->u);
+ free(ctx->qx);
+ free(ctx->qy);
+ free(ctx->gradx);
+ free(ctx->grady);
+ free(ctx->div);
+
+ free(ctx->div0);
+ free(ctx->udiff0);
+ free(ctx->udiff);
+
+ return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+
+// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*(stepY+1)*padZ);
+ long DimRow = (long)(dimX * padZ);
+
+ // Auxiliar vectors
+ ctx->Input = malloc(Dim1Total * sizeof(float));
+ ctx->u = malloc(Dim1Total * sizeof(float));
+ ctx->qx = malloc(DimTotal * sizeof(float));
+ ctx->qy = malloc(DimTotal * sizeof(float));
+ ctx->gradx = malloc(DimTotal * sizeof(float));
+ ctx->grady = malloc(DimTotal * sizeof(float));
+ ctx->div = malloc(Dim1Total * sizeof(float));
+
+ ctx->div0 = malloc(DimRow * sizeof(float));
+ ctx->udiff0 = malloc(DimRow * sizeof(float));
+ ctx->udiff = malloc(DimRow * sizeof(float));
+
+ if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) {
+ fprintf(stderr, "Error allocating memory\n");
+ exit(-1);
+ }
+
+ return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ memset(ctx->u, 0, Dim1Total * sizeof(float));
+ memset(ctx->qx, 0, DimTotal * sizeof(float));
+ memset(ctx->qy, 0, DimTotal * sizeof(float));
+ memset(ctx->gradx, 0, DimTotal * sizeof(float));
+ memset(ctx->grady, 0, DimTotal * sizeof(float));
+ memset(ctx->div, 0, Dim1Total * sizeof(float));
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<copY; j++) {
+ for(i=0; i<dimX; i++) {
+ ctx->Input[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+ ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<stepY; j++) {
+ for(i=0; i<dimX; i++) {
+ tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k];
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+ int padZ = tnv_ctx->padZ;
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ memset(ctx->u, 0, Dim1Total * sizeof(float));
+ memset(ctx->qx, 0, DimTotal * sizeof(float));
+ memset(ctx->qy, 0, DimTotal * sizeof(float));
+ memset(ctx->gradx, 0, DimTotal * sizeof(float));
+ memset(ctx->grady, 0, DimTotal * sizeof(float));
+ memset(ctx->div, 0, Dim1Total * sizeof(float));
+
+ return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+ long i, j, k, l, m;
+
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+ float *Input = ctx->Input;
+ float *u = ctx->u;
+ float *qx = ctx->qx;
+ float *qy = ctx->qy;
+ float *gradx = ctx->gradx;
+ float *grady = ctx->grady;
+ float *div = ctx->div;
+
+ long p = 1l;
+ long q = 1l;
+ long r = 0l;
+
+ float lambda = tnv_ctx->lambda;
+ float sigma = tnv_ctx->sigma;
+ float tau = tnv_ctx->tau;
+ float theta = tnv_ctx->theta;
+
+ float taulambda = tau * lambda;
+ float divtau = 1.0f / tau;
+ float divsigma = 1.0f / sigma;
+ float theta1 = 1.0f + theta;
+ float constant = 1.0f + taulambda;
+
+ float resprimal = 0.0f;
+ float resdual = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
+
+ float qxdiff;
+ float qydiff;
+ float divdiff;
+ float gradxdiff[dimZ];
+ float gradydiff[dimZ];
+ float ubarx[dimZ];
+ float ubary[dimZ];
+ float udiff_next[dimZ];
+
+ for(i=0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+ float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
+ float udiff = u[l] - u_upd;
+ ctx->udiff[l] = udiff;
+ ctx->udiff0[l] = udiff;
+ ctx->div0[l] = div[l];
+ u[l] = u_upd;
+ }
+ }
+
+ for(j = 0; j < stepY; j++) {
+ for(i = 0; i < dimX; i++) {
+ float t[3];
+ float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+ l = (j * dimX + i) * padZ;
+ m = dimX * padZ;
+
+//#pragma unroll 64
+ for(k = 0; k < dimZ; k++) {
+ float u_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant;
+ udiff_next[k] = u[l + k + m] - u_upd;
+ u[l + k + m] = u_upd;
+
+ float gradx_upd = (i == (dimX - 1))?0:(u[l + k + padZ] - u[l + k]);
+ float grady_upd = (j == (copY - 1))?0:(u[l + k + m] - u[l + k]);
+ gradxdiff[k] = gradx[l + k] - gradx_upd;
+ gradydiff[k] = grady[l + k] - grady_upd;
+ gradx[l + k] = gradx_upd;
+ grady[l + k] = grady_upd;
+
+ ubarx[k] = gradx_upd - theta * gradxdiff[k];
+ ubary[k] = grady_upd - theta * gradydiff[k];
+
+ float vx = ubarx[k] + divsigma * qx[l + k];
+ float vy = ubary[k] + divsigma * qy[l + k];
+
+ M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+ }
+
+ coefF(t, M1, M2, M3, sigma, p, q, r);
+
+//#pragma unroll 64
+ for(k = 0; k < dimZ; k++) {
+ float vx = ubarx[k] + divsigma * qx[l + k];
+ float vy = ubary[k] + divsigma * qy[l + k];
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qxdiff = sigma * (ubarx[k] - gx_upd);
+ qydiff = sigma * (ubary[k] - gy_upd);
+
+ qx[l + k] += qxdiff;
+ qy[l + k] += qydiff;
+
+ float udiff = ctx->udiff[i * padZ + k];
+ ctx->udiff[i * padZ + k] = udiff_next[k];
+ unorm += (udiff * udiff);
+ qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+ float div_upd = 0;
+ div_upd -= (i > 0)?qx[l + k - padZ]:0;
+ div_upd -= (j > 0)?qy[l + k - m]:0;
+ div_upd += (i < (dimX-1))?qx[l + k]:0;
+ div_upd += (j < (copY-1))?qy[l + k]:0;
+ divdiff = div[l + k] - div_upd;
+ div[l + k] = div_upd;
+
+ resprimal += ((offY == 0)||(j > 0))?fabs(divtau * udiff + divdiff):0;
+ resdual += fabs(divsigma * qxdiff + gradxdiff[k]);
+ resdual += fabs(divsigma * qydiff + gradydiff[k]);
+ product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+ }
+
+ } // i
+ } // j
+
+
+ ctx->resprimal = resprimal;
+ ctx->resdual = resdual;
+ ctx->product = product;
+ ctx->unorm = unorm;
+ ctx->qnorm = qnorm;
+
+ return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+ int i, off, size, err;
+
+ if (sched) return;
+
+ tnv_ctx.dimX = dimX;
+ tnv_ctx.dimY = dimY;
+ tnv_ctx.dimZ = dimZ;
+ // Padding seems actually slower
+// tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0));
+ tnv_ctx.padZ = dimZ;
+
+ hw_sched_init();
+
+ int threads = hw_sched_get_cpu_count();
+ if (threads > dimY) threads = dimY/2;
+
+ int step = dimY / threads;
+ int extra = dimY % threads;
+
+ tnv_ctx.threads = threads;
+ tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+ for (i = 0, off = 0; i < threads; i++, off += size) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+ size = step + ((i < extra)?1:0);
+
+ ctx->offY = off;
+ ctx->stepY = size;
+
+ if (i == (threads-1)) ctx->copY = ctx->stepY;
+ else ctx->copY = ctx->stepY + 1;
+ }
+
+ sched = hw_sched_create(threads);
+ if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
/*
* C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
* The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
@@ -32,50 +499,25 @@
* 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter]
*
* Output:
- * 1. Filtered/regularized image
+ * 1. Filtered/regularized image (u)
*
* [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
*/
-float TNV_CPU_main(float *Input, float *u, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
{
- long k, p, q, r, DimTotal;
- float taulambda;
- float *u_upd, *gx, *gy, *gx_upd, *gy_upd, *qx, *qy, *qx_upd, *qy_upd, *v, *vx, *vy, *gradx, *grady, *gradx_upd, *grady_upd, *gradx_ubar, *grady_ubar, *div, *div_upd;
-
- p = 1l;
- q = 1l;
- r = 0l;
-
+ int err;
+ int iter;
+ int i,j,k,l,m;
+
lambda = 1.0f/(2.0f*lambda);
- DimTotal = (long)(dimX*dimY*dimZ);
- /* PDHG algorithm parameters*/
+ tnv_ctx.lambda = lambda;
+
+ // PDHG algorithm parameters
float tau = 0.5f;
float sigma = 0.5f;
float theta = 1.0f;
-
- // Auxiliar vectors
- u_upd = calloc(DimTotal, sizeof(float));
- gx = calloc(DimTotal, sizeof(float));
- gy = calloc(DimTotal, sizeof(float));
- gx_upd = calloc(DimTotal, sizeof(float));
- gy_upd = calloc(DimTotal, sizeof(float));
- qx = calloc(DimTotal, sizeof(float));
- qy = calloc(DimTotal, sizeof(float));
- qx_upd = calloc(DimTotal, sizeof(float));
- qy_upd = calloc(DimTotal, sizeof(float));
- v = calloc(DimTotal, sizeof(float));
- vx = calloc(DimTotal, sizeof(float));
- vy = calloc(DimTotal, sizeof(float));
- gradx = calloc(DimTotal, sizeof(float));
- grady = calloc(DimTotal, sizeof(float));
- gradx_upd = calloc(DimTotal, sizeof(float));
- grady_upd = calloc(DimTotal, sizeof(float));
- gradx_ubar = calloc(DimTotal, sizeof(float));
- grady_ubar = calloc(DimTotal, sizeof(float));
- div = calloc(DimTotal, sizeof(float));
- div_upd = calloc(DimTotal, sizeof(float));
-
+
// Backtracking parameters
float s = 1.0f;
float gamma = 0.75f;
@@ -84,369 +526,117 @@ float TNV_CPU_main(float *Input, float *u, float lambda, int maxIter, float tol,
float alpha = alpha0;
float delta = 1.5f;
float eta = 0.95f;
+
+ TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+ tnv_ctx.InputT = InputT;
+ tnv_ctx.uT = uT;
- // PDHG algorithm parameters
- taulambda = tau * lambda;
- float divtau = 1.0f / tau;
- float divsigma = 1.0f / sigma;
- float theta1 = 1.0f + theta;
-
- /*allocate memory for taulambda */
- //taulambda = (float*) calloc(dimZ, sizeof(float));
- //for(k=0; k < dimZ; k++) {taulambda[k] = tau*lambda[k];}
-
+ int padZ = tnv_ctx.padZ;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
// Apply Primal-Dual Hybrid Gradient scheme
- int iter = 0;
float residual = fLarge;
- float ubarx, ubary;
-
+ int started = 0;
for(iter = 0; iter < maxIter; iter++) {
- // Argument of proximal mapping of fidelity term
-#pragma omp parallel for shared(v, u) private(k)
- for(k=0; k<dimX*dimY*dimZ; k++) {v[k] = u[k] + tau*div[k];}
-
-// Proximal solution of fidelity term
-proxG(u_upd, v, Input, taulambda, (long)(dimX), (long)(dimY), (long)(dimZ));
-
-// Gradient of updated primal variable
-gradient(u_upd, gradx_upd, grady_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
-
-// Argument of proximal mapping of regularization term
-#pragma omp parallel for shared(gradx_upd, grady_upd, gradx, grady) private(k, ubarx, ubary)
-for(k=0; k<dimX*dimY*dimZ; k++) {
- ubarx = theta1 * gradx_upd[k] - theta * gradx[k];
- ubary = theta1 * grady_upd[k] - theta * grady[k];
- vx[k] = ubarx + divsigma * qx[k];
- vy[k] = ubary + divsigma * qy[k];
- gradx_ubar[k] = ubarx;
- grady_ubar[k] = ubary;
-}
+ float resprimal = 0.0f;
+ float resdual = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
-proxF(gx_upd, gy_upd, vx, vy, sigma, p, q, r, (long)(dimX), (long)(dimY), (long)(dimZ));
+ float divtau = 1.0f / tau;
-// Update dual variable
-#pragma omp parallel for shared(qx_upd, qy_upd) private(k)
-for(k=0; k<dimX*dimY*dimZ; k++) {
- qx_upd[k] = qx[k] + sigma * (gradx_ubar[k] - gx_upd[k]);
- qy_upd[k] = qy[k] + sigma * (grady_ubar[k] - gy_upd[k]);
-}
+ tnv_ctx.sigma = sigma;
+ tnv_ctx.tau = tau;
+ tnv_ctx.theta = theta;
-// Divergence of updated dual variable
-#pragma omp parallel for shared(div_upd) private(k)
-for(k=0; k<dimX*dimY*dimZ; k++) {div_upd[k] = 0.0f;}
-divergence(qx_upd, qy_upd, div_upd, dimX, dimY, dimZ);
-
-// Compute primal residual, dual residual, and backtracking condition
-float resprimal = 0.0f;
-float resdual = 0.0f;
-float product = 0.0f;
-float unorm = 0.0f;
-float qnorm = 0.0f;
-
-for(k=0; k<dimX*dimY*dimZ; k++) {
- float udiff = u[k] - u_upd[k];
- float qxdiff = qx[k] - qx_upd[k];
- float qydiff = qy[k] - qy_upd[k];
- float divdiff = div[k] - div_upd[k];
- float gradxdiff = gradx[k] - gradx_upd[k];
- float gradydiff = grady[k] - grady_upd[k];
-
- resprimal += fabs(divtau*udiff + divdiff);
- resdual += fabs(divsigma*qxdiff - gradxdiff);
- resdual += fabs(divsigma*qydiff - gradydiff);
-
- unorm += (udiff * udiff);
- qnorm += (qxdiff * qxdiff + qydiff * qydiff);
- product += (gradxdiff * qxdiff + gradydiff * qydiff);
-}
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
-float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm +
- gamma * tau * qnorm);
+ // border regions
+ for (j = 1; j < tnv_ctx.threads; j++) {
+ tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
-// Adapt step-size parameters
-float dual_dot_delta = resdual * s * delta;
-float dual_div_delta = (resdual * s) / delta;
+ m = (ctx0->stepY - 1) * dimX * padZ;
+ for(i = 0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+
+ float divdiff = ctx->div0[l] - ctx->div[l];
+ float udiff = ctx->udiff0[l];
-if(b > 1)
-{
- // Decrease step-sizes to fit balancing principle
- tau = (beta * tau) / b;
- sigma = (beta * sigma) / b;
- alpha = alpha0;
-
- copyIm(u, u_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
- copyIm(gx, gx_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
- copyIm(gy, gy_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
- copyIm(qx, qx_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
- copyIm(qy, qy_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
- copyIm(gradx, gradx_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
- copyIm(grady, grady_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
- copyIm(div, div_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
-
-} else if(resprimal > dual_dot_delta)
-{
- // Increase primal step-size and decrease dual step-size
- tau = tau / (1.0f - alpha);
- sigma = sigma * (1.0f - alpha);
- alpha = alpha * eta;
-
-} else if(resprimal < dual_div_delta)
-{
- // Decrease primal step-size and increase dual step-size
- tau = tau * (1.0f - alpha);
- sigma = sigma / (1.0f - alpha);
- alpha = alpha * eta;
-}
+ ctx->div[l] -= ctx0->qy[l + m];
+ ctx0->div[m + l + dimX*padZ] = ctx->div[l];
+
+ divdiff += ctx0->qy[l + m];
+ resprimal += fabs(divtau * udiff + divdiff);
+ }
+ }
+ }
-// Update variables
-taulambda = tau * lambda;
-//for(k=0; k < dimZ; k++) taulambda[k] = tau*lambda[k];
+ for (j = 0; j < tnv_ctx.threads; j++) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+ resprimal += ctx->resprimal;
+ resdual += ctx->resdual;
+ product += ctx->product;
+ unorm += ctx->unorm;
+ qnorm += ctx->qnorm;
+ }
-divsigma = 1.0f / sigma;
-divtau = 1.0f / tau;
+ residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+ float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+ float dual_dot_delta = resdual * s * delta;
+ float dual_div_delta = (resdual * s) / delta;
+ printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
-copyIm(u_upd, u, (long)(dimX), (long)(dimY), (long)(dimZ));
-copyIm(gx_upd, gx, (long)(dimX), (long)(dimY), (long)(dimZ));
-copyIm(gy_upd, gy, (long)(dimX), (long)(dimY), (long)(dimZ));
-copyIm(qx_upd, qx, (long)(dimX), (long)(dimY), (long)(dimZ));
-copyIm(qy_upd, qy, (long)(dimX), (long)(dimY), (long)(dimZ));
-copyIm(gradx_upd, gradx, (long)(dimX), (long)(dimY), (long)(dimZ));
-copyIm(grady_upd, grady, (long)(dimX), (long)(dimY), (long)(dimZ));
-copyIm(div_upd, div, (long)(dimX), (long)(dimY), (long)(dimZ));
-// Compute residual at current iteration
-residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+ if(b > 1) {
+
+ // Decrease step-sizes to fit balancing principle
+ tau = (beta * tau) / b;
+ sigma = (beta * sigma) / b;
+ alpha = alpha0;
-// printf("%f \n", residual);
-if (residual < tol) {
- printf("Iterations stopped at %i with the residual %f \n", iter, residual);
- break; }
+ if (started) {
+ fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n");
+ } else {
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+ }
+ } else {
+ started = 1;
+ if(resprimal > dual_dot_delta) {
+ // Increase primal step-size and decrease dual step-size
+ tau = tau / (1.0f - alpha);
+ sigma = sigma * (1.0f - alpha);
+ alpha = alpha * eta;
+ } else if(resprimal < dual_div_delta) {
+ // Decrease primal step-size and increase dual step-size
+ tau = tau * (1.0f - alpha);
+ sigma = sigma / (1.0f - alpha);
+ alpha = alpha * eta;
+ }
+ }
+ if (residual < tol) break;
}
- printf("Iterations stopped at %i with the residual %f \n", iter, residual);
- free (u_upd); free(gx); free(gy); free(gx_upd); free(gy_upd);
- free(qx); free(qy); free(qx_upd); free(qy_upd); free(v); free(vx); free(vy);
- free(gradx); free(grady); free(gradx_upd); free(grady_upd); free(gradx_ubar);
- free(grady_ubar); free(div); free(div_upd);
- return *u;
-}
-float proxG(float *u_upd, float *v, float *f, float taulambda, long dimX, long dimY, long dimZ)
-{
- float constant;
- long k;
- constant = 1.0f + taulambda;
-#pragma omp parallel for shared(v, f, u_upd) private(k)
- for(k=0; k<dimZ*dimX*dimY; k++) {
- u_upd[k] = (v[k] + taulambda * f[k])/constant;
- //u_upd[(dimX*dimY)*k + l] = (v[(dimX*dimY)*k + l] + taulambda * f[(dimX*dimY)*k + l])/constant;
- }
- return *u_upd;
-}
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
-float gradient(float *u_upd, float *gradx_upd, float *grady_upd, long dimX, long dimY, long dimZ)
-{
- long i, j, k, l;
- // Compute discrete gradient using forward differences
-#pragma omp parallel for shared(gradx_upd,grady_upd,u_upd) private(i, j, k, l)
- for(k = 0; k < dimZ; k++) {
- for(j = 0; j < dimY; j++) {
- l = j * dimX;
- for(i = 0; i < dimX; i++) {
- // Derivatives in the x-direction
- if(i != dimX-1)
- gradx_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+1+l] - u_upd[(dimX*dimY)*k + i+l];
- else
- gradx_upd[(dimX*dimY)*k + i+l] = 0.0f;
-
- // Derivatives in the y-direction
- if(j != dimY-1)
- //grady_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+dimY+l] -u_upd[(dimX*dimY)*k + i+l];
- grady_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+(j+1)*dimX] -u_upd[(dimX*dimY)*k + i+l];
- else
- grady_upd[(dimX*dimY)*k + i+l] = 0.0f;
- }}}
- return 1;
-}
-float proxF(float *gx, float *gy, float *vx, float *vy, float sigma, int p, int q, int r, long dimX, long dimY, long dimZ)
-{
- // (S^p, \ell^1) norm decouples at each pixel
-// Spl1(gx, gy, vx, vy, sigma, p, num_channels, dim);
- float divsigma = 1.0f / sigma;
-
- // $\ell^{1,1,1}$-TV regularization
-// int i,j,k;
-// #pragma omp parallel for shared (gx,gy,vx,vy) private(i,j,k)
-// for(k = 0; k < dimZ; k++) {
-// for(i=0; i<dimX; i++) {
-// for(j=0; j<dimY; j++) {
-// gx[(dimX*dimY)*k + (i)*dimY + (j)] = SIGN(vx[(dimX*dimY)*k + (i)*dimY + (j)]) * MAX(fabs(vx[(dimX*dimY)*k + (i)*dimY + (j)]) - divsigma, 0.0f);
-// gy[(dimX*dimY)*k + (i)*dimY + (j)] = SIGN(vy[(dimX*dimY)*k + (i)*dimY + (j)]) * MAX(fabs(vy[(dimX*dimY)*k + (i)*dimY + (j)]) - divsigma, 0.0f);
-// }}}
-
- // Auxiliar vector
- float *proj, sum, shrinkfactor ;
- float M1,M2,M3,valuex,valuey,T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd,t1,t2,t3;
- long i,j,k, ii, num;
-#pragma omp parallel for shared (gx,gy,vx,vy,p) private(i,ii,j,k,proj,num, sum, shrinkfactor, M1,M2,M3,valuex,valuey,T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4,v0,v1,v2,mu1,mu2,sig1_upd,sig2_upd,t1,t2,t3)
- for(i=0; i<dimX; i++) {
- for(j=0; j<dimY; j++) {
-
- proj = (float*) calloc (2,sizeof(float));
- // Compute matrix $M\in\R^{2\times 2}$
- M1 = 0.0f;
- M2 = 0.0f;
- M3 = 0.0f;
-
- for(k = 0; k < dimZ; k++)
- {
- valuex = vx[(dimX*dimY)*k + (j)*dimX + (i)];
- valuey = vy[(dimX*dimY)*k + (j)*dimX + (i)];
-
- M1 += (valuex * valuex);
- M2 += (valuex * valuey);
- M3 += (valuey * valuey);
- }
-
- // Compute eigenvalues of M
- T = M1 + M3;
- D = M1 * M3 - M2 * M2;
- det = sqrt(MAX((T * T / 4.0f) - D, 0.0f));
- eig1 = MAX((T / 2.0f) + det, 0.0f);
- eig2 = MAX((T / 2.0f) - det, 0.0f);
- sig1 = sqrt(eig1);
- sig2 = sqrt(eig2);
-
- // Compute normalized eigenvectors
- V1 = V2 = V3 = V4 = 0.0f;
-
- if(M2 != 0.0f)
- {
- v0 = M2;
- v1 = eig1 - M3;
- v2 = eig2 - M3;
-
- mu1 = sqrtf(v0 * v0 + v1 * v1);
- mu2 = sqrtf(v0 * v0 + v2 * v2);
-
- if(mu1 > fTiny)
- {
- V1 = v1 / mu1;
- V3 = v0 / mu1;
- }
-
- if(mu2 > fTiny)
- {
- V2 = v2 / mu2;
- V4 = v0 / mu2;
- }
-
- } else
- {
- if(M1 > M3)
- {
- V1 = V4 = 1.0f;
- V2 = V3 = 0.0f;
-
- } else
- {
- V1 = V4 = 0.0f;
- V2 = V3 = 1.0f;
- }
- }
-
- // Compute prox_p of the diagonal entries
- sig1_upd = sig2_upd = 0.0f;
-
- if(p == 1)
- {
- sig1_upd = MAX(sig1 - divsigma, 0.0f);
- sig2_upd = MAX(sig2 - divsigma, 0.0f);
-
- } else if(p == INFNORM)
- {
- proj[0] = sigma * fabs(sig1);
- proj[1] = sigma * fabs(sig2);
-
- /*l1 projection part */
- sum = fLarge;
- num = 0l;
- shrinkfactor = 0.0f;
- while(sum > 1.0f)
- {
- sum = 0.0f;
- num = 0;
-
- for(ii = 0; ii < 2; ii++)
- {
- proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
-
- sum += fabs(proj[ii]);
- if(proj[ii]!= 0.0f)
- num++;
- }
-
- if(num > 0)
- shrinkfactor = (sum - 1.0f) / num;
- else
- break;
- }
- /*l1 proj ends*/
-
- sig1_upd = sig1 - divsigma * proj[0];
- sig2_upd = sig2 - divsigma * proj[1];
- }
-
- // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
- if(sig1 > fTiny)
- sig1_upd /= sig1;
-
- if(sig2 > fTiny)
- sig2_upd /= sig2;
-
- // Compute solution
- t1 = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
- t2 = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
- t3 = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
-
- for(k = 0; k < dimZ; k++)
- {
- gx[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t1 + vy[(dimX*dimY)*k + j*dimX + i] * t2;
- gy[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t2 + vy[(dimX*dimY)*k + j*dimX + i] * t3;
- }
-
- // Delete allocated memory
- free(proj);
- }}
-
- return 1;
-}
+ printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+ printf("Return: %f\n", *uT);
-float divergence(float *qx_upd, float *qy_upd, float *div_upd, long dimX, long dimY, long dimZ)
-{
- long i, j, k, l;
-#pragma omp parallel for shared(qx_upd,qy_upd,div_upd) private(i, j, k, l)
- for(k = 0; k < dimZ; k++) {
- for(j = 0; j < dimY; j++) {
- l = j * dimX;
- for(i = 0; i < dimX; i++) {
- if(i != dimX-1)
- {
- // ux[k][i+l] = u[k][i+1+l] - u[k][i+l]
- div_upd[(dimX*dimY)*k + i+1+l] -= qx_upd[(dimX*dimY)*k + i+l];
- div_upd[(dimX*dimY)*k + i+l] += qx_upd[(dimX*dimY)*k + i+l];
- }
-
- if(j != dimY-1)
- {
- // uy[k][i+l] = u[k][i+width+l] - u[k][i+l]
- //div_upd[(dimX*dimY)*k + i+dimY+l] -= qy_upd[(dimX*dimY)*k + i+l];
- div_upd[(dimX*dimY)*k + i+(j+1)*dimX] -= qy_upd[(dimX*dimY)*k + i+l];
- div_upd[(dimX*dimY)*k + i+l] += qy_upd[(dimX*dimY)*k + i+l];
- }
- }
- }
- }
- return *div_upd;
+// exit(-1);
+ return *uT;
}
diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c
new file mode 100755
index 0000000..d771db5
--- /dev/null
+++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c
@@ -0,0 +1,695 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Copyriht 2020 Suren A. Chlingaryan
+ * Optimized version with 1/2 of memory consumption and ~4x performance
+ * This version is algorithmicly comptable with the original, but slight change in results
+ * is expected due to different order of floating-point operations.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TNV_core.h"
+
+#define min(a,b) (((a)<(b))?(a):(b))
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+ int ii, num;
+ float divsigma = 1.0f / sigma;
+ float sum, shrinkfactor;
+ float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+ float proj[2] = {0};
+
+ // Compute eigenvalues of M
+ T = M1 + M3;
+ D = M1 * M3 - M2 * M2;
+ det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+ eig1 = MAX((T / 2.0f) + det, 0.0f);
+ eig2 = MAX((T / 2.0f) - det, 0.0f);
+ sig1 = sqrtf(eig1);
+ sig2 = sqrtf(eig2);
+
+ // Compute normalized eigenvectors
+ V1 = V2 = V3 = V4 = 0.0f;
+
+ if(M2 != 0.0f)
+ {
+ v0 = M2;
+ v1 = eig1 - M3;
+ v2 = eig2 - M3;
+
+ mu1 = sqrtf(v0 * v0 + v1 * v1);
+ mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+ if(mu1 > fTiny)
+ {
+ V1 = v1 / mu1;
+ V3 = v0 / mu1;
+ }
+
+ if(mu2 > fTiny)
+ {
+ V2 = v2 / mu2;
+ V4 = v0 / mu2;
+ }
+
+ } else
+ {
+ if(M1 > M3)
+ {
+ V1 = V4 = 1.0f;
+ V2 = V3 = 0.0f;
+
+ } else
+ {
+ V1 = V4 = 0.0f;
+ V2 = V3 = 1.0f;
+ }
+ }
+
+ // Compute prox_p of the diagonal entries
+ sig1_upd = sig2_upd = 0.0f;
+
+ if(p == 1)
+ {
+ sig1_upd = MAX(sig1 - divsigma, 0.0f);
+ sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+ } else if(p == INFNORM)
+ {
+ proj[0] = sigma * fabs(sig1);
+ proj[1] = sigma * fabs(sig2);
+
+ /*l1 projection part */
+ sum = fLarge;
+ num = 0l;
+ shrinkfactor = 0.0f;
+ while(sum > 1.0f)
+ {
+ sum = 0.0f;
+ num = 0;
+
+ for(ii = 0; ii < 2; ii++)
+ {
+ proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+ sum += fabs(proj[ii]);
+ if(proj[ii]!= 0.0f)
+ num++;
+ }
+
+ if(num > 0)
+ shrinkfactor = (sum - 1.0f) / num;
+ else
+ break;
+ }
+ /*l1 proj ends*/
+
+ sig1_upd = sig1 - divsigma * proj[0];
+ sig2_upd = sig2 - divsigma * proj[1];
+ }
+
+ // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+ if(sig1 > fTiny)
+ sig1_upd /= sig1;
+
+ if(sig2 > fTiny)
+ sig2_upd /= sig2;
+
+ // Compute solution
+ t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+ t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+ t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+ int offY, stepY, copY;
+ float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd;
+ float *div, *div_upd;
+ float resprimal, resdual;
+ float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+ int threads;
+ tnv_thread_t *thr_ctx;
+ float *InputT, *uT;
+ int dimX, dimY, dimZ, padZ;
+ float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ free(ctx->Input);
+ free(ctx->u);
+ free(ctx->u_upd);
+ free(ctx->qx);
+ free(ctx->qy);
+ free(ctx->qx_upd);
+ free(ctx->qy_upd);
+ free(ctx->gradx);
+ free(ctx->grady);
+ free(ctx->gradx_upd);
+ free(ctx->grady_upd);
+ free(ctx->div);
+ free(ctx->div_upd);
+
+ return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+
+// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*(stepY+1)*padZ);
+
+ // Auxiliar vectors
+ ctx->Input = malloc(Dim1Total * sizeof(float));
+ ctx->u = malloc(Dim1Total * sizeof(float));
+ ctx->u_upd = malloc(Dim1Total * sizeof(float));
+ ctx->qx = malloc(DimTotal * sizeof(float));
+ ctx->qy = malloc(DimTotal * sizeof(float));
+ ctx->qx_upd = malloc(DimTotal * sizeof(float));
+ ctx->qy_upd = malloc(DimTotal * sizeof(float));
+ ctx->gradx = malloc(DimTotal * sizeof(float));
+ ctx->grady = malloc(DimTotal * sizeof(float));
+ ctx->gradx_upd = malloc(DimTotal * sizeof(float));
+ ctx->grady_upd = malloc(DimTotal * sizeof(float));
+ ctx->div = malloc(Dim1Total * sizeof(float));
+ ctx->div_upd = malloc(Dim1Total * sizeof(float));
+
+ if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) {
+ fprintf(stderr, "Error allocating memory\n");
+ exit(-1);
+ }
+
+ return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ memset(ctx->u, 0, Dim1Total * sizeof(float));
+ memset(ctx->qx, 0, DimTotal * sizeof(float));
+ memset(ctx->qy, 0, DimTotal * sizeof(float));
+ memset(ctx->gradx, 0, DimTotal * sizeof(float));
+ memset(ctx->grady, 0, DimTotal * sizeof(float));
+ memset(ctx->div, 0, Dim1Total * sizeof(float));
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<copY; j++) {
+ for(i=0; i<dimX; i++) {
+ ctx->Input[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+ ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<stepY; j++) {
+ for(i=0; i<dimX; i++) {
+ tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k];
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+ int padZ = tnv_ctx->padZ;
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ // Auxiliar vectors
+ memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float));
+ memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float));
+ memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float));
+ memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float));
+ memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float));
+ memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float));
+
+ return 0;
+}
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+ int padZ = tnv_ctx->padZ;
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ // Auxiliar vectors
+ memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float));
+ memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float));
+ memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float));
+ memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float));
+ memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float));
+ memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float));
+
+ return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+ long i, j, k, l, m;
+
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+ float *Input = ctx->Input;
+ float *u = ctx->u;
+ float *u_upd = ctx->u_upd;
+ float *qx = ctx->qx;
+ float *qy = ctx->qy;
+ float *qx_upd = ctx->qx_upd;
+ float *qy_upd = ctx->qy_upd;
+ float *gradx = ctx->gradx;
+ float *grady = ctx->grady;
+ float *gradx_upd = ctx->gradx_upd;
+ float *grady_upd = ctx->grady_upd;
+ float *div = ctx->div;
+ float *div_upd = ctx->div_upd;
+
+ long p = 1l;
+ long q = 1l;
+ long r = 0l;
+
+ float lambda = tnv_ctx->lambda;
+ float sigma = tnv_ctx->sigma;
+ float tau = tnv_ctx->tau;
+ float theta = tnv_ctx->theta;
+
+ float taulambda = tau * lambda;
+ float divtau = 1.0f / tau;
+ float divsigma = 1.0f / sigma;
+ float theta1 = 1.0f + theta;
+ float constant = 1.0f + taulambda;
+
+ float resprimal = 0.0f;
+ float resdual = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
+
+ float udiff[dimZ];
+ float qxdiff;
+ float qydiff;
+ float divdiff;
+ float gradxdiff[dimZ];
+ float gradydiff[dimZ];
+
+ for(i=0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+ u_upd[l] = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
+ div_upd[l] = 0;
+ }
+ }
+
+ for(j = 0; j < stepY; j++) {
+/* m = j * dimX * dimZ + (dimX - 1) * dimZ;
+ for(k = 0; k < dimZ; k++) {
+ u_upd[k + m] = (u[k + m] + tau * div[k + m] + taulambda * Input[k + m]) / constant;
+ }*/
+
+ for(i = 0; i < dimX/* - 1*/; i++) {
+ float t[3];
+ float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+ l = (j * dimX + i) * padZ;
+ m = dimX * padZ;
+
+//#pragma unroll 64
+ for(k = 0; k < dimZ; k++) {
+ u_upd[l + k + m] = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant;
+
+ gradx_upd[l + k] = (i == (dimX - 1))?0:(u_upd[l + k + padZ] - u_upd[l + k]);
+ grady_upd[l + k] = (j == (copY - 1))?0:(u_upd[l + k + m] - u_upd[l + k]); // We need div from the next thread on last iter
+
+ udiff[k] = u[l + k] - u_upd[l + k];
+ unorm += (udiff[k] * udiff[k]);
+// if ((!k)&&(!i)) printf("%i = %f %f, %f %f\n", offY + j, u[l + k], u_upd[l + k], udiff[k], unorm);
+
+ gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
+ gradydiff[k] = grady[l + k] - grady_upd[l + k];
+
+ float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+ float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+//#define TNV_NEW_STYLE
+#ifdef TNV_NEW_STYLE
+ qx_upd[l + k] = qx[l + k] + sigma * ubarx;
+ qy_upd[l + k] = qy[l + k] + sigma * ubary;
+
+ float vx = divsigma * qx_upd[l + k]; //+ ubarx
+ float vy = divsigma * qy_upd[l + k]; //+ ubary
+#else
+ float vx = ubarx + divsigma * qx[l + k];
+ float vy = ubary + divsigma * qy[l + k];
+#endif
+
+ M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+ }
+
+ coefF(t, M1, M2, M3, sigma, p, q, r);
+
+//#pragma unroll 64
+ for(k = 0; k < dimZ; k++) {
+#ifdef TNV_NEW_STYLE
+ float vx = divsigma * qx_upd[l + k];
+ float vy = divsigma * qy_upd[l + k];
+
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qx_upd[l + k] -= sigma * gx_upd;
+ qy_upd[l + k] -= sigma * gy_upd;
+#else
+ float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+ float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+ float vx = ubarx + divsigma * qx[l + k];
+ float vy = ubary + divsigma * qy[l + k];
+
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
+ qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
+#endif
+
+if(i != (dimX-1)) {
+ div_upd[l + k] += qx_upd[l + k];
+ div_upd[l + k + padZ] -= qx_upd[l + k];
+}
+if(j != (copY-1)) {
+ div_upd[l + k] += qy_upd[l + k];
+ div_upd[l + k + m] = -qy_upd[l + k]; // We need to update div in the next thread on last iter
+}
+
+ qxdiff = qx[l + k] - qx_upd[l + k];
+ qydiff = qy[l + k] - qy_upd[l + k];
+ qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+ resdual += fabs(divsigma * qxdiff - gradxdiff[k]);
+ resdual += fabs(divsigma * qydiff - gradydiff[k]);
+ product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+ if ((offY == 0)||(j > 0)) {
+ divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history?
+ resprimal += fabs(divtau * udiff[k] + divdiff);
+ }
+ }
+
+ } // i
+ }
+
+
+ ctx->resprimal = resprimal;
+ ctx->resdual = resdual;
+ ctx->product = product;
+ ctx->unorm = unorm;
+ ctx->qnorm = qnorm;
+
+ return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+ int i, off, size, err;
+
+ if (sched) return;
+
+ tnv_ctx.dimX = dimX;
+ tnv_ctx.dimY = dimY;
+ tnv_ctx.dimZ = dimZ;
+ // Padding seems actually slower
+// tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0));
+ tnv_ctx.padZ = dimZ;
+
+ hw_sched_init();
+
+ int threads = hw_sched_get_cpu_count();
+ if (threads > dimY) threads = dimY/2;
+
+ int step = dimY / threads;
+ int extra = dimY % threads;
+
+ tnv_ctx.threads = threads;
+ tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+ for (i = 0, off = 0; i < threads; i++, off += size) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+ size = step + ((i < extra)?1:0);
+
+ ctx->offY = off;
+ ctx->stepY = size;
+
+ if (i == (threads-1)) ctx->copY = ctx->stepY;
+ else ctx->copY = ctx->stepY + 1;
+ }
+
+ sched = hw_sched_create(threads);
+ if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+ int err;
+ int iter;
+ int i,j,k,l,m;
+
+ lambda = 1.0f/(2.0f*lambda);
+ tnv_ctx.lambda = lambda;
+
+ // PDHG algorithm parameters
+ float tau = 0.5f;
+ float sigma = 0.5f;
+ float theta = 1.0f;
+
+ // Backtracking parameters
+ float s = 1.0f;
+ float gamma = 0.75f;
+ float beta = 0.95f;
+ float alpha0 = 0.2f;
+ float alpha = alpha0;
+ float delta = 1.5f;
+ float eta = 0.95f;
+
+ TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+ tnv_ctx.InputT = InputT;
+ tnv_ctx.uT = uT;
+
+ int padZ = tnv_ctx.padZ;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
+ // Apply Primal-Dual Hybrid Gradient scheme
+ float residual = fLarge;
+ for(iter = 0; iter < maxIter; iter++) {
+ float resprimal = 0.0f;
+ float resdual = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
+
+ float divtau = 1.0f / tau;
+
+ tnv_ctx.sigma = sigma;
+ tnv_ctx.tau = tau;
+ tnv_ctx.theta = theta;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
+
+ // border regions
+ for (j = 1; j < tnv_ctx.threads; j++) {
+ tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+
+ m = ctx0->stepY * dimX * padZ;
+ for(i = 0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+
+ float div_upd_add = ctx0->div_upd[m + l];
+ ctx->div_upd[l] += div_upd_add;
+ ctx0->div_upd[m + l] = ctx->div_upd[l];
+ //ctx0->u_upd[m + l] = ctx->u_upd[l];
+
+ float divdiff = ctx->div[l] - ctx->div_upd[l]; // Multiple steps... How we compute without history?
+ float udiff = ctx->u[l] - ctx->u_upd[l];
+ resprimal += fabs(divtau * udiff + divdiff);
+ }
+ }
+ }
+
+ for (j = 0; j < tnv_ctx.threads; j++) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+ resprimal += ctx->resprimal;
+ resdual += ctx->resdual;
+ product += ctx->product;
+ unorm += ctx->unorm;
+ qnorm += ctx->qnorm;
+ }
+
+ residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+ float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+ float dual_dot_delta = resdual * s * delta;
+ float dual_div_delta = (resdual * s) / delta;
+ printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+
+ if(b > 1) {
+ // Decrease step-sizes to fit balancing principle
+ tau = (beta * tau) / b;
+ sigma = (beta * sigma) / b;
+ alpha = alpha0;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+ } else {
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); }
+
+ if(resprimal > dual_dot_delta) {
+ // Increase primal step-size and decrease dual step-size
+ tau = tau / (1.0f - alpha);
+ sigma = sigma * (1.0f - alpha);
+ alpha = alpha * eta;
+ } else if(resprimal < dual_div_delta) {
+ // Decrease primal step-size and increase dual step-size
+ tau = tau * (1.0f - alpha);
+ sigma = sigma / (1.0f - alpha);
+ alpha = alpha * eta;
+ }
+ }
+
+ if (residual < tol) break;
+ }
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
+
+
+ printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+ printf("Return: %f\n", *uT);
+
+ return *uT;
+}
diff --git a/src/Core/regularisers_CPU/hw_sched.c b/src/Core/regularisers_CPU/hw_sched.c
new file mode 100644
index 0000000..be895cc
--- /dev/null
+++ b/src/Core/regularisers_CPU/hw_sched.c
@@ -0,0 +1,396 @@
+/*
+ * The PyHST program is Copyright (C) 2002-2011 of the
+ * European Synchrotron Radiation Facility (ESRF) and
+ * Karlsruhe Institute of Technology (KIT).
+ *
+ * PyHST is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * hst is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef HW_HAVE_SCHED_HEADERS
+# include <sys/types.h>
+# include <unistd.h>
+# include <sched.h>
+#endif /* HW_HAVE_SCHED_HEADERS */
+
+#include "hw_sched.h"
+
+
+#ifdef HW_USE_THREADS
+# define MUTEX_INIT(ctx, name) \
+ if (!err) { \
+ ctx->name##_mutex = g_mutex_new(); \
+ if (!ctx->name##_mutex) err = 1; \
+ }
+
+# define MUTEX_FREE(ctx, name) \
+ if (ctx->name##_mutex) g_mutex_free(ctx->name##_mutex);
+
+# define COND_INIT(ctx, name) \
+ MUTEX_INIT(ctx, name##_cond) \
+ if (!err) { \
+ ctx->name##_cond = g_cond_new(); \
+ if (!ctx->name##_cond) { \
+ err = 1; \
+ MUTEX_FREE(ctx, name##_cond) \
+ } \
+ }
+
+# define COND_FREE(ctx, name) \
+ if (ctx->name##_cond) g_cond_free(ctx->name##_cond); \
+ MUTEX_FREE(ctx, name##_cond)
+#else /* HW_USE_THREADS */
+# define MUTEX_INIT(ctx, name)
+# define MUTEX_FREE(ctx, name)
+# define COND_INIT(ctx, name)
+# define COND_FREE(ctx, name)
+#endif /* HW_USE_THREADS */
+
+
+HWRunFunction ppu_run[] = {
+ (HWRunFunction)NULL
+};
+
+static int hw_sched_initialized = 0;
+
+int hw_sched_init(void) {
+ if (!hw_sched_initialized) {
+#ifdef HW_USE_THREADS
+ g_thread_init(NULL);
+#endif /* HW_USE_THREADS */
+ hw_sched_initialized = 1;
+ }
+
+ return 0;
+}
+
+
+int hw_sched_get_cpu_count(void) {
+#ifdef HW_HAVE_SCHED_HEADERS
+ int err;
+
+ int cpu_count;
+ cpu_set_t mask;
+
+ err = sched_getaffinity(getpid(), sizeof(mask), &mask);
+ if (err) return 1;
+
+# ifdef CPU_COUNT
+ cpu_count = CPU_COUNT(&mask);
+# else
+ for (cpu_count = 0; cpu_count < CPU_SETSIZE; cpu_count++) {
+ if (!CPU_ISSET(cpu_count, &mask)) break;
+ }
+# endif
+
+ if (!cpu_count) cpu_count = 1;
+ return cpu_count;
+#else /* HW_HAVE_SCHED_HEADERS */
+ return 1;
+#endif /* HW_HAVE_SCHED_HEADERS */
+}
+
+
+HWSched hw_sched_create(int cpu_count) {
+ int i;
+ int err = 0;
+
+ HWSched ctx;
+
+ //hw_sched_init();
+
+ ctx = (HWSched)malloc(sizeof(HWSchedS));
+ if (!ctx) return NULL;
+
+ memset(ctx, 0, sizeof(HWSchedS));
+
+ ctx->status = 1;
+
+ MUTEX_INIT(ctx, sync);
+ MUTEX_INIT(ctx, data);
+ COND_INIT(ctx, compl);
+ COND_INIT(ctx, job);
+
+ if (err) {
+ hw_sched_destroy(ctx);
+ return NULL;
+ }
+
+ if (!cpu_count) cpu_count = hw_sched_get_cpu_count();
+ if (cpu_count > HW_MAX_THREADS) cpu_count = HW_MAX_THREADS;
+
+ ctx->n_threads = 0;
+ for (i = 0; i < cpu_count; i++) {
+ ctx->thread[ctx->n_threads] = hw_thread_create(ctx, ctx->n_threads, NULL, ppu_run, NULL);
+ if (ctx->thread[ctx->n_threads]) {
+#ifndef HW_USE_THREADS
+ ctx->thread[ctx->n_threads]->status = HW_THREAD_STATUS_STARTING;
+#endif /* HW_USE_THREADS */
+ ++ctx->n_threads;
+ }
+ }
+
+ if (!ctx->n_threads) {
+ hw_sched_destroy(ctx);
+ return NULL;
+ }
+
+ return ctx;
+}
+
+static int hw_sched_wait_threads(HWSched ctx) {
+#ifdef HW_USE_THREADS
+ int i = 0;
+
+ hw_sched_lock(ctx, compl_cond);
+ while (i < ctx->n_threads) {
+ for (; i < ctx->n_threads; i++) {
+ if (ctx->thread[i]->status == HW_THREAD_STATUS_INIT) {
+ hw_sched_wait(ctx, compl);
+ break;
+ }
+ }
+
+ }
+ hw_sched_unlock(ctx, compl_cond);
+#endif /* HW_USE_THREADS */
+
+ ctx->started = 1;
+
+ return 0;
+}
+
+void hw_sched_destroy(HWSched ctx) {
+ int i;
+
+ if (ctx->n_threads > 0) {
+ if (!ctx->started) {
+ hw_sched_wait_threads(ctx);
+ }
+
+ ctx->status = 0;
+ hw_sched_lock(ctx, job_cond);
+ hw_sched_broadcast(ctx, job);
+ hw_sched_unlock(ctx, job_cond);
+
+ for (i = 0; i < ctx->n_threads; i++) {
+ hw_thread_destroy(ctx->thread[i]);
+ }
+ }
+
+ COND_FREE(ctx, job);
+ COND_FREE(ctx, compl);
+ MUTEX_FREE(ctx, data);
+ MUTEX_FREE(ctx, sync);
+
+ free(ctx);
+}
+
+int hw_sched_set_sequential_mode(HWSched ctx, int *n_blocks, int *cur_block, HWSchedFlags flags) {
+ ctx->mode = HW_SCHED_MODE_SEQUENTIAL;
+ ctx->n_blocks = n_blocks;
+ ctx->cur_block = cur_block;
+ ctx->flags = flags;
+
+ return 0;
+}
+
+int hw_sched_get_chunk(HWSched ctx, int thread_id) {
+ int block;
+
+ switch (ctx->mode) {
+ case HW_SCHED_MODE_PREALLOCATED:
+ if (ctx->thread[thread_id]->status == HW_THREAD_STATUS_STARTING) {
+#ifndef HW_USE_THREADS
+ ctx->thread[thread_id]->status = HW_THREAD_STATUS_DONE;
+#endif /* HW_USE_THREADS */
+ return thread_id;
+ } else {
+ return HW_SCHED_CHUNK_INVALID;
+ }
+ case HW_SCHED_MODE_SEQUENTIAL:
+ if ((ctx->flags&HW_SCHED_FLAG_INIT_CALL)&&(ctx->thread[thread_id]->status == HW_THREAD_STATUS_STARTING)) {
+ return HW_SCHED_CHUNK_INIT;
+ }
+ hw_sched_lock(ctx, data);
+ block = *ctx->cur_block;
+ if (block < *ctx->n_blocks) {
+ *ctx->cur_block = *ctx->cur_block + 1;
+ } else {
+ block = HW_SCHED_CHUNK_INVALID;
+ }
+ hw_sched_unlock(ctx, data);
+ if (block == HW_SCHED_CHUNK_INVALID) {
+ if (((ctx->flags&HW_SCHED_FLAG_FREE_CALL)&&(ctx->thread[thread_id]->status == HW_THREAD_STATUS_RUNNING))) {
+ ctx->thread[thread_id]->status = HW_THREAD_STATUS_FINISHING;
+ return HW_SCHED_CHUNK_FREE;
+ }
+ if ((ctx->flags&HW_SCHED_FLAG_TERMINATOR_CALL)&&((ctx->thread[thread_id]->status == HW_THREAD_STATUS_RUNNING)||(ctx->thread[thread_id]->status == HW_THREAD_STATUS_FINISHING))) {
+ int i;
+ hw_sched_lock(ctx, data);
+ for (i = 0; i < ctx->n_threads; i++) {
+ if (thread_id == i) continue;
+ if ((ctx->thread[i]->status != HW_THREAD_STATUS_DONE)&&(ctx->thread[i]->status != HW_THREAD_STATUS_FINISHING2)&&(ctx->thread[i]->status != HW_THREAD_STATUS_IDLE)) {
+ break;
+ }
+ }
+ ctx->thread[thread_id]->status = HW_THREAD_STATUS_FINISHING2;
+ hw_sched_unlock(ctx, data);
+ if (i == ctx->n_threads) {
+ return HW_SCHED_CHUNK_TERMINATOR;
+ }
+ }
+ }
+ return block;
+ default:
+ return HW_SCHED_CHUNK_INVALID;
+ }
+
+ return -1;
+}
+
+
+int hw_sched_schedule_task(HWSched ctx, void *appctx, HWEntry entry) {
+#ifdef HW_USE_THREADS
+ if (!ctx->started) {
+ hw_sched_wait_threads(ctx);
+ }
+#else /* HW_USE_THREADS */
+ int err;
+ int i, chunk_id, n_threads;
+ HWRunFunction run;
+ HWThread thrctx;
+#endif /* HW_USE_THREADS */
+
+ ctx->ctx = appctx;
+ ctx->entry = entry;
+
+ switch (ctx->mode) {
+ case HW_SCHED_MODE_SEQUENTIAL:
+ *ctx->cur_block = 0;
+ break;
+ default:
+ ;
+ }
+
+#ifdef HW_USE_THREADS
+ hw_sched_lock(ctx, compl_cond);
+
+ hw_sched_lock(ctx, job_cond);
+ hw_sched_broadcast(ctx, job);
+ hw_sched_unlock(ctx, job_cond);
+#else /* HW_USE_THREADS */
+ n_threads = ctx->n_threads;
+
+ for (i = 0; i < n_threads; i++) {
+ thrctx = ctx->thread[i];
+ thrctx->err = 0;
+ }
+
+ i = 0;
+ thrctx = ctx->thread[i];
+ chunk_id = hw_sched_get_chunk(ctx, thrctx->thread_id);
+
+ while (chunk_id >= 0) {
+ run = hw_run_entry(thrctx->runs, entry);
+ err = run(thrctx, thrctx->hwctx, chunk_id, appctx);
+ if (err) {
+ thrctx->err = err;
+ break;
+ }
+
+ if ((++i) == n_threads) i = 0;
+ thrctx = ctx->thread[i];
+ chunk_id = hw_sched_get_chunk(ctx, thrctx->thread_id);
+ }
+#endif /* HW_USE_THREADS */
+
+ return 0;
+}
+
+int hw_sched_wait_task(HWSched ctx) {
+ int err = 0;
+ int i = 0, n_threads = ctx->n_threads;
+
+#ifdef HW_USE_THREADS
+ while (i < ctx->n_threads) {
+ for (; i < ctx->n_threads; i++) {
+ if (ctx->thread[i]->status == HW_THREAD_STATUS_DONE) {
+ ctx->thread[i]->status = HW_THREAD_STATUS_IDLE;
+ } else {
+ hw_sched_wait(ctx, compl);
+ break;
+ }
+ }
+
+ }
+
+ hw_sched_unlock(ctx, compl_cond);
+#endif /* HW_USE_THREADS */
+
+ for (i = 0; i < n_threads; i++) {
+ HWThread thrctx = ctx->thread[i];
+ if (thrctx->err) return err = thrctx->err;
+
+#ifndef HW_USE_THREADS
+ thrctx->status = HW_THREAD_STATUS_IDLE;
+#endif /* HW_USE_THREADS */
+ }
+
+ return err;
+}
+
+int hw_sched_execute_task(HWSched ctx, void *appctx, HWEntry entry) {
+ int err;
+
+ err = hw_sched_schedule_task(ctx, appctx, entry);
+ if (err) return err;
+
+ return hw_sched_wait_task(ctx);
+}
+
+int hw_sched_schedule_thread_task(HWSched ctx, void *appctx, HWEntry entry) {
+ int err;
+
+ ctx->saved_mode = ctx->mode;
+ ctx->mode = HW_SCHED_MODE_PREALLOCATED;
+ err = hw_sched_schedule_task(ctx, appctx, entry);
+
+ return err;
+}
+
+
+int hw_sched_wait_thread_task(HWSched ctx) {
+ int err;
+
+ err = hw_sched_wait_task(ctx);
+ ctx->mode = ctx->saved_mode;
+
+ return err;
+}
+
+int hw_sched_execute_thread_task(HWSched ctx, void *appctx, HWEntry entry) {
+ int err;
+ int saved_mode = ctx->mode;
+
+ ctx->mode = HW_SCHED_MODE_PREALLOCATED;
+ err = hw_sched_execute_task(ctx, appctx, entry);
+ ctx->mode = saved_mode;
+
+ return err;
+}
diff --git a/src/Core/regularisers_CPU/hw_sched.h b/src/Core/regularisers_CPU/hw_sched.h
new file mode 100644
index 0000000..c5d1285
--- /dev/null
+++ b/src/Core/regularisers_CPU/hw_sched.h
@@ -0,0 +1,151 @@
+/*
+ * The PyHST program is Copyright (C) 2002-2011 of the
+ * European Synchrotron Radiation Facility (ESRF) and
+ * Karlsruhe Institute of Technology (KIT).
+ *
+ * PyHST is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * hst is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _HW_SCHED_H
+#define _HW_SCHED_H
+
+#include <glib.h>
+
+ // enable threading
+#define HW_HAVE_SCHED_HEADERS
+#define HW_USE_THREADS
+
+
+typedef struct HWSchedT *HWSched;
+#ifdef HW_USE_THREADS
+typedef GMutex *HWMutex;
+#else /* HW_USE_THREADS */
+typedef void *HWMutex;
+#endif /* HW_USE_THREADS */
+
+
+#include "hw_thread.h"
+
+enum HWSchedModeT {
+ HW_SCHED_MODE_PREALLOCATED = 0,
+ HW_SCHED_MODE_SEQUENTIAL
+};
+typedef enum HWSchedModeT HWSchedMode;
+
+enum HWSchedChunkT {
+ HW_SCHED_CHUNK_INVALID = -1,
+ HW_SCHED_CHUNK_INIT = -2,
+ HW_SCHED_CHUNK_FREE = -3,
+ HW_SCHED_CHUNK_TERMINATOR = -4
+};
+typedef enum HWSchedChunkT HWSchedChunk;
+
+enum HWSchedFlagsT {
+ HW_SCHED_FLAG_INIT_CALL = 1, //! Executed in each thread before real chunks
+ HW_SCHED_FLAG_FREE_CALL = 2, //! Executed in each thread after real chunks
+ HW_SCHED_FLAG_TERMINATOR_CALL = 4 //! Executed in one of the threads after all threads are done
+};
+typedef enum HWSchedFlagsT HWSchedFlags;
+
+
+#define HW_SINGLE_MODE
+//#define HW_DETECT_CPU_CORES
+#define HW_MAX_THREADS 128
+
+#ifdef HW_SINGLE_MODE
+ typedef HWRunFunction HWEntry;
+# define hw_run_entry(runs, entry) entry
+#else /* HW_SINGLE_MODE */
+ typedef int HWEntry;
+# define hw_run_entry(runs, entry) runs[entry]
+#endif /* HW_SINGLE_MODE */
+
+#ifndef HW_HIDE_DETAILS
+struct HWSchedT {
+ int status;
+ int started;
+
+ int n_threads;
+ HWThread thread[HW_MAX_THREADS];
+
+#ifdef HW_USE_THREADS
+ GCond *job_cond, *compl_cond;
+ GMutex *job_cond_mutex, *compl_cond_mutex, *data_mutex;
+ GMutex *sync_mutex;
+#endif /* HW_USE_THREADS */
+
+ HWSchedMode mode;
+ HWSchedMode saved_mode;
+ HWSchedFlags flags;
+ int *n_blocks;
+ int *cur_block;
+
+ HWEntry entry;
+ void *ctx;
+};
+typedef struct HWSchedT HWSchedS;
+#endif /* HW_HIDE_DETAILS */
+
+# ifdef __cplusplus
+extern "C" {
+# endif
+
+HWSched hw_sched_create(int ppu_count);
+int hw_sched_init(void);
+void hw_sched_destroy(HWSched ctx);
+int hw_sched_get_cpu_count(void);
+
+int hw_sched_set_sequential_mode(HWSched ctx, int *n_blocks, int *cur_block, HWSchedFlags flags);
+int hw_sched_get_chunk(HWSched ctx, int thread_id);
+int hw_sched_schedule_task(HWSched ctx, void *appctx, HWEntry entry);
+int hw_sched_wait_task(HWSched ctx);
+int hw_sched_execute_task(HWSched ctx, void *appctx, HWEntry entry);
+
+int hw_sched_schedule_thread_task(HWSched ctx, void *appctx, HWEntry entry);
+int hw_sched_wait_thread_task(HWSched ctx);
+int hw_sched_execute_thread_task(HWSched ctx, void *appctx, HWEntry entry);
+
+HWMutex hw_sched_create_mutex(void);
+void hw_sched_destroy_mutex(HWMutex ctx);
+
+#ifdef HW_USE_THREADS
+# define hw_sched_lock(ctx, type) g_mutex_lock(ctx->type##_mutex)
+# define hw_sched_unlock(ctx, type) g_mutex_unlock(ctx->type##_mutex)
+# define hw_sched_broadcast(ctx, type) g_cond_broadcast(ctx->type##_cond)
+# define hw_sched_signal(ctx, type) g_cond_signal(ctx->type##_cond)
+# define hw_sched_wait(ctx, type) g_cond_wait(ctx->type##_cond, ctx->type##_cond_mutex)
+
+#define hw_sched_create_mutex(void) g_mutex_new()
+#define hw_sched_destroy_mutex(ctx) g_mutex_free(ctx)
+#define hw_sched_lock_mutex(ctx) g_mutex_lock(ctx)
+#define hw_sched_unlock_mutex(ctx) g_mutex_unlock(ctx)
+#else /* HW_USE_THREADS */
+# define hw_sched_lock(ctx, type)
+# define hw_sched_unlock(ctx, type)
+# define hw_sched_broadcast(ctx, type)
+# define hw_sched_signal(ctx, type)
+# define hw_sched_wait(ctx, type)
+
+#define hw_sched_create_mutex(void) NULL
+#define hw_sched_destroy_mutex(ctx)
+#define hw_sched_lock_mutex(ctx)
+#define hw_sched_unlock_mutex(ctx)
+#endif /* HW_USE_THREADS */
+
+# ifdef __cplusplus
+}
+# endif
+
+#endif /* _HW_SCHED_H */
+
diff --git a/src/Core/regularisers_CPU/hw_thread.c b/src/Core/regularisers_CPU/hw_thread.c
new file mode 100644
index 0000000..41b1800
--- /dev/null
+++ b/src/Core/regularisers_CPU/hw_thread.c
@@ -0,0 +1,141 @@
+/*
+ * The PyHST program is Copyright (C) 2002-2011 of the
+ * European Synchrotron Radiation Facility (ESRF) and
+ * Karlsruhe Institute of Technology (KIT).
+ *
+ * PyHST is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * hst is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "hw_sched.h"
+#include "hw_thread.h"
+
+#ifdef HW_USE_THREADS
+static void *hw_thread_function(HWThread ctx) {
+ int err;
+ int chunk_id;
+
+ HWRunFunction *runs;
+ HWRunFunction run;
+ HWSched sched;
+ void *hwctx;
+
+ sched = ctx->sched;
+ runs = ctx->run;
+ hwctx = ctx->hwctx;
+
+ hw_sched_lock(sched, job_cond);
+
+ hw_sched_lock(sched, compl_cond);
+ ctx->status = HW_THREAD_STATUS_IDLE;
+ hw_sched_broadcast(sched, compl);
+ hw_sched_unlock(sched, compl_cond);
+
+ while (sched->status) {
+ hw_sched_wait(sched, job);
+ if (!sched->status) break;
+
+ ctx->err = 0;
+ ctx->status = HW_THREAD_STATUS_STARTING;
+ hw_sched_unlock(sched, job_cond);
+
+ run = hw_run_entry(runs, sched->entry);
+#if 0
+ // Offset to interleave transfers if the GPUBox is used
+ // Just check with CUDA_LAUNCH_BLOCKED the togpu time and put it here
+ // It should be still significantly less than BP time
+ // We can do callibration during initilization in future
+
+ usleep(12000 * ctx->thread_id);
+#endif
+ chunk_id = hw_sched_get_chunk(sched, ctx->thread_id);
+
+ /* Should be after get_chunk, since we can check if it's first time */
+ ctx->status = HW_THREAD_STATUS_RUNNING;
+ while (chunk_id != HW_SCHED_CHUNK_INVALID) {
+ //printf("Thread %i processing slice %i\n", ctx->thread_id, chunk_id);
+ err = run(ctx, hwctx, chunk_id, sched->ctx);
+ if (err) {
+ ctx->err = err;
+ break;
+ }
+ chunk_id = hw_sched_get_chunk(sched, ctx->thread_id);
+ }
+
+ hw_sched_lock(sched, job_cond);
+
+ hw_sched_lock(sched, compl_cond);
+ ctx->status = HW_THREAD_STATUS_DONE;
+ hw_sched_broadcast(sched, compl);
+ hw_sched_unlock(sched, compl_cond);
+ }
+
+ hw_sched_unlock(sched, job_cond);
+
+ g_thread_exit(NULL);
+ return NULL; /* TODO: check this */
+}
+#endif /* HW_USE_THREADS */
+
+
+HWThread hw_thread_create(HWSched sched, int thread_id, void *hwctx, HWRunFunction *run_func, HWFreeFunction free_func) {
+ GError *err;
+
+ HWThread ctx;
+
+ ctx = (HWThread)malloc(sizeof(HWThreadS));
+ if (!ctx) return ctx;
+
+ memset(ctx, 0, sizeof(HWThreadS));
+
+ ctx->sched = sched;
+ ctx->hwctx = hwctx;
+ ctx->run = run_func;
+ ctx->free = free_func;
+ ctx->thread_id = thread_id;
+ ctx->status = HW_THREAD_STATUS_INIT;
+
+#ifdef HW_USE_THREADS
+ ctx->thread = g_thread_create((GThreadFunc)hw_thread_function, ctx, 1, &err);
+ if (!ctx->thread) {
+ g_error_free(err);
+
+ hw_thread_destroy(ctx);
+ return NULL;
+ }
+#endif /* HW_USE_THREADS */
+
+ return ctx;
+}
+
+void hw_thread_destroy(HWThread ctx) {
+#ifdef HW_USE_THREADS
+ if (ctx->thread) {
+ g_thread_join(ctx->thread);
+ }
+#endif /* HW_USE_THREADS */
+
+ if (ctx->data) {
+ free(ctx->data);
+ }
+
+ if (ctx->free) {
+ ctx->free(ctx->hwctx);
+ }
+
+ free(ctx);
+}
diff --git a/src/Core/regularisers_CPU/hw_thread.h b/src/Core/regularisers_CPU/hw_thread.h
new file mode 100644
index 0000000..de7f60f
--- /dev/null
+++ b/src/Core/regularisers_CPU/hw_thread.h
@@ -0,0 +1,76 @@
+/*
+ * The PyHST program is Copyright (C) 2002-2011 of the
+ * European Synchrotron Radiation Facility (ESRF) and
+ * Karlsruhe Institute of Technology (KIT).
+ *
+ * PyHST is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * hst is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _HW_THREAD_H
+#define _HW_THREAD_H
+
+#include <glib.h>
+
+typedef struct HWThreadT *HWThread;
+typedef int (*HWRunFunction)(HWThread thread, void *ctx, int block, void *attr);
+typedef int (*HWFreeFunction)(void *ctx);
+
+#include "hw_sched.h"
+
+enum HWThreadStatusT {
+ HW_THREAD_STATUS_IDLE = 0,
+ HW_THREAD_STATUS_STARTING = 1,
+ HW_THREAD_STATUS_RUNNING = 2,
+ HW_THREAD_STATUS_FINISHING = 3,
+ HW_THREAD_STATUS_FINISHING2 = 4,
+ HW_THREAD_STATUS_DONE = 5,
+ HW_THREAD_STATUS_INIT = 6
+};
+typedef enum HWThreadStatusT HWThreadStatus;
+
+
+#ifndef HW_HIDE_DETAILS
+struct HWThreadT {
+ int thread_id;
+ HWSched sched;
+
+#ifdef HW_USE_THREADS
+ GThread *thread;
+#endif /* HW_USE_THREADS */
+
+ void *hwctx;
+ HWRunFunction *run;
+ HWFreeFunction free;
+
+ int err;
+ HWThreadStatus status;
+
+ void *data; /**< Per-thread data storage, will be free'd if set */
+};
+typedef struct HWThreadT HWThreadS;
+#endif /* HW_HIDE_DETAILS */
+
+# ifdef __cplusplus
+extern "C" {
+# endif
+
+HWThread hw_thread_create(HWSched sched, int thread_id, void *hwctx, HWRunFunction *run_func, HWFreeFunction free_func);
+void hw_thread_destroy(HWThread ctx);
+
+# ifdef __cplusplus
+}
+# endif
+
+
+#endif /* _HW_THREAD_H */