diff options
| author | Suren A. Chilingaryan <csa@suren.me> | 2020-03-27 20:15:30 +0100 | 
|---|---|---|
| committer | Suren A. Chilingaryan <csa@suren.me> | 2020-03-27 20:15:30 +0100 | 
| commit | 324411a3e033c7f05b77e3bff829b55d674438da (patch) | |
| tree | 1988ded680ccdecc2dd4d4a4b6ee6923dfb3dfd6 | |
| parent | 17727d5938b0c82483c0375b33171645533b4fa5 (diff) | |
| download | regularization-324411a3e033c7f05b77e3bff829b55d674438da.tar.gz regularization-324411a3e033c7f05b77e3bff829b55d674438da.tar.bz2 regularization-324411a3e033c7f05b77e3bff829b55d674438da.tar.xz regularization-324411a3e033c7f05b77e3bff829b55d674438da.zip | |
Optimized TNV routine (10x performance, 1/3 memcory consumption)
| -rw-r--r-- | CMakeLists.txt | 1 | ||||
| -rw-r--r-- | cmake/FindGLIB2.cmake | 54 | ||||
| -rw-r--r-- | src/Core/CMakeLists.txt | 19 | ||||
| -rwxr-xr-x | src/Core/regularisers_CPU/TNV_core.c | 948 | ||||
| -rwxr-xr-x | src/Core/regularisers_CPU/TNV_core_backtrack.c | 695 | ||||
| -rw-r--r-- | src/Core/regularisers_CPU/hw_sched.c | 396 | ||||
| -rw-r--r-- | src/Core/regularisers_CPU/hw_sched.h | 151 | ||||
| -rw-r--r-- | src/Core/regularisers_CPU/hw_thread.c | 141 | ||||
| -rw-r--r-- | src/Core/regularisers_CPU/hw_thread.h | 76 | 
9 files changed, 2097 insertions, 384 deletions
| diff --git a/CMakeLists.txt b/CMakeLists.txt index 043f13c..afbff2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,7 @@  #   limitations under the License.  cmake_minimum_required (VERSION 3.0) +set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})  project(RGL)  #https://stackoverflow.com/questions/13298504/using-cmake-with-setup-py diff --git a/cmake/FindGLIB2.cmake b/cmake/FindGLIB2.cmake new file mode 100644 index 0000000..2940d9f --- /dev/null +++ b/cmake/FindGLIB2.cmake @@ -0,0 +1,54 @@ +# - Try to find the GLIB2 libraries +# Once done this will define +# +#  GLIB2_FOUND - system has glib2 +#  GLIB2_DIR - path to the glib2 base directory +#  GLIB2_INCLUDE_DIR - the glib2 include directory +#  GLIB2_LIBRARIES - glib2 library + +set(GLIB2_DIR GLIB2_DIR-NOTFOUND CACHE PATH "Location of GLIB2 package") + +if(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES) +    # Already in cache, be silent +    set(GLIB2_FIND_QUIETLY TRUE) +endif(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES) + +if (GLIB2_DIR) +    set(PKG_GLIB_LIBRARY_DIRS ${GLIB2_DIR}/lib${CMAKE_BUILD_ARCH} ${GLIB2_DIR}/lib) +    set(PKG_GLIB_INCLUDE_DIRS ${GLIB2_DIR}/include/) +else (GLIB2_DIR) +    if (NOT WIN32) +	find_package(PkgConfig REQUIRED) +	pkg_check_modules(PKG_GLIB REQUIRED glib-2.0) +    endif(NOT WIN32) +endif (GLIB2_DIR) + +find_path(GLIB2_MAIN_INCLUDE_DIR glib.h +         PATH_SUFFIXES glib-2.0 +         PATHS ${PKG_GLIB_INCLUDE_DIRS} ) + +# search the glibconfig.h include dir under the same root where the library is found +find_library(GLIB2_LIBRARIES +             NAMES glib-2.0 +             PATHS ${PKG_GLIB_LIBRARY_DIRS} ) + +find_library(GTHREAD2_LIBRARIES +             NAMES gthread-2.0 +             PATHS ${PKG_GLIB_LIBRARY_DIRS} ) + +find_path(GLIB2_INTERNAL_INCLUDE_DIR glibconfig.h +          PATH_SUFFIXES glib-2.0/include +          PATHS ${PKG_GLIB_INCLUDE_DIRS} ${PKG_GLIB_LIBRARY_DIRS} ${CMAKE_SYSTEM_LIBRARY_PATH}) + +set(GLIB2_INCLUDE_DIR ${GLIB2_MAIN_INCLUDE_DIR}) + +# not sure if this include dir is optional or required +# for now it is optional +if(GLIB2_INTERNAL_INCLUDE_DIR) +  set(GLIB2_INCLUDE_DIR ${GLIB2_INCLUDE_DIR} ${GLIB2_INTERNAL_INCLUDE_DIR}) +endif(GLIB2_INTERNAL_INCLUDE_DIR) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(GLIB2  DEFAULT_MSG  GLIB2_LIBRARIES GTHREAD2_LIBRARIES GLIB2_MAIN_INCLUDE_DIR) + +mark_as_advanced(GLIB2_INCLUDE_DIR GLIB2_LIBRARIES GTHREAD2_LIBRARIES GLIB2_INTERNAL_INCLUDE_DIR GLIB2_MAIN_INCLUDE_DIR) diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt index 4e43b5e..10b16f3 100644 --- a/src/Core/CMakeLists.txt +++ b/src/Core/CMakeLists.txt @@ -15,6 +15,7 @@ message("CIL_VERSION ${CIL_VERSION}")  ## Build the regularisers package as a library  message("Creating Regularisers as a shared library") +find_package(GLIB2 REQUIRED)  find_package(OpenMP REQUIRED)  if (OPENMP_FOUND)      set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") @@ -24,8 +25,8 @@ if (OPENMP_FOUND)     set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")  endif() -message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") -message("CMAKE_C_FLAGS ${CMAKE_C_FLAGS}") +message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer") +message("CMAKE_C_FLAGS ${CMAKE_C_FLAGS} -fno-omit-frame-pointer")  message("CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS}")  message("CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}")  message("CMAKE_STATIC_LINKER_FLAGS ${CMAKE_STATIC_LINKER_FLAGS}") @@ -58,10 +59,17 @@ message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")  ## Build the regularisers package as a library  message("Adding regularisers as a shared library") -#set(CMAKE_C_COMPILER /apps/pgi/linux86-64/17.4/bin/pgcc) +#set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc) +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=2 -qopt-report-phase=vec -qopenmp") +set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -mavx -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer") +  #set(CMAKE_C_FLAGS "-acc -Minfo -ta=tesla:cc20 -openmp")  #set(CMAKE_C_FLAGS "-acc -Minfo -ta=multicore -openmp -fPIC")  add_library(cilreg SHARED +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_sched.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_thread.c  	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_TV_core.c  	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/SB_TV_core.c              ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PD_TV_core.c @@ -78,8 +86,9 @@ add_library(cilreg SHARED  	    ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/Diffusion_Inpaint_core.c  	    ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/NonlocalMarching_Inpaint_core.c  	    ) -target_link_libraries(cilreg ${EXTRA_LIBRARIES} ) +target_link_libraries(cilreg ${EXTRA_LIBRARIES} ${GLIB2_LIBRARIES} ${GTHREAD2_LIBRARIES} )  include_directories(cilreg PUBLIC +                      ${GLIB2_INCLUDE_DIR}                        ${LIBRARY_INC}/include  					  ${CMAKE_CURRENT_SOURCE_DIR}  		              ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ @@ -105,7 +114,7 @@ endif()  # GPU Regularisers -if (BUILD_CUDA) +if (BUILD_CUDA1)      find_package(CUDA)      if (CUDA_FOUND)        set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -shared -D_FORCE_INLINES") diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c index 753cc5f..be7fdef 100755 --- a/src/Core/regularisers_CPU/TNV_core.c +++ b/src/Core/regularisers_CPU/TNV_core.c @@ -5,6 +5,12 @@   *   * Copyright 2017 Daniil Kazantsev   * Copyright 2017 Srikanth Nagella, Edoardo Pasca + *  + * Copyriht 2020 Suren A. Chlingaryan + * Optimized version with 1/3 of memory consumption and ~10x performance + * This version is not able to perform back-track except during first iterations + * But warning would be printed if backtracking is required and slower version (TNV_core_backtrack.c)  + * could be executed instead. It still better than original with 1/2 of memory consumption and 4x performance gain   *   * Licensed under the Apache License, Version 2.0 (the "License");   * you may not use this file except in compliance with the License. @@ -19,6 +25,467 @@  #include "TNV_core.h" +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { +    int ii, num; +    float divsigma = 1.0f / sigma; +    float sum, shrinkfactor; +    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; +    float proj[2] = {0}; + +    // Compute eigenvalues of M +    T = M1 + M3; +    D = M1 * M3 - M2 * M2; +    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); +    eig1 = MAX((T / 2.0f) + det, 0.0f); +    eig2 = MAX((T / 2.0f) - det, 0.0f); +    sig1 = sqrtf(eig1); +    sig2 = sqrtf(eig2); + +    // Compute normalized eigenvectors +    V1 = V2 = V3 = V4 = 0.0f; + +    if(M2 != 0.0f) +    { +        v0 = M2; +        v1 = eig1 - M3; +        v2 = eig2 - M3; + +        mu1 = sqrtf(v0 * v0 + v1 * v1); +        mu2 = sqrtf(v0 * v0 + v2 * v2); + +        if(mu1 > fTiny) +        { +            V1 = v1 / mu1; +            V3 = v0 / mu1; +        } + +        if(mu2 > fTiny) +        { +            V2 = v2 / mu2; +            V4 = v0 / mu2; +        } + +    } else +    { +        if(M1 > M3) +        { +            V1 = V4 = 1.0f; +            V2 = V3 = 0.0f; + +        } else +        { +            V1 = V4 = 0.0f; +            V2 = V3 = 1.0f; +        } +    } + +    // Compute prox_p of the diagonal entries +    sig1_upd = sig2_upd = 0.0f; + +    if(p == 1) +    { +        sig1_upd = MAX(sig1 - divsigma, 0.0f); +        sig2_upd = MAX(sig2 - divsigma, 0.0f); + +    } else if(p == INFNORM) +    { +        proj[0] = sigma * fabs(sig1); +        proj[1] = sigma * fabs(sig2); + +        /*l1 projection part */ +        sum = fLarge; +        num = 0l; +        shrinkfactor = 0.0f; +        while(sum > 1.0f) +        { +            sum = 0.0f; +            num = 0; + +            for(ii = 0; ii < 2; ii++) +            { +                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + +                sum += fabs(proj[ii]); +                if(proj[ii]!= 0.0f) +                    num++; +            } + +            if(num > 0) +                shrinkfactor = (sum - 1.0f) / num; +            else +                break; +        } +        /*l1 proj ends*/ + +        sig1_upd = sig1 - divsigma * proj[0]; +        sig2_upd = sig2 - divsigma * proj[1]; +    } + +    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ +    if(sig1 > fTiny) +        sig1_upd /= sig1; + +    if(sig2 > fTiny) +        sig2_upd /= sig2; + +    // Compute solution +    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; +    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; +    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { +    int offY, stepY, copY; +    float *Input, *u, *qx, *qy, *gradx, *grady, *div; +    float *div0, *udiff0, *udiff; +    float resprimal, resdual; +    float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { +    int threads; +    tnv_thread_t *thr_ctx; +    float *InputT, *uT; +    int dimX, dimY, dimZ, padZ; +    float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    free(ctx->Input); +    free(ctx->u); +    free(ctx->qx); +    free(ctx->qy); +    free(ctx->gradx); +    free(ctx->grady); +    free(ctx->div); +     +    free(ctx->div0); +    free(ctx->udiff0); +    free(ctx->udiff); +     +    return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +     +//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*(stepY+1)*padZ); +    long DimRow = (long)(dimX * padZ); + +    // Auxiliar vectors +    ctx->Input = malloc(Dim1Total * sizeof(float)); +    ctx->u = malloc(Dim1Total * sizeof(float)); +    ctx->qx = malloc(DimTotal * sizeof(float)); +    ctx->qy = malloc(DimTotal * sizeof(float)); +    ctx->gradx = malloc(DimTotal * sizeof(float)); +    ctx->grady = malloc(DimTotal * sizeof(float)); +    ctx->div = malloc(Dim1Total * sizeof(float)); + +    ctx->div0 = malloc(DimRow * sizeof(float)); +    ctx->udiff0 = malloc(DimRow * sizeof(float)); +    ctx->udiff = malloc(DimRow * sizeof(float)); + +    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) { +        fprintf(stderr, "Error allocating memory\n"); +        exit(-1); +    } + +    return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +     +//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    memset(ctx->u, 0, Dim1Total * sizeof(float)); +    memset(ctx->qx, 0, DimTotal * sizeof(float)); +    memset(ctx->qy, 0, DimTotal * sizeof(float)); +    memset(ctx->gradx, 0, DimTotal * sizeof(float)); +    memset(ctx->grady, 0, DimTotal * sizeof(float)); +    memset(ctx->div, 0, Dim1Total * sizeof(float)); +     +    for(k=0; k<dimZ; k++) { +        for(j=0; j<copY; j++) { +            for(i=0; i<dimX; i++) { +                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; +                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; +            } +        } +    } + +    return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    for(k=0; k<dimZ; k++) { +        for(j=0; j<stepY; j++) { +            for(i=0; i<dimX; i++) { +                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; +            } +        } +    } +     +    return 0; +} + + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +    int padZ = tnv_ctx->padZ; +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    memset(ctx->u, 0, Dim1Total * sizeof(float)); +    memset(ctx->qx, 0, DimTotal * sizeof(float)); +    memset(ctx->qy, 0, DimTotal * sizeof(float)); +    memset(ctx->gradx, 0, DimTotal * sizeof(float)); +    memset(ctx->grady, 0, DimTotal * sizeof(float)); +    memset(ctx->div, 0, Dim1Total * sizeof(float)); + +    return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { +    long i, j, k, l, m; + +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    float *Input = ctx->Input; +    float *u = ctx->u; +    float *qx = ctx->qx; +    float *qy = ctx->qy; +    float *gradx = ctx->gradx; +    float *grady = ctx->grady; +    float *div = ctx->div; + +    long p = 1l; +    long q = 1l; +    long r = 0l; + +    float lambda = tnv_ctx->lambda; +    float sigma = tnv_ctx->sigma; +    float tau = tnv_ctx->tau; +    float theta = tnv_ctx->theta; +     +    float taulambda = tau * lambda; +    float divtau = 1.0f / tau; +    float divsigma = 1.0f / sigma; +    float theta1 = 1.0f + theta; +    float constant = 1.0f + taulambda; + +    float resprimal = 0.0f; +    float resdual = 0.0f; +    float product = 0.0f; +    float unorm = 0.0f; +    float qnorm = 0.0f; + +    float qxdiff; +    float qydiff; +    float divdiff; +    float gradxdiff[dimZ]; +    float gradydiff[dimZ]; +    float ubarx[dimZ]; +    float ubary[dimZ]; +    float udiff_next[dimZ]; + +    for(i=0; i < dimX; i++) { +        for(k = 0; k < dimZ; k++) {  +            int l = i * padZ + k; +            float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant; +            float udiff = u[l] - u_upd; +            ctx->udiff[l] = udiff; +            ctx->udiff0[l] = udiff; +            ctx->div0[l] = div[l]; +            u[l] = u_upd; +        } +    } + +    for(j = 0; j < stepY; j++) { +        for(i = 0; i < dimX; i++) { +            float t[3]; +            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; +            l = (j * dimX  + i) * padZ; +            m = dimX * padZ; +         +//#pragma unroll 64 +            for(k = 0; k < dimZ; k++) { +                float u_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; +                udiff_next[k] = u[l + k + m] - u_upd; +                u[l + k + m] = u_upd; + +                float gradx_upd = (i == (dimX - 1))?0:(u[l + k + padZ] - u[l + k]); +                float grady_upd = (j == (copY - 1))?0:(u[l + k + m] - u[l + k]); +                gradxdiff[k] = gradx[l + k] - gradx_upd; +                gradydiff[k] = grady[l + k] - grady_upd; +                gradx[l + k] = gradx_upd; +                grady[l + k] = grady_upd; +                 +                ubarx[k] = gradx_upd - theta * gradxdiff[k]; +                ubary[k] = grady_upd - theta * gradydiff[k]; + +                float vx = ubarx[k] + divsigma * qx[l + k]; +                float vy = ubary[k] + divsigma * qy[l + k]; + +                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); +            } + +            coefF(t, M1, M2, M3, sigma, p, q, r); +             +//#pragma unroll 64 +            for(k = 0; k < dimZ; k++) { +                float vx = ubarx[k] + divsigma * qx[l + k]; +                float vy = ubary[k] + divsigma * qy[l + k]; +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qxdiff = sigma * (ubarx[k] - gx_upd); +                qydiff = sigma * (ubary[k] - gy_upd); +                 +                qx[l + k] += qxdiff; +                qy[l + k] += qydiff; + +                float udiff = ctx->udiff[i * padZ + k]; +                ctx->udiff[i * padZ + k] = udiff_next[k]; +                unorm += (udiff * udiff); +                qnorm += (qxdiff * qxdiff + qydiff * qydiff); + +                float div_upd = 0; +                div_upd -= (i > 0)?qx[l + k - padZ]:0; +                div_upd -= (j > 0)?qy[l + k - m]:0; +                div_upd += (i < (dimX-1))?qx[l + k]:0; +                div_upd += (j < (copY-1))?qy[l + k]:0; +                divdiff = div[l + k] - div_upd;   +                div[l + k] = div_upd; + +                resprimal +=  ((offY == 0)||(j > 0))?fabs(divtau * udiff + divdiff):0;  +                resdual += fabs(divsigma * qxdiff + gradxdiff[k]); +                resdual += fabs(divsigma * qydiff + gradydiff[k]); +                product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); +            } +             +        } // i +    } // j + + +    ctx->resprimal = resprimal; +    ctx->resdual = resdual; +    ctx->product = product; +    ctx->unorm = unorm; +    ctx->qnorm = qnorm; + +    return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { +    int i, off, size, err; + +    if (sched) return; + +    tnv_ctx.dimX = dimX; +    tnv_ctx.dimY = dimY; +    tnv_ctx.dimZ = dimZ; +        // Padding seems actually slower +//    tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0)); +    tnv_ctx.padZ = dimZ; +     +    hw_sched_init(); + +    int threads = hw_sched_get_cpu_count(); +    if (threads > dimY) threads = dimY/2; + +    int step = dimY / threads; +    int extra = dimY % threads; + +    tnv_ctx.threads = threads; +    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); +    for (i = 0, off = 0; i < threads; i++, off += size) { +        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; +        size = step + ((i < extra)?1:0); + +        ctx->offY = off; +        ctx->stepY = size; + +        if (i == (threads-1)) ctx->copY = ctx->stepY; +        else ctx->copY = ctx->stepY + 1; +    } + +    sched = hw_sched_create(threads); +    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + +  /*   * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]   * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see @@ -32,50 +499,25 @@   * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter]   *   * Output: - * 1. Filtered/regularized image + * 1. Filtered/regularized image (u)   *   * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.   */ -float TNV_CPU_main(float *Input, float *u, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)  { -    long k, p, q, r, DimTotal; -    float taulambda; -    float *u_upd, *gx, *gy, *gx_upd, *gy_upd, *qx, *qy, *qx_upd, *qy_upd, *v, *vx, *vy, *gradx, *grady, *gradx_upd, *grady_upd, *gradx_ubar, *grady_ubar, *div, *div_upd; -     -    p = 1l; -    q = 1l; -    r = 0l; -     +    int err; +    int iter; +    int i,j,k,l,m; +      lambda = 1.0f/(2.0f*lambda); -    DimTotal = (long)(dimX*dimY*dimZ); -    /* PDHG algorithm parameters*/ +    tnv_ctx.lambda = lambda; + +    // PDHG algorithm parameters      float tau = 0.5f;      float sigma = 0.5f;      float theta = 1.0f; -     -    // Auxiliar vectors -    u_upd = calloc(DimTotal, sizeof(float)); -    gx = calloc(DimTotal, sizeof(float)); -    gy = calloc(DimTotal, sizeof(float)); -    gx_upd = calloc(DimTotal, sizeof(float)); -    gy_upd = calloc(DimTotal, sizeof(float)); -    qx = calloc(DimTotal, sizeof(float)); -    qy = calloc(DimTotal, sizeof(float)); -    qx_upd = calloc(DimTotal, sizeof(float)); -    qy_upd = calloc(DimTotal, sizeof(float)); -    v = calloc(DimTotal, sizeof(float)); -    vx = calloc(DimTotal, sizeof(float)); -    vy = calloc(DimTotal, sizeof(float)); -    gradx = calloc(DimTotal, sizeof(float)); -    grady = calloc(DimTotal, sizeof(float)); -    gradx_upd = calloc(DimTotal, sizeof(float)); -    grady_upd = calloc(DimTotal, sizeof(float)); -    gradx_ubar = calloc(DimTotal, sizeof(float)); -    grady_ubar = calloc(DimTotal, sizeof(float)); -    div = calloc(DimTotal, sizeof(float)); -    div_upd = calloc(DimTotal, sizeof(float)); -     +      // Backtracking parameters      float s = 1.0f;      float gamma = 0.75f; @@ -84,369 +526,117 @@ float TNV_CPU_main(float *Input, float *u, float lambda, int maxIter, float tol,      float alpha = alpha0;      float delta = 1.5f;      float eta = 0.95f; + +    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + +    tnv_ctx.InputT = InputT; +    tnv_ctx.uT = uT; -    // PDHG algorithm parameters -    taulambda = tau * lambda; -    float divtau = 1.0f / tau; -    float divsigma = 1.0f / sigma; -    float theta1 = 1.0f + theta; -     -    /*allocate memory for  taulambda */ -    //taulambda = (float*) calloc(dimZ, sizeof(float)); -    //for(k=0; k < dimZ; k++)  {taulambda[k] = tau*lambda[k];} -     +    int padZ = tnv_ctx.padZ; + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + +      // Apply Primal-Dual Hybrid Gradient scheme -    int iter = 0;      float residual = fLarge; -    float ubarx, ubary; -     +    int started = 0;      for(iter = 0; iter < maxIter; iter++)   { -        // Argument of proximal mapping of fidelity term -#pragma omp parallel for shared(v, u) private(k) -        for(k=0; k<dimX*dimY*dimZ; k++)  {v[k] = u[k] + tau*div[k];} - -// Proximal solution of fidelity term -proxG(u_upd, v, Input, taulambda, (long)(dimX), (long)(dimY), (long)(dimZ)); - -// Gradient of updated primal variable -gradient(u_upd, gradx_upd, grady_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); - -// Argument of proximal mapping of regularization term -#pragma omp parallel for shared(gradx_upd, grady_upd, gradx, grady) private(k, ubarx, ubary) -for(k=0; k<dimX*dimY*dimZ; k++) { -    ubarx = theta1 * gradx_upd[k] - theta * gradx[k]; -    ubary = theta1 * grady_upd[k] - theta * grady[k]; -    vx[k] = ubarx + divsigma * qx[k]; -    vy[k] = ubary + divsigma * qy[k]; -    gradx_ubar[k] = ubarx; -    grady_ubar[k] = ubary; -} +        float resprimal = 0.0f; +        float resdual = 0.0f; +        float product = 0.0f; +        float unorm = 0.0f; +        float qnorm = 0.0f; -proxF(gx_upd, gy_upd, vx, vy, sigma, p, q, r, (long)(dimX), (long)(dimY), (long)(dimZ)); +        float divtau = 1.0f / tau; -// Update dual variable -#pragma omp parallel for shared(qx_upd, qy_upd) private(k) -for(k=0; k<dimX*dimY*dimZ; k++) { -    qx_upd[k] = qx[k] + sigma * (gradx_ubar[k] - gx_upd[k]); -    qy_upd[k] = qy[k] + sigma * (grady_ubar[k] - gy_upd[k]); -} +        tnv_ctx.sigma = sigma; +        tnv_ctx.tau = tau; +        tnv_ctx.theta = theta; -// Divergence of updated dual variable -#pragma omp parallel for shared(div_upd) private(k) -for(k=0; k<dimX*dimY*dimZ; k++)  {div_upd[k] = 0.0f;} -divergence(qx_upd, qy_upd, div_upd, dimX, dimY, dimZ); - -// Compute primal residual, dual residual, and backtracking condition -float resprimal = 0.0f; -float resdual = 0.0f; -float product = 0.0f; -float unorm = 0.0f; -float qnorm = 0.0f; - -for(k=0; k<dimX*dimY*dimZ; k++) { -    float udiff = u[k] - u_upd[k]; -    float qxdiff = qx[k] - qx_upd[k]; -    float qydiff = qy[k] - qy_upd[k]; -    float divdiff = div[k] - div_upd[k]; -    float gradxdiff = gradx[k] - gradx_upd[k]; -    float gradydiff = grady[k] - grady_upd[k]; -     -    resprimal += fabs(divtau*udiff + divdiff); -    resdual += fabs(divsigma*qxdiff - gradxdiff); -    resdual += fabs(divsigma*qydiff - gradydiff); -     -    unorm += (udiff * udiff); -    qnorm += (qxdiff * qxdiff + qydiff * qydiff); -    product += (gradxdiff * qxdiff + gradydiff * qydiff); -} +        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); +        if (!err) err = hw_sched_wait_task(sched); +        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } -float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + -        gamma * tau * qnorm); +            // border regions +        for (j = 1; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; -// Adapt step-size parameters -float dual_dot_delta = resdual * s * delta; -float dual_div_delta = (resdual * s) / delta; +            m = (ctx0->stepY - 1) * dimX * padZ; +            for(i = 0; i < dimX; i++) { +                for(k = 0; k < dimZ; k++) { +                    int l = i * padZ + k; +                         +                    float divdiff = ctx->div0[l] - ctx->div[l]; +                    float udiff = ctx->udiff0[l]; -if(b > 1) -{ -    // Decrease step-sizes to fit balancing principle -    tau = (beta * tau) / b; -    sigma = (beta * sigma) / b; -    alpha = alpha0; -     -    copyIm(u, u_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); -    copyIm(gx, gx_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); -    copyIm(gy, gy_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); -    copyIm(qx, qx_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); -    copyIm(qy, qy_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); -    copyIm(gradx, gradx_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); -    copyIm(grady, grady_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); -    copyIm(div, div_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); -     -} else if(resprimal > dual_dot_delta) -{ -    // Increase primal step-size and decrease dual step-size -    tau = tau / (1.0f - alpha); -    sigma = sigma * (1.0f - alpha); -    alpha = alpha * eta; -     -} else if(resprimal < dual_div_delta) -{ -    // Decrease primal step-size and increase dual step-size -    tau = tau * (1.0f - alpha); -    sigma = sigma / (1.0f - alpha); -    alpha = alpha * eta; -} +                    ctx->div[l] -= ctx0->qy[l + m]; +                    ctx0->div[m + l + dimX*padZ] = ctx->div[l]; +                     +                    divdiff += ctx0->qy[l + m]; +                    resprimal += fabs(divtau * udiff + divdiff);  +                } +            } +        } -// Update variables -taulambda = tau * lambda; -//for(k=0; k < dimZ; k++) taulambda[k] = tau*lambda[k]; +        for (j = 0; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; +            resprimal += ctx->resprimal; +            resdual += ctx->resdual; +            product += ctx->product; +            unorm += ctx->unorm; +            qnorm += ctx->qnorm; +        }  -divsigma = 1.0f / sigma; -divtau = 1.0f / tau; +        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); +        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); +        float dual_dot_delta = resdual * s * delta; +        float dual_div_delta = (resdual * s) / delta; +        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); -copyIm(u_upd, u, (long)(dimX), (long)(dimY), (long)(dimZ)); -copyIm(gx_upd, gx, (long)(dimX), (long)(dimY), (long)(dimZ)); -copyIm(gy_upd, gy, (long)(dimX), (long)(dimY), (long)(dimZ)); -copyIm(qx_upd, qx, (long)(dimX), (long)(dimY), (long)(dimZ)); -copyIm(qy_upd, qy, (long)(dimX), (long)(dimY), (long)(dimZ)); -copyIm(gradx_upd, gradx, (long)(dimX), (long)(dimY), (long)(dimZ)); -copyIm(grady_upd, grady, (long)(dimX), (long)(dimY), (long)(dimZ)); -copyIm(div_upd, div, (long)(dimX), (long)(dimY), (long)(dimZ)); -// Compute residual at current iteration -residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); +        if(b > 1) { +             +            // Decrease step-sizes to fit balancing principle +            tau = (beta * tau) / b; +            sigma = (beta * sigma) / b; +            alpha = alpha0; -//       printf("%f \n", residual); -if (residual < tol) { -    printf("Iterations stopped at %i with the residual %f \n", iter, residual); -    break; } +            if (started) { +                fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n"); +            } else { +                err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); +                if (!err) err = hw_sched_wait_task(sched); +                if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } +            } +        } else { +            started = 1; +            if(resprimal > dual_dot_delta) { +                    // Increase primal step-size and decrease dual step-size +                tau = tau / (1.0f - alpha); +                sigma = sigma * (1.0f - alpha); +                alpha = alpha * eta; +            } else if(resprimal < dual_div_delta) { +                    // Decrease primal step-size and increase dual step-size +                tau = tau * (1.0f - alpha); +                sigma = sigma / (1.0f - alpha); +                alpha = alpha * eta; +            } +        } +        if (residual < tol) break;      } -    printf("Iterations stopped at %i with the residual %f \n", iter, residual); -    free (u_upd); free(gx); free(gy); free(gx_upd); free(gy_upd); -    free(qx); free(qy); free(qx_upd); free(qy_upd); free(v); free(vx); free(vy); -    free(gradx); free(grady); free(gradx_upd); free(grady_upd); free(gradx_ubar); -    free(grady_ubar); free(div); free(div_upd);     -    return *u; -} -float proxG(float *u_upd, float *v, float *f, float taulambda, long dimX, long dimY, long dimZ) -{ -    float constant; -    long k; -    constant = 1.0f + taulambda; -#pragma omp parallel for shared(v, f, u_upd) private(k) -    for(k=0; k<dimZ*dimX*dimY; k++) { -        u_upd[k] = (v[k] + taulambda * f[k])/constant; -        //u_upd[(dimX*dimY)*k + l] = (v[(dimX*dimY)*k + l] + taulambda * f[(dimX*dimY)*k + l])/constant; -    } -    return *u_upd; -} +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } -float gradient(float *u_upd, float *gradx_upd, float *grady_upd, long dimX, long dimY, long dimZ) -{ -    long i, j, k, l; -    // Compute discrete gradient using forward differences -#pragma omp parallel for shared(gradx_upd,grady_upd,u_upd) private(i, j, k, l) -    for(k = 0; k < dimZ; k++)   { -        for(j = 0; j < dimY; j++)   { -            l = j * dimX;            -            for(i = 0; i < dimX; i++)   { -                // Derivatives in the x-direction -                if(i != dimX-1) -                    gradx_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+1+l] - u_upd[(dimX*dimY)*k + i+l]; -                else -                    gradx_upd[(dimX*dimY)*k + i+l] = 0.0f; -                 -                // Derivatives in the y-direction -                if(j != dimY-1) -                    //grady_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+dimY+l] -u_upd[(dimX*dimY)*k + i+l]; -                    grady_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+(j+1)*dimX] -u_upd[(dimX*dimY)*k + i+l]; -                else -                    grady_upd[(dimX*dimY)*k + i+l] = 0.0f; -            }}} -    return 1; -} -float proxF(float *gx, float *gy, float *vx, float *vy, float sigma, int p, int q, int r, long dimX, long dimY, long dimZ) -{ -    // (S^p, \ell^1) norm decouples at each pixel -//   Spl1(gx, gy, vx, vy, sigma, p, num_channels, dim); -    float divsigma = 1.0f / sigma; -     -    // $\ell^{1,1,1}$-TV regularization -//       int i,j,k; -//     #pragma omp parallel for shared (gx,gy,vx,vy) private(i,j,k) -//      for(k = 0; k < dimZ; k++)  { -//         for(i=0; i<dimX; i++) { -//              for(j=0; j<dimY; j++) { -//                 gx[(dimX*dimY)*k + (i)*dimY + (j)] = SIGN(vx[(dimX*dimY)*k + (i)*dimY + (j)]) * MAX(fabs(vx[(dimX*dimY)*k + (i)*dimY + (j)]) - divsigma,  0.0f); -//                 gy[(dimX*dimY)*k + (i)*dimY + (j)] = SIGN(vy[(dimX*dimY)*k + (i)*dimY + (j)]) * MAX(fabs(vy[(dimX*dimY)*k + (i)*dimY + (j)]) - divsigma,  0.0f); -//             }}} -     -    // Auxiliar vector -    float *proj, sum, shrinkfactor ; -    float M1,M2,M3,valuex,valuey,T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd,t1,t2,t3; -    long i,j,k, ii, num; -#pragma omp parallel for shared (gx,gy,vx,vy,p) private(i,ii,j,k,proj,num, sum, shrinkfactor, M1,M2,M3,valuex,valuey,T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4,v0,v1,v2,mu1,mu2,sig1_upd,sig2_upd,t1,t2,t3) -    for(i=0; i<dimX; i++) { -        for(j=0; j<dimY; j++) { -             -            proj = (float*) calloc (2,sizeof(float)); -            // Compute matrix $M\in\R^{2\times 2}$ -            M1 = 0.0f; -            M2 = 0.0f; -            M3 = 0.0f; -             -            for(k = 0; k < dimZ; k++) -            { -                valuex = vx[(dimX*dimY)*k + (j)*dimX + (i)]; -                valuey = vy[(dimX*dimY)*k + (j)*dimX + (i)]; -                 -                M1 += (valuex * valuex); -                M2 += (valuex * valuey); -                M3 += (valuey * valuey); -            } -             -            // Compute eigenvalues of M -            T = M1 + M3; -            D = M1 * M3 - M2 * M2; -            det = sqrt(MAX((T * T / 4.0f) - D, 0.0f)); -            eig1 = MAX((T / 2.0f) + det, 0.0f); -            eig2 = MAX((T / 2.0f) - det, 0.0f); -            sig1 = sqrt(eig1); -            sig2 = sqrt(eig2); -             -            // Compute normalized eigenvectors -            V1 = V2 = V3 = V4 = 0.0f; -             -            if(M2 != 0.0f) -            { -                v0 = M2; -                v1 = eig1 - M3; -                v2 = eig2 - M3; -                 -                mu1 = sqrtf(v0 * v0 + v1 * v1); -                mu2 = sqrtf(v0 * v0 + v2 * v2); -                 -                if(mu1 > fTiny) -                { -                    V1 = v1 / mu1; -                    V3 = v0 / mu1; -                } -                 -                if(mu2 > fTiny) -                { -                    V2 = v2 / mu2; -                    V4 = v0 / mu2; -                } -                 -            } else -            { -                if(M1 > M3) -                { -                    V1 = V4 = 1.0f; -                    V2 = V3 = 0.0f; -                     -                } else -                { -                    V1 = V4 = 0.0f; -                    V2 = V3 = 1.0f; -                } -            } -             -            // Compute prox_p of the diagonal entries -            sig1_upd = sig2_upd = 0.0f; -             -            if(p == 1) -            { -                sig1_upd = MAX(sig1 - divsigma, 0.0f); -                sig2_upd = MAX(sig2 - divsigma, 0.0f); -                 -            } else if(p == INFNORM) -            { -                proj[0] = sigma * fabs(sig1); -                proj[1] = sigma * fabs(sig2); -                 -                /*l1 projection part */ -                sum = fLarge; -                num = 0l; -                shrinkfactor = 0.0f; -                while(sum > 1.0f) -                { -                    sum = 0.0f; -                    num = 0; -                     -                    for(ii = 0; ii < 2; ii++) -                    { -                        proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); -                         -                        sum += fabs(proj[ii]); -                        if(proj[ii]!= 0.0f) -                            num++; -                    } -                     -                    if(num > 0) -                        shrinkfactor = (sum - 1.0f) / num; -                    else -                        break; -                } -                /*l1 proj ends*/ -                 -                sig1_upd = sig1 - divsigma * proj[0]; -                sig2_upd = sig2 - divsigma * proj[1]; -            } -             -            // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ -            if(sig1 > fTiny) -                sig1_upd /= sig1; -             -            if(sig2 > fTiny) -                sig2_upd /= sig2; -             -            // Compute solution -            t1 = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; -            t2 = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; -            t3 = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; -             -            for(k = 0; k < dimZ; k++) -            { -                gx[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t1 + vy[(dimX*dimY)*k + j*dimX + i] * t2; -                gy[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t2 + vy[(dimX*dimY)*k + j*dimX + i] * t3; -            }            -             -            // Delete allocated memory -            free(proj); -        }} -     -    return 1; -} +    printf("Iterations stopped at %i with the residual %f \n", iter, residual); +    printf("Return: %f\n", *uT); -float divergence(float *qx_upd, float *qy_upd, float *div_upd, long dimX, long dimY, long dimZ) -{ -    long i, j, k, l; -#pragma omp parallel for shared(qx_upd,qy_upd,div_upd) private(i, j, k, l) -    for(k = 0; k < dimZ; k++)   { -        for(j = 0; j < dimY; j++)   { -            l = j * dimX;             -            for(i = 0; i < dimX; i++)   { -                if(i != dimX-1) -                { -                    // ux[k][i+l] = u[k][i+1+l] - u[k][i+l] -                    div_upd[(dimX*dimY)*k + i+1+l] -= qx_upd[(dimX*dimY)*k + i+l]; -                    div_upd[(dimX*dimY)*k + i+l] += qx_upd[(dimX*dimY)*k + i+l]; -                } -                 -                if(j != dimY-1) -                { -                    // uy[k][i+l] = u[k][i+width+l] - u[k][i+l] -                    //div_upd[(dimX*dimY)*k + i+dimY+l] -= qy_upd[(dimX*dimY)*k + i+l]; -                    div_upd[(dimX*dimY)*k + i+(j+1)*dimX] -= qy_upd[(dimX*dimY)*k + i+l];                     -                    div_upd[(dimX*dimY)*k + i+l] += qy_upd[(dimX*dimY)*k + i+l]; -                } -            } -        } -    } -    return *div_upd; +//    exit(-1); +    return *uT;  } diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c new file mode 100755 index 0000000..d771db5 --- /dev/null +++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c @@ -0,0 +1,695 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Copyriht 2020 Suren A. Chlingaryan + * Optimized version with 1/2 of memory consumption and ~4x performance + * This version is algorithmicly comptable with the original, but slight change in results  + * is expected due to different order of floating-point operations. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TNV_core.h" + +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { +    int ii, num; +    float divsigma = 1.0f / sigma; +    float sum, shrinkfactor; +    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; +    float proj[2] = {0}; + +    // Compute eigenvalues of M +    T = M1 + M3; +    D = M1 * M3 - M2 * M2; +    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); +    eig1 = MAX((T / 2.0f) + det, 0.0f); +    eig2 = MAX((T / 2.0f) - det, 0.0f); +    sig1 = sqrtf(eig1); +    sig2 = sqrtf(eig2); + +    // Compute normalized eigenvectors +    V1 = V2 = V3 = V4 = 0.0f; + +    if(M2 != 0.0f) +    { +        v0 = M2; +        v1 = eig1 - M3; +        v2 = eig2 - M3; + +        mu1 = sqrtf(v0 * v0 + v1 * v1); +        mu2 = sqrtf(v0 * v0 + v2 * v2); + +        if(mu1 > fTiny) +        { +            V1 = v1 / mu1; +            V3 = v0 / mu1; +        } + +        if(mu2 > fTiny) +        { +            V2 = v2 / mu2; +            V4 = v0 / mu2; +        } + +    } else +    { +        if(M1 > M3) +        { +            V1 = V4 = 1.0f; +            V2 = V3 = 0.0f; + +        } else +        { +            V1 = V4 = 0.0f; +            V2 = V3 = 1.0f; +        } +    } + +    // Compute prox_p of the diagonal entries +    sig1_upd = sig2_upd = 0.0f; + +    if(p == 1) +    { +        sig1_upd = MAX(sig1 - divsigma, 0.0f); +        sig2_upd = MAX(sig2 - divsigma, 0.0f); + +    } else if(p == INFNORM) +    { +        proj[0] = sigma * fabs(sig1); +        proj[1] = sigma * fabs(sig2); + +        /*l1 projection part */ +        sum = fLarge; +        num = 0l; +        shrinkfactor = 0.0f; +        while(sum > 1.0f) +        { +            sum = 0.0f; +            num = 0; + +            for(ii = 0; ii < 2; ii++) +            { +                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + +                sum += fabs(proj[ii]); +                if(proj[ii]!= 0.0f) +                    num++; +            } + +            if(num > 0) +                shrinkfactor = (sum - 1.0f) / num; +            else +                break; +        } +        /*l1 proj ends*/ + +        sig1_upd = sig1 - divsigma * proj[0]; +        sig2_upd = sig2 - divsigma * proj[1]; +    } + +    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ +    if(sig1 > fTiny) +        sig1_upd /= sig1; + +    if(sig2 > fTiny) +        sig2_upd /= sig2; + +    // Compute solution +    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; +    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; +    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { +    int offY, stepY, copY; +    float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd; +    float *div, *div_upd; +    float resprimal, resdual; +    float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { +    int threads; +    tnv_thread_t *thr_ctx; +    float *InputT, *uT; +    int dimX, dimY, dimZ, padZ; +    float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    free(ctx->Input); +    free(ctx->u); +    free(ctx->u_upd); +    free(ctx->qx); +    free(ctx->qy); +    free(ctx->qx_upd); +    free(ctx->qy_upd); +    free(ctx->gradx); +    free(ctx->grady); +    free(ctx->gradx_upd); +    free(ctx->grady_upd); +    free(ctx->div); +    free(ctx->div_upd); +     +    return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +     +//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*(stepY+1)*padZ); + +    // Auxiliar vectors +    ctx->Input = malloc(Dim1Total * sizeof(float)); +    ctx->u = malloc(Dim1Total * sizeof(float)); +    ctx->u_upd = malloc(Dim1Total * sizeof(float)); +    ctx->qx = malloc(DimTotal * sizeof(float)); +    ctx->qy = malloc(DimTotal * sizeof(float)); +    ctx->qx_upd = malloc(DimTotal * sizeof(float)); +    ctx->qy_upd = malloc(DimTotal * sizeof(float)); +    ctx->gradx = malloc(DimTotal * sizeof(float)); +    ctx->grady = malloc(DimTotal * sizeof(float)); +    ctx->gradx_upd = malloc(DimTotal * sizeof(float)); +    ctx->grady_upd = malloc(DimTotal * sizeof(float)); +    ctx->div = malloc(Dim1Total * sizeof(float)); +    ctx->div_upd = malloc(Dim1Total * sizeof(float)); + +    if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) { +        fprintf(stderr, "Error allocating memory\n"); +        exit(-1); +    } + +    return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +     +//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    memset(ctx->u, 0, Dim1Total * sizeof(float)); +    memset(ctx->qx, 0, DimTotal * sizeof(float)); +    memset(ctx->qy, 0, DimTotal * sizeof(float)); +    memset(ctx->gradx, 0, DimTotal * sizeof(float)); +    memset(ctx->grady, 0, DimTotal * sizeof(float)); +    memset(ctx->div, 0, Dim1Total * sizeof(float)); +     +    for(k=0; k<dimZ; k++) { +        for(j=0; j<copY; j++) { +            for(i=0; i<dimX; i++) { +                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; +                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; +            } +        } +    } + +    return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    for(k=0; k<dimZ; k++) { +        for(j=0; j<stepY; j++) { +            for(i=0; i<dimX; i++) { +                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; +            } +        } +    } +     +    return 0; +} + + +static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +    int padZ = tnv_ctx->padZ; +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    // Auxiliar vectors +    memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float)); +    memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float)); +    memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float)); +    memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float)); +    memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float)); +    memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float)); + +    return 0; +} + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +    int padZ = tnv_ctx->padZ; +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    // Auxiliar vectors +    memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float)); +    memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float)); +    memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float)); +    memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float)); +    memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float)); +    memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float)); + +    return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { +    long i, j, k, l, m; + +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    float *Input = ctx->Input; +    float *u = ctx->u; +    float *u_upd = ctx->u_upd; +    float *qx = ctx->qx; +    float *qy = ctx->qy; +    float *qx_upd = ctx->qx_upd; +    float *qy_upd = ctx->qy_upd; +    float *gradx = ctx->gradx; +    float *grady = ctx->grady; +    float *gradx_upd = ctx->gradx_upd; +    float *grady_upd = ctx->grady_upd; +    float *div = ctx->div; +    float *div_upd = ctx->div_upd; + +    long p = 1l; +    long q = 1l; +    long r = 0l; + +    float lambda = tnv_ctx->lambda; +    float sigma = tnv_ctx->sigma; +    float tau = tnv_ctx->tau; +    float theta = tnv_ctx->theta; +     +    float taulambda = tau * lambda; +    float divtau = 1.0f / tau; +    float divsigma = 1.0f / sigma; +    float theta1 = 1.0f + theta; +    float constant = 1.0f + taulambda; + +    float resprimal = 0.0f; +    float resdual = 0.0f; +    float product = 0.0f; +    float unorm = 0.0f; +    float qnorm = 0.0f; + +    float udiff[dimZ]; +    float qxdiff; +    float qydiff; +    float divdiff; +    float gradxdiff[dimZ]; +    float gradydiff[dimZ]; + +    for(i=0; i < dimX; i++) { +        for(k = 0; k < dimZ; k++) {  +            int l = i * padZ + k; +            u_upd[l] = (u[l] + tau * div[l] + taulambda * Input[l])/constant; +            div_upd[l] = 0; +        } +    } + +    for(j = 0; j < stepY; j++) { +/*        m = j * dimX * dimZ + (dimX - 1) * dimZ; +        for(k = 0; k < dimZ; k++) {  +            u_upd[k + m] = (u[k + m] + tau * div[k + m] + taulambda * Input[k + m]) / constant;  +        }*/ +         +        for(i = 0; i < dimX/* - 1*/; i++) { +            float t[3]; +            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; +            l = (j * dimX  + i) * padZ; +            m = dimX * padZ; +         +//#pragma unroll 64 +            for(k = 0; k < dimZ; k++) { +                u_upd[l + k + m] = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; + +                gradx_upd[l + k] = (i == (dimX - 1))?0:(u_upd[l + k + padZ] - u_upd[l + k]); +                grady_upd[l + k] = (j == (copY - 1))?0:(u_upd[l + k + m] - u_upd[l + k]);   // We need div from the next thread on last iter + +                udiff[k] = u[l + k] - u_upd[l + k]; +                unorm += (udiff[k] * udiff[k]); +//                if ((!k)&&(!i)) printf("%i = %f %f, %f %f\n", offY + j, u[l + k], u_upd[l + k], udiff[k], unorm); + +                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; +                gradydiff[k] = grady[l + k] - grady_upd[l + k]; + +                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; +                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +//#define TNV_NEW_STYLE                 +#ifdef TNV_NEW_STYLE                 +                qx_upd[l + k] = qx[l + k] + sigma * ubarx; +                qy_upd[l + k] = qy[l + k] + sigma * ubary; + +                float vx = divsigma * qx_upd[l + k]; //+ ubarx +                float vy = divsigma * qy_upd[l + k]; //+ ubary +#else +                float vx = ubarx + divsigma * qx[l + k]; +                float vy = ubary + divsigma * qy[l + k]; +#endif + +                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); +            } + +            coefF(t, M1, M2, M3, sigma, p, q, r); +             +//#pragma unroll 64 +            for(k = 0; k < dimZ; k++) { +#ifdef TNV_NEW_STYLE     +                float vx = divsigma * qx_upd[l + k]; +                float vy = divsigma * qy_upd[l + k]; + +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qx_upd[l + k] -= sigma * gx_upd; +                qy_upd[l + k] -= sigma * gy_upd; +#else +                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; +                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +                float vx = ubarx + divsigma * qx[l + k]; +                float vy = ubary + divsigma * qy[l + k]; + +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); +                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); +#endif + +if(i != (dimX-1)) { +                div_upd[l + k] += qx_upd[l + k]; +                div_upd[l + k + padZ] -= qx_upd[l + k]; +} +if(j != (copY-1)) { +                div_upd[l + k] += qy_upd[l + k]; +                div_upd[l + k + m] = -qy_upd[l + k];  // We need to update div in the next thread on last iter +} + +                qxdiff = qx[l + k] - qx_upd[l + k]; +                qydiff = qy[l + k] - qy_upd[l + k]; +                qnorm += (qxdiff * qxdiff + qydiff * qydiff); + +                resdual += fabs(divsigma * qxdiff - gradxdiff[k]); +                resdual += fabs(divsigma * qydiff - gradydiff[k]); +                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + +                if ((offY == 0)||(j > 0)) { +                    divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history? +                    resprimal += fabs(divtau * udiff[k] + divdiff);  +                } +            } +             +        } // i +    } +     +     +    ctx->resprimal = resprimal; +    ctx->resdual = resdual; +    ctx->product = product; +    ctx->unorm = unorm; +    ctx->qnorm = qnorm; + +    return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { +    int i, off, size, err; + +    if (sched) return; + +    tnv_ctx.dimX = dimX; +    tnv_ctx.dimY = dimY; +    tnv_ctx.dimZ = dimZ; +        // Padding seems actually slower +//    tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0)); +    tnv_ctx.padZ = dimZ; +     +    hw_sched_init(); + +    int threads = hw_sched_get_cpu_count(); +    if (threads > dimY) threads = dimY/2; + +    int step = dimY / threads; +    int extra = dimY % threads; + +    tnv_ctx.threads = threads; +    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); +    for (i = 0, off = 0; i < threads; i++, off += size) { +        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; +        size = step + ((i < extra)?1:0); + +        ctx->offY = off; +        ctx->stepY = size; + +        if (i == (threads-1)) ctx->copY = ctx->stepY; +        else ctx->copY = ctx->stepY + 1; +    } + +    sched = hw_sched_create(threads); +    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ +    int err; +    int iter; +    int i,j,k,l,m; + +    lambda = 1.0f/(2.0f*lambda); +    tnv_ctx.lambda = lambda; + +    // PDHG algorithm parameters +    float tau = 0.5f; +    float sigma = 0.5f; +    float theta = 1.0f; + +    // Backtracking parameters +    float s = 1.0f; +    float gamma = 0.75f; +    float beta = 0.95f; +    float alpha0 = 0.2f; +    float alpha = alpha0; +    float delta = 1.5f; +    float eta = 0.95f; + +    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + +    tnv_ctx.InputT = InputT; +    tnv_ctx.uT = uT; +     +    int padZ = tnv_ctx.padZ; + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + +    // Apply Primal-Dual Hybrid Gradient scheme +    float residual = fLarge; +    for(iter = 0; iter < maxIter; iter++)   { +        float resprimal = 0.0f; +        float resdual = 0.0f; +        float product = 0.0f; +        float unorm = 0.0f; +        float qnorm = 0.0f; + +        float divtau = 1.0f / tau; + +        tnv_ctx.sigma = sigma; +        tnv_ctx.tau = tau; +        tnv_ctx.theta = theta; + +        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); +        if (!err) err = hw_sched_wait_task(sched); +        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + +            // border regions +        for (j = 1; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + +            m = ctx0->stepY * dimX * padZ; +            for(i = 0; i < dimX; i++) { +                for(k = 0; k < dimZ; k++) { +                    int l = i * padZ + k; +                         +                    float div_upd_add = ctx0->div_upd[m + l]; +                    ctx->div_upd[l] += div_upd_add; +                    ctx0->div_upd[m + l] = ctx->div_upd[l]; +                    //ctx0->u_upd[m + l] = ctx->u_upd[l]; + +                    float divdiff = ctx->div[l] - ctx->div_upd[l];  // Multiple steps... How we compute without history? +                    float udiff = ctx->u[l] - ctx->u_upd[l]; +                    resprimal += fabs(divtau * udiff + divdiff);  +                } +            } +        } + +        for (j = 0; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; +            resprimal += ctx->resprimal; +            resdual += ctx->resdual; +            product += ctx->product; +            unorm += ctx->unorm; +            qnorm += ctx->qnorm; +        }  + +        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); +        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); +        float dual_dot_delta = resdual * s * delta; +        float dual_div_delta = (resdual * s) / delta; +        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + +        if(b > 1) { +            // Decrease step-sizes to fit balancing principle +            tau = (beta * tau) / b; +            sigma = (beta * sigma) / b; +            alpha = alpha0; + +            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); +            if (!err) err = hw_sched_wait_task(sched); +            if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } +        } else { +            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy); +            if (!err) err = hw_sched_wait_task(sched); +            if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); } +         +            if(resprimal > dual_dot_delta) { +                    // Increase primal step-size and decrease dual step-size +                tau = tau / (1.0f - alpha); +                sigma = sigma * (1.0f - alpha); +                alpha = alpha * eta; +            } else if(resprimal < dual_div_delta) { +                    // Decrease primal step-size and increase dual step-size +                tau = tau * (1.0f - alpha); +                sigma = sigma / (1.0f - alpha); +                alpha = alpha * eta; +            } +        } + +        if (residual < tol) break; +    } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + +    printf("Iterations stopped at %i with the residual %f \n", iter, residual); +    printf("Return: %f\n", *uT); + +    return *uT; +} diff --git a/src/Core/regularisers_CPU/hw_sched.c b/src/Core/regularisers_CPU/hw_sched.c new file mode 100644 index 0000000..be895cc --- /dev/null +++ b/src/Core/regularisers_CPU/hw_sched.c @@ -0,0 +1,396 @@ +/* + * The PyHST program is Copyright (C) 2002-2011 of the + * European Synchrotron Radiation Facility (ESRF) and + * Karlsruhe Institute of Technology (KIT). + * + * PyHST is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + *  + * hst is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + *  + * You should have received a copy of the GNU General Public License along + * with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifdef HW_HAVE_SCHED_HEADERS +# include <sys/types.h> +# include <unistd.h> +# include <sched.h> +#endif /* HW_HAVE_SCHED_HEADERS */ + +#include "hw_sched.h" + + +#ifdef HW_USE_THREADS +# define MUTEX_INIT(ctx, name) \ +    if (!err) { \ +	ctx->name##_mutex = g_mutex_new(); \ +	if (!ctx->name##_mutex) err = 1; \ +    } +     +# define MUTEX_FREE(ctx, name) \ +    if (ctx->name##_mutex) g_mutex_free(ctx->name##_mutex); + +# define COND_INIT(ctx, name) \ +    MUTEX_INIT(ctx, name##_cond) \ +    if (!err) { \ +	ctx->name##_cond = g_cond_new(); \ +	if (!ctx->name##_cond) { \ +	    err = 1; \ +	    MUTEX_FREE(ctx, name##_cond) \ +	} \ +    } + +# define COND_FREE(ctx, name) \ +    if (ctx->name##_cond) g_cond_free(ctx->name##_cond); \ +    MUTEX_FREE(ctx, name##_cond) +#else /* HW_USE_THREADS */ +# define MUTEX_INIT(ctx, name) +# define MUTEX_FREE(ctx, name) +# define COND_INIT(ctx, name) +# define COND_FREE(ctx, name) +#endif /* HW_USE_THREADS */ + + +HWRunFunction ppu_run[] = { +    (HWRunFunction)NULL +}; + +static int hw_sched_initialized = 0; + +int hw_sched_init(void) { +    if (!hw_sched_initialized) { +#ifdef HW_USE_THREADS +	g_thread_init(NULL); +#endif /* HW_USE_THREADS */ +	hw_sched_initialized = 1; +    } + +    return 0; +} + + +int hw_sched_get_cpu_count(void) { +#ifdef HW_HAVE_SCHED_HEADERS +    int err; + +    int cpu_count; +    cpu_set_t mask; + +    err = sched_getaffinity(getpid(), sizeof(mask), &mask); +    if (err) return 1; + +# ifdef CPU_COUNT +    cpu_count = CPU_COUNT(&mask); +# else +    for (cpu_count = 0; cpu_count < CPU_SETSIZE; cpu_count++) { +	if (!CPU_ISSET(cpu_count, &mask)) break; +    } +# endif + +    if (!cpu_count) cpu_count = 1; +    return cpu_count;     +#else /* HW_HAVE_SCHED_HEADERS */ +    return 1; +#endif /* HW_HAVE_SCHED_HEADERS */ +} + + +HWSched hw_sched_create(int cpu_count) { +    int i; +    int err = 0; + +    HWSched ctx; + +    //hw_sched_init(); +     +    ctx = (HWSched)malloc(sizeof(HWSchedS)); +    if (!ctx) return NULL; + +    memset(ctx, 0, sizeof(HWSchedS)); + +    ctx->status = 1; + +    MUTEX_INIT(ctx, sync); +    MUTEX_INIT(ctx, data); +    COND_INIT(ctx, compl); +    COND_INIT(ctx, job); +     +    if (err) { +	hw_sched_destroy(ctx); +	return NULL; +    } +     +    if (!cpu_count) cpu_count = hw_sched_get_cpu_count(); +    if (cpu_count > HW_MAX_THREADS) cpu_count = HW_MAX_THREADS; + +    ctx->n_threads = 0; +    for (i = 0; i < cpu_count; i++) { +	ctx->thread[ctx->n_threads] = hw_thread_create(ctx, ctx->n_threads, NULL, ppu_run, NULL); +	if (ctx->thread[ctx->n_threads]) { +#ifndef HW_USE_THREADS +	    ctx->thread[ctx->n_threads]->status = HW_THREAD_STATUS_STARTING; +#endif /* HW_USE_THREADS */ +	    ++ctx->n_threads; +	} +    } +     +    if (!ctx->n_threads) { +	hw_sched_destroy(ctx); +	return NULL; +    } +     +    return ctx; +} + +static int hw_sched_wait_threads(HWSched ctx) { +#ifdef HW_USE_THREADS +    int i = 0; +     +    hw_sched_lock(ctx, compl_cond); +    while (i < ctx->n_threads) { +        for (; i < ctx->n_threads; i++) { +	    if (ctx->thread[i]->status == HW_THREAD_STATUS_INIT) { +		hw_sched_wait(ctx, compl); +		break; +	    } +	} +	 +    } +    hw_sched_unlock(ctx, compl_cond); +#endif /* HW_USE_THREADS */ +     +    ctx->started = 1; + +    return 0; +} + +void hw_sched_destroy(HWSched ctx) { +    int i; + +    if (ctx->n_threads > 0) { +	if (!ctx->started) { +	    hw_sched_wait_threads(ctx); +	} + +	ctx->status = 0; +	hw_sched_lock(ctx, job_cond); +	hw_sched_broadcast(ctx, job); +	hw_sched_unlock(ctx, job_cond); +     +	for (i = 0; i < ctx->n_threads; i++) { +	    hw_thread_destroy(ctx->thread[i]); +	} +    } + +    COND_FREE(ctx, job); +    COND_FREE(ctx, compl); +    MUTEX_FREE(ctx, data); +    MUTEX_FREE(ctx, sync); + +    free(ctx); +} + +int hw_sched_set_sequential_mode(HWSched ctx, int *n_blocks, int *cur_block, HWSchedFlags flags) { +    ctx->mode = HW_SCHED_MODE_SEQUENTIAL; +    ctx->n_blocks = n_blocks; +    ctx->cur_block = cur_block; +    ctx->flags = flags; +     +    return 0; +} + +int hw_sched_get_chunk(HWSched ctx, int thread_id) { +    int block; + +    switch (ctx->mode) { +	case HW_SCHED_MODE_PREALLOCATED: +	    if (ctx->thread[thread_id]->status == HW_THREAD_STATUS_STARTING) { +#ifndef HW_USE_THREADS +	        ctx->thread[thread_id]->status = HW_THREAD_STATUS_DONE; +#endif /* HW_USE_THREADS */ +                return thread_id; +	    } else { +		return HW_SCHED_CHUNK_INVALID; +	    } +	case HW_SCHED_MODE_SEQUENTIAL: +	    if ((ctx->flags&HW_SCHED_FLAG_INIT_CALL)&&(ctx->thread[thread_id]->status == HW_THREAD_STATUS_STARTING)) { +	        return HW_SCHED_CHUNK_INIT; +	    } +	    hw_sched_lock(ctx, data); +	    block = *ctx->cur_block; +	    if (block < *ctx->n_blocks) { +		*ctx->cur_block = *ctx->cur_block + 1; +	    } else { +		block = HW_SCHED_CHUNK_INVALID; +	    } +	    hw_sched_unlock(ctx, data); +	    if (block == HW_SCHED_CHUNK_INVALID) { +	        if (((ctx->flags&HW_SCHED_FLAG_FREE_CALL)&&(ctx->thread[thread_id]->status == HW_THREAD_STATUS_RUNNING))) { +	            ctx->thread[thread_id]->status = HW_THREAD_STATUS_FINISHING; +	            return HW_SCHED_CHUNK_FREE; +	        } +	        if ((ctx->flags&HW_SCHED_FLAG_TERMINATOR_CALL)&&((ctx->thread[thread_id]->status == HW_THREAD_STATUS_RUNNING)||(ctx->thread[thread_id]->status == HW_THREAD_STATUS_FINISHING))) { +	            int i; +	            hw_sched_lock(ctx, data); +	            for (i = 0; i < ctx->n_threads; i++) { +	                if (thread_id == i) continue; +	                if ((ctx->thread[i]->status != HW_THREAD_STATUS_DONE)&&(ctx->thread[i]->status != HW_THREAD_STATUS_FINISHING2)&&(ctx->thread[i]->status != HW_THREAD_STATUS_IDLE)) { +	            	    break; +	            	} +	            } +	            ctx->thread[thread_id]->status  = HW_THREAD_STATUS_FINISHING2; +	            hw_sched_unlock(ctx, data); +	            if (i == ctx->n_threads) { +	                return HW_SCHED_CHUNK_TERMINATOR; +	            }  +	        } +	    } +	    return block; +	default: +	    return HW_SCHED_CHUNK_INVALID; +    } + +    return -1; +} + +     +int hw_sched_schedule_task(HWSched ctx, void *appctx, HWEntry entry) { +#ifdef HW_USE_THREADS +    if (!ctx->started) { +	hw_sched_wait_threads(ctx); +    } +#else /* HW_USE_THREADS */ +    int err; +    int i, chunk_id, n_threads; +    HWRunFunction run; +    HWThread thrctx; +#endif /* HW_USE_THREADS */ +     +    ctx->ctx = appctx; +    ctx->entry = entry; + +    switch (ctx->mode) { +	case HW_SCHED_MODE_SEQUENTIAL: +	    *ctx->cur_block = 0; +	break; +	default: +	    ; +    } +         +#ifdef HW_USE_THREADS +    hw_sched_lock(ctx, compl_cond); + +    hw_sched_lock(ctx, job_cond); +    hw_sched_broadcast(ctx, job); +    hw_sched_unlock(ctx, job_cond); +#else  /* HW_USE_THREADS */ +    n_threads = ctx->n_threads; +     +    for (i = 0; i < n_threads; i++) { +	thrctx = ctx->thread[i]; +	thrctx->err = 0; +    } + +    i = 0; +    thrctx = ctx->thread[i]; +    chunk_id = hw_sched_get_chunk(ctx, thrctx->thread_id); + +    while (chunk_id >= 0) { +	run = hw_run_entry(thrctx->runs, entry); +        err = run(thrctx, thrctx->hwctx, chunk_id, appctx); +	if (err) { +	    thrctx->err = err; +	    break; +	} +	 +	if ((++i) == n_threads) i = 0; +	thrctx = ctx->thread[i]; +        chunk_id = hw_sched_get_chunk(ctx, thrctx->thread_id); +    } +#endif /* HW_USE_THREADS */ + +    return 0; +} + +int hw_sched_wait_task(HWSched ctx) { +    int err = 0; +    int i = 0, n_threads = ctx->n_threads; + +#ifdef HW_USE_THREADS +    while (i < ctx->n_threads) { +        for (; i < ctx->n_threads; i++) { +	    if (ctx->thread[i]->status == HW_THREAD_STATUS_DONE) { +		ctx->thread[i]->status = HW_THREAD_STATUS_IDLE; +	    } else { +		hw_sched_wait(ctx, compl); +		break; +	    } +	} +	 +    } + +    hw_sched_unlock(ctx, compl_cond); +#endif /* HW_USE_THREADS */ + +    for (i = 0; i < n_threads; i++) { +	HWThread thrctx = ctx->thread[i]; +	if (thrctx->err) return err = thrctx->err; + +#ifndef HW_USE_THREADS +        thrctx->status = HW_THREAD_STATUS_IDLE; +#endif /* HW_USE_THREADS */ +    } + +    return err; +} + +int hw_sched_execute_task(HWSched ctx, void *appctx, HWEntry entry) { +    int err; +     +    err = hw_sched_schedule_task(ctx, appctx, entry); +    if (err) return err; +     +    return hw_sched_wait_task(ctx); +} + +int hw_sched_schedule_thread_task(HWSched ctx, void *appctx, HWEntry entry) { +    int err; +     +    ctx->saved_mode = ctx->mode; +    ctx->mode = HW_SCHED_MODE_PREALLOCATED; +    err = hw_sched_schedule_task(ctx, appctx, entry); +     +    return err; +} + + +int hw_sched_wait_thread_task(HWSched ctx) { +    int err; + +    err = hw_sched_wait_task(ctx); +    ctx->mode = ctx->saved_mode; + +    return err; +} + +int hw_sched_execute_thread_task(HWSched ctx, void *appctx, HWEntry entry) { +    int err; +    int saved_mode = ctx->mode; + +    ctx->mode = HW_SCHED_MODE_PREALLOCATED; +    err = hw_sched_execute_task(ctx, appctx, entry); +    ctx->mode = saved_mode; +     +    return err; +} diff --git a/src/Core/regularisers_CPU/hw_sched.h b/src/Core/regularisers_CPU/hw_sched.h new file mode 100644 index 0000000..c5d1285 --- /dev/null +++ b/src/Core/regularisers_CPU/hw_sched.h @@ -0,0 +1,151 @@ +/* + * The PyHST program is Copyright (C) 2002-2011 of the + * European Synchrotron Radiation Facility (ESRF) and + * Karlsruhe Institute of Technology (KIT). + * + * PyHST is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + *  + * hst is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + *  + * You should have received a copy of the GNU General Public License along + * with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _HW_SCHED_H +#define _HW_SCHED_H + +#include <glib.h> + +    // enable threading +#define HW_HAVE_SCHED_HEADERS +#define HW_USE_THREADS + + +typedef struct HWSchedT *HWSched; +#ifdef HW_USE_THREADS +typedef GMutex *HWMutex; +#else /* HW_USE_THREADS */ +typedef void *HWMutex; +#endif /* HW_USE_THREADS */ + + +#include "hw_thread.h" + +enum HWSchedModeT { +    HW_SCHED_MODE_PREALLOCATED = 0, +    HW_SCHED_MODE_SEQUENTIAL +}; +typedef enum HWSchedModeT HWSchedMode; + +enum HWSchedChunkT { +    HW_SCHED_CHUNK_INVALID = -1, +    HW_SCHED_CHUNK_INIT = -2, +    HW_SCHED_CHUNK_FREE = -3, +    HW_SCHED_CHUNK_TERMINATOR = -4 +}; +typedef enum HWSchedChunkT HWSchedChunk; + +enum HWSchedFlagsT { +    HW_SCHED_FLAG_INIT_CALL = 1,        //! Executed in each thread before real chunks +    HW_SCHED_FLAG_FREE_CALL = 2,        //! Executed in each thread after real chunks +    HW_SCHED_FLAG_TERMINATOR_CALL = 4   //! Executed in one of the threads after all threads are done +}; +typedef enum HWSchedFlagsT HWSchedFlags; + + +#define HW_SINGLE_MODE +//#define HW_DETECT_CPU_CORES +#define HW_MAX_THREADS 128 + +#ifdef HW_SINGLE_MODE +    typedef HWRunFunction HWEntry; +# define hw_run_entry(runs, entry) entry +#else /* HW_SINGLE_MODE */ +    typedef int HWEntry; +# define hw_run_entry(runs, entry) runs[entry] +#endif /* HW_SINGLE_MODE */ + +#ifndef HW_HIDE_DETAILS +struct HWSchedT { +    int status; +    int started; +     +    int n_threads; +    HWThread thread[HW_MAX_THREADS]; +     +#ifdef HW_USE_THREADS +    GCond *job_cond, *compl_cond; +    GMutex *job_cond_mutex, *compl_cond_mutex, *data_mutex; +    GMutex *sync_mutex; +#endif /* HW_USE_THREADS */ +     +    HWSchedMode mode; +    HWSchedMode saved_mode; +    HWSchedFlags flags; +    int *n_blocks; +    int *cur_block; + +    HWEntry entry; +    void *ctx; +}; +typedef struct HWSchedT HWSchedS; +#endif /* HW_HIDE_DETAILS */ + +# ifdef __cplusplus +extern "C" { +# endif + +HWSched hw_sched_create(int ppu_count); +int hw_sched_init(void); +void hw_sched_destroy(HWSched ctx); +int hw_sched_get_cpu_count(void); + +int hw_sched_set_sequential_mode(HWSched ctx, int *n_blocks, int *cur_block, HWSchedFlags flags); +int hw_sched_get_chunk(HWSched ctx, int thread_id); +int hw_sched_schedule_task(HWSched ctx, void *appctx, HWEntry entry); +int hw_sched_wait_task(HWSched ctx); +int hw_sched_execute_task(HWSched ctx, void *appctx, HWEntry entry); + +int hw_sched_schedule_thread_task(HWSched ctx, void *appctx, HWEntry entry); +int hw_sched_wait_thread_task(HWSched ctx); +int hw_sched_execute_thread_task(HWSched ctx, void *appctx, HWEntry entry); + +HWMutex hw_sched_create_mutex(void); +void hw_sched_destroy_mutex(HWMutex ctx); + +#ifdef HW_USE_THREADS +# define hw_sched_lock(ctx, type) g_mutex_lock(ctx->type##_mutex) +# define hw_sched_unlock(ctx, type) g_mutex_unlock(ctx->type##_mutex) +# define hw_sched_broadcast(ctx, type) g_cond_broadcast(ctx->type##_cond) +# define hw_sched_signal(ctx, type) g_cond_signal(ctx->type##_cond) +# define hw_sched_wait(ctx, type) g_cond_wait(ctx->type##_cond, ctx->type##_cond_mutex) + +#define hw_sched_create_mutex(void) g_mutex_new() +#define hw_sched_destroy_mutex(ctx) g_mutex_free(ctx) +#define hw_sched_lock_mutex(ctx) g_mutex_lock(ctx) +#define hw_sched_unlock_mutex(ctx) g_mutex_unlock(ctx) +#else /* HW_USE_THREADS */ +# define hw_sched_lock(ctx, type) +# define hw_sched_unlock(ctx, type) +# define hw_sched_broadcast(ctx, type) +# define hw_sched_signal(ctx, type) +# define hw_sched_wait(ctx, type) + +#define hw_sched_create_mutex(void) NULL +#define hw_sched_destroy_mutex(ctx) +#define hw_sched_lock_mutex(ctx) +#define hw_sched_unlock_mutex(ctx) +#endif /* HW_USE_THREADS */ + +# ifdef __cplusplus +} +# endif + +#endif /* _HW_SCHED_H */ + diff --git a/src/Core/regularisers_CPU/hw_thread.c b/src/Core/regularisers_CPU/hw_thread.c new file mode 100644 index 0000000..41b1800 --- /dev/null +++ b/src/Core/regularisers_CPU/hw_thread.c @@ -0,0 +1,141 @@ +/* + * The PyHST program is Copyright (C) 2002-2011 of the + * European Synchrotron Radiation Facility (ESRF) and + * Karlsruhe Institute of Technology (KIT). + * + * PyHST is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + *  + * hst is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + *  + * You should have received a copy of the GNU General Public License along + * with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "hw_sched.h" +#include "hw_thread.h" + +#ifdef HW_USE_THREADS +static void *hw_thread_function(HWThread ctx) { +    int err; +    int chunk_id; +     +    HWRunFunction *runs; +    HWRunFunction run; +    HWSched sched; +    void *hwctx; +     +    sched = ctx->sched; +    runs = ctx->run; +    hwctx = ctx->hwctx; +     +    hw_sched_lock(sched, job_cond); + +    hw_sched_lock(sched, compl_cond); +    ctx->status = HW_THREAD_STATUS_IDLE;     +    hw_sched_broadcast(sched, compl); +    hw_sched_unlock(sched, compl_cond); +     +    while (sched->status) { +	hw_sched_wait(sched, job); +	if (!sched->status) break; + +	ctx->err = 0; +	ctx->status = HW_THREAD_STATUS_STARTING; +	hw_sched_unlock(sched, job_cond); +	 +	run = hw_run_entry(runs, sched->entry); +#if 0 +	// Offset to interleave transfers if the GPUBox is used +	// Just check with CUDA_LAUNCH_BLOCKED the togpu time and put it here +	// It should be still significantly less than BP time +	// We can do callibration during initilization in future +	 +	usleep(12000 * ctx->thread_id); +#endif +	chunk_id = hw_sched_get_chunk(sched, ctx->thread_id); + +	    /* Should be after get_chunk, since we can check if it's first time */ +	ctx->status = HW_THREAD_STATUS_RUNNING;  +	while (chunk_id != HW_SCHED_CHUNK_INVALID) { +	    //printf("Thread %i processing slice %i\n", ctx->thread_id, chunk_id); +	    err = run(ctx, hwctx, chunk_id, sched->ctx); +	    if (err) { +		ctx->err = err; +		break; +	    } +	    chunk_id = hw_sched_get_chunk(sched, ctx->thread_id); +	} + +	hw_sched_lock(sched, job_cond); +	 +	hw_sched_lock(sched, compl_cond); +	ctx->status = HW_THREAD_STATUS_DONE; +	hw_sched_broadcast(sched, compl); +	hw_sched_unlock(sched, compl_cond); +    } + +    hw_sched_unlock(sched, job_cond); + +    g_thread_exit(NULL); +    return NULL; /* TODO: check this */ +} +#endif /* HW_USE_THREADS */ + + +HWThread hw_thread_create(HWSched sched, int thread_id, void *hwctx, HWRunFunction *run_func, HWFreeFunction free_func) { +    GError *err; +     +    HWThread ctx; +     +    ctx = (HWThread)malloc(sizeof(HWThreadS)); +    if (!ctx) return ctx; +     +    memset(ctx, 0, sizeof(HWThreadS)); + +    ctx->sched = sched; +    ctx->hwctx = hwctx; +    ctx->run = run_func; +    ctx->free = free_func; +    ctx->thread_id = thread_id; +    ctx->status = HW_THREAD_STATUS_INIT; +     +#ifdef HW_USE_THREADS +    ctx->thread = g_thread_create((GThreadFunc)hw_thread_function, ctx, 1, &err); +    if (!ctx->thread) { +	g_error_free(err); + +	hw_thread_destroy(ctx); +	return NULL; +    } +#endif /* HW_USE_THREADS */ +     +    return ctx; +} + +void hw_thread_destroy(HWThread ctx) { +#ifdef HW_USE_THREADS +    if (ctx->thread) { +	g_thread_join(ctx->thread); +    } +#endif /* HW_USE_THREADS */ +     +    if (ctx->data) { +	free(ctx->data); +    } +     +    if (ctx->free) { +	ctx->free(ctx->hwctx); +    } +     +    free(ctx); +} diff --git a/src/Core/regularisers_CPU/hw_thread.h b/src/Core/regularisers_CPU/hw_thread.h new file mode 100644 index 0000000..de7f60f --- /dev/null +++ b/src/Core/regularisers_CPU/hw_thread.h @@ -0,0 +1,76 @@ +/* + * The PyHST program is Copyright (C) 2002-2011 of the + * European Synchrotron Radiation Facility (ESRF) and + * Karlsruhe Institute of Technology (KIT). + * + * PyHST is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + *  + * hst is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + *  + * You should have received a copy of the GNU General Public License along + * with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _HW_THREAD_H +#define _HW_THREAD_H + +#include <glib.h> + +typedef struct HWThreadT *HWThread; +typedef int (*HWRunFunction)(HWThread thread, void *ctx, int block, void *attr); +typedef int (*HWFreeFunction)(void *ctx); + +#include "hw_sched.h" + +enum HWThreadStatusT { +    HW_THREAD_STATUS_IDLE = 0, +    HW_THREAD_STATUS_STARTING = 1, +    HW_THREAD_STATUS_RUNNING = 2, +    HW_THREAD_STATUS_FINISHING = 3, +    HW_THREAD_STATUS_FINISHING2 = 4, +    HW_THREAD_STATUS_DONE = 5, +    HW_THREAD_STATUS_INIT = 6 +}; +typedef enum HWThreadStatusT HWThreadStatus; + + +#ifndef HW_HIDE_DETAILS +struct HWThreadT { +    int thread_id; +    HWSched sched; + +#ifdef HW_USE_THREADS     +    GThread *thread; +#endif /* HW_USE_THREADS */ +     +    void *hwctx; +    HWRunFunction *run; +    HWFreeFunction free; +     +    int err; +    HWThreadStatus status; + +    void *data;			/**< Per-thread data storage, will be free'd if set */ +}; +typedef struct HWThreadT HWThreadS; +#endif /* HW_HIDE_DETAILS */ + +# ifdef __cplusplus +extern "C" { +# endif + +HWThread hw_thread_create(HWSched sched, int thread_id, void *hwctx, HWRunFunction *run_func, HWFreeFunction free_func); +void hw_thread_destroy(HWThread ctx); + +# ifdef __cplusplus +} +# endif + + +#endif /* _HW_THREAD_H */ | 
