From c65291e6b987283e4767a8ad2bd2d2433ca3782e Mon Sep 17 00:00:00 2001 From: Daniil Kazantsev Date: Thu, 28 Nov 2019 23:01:03 +0000 Subject: all work completed on gpu version of pdtv --- src/CMakeLists.txt | 10 +- src/Core/CMakeLists.txt | 12 +- src/Core/regularisers_CPU/PD_TV_core.c | 1 + src/Core/regularisers_GPU/TV_PD_GPU_core.cu | 522 +++++++++++++++++++++ src/Core/regularisers_GPU/TV_PD_GPU_core.h | 9 + src/Matlab/mex_compile/compileGPU_mex.m | 6 + src/Matlab/mex_compile/regularisers_CPU/PD_TV.c | 22 +- .../mex_compile/regularisers_GPU/PD_TV_GPU.cpp | 101 ++++ src/Python/ccpi/filters/regularisers.py | 4 +- src/Python/setup-regularisers.py.in | 1 + src/Python/src/gpu_regularisers.pyx | 72 ++- 11 files changed, 735 insertions(+), 25 deletions(-) create mode 100644 src/Core/regularisers_GPU/TV_PD_GPU_core.cu create mode 100644 src/Core/regularisers_GPU/TV_PD_GPU_core.h create mode 100644 src/Matlab/mex_compile/regularisers_GPU/PD_TV_GPU.cpp (limited to 'src') diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5fe1a57..f93c19a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -17,4 +17,12 @@ if (BUILD_MATLAB_WRAPPER) endif() if (BUILD_PYTHON_WRAPPER) add_subdirectory(Python) -endif() \ No newline at end of file +endif() +find_package(OpenMP REQUIRED) +if (OPENMP_FOUND) + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") + set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") +endif() diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt index c1bd7bb..07aae3c 100644 --- a/src/Core/CMakeLists.txt +++ b/src/Core/CMakeLists.txt @@ -12,17 +12,6 @@ set (CIL_VERSION $ENV{CIL_VERSION} CACHE INTERNAL "Core Imaging Library version" message("CIL_VERSION ${CIL_VERSION}") #include (GenerateExportHeader) - -find_package(OpenMP) -if (OPENMP_FOUND) - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") - set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") - set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") - -endif() - ## Build the regularisers package as a library message("Creating Regularisers as a shared library") @@ -115,6 +104,7 @@ if (BUILD_CUDA) CUDA_ADD_LIBRARY(cilregcuda SHARED ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_ROF_GPU_core.cu ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_FGP_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_PD_GPU_core.cu ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_SB_GPU_core.cu ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/LLT_ROF_GPU_core.cu ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TGV_GPU_core.cu diff --git a/src/Core/regularisers_CPU/PD_TV_core.c b/src/Core/regularisers_CPU/PD_TV_core.c index 65b8711..534091b 100644 --- a/src/Core/regularisers_CPU/PD_TV_core.c +++ b/src/Core/regularisers_CPU/PD_TV_core.c @@ -81,6 +81,7 @@ float PDTV_CPU_main(float *Input, float *U, float *infovector, float lambdaPar, /* copy U to U_old */ copyIm(U, U_old, (long)(dimX), (long)(dimY), 1l); + /* calculate divergence */ DivProj2D(U, Input, P1, P2,(long)(dimX), (long)(dimY), lt, tau); /* check early stopping criteria */ diff --git a/src/Core/regularisers_GPU/TV_PD_GPU_core.cu b/src/Core/regularisers_GPU/TV_PD_GPU_core.cu new file mode 100644 index 0000000..59fda3b --- /dev/null +++ b/src/Core/regularisers_GPU/TV_PD_GPU_core.cu @@ -0,0 +1,522 @@ +/* +This work is part of the Core Imaging Library developed by +Visual Analytics and Imaging System Group of the Science Technology +Facilities Council, STFC + +Copyright 2017 Daniil Kazantsev +Copyright 2017 Srikanth Nagella, Edoardo Pasca + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "TV_PD_GPU_core.h" +#include "shared.h" +#include +#include +#include + +/* CUDA implementation of Primal-Dual TV [1] by Chambolle Pock denoising/regularization model (2D/3D case) + * + * Input Parameters: + * 1. Noisy image/volume + * 2. lambdaPar - regularization parameter + * 3. Number of iterations + * 4. eplsilon: tolerance constant + * 5. lipschitz_const: convergence related parameter + * 6. TV-type: methodTV - 'iso' (0) or 'l1' (1) + * 7. nonneg: 'nonnegativity (0 is OFF by default, 1 is ON) + * 8. tau: time marching parameter + + * Output: + * [1] TV - Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] + * + * [1] Antonin Chambolle, Thomas Pock. "A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging", 2010 + */ + +#define BLKXSIZE2D 16 +#define BLKYSIZE2D 16 + +#define BLKXSIZE 8 +#define BLKYSIZE 8 +#define BLKZSIZE 8 + +#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) +// struct square { __host__ __device__ float operator()(float x) { return x * x; } }; + +/************************************************/ +/*****************2D modules*********************/ +/************************************************/ + +__global__ void dualPD_kernel(float *U, float *P1, float *P2, float sigma, int N, int M) +{ + + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + if (xIndex == N-1) P1[index] += sigma*(U[(xIndex-1) + N*yIndex] - U[index]); + else P1[index] += sigma*(U[(xIndex+1) + N*yIndex] - U[index]); + if (yIndex == M-1) P2[index] += sigma*(U[xIndex + N*(yIndex-1)] - U[index]); + else P2[index] += sigma*(U[xIndex + N*(yIndex+1)] - U[index]); + } + return; +} +__global__ void Proj_funcPD2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize) +{ + + float denom; + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + denom = pow(P1[index],2) + pow(P2[index],2); + if (denom > 1.0f) { + P1[index] = P1[index]/sqrt(denom); + P2[index] = P2[index]/sqrt(denom); + } + } + return; +} +__global__ void Proj_funcPD2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize) +{ + + float val1, val2; + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + val1 = abs(P1[index]); + val2 = abs(P2[index]); + if (val1 < 1.0f) {val1 = 1.0f;} + if (val2 < 1.0f) {val2 = 1.0f;} + P1[index] = P1[index]/val1; + P2[index] = P2[index]/val2; + } + return; +} +__global__ void DivProj2D_kernel(float *U, float *Input, float *P1, float *P2, float lt, float tau, int N, int M) +{ + float P_v1, P_v2, div_var; + + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + if (xIndex == 0) P_v1 = -P1[index]; + else P_v1 = -(P1[index] - P1[(xIndex-1) + N*yIndex]); + if (yIndex == 0) P_v2 = -P2[index]; + else P_v2 = -(P2[index] - P2[xIndex + N*(yIndex-1)]); + div_var = P_v1 + P_v2; + U[index] = (U[index] - tau*div_var + lt*Input[index])/(1.0 + lt); + } + return; +} +__global__ void PDnonneg2D_kernel(float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + if (Output[index] < 0.0f) Output[index] = 0.0f; + } +} +/************************************************/ +/*****************3D modules*********************/ +/************************************************/ +__global__ void dualPD3D_kernel(float *U, float *P1, float *P2, float *P3, float sigma, int N, int M, int Z) +{ + + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + if (i == N-1) P1[index] += sigma*(U[(N*M)*k + (i-1) + N*j] - U[index]); + else P1[index] += sigma*(U[(N*M)*k + (i+1) + N*j] - U[index]); + if (j == M-1) P2[index] += sigma*(U[(N*M)*k + i + N*(j-1)] - U[index]); + else P2[index] += sigma*(U[(N*M)*k + i + N*(j+1)] - U[index]); + if (k == Z-1) P3[index] += sigma*(U[(N*M)*(k-1) + i + N*j] - U[index]); + else P3[index] += sigma*(U[(N*M)*(k+1) + i + N*j] - U[index]); + } + return; +} +__global__ void Proj_funcPD3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) +{ + + float denom,sq_denom; + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + denom = pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2); + if (denom > 1.0f) { + sq_denom = 1.0f/sqrt(denom); + P1[index] *= sq_denom; + P2[index] *= sq_denom; + P3[index] *= sq_denom; + } + } + return; +} +__global__ void Proj_funcPD3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) +{ + + float val1, val2, val3; + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + val1 = abs(P1[index]); + val2 = abs(P2[index]); + val3 = abs(P3[index]); + if (val1 < 1.0f) {val1 = 1.0f;} + if (val2 < 1.0f) {val2 = 1.0f;} + if (val3 < 1.0f) {val3 = 1.0f;} + P1[index] /= val1; + P2[index] /= val2; + P3[index] /= val3; + } + return; +} +__global__ void DivProj3D_kernel(float *U, float *Input, float *P1, float *P2, float *P3, float lt, float tau, int N, int M, int Z) +{ + float P_v1, P_v2, P_v3, div_var; + + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + if (i == 0) P_v1 = -P1[index]; + else P_v1 = -(P1[index] - P1[(N*M)*k + (i-1) + N*j]); + if (j == 0) P_v2 = -P2[index]; + else P_v2 = -(P2[index] - P2[(N*M)*k + i + N*(j-1)]); + if (k == 0) P_v3 = -P3[index]; + else P_v3 = -(P3[index] - P3[(N*M)*(k-1) + i + N*j]); + div_var = P_v1 + P_v2 + P_v3; + U[index] = (U[index] - tau*div_var + lt*Input[index])/(1.0 + lt); + } + return; +} + +__global__ void PDnonneg3D_kernel(float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + if (Output[index] < 0.0f) Output[index] = 0.0f; + } +} +__global__ void PDcopy_kernel2D(float *Input, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input[index]; + } +} + +__global__ void PDcopy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input[index]; + } +} + +__global__ void getU2D_kernel(float *Input, float *Input_old, float theta, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Input[index] += theta*(Input[index] - Input_old[index]); + } +} + +__global__ void getU3D_kernel(float *Input, float *Input_old, float theta, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Input[index] += theta*(Input[index] - Input_old[index]); + } +} + +__global__ void PDResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} + +__global__ void PDResidCalc3D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} + + +/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ + +////////////MAIN HOST FUNCTION /////////////// +extern "C" int TV_PD_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, float lipschitz_const, int methodTV, int nonneg, float tau, int dimX, int dimY, int dimZ) +{ + int deviceCount = -1; // number of devices + cudaGetDeviceCount(&deviceCount); + if (deviceCount == 0) { + fprintf(stderr, "No CUDA devices found\n"); + return -1; + } + int count = 0, i; + float re, sigma, theta, lt; + re = 0.0f; + + sigma = 1.0/(lipschitz_const*tau); + theta = 1.0f; + lt = tau/lambdaPar; + + if (dimZ <= 1) { + /*2D verson*/ + int ImSize = dimX*dimY; + float *d_input, *d_update, *d_old=NULL, *P1=NULL, *P2=NULL; + + dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); + dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D)); + + /*allocate space for images on device*/ + checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_old,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) ); + + checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + checkCudaErrors( cudaMemcpy(d_update,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + cudaMemset(P1, 0, ImSize*sizeof(float)); + cudaMemset(P2, 0, ImSize*sizeof(float)); + + /********************** Run CUDA 2D kernel here ********************/ + /* The main kernel */ + for (i = 0; i < iter; i++) { + + /* computing the the dual P variable */ + dualPD_kernel<<>>(d_update, P1, P2, sigma, dimX, dimY); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + if (nonneg != 0) { + PDnonneg2D_kernel<<>>(d_update, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); } + + /* projection step */ + if (methodTV == 0) Proj_funcPD2D_iso_kernel<<>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/ + else Proj_funcPD2D_aniso_kernel<<>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/ + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + /* copy U to U_old */ + PDcopy_kernel2D<<>>(d_update, d_old, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + /* calculate divergence */ + DivProj2D_kernel<<>>(d_update, d_input, P1, P2, lt, tau, dimX, dimY); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + if ((epsil != 0.0f) && (i % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + PDResidCalc2D_kernel<<>>(d_update, d_old, P1, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + // setup arguments + square unary_op; + thrust::plus binary_op; + thrust::device_vector d_vec(P1, P1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector d_vec2(d_update, d_update + ImSize); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } + + getU2D_kernel<<>>(d_update, d_old, theta, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + //copy result matrix from device to host memory + cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); + + cudaFree(d_input); + cudaFree(d_update); + cudaFree(d_old); + cudaFree(P1); + cudaFree(P2); + + } + else { + /*3D verson*/ + int ImSize = dimX*dimY*dimZ; + float *d_input, *d_update, *d_old=NULL, *P1=NULL, *P2=NULL, *P3=NULL; + + dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); + dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE)); + + /*allocate space for images on device*/ + checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_old,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P3,ImSize*sizeof(float)) ); + + checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + checkCudaErrors( cudaMemcpy(d_update,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + cudaMemset(P1, 0, ImSize*sizeof(float)); + cudaMemset(P2, 0, ImSize*sizeof(float)); + cudaMemset(P3, 0, ImSize*sizeof(float)); + /********************** Run CUDA 3D kernel here ********************/ + for (i = 0; i < iter; i++) { + + /* computing the the dual P variable */ + dualPD3D_kernel<<>>(d_update, P1, P2, P3, sigma, dimX, dimY, dimZ); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + if (nonneg != 0) { + PDnonneg3D_kernel<<>>(d_update, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); } + + /* projection step */ + if (methodTV == 0) Proj_funcPD3D_iso_kernel<<>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /*isotropic TV*/ + else Proj_funcPD3D_aniso_kernel<<>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /*anisotropic TV*/ + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + /* copy U to U_old */ + PDcopy_kernel3D<<>>(d_update, d_old, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + /* calculate divergence */ + DivProj3D_kernel<<>>(d_update, d_input, P1, P2, P3, lt, tau, dimX, dimY, dimZ); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + if ((epsil != 0.0f) && (i % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + PDResidCalc3D_kernel<<>>(d_update, d_old, P1, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + // setup arguments + square unary_op; + thrust::plus binary_op; + thrust::device_vector d_vec(P1, P1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector d_vec2(d_update, d_update + ImSize); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } + + /* get U*/ + getU3D_kernel<<>>(d_update, d_old, theta, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + /***************************************************************/ + //copy result matrix from device to host memory + cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); + + cudaFree(d_input); + cudaFree(d_update); + cudaFree(d_old); + cudaFree(P1); + cudaFree(P2); + cudaFree(P3); + + } + //cudaDeviceReset(); + /*adding info into info_vector */ + infovector[0] = (float)(i); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + return 0; +} diff --git a/src/Core/regularisers_GPU/TV_PD_GPU_core.h b/src/Core/regularisers_GPU/TV_PD_GPU_core.h new file mode 100644 index 0000000..2b123d9 --- /dev/null +++ b/src/Core/regularisers_GPU/TV_PD_GPU_core.h @@ -0,0 +1,9 @@ +#ifndef _TV_PD_GPU_ +#define _TV_PD_GPU_ + +#include "CCPiDefines.h" +#include + +extern "C" CCPI_EXPORT int TV_PD_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, float lipschitz_const, int methodTV, int nonneg, float tau, int dimX, int dimY, int dimZ); + +#endif diff --git a/src/Matlab/mex_compile/compileGPU_mex.m b/src/Matlab/mex_compile/compileGPU_mex.m index 56fcd38..577630f 100644 --- a/src/Matlab/mex_compile/compileGPU_mex.m +++ b/src/Matlab/mex_compile/compileGPU_mex.m @@ -41,6 +41,11 @@ fprintf('%s \n', 'Compiling SB-TV...'); mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcufft -lmwgpu SB_TV_GPU.cpp TV_SB_GPU_core.o movefile('SB_TV_GPU.mex*',Pathmove); +fprintf('%s \n', 'Compiling PD-TV...'); +!/usr/local/cuda/bin/nvcc -O0 -c TV_PD_GPU_core.cu -Xcompiler -fPIC -I~/SOFT/MATLAB9/extern/include/ +mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcufft -lmwgpu PD_TV_GPU.cpp TV_PD_GPU_core.o +movefile('PD_TV_GPU.mex*',Pathmove); + fprintf('%s \n', 'Compiling TGV...'); !/usr/local/cuda/bin/nvcc -O0 -c TGV_GPU_core.cu -Xcompiler -fPIC -I~/SOFT/MATLAB9/extern/include/ mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcufft -lmwgpu TGV_GPU.cpp TGV_GPU_core.o @@ -72,6 +77,7 @@ mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcu movefile('PatchSelect_GPU.mex*',Pathmove); delete TV_ROF_GPU_core* TV_FGP_GPU_core* TV_SB_GPU_core* dTV_FGP_GPU_core* NonlDiff_GPU_core* Diffus_4thO_GPU_core* TGV_GPU_core* LLT_ROF_GPU_core* CCPiDefines.h +delete TV_PD_GPU_core* delete PatchSelect_GPU_core* Nonlocal_TV_core* shared.h fprintf('%s \n', 'All successfully compiled!'); diff --git a/src/Matlab/mex_compile/regularisers_CPU/PD_TV.c b/src/Matlab/mex_compile/regularisers_CPU/PD_TV.c index eac2d18..e5ab1e4 100644 --- a/src/Matlab/mex_compile/regularisers_CPU/PD_TV.c +++ b/src/Matlab/mex_compile/regularisers_CPU/PD_TV.c @@ -30,6 +30,7 @@ * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) * 6. nonneg: 'nonnegativity (0 is OFF by default, 1 is ON) * 7. lipschitz_const: convergence related parameter + * 8. tau: convergence related parameter * Output: * [1] TV - Filtered/regularized image/volume @@ -45,7 +46,7 @@ void mexFunction( int number_of_dims, iter, methTV, nonneg; mwSize dimX, dimY, dimZ; const mwSize *dim_array; - float *Input, *infovec=NULL, *Output=NULL, lambda, epsil, lipschitz_const; + float *Input, *infovec=NULL, *Output=NULL, lambda, epsil, lipschitz_const, tau; number_of_dims = mxGetNumberOfDimensions(prhs[0]); dim_array = mxGetDimensions(prhs[0]); @@ -55,29 +56,30 @@ void mexFunction( Input = (float *) mxGetData(prhs[0]); /*noisy image (2D/3D) */ lambda = (float) mxGetScalar(prhs[1]); /* regularization parameter */ - iter = 400; /* default iterations number */ + iter = 500; /* default iterations number */ epsil = 1.0e-06; /* default tolerance constant */ methTV = 0; /* default isotropic TV penalty */ nonneg = 0; /* default nonnegativity switch, off - 0 */ - lipschitz_const = 12.0; /* lipschitz_const */ + lipschitz_const = 8.0; /* lipschitz_const */ + tau = 0.0025; /* tau convergence const */ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } - if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7)) iter = (int) mxGetScalar(prhs[2]); /* iterations number */ - if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7)) epsil = (float) mxGetScalar(prhs[3]); /* tolerance constant */ - if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7)) { + if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8)) iter = (int) mxGetScalar(prhs[2]); /* iterations number */ + if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8)) epsil = (float) mxGetScalar(prhs[3]); /* tolerance constant */ + if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8)) { char *penalty_type; penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */ if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',"); if (strcmp(penalty_type, "l1") == 0) methTV = 1; /* enable 'l1' penalty */ mxFree(penalty_type); } - if ((nrhs == 6) || (nrhs == 7)) { + if ((nrhs == 6) || (nrhs == 7) || (nrhs == 8)) { nonneg = (int) mxGetScalar(prhs[5]); if ((nonneg != 0) && (nonneg != 1)) mexErrMsgTxt("Nonnegativity constraint can be enabled by choosing 1 or off - 0"); } - if (nrhs == 7) lipschitz_const = (float) mxGetScalar(prhs[6]); - + if ((nrhs == 7) || (nrhs == 8)) lipschitz_const = (float) mxGetScalar(prhs[6]); + if (nrhs == 8) tau = (float) mxGetScalar(prhs[7]); /*Handling Matlab output data*/ dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; @@ -94,5 +96,5 @@ void mexFunction( infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); /* running the function */ - PDTV_CPU_main(Input, Output, infovec, lambda, iter, epsil, lipschitz_const, methTV, nonneg, dimX, dimY, dimZ); + PDTV_CPU_main(Input, Output, infovec, lambda, iter, epsil, lipschitz_const, methTV, nonneg, tau, dimX, dimY, dimZ); } diff --git a/src/Matlab/mex_compile/regularisers_GPU/PD_TV_GPU.cpp b/src/Matlab/mex_compile/regularisers_GPU/PD_TV_GPU.cpp new file mode 100644 index 0000000..e853dd3 --- /dev/null +++ b/src/Matlab/mex_compile/regularisers_GPU/PD_TV_GPU.cpp @@ -0,0 +1,101 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2019 Daniil Kazantsev + * Copyright 2019 Srikanth Nagella, Edoardo Pasca + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "matrix.h" +#include "mex.h" +#include "TV_PD_GPU_core.h" + +/* GPU implementation of Primal-Dual TV [1] by Chambolle Pock denoising/regularization model (2D/3D case) + * + * Input Parameters: + * 1. Noisy image/volume + * 2. lambdaPar - regularization parameter + * 3. Number of iterations + * 4. eplsilon: tolerance constant + * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) + * 6. nonneg: 'nonnegativity (0 is OFF by default, 1 is ON) + * 7. lipschitz_const: convergence related parameter + * 8. tau: convergence related parameter + + * Output: + * [1] TV - Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] + * + * [1] Antonin Chambolle, Thomas Pock. "A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging", 2010 + */ + +void mexFunction( + int nlhs, mxArray *plhs[], + int nrhs, const mxArray *prhs[]) + +{ + int number_of_dims, iter, methTV, nonneg; + mwSize dimX, dimY, dimZ; + const mwSize *dim_array; + float *Input, *infovec=NULL, *Output=NULL, lambda, epsil, lipschitz_const, tau; + + number_of_dims = mxGetNumberOfDimensions(prhs[0]); + dim_array = mxGetDimensions(prhs[0]); + + /*Handling Matlab input data*/ + if ((nrhs < 2) || (nrhs > 7)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D/3D), Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1'), nonnegativity switch, lipschitz_const"); + + Input = (float *) mxGetData(prhs[0]); /*noisy image (2D/3D) */ + lambda = (float) mxGetScalar(prhs[1]); /* regularization parameter */ + iter = 500; /* default iterations number */ + epsil = 1.0e-06; /* default tolerance constant */ + methTV = 0; /* default isotropic TV penalty */ + nonneg = 0; /* default nonnegativity switch, off - 0 */ + lipschitz_const = 8.0; /* lipschitz_const */ + tau = 0.0025; /* tau convergence const */ + + if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } + + if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8)) iter = (int) mxGetScalar(prhs[2]); /* iterations number */ + if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8)) epsil = (float) mxGetScalar(prhs[3]); /* tolerance constant */ + if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8)) { + char *penalty_type; + penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */ + if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',"); + if (strcmp(penalty_type, "l1") == 0) methTV = 1; /* enable 'l1' penalty */ + mxFree(penalty_type); + } + if ((nrhs == 6) || (nrhs == 7) || (nrhs == 8)) { + nonneg = (int) mxGetScalar(prhs[5]); + if ((nonneg != 0) && (nonneg != 1)) mexErrMsgTxt("Nonnegativity constraint can be enabled by choosing 1 or off - 0"); + } + if ((nrhs == 7) || (nrhs == 8)) lipschitz_const = (float) mxGetScalar(prhs[6]); + if (nrhs == 8) tau = (float) mxGetScalar(prhs[7]); + + /*Handling Matlab output data*/ + dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; + + if (number_of_dims == 2) { + dimZ = 1; /*2D case*/ + Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); + } + if (number_of_dims == 3) { + Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + } + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); + + /* running the function */ + TV_PD_GPU_main(Input, Output, infovec, lambda, iter, epsil, lipschitz_const, methTV, nonneg, tau, dimX, dimY, dimZ); +} diff --git a/src/Python/ccpi/filters/regularisers.py b/src/Python/ccpi/filters/regularisers.py index bc745fe..5f4001a 100644 --- a/src/Python/ccpi/filters/regularisers.py +++ b/src/Python/ccpi/filters/regularisers.py @@ -4,7 +4,7 @@ script which assigns a proper device core function based on a flag ('cpu' or 'gp from ccpi.filters.cpu_regularisers import TV_ROF_CPU, TV_FGP_CPU, TV_PD_CPU, TV_SB_CPU, dTV_FGP_CPU, TNV_CPU, NDF_CPU, Diff4th_CPU, TGV_CPU, LLT_ROF_CPU, PATCHSEL_CPU, NLTV_CPU try: - from ccpi.filters.gpu_regularisers import TV_ROF_GPU, TV_FGP_GPU, TV_SB_GPU, dTV_FGP_GPU, NDF_GPU, Diff4th_GPU, TGV_GPU, LLT_ROF_GPU, PATCHSEL_GPU + from ccpi.filters.gpu_regularisers import TV_ROF_GPU, TV_FGP_GPU, TV_PD_GPU, TV_SB_GPU, dTV_FGP_GPU, NDF_GPU, Diff4th_GPU, TGV_GPU, LLT_ROF_GPU, PATCHSEL_GPU gpu_enabled = True except ImportError: gpu_enabled = False @@ -64,7 +64,7 @@ def PD_TV(inputData, regularisation_parameter, iterations, lipschitz_const, tau) elif device == 'gpu' and gpu_enabled: - return TV_PD_CPU(inputData, + return TV_PD_GPU(inputData, regularisation_parameter, iterations, tolerance_param, diff --git a/src/Python/setup-regularisers.py.in b/src/Python/setup-regularisers.py.in index 9a5b693..c4ad143 100644 --- a/src/Python/setup-regularisers.py.in +++ b/src/Python/setup-regularisers.py.in @@ -39,6 +39,7 @@ extra_include_dirs += [os.path.join(".." , "Core"), os.path.join(".." , "Core", "inpainters_CPU"), os.path.join(".." , "Core", "regularisers_GPU" , "TV_FGP" ) , os.path.join(".." , "Core", "regularisers_GPU" , "TV_ROF" ) , + os.path.join(".." , "Core", "regularisers_GPU" , "TV_PD" ) , os.path.join(".." , "Core", "regularisers_GPU" , "TV_SB" ) , os.path.join(".." , "Core", "regularisers_GPU" , "TGV" ) , os.path.join(".." , "Core", "regularisers_GPU" , "LLTROF" ) , diff --git a/src/Python/src/gpu_regularisers.pyx b/src/Python/src/gpu_regularisers.pyx index 8cd8c93..b22d15e 100644 --- a/src/Python/src/gpu_regularisers.pyx +++ b/src/Python/src/gpu_regularisers.pyx @@ -22,6 +22,7 @@ CUDAErrorMessage = 'CUDA error' cdef extern int TV_ROF_GPU_main(float* Input, float* Output, float *infovector, float *lambdaPar, int lambda_is_arr, int iter, float tau, float epsil, int N, int M, int Z); cdef extern int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int N, int M, int Z); +cdef extern int TV_PD_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, float lipschitz_const, int methodTV, int nonneg, float tau, int dimX, int dimY, int dimZ); cdef extern int TV_SB_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int N, int M, int Z); cdef extern int LLT_ROF_GPU_main(float *Input, float *Output, float *infovector, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, float epsil, int N, int M, int Z); cdef extern int TGV_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, float epsil, int dimX, int dimY, int dimZ); @@ -70,6 +71,75 @@ def TV_FGP_GPU(inputData, tolerance_param, methodTV, nonneg) +# Total-variation Primal-Dual (PD) +def TV_PD_GPU(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, nonneg, lipschitz_const, tau): + if inputData.ndim == 2: + return TVPD2D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, nonneg, lipschitz_const, tau) + elif inputData.ndim == 3: + return TVPD3D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, nonneg, lipschitz_const, tau) + +def TVPD2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, + float regularisation_parameter, + int iterationsNumb, + float tolerance_param, + int methodTV, + int nonneg, + float lipschitz_const, + float tau): + + cdef long dims[2] + dims[0] = inputData.shape[0] + dims[1] = inputData.shape[1] + + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ + np.zeros([dims[0],dims[1]], dtype='float32') + + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + if (TV_PD_GPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], regularisation_parameter, + iterationsNumb, + tolerance_param, + lipschitz_const, + methodTV, + nonneg, + tau, + dims[1],dims[0], 1) ==0): + return (outputData,infovec) + else: + raise ValueError(CUDAErrorMessage); + +def TVPD3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + float regularisation_parameter, + int iterationsNumb, + float tolerance_param, + int methodTV, + int nonneg, + float lipschitz_const, + float tau): + + cdef long dims[3] + dims[0] = inputData.shape[0] + dims[1] = inputData.shape[1] + dims[2] = inputData.shape[2] + + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ + np.zeros([dims[0], dims[1], dims[2]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + + if (TV_PD_GPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], regularisation_parameter, + iterationsNumb, + tolerance_param, + lipschitz_const, + methodTV, + nonneg, + tau, + dims[2], dims[1], dims[0]) ==0): + return (outputData,infovec) + else: + raise ValueError(CUDAErrorMessage); + # Total-variation Split Bregman (SB) def TV_SB_GPU(inputData, regularisation_parameter, @@ -195,7 +265,7 @@ def ROFTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, if isinstance (regularisation_parameter, np.ndarray): reg = regularisation_parameter.copy() # Running CUDA code here - if (TV_ROF_GPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], + if (TV_ROF_GPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], ®[0,0], 1, iterations, time_marching_parameter, -- cgit v1.2.3