summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorSuren A. Chilingaryan <csa@suren.me>2020-03-29 20:22:23 +0200
committerSuren A. Chilingaryan <csa@suren.me>2020-03-29 20:22:23 +0200
commit9b2ffcb1eb7e87b2b08670bbbd2fa8db6e5b2e1e (patch)
tree2cd53efd043ba96e38b547a8d24d8a1609a069cd /src
parentec31378b47fdf0eae06732e4e91c3b1bb02c454a (diff)
downloadregularization-9b2ffcb1eb7e87b2b08670bbbd2fa8db6e5b2e1e.tar.gz
regularization-9b2ffcb1eb7e87b2b08670bbbd2fa8db6e5b2e1e.tar.bz2
regularization-9b2ffcb1eb7e87b2b08670bbbd2fa8db6e5b2e1e.tar.xz
regularization-9b2ffcb1eb7e87b2b08670bbbd2fa8db6e5b2e1e.zip
Optimize cache usage also for the version with full back-tracking
Diffstat (limited to 'src')
-rwxr-xr-xsrc/Core/regularisers_CPU/TNV_core_backtrack.c123
1 files changed, 58 insertions, 65 deletions
diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c
index d771db5..7192475 100755
--- a/src/Core/regularisers_CPU/TNV_core_backtrack.c
+++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c
@@ -22,8 +22,10 @@
* limitations under the License.
*/
+#include <malloc.h>
#include "TNV_core.h"
+#define BLOCK 32
#define min(a,b) (((a)<(b))?(a):(b))
inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
@@ -196,18 +198,18 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
long Dim1Total = (long)(dimX*(stepY+1)*padZ);
// Auxiliar vectors
- ctx->Input = malloc(Dim1Total * sizeof(float));
- ctx->u = malloc(Dim1Total * sizeof(float));
- ctx->u_upd = malloc(Dim1Total * sizeof(float));
- ctx->qx = malloc(DimTotal * sizeof(float));
- ctx->qy = malloc(DimTotal * sizeof(float));
- ctx->qx_upd = malloc(DimTotal * sizeof(float));
- ctx->qy_upd = malloc(DimTotal * sizeof(float));
- ctx->gradx = malloc(DimTotal * sizeof(float));
- ctx->grady = malloc(DimTotal * sizeof(float));
- ctx->gradx_upd = malloc(DimTotal * sizeof(float));
- ctx->grady_upd = malloc(DimTotal * sizeof(float));
- ctx->div = malloc(Dim1Total * sizeof(float));
+ ctx->Input = memalign(64, Dim1Total * sizeof(float));
+ ctx->u = memalign(64, Dim1Total * sizeof(float));
+ ctx->u_upd = memalign(64, Dim1Total * sizeof(float));
+ ctx->qx = memalign(64, DimTotal * sizeof(float));
+ ctx->qy = memalign(64, DimTotal * sizeof(float));
+ ctx->qx_upd = memalign(64, DimTotal * sizeof(float));
+ ctx->qy_upd = memalign(64, DimTotal * sizeof(float));
+ ctx->gradx = memalign(64, DimTotal * sizeof(float));
+ ctx->grady = memalign(64, DimTotal * sizeof(float));
+ ctx->gradx_upd = memalign(64, DimTotal * sizeof(float));
+ ctx->grady_upd = memalign(64, DimTotal * sizeof(float));
+ ctx->div = memalign(64, Dim1Total * sizeof(float));
ctx->div_upd = malloc(Dim1Total * sizeof(float));
if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) {
@@ -332,7 +334,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
- long i, j, k, l, m;
+ long i, j, k, l;
tnv_context_t *tnv_ctx = (tnv_context_t*)data;
tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
@@ -374,50 +376,41 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
float theta1 = 1.0f + theta;
float constant = 1.0f + taulambda;
- float resprimal = 0.0f;
- float resdual = 0.0f;
- float product = 0.0f;
- float unorm = 0.0f;
- float qnorm = 0.0f;
-
- float udiff[dimZ];
- float qxdiff;
- float qydiff;
- float divdiff;
- float gradxdiff[dimZ];
- float gradydiff[dimZ];
-
- for(i=0; i < dimX; i++) {
- for(k = 0; k < dimZ; k++) {
- int l = i * padZ + k;
- u_upd[l] = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
- div_upd[l] = 0;
- }
- }
-
- for(j = 0; j < stepY; j++) {
-/* m = j * dimX * dimZ + (dimX - 1) * dimZ;
- for(k = 0; k < dimZ; k++) {
- u_upd[k + m] = (u[k + m] + tau * div[k + m] + taulambda * Input[k + m]) / constant;
- }*/
-
- for(i = 0; i < dimX/* - 1*/; i++) {
+ float resprimal = 0.0;
+ float resdual = 0.0;
+ float product = 0.0;
+ float unorm = 0.0;
+ float qnorm = 0.0;
+
+ float udiff[dimZ] __attribute__((aligned(64)));
+ float qxdiff __attribute__((aligned(64)));
+ float qydiff __attribute__((aligned(64)));
+ float divdiff __attribute__((aligned(64)));
+ float gradxdiff[dimZ] __attribute__((aligned(64)));
+ float gradydiff[dimZ] __attribute__((aligned(64)));
+
+ for(int j1 = 0; j1 < stepY; j1 += BLOCK) {
+ for(int i1 = 0; i1 < dimX; i1 += BLOCK) {
+ for(int j2 = 0; j2 < BLOCK; j2++) {
+ j = j1 + j2;
+ for(int i2 = 0; i2 < BLOCK; i2++) {
float t[3];
float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+
+ i = i1 + i2;
+ if (i == dimX) break;
+ if (j == stepY) { j2 = BLOCK; break; }
l = (j * dimX + i) * padZ;
- m = dimX * padZ;
-//#pragma unroll 64
+//#pragma vector aligned
+#pragma GCC ivdep
for(k = 0; k < dimZ; k++) {
- u_upd[l + k + m] = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant;
-
- gradx_upd[l + k] = (i == (dimX - 1))?0:(u_upd[l + k + padZ] - u_upd[l + k]);
- grady_upd[l + k] = (j == (copY - 1))?0:(u_upd[l + k + m] - u_upd[l + k]); // We need div from the next thread on last iter
-
+ u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
udiff[k] = u[l + k] - u_upd[l + k];
unorm += (udiff[k] * udiff[k]);
-// if ((!k)&&(!i)) printf("%i = %f %f, %f %f\n", offY + j, u[l + k], u_upd[l + k], udiff[k], unorm);
+ gradx_upd[l + k] = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]);
+ grady_upd[l + k] = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]);
gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
gradydiff[k] = grady[l + k] - grady_upd[l + k];
@@ -440,7 +433,8 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
coefF(t, M1, M2, M3, sigma, p, q, r);
-//#pragma unroll 64
+//#pragma vector aligned
+#pragma GCC ivdep
for(k = 0; k < dimZ; k++) {
#ifdef TNV_NEW_STYLE
float vx = divsigma * qx_upd[l + k];
@@ -464,14 +458,12 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
#endif
-if(i != (dimX-1)) {
- div_upd[l + k] += qx_upd[l + k];
- div_upd[l + k + padZ] -= qx_upd[l + k];
-}
-if(j != (copY-1)) {
- div_upd[l + k] += qy_upd[l + k];
- div_upd[l + k + m] = -qy_upd[l + k]; // We need to update div in the next thread on last iter
-}
+ float div_upd_val = 0;
+ div_upd_val -= (i > 0)?qx_upd[l + k - padZ]:0;
+ div_upd_val -= (j > 0)?qy_upd[l + k - dimX * padZ]:0;
+ div_upd_val += (i < (dimX-1))?qx_upd[l + k]:0;
+ div_upd_val += (j < (copY-1))?qy_upd[l + k]:0;
+ div_upd[l + k] = div_upd_val;
qxdiff = qx[l + k] - qx_upd[l + k];
qydiff = qy[l + k] - qy_upd[l + k];
@@ -488,7 +480,9 @@ if(j != (copY-1)) {
}
} // i
- }
+ } // j
+ } // i
+ } // j
ctx->resprimal = resprimal;
@@ -509,8 +503,8 @@ static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ)
tnv_ctx.dimY = dimY;
tnv_ctx.dimZ = dimZ;
// Padding seems actually slower
-// tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0));
tnv_ctx.padZ = dimZ;
+// tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
hw_sched_init();
@@ -620,17 +614,16 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
- m = ctx0->stepY * dimX * padZ;
+ m = ctx0->stepY * dimX * padZ;
for(i = 0; i < dimX; i++) {
for(k = 0; k < dimZ; k++) {
int l = i * padZ + k;
- float div_upd_add = ctx0->div_upd[m + l];
- ctx->div_upd[l] += div_upd_add;
+ ctx->div_upd[l] -= ctx0->qy_upd[m - dimX * padZ + l];
ctx0->div_upd[m + l] = ctx->div_upd[l];
- //ctx0->u_upd[m + l] = ctx->u_upd[l];
+ ctx0->u_upd[m + l] = ctx->u_upd[l];
- float divdiff = ctx->div[l] - ctx->div_upd[l]; // Multiple steps... How we compute without history?
+ float divdiff = ctx->div[l] - ctx->div_upd[l];
float udiff = ctx->u[l] - ctx->u_upd[l];
resprimal += fabs(divtau * udiff + divdiff);
}