1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
|
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <pcilib.h>
#include <pcilib/mem.h>
#include <pcilib/bar.h>
#include <pcilib/kmem.h>
#include "CL/cl.h"
#include "CL/cl_ext.h"
#define KERNEL_CONTROL
//#define OPENCL_TIMINGS
#define CPU_WAIT
//#define CPU_MARKER
#define BAR PCILIB_BAR0
#define TLP_SIZE 64
#define PAGE_SIZE 4096
#define NUM_PAGES 16
#define ITERATIONS 100
#define KMEM_DEFAULT_FLAGS (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE)
#define KMEM_USE_RING PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
#define KMEM_USE_DEFAULT PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
#define REG_RESET_DMA 0x00
#define REG_DMA 0x04
#define REG_NUM_PACKETS_PER_DESCRIPTOR 0x10
#define REG_PERF_COUNTER 0x20
//#define REG_PERF_COUNTER 0x28
#define REG_PACKET_LENGTH 0x0C
#define REG_DESCRIPTOR_ADDRESS 0x50
#define REG_UPDATE_ADDRESS 0x58
#define REG_UPDATE_THRESHOLD 0x60
#define REG_UPDATE_COUNTER 0x70
#define REG_COUNTER 0x9000
#define WR32(addr, value) *(uint32_t *) (bar + (addr)) = (value);
#define RD32(addr) (*(uint32_t *) (bar + (addr)))
#define WR32_sleep(addr, value) *(uint32_t *) (bar + (addr)) = (value); usleep (100);
#define WR64(addr, value) *(uint64_t *) (bar + (addr)) = (value);
#define RD64(addr) (*(uint64_t *) (bar + (addr)))
#define WR64_sleep(addr, value) *(uint64_t *) (bar + (addr)) = (value); usleep (100);
#define DATA_SIZE NUM_PAGES * PAGE_SIZE
#define CL_CHECK_STATUS(error) { \
if ((error) != CL_SUCCESS) fprintf (stderr, "OpenCL error <%s:%i>: %i\n", __FILE__, __LINE__, (error)); }
static void check_data(cl_command_queue queue, cl_mem mem, size_t size) {
uint32_t *data;
data = malloc (size);
memset (data, 0, size);
CL_CHECK_STATUS (clEnqueueReadBuffer (queue, mem, CL_TRUE, 0, size, data, 0, NULL, NULL));
printf("%lx\n", data[0]);
free (data);
}
int main(void)
{
int i;
cl_uint j = 0;
cl_context context;
cl_command_queue command_queue;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem input, output;//, host;
cl_bus_address_amd bus_address;
cl_event event, event1, event2;
cl_int status;
cl_command_type type;
size_t res_size;
clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD;
clEnqueueWaitSignalAMD_fn clEnqueueWaitSignalAMD;
CL_CHECK_STATUS(clGetPlatformIDs(1, &platform_id, &num_of_platforms));
clEnqueueMakeBuffersResidentAMD = (clEnqueueMakeBuffersResidentAMD_fn)clGetExtensionFunctionAddressForPlatform(platform_id, "clEnqueueMakeBuffersResidentAMD");
clEnqueueWaitSignalAMD = clGetExtensionFunctionAddressForPlatform (platform_id, "clEnqueueWaitSignalAMD");
CL_CHECK_STATUS(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,&num_of_devices));
cl_context_properties properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform_id, 0 };
context = clCreateContext(properties, 1, &device_id, NULL,NULL, &err);
CL_CHECK_STATUS(err);
cl_queue_properties props[] = {
CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE/*|CL_QUEUE_ON_DEVICE|CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|CL_QUEUE_ON_DEVICE_DEFAULT,
CL_QUEUE_SIZE, CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE*/,
0};
command_queue = clCreateCommandQueueWithProperties(context, device_id, props, &err);
CL_CHECK_STATUS(err);
input = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_BUS_ADDRESSABLE_AMD, DATA_SIZE, NULL, &err);
CL_CHECK_STATUS(err);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, 4096, NULL, &err);
CL_CHECK_STATUS(err);
memset(&bus_address, 0, sizeof(cl_bus_address_amd));
// CL_CHECK_STATUS(clEnqueueMakeBuffersResidentAMD(command_queue, 1, &input, CL_TRUE, &bus_address, 0, 0, NULL));
CL_CHECK_STATUS(clEnqueueMakeBuffersResidentAMD(command_queue, 1, &input, CL_FALSE, &bus_address, 0, 0, &event));
CL_CHECK_STATUS (clWaitForEvents (1, &event));
CL_CHECK_STATUS (clReleaseEvent (event));
printf("bus adress : surface : 0x%lx, marker : 0x%lx\n", bus_address.surface_bus_address, bus_address.marker_bus_address);
pcilib_t *pci = pcilib_open("/dev/fpga0", PCILIB_MODEL_DETECT);
if (!pci) {
printf("pcilib_open failed\n");
exit(1);
}
volatile void *bar = pcilib_resolve_bar_address(pci, BAR, 0);
if (!bar) {
pcilib_close(pci);
printf("map bar\n");
exit(1);
}
const pcilib_bar_info_t *bar_info = pcilib_get_bar_info(pci, BAR);
if (!bar_info) {
pcilib_close(pci);
printf("get bar info\n");
exit(1);
}
cl_bus_address_amd amd_addr = {
.surface_bus_address = bar_info->phys_addr,
.marker_bus_address = bar_info->phys_addr
};
cl_mem bar_cl = clCreateBuffer (context, CL_MEM_EXTERNAL_PHYSICAL_AMD, bar_info->size, &amd_addr, &err);
if (err) {
pcilib_close(pci);
printf("Error (%i) mapping BAR to GPU\n", err);
exit(1);
}
FILE *f = fopen("kernel.cl", "rb");
fseek(f, 0, SEEK_END);
long fsize = ftell(f);
fseek(f, 0, SEEK_SET); //same as rewind(f);
char *cl_string = malloc(fsize + 1);
fread(cl_string, fsize, 1, f);
cl_string[fsize] = 0;
fclose(f);
cl_program program = clCreateProgramWithSource (context, 1, (const char **) &cl_string, NULL, &err);
CL_CHECK_STATUS (err);
CL_CHECK_STATUS(clBuildProgram (program, 1, &device_id, "-cl-std=CL2.0 -D CL_VERSION_2_0", NULL, NULL));
size_t work_size = 1;
cl_kernel process_kernel = clCreateKernel (program, "process", &err);
CL_CHECK_STATUS (err);
CL_CHECK_STATUS (clSetKernelArg (process_kernel, 0, sizeof (uint), &j));
CL_CHECK_STATUS (clSetKernelArg (process_kernel, 1, sizeof (cl_mem), &input));
CL_CHECK_STATUS (clSetKernelArg (process_kernel, 2, sizeof (cl_mem), &output));
CL_CHECK_STATUS (clSetKernelArg (process_kernel, 3, sizeof (cl_mem), &bar_cl));
CL_CHECK_STATUS (clSetKernelArg (process_kernel, 4, sizeof (ulong), &bus_address.surface_bus_address));
cl_kernel measure_kernel = clCreateKernel (program, "measure", &err);
CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 0, sizeof (uint), &j));
CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 1, sizeof (cl_mem), &input));
CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 2, sizeof (cl_mem), &output));
CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 3, sizeof (cl_mem), &bar_cl));
CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 4, sizeof (ulong), &bus_address.surface_bus_address));
cl_kernel nil_kernel = clCreateKernel (program, "nil", &err);
pcilib_kmem_handle_t *desc_kmem = pcilib_alloc_kernel_memory (pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
uintptr_t desc_bus = pcilib_kmem_get_block_ba (pci, desc_kmem, 0);
volatile void *desc = (uint32_t *) pcilib_kmem_get_block_ua (pci, desc_kmem, 0);
memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t));
volatile uint64_t *hwaddr = (uint64_t*)(desc + 2 * sizeof(uint32_t));
pcilib_kmem_handle_t *kbuf_kmem = pcilib_alloc_kernel_memory(pci, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, 4096, 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS);
uintptr_t kbuf_bus = pcilib_kmem_get_block_ba (pci, kbuf_kmem, 0);
volatile uint32_t *kbuf = (uint32_t *) pcilib_kmem_get_block_ua (pci, kbuf_kmem, 0);
memset ((uint32_t *)kbuf, 0, 4096);
void *marker = pcilib_map_area(pci, bus_address.marker_bus_address, 4096);
if (!marker) {
printf("pcilib_map_area failed\n");
exit(1);
}
*(uint32_t*)marker = 0;
void *gpubuf = pcilib_map_area(pci, bus_address.surface_bus_address, 4096);
*(uint32_t*)gpubuf = 0x1;
check_data(command_queue, input, 4);
WR32 (REG_RESET_DMA, 1); usleep (100000);
WR32 (REG_RESET_DMA, 0); usleep (100000);
WR32_sleep (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE));
WR32_sleep (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
WR32_sleep (REG_UPDATE_THRESHOLD, 1);
WR64_sleep (REG_UPDATE_ADDRESS, desc_bus);
WR64_sleep (REG_UPDATE_COUNTER, bus_address.marker_bus_address);
// WR32_sleep (REG_PERF_COUNTER, 0);
// WR32 (REG_DMA, 1);
WR32_sleep (0x9048, 0x232);
WR32_sleep (REG_COUNTER, 1);
usleep(1000000);
struct timespec tss, tse, tsk;
#ifdef KERNEL_CONTROL
cl_kernel kernel = measure_kernel;
#else
cl_kernel kernel = process_kernel;
#endif /* KERNEL_CONTROL */
for (i = 0; i < ITERATIONS; i++) {
cl_event wevent[NUM_PAGES], kevent[NUM_PAGES];
printf("Iteration %i\n", i);
WR32_sleep (REG_DMA, 0);
WR32_sleep (REG_PERF_COUNTER, 0);
*(volatile uint32_t*)marker = 0;
*hwaddr = 0;
clock_gettime(CLOCK_REALTIME, &tss);
// we rather need to trigger it every few milliseconds and see what happens.
// CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, 0, 0, NULL, &event));
for (j = 0; j < NUM_PAGES; j++) {
#ifndef KERNEL_CONTROL
WR64_sleep (REG_DESCRIPTOR_ADDRESS, bus_address.surface_bus_address + j * PAGE_SIZE);
#endif
#ifdef CPU_WAIT
wevent[j] = clCreateUserEvent(context, &err); CL_CHECK_STATUS(err);
#else
CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, j + 1, 0, NULL, &wevent[j]));
#endif
CL_CHECK_STATUS (clSetKernelArg (kernel, 0, sizeof (uint), &j));
CL_CHECK_STATUS (clEnqueueNDRangeKernel (command_queue, kernel, 1, NULL, &work_size, NULL, 1, &wevent[j], &kevent[j]));
CL_CHECK_STATUS (clFlush(command_queue));
}
#ifdef KERNEL_CONTROL
// we write one extra in the end.
# ifndef CPU_WAIT
CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, NUM_PAGES + 1, 0, NULL, NULL));
# endif
CL_CHECK_STATUS (clFlush(command_queue));
#endif
clock_gettime(CLOCK_REALTIME, &tse);
double lat_sched = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.;
// usleep(10000);
clock_gettime(CLOCK_REALTIME, &tss);
double lat_flush = (tss.tv_sec - tse.tv_sec)*1000000 + 1. * (tss.tv_nsec - tse.tv_nsec) / 1000.;
#ifdef KERNEL_CONTROL
WR64 (REG_DESCRIPTOR_ADDRESS, bus_address.surface_bus_address);
#endif
clock_gettime(CLOCK_REALTIME, &tss);
WR32 (REG_DMA, 1);
printf(" Markers (us):");
double lastlat = 0;
#ifdef CPU_WAIT
for (j = 0; j < NUM_PAGES; j++) {
# ifdef CPU_MARKER
while ((*hwaddr) == 0) {
}
*hwaddr = 0;
# else
while ((*(volatile uint32_t*)marker) < (j + 1)) {
}
# endif
clock_gettime(CLOCK_REALTIME, &tse);
CL_CHECK_STATUS (clSetUserEventStatus(wevent[j], CL_COMPLETE));
double latm = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.;
printf(" %8.3lf", latm - lastlat);
lastlat = latm;
}
#else
int cur, curupd;
for (cur = 0; (*(volatile uint32_t*)marker) < NUM_PAGES;) {
if (cur != (*(volatile uint32_t*)marker)) {
clock_gettime(CLOCK_REALTIME, &tse);
curupd = (*(volatile uint32_t*)marker);
double latm = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.;
for (cur++; cur < curupd; cur++) printf(" -");
printf(" %8.3lf", latm - lastlat);
cur = curupd;
lastlat = latm;
}
}
#endif
printf("\n");
CL_CHECK_STATUS (clWaitForEvents (1, &kevent[NUM_PAGES - 1]));
CL_CHECK_STATUS (clFinish(command_queue));
clock_gettime(CLOCK_REALTIME, &tse);
double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; lat /= NUM_PAGES;
printf(" Markers: 0x%lx %u\n", *hwaddr, *(volatile uint32_t*)marker);
printf(" GPU latencies: ");
for (j = 1; j < NUM_PAGES; j++) {
/*
cl_ulong start, submit, end;
CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &submit, NULL));
CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL));
CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL));
printf(" Page %i start-end: %6.3lf us, submit-end: %6.3lf us\n", j, 1. * (end - start) / 1000, 1. * (start - submit) / 1000);
CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &submit, NULL));
CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL));
CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL));
printf(" Kernel %i start-end: %6.3lf us, submit-end: %6.3lf us\n", j, 1. * (end - start) / 1000, 1. * (start - submit) / 1000);*/
cl_ulong end, endk, endw, startw, startk;
CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j - 1], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &endk, NULL));
CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &startk, NULL));
CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL));
#ifdef CPU_WAIT
startw = endk;
endw = startk;
#else
CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &startw, NULL));
CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &endw, NULL));
#endif
printf("k-%.3lf-w-%.3lf-w-%.3lf-k-%.3lf ", 1. * (startw - endk) / 1000, 1. * (endw - startw) / 1000, 1. * (startk - endw) / 1000, 1. * (end - startk) / 1000);
}
printf("\n");
double lath = 4. * RD32 (0x20) / 1000; lath /= NUM_PAGES;
printf(" fpga: %8.3lf us, software: %8.3lf us, sched: %8.3lf us, flush: %8.3lf us\n", lath, lat, lat_sched, lat_flush);
/*
CL_CHECK_STATUS(clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, &res_size));
printf(" Event return: %i (CL_COMPLETE: %i)\n", status, CL_COMPLETE);
CL_CHECK_STATUS(clGetEventInfo(event, CL_EVENT_COMMAND_TYPE, sizeof(type), &type, &res_size));
printf(" Event type: 0x%x (CL_COMMAND_WAIT_SIGNAL_AMD: 0x%x)\n", type, CL_COMMAND_WAIT_SIGNAL_AMD);
*/
for (j = 0; j < NUM_PAGES; j++) {
// CL_CHECK_STATUS (clReleaseEvent (wevent[j]));
CL_CHECK_STATUS (clReleaseEvent (kevent[j]));
}
// CL_CHECK_STATUS (clReleaseEvent (event));
#ifdef KERNEL_CONTROL
uint data[1024];
CL_CHECK_STATUS (clEnqueueReadBuffer (command_queue, output, CL_TRUE, 0, 4096, data, 0, NULL, NULL));
printf(" FPGA Latencies: ");
for (j = 0; j < NUM_PAGES + 1; j++) {
/* if (j)
printf("%6.3lf ", 4. * (data[j] - data[j - 1]) / 1000);
else
printf("%6.3lf ", 4. * data[j] / 1000);*/
printf("%u ", data[j]);
}
printf("\n");
#endif
}
WR32 (REG_COUNTER, 0);
WR32 (REG_DMA, 0);
usleep(10000);
WR32 (REG_RESET_DMA, 1); usleep (100000);
WR32 (REG_RESET_DMA, 0); usleep (100000);
pcilib_unmap_area(pci, marker, 4096);
pcilib_unmap_area(pci, gpubuf, 4096);
pcilib_free_kernel_memory(pci, kbuf_kmem, KMEM_DEFAULT_FLAGS);
pcilib_free_kernel_memory(pci, desc_kmem, KMEM_DEFAULT_FLAGS);
clReleaseMemObject(bar_cl);
pcilib_close(pci);
clReleaseKernel (nil_kernel);
clReleaseKernel (process_kernel);
clReleaseKernel (measure_kernel);
clReleaseProgram (program);
clReleaseMemObject(output);
clReleaseMemObject(input);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
}
|