diff options
| -rw-r--r-- | .gitignore | 8 | ||||
| -rw-r--r-- | CMakeLists.txt | 41 | ||||
| -rw-r--r-- | config.h | 9 | ||||
| -rw-r--r-- | gdr_test.cu | 295 | ||||
| -rw-r--r-- | gdrcopy.diff | 57 | ||||
| -rwxr-xr-x | insmod.sh | 42 | ||||
| -rw-r--r-- | ipedma.h | 20 | ||||
| -rw-r--r-- | kernels.cu | 48 | ||||
| -rw-r--r-- | kernels.h | 5 | 
9 files changed, 525 insertions, 0 deletions
| diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b527926 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +CMakeCache.txt +CMakeFiles +CMakeScripts +Makefile +cmake_install.cmake +install_manifest.txt +CTestTestfile.cmake +gdr_test diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..f3369d9 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,41 @@ +project(ipecamera) + +set(IPECAMERA_VERSION "0.0.1") +set(IPECAMERA_ABI_VERSION "0") + +cmake_minimum_required(VERSION 2.6) +list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") + +add_definitions("-fPIC --std=gnu99 -Wall -O2 -gdwarf-2 -g3  -fno-omit-frame-pointer") + +find_package(CUDA REQUIRED) +set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_35,code=sm_35;-rdc=true) +set(CUDA_SEPARABLE_COMPILATION ON) + + +find_package(PkgConfig REQUIRED) + +pkg_check_modules(PCILIB pcitool>=0.2 REQUIRED) + + +include_directories( +    ${CMAKE_SOURCE_DIR} +    ${PCILIB_INCLUDE_DIRS} +    ${CUDA_INCLUDE_DIRS} +) + +link_directories( +    ${PCILIB_LIBRARY_DIRS} +    ${CUDA_LIBRARY_DIRS} +) + + +set(CUDA_KERNELS kernels.cu) + +#cuda_compile_ptx(cuda_ptx_files kernels.cu) +#add_custom_target(ptx ALL DEPENDS ${cuda_ptx_files} ${CUDA_KERNELS} SOURCES ${CUDA_KERNELS}) + + +cuda_add_executable(gdr_test gdr_test.cu kernels.cu) +target_link_libraries(gdr_test pcilib rt cuda gdrapi /usr/local/cuda/lib64/libcudadevrt.a) + diff --git a/config.h b/config.h new file mode 100644 index 0000000..acefc3b --- /dev/null +++ b/config.h @@ -0,0 +1,9 @@ +#define ITERS 100 +#define GPU_ITERS 1000 + +#define TLP_SIZE 64 +#define GPU_PAGE 65536 +#define PAGE_SIZE 4096 + +#define VERBOSE +#define GPU_DESC diff --git a/gdr_test.cu b/gdr_test.cu new file mode 100644 index 0000000..13af482 --- /dev/null +++ b/gdr_test.cu @@ -0,0 +1,295 @@ +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdarg.h> +#include <time.h> +#include <sched.h> +#include <sys/time.h> + +#include <cuda.h> +#include <gdrapi.h> + + +#include <pcilib.h> +#include <pcilib/bar.h> +#include <pcilib/kmem.h> + +#include "config.h" +#include "ipedma.h" +#include "kernels.h" + +#define DEVICE "/dev/fpga0" + +#define BAR PCILIB_BAR0 + + +#define KMEM_DEFAULT_FLAGS      (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE) + +#define KMEM_USE_RING           PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1) +#define KMEM_USE_DEFAULT        PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2) + +#define gdrAssert(ans) { gdrError((ans), __FILE__, __LINE__); } +inline int gdrError(int code, const char *file, int line) +{ +    if (code != 0)  +    { +        fprintf(stderr,"GDRassert: %i %s %d\n", +                code, file, line); +        return code; +    } else { +        return 0; +    } +} + + +#define initAssert(ans) { initError((ans), __FILE__, __LINE__); } +inline int initError(CUresult code, const char *file, int line) +{ +    if (code != CUDA_SUCCESS)  +    { +        const char *error = NULL; +        cuGetErrorString (code, &error); +        fprintf(stderr,"GPUassert: %s (Code: %i) %s %d\n", +                error, code, file, line); +        return code; +    } else { +        return 0; +    } +} + + +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline int gpuAssert(cudaError_t code, const char *file, int line) +{ +    if (code != cudaSuccess)  +    { +        fprintf(stderr,"GPUassert: %s (Code: %i) %s %d\n", +                cudaGetErrorString(code), code, file, line); +        return code; +    } else { +        return 0; +    } +} + + + +int main(int argc, char *argv[]) { +    int err; + +        //CUDA initialization +    initAssert (cuInit(0)); + +    int num_gpus; +    initAssert (cuDeviceGetCount (&num_gpus)); +    printf ("Found %i GPUs on the system\n", num_gpus); + + +    CUdevice gpu; //will be used to find the correct GPU +    for (num_gpus--; num_gpus >= 0; num_gpus--) { + +        CUdevice current_gpu; +        initAssert (cuDeviceGet (¤t_gpu, num_gpus)); + +        char gpu_name[30] = {0}; +        initAssert (cuDeviceGetName (gpu_name, 30, current_gpu)); + +        printf("GPU %i: %s\n", num_gpus, gpu_name); + + +        if (strncmp (gpu_name, "Tesla K40", 9) == 0) { +            printf ("Found a Tesla GPU! I'll use that one.\n"); +            gpu = current_gpu; +            break; +        } +    } + +        //The CU_CTX_MAP_HOST is what we are interested in!  +    CUcontext context; +    initAssert (cuCtxCreate (&context, CU_CTX_MAP_HOST | CU_CTX_SCHED_AUTO, gpu)); +    initAssert (cuCtxSetCurrent (context)); + +        //NOTE: API Version 3010 is problematic +        //(see https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/html/group__CUDART__DRIVER.html) +    unsigned int api_version; +    initAssert (cuCtxGetApiVersion (context, &api_version)); +    printf ("CUDA API Version: %u\n", api_version); +    printf ("CUDA init done\n\n"); + +    CUdevprop gpu_props; +    initAssert(cuDeviceGetProperties(&gpu_props, gpu)); +    printf ("Clock %lu KHz\n", gpu_props.clockRate); + +    CUdeviceptr d_A, d_D; +    initAssert(cuMemAlloc(&d_D, GPU_PAGE));	// Should be multiple of GPU page, or mapping of next allocation will segfault the gdrcopy module +    initAssert(cuMemAlloc(&d_A, PAGE_SIZE)); +    unsigned int flag = 1; +    initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_D)); +    initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_A)); + +    gdr_mh_t A_mh, D_mh; +    gdr_info_t A_info, D_info; +    void *A_bar_ptr = NULL;  +    void *D_bar_ptr = NULL; + + +    gdr_t g = gdr_open(); +    gdrAssert(g == NULL); + +    gdrAssert(gdr_pin_buffer(g, d_D, GPU_PAGE, 0, 0, &D_mh)); +    gdrAssert(gdr_map(g, D_mh, &D_bar_ptr, GPU_PAGE)); +    gdrAssert(gdr_get_info(g, D_mh, &D_info)); + +    gdrAssert(gdr_pin_buffer(g, d_A, PAGE_SIZE, 0, 0, &A_mh)); +    gdrAssert(gdr_map(g, A_mh, &A_bar_ptr, PAGE_SIZE)); +    gdrAssert(gdr_get_info(g, A_mh, &A_info)); + +    int D_bar_off = D_info.va - d_D; +    volatile uint32_t *D = (uint32_t *)((char *)D_bar_ptr + D_bar_off); + +    int A_bar_off = A_info.va - d_A; +    volatile uint32_t *A = (uint32_t *)((char *)A_bar_ptr + A_bar_off); + +    printf("DevicePtr: %lx, GDR ptr: %p, Bus ptr: %lx, (Bar: %p, Offset: %i), VA: 0x%lx, Size: %lu, Page: %lu\n", d_A, A, A_info.bus_addr, A_bar_ptr, A_bar_off, A_info.va, A_info.mapped_size, A_info.page_size); + +    pcilib_t *pci; +    volatile void *bar; +    const pcilib_bar_info_t *bar_info; + +    pci = pcilib_open(DEVICE, PCILIB_MODEL_DETECT); +    if (!pci) { +        printf("pcilib_open\n"); +        exit(1); +    } +    bar = pcilib_resolve_bar_address(pci, BAR, 0); +    if (!bar) { +        pcilib_close(pci); +        printf("map bar\n"); +        exit(1); +    } +    printf("BAR mapped to: %p\n", bar); + +    CUdeviceptr dBAR; +//    initAssert (cuMemHostRegister ((void*)((((uintptr_t)bar)/65536)*65536), 65536, CU_MEMHOSTREGISTER_DEVICEMAP)); +    initAssert (cuMemHostRegister ((void*)bar, 4096, CU_MEMHOSTREGISTER_IOMEMORY)); +    initAssert (cuMemHostGetDevicePointer(&dBAR, (void*)bar, 0)); + +    bar_info = pcilib_get_bar_info(pci, BAR); +    printf("%p (Phys: 0x%lx, Size: 0x%x)\n", bar_info[BAR].virt_addr, bar_info[BAR].phys_addr, bar_info[BAR].size); + +    pcilib_kmem_handle_t *kdesc_kmem = pcilib_alloc_kernel_memory (pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS); +    uintptr_t kdesc_bus = pcilib_kmem_get_block_ba (pci, kdesc_kmem, 0); +    volatile void *kdesc = (uint32_t *) pcilib_kmem_get_block_ua (pci, kdesc_kmem, 0); + + +    pcilib_kmem_handle_t *kbuf_kmem = pcilib_alloc_kernel_memory(pci, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, ((PAGE_SIZE%4096)?(4096 * (1 + PAGE_SIZE/4096)):PAGE_SIZE), 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS); +    uintptr_t kbuf_bus = pcilib_kmem_get_block_ba (pci, kbuf_kmem, 0); +    volatile uint32_t *kbuf = (uint32_t *) pcilib_kmem_get_block_ua (pci, kbuf_kmem, 0); +    memset ((uint32_t *)kbuf, 0, PAGE_SIZE); + +#ifdef GPU_DESC +    volatile void *desc = D; +    uintptr_t desc_bus = D_info.bus_addr; +#else +    volatile void *desc = kdesc; +    uintptr_t desc_bus = kdesc_bus; +#endif + + +    memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t)); +    volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + 2 * sizeof(uint32_t)); + +    WR32 (REG_RESET_DMA, 1); +    usleep (100000); +    WR32 (REG_RESET_DMA, 0); +    usleep (100000); + +    WR32 (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE)); +    WR32 (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE); +    WR32 (REG_UPDATE_THRESHOLD, 0); +    WR64 (REG_UPDATE_ADDRESS, desc_bus); +    WR32 (REG_DMA, 1); +    WR32 (REG_COUNTER, 1); + +#ifdef VERBOSE +    struct timespec tss, tse, tsk; +#else +    struct timeval tvs, tve; +#endif /* VERBOSE */ + +    for (int i = 0; i < ITERS; i++) { +	clock_gettime(CLOCK_REALTIME, &tss); + +#ifdef GPU_DESC +	ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint64_t*)d_D, (uint32_t*)d_A); +#else +	WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr); +//    WR64 (REG_DESCRIPTOR_ADDRESS, kbuf_bus); + +	do { +	} while (*hwaddr == 0); +	clock_gettime(CLOCK_REALTIME, &tse); + +	null<<<1, 1>>>((uint32_t*)d_A); +#endif +	cudaDeviceSynchronize(); + +	clock_gettime(CLOCK_REALTIME, &tsk); + +	*hwaddr = 0; +	 +#ifdef VERBOSE +        initAssert(cuMemcpyDtoH((void*)kbuf, d_A, PAGE_SIZE)); + +# ifdef GPU_DESC +	double lat = 1000. * kbuf[0] / gpu_props.clockRate; +	double latk = 1000. * kbuf[1] / gpu_props.clockRate; +	double latc = ((tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS; +#else +	double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; +	double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.; +	double latc = 0; +#endif + +	printf("Latency: %.3lf us / %.3lf us (%.3lf us) %x %x %x %x\n", lat, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]); +#else +	if (!i)  gettimeofday(&tvs, NULL); +#endif /* VERBOSE */ +	 +    } + +#ifndef VERBOSE +    gettimeofday(&tve, NULL); +    size_t avglat = (tve.tv_sec - tvs.tv_sec)*1000000 + (tve.tv_usec - tvs.tv_usec); +    printf("Latency: %.3lf us (average for %i iterations)\n", 1. * avglat / ITERS, ITERS); +#endif /* VERBOSE */ + +    usleep(1000000); + + + +    WR32 (REG_COUNTER, 0); +    WR32 (REG_DMA, 0); + +    WR32 (REG_RESET_DMA, 1); +    usleep (100000); +    WR32 (REG_RESET_DMA, 0); +    usleep (100000); + +    pcilib_free_kernel_memory(pci, kbuf_kmem,  KMEM_DEFAULT_FLAGS); +    pcilib_free_kernel_memory(pci, kdesc_kmem,  KMEM_DEFAULT_FLAGS); + +    pcilib_close(pci); +    printf("PCI closed\n"); + + +    gdr_unmap(g, A_mh, A_bar_ptr, PAGE_SIZE); +    gdr_unpin_buffer(g, A_mh); + +    gdr_unmap(g, D_mh, D_bar_ptr, GPU_PAGE); +    gdr_unpin_buffer(g, D_mh); + +    gdr_close(g); +    cuMemFree(d_A); +    cuMemFree(d_D); +    printf("GDR closed\n"); +} diff --git a/gdrcopy.diff b/gdrcopy.diff new file mode 100644 index 0000000..c7043c7 --- /dev/null +++ b/gdrcopy.diff @@ -0,0 +1,57 @@ +diff --git a/gdrapi.c b/gdrapi.c +index e38fb8a..c9faeb5 100644 +--- a/gdrapi.c ++++ b/gdrapi.c +@@ -218,6 +218,7 @@ int gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) +         info->page_size   = params.page_size; +         info->tm_cycles   = params.tm_cycles; +         info->cycles_per_ms = params.tsc_khz; ++        info->bus_addr = params.bus_addr; +     } +     return ret; + } +diff --git a/gdrapi.h b/gdrapi.h +index da02719..006f7f0 100644 +--- a/gdrapi.h ++++ b/gdrapi.h +@@ -89,6 +89,7 @@ struct gdr_info { +     uint32_t page_size; +     uint64_t tm_cycles; +     uint32_t cycles_per_ms; ++    uint64_t bus_addr; + }; + typedef struct gdr_info gdr_info_t; + int gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info); +diff --git a/gdrdrv/gdrdrv.c b/gdrdrv/gdrdrv.c +index 8363051..8e78441 100644 +--- a/gdrdrv/gdrdrv.c ++++ b/gdrdrv/gdrdrv.c +@@ -443,11 +443,16 @@ static int gdrdrv_get_info(gdr_info_t *info, void __user *_params) +         return -EINVAL; +     } +  ++    struct nvidia_p2p_page *page = mr->page_table->pages[0]; ++    unsigned long page_paddr = page->physical_address; ++    unsigned long paddr = page_paddr + mr->offset; ++ +     params.va          = mr->va; +     params.mapped_size = mr->mapped_size; +     params.page_size   = mr->page_size; +     params.tm_cycles   = mr->tm_cycles; +     params.tsc_khz     = mr->tsc_khz; ++    params.bus_addr    = paddr; +  +     if (copy_to_user(_params, ¶ms, sizeof(params))) { +         gdr_err("copy_to_user failed on user pointer %p\n", _params); +diff --git a/gdrdrv/gdrdrv.h b/gdrdrv/gdrdrv.h +index 672a203..e1fd2a5 100644 +--- a/gdrdrv/gdrdrv.h ++++ b/gdrdrv/gdrdrv.h +@@ -77,6 +77,7 @@ struct GDRDRV_IOC_GET_INFO_PARAMS +     __u32 page_size; +     __u32 tsc_khz; +     __u64 tm_cycles; ++    __u64 bus_addr; + }; +  + #define GDRDRV_IOC_GET_INFO _IOWR(GDRDRV_IOCTL, 4, struct GDRDRV_IOC_GET_INFO_PARAMS *) diff --git a/insmod.sh b/insmod.sh new file mode 100755 index 0000000..de5eb3c --- /dev/null +++ b/insmod.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in  +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +THIS_DIR=$(dirname $0) + +# remove driver +grep gdrdrv /proc/devices >/dev/null && sudo /sbin/rmmod gdrdrv + +# insert driver +#sudo /sbin/insmod gdrdrv/gdrdrv.ko +modprobe gdrdrv + +# create device inodes +major=`fgrep gdrdrv /proc/devices | cut -b 1-4` +echo "INFO: driver major is $major" + +# remove old inodes just in case +if [ -e /dev/gdrdrv ]; then +    sudo rm /dev/gdrdrv +fi + +echo "INFO: creating /dev/gdrdrv inode" +sudo mknod /dev/gdrdrv c $major 0 +sudo chmod a+w+r /dev/gdrdrv diff --git a/ipedma.h b/ipedma.h new file mode 100644 index 0000000..284b058 --- /dev/null +++ b/ipedma.h @@ -0,0 +1,20 @@ +#define REG_RESET_DMA                   0x00 +#define REG_DMA                         0x04 +#define REG_NUM_PACKETS_PER_DESCRIPTOR  0x10 +#define REG_PERF_COUNTER                0x28 +#define REG_PACKET_LENGTH               0x0C +#define REG_DESCRIPTOR_ADDRESS          0x50 +#define REG_UPDATE_ADDRESS              0x58 +#define REG_UPDATE_THRESHOLD            0x60 + + +#define REG_COUNTER                     0x9000 + +#define WR32(addr, value) *(uint32_t *) (((char*)(bar)) + (addr)) = (value); +#define RD32(addr) (*(uint32_t *) (((char*)(bar)) + (addr))) +#define WR32_sleep(addr, value) *(uint32_t *) (((char*)(bar)) + (addr)) = (value); usleep (100); + +#define WR64(addr, value) *(uint64_t *) (((char*)(bar)) + (addr)) = (value); +#define RD64(addr) (*(uint64_t *) (((char*)(bar)) + (addr))) +#define WR64_sleep(addr, value) *(uint64_t *) (((char*)(bar)) + (addr)) = (value); usleep (100); + diff --git a/kernels.cu b/kernels.cu new file mode 100644 index 0000000..341bb59 --- /dev/null +++ b/kernels.cu @@ -0,0 +1,48 @@ +#include <cuda.h> +#include <stdint.h> + +#include "config.h" +#include "ipedma.h" + +__global__ void null(uint32_t *data) { +} + + +__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t *desc, uint32_t *data) { +    int i; +    clock_t sum = 0, sumk = 0, t1, t2, t3; + +    for (i = 0; i < GPU_ITERS; i++) { +	long wait = 0; + +	desc[1] = 0; + +        t1 = clock64(); +	WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); + +        do { +	    if (++wait > 0x10000) break; +	} while (desc[1] == 0); + +	t2 = clock64(); + +	null<<<1,1>>>(data); +	cudaDeviceSynchronize(); +	t3 = clock64(); + +	sum += t2 - t1; +	sumk += t3 - t1; +    } + +    data[0] = sum / GPU_ITERS; +    data[1] = sumk / GPU_ITERS; +} + + + + +/* +__global__ void do_leet (int *rin) { +    *rin = 0x1337; +} +*/ diff --git a/kernels.h b/kernels.h new file mode 100644 index 0000000..74a0b44 --- /dev/null +++ b/kernels.h @@ -0,0 +1,5 @@ +__global__ void null(uint32_t *data); + +__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t *desc, uint32_t *data); + + | 
