Comparing Code for Different APIs

To illustrate the difference in code between the Runtime and Driver APIs, compare Host code for adding two vectors using the CUDA Runtime and Host code for adding two vectors using the CUDA Driver API, which are examples of a vector addition in which two arrays are added.

Host code for adding two vectors using the CUDA Runtime

const unsigned int cnBlockSize = 512;
const unsigned int cnBlocks    = 3;
const unsigned int cnDimension = cnBlocks * cnBlockSize;
// create CUDA device & context
cudaSetDevice( 0 );	// pick first device
// allocate host vectors
float * pA = new float[cnDimension];
float * pB = new float[cnDimension];
float * pC = new float[cnDimension];
// initialize host memory
randomInit(pA, cnDimension);
randomInit(pB, cnDimension);
// allocate device memory
float *pDeviceMemA, *pDeviceMemB, *pDeviceMemC;
cudaMalloc(&pDeviceMemA, cnDimension * sizeof(float));
cudaMalloc(&pDeviceMemB, cnDimension * sizeof(float));
cudaMalloc(&pDeviceMemC, cnDimension * sizeof(float));
// copy host vectors to device
cudaMemcpy(pDeviceMemA, pA, cnDimension * sizeof(float), 
  cudaMemcpyHostToDevice);
cudaMemcpy(pDeviceMemB, pB, cnDimension * sizeof(float),
  cudaMemcpyHostToDevice);
vectorAdd<<<cnBlocks, cnBlockSize>>> (pDeviceMemA, pDeviceMemB,
                                   pDeviceMemC);
// copy result from device to host
cudaMemcpy ((void *) pC, pDeviceMemC, cnDimension * sizeof(float),
cudaMemcpyDeviceToHost);
delete[] pA;
delete[] pB;
delete[] pC;
cudaFree(pDeviceMemA);
cudaFree(pDeviceMemB);
cudaFree(pDeviceMemC);

Host code for adding two vectors using the CUDA Runtime consists of 27 lines of code. Host code for adding two vectors using the CUDA Driver API shows the same functionality implemented using the CUDA Driver API.

Host code for adding two vectors using the CUDA Driver API

const unsigned int cnBlockSize = 512;
const unsigned int cnBlocks    = 3;
const unsigned int cnDimension = cnBlocks * cnBlockSize;
CUdevice    hDevice;
CUcontext   hContext;
CUmodule    hModule;
CUfunction  hFunction;
 
// create CUDA device & context
cuInit(0);
cuDeviceGet(&hContext, 0); // pick first device
cuCtxCreate(&hContext, 0, hDevice));
cuModuleLoad(&hModule, "vectorAdd.cubin");
cuModuleGetFunction(&hFunction, hModule, "vectorAdd");
// allocate host vectors
float * pA = new float[cnDimension];
float * pB = new float[cnDimension];
float * pC = new float[cnDimension];
// initialize host memory
randomInit(pA, cnDimension);
randomInit(pB, cnDimension);
// allocate memory on the device 
CUdeviceptr pDeviceMemA, pDeviceMemB, pDeviceMemC;
cuMemAlloc(&pDeviceMemA, cnDimension * sizeof(float));
cuMemAlloc(&pDeviceMemB, cnDimension * sizeof(float)); 
cuMemAlloc(&pDeviceMemC, cnDimension * sizeof(float));
// copy host vectors to device
cuMemcpyHtoD(pDeviceMemA, pA, cnDimension * sizeof(float));
cuMemcpyHtoD(pDeviceMemB, pB, cnDimension * sizeof(float));
// set up parameter values
cuFuncSetBlockShape(cuFunction, cnBlockSize, 1, 1);
#define ALIGN_UP(offset, alignment) /
 (offset) = ((offset) + (alignment) – 1) & ~((alignment) – 1)
int offset = 0;
ALIGN_UP(offset, __alignof(pDeviceMemA));
cuParamSetv(cuFunction, offset, &ptr, sizeof(pDeviceMemA));
offset += sizeof(pDeviceMemA);
ALIGN_UP(offset, __alignof(pDeviceMemB));
cuParamSetv(cuFunction, offset, &ptr, sizeof(pDeviceMemB));
offset += sizeof(pDeviceMemB);
ALIGN_UP(offset, __alignof(pDeviceMemC));
cuParamSetv(cuFunction, offset, &ptr, sizeof(pDeviceMemC));
offset += sizeof(pDeviceMemC);
cuParamSetSize(cuFunction, offset);
 
// execute kernel
cuLaunchGrid(cuFunction, cnBlocks, 1);
// copy the result from device back to host
cuMemcpyDtoH((void *) pC, pDeviceMemC, 
          cnDimension * sizeof(float));
delete[] pA;
delete[] pB;
delete[] pC;
cuMemFree(pDeviceMemA);
cuMemFree(pDeviceMemB);
cuMemFree(pDeviceMemC);

Host code for adding two vectors using the CUDA Driver API contains 50 lines of code and performs several lower-level operations than the Runtime API. These additional calls are evident in several places, especially the setup necessary in the Driver API prior to the kernel launch.