C.1 cuFile & kvikIO API Guide | GPU-NVMe Deep Dive

1

cuFile Basics

Driver Initialization

C

// Initialize GDS driver (once per process)
CUfileError_t status = cuFileDriverOpen();
if (status.err != CU_FILE_SUCCESS) {
    fprintf(stderr, "cuFile driver init failed: %d\n", status.err);
    return -1;
}

// Get driver properties
CUfileDrvProps_t props;
cuFileDriverGetProperties(&props);
printf("Max direct I/O size: %zu\n", props.max_direct_io_size);
printf("Required alignment: %zu\n", props.max_device_cache_size);

// Cleanup (at process exit)
cuFileDriverClose();

File Handle Registration

C

// Open file with O_DIRECT (required for GDS)
int fd = open(filename, O_RDWR | O_DIRECT);
if (fd < 0) {
    perror("open failed");
    return -1;
}

// Register file with cuFile
CUfileDescr_t descr;
CUfileHandle_t handle;
memset(&descr, 0, sizeof(descr));
descr.handle.fd = fd;
descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;

status = cuFileHandleRegister(&handle, &descr);
if (status.err != CU_FILE_SUCCESS) {
    fprintf(stderr, "Handle registration failed: %d\n", status.err);
    close(fd);
    return -1;
}

// Use handle for I/O...

// Deregister when done
cuFileHandleDeregister(handle);
close(fd);

Key Requirements

File must be opened with O_DIRECT flag
I/O offset and size must be aligned (typically 4KB)
GPU buffer must be registered before use
Driver must be initialized before any API calls

2

cuFileRead/Write API

Synchronous Read

C

// Read directly into GPU memory
ssize_t bytes_read = cuFileRead(
    handle,           // CUfileHandle_t from registration
    gpu_buffer,       // Device pointer (GPU memory)
    size,             // Number of bytes to read
    file_offset,      // Offset in file (must be aligned)
    buffer_offset     // Offset in GPU buffer
);

if (bytes_read < 0) {
    fprintf(stderr, "cuFileRead error: %zd\n", bytes_read);
} else {
    printf("Read %zd bytes to GPU\n", bytes_read);
}

Synchronous Write

C

// Write directly from GPU memory
ssize_t bytes_written = cuFileWrite(
    handle,           // CUfileHandle_t
    gpu_buffer,       // Device pointer (source)
    size,             // Number of bytes to write
    file_offset,      // Offset in file (must be aligned)
    buffer_offset     // Offset in GPU buffer
);

if (bytes_written < 0) {
    fprintf(stderr, "cuFileWrite error: %zd\n", bytes_written);
}

Alignment Requirements

Parameter	Alignment	Notes
`file_offset`	4KB (4096 bytes)	Must be multiple of filesystem block size
`size`	4KB (4096 bytes)	Except for last read at EOF
`gpu_buffer`	256 bytes	cudaMalloc provides 256-byte alignment
`buffer_offset`	4 bytes	Must be 4-byte aligned

3

Batch I/O Operations

When to Use Batch I/O Batch operations amortize API overhead and enable the driver to optimize I/O scheduling. Use for multiple small reads or writes.

Batch Read Example

C

#define BATCH_SIZE 16

// Prepare batch I/O entries
CUfileIOParams_t io_params[BATCH_SIZE];
CUfileIOEvents_t io_events[BATCH_SIZE];

for (int i = 0; i < BATCH_SIZE; i++) {
    io_params[i].mode = CUFILE_BATCH;
    io_params[i].fh = handle;
    io_params[i].u.batch.devPtr_base = gpu_buffer;
    io_params[i].u.batch.devPtr_offset = i * CHUNK_SIZE;
    io_params[i].u.batch.file_offset = i * CHUNK_SIZE;
    io_params[i].u.batch.size = CHUNK_SIZE;
    io_params[i].opcode = CUFILE_READ;
}

// Submit batch
CUfileBatchHandle_t batch_handle;
cuFileBatchIOSetUp(&batch_handle, BATCH_SIZE);
cuFileBatchIOSubmit(batch_handle, BATCH_SIZE, io_params, 0);

// Wait for completion
unsigned nr_completed = 0;
while (nr_completed < BATCH_SIZE) {
    int n = cuFileBatchIOGetStatus(batch_handle, BATCH_SIZE, 
                                   &nr_completed, io_events, NULL);
}

// Check results
for (int i = 0; i < BATCH_SIZE; i++) {
    if (io_events[i].status != CUFILE_COMPLETE) {
        fprintf(stderr, "Batch entry %d failed\n", i);
    }
}

cuFileBatchIODestroy(batch_handle);

Batch Performance Guidelines

Batch Size	Overhead	Best For
1-4	High per-I/O	Not recommended for batch
8-32	Moderate	General purpose, random I/O
64-128	Low	High-throughput streaming
>256	Diminishing returns	May increase latency

4

Buffer Registration

GPU Buffer Registration

C

// Allocate GPU memory
void* gpu_buffer;
cudaMalloc(&gpu_buffer, buffer_size);

// Register buffer with cuFile for optimal DMA
status = cuFileBufRegister(gpu_buffer, buffer_size, 0);
if (status.err != CU_FILE_SUCCESS) {
    fprintf(stderr, "Buffer registration failed: %d\n", status.err);
}

// Use buffer for I/O operations...

// Deregister before freeing
cuFileBufDeregister(gpu_buffer);
cudaFree(gpu_buffer);

⚠️ Important Buffer registration pins GPU memory and creates DMA mappings. This has overhead, so:

Register buffers once, reuse for many I/O operations
Don't register/deregister in hot loops
Registration limit varies by GPU (check max_device_cache_size)

Compatibility Mode (Unregistered Buffers)

C

// cuFile can work with unregistered buffers (slower)
void* gpu_buffer;
cudaMalloc(&gpu_buffer, buffer_size);

// This works but uses bounce buffer internally
ssize_t bytes = cuFileRead(handle, gpu_buffer, size, offset, 0);

// Performance: ~50% of registered buffer speed

5

Error Handling

Error Codes

Error Code	Meaning	Recovery
`CU_FILE_SUCCESS`	Operation successful	N/A
`CU_FILE_DRIVER_NOT_INITIALIZED`	Driver not opened	Call cuFileDriverOpen()
`CU_FILE_INVALID_HANDLE`	Bad file handle	Re-register file handle
`CU_FILE_INVALID_OFFSET`	Misaligned offset	Align to 4KB boundary
`CU_FILE_CUDA_DRIVER_ERROR`	CUDA error	Check GPU state, cudaGetLastError()
`CU_FILE_IO_ERROR`	Storage I/O failed	Check filesystem, NVMe health

Robust Error Handling Pattern

C

ssize_t gds_read_with_fallback(CUfileHandle_t handle, 
                                  void* gpu_buf, size_t size,
                                  off_t offset, int fd) {
    // Try GDS path first
    ssize_t ret = cuFileRead(handle, gpu_buf, size, offset, 0);
    
    if (ret >= 0) {
        return ret;  // Success via GDS
    }
    
    // GDS failed - fall back to CPU path
    fprintf(stderr, "GDS read failed (%zd), falling back to pread\n", ret);
    
    // Allocate pinned host buffer
    void* host_buf;
    cudaMallocHost(&host_buf, size);
    
    // Read via standard I/O
    ret = pread(fd, host_buf, size, offset);
    if (ret > 0) {
        // Copy to GPU
        cudaMemcpy(gpu_buf, host_buf, ret, cudaMemcpyHostToDevice);
    }
    
    cudaFreeHost(host_buf);
    return ret;
}

6

kvikIO Python API

kvikIO is RAPIDS' Python library for high-performance file I/O with GDS support. It provides a Pythonic interface with NumPy/CuPy integration.

Installation

Bash

# conda (recommended)
conda install -c rapidsai -c conda-forge kvikio

# pip
pip install kvikio-cu12  # for CUDA 12.x

Basic Usage

Python

import kvikio
import cupy as cp

# Check GDS availability
print(f"GDS available: {kvikio.defaults.compat_mode()}")

# Read file directly to GPU
gpu_array = cp.empty(1024 * 1024, dtype=cp.float32)  # 4MB

with kvikio.CuFile("/data/model.bin", "r") as f:
    bytes_read = f.read(gpu_array)
    print(f"Read {bytes_read} bytes")

# Write GPU data to file
with kvikio.CuFile("/data/output.bin", "w") as f:
    bytes_written = f.write(gpu_array)

Async I/O with kvikIO

Python

import kvikio
import cupy as cp
import asyncio

# Async read for overlapping I/O with compute
async def load_batch(filename, gpu_buffer):
    async with kvikio.CuFile(filename, "r") as f:
        await f.pread(gpu_buffer, file_offset=0)
    return gpu_buffer

# Multiple concurrent reads
async def load_all_batches(file_list):
    buffers = [cp.empty(BATCH_SIZE, dtype=cp.float32) for _ in file_list]
    tasks = [load_batch(f, buf) for f, buf in zip(file_list, buffers)]
    return await asyncio.gather(*tasks)

Integration with PyTorch

Python

import torch
import kvikio
import os
import io

def load_checkpoint_gds(path, device="cuda:0"):
    # Get file size
    size = os.path.getsize(path)
    
    # Allocate GPU tensor
    buffer = torch.empty(size, dtype=torch.uint8, device=device)
    
    # Read directly to GPU via GDS
    with kvikio.CuFile(path, "r") as f:
        f.read(buffer)
    
    # Deserialize (happens on GPU)
    return torch.load(io.BytesIO(buffer.cpu().numpy()))

7

Optimization Strategies

Prefetching & Double Buffering

Strategy: Hide storage latency by overlapping I/O with computation.

C

// Double-buffering pattern
buffer[0] = async_read(batch_0);  // Start first read

for (batch = 1; batch < num_batches; batch++) {
    buffer[batch % 2] = async_read(batch);  // Prefetch next
    sync(buffer[(batch-1) % 2]);             // Wait for previous
    process_on_gpu(buffer[(batch-1) % 2]);   // Compute
}

I/O Size Optimization

I/O Size	Overhead	Recommendation
< 4KB	Very High (50%+)	Batch multiple requests
4KB - 64KB	Moderate (10-30%)	OK for random access
64KB - 1MB	Low (3-10%)	Optimal for most workloads
> 1MB	Minimal (<3%)	Best for streaming

Queue Depth Tuning

Finding Optimal Queue Depth

Too shallow (QD=1-4): SSD parallelism underutilized
Optimal (QD=32-128): Saturates SSD with acceptable latency
Too deep (QD>256): Latency increases, diminishing returns

Rule: QD = (Target_BW × Latency) / IO_Size
Example: (14 GB/s × 100μs) / 128KB = 11 → use QD=16-32

Multi-SSD Scaling

1×

Single SSD
~14 GB/s

4×

4-way RAID0
~50 GB/s

8×

8-way RAID0
~100 GB/s

8

Complete Examples

Checkpoint Save with GDS

C

#include <cufile.h>
#include <cuda_runtime.h>

int save_checkpoint(const char* path, void* gpu_data, size_t size) {
    // Open file for writing
    int fd = open(path, O_CREAT | O_WRONLY | O_DIRECT, 0644);
    if (fd < 0) return -1;
    
    // Register with cuFile
    CUfileDescr_t descr = {0};
    CUfileHandle_t handle;
    descr.handle.fd = fd;
    descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
    
    if (cuFileHandleRegister(&handle, &descr).err != CU_FILE_SUCCESS) {
        close(fd);
        return -1;
    }
    
    // Align size to 4KB
    size_t aligned_size = (size + 4095) & ~4095UL;
    
    // Write from GPU
    ssize_t written = cuFileWrite(handle, gpu_data, aligned_size, 0, 0);
    
    // Cleanup
    cuFileHandleDeregister(handle);
    ftruncate(fd, size);  // Trim to actual size
    close(fd);
    
    return (written >= size) ? 0 : -1;
}

PyTorch DataLoader with GDS

Python

import torch
from torch.utils.data import Dataset, DataLoader
import kvikio
import cupy as cp

class GDSDataset(Dataset):
    def __init__(self, file_list, sample_size):
        self.files = file_list
        self.sample_size = sample_size
        
    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        # Allocate GPU buffer
        buffer = cp.empty(self.sample_size, dtype=cp.float32)
        
        # Read directly to GPU via GDS
        with kvikio.CuFile(self.files[idx], "r") as f:
            f.read(buffer)
        
        # Convert to PyTorch tensor (zero-copy via DLPack)
        tensor = torch.as_tensor(buffer, device="cuda")
        return tensor

# Usage
dataset = GDSDataset(file_list, sample_size=1024*1024)
loader = DataLoader(dataset, batch_size=32, num_workers=0)  # num_workers=0 for GDS