1
cuFile Basics
Driver Initialization
C
// Initialize GDS driver (once per process) CUfileError_t status = cuFileDriverOpen(); if (status.err != CU_FILE_SUCCESS) { fprintf(stderr, "cuFile driver init failed: %d\n", status.err); return -1; } // Get driver properties CUfileDrvProps_t props; cuFileDriverGetProperties(&props); printf("Max direct I/O size: %zu\n", props.max_direct_io_size); printf("Required alignment: %zu\n", props.max_device_cache_size); // Cleanup (at process exit) cuFileDriverClose();
File Handle Registration
C
// Open file with O_DIRECT (required for GDS) int fd = open(filename, O_RDWR | O_DIRECT); if (fd < 0) { perror("open failed"); return -1; } // Register file with cuFile CUfileDescr_t descr; CUfileHandle_t handle; memset(&descr, 0, sizeof(descr)); descr.handle.fd = fd; descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; status = cuFileHandleRegister(&handle, &descr); if (status.err != CU_FILE_SUCCESS) { fprintf(stderr, "Handle registration failed: %d\n", status.err); close(fd); return -1; } // Use handle for I/O... // Deregister when done cuFileHandleDeregister(handle); close(fd);
Key Requirements
- File must be opened with
O_DIRECTflag - I/O offset and size must be aligned (typically 4KB)
- GPU buffer must be registered before use
- Driver must be initialized before any API calls
2
cuFileRead/Write API
Synchronous Read
C
// Read directly into GPU memory ssize_t bytes_read = cuFileRead( handle, // CUfileHandle_t from registration gpu_buffer, // Device pointer (GPU memory) size, // Number of bytes to read file_offset, // Offset in file (must be aligned) buffer_offset // Offset in GPU buffer ); if (bytes_read < 0) { fprintf(stderr, "cuFileRead error: %zd\n", bytes_read); } else { printf("Read %zd bytes to GPU\n", bytes_read); }
Synchronous Write
C
// Write directly from GPU memory ssize_t bytes_written = cuFileWrite( handle, // CUfileHandle_t gpu_buffer, // Device pointer (source) size, // Number of bytes to write file_offset, // Offset in file (must be aligned) buffer_offset // Offset in GPU buffer ); if (bytes_written < 0) { fprintf(stderr, "cuFileWrite error: %zd\n", bytes_written); }
Alignment Requirements
| Parameter | Alignment | Notes |
|---|---|---|
file_offset |
4KB (4096 bytes) | Must be multiple of filesystem block size |
size |
4KB (4096 bytes) | Except for last read at EOF |
gpu_buffer |
256 bytes | cudaMalloc provides 256-byte alignment |
buffer_offset |
4 bytes | Must be 4-byte aligned |
3
Batch I/O Operations
When to Use Batch I/O
Batch operations amortize API overhead and enable the driver to optimize I/O scheduling. Use for multiple small reads or writes.
Batch Read Example
C
#define BATCH_SIZE 16 // Prepare batch I/O entries CUfileIOParams_t io_params[BATCH_SIZE]; CUfileIOEvents_t io_events[BATCH_SIZE]; for (int i = 0; i < BATCH_SIZE; i++) { io_params[i].mode = CUFILE_BATCH; io_params[i].fh = handle; io_params[i].u.batch.devPtr_base = gpu_buffer; io_params[i].u.batch.devPtr_offset = i * CHUNK_SIZE; io_params[i].u.batch.file_offset = i * CHUNK_SIZE; io_params[i].u.batch.size = CHUNK_SIZE; io_params[i].opcode = CUFILE_READ; } // Submit batch CUfileBatchHandle_t batch_handle; cuFileBatchIOSetUp(&batch_handle, BATCH_SIZE); cuFileBatchIOSubmit(batch_handle, BATCH_SIZE, io_params, 0); // Wait for completion unsigned nr_completed = 0; while (nr_completed < BATCH_SIZE) { int n = cuFileBatchIOGetStatus(batch_handle, BATCH_SIZE, &nr_completed, io_events, NULL); } // Check results for (int i = 0; i < BATCH_SIZE; i++) { if (io_events[i].status != CUFILE_COMPLETE) { fprintf(stderr, "Batch entry %d failed\n", i); } } cuFileBatchIODestroy(batch_handle);
Batch Performance Guidelines
| Batch Size | Overhead | Best For |
|---|---|---|
| 1-4 | High per-I/O | Not recommended for batch |
| 8-32 | Moderate | General purpose, random I/O |
| 64-128 | Low | High-throughput streaming |
| >256 | Diminishing returns | May increase latency |
4
Buffer Registration
GPU Buffer Registration
C
// Allocate GPU memory void* gpu_buffer; cudaMalloc(&gpu_buffer, buffer_size); // Register buffer with cuFile for optimal DMA status = cuFileBufRegister(gpu_buffer, buffer_size, 0); if (status.err != CU_FILE_SUCCESS) { fprintf(stderr, "Buffer registration failed: %d\n", status.err); } // Use buffer for I/O operations... // Deregister before freeing cuFileBufDeregister(gpu_buffer); cudaFree(gpu_buffer);
⚠️ Important
Buffer registration pins GPU memory and creates DMA mappings. This has overhead, so:
- Register buffers once, reuse for many I/O operations
- Don't register/deregister in hot loops
- Registration limit varies by GPU (check
max_device_cache_size)
Compatibility Mode (Unregistered Buffers)
C
// cuFile can work with unregistered buffers (slower) void* gpu_buffer; cudaMalloc(&gpu_buffer, buffer_size); // This works but uses bounce buffer internally ssize_t bytes = cuFileRead(handle, gpu_buffer, size, offset, 0); // Performance: ~50% of registered buffer speed
5
Error Handling
Error Codes
| Error Code | Meaning | Recovery |
|---|---|---|
CU_FILE_SUCCESS |
Operation successful | N/A |
CU_FILE_DRIVER_NOT_INITIALIZED |
Driver not opened | Call cuFileDriverOpen() |
CU_FILE_INVALID_HANDLE |
Bad file handle | Re-register file handle |
CU_FILE_INVALID_OFFSET |
Misaligned offset | Align to 4KB boundary |
CU_FILE_CUDA_DRIVER_ERROR |
CUDA error | Check GPU state, cudaGetLastError() |
CU_FILE_IO_ERROR |
Storage I/O failed | Check filesystem, NVMe health |
Robust Error Handling Pattern
C
ssize_t gds_read_with_fallback(CUfileHandle_t handle, void* gpu_buf, size_t size, off_t offset, int fd) { // Try GDS path first ssize_t ret = cuFileRead(handle, gpu_buf, size, offset, 0); if (ret >= 0) { return ret; // Success via GDS } // GDS failed - fall back to CPU path fprintf(stderr, "GDS read failed (%zd), falling back to pread\n", ret); // Allocate pinned host buffer void* host_buf; cudaMallocHost(&host_buf, size); // Read via standard I/O ret = pread(fd, host_buf, size, offset); if (ret > 0) { // Copy to GPU cudaMemcpy(gpu_buf, host_buf, ret, cudaMemcpyHostToDevice); } cudaFreeHost(host_buf); return ret; }
6
kvikIO Python API
kvikIO is RAPIDS' Python library for high-performance file I/O with GDS support. It provides a Pythonic interface with NumPy/CuPy integration.
Installation
Bash
# conda (recommended) conda install -c rapidsai -c conda-forge kvikio # pip pip install kvikio-cu12 # for CUDA 12.x
Basic Usage
Python
import kvikio import cupy as cp # Check GDS availability print(f"GDS available: {kvikio.defaults.compat_mode()}") # Read file directly to GPU gpu_array = cp.empty(1024 * 1024, dtype=cp.float32) # 4MB with kvikio.CuFile("/data/model.bin", "r") as f: bytes_read = f.read(gpu_array) print(f"Read {bytes_read} bytes") # Write GPU data to file with kvikio.CuFile("/data/output.bin", "w") as f: bytes_written = f.write(gpu_array)
Async I/O with kvikIO
Python
import kvikio import cupy as cp import asyncio # Async read for overlapping I/O with compute async def load_batch(filename, gpu_buffer): async with kvikio.CuFile(filename, "r") as f: await f.pread(gpu_buffer, file_offset=0) return gpu_buffer # Multiple concurrent reads async def load_all_batches(file_list): buffers = [cp.empty(BATCH_SIZE, dtype=cp.float32) for _ in file_list] tasks = [load_batch(f, buf) for f, buf in zip(file_list, buffers)] return await asyncio.gather(*tasks)
Integration with PyTorch
Python
import torch import kvikio import os import io def load_checkpoint_gds(path, device="cuda:0"): # Get file size size = os.path.getsize(path) # Allocate GPU tensor buffer = torch.empty(size, dtype=torch.uint8, device=device) # Read directly to GPU via GDS with kvikio.CuFile(path, "r") as f: f.read(buffer) # Deserialize (happens on GPU) return torch.load(io.BytesIO(buffer.cpu().numpy()))
7
Optimization Strategies
Prefetching & Double Buffering
Strategy: Hide storage latency by overlapping I/O with computation.
C
// Double-buffering pattern buffer[0] = async_read(batch_0); // Start first read for (batch = 1; batch < num_batches; batch++) { buffer[batch % 2] = async_read(batch); // Prefetch next sync(buffer[(batch-1) % 2]); // Wait for previous process_on_gpu(buffer[(batch-1) % 2]); // Compute }
I/O Size Optimization
| I/O Size | Overhead | Recommendation |
|---|---|---|
| < 4KB | Very High (50%+) | Batch multiple requests |
| 4KB - 64KB | Moderate (10-30%) | OK for random access |
| 64KB - 1MB | Low (3-10%) | Optimal for most workloads |
| > 1MB | Minimal (<3%) | Best for streaming |
Queue Depth Tuning
Finding Optimal Queue Depth
- Too shallow (QD=1-4): SSD parallelism underutilized
- Optimal (QD=32-128): Saturates SSD with acceptable latency
- Too deep (QD>256): Latency increases, diminishing returns
Rule: QD = (Target_BW × Latency) / IO_Size
Example: (14 GB/s × 100μs) / 128KB = 11 → use QD=16-32
Multi-SSD Scaling
1×
Single SSD
~14 GB/s
~14 GB/s
4×
4-way RAID0
~50 GB/s
~50 GB/s
8×
8-way RAID0
~100 GB/s
~100 GB/s
8
Complete Examples
Checkpoint Save with GDS
C
#include <cufile.h> #include <cuda_runtime.h> int save_checkpoint(const char* path, void* gpu_data, size_t size) { // Open file for writing int fd = open(path, O_CREAT | O_WRONLY | O_DIRECT, 0644); if (fd < 0) return -1; // Register with cuFile CUfileDescr_t descr = {0}; CUfileHandle_t handle; descr.handle.fd = fd; descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; if (cuFileHandleRegister(&handle, &descr).err != CU_FILE_SUCCESS) { close(fd); return -1; } // Align size to 4KB size_t aligned_size = (size + 4095) & ~4095UL; // Write from GPU ssize_t written = cuFileWrite(handle, gpu_data, aligned_size, 0, 0); // Cleanup cuFileHandleDeregister(handle); ftruncate(fd, size); // Trim to actual size close(fd); return (written >= size) ? 0 : -1; }
PyTorch DataLoader with GDS
Python
import torch from torch.utils.data import Dataset, DataLoader import kvikio import cupy as cp class GDSDataset(Dataset): def __init__(self, file_list, sample_size): self.files = file_list self.sample_size = sample_size def __len__(self): return len(self.files) def __getitem__(self, idx): # Allocate GPU buffer buffer = cp.empty(self.sample_size, dtype=cp.float32) # Read directly to GPU via GDS with kvikio.CuFile(self.files[idx], "r") as f: f.read(buffer) # Convert to PyTorch tensor (zero-copy via DLPack) tensor = torch.as_tensor(buffer, device="cuda") return tensor # Usage dataset = GDSDataset(file_list, sample_size=1024*1024) loader = DataLoader(dataset, batch_size=32, num_workers=0) # num_workers=0 for GDS