ggmlR supports multi-GPU inference through the backend scheduler API. This allows you to distribute computations across multiple GPUs for improved performance with large models.
First, check how many GPUs are available:
library(ggmlR)
if (ggml_vulkan_available()) {
n_gpus <- ggml_vulkan_device_count()
cat("Available GPUs:", n_gpus, "\n\n")
for (i in seq_len(n_gpus)) {
cat("GPU", i - 1, ":", ggml_vulkan_device_description(i - 1), "\n")
mem_gb <- ggml_vulkan_device_memory(i - 1) / 1024^3
cat(" Memory:", round(mem_gb, 2), "GB\n")
}
}The scheduler automatically distributes work across backends:
if (ggml_vulkan_available() && ggml_vulkan_device_count() >= 2) {
# Initialize multiple GPU backends
gpu0 <- ggml_vulkan_init(0)
gpu1 <- ggml_vulkan_init(1)
# Create scheduler with multiple backends
# Order matters: first backend is preferred for supported operations
sched <- ggml_backend_sched_new(list(gpu0, gpu1))
cat("Scheduler created with", ggml_backend_sched_get_n_backends(sched),
"backends\n")
# Check backends
for (i in seq_len(ggml_backend_sched_get_n_backends(sched))) {
backend <- ggml_backend_sched_get_backend(sched, i - 1)
cat("Backend", i - 1, ":", ggml_backend_name(backend), "\n")
}
}A common pattern is to use GPU with CPU as fallback for unsupported operations:
if (ggml_vulkan_available()) {
# Initialize backends
gpu <- ggml_vulkan_init(0)
cpu <- ggml_backend_cpu_init()
ggml_backend_cpu_set_n_threads(cpu, 4)
# GPU first, CPU as fallback
sched <- ggml_backend_sched_new(list(gpu, cpu))
ctx <- ggml_init(64 * 1024 * 1024)
# Create computation
a <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1000, 1000)
b <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1000, 1000)
c <- ggml_mul_mat(ctx, a, b)
graph <- ggml_build_forward_expand(ctx, c)
ggml_backend_sched_reserve(sched, graph)
ggml_backend_sched_alloc_graph(sched, graph)
# Check which backend handles each tensor
cat("\nTensor backend assignment:\n")
cat(" a:", ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, a)),
"\n")
cat(" b:", ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, b)),
"\n")
cat(" c:", ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, c)),
"\n")
# Cleanup
ggml_backend_sched_free(sched)
ggml_vulkan_free(gpu)
ggml_backend_free(cpu)
ggml_free(ctx)
}You can explicitly assign tensors to specific backends:
if (ggml_vulkan_available() && ggml_vulkan_device_count() >= 2) {
gpu0 <- ggml_vulkan_init(0)
gpu1 <- ggml_vulkan_init(1)
sched <- ggml_backend_sched_new(list(gpu0, gpu1))
ctx <- ggml_init(128 * 1024 * 1024)
# Create tensors for two parallel computations
a1 <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 512, 512)
b1 <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 512, 512)
c1 <- ggml_mul_mat(ctx, a1, b1)
a2 <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 512, 512)
b2 <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 512, 512)
c2 <- ggml_mul_mat(ctx, a2, b2)
# Combine results
result <- ggml_add(ctx, c1, c2)
graph <- ggml_build_forward_expand(ctx, result)
# Manually assign tensors to different GPUs
ggml_backend_sched_set_tensor_backend(sched, a1, gpu0)
ggml_backend_sched_set_tensor_backend(sched, b1, gpu0)
ggml_backend_sched_set_tensor_backend(sched, c1, gpu0)
ggml_backend_sched_set_tensor_backend(sched, a2, gpu1)
ggml_backend_sched_set_tensor_backend(sched, b2, gpu1)
ggml_backend_sched_set_tensor_backend(sched, c2, gpu1)
ggml_backend_sched_reserve(sched, graph)
ggml_backend_sched_alloc_graph(sched, graph)
# Set data and compute
ggml_set_f32(a1, rnorm(512 * 512))
ggml_set_f32(b1, rnorm(512 * 512))
ggml_set_f32(a2, rnorm(512 * 512))
ggml_set_f32(b2, rnorm(512 * 512))
ggml_backend_sched_graph_compute(sched, graph)
cat("Multi-GPU computation completed\n")
cat("Result shape:", ggml_tensor_shape(result), "\n")
# Cleanup
ggml_backend_sched_free(sched)
ggml_vulkan_free(gpu0)
ggml_vulkan_free(gpu1)
ggml_free(ctx)
}For maximum performance, use asynchronous operations:
if (ggml_vulkan_available() && ggml_vulkan_device_count() >= 2) {
gpu0 <- ggml_vulkan_init(0)
gpu1 <- ggml_vulkan_init(1)
sched <- ggml_backend_sched_new(list(gpu0, gpu1))
ctx <- ggml_init(64 * 1024 * 1024)
# Build graph
a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 100000)
b <- ggml_relu(ctx, a)
c <- ggml_sum(ctx, b)
graph <- ggml_build_forward_expand(ctx, c)
ggml_backend_sched_reserve(sched, graph)
ggml_backend_sched_alloc_graph(sched, graph)
ggml_set_f32(a, rnorm(100000))
# Async compute - returns immediately
ggml_backend_sched_graph_compute_async(sched, graph)
# Do other work here while GPU computes...
cat("Computing asynchronously...\n")
# Wait for completion
ggml_backend_sched_synchronize(sched)
cat("Result:", ggml_get_f32(c), "\n")
# Cleanup
ggml_backend_sched_free(sched)
ggml_vulkan_free(gpu0)
ggml_vulkan_free(gpu1)
ggml_free(ctx)
}ggml_backend_sched_reserve() before allocation to
optimize memory layoutggml_vulkan_device_memory()ggml_backend_sched_get_n_splits()vignette("vulkan-backend") for single-GPU usagevignette("quantization") for memory-efficient
models