Zero-Copy Techniques and Atomics
Copying data is the enemy of performance. Every memcpy wastes CPU cycles and
pollutes the cache. This chapter covers zero-copy I/O on Linux and atomic
operations for lock-free data structures -- two techniques that separate
fast systems code from everything else.
Zero-Copy I/O: Why Copies Hurt
A naive file-to-socket transfer does four copies:
Traditional copy path:
Disk --> Kernel Buffer --> User Buffer --> Kernel Buffer --> NIC
copy #1 copy #2 copy #3
(+ context switch) (+ context switch)
With sendfile, the kernel does it in zero or one copy:
sendfile path:
Disk --> Kernel Buffer ---------> NIC
(DMA) (DMA or single copy)
No user-space involvement
sendfile
/* sendfile_demo.c */
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/sendfile.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <string.h>
int main(int argc, char *argv[]) {
if (argc != 2) {
fprintf(stderr, "Usage: %s <file>\n", argv[0]);
return 1;
}
int filefd = open(argv[1], O_RDONLY);
if (filefd < 0) { perror("open"); return 1; }
struct stat st;
fstat(filefd, &st);
/* Create a TCP server socket */
int srv = socket(AF_INET, SOCK_STREAM, 0);
int opt = 1;
setsockopt(srv, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
struct sockaddr_in addr = {
.sin_family = AF_INET,
.sin_port = htons(9000),
.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
};
bind(srv, (struct sockaddr *)&addr, sizeof(addr));
listen(srv, 1);
printf("Listening on :9000, waiting for connection...\n");
int client = accept(srv, NULL, NULL);
if (client < 0) { perror("accept"); return 1; }
/* Zero-copy transfer */
off_t offset = 0;
ssize_t sent = sendfile(client, filefd, &offset, st.st_size);
printf("Sent %zd bytes via sendfile\n", sent);
close(client);
close(srv);
close(filefd);
return 0;
}
$ gcc -O2 -o sendfile_demo sendfile_demo.c
$ ./sendfile_demo /etc/passwd &
$ nc localhost 9000 | wc -c
splice
splice moves data between two file descriptors via a kernel pipe buffer.
No user-space copy.
/* splice_demo.c */
#define _GNU_SOURCE
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
int main(void) {
int infd = open("/etc/hosts", O_RDONLY);
int outfd = open("/tmp/hosts_copy", O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (infd < 0 || outfd < 0) { perror("open"); return 1; }
int pipefd[2];
if (pipe(pipefd) < 0) { perror("pipe"); return 1; }
/* Move data: file -> pipe -> file, no user-space buffer */
ssize_t total = 0;
ssize_t n;
while ((n = splice(infd, NULL, pipefd[1], NULL, 65536,
SPLICE_F_MOVE)) > 0) {
ssize_t w = splice(pipefd[0], NULL, outfd, NULL, n,
SPLICE_F_MOVE);
if (w < 0) { perror("splice out"); return 1; }
total += w;
}
printf("Copied %zd bytes via splice\n", total);
close(infd); close(outfd);
close(pipefd[0]); close(pipefd[1]);
return 0;
}
$ gcc -O2 -o splice_demo splice_demo.c && ./splice_demo
Copied 221 bytes via splice
Parse In-Place: Avoiding memcpy in Protocols
Instead of copying fields out of a packet buffer, point into the buffer directly:
/* parse_inplace.c */
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <arpa/inet.h>
/* A simple fixed-format message */
struct __attribute__((packed)) Message {
uint16_t type;
uint16_t length;
uint32_t sequence;
char payload[0]; /* flexible array member */
};
void process_buffer(const uint8_t *buf, size_t len) {
/* Parse in-place: cast, don't copy */
const struct Message *msg = (const struct Message *)buf;
printf("Type: %u\n", ntohs(msg->type));
printf("Length: %u\n", ntohs(msg->length));
printf("Sequence: %u\n", ntohl(msg->sequence));
printf("Payload: %.*s\n",
(int)(len - sizeof(struct Message)),
msg->payload);
}
int main(void) {
/* Simulate a received buffer */
uint8_t buf[64];
struct Message *msg = (struct Message *)buf;
msg->type = htons(1);
msg->length = htons(13);
msg->sequence = htonl(42);
memcpy(msg->payload, "Hello World!", 13);
process_buffer(buf, sizeof(struct Message) + 13);
return 0;
}
Caution: In-place parsing requires careful attention to alignment and endianness. On architectures that require aligned access (ARM, SPARC), casting an unaligned buffer to a struct pointer is undefined behavior.
Rust: Zero-Copy Patterns
// zero_copy_parse.rs use std::convert::TryInto; fn parse_u16_be(buf: &[u8], offset: usize) -> u16 { u16::from_be_bytes(buf[offset..offset + 2].try_into().unwrap()) } fn parse_u32_be(buf: &[u8], offset: usize) -> u32 { u32::from_be_bytes(buf[offset..offset + 4].try_into().unwrap()) } fn main() { // Simulate a received buffer let buf: Vec<u8> = vec![ 0x00, 0x01, // type = 1 0x00, 0x0D, // length = 13 0x00, 0x00, 0x00, 0x2A, // sequence = 42 b'H', b'e', b'l', b'l', b'o', b' ', b'W', b'o', b'r', b'l', b'd', b'!', 0x00, ]; let msg_type = parse_u16_be(&buf, 0); let length = parse_u16_be(&buf, 2); let sequence = parse_u32_be(&buf, 4); let payload = std::str::from_utf8(&buf[8..8 + length as usize - 1]) .unwrap(); println!("Type: {msg_type}, Length: {length}, Seq: {sequence}"); println!("Payload: {payload}"); }
Rust Note: Rust doesn't allow arbitrary pointer casts to structs. Instead, you parse fields explicitly with
from_be_bytes. Thezerocopyandbytemuckcrates provide safe zero-copy deserialization for types that meet alignment and validity requirements.
Atomic Operations
When threads share data without locks, you need atomics. An atomic operation completes indivisibly -- no other thread can see a half-written value.
C: __atomic Builtins
/* atomics_c.c */
#include <stdio.h>
#include <pthread.h>
#include <stdatomic.h>
static atomic_long counter = 0;
void *increment(void *arg) {
(void)arg;
for (int i = 0; i < 1000000; i++) {
atomic_fetch_add(&counter, 1);
}
return NULL;
}
int main(void) {
pthread_t threads[4];
for (int i = 0; i < 4; i++)
pthread_create(&threads[i], NULL, increment, NULL);
for (int i = 0; i < 4; i++)
pthread_join(threads[i], NULL);
printf("Counter: %ld (expected 4000000)\n",
atomic_load(&counter));
return 0;
}
$ gcc -O2 -o atomics_c atomics_c.c -lpthread && ./atomics_c
Counter: 4000000 (expected 4000000)
Without atomic_, the result would be less than 4000000 due to data races.
Rust: std::sync::atomic
// atomics_rust.rs use std::sync::atomic::{AtomicI64, Ordering}; use std::sync::Arc; use std::thread; fn main() { let counter = Arc::new(AtomicI64::new(0)); let handles: Vec<_> = (0..4) .map(|_| { let c = Arc::clone(&counter); thread::spawn(move || { for _ in 0..1_000_000 { c.fetch_add(1, Ordering::Relaxed); } }) }) .collect(); for h in handles { h.join().unwrap(); } println!("Counter: {} (expected 4000000)", counter.load(Ordering::Relaxed)); }
Memory Ordering
Atomics alone aren't enough. You must specify how operations are ordered relative to other memory accesses.
Ordering Strength (weakest to strongest):
Relaxed No ordering guarantees. Only atomicity.
Acquire Reads after this acquire see writes before the matching release.
Release Writes before this release are visible after the matching acquire.
AcqRel Both acquire and release.
SeqCst Total global order. All threads agree on the sequence.
Acquire-Release Pattern
Thread A (producer): Thread B (consumer):
data = 42; while (!ready.load(Acquire)) { }
ready.store(true, Release); assert(data == 42); // guaranteed!
| ^
+--- Release syncs with Acquire--+
Without proper ordering, Thread B might see ready == true but data == 0
because the CPU reordered the stores.
/* acquire_release.c */
#include <stdio.h>
#include <pthread.h>
#include <stdatomic.h>
#include <assert.h>
static int data = 0;
static atomic_int ready = 0;
void *producer(void *arg) {
(void)arg;
data = 42; /* non-atomic write */
atomic_store_explicit(&ready, 1, memory_order_release);
return NULL;
}
void *consumer(void *arg) {
(void)arg;
while (atomic_load_explicit(&ready, memory_order_acquire) == 0) {
/* spin */
}
assert(data == 42); /* guaranteed by acquire-release */
printf("data = %d (correct!)\n", data);
return NULL;
}
int main(void) {
pthread_t t1, t2;
pthread_create(&t1, NULL, producer, NULL);
pthread_create(&t2, NULL, consumer, NULL);
pthread_join(t1, NULL);
pthread_join(t2, NULL);
return 0;
}
$ gcc -O2 -o acqrel acquire_release.c -lpthread && ./acqrel
data = 42 (correct!)
Rust Acquire-Release
use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::thread; fn main() { let data = Arc::new(std::sync::Mutex::new(0)); let ready = Arc::new(AtomicBool::new(false)); // Use raw pointers for the non-atomic shared data // to mirror the C pattern (Rust normally prevents this) let shared: Arc<(AtomicBool, std::cell::UnsafeCell<i32>)> = Arc::new((AtomicBool::new(false), std::cell::UnsafeCell::new(0))); // SAFETY: Acquire-release ordering guarantees visibility let s1 = Arc::clone(&shared); let producer = thread::spawn(move || { unsafe { *s1.1.get() = 42; } s1.0.store(true, Ordering::Release); }); let s2 = Arc::clone(&shared); let consumer = thread::spawn(move || { while !s2.0.load(Ordering::Acquire) { std::hint::spin_loop(); } let val = unsafe { *s2.1.get() }; assert_eq!(val, 42); println!("data = {val} (correct!)"); }); producer.join().unwrap(); consumer.join().unwrap(); }
Rust Note: Rust requires
unsafeto share non-atomic data between threads without aMutex. The language forces you to acknowledge the danger explicitly. In practice, preferMutexor channels unless you've proven atomics are necessary.
Compare-and-Swap (CAS)
CAS is the fundamental lock-free primitive: "If the value is X, change it to Y. Otherwise, tell me the current value."
/* cas_demo.c */
#include <stdio.h>
#include <stdatomic.h>
#include <stdbool.h>
static atomic_int value = 0;
bool try_increment(int expected_old, int new_val) {
/* Atomically: if value == expected_old, set to new_val */
return atomic_compare_exchange_strong(&value, &expected_old, new_val);
}
int main(void) {
atomic_store(&value, 10);
/* Try to change 10 -> 20 */
if (try_increment(10, 20))
printf("CAS succeeded: %d -> 20\n", 10);
else
printf("CAS failed\n");
/* Try to change 10 -> 30 (will fail, value is now 20) */
if (try_increment(10, 30))
printf("CAS succeeded: 10 -> 30\n");
else
printf("CAS failed: value is actually %d\n",
atomic_load(&value));
return 0;
}
$ gcc -O2 -o cas_demo cas_demo.c && ./cas_demo
CAS succeeded: 10 -> 20
CAS failed: value is actually 20
Rust CAS
use std::sync::atomic::{AtomicI32, Ordering}; fn main() { let value = AtomicI32::new(10); // Try to change 10 -> 20 match value.compare_exchange(10, 20, Ordering::SeqCst, Ordering::SeqCst) { Ok(old) => println!("CAS succeeded: {old} -> 20"), Err(cur) => println!("CAS failed: value is {cur}"), } // Try to change 10 -> 30 (will fail) match value.compare_exchange(10, 30, Ordering::SeqCst, Ordering::SeqCst) { Ok(old) => println!("CAS succeeded: {old} -> 30"), Err(cur) => println!("CAS failed: value is {cur}"), } }
Lock-Free Stack (Sketch)
A lock-free stack using CAS:
/* lockfree_stack.c */
#include <stdio.h>
#include <stdlib.h>
#include <stdatomic.h>
typedef struct Node {
int value;
struct Node *next;
} Node;
typedef struct {
_Atomic(Node *) top;
} Stack;
void stack_init(Stack *s) {
atomic_store(&s->top, NULL);
}
void stack_push(Stack *s, int value) {
Node *node = malloc(sizeof(Node));
node->value = value;
Node *old_top;
do {
old_top = atomic_load(&s->top);
node->next = old_top;
} while (!atomic_compare_exchange_weak(&s->top, &old_top, node));
}
int stack_pop(Stack *s, int *out) {
Node *old_top;
Node *new_top;
do {
old_top = atomic_load(&s->top);
if (!old_top) return 0; /* empty */
new_top = old_top->next;
} while (!atomic_compare_exchange_weak(&s->top, &old_top, new_top));
*out = old_top->value;
free(old_top); /* caution: ABA problem in real code */
return 1;
}
int main(void) {
Stack s;
stack_init(&s);
stack_push(&s, 10);
stack_push(&s, 20);
stack_push(&s, 30);
int val;
while (stack_pop(&s, &val))
printf("popped: %d\n", val);
return 0;
}
$ gcc -O2 -o lockfree lockfree_stack.c && ./lockfree
popped: 30
popped: 20
popped: 10
Caution: This stack has the ABA problem: if thread A pops node X, thread B pops X and Y then pushes X back, thread A's CAS succeeds but the stack is corrupted. Real lock-free structures use tagged pointers or hazard pointers.
When to Use Atomics vs Mutex
+-------------------+----------------------------------+
| Use Atomics | Use Mutex |
+-------------------+----------------------------------+
| Single counter | Multiple fields updated together |
| Single flag | Complex invariants |
| Hot path, simple | Readability matters |
| Statistics | Anything non-trivial |
+-------------------+----------------------------------+
Rule of thumb: if you can't explain why your lock-free code is correct in one paragraph, use a mutex.
Driver Prep: The Linux kernel uses atomic operations extensively:
atomic_t,atomic_inc(),atomic_dec_and_test(). Memory barriers (smp_mb(),smp_wmb(),smp_rmb()) map to the orderings above. DMA descriptor rings are often lock-free structures using these primitives.
Try It: Modify
atomics_c.cto usememory_order_relaxedinstead of the defaultmemory_order_seq_cst. Does it still produce the correct count? Why?
Quick Knowledge Check
- How many copies does
sendfileeliminate compared toread+write? - What does
memory_order_acquireguarantee thatmemory_order_relaxeddoes not? - What is the ABA problem in lock-free programming?
Common Pitfalls
- Using
memcpywhere a pointer would do. Profile first, but default to referencing data in-place. - Relaxed ordering everywhere. It works for counters but breaks for publish/subscribe patterns.
- Forgetting
volatiledoesn't mean atomic.volatileprevents compiler reordering but not CPU reordering. It's not a substitute for atomics. - Lock-free code without formal reasoning. Lock-free is harder to get right than locks. Only use it when profiling proves the lock is the bottleneck.
sendfileon non-regular files. It doesn't work with pipes or sockets as the source -- usesplicefor those.- CAS loops without backoff. Under high contention, spinning CAS wastes CPU. Add exponential backoff or yield.