Zero-Copy Techniques and Atomics

Copying data is the enemy of performance. Every memcpy wastes CPU cycles and pollutes the cache. This chapter covers zero-copy I/O on Linux and atomic operations for lock-free data structures -- two techniques that separate fast systems code from everything else.

Zero-Copy I/O: Why Copies Hurt

A naive file-to-socket transfer does four copies:

Traditional copy path:

Disk --> Kernel Buffer --> User Buffer --> Kernel Buffer --> NIC
           copy #1          copy #2          copy #3
         (+ context switch)              (+ context switch)

With sendfile, the kernel does it in zero or one copy:

sendfile path:

Disk --> Kernel Buffer ---------> NIC
           (DMA)       (DMA or single copy)
           No user-space involvement

sendfile

/* sendfile_demo.c */
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/sendfile.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <string.h>

int main(int argc, char *argv[]) {
    if (argc != 2) {
        fprintf(stderr, "Usage: %s <file>\n", argv[0]);
        return 1;
    }

    int filefd = open(argv[1], O_RDONLY);
    if (filefd < 0) { perror("open"); return 1; }

    struct stat st;
    fstat(filefd, &st);

    /* Create a TCP server socket */
    int srv = socket(AF_INET, SOCK_STREAM, 0);
    int opt = 1;
    setsockopt(srv, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));

    struct sockaddr_in addr = {
        .sin_family = AF_INET,
        .sin_port   = htons(9000),
        .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
    };
    bind(srv, (struct sockaddr *)&addr, sizeof(addr));
    listen(srv, 1);
    printf("Listening on :9000, waiting for connection...\n");

    int client = accept(srv, NULL, NULL);
    if (client < 0) { perror("accept"); return 1; }

    /* Zero-copy transfer */
    off_t offset = 0;
    ssize_t sent = sendfile(client, filefd, &offset, st.st_size);
    printf("Sent %zd bytes via sendfile\n", sent);

    close(client);
    close(srv);
    close(filefd);
    return 0;
}
$ gcc -O2 -o sendfile_demo sendfile_demo.c
$ ./sendfile_demo /etc/passwd &
$ nc localhost 9000 | wc -c

splice

splice moves data between two file descriptors via a kernel pipe buffer. No user-space copy.

/* splice_demo.c */
#define _GNU_SOURCE
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>

int main(void) {
    int infd  = open("/etc/hosts", O_RDONLY);
    int outfd = open("/tmp/hosts_copy", O_WRONLY | O_CREAT | O_TRUNC, 0644);
    if (infd < 0 || outfd < 0) { perror("open"); return 1; }

    int pipefd[2];
    if (pipe(pipefd) < 0) { perror("pipe"); return 1; }

    /* Move data: file -> pipe -> file, no user-space buffer */
    ssize_t total = 0;
    ssize_t n;
    while ((n = splice(infd, NULL, pipefd[1], NULL, 65536,
                       SPLICE_F_MOVE)) > 0) {
        ssize_t w = splice(pipefd[0], NULL, outfd, NULL, n,
                           SPLICE_F_MOVE);
        if (w < 0) { perror("splice out"); return 1; }
        total += w;
    }

    printf("Copied %zd bytes via splice\n", total);

    close(infd); close(outfd);
    close(pipefd[0]); close(pipefd[1]);
    return 0;
}
$ gcc -O2 -o splice_demo splice_demo.c && ./splice_demo
Copied 221 bytes via splice

Parse In-Place: Avoiding memcpy in Protocols

Instead of copying fields out of a packet buffer, point into the buffer directly:

/* parse_inplace.c */
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <arpa/inet.h>

/* A simple fixed-format message */
struct __attribute__((packed)) Message {
    uint16_t type;
    uint16_t length;
    uint32_t sequence;
    char     payload[0]; /* flexible array member */
};

void process_buffer(const uint8_t *buf, size_t len) {
    /* Parse in-place: cast, don't copy */
    const struct Message *msg = (const struct Message *)buf;

    printf("Type:     %u\n", ntohs(msg->type));
    printf("Length:   %u\n", ntohs(msg->length));
    printf("Sequence: %u\n", ntohl(msg->sequence));
    printf("Payload:  %.*s\n",
           (int)(len - sizeof(struct Message)),
           msg->payload);
}

int main(void) {
    /* Simulate a received buffer */
    uint8_t buf[64];
    struct Message *msg = (struct Message *)buf;
    msg->type     = htons(1);
    msg->length   = htons(13);
    msg->sequence = htonl(42);
    memcpy(msg->payload, "Hello World!", 13);

    process_buffer(buf, sizeof(struct Message) + 13);
    return 0;
}

Caution: In-place parsing requires careful attention to alignment and endianness. On architectures that require aligned access (ARM, SPARC), casting an unaligned buffer to a struct pointer is undefined behavior.

Rust: Zero-Copy Patterns

// zero_copy_parse.rs
use std::convert::TryInto;

fn parse_u16_be(buf: &[u8], offset: usize) -> u16 {
    u16::from_be_bytes(buf[offset..offset + 2].try_into().unwrap())
}

fn parse_u32_be(buf: &[u8], offset: usize) -> u32 {
    u32::from_be_bytes(buf[offset..offset + 4].try_into().unwrap())
}

fn main() {
    // Simulate a received buffer
    let buf: Vec<u8> = vec![
        0x00, 0x01,             // type = 1
        0x00, 0x0D,             // length = 13
        0x00, 0x00, 0x00, 0x2A, // sequence = 42
        b'H', b'e', b'l', b'l', b'o', b' ',
        b'W', b'o', b'r', b'l', b'd', b'!', 0x00,
    ];

    let msg_type = parse_u16_be(&buf, 0);
    let length   = parse_u16_be(&buf, 2);
    let sequence = parse_u32_be(&buf, 4);
    let payload  = std::str::from_utf8(&buf[8..8 + length as usize - 1])
                       .unwrap();

    println!("Type: {msg_type}, Length: {length}, Seq: {sequence}");
    println!("Payload: {payload}");
}

Rust Note: Rust doesn't allow arbitrary pointer casts to structs. Instead, you parse fields explicitly with from_be_bytes. The zerocopy and bytemuck crates provide safe zero-copy deserialization for types that meet alignment and validity requirements.

Atomic Operations

When threads share data without locks, you need atomics. An atomic operation completes indivisibly -- no other thread can see a half-written value.

C: __atomic Builtins

/* atomics_c.c */
#include <stdio.h>
#include <pthread.h>
#include <stdatomic.h>

static atomic_long counter = 0;

void *increment(void *arg) {
    (void)arg;
    for (int i = 0; i < 1000000; i++) {
        atomic_fetch_add(&counter, 1);
    }
    return NULL;
}

int main(void) {
    pthread_t threads[4];

    for (int i = 0; i < 4; i++)
        pthread_create(&threads[i], NULL, increment, NULL);

    for (int i = 0; i < 4; i++)
        pthread_join(threads[i], NULL);

    printf("Counter: %ld (expected 4000000)\n",
           atomic_load(&counter));
    return 0;
}
$ gcc -O2 -o atomics_c atomics_c.c -lpthread && ./atomics_c
Counter: 4000000 (expected 4000000)

Without atomic_, the result would be less than 4000000 due to data races.

Rust: std::sync::atomic

// atomics_rust.rs
use std::sync::atomic::{AtomicI64, Ordering};
use std::sync::Arc;
use std::thread;

fn main() {
    let counter = Arc::new(AtomicI64::new(0));

    let handles: Vec<_> = (0..4)
        .map(|_| {
            let c = Arc::clone(&counter);
            thread::spawn(move || {
                for _ in 0..1_000_000 {
                    c.fetch_add(1, Ordering::Relaxed);
                }
            })
        })
        .collect();

    for h in handles {
        h.join().unwrap();
    }

    println!("Counter: {} (expected 4000000)",
             counter.load(Ordering::Relaxed));
}

Memory Ordering

Atomics alone aren't enough. You must specify how operations are ordered relative to other memory accesses.

Ordering Strength (weakest to strongest):

Relaxed    No ordering guarantees. Only atomicity.
Acquire    Reads after this acquire see writes before the matching release.
Release    Writes before this release are visible after the matching acquire.
AcqRel     Both acquire and release.
SeqCst     Total global order. All threads agree on the sequence.

Acquire-Release Pattern

Thread A (producer):             Thread B (consumer):

data = 42;                      while (!ready.load(Acquire)) { }
ready.store(true, Release);     assert(data == 42);  // guaranteed!
           |                                ^
           +--- Release syncs with Acquire--+

Without proper ordering, Thread B might see ready == true but data == 0 because the CPU reordered the stores.

/* acquire_release.c */
#include <stdio.h>
#include <pthread.h>
#include <stdatomic.h>
#include <assert.h>

static int data = 0;
static atomic_int ready = 0;

void *producer(void *arg) {
    (void)arg;
    data = 42;                                    /* non-atomic write */
    atomic_store_explicit(&ready, 1, memory_order_release);
    return NULL;
}

void *consumer(void *arg) {
    (void)arg;
    while (atomic_load_explicit(&ready, memory_order_acquire) == 0) {
        /* spin */
    }
    assert(data == 42); /* guaranteed by acquire-release */
    printf("data = %d (correct!)\n", data);
    return NULL;
}

int main(void) {
    pthread_t t1, t2;
    pthread_create(&t1, NULL, producer, NULL);
    pthread_create(&t2, NULL, consumer, NULL);
    pthread_join(t1, NULL);
    pthread_join(t2, NULL);
    return 0;
}
$ gcc -O2 -o acqrel acquire_release.c -lpthread && ./acqrel
data = 42 (correct!)

Rust Acquire-Release

use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::thread;

fn main() {
    let data = Arc::new(std::sync::Mutex::new(0));
    let ready = Arc::new(AtomicBool::new(false));

    // Use raw pointers for the non-atomic shared data
    // to mirror the C pattern (Rust normally prevents this)
    let shared: Arc<(AtomicBool, std::cell::UnsafeCell<i32>)> =
        Arc::new((AtomicBool::new(false), std::cell::UnsafeCell::new(0)));

    // SAFETY: Acquire-release ordering guarantees visibility
    let s1 = Arc::clone(&shared);
    let producer = thread::spawn(move || {
        unsafe { *s1.1.get() = 42; }
        s1.0.store(true, Ordering::Release);
    });

    let s2 = Arc::clone(&shared);
    let consumer = thread::spawn(move || {
        while !s2.0.load(Ordering::Acquire) {
            std::hint::spin_loop();
        }
        let val = unsafe { *s2.1.get() };
        assert_eq!(val, 42);
        println!("data = {val} (correct!)");
    });

    producer.join().unwrap();
    consumer.join().unwrap();
}

Rust Note: Rust requires unsafe to share non-atomic data between threads without a Mutex. The language forces you to acknowledge the danger explicitly. In practice, prefer Mutex or channels unless you've proven atomics are necessary.

Compare-and-Swap (CAS)

CAS is the fundamental lock-free primitive: "If the value is X, change it to Y. Otherwise, tell me the current value."

/* cas_demo.c */
#include <stdio.h>
#include <stdatomic.h>
#include <stdbool.h>

static atomic_int value = 0;

bool try_increment(int expected_old, int new_val) {
    /* Atomically: if value == expected_old, set to new_val */
    return atomic_compare_exchange_strong(&value, &expected_old, new_val);
}

int main(void) {
    atomic_store(&value, 10);

    /* Try to change 10 -> 20 */
    if (try_increment(10, 20))
        printf("CAS succeeded: %d -> 20\n", 10);
    else
        printf("CAS failed\n");

    /* Try to change 10 -> 30 (will fail, value is now 20) */
    if (try_increment(10, 30))
        printf("CAS succeeded: 10 -> 30\n");
    else
        printf("CAS failed: value is actually %d\n",
               atomic_load(&value));

    return 0;
}
$ gcc -O2 -o cas_demo cas_demo.c && ./cas_demo
CAS succeeded: 10 -> 20
CAS failed: value is actually 20

Rust CAS

use std::sync::atomic::{AtomicI32, Ordering};

fn main() {
    let value = AtomicI32::new(10);

    // Try to change 10 -> 20
    match value.compare_exchange(10, 20,
                                 Ordering::SeqCst,
                                 Ordering::SeqCst) {
        Ok(old)  => println!("CAS succeeded: {old} -> 20"),
        Err(cur) => println!("CAS failed: value is {cur}"),
    }

    // Try to change 10 -> 30 (will fail)
    match value.compare_exchange(10, 30,
                                 Ordering::SeqCst,
                                 Ordering::SeqCst) {
        Ok(old)  => println!("CAS succeeded: {old} -> 30"),
        Err(cur) => println!("CAS failed: value is {cur}"),
    }
}

Lock-Free Stack (Sketch)

A lock-free stack using CAS:

/* lockfree_stack.c */
#include <stdio.h>
#include <stdlib.h>
#include <stdatomic.h>

typedef struct Node {
    int value;
    struct Node *next;
} Node;

typedef struct {
    _Atomic(Node *) top;
} Stack;

void stack_init(Stack *s) {
    atomic_store(&s->top, NULL);
}

void stack_push(Stack *s, int value) {
    Node *node = malloc(sizeof(Node));
    node->value = value;

    Node *old_top;
    do {
        old_top = atomic_load(&s->top);
        node->next = old_top;
    } while (!atomic_compare_exchange_weak(&s->top, &old_top, node));
}

int stack_pop(Stack *s, int *out) {
    Node *old_top;
    Node *new_top;
    do {
        old_top = atomic_load(&s->top);
        if (!old_top) return 0; /* empty */
        new_top = old_top->next;
    } while (!atomic_compare_exchange_weak(&s->top, &old_top, new_top));

    *out = old_top->value;
    free(old_top); /* caution: ABA problem in real code */
    return 1;
}

int main(void) {
    Stack s;
    stack_init(&s);

    stack_push(&s, 10);
    stack_push(&s, 20);
    stack_push(&s, 30);

    int val;
    while (stack_pop(&s, &val))
        printf("popped: %d\n", val);

    return 0;
}
$ gcc -O2 -o lockfree lockfree_stack.c && ./lockfree
popped: 30
popped: 20
popped: 10

Caution: This stack has the ABA problem: if thread A pops node X, thread B pops X and Y then pushes X back, thread A's CAS succeeds but the stack is corrupted. Real lock-free structures use tagged pointers or hazard pointers.

When to Use Atomics vs Mutex

+-------------------+----------------------------------+
| Use Atomics       | Use Mutex                        |
+-------------------+----------------------------------+
| Single counter    | Multiple fields updated together |
| Single flag       | Complex invariants               |
| Hot path, simple  | Readability matters              |
| Statistics        | Anything non-trivial             |
+-------------------+----------------------------------+

Rule of thumb: if you can't explain why your lock-free code is correct in one paragraph, use a mutex.

Driver Prep: The Linux kernel uses atomic operations extensively: atomic_t, atomic_inc(), atomic_dec_and_test(). Memory barriers (smp_mb(), smp_wmb(), smp_rmb()) map to the orderings above. DMA descriptor rings are often lock-free structures using these primitives.

Try It: Modify atomics_c.c to use memory_order_relaxed instead of the default memory_order_seq_cst. Does it still produce the correct count? Why?

Quick Knowledge Check

  1. How many copies does sendfile eliminate compared to read+write?
  2. What does memory_order_acquire guarantee that memory_order_relaxed does not?
  3. What is the ABA problem in lock-free programming?

Common Pitfalls

  • Using memcpy where a pointer would do. Profile first, but default to referencing data in-place.
  • Relaxed ordering everywhere. It works for counters but breaks for publish/subscribe patterns.
  • Forgetting volatile doesn't mean atomic. volatile prevents compiler reordering but not CPU reordering. It's not a substitute for atomics.
  • Lock-free code without formal reasoning. Lock-free is harder to get right than locks. Only use it when profiling proves the lock is the bottleneck.
  • sendfile on non-regular files. It doesn't work with pipes or sockets as the source -- use splice for those.
  • CAS loops without backoff. Under high contention, spinning CAS wastes CPU. Add exponential backoff or yield.