#include <sys/param.h>
#include <sys/reboot.h>
#include <sys/exec.h>
#include <elf.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
#include <unistd.h>
#include <err.h>
#include <dev/vmm/vmm.h>
#include <machine/biosvar.h>
#include <machine/segments.h>
#include <machine/specialreg.h>
#include <machine/pte.h>
#include "loadfile.h"
#include "vmd.h"
#define LOADADDR(a) ((((u_long)(a)) + offset)&0xfffffff)
union {
Elf32_Ehdr elf32;
Elf64_Ehdr elf64;
} hdr;
static void setsegment(struct mem_segment_descriptor *, uint32_t,
size_t, int, int, int, int);
static int elf32_exec(gzFile, Elf32_Ehdr *, u_long *, int);
static int elf64_exec(gzFile, Elf64_Ehdr *, u_long *, int);
static size_t create_bios_memmap(struct vmop_create_params *, bios_memmap_t *);
static uint32_t push_bootargs(bios_memmap_t *, size_t, bios_bootmac_t *);
static size_t push_stack(uint32_t, uint32_t);
static void push_gdt(void);
static void marc4random_buf(paddr_t, int);
static void mbzero(paddr_t, int);
static void mbcopy(void *, paddr_t, int);
extern int vm_id;
static void
setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit,
int type, int dpl, int def32, int gran)
{
sd->sd_lolimit = (int)limit;
sd->sd_lobase = (int)base;
sd->sd_type = type;
sd->sd_dpl = dpl;
sd->sd_p = 1;
sd->sd_hilimit = (int)limit >> 16;
sd->sd_avl = 0;
sd->sd_long = 0;
sd->sd_def32 = def32;
sd->sd_gran = gran;
sd->sd_hibase = (int)base >> 24;
}
static void
push_gdt(void)
{
uint8_t gdtpage[PAGE_SIZE];
struct mem_segment_descriptor *sd;
memset(&gdtpage, 0, sizeof(gdtpage));
sd = (struct mem_segment_descriptor *)&gdtpage;
setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1);
setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1);
write_mem(GDT_PAGE, gdtpage, PAGE_SIZE);
sev_register_encryption(GDT_PAGE, PAGE_SIZE);
}
int
loadfile_elf(gzFile fp, struct vmd_vm *vm, struct vcpu_reg_state *vrs,
unsigned int bootdevice)
{
int r;
uint32_t bootargsz;
size_t n, stacksize;
u_long marks[MARK_MAX];
bios_memmap_t memmap[VMM_MAX_MEM_RANGES + 1];
bios_bootmac_t bm, *bootmac = NULL;
if ((r = gzread(fp, &hdr, sizeof(hdr))) != sizeof(hdr))
return 1;
memset(&marks, 0, sizeof(marks));
if (memcmp(hdr.elf32.e_ident, ELFMAG, SELFMAG) == 0 &&
hdr.elf32.e_ident[EI_CLASS] == ELFCLASS32) {
r = elf32_exec(fp, &hdr.elf32, marks, LOAD_ALL);
} else if (memcmp(hdr.elf64.e_ident, ELFMAG, SELFMAG) == 0 &&
hdr.elf64.e_ident[EI_CLASS] == ELFCLASS64) {
r = elf64_exec(fp, &hdr.elf64, marks, LOAD_ALL);
} else
errno = ENOEXEC;
if (r)
return (r);
push_gdt();
vrs->vrs_crs[VCPU_REGS_CR3] = 0ULL;
vrs->vrs_crs[VCPU_REGS_CR4] = 0ULL;
vrs->vrs_msrs[VCPU_REGS_EFER] = 0ULL;
vrs->vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE;
if (bootdevice == VMBOOTDEV_NET) {
bootmac = &bm;
memcpy(bootmac, vm->vm_params.vmc_macs[0], ETHER_ADDR_LEN);
}
n = create_bios_memmap(&vm->vm_params, memmap);
bootargsz = push_bootargs(memmap, n, bootmac);
stacksize = push_stack(bootargsz, marks[MARK_END]);
vrs->vrs_gprs[VCPU_REGS_RIP] = (uint64_t)marks[MARK_ENTRY];
vrs->vrs_gprs[VCPU_REGS_RSP] = (uint64_t)(STACK_PAGE + PAGE_SIZE) - stacksize;
vrs->vrs_gdtr.vsi_base = GDT_PAGE;
log_debug("%s: loaded ELF kernel", __func__);
return (0);
}
static size_t
create_bios_memmap(struct vmop_create_params *vmc, bios_memmap_t *memmap)
{
size_t i, n = 0;
struct vm_mem_range *vmr;
for (i = 0; i < vmc->vmc_nmemranges; i++, n++) {
vmr = &vmc->vmc_memranges[i];
memmap[n].addr = vmr->vmr_gpa;
memmap[n].size = vmr->vmr_size;
if (vmr->vmr_type == VM_MEM_RAM)
memmap[n].type = BIOS_MAP_FREE;
else
memmap[n].type = BIOS_MAP_RES;
}
memmap[n].addr = 0x0;
memmap[n].size = 0x0;
memmap[n].type = BIOS_MAP_END;
n++;
return (n);
}
static uint32_t
push_bootargs(bios_memmap_t *memmap, size_t n, bios_bootmac_t *bootmac)
{
uint32_t memmap_sz, consdev_sz, bootmac_sz, i;
bios_consdev_t consdev;
uint32_t ba[1024];
memmap_sz = 3 * sizeof(uint32_t) + n * sizeof(bios_memmap_t);
ba[0] = BOOTARG_MEMMAP;
ba[1] = memmap_sz;
ba[2] = memmap_sz;
memcpy(&ba[3], memmap, n * sizeof(bios_memmap_t));
i = memmap_sz / sizeof(uint32_t);
memset(&consdev, 0, sizeof(consdev));
consdev.consdev = makedev(8, 0);
consdev.conspeed = 115200;
consdev.consaddr = 0x3f8;
consdev_sz = 3 * sizeof(uint32_t) + sizeof(bios_consdev_t);
ba[i] = BOOTARG_CONSDEV;
ba[i + 1] = consdev_sz;
ba[i + 2] = consdev_sz;
memcpy(&ba[i + 3], &consdev, sizeof(bios_consdev_t));
i += consdev_sz / sizeof(uint32_t);
if (bootmac) {
bootmac_sz = 3 * sizeof(uint32_t) +
(sizeof(bios_bootmac_t) + 3) & ~3;
ba[i] = BOOTARG_BOOTMAC;
ba[i + 1] = bootmac_sz;
ba[i + 2] = bootmac_sz;
memcpy(&ba[i + 3], bootmac, sizeof(bios_bootmac_t));
i += bootmac_sz / sizeof(uint32_t);
}
ba[i++] = 0xFFFFFFFF;
write_mem(BOOTARGS_PAGE, ba, PAGE_SIZE);
sev_register_encryption(BOOTARGS_PAGE, PAGE_SIZE);
return (i * sizeof(uint32_t));
}
static size_t
push_stack(uint32_t bootargsz, uint32_t end)
{
uint32_t stack[1024];
uint16_t loc;
memset(&stack, 0, sizeof(stack));
loc = 1024;
stack[--loc] = BOOTARGS_PAGE;
stack[--loc] = bootargsz;
stack[--loc] = 0;
stack[--loc] = 0;
stack[--loc] = end;
stack[--loc] = 0x0e;
stack[--loc] = MAKEBOOTDEV(0x4, 0, 0, 0, 0);
stack[--loc] = 0;
write_mem(STACK_PAGE, &stack, PAGE_SIZE);
sev_register_encryption(STACK_PAGE, PAGE_SIZE);
return (1024 - (loc - 1)) * sizeof(uint32_t);
}
size_t
mread(gzFile fp, paddr_t addr, size_t sz)
{
const char *errstr = NULL;
int errnum = 0;
size_t ct;
size_t i, osz;
char buf[PAGE_SIZE];
sev_register_encryption(addr, sz);
ct = 0;
osz = sz;
if ((addr & PAGE_MASK) != 0) {
memset(buf, 0, sizeof(buf));
if (sz > PAGE_SIZE)
ct = PAGE_SIZE - (addr & PAGE_MASK);
else
ct = sz;
if ((size_t)gzread(fp, buf, ct) != ct) {
errstr = gzerror(fp, &errnum);
if (errnum == Z_ERRNO)
errnum = errno;
log_warnx("error %d in mread, %s", errnum, errstr);
return (0);
}
if (write_mem(addr, buf, ct))
return (0);
addr += ct;
}
sz = sz - ct;
if (sz == 0)
return (osz);
for (i = 0; i < sz; i += PAGE_SIZE, addr += PAGE_SIZE) {
memset(buf, 0, sizeof(buf));
if (i + PAGE_SIZE > sz)
ct = sz - i;
else
ct = PAGE_SIZE;
if ((size_t)gzread(fp, buf, ct) != ct) {
errstr = gzerror(fp, &errnum);
if (errnum == Z_ERRNO)
errnum = errno;
log_warnx("error %d in mread, %s", errnum, errstr);
return (0);
}
if (write_mem(addr, buf, ct))
return (0);
}
return (osz);
}
static void
marc4random_buf(paddr_t addr, int sz)
{
int i, ct;
char buf[PAGE_SIZE];
sev_register_encryption(addr, sz);
ct = 0;
if (addr % PAGE_SIZE != 0) {
memset(buf, 0, sizeof(buf));
ct = PAGE_SIZE - (addr % PAGE_SIZE);
arc4random_buf(buf, ct);
if (write_mem(addr, buf, ct))
return;
addr += ct;
}
for (i = 0; i < sz; i+= PAGE_SIZE, addr += PAGE_SIZE) {
memset(buf, 0, sizeof(buf));
if (i + PAGE_SIZE > sz)
ct = sz - i;
else
ct = PAGE_SIZE;
arc4random_buf(buf, ct);
if (write_mem(addr, buf, ct))
return;
}
}
static void
mbzero(paddr_t addr, int sz)
{
if (write_mem(addr, NULL, sz))
return;
sev_register_encryption(addr, sz);
}
static void
mbcopy(void *src, paddr_t dst, int sz)
{
write_mem(dst, src, sz);
sev_register_encryption(dst, sz);
}
static int
elf64_exec(gzFile fp, Elf64_Ehdr *elf, u_long *marks, int flags)
{
Elf64_Shdr *shp;
Elf64_Phdr *phdr;
Elf64_Off off;
int i;
size_t sz;
int havesyms;
paddr_t minp = ~0, maxp = 0, pos = 0;
paddr_t offset = marks[MARK_START], shpp, elfp;
sz = elf->e_phnum * sizeof(Elf64_Phdr);
phdr = malloc(sz);
if (gzseek(fp, (off_t)elf->e_phoff, SEEK_SET) == -1) {
free(phdr);
return 1;
}
if ((size_t)gzread(fp, phdr, sz) != sz) {
free(phdr);
return 1;
}
for (i = 0; i < elf->e_phnum; i++) {
if (phdr[i].p_type == PT_OPENBSD_RANDOMIZE) {
int m;
if (flags & LOAD_RANDOM) {
for (pos = 0; pos < phdr[i].p_filesz;
pos += m) {
m = phdr[i].p_filesz - pos;
marc4random_buf(phdr[i].p_paddr + pos,
m);
}
}
if (flags & (LOAD_RANDOM | COUNT_RANDOM)) {
marks[MARK_RANDOM] = LOADADDR(phdr[i].p_paddr);
marks[MARK_ERANDOM] =
marks[MARK_RANDOM] + phdr[i].p_filesz;
}
continue;
}
if (phdr[i].p_type != PT_LOAD ||
(phdr[i].p_flags & (PF_W|PF_R|PF_X)) == 0)
continue;
#define IS_TEXT(p) (p.p_flags & PF_X)
#define IS_DATA(p) ((p.p_flags & PF_X) == 0)
#define IS_BSS(p) (p.p_filesz < p.p_memsz)
if ((IS_TEXT(phdr[i]) && (flags & LOAD_TEXT)) ||
(IS_DATA(phdr[i]) && (flags & LOAD_DATA))) {
if (gzseek(fp, (off_t)phdr[i].p_offset,
SEEK_SET) == -1) {
free(phdr);
return 1;
}
if (mread(fp, phdr[i].p_paddr, phdr[i].p_filesz) !=
phdr[i].p_filesz) {
free(phdr);
return 1;
}
}
if ((IS_TEXT(phdr[i]) && (flags & (LOAD_TEXT | COUNT_TEXT))) ||
(IS_DATA(phdr[i]) && (flags & (LOAD_DATA | COUNT_TEXT)))) {
pos = phdr[i].p_paddr;
if (minp > pos)
minp = pos;
pos += phdr[i].p_filesz;
if (maxp < pos)
maxp = pos;
}
if (IS_BSS(phdr[i]) && (flags & LOAD_BSS)) {
mbzero((phdr[i].p_paddr + phdr[i].p_filesz),
phdr[i].p_memsz - phdr[i].p_filesz);
}
if (IS_BSS(phdr[i]) && (flags & (LOAD_BSS|COUNT_BSS))) {
pos += phdr[i].p_memsz - phdr[i].p_filesz;
if (maxp < pos)
maxp = pos;
}
}
free(phdr);
elfp = maxp = roundup(maxp, sizeof(Elf64_Addr));
if (flags & (LOAD_HDR | COUNT_HDR))
maxp += sizeof(Elf64_Ehdr);
if (flags & (LOAD_SYM | COUNT_SYM)) {
if (gzseek(fp, (off_t)elf->e_shoff, SEEK_SET) == -1) {
warn("gzseek section headers");
return 1;
}
sz = elf->e_shnum * sizeof(Elf64_Shdr);
shp = malloc(sz);
if ((size_t)gzread(fp, shp, sz) != sz) {
free(shp);
return 1;
}
shpp = maxp;
maxp += roundup(sz, sizeof(Elf64_Addr));
size_t shstrsz = shp[elf->e_shstrndx].sh_size;
char *shstr = malloc(shstrsz);
if (gzseek(fp, (off_t)shp[elf->e_shstrndx].sh_offset,
SEEK_SET) == -1) {
free(shstr);
free(shp);
return 1;
}
if ((size_t)gzread(fp, shstr, shstrsz) != shstrsz) {
free(shstr);
free(shp);
return 1;
}
off = roundup((sizeof(Elf64_Ehdr) + sz), sizeof(Elf64_Addr));
for (havesyms = i = 0; i < elf->e_shnum; i++)
if (shp[i].sh_type == SHT_SYMTAB)
havesyms = 1;
for (i = 0; i < elf->e_shnum; i++) {
if (shp[i].sh_type == SHT_SYMTAB ||
shp[i].sh_type == SHT_STRTAB ||
!strcmp(shstr + shp[i].sh_name, ".debug_line") ||
!strcmp(shstr + shp[i].sh_name, ELF_CTF)) {
if (havesyms && (flags & LOAD_SYM)) {
if (gzseek(fp, (off_t)shp[i].sh_offset,
SEEK_SET) == -1) {
free(shstr);
free(shp);
return 1;
}
if (mread(fp, maxp,
shp[i].sh_size) != shp[i].sh_size) {
free(shstr);
free(shp);
return 1;
}
}
maxp += roundup(shp[i].sh_size,
sizeof(Elf64_Addr));
shp[i].sh_offset = off;
shp[i].sh_flags |= SHF_ALLOC;
off += roundup(shp[i].sh_size,
sizeof(Elf64_Addr));
}
}
if (flags & LOAD_SYM) {
mbcopy(shp, shpp, sz);
}
free(shstr);
free(shp);
}
if (flags & LOAD_HDR) {
elf->e_phoff = 0;
elf->e_shoff = sizeof(Elf64_Ehdr);
elf->e_phentsize = 0;
elf->e_phnum = 0;
mbcopy(elf, elfp, sizeof(*elf));
}
marks[MARK_START] = LOADADDR(minp);
marks[MARK_ENTRY] = LOADADDR(elf->e_entry);
marks[MARK_NSYM] = 1;
marks[MARK_SYM] = LOADADDR(elfp);
marks[MARK_END] = LOADADDR(maxp);
return 0;
}
static int
elf32_exec(gzFile fp, Elf32_Ehdr *elf, u_long *marks, int flags)
{
Elf32_Shdr *shp;
Elf32_Phdr *phdr;
Elf32_Off off;
int i;
size_t sz;
int havesyms;
paddr_t minp = ~0, maxp = 0, pos = 0;
paddr_t offset = marks[MARK_START], shpp, elfp;
sz = elf->e_phnum * sizeof(Elf32_Phdr);
phdr = malloc(sz);
if (gzseek(fp, (off_t)elf->e_phoff, SEEK_SET) == -1) {
free(phdr);
return 1;
}
if ((size_t)gzread(fp, phdr, sz) != sz) {
free(phdr);
return 1;
}
for (i = 0; i < elf->e_phnum; i++) {
if (phdr[i].p_type == PT_OPENBSD_RANDOMIZE) {
int m;
if (flags & LOAD_RANDOM) {
for (pos = 0; pos < phdr[i].p_filesz;
pos += m) {
m = phdr[i].p_filesz - pos;
marc4random_buf(phdr[i].p_paddr + pos,
m);
}
}
if (flags & (LOAD_RANDOM | COUNT_RANDOM)) {
marks[MARK_RANDOM] = LOADADDR(phdr[i].p_paddr);
marks[MARK_ERANDOM] =
marks[MARK_RANDOM] + phdr[i].p_filesz;
}
continue;
}
if (phdr[i].p_type != PT_LOAD ||
(phdr[i].p_flags & (PF_W|PF_R|PF_X)) == 0)
continue;
#define IS_TEXT(p) (p.p_flags & PF_X)
#define IS_DATA(p) ((p.p_flags & PF_X) == 0)
#define IS_BSS(p) (p.p_filesz < p.p_memsz)
if ((IS_TEXT(phdr[i]) && (flags & LOAD_TEXT)) ||
(IS_DATA(phdr[i]) && (flags & LOAD_DATA))) {
if (gzseek(fp, (off_t)phdr[i].p_offset,
SEEK_SET) == -1) {
free(phdr);
return 1;
}
if (mread(fp, phdr[i].p_paddr, phdr[i].p_filesz) !=
phdr[i].p_filesz) {
free(phdr);
return 1;
}
}
if ((IS_TEXT(phdr[i]) && (flags & (LOAD_TEXT | COUNT_TEXT))) ||
(IS_DATA(phdr[i]) && (flags & (LOAD_DATA | COUNT_TEXT)))) {
pos = phdr[i].p_paddr;
if (minp > pos)
minp = pos;
pos += phdr[i].p_filesz;
if (maxp < pos)
maxp = pos;
}
if (IS_BSS(phdr[i]) && (flags & LOAD_BSS)) {
mbzero((phdr[i].p_paddr + phdr[i].p_filesz),
phdr[i].p_memsz - phdr[i].p_filesz);
}
if (IS_BSS(phdr[i]) && (flags & (LOAD_BSS|COUNT_BSS))) {
pos += phdr[i].p_memsz - phdr[i].p_filesz;
if (maxp < pos)
maxp = pos;
}
}
free(phdr);
elfp = maxp = roundup(maxp, sizeof(Elf32_Addr));
if (flags & (LOAD_HDR | COUNT_HDR))
maxp += sizeof(Elf32_Ehdr);
if (flags & (LOAD_SYM | COUNT_SYM)) {
if (gzseek(fp, (off_t)elf->e_shoff, SEEK_SET) == -1) {
warn("lseek section headers");
return 1;
}
sz = elf->e_shnum * sizeof(Elf32_Shdr);
shp = malloc(sz);
if ((size_t)gzread(fp, shp, sz) != sz) {
free(shp);
return 1;
}
shpp = maxp;
maxp += roundup(sz, sizeof(Elf32_Addr));
size_t shstrsz = shp[elf->e_shstrndx].sh_size;
char *shstr = malloc(shstrsz);
if (gzseek(fp, (off_t)shp[elf->e_shstrndx].sh_offset,
SEEK_SET) == -1) {
free(shstr);
free(shp);
return 1;
}
if ((size_t)gzread(fp, shstr, shstrsz) != shstrsz) {
free(shstr);
free(shp);
return 1;
}
off = roundup((sizeof(Elf32_Ehdr) + sz), sizeof(Elf32_Addr));
for (havesyms = i = 0; i < elf->e_shnum; i++)
if (shp[i].sh_type == SHT_SYMTAB)
havesyms = 1;
for (i = 0; i < elf->e_shnum; i++) {
if (shp[i].sh_type == SHT_SYMTAB ||
shp[i].sh_type == SHT_STRTAB ||
!strcmp(shstr + shp[i].sh_name, ".debug_line")) {
if (havesyms && (flags & LOAD_SYM)) {
if (gzseek(fp, (off_t)shp[i].sh_offset,
SEEK_SET) == -1) {
free(shstr);
free(shp);
return 1;
}
if (mread(fp, maxp,
shp[i].sh_size) != shp[i].sh_size) {
free(shstr);
free(shp);
return 1;
}
}
maxp += roundup(shp[i].sh_size,
sizeof(Elf32_Addr));
shp[i].sh_offset = off;
shp[i].sh_flags |= SHF_ALLOC;
off += roundup(shp[i].sh_size,
sizeof(Elf32_Addr));
}
}
if (flags & LOAD_SYM) {
mbcopy(shp, shpp, sz);
}
free(shstr);
free(shp);
}
if (flags & LOAD_HDR) {
elf->e_phoff = 0;
elf->e_shoff = sizeof(Elf32_Ehdr);
elf->e_phentsize = 0;
elf->e_phnum = 0;
mbcopy(elf, elfp, sizeof(*elf));
}
marks[MARK_START] = LOADADDR(minp);
marks[MARK_ENTRY] = LOADADDR(elf->e_entry);
marks[MARK_NSYM] = 1;
marks[MARK_SYM] = LOADADDR(elfp);
marks[MARK_END] = LOADADDR(maxp);
return 0;
}