root/usr/src/cmd/bhyve/common/mem.c
/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2012 NetApp, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 *
 * Copyright 2020 Oxide Computer Company
 * Copyright 2026 OmniOS Community Edition (OmniOSce) Association.
 */

/*
 * Memory ranges are represented with an RB tree. On insertion, the range
 * is checked for overlaps. On lookup, the key has the same base and limit
 * so it can be searched within the range.
 */


#include <sys/types.h>
#include <sys/errno.h>
#include <sys/tree.h>
#include <machine/vmm.h>

#include <assert.h>
#include <err.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <vmmapi.h>

#include "mem.h"
#include "debug.h"

struct mmio_rb_range {
        RB_ENTRY(mmio_rb_range) mr_link;        /* RB tree links */
        struct mem_range        mr_param;
        uint64_t                mr_base;
        uint64_t                mr_end;
};

struct mmio_rb_tree;
RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);

static RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rb_root, mmio_rb_fallback;

/*
 * Per-vCPU cache. Since most accesses from a vCPU will be to
 * consecutive addresses in a range, it makes sense to cache the
 * result of a lookup.
 */
static struct mmio_rb_range     **mmio_hint;
static int mmio_ncpu;

static pthread_rwlock_t mmio_rwlock;

static int
mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b)
{
        if (a->mr_end < b->mr_base)
                return (-1);
        else if (a->mr_base > b->mr_end)
                return (1);
        return (0);
}

static int
mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr,
    struct mmio_rb_range **entry)
{
        struct mmio_rb_range find, *res;

        find.mr_base = find.mr_end = addr;

        res = RB_FIND(mmio_rb_tree, rbt, &find);

        if (res != NULL) {
                *entry = res;
                return (0);
        }

        return (ENOENT);
}

static void
mmio_rb_dump(struct mmio_rb_tree *rbt)
{
        int perror;
        struct mmio_rb_range *np;

        pthread_rwlock_rdlock(&mmio_rwlock);
        RB_FOREACH(np, mmio_rb_tree, rbt) {
                PRINTLN(" %lx:%lx, %s", np->mr_base, np->mr_end,
                       np->mr_param.name);
        }
        perror = pthread_rwlock_unlock(&mmio_rwlock);
        assert(perror == 0);
}

static int
mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new)
{
        struct mmio_rb_range *overlap;

        overlap = RB_INSERT(mmio_rb_tree, rbt, new);

        if (overlap != NULL) {
                EPRINTLN("overlap detected: new %lx:%lx, tree %lx:%lx, '%s' "
                       "claims region already claimed for '%s'",
                       new->mr_base, new->mr_end,
                       overlap->mr_base, overlap->mr_end,
                       new->mr_param.name, overlap->mr_param.name);

                return (EEXIST);
        }

        return (0);
}

RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);

typedef int (mem_cb_t)(struct vcpu *vcpu, uint64_t gpa, struct mem_range *mr,
    void *arg);

static int
mem_read(struct vcpu *vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg)
{
        int error;
        struct mem_range *mr = arg;

        error = (*mr->handler)(vcpu, MEM_F_READ, gpa, size, rval, mr->arg1,
            mr->arg2);
        return (error);
}

static int
mem_write(struct vcpu *vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
{
        int error;
        struct mem_range *mr = arg;

        error = (*mr->handler)(vcpu, MEM_F_WRITE, gpa, size, &wval, mr->arg1,
            mr->arg2);
        return (error);
}

static int
access_memory(struct vcpu *vcpu, uint64_t paddr, mem_cb_t *cb, void *arg)
{
        struct mmio_rb_range *entry;
        int err, perror, immutable, vcpuid;

        vcpuid = vcpu_id(vcpu);
        pthread_rwlock_rdlock(&mmio_rwlock);
        /*
         * First check the per-vCPU cache
         */
        if (mmio_hint[vcpuid] &&
            paddr >= mmio_hint[vcpuid]->mr_base &&
            paddr <= mmio_hint[vcpuid]->mr_end) {
                entry = mmio_hint[vcpuid];
        } else
                entry = NULL;

        if (entry == NULL) {
                if (mmio_rb_lookup(&mmio_rb_root, paddr, &entry) == 0) {
                        /* Update the per-vCPU cache */
                        mmio_hint[vcpuid] = entry;
                } else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) {
                        perror = pthread_rwlock_unlock(&mmio_rwlock);
                        assert(perror == 0);
                        return (ESRCH);
                }
        }

        assert(entry != NULL);

        /*
         * An 'immutable' memory range is guaranteed to be never removed
         * so there is no need to hold 'mmio_rwlock' while calling the
         * handler.
         *
         * XXX writes to the PCIR_COMMAND register can cause register_mem()
         * to be called. If the guest is using PCI extended config space
         * to modify the PCIR_COMMAND register then register_mem() can
         * deadlock on 'mmio_rwlock'. However by registering the extended
         * config space window as 'immutable' the deadlock can be avoided.
         */
        immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE);
        if (immutable) {
                perror = pthread_rwlock_unlock(&mmio_rwlock);
                assert(perror == 0);
        }

        err = cb(vcpu, paddr, &entry->mr_param, arg);

        if (!immutable) {
                perror = pthread_rwlock_unlock(&mmio_rwlock);
                assert(perror == 0);
        }

        return (err);
}

static int
emulate_mem_cb(struct vcpu *vcpu, uint64_t paddr, struct mem_range *mr,
    void *arg)
{
        struct vm_mmio *mmio;
        int err = 0;

        mmio = arg;

        if (mmio->read != 0) {
                err = mem_read(vcpu, paddr, &mmio->data, mmio->bytes, mr);
        } else {
                err = mem_write(vcpu, paddr, mmio->data, mmio->bytes, mr);
        }

        return (err);
}

int
emulate_mem(struct vcpu *vcpu, struct vm_mmio *mmio)
{
        return (access_memory(vcpu, mmio->gpa, emulate_mem_cb, mmio));
}

struct rw_mem_args {
        uint64_t *val;
        int size;
        int operation;
};

static int
rw_mem_cb(struct vcpu *vcpu, uint64_t paddr, struct mem_range *mr,
    void *arg)
{
        struct rw_mem_args *rma;

        rma = arg;
        return (mr->handler(vcpu, rma->operation, paddr, rma->size,
            rma->val, mr->arg1, mr->arg2));
}

int
read_mem(struct vcpu *vcpu, uint64_t gpa, uint64_t *rval, int size)
{
        struct rw_mem_args rma;

        rma.val = rval;
        rma.size = size;
        rma.operation = MEM_F_READ;
        return (access_memory(vcpu, gpa, rw_mem_cb, &rma));
}

int
write_mem(struct vcpu *vcpu, uint64_t gpa, uint64_t wval, int size)
{
        struct rw_mem_args rma;

        rma.val = &wval;
        rma.size = size;
        rma.operation = MEM_F_WRITE;
        return (access_memory(vcpu, gpa, rw_mem_cb, &rma));
}

static int
register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp)
{
        struct mmio_rb_range *entry, *mrp;
        int err, perror;

        err = 0;

        mrp = malloc(sizeof(struct mmio_rb_range));
        if (mrp == NULL) {
                warn("%s: couldn't allocate memory for mrp\n",
                     __func__);
                err = ENOMEM;
        } else {
                mrp->mr_param = *memp;
                mrp->mr_base = memp->base;
                mrp->mr_end = memp->base + memp->size - 1;
                pthread_rwlock_wrlock(&mmio_rwlock);
                if (mmio_rb_lookup(rbt, memp->base, &entry) != 0)
                        err = mmio_rb_add(rbt, mrp);
#ifndef __FreeBSD__
                else /* smatch warn: possible memory leak of 'mrp' */
                        free(mrp);
#endif
                perror = pthread_rwlock_unlock(&mmio_rwlock);
#ifdef  __FreeBSD__
                assert(perror == 0);
#else
                if (perror != 0) {
                        mmio_rb_dump(rbt);
                        exit(4);
                }
#endif
                if (err)
                        free(mrp);
        }

        return (err);
}

int
register_mem(struct mem_range *memp)
{

        return (register_mem_int(&mmio_rb_root, memp));
}

int
register_mem_fallback(struct mem_range *memp)
{

        return (register_mem_int(&mmio_rb_fallback, memp));
}

int
unregister_mem(struct mem_range *memp)
{
        struct mem_range *mr;
        struct mmio_rb_range *entry = NULL;
        int err, perror, i;

        pthread_rwlock_wrlock(&mmio_rwlock);
        err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry);
        if (err == 0) {
                mr = &entry->mr_param;
                assert(mr->name == memp->name);
                assert(mr->base == memp->base && mr->size == memp->size);
                assert((mr->flags & MEM_F_IMMUTABLE) == 0);
                RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry);

                /* flush Per-vCPU cache */
                for (i = 0; i < mmio_ncpu; i++) {
                        if (mmio_hint[i] == entry)
                                mmio_hint[i] = NULL;
                }
        }
        perror = pthread_rwlock_unlock(&mmio_rwlock);
        assert(perror == 0);

        if (entry)
                free(entry);

        return (err);
}

void
init_mem(int ncpu)
{

        mmio_ncpu = ncpu;
        mmio_hint = calloc(ncpu, sizeof(*mmio_hint));
        RB_INIT(&mmio_rb_root);
        RB_INIT(&mmio_rb_fallback);
        pthread_rwlock_init(&mmio_rwlock, NULL);
}