#include <sys/types.h>
#include <sys/systm.h>
#include <sys/param.h>
#include <sys/taskq.h>
#include <sys/cmn_err.h>
#include <sys/archsystm.h>
#include <sys/machsystm.h>
#include <sys/segments.h>
#include <sys/cpuvar.h>
#include <sys/x86_archext.h>
#include <sys/controlregs.h>
#include <sys/hypervisor.h>
#include <sys/xpv_panic.h>
#include <sys/mman.h>
#include <sys/psw.h>
#include <sys/cpu.h>
#include <sys/sunddi.h>
#include <util/sscanf.h>
#include <vm/hat_i86.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <xen/public/io/xs_wire.h>
#include <xen/sys/xenbus_impl.h>
#include <xen/public/vcpu.h>
extern cpuset_t cpu_ready_set;
#define CPU_PHASE_NONE 0
#define CPU_PHASE_WAIT_SAFE 1
#define CPU_PHASE_SAFE 2
#define CPU_PHASE_POWERED_OFF 3
#define POKE_TIMEOUT (NANOSEC / 256)
static taskq_t *cpu_config_tq;
static int cpu_phase[NCPU];
static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
int
vcpu_on_pcpu(processorid_t cpu)
{
struct vcpu_runstate_info runstate;
int ret = VCPU_STATE_UNKNOWN;
ASSERT(cpu < NCPU);
if (cpu == CPU->cpu_id)
return (VCPU_ON_PCPU);
if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
goto out;
switch (runstate.state) {
case RUNSTATE_running:
ret = VCPU_ON_PCPU;
break;
case RUNSTATE_runnable:
case RUNSTATE_offline:
case RUNSTATE_blocked:
ret = VCPU_NOT_ON_PCPU;
break;
default:
break;
}
out:
return (ret);
}
int
mach_cpucontext_init(void)
{
return (0);
}
void
do_cpu_config_watch(int state)
{
static struct xenbus_watch cpu_config_watch;
if (state != XENSTORE_UP)
return;
cpu_config_watch.node = "cpu";
cpu_config_watch.callback = vcpu_config_event;
if (register_xenbus_watch(&cpu_config_watch)) {
taskq_destroy(cpu_config_tq);
cmn_err(CE_WARN, "do_cpu_config_watch: "
"failed to set vcpu config watch");
}
}
void
mach_cpucontext_fini(void)
{
cpu_config_tq = taskq_create("vcpu config taskq", 1,
maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
(void) xs_register_xenbus_callback(do_cpu_config_watch);
}
static int
mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
{
uint_t vec, iopl;
vgc->flags = VGCF_IN_KERNEL;
vgc->user_regs.cs = KCS_SEL | SEL_KPL;
vgc->user_regs.ds = KDS_SEL;
vgc->user_regs.es = KDS_SEL;
vgc->user_regs.ss = KDS_SEL;
vgc->kernel_ss = KDS_SEL;
if (DOMAIN_IS_INITDOMAIN(xen_info))
iopl = (PS_IOPL & 0x1000);
else
iopl = 0;
vgc->user_regs.fs = 0;
vgc->user_regs.gs = 0;
vgc->user_regs.rflags = F_OFF | iopl;
#if !defined(__lint)
ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
#endif
for (vec = 0; vec < NIDT; vec++) {
trap_info_t *ti = &vgc->trap_ctxt[vec];
if (xen_idt_to_trap_info(vec,
&cp->cpu_m.mcpu_idt[vec], ti) == 0) {
ti->cs = KCS_SEL;
ti->vector = vec;
}
}
#ifdef CRASH_XEN
vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
#else
vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
#endif
vgc->gdt_ents = NGDT;
vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
vgc->ctrlreg[3] =
pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
vgc->ctrlreg[4] = getcr4();
vgc->event_callback_eip = (uintptr_t)xen_callback;
vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
vgc->flags |= VGCF_failsafe_disables_events;
vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
vgc->flags |= VGCF_syscall_disables_events;
ASSERT(vgc->user_regs.gs == 0);
vgc->gs_base_kernel = (uintptr_t)cp;
return (xen_vcpu_initialize(cp->cpu_id, vgc));
}
void *
mach_cpucontext_alloc(struct cpu *cp)
{
kthread_t *tp = cp->cpu_thread;
vcpu_guest_context_t vgc;
int err = 1;
cp->cpu_m.mcpu_vcpu_info =
&HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
sizeof (struct xen_evt_data), KM_SLEEP);
cp->cpu_m.mcpu_gdtpa =
mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
goto done;
bzero(&vgc, sizeof (vgc));
vgc.user_regs.rip = tp->t_pc;
vgc.user_regs.rsp = tp->t_sp;
vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
vgc.kernel_sp = (ulong_t)tp->t_stk;
err = mp_set_cpu_context(&vgc, cp);
done:
if (err) {
mach_cpucontext_free(cp, NULL, err);
return (NULL);
}
return (cp);
}
void
mach_cpucontext_free(struct cpu *cp, void *arg, int err)
{
switch (err) {
case 0:
break;
case ETIMEDOUT:
break;
default:
(void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
kmem_free(cp->cpu_m.mcpu_evt_pend,
sizeof (struct xen_evt_data));
break;
}
}
void
mach_cpucontext_reset(cpu_t *cp)
{
bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
}
static void
pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
{
vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
}
void
mach_cpucontext_restore(cpu_t *cp)
{
vcpu_guest_context_t vgc;
int err;
ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
cp->cpu_thread == cp->cpu_idle_thread);
bzero(&vgc, sizeof (vgc));
pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
vgc.user_regs.rax = 1;
vgc.user_regs.rsp += sizeof (ulong_t);
vgc.kernel_sp = cp->cpu_thread->t_sp;
err = mp_set_cpu_context(&vgc, cp);
ASSERT(err == 0);
}
static void
enter_safe_phase(void)
{
ulong_t flags = intr_clear();
if (setjmp(&curthread->t_pcb) == 0) {
cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
SMT_PAUSE();
}
ASSERT(!interrupts_enabled());
intr_restore(flags);
}
void
mach_cpu_idle(void)
{
if (IN_XPV_PANIC()) {
xpv_panic_halt();
} else {
(void) HYPERVISOR_block();
if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
enter_safe_phase();
}
}
void
mach_cpu_pause(volatile char *safe)
{
*safe = PAUSE_WAIT;
membar_enter();
while (*safe != PAUSE_IDLE) {
if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
enter_safe_phase();
SMT_PAUSE();
}
}
int
mach_cpu_halt(xc_arg_t arg1, xc_arg_t arg2 __unused, xc_arg_t arg3 __unused)
{
char *msg = (char *)arg1;
if (msg)
prom_printf("%s\n", msg);
(void) xen_vcpu_down(CPU->cpu_id);
return (0);
}
int
mp_cpu_poweron(struct cpu *cp)
{
return (ENOTSUP);
}
int
mp_cpu_poweroff(struct cpu *cp)
{
return (ENOTSUP);
}
void
mp_enter_barrier(void)
{
hrtime_t last_poke_time = 0;
int poke_allowed = 0;
int done = 0;
int i;
ASSERT(MUTEX_HELD(&cpu_lock));
pause_cpus(NULL, NULL);
while (!done) {
done = 1;
poke_allowed = 0;
if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
last_poke_time = xpv_gethrtime();
poke_allowed = 1;
}
for (i = 0; i < NCPU; i++) {
cpu_t *cp = cpu_get(i);
if (cp == NULL || cp == CPU)
continue;
switch (cpu_phase[i]) {
case CPU_PHASE_NONE:
cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
poke_cpu(i);
done = 0;
break;
case CPU_PHASE_WAIT_SAFE:
if (poke_allowed)
poke_cpu(i);
done = 0;
break;
case CPU_PHASE_SAFE:
case CPU_PHASE_POWERED_OFF:
break;
}
}
SMT_PAUSE();
}
}
void
mp_leave_barrier(void)
{
int i;
ASSERT(MUTEX_HELD(&cpu_lock));
for (i = 0; i < NCPU; i++) {
cpu_t *cp = cpu_get(i);
if (cp == NULL || cp == CPU)
continue;
switch (cpu_phase[i]) {
case CPU_PHASE_NONE:
case CPU_PHASE_WAIT_SAFE:
(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
break;
case CPU_PHASE_POWERED_OFF:
break;
case CPU_PHASE_SAFE:
cpu_phase[i] = CPU_PHASE_NONE;
}
}
start_cpus();
}
static int
poweroff_vcpu(struct cpu *cp)
{
int error;
ASSERT(MUTEX_HELD(&cpu_lock));
ASSERT(CPU->cpu_id != cp->cpu_id);
ASSERT(cp->cpu_flags & CPU_QUIESCED);
mp_enter_barrier();
if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
CPUSET_DEL(cpu_ready_set, cp->cpu_id);
if (cp->cpu_flags & CPU_ENABLE)
ncpus_intr_enabled--;
cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
cp->cpu_flags &=
~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
cpu_set_state(cp);
}
mp_leave_barrier();
return (error);
}
static int
vcpu_config_poweroff(processorid_t id)
{
int oldstate;
int error;
cpu_t *cp;
mutex_enter(&cpu_lock);
if ((cp = cpu_get(id)) == NULL) {
mutex_exit(&cpu_lock);
return (ESRCH);
}
if (cpu_get_state(cp) == P_POWEROFF) {
mutex_exit(&cpu_lock);
return (0);
}
mutex_exit(&cpu_lock);
do {
error = p_online_internal(id, P_OFFLINE,
&oldstate);
if (error != 0)
break;
mutex_enter(&cpu_lock);
if ((cp = cpu_get(id)) == NULL)
error = ESRCH;
else {
if (cp->cpu_flags & CPU_QUIESCED)
error = poweroff_vcpu(cp);
else
error = EBUSY;
}
mutex_exit(&cpu_lock);
} while (error == EBUSY);
return (error);
}
static int
vcpu_config_new(processorid_t id)
{
extern int start_cpu(processorid_t);
int error;
if (ncpus == 1) {
printf("cannot (yet) add cpus to a single-cpu domain\n");
return (ENOTSUP);
}
affinity_set(CPU_CURRENT);
error = start_cpu(id);
affinity_clear();
return (error);
}
static int
poweron_vcpu(struct cpu *cp)
{
int error;
ASSERT(MUTEX_HELD(&cpu_lock));
if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
printf("poweron_vcpu: vcpu%d is not available!\n",
cp->cpu_id);
return (ENXIO);
}
if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
CPUSET_ADD(cpu_ready_set, cp->cpu_id);
cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
cp->cpu_flags &= ~CPU_POWEROFF;
(void) xen_vcpu_up(cp->cpu_id);
cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
cpu_set_state(cp);
}
return (error);
}
static int
vcpu_config_poweron(processorid_t id)
{
cpu_t *cp;
int oldstate;
int error;
if (id >= ncpus)
return (vcpu_config_new(id));
mutex_enter(&cpu_lock);
if ((cp = cpu_get(id)) == NULL) {
mutex_exit(&cpu_lock);
return (ESRCH);
}
if (cpu_get_state(cp) != P_POWEROFF) {
mutex_exit(&cpu_lock);
return (0);
}
if ((error = poweron_vcpu(cp)) != 0) {
mutex_exit(&cpu_lock);
return (error);
}
mutex_exit(&cpu_lock);
return (p_online_internal(id, P_ONLINE, &oldstate));
}
#define REPORT_LEN 128
static void
vcpu_config_report(processorid_t id, uint_t newstate, int error)
{
char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
size_t len;
char *ps;
ps = NULL;
switch (newstate) {
case P_ONLINE:
ps = PS_ONLINE;
break;
case P_POWEROFF:
ps = PS_POWEROFF;
break;
default:
cmn_err(CE_PANIC, "unknown state %u\n", newstate);
break;
}
len = snprintf(report, REPORT_LEN,
"cpu%d: externally initiated %s", id, ps);
if (!error) {
cmn_err(CE_CONT, "!%s\n", report);
kmem_free(report, REPORT_LEN);
return;
}
len += snprintf(report + len, REPORT_LEN - len,
" failed, error %d: ", error);
switch (error) {
case EEXIST:
len += snprintf(report + len, REPORT_LEN - len,
"cpu already %s", ps ? ps : "?");
break;
case ESRCH:
len += snprintf(report + len, REPORT_LEN - len,
"cpu not found");
break;
case EINVAL:
case EALREADY:
break;
case EPERM:
len += snprintf(report + len, REPORT_LEN - len,
"insufficient privilege (0x%x)", id);
break;
case EBUSY:
switch (newstate) {
case P_ONLINE:
len += snprintf(report + len, REPORT_LEN - len,
"already running");
break;
case P_POWEROFF:
len += snprintf(report + len, REPORT_LEN - len,
"bound lwps?");
break;
default:
break;
}
default:
break;
}
cmn_err(CE_CONT, "%s\n", report);
kmem_free(report, REPORT_LEN);
}
static void
vcpu_config(void *arg)
{
int id = (int)(uintptr_t)arg;
int error;
char dir[16];
char *state;
if ((uint_t)id >= max_ncpus) {
cmn_err(CE_WARN,
"vcpu_config: cpu%d does not fit in this domain", id);
return;
}
(void) snprintf(dir, sizeof (dir), "cpu/%d", id);
state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
if (strcmp(state, "online") == 0) {
error = vcpu_config_poweron(id);
vcpu_config_report(id, P_ONLINE, error);
} else if (strcmp(state, "offline") == 0) {
error = vcpu_config_poweroff(id);
vcpu_config_report(id, P_POWEROFF, error);
} else {
cmn_err(CE_WARN,
"cpu%d: unknown target state '%s'", id, state);
}
} else
cmn_err(CE_WARN,
"cpu%d: unable to read target state from xenstore", id);
kmem_free(state, MAXPATHLEN);
}
static void
vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
{
const char *path = vec[XS_WATCH_PATH];
processorid_t id;
char *s;
if ((s = strstr(path, "cpu/")) != NULL &&
sscanf(s, "cpu/%d", &id) == 1) {
(void) taskq_dispatch(cpu_config_tq,
vcpu_config, (void *)(uintptr_t)id, 0);
}
}
static int
xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
{
int err;
if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
char *str;
int level = CE_WARN;
switch (err) {
case -X_EINVAL:
str = "something is wrong :(";
break;
case -X_ENOENT:
str = "no such cpu";
break;
case -X_ENOMEM:
str = "no mem to copy ctxt";
break;
case -X_EFAULT:
str = "bad address";
break;
case -X_EEXIST:
level = CE_PANIC;
str = "already initialized";
break;
default:
level = CE_PANIC;
str = "<unexpected>";
break;
}
cmn_err(level, "vcpu%d: failed to init: error %d: %s",
id, -err, str);
}
return (err);
}
long
xen_vcpu_up(processorid_t id)
{
long err;
if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
char *str;
switch (err) {
case -X_ENOENT:
str = "no such cpu";
break;
case -X_EINVAL:
if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
str = "bad cpuid";
else
str = "not initialized";
break;
default:
str = "<unexpected>";
break;
}
printf("vcpu%d: failed to start: error %d: %s\n",
id, -(int)err, str);
return (EBFONT);
}
return (err);
}
long
xen_vcpu_down(processorid_t id)
{
long err;
if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
panic("vcpu%d: failed to stop: error %d", id, -(int)err);
}
return (err);
}