#include <sys/cdefs.h>
#include "opt_sched.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/runq.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <machine/cpu.h>
#if 0
#define KTR_CRITICAL KTR_SCHED
#else
#define KTR_CRITICAL 0
#endif
#ifdef FULL_PREEMPTION
#ifndef PREEMPTION
#error "The FULL_PREEMPTION option requires the PREEMPTION option"
#endif
#endif
#ifdef PREEMPTION
static int kern_sched_preemption = 1;
#else
static int kern_sched_preemption = 0;
#endif
SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
&kern_sched_preemption, 0, "Kernel preemption enabled");
#ifdef SCHED_STATS
SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"switch stats");
DPCPU_DEFINE(long, sched_switch_stats[SWT_COUNT]);
SCHED_STAT_DEFINE_VAR(owepreempt,
&DPCPU_NAME(sched_switch_stats[SWT_OWEPREEMPT]), "");
SCHED_STAT_DEFINE_VAR(turnstile,
&DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), "");
SCHED_STAT_DEFINE_VAR(sleepq,
&DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), "");
SCHED_STAT_DEFINE_VAR(relinquish,
&DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), "");
SCHED_STAT_DEFINE_VAR(needresched,
&DPCPU_NAME(sched_switch_stats[SWT_NEEDRESCHED]), "");
SCHED_STAT_DEFINE_VAR(idle,
&DPCPU_NAME(sched_switch_stats[SWT_IDLE]), "");
SCHED_STAT_DEFINE_VAR(iwait,
&DPCPU_NAME(sched_switch_stats[SWT_IWAIT]), "");
SCHED_STAT_DEFINE_VAR(suspend,
&DPCPU_NAME(sched_switch_stats[SWT_SUSPEND]), "");
SCHED_STAT_DEFINE_VAR(remotepreempt,
&DPCPU_NAME(sched_switch_stats[SWT_REMOTEPREEMPT]), "");
SCHED_STAT_DEFINE_VAR(remotewakeidle,
&DPCPU_NAME(sched_switch_stats[SWT_REMOTEWAKEIDLE]), "");
SCHED_STAT_DEFINE_VAR(bind,
&DPCPU_NAME(sched_switch_stats[SWT_BIND]), "");
static int
sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
{
struct sysctl_oid *p;
uintptr_t counter;
int error;
int val;
int i;
val = 0;
error = sysctl_handle_int(oidp, &val, 0, req);
if (error != 0 || req->newptr == NULL)
return (error);
if (val == 0)
return (0);
RB_FOREACH(p, sysctl_oid_list, oidp->oid_parent) {
if (p == oidp || p->oid_arg1 == NULL)
continue;
counter = (uintptr_t)p->oid_arg1;
CPU_FOREACH(i) {
*(long *)(dpcpu_off[i] + counter) = 0;
}
}
return (0);
}
SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset,
CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE, NULL, 0,
sysctl_stats_reset, "I",
"Reset scheduler statistics");
#endif
static __noinline struct thread *
choosethread_panic(struct thread *td)
{
retry:
if (((td->td_proc->p_flag & P_SYSTEM) == 0 &&
(td->td_flags & TDF_INPANIC) == 0)) {
TD_SET_CAN_RUN(td);
td = sched_choose();
goto retry;
}
TD_SET_RUNNING(td);
return (td);
}
struct thread *
choosethread(void)
{
struct thread *td;
td = sched_choose();
if (KERNEL_PANICKED())
return (choosethread_panic(td));
TD_SET_RUNNING(td);
return (td);
}
void
critical_enter_KBI(void)
{
#ifdef KTR
struct thread *td = curthread;
#endif
critical_enter();
CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
(long)td->td_proc->p_pid, td->td_name, td->td_critnest);
}
void __noinline
critical_exit_preempt(void)
{
struct thread *td;
int flags;
td = curthread;
if (td->td_critnest != 0)
return;
if (kdb_active)
return;
td->td_critnest = 1;
thread_lock(td);
td->td_critnest--;
flags = SW_INVOL | SW_PREEMPT;
if (TD_IS_IDLETHREAD(td))
flags |= SWT_IDLE;
else
flags |= SWT_OWEPREEMPT;
mi_switch(flags);
}
void
critical_exit_KBI(void)
{
#ifdef KTR
struct thread *td = curthread;
#endif
critical_exit();
CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
(long)td->td_proc->p_pid, td->td_name, td->td_critnest);
}
_Static_assert(RQ_NQS <= 256,
"'td_rqindex' must be turned into a bigger unsigned type");
#define CHECK_IDX(idx) ({ \
__typeof(idx) _idx __unused = (idx); \
KASSERT(0 <= _idx && _idx < RQ_NQS, \
("%s: %s out of range: %d", __func__, __STRING(idx), _idx)); \
})
typedef uintptr_t runq_sw_op(int idx, int sw_idx, rqsw_t sw_bit,
rqsw_t *swp);
static inline uintptr_t runq_sw_apply(struct runq *rq, int idx,
runq_sw_op *op);
static inline uintptr_t runq_sw_set_not_empty_op(int idx, int sw_idx,
rqsw_t sw_bit, rqsw_t *swp);
static inline uintptr_t runq_sw_set_empty_op(int idx, int sw_idx,
rqsw_t sw_bit, rqsw_t *swp);
static inline uintptr_t runq_sw_is_empty_op(int idx, int sw_idx,
rqsw_t sw_bit, rqsw_t *swp);
static inline void runq_sw_set_not_empty(struct runq *rq, int idx);
static inline void runq_sw_set_empty(struct runq *rq, int idx);
static inline bool runq_sw_is_empty(struct runq *rq, int idx);
void
runq_init(struct runq *rq)
{
int i;
bzero(rq, sizeof(*rq));
for (i = 0; i < RQ_NQS; i++)
TAILQ_INIT(&rq->rq_queues[i]);
}
static inline uintptr_t
runq_sw_apply(struct runq *rq, int idx, runq_sw_op *op)
{
rqsw_t *swp;
rqsw_t sw_bit;
int sw_idx;
CHECK_IDX(idx);
sw_idx = RQSW_IDX(idx);
sw_bit = RQSW_BIT(idx);
swp = &rq->rq_status.rq_sw[sw_idx];
return (op(idx, sw_idx, sw_bit, swp));
}
static inline uintptr_t
runq_sw_set_not_empty_op(int idx, int sw_idx, rqsw_t sw_bit, rqsw_t *swp)
{
rqsw_t old_sw __unused = *swp;
*swp |= sw_bit;
CTR4(KTR_RUNQ,
"runq_sw_set_not_empty: idx=%d sw_idx=%d "
"bits=" RQSW_PRI "->" RQSW_PRI,
idx, sw_idx, old_sw, *swp);
return (0);
}
static inline void
runq_sw_set_not_empty(struct runq *rq, int idx)
{
(void)runq_sw_apply(rq, idx, &runq_sw_set_not_empty_op);
}
static inline uintptr_t
runq_sw_set_empty_op(int idx, int sw_idx, rqsw_t sw_bit, rqsw_t *swp)
{
rqsw_t old_sw __unused = *swp;
*swp &= ~sw_bit;
CTR4(KTR_RUNQ,
"runq_sw_set_empty: idx=%d sw_idx=%d "
"bits=" RQSW_PRI "->" RQSW_PRI,
idx, sw_idx, old_sw, *swp);
return (0);
}
static inline void
runq_sw_set_empty(struct runq *rq, int idx)
{
(void)runq_sw_apply(rq, idx, &runq_sw_set_empty_op);
}
static inline uintptr_t
runq_sw_is_empty_op(int idx, int sw_idx, rqsw_t sw_bit, rqsw_t *swp)
{
return ((*swp & sw_bit) == 0);
}
static inline bool
runq_sw_is_empty(struct runq *rq, int idx)
{
return (runq_sw_apply(rq, idx, &runq_sw_is_empty_op));
}
bool
runq_is_queue_empty(struct runq *rq, int idx)
{
return (runq_sw_is_empty(rq, idx));
}
void
runq_add(struct runq *rq, struct thread *td, int flags)
{
runq_add_idx(rq, td, RQ_PRI_TO_QUEUE_IDX(td->td_priority), flags);
}
void
runq_add_idx(struct runq *rq, struct thread *td, int idx, int flags)
{
struct rq_queue *rqq;
td->td_rqindex = idx;
runq_sw_set_not_empty(rq, idx);
rqq = &rq->rq_queues[idx];
CTR4(KTR_RUNQ, "runq_add_idx: td=%p pri=%d idx=%d rqq=%p",
td, td->td_priority, idx, rqq);
if (flags & SRQ_PREEMPTED)
TAILQ_INSERT_HEAD(rqq, td, td_runq);
else
TAILQ_INSERT_TAIL(rqq, td, td_runq);
}
bool
runq_remove(struct runq *rq, struct thread *td)
{
struct rq_queue *rqq;
int idx;
KASSERT(td->td_flags & TDF_INMEM, ("runq_remove: Thread swapped out"));
idx = td->td_rqindex;
CHECK_IDX(idx);
rqq = &rq->rq_queues[idx];
CTR4(KTR_RUNQ, "runq_remove: td=%p pri=%d idx=%d rqq=%p",
td, td->td_priority, idx, rqq);
TAILQ_REMOVE(rqq, td, td_runq);
if (TAILQ_EMPTY(rqq)) {
runq_sw_set_empty(rq, idx);
CTR1(KTR_RUNQ, "runq_remove: queue at idx=%d now empty", idx);
return (true);
}
return (false);
}
static inline int
runq_findq_status_word(struct runq *const rq, const int w_idx,
const rqsw_t w, runq_pred_t *const pred, void *const pred_data)
{
struct rq_queue *q;
rqsw_t tw = w;
int idx, b_idx;
while (tw != 0) {
b_idx = RQSW_BSF(tw);
idx = RQSW_TO_QUEUE_IDX(w_idx, b_idx);
q = &rq->rq_queues[idx];
KASSERT(!TAILQ_EMPTY(q),
("runq_findq(): No thread on non-empty queue with idx=%d",
idx));
if (pred(idx, q, pred_data))
return (idx);
tw &= ~RQSW_BIT(idx);
}
return (-1);
}
int
runq_findq(struct runq *const rq, const int lvl_min, const int lvl_max,
runq_pred_t *const pred, void *const pred_data)
{
const rqsw_t (*const rqsw)[RQSW_NB] = &rq->rq_status.rq_sw;
rqsw_t w;
int i, last, idx;
CHECK_IDX(lvl_min);
CHECK_IDX(lvl_max);
KASSERT(lvl_min <= lvl_max,
("lvl_min: %d > lvl_max: %d!", lvl_min, lvl_max));
i = RQSW_IDX(lvl_min);
last = RQSW_IDX(lvl_max);
w = (*rqsw)[i] & ~(RQSW_BIT(lvl_min) - 1);
if (i == last)
goto last_mask;
idx = runq_findq_status_word(rq, i, w, pred, pred_data);
if (idx != -1)
goto return_idx;
for (++i; i < last; ++i) {
w = (*rqsw)[i];
idx = runq_findq_status_word(rq, i, w, pred, pred_data);
if (idx != -1)
goto return_idx;
}
MPASS(i == last);
w = (*rqsw)[i];
last_mask:
w &= (RQSW_BIT(lvl_max) - 1) | RQSW_BIT(lvl_max);
idx = runq_findq_status_word(rq, i, w, pred, pred_data);
if (idx != -1)
goto return_idx;
return (-1);
return_idx:
CTR4(KTR_RUNQ,
"runq_findq: bits=" RQSW_PRI "->" RQSW_PRI " i=%d idx=%d",
(*rqsw)[i], w, i, idx);
return (idx);
}
static bool
runq_first_thread_pred(const int idx, struct rq_queue *const q, void *const data)
{
struct thread **const tdp = data;
struct thread *const td = TAILQ_FIRST(q);
*tdp = td;
return (true);
}
extern inline struct thread *
runq_first_thread_range(struct runq *const rq, const int lvl_min,
const int lvl_max)
{
struct thread *td = NULL;
(void)runq_findq(rq, lvl_min, lvl_max, runq_first_thread_pred, &td);
return (td);
}
static inline struct thread *
runq_first_thread(struct runq *const rq)
{
return (runq_first_thread_range(rq, 0, RQ_NQS - 1));
}
bool
runq_not_empty(struct runq *rq)
{
const rqsw_t (*const rqsw)[RQSW_NB] = &rq->rq_status.rq_sw;
int sw_idx;
for (sw_idx = 0; sw_idx < RQSW_NB; ++sw_idx) {
const rqsw_t w = (*rqsw)[sw_idx];
if (w != 0) {
CTR3(KTR_RUNQ, "runq_not_empty: not empty; "
"rq=%p, sw_idx=%d, bits=" RQSW_PRI,
rq, sw_idx, w);
return (true);
}
}
CTR1(KTR_RUNQ, "runq_not_empty: empty; rq=%p", rq);
return (false);
}
struct thread *
runq_choose(struct runq *rq)
{
struct thread *td;
td = runq_first_thread(rq);
if (td != NULL) {
CTR2(KTR_RUNQ, "runq_choose: idx=%d td=%p", td->td_rqindex, td);
return (td);
}
CTR0(KTR_RUNQ, "runq_choose: idlethread");
return (NULL);
}
struct runq_fuzz_pred_data {
int fuzz;
struct thread *td;
};
static bool
runq_fuzz_pred(const int idx, struct rq_queue *const q, void *const data)
{
struct runq_fuzz_pred_data *const d = data;
const int fuzz = d->fuzz;
struct thread *td;
td = TAILQ_FIRST(q);
if (fuzz > 1) {
struct thread *td2 = td;
int count = fuzz;
int cpu = PCPU_GET(cpuid);
while (count-- != 0 && td2 != NULL) {
if (td2->td_lastcpu == cpu) {
td = td2;
break;
}
td2 = TAILQ_NEXT(td2, td_runq);
}
}
d->td = td;
return (true);
}
struct thread *
runq_choose_fuzz(struct runq *rq, int fuzz)
{
struct runq_fuzz_pred_data data = {
.fuzz = fuzz,
.td = NULL
};
int idx;
idx = runq_findq(rq, 0, RQ_NQS - 1, runq_fuzz_pred, &data);
if (idx != -1) {
MPASS(data.td != NULL);
CTR2(KTR_RUNQ, "runq_choose_fuzz: idx=%d td=%p", idx, data.td);
return (data.td);
}
MPASS(data.td == NULL);
CTR0(KTR_RUNQ, "runq_choose_fuzz: idlethread");
return (NULL);
}