#include "opt_hwpmc_hooks.h"
#include "opt_hwt_hooks.h"
#include "opt_sched.h"
#include <sys/systm.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/runq.h>
#include <sys/sched.h>
#include <sys/sdt.h>
#include <sys/smp.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/turnstile.h>
#include <sys/umtxvar.h>
#include <sys/vmmeter.h>
#include <sys/cpuset.h>
#include <sys/sbuf.h>
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
#ifdef HWT_HOOKS
#include <dev/hwt/hwt_hook.h>
#endif
#include <machine/cpu.h>
#include <machine/smp.h>
#define KTR_ULE 0
#define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
#define TDQ_NAME_LEN (sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU)))
#define TDQ_LOADNAME_LEN (sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load"))
struct td_sched {
short ts_flags;
int ts_cpu;
u_int ts_rltick;
u_int ts_slice;
u_int ts_ftick;
u_int ts_ltick;
u_int ts_slptime;
u_int ts_runtime;
u_int ts_ticks;
#ifdef KTR
char ts_name[TS_NAME_LEN];
#endif
};
#define TSF_BOUND 0x0001
#define TSF_XFERABLE 0x0002
#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0)
#define THREAD_CAN_SCHED(td, cpu) \
CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
_Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <=
sizeof(struct thread0_storage),
"increase struct thread0_storage.t0st_sched size");
#define PRI_TIMESHARE_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
#define PRI_INTERACT_RANGE ((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2)
#define PRI_BATCH_RANGE (PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE)
#define PRI_MIN_INTERACT PRI_MIN_TIMESHARE
#define PRI_MAX_INTERACT (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1)
#define PRI_MIN_BATCH (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE)
#define PRI_MAX_BATCH PRI_MAX_TIMESHARE
#define SCHED_PRI_CPU_RANGE (PRI_BATCH_RANGE - SCHED_PRI_NRESV)
#define SCHED_PRI_NICE(nice) (((nice) - PRIO_MIN) * 5 / 4)
#define SCHED_PRI_NRESV SCHED_PRI_NICE(PRIO_MAX)
#define RQ_RT_POL_MIN (RQ_PRI_TO_QUEUE_IDX(PRI_MIN_ITHD))
#define RQ_RT_POL_MAX (RQ_PRI_TO_QUEUE_IDX(PRI_MAX_INTERACT))
#define RQ_TS_POL_MIN (RQ_PRI_TO_QUEUE_IDX(PRI_MIN_BATCH))
#define RQ_TS_POL_MAX (RQ_PRI_TO_QUEUE_IDX(PRI_MAX_BATCH))
#define RQ_ID_POL_MIN (RQ_PRI_TO_QUEUE_IDX(PRI_MIN_IDLE))
#define RQ_ID_POL_MAX (RQ_PRI_TO_QUEUE_IDX(PRI_MAX_IDLE))
_Static_assert(RQ_RT_POL_MAX != RQ_TS_POL_MIN,
"ULE's realtime and timeshare policies' runqueue ranges overlap");
_Static_assert(RQ_TS_POL_MAX != RQ_ID_POL_MIN,
"ULE's timeshare and idle policies' runqueue ranges overlap");
#define RQ_TS_POL_MODULO (RQ_TS_POL_MAX - RQ_TS_POL_MIN + 1)
#define SCHED_TICK_SECS 11
#define SCHED_TICK_MAX(hz) ((hz) * SCHED_TICK_SECS)
#define SCHED_TICK_SHIFT 10
#define SCHED_TICK_RUN_SHIFTED(ts) ((ts)->ts_ticks)
#define SCHED_TICK_LENGTH(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, 1))
#define SCHED_CPU_DECAY_NUMER 10
#define SCHED_CPU_DECAY_DENOM 11
_Static_assert(SCHED_CPU_DECAY_NUMER >= 0 && SCHED_CPU_DECAY_DENOM > 0 &&
SCHED_CPU_DECAY_NUMER <= SCHED_CPU_DECAY_DENOM,
"Inconsistent values for SCHED_CPU_DECAY_NUMER and/or "
"SCHED_CPU_DECAY_DENOM");
#define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT)
#define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT)
#define SCHED_INTERACT_MAX (100)
#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2)
#define SCHED_INTERACT_THRESH (30)
#define SCHED_SLICE_DEFAULT_DIVISOR 10
#define SCHED_SLICE_MIN_DIVISOR 6
#define TDF_PICKCPU TDF_SCHED0
#define TDF_SLICEEND TDF_SCHED2
static u_int __read_mostly sched_interact = SCHED_INTERACT_THRESH;
static int __read_mostly tickincr = 8 << SCHED_TICK_SHIFT;
static int __read_mostly realstathz = 127;
static int __read_mostly sched_slice = 10;
static int __read_mostly sched_slice_min = 1;
#ifdef PREEMPTION
#ifdef FULL_PREEMPTION
static int __read_mostly preempt_thresh = PRI_MAX_IDLE;
#else
static int __read_mostly preempt_thresh = PRI_MIN_KERN;
#endif
#else
static int __read_mostly preempt_thresh = 0;
#endif
static int __read_mostly static_boost = PRI_MIN_BATCH;
static int __read_mostly sched_idlespins = 10000;
static int __read_mostly sched_idlespinthresh = -1;
struct tdq {
struct mtx_padalign tdq_lock;
struct cpu_group *tdq_cg;
struct thread *tdq_curthread;
int tdq_load;
int tdq_sysload;
int tdq_cpu_idle;
int tdq_transferable;
short tdq_switchcnt;
short tdq_oldswitchcnt;
u_char tdq_lowpri;
u_char tdq_owepreempt;
u_char tdq_ts_off;
u_char tdq_ts_deq_off;
u_char tdq_ts_ticks;
int tdq_id;
struct runq tdq_runq;
char tdq_name[TDQ_NAME_LEN];
#ifdef KTR
char tdq_loadname[TDQ_LOADNAME_LEN];
#endif
};
#define TDQ_RUNNING 1
#define TDQ_IDLE 2
#define TDQ_LOAD(tdq) atomic_load_int(&(tdq)->tdq_load)
#define TDQ_TRANSFERABLE(tdq) atomic_load_int(&(tdq)->tdq_transferable)
#define TDQ_SWITCHCNT(tdq) (atomic_load_short(&(tdq)->tdq_switchcnt) + \
atomic_load_short(&(tdq)->tdq_oldswitchcnt))
#define TDQ_SWITCHCNT_INC(tdq) (atomic_store_short(&(tdq)->tdq_switchcnt, \
atomic_load_short(&(tdq)->tdq_switchcnt) + 1))
#ifdef SMP
#define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000))
#define SCHED_AFFINITY(ts, t) ((u_int)ticks - (ts)->ts_rltick < (t) * affinity)
static int rebalance = 1;
static int balance_interval = 128;
static int __read_mostly affinity;
static int __read_mostly steal_idle = 1;
static int __read_mostly steal_thresh = 2;
static int __read_mostly always_steal = 0;
static int __read_mostly trysteal_limit = 2;
static struct tdq __read_mostly *balance_tdq;
static int balance_ticks;
DPCPU_DEFINE_STATIC(struct tdq, tdq);
DPCPU_DEFINE_STATIC(uint32_t, randomval);
#define TDQ_SELF() ((struct tdq *)PCPU_GET(sched))
#define TDQ_CPU(x) (DPCPU_ID_PTR((x), tdq))
#define TDQ_ID(x) ((x)->tdq_id)
#else
static struct tdq tdq_cpu;
#define TDQ_ID(x) (0)
#define TDQ_SELF() (&tdq_cpu)
#define TDQ_CPU(x) (&tdq_cpu)
#endif
#define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type))
#define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t)))
#define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
#define TDQ_TRYLOCK(t) mtx_trylock_spin(TDQ_LOCKPTR((t)))
#define TDQ_TRYLOCK_FLAGS(t, f) mtx_trylock_spin_flags(TDQ_LOCKPTR((t)), (f))
#define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t)))
#define TDQ_LOCKPTR(t) ((struct mtx *)(&(t)->tdq_lock))
static void sched_setpreempt(int);
static void sched_priority(struct thread *);
static void sched_thread_priority(struct thread *, u_char);
static int sched_interact_score(struct thread *);
static void sched_interact_update(struct thread *);
static void sched_interact_fork(struct thread *);
static void sched_pctcpu_update(struct td_sched *, int);
static inline struct thread *runq_choose_realtime(struct runq *const rq);
static inline struct thread *runq_choose_timeshare(struct runq *const rq,
int off);
static inline struct thread *runq_choose_idle(struct runq *const rq);
static struct thread *tdq_choose(struct tdq *);
static void tdq_setup(struct tdq *, int i);
static void tdq_load_add(struct tdq *, struct thread *);
static void tdq_load_rem(struct tdq *, struct thread *);
static inline void tdq_runq_add(struct tdq *, struct thread *, int);
static inline void tdq_advance_ts_deq_off(struct tdq *, bool);
static inline void tdq_runq_rem(struct tdq *, struct thread *);
static inline int sched_shouldpreempt(int, int, int);
static void tdq_print(int cpu);
static void runq_print(struct runq *rq);
static int tdq_add(struct tdq *, struct thread *, int);
#ifdef SMP
static int tdq_move(struct tdq *, struct tdq *);
static int tdq_idled(struct tdq *);
static void tdq_notify(struct tdq *, int lowpri);
static bool runq_steal_pred(const int idx, struct rq_queue *const q,
void *const data);
static inline struct thread *runq_steal_range(struct runq *const rq,
const int lvl_min, const int lvl_max, int cpu);
static inline struct thread *runq_steal_realtime(struct runq *const rq,
int cpu);
static inline struct thread *runq_steal_timeshare(struct runq *const rq,
int cpu, int off);
static inline struct thread *runq_steal_idle(struct runq *const rq,
int cpu);
static struct thread *tdq_steal(struct tdq *, int);
static int sched_pickcpu(struct thread *, int);
static void sched_balance(void);
static bool sched_balance_pair(struct tdq *, struct tdq *);
static inline struct tdq *sched_setcpu(struct thread *, int, int);
static inline void thread_unblock_switch(struct thread *, struct mtx *);
#endif
static void
runq_print(struct runq *rq)
{
struct rq_queue *rqq;
struct thread *td;
int pri;
int j;
int i;
for (i = 0; i < RQSW_NB; i++) {
printf("\t\trunq bits %d %#lx\n",
i, rq->rq_status.rq_sw[i]);
for (j = 0; j < RQSW_BPW; j++)
if (rq->rq_status.rq_sw[i] & (1ul << j)) {
pri = RQSW_TO_QUEUE_IDX(i, j);
rqq = &rq->rq_queues[pri];
TAILQ_FOREACH(td, rqq, td_runq) {
printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
td, td->td_name, td->td_priority,
td->td_rqindex, pri);
}
}
}
}
static void __unused
tdq_print(int cpu)
{
struct tdq *tdq;
tdq = TDQ_CPU(cpu);
printf("tdq %d:\n", TDQ_ID(tdq));
printf("\tlock %p\n", TDQ_LOCKPTR(tdq));
printf("\tLock name: %s\n", tdq->tdq_name);
printf("\tload: %d\n", tdq->tdq_load);
printf("\tswitch cnt: %d\n", tdq->tdq_switchcnt);
printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
printf("\tTS insert offset: %d\n", tdq->tdq_ts_off);
printf("\tTS dequeue offset: %d\n", tdq->tdq_ts_deq_off);
printf("\tload transferable: %d\n", tdq->tdq_transferable);
printf("\tlowest priority: %d\n", tdq->tdq_lowpri);
printf("\trunq:\n");
runq_print(&tdq->tdq_runq);
}
static inline int
sched_shouldpreempt(int pri, int cpri, int remote)
{
if (pri >= cpri)
return (0);
if (cpri >= PRI_MIN_IDLE)
return (1);
if (preempt_thresh == 0)
return (0);
if (pri <= preempt_thresh)
return (1);
if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT)
return (1);
return (0);
}
static inline void
tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
{
struct td_sched *ts;
u_char pri, idx;
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
pri = td->td_priority;
ts = td_get_sched(td);
TD_SET_RUNQ(td);
if (THREAD_CAN_MIGRATE(td)) {
tdq->tdq_transferable++;
ts->ts_flags |= TSF_XFERABLE;
}
if (PRI_MIN_BATCH <= pri && pri <= PRI_MAX_BATCH) {
if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) != 0)
idx = tdq->tdq_ts_deq_off;
else {
idx = (RQ_PRI_TO_QUEUE_IDX(pri) - RQ_TS_POL_MIN +
tdq->tdq_ts_off) % RQ_TS_POL_MODULO;
if (tdq->tdq_ts_deq_off != tdq->tdq_ts_off &&
idx == tdq->tdq_ts_deq_off)
idx = (idx - 1 + RQ_TS_POL_MODULO) %
RQ_TS_POL_MODULO;
}
idx += RQ_TS_POL_MIN;
runq_add_idx(&tdq->tdq_runq, td, idx, flags);
} else
runq_add(&tdq->tdq_runq, td, flags);
}
static inline void
tdq_advance_ts_deq_off(struct tdq *tdq, bool deq_queue_known_empty)
{
while (tdq->tdq_ts_deq_off != tdq->tdq_ts_off) {
if (deq_queue_known_empty)
deq_queue_known_empty = false;
else if (!runq_is_queue_empty(&tdq->tdq_runq,
tdq->tdq_ts_deq_off + RQ_TS_POL_MIN))
break;
tdq->tdq_ts_deq_off = (tdq->tdq_ts_deq_off + 1) %
RQ_TS_POL_MODULO;
}
}
static inline void
tdq_runq_rem(struct tdq *tdq, struct thread *td)
{
struct td_sched *ts;
bool queue_empty;
ts = td_get_sched(td);
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
if (ts->ts_flags & TSF_XFERABLE) {
tdq->tdq_transferable--;
ts->ts_flags &= ~TSF_XFERABLE;
}
queue_empty = runq_remove(&tdq->tdq_runq, td);
if (PRI_MIN_BATCH <= td->td_priority &&
td->td_priority <= PRI_MAX_BATCH && queue_empty &&
tdq->tdq_ts_deq_off + RQ_TS_POL_MIN == td->td_rqindex)
tdq_advance_ts_deq_off(tdq, true);
}
static void
tdq_load_add(struct tdq *tdq, struct thread *td)
{
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
tdq->tdq_load++;
if ((td->td_flags & TDF_NOLOAD) == 0)
tdq->tdq_sysload++;
KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load);
}
static void
tdq_load_rem(struct tdq *tdq, struct thread *td)
{
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
KASSERT(tdq->tdq_load != 0,
("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
tdq->tdq_load--;
if ((td->td_flags & TDF_NOLOAD) == 0)
tdq->tdq_sysload--;
KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load);
}
static inline u_int
tdq_slice(struct tdq *tdq)
{
int load;
load = tdq->tdq_sysload - 1;
if (load >= SCHED_SLICE_MIN_DIVISOR)
return (sched_slice_min);
if (load <= 1)
return (sched_slice);
return (sched_slice / load);
}
static void
tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
{
struct thread *td;
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
if (ctd == NULL)
ctd = tdq->tdq_curthread;
td = tdq_choose(tdq);
if (td == NULL || td->td_priority > ctd->td_priority)
tdq->tdq_lowpri = ctd->td_priority;
else
tdq->tdq_lowpri = td->td_priority;
}
#ifdef SMP
static uint32_t
sched_random(void)
{
uint32_t *rndptr;
rndptr = DPCPU_PTR(randomval);
*rndptr = *rndptr * 69069 + 5;
return (*rndptr >> 16);
}
struct cpu_search {
cpuset_t *cs_mask;
int cs_prefer;
int cs_running;
int cs_pri;
int cs_load;
int cs_trans;
};
struct cpu_search_res {
int csr_cpu;
int csr_load;
};
static int
cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s,
struct cpu_search_res *r)
{
struct cpu_search_res lr;
struct tdq *tdq;
int c, bload, l, load, p, total;
total = 0;
bload = INT_MAX;
r->csr_cpu = -1;
if (cg->cg_children > 0) {
for (c = cg->cg_children - 1; c >= 0; c--) {
load = cpu_search_lowest(&cg->cg_child[c], s, &lr);
total += load;
if (__predict_false(s->cs_running) &&
(cg->cg_child[c].cg_flags & CG_FLAG_THREAD) &&
load >= 128 && (load & 128) != 0)
load += 128;
if (lr.csr_cpu >= 0 && (load < bload ||
(load == bload && lr.csr_load < r->csr_load))) {
bload = load;
r->csr_cpu = lr.csr_cpu;
r->csr_load = lr.csr_load;
}
}
return (total);
}
for (c = cg->cg_last; c >= cg->cg_first; c--) {
if (!CPU_ISSET(c, &cg->cg_mask))
continue;
tdq = TDQ_CPU(c);
l = TDQ_LOAD(tdq);
if (c == s->cs_prefer) {
if (__predict_false(s->cs_running))
l--;
p = 128;
} else
p = 0;
load = l * 256;
total += load - p;
if (l > s->cs_load ||
(atomic_load_char(&tdq->tdq_lowpri) <= s->cs_pri &&
(!s->cs_running || c != s->cs_prefer)) ||
!CPU_ISSET(c, s->cs_mask))
continue;
if (__predict_false(s->cs_running) && l > 0)
p = 0;
load -= sched_random() % 128;
if (bload > load - p) {
bload = load - p;
r->csr_cpu = c;
r->csr_load = load;
}
}
return (total);
}
static int
cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s,
struct cpu_search_res *r)
{
struct cpu_search_res lr;
struct tdq *tdq;
int c, bload, l, load, total;
total = 0;
bload = INT_MIN;
r->csr_cpu = -1;
if (cg->cg_children > 0) {
for (c = cg->cg_children - 1; c >= 0; c--) {
load = cpu_search_highest(&cg->cg_child[c], s, &lr);
total += load;
if (lr.csr_cpu >= 0 && (load > bload ||
(load == bload && lr.csr_load > r->csr_load))) {
bload = load;
r->csr_cpu = lr.csr_cpu;
r->csr_load = lr.csr_load;
}
}
return (total);
}
for (c = cg->cg_last; c >= cg->cg_first; c--) {
if (!CPU_ISSET(c, &cg->cg_mask))
continue;
tdq = TDQ_CPU(c);
l = TDQ_LOAD(tdq);
load = l * 256;
total += load;
if (l < s->cs_load || TDQ_TRANSFERABLE(tdq) < s->cs_trans ||
!CPU_ISSET(c, s->cs_mask))
continue;
load -= sched_random() % 256;
if (load > bload) {
bload = load;
r->csr_cpu = c;
}
}
r->csr_load = bload;
return (total);
}
static inline int
sched_lowest(const struct cpu_group *cg, cpuset_t *mask, int pri, int maxload,
int prefer, int running)
{
struct cpu_search s;
struct cpu_search_res r;
s.cs_prefer = prefer;
s.cs_running = running;
s.cs_mask = mask;
s.cs_pri = pri;
s.cs_load = maxload;
cpu_search_lowest(cg, &s, &r);
return (r.csr_cpu);
}
static inline int
sched_highest(const struct cpu_group *cg, cpuset_t *mask, int minload,
int mintrans)
{
struct cpu_search s;
struct cpu_search_res r;
s.cs_mask = mask;
s.cs_load = minload;
s.cs_trans = mintrans;
cpu_search_highest(cg, &s, &r);
return (r.csr_cpu);
}
static void
sched_balance_group(struct cpu_group *cg)
{
struct tdq *tdq;
struct thread *td;
cpuset_t hmask, lmask;
int high, low, anylow;
CPU_FILL(&hmask);
for (;;) {
high = sched_highest(cg, &hmask, 1, 0);
if (high == -1)
break;
CPU_CLR(high, &hmask);
CPU_COPY(&hmask, &lmask);
if (CPU_EMPTY(&lmask))
break;
tdq = TDQ_CPU(high);
if (TDQ_LOAD(tdq) == 1) {
TDQ_LOCK(tdq);
td = tdq->tdq_curthread;
if (td->td_lock == TDQ_LOCKPTR(tdq) &&
(td->td_flags & TDF_IDLETD) == 0 &&
THREAD_CAN_MIGRATE(td)) {
td->td_flags |= TDF_PICKCPU;
ast_sched_locked(td, TDA_SCHED);
if (high != curcpu)
ipi_cpu(high, IPI_AST);
}
TDQ_UNLOCK(tdq);
break;
}
anylow = 1;
nextlow:
if (TDQ_TRANSFERABLE(tdq) == 0)
continue;
low = sched_lowest(cg, &lmask, -1, TDQ_LOAD(tdq) - 1, high, 1);
if (anylow && low == -1)
break;
if (low == -1)
continue;
if (sched_balance_pair(tdq, TDQ_CPU(low))) {
CPU_CLR(low, &hmask);
} else {
CPU_CLR(low, &lmask);
anylow = 0;
goto nextlow;
}
}
}
static void
sched_balance(void)
{
struct tdq *tdq;
balance_ticks = max(balance_interval / 2, 1) +
(sched_random() % balance_interval);
tdq = TDQ_SELF();
TDQ_UNLOCK(tdq);
sched_balance_group(cpu_top);
TDQ_LOCK(tdq);
}
static void
tdq_lock_pair(struct tdq *one, struct tdq *two)
{
if (one < two) {
TDQ_LOCK(one);
TDQ_LOCK_FLAGS(two, MTX_DUPOK);
} else {
TDQ_LOCK(two);
TDQ_LOCK_FLAGS(one, MTX_DUPOK);
}
}
static void
tdq_unlock_pair(struct tdq *one, struct tdq *two)
{
TDQ_UNLOCK(one);
TDQ_UNLOCK(two);
}
static bool
sched_balance_pair(struct tdq *high, struct tdq *low)
{
int cpu, lowpri;
bool ret;
ret = false;
tdq_lock_pair(high, low);
if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load) {
lowpri = tdq_move(high, low);
if (lowpri != -1) {
cpu = TDQ_ID(low);
if (cpu != PCPU_GET(cpuid))
tdq_notify(low, lowpri);
else
sched_setpreempt(low->tdq_lowpri);
ret = true;
}
}
tdq_unlock_pair(high, low);
return (ret);
}
static int
tdq_move(struct tdq *from, struct tdq *to)
{
struct thread *td;
int cpu;
TDQ_LOCK_ASSERT(from, MA_OWNED);
TDQ_LOCK_ASSERT(to, MA_OWNED);
cpu = TDQ_ID(to);
td = tdq_steal(from, cpu);
if (td == NULL)
return (-1);
thread_lock_block_wait(td);
sched_rem(td);
THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(from));
td->td_lock = TDQ_LOCKPTR(to);
td_get_sched(td)->ts_cpu = cpu;
return (tdq_add(to, td, SRQ_YIELDING));
}
static int
tdq_idled(struct tdq *tdq)
{
struct cpu_group *cg, *parent;
struct tdq *steal;
cpuset_t mask;
int cpu, switchcnt, goup;
if (smp_started == 0 || steal_idle == 0 || tdq->tdq_cg == NULL)
return (1);
CPU_FILL(&mask);
CPU_CLR(PCPU_GET(cpuid), &mask);
restart:
switchcnt = TDQ_SWITCHCNT(tdq);
for (cg = tdq->tdq_cg, goup = 0; ; ) {
cpu = sched_highest(cg, &mask, steal_thresh, 1);
if (TDQ_LOAD(tdq))
return (0);
if (cpu == -1) {
if (goup) {
cg = cg->cg_parent;
goup = 0;
}
parent = cg->cg_parent;
if (parent == NULL)
return (1);
if (parent->cg_children == 2) {
if (cg == &parent->cg_child[0])
cg = &parent->cg_child[1];
else
cg = &parent->cg_child[0];
goup = 1;
} else
cg = parent;
continue;
}
steal = TDQ_CPU(cpu);
if (TDQ_LOAD(steal) < steal_thresh ||
TDQ_TRANSFERABLE(steal) == 0)
goto restart;
TDQ_LOCK(tdq);
if (tdq->tdq_load > 0) {
mi_switch(SW_VOL | SWT_IDLE);
return (0);
}
if (TDQ_TRYLOCK_FLAGS(steal, MTX_DUPOK) == 0) {
TDQ_UNLOCK(tdq);
CPU_CLR(cpu, &mask);
continue;
}
if (TDQ_LOAD(steal) < steal_thresh ||
TDQ_TRANSFERABLE(steal) == 0 ||
switchcnt != TDQ_SWITCHCNT(tdq)) {
tdq_unlock_pair(tdq, steal);
goto restart;
}
if (tdq_move(steal, tdq) != -1)
break;
CPU_CLR(cpu, &mask);
tdq_unlock_pair(tdq, steal);
}
TDQ_UNLOCK(steal);
mi_switch(SW_VOL | SWT_IDLE);
return (0);
}
static void
tdq_notify(struct tdq *tdq, int lowpri)
{
int cpu;
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
KASSERT(tdq->tdq_lowpri <= lowpri,
("tdq_notify: lowpri %d > tdq_lowpri %d", lowpri, tdq->tdq_lowpri));
if (tdq->tdq_owepreempt)
return;
if (!sched_shouldpreempt(tdq->tdq_lowpri, lowpri, 1))
return;
atomic_thread_fence_seq_cst();
cpu = TDQ_ID(tdq);
if (TD_IS_IDLETHREAD(tdq->tdq_curthread) &&
(atomic_load_int(&tdq->tdq_cpu_idle) == 0 || cpu_idle_wakeup(cpu)))
return;
tdq->tdq_owepreempt = 1;
ipi_cpu(cpu, IPI_PREEMPT);
}
struct runq_steal_pred_data {
struct thread *td;
int cpu;
};
static bool
runq_steal_pred(const int idx, struct rq_queue *const q, void *const data)
{
struct runq_steal_pred_data *const d = data;
struct thread *td;
TAILQ_FOREACH(td, q, td_runq) {
if (THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, d->cpu)) {
d->td = td;
return (true);
}
}
return (false);
}
static inline struct thread *
runq_steal_range(struct runq *const rq, const int lvl_min, const int lvl_max,
int cpu)
{
struct runq_steal_pred_data data = {
.td = NULL,
.cpu = cpu,
};
int idx;
idx = runq_findq(rq, lvl_min, lvl_max, &runq_steal_pred, &data);
if (idx != -1) {
MPASS(data.td != NULL);
return (data.td);
}
MPASS(data.td == NULL);
return (NULL);
}
static inline struct thread *
runq_steal_realtime(struct runq *const rq, int cpu)
{
return (runq_steal_range(rq, RQ_RT_POL_MIN, RQ_RT_POL_MAX, cpu));
}
static inline struct thread *
runq_steal_timeshare(struct runq *const rq, int cpu, int off)
{
struct thread *td;
MPASS(0 <= off && off < RQ_TS_POL_MODULO);
td = runq_steal_range(rq, RQ_TS_POL_MIN + off, RQ_TS_POL_MAX, cpu);
if (td != NULL || off == 0)
return (td);
td = runq_steal_range(rq, RQ_TS_POL_MIN, RQ_TS_POL_MIN + off - 1, cpu);
return (td);
}
static inline struct thread *
runq_steal_idle(struct runq *const rq, int cpu)
{
return (runq_steal_range(rq, RQ_ID_POL_MIN, RQ_ID_POL_MAX, cpu));
}
static struct thread *
tdq_steal(struct tdq *tdq, int cpu)
{
struct thread *td;
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
td = runq_steal_realtime(&tdq->tdq_runq, cpu);
if (td != NULL)
return (td);
td = runq_steal_timeshare(&tdq->tdq_runq, cpu, tdq->tdq_ts_deq_off);
if (td != NULL)
return (td);
return (runq_steal_idle(&tdq->tdq_runq, cpu));
}
static inline struct tdq *
sched_setcpu(struct thread *td, int cpu, int flags)
{
struct tdq *tdq;
struct mtx *mtx;
THREAD_LOCK_ASSERT(td, MA_OWNED);
tdq = TDQ_CPU(cpu);
td_get_sched(td)->ts_cpu = cpu;
if (td->td_lock == TDQ_LOCKPTR(tdq)) {
KASSERT((flags & SRQ_HOLD) == 0,
("sched_setcpu: Invalid lock for SRQ_HOLD"));
return (tdq);
}
spinlock_enter();
mtx = thread_lock_block(td);
if ((flags & SRQ_HOLD) == 0)
mtx_unlock_spin(mtx);
TDQ_LOCK(tdq);
thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
spinlock_exit();
return (tdq);
}
SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding");
SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity");
SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity");
SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load");
SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu");
SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
static int
sched_pickcpu(struct thread *td, int flags)
{
struct cpu_group *cg, *ccg;
struct td_sched *ts;
struct tdq *tdq;
cpuset_t *mask;
int cpu, pri, r, self, intr;
self = PCPU_GET(cpuid);
ts = td_get_sched(td);
KASSERT(!CPU_ABSENT(ts->ts_cpu), ("sched_pickcpu: Start scheduler on "
"absent CPU %d for thread %s.", ts->ts_cpu, td->td_name));
if (smp_started == 0)
return (self);
if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
return (ts->ts_cpu);
if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
curthread->td_intr_nesting_level) {
tdq = TDQ_SELF();
if (tdq->tdq_lowpri >= PRI_MIN_IDLE) {
SCHED_STAT_INC(pickcpu_idle_affinity);
return (self);
}
ts->ts_cpu = self;
intr = 1;
cg = tdq->tdq_cg;
goto llc;
} else {
intr = 0;
tdq = TDQ_CPU(ts->ts_cpu);
cg = tdq->tdq_cg;
}
if (THREAD_CAN_SCHED(td, ts->ts_cpu) &&
atomic_load_char(&tdq->tdq_lowpri) >= PRI_MIN_IDLE &&
SCHED_AFFINITY(ts, CG_SHARE_L2)) {
if (cg->cg_flags & CG_FLAG_THREAD) {
for (cpu = cg->cg_first; cpu <= cg->cg_last; cpu++) {
pri =
atomic_load_char(&TDQ_CPU(cpu)->tdq_lowpri);
if (CPU_ISSET(cpu, &cg->cg_mask) &&
pri < PRI_MIN_IDLE)
break;
}
if (cpu > cg->cg_last) {
SCHED_STAT_INC(pickcpu_idle_affinity);
return (ts->ts_cpu);
}
} else {
SCHED_STAT_INC(pickcpu_idle_affinity);
return (ts->ts_cpu);
}
}
llc:
for (ccg = NULL; cg != NULL; cg = cg->cg_parent) {
if (cg->cg_flags & CG_FLAG_THREAD)
continue;
if (cg->cg_children == 1 || cg->cg_count == 1)
continue;
if (cg->cg_level == CG_SHARE_NONE ||
(!intr && !SCHED_AFFINITY(ts, cg->cg_level)))
continue;
ccg = cg;
}
if (ccg == cpu_top)
ccg = NULL;
cpu = -1;
mask = &td->td_cpuset->cs_mask;
pri = td->td_priority;
r = TD_IS_RUNNING(td);
if (ccg != NULL && intr) {
cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu, r);
if (cpu >= 0)
SCHED_STAT_INC(pickcpu_intrbind);
} else
if (ccg != NULL) {
cpu = sched_lowest(ccg, mask, max(pri, PRI_MAX_TIMESHARE),
INT_MAX, ts->ts_cpu, r);
if (cpu >= 0)
SCHED_STAT_INC(pickcpu_affinity);
}
if (cpu < 0) {
cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu, r);
if (cpu >= 0)
SCHED_STAT_INC(pickcpu_lowest);
}
if (cpu < 0) {
cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu, r);
if (cpu >= 0)
SCHED_STAT_INC(pickcpu_lowest);
}
KASSERT(cpu >= 0, ("sched_pickcpu: Failed to find a cpu."));
KASSERT(!CPU_ABSENT(cpu), ("sched_pickcpu: Picked absent CPU %d.", cpu));
tdq = TDQ_CPU(cpu);
if (THREAD_CAN_SCHED(td, self) && TDQ_SELF()->tdq_lowpri > pri &&
atomic_load_char(&tdq->tdq_lowpri) < PRI_MIN_IDLE &&
TDQ_LOAD(TDQ_SELF()) <= TDQ_LOAD(tdq) + 1) {
SCHED_STAT_INC(pickcpu_local);
cpu = self;
}
if (cpu != ts->ts_cpu)
SCHED_STAT_INC(pickcpu_migration);
return (cpu);
}
#endif
static inline struct thread *
runq_choose_realtime(struct runq *const rq)
{
return (runq_first_thread_range(rq, RQ_RT_POL_MIN, RQ_RT_POL_MAX));
}
static struct thread *
runq_choose_timeshare(struct runq *const rq, int off)
{
struct thread *td;
MPASS(0 <= off && off < RQ_TS_POL_MODULO);
td = runq_first_thread_range(rq, RQ_TS_POL_MIN + off, RQ_TS_POL_MAX);
if (td != NULL || off == 0)
return (td);
td = runq_first_thread_range(rq, RQ_TS_POL_MIN, RQ_TS_POL_MIN + off - 1);
return (td);
}
static inline struct thread *
runq_choose_idle(struct runq *const rq)
{
return (runq_first_thread_range(rq, RQ_ID_POL_MIN, RQ_ID_POL_MAX));
}
static struct thread *
tdq_choose(struct tdq *tdq)
{
struct thread *td;
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
td = runq_choose_realtime(&tdq->tdq_runq);
if (td != NULL)
return (td);
td = runq_choose_timeshare(&tdq->tdq_runq, tdq->tdq_ts_deq_off);
if (td != NULL) {
KASSERT(td->td_priority >= PRI_MIN_BATCH,
("tdq_choose: Invalid priority on timeshare queue %d",
td->td_priority));
return (td);
}
td = runq_choose_idle(&tdq->tdq_runq);
if (td != NULL) {
KASSERT(td->td_priority >= PRI_MIN_IDLE,
("tdq_choose: Invalid priority on idle queue %d",
td->td_priority));
return (td);
}
return (NULL);
}
static void
tdq_setup(struct tdq *tdq, int id)
{
if (bootverbose)
printf("ULE: setup cpu %d\n", id);
runq_init(&tdq->tdq_runq);
tdq->tdq_id = id;
snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
"sched lock %d", (int)TDQ_ID(tdq));
mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", MTX_SPIN);
#ifdef KTR
snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname),
"CPU %d load", (int)TDQ_ID(tdq));
#endif
}
#ifdef SMP
static void
sched_setup_smp(void)
{
struct tdq *tdq;
int i;
CPU_FOREACH(i) {
tdq = DPCPU_ID_PTR(i, tdq);
tdq_setup(tdq, i);
tdq->tdq_cg = smp_topo_find(cpu_top, i);
if (tdq->tdq_cg == NULL)
panic("Can't find cpu group for %d\n", i);
DPCPU_ID_SET(i, randomval, i * 69069 + 5);
}
PCPU_SET(sched, DPCPU_PTR(tdq));
balance_tdq = TDQ_SELF();
}
#endif
static void
sched_ule_setup(void)
{
struct tdq *tdq;
#ifdef SMP
sched_setup_smp();
#else
tdq_setup(TDQ_SELF(), 0);
#endif
tdq = TDQ_SELF();
TDQ_LOCK(tdq);
thread0.td_lock = TDQ_LOCKPTR(tdq);
tdq_load_add(tdq, &thread0);
tdq->tdq_curthread = &thread0;
tdq->tdq_lowpri = thread0.td_priority;
TDQ_UNLOCK(tdq);
}
static void
sched_ule_initticks(void)
{
int incr;
realstathz = stathz ? stathz : hz;
sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR;
sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
realstathz);
incr = (hz << SCHED_TICK_SHIFT) / realstathz;
if (incr == 0)
incr = 1;
tickincr = incr;
#ifdef SMP
balance_interval = realstathz;
balance_ticks = balance_interval;
affinity = SCHED_AFFINITY_DEFAULT;
#endif
if (sched_idlespinthresh < 0)
sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz;
}
static int
sched_interact_score(struct thread *td)
{
struct td_sched *ts;
int div;
ts = td_get_sched(td);
if (sched_interact <= SCHED_INTERACT_HALF &&
ts->ts_runtime >= ts->ts_slptime)
return (SCHED_INTERACT_HALF);
if (ts->ts_runtime > ts->ts_slptime) {
div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
return (SCHED_INTERACT_HALF +
(SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
}
if (ts->ts_slptime > ts->ts_runtime) {
div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
return (ts->ts_runtime / div);
}
if (ts->ts_runtime)
return (SCHED_INTERACT_HALF);
return (0);
}
static void
sched_priority(struct thread *td)
{
u_int pri, score;
int nice;
if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
return;
nice = td->td_proc->p_nice;
score = imax(0, sched_interact_score(td) + nice);
if (score < sched_interact) {
pri = PRI_MIN_INTERACT;
pri += (PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) * score /
sched_interact;
KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT,
("sched_priority: invalid interactive priority %u score %u",
pri, score));
} else {
const struct td_sched *const ts = td_get_sched(td);
const u_int run = SCHED_TICK_RUN_SHIFTED(ts);
const u_int run_unshifted __unused = (run +
(1 << SCHED_TICK_SHIFT) / 2) >> SCHED_TICK_SHIFT;
const u_int len = SCHED_TICK_LENGTH(ts);
const u_int nice_pri_off = SCHED_PRI_NICE(nice);
const u_int cpu_pri_off = (((SCHED_PRI_CPU_RANGE - 1) *
run + len / 2) / len + (1 << SCHED_TICK_SHIFT) / 2) >>
SCHED_TICK_SHIFT;
MPASS(cpu_pri_off < SCHED_PRI_CPU_RANGE);
pri = PRI_MIN_BATCH + cpu_pri_off + nice_pri_off;
KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH,
("sched_priority: Invalid computed priority %u: "
"Should be between %u and %u (PRI_MIN_BATCH: %u; "
"Window size (ticks): %u, runtime (shifted ticks): %u,"
"(unshifted ticks): %u => CPU pri off: %u; "
"Nice: %d => nice pri off: %u)",
pri, PRI_MIN_BATCH, PRI_MAX_BATCH, PRI_MIN_BATCH,
len, run, run_unshifted, cpu_pri_off, nice, nice_pri_off));
}
sched_user_prio(td, pri);
return;
}
static void
sched_interact_update(struct thread *td)
{
struct td_sched *ts;
u_int sum;
ts = td_get_sched(td);
sum = ts->ts_runtime + ts->ts_slptime;
if (sum < SCHED_SLP_RUN_MAX)
return;
if (sum > SCHED_SLP_RUN_MAX * 2) {
if (ts->ts_runtime > ts->ts_slptime) {
ts->ts_runtime = SCHED_SLP_RUN_MAX;
ts->ts_slptime = 1;
} else {
ts->ts_slptime = SCHED_SLP_RUN_MAX;
ts->ts_runtime = 1;
}
return;
}
if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
ts->ts_runtime /= 2;
ts->ts_slptime /= 2;
return;
}
ts->ts_runtime = (ts->ts_runtime / 5) * 4;
ts->ts_slptime = (ts->ts_slptime / 5) * 4;
}
static void
sched_interact_fork(struct thread *td)
{
struct td_sched *ts;
int ratio;
int sum;
ts = td_get_sched(td);
sum = ts->ts_runtime + ts->ts_slptime;
if (sum > SCHED_SLP_RUN_FORK) {
ratio = sum / SCHED_SLP_RUN_FORK;
ts->ts_runtime /= ratio;
ts->ts_slptime /= ratio;
}
}
static void
sched_ule_init(void)
{
struct td_sched *ts0;
ts0 = td_get_sched(&thread0);
ts0->ts_ftick = (u_int)ticks;
ts0->ts_ltick = ts0->ts_ftick;
ts0->ts_slice = 0;
ts0->ts_cpu = curcpu;
}
static void
sched_ule_init_ap(void)
{
#ifdef SMP
PCPU_SET(sched, DPCPU_PTR(tdq));
#endif
PCPU_GET(idlethread)->td_lock = TDQ_LOCKPTR(TDQ_SELF());
}
static int
sched_ule_rr_interval(void)
{
return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
}
static void
sched_pctcpu_update(struct td_sched *ts, int run)
{
const u_int t = (u_int)ticks;
u_int t_max = SCHED_TICK_MAX((u_int)hz);
u_int t_tgt = ((t_max << SCHED_TICK_SHIFT) * SCHED_CPU_DECAY_NUMER /
SCHED_CPU_DECAY_DENOM) >> SCHED_TICK_SHIFT;
const u_int lu_span = t - ts->ts_ltick;
if (lu_span >= t_tgt) {
ts->ts_ticks = run ? (t_tgt << SCHED_TICK_SHIFT) : 0;
ts->ts_ftick = t - t_tgt;
ts->ts_ltick = t;
return;
}
if (t - ts->ts_ftick >= t_max) {
ts->ts_ticks = SCHED_TICK_RUN_SHIFTED(ts) /
SCHED_TICK_LENGTH(ts) * (t_tgt - lu_span);
ts->ts_ftick = t - t_tgt;
}
if (run)
ts->ts_ticks += lu_span << SCHED_TICK_SHIFT;
ts->ts_ltick = t;
}
static void
sched_thread_priority(struct thread *td, u_char prio)
{
struct tdq *tdq;
int oldpri;
KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
"prio:%d", td->td_priority, "new prio:%d", prio,
KTR_ATTR_LINKED, sched_tdname(curthread));
SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio);
if (td != curthread && prio < td->td_priority) {
KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
"lend prio", "prio:%d", td->td_priority, "new prio:%d",
prio, KTR_ATTR_LINKED, sched_tdname(td));
SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio,
curthread);
}
THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_priority == prio)
return;
if (TD_ON_RUNQ(td) && prio < td->td_priority) {
sched_rem(td);
td->td_priority = prio;
sched_add(td, SRQ_BORROWING | SRQ_HOLDTD);
return;
}
if (TD_IS_RUNNING(td)) {
tdq = TDQ_CPU(td_get_sched(td)->ts_cpu);
oldpri = td->td_priority;
td->td_priority = prio;
if (prio < tdq->tdq_lowpri)
tdq->tdq_lowpri = prio;
else if (tdq->tdq_lowpri == oldpri)
tdq_setlowpri(tdq, td);
return;
}
td->td_priority = prio;
}
static void
sched_ule_lend_prio(struct thread *td, u_char prio)
{
td->td_flags |= TDF_BORROWING;
sched_thread_priority(td, prio);
}
static void
sched_ule_unlend_prio(struct thread *td, u_char prio)
{
u_char base_pri;
if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
td->td_base_pri <= PRI_MAX_TIMESHARE)
base_pri = td->td_user_pri;
else
base_pri = td->td_base_pri;
if (prio >= base_pri) {
td->td_flags &= ~TDF_BORROWING;
sched_thread_priority(td, base_pri);
} else
sched_lend_prio(td, prio);
}
static void
sched_ule_prio(struct thread *td, u_char prio)
{
u_char oldprio;
td->td_base_pri = prio;
if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
return;
oldprio = td->td_priority;
sched_thread_priority(td, prio);
if (TD_ON_LOCK(td) && oldprio != prio)
turnstile_adjust(td, oldprio);
}
static void
sched_ule_ithread_prio(struct thread *td, u_char prio)
{
THREAD_LOCK_ASSERT(td, MA_OWNED);
MPASS(td->td_pri_class == PRI_ITHD);
td->td_base_ithread_pri = prio;
sched_prio(td, prio);
}
static void
sched_ule_user_prio(struct thread *td, u_char prio)
{
td->td_base_user_pri = prio;
if (td->td_lend_user_pri <= prio)
return;
td->td_user_pri = prio;
}
static void
sched_ule_lend_user_prio(struct thread *td, u_char prio)
{
THREAD_LOCK_ASSERT(td, MA_OWNED);
td->td_lend_user_pri = prio;
td->td_user_pri = min(prio, td->td_base_user_pri);
if (td->td_priority > td->td_user_pri)
sched_prio(td, td->td_user_pri);
else if (td->td_priority != td->td_user_pri)
ast_sched_locked(td, TDA_SCHED);
}
static void
sched_ule_lend_user_prio_cond(struct thread *td, u_char prio)
{
if (td->td_lend_user_pri == prio)
return;
thread_lock(td);
sched_lend_user_prio(td, prio);
thread_unlock(td);
}
#ifdef SMP
static void
tdq_trysteal(struct tdq *tdq)
{
struct cpu_group *cg, *parent;
struct tdq *steal;
cpuset_t mask;
int cpu, i, goup;
if (smp_started == 0 || steal_idle == 0 || trysteal_limit == 0 ||
tdq->tdq_cg == NULL)
return;
CPU_FILL(&mask);
CPU_CLR(PCPU_GET(cpuid), &mask);
spinlock_enter();
TDQ_UNLOCK(tdq);
for (i = 1, cg = tdq->tdq_cg, goup = 0; ; ) {
cpu = sched_highest(cg, &mask, steal_thresh, 1);
if (TDQ_LOAD(tdq) > 0) {
TDQ_LOCK(tdq);
break;
}
if (cpu == -1) {
if (goup) {
cg = cg->cg_parent;
goup = 0;
}
if (++i > trysteal_limit) {
TDQ_LOCK(tdq);
break;
}
parent = cg->cg_parent;
if (parent == NULL) {
TDQ_LOCK(tdq);
break;
}
if (parent->cg_children == 2) {
if (cg == &parent->cg_child[0])
cg = &parent->cg_child[1];
else
cg = &parent->cg_child[0];
goup = 1;
} else
cg = parent;
continue;
}
steal = TDQ_CPU(cpu);
if (TDQ_LOAD(steal) < steal_thresh ||
TDQ_TRANSFERABLE(steal) == 0)
continue;
TDQ_LOCK(tdq);
if (tdq->tdq_load > 0)
break;
if (TDQ_TRYLOCK_FLAGS(steal, MTX_DUPOK) == 0)
break;
if (TDQ_LOAD(steal) < steal_thresh ||
TDQ_TRANSFERABLE(steal) == 0) {
TDQ_UNLOCK(steal);
break;
}
if (tdq_move(steal, tdq) == -1) {
TDQ_UNLOCK(steal);
break;
}
TDQ_UNLOCK(steal);
break;
}
spinlock_exit();
}
#endif
static struct mtx *
sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
{
struct tdq *tdn;
#ifdef SMP
int lowpri;
#endif
KASSERT(THREAD_CAN_MIGRATE(td) ||
(td_get_sched(td)->ts_flags & TSF_BOUND) != 0,
("Thread %p shouldn't migrate", td));
KASSERT(!CPU_ABSENT(td_get_sched(td)->ts_cpu), ("sched_switch_migrate: "
"thread %s queued on absent CPU %d.", td->td_name,
td_get_sched(td)->ts_cpu));
tdn = TDQ_CPU(td_get_sched(td)->ts_cpu);
#ifdef SMP
tdq_load_rem(tdq, td);
TDQ_UNLOCK(tdq);
TDQ_LOCK(tdn);
lowpri = tdq_add(tdn, td, flags);
tdq_notify(tdn, lowpri);
TDQ_UNLOCK(tdn);
TDQ_LOCK(tdq);
#endif
return (TDQ_LOCKPTR(tdn));
}
static inline void
thread_unblock_switch(struct thread *td, struct mtx *mtx)
{
atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
(uintptr_t)mtx);
}
static void
sched_ule_sswitch(struct thread *td, int flags)
{
struct thread *newtd;
struct tdq *tdq;
struct td_sched *ts;
struct mtx *mtx;
int srqflag;
int cpuid, preempted;
#ifdef SMP
int pickcpu;
#endif
THREAD_LOCK_ASSERT(td, MA_OWNED);
cpuid = PCPU_GET(cpuid);
tdq = TDQ_SELF();
ts = td_get_sched(td);
sched_pctcpu_update(ts, 1);
#ifdef SMP
pickcpu = (td->td_flags & TDF_PICKCPU) != 0;
if (pickcpu)
ts->ts_rltick = (u_int)ticks - affinity * MAX_CACHE_LEVELS;
else
ts->ts_rltick = (u_int)ticks;
#endif
td->td_lastcpu = td->td_oncpu;
preempted = (td->td_flags & TDF_SLICEEND) == 0 &&
(flags & SW_PREEMPT) != 0;
td->td_flags &= ~(TDF_PICKCPU | TDF_SLICEEND);
ast_unsched_locked(td, TDA_SCHED);
td->td_owepreempt = 0;
atomic_store_char(&tdq->tdq_owepreempt, 0);
if (!TD_IS_IDLETHREAD(td))
TDQ_SWITCHCNT_INC(tdq);
mtx = thread_lock_block(td);
spinlock_enter();
if (TD_IS_IDLETHREAD(td)) {
MPASS(mtx == TDQ_LOCKPTR(tdq));
TD_SET_CAN_RUN(td);
} else if (TD_IS_RUNNING(td)) {
MPASS(mtx == TDQ_LOCKPTR(tdq));
srqflag = SRQ_OURSELF | SRQ_YIELDING |
(preempted ? SRQ_PREEMPTED : 0);
#ifdef SMP
if (THREAD_CAN_MIGRATE(td) && (!THREAD_CAN_SCHED(td, ts->ts_cpu)
|| pickcpu))
ts->ts_cpu = sched_pickcpu(td, 0);
#endif
if (ts->ts_cpu == cpuid)
tdq_runq_add(tdq, td, srqflag);
else
mtx = sched_switch_migrate(tdq, td, srqflag);
} else {
if (mtx != TDQ_LOCKPTR(tdq)) {
mtx_unlock_spin(mtx);
TDQ_LOCK(tdq);
}
tdq_load_rem(tdq, td);
#ifdef SMP
if (tdq->tdq_load == 0)
tdq_trysteal(tdq);
#endif
}
#if (KTR_COMPILE & KTR_SCHED) != 0
if (TD_IS_IDLETHREAD(td))
KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
"prio:%d", td->td_priority);
else
KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
"prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
"lockname:\"%s\"", td->td_lockname);
#endif
TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
MPASS(td == tdq->tdq_curthread);
newtd = choosethread();
sched_pctcpu_update(td_get_sched(newtd), 0);
TDQ_UNLOCK(tdq);
if (td != newtd) {
#ifdef HWPMC_HOOKS
if (PMC_PROC_IS_USING_PMCS(td->td_proc))
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
#endif
SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
#ifdef KDTRACE_HOOKS
if (dtrace_vtime_active)
(*dtrace_vtime_switch_func)(newtd);
#endif
#ifdef HWT_HOOKS
HWT_CALL_HOOK(td, HWT_SWITCH_OUT, NULL);
HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL);
#endif
td->td_oncpu = NOCPU;
cpu_switch(td, newtd, mtx);
cpuid = td->td_oncpu = PCPU_GET(cpuid);
SDT_PROBE0(sched, , , on__cpu);
#ifdef HWPMC_HOOKS
if (PMC_PROC_IS_USING_PMCS(td->td_proc))
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
#endif
} else {
thread_unblock_switch(td, mtx);
SDT_PROBE0(sched, , , remain__cpu);
}
KASSERT(curthread->td_md.md_spinlock_count == 1,
("invalid count %d", curthread->td_md.md_spinlock_count));
KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
"prio:%d", td->td_priority);
}
static void
sched_ule_nice(struct proc *p, int nice)
{
struct thread *td;
PROC_LOCK_ASSERT(p, MA_OWNED);
p->p_nice = nice;
FOREACH_THREAD_IN_PROC(p, td) {
thread_lock(td);
sched_priority(td);
sched_prio(td, td->td_base_user_pri);
thread_unlock(td);
}
}
static void
sched_ule_sleep(struct thread *td, int prio)
{
THREAD_LOCK_ASSERT(td, MA_OWNED);
td->td_slptick = ticks;
if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
return;
if (static_boost == 1 && prio)
sched_prio(td, prio);
else if (static_boost && td->td_priority > static_boost)
sched_prio(td, static_boost);
}
static void
sched_ule_wakeup(struct thread *td, int srqflags)
{
struct td_sched *ts;
int slptick;
THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td_get_sched(td);
slptick = td->td_slptick;
td->td_slptick = 0;
if (slptick && slptick != ticks) {
ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT;
sched_interact_update(td);
sched_pctcpu_update(ts, 0);
}
if (PRI_BASE(td->td_pri_class) == PRI_ITHD &&
td->td_priority != td->td_base_ithread_pri)
sched_prio(td, td->td_base_ithread_pri);
ts->ts_slice = 0;
sched_add(td, SRQ_BORING | srqflags);
}
static void
sched_ule_fork(struct thread *td, struct thread *child)
{
THREAD_LOCK_ASSERT(td, MA_OWNED);
sched_pctcpu_update(td_get_sched(td), 1);
sched_fork_thread(td, child);
sched_interact_fork(child);
sched_priority(child);
td_get_sched(td)->ts_runtime += tickincr;
sched_interact_update(td);
sched_priority(td);
}
static void
sched_ule_fork_thread(struct thread *td, struct thread *child)
{
struct td_sched *ts;
struct td_sched *ts2;
struct tdq *tdq;
tdq = TDQ_SELF();
THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td_get_sched(td);
ts2 = td_get_sched(child);
child->td_oncpu = NOCPU;
child->td_lastcpu = NOCPU;
child->td_lock = TDQ_LOCKPTR(tdq);
child->td_cpuset = cpuset_ref(td->td_cpuset);
child->td_domain.dr_policy = td->td_cpuset->cs_domain;
ts2->ts_cpu = ts->ts_cpu;
ts2->ts_flags = 0;
ts2->ts_ticks = ts->ts_ticks;
ts2->ts_ltick = ts->ts_ltick;
ts2->ts_ftick = ts->ts_ftick;
child->td_priority = child->td_base_pri;
ts2->ts_slptime = ts->ts_slptime;
ts2->ts_runtime = ts->ts_runtime;
ts2->ts_slice = tdq_slice(tdq) - sched_slice_min;
#ifdef KTR
bzero(ts2->ts_name, sizeof(ts2->ts_name));
#endif
}
static void
sched_ule_class(struct thread *td, int class)
{
THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_pri_class == class)
return;
td->td_pri_class = class;
}
static void
sched_ule_exit(struct proc *p, struct thread *child)
{
struct thread *td;
KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit",
"prio:%d", child->td_priority);
PROC_LOCK_ASSERT(p, MA_OWNED);
td = FIRST_THREAD_IN_PROC(p);
sched_exit_thread(td, child);
}
static void
sched_ule_exit_thread(struct thread *td, struct thread *child)
{
KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit",
"prio:%d", child->td_priority);
thread_lock(td);
td_get_sched(td)->ts_runtime += td_get_sched(child)->ts_runtime;
sched_interact_update(td);
sched_priority(td);
thread_unlock(td);
}
static void
sched_ule_preempt(struct thread *td)
{
struct tdq *tdq;
int flags;
SDT_PROBE2(sched, , , surrender, td, td->td_proc);
thread_lock(td);
tdq = TDQ_SELF();
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
if (td->td_priority > tdq->tdq_lowpri) {
if (td->td_critnest == 1) {
flags = SW_INVOL | SW_PREEMPT;
flags |= TD_IS_IDLETHREAD(td) ? SWT_REMOTEWAKEIDLE :
SWT_REMOTEPREEMPT;
mi_switch(flags);
return;
}
td->td_owepreempt = 1;
} else {
tdq->tdq_owepreempt = 0;
}
thread_unlock(td);
}
static void
sched_ule_userret_slowpath(struct thread *td)
{
thread_lock(td);
td->td_priority = td->td_user_pri;
td->td_base_pri = td->td_user_pri;
tdq_setlowpri(TDQ_SELF(), td);
thread_unlock(td);
}
static inline u_int
td_slice(struct thread *td, struct tdq *tdq)
{
if (PRI_BASE(td->td_pri_class) == PRI_ITHD)
return (sched_slice);
return (tdq_slice(tdq));
}
static void
sched_ule_clock(struct thread *td, int cnt)
{
struct tdq *tdq;
struct td_sched *ts;
THREAD_LOCK_ASSERT(td, MA_OWNED);
tdq = TDQ_SELF();
#ifdef SMP
if (balance_tdq == tdq && smp_started != 0 && rebalance != 0 &&
balance_ticks != 0) {
balance_ticks -= cnt;
if (balance_ticks <= 0)
sched_balance();
}
#endif
tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
tdq->tdq_switchcnt = tdq->tdq_load;
if (tdq->tdq_ts_off == tdq->tdq_ts_deq_off) {
tdq->tdq_ts_ticks += cnt;
tdq->tdq_ts_off = (tdq->tdq_ts_off + 2 * cnt -
tdq-> tdq_ts_ticks / 4) % RQ_TS_POL_MODULO;
tdq->tdq_ts_ticks %= 4;
tdq_advance_ts_deq_off(tdq, false);
}
ts = td_get_sched(td);
sched_pctcpu_update(ts, 1);
if ((td->td_pri_class & PRI_FIFO_BIT) || TD_IS_IDLETHREAD(td))
return;
if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) {
td_get_sched(td)->ts_runtime += tickincr * cnt;
sched_interact_update(td);
sched_priority(td);
}
ts->ts_slice += cnt;
if (ts->ts_slice >= td_slice(td, tdq)) {
ts->ts_slice = 0;
if (PRI_BASE(td->td_pri_class) == PRI_ITHD) {
SCHED_STAT_INC(ithread_preemptions);
td->td_owepreempt = 1;
if (td->td_base_pri + RQ_PPQ < PRI_MAX_ITHD) {
SCHED_STAT_INC(ithread_demotions);
sched_prio(td, td->td_base_pri + RQ_PPQ);
}
} else {
ast_sched_locked(td, TDA_SCHED);
td->td_flags |= TDF_SLICEEND;
}
}
}
static u_int
sched_ule_estcpu(struct thread *td __unused)
{
return (0);
}
static bool
sched_ule_runnable(void)
{
struct tdq *tdq;
tdq = TDQ_SELF();
return (TDQ_LOAD(tdq) > (TD_IS_IDLETHREAD(curthread) ? 0 : 1));
}
static struct thread *
sched_ule_choose(void)
{
struct thread *td;
struct tdq *tdq;
tdq = TDQ_SELF();
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
td = tdq_choose(tdq);
if (td != NULL) {
tdq_runq_rem(tdq, td);
tdq->tdq_lowpri = td->td_priority;
} else {
tdq->tdq_lowpri = PRI_MAX_IDLE;
td = PCPU_GET(idlethread);
}
tdq->tdq_curthread = td;
return (td);
}
static void
sched_setpreempt(int pri)
{
struct thread *ctd;
int cpri;
ctd = curthread;
THREAD_LOCK_ASSERT(ctd, MA_OWNED);
cpri = ctd->td_priority;
if (pri < cpri)
ast_sched_locked(ctd, TDA_SCHED);
if (KERNEL_PANICKED() || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
return;
if (!sched_shouldpreempt(pri, cpri, 0))
return;
ctd->td_owepreempt = 1;
}
static int
tdq_add(struct tdq *tdq, struct thread *td, int flags)
{
int lowpri;
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
KASSERT((td->td_inhibitors == 0),
("sched_add: trying to run inhibited thread"));
KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
("sched_add: bad thread state"));
KASSERT(td->td_flags & TDF_INMEM,
("sched_add: thread swapped out"));
lowpri = tdq->tdq_lowpri;
if (td->td_priority < lowpri)
tdq->tdq_lowpri = td->td_priority;
tdq_runq_add(tdq, td, flags);
tdq_load_add(tdq, td);
return (lowpri);
}
static void
sched_ule_add(struct thread *td, int flags)
{
struct tdq *tdq;
#ifdef SMP
int cpu, lowpri;
#endif
KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
"prio:%d", td->td_priority, KTR_ATTR_LINKED,
sched_tdname(curthread));
KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
KTR_ATTR_LINKED, sched_tdname(td));
SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL,
flags & SRQ_PREEMPTED);
THREAD_LOCK_ASSERT(td, MA_OWNED);
if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
sched_priority(td);
#ifdef SMP
cpu = sched_pickcpu(td, flags);
tdq = sched_setcpu(td, cpu, flags);
lowpri = tdq_add(tdq, td, flags);
if (cpu != PCPU_GET(cpuid))
tdq_notify(tdq, lowpri);
else if (!(flags & SRQ_YIELDING))
sched_setpreempt(td->td_priority);
#else
tdq = TDQ_SELF();
if (td->td_lock != TDQ_LOCKPTR(tdq)) {
TDQ_LOCK(tdq);
if ((flags & SRQ_HOLD) != 0)
td->td_lock = TDQ_LOCKPTR(tdq);
else
thread_lock_set(td, TDQ_LOCKPTR(tdq));
}
(void)tdq_add(tdq, td, flags);
if (!(flags & SRQ_YIELDING))
sched_setpreempt(td->td_priority);
#endif
if (!(flags & SRQ_HOLDTD))
thread_unlock(td);
}
static void
sched_ule_rem(struct thread *td)
{
struct tdq *tdq;
KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
"prio:%d", td->td_priority);
SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
tdq = TDQ_CPU(td_get_sched(td)->ts_cpu);
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
KASSERT(TD_ON_RUNQ(td),
("sched_rem: thread not on run queue"));
tdq_runq_rem(tdq, td);
tdq_load_rem(tdq, td);
TD_SET_CAN_RUN(td);
if (td->td_priority == tdq->tdq_lowpri)
tdq_setlowpri(tdq, NULL);
}
static fixpt_t
sched_ule_pctcpu(struct thread *td)
{
struct td_sched *ts;
u_int len;
fixpt_t pctcpu;
THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td_get_sched(td);
sched_pctcpu_update(ts, TD_IS_RUNNING(td));
len = SCHED_TICK_LENGTH(ts);
pctcpu = ((FSHIFT >= SCHED_TICK_SHIFT ?
(SCHED_TICK_RUN_SHIFTED(ts) << (FSHIFT - SCHED_TICK_SHIFT)) :
(SCHED_TICK_RUN_SHIFTED(ts) >> (SCHED_TICK_SHIFT - FSHIFT))) +
len / 2) / len;
return (pctcpu);
}
static void
sched_ule_affinity(struct thread *td)
{
#ifdef SMP
struct td_sched *ts;
THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td_get_sched(td);
if (THREAD_CAN_SCHED(td, ts->ts_cpu))
return;
if (TD_ON_RUNQ(td)) {
sched_rem(td);
sched_add(td, SRQ_BORING | SRQ_HOLDTD);
return;
}
if (!TD_IS_RUNNING(td))
return;
ast_sched_locked(td, TDA_SCHED);
if (td != curthread)
ipi_cpu(ts->ts_cpu, IPI_PREEMPT);
#endif
}
static void
sched_ule_bind(struct thread *td, int cpu)
{
struct td_sched *ts;
THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
ts = td_get_sched(td);
if (ts->ts_flags & TSF_BOUND)
sched_unbind(td);
KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td));
ts->ts_flags |= TSF_BOUND;
sched_pin();
if (PCPU_GET(cpuid) == cpu)
return;
ts->ts_cpu = cpu;
mi_switch(SW_VOL | SWT_BIND);
thread_lock(td);
}
static void
sched_ule_unbind(struct thread *td)
{
struct td_sched *ts;
THREAD_LOCK_ASSERT(td, MA_OWNED);
KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
ts = td_get_sched(td);
if ((ts->ts_flags & TSF_BOUND) == 0)
return;
ts->ts_flags &= ~TSF_BOUND;
sched_unpin();
}
static int
sched_ule_is_bound(struct thread *td)
{
THREAD_LOCK_ASSERT(td, MA_OWNED);
return (td_get_sched(td)->ts_flags & TSF_BOUND);
}
static void
sched_ule_relinquish(struct thread *td)
{
thread_lock(td);
mi_switch(SW_VOL | SWT_RELINQUISH);
}
static int
sched_ule_load(void)
{
#ifdef SMP
int total;
int i;
total = 0;
CPU_FOREACH(i)
total += atomic_load_int(&TDQ_CPU(i)->tdq_sysload);
return (total);
#else
return (atomic_load_int(&TDQ_SELF()->tdq_sysload));
#endif
}
static int
sched_ule_sizeof_proc(void)
{
return (sizeof(struct proc));
}
static int
sched_ule_sizeof_thread(void)
{
return (sizeof(struct thread) + sizeof(struct td_sched));
}
#ifdef SMP
#define TDQ_IDLESPIN(tdq) \
((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0)
#else
#define TDQ_IDLESPIN(tdq) 1
#endif
static void
sched_ule_idletd(void *dummy)
{
struct thread *td;
struct tdq *tdq;
int oldswitchcnt, switchcnt;
int i;
mtx_assert(&Giant, MA_NOTOWNED);
td = curthread;
tdq = TDQ_SELF();
THREAD_NO_SLEEPING();
oldswitchcnt = -1;
for (;;) {
if (TDQ_LOAD(tdq)) {
thread_lock(td);
mi_switch(SW_VOL | SWT_IDLE);
}
switchcnt = TDQ_SWITCHCNT(tdq);
#ifdef SMP
if (always_steal || switchcnt != oldswitchcnt) {
oldswitchcnt = switchcnt;
if (tdq_idled(tdq) == 0)
continue;
}
switchcnt = TDQ_SWITCHCNT(tdq);
#else
oldswitchcnt = switchcnt;
#endif
if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) {
for (i = 0; i < sched_idlespins; i++) {
if (TDQ_LOAD(tdq))
break;
cpu_spinwait();
}
}
switchcnt = TDQ_SWITCHCNT(tdq);
if (TDQ_LOAD(tdq) != 0 || switchcnt != oldswitchcnt)
continue;
atomic_store_int(&tdq->tdq_cpu_idle, 1);
atomic_thread_fence_seq_cst();
if (TDQ_LOAD(tdq) != 0) {
atomic_store_int(&tdq->tdq_cpu_idle, 0);
continue;
}
cpu_idle(switchcnt * 4 > sched_idlespinthresh);
atomic_store_int(&tdq->tdq_cpu_idle, 0);
switchcnt = TDQ_SWITCHCNT(tdq);
if (switchcnt != oldswitchcnt)
continue;
TDQ_SWITCHCNT_INC(tdq);
oldswitchcnt++;
}
}
static struct thread *
sched_throw_grab(struct tdq *tdq)
{
struct thread *newtd;
newtd = choosethread();
spinlock_enter();
TDQ_UNLOCK(tdq);
KASSERT(curthread->td_md.md_spinlock_count == 1,
("invalid count %d", curthread->td_md.md_spinlock_count));
return (newtd);
}
static void
sched_ule_ap_entry(void)
{
struct thread *newtd;
struct tdq *tdq;
tdq = TDQ_SELF();
THREAD_LOCKPTR_ASSERT(curthread, TDQ_LOCKPTR(tdq));
TDQ_LOCK(tdq);
spinlock_exit();
PCPU_SET(switchtime, cpu_ticks());
PCPU_SET(switchticks, ticks);
newtd = sched_throw_grab(tdq);
#ifdef HWT_HOOKS
HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL);
#endif
cpu_throw(NULL, newtd);
}
static void
sched_ule_throw(struct thread *td)
{
struct thread *newtd;
struct tdq *tdq;
tdq = TDQ_SELF();
MPASS(td != NULL);
THREAD_LOCK_ASSERT(td, MA_OWNED);
THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(tdq));
tdq_load_rem(tdq, td);
td->td_lastcpu = td->td_oncpu;
td->td_oncpu = NOCPU;
thread_lock_block(td);
newtd = sched_throw_grab(tdq);
#ifdef HWT_HOOKS
HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL);
#endif
cpu_switch(td, newtd, TDQ_LOCKPTR(tdq));
}
static void
sched_ule_fork_exit(struct thread *td)
{
struct tdq *tdq;
int cpuid;
KASSERT(curthread->td_md.md_spinlock_count == 1,
("invalid count %d", curthread->td_md.md_spinlock_count));
cpuid = PCPU_GET(cpuid);
tdq = TDQ_SELF();
TDQ_LOCK(tdq);
spinlock_exit();
MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
td->td_oncpu = cpuid;
KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
"prio:%d", td->td_priority);
SDT_PROBE0(sched, , , on__cpu);
}
static char *
sched_ule_tdname(struct thread *td)
{
#ifdef KTR
struct td_sched *ts;
ts = td_get_sched(td);
if (ts->ts_name[0] == '\0')
snprintf(ts->ts_name, sizeof(ts->ts_name),
"%s tid %d", td->td_name, td->td_tid);
return (ts->ts_name);
#else
return (td->td_name);
#endif
}
static void
sched_ule_clear_tdname(struct thread *td)
{
#ifdef KTR
struct td_sched *ts;
ts = td_get_sched(td);
ts->ts_name[0] = '\0';
#endif
}
static void
sched_ule_schedcpu(void)
{
}
static bool
sched_ule_do_timer_accounting(void)
{
return (true);
}
#ifdef SMP
static int
sched_ule_find_child_with_core(int cpu, struct cpu_group *grp)
{
int i;
if (grp->cg_children == 0)
return (-1);
MPASS(grp->cg_child);
for (i = 0; i < grp->cg_children; i++) {
if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
return (i);
}
return (-1);
}
static int
sched_ule_find_l2_neighbor(int cpu)
{
struct cpu_group *grp;
int i;
grp = cpu_top;
if (grp == NULL)
return (-1);
i = 0;
while ((i = sched_ule_find_child_with_core(cpu, grp)) != -1) {
if (grp->cg_child[i].cg_count <= 1)
return (-1);
grp = &grp->cg_child[i];
}
if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE)
return (-1);
for (i = 0; i < CPU_SETSIZE; i++) {
if (CPU_ISSET(i, &grp->cg_mask) && i != cpu)
return (i);
}
return (-1);
}
#else
static int
sched_ule_find_l2_neighbor(int cpu)
{
return (-1);
}
#endif
struct sched_instance sched_ule_instance = {
#define SLOT(name) .name = sched_ule_##name
SLOT(load),
SLOT(rr_interval),
SLOT(runnable),
SLOT(exit),
SLOT(fork),
SLOT(fork_exit),
SLOT(class),
SLOT(nice),
SLOT(ap_entry),
SLOT(exit_thread),
SLOT(estcpu),
SLOT(fork_thread),
SLOT(ithread_prio),
SLOT(lend_prio),
SLOT(lend_user_prio),
SLOT(lend_user_prio_cond),
SLOT(pctcpu),
SLOT(prio),
SLOT(sleep),
SLOT(sswitch),
SLOT(throw),
SLOT(unlend_prio),
SLOT(user_prio),
SLOT(userret_slowpath),
SLOT(add),
SLOT(choose),
SLOT(clock),
SLOT(idletd),
SLOT(preempt),
SLOT(relinquish),
SLOT(rem),
SLOT(wakeup),
SLOT(bind),
SLOT(unbind),
SLOT(is_bound),
SLOT(affinity),
SLOT(sizeof_proc),
SLOT(sizeof_thread),
SLOT(tdname),
SLOT(clear_tdname),
SLOT(do_timer_accounting),
SLOT(find_l2_neighbor),
SLOT(init),
SLOT(init_ap),
SLOT(setup),
SLOT(initticks),
SLOT(schedcpu),
#undef SLOT
};
DECLARE_SCHEDULER(ule_sched_selector, "ULE", &sched_ule_instance);
static int
sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
{
int error, new_val, period;
period = 1000000 / realstathz;
new_val = period * sched_slice;
error = sysctl_handle_int(oidp, &new_val, 0, req);
if (error != 0 || req->newptr == NULL)
return (error);
if (new_val <= 0)
return (EINVAL);
sched_slice = imax(1, (new_val + period / 2) / period);
sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
realstathz);
return (0);
}
SYSCTL_NODE(_kern_sched, OID_AUTO, ule, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
"ULE Scheduler");
SYSCTL_PROC(_kern_sched_ule, OID_AUTO, quantum,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
sysctl_kern_quantum, "I",
"Quantum for timeshare threads in microseconds");
SYSCTL_INT(_kern_sched_ule, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
"Quantum for timeshare threads in stathz ticks");
SYSCTL_UINT(_kern_sched_ule, OID_AUTO, interact, CTLFLAG_RWTUN, &sched_interact, 0,
"Interactivity score threshold");
SYSCTL_INT(_kern_sched_ule, OID_AUTO, preempt_thresh, CTLFLAG_RWTUN,
&preempt_thresh, 0,
"Maximal (lowest) priority for preemption");
SYSCTL_INT(_kern_sched_ule, OID_AUTO, static_boost, CTLFLAG_RWTUN,
&static_boost, 0,
"Assign static kernel priorities to sleeping threads");
SYSCTL_INT(_kern_sched_ule, OID_AUTO, idlespins, CTLFLAG_RWTUN,
&sched_idlespins, 0,
"Number of times idle thread will spin waiting for new work");
SYSCTL_INT(_kern_sched_ule, OID_AUTO, idlespinthresh, CTLFLAG_RW,
&sched_idlespinthresh, 0,
"Threshold before we will permit idle thread spinning");
#ifdef SMP
SYSCTL_INT(_kern_sched_ule, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
"Number of hz ticks to keep thread affinity for");
SYSCTL_INT(_kern_sched_ule, OID_AUTO, balance, CTLFLAG_RWTUN, &rebalance, 0,
"Enables the long-term load balancer");
SYSCTL_INT(_kern_sched_ule, OID_AUTO, balance_interval, CTLFLAG_RW,
&balance_interval, 0,
"Average period in stathz ticks to run the long-term balancer");
SYSCTL_INT(_kern_sched_ule, OID_AUTO, steal_idle, CTLFLAG_RWTUN,
&steal_idle, 0,
"Attempts to steal work from other cores before idling");
SYSCTL_INT(_kern_sched_ule, OID_AUTO, steal_thresh, CTLFLAG_RWTUN,
&steal_thresh, 0,
"Minimum load on remote CPU before we'll steal");
SYSCTL_INT(_kern_sched_ule, OID_AUTO, trysteal_limit, CTLFLAG_RWTUN,
&trysteal_limit, 0,
"Topological distance limit for stealing threads in sched_switch()");
SYSCTL_INT(_kern_sched_ule, OID_AUTO, always_steal, CTLFLAG_RWTUN,
&always_steal, 0,
"Always run the stealer from the idle thread");
#endif