root/usr/src/lib/commpage/amd64/cp_subr.S
/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2019 Joyent, Inc.
 * Copyright 2020 Oxide Computer Company
 */

#include <sys/asm_linkage.h>
#include <sys/segments.h>
#include <sys/time_impl.h>
#include <sys/tsc.h>
#include <cp_offsets.h>

#define GETCPU_GDT_OFFSET       SEL_GDT(GDT_CPUID, SEL_UPL)

        .file   "cp_subr.s"

/*
 * These are cloned from TSC and time related code in the kernel.  They should
 * be kept in sync in the case that the source values are changed.
 * See: uts/i86pc/os/timestamp.c
 */
#define NSEC_SHIFT      5
#define ADJ_SHIFT       4
#define NANOSEC         0x3b9aca00

/*
 * For __cp_tsc_read calls which incur looping retries due to CPU migration,
 * this represents the maximum number of tries before bailing out.
 */
#define TSC_READ_MAXLOOP        0x4

/*
 * hrtime_t
 * __cp_tsc_read(comm_page_t *cp)
 *
 * Stack usage: 0 bytes
 */
        ENTRY_NP(__cp_tsc_read)
        movl    CP_TSC_TYPE(%rdi), %esi
        movl    CP_TSC_NCPU(%rdi), %r8d

        cmpl    $TSC_TSCP, %esi
        jne     2f
        rdtscp
        /*
         * When the TSC is read, the low 32 bits are placed in %eax while the
         * high 32 bits are placed in %edx.  They are shifted and ORed together
         * to obtain the full 64-bit value.
         */
        shlq    $0x20, %rdx
        orq     %rdx, %rax

        /*
         * A zeroed cp_tsc_ncpu (currently held in r8d) indicates that no
         * per-CPU TSC offsets are required.
         */
        testl   %r8d, %r8d
        jnz     1f
        ret

1:
        /*
         * A non-zero cp_tsc_ncpu indicates the array length of
         * cp_tsc_sync_tick_delta containing per-CPU offsets which are applied
         * to TSC readings.  The CPU ID furnished by the IA32_TSC_AUX register
         * via rdtscp (placed in rcx) is used to look up an offset value in
         * that array and apply it to the TSC value.
         */
        leaq    CP_TSC_SYNC_TICK_DELTA(%rdi), %r9
        movq    (%r9, %rcx, 8), %rdx
        addq    %rdx, %rax
        ret

2:
        /*
         * TSC reading without RDTSCP
         *
         * Check if handling for per-CPU TSC offsets is required.  If not,
         * immediately skip to the the appropriate steps to perform a rdtsc.
         *
         * If per-CPU offsets are present, the TSC reading process is more
         * complicated.  Without rdtscp, there is no way to simultaneously read
         * the TSC and query the current CPU.  In order to "catch" migrations
         * during execution, the CPU ID is queried before and after rdtsc.  The
         * execution is repeated if results differ, subject to a loop limit.
         */
        xorq    %r9, %r9
        testl   %r8d, %r8d
        jz      3f

        /*
         * Load the address of the per-CPU offset array, since it is needed.
         * The attempted loop count is kept in r8.
         */
        leaq    CP_TSC_SYNC_TICK_DELTA(%rdi), %r9
        xorl    %r8d, %r8d

        /* Query the CPU ID and stash it in r10 for later comparison */
        movl    $GETCPU_GDT_OFFSET, %edx
        lsl     %edx, %edx
        movl    %edx, %r10d

3:
        /*
         * TSC_RDTSC_MFENCE was used in the past for AMD chips, but has been
         * supplanted by TSC_RDTSC_LFENCE, which works on Intel and AMD (when
         * lfence can be confirmed as serializing).
         */

4:
        cmpl    $TSC_RDTSC_LFENCE, %esi
        jne     5f
        lfence
        rdtsc
        jmp     7f

5:
        cmpl    $TSC_RDTSC_CPUID, %esi
        jne     6f
        /*
         * Since the amd64 ABI dictates that %rbx is callee-saved, it must be
         * preserved here.  Its contents will be overwritten when cpuid is used
         * as a serializing instruction.
         */
        movq    %rbx, %r11
        xorl    %eax, %eax
        cpuid
        rdtsc
        movq    %r11, %rbx
        jmp     7f

6:
        /*
         * Other protections should have prevented this function from being
         * called in the first place.  Since callers must handle a failure from
         * CPU migration looping, yield the same result as a bail-out: 0
         */
        xorl    %eax, %eax
        ret

7:
        shlq    $0x20, %rdx
        orq     %rdx, %rax

        /*
         * With the TSC reading in-hand, check if any per-CPU offset handling
         * is required.  The address to the array of deltas (r9) will not have
         * been populated if offset handling is unecessary.
         */
        testq   %r9, %r9
        jnz     8f
        ret

8:
        movl    $GETCPU_GDT_OFFSET, %edx
        lsl     %edx, %edx
        cmpl    %edx, %r10d
        jne     9f
        movq    (%r9, %rdx, 8), %rdx
        addq    %rdx, %rax
        ret

9:
        /*
         * It appears that a migration has occurred between the first CPU ID
         * query and now.  Check if the loop limit has been broken and retry if
         * that's not the case.
         */
        cmpl    $TSC_READ_MAXLOOP, %r8d
        jge     10f
        incl    %r8d
        movl    %edx, %r10d
        jmp     3b

10:
        /* Loop limit was reached. Return bail-out value of 0. */
        xorl    %eax, %eax
        ret

        SET_SIZE(__cp_tsc_read)


/*
 * uint_t
 * __cp_getcpu(comm_page_t *)
 *
 * Stack usage: 0 bytes
 */
        ENTRY_NP(__cp_getcpu)
        movl    CP_TSC_TYPE(%rdi), %edi
        /*
         * If RDTSCP is available, it is a quick way to grab the cpu_id which
         * is stored in the TSC_AUX MSR by the kernel.
         */
        cmpl    $TSC_TSCP, %edi
        jne     1f
        rdtscp
        movl    %ecx, %eax
        ret
1:
        mov     $GETCPU_GDT_OFFSET, %eax
        lsl     %eax, %eax
        ret
        SET_SIZE(__cp_getcpu)

/*
 * hrtime_t
 * __cp_gethrtime(comm_page_t *cp)
 *
 * Stack usage: 0x20 local + 0x8 call = 0x28 bytes
 *
 * %rsp+0x00 - hrtime_t tsc_last
 * %rsp+0x08 - hrtime_t hrtime_base
 * %rsp+0x10 - commpage_t *cp
 * %rsp+0x18 - int hres_lock
 */
        ENTRY_NP(__cp_gethrtime)
        subq    $0x20, %rsp
        movq    %rdi, 0x10(%rsp)
1:
        movl    CP_HRES_LOCK(%rdi), %r9d
        movl    %r9d, 0x18(%rsp)

        movq    CP_TSC_LAST(%rdi), %rax
        movq    CP_TSC_HRTIME_BASE(%rdi), %rdx
        movq    %rax, (%rsp)
        movq    %rdx, 0x8(%rsp)

        call    __cp_tsc_read

        /*
         * Failure is inferred from a TSC reading of 0.  The normal fasttrap
         * mechanism can be used as a fallback in such cases.
         */
        testq   %rax, %rax
        jz      6f

        movq    0x10(%rsp), %rdi
        movl    0x18(%rsp), %r9d
        movl    CP_HRES_LOCK(%rdi), %edx
        andl    $0xfffffffe, %r9d
        cmpl    %r9d, %edx
        jne     1b

        /*
         * The in-kernel logic for calculating hrtime performs several checks
         * to protect against edge cases.  That logic is summarized as:
         * if (tsc >= tsc_last) {
         *         delta -= tsc_last;
         * } else if (tsc >= tsc_last - 2*tsc_max_delta) {
         *         delta = 0;
         * } else {
         *         delta = MIN(tsc, tsc_resume_cap);
         * }
         *
         * The below implementation achieves the same result, although it is
         * structured for speed and optimized for the fast path:
         *
         * delta = tsc - tsc_last;
         * if (delta < 0) {
         *         delta += (tsc_max_delta << 1);
         *         if (delta >= 0) {
         *                 delta = 0;
         *         } else {
         *                 delta = MIN(tsc, tsc_resume_cap);
         *         }
         * }
         */
        movq    (%rsp), %rdx
        subq    %rdx, %rax              /* delta = tsc - tsc_last */
        jbe     3f                      /* if (delta < 0) */

2:
        /*
         * Optimized TSC_CONVERT_AND_ADD:
         * hrtime_base += (tsc_delta * nsec_scale) >> (32 - NSEC_SHIFT)
         *
         * Since the multiply and shift are done in 128-bit, there is no need
         * to worry about overflow.
         */
        movl    CP_NSEC_SCALE(%rdi), %ecx
        mulq    %rcx
        shrdq   $_CONST(32 - NSEC_SHIFT), %rdx, %rax
        movq    0x8(%rsp), %r8
        addq    %r8, %rax

        addq    $0x20, %rsp
        ret

3:
        movq    %rax, %r9               /* save (tsc - tsc_last) in r9 */
        movl    CP_TSC_MAX_DELTA(%rdi), %ecx
        sall    $1, %ecx
        addq    %rcx, %rax              /* delta += (tsc_max_delta << 1) */
        jae     4f                      /* delta < 0 */
        xorq    %rax, %rax
        jmp     2b

4:
        /*
         * Repopulate %rax with the TSC reading by adding tsc_last to %r9
         * (which holds tsc - tsc_last)
         */
        movq    (%rsp), %rax
        addq    %r9, %rax

        /* delta = MIN(tsc, resume_cap) */
        movq    CP_TSC_RESUME_CAP(%rdi), %rcx
        cmpq    %rcx, %rax
        jbe     5f
        movq    %rcx, %rax
5:
        jmp     2b

6:
        movl    $T_GETHRTIME, %eax
        int     $T_FASTTRAP
        addq    $0x20, %rsp
        ret

        SET_SIZE(__cp_gethrtime)

/*
 * int
 * __cp_clock_gettime_monotonic(comm_page_t *cp, timespec_t *tsp)
 *
 * Stack usage: 0x8 local + 0x8 call + 0x28 called func. = 0x38 bytes
 *
 * %rsp+0x00 - timespec_t *tsp
 */
        ENTRY_NP(__cp_clock_gettime_monotonic)
        subq    $0x8, %rsp
        movq    %rsi, (%rsp)

        call    __cp_gethrtime

        /*
         * Convert from hrtime_t (int64_t in nanoseconds) to timespec_t.
         * This uses the same approach as hrt2ts, although it has been updated
         * to utilize 64-bit math.
         * 1 / 1,000,000,000 =
         * 1000100101110000010111110100000100110110101101001010110110011B-26
         * = 0x112e0be826d694b3 * 2^-26
         *
         * secs = (nsecs * 0x112e0be826d694b3) >> 26
         *
         * In order to account for the 2s-compliment of negative inputs, a
         * final operation completes the process:
         *
         * secs -= (nsecs >> 63)
         */
        movq    %rax, %r11
        movq    $0x112e0be826d694b3, %rdx
        imulq   %rdx
        sarq    $0x1a, %rdx
        movq    %r11, %rax
        sarq    $0x3f, %rax
        subq    %rax, %rdx
        movq    (%rsp), %rsi
        movq    %rdx, (%rsi)
        /*
         * Populating tv_nsec is easier:
         * tv_nsec = nsecs - (secs * NANOSEC)
         */
        imulq   $NANOSEC, %rdx, %rdx
        subq    %rdx, %r11
        movq    %r11, 0x8(%rsi)

        xorl    %eax, %eax
        addq    $0x8, %rsp
        ret
        SET_SIZE(__cp_clock_gettime_monotonic)

/*
 * int
 * __cp_clock_gettime_realtime(comm_page_t *cp, timespec_t *tsp)
 *
 * Stack usage: 0x18 local + 0x8 call + 0x28 called func. = 0x48 bytes
 *
 * %rsp+0x00 - commpage_t *cp
 * %rsp+0x08 - timespec_t *tsp
 * %rsp+0x10 - int hres_lock
 */
        ENTRY_NP(__cp_clock_gettime_realtime)
        subq    $0x18, %rsp
        movq    %rdi, (%rsp)
        movq    %rsi, 0x8(%rsp)

1:
        movl    CP_HRES_LOCK(%rdi), %eax
        movl    %eax, 0x10(%rsp)

        call    __cp_gethrtime
        movq    (%rsp), %rdi
        movq    CP_HRES_LAST_TICK(%rdi), %rdx
        subq    %rdx, %rax                      /* nslt = hrtime - last_tick */
        jb      1b
        movq    CP_HRESTIME(%rdi), %r9
        movq    _CONST(CP_HRESTIME + CP_HRESTIME_INCR)(%rdi), %r10
        movl    CP_HRESTIME_ADJ(%rdi), %r11d

        addq    %rax, %r10                      /* now.tv_nsec += nslt */

        cmpl    $0, %r11d
        jb      4f                              /* hres_adj > 0 */
        ja      6f                              /* hres_adj < 0 */

2:
        cmpq    $NANOSEC, %r10
        jae     8f                              /* tv_nsec >= NANOSEC */

3:
        movl    0x10(%rsp), %eax
        movl    CP_HRES_LOCK(%rdi), %edx
        andl    $0xfffffffe, %edx
        cmpl    %eax, %edx
        jne     1b

        movq    0x8(%rsp), %rsi
        movq    %r9, (%rsi)
        movq    %r10, 0x8(%rsi)

        xorl    %eax, %eax
        addq    $0x18, %rsp
        ret


4:                                              /* hres_adj > 0 */
        sarq    $ADJ_SHIFT, %rax
        cmpl    %r11d, %eax
        jbe     5f
        movl    %r11d, %eax
5:
        addq    %rax, %r10
        jmp     2b

6:                                              /* hres_adj < 0 */
        sarq    $ADJ_SHIFT, %rax
        negl    %r11d
        cmpl    %r11d, %eax
        jbe     7f
        movl    %r11d, %eax
7:
        subq    %rax, %r10
        jmp     2b

8:                                              /* tv_nsec >= NANOSEC */
        subq    $NANOSEC, %r10
        incq    %r9
        cmpq    $NANOSEC, %r10
        jae     8b
        jmp     3b

        SET_SIZE(__cp_clock_gettime_realtime)