root/sys/amd64/amd64/support.S
/*-
 * Copyright (c) 2018-2019 The FreeBSD Foundation
 * Copyright (c) 2003 Peter Wemm.
 * Copyright (c) 1993 The Regents of the University of California.
 * All rights reserved.
 *
 * Portions of this software were developed by
 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
 * the FreeBSD Foundation.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "opt_ddb.h"

#include <machine/asmacros.h>
#include <machine/specialreg.h>
#include <machine/pmap.h>

#include "assym.inc"

        .text

/* Address: %rdi */
ENTRY(pagezero_std)
        PUSH_FRAME_POINTER
        movl    $PAGE_SIZE/8,%ecx
        xorl    %eax,%eax
        rep
        stosq
        POP_FRAME_POINTER
        ret
END(pagezero_std)

ENTRY(pagezero_erms)
        PUSH_FRAME_POINTER
        movl    $PAGE_SIZE,%ecx
        xorl    %eax,%eax
        rep
        stosb
        POP_FRAME_POINTER
        ret
END(pagezero_erms)

/*
 * pagecopy(%rdi=from, %rsi=to)
 */
ENTRY(pagecopy)
        PUSH_FRAME_POINTER
        movl    $PAGE_SIZE/8,%ecx
        movq    %rdi,%r9
        movq    %rsi,%rdi
        movq    %r9,%rsi
        rep
        movsq
        POP_FRAME_POINTER
        ret
END(pagecopy)

/*
 * memcmp(b1, b2, len)
 *         rdi,rsi,rdx
 */
ENTRY(memcmp)
        PUSH_FRAME_POINTER

        xorl    %eax,%eax
10:
        cmpq    $16,%rdx
        ja      101632f

        cmpb    $8,%dl
        jg      100816f

        cmpb    $4,%dl
        jg      100408f

        cmpb    $2,%dl
        jge     100204f

        cmpb    $1,%dl
        jl      100000f
        movzbl  (%rdi),%eax
        movzbl  (%rsi),%r8d
        subl    %r8d,%eax
100000:
        POP_FRAME_POINTER
        ret

        ALIGN_TEXT
100816:
        movq    (%rdi),%r8
        movq    (%rsi),%r9
        cmpq    %r8,%r9
        jne     80f
        movq    -8(%rdi,%rdx),%r8
        movq    -8(%rsi,%rdx),%r9
        cmpq    %r8,%r9
        jne     10081608f
        POP_FRAME_POINTER
        ret
        ALIGN_TEXT
100408:
        movl    (%rdi),%r8d
        movl    (%rsi),%r9d
        cmpl    %r8d,%r9d
        jne     80f
        movl    -4(%rdi,%rdx),%r8d
        movl    -4(%rsi,%rdx),%r9d
        cmpl    %r8d,%r9d
        jne     10040804f
        POP_FRAME_POINTER
        ret
        ALIGN_TEXT
100204:
        movzwl  (%rdi),%r8d
        movzwl  (%rsi),%r9d
        cmpl    %r8d,%r9d
        jne     1f
        movzwl  -2(%rdi,%rdx),%r8d
        movzwl  -2(%rsi,%rdx),%r9d
        cmpl    %r8d,%r9d
        jne     1f
        POP_FRAME_POINTER
        ret
        ALIGN_TEXT
101632:
        cmpq    $32,%rdx
        ja      103200f
        movq    (%rdi),%r8
        movq    (%rsi),%r9
        cmpq    %r8,%r9
        jne     80f
        movq    8(%rdi),%r8
        movq    8(%rsi),%r9
        cmpq    %r8,%r9
        jne     10163208f
        movq    -16(%rdi,%rdx),%r8
        movq    -16(%rsi,%rdx),%r9
        cmpq    %r8,%r9
        jne     10163216f
        movq    -8(%rdi,%rdx),%r8
        movq    -8(%rsi,%rdx),%r9
        cmpq    %r8,%r9
        jne     10163224f
        POP_FRAME_POINTER
        ret
        ALIGN_TEXT
103200:
        movq    (%rdi),%r8
        movq    8(%rdi),%r9
        subq    (%rsi),%r8
        subq    8(%rsi),%r9
        orq     %r8,%r9
        jnz     10320000f

        movq    16(%rdi),%r8
        movq    24(%rdi),%r9
        subq    16(%rsi),%r8
        subq    24(%rsi),%r9
        orq     %r8,%r9
        jnz     10320016f

        leaq    32(%rdi),%rdi
        leaq    32(%rsi),%rsi
        subq    $32,%rdx
        cmpq    $32,%rdx
        jae     103200b
        cmpb    $0,%dl
        jne     10b
        POP_FRAME_POINTER
        ret

/*
 * Mismatch was found.
 *
 * Before we compute it we narrow down the range (16 -> 8 -> 4 bytes).
 */
        ALIGN_TEXT
10320016:
        leaq    16(%rdi),%rdi
        leaq    16(%rsi),%rsi
10320000:
        movq    (%rdi),%r8
        movq    (%rsi),%r9
        cmpq    %r8,%r9
        jne     80f
        leaq    8(%rdi),%rdi
        leaq    8(%rsi),%rsi
        jmp     80f
        ALIGN_TEXT
10081608:
10163224:
        leaq    -8(%rdi,%rdx),%rdi
        leaq    -8(%rsi,%rdx),%rsi
        jmp     80f
        ALIGN_TEXT
10163216:
        leaq    -16(%rdi,%rdx),%rdi
        leaq    -16(%rsi,%rdx),%rsi
        jmp     80f
        ALIGN_TEXT
10163208:
        leaq    8(%rdi),%rdi
        leaq    8(%rsi),%rsi
        jmp     80f
        ALIGN_TEXT
10040804:
        leaq    -4(%rdi,%rdx),%rdi
        leaq    -4(%rsi,%rdx),%rsi
        jmp     1f

        ALIGN_TEXT
80:
        movl    (%rdi),%r8d
        movl    (%rsi),%r9d
        cmpl    %r8d,%r9d
        jne     1f
        leaq    4(%rdi),%rdi
        leaq    4(%rsi),%rsi

/*
 * We have up to 4 bytes to inspect.
 */
1:
        movzbl  (%rdi),%eax
        movzbl  (%rsi),%r8d
        cmpb    %r8b,%al
        jne     2f

        movzbl  1(%rdi),%eax
        movzbl  1(%rsi),%r8d
        cmpb    %r8b,%al
        jne     2f

        movzbl  2(%rdi),%eax
        movzbl  2(%rsi),%r8d
        cmpb    %r8b,%al
        jne     2f

        movzbl  3(%rdi),%eax
        movzbl  3(%rsi),%r8d
2:
        subl    %r8d,%eax
        POP_FRAME_POINTER
        ret
END(memcmp)

/*
 * memmove(dst, src, cnt)
 *         rdi, rsi, rdx
 */

/*
 * Register state at entry is supposed to be as follows:
 * rdi - destination
 * rsi - source
 * rdx - count
 *
 * The macro possibly clobbers the above and: rcx, r8, r9, r10
 * It does not clobber rax nor r11.
 */
.macro MEMMOVE erms overlap begin end
        \begin

        /*
         * For sizes 0..32 all data is read before it is written, so there
         * is no correctness issue with direction of copying.
         */
        cmpq    $32,%rcx
        jbe     101632f

.if \overlap == 1
        movq    %rdi,%r8
        subq    %rsi,%r8
        cmpq    %rcx,%r8        /* overlapping && src < dst? */
        jb      2f
.endif

        cmpq    $256,%rcx
        ja      1256f

        ALIGN_TEXT
103200:
        movq    (%rsi),%rdx
        movq    %rdx,(%rdi)
        movq    8(%rsi),%rdx
        movq    %rdx,8(%rdi)
        movq    16(%rsi),%rdx
        movq    %rdx,16(%rdi)
        movq    24(%rsi),%rdx
        movq    %rdx,24(%rdi)
        leaq    32(%rsi),%rsi
        leaq    32(%rdi),%rdi
        subq    $32,%rcx
        cmpq    $32,%rcx
        jae     103200b
        cmpb    $0,%cl
        jne     101632f
        \end
        ret
        ALIGN_TEXT
101632:
        cmpb    $16,%cl
        jl      100816f
        movq    (%rsi),%rdx
        movq    8(%rsi),%r8
        movq    -16(%rsi,%rcx),%r9
        movq    -8(%rsi,%rcx),%r10
        movq    %rdx,(%rdi)
        movq    %r8,8(%rdi)
        movq    %r9,-16(%rdi,%rcx)
        movq    %r10,-8(%rdi,%rcx)
        \end
        ret
        ALIGN_TEXT
100816:
        cmpb    $8,%cl
        jl      100408f
        movq    (%rsi),%rdx
        movq    -8(%rsi,%rcx),%r8
        movq    %rdx,(%rdi)
        movq    %r8,-8(%rdi,%rcx,)
        \end
        ret
        ALIGN_TEXT
100408:
        cmpb    $4,%cl
        jl      100204f
        movl    (%rsi),%edx
        movl    -4(%rsi,%rcx),%r8d
        movl    %edx,(%rdi)
        movl    %r8d,-4(%rdi,%rcx)
        \end
        ret
        ALIGN_TEXT
100204:
        cmpb    $2,%cl
        jl      100001f
        movzwl  (%rsi),%edx
        movzwl  -2(%rsi,%rcx),%r8d
        movw    %dx,(%rdi)
        movw    %r8w,-2(%rdi,%rcx)
        \end
        ret
        ALIGN_TEXT
100001:
        cmpb    $1,%cl
        jl      100000f
        movb    (%rsi),%dl
        movb    %dl,(%rdi)
100000:
        \end
        ret

        ALIGN_TEXT
1256:
        testb   $15,%dil
        jnz     100f
.if \erms == 1
        rep
        movsb
.else
        shrq    $3,%rcx                         /* copy by 64-bit words */
        rep
        movsq
        movq    %rdx,%rcx
        andl    $7,%ecx                         /* any bytes left? */
        jne     100408b
.endif
        \end
        ret
100:
        movq    (%rsi),%r8
        movq    8(%rsi),%r9
        movq    %rdi,%r10
        movq    %rdi,%rcx
        andq    $15,%rcx
        leaq    -16(%rdx,%rcx),%rdx
        neg     %rcx
        leaq    16(%rdi,%rcx),%rdi
        leaq    16(%rsi,%rcx),%rsi
        movq    %rdx,%rcx
.if \erms == 1
        rep
        movsb
        movq    %r8,(%r10)
        movq    %r9,8(%r10)
.else
        shrq    $3,%rcx                         /* copy by 64-bit words */
        rep
        movsq
        movq    %r8,(%r10)
        movq    %r9,8(%r10)
        movq    %rdx,%rcx
        andl    $7,%ecx                         /* any bytes left? */
        jne     100408b
.endif
        \end
        ret

.if \overlap == 1
        /*
         * Copy backwards.
         */
        ALIGN_TEXT
2:
        cmpq    $256,%rcx
        ja      2256f

        leaq    -8(%rdi,%rcx),%rdi
        leaq    -8(%rsi,%rcx),%rsi

        cmpq    $32,%rcx
        jb      2016f

        ALIGN_TEXT
2032:
        movq    (%rsi),%rdx
        movq    %rdx,(%rdi)
        movq    -8(%rsi),%rdx
        movq    %rdx,-8(%rdi)
        movq    -16(%rsi),%rdx
        movq    %rdx,-16(%rdi)
        movq    -24(%rsi),%rdx
        movq    %rdx,-24(%rdi)
        leaq    -32(%rsi),%rsi
        leaq    -32(%rdi),%rdi
        subq    $32,%rcx
        cmpq    $32,%rcx
        jae     2032b
        cmpb    $0,%cl
        jne     2016f
        \end
        ret
        ALIGN_TEXT
2016:
        cmpb    $16,%cl
        jl      2008f
        movq    (%rsi),%rdx
        movq    %rdx,(%rdi)
        movq    -8(%rsi),%rdx
        movq    %rdx,-8(%rdi)
        subb    $16,%cl
        jz      2000f
        leaq    -16(%rsi),%rsi
        leaq    -16(%rdi),%rdi
2008:
        cmpb    $8,%cl
        jl      2004f
        movq    (%rsi),%rdx
        movq    %rdx,(%rdi)
        subb    $8,%cl
        jz      2000f
        leaq    -8(%rsi),%rsi
        leaq    -8(%rdi),%rdi
2004:
        cmpb    $4,%cl
        jl      2002f
        movl    4(%rsi),%edx
        movl    %edx,4(%rdi)
        subb    $4,%cl
        jz      2000f
        leaq    -4(%rsi),%rsi
        leaq    -4(%rdi),%rdi
2002:
        cmpb    $2,%cl
        jl      2001f
        movw    6(%rsi),%dx
        movw    %dx,6(%rdi)
        subb    $2,%cl
        jz      2000f
        leaq    -2(%rsi),%rsi
        leaq    -2(%rdi),%rdi
2001:
        cmpb    $1,%cl
        jl      2000f
        movb    7(%rsi),%dl
        movb    %dl,7(%rdi)
2000:
        \end
        ret
        ALIGN_TEXT
2256:
        std
        leaq    -8(%rdi,%rcx),%rdi
        leaq    -8(%rsi,%rcx),%rsi
        shrq    $3,%rcx
        rep
        movsq
        cld
        movq    %rdx,%rcx
        andb    $7,%cl
        jne     2004b
        \end
        ret
.endif
.endm

.macro MEMMOVE_BEGIN
        PUSH_FRAME_POINTER
        movq    %rdi,%rax
        movq    %rdx,%rcx
.endm

.macro MEMMOVE_END
        POP_FRAME_POINTER
.endm

ENTRY(memmove_std)
        MEMMOVE erms=0 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END
END(memmove_std)

ENTRY(memmove_erms)
        MEMMOVE erms=1 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END
END(memmove_erms)

/*
 * memcpy(dst, src, len)
 *        rdi, rsi, rdx
 *
 * Note: memcpy does not support overlapping copies
 */
ENTRY(memcpy_std)
        MEMMOVE erms=0 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END
END(memcpy_std)

ENTRY(memcpy_erms)
        MEMMOVE erms=1 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END
END(memcpy_erms)

/*
 * memset(dst, c,   len)
 *        rdi, rsi, rdx
 */
.macro MEMSET erms
        PUSH_FRAME_POINTER
        movq    %rdi,%rax
        movq    %rdx,%rcx
        movzbq  %sil,%r8
        movabs  $0x0101010101010101,%r10
        imulq   %r8,%r10

        cmpq    $32,%rcx
        jbe     101632f

        cmpq    $256,%rcx
        ja      1256f

        ALIGN_TEXT
103200:
        movq    %r10,(%rdi)
        movq    %r10,8(%rdi)
        movq    %r10,16(%rdi)
        movq    %r10,24(%rdi)
        leaq    32(%rdi),%rdi
        subq    $32,%rcx
        cmpq    $32,%rcx
        ja      103200b
        cmpb    $16,%cl
        ja      201632f
        movq    %r10,-16(%rdi,%rcx)
        movq    %r10,-8(%rdi,%rcx)
        POP_FRAME_POINTER
        ret
        ALIGN_TEXT
101632:
        cmpb    $16,%cl
        jl      100816f
201632:
        movq    %r10,(%rdi)
        movq    %r10,8(%rdi)
        movq    %r10,-16(%rdi,%rcx)
        movq    %r10,-8(%rdi,%rcx)
        POP_FRAME_POINTER
        ret
        ALIGN_TEXT
100816:
        cmpb    $8,%cl
        jl      100408f
        movq    %r10,(%rdi)
        movq    %r10,-8(%rdi,%rcx)
        POP_FRAME_POINTER
        ret
        ALIGN_TEXT
100408:
        cmpb    $4,%cl
        jl      100204f
        movl    %r10d,(%rdi)
        movl    %r10d,-4(%rdi,%rcx)
        POP_FRAME_POINTER
        ret
        ALIGN_TEXT
100204:
        cmpb    $2,%cl
        jl      100001f
        movw    %r10w,(%rdi)
        movw    %r10w,-2(%rdi,%rcx)
        POP_FRAME_POINTER
        ret
        ALIGN_TEXT
100001:
        cmpb    $0,%cl
        je      100000f
        movb    %r10b,(%rdi)
100000:
        POP_FRAME_POINTER
        ret
        ALIGN_TEXT
1256:
        movq    %rdi,%r9
        movq    %r10,%rax
        testl   $15,%edi
        jnz     3f
1:
.if \erms == 1
        rep
        stosb
        movq    %r9,%rax
.else
        movq    %rcx,%rdx
        shrq    $3,%rcx
        rep
        stosq
        movq    %r9,%rax
        andl    $7,%edx
        jnz     2f
        POP_FRAME_POINTER
        ret
2:
        movq    %r10,-8(%rdi,%rdx)
.endif
        POP_FRAME_POINTER
        ret
        ALIGN_TEXT
3:
        movq    %r10,(%rdi)
        movq    %r10,8(%rdi)
        movq    %rdi,%r8
        andq    $15,%r8
        leaq    -16(%rcx,%r8),%rcx
        neg     %r8
        leaq    16(%rdi,%r8),%rdi
        jmp     1b
.endm

ENTRY(memset_std)
        MEMSET erms=0
END(memset_std)

ENTRY(memset_erms)
        MEMSET erms=1
END(memset_erms)

/* fillw(pat, base, cnt) */
/*       %rdi,%rsi, %rdx */
ENTRY(fillw)
        PUSH_FRAME_POINTER
        movq    %rdi,%rax
        movq    %rsi,%rdi
        movq    %rdx,%rcx
        rep
        stosw
        POP_FRAME_POINTER
        ret
END(fillw)

/*
 * strlen(string)
 *        %rdi
 *
 * Uses the ((x - 0x01....01) & ~x & 0x80....80) trick.
 *
 * 0x01....01 is replaced with 0x0 - 0x01....01 so that it can be added
 * with leaq.
 *
 * For a description see either:
 * - "Hacker's Delight" by Henry S. Warren, Jr.
 * - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms"
 *   by Agner Fog
 *
 * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386.
 */
ENTRY(strlen)
        PUSH_FRAME_POINTER
        movabsq $0xfefefefefefefeff,%r8
        movabsq $0x8080808080808080,%r9

        movq    %rdi,%r10
        movq    %rdi,%rcx
        testb   $7,%dil
        jz      2f

        /*
         * Handle misaligned reads: align to 8 and fill
         * the spurious bytes.
         */
        andq    $~7,%rdi
        movq    (%rdi),%r11
        shlq    $3,%rcx
        movq    $-1,%rdx
        shlq    %cl,%rdx
        notq    %rdx
        orq     %rdx,%r11

        leaq    (%r11,%r8),%rcx
        notq    %r11
        andq    %r11,%rcx
        andq    %r9,%rcx
        jnz     3f

        /*
         * Main loop.
         */
        ALIGN_TEXT
1:
        leaq    8(%rdi),%rdi
2:
        movq    (%rdi),%r11
        leaq    (%r11,%r8),%rcx
        notq    %r11
        andq    %r11,%rcx
        andq    %r9,%rcx
        jz      1b
3:
        bsfq    %rcx,%rcx
        shrq    $3,%rcx
        leaq    (%rcx,%rdi),%rax
        subq    %r10,%rax
        POP_FRAME_POINTER
        ret
END(strlen)

/*****************************************************************************/
/* copyout and fubyte family                                                 */
/*****************************************************************************/
/*
 * Access user memory from inside the kernel. These routines should be
 * the only places that do this.
 *
 * These routines set curpcb->pcb_onfault for the time they execute. When a
 * protection violation occurs inside the functions, the trap handler
 * returns to *curpcb->pcb_onfault instead of the function.
 */

.macro SMAP_DISABLE smap
.if     \smap
        stac
.endif
.endm


.macro SMAP_ENABLE smap
.if     \smap
        clac
.endif
.endm

.macro COPYINOUT_BEGIN
.endm

.macro COPYINOUT_END
        movq    %rax,PCB_ONFAULT(%r11)
        POP_FRAME_POINTER
.endm

.macro COPYINOUT_SMAP_END
        SMAP_ENABLE smap=1
        COPYINOUT_END
.endm

/*
 * copyout(from_kernel, to_user, len)
 *         %rdi,        %rsi,    %rdx
 */
.macro  COPYOUT smap erms
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%r11
        movq    $copy_fault,PCB_ONFAULT(%r11)

        /*
         * Check explicitly for non-user addresses.
         * First, prevent address wrapping.
         */
        movq    %rsi,%rax
        addq    %rdx,%rax
        jc      copy_fault
/*
 * XXX STOP USING VM_MAXUSER_ADDRESS.
 * It is an end address, not a max, so every time it is used correctly it
 * looks like there is an off by one error, and of course it caused an off
 * by one error in several places.
 */
        movq    $VM_MAXUSER_ADDRESS,%rcx
        cmpq    %rcx,%rax
        ja      copy_fault

        /*
         * Set return value to zero. Remaining failure mode goes through
         * copy_fault.
         */
        xorl    %eax,%eax

        /*
         * Set up arguments for MEMMOVE.
         */
        movq    %rdi,%r8
        movq    %rsi,%rdi
        movq    %r8,%rsi
        movq    %rdx,%rcx


        SMAP_DISABLE \smap
.if     \smap == 1
        MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_SMAP_END
.else
        MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_END
.endif
        /* NOTREACHED */
.endm

ENTRY(copyout_nosmap_std)
        COPYOUT smap=0 erms=0
END(copyout_nosmap_std)

ENTRY(copyout_smap_std)
        COPYOUT smap=1 erms=0
END(copyout_smap_std)

ENTRY(copyout_nosmap_erms)
        COPYOUT smap=0 erms=1
END(copyout_nosmap_erms)

ENTRY(copyout_smap_erms)
        COPYOUT smap=1 erms=1
END(copyout_smap_erms)

/*
 * copyin(from_user, to_kernel, len)
 *        %rdi,      %rsi,      %rdx
 */
.macro  COPYIN smap erms
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%r11
        movq    $copy_fault,PCB_ONFAULT(%r11)

        /*
         * make sure address is valid
         */
        movq    %rdi,%rax
        addq    %rdx,%rax
        jc      copy_fault
        movq    $VM_MAXUSER_ADDRESS,%rcx
        cmpq    %rcx,%rax
        ja      copy_fault

        xorl    %eax,%eax

        movq    %rdi,%r8
        movq    %rsi,%rdi
        movq    %r8,%rsi
        movq    %rdx,%rcx

        SMAP_DISABLE \smap
.if     \smap == 1
        MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_SMAP_END
.else
        MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_END
.endif
        /* NOTREACHED */
.endm

ENTRY(copyin_nosmap_std)
        COPYIN smap=0 erms=0
END(copyin_nosmap_std)

ENTRY(copyin_smap_std)
        COPYIN smap=1 erms=0
END(copyin_smap_std)

ENTRY(copyin_nosmap_erms)
        COPYIN smap=0 erms=1
END(copyin_nosmap_erms)

ENTRY(copyin_smap_erms)
        COPYIN smap=1 erms=1
END(copyin_smap_erms)

        ALIGN_TEXT
copy_fault:
        testl   $CPUID_STDEXT_SMAP,cpu_stdext_feature(%rip)
        je      1f
        clac
1:      movq    $0,PCB_ONFAULT(%r11)
        movl    $EFAULT,%eax
        POP_FRAME_POINTER
        ret

/*
 * casueword32.  Compare and set user integer.  Returns -1 on fault,
 *        0 if access was successful, and 1 when comparison failed.
 *        Old value is written to *oldp.
 *        dst = %rdi, old = %esi, oldp = %rdx, new = %ecx
 */
ENTRY(casueword32_nosmap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%r8
        movq    $fusufault,PCB_ONFAULT(%r8)

        movq    $VM_MAXUSER_ADDRESS-4,%rax
        cmpq    %rax,%rdi                       /* verify address is valid */
        ja      fusufault

        movl    %esi,%eax                       /* old */
        lock cmpxchgl %ecx,(%rdi)               /* new = %ecx */
        setne   %cl

        /*
         * The old value is in %eax.  If the store succeeded it will be the
         * value we expected (old) from before the store, otherwise it will
         * be the current value.  Save %eax into %esi to prepare the return
         * value.
         */
        movl    %eax,%esi
        xorl    %eax,%eax
        movq    %rax,PCB_ONFAULT(%r8)

        /*
         * Access the oldp after the pcb_onfault is cleared, to correctly
         * catch corrupted pointer.
         */
        movl    %esi,(%rdx)                     /* oldp = %rdx */
        POP_FRAME_POINTER
        movzbl  %cl, %eax
        ret
END(casueword32_nosmap)

ENTRY(casueword32_smap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%r8
        movq    $fusufault,PCB_ONFAULT(%r8)

        movq    $VM_MAXUSER_ADDRESS-4,%rax
        cmpq    %rax,%rdi                       /* verify address is valid */
        ja      fusufault

        movl    %esi,%eax                       /* old */
        stac
        lock cmpxchgl %ecx,(%rdi)               /* new = %ecx */
        clac
        setne   %cl

        /*
         * The old value is in %eax.  If the store succeeded it will be the
         * value we expected (old) from before the store, otherwise it will
         * be the current value.  Save %eax into %esi to prepare the return
         * value.
         */
        movl    %eax,%esi
        xorl    %eax,%eax
        movq    %rax,PCB_ONFAULT(%r8)

        /*
         * Access the oldp after the pcb_onfault is cleared, to correctly
         * catch corrupted pointer.
         */
        movl    %esi,(%rdx)                     /* oldp = %rdx */
        POP_FRAME_POINTER
        movzbl  %cl, %eax
        ret
END(casueword32_smap)

/*
 * casueword.  Compare and set user long.  Returns -1 on fault,
 *        0 if access was successful, and 1 when comparison failed.
 *        Old value is written to *oldp.
 *        dst = %rdi, old = %rsi, oldp = %rdx, new = %rcx
 */
ENTRY(casueword_nosmap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%r8
        movq    $fusufault,PCB_ONFAULT(%r8)

        movq    $VM_MAXUSER_ADDRESS-4,%rax
        cmpq    %rax,%rdi                       /* verify address is valid */
        ja      fusufault

        movq    %rsi,%rax                       /* old */
        lock cmpxchgq %rcx,(%rdi)               /* new = %rcx */
        setne   %cl

        /*
         * The old value is in %rax.  If the store succeeded it will be the
         * value we expected (old) from before the store, otherwise it will
         * be the current value.
         */
        movq    %rax,%rsi
        xorl    %eax,%eax
        movq    %rax,PCB_ONFAULT(%r8)
        movq    %rsi,(%rdx)
        POP_FRAME_POINTER
        movzbl  %cl, %eax
        ret
END(casueword_nosmap)

ENTRY(casueword_smap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%r8
        movq    $fusufault,PCB_ONFAULT(%r8)

        movq    $VM_MAXUSER_ADDRESS-4,%rax
        cmpq    %rax,%rdi                       /* verify address is valid */
        ja      fusufault

        movq    %rsi,%rax                       /* old */
        stac
        lock cmpxchgq %rcx,(%rdi)               /* new = %rcx */
        clac
        setne   %cl

        /*
         * The old value is in %rax.  If the store succeeded it will be the
         * value we expected (old) from before the store, otherwise it will
         * be the current value.
         */
        movq    %rax,%rsi
        xorl    %eax,%eax
        movq    %rax,PCB_ONFAULT(%r8)
        movq    %rsi,(%rdx)
        POP_FRAME_POINTER
        movzbl  %cl, %eax
        ret
END(casueword_smap)

/*
 * Fetch (load) a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit
 * byte from user memory.
 * addr = %rdi, valp = %rsi
 */

ENTRY(fueword_nosmap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-8,%rax
        cmpq    %rax,%rdi                       /* verify address is valid */
        ja      fusufault

        xorl    %eax,%eax
        movq    (%rdi),%r11
        movq    %rax,PCB_ONFAULT(%rcx)
        movq    %r11,(%rsi)
        POP_FRAME_POINTER
        ret
END(fueword_nosmap)

ENTRY(fueword_smap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-8,%rax
        cmpq    %rax,%rdi                       /* verify address is valid */
        ja      fusufault

        xorl    %eax,%eax
        stac
        movq    (%rdi),%r11
        clac
        movq    %rax,PCB_ONFAULT(%rcx)
        movq    %r11,(%rsi)
        POP_FRAME_POINTER
        ret
END(fueword_smap)

ENTRY(fueword32_nosmap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-4,%rax
        cmpq    %rax,%rdi                       /* verify address is valid */
        ja      fusufault

        xorl    %eax,%eax
        movl    (%rdi),%r11d
        movq    %rax,PCB_ONFAULT(%rcx)
        movl    %r11d,(%rsi)
        POP_FRAME_POINTER
        ret
END(fueword32_nosmap)

ENTRY(fueword32_smap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-4,%rax
        cmpq    %rax,%rdi                       /* verify address is valid */
        ja      fusufault

        xorl    %eax,%eax
        stac
        movl    (%rdi),%r11d
        clac
        movq    %rax,PCB_ONFAULT(%rcx)
        movl    %r11d,(%rsi)
        POP_FRAME_POINTER
        ret
END(fueword32_smap)

ENTRY(fuword16_nosmap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-2,%rax
        cmpq    %rax,%rdi
        ja      fusufault

        movzwl  (%rdi),%eax
        movq    $0,PCB_ONFAULT(%rcx)
        POP_FRAME_POINTER
        ret
END(fuword16_nosmap)

ENTRY(fuword16_smap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-2,%rax
        cmpq    %rax,%rdi
        ja      fusufault

        stac
        movzwl  (%rdi),%eax
        clac
        movq    $0,PCB_ONFAULT(%rcx)
        POP_FRAME_POINTER
        ret
END(fuword16_smap)

ENTRY(fubyte_nosmap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-1,%rax
        cmpq    %rax,%rdi
        ja      fusufault

        movzbl  (%rdi),%eax
        movq    $0,PCB_ONFAULT(%rcx)
        POP_FRAME_POINTER
        ret
END(fubyte_nosmap)

ENTRY(fubyte_smap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-1,%rax
        cmpq    %rax,%rdi
        ja      fusufault

        stac
        movzbl  (%rdi),%eax
        clac
        movq    $0,PCB_ONFAULT(%rcx)
        POP_FRAME_POINTER
        ret
END(fubyte_smap)

/*
 * Store a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit byte to
 * user memory.
 * addr = %rdi, value = %rsi
 */
ENTRY(suword_nosmap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-8,%rax
        cmpq    %rax,%rdi                       /* verify address validity */
        ja      fusufault

        movq    %rsi,(%rdi)
        xorl    %eax,%eax
        movq    %rax,PCB_ONFAULT(%rcx)
        POP_FRAME_POINTER
        ret
END(suword_nosmap)

ENTRY(suword_smap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-8,%rax
        cmpq    %rax,%rdi                       /* verify address validity */
        ja      fusufault

        stac
        movq    %rsi,(%rdi)
        clac
        xorl    %eax,%eax
        movq    %rax,PCB_ONFAULT(%rcx)
        POP_FRAME_POINTER
        ret
END(suword_smap)

ENTRY(suword32_nosmap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-4,%rax
        cmpq    %rax,%rdi                       /* verify address validity */
        ja      fusufault

        movl    %esi,(%rdi)
        xorl    %eax,%eax
        movq    %rax,PCB_ONFAULT(%rcx)
        POP_FRAME_POINTER
        ret
END(suword32_nosmap)

ENTRY(suword32_smap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-4,%rax
        cmpq    %rax,%rdi                       /* verify address validity */
        ja      fusufault

        stac
        movl    %esi,(%rdi)
        clac
        xorl    %eax,%eax
        movq    %rax,PCB_ONFAULT(%rcx)
        POP_FRAME_POINTER
        ret
END(suword32_smap)

ENTRY(suword16_nosmap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-2,%rax
        cmpq    %rax,%rdi                       /* verify address validity */
        ja      fusufault

        movw    %si,(%rdi)
        xorl    %eax,%eax
        movq    %rax,PCB_ONFAULT(%rcx)
        POP_FRAME_POINTER
        ret
END(suword16_nosmap)

ENTRY(suword16_smap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-2,%rax
        cmpq    %rax,%rdi                       /* verify address validity */
        ja      fusufault

        stac
        movw    %si,(%rdi)
        clac
        xorl    %eax,%eax
        movq    %rax,PCB_ONFAULT(%rcx)
        POP_FRAME_POINTER
        ret
END(suword16_smap)

ENTRY(subyte_nosmap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-1,%rax
        cmpq    %rax,%rdi                       /* verify address validity */
        ja      fusufault

        movl    %esi,%eax
        movb    %al,(%rdi)
        xorl    %eax,%eax
        movq    %rax,PCB_ONFAULT(%rcx)
        POP_FRAME_POINTER
        ret
END(subyte_nosmap)

ENTRY(subyte_smap)
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)

        movq    $VM_MAXUSER_ADDRESS-1,%rax
        cmpq    %rax,%rdi                       /* verify address validity */
        ja      fusufault

        movl    %esi,%eax
        stac
        movb    %al,(%rdi)
        clac
        xorl    %eax,%eax
        movq    %rax,PCB_ONFAULT(%rcx)
        POP_FRAME_POINTER
        ret
END(subyte_smap)

        ALIGN_TEXT
fusufault:
        testl   $CPUID_STDEXT_SMAP,cpu_stdext_feature(%rip)
        je      1f
        clac
1:      movq    PCPU(CURPCB),%rcx
        xorl    %eax,%eax
        movq    %rax,PCB_ONFAULT(%rcx)
        decq    %rax
        POP_FRAME_POINTER
        ret

/*
 * copyinstr(from, to, maxlen, int *lencopied)
 *           %rdi, %rsi, %rdx, %rcx
 *
 *      copy a string from 'from' to 'to', stop when a 0 character is reached.
 *      return ENAMETOOLONG if string is longer than maxlen, and
 *      EFAULT on protection violations. If lencopied is non-zero,
 *      return the actual length in *lencopied.
 */
.macro COPYINSTR smap
        PUSH_FRAME_POINTER
        movq    %rdx,%r8                        /* %r8 = maxlen */
        movq    PCPU(CURPCB),%r9
        movq    $cpystrflt,PCB_ONFAULT(%r9)

        movq    $VM_MAXUSER_ADDRESS,%rax

        /* make sure 'from' is within bounds */
        subq    %rdi,%rax
        jbe     cpystrflt

        SMAP_DISABLE \smap

        /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
        cmpq    %rdx,%rax
        jb      8f
1:
        incq    %rdx
2:
        decq    %rdx
.if \smap == 0
        jz      copyinstr_toolong
.else
        jz      copyinstr_toolong_smap
.endif

        movb    (%rdi),%al
        movb    %al,(%rsi)
        incq    %rsi
        incq    %rdi
        testb   %al,%al
        jnz     2b

        SMAP_ENABLE \smap

        /* Success -- 0 byte reached */
        decq    %rdx
        xorl    %eax,%eax

        /* set *lencopied and return %eax */
        movq    %rax,PCB_ONFAULT(%r9)

        testq   %rcx,%rcx
        jz      3f
        subq    %rdx,%r8
        movq    %r8,(%rcx)
3:
        POP_FRAME_POINTER
        ret
        ALIGN_TEXT
8:
        movq    %rax,%rdx
        movq    %rax,%r8
        jmp 1b

.endm

ENTRY(copyinstr_nosmap)
        COPYINSTR smap=0
END(copyinstr_nosmap)

ENTRY(copyinstr_smap)
        COPYINSTR smap=1
END(copyinstr_smap)

cpystrflt:
        testl   $CPUID_STDEXT_SMAP,cpu_stdext_feature(%rip)
        je      1f
        clac
1:      movl    $EFAULT,%eax
cpystrflt_x:
        /* set *lencopied and return %eax */
        movq    $0,PCB_ONFAULT(%r9)

        testq   %rcx,%rcx
        jz      1f
        subq    %rdx,%r8
        movq    %r8,(%rcx)
1:
        POP_FRAME_POINTER
        ret

copyinstr_toolong_smap:
        clac
copyinstr_toolong:
        /* rdx is zero - return ENAMETOOLONG or EFAULT */
        movq    $VM_MAXUSER_ADDRESS,%rax
        cmpq    %rax,%rdi
        jae     cpystrflt
        movl    $ENAMETOOLONG,%eax
        jmp     cpystrflt_x

/*
 * Handling of special amd64 registers and descriptor tables etc
 */
/* void lgdt(struct region_descriptor *rdp); */
ENTRY(lgdt)
        /* reload the descriptor table */
        lgdt    (%rdi)

        /* flush the prefetch q */
        jmp     1f
        nop
1:
        movl    $KDSEL,%eax
        movl    %eax,%ds
        movl    %eax,%es
        movl    %eax,%fs        /* Beware, use wrmsr to set 64 bit base */
        movl    %eax,%gs
        movl    %eax,%ss

        /* reload code selector by turning return into intersegmental return */
        popq    %rax
        pushq   $KCSEL
        pushq   %rax
        lretq
END(lgdt)

/*****************************************************************************/
/* setjump, longjump                                                         */
/*****************************************************************************/

ENTRY(setjmp)
        movq    %rbx,0(%rdi)                    /* save rbx */
        movq    %rsp,8(%rdi)                    /* save rsp */
        movq    %rbp,16(%rdi)                   /* save rbp */
        movq    %r12,24(%rdi)                   /* save r12 */
        movq    %r13,32(%rdi)                   /* save r13 */
        movq    %r14,40(%rdi)                   /* save r14 */
        movq    %r15,48(%rdi)                   /* save r15 */
        movq    0(%rsp),%rdx                    /* get rta */
        movq    %rdx,56(%rdi)                   /* save rip */
        xorl    %eax,%eax                       /* return(0); */
        ret
END(setjmp)

ENTRY(longjmp)
        movq    0(%rdi),%rbx                    /* restore rbx */
        movq    8(%rdi),%rsp                    /* restore rsp */
        movq    16(%rdi),%rbp                   /* restore rbp */
        movq    24(%rdi),%r12                   /* restore r12 */
        movq    32(%rdi),%r13                   /* restore r13 */
        movq    40(%rdi),%r14                   /* restore r14 */
        movq    48(%rdi),%r15                   /* restore r15 */
        movq    56(%rdi),%rdx                   /* get rta */
        movq    %rdx,0(%rsp)                    /* put in return frame */
        xorl    %eax,%eax                       /* return(1); */
        incl    %eax
        ret
END(longjmp)

/*
 * Support for reading MSRs in the safe manner.  (Instead of panic on #gp,
 * return an error.)
 */
ENTRY(rdmsr_safe)
/* int rdmsr_safe(u_int msr, uint64_t *data) */
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%r8
        movq    PCB_ONFAULT(%r8),%r9
        movq    $msr_onfault,PCB_ONFAULT(%r8)
        movl    %edi,%ecx
        rdmsr                   /* Read MSR pointed by %ecx. Returns
                                   hi byte in edx, lo in %eax */
        salq    $32,%rdx        /* sign-shift %rdx left */
        movl    %eax,%eax       /* zero-extend %eax -> %rax */
        orq     %rdx,%rax
        movq    %rax,(%rsi)
        movq    %r9,PCB_ONFAULT(%r8)
        xorl    %eax,%eax
        POP_FRAME_POINTER
        ret

/*
 * Support for writing MSRs in the safe manner.  (Instead of panic on #gp,
 * return an error.)
 */
ENTRY(wrmsr_safe)
/* int wrmsr_safe(u_int msr, uint64_t data) */
        PUSH_FRAME_POINTER
        movq    PCPU(CURPCB),%r8
        movq    PCB_ONFAULT(%r8),%r9
        movq    $msr_onfault,PCB_ONFAULT(%r8)
        movl    %edi,%ecx
        movl    %esi,%eax
        sarq    $32,%rsi
        movl    %esi,%edx
        wrmsr                   /* Write MSR pointed by %ecx. Accepts
                                   hi byte in edx, lo in %eax. */
        movq    %r9,PCB_ONFAULT(%r8)
        xorl    %eax,%eax
        POP_FRAME_POINTER
        ret

/*
 * MSR operations fault handler
 */
        ALIGN_TEXT
msr_onfault:
        movq    %r9,PCB_ONFAULT(%r8)
        movl    $EFAULT,%eax
        POP_FRAME_POINTER
        ret

ENTRY(wrmsr_early_safe)
        movl    %edi,%ecx
        movl    %esi,%eax
        sarq    $32,%rsi
        movl    %esi,%edx
        wrmsr
        xorl    %eax,%eax
wrmsr_early_faulted:
        ret

ENTRY(wrmsr_early_safe_gp_handler)
        addq    $8,%rsp
        movl    $EFAULT,%eax
        movq    $wrmsr_early_faulted,(%rsp)
        iretq

/*
 * void pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3);
 * Invalidates address space addressed by ucr3, then returns to kcr3.
 * Done in assembler to ensure no other memory accesses happen while
 * on ucr3.
 */
        ALIGN_TEXT
ENTRY(pmap_pti_pcid_invalidate)
        pushfq
        cli
        movq    %rdi,%cr3       /* to user page table */
        movq    %rsi,%cr3       /* back to kernel */
        popfq
        retq

/*
 * void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va);
 * Invalidates virtual address va in address space ucr3, then returns to kcr3.
 */
        ALIGN_TEXT
ENTRY(pmap_pti_pcid_invlpg)
        pushfq
        cli
        movq    %rdi,%cr3       /* to user page table */
        invlpg  (%rdx)
        movq    %rsi,%cr3       /* back to kernel */
        popfq
        retq

/*
 * void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
 *     vm_offset_t eva);
 * Invalidates virtual addresses between sva and eva in address space ucr3,
 * then returns to kcr3.
 */
        ALIGN_TEXT
ENTRY(pmap_pti_pcid_invlrng)
        pushfq
        cli
        movq    %rdi,%cr3       /* to user page table */
1:      invlpg  (%rdx)
        addq    $PAGE_SIZE,%rdx
        cmpq    %rdx,%rcx
        ja      1b
        movq    %rsi,%cr3       /* back to kernel */
        popfq
        retq

        .altmacro
        .macro  rsb_seq_label l
rsb_seq_\l:
        .endm
        .macro  rsb_call_label l
        call    rsb_seq_\l
        .endm
        .macro  rsb_seq count
        ll=1
        .rept   \count
        rsb_call_label  %(ll)
        nop
        rsb_seq_label %(ll)
        addq    $8,%rsp
        ll=ll+1
        .endr
        .endm

ENTRY(rsb_flush)
        rsb_seq 32
        ret

/* all callers already saved %rax, %rdx, and %rcx */
ENTRY(handle_ibrs_entry)
        cmpb    $0,hw_ibrs_ibpb_active(%rip)
        je      1f
        movl    $MSR_IA32_SPEC_CTRL,%ecx
        rdmsr
        orl     $(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
        orl     $(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32,%edx
        wrmsr
        movb    $1,PCPU(IBPB_SET)
        testl   $CPUID_STDEXT_SMEP,cpu_stdext_feature(%rip)
        je      rsb_flush
1:      ret
END(handle_ibrs_entry)

ENTRY(handle_ibrs_exit)
        cmpb    $0,PCPU(IBPB_SET)
        je      1f
        movl    $MSR_IA32_SPEC_CTRL,%ecx
        rdmsr
        andl    $~(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
        andl    $~((IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32),%edx
        wrmsr
        movb    $0,PCPU(IBPB_SET)
1:      ret
END(handle_ibrs_exit)

/* registers-neutral version, but needs stack */
ENTRY(handle_ibrs_exit_rs)
        cmpb    $0,PCPU(IBPB_SET)
        je      1f
        pushq   %rax
        pushq   %rdx
        pushq   %rcx
        movl    $MSR_IA32_SPEC_CTRL,%ecx
        rdmsr
        andl    $~(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
        andl    $~((IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32),%edx
        wrmsr
        popq    %rcx
        popq    %rdx
        popq    %rax
        movb    $0,PCPU(IBPB_SET)
1:      ret
END(handle_ibrs_exit_rs)

        .noaltmacro

/*
 * Flush L1D cache.  Load enough of the data from the kernel text
 * to flush existing L1D content.
 *
 * N.B. The function does not follow ABI calling conventions, it corrupts %rbx.
 * The vmm.ko caller expects that only %rax, %rdx, %rbx, %rcx, %r9, and %rflags
 * registers are clobbered.  The NMI handler caller only needs %r13 and %r15
 * preserved.
 */
ENTRY(flush_l1d_sw)
#define L1D_FLUSH_SIZE  (64 * 1024)
        movq    $KERNBASE, %r9
        movq    $-L1D_FLUSH_SIZE, %rcx
        /*
         * pass 1: Preload TLB.
         * Kernel text is mapped using superpages.  TLB preload is
         * done for the benefit of older CPUs which split 2M page
         * into 4k TLB entries.
         */
1:      movb    L1D_FLUSH_SIZE(%r9, %rcx), %al
        addq    $PAGE_SIZE, %rcx
        jne     1b
        xorl    %eax, %eax
        cpuid
        movq    $-L1D_FLUSH_SIZE, %rcx
        /* pass 2: Read each cache line. */
2:      movb    L1D_FLUSH_SIZE(%r9, %rcx), %al
        addq    $64, %rcx
        jne     2b
        lfence
        ret
#undef  L1D_FLUSH_SIZE
END(flush_l1d_sw)

ENTRY(flush_l1d_sw_abi)
        pushq   %rbx
        call    flush_l1d_sw
        popq    %rbx
        ret
END(flush_l1d_sw_abi)

ENTRY(mds_handler_void)
        retq
END(mds_handler_void)

ENTRY(mds_handler_verw)
        subq    $8, %rsp
        movw    %ds, (%rsp)
        verw    (%rsp)
        addq    $8, %rsp
        retq
END(mds_handler_verw)

ENTRY(mds_handler_ivb)
        pushq   %rax
        pushq   %rdx
        pushq   %rcx

        movq    %cr0, %rax
        testb   $CR0_TS, %al
        je      1f
        clts
1:      movq    PCPU(MDS_BUF), %rdx
        movdqa  %xmm0, PCPU(MDS_TMP)
        pxor    %xmm0, %xmm0

        lfence
        orpd    (%rdx), %xmm0
        orpd    (%rdx), %xmm0
        mfence
        movl    $40, %ecx
        addq    $16, %rdx
2:      movntdq %xmm0, (%rdx)
        addq    $16, %rdx
        decl    %ecx
        jnz     2b
        mfence

        movdqa  PCPU(MDS_TMP),%xmm0
        testb   $CR0_TS, %al
        je      3f
        movq    %rax, %cr0
3:      popq    %rcx
        popq    %rdx
        popq    %rax
        retq
END(mds_handler_ivb)

ENTRY(mds_handler_bdw)
        pushq   %rax
        pushq   %rbx
        pushq   %rcx
        pushq   %rdi
        pushq   %rsi

        movq    %cr0, %rax
        testb   $CR0_TS, %al
        je      1f
        clts
1:      movq    PCPU(MDS_BUF), %rbx
        movdqa  %xmm0, PCPU(MDS_TMP)
        pxor    %xmm0, %xmm0

        movq    %rbx, %rdi
        movq    %rbx, %rsi
        movl    $40, %ecx
2:      movntdq %xmm0, (%rbx)
        addq    $16, %rbx
        decl    %ecx
        jnz     2b
        mfence
        movl    $1536, %ecx
        rep; movsb
        lfence

        movdqa  PCPU(MDS_TMP),%xmm0
        testb   $CR0_TS, %al
        je      3f
        movq    %rax, %cr0
3:      popq    %rsi
        popq    %rdi
        popq    %rcx
        popq    %rbx
        popq    %rax
        retq
END(mds_handler_bdw)

ENTRY(mds_handler_skl_sse)
        pushq   %rax
        pushq   %rdx
        pushq   %rcx
        pushq   %rdi

        movq    %cr0, %rax
        testb   $CR0_TS, %al
        je      1f
        clts
1:      movq    PCPU(MDS_BUF), %rdi
        movq    PCPU(MDS_BUF64), %rdx
        movdqa  %xmm0, PCPU(MDS_TMP)
        pxor    %xmm0, %xmm0

        lfence
        orpd    (%rdx), %xmm0
        orpd    (%rdx), %xmm0
        xorl    %eax, %eax
2:      clflushopt      5376(%rdi, %rax, 8)
        addl    $8, %eax
        cmpl    $8 * 12, %eax
        jb      2b
        sfence
        movl    $6144, %ecx
        xorl    %eax, %eax
        rep; stosb
        mfence

        movdqa  PCPU(MDS_TMP), %xmm0
        testb   $CR0_TS, %al
        je      3f
        movq    %rax, %cr0
3:      popq    %rdi
        popq    %rcx
        popq    %rdx
        popq    %rax
        retq
END(mds_handler_skl_sse)

ENTRY(mds_handler_skl_avx)
        pushq   %rax
        pushq   %rdx
        pushq   %rcx
        pushq   %rdi

        movq    %cr0, %rax
        testb   $CR0_TS, %al
        je      1f
        clts
1:      movq    PCPU(MDS_BUF), %rdi
        movq    PCPU(MDS_BUF64), %rdx
        vmovdqa %ymm0, PCPU(MDS_TMP)
        vpxor   %ymm0, %ymm0, %ymm0

        lfence
        vorpd   (%rdx), %ymm0, %ymm0
        vorpd   (%rdx), %ymm0, %ymm0
        xorl    %eax, %eax
2:      clflushopt      5376(%rdi, %rax, 8)
        addl    $8, %eax
        cmpl    $8 * 12, %eax
        jb      2b
        sfence
        movl    $6144, %ecx
        xorl    %eax, %eax
        rep; stosb
        mfence

        vmovdqa PCPU(MDS_TMP), %ymm0
        testb   $CR0_TS, %al
        je      3f
        movq    %rax, %cr0
3:      popq    %rdi
        popq    %rcx
        popq    %rdx
        popq    %rax
        retq
END(mds_handler_skl_avx)

ENTRY(mds_handler_skl_avx512)
        pushq   %rax
        pushq   %rdx
        pushq   %rcx
        pushq   %rdi

        movq    %cr0, %rax
        testb   $CR0_TS, %al
        je      1f
        clts
1:      movq    PCPU(MDS_BUF), %rdi
        movq    PCPU(MDS_BUF64), %rdx
        vmovdqa64       %zmm0, PCPU(MDS_TMP)
        vpxord  %zmm0, %zmm0, %zmm0

        lfence
        vorpd   (%rdx), %zmm0, %zmm0
        vorpd   (%rdx), %zmm0, %zmm0
        xorl    %eax, %eax
2:      clflushopt      5376(%rdi, %rax, 8)
        addl    $8, %eax
        cmpl    $8 * 12, %eax
        jb      2b
        sfence
        movl    $6144, %ecx
        xorl    %eax, %eax
        rep; stosb
        mfence

        vmovdqa64       PCPU(MDS_TMP), %zmm0
        testb   $CR0_TS, %al
        je      3f
        movq    %rax, %cr0
3:      popq    %rdi
        popq    %rcx
        popq    %rdx
        popq    %rax
        retq
END(mds_handler_skl_avx512)

ENTRY(mds_handler_silvermont)
        pushq   %rax
        pushq   %rdx
        pushq   %rcx

        movq    %cr0, %rax
        testb   $CR0_TS, %al
        je      1f
        clts
1:      movq    PCPU(MDS_BUF), %rdx
        movdqa  %xmm0, PCPU(MDS_TMP)
        pxor    %xmm0, %xmm0

        movl    $16, %ecx
2:      movntdq %xmm0, (%rdx)
        addq    $16, %rdx
        decl    %ecx
        jnz     2b
        mfence

        movdqa  PCPU(MDS_TMP),%xmm0
        testb   $CR0_TS, %al
        je      3f
        movq    %rax, %cr0
3:      popq    %rcx
        popq    %rdx
        popq    %rax
        retq
END(mds_handler_silvermont)

/*
 * Do the same as Linux and execute IRET explicitly, despite IPI
 * return does it as well.
 */
ENTRY(cpu_sync_core)
/*
 * Can utilize SERIALIZE when instruction is moved from
 * 'future extensions' to SDM.
 */
        movq    (%rsp), %rdx
        movl    %ss, %eax
        pushq   %rax
        pushq   %rsp
        addq    $16, (%rsp)
        pushfq
        movl    %cs, %eax
        pushq   %rax
        pushq   %rdx
        iretq
END(cpu_sync_core)