root/usr/src/cmd/sgs/rtld/amd64/boot_elf.S
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright (c) 2018 Joyent, Inc. All rights reserved.
 */

/*
 * Welcome to the magic behind the PLT (procedure linkage table). When rtld
 * fills out the PLT entries, it will refer initially to the functions in this
 * file. As such our goal is simple:
 *
 *     The lie of the function call must be preserved at all costs.
 *
 * This means that we need to prepare the system for an arbitrary series of
 * instructions to be called. For example, as a side effect of resolving a
 * symbol we may need to open a shared object which will cause any _init
 * functions to be called. Those functions can use any and all of the ABI state
 * that they desire (for example, the FPU registers). Therefore we must save and
 * restore all the ABI mandated registers here.
 *
 * For the full information about what we need to save and restore and why,
 * please see the System V amd64 PS ABI '3.2.3 Parameter Passing'. For general
 * purpose registers, we need to take care of the following:
 *
 *      %rax    - Used for information about the number of vector arguments
 *      %rdi    - arg0
 *      %rsi    - arg1
 *      %rdx    - arg2
 *      %rcx    - arg3
 *      %r8     - arg4
 *      %r9     - arg5
 *      %r10    - static chain pointer
 *
 * Unfortunately, the world of the FPU is more complicated.
 *
 * The ABI mandates that we must save %xmm0-%xmm7. On newer Intel processors,
 * %xmm0-%xmm7 shadow %ymm0-%ymm7 and %zmm0-%zmm7. Historically, when saving the
 * FPU, we only saved and restored these eight registers. Unfortunately, this
 * process itself ended up having side effects. Because the registers shadow one
 * another, if we saved a full %zmm register when only a %xmm register was
 * valid, we would end up causing the processor to think that the full %zmm
 * register was valid. Once it believed that this was the case, it would then
 * degrade performance of code that only used the %xmm registers.
 *
 * One way to tackle this problem would have been to use xgetbv with ecx=1 to
 * get information about what was actually in use and only save and restore
 * that. You can imagine that this logic roughly ends up as something like:
 *
 *         if (zmm_inuse)
 *              save_zmm()
 *         if (ymm_inuse)
 *              save_ymm()
 *         save_xmm()
 *
 * However, this logic leaves us at the mercy of the branch predictor. This
 * means that all of our efforts can end up still causing the CPU to execute
 * things to make it think that some of these other FPU registers are in use and
 * thus defeat the optimizations that it has.
 *
 * To deal with this problem, Intel has suggested using the xsave family of
 * instructions. The kernel provides information about the size required for the
 * floating point registers as well as which of several methods we need to
 * employ through the aux vector. This gets us out of trying to look at the
 * hardware capabilities and make decisions every time. As part of the
 * amd64-specific portion of rtld, it will process those values and determine
 * the functions on an as-needed basis.
 *
 * There are two different functions that we export. The first is elf_rtbndr().
 * This is basically the glue that gets us into the PLT and to perform
 * relocations. elf_rtbndr() determines the address of the function that we must
 * call and arranges its stack such that when we return from elf_rtbndr() we
 * will instead jump to the actual relocated function which will return to the
 * original caller. Because of this, we must preserve all of the registers that
 * are used for arguments and restore them before returning.
 *
 * The second function we export is elf_plt_trace(). This is used to add support
 * for audit libraries among other things. elf_plt_trace() may or may not call
 * the underlying function as a side effect or merely set up its return to it.
 * This changes how we handle %rax. If we call the function ourself, then we end
 * up making sure that %rax is the return value versus the initial value. In
 * addition, because we get %r11 from the surrounding PLT code, we opt to
 * preserve it in case some of the relocation logic ever ends up calling back
 * into us again.
 */

#if     defined(lint)

#include        <sys/types.h>
#include        <_rtld.h>
#include        <_audit.h>
#include        <_elf.h>
#include        <sys/regset.h>
#include        <sys/auxv_386.h>

#else

#include        <link.h>
#include        <_audit.h>
#include        <sys/asm_linkage.h>
#include        <sys/auxv_386.h>
#include        <sys/x86_archext.h>

/*
 * This macro is used to zero the xsave header. The contents of scratchreg will
 * be destroyed. locreg should contain the starting address of the xsave header.
 */
#define XSAVE_HEADER_ZERO(scratch, loc) \
        xorq    scratch, scratch;       \
        movq    scratch, 0x200(loc);    \
        movq    scratch, 0x208(loc);    \
        movq    scratch, 0x210(loc);    \
        movq    scratch, 0x218(loc);    \
        movq    scratch, 0x220(loc);    \
        movq    scratch, 0x228(loc);    \
        movq    scratch, 0x230(loc);    \
        movq    scratch, 0x238(loc)


        .file   "boot_elf.s"
        .text

/*
 * This section of the code contains glue functions that are used to take care
 * of saving and restoring the FPU. We deal with this in a few different ways
 * based on the hardware support and what exists. Historically we've only saved
 * and restored the first 8 floating point registers rather than the entire FPU.
 * That implementation still exists here and is kept around mostly as an
 * insurance policy.
 */
        ENTRY(_elf_rtbndr_fp_save_orig)
        movq    org_scapset@GOTPCREL(%rip),%r11
        movq    (%r11),%r11             /* Syscapset_t pointer */
        movl    8(%r11),%edx            /* sc_hw_2 */
        testl   $AV_386_2_AVX512F,%edx
        jne     .save_zmm
        movl    (%r11),%edx             /* sc_hw_1 */
        testl   $AV_386_AVX,%edx
        jne     .save_ymm
        movdqa  %xmm0, (%rdi)
        movdqa  %xmm1, 64(%rdi)
        movdqa  %xmm2, 128(%rdi)
        movdqa  %xmm3, 192(%rdi)
        movdqa  %xmm4, 256(%rdi)
        movdqa  %xmm5, 320(%rdi)
        movdqa  %xmm6, 384(%rdi)
        movdqa  %xmm7, 448(%rdi)
        jmp     .save_finish

.save_ymm:
        vmovdqa %ymm0, (%rdi)
        vmovdqa %ymm1, 64(%rdi)
        vmovdqa %ymm2, 128(%rdi)
        vmovdqa %ymm3, 192(%rdi)
        vmovdqa %ymm4, 256(%rdi)
        vmovdqa %ymm5, 320(%rdi)
        vmovdqa %ymm6, 384(%rdi)
        vmovdqa %ymm7, 448(%rdi)
        jmp     .save_finish

.save_zmm:
        vmovdqa64       %zmm0, (%rdi)
        vmovdqa64       %zmm1, 64(%rdi)
        vmovdqa64       %zmm2, 128(%rdi)
        vmovdqa64       %zmm3, 192(%rdi)
        vmovdqa64       %zmm4, 256(%rdi)
        vmovdqa64       %zmm5, 320(%rdi)
        vmovdqa64       %zmm6, 384(%rdi)
        vmovdqa64       %zmm7, 448(%rdi)

.save_finish:
        ret
        SET_SIZE(_elf_rtbndr_fp_save_orig)

        ENTRY(_elf_rtbndr_fp_restore_orig)
        movq    org_scapset@GOTPCREL(%rip),%r11
        movq    (%r11),%r11             /* Syscapset_t pointer */
        movl    8(%r11),%edx            /* sc_hw_2 */
        testl   $AV_386_2_AVX512F,%edx
        jne     .restore_zmm
        movl    (%r11),%edx             /* sc_hw_1 */
        testl   $AV_386_AVX,%edx
        jne     .restore_ymm

        movdqa  (%rdi), %xmm0
        movdqa  64(%rdi), %xmm1
        movdqa  128(%rdi), %xmm2
        movdqa  192(%rdi), %xmm3
        movdqa  256(%rdi), %xmm4
        movdqa  320(%rdi), %xmm5
        movdqa  384(%rdi), %xmm6
        movdqa  448(%rdi), %xmm7
        jmp     .restore_finish

.restore_ymm:
        vmovdqa (%rdi), %ymm0
        vmovdqa 64(%rdi), %ymm1
        vmovdqa 128(%rdi), %ymm2
        vmovdqa 192(%rdi), %ymm3
        vmovdqa 256(%rdi), %ymm4
        vmovdqa 320(%rdi), %ymm5
        vmovdqa 384(%rdi), %ymm6
        vmovdqa 448(%rdi), %ymm7
        jmp     .restore_finish

.restore_zmm:
        vmovdqa64       (%rdi), %zmm0
        vmovdqa64       64(%rdi), %zmm1
        vmovdqa64       128(%rdi), %zmm2
        vmovdqa64       192(%rdi), %zmm3
        vmovdqa64       256(%rdi), %zmm4
        vmovdqa64       320(%rdi), %zmm5
        vmovdqa64       384(%rdi), %zmm6
        vmovdqa64       448(%rdi), %zmm7

.restore_finish:
        ret
        SET_SIZE(_elf_rtbndr_fp_restore_orig)

        ENTRY(_elf_rtbndr_fp_fxsave)
        fxsaveq (%rdi)
        ret
        SET_SIZE(_elf_rtbndr_fp_fxsave)

        ENTRY(_elf_rtbndr_fp_fxrestore)
        fxrstor (%rdi)
        ret
        SET_SIZE(_elf_rtbndr_fp_fxrestore)

        ENTRY(_elf_rtbndr_fp_xsave)
        XSAVE_HEADER_ZERO(%rdx, %rdi)
        movq    $_CONST(XFEATURE_FP_ALL), %rdx
        movl    %edx, %eax
        shrq    $32, %rdx
        xsave   (%rdi)                  /* save data */
        ret
        SET_SIZE(_elf_rtbndr_fp_xsave)

        ENTRY(_elf_rtbndr_fp_xrestore)
        movq    $_CONST(XFEATURE_FP_ALL), %rdx
        movl    %edx, %eax
        shrq    $32, %rdx
        xrstor  (%rdi)                  /* save data */
        ret
        SET_SIZE(_elf_rtbndr_fp_xrestore)

#endif

#if     defined(lint)

/* ARGSUSED0 */
int
elf_plt_trace()
{
        return (0);
}

#else

/*
 * On entry the 'glue code' has already  done the following:
 *
 *      pushq   %rbp
 *      movq    %rsp, %rbp
 *      subq    $0x10, %rsp
 *      leaq    trace_fields(%rip), %r11
 *      movq    %r11, -0x8(%rbp)
 *      movq    $elf_plt_trace, %r11
 *      jmp     *%r11
 *
 * so - -8(%rbp) contains the dyndata ptr
 *
 *      0x0     Addr            *reflmp
 *      0x8     Addr            *deflmp
 *      0x10    Word            symndx
 *      0x14    Word            sb_flags
 *      0x18    Sym             symdef.st_name
 *      0x1c                    symdef.st_info
 *      0x1d                    symdef.st_other
 *      0x1e                    symdef.st_shndx
 *      0x20                    symdef.st_value
 *      0x28                    symdef.st_size
 *
 * Also note - on entry 16 bytes have already been subtracted
 * from the %rsp.  The first 8 bytes is for the dyn_data_ptr,
 * the second 8 bytes are to align the stack and are available
 * for use.
 */
#define REFLMP_OFF              0x0
#define DEFLMP_OFF              0x8
#define SYMNDX_OFF              0x10
#define SBFLAGS_OFF             0x14
#define SYMDEF_OFF              0x18
#define SYMDEF_VALUE_OFF        0x20

/*
 * Next, we need to create a bunch of local storage. First, we have to preserve
 * the standard registers per the amd64 ABI. This means we need to deal with:
 *      %rax    - Used for information about the number of vector arguments
 *      %rdi    - arg0
 *      %rsi    - arg1
 *      %rdx    - arg2
 *      %rcx    - arg3
 *      %r8     - arg4
 *      %r9     - arg5
 *      %r10    - static chain pointer
 *      %r11    - PLT Interwork register, our caller is using this, so it's not
 *                a temporary for us.
 *
 * In addition, we need to save the amd64 ABI floating point arguments. Finally,
 * we need to deal with our local storage. We need a La_amd64_regs and a
 * uint64_t for the previous stack size.
 *
 * To deal with this and the potentially variable size of the FPU regs, we have
 * to play a few different games. We refer to all of the standard registers, the
 * previous stack size, and La_amd64_regs structure off of %rbp. These are all
 * values that are below %rbp.
 */
#define SPDYNOFF        -8
#define SPDESTOFF       -16
#define SPPRVSTKOFF     -24
#define SPLAREGOFF      -88
#define ORIG_RDI        -96
#define ORIG_RSI        -104
#define ORIG_RDX        -112
#define ORIG_RCX        -120
#define ORIG_R8         -128
#define ORIG_R9         -136
#define ORIG_R10        -144
#define ORIG_R11        -152
#define ORIG_RAX        -160
#define PLT_SAVE_OFF    168

        ENTRY(elf_plt_trace)
        /*
         * Save our static registers. After that 64-byte align us and subtract
         * the appropriate amount for the FPU. The frame pointer has already
         * been pushed for us by the glue code.
         */
        movq    %rdi, ORIG_RDI(%rbp)
        movq    %rsi, ORIG_RSI(%rbp)
        movq    %rdx, ORIG_RDX(%rbp)
        movq    %rcx, ORIG_RCX(%rbp)
        movq    %r8, ORIG_R8(%rbp)
        movq    %r9, ORIG_R9(%rbp)
        movq    %r10, ORIG_R10(%rbp)
        movq    %r11, ORIG_R11(%rbp)
        movq    %rax, ORIG_RAX(%rbp)

        subq    $PLT_SAVE_OFF, %rsp

        movq    _plt_save_size@GOTPCREL(%rip),%r9
        movq    _plt_fp_save@GOTPCREL(%rip),%r10
        subq    (%r9), %rsp
        andq    $-64, %rsp
        movq    %rsp, %rdi
        call    *(%r10)

        /*
         * Now that we've saved all of our registers, figure out what we need to
         * do next.
         */
        movq    SPDYNOFF(%rbp), %rax                    / %rax = dyndata
        testb   $LA_SYMB_NOPLTENTER, SBFLAGS_OFF(%rax)  / <link.h>
        je      .start_pltenter
        movq    SYMDEF_VALUE_OFF(%rax), %rdi
        movq    %rdi, SPDESTOFF(%rbp)           / save destination address
        jmp     .end_pltenter

.start_pltenter:
        /*
         * save all registers into La_amd64_regs
         */
        leaq    SPLAREGOFF(%rbp), %rsi  / %rsi = &La_amd64_regs
        leaq    8(%rbp), %rdi
        movq    %rdi, 0(%rsi)           / la_rsp
        movq    0(%rbp), %rdi
        movq    %rdi, 8(%rsi)           / la_rbp
        movq    ORIG_RDI(%rbp), %rdi
        movq    %rdi, 16(%rsi)          / la_rdi
        movq    ORIG_RSI(%rbp), %rdi
        movq    %rdi, 24(%rsi)          / la_rsi
        movq    ORIG_RDX(%rbp), %rdi
        movq    %rdi, 32(%rsi)          / la_rdx
        movq    ORIG_RCX(%rbp), %rdi
        movq    %rdi, 40(%rsi)          / la_rcx
        movq    ORIG_R8(%rbp), %rdi
        movq    %rdi, 48(%rsi)          / la_r8
        movq    ORIG_R9(%rbp), %rdi
        movq    %rdi, 56(%rsi)          / la_r9

        /*
         * prepare for call to la_pltenter
         */
        movq    SPDYNOFF(%rbp), %r11            / %r11 = &dyndata
        leaq    SBFLAGS_OFF(%r11), %r9          / arg6 (&sb_flags)
        leaq    SPLAREGOFF(%rbp), %r8           / arg5 (&La_amd64_regs)
        movl    SYMNDX_OFF(%r11), %ecx          / arg4 (symndx)
        leaq    SYMDEF_OFF(%r11), %rdx          / arg3 (&Sym)
        movq    DEFLMP_OFF(%r11), %rsi          / arg2 (dlmp)
        movq    REFLMP_OFF(%r11), %rdi          / arg1 (rlmp)
        call    audit_pltenter@PLT
        movq    %rax, SPDESTOFF(%rbp)           / save calling address
.end_pltenter:

        /*
         * If *no* la_pltexit() routines exist
         * we do not need to keep the stack frame
         * before we call the actual routine.  Instead we
         * jump to it and remove our stack from the stack
         * at the same time.
         */
        movl    audit_flags(%rip), %eax
        andl    $AF_PLTEXIT, %eax               / value of audit.h:AF_PLTEXIT
        cmpl    $0, %eax
        je      .bypass_pltexit
        /*
         * Has the *nopltexit* flag been set for this entry point
         */
        movq    SPDYNOFF(%rbp), %r11            / %r11 = &dyndata
        testb   $LA_SYMB_NOPLTEXIT, SBFLAGS_OFF(%r11)
        je      .start_pltexit

.bypass_pltexit:
        /*
         * No PLTEXIT processing required.
         */
        movq    0(%rbp), %r11
        movq    %r11, -8(%rbp)                  / move prev %rbp
        movq    SPDESTOFF(%rbp), %r11           / r11 == calling destination
        movq    %r11, 0(%rbp)                   / store destination at top

        /* Restore FPU */
        movq    _plt_fp_restore@GOTPCREL(%rip),%r10

        movq    %rsp, %rdi
        call    *(%r10)

        movq    ORIG_RDI(%rbp), %rdi
        movq    ORIG_RSI(%rbp), %rsi
        movq    ORIG_RDX(%rbp), %rdx
        movq    ORIG_RCX(%rbp), %rcx
        movq    ORIG_R8(%rbp), %r8
        movq    ORIG_R9(%rbp), %r9
        movq    ORIG_R10(%rbp), %r10
        movq    ORIG_R11(%rbp), %r11
        movq    ORIG_RAX(%rbp), %rax

        subq    $8, %rbp                        / adjust %rbp for 'ret'
        movq    %rbp, %rsp                      /
        /*
         * At this point, after a little doctoring, we should
         * have the following on the stack:
         *
         *      16(%rsp):  ret addr
         *      8(%rsp):  dest_addr
         *      0(%rsp):  Previous %rbp
         *
         * So - we pop the previous %rbp, and then
         * ret to our final destination.
         */
        popq    %rbp                            /
        ret                                     / jmp to final destination
                                                / and clean up stack :)

.start_pltexit:
        /*
         * In order to call the destination procedure and then return
         * to audit_pltexit() for post analysis we must first grow
         * our stack frame and then duplicate the original callers
         * stack state.  This duplicates all of the arguements
         * that were to be passed to the destination procedure.
         */
        movq    %rbp, %rdi                      /
        addq    $16, %rdi                       /    %rdi = src
        movq    (%rbp), %rdx                    /
        subq    %rdi, %rdx                      /    %rdx == prev frame sz
        /*
         * If audit_argcnt > 0 then we limit the number of
         * arguements that will be duplicated to audit_argcnt.
         *
         * If (prev_stack_size > (audit_argcnt * 8))
         *      prev_stack_size = audit_argcnt * 8;
         */
        movl    audit_argcnt(%rip),%eax         /   %eax = audit_argcnt
        cmpl    $0, %eax
        jle     .grow_stack
        leaq    (,%rax,8), %rax                 /    %eax = %eax * 4
        cmpq    %rax,%rdx
        jle     .grow_stack
        movq    %rax, %rdx
        /*
         * Grow the stack and duplicate the arguements of the
         * original caller.
         */
.grow_stack:
        movq    %rsp, %r11
        subq    %rdx, %rsp                      /    grow the stack
        movq    %rdx, SPPRVSTKOFF(%rbp)         /    -88(%rbp) == prev frame sz
        movq    %rsp, %rcx                      /    %rcx = dest
        addq    %rcx, %rdx                      /    %rdx == tail of dest
.while_base:
        cmpq    %rdx, %rcx                      /   while (base+size >= src++) {
        jge     .end_while                      /
        movq    (%rdi), %rsi
        movq    %rsi,(%rcx)                     /        *dest = *src
        addq    $8, %rdi                        /        src++
        addq    $8, %rcx                        /        dest++
        jmp     .while_base                     /    }

        /*
         * The above stack is now an exact duplicate of
         * the stack of the original calling procedure.
         */
.end_while:
        /
        / Restore registers using %r11 which contains our old %rsp value
        / before growing the stack.
        /
        movq    _plt_fp_restore@GOTPCREL(%rip),%r10
        movq    %r11, %rdi
        call    *(%r10)

.trace_r2_finish:
        movq    ORIG_RDI(%rbp), %rdi
        movq    ORIG_RSI(%rbp), %rsi
        movq    ORIG_RDX(%rbp), %rdx
        movq    ORIG_RCX(%rbp), %rcx
        movq    ORIG_R8(%rbp), %r8
        movq    ORIG_R9(%rbp), %r9
        movq    ORIG_R10(%rbp), %r10
        movq    ORIG_RAX(%rbp), %rax
        movq    ORIG_R11(%rbp), %r11

        /*
         * Call to desitnation function - we'll return here
         * for pltexit monitoring.
         */
        call    *SPDESTOFF(%rbp)

        addq    SPPRVSTKOFF(%rbp), %rsp / cleanup dupped stack

        /
        / prepare for call to audit_pltenter()
        /
        movq    SPDYNOFF(%rbp), %r11            / %r11 = &dyndata
        movq    SYMNDX_OFF(%r11), %r8           / arg5 (symndx)
        leaq    SYMDEF_OFF(%r11), %rcx          / arg4 (&Sym)
        movq    DEFLMP_OFF(%r11), %rdx          / arg3 (dlmp)
        movq    REFLMP_OFF(%r11), %rsi          / arg2 (rlmp)
        movq    %rax, %rdi                      / arg1 (returnval)
        call    audit_pltexit@PLT

        /*
         * Clean up after ourselves and return to the
         * original calling procedure. Make sure to restore
         * registers.
         */

        movq    _plt_fp_restore@GOTPCREL(%rip),%r10
        movq    %rsp, %rdi
        movq    %rax, SPPRVSTKOFF(%rbp)
        call    *(%r10)

        movq    ORIG_RDI(%rbp), %rdi
        movq    ORIG_RSI(%rbp), %rsi
        movq    ORIG_RDX(%rbp), %rdx
        movq    ORIG_RCX(%rbp), %rcx
        movq    ORIG_R8(%rbp), %r8
        movq    ORIG_R9(%rbp), %r9
        movq    ORIG_R10(%rbp), %r10
        movq    ORIG_R11(%rbp), %r11
        movq    SPPRVSTKOFF(%rbp), %rax

        movq    %rbp, %rsp                      /
        popq    %rbp                            /
        ret                                     / return to caller
        SET_SIZE(elf_plt_trace)
#endif

/*
 * We got here because a call to a function resolved to a procedure
 * linkage table entry.  That entry did a JMPL to the first PLT entry, which
 * in turn did a call to elf_rtbndr.
 *
 * the code sequence that got us here was:
 *
 * .PLT0:
 *      pushq   GOT+8(%rip)     #GOT[1]
 *      jmp     *GOT+16(%rip)   #GOT[2]
 *      nop
 *      nop
 *      nop
 *      nop
 *      ...
 * PLT entry for foo:
 *      jmp     *name1@GOTPCREL(%rip)
 *      pushl   $rel.plt.foo
 *      jmp     PLT0
 *
 * At entry, the stack looks like this:
 *
 *      return address                  16(%rsp)
 *      $rel.plt.foo    (plt index)     8(%rsp)
 *      lmp                             0(%rsp)
 *
 */
#if defined(lint)

extern unsigned long    elf_bndr(Rt_map *, unsigned long, caddr_t);

void
elf_rtbndr(Rt_map * lmp, unsigned long reloc, caddr_t pc)
{
        (void) elf_bndr(lmp, reloc, pc);
}

#else

/*
 * The PLT code that landed us here placed 2 arguments on the stack as
 * arguments to elf_rtbndr.
 * Additionally the pc of caller is below these 2 args.
 * Our stack will look like this after we establish a stack frame with
 * push %rbp; movq %rsp, %rbp sequence:
 *
 *      8(%rbp)                 arg1 - *lmp
 *      16(%rbp), %rsi          arg2 - reloc index
 *      24(%rbp), %rdx          arg3 - pc of caller
 */
#define LBPLMPOFF       8       /* arg1 - *lmp */
#define LBPRELOCOFF     16      /* arg2 - reloc index */
#define LBRPCOFF        24      /* arg3 - pc of caller */

/*
 * With the above in place, we must now proceed to preserve all temporary
 * registers that are also used for passing arguments. Specifically this
 * means:
 *
 *      %rax    - Used for information about the number of vector arguments
 *      %rdi    - arg0
 *      %rsi    - arg1
 *      %rdx    - arg2
 *      %rcx    - arg3
 *      %r8     - arg4
 *      %r9     - arg5
 *      %r10    - static chain pointer
 *
 * While we don't have to preserve %r11, we do have to preserve the FPU
 * registers. The FPU logic is delegated to a specific function that we'll call.
 * However, it requires that its stack is 64-byte aligned. We defer the
 * alignment to that point. This will also take care of the fact that a caller
 * may not call us with a correctly aligned stack pointer per the amd64 ABI.
 */

        .extern _plt_save_size
        .extern _plt_fp_save
        .extern plt_fp_restore

        .weak   _elf_rtbndr
        _elf_rtbndr = elf_rtbndr

        ENTRY(elf_rtbndr)
        pushq   %rbp            /* Establish stack frame */
        movq    %rsp, %rbp

        /*
         * Save basic regs.
         */
        pushq   %rax
        pushq   %rdi
        pushq   %rsi
        pushq   %rdx
        pushq   %rcx
        pushq   %r8
        pushq   %r9
        pushq   %r10
        pushq   %r12

        /*
         * Save the amount of space we need for the FPU registers and call that
         * function. Save %rsp before we manipulate it to make restore easier.
         */
        movq    %rsp, %r12
        movq    _plt_save_size@GOTPCREL(%rip),%r9
        movq    _plt_fp_save@GOTPCREL(%rip),%r10
        subq    (%r9), %rsp
        andq    $-64, %rsp

        movq    %rsp, %rdi
        call    *(%r10)

        /*
         * Perform actual PLT logic. Note that the plt related arguments are
         * located at an offset relative to %rbp.
         */
        movq    LBPLMPOFF(%rbp), %rdi   /* arg1 - *lmp */
        movq    LBPRELOCOFF(%rbp), %rsi /* arg2 - reloc index */
        movq    LBRPCOFF(%rbp), %rdx    /* arg3 - pc of caller */
        call    elf_bndr@PLT            /* call elf_rtbndr(lmp, relndx, pc) */
        movq    %rax, LBPRELOCOFF(%rbp) /* store final destination */

        /* Restore FPU */
        movq    _plt_fp_restore@GOTPCREL(%rip),%r10

        movq    %rsp, %rdi
        call    *(%r10)

        movq    %r12, %rsp
        popq    %r12
        popq    %r10
        popq    %r9
        popq    %r8
        popq    %rcx
        popq    %rdx
        popq    %rsi
        popq    %rdi
        popq    %rax

        movq    %rbp, %rsp      /* Restore our stack frame */
        popq    %rbp

        addq    $8, %rsp        /* pop 1st plt-pushed args */
                                /* the second arguement is used */
                                /* for the 'return' address to our */
                                /* final destination */

        ret                     /* invoke resolved function */

        SET_SIZE(elf_rtbndr)
#endif