root/arch/arm/boot/compressed/head.S
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 *  linux/arch/arm/boot/compressed/head.S
 *
 *  Copyright (C) 1996-2002 Russell King
 *  Copyright (C) 2004 Hyok S. Choi (MPU support)
 */
#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/v7m.h>

#include "efi-header.S"

#ifdef __ARMEB__
#define OF_DT_MAGIC 0xd00dfeed
#else
#define OF_DT_MAGIC 0xedfe0dd0
#endif

 AR_CLASS(      .arch   armv7-a )
 M_CLASS(       .arch   armv7-m )

/*
 * Debugging stuff
 *
 * Note that these macros must not contain any code which is not
 * 100% relocatable.  Any attempt to do so will result in a crash.
 * Please select one of the following when turning on debugging.
 */
#ifdef DEBUG

#if defined(CONFIG_DEBUG_ICEDCC)

#if defined(CONFIG_CPU_V6) || defined(CONFIG_CPU_V6K) || defined(CONFIG_CPU_V7)
                .macro  loadsp, rb, tmp1, tmp2
                .endm
                .macro  writeb, ch, rb, tmp
                mcr     p14, 0, \ch, c0, c5, 0
                .endm
#elif defined(CONFIG_CPU_XSCALE)
                .macro  loadsp, rb, tmp1, tmp2
                .endm
                .macro  writeb, ch, rb, tmp
                mcr     p14, 0, \ch, c8, c0, 0
                .endm
#else
                .macro  loadsp, rb, tmp1, tmp2
                .endm
                .macro  writeb, ch, rb, tmp
                mcr     p14, 0, \ch, c1, c0, 0
                .endm
#endif

#else

#include CONFIG_DEBUG_LL_INCLUDE

                .macro  writeb, ch, rb, tmp
#ifdef CONFIG_DEBUG_UART_FLOW_CONTROL
                waituartcts \tmp, \rb
#endif
                waituarttxrdy \tmp, \rb
                senduart \ch, \rb
                busyuart \tmp, \rb
                .endm

#if defined(CONFIG_ARCH_SA1100)
                .macro  loadsp, rb, tmp1, tmp2
                mov     \rb, #0x80000000        @ physical base address
                add     \rb, \rb, #0x00010000   @ Ser1
                .endm
#else
                .macro  loadsp, rb, tmp1, tmp2
                addruart \rb, \tmp1, \tmp2
                .endm
#endif
#endif
#endif

                .macro  kputc,val
                mov     r0, \val
                bl      putc
                .endm

                .macro  kphex,val,len
                mov     r0, \val
                mov     r1, #\len
                bl      phex
                .endm

                /*
                 * Debug kernel copy by printing the memory addresses involved
                 */
                .macro dbgkc, begin, end, cbegin, cend
#ifdef DEBUG
                kputc   #'C'
                kputc   #':'
                kputc   #'0'
                kputc   #'x'
                kphex   \begin, 8       /* Start of compressed kernel */
                kputc   #'-'
                kputc   #'0'
                kputc   #'x'
                kphex   \end, 8         /* End of compressed kernel */
                kputc   #'-'
                kputc   #'>'
                kputc   #'0'
                kputc   #'x'
                kphex   \cbegin, 8      /* Start of kernel copy */
                kputc   #'-'
                kputc   #'0'
                kputc   #'x'
                kphex   \cend, 8        /* End of kernel copy */
                kputc   #'\n'
#endif
                .endm

                /*
                 * Debug print of the final appended DTB location
                 */
                .macro dbgadtb, begin, size
#ifdef DEBUG
                kputc   #'D'
                kputc   #'T'
                kputc   #'B'
                kputc   #':'
                kputc   #'0'
                kputc   #'x'
                kphex   \begin, 8       /* Start of appended DTB */
                kputc   #' '
                kputc   #'('
                kputc   #'0'
                kputc   #'x'
                kphex   \size, 8        /* Size of appended DTB */
                kputc   #')'
                kputc   #'\n'
#endif
                .endm

                .macro  enable_cp15_barriers, reg
                mrc     p15, 0, \reg, c1, c0, 0 @ read SCTLR
                tst     \reg, #(1 << 5)         @ CP15BEN bit set?
                bne     .L_\@
                orr     \reg, \reg, #(1 << 5)   @ CP15 barrier instructions
                mcr     p15, 0, \reg, c1, c0, 0 @ write SCTLR
 ARM(           .inst   0xf57ff06f              @ v7+ isb       )
 THUMB(         isb                                             )
.L_\@:
                .endm

                /*
                 * The kernel build system appends the size of the
                 * decompressed kernel at the end of the compressed data
                 * in little-endian form.
                 */
                .macro  get_inflated_image_size, res:req, tmp1:req, tmp2:req
                adr     \res, .Linflated_image_size_offset
                ldr     \tmp1, [\res]
                add     \tmp1, \tmp1, \res      @ address of inflated image size

                ldrb    \res, [\tmp1]           @ get_unaligned_le32
                ldrb    \tmp2, [\tmp1, #1]
                orr     \res, \res, \tmp2, lsl #8
                ldrb    \tmp2, [\tmp1, #2]
                ldrb    \tmp1, [\tmp1, #3]
                orr     \res, \res, \tmp2, lsl #16
                orr     \res, \res, \tmp1, lsl #24
                .endm

                .macro  be32tocpu, val, tmp
#ifndef __ARMEB__
                /* convert to little endian */
                rev_l   \val, \tmp
#endif
                .endm

                .section ".start", "ax"
/*
 * sort out different calling conventions
 */
                .align
                /*
                 * Always enter in ARM state for CPUs that support the ARM ISA.
                 * As of today (2014) that's exactly the members of the A and R
                 * classes.
                 */
 AR_CLASS(      .arm    )
start:
                .type   start,#function
                /*
                 * These 7 nops along with the 1 nop immediately below for
                 * !THUMB2 form 8 nops that make the compressed kernel bootable
                 * on legacy ARM systems that were assuming the kernel in a.out
                 * binary format. The boot loaders on these systems would
                 * jump 32 bytes into the image to skip the a.out header.
                 * with these 8 nops filling exactly 32 bytes, things still
                 * work as expected on these legacy systems. Thumb2 mode keeps
                 * 7 of the nops as it turns out that some boot loaders
                 * were patching the initial instructions of the kernel, i.e
                 * had started to exploit this "patch area".
                 */
                __initial_nops
                .rept   5
                __nop
                .endr
#ifndef CONFIG_THUMB2_KERNEL
                __nop
#else
 AR_CLASS(      sub     pc, pc, #3      )       @ A/R: switch to Thumb2 mode
  M_CLASS(      nop.w                   )       @ M: already in Thumb2 mode
                .thumb
#endif
                W(b)    1f

                .word   _magic_sig      @ Magic numbers to help the loader
                .word   _magic_start    @ absolute load/run zImage address
                .word   _magic_end      @ zImage end address
                .word   0x04030201      @ endianness flag
                .word   0x45454545      @ another magic number to indicate
                .word   _magic_table    @ additional data table

                __EFI_HEADER
1:
 ARM_BE8(       setend  be              )       @ go BE8 if compiled for BE8
 AR_CLASS(      mrs     r9, cpsr        )
#ifdef CONFIG_ARM_VIRT_EXT
                bl      __hyp_stub_install      @ get into SVC mode, reversibly
#endif
                mov     r7, r1                  @ save architecture ID
                mov     r8, r2                  @ save atags pointer

#ifndef CONFIG_CPU_V7M
                /*
                 * Booting from Angel - need to enter SVC mode and disable
                 * FIQs/IRQs (numeric definitions from angel arm.h source).
                 * We only do this if we were in user mode on entry.
                 */
                mrs     r2, cpsr                @ get current mode
                tst     r2, #3                  @ not user?
                bne     not_angel
                mov     r0, #0x17               @ angel_SWIreason_EnterSVC
 ARM(           swi     0x123456        )       @ angel_SWI_ARM
 THUMB(         svc     0xab            )       @ angel_SWI_THUMB
not_angel:
                safe_svcmode_maskall r0
                msr     spsr_cxsf, r9           @ Save the CPU boot mode in
                                                @ SPSR
#endif
                /*
                 * Note that some cache flushing and other stuff may
                 * be needed here - is there an Angel SWI call for this?
                 */

                /*
                 * some architecture specific code can be inserted
                 * by the linker here, but it should preserve r7, r8, and r9.
                 */

                .text

#ifdef CONFIG_AUTO_ZRELADDR
                /*
                 * Find the start of physical memory.  As we are executing
                 * without the MMU on, we are in the physical address space.
                 * We just need to get rid of any offset by aligning the
                 * address.
                 *
                 * This alignment is a balance between the requirements of
                 * different platforms - we have chosen 128MB to allow
                 * platforms which align the start of their physical memory
                 * to 128MB to use this feature, while allowing the zImage
                 * to be placed within the first 128MB of memory on other
                 * platforms.  Increasing the alignment means we place
                 * stricter alignment requirements on the start of physical
                 * memory, but relaxing it means that we break people who
                 * are already placing their zImage in (eg) the top 64MB
                 * of this range.
                 */
                mov     r0, pc
                and     r0, r0, #0xf8000000
#ifdef CONFIG_USE_OF
                adr     r1, LC1
#ifdef CONFIG_ARM_APPENDED_DTB
                /*
                 * Look for an appended DTB.  If found, we cannot use it to
                 * validate the calculated start of physical memory, as its
                 * memory nodes may need to be augmented by ATAGS stored at
                 * an offset from the same start of physical memory.
                 */
                ldr     r2, [r1, #4]    @ get &_edata
                add     r2, r2, r1      @ relocate it
                ldr     r2, [r2]        @ get DTB signature
                ldr     r3, =OF_DT_MAGIC
                cmp     r2, r3          @ do we have a DTB there?
                beq     1f              @ if yes, skip validation
#endif /* CONFIG_ARM_APPENDED_DTB */

                /*
                 * Make sure we have some stack before calling C code.
                 * No GOT fixup has occurred yet, but none of the code we're
                 * about to call uses any global variables.
                 */
                ldr     sp, [r1]        @ get stack location
                add     sp, sp, r1      @ apply relocation

                /* Validate calculated start against passed DTB */
                mov     r1, r8
                bl      fdt_check_mem_start
1:
#endif /* CONFIG_USE_OF */
                /* Determine final kernel image address. */
                add     r4, r0, #TEXT_OFFSET
#else
                ldr     r4, =zreladdr
#endif

                /*
                 * Set up a page table only if it won't overwrite ourself.
                 * That means r4 < pc || r4 - 16k page directory > &_end.
                 * Given that r4 > &_end is most unfrequent, we add a rough
                 * additional 1MB of room for a possible appended DTB.
                 */
                mov     r0, pc
                cmp     r0, r4
                ldrcc   r0, .Lheadroom
                addcc   r0, r0, pc
                cmpcc   r4, r0
                orrcc   r4, r4, #1              @ remember we skipped cache_on
                blcs    cache_on

restart:        adr     r0, LC1
                ldr     sp, [r0]
                ldr     r6, [r0, #4]
                add     sp, sp, r0
                add     r6, r6, r0

                get_inflated_image_size r9, r10, lr

#ifndef CONFIG_ZBOOT_ROM
                /* malloc space is above the relocated stack (64k max) */
                add     r10, sp, #MALLOC_SIZE
#else
                /*
                 * With ZBOOT_ROM the bss/stack is non relocatable,
                 * but someone could still run this code from RAM,
                 * in which case our reference is _edata.
                 */
                mov     r10, r6
#endif

                mov     r5, #0                  @ init dtb size to 0
#ifdef CONFIG_ARM_APPENDED_DTB
/*
 *   r4  = final kernel address (possibly with LSB set)
 *   r5  = appended dtb size (still unknown)
 *   r6  = _edata
 *   r7  = architecture ID
 *   r8  = atags/device tree pointer
 *   r9  = size of decompressed image
 *   r10 = end of this image, including  bss/stack/malloc space if non XIP
 *   sp  = stack pointer
 *
 * if there are device trees (dtb) appended to zImage, advance r10 so that the
 * dtb data will get relocated along with the kernel if necessary.
 */

                ldr     lr, [r6, #0]
                ldr     r1, =OF_DT_MAGIC
                cmp     lr, r1
                bne     dtb_check_done          @ not found

#ifdef CONFIG_ARM_ATAG_DTB_COMPAT
                /*
                 * OK... Let's do some funky business here.
                 * If we do have a DTB appended to zImage, and we do have
                 * an ATAG list around, we want the later to be translated
                 * and folded into the former here. No GOT fixup has occurred
                 * yet, but none of the code we're about to call uses any
                 * global variable.
                */

                /* Get the initial DTB size */
                ldr     r5, [r6, #4]
                be32tocpu r5, r1
                dbgadtb r6, r5
                /* 50% DTB growth should be good enough */
                add     r5, r5, r5, lsr #1
                /* preserve 64-bit alignment */
                add     r5, r5, #7
                bic     r5, r5, #7
                /* clamp to 32KB min and 1MB max */
                cmp     r5, #(1 << 15)
                movlo   r5, #(1 << 15)
                cmp     r5, #(1 << 20)
                movhi   r5, #(1 << 20)
                /* temporarily relocate the stack past the DTB work space */
                add     sp, sp, r5

                mov     r0, r8
                mov     r1, r6
                mov     r2, r5
                bl      atags_to_fdt

                /*
                 * If returned value is 1, there is no ATAG at the location
                 * pointed by r8.  Try the typical 0x100 offset from start
                 * of RAM and hope for the best.
                 */
                cmp     r0, #1
                sub     r0, r4, #TEXT_OFFSET
                bic     r0, r0, #1
                add     r0, r0, #0x100
                mov     r1, r6
                mov     r2, r5
                bleq    atags_to_fdt

                sub     sp, sp, r5
#endif

                mov     r8, r6                  @ use the appended device tree

                /*
                 * Make sure that the DTB doesn't end up in the final
                 * kernel's .bss area. To do so, we adjust the decompressed
                 * kernel size to compensate if that .bss size is larger
                 * than the relocated code.
                 */
                ldr     r5, =_kernel_bss_size
                adr     r1, wont_overwrite
                sub     r1, r6, r1
                subs    r1, r5, r1
                addhi   r9, r9, r1

                /* Get the current DTB size */
                ldr     r5, [r6, #4]
                be32tocpu r5, r1

                /* preserve 64-bit alignment */
                add     r5, r5, #7
                bic     r5, r5, #7

                /* relocate some pointers past the appended dtb */
                add     r6, r6, r5
                add     r10, r10, r5
                add     sp, sp, r5
dtb_check_done:
#endif

/*
 * Check to see if we will overwrite ourselves.
 *   r4  = final kernel address (possibly with LSB set)
 *   r9  = size of decompressed image
 *   r10 = end of this image, including  bss/stack/malloc space if non XIP
 * We basically want:
 *   r4 - 16k page directory >= r10 -> OK
 *   r4 + image length <= address of wont_overwrite -> OK
 * Note: the possible LSB in r4 is harmless here.
 */
                add     r10, r10, #16384
                cmp     r4, r10
                bhs     wont_overwrite
                add     r10, r4, r9
                adr     r9, wont_overwrite
                cmp     r10, r9
                bls     wont_overwrite

/*
 * Relocate ourselves past the end of the decompressed kernel.
 *   r6  = _edata
 *   r10 = end of the decompressed kernel
 * Because we always copy ahead, we need to do it from the end and go
 * backward in case the source and destination overlap.
 */
                /*
                 * Bump to the next 256-byte boundary with the size of
                 * the relocation code added. This avoids overwriting
                 * ourself when the offset is small.
                 */
                add     r10, r10, #((reloc_code_end - restart + 256) & ~255)
                bic     r10, r10, #255

                /* Get start of code we want to copy and align it down. */
                adr     r5, restart
                bic     r5, r5, #31

/* Relocate the hyp vector base if necessary */
#ifdef CONFIG_ARM_VIRT_EXT
                mrs     r0, spsr
                and     r0, r0, #MODE_MASK
                cmp     r0, #HYP_MODE
                bne     1f

                /*
                 * Compute the address of the hyp vectors after relocation.
                 * Call __hyp_set_vectors with the new address so that we
                 * can HVC again after the copy.
                 */
                adr_l   r0, __hyp_stub_vectors
                sub     r0, r0, r5
                add     r0, r0, r10
                bl      __hyp_set_vectors
1:
#endif

                sub     r9, r6, r5              @ size to copy
                add     r9, r9, #31             @ rounded up to a multiple
                bic     r9, r9, #31             @ ... of 32 bytes
                add     r6, r9, r5
                add     r9, r9, r10

#ifdef DEBUG
                sub     r10, r6, r5
                sub     r10, r9, r10
                /*
                 * We are about to copy the kernel to a new memory area.
                 * The boundaries of the new memory area can be found in
                 * r10 and r9, whilst r5 and r6 contain the boundaries
                 * of the memory we are going to copy.
                 * Calling dbgkc will help with the printing of this
                 * information.
                 */
                dbgkc   r5, r6, r10, r9
#endif

1:              ldmdb   r6!, {r0 - r3, r10 - r12, lr}
                cmp     r6, r5
                stmdb   r9!, {r0 - r3, r10 - r12, lr}
                bhi     1b

                /* Preserve offset to relocated code. */
                sub     r6, r9, r6

                mov     r0, r9                  @ start of relocated zImage
                add     r1, sp, r6              @ end of relocated zImage
                bl      cache_clean_flush

                badr    r0, restart
                add     r0, r0, r6
                mov     pc, r0

wont_overwrite:
                adr     r0, LC0
                ldmia   r0, {r1, r2, r3, r11, r12}
                sub     r0, r0, r1              @ calculate the delta offset

/*
 * If delta is zero, we are running at the address we were linked at.
 *   r0  = delta
 *   r2  = BSS start
 *   r3  = BSS end
 *   r4  = kernel execution address (possibly with LSB set)
 *   r5  = appended dtb size (0 if not present)
 *   r7  = architecture ID
 *   r8  = atags pointer
 *   r11 = GOT start
 *   r12 = GOT end
 *   sp  = stack pointer
 */
                orrs    r1, r0, r5
                beq     not_relocated

                add     r11, r11, r0
                add     r12, r12, r0

#ifndef CONFIG_ZBOOT_ROM
                /*
                 * If we're running fully PIC === CONFIG_ZBOOT_ROM = n,
                 * we need to fix up pointers into the BSS region.
                 * Note that the stack pointer has already been fixed up.
                 */
                add     r2, r2, r0
                add     r3, r3, r0

                /*
                 * Relocate all entries in the GOT table.
                 * Bump bss entries to _edata + dtb size
                 */
1:              ldr     r1, [r11, #0]           @ relocate entries in the GOT
                add     r1, r1, r0              @ This fixes up C references
                cmp     r1, r2                  @ if entry >= bss_start &&
                cmphs   r3, r1                  @       bss_end > entry
                addhi   r1, r1, r5              @    entry += dtb size
                str     r1, [r11], #4           @ next entry
                cmp     r11, r12
                blo     1b

                /* bump our bss pointers too */
                add     r2, r2, r5
                add     r3, r3, r5

#else

                /*
                 * Relocate entries in the GOT table.  We only relocate
                 * the entries that are outside the (relocated) BSS region.
                 */
1:              ldr     r1, [r11, #0]           @ relocate entries in the GOT
                cmp     r1, r2                  @ entry < bss_start ||
                cmphs   r3, r1                  @ _end < entry
                addlo   r1, r1, r0              @ table.  This fixes up the
                str     r1, [r11], #4           @ C references.
                cmp     r11, r12
                blo     1b
#endif

not_relocated:  mov     r0, #0
1:              str     r0, [r2], #4            @ clear bss
                str     r0, [r2], #4
                str     r0, [r2], #4
                str     r0, [r2], #4
                cmp     r2, r3
                blo     1b

                /*
                 * Did we skip the cache setup earlier?
                 * That is indicated by the LSB in r4.
                 * Do it now if so.
                 */
                tst     r4, #1
                bic     r4, r4, #1
                blne    cache_on

/*
 * The C runtime environment should now be setup sufficiently.
 * Set up some pointers, and start decompressing.
 *   r4  = kernel execution address
 *   r7  = architecture ID
 *   r8  = atags pointer
 */
                mov     r0, r4
                mov     r1, sp                  @ malloc space above stack
                add     r2, sp, #MALLOC_SIZE    @ 64k max
                mov     r3, r7
                bl      decompress_kernel

                get_inflated_image_size r1, r2, r3

                mov     r0, r4                  @ start of inflated image
                add     r1, r1, r0              @ end of inflated image
                bl      cache_clean_flush
                bl      cache_off

#ifdef CONFIG_ARM_VIRT_EXT
                mrs     r0, spsr                @ Get saved CPU boot mode
                and     r0, r0, #MODE_MASK
                cmp     r0, #HYP_MODE           @ if not booted in HYP mode...
                bne     __enter_kernel          @ boot kernel directly

                adr_l   r0, __hyp_reentry_vectors
                bl      __hyp_set_vectors
                __HVC(0)                        @ otherwise bounce to hyp mode

                b       .                       @ should never be reached
#else
                b       __enter_kernel
#endif

                .align  2
                .type   LC0, #object
LC0:            .word   LC0                     @ r1
                .word   __bss_start             @ r2
                .word   _end                    @ r3
                .word   _got_start              @ r11
                .word   _got_end                @ ip
                .size   LC0, . - LC0

                .type   LC1, #object
LC1:            .word   .L_user_stack_end - LC1 @ sp
                .word   _edata - LC1            @ r6
                .size   LC1, . - LC1

.Lheadroom:
                .word   _end - restart + 16384 + 1024*1024

.Linflated_image_size_offset:
                .long   (input_data_end - 4) - .

#ifdef CONFIG_ARCH_RPC
                .globl  params
params:         ldr     r0, =0x10000100         @ params_phys for RPC
                mov     pc, lr
                .ltorg
                .align
#endif

/*
 * dcache_line_size - get the minimum D-cache line size from the CTR register
 * on ARMv7.
 */
                .macro  dcache_line_size, reg, tmp
#ifdef CONFIG_CPU_V7M
                movw    \tmp, #:lower16:BASEADDR_V7M_SCB + V7M_SCB_CTR
                movt    \tmp, #:upper16:BASEADDR_V7M_SCB + V7M_SCB_CTR
                ldr     \tmp, [\tmp]
#else
                mrc     p15, 0, \tmp, c0, c0, 1         @ read ctr
#endif
                lsr     \tmp, \tmp, #16
                and     \tmp, \tmp, #0xf                @ cache line size encoding
                mov     \reg, #4                        @ bytes per word
                mov     \reg, \reg, lsl \tmp            @ actual cache line size
                .endm

/*
 * Turn on the cache.  We need to setup some page tables so that we
 * can have both the I and D caches on.
 *
 * We place the page tables 16k down from the kernel execution address,
 * and we hope that nothing else is using it.  If we're using it, we
 * will go pop!
 *
 * On entry,
 *  r4 = kernel execution address
 *  r7 = architecture number
 *  r8 = atags pointer
 * On exit,
 *  r0, r1, r2, r3, r9, r10, r12 corrupted
 * This routine must preserve:
 *  r4, r7, r8
 */
                .align  5
cache_on:       mov     r3, #8                  @ cache_on function
                b       call_cache_fn

/*
 * Initialize the highest priority protection region, PR7
 * to cover all 32bit address and cacheable and bufferable.
 */
__armv4_mpu_cache_on:
                mov     r0, #0x3f               @ 4G, the whole
                mcr     p15, 0, r0, c6, c7, 0   @ PR7 Area Setting
                mcr     p15, 0, r0, c6, c7, 1

                mov     r0, #0x80               @ PR7
                mcr     p15, 0, r0, c2, c0, 0   @ D-cache on
                mcr     p15, 0, r0, c2, c0, 1   @ I-cache on
                mcr     p15, 0, r0, c3, c0, 0   @ write-buffer on

                mov     r0, #0xc000
                mcr     p15, 0, r0, c5, c0, 1   @ I-access permission
                mcr     p15, 0, r0, c5, c0, 0   @ D-access permission

                mov     r0, #0
                mcr     p15, 0, r0, c7, c10, 4  @ drain write buffer
                mcr     p15, 0, r0, c7, c5, 0   @ flush(inval) I-Cache
                mcr     p15, 0, r0, c7, c6, 0   @ flush(inval) D-Cache
                mrc     p15, 0, r0, c1, c0, 0   @ read control reg
                                                @ ...I .... ..D. WC.M
                orr     r0, r0, #0x002d         @ .... .... ..1. 11.1
                orr     r0, r0, #0x1000         @ ...1 .... .... ....

                mcr     p15, 0, r0, c1, c0, 0   @ write control reg

                mov     r0, #0
                mcr     p15, 0, r0, c7, c5, 0   @ flush(inval) I-Cache
                mcr     p15, 0, r0, c7, c6, 0   @ flush(inval) D-Cache
                mov     pc, lr

__armv3_mpu_cache_on:
                mov     r0, #0x3f               @ 4G, the whole
                mcr     p15, 0, r0, c6, c7, 0   @ PR7 Area Setting

                mov     r0, #0x80               @ PR7
                mcr     p15, 0, r0, c2, c0, 0   @ cache on
                mcr     p15, 0, r0, c3, c0, 0   @ write-buffer on

                mov     r0, #0xc000
                mcr     p15, 0, r0, c5, c0, 0   @ access permission

                mov     r0, #0
                mcr     p15, 0, r0, c7, c0, 0   @ invalidate whole cache v3
                /*
                 * ?? ARMv3 MMU does not allow reading the control register,
                 * does this really work on ARMv3 MPU?
                 */
                mrc     p15, 0, r0, c1, c0, 0   @ read control reg
                                                @ .... .... .... WC.M
                orr     r0, r0, #0x000d         @ .... .... .... 11.1
                /* ?? this overwrites the value constructed above? */
                mov     r0, #0
                mcr     p15, 0, r0, c1, c0, 0   @ write control reg

                /* ?? invalidate for the second time? */
                mcr     p15, 0, r0, c7, c0, 0   @ invalidate whole cache v3
                mov     pc, lr

#ifdef CONFIG_CPU_DCACHE_WRITETHROUGH
#define CB_BITS 0x08
#else
#define CB_BITS 0x0c
#endif

__setup_mmu:    sub     r3, r4, #16384          @ Page directory size
                bic     r3, r3, #0xff           @ Align the pointer
                bic     r3, r3, #0x3f00
/*
 * Initialise the page tables, turning on the cacheable and bufferable
 * bits for the RAM area only.
 */
                mov     r0, r3
                mov     r9, r0, lsr #18
                mov     r9, r9, lsl #18         @ start of RAM
                add     r10, r9, #0x10000000    @ a reasonable RAM size
                mov     r1, #0x12               @ XN|U + section mapping
                orr     r1, r1, #3 << 10        @ AP=11
                add     r2, r3, #16384
1:              cmp     r1, r9                  @ if virt > start of RAM
                cmphs   r10, r1                 @   && end of RAM > virt
                bic     r1, r1, #0x1c           @ clear XN|U + C + B
                orrlo   r1, r1, #0x10           @ Set XN|U for non-RAM
                orrhs   r1, r1, r6              @ set RAM section settings
                str     r1, [r0], #4            @ 1:1 mapping
                add     r1, r1, #1048576
                teq     r0, r2
                bne     1b
/*
 * If ever we are running from Flash, then we surely want the cache
 * to be enabled also for our execution instance...  We map 2MB of it
 * so there is no map overlap problem for up to 1 MB compressed kernel.
 * If the execution is in RAM then we would only be duplicating the above.
 */
                orr     r1, r6, #0x04           @ ensure B is set for this
                orr     r1, r1, #3 << 10
                mov     r2, pc
                mov     r2, r2, lsr #20
                orr     r1, r1, r2, lsl #20
                add     r0, r3, r2, lsl #2
                str     r1, [r0], #4
                add     r1, r1, #1048576
                str     r1, [r0]
                mov     pc, lr
ENDPROC(__setup_mmu)

@ Enable unaligned access on v6, to allow better code generation
@ for the decompressor C code:
__armv6_mmu_cache_on:
                mrc     p15, 0, r0, c1, c0, 0   @ read SCTLR
                bic     r0, r0, #2              @ A (no unaligned access fault)
                orr     r0, r0, #1 << 22        @ U (v6 unaligned access model)
                mcr     p15, 0, r0, c1, c0, 0   @ write SCTLR
                b       __armv4_mmu_cache_on

__arm926ejs_mmu_cache_on:
#ifdef CONFIG_CPU_DCACHE_WRITETHROUGH
                mov     r0, #4                  @ put dcache in WT mode
                mcr     p15, 7, r0, c15, c0, 0
#endif

__armv4_mmu_cache_on:
                mov     r12, lr
#ifdef CONFIG_MMU
                mov     r6, #CB_BITS | 0x12     @ U
                bl      __setup_mmu
                mov     r0, #0
                mcr     p15, 0, r0, c7, c10, 4  @ drain write buffer
                mcr     p15, 0, r0, c8, c7, 0   @ flush I,D TLBs
                mrc     p15, 0, r0, c1, c0, 0   @ read control reg
                orr     r0, r0, #0x5000         @ I-cache enable, RR cache replacement
                orr     r0, r0, #0x0030
 ARM_BE8(       orr     r0, r0, #1 << 25 )      @ big-endian page tables
                bl      __common_mmu_cache_on
                mov     r0, #0
                mcr     p15, 0, r0, c8, c7, 0   @ flush I,D TLBs
#endif
                mov     pc, r12

__armv7_mmu_cache_on:
                enable_cp15_barriers    r11
                mov     r12, lr
#ifdef CONFIG_MMU
                mrc     p15, 0, r11, c0, c1, 4  @ read ID_MMFR0
                tst     r11, #0xf               @ VMSA
                movne   r6, #CB_BITS | 0x02     @ !XN
                blne    __setup_mmu
                mov     r0, #0
                mcr     p15, 0, r0, c7, c10, 4  @ drain write buffer
                tst     r11, #0xf               @ VMSA
                mcrne   p15, 0, r0, c8, c7, 0   @ flush I,D TLBs
#endif
                mrc     p15, 0, r0, c1, c0, 0   @ read control reg
                bic     r0, r0, #1 << 28        @ clear SCTLR.TRE
                orr     r0, r0, #0x5000         @ I-cache enable, RR cache replacement
                orr     r0, r0, #0x003c         @ write buffer
                bic     r0, r0, #2              @ A (no unaligned access fault)
                orr     r0, r0, #1 << 22        @ U (v6 unaligned access model)
                                                @ (needed for ARM1176)
#ifdef CONFIG_MMU
 ARM_BE8(       orr     r0, r0, #1 << 25 )      @ big-endian page tables
                mrcne   p15, 0, r6, c2, c0, 2   @ read ttb control reg
                orrne   r0, r0, #1              @ MMU enabled
                movne   r1, #0xfffffffd         @ domain 0 = client
                bic     r6, r6, #1 << 31        @ 32-bit translation system
                bic     r6, r6, #(7 << 0) | (1 << 4)    @ use only ttbr0
                mcrne   p15, 0, r3, c2, c0, 0   @ load page table pointer
                mcrne   p15, 0, r1, c3, c0, 0   @ load domain access control
                mcrne   p15, 0, r6, c2, c0, 2   @ load ttb control
#endif
                mcr     p15, 0, r0, c7, c5, 4   @ ISB
                mcr     p15, 0, r0, c1, c0, 0   @ load control register
                mrc     p15, 0, r0, c1, c0, 0   @ and read it back
                mov     r0, #0
                mcr     p15, 0, r0, c7, c5, 4   @ ISB
                mov     pc, r12

__fa526_cache_on:
                mov     r12, lr
                mov     r6, #CB_BITS | 0x12     @ U
                bl      __setup_mmu
                mov     r0, #0
                mcr     p15, 0, r0, c7, c7, 0   @ Invalidate whole cache
                mcr     p15, 0, r0, c7, c10, 4  @ drain write buffer
                mcr     p15, 0, r0, c8, c7, 0   @ flush UTLB
                mrc     p15, 0, r0, c1, c0, 0   @ read control reg
                orr     r0, r0, #0x1000         @ I-cache enable
                bl      __common_mmu_cache_on
                mov     r0, #0
                mcr     p15, 0, r0, c8, c7, 0   @ flush UTLB
                mov     pc, r12

__common_mmu_cache_on:
#ifndef CONFIG_THUMB2_KERNEL
#ifndef DEBUG
                orr     r0, r0, #0x000d         @ Write buffer, mmu
#endif
                mov     r1, #-1
                mcr     p15, 0, r3, c2, c0, 0   @ load page table pointer
                mcr     p15, 0, r1, c3, c0, 0   @ load domain access control
                b       1f
                .align  5                       @ cache line aligned
1:              mcr     p15, 0, r0, c1, c0, 0   @ load control register
                mrc     p15, 0, r0, c1, c0, 0   @ and read it back to
                sub     pc, lr, r0, lsr #32     @ properly flush pipeline
#endif

#define PROC_ENTRY_SIZE (4*5)

/*
 * Here follow the relocatable cache support functions for the
 * various processors.  This is a generic hook for locating an
 * entry and jumping to an instruction at the specified offset
 * from the start of the block.  Please note this is all position
 * independent code.
 *
 *  r1  = corrupted
 *  r2  = corrupted
 *  r3  = block offset
 *  r9  = corrupted
 *  r12 = corrupted
 */

call_cache_fn:  adr     r12, proc_types
#ifdef CONFIG_CPU_CP15
                mrc     p15, 0, r9, c0, c0      @ get processor ID
#elif defined(CONFIG_CPU_V7M)
                /*
                 * On v7-M the processor id is located in the V7M_SCB_CPUID
                 * register, but as cache handling is IMPLEMENTATION DEFINED on
                 * v7-M (if existant at all) we just return early here.
                 * If V7M_SCB_CPUID were used the cpu ID functions (i.e.
                 * __armv7_mmu_cache_{on,off,flush}) would be selected which
                 * use cp15 registers that are not implemented on v7-M.
                 */
                bx      lr
#else
                ldr     r9, =CONFIG_PROCESSOR_ID
#endif
1:              ldr     r1, [r12, #0]           @ get value
                ldr     r2, [r12, #4]           @ get mask
                eor     r1, r1, r9              @ (real ^ match)
                tst     r1, r2                  @       & mask
 ARM(           addeq   pc, r12, r3             ) @ call cache function
 THUMB(         addeq   r12, r3                 )
 THUMB(         moveq   pc, r12                 ) @ call cache function
                add     r12, r12, #PROC_ENTRY_SIZE
                b       1b

/*
 * Table for cache operations.  This is basically:
 *   - CPU ID match
 *   - CPU ID mask
 *   - 'cache on' method instruction
 *   - 'cache off' method instruction
 *   - 'cache flush' method instruction
 *
 * We match an entry using: ((real_id ^ match) & mask) == 0
 *
 * Writethrough caches generally only need 'on' and 'off'
 * methods.  Writeback caches _must_ have the flush method
 * defined.
 */
                .align  2
                .type   proc_types,#object
proc_types:
                .word   0x41000000              @ old ARM ID
                .word   0xff00f000
                mov     pc, lr
 THUMB(         nop                             )
                mov     pc, lr
 THUMB(         nop                             )
                mov     pc, lr
 THUMB(         nop                             )

                .word   0x41007000              @ ARM7/710
                .word   0xfff8fe00
                mov     pc, lr
 THUMB(         nop                             )
                mov     pc, lr
 THUMB(         nop                             )
                mov     pc, lr
 THUMB(         nop                             )

                .word   0x41807200              @ ARM720T (writethrough)
                .word   0xffffff00
                W(b)    __armv4_mmu_cache_on
                W(b)    __armv4_mmu_cache_off
                mov     pc, lr
 THUMB(         nop                             )

                .word   0x41007400              @ ARM74x
                .word   0xff00ff00
                W(b)    __armv3_mpu_cache_on
                W(b)    __armv3_mpu_cache_off
                W(b)    __armv3_mpu_cache_flush
                
                .word   0x41009400              @ ARM94x
                .word   0xff00ff00
                W(b)    __armv4_mpu_cache_on
                W(b)    __armv4_mpu_cache_off
                W(b)    __armv4_mpu_cache_flush

                .word   0x41069260              @ ARM926EJ-S (v5TEJ)
                .word   0xff0ffff0
                W(b)    __arm926ejs_mmu_cache_on
                W(b)    __armv4_mmu_cache_off
                W(b)    __armv5tej_mmu_cache_flush

                .word   0x00007000              @ ARM7 IDs
                .word   0x0000f000
                mov     pc, lr
 THUMB(         nop                             )
                mov     pc, lr
 THUMB(         nop                             )
                mov     pc, lr
 THUMB(         nop                             )

                @ Everything from here on will be the new ID system.

                .word   0x4401a100              @ sa110 / sa1100
                .word   0xffffffe0
                W(b)    __armv4_mmu_cache_on
                W(b)    __armv4_mmu_cache_off
                W(b)    __armv4_mmu_cache_flush

                .word   0x6901b110              @ sa1110
                .word   0xfffffff0
                W(b)    __armv4_mmu_cache_on
                W(b)    __armv4_mmu_cache_off
                W(b)    __armv4_mmu_cache_flush

                .word   0x56056900
                .word   0xffffff00              @ PXA9xx
                W(b)    __armv4_mmu_cache_on
                W(b)    __armv4_mmu_cache_off
                W(b)    __armv4_mmu_cache_flush

                .word   0x56158000              @ PXA168
                .word   0xfffff000
                W(b)    __armv4_mmu_cache_on
                W(b)    __armv4_mmu_cache_off
                W(b)    __armv5tej_mmu_cache_flush

                .word   0x56050000              @ Feroceon
                .word   0xff0f0000
                W(b)    __armv4_mmu_cache_on
                W(b)    __armv4_mmu_cache_off
                W(b)    __armv5tej_mmu_cache_flush

#ifdef CONFIG_CPU_FEROCEON_OLD_ID
                /* this conflicts with the standard ARMv5TE entry */
                .long   0x41009260              @ Old Feroceon
                .long   0xff00fff0
                b       __armv4_mmu_cache_on
                b       __armv4_mmu_cache_off
                b       __armv5tej_mmu_cache_flush
#endif

                .word   0x66015261              @ FA526
                .word   0xff01fff1
                W(b)    __fa526_cache_on
                W(b)    __armv4_mmu_cache_off
                W(b)    __fa526_cache_flush

                @ These match on the architecture ID

                .word   0x00020000              @ ARMv4T
                .word   0x000f0000
                W(b)    __armv4_mmu_cache_on
                W(b)    __armv4_mmu_cache_off
                W(b)    __armv4_mmu_cache_flush

                .word   0x00050000              @ ARMv5TE
                .word   0x000f0000
                W(b)    __armv4_mmu_cache_on
                W(b)    __armv4_mmu_cache_off
                W(b)    __armv4_mmu_cache_flush

                .word   0x00060000              @ ARMv5TEJ
                .word   0x000f0000
                W(b)    __armv4_mmu_cache_on
                W(b)    __armv4_mmu_cache_off
                W(b)    __armv5tej_mmu_cache_flush

                .word   0x0007b000              @ ARMv6
                .word   0x000ff000
                W(b)    __armv6_mmu_cache_on
                W(b)    __armv4_mmu_cache_off
                W(b)    __armv6_mmu_cache_flush

                .word   0x000f0000              @ new CPU Id
                .word   0x000f0000
                W(b)    __armv7_mmu_cache_on
                W(b)    __armv7_mmu_cache_off
                W(b)    __armv7_mmu_cache_flush

                .word   0                       @ unrecognised type
                .word   0
                mov     pc, lr
 THUMB(         nop                             )
                mov     pc, lr
 THUMB(         nop                             )
                mov     pc, lr
 THUMB(         nop                             )

                .size   proc_types, . - proc_types

                /*
                 * If you get a "non-constant expression in ".if" statement"
                 * error from the assembler on this line, check that you have
                 * not accidentally written a "b" instruction where you should
                 * have written W(b).
                 */
                .if (. - proc_types) % PROC_ENTRY_SIZE != 0
                .error "The size of one or more proc_types entries is wrong."
                .endif

/*
 * Turn off the Cache and MMU.  ARMv3 does not support
 * reading the control register, but ARMv4 does.
 *
 * On exit,
 *  r0, r1, r2, r3, r9, r12 corrupted
 * This routine must preserve:
 *  r4, r7, r8
 */
                .align  5
cache_off:      mov     r3, #12                 @ cache_off function
                b       call_cache_fn

__armv4_mpu_cache_off:
                mrc     p15, 0, r0, c1, c0
                bic     r0, r0, #0x000d
                mcr     p15, 0, r0, c1, c0      @ turn MPU and cache off
                mov     r0, #0
                mcr     p15, 0, r0, c7, c10, 4  @ drain write buffer
                mcr     p15, 0, r0, c7, c6, 0   @ flush D-Cache
                mcr     p15, 0, r0, c7, c5, 0   @ flush I-Cache
                mov     pc, lr

__armv3_mpu_cache_off:
                mrc     p15, 0, r0, c1, c0
                bic     r0, r0, #0x000d
                mcr     p15, 0, r0, c1, c0, 0   @ turn MPU and cache off
                mov     r0, #0
                mcr     p15, 0, r0, c7, c0, 0   @ invalidate whole cache v3
                mov     pc, lr

__armv4_mmu_cache_off:
#ifdef CONFIG_MMU
                mrc     p15, 0, r0, c1, c0
                bic     r0, r0, #0x000d
                mcr     p15, 0, r0, c1, c0      @ turn MMU and cache off
                mov     r0, #0
                mcr     p15, 0, r0, c7, c7      @ invalidate whole cache v4
                mcr     p15, 0, r0, c8, c7      @ invalidate whole TLB v4
#endif
                mov     pc, lr

__armv7_mmu_cache_off:
                mrc     p15, 0, r0, c1, c0
#ifdef CONFIG_MMU
                bic     r0, r0, #0x0005
#else
                bic     r0, r0, #0x0004
#endif
                mcr     p15, 0, r0, c1, c0      @ turn MMU and cache off
                mov     r0, #0
#ifdef CONFIG_MMU
                mcr     p15, 0, r0, c8, c7, 0   @ invalidate whole TLB
#endif
                mcr     p15, 0, r0, c7, c5, 6   @ invalidate BTC
                mcr     p15, 0, r0, c7, c10, 4  @ DSB
                mcr     p15, 0, r0, c7, c5, 4   @ ISB
                mov     pc, lr

/*
 * Clean and flush the cache to maintain consistency.
 *
 * On entry,
 *  r0 = start address
 *  r1 = end address (exclusive)
 * On exit,
 *  r1, r2, r3, r9, r10, r11, r12 corrupted
 * This routine must preserve:
 *  r4, r6, r7, r8
 */
                .align  5
cache_clean_flush:
                mov     r3, #16
                mov     r11, r1
                b       call_cache_fn

__armv4_mpu_cache_flush:
                tst     r4, #1
                movne   pc, lr
                mov     r2, #1
                mov     r3, #0
                mcr     p15, 0, ip, c7, c6, 0   @ invalidate D cache
                mov     r1, #7 << 5             @ 8 segments
1:              orr     r3, r1, #63 << 26       @ 64 entries
2:              mcr     p15, 0, r3, c7, c14, 2  @ clean & invalidate D index
                subs    r3, r3, #1 << 26
                bcs     2b                      @ entries 63 to 0
                subs    r1, r1, #1 << 5
                bcs     1b                      @ segments 7 to 0

                teq     r2, #0
                mcrne   p15, 0, ip, c7, c5, 0   @ invalidate I cache
                mcr     p15, 0, ip, c7, c10, 4  @ drain WB
                mov     pc, lr
                
__fa526_cache_flush:
                tst     r4, #1
                movne   pc, lr
                mov     r1, #0
                mcr     p15, 0, r1, c7, c14, 0  @ clean and invalidate D cache
                mcr     p15, 0, r1, c7, c5, 0   @ flush I cache
                mcr     p15, 0, r1, c7, c10, 4  @ drain WB
                mov     pc, lr

__armv6_mmu_cache_flush:
                mov     r1, #0
                tst     r4, #1
                mcreq   p15, 0, r1, c7, c14, 0  @ clean+invalidate D
                mcr     p15, 0, r1, c7, c5, 0   @ invalidate I+BTB
                mcreq   p15, 0, r1, c7, c15, 0  @ clean+invalidate unified
                mcr     p15, 0, r1, c7, c10, 4  @ drain WB
                mov     pc, lr

__armv7_mmu_cache_flush:
                enable_cp15_barriers    r10
                tst     r4, #1
                bne     iflush
                mrc     p15, 0, r10, c0, c1, 5  @ read ID_MMFR1
                tst     r10, #0xf << 16         @ hierarchical cache (ARMv7)
                mov     r10, #0
                beq     hierarchical
                mcr     p15, 0, r10, c7, c14, 0 @ clean+invalidate D
                b       iflush
hierarchical:
                dcache_line_size r1, r2         @ r1 := dcache min line size
                sub     r2, r1, #1              @ r2 := line size mask
                bic     r0, r0, r2              @ round down start to line size
                sub     r11, r11, #1            @ end address is exclusive
                bic     r11, r11, r2            @ round down end to line size
0:              cmp     r0, r11                 @ finished?
                bgt     iflush
                mcr     p15, 0, r0, c7, c14, 1  @ Dcache clean/invalidate by VA
                add     r0, r0, r1
                b       0b
iflush:
                mcr     p15, 0, r10, c7, c10, 4 @ DSB
                mcr     p15, 0, r10, c7, c5, 0  @ invalidate I+BTB
                mcr     p15, 0, r10, c7, c10, 4 @ DSB
                mcr     p15, 0, r10, c7, c5, 4  @ ISB
                mov     pc, lr

__armv5tej_mmu_cache_flush:
                tst     r4, #1
                movne   pc, lr
1:              mrc     p15, 0, APSR_nzcv, c7, c14, 3   @ test,clean,invalidate D cache
                bne     1b
                mcr     p15, 0, r0, c7, c5, 0   @ flush I cache
                mcr     p15, 0, r0, c7, c10, 4  @ drain WB
                mov     pc, lr

__armv4_mmu_cache_flush:
                tst     r4, #1
                movne   pc, lr
                mov     r2, #64*1024            @ default: 32K dcache size (*2)
                mov     r11, #32                @ default: 32 byte line size
                mrc     p15, 0, r3, c0, c0, 1   @ read cache type
                teq     r3, r9                  @ cache ID register present?
                beq     no_cache_id
                mov     r1, r3, lsr #18
                and     r1, r1, #7
                mov     r2, #1024
                mov     r2, r2, lsl r1          @ base dcache size *2
                tst     r3, #1 << 14            @ test M bit
                addne   r2, r2, r2, lsr #1      @ +1/2 size if M == 1
                mov     r3, r3, lsr #12
                and     r3, r3, #3
                mov     r11, #8
                mov     r11, r11, lsl r3        @ cache line size in bytes
no_cache_id:
                mov     r1, pc
                bic     r1, r1, #63             @ align to longest cache line
                add     r2, r1, r2
1:
 ARM(           ldr     r3, [r1], r11           ) @ s/w flush D cache
 THUMB(         ldr     r3, [r1]                ) @ s/w flush D cache
 THUMB(         add     r1, r1, r11             )
                teq     r1, r2
                bne     1b

                mcr     p15, 0, r1, c7, c5, 0   @ flush I cache
                mcr     p15, 0, r1, c7, c6, 0   @ flush D cache
                mcr     p15, 0, r1, c7, c10, 4  @ drain WB
                mov     pc, lr

__armv3_mmu_cache_flush:
__armv3_mpu_cache_flush:
                tst     r4, #1
                movne   pc, lr
                mov     r1, #0
                mcr     p15, 0, r1, c7, c0, 0   @ invalidate whole cache v3
                mov     pc, lr

/*
 * Various debugging routines for printing hex characters and
 * memory, which again must be relocatable.
 */
#ifdef DEBUG
                .align  2
                .type   phexbuf,#object
phexbuf:        .space  12
                .size   phexbuf, . - phexbuf

@ phex corrupts {r0, r1, r2, r3}
phex:           adr     r3, phexbuf
                mov     r2, #0
                strb    r2, [r3, r1]
1:              subs    r1, r1, #1
                movmi   r0, r3
                bmi     puts
                and     r2, r0, #15
                mov     r0, r0, lsr #4
                cmp     r2, #10
                addge   r2, r2, #7
                add     r2, r2, #'0'
                strb    r2, [r3, r1]
                b       1b

@ puts corrupts {r0, r1, r2, r3}
puts:           loadsp  r3, r2, r1
1:              ldrb    r2, [r0], #1
                teq     r2, #0
                moveq   pc, lr
2:              writeb  r2, r3, r1
                mov     r1, #0x00020000
3:              subs    r1, r1, #1
                bne     3b
                teq     r2, #'\n'
                moveq   r2, #'\r'
                beq     2b
                teq     r0, #0
                bne     1b
                mov     pc, lr
@ putc corrupts {r0, r1, r2, r3}
putc:
                mov     r2, r0
                loadsp  r3, r1, r0
                mov     r0, #0
                b       2b

@ memdump corrupts {r0, r1, r2, r3, r10, r11, r12, lr}
memdump:        mov     r12, r0
                mov     r10, lr
                mov     r11, #0
2:              mov     r0, r11, lsl #2
                add     r0, r0, r12
                mov     r1, #8
                bl      phex
                mov     r0, #':'
                bl      putc
1:              mov     r0, #' '
                bl      putc
                ldr     r0, [r12, r11, lsl #2]
                mov     r1, #8
                bl      phex
                and     r0, r11, #7
                teq     r0, #3
                moveq   r0, #' '
                bleq    putc
                and     r0, r11, #7
                add     r11, r11, #1
                teq     r0, #7
                bne     1b
                mov     r0, #'\n'
                bl      putc
                cmp     r11, #64
                blt     2b
                mov     pc, r10
#endif

                .ltorg

#ifdef CONFIG_ARM_VIRT_EXT
.align 5
__hyp_reentry_vectors:
                W(b)    .                       @ reset
                W(b)    .                       @ undef
#ifdef CONFIG_EFI_STUB
                W(b)    __enter_kernel_from_hyp @ hvc from HYP
#else
                W(b)    .                       @ svc
#endif
                W(b)    .                       @ pabort
                W(b)    .                       @ dabort
                W(b)    __enter_kernel          @ hyp
                W(b)    .                       @ irq
                W(b)    .                       @ fiq
#endif /* CONFIG_ARM_VIRT_EXT */

__enter_kernel:
                mov     r0, #0                  @ must be 0
                mov     r1, r7                  @ restore architecture number
                mov     r2, r8                  @ restore atags pointer
 ARM(           mov     pc, r4          )       @ call kernel
 M_CLASS(       add     r4, r4, #1      )       @ enter in Thumb mode for M class
 THUMB(         bx      r4              )       @ entry point is always ARM for A/R classes

reloc_code_end:

#ifdef CONFIG_EFI_STUB
__enter_kernel_from_hyp:
                mrc     p15, 4, r0, c1, c0, 0   @ read HSCTLR
                bic     r0, r0, #0x5            @ disable MMU and caches
                mcr     p15, 4, r0, c1, c0, 0   @ write HSCTLR
                isb
                b       __enter_kernel

ENTRY(efi_enter_kernel)
                mov     r4, r0                  @ preserve image base
                mov     r8, r1                  @ preserve DT pointer

                adr_l   r0, call_cache_fn
                adr     r1, 0f                  @ clean the region of code we
                bl      cache_clean_flush       @ may run with the MMU off

#ifdef CONFIG_ARM_VIRT_EXT
                @
                @ The EFI spec does not support booting on ARM in HYP mode,
                @ since it mandates that the MMU and caches are on, with all
                @ 32-bit addressable DRAM mapped 1:1 using short descriptors.
                @
                @ While the EDK2 reference implementation adheres to this,
                @ U-Boot might decide to enter the EFI stub in HYP mode
                @ anyway, with the MMU and caches either on or off.
                @
                mrs     r0, cpsr                @ get the current mode
                msr     spsr_cxsf, r0           @ record boot mode
                and     r0, r0, #MODE_MASK      @ are we running in HYP mode?
                cmp     r0, #HYP_MODE
                bne     .Lefi_svc

                mrc     p15, 4, r1, c1, c0, 0   @ read HSCTLR
                tst     r1, #0x1                @ MMU enabled at HYP?
                beq     1f

                @
                @ When running in HYP mode with the caches on, we're better
                @ off just carrying on using the cached 1:1 mapping that the
                @ firmware provided. Set up the HYP vectors so HVC instructions
                @ issued from HYP mode take us to the correct handler code. We
                @ will disable the MMU before jumping to the kernel proper.
                @
 ARM(           bic     r1, r1, #(1 << 30)      ) @ clear HSCTLR.TE
 THUMB(         orr     r1, r1, #(1 << 30)      ) @ set HSCTLR.TE
                mcr     p15, 4, r1, c1, c0, 0
                adr     r0, __hyp_reentry_vectors
                mcr     p15, 4, r0, c12, c0, 0  @ set HYP vector base (HVBAR)
                isb
                b       .Lefi_hyp

                @
                @ When running in HYP mode with the caches off, we need to drop
                @ into SVC mode now, and let the decompressor set up its cached
                @ 1:1 mapping as usual.
                @
1:              mov     r9, r4                  @ preserve image base
                bl      __hyp_stub_install      @ install HYP stub vectors
                safe_svcmode_maskall    r1      @ drop to SVC mode
                msr     spsr_cxsf, r0           @ record boot mode
                orr     r4, r9, #1              @ restore image base and set LSB
                b       .Lefi_hyp
.Lefi_svc:
#endif
                mrc     p15, 0, r0, c1, c0, 0   @ read SCTLR
                tst     r0, #0x1                @ MMU enabled?
                orreq   r4, r4, #1              @ set LSB if not

.Lefi_hyp:
                mov     r0, r8                  @ DT start
                add     r1, r8, r2              @ DT end
                bl      cache_clean_flush

                adr     r0, 0f                  @ switch to our stack
                ldr     sp, [r0]
                add     sp, sp, r0

                mov     r5, #0                  @ appended DTB size
                mov     r7, #0xFFFFFFFF         @ machine ID
                b       wont_overwrite
ENDPROC(efi_enter_kernel)
0:              .long   .L_user_stack_end - .
#endif

                .align
                .section ".stack", "aw", %nobits
.L_user_stack:  .space  4096
.L_user_stack_end: