root/sys/arm/arm/blockio.S
/*      $NetBSD: blockio.S,v 1.5 2002/08/15 01:38:16 briggs Exp $       */

/*-
 * Copyright (c) 2001 Ben Harris.
 * Copyright (c) 1994 Mark Brinicombe.
 * Copyright (c) 1994 Brini.
 * All rights reserved.
 *
 * This code is derived from software written for Brini by Mark Brinicombe
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Brini.
 * 4. The name of the company nor the name of the author may be used to
 *    endorse or promote products derived from this software without specific
 *    prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * RiscBSD kernel project
 *
 * blockio.S
 *
 * optimised block read/write from/to IO routines.
 *
 * Created      : 08/10/94
 * Modified     : 22/01/99  -- R.Earnshaw
 *                             Faster, and small tweaks for StrongARM
 */

#include <machine/asm.h>
        .syntax unified

/*
 * Read bytes from an I/O address into a block of memory
 *
 * r0 = address to read from (IO)
 * r1 = address to write to (memory)
 * r2 = length
 */

/* This code will look very familiar if you've read _memcpy(). */
ENTRY(read_multi_1)
        mov     ip, sp
        stmfd   sp!, {fp, ip, lr, pc}
        sub     fp, ip, #4
        subs    r2, r2, #4              /* r2 = length - 4 */
        blt     .Lrm1_l4                        /* less than 4 bytes */
        ands    r12, r1, #3
        beq     .Lrm1_main              /* aligned destination */
        rsb     r12, r12, #4
        cmp     r12, #2
        ldrb    r3, [r0]
        strb    r3, [r1], #1
        ldrbge  r3, [r0]
        strbge  r3, [r1], #1
        ldrbgt  r3, [r0]
        strbgt  r3, [r1], #1
        subs    r2, r2, r12
        blt     .Lrm1_l4
.Lrm1_main:
.Lrm1loop:
        ldrb    r3, [r0]
        ldrb    r12, [r0]
        orr     r3, r3, r12, lsl #8
        ldrb    r12, [r0]
        orr     r3, r3, r12, lsl #16
        ldrb    r12, [r0]
        orr     r3, r3, r12, lsl #24
        str     r3, [r1], #4
        subs    r2, r2, #4
        bge     .Lrm1loop
.Lrm1_l4:
        adds    r2, r2, #4                      /* r2 = length again */
        ldmdbeq fp, {fp, sp, pc}
        RETeq
        cmp     r2, #2
        ldrb    r3, [r0]
        strb    r3, [r1], #1
        ldrbge  r3, [r0]
        strbge  r3, [r1], #1
        ldrbgt  r3, [r0]
        strbgt  r3, [r1], #1
        ldmdb   fp, {fp, sp, pc}
END(read_multi_1)

/*
 * Write bytes to an I/O address from a block of memory
 *
 * r0 = address to write to (IO)
 * r1 = address to read from (memory)
 * r2 = length
 */

/* This code will look very familiar if you've read _memcpy(). */
ENTRY(write_multi_1)
        mov     ip, sp
        stmfd   sp!, {fp, ip, lr, pc}
        sub     fp, ip, #4
        subs    r2, r2, #4              /* r2 = length - 4 */
        blt     .Lwm1_l4                /* less than 4 bytes */
        ands    r12, r1, #3
        beq     .Lwm1_main              /* aligned source */
        rsb     r12, r12, #4
        cmp     r12, #2
        ldrb    r3, [r1], #1
        strb    r3, [r0]
        ldrbge  r3, [r1], #1
        strbge  r3, [r0]
        ldrbgt  r3, [r1], #1
        strbgt  r3, [r0]
        subs    r2, r2, r12
        blt     .Lwm1_l4
.Lwm1_main:
.Lwm1loop:
        ldr     r3, [r1], #4
        strb    r3, [r0]
        mov     r3, r3, lsr #8
        strb    r3, [r0]
        mov     r3, r3, lsr #8
        strb    r3, [r0]
        mov     r3, r3, lsr #8
        strb    r3, [r0]
        subs    r2, r2, #4
        bge     .Lwm1loop
.Lwm1_l4:
        adds    r2, r2, #4                      /* r2 = length again */
        ldmdbeq fp, {fp, sp, pc}
        cmp     r2, #2
        ldrb    r3, [r1], #1
        strb    r3, [r0]
        ldrbge  r3, [r1], #1
        strbge  r3, [r0]
        ldrbgt  r3, [r1], #1
        strbgt  r3, [r0]
        ldmdb   fp, {fp, sp, pc}
END(write_multi_1)

/*
 * Reads short ints (16 bits) from an I/O address into a block of memory
 *
 * r0 = address to read from (IO)
 * r1 = address to write to (memory)
 * r2 = length
 */

ENTRY(insw)
/* Make sure that we have a positive length */
        cmp     r2, #0x00000000
        movle   pc, lr

/* If the destination address and the size is word aligned, do it fast */

        tst     r2, #0x00000001
        tsteq   r1, #0x00000003
        beq     .Lfastinsw

/* Non aligned insw */

.Linswloop:
        ldr     r3, [r0]
        subs    r2, r2, #0x00000001     /* Loop test in load delay slot */
        strb    r3, [r1], #0x0001
        mov     r3, r3, lsr #8
        strb    r3, [r1], #0x0001
        bgt     .Linswloop

        RET

/* Word aligned insw */

.Lfastinsw:

.Lfastinswloop:
        ldr     r3, [r0, #0x0002]       /* take advantage of nonaligned
                                         * word accesses */
        ldr     ip, [r0]
        mov     r3, r3, lsr #16         /* Put the two shorts together */
        orr     r3, r3, ip, lsl #16
        str     r3, [r1], #0x0004       /* Store */
        subs    r2, r2, #0x00000002     /* Next */
        bgt     .Lfastinswloop

        RET
END(insw)

/*
 * Writes short ints (16 bits) from a block of memory to an I/O address
 *
 * r0 = address to write to (IO)
 * r1 = address to read from (memory)
 * r2 = length
 */

ENTRY(outsw)
/* Make sure that we have a positive length */
        cmp     r2, #0x00000000
        movle   pc, lr

/* If the destination address and the size is word aligned, do it fast */

        tst     r2, #0x00000001
        tsteq   r1, #0x00000003
        beq     .Lfastoutsw

/* Non aligned outsw */

.Loutswloop:
        ldrb    r3, [r1], #0x0001
        ldrb    ip, [r1], #0x0001
        subs    r2, r2, #0x00000001     /* Loop test in load delay slot */
        orr     r3, r3, ip, lsl #8
        orr     r3, r3, r3, lsl #16
        str     r3, [r0]
        bgt     .Loutswloop

        RET

/* Word aligned outsw */

.Lfastoutsw:

.Lfastoutswloop:
        ldr     r3, [r1], #0x0004       /* r3 = (H)(L) */
        subs    r2, r2, #0x00000002     /* Loop test in load delay slot */

        eor     ip, r3, r3, lsr #16     /* ip = (H)(H^L) */
        eor     r3, r3, ip, lsl #16     /* r3 = (H^H^L)(L) = (L)(L) */
        eor     ip, ip, r3, lsr #16     /* ip = (H)(H^L^L) = (H)(H) */

        str     r3, [r0]
        str     ip, [r0]

/*      mov     ip, r3, lsl #16
 *      orr     ip, ip, ip, lsr #16
 *      str     ip, [r0]
 *
 *      mov     ip, r3, lsr #16
 *      orr     ip, ip, ip, lsl #16
 *      str     ip, [r0]
 */

        bgt     .Lfastoutswloop

        RET
END(outsw)

/*
 * reads short ints (16 bits) from an I/O address into a block of memory
 * with a length garenteed to be a multiple of 16 bytes
 * with a word aligned destination address
 *
 * r0 = address to read from (IO)
 * r1 = address to write to (memory)
 * r2 = length
 */

ENTRY(insw16)
/* Make sure that we have a positive length */
        cmp     r2, #0x00000000
        movle   pc, lr

/* If the destination address is word aligned and the size suitably
   aligned, do it fast */

        tst     r2, #0x00000007
        tsteq   r1, #0x00000003

        bne     _C_LABEL(insw)

/* Word aligned insw */

        stmfd   sp!, {r4,r5,lr}

.Linsw16loop:
        ldr     r3, [r0, #0x0002]       /* take advantage of nonaligned
                                         * word accesses */
        ldr     lr, [r0]
        mov     r3, r3, lsr #16         /* Put the two shorts together */
        orr     r3, r3, lr, lsl #16

        ldr     r4, [r0, #0x0002]       /* take advantage of nonaligned
                                         * word accesses */
        ldr     lr, [r0]
        mov     r4, r4, lsr #16         /* Put the two shorts together */
        orr     r4, r4, lr, lsl #16

        ldr     r5, [r0, #0x0002]       /* take advantage of nonaligned
                                         * word accesses */
        ldr     lr, [r0]
        mov     r5, r5, lsr #16         /* Put the two shorts together */
        orr     r5, r5, lr, lsl #16

        ldr     ip, [r0, #0x0002]       /* take advantage of nonaligned
                                         * word accesses */
        ldr     lr, [r0]
        mov     ip, ip, lsr #16         /* Put the two shorts together */
        orr     ip, ip, lr, lsl #16

        stmia   r1!, {r3-r5,ip}
        subs    r2, r2, #0x00000008     /* Next */
        bgt     .Linsw16loop

        ldmfd   sp!, {r4,r5,pc}         /* Restore regs and go home */
END(insw16)

/*
 * Writes short ints (16 bits) from a block of memory to an I/O address
 *
 * r0 = address to write to (IO)
 * r1 = address to read from (memory)
 * r2 = length
 */

ENTRY(outsw16)
/* Make sure that we have a positive length */
        cmp     r2, #0x00000000
        movle   pc, lr

/* If the destination address is word aligned and the size suitably
   aligned, do it fast */

        tst     r2, #0x00000007
        tsteq   r1, #0x00000003

        bne     _C_LABEL(outsw)

/* Word aligned outsw */

        stmfd   sp!, {r4,r5,lr}

.Loutsw16loop:
        ldmia   r1!, {r4,r5,ip,lr}

        eor     r3, r4, r4, lsl #16     /* r3 = (A^B)(B) */
        eor     r4, r4, r3, lsr #16     /* r4 = (A)(B^A^B) = (A)(A) */
        eor     r3, r3, r4, lsl #16     /* r3 = (A^B^A)(B) = (B)(B) */
        str     r3, [r0]
        str     r4, [r0]

/*      mov     r3, r4, lsl #16
 *      orr     r3, r3, r3, lsr #16
 *      str     r3, [r0]
 *
 *      mov     r3, r4, lsr #16
 *      orr     r3, r3, r3, lsl #16
 *      str     r3, [r0]
 */

        eor     r3, r5, r5, lsl #16     /* r3 = (A^B)(B) */
        eor     r5, r5, r3, lsr #16     /* r4 = (A)(B^A^B) = (A)(A) */
        eor     r3, r3, r5, lsl #16     /* r3 = (A^B^A)(B) = (B)(B) */
        str     r3, [r0]
        str     r5, [r0]

        eor     r3, ip, ip, lsl #16     /* r3 = (A^B)(B) */
        eor     ip, ip, r3, lsr #16     /* r4 = (A)(B^A^B) = (A)(A) */
        eor     r3, r3, ip, lsl #16     /* r3 = (A^B^A)(B) = (B)(B) */
        str     r3, [r0]
        str     ip, [r0]

        eor     r3, lr, lr, lsl #16     /* r3 = (A^B)(B) */
        eor     lr, lr, r3, lsr #16     /* r4 = (A)(B^A^B) = (A)(A) */
        eor     r3, r3, lr, lsl #16     /* r3 = (A^B^A)(B) = (B)(B) */
        str     r3, [r0]
        str     lr, [r0]

        subs    r2, r2, #0x00000008
        bgt     .Loutsw16loop

        ldmfd   sp!, {r4,r5,pc}         /* and go home */
END(outsw16)

/*
 * reads short ints (16 bits) from an I/O address into a block of memory
 * The I/O address is assumed to be mapped multiple times in a block of
 * 8 words.
 * The destination address should be word aligned.
 *
 * r0 = address to read from (IO)
 * r1 = address to write to (memory)
 * r2 = length
 */

ENTRY(inswm8)
/* Make sure that we have a positive length */
        cmp     r2, #0x00000000
        movle   pc, lr

/* If the destination address is word aligned and the size suitably
   aligned, do it fast */

        tst     r1, #0x00000003

        bne     _C_LABEL(insw)

/* Word aligned insw */

        stmfd   sp!, {r4-r9,lr}

        mov     lr, #0xff000000
        orr     lr, lr, #0x00ff0000

.Linswm8_loop8:
        cmp     r2, #8
        bcc     .Linswm8_l8

        ldmia   r0, {r3-r9,ip}

        bic     r3, r3, lr
        orr     r3, r3, r4, lsl #16
        bic     r5, r5, lr
        orr     r4, r5, r6, lsl #16
        bic     r7, r7, lr
        orr     r5, r7, r8, lsl #16
        bic     r9, r9, lr
        orr     r6, r9, ip, lsl #16

        stmia   r1!, {r3-r6}

        subs    r2, r2, #0x00000008     /* Next */
        bne     .Linswm8_loop8
        beq     .Linswm8_l1

.Linswm8_l8:
        cmp     r2, #4
        bcc     .Linswm8_l4

        ldmia   r0, {r3-r6}

        bic     r3, r3, lr
        orr     r3, r3, r4, lsl #16
        bic     r5, r5, lr
        orr     r4, r5, r6, lsl #16

        stmia   r1!, {r3-r4}

        subs    r2, r2, #0x00000004
        beq     .Linswm8_l1

.Linswm8_l4:
        cmp     r2, #2
        bcc     .Linswm8_l2

        ldmia   r0, {r3-r4}

        bic     r3, r3, lr
        orr     r3, r3, r4, lsl #16
        str     r3, [r1], #0x0004

        subs    r2, r2, #0x00000002
        beq     .Linswm8_l1

.Linswm8_l2:
        cmp     r2, #1
        bcc     .Linswm8_l1

        ldr     r3, [r0]
        subs    r2, r2, #0x00000001     /* Test in load delay slot */
                                        /* XXX, why don't we use result?  */

        strb    r3, [r1], #0x0001
        mov     r3, r3, lsr #8
        strb    r3, [r1], #0x0001


.Linswm8_l1:
        ldmfd   sp!, {r4-r9,pc}         /* And go home */
END(inswm8)

/*
 * write short ints (16 bits) to an I/O address from a block of memory
 * The I/O address is assumed to be mapped multiple times in a block of
 * 8 words.
 * The source address should be word aligned.
 *
 * r0 = address to read to (IO)
 * r1 = address to write from (memory)
 * r2 = length
 */

ENTRY(outswm8)
/* Make sure that we have a positive length */
        cmp     r2, #0x00000000
        movle   pc, lr

/* If the destination address is word aligned and the size suitably
   aligned, do it fast */

        tst     r1, #0x00000003

        bne     _C_LABEL(outsw)

/* Word aligned outsw */

        stmfd   sp!, {r4-r8,lr}

.Loutswm8_loop8:
        cmp     r2, #8
        bcc     .Loutswm8_l8

        ldmia   r1!, {r3,r5,r7,ip}

        eor     r4, r3, r3, lsr #16     /* r4 = (A)(A^B) */
        eor     r3, r3, r4, lsl #16     /* r3 = (A^A^B)(B) = (B)(B) */
        eor     r4, r4, r3, lsr #16     /* r4 = (A)(B^A^B) = (A)(A) */

        eor     r6, r5, r5, lsr #16     /* r6 = (A)(A^B) */
        eor     r5, r5, r6, lsl #16     /* r5 = (A^A^B)(B) = (B)(B) */
        eor     r6, r6, r5, lsr #16     /* r6 = (A)(B^A^B) = (A)(A) */

        eor     r8, r7, r7, lsr #16     /* r8 = (A)(A^B) */
        eor     r7, r7, r8, lsl #16     /* r7 = (A^A^B)(B) = (B)(B) */
        eor     r8, r8, r7, lsr #16     /* r8 = (A)(B^A^B) = (A)(A) */

        eor     lr, ip, ip, lsr #16     /* lr = (A)(A^B) */
        eor     ip, ip, lr, lsl #16     /* ip = (A^A^B)(B) = (B)(B) */
        eor     lr, lr, ip, lsr #16     /* lr = (A)(B^A^B) = (A)(A) */

        stmia   r0, {r3-r8,ip,lr}

        subs    r2, r2, #0x00000008     /* Next */
        bne     .Loutswm8_loop8
        beq     .Loutswm8_l1

.Loutswm8_l8:
        cmp     r2, #4
        bcc     .Loutswm8_l4

        ldmia   r1!, {r3-r4}

        eor     r6, r3, r3, lsr #16     /* r6 = (A)(A^B) */
        eor     r5, r3, r6, lsl #16     /* r5 = (A^A^B)(B) = (B)(B) */
        eor     r6, r6, r5, lsr #16     /* r6 = (A)(B^A^B) = (A)(A) */

        eor     r8, r4, r4, lsr #16     /* r8 = (A)(A^B) */
        eor     r7, r4, r8, lsl #16     /* r7 = (A^A^B)(B) = (B)(B) */
        eor     r8, r8, r7, lsr #16     /* r8 = (A)(B^A^B) = (A)(A) */

        stmia   r0, {r5-r8}

        subs    r2, r2, #0x00000004
        beq     .Loutswm8_l1

.Loutswm8_l4:
        cmp     r2, #2
        bcc     .Loutswm8_l2

        ldr     r3, [r1], #0x0004       /* r3 = (A)(B) */
        subs    r2, r2, #0x00000002     /* Done test in Load delay slot */

        eor     r5, r3, r3, lsr #16     /* r5 = (A)(A^B)*/
        eor     r4, r3, r5, lsl #16     /* r4 = (A^A^B)(B) = (B)(B) */
        eor     r5, r5, r4, lsr #16     /* r5 = (A)(B^A^B) = (A)(A) */

        stmia   r0, {r4, r5}

        beq     .Loutswm8_l1

.Loutswm8_l2:
        cmp     r2, #1
        bcc     .Loutswm8_l1

        ldrb    r3, [r1], #0x0001
        ldrb    r4, [r1], #0x0001
        subs    r2, r2, #0x00000001     /* Done test in load delay slot */
                                        /* XXX This test isn't used?  */
        orr     r3, r3, r4, lsl #8
        orr     r3, r3, r3, lsl #16
        str     r3, [r0]

.Loutswm8_l1:
        ldmfd   sp!, {r4-r8,pc}         /* And go home */
END(outswm8)