root/arch/x86/lib/csum-copy_64.S
/*
 * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
 *
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file COPYING in the main directory of this archive
 * for more details. No warranty for anything given at all.
 */
#include <linux/linkage.h>
#include <asm/errno.h>
#include <asm/asm.h>

/*
 * Checksum copy with exception handling.
 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
 * destination is zeroed.
 *
 * Input
 * rdi  source
 * rsi  destination
 * edx  len (32bit)
 *
 * Output
 * eax  64bit sum. undefined in case of exception.
 *
 * Wrappers need to take care of valid exception sum and zeroing.
 * They also should align source or destination to 8 bytes.
 */

        .macro source
10:
        _ASM_EXTABLE_UA(10b, .Lfault)
        .endm

        .macro dest
20:
        _ASM_EXTABLE_UA(20b, .Lfault)
        .endm

SYM_FUNC_START(csum_partial_copy_generic)
        subq  $5*8, %rsp
        movq  %rbx, 0*8(%rsp)
        movq  %r12, 1*8(%rsp)
        movq  %r14, 2*8(%rsp)
        movq  %r13, 3*8(%rsp)
        movq  %r15, 4*8(%rsp)

        movl  $-1, %eax
        xorl  %r9d, %r9d
        movl  %edx, %ecx
        cmpl  $8, %ecx
        jb    .Lshort

        testb  $7, %sil
        jne   .Lunaligned
.Laligned:
        movl  %ecx, %r12d

        shrq  $6, %r12
        jz      .Lhandle_tail       /* < 64 */

        clc

        /* main loop. clear in 64 byte blocks */
        /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
        /* r11: temp3, rdx: temp4, r12 loopcnt */
        /* r10: temp5, r15: temp6, r14 temp7, r13 temp8 */
        .p2align 4
.Lloop:
        source
        movq  (%rdi), %rbx
        source
        movq  8(%rdi), %r8
        source
        movq  16(%rdi), %r11
        source
        movq  24(%rdi), %rdx

        source
        movq  32(%rdi), %r10
        source
        movq  40(%rdi), %r15
        source
        movq  48(%rdi), %r14
        source
        movq  56(%rdi), %r13

30:
        /*
         * No _ASM_EXTABLE_UA; this is used for intentional prefetch on a
         * potentially unmapped kernel address.
         */
        _ASM_EXTABLE(30b, 2f)
        prefetcht0 5*64(%rdi)
2:
        adcq  %rbx, %rax
        adcq  %r8, %rax
        adcq  %r11, %rax
        adcq  %rdx, %rax
        adcq  %r10, %rax
        adcq  %r15, %rax
        adcq  %r14, %rax
        adcq  %r13, %rax

        decl %r12d

        dest
        movq %rbx, (%rsi)
        dest
        movq %r8, 8(%rsi)
        dest
        movq %r11, 16(%rsi)
        dest
        movq %rdx, 24(%rsi)

        dest
        movq %r10, 32(%rsi)
        dest
        movq %r15, 40(%rsi)
        dest
        movq %r14, 48(%rsi)
        dest
        movq %r13, 56(%rsi)

        leaq 64(%rdi), %rdi
        leaq 64(%rsi), %rsi

        jnz     .Lloop

        adcq  %r9, %rax

        /* do last up to 56 bytes */
.Lhandle_tail:
        /* ecx: count, rcx.63: the end result needs to be rol8 */
        movq %rcx, %r10
        andl $63, %ecx
        shrl $3, %ecx
        jz      .Lfold
        clc
        .p2align 4
.Lloop_8:
        source
        movq (%rdi), %rbx
        adcq %rbx, %rax
        decl %ecx
        dest
        movq %rbx, (%rsi)
        leaq 8(%rsi), %rsi /* preserve carry */
        leaq 8(%rdi), %rdi
        jnz     .Lloop_8
        adcq %r9, %rax  /* add in carry */

.Lfold:
        /* reduce checksum to 32bits */
        movl %eax, %ebx
        shrq $32, %rax
        addl %ebx, %eax
        adcl %r9d, %eax

        /* do last up to 6 bytes */
.Lhandle_7:
        movl %r10d, %ecx
        andl $7, %ecx
.L1:                            /* .Lshort rejoins the common path here */
        shrl $1, %ecx
        jz   .Lhandle_1
        movl $2, %edx
        xorl %ebx, %ebx
        clc
        .p2align 4
.Lloop_1:
        source
        movw (%rdi), %bx
        adcl %ebx, %eax
        decl %ecx
        dest
        movw %bx, (%rsi)
        leaq 2(%rdi), %rdi
        leaq 2(%rsi), %rsi
        jnz .Lloop_1
        adcl %r9d, %eax /* add in carry */

        /* handle last odd byte */
.Lhandle_1:
        testb $1, %r10b
        jz    .Lende
        xorl  %ebx, %ebx
        source
        movb (%rdi), %bl
        dest
        movb %bl, (%rsi)
        addl %ebx, %eax
        adcl %r9d, %eax         /* carry */

.Lende:
        testq %r10, %r10
        js  .Lwas_odd
.Lout:
        movq 0*8(%rsp), %rbx
        movq 1*8(%rsp), %r12
        movq 2*8(%rsp), %r14
        movq 3*8(%rsp), %r13
        movq 4*8(%rsp), %r15
        addq $5*8, %rsp
        RET
.Lshort:
        movl %ecx, %r10d
        jmp  .L1
.Lunaligned:
        xorl %ebx, %ebx
        testb $1, %sil
        jne  .Lodd
1:      testb $2, %sil
        je   2f
        source
        movw (%rdi), %bx
        dest
        movw %bx, (%rsi)
        leaq 2(%rdi), %rdi
        subq $2, %rcx
        leaq 2(%rsi), %rsi
        addq %rbx, %rax
2:      testb $4, %sil
        je .Laligned
        source
        movl (%rdi), %ebx
        dest
        movl %ebx, (%rsi)
        leaq 4(%rdi), %rdi
        subq $4, %rcx
        leaq 4(%rsi), %rsi
        addq %rbx, %rax
        jmp .Laligned

.Lodd:
        source
        movb (%rdi), %bl
        dest
        movb %bl, (%rsi)
        leaq 1(%rdi), %rdi
        leaq 1(%rsi), %rsi
        /* decrement, set MSB */
        leaq -1(%rcx, %rcx), %rcx
        rorq $1, %rcx
        shll $8, %ebx
        addq %rbx, %rax
        jmp 1b

.Lwas_odd:
        roll $8, %eax
        jmp .Lout

        /* Exception: just return 0 */
.Lfault:
        xorl %eax, %eax
        jmp  .Lout
SYM_FUNC_END(csum_partial_copy_generic)