#include <sys/param.h>
#include <sys/errno.h>
#include <sys/asm_linkage.h>
#include <sys/vtrace.h>
#include <sys/machthread.h>
#include <sys/machparam.h>
#include "assym.h"
ENTRY(ip_ocsum)
!
! Entry point for checksum-only.
! %o0 contains buffer address
! %o1 contains count of 16bit words
! %o2 contains sum
!
! %o3 temporary
! %o4 temporary
! %g1 32bit mask
! %g4 16bit mask
! %g5 64bit mask (all 1s)
!
not %g0, %g5 ! all 1's
prefetch [%o0], #n_reads ! first hword, dword, cacheline
clruw %g5, %g1 ! 32 1's at low end
srl %g5, 16, %g4 ! 16 1's at low end
cmp %o1, 32 ! at least a cacheline (64 bytes)?
bge,pn %icc, ip_ocsum_long ! yes, do the whole works
andn %o0, 7, %o5 ! delay: base src addr
cmp %o1, 4 ! < 4 halfwords?
bl,pn %icc, .tiny ! < 4 halfwords, just do them
inc 8, %o5 ! delay: next addr (no matter for .tiny)
ldx [%o5-8], %o3 ! NB base addr
sub %o5, %o0, %g2 ! byte count: 2/4/6/8
mov %o5, %o0
sll %g2, 2, %g2 ! 8/16/24/32 for mask
sllx %g5, %g2, %o5
sllx %o5, %g2, %o5 ! mask: 16/32/48/64 0's at low end
srl %g2, 3, %g2 ! hw count
andn %o3, %o5, %o3 ! select hw's from src
srlx %o3, 32, %o4 ! hi32
b 9f
sub %o1, %g2, %o1 ! delay: decr count, 1-4 halfwords
.short_dw: ! max 7 iters of 4 clocks; 1 mispred of 4
ldx [%o0], %o3 ! tmp64 = *src++ (groups with the branch)
inc 8, %o0 ! (D-cache load-use delay)
dec 4, %o1 ! decrement count, 4 halfwords
srlx %o3, 32, %o4 ! hi32
9: and %o3, %g1, %o3 ! lo32
add %o4, %o2, %o2 ! accumulator
andncc %o1, 3, %g0 ! more than 3 hwords left?
bnz,pt %icc, .short_dw
add %o3, %o2, %o2 ! accumulator
.short_hw: ! trailing dw: 0-3 hwords
tst %o1 ! 0 seems fairly common...
bz,a .short_fold
srlx %o2, 32, %o4 ! delay: hi32
! mispredict 4 + 7 clocks for 1-3
ldx [%o0], %o3
sll %o1, 4, %o1 ! bitcount: 16/32/48
srlx %g5, %o1, %o5 ! mask: 16/32/48 0's at high end
andn %o3, %o5, %o3 ! select hw's from src
srlx %o3, 32, %o4 ! hi32
and %o3, %g1, %o3 ! lo32
add %o4, %o2, %o2 ! accumulator
add %o3, %o2, %o2 ! accumulator
! at this point the 64-bit accumulator
! has the result that needs to be returned in 16-bits
srlx %o2, 32, %o4 ! hi32
.short_fold:
and %o2, %g1, %o2 ! lo32
add %o4, %o2, %o2 ! 33b
srlx %o2, 16, %o3 ! hi17
and %o2, %g4, %o2 ! lo16
add %o3, %o2, %o2 ! 18b
srlx %o2, 16, %o3 ! hi2
and %o2, %g4, %o2 ! lo16
retl ! return
add %o3, %o2, %o0 ! 16b result in %o0
.tiny: ! almost never: less than 4 halfwords total.
tst %o1
bz,a .short_fold
srlx %o2, 32, %o4 ! delay: hi32
lduh [%o0], %o3 ! tmp16 = *src++
1:
inc 2, %o0
! stall for D-cache
add %o3, %o2, %o2 ! accumulator
deccc %o1 ! decrement count
bnz,a,pt %icc, 1b
lduh [%o0], %o3 ! tmp16 = *src++
! at this point the 64-bit accumulator
! has the result that needs to be returned in 16-bits
b .short_fold
srlx %o2, 32, %o4 ! hi32
SET_SIZE(ip_ocsum) ! 64-bit version
ENTRY(ip_ocsum_long) ! 64-bit, large blocks
save %sp, -SA(MINFRAME), %sp ! get another window
!
! %i0 contains buffer address
! %i1 contains count of 16bit words
! %i2 contains sum
! %i4 contains the mainloop count
! %i5 comes in with the buffer address rounded down to the first dword
!
! %g1 32bit mask
! %g4 16bit mask
! %g5 64bit mask (all 1s)
! %g6 fetch-ahead offset for Ecache
!
! %l0-7,%o0-5,%g2-3 mainloop temporaries
!
!
! 1 clock overhead
btst 63, %i0 ! src 64-byte aligned?
bz,a,pt %icc, .mainsection ! aligned blocks are fairly common
andncc %i1, 31, %i4 ! at least 64 bytes for main loop?
! Leading dword, with 1-4 hwords: 9 clocks
! Assumes ok to read the entire dword with the leading bytes
ldx [%i5], %l0 ! NB base addr
inc 8, %i5 ! next addr
sub %i5, %i0, %l2 ! byte count: 2/4/6/8
mov %i5, %i0
sll %l2, 2, %l2 ! 8/16/24/32 for mask
sllx %g5, %l2, %l4
sllx %l4, %l2, %l4 ! mask: 16, 32, 48, 64 0's at lsb
srl %l2, 3, %l2 ! 1/2/3/4 for count
andn %l0, %l4, %l0 ! select hw's from src
srlx %l0, 32, %o0 ! hi32
b 9f
sub %i1, %l2, %i1 ! decr count, 1-4 halfwords
! Do dwords until source is 64-byte aligned, 0-6 iterations
! 4 clocks per + 4 for 1 mispred = 16 clocks avg
.dw: ldx [%i0], %l0 ! tmp64 = *src++ (groups with the branch below)
inc 8, %i0 ! (Dcache load-use delay)
dec 4, %i1 ! decrement count, 4 halfwords
srlx %l0, 32, %o0 ! hi32
9: and %l0, %g1, %l0 ! lo32
add %o0, %i2, %i2 ! accumulator
btst 63, %i0 ! src 64-byte aligned?
bnz,pt %icc, .dw
add %l0, %i2, %i2 ! accumulator
! At this point source address is 64 byte aligned
! and we've dealt with 1-32 halfwords.
andncc %i1, 31, %i4 ! at least 64 bytes for main loop?
.mainsection: ! total 18n + 21 clocks
bz,pn %icc, .postamble
and %i1, 31, %i1 ! count for postamble
! preload for main loop - 9 clocks assuming D$ hits at 1 per
ldx [%i0+0], %l0
ldx [%i0+8], %l1
ldx [%i0+16], %l2 ! %l0 could be used here if Dcache hit
ldx [%i0+24], %l3 ! but US-II prefetch only loads Ecache
ldx [%i0+32], %l4 ! check on US-III: could mix preloads & splits?
ldx [%i0+40], %l5
ldx [%i0+48], %l6
ldx [%i0+56], %l7
inc 64, %i0
prefetch [%i0], #n_reads
! main loop. Read 64 bytes at a time - 18 clocks per iteration
5: ! plus 4 for the exit mispredict
srlx %l0, 32, %o0 ! hi32 to %o0
and %l0, %g1, %l0 ! lo32 to %l0
srlx %l1, 32, %o1 ! hi32 to %o1
and %l1, %g1, %l1 ! lo32 to %l1
srlx %l2, 32, %o2 ! hi32 to %o2
and %l2, %g1, %l2 ! lo32 to %l2
srlx %l3, 32, %o3 ! hi32 to %o3
and %l3, %g1, %l3 ! lo32 to %l3
srlx %l4, 32, %o4 ! hi32 to %o4
and %l4, %g1, %l4 ! lo32 to %l4
srlx %l5, 32, %o5 ! hi32 to %o5
and %l5, %g1, %l5 ! lo32 to %l5
srlx %l6, 32, %g2 ! hi32 to %g2
and %l6, %g1, %l6 ! lo32 to %l6
srlx %l7, 32, %g3 ! hi32 to %g3
and %l7, %g1, %l7 ! lo32 to %l7
! splits gave 16 off 32b vals
deccc 32, %i4 ! mv early,avoid mispredicts? nohelp US-II.
bz,pn %icc, .looptidy ! count now zero?
add %l0, %o0, %o0 ! delay
ldx [%i0+0], %l0
add %l1, %o1, %o1 ! adds and loads
add %l2, %o2, %o2
ldx [%i0+8], %l1
add %l3, %o3, %o3
add %l4, %o4, %o4
ldx [%i0+16], %l2
add %l5, %o5, %o5
add %l6, %g2, %g2
ldx [%i0+24], %l3
add %l7, %g3, %g3 ! now 8 off 33b vals
add %o0, %o1, %o0
ldx [%i0+32], %l4
add %o2, %o3, %o1
add %o4, %o5, %o2
ldx [%i0+40], %l5
add %g2, %g3, %o3 ! now 4 off 34b vals
add %o0, %o1, %o0
ldx [%i0+48], %l6
add %o2, %o3, %o1 ! 2 off 35b
ldx [%i0+56], %l7
add %o0, %o1, %o0 ! 36b
inc 64, %i0 ! increment source address
add %o0, %i2, %i2 ! accumulator
ba 5b
prefetch [%i0], #n_reads ! next cacheline
! end of main loop
.looptidy: ! compute remaining partial sum - 8 clocks
add %l1, %o1, %o1
add %l2, %o2, %o2
add %l3, %o3, %o3
add %l4, %o4, %o4
add %l5, %o5, %o5
add %l6, %g2, %g2
add %l7, %g3, %g3 ! 8 x 33b
add %o0, %o1, %o0
add %o2, %o3, %o1
add %o4, %o5, %o2
add %g2, %g3, %o3 ! 4 x 34b
add %o0, %o1, %o0
add %o2, %o3, %o1 ! 2 x 35b
add %o0, %i2, %i2 ! accumulator
add %o1, %i2, %i2 ! accumulator
.postamble:
! postamble hword count is in %i1 (can be zero)
! while at least 1 dword, do dwords. Max 7 iterations.
andncc %i1, 3, %g0 ! more than 3 hwords?
.dotail_dw:
bz,a,pn %icc, .dotail_hw
tst %i1 ! delay: any at all left?
8:
ldx [%i0], %l0 ! tmp64 = *src++
inc 8, %i0
dec 4, %i1 ! decrement count, 4 halfwords
! stall for D-cache
srlx %l0, 32, %o0 ! hi32
and %l0, %g1, %l0 ! lo32
add %o0, %i2, %i2 ! accumulator
andncc %i1, 3, %g0 ! more than 3 hwords?
bnz,pt %icc, 8b
add %l0, %i2, %i2 ! accumulator
! while at least 1 hword, do hwords. Max 3 iterations.
tst %i1
.dotail_hw:
bz,a .fold
srlx %i2, 32, %o0 ! delay: hi32
lduh [%i0], %l0 ! tmp16 = *src++
1:
inc 2, %i0
! stall for D-cache
add %l0, %i2, %i2 ! accumulator
deccc %i1 ! decrement count
bnz,a,pt %icc, 1b
lduh [%i0], %l0 ! tmp16 = *src++
! at this point the 64-bit accumulator
! has the result that needs to be returned in 16-bits
srlx %i2, 32, %o0 ! hi32
.fold:
and %i2, %g1, %o1 ! lo32
add %o0, %o1, %o0 ! 33b
srlx %o0, 16, %o1 ! hi17
and %o0, %g4, %o0 ! lo16
add %o1, %o0, %o0 ! 18b
srlx %o0, 16, %o1 ! hi2
and %o0, %g4, %o0 ! lo16
add %o1, %o0, %i0 ! 16b result in %i0
ret ! return
restore
SET_SIZE(ip_ocsum_long) ! 64-bit version