|
- //+build !noasm
- //+build !appengine
-
- // Copyright 2015, Klaus Post, see LICENSE for details.
-
- // func crc32sse(a []byte) uint32
- TEXT ·crc32sse(SB), 4, $0
- MOVQ a+0(FP), R10
- XORQ BX, BX
-
- // CRC32 dword (R10), EBX
- BYTE $0xF2; BYTE $0x41; BYTE $0x0f
- BYTE $0x38; BYTE $0xf1; BYTE $0x1a
-
- MOVL BX, ret+24(FP)
- RET
-
- // func crc32sseAll(a []byte, dst []uint32)
- TEXT ·crc32sseAll(SB), 4, $0
- MOVQ a+0(FP), R8 // R8: src
- MOVQ a_len+8(FP), R10 // input length
- MOVQ dst+24(FP), R9 // R9: dst
- SUBQ $4, R10
- JS end
- JZ one_crc
- MOVQ R10, R13
- SHRQ $2, R10 // len/4
- ANDQ $3, R13 // len&3
- XORQ BX, BX
- ADDQ $1, R13
- TESTQ R10, R10
- JZ rem_loop
-
- crc_loop:
- MOVQ (R8), R11
- XORQ BX, BX
- XORQ DX, DX
- XORQ DI, DI
- MOVQ R11, R12
- SHRQ $8, R11
- MOVQ R12, AX
- MOVQ R11, CX
- SHRQ $16, R12
- SHRQ $16, R11
- MOVQ R12, SI
-
- // CRC32 EAX, EBX
- BYTE $0xF2; BYTE $0x0f
- BYTE $0x38; BYTE $0xf1; BYTE $0xd8
-
- // CRC32 ECX, EDX
- BYTE $0xF2; BYTE $0x0f
- BYTE $0x38; BYTE $0xf1; BYTE $0xd1
-
- // CRC32 ESI, EDI
- BYTE $0xF2; BYTE $0x0f
- BYTE $0x38; BYTE $0xf1; BYTE $0xfe
- MOVL BX, (R9)
- MOVL DX, 4(R9)
- MOVL DI, 8(R9)
-
- XORQ BX, BX
- MOVL R11, AX
-
- // CRC32 EAX, EBX
- BYTE $0xF2; BYTE $0x0f
- BYTE $0x38; BYTE $0xf1; BYTE $0xd8
- MOVL BX, 12(R9)
-
- ADDQ $16, R9
- ADDQ $4, R8
- XORQ BX, BX
- SUBQ $1, R10
- JNZ crc_loop
-
- rem_loop:
- MOVL (R8), AX
-
- // CRC32 EAX, EBX
- BYTE $0xF2; BYTE $0x0f
- BYTE $0x38; BYTE $0xf1; BYTE $0xd8
-
- MOVL BX, (R9)
- ADDQ $4, R9
- ADDQ $1, R8
- XORQ BX, BX
- SUBQ $1, R13
- JNZ rem_loop
-
- end:
- RET
-
- one_crc:
- MOVQ $1, R13
- XORQ BX, BX
- JMP rem_loop
-
- // func matchLenSSE4(a, b []byte, max int) int
- TEXT ·matchLenSSE4(SB), 4, $0
- MOVQ a_base+0(FP), SI
- MOVQ b_base+24(FP), DI
- MOVQ DI, DX
- MOVQ max+48(FP), CX
-
- cmp8:
- // As long as we are 8 or more bytes before the end of max, we can load and
- // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
- CMPQ CX, $8
- JLT cmp1
- MOVQ (SI), AX
- MOVQ (DI), BX
- CMPQ AX, BX
- JNE bsf
- ADDQ $8, SI
- ADDQ $8, DI
- SUBQ $8, CX
- JMP cmp8
-
- bsf:
- // If those 8 bytes were not equal, XOR the two 8 byte values, and return
- // the index of the first byte that differs. The BSF instruction finds the
- // least significant 1 bit, the amd64 architecture is little-endian, and
- // the shift by 3 converts a bit index to a byte index.
- XORQ AX, BX
- BSFQ BX, BX
- SHRQ $3, BX
- ADDQ BX, DI
-
- // Subtract off &b[0] to convert from &b[ret] to ret, and return.
- SUBQ DX, DI
- MOVQ DI, ret+56(FP)
- RET
-
- cmp1:
- // In the slices' tail, compare 1 byte at a time.
- CMPQ CX, $0
- JEQ matchLenEnd
- MOVB (SI), AX
- MOVB (DI), BX
- CMPB AX, BX
- JNE matchLenEnd
- ADDQ $1, SI
- ADDQ $1, DI
- SUBQ $1, CX
- JMP cmp1
-
- matchLenEnd:
- // Subtract off &b[0] to convert from &b[ret] to ret, and return.
- SUBQ DX, DI
- MOVQ DI, ret+56(FP)
- RET
-
- // func histogram(b []byte, h []int32)
- TEXT ·histogram(SB), 4, $0
- MOVQ b+0(FP), SI // SI: &b
- MOVQ b_len+8(FP), R9 // R9: len(b)
- MOVQ h+24(FP), DI // DI: Histogram
- MOVQ R9, R8
- SHRQ $3, R8
- JZ hist1
- XORQ R11, R11
-
- loop_hist8:
- MOVQ (SI), R10
-
- MOVB R10, R11
- INCL (DI)(R11*4)
- SHRQ $8, R10
-
- MOVB R10, R11
- INCL (DI)(R11*4)
- SHRQ $8, R10
-
- MOVB R10, R11
- INCL (DI)(R11*4)
- SHRQ $8, R10
-
- MOVB R10, R11
- INCL (DI)(R11*4)
- SHRQ $8, R10
-
- MOVB R10, R11
- INCL (DI)(R11*4)
- SHRQ $8, R10
-
- MOVB R10, R11
- INCL (DI)(R11*4)
- SHRQ $8, R10
-
- MOVB R10, R11
- INCL (DI)(R11*4)
- SHRQ $8, R10
-
- INCL (DI)(R10*4)
-
- ADDQ $8, SI
- DECQ R8
- JNZ loop_hist8
-
- hist1:
- ANDQ $7, R9
- JZ end_hist
- XORQ R10, R10
-
- loop_hist1:
- MOVB (SI), R10
- INCL (DI)(R10*4)
- INCQ SI
- DECQ R9
- JNZ loop_hist1
-
- end_hist:
- RET
|