|
- // Copyright 2016 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
-
- // +build !appengine
- // +build gc
- // +build !noasm
-
- #include "textflag.h"
-
- // The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
- // Go toolchain regression. See https://github.com/golang/go/issues/15426 and
- // https://github.com/golang/snappy/issues/29
- //
- // As a workaround, the package was built with a known good assembler, and
- // those instructions were disassembled by "objdump -d" to yield the
- // 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
- // style comments, in AT&T asm syntax. Note that rsp here is a physical
- // register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
- // The instructions were then encoded as "BYTE $0x.." sequences, which assemble
- // fine on Go 1.6.
-
- // The asm code generally follows the pure Go code in encode_other.go, except
- // where marked with a "!!!".
-
- // ----------------------------------------------------------------------------
-
- // func emitLiteral(dst, lit []byte) int
- //
- // All local variables fit into registers. The register allocation:
- // - AX len(lit)
- // - BX n
- // - DX return value
- // - DI &dst[i]
- // - R10 &lit[0]
- //
- // The 24 bytes of stack space is to call runtime·memmove.
- //
- // The unusual register allocation of local variables, such as R10 for the
- // source pointer, matches the allocation used at the call site in encodeBlock,
- // which makes it easier to manually inline this function.
- TEXT ·emitLiteral(SB), NOSPLIT, $24-56
- MOVQ dst_base+0(FP), DI
- MOVQ lit_base+24(FP), R10
- MOVQ lit_len+32(FP), AX
- MOVQ AX, DX
- MOVL AX, BX
- SUBL $1, BX
-
- CMPL BX, $60
- JLT oneByte
- CMPL BX, $256
- JLT twoBytes
-
- threeBytes:
- MOVB $0xf4, 0(DI)
- MOVW BX, 1(DI)
- ADDQ $3, DI
- ADDQ $3, DX
- JMP memmove
-
- twoBytes:
- MOVB $0xf0, 0(DI)
- MOVB BX, 1(DI)
- ADDQ $2, DI
- ADDQ $2, DX
- JMP memmove
-
- oneByte:
- SHLB $2, BX
- MOVB BX, 0(DI)
- ADDQ $1, DI
- ADDQ $1, DX
-
- memmove:
- MOVQ DX, ret+48(FP)
-
- // copy(dst[i:], lit)
- //
- // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
- // DI, R10 and AX as arguments.
- MOVQ DI, 0(SP)
- MOVQ R10, 8(SP)
- MOVQ AX, 16(SP)
- CALL runtime·memmove(SB)
- RET
-
- // ----------------------------------------------------------------------------
-
- // func emitCopy(dst []byte, offset, length int) int
- //
- // All local variables fit into registers. The register allocation:
- // - AX length
- // - SI &dst[0]
- // - DI &dst[i]
- // - R11 offset
- //
- // The unusual register allocation of local variables, such as R11 for the
- // offset, matches the allocation used at the call site in encodeBlock, which
- // makes it easier to manually inline this function.
- TEXT ·emitCopy(SB), NOSPLIT, $0-48
- MOVQ dst_base+0(FP), DI
- MOVQ DI, SI
- MOVQ offset+24(FP), R11
- MOVQ length+32(FP), AX
-
- loop0:
- // for length >= 68 { etc }
- CMPL AX, $68
- JLT step1
-
- // Emit a length 64 copy, encoded as 3 bytes.
- MOVB $0xfe, 0(DI)
- MOVW R11, 1(DI)
- ADDQ $3, DI
- SUBL $64, AX
- JMP loop0
-
- step1:
- // if length > 64 { etc }
- CMPL AX, $64
- JLE step2
-
- // Emit a length 60 copy, encoded as 3 bytes.
- MOVB $0xee, 0(DI)
- MOVW R11, 1(DI)
- ADDQ $3, DI
- SUBL $60, AX
-
- step2:
- // if length >= 12 || offset >= 2048 { goto step3 }
- CMPL AX, $12
- JGE step3
- CMPL R11, $2048
- JGE step3
-
- // Emit the remaining copy, encoded as 2 bytes.
- MOVB R11, 1(DI)
- SHRL $8, R11
- SHLB $5, R11
- SUBB $4, AX
- SHLB $2, AX
- ORB AX, R11
- ORB $1, R11
- MOVB R11, 0(DI)
- ADDQ $2, DI
-
- // Return the number of bytes written.
- SUBQ SI, DI
- MOVQ DI, ret+40(FP)
- RET
-
- step3:
- // Emit the remaining copy, encoded as 3 bytes.
- SUBL $1, AX
- SHLB $2, AX
- ORB $2, AX
- MOVB AX, 0(DI)
- MOVW R11, 1(DI)
- ADDQ $3, DI
-
- // Return the number of bytes written.
- SUBQ SI, DI
- MOVQ DI, ret+40(FP)
- RET
-
- // ----------------------------------------------------------------------------
-
- // func extendMatch(src []byte, i, j int) int
- //
- // All local variables fit into registers. The register allocation:
- // - DX &src[0]
- // - SI &src[j]
- // - R13 &src[len(src) - 8]
- // - R14 &src[len(src)]
- // - R15 &src[i]
- //
- // The unusual register allocation of local variables, such as R15 for a source
- // pointer, matches the allocation used at the call site in encodeBlock, which
- // makes it easier to manually inline this function.
- TEXT ·extendMatch(SB), NOSPLIT, $0-48
- MOVQ src_base+0(FP), DX
- MOVQ src_len+8(FP), R14
- MOVQ i+24(FP), R15
- MOVQ j+32(FP), SI
- ADDQ DX, R14
- ADDQ DX, R15
- ADDQ DX, SI
- MOVQ R14, R13
- SUBQ $8, R13
-
- cmp8:
- // As long as we are 8 or more bytes before the end of src, we can load and
- // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
- CMPQ SI, R13
- JA cmp1
- MOVQ (R15), AX
- MOVQ (SI), BX
- CMPQ AX, BX
- JNE bsf
- ADDQ $8, R15
- ADDQ $8, SI
- JMP cmp8
-
- bsf:
- // If those 8 bytes were not equal, XOR the two 8 byte values, and return
- // the index of the first byte that differs. The BSF instruction finds the
- // least significant 1 bit, the amd64 architecture is little-endian, and
- // the shift by 3 converts a bit index to a byte index.
- XORQ AX, BX
- BSFQ BX, BX
- SHRQ $3, BX
- ADDQ BX, SI
-
- // Convert from &src[ret] to ret.
- SUBQ DX, SI
- MOVQ SI, ret+40(FP)
- RET
-
- cmp1:
- // In src's tail, compare 1 byte at a time.
- CMPQ SI, R14
- JAE extendMatchEnd
- MOVB (R15), AX
- MOVB (SI), BX
- CMPB AX, BX
- JNE extendMatchEnd
- ADDQ $1, R15
- ADDQ $1, SI
- JMP cmp1
-
- extendMatchEnd:
- // Convert from &src[ret] to ret.
- SUBQ DX, SI
- MOVQ SI, ret+40(FP)
- RET
-
- // ----------------------------------------------------------------------------
-
- // func encodeBlock(dst, src []byte) (d int)
- //
- // All local variables fit into registers, other than "var table". The register
- // allocation:
- // - AX . .
- // - BX . .
- // - CX 56 shift (note that amd64 shifts by non-immediates must use CX).
- // - DX 64 &src[0], tableSize
- // - SI 72 &src[s]
- // - DI 80 &dst[d]
- // - R9 88 sLimit
- // - R10 . &src[nextEmit]
- // - R11 96 prevHash, currHash, nextHash, offset
- // - R12 104 &src[base], skip
- // - R13 . &src[nextS], &src[len(src) - 8]
- // - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
- // - R15 112 candidate
- //
- // The second column (56, 64, etc) is the stack offset to spill the registers
- // when calling other functions. We could pack this slightly tighter, but it's
- // simpler to have a dedicated spill map independent of the function called.
- //
- // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
- // extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
- // local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
- TEXT ·encodeBlock(SB), 0, $32888-56
- MOVQ dst_base+0(FP), DI
- MOVQ src_base+24(FP), SI
- MOVQ src_len+32(FP), R14
-
- // shift, tableSize := uint32(32-8), 1<<8
- MOVQ $24, CX
- MOVQ $256, DX
-
- calcShift:
- // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
- // shift--
- // }
- CMPQ DX, $16384
- JGE varTable
- CMPQ DX, R14
- JGE varTable
- SUBQ $1, CX
- SHLQ $1, DX
- JMP calcShift
-
- varTable:
- // var table [maxTableSize]uint16
- //
- // In the asm code, unlike the Go code, we can zero-initialize only the
- // first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
- // writes 16 bytes, so we can do only tableSize/8 writes instead of the
- // 2048 writes that would zero-initialize all of table's 32768 bytes.
- SHRQ $3, DX
- LEAQ table-32768(SP), BX
- PXOR X0, X0
-
- memclr:
- MOVOU X0, 0(BX)
- ADDQ $16, BX
- SUBQ $1, DX
- JNZ memclr
-
- // !!! DX = &src[0]
- MOVQ SI, DX
-
- // sLimit := len(src) - inputMargin
- MOVQ R14, R9
- SUBQ $15, R9
-
- // !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
- // change for the rest of the function.
- MOVQ CX, 56(SP)
- MOVQ DX, 64(SP)
- MOVQ R9, 88(SP)
-
- // nextEmit := 0
- MOVQ DX, R10
-
- // s := 1
- ADDQ $1, SI
-
- // nextHash := hash(load32(src, s), shift)
- MOVL 0(SI), R11
- IMULL $0x1e35a7bd, R11
- SHRL CX, R11
-
- outer:
- // for { etc }
-
- // skip := 32
- MOVQ $32, R12
-
- // nextS := s
- MOVQ SI, R13
-
- // candidate := 0
- MOVQ $0, R15
-
- inner0:
- // for { etc }
-
- // s := nextS
- MOVQ R13, SI
-
- // bytesBetweenHashLookups := skip >> 5
- MOVQ R12, R14
- SHRQ $5, R14
-
- // nextS = s + bytesBetweenHashLookups
- ADDQ R14, R13
-
- // skip += bytesBetweenHashLookups
- ADDQ R14, R12
-
- // if nextS > sLimit { goto emitRemainder }
- MOVQ R13, AX
- SUBQ DX, AX
- CMPQ AX, R9
- JA emitRemainder
-
- // candidate = int(table[nextHash])
- // XXX: MOVWQZX table-32768(SP)(R11*2), R15
- // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
- BYTE $0x4e
- BYTE $0x0f
- BYTE $0xb7
- BYTE $0x7c
- BYTE $0x5c
- BYTE $0x78
-
- // table[nextHash] = uint16(s)
- MOVQ SI, AX
- SUBQ DX, AX
-
- // XXX: MOVW AX, table-32768(SP)(R11*2)
- // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
- BYTE $0x66
- BYTE $0x42
- BYTE $0x89
- BYTE $0x44
- BYTE $0x5c
- BYTE $0x78
-
- // nextHash = hash(load32(src, nextS), shift)
- MOVL 0(R13), R11
- IMULL $0x1e35a7bd, R11
- SHRL CX, R11
-
- // if load32(src, s) != load32(src, candidate) { continue } break
- MOVL 0(SI), AX
- MOVL (DX)(R15*1), BX
- CMPL AX, BX
- JNE inner0
-
- fourByteMatch:
- // As per the encode_other.go code:
- //
- // A 4-byte match has been found. We'll later see etc.
-
- // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
- // on inputMargin in encode.go.
- MOVQ SI, AX
- SUBQ R10, AX
- CMPQ AX, $16
- JLE emitLiteralFastPath
-
- // ----------------------------------------
- // Begin inline of the emitLiteral call.
- //
- // d += emitLiteral(dst[d:], src[nextEmit:s])
-
- MOVL AX, BX
- SUBL $1, BX
-
- CMPL BX, $60
- JLT inlineEmitLiteralOneByte
- CMPL BX, $256
- JLT inlineEmitLiteralTwoBytes
-
- inlineEmitLiteralThreeBytes:
- MOVB $0xf4, 0(DI)
- MOVW BX, 1(DI)
- ADDQ $3, DI
- JMP inlineEmitLiteralMemmove
-
- inlineEmitLiteralTwoBytes:
- MOVB $0xf0, 0(DI)
- MOVB BX, 1(DI)
- ADDQ $2, DI
- JMP inlineEmitLiteralMemmove
-
- inlineEmitLiteralOneByte:
- SHLB $2, BX
- MOVB BX, 0(DI)
- ADDQ $1, DI
-
- inlineEmitLiteralMemmove:
- // Spill local variables (registers) onto the stack; call; unspill.
- //
- // copy(dst[i:], lit)
- //
- // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
- // DI, R10 and AX as arguments.
- MOVQ DI, 0(SP)
- MOVQ R10, 8(SP)
- MOVQ AX, 16(SP)
- ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)".
- MOVQ SI, 72(SP)
- MOVQ DI, 80(SP)
- MOVQ R15, 112(SP)
- CALL runtime·memmove(SB)
- MOVQ 56(SP), CX
- MOVQ 64(SP), DX
- MOVQ 72(SP), SI
- MOVQ 80(SP), DI
- MOVQ 88(SP), R9
- MOVQ 112(SP), R15
- JMP inner1
-
- inlineEmitLiteralEnd:
- // End inline of the emitLiteral call.
- // ----------------------------------------
-
- emitLiteralFastPath:
- // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
- MOVB AX, BX
- SUBB $1, BX
- SHLB $2, BX
- MOVB BX, (DI)
- ADDQ $1, DI
-
- // !!! Implement the copy from lit to dst as a 16-byte load and store.
- // (Encode's documentation says that dst and src must not overlap.)
- //
- // This always copies 16 bytes, instead of only len(lit) bytes, but that's
- // OK. Subsequent iterations will fix up the overrun.
- //
- // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
- // 16-byte loads and stores. This technique probably wouldn't be as
- // effective on architectures that are fussier about alignment.
- MOVOU 0(R10), X0
- MOVOU X0, 0(DI)
- ADDQ AX, DI
-
- inner1:
- // for { etc }
-
- // base := s
- MOVQ SI, R12
-
- // !!! offset := base - candidate
- MOVQ R12, R11
- SUBQ R15, R11
- SUBQ DX, R11
-
- // ----------------------------------------
- // Begin inline of the extendMatch call.
- //
- // s = extendMatch(src, candidate+4, s+4)
-
- // !!! R14 = &src[len(src)]
- MOVQ src_len+32(FP), R14
- ADDQ DX, R14
-
- // !!! R13 = &src[len(src) - 8]
- MOVQ R14, R13
- SUBQ $8, R13
-
- // !!! R15 = &src[candidate + 4]
- ADDQ $4, R15
- ADDQ DX, R15
-
- // !!! s += 4
- ADDQ $4, SI
-
- inlineExtendMatchCmp8:
- // As long as we are 8 or more bytes before the end of src, we can load and
- // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
- CMPQ SI, R13
- JA inlineExtendMatchCmp1
- MOVQ (R15), AX
- MOVQ (SI), BX
- CMPQ AX, BX
- JNE inlineExtendMatchBSF
- ADDQ $8, R15
- ADDQ $8, SI
- JMP inlineExtendMatchCmp8
-
- inlineExtendMatchBSF:
- // If those 8 bytes were not equal, XOR the two 8 byte values, and return
- // the index of the first byte that differs. The BSF instruction finds the
- // least significant 1 bit, the amd64 architecture is little-endian, and
- // the shift by 3 converts a bit index to a byte index.
- XORQ AX, BX
- BSFQ BX, BX
- SHRQ $3, BX
- ADDQ BX, SI
- JMP inlineExtendMatchEnd
-
- inlineExtendMatchCmp1:
- // In src's tail, compare 1 byte at a time.
- CMPQ SI, R14
- JAE inlineExtendMatchEnd
- MOVB (R15), AX
- MOVB (SI), BX
- CMPB AX, BX
- JNE inlineExtendMatchEnd
- ADDQ $1, R15
- ADDQ $1, SI
- JMP inlineExtendMatchCmp1
-
- inlineExtendMatchEnd:
- // End inline of the extendMatch call.
- // ----------------------------------------
-
- // ----------------------------------------
- // Begin inline of the emitCopy call.
- //
- // d += emitCopy(dst[d:], base-candidate, s-base)
-
- // !!! length := s - base
- MOVQ SI, AX
- SUBQ R12, AX
-
- inlineEmitCopyLoop0:
- // for length >= 68 { etc }
- CMPL AX, $68
- JLT inlineEmitCopyStep1
-
- // Emit a length 64 copy, encoded as 3 bytes.
- MOVB $0xfe, 0(DI)
- MOVW R11, 1(DI)
- ADDQ $3, DI
- SUBL $64, AX
- JMP inlineEmitCopyLoop0
-
- inlineEmitCopyStep1:
- // if length > 64 { etc }
- CMPL AX, $64
- JLE inlineEmitCopyStep2
-
- // Emit a length 60 copy, encoded as 3 bytes.
- MOVB $0xee, 0(DI)
- MOVW R11, 1(DI)
- ADDQ $3, DI
- SUBL $60, AX
-
- inlineEmitCopyStep2:
- // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
- CMPL AX, $12
- JGE inlineEmitCopyStep3
- CMPL R11, $2048
- JGE inlineEmitCopyStep3
-
- // Emit the remaining copy, encoded as 2 bytes.
- MOVB R11, 1(DI)
- SHRL $8, R11
- SHLB $5, R11
- SUBB $4, AX
- SHLB $2, AX
- ORB AX, R11
- ORB $1, R11
- MOVB R11, 0(DI)
- ADDQ $2, DI
- JMP inlineEmitCopyEnd
-
- inlineEmitCopyStep3:
- // Emit the remaining copy, encoded as 3 bytes.
- SUBL $1, AX
- SHLB $2, AX
- ORB $2, AX
- MOVB AX, 0(DI)
- MOVW R11, 1(DI)
- ADDQ $3, DI
-
- inlineEmitCopyEnd:
- // End inline of the emitCopy call.
- // ----------------------------------------
-
- // nextEmit = s
- MOVQ SI, R10
-
- // if s >= sLimit { goto emitRemainder }
- MOVQ SI, AX
- SUBQ DX, AX
- CMPQ AX, R9
- JAE emitRemainder
-
- // As per the encode_other.go code:
- //
- // We could immediately etc.
-
- // x := load64(src, s-1)
- MOVQ -1(SI), R14
-
- // prevHash := hash(uint32(x>>0), shift)
- MOVL R14, R11
- IMULL $0x1e35a7bd, R11
- SHRL CX, R11
-
- // table[prevHash] = uint16(s-1)
- MOVQ SI, AX
- SUBQ DX, AX
- SUBQ $1, AX
-
- // XXX: MOVW AX, table-32768(SP)(R11*2)
- // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
- BYTE $0x66
- BYTE $0x42
- BYTE $0x89
- BYTE $0x44
- BYTE $0x5c
- BYTE $0x78
-
- // currHash := hash(uint32(x>>8), shift)
- SHRQ $8, R14
- MOVL R14, R11
- IMULL $0x1e35a7bd, R11
- SHRL CX, R11
-
- // candidate = int(table[currHash])
- // XXX: MOVWQZX table-32768(SP)(R11*2), R15
- // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
- BYTE $0x4e
- BYTE $0x0f
- BYTE $0xb7
- BYTE $0x7c
- BYTE $0x5c
- BYTE $0x78
-
- // table[currHash] = uint16(s)
- ADDQ $1, AX
-
- // XXX: MOVW AX, table-32768(SP)(R11*2)
- // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
- BYTE $0x66
- BYTE $0x42
- BYTE $0x89
- BYTE $0x44
- BYTE $0x5c
- BYTE $0x78
-
- // if uint32(x>>8) == load32(src, candidate) { continue }
- MOVL (DX)(R15*1), BX
- CMPL R14, BX
- JEQ inner1
-
- // nextHash = hash(uint32(x>>16), shift)
- SHRQ $8, R14
- MOVL R14, R11
- IMULL $0x1e35a7bd, R11
- SHRL CX, R11
-
- // s++
- ADDQ $1, SI
-
- // break out of the inner1 for loop, i.e. continue the outer loop.
- JMP outer
-
- emitRemainder:
- // if nextEmit < len(src) { etc }
- MOVQ src_len+32(FP), AX
- ADDQ DX, AX
- CMPQ R10, AX
- JEQ encodeBlockEnd
-
- // d += emitLiteral(dst[d:], src[nextEmit:])
- //
- // Push args.
- MOVQ DI, 0(SP)
- MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
- MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
- MOVQ R10, 24(SP)
- SUBQ R10, AX
- MOVQ AX, 32(SP)
- MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
-
- // Spill local variables (registers) onto the stack; call; unspill.
- MOVQ DI, 80(SP)
- CALL ·emitLiteral(SB)
- MOVQ 80(SP), DI
-
- // Finish the "d +=" part of "d += emitLiteral(etc)".
- ADDQ 48(SP), DI
-
- encodeBlockEnd:
- MOVQ dst_base+0(FP), AX
- SUBQ AX, DI
- MOVQ DI, d+48(FP)
- RET
|