1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
11 // The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
12 // Go toolchain regression. See https://github.com/golang/go/issues/15426 and
13 // https://github.com/golang/snappy/issues/29
15 // As a workaround, the package was built with a known good assembler, and
16 // those instructions were disassembled by "objdump -d" to yield the
17 // 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
18 // style comments, in AT&T asm syntax. Note that rsp here is a physical
19 // register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
20 // The instructions were then encoded as "BYTE $0x.." sequences, which assemble
23 // The asm code generally follows the pure Go code in encode_other.go, except
24 // where marked with a "!!!".
26 // ----------------------------------------------------------------------------
28 // func emitLiteral(dst, lit []byte) int
30 // All local variables fit into registers. The register allocation:
37 // The 24 bytes of stack space is to call runtime·memmove.
39 // The unusual register allocation of local variables, such as R10 for the
40 // source pointer, matches the allocation used at the call site in encodeBlock,
41 // which makes it easier to manually inline this function.
42 TEXT ·emitLiteral(SB), NOSPLIT, $24-56
43 MOVQ dst_base+0(FP), DI
44 MOVQ lit_base+24(FP), R10
45 MOVQ lit_len+32(FP), AX
80 // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
81 // DI, R10 and AX as arguments.
85 CALL runtime·memmove(SB)
88 // ----------------------------------------------------------------------------
90 // func emitCopy(dst []byte, offset, length int) int
92 // All local variables fit into registers. The register allocation:
98 // The unusual register allocation of local variables, such as R11 for the
99 // offset, matches the allocation used at the call site in encodeBlock, which
100 // makes it easier to manually inline this function.
101 TEXT ·emitCopy(SB), NOSPLIT, $0-48
102 MOVQ dst_base+0(FP), DI
104 MOVQ offset+24(FP), R11
105 MOVQ length+32(FP), AX
108 // for length >= 68 { etc }
112 // Emit a length 64 copy, encoded as 3 bytes.
120 // if length > 64 { etc }
124 // Emit a length 60 copy, encoded as 3 bytes.
131 // if length >= 12 || offset >= 2048 { goto step3 }
137 // Emit the remaining copy, encoded as 2 bytes.
148 // Return the number of bytes written.
154 // Emit the remaining copy, encoded as 3 bytes.
162 // Return the number of bytes written.
167 // ----------------------------------------------------------------------------
169 // func extendMatch(src []byte, i, j int) int
171 // All local variables fit into registers. The register allocation:
174 // - R13 &src[len(src) - 8]
175 // - R14 &src[len(src)]
178 // The unusual register allocation of local variables, such as R15 for a source
179 // pointer, matches the allocation used at the call site in encodeBlock, which
180 // makes it easier to manually inline this function.
181 TEXT ·extendMatch(SB), NOSPLIT, $0-48
182 MOVQ src_base+0(FP), DX
183 MOVQ src_len+8(FP), R14
193 // As long as we are 8 or more bytes before the end of src, we can load and
194 // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
206 // If those 8 bytes were not equal, XOR the two 8 byte values, and return
207 // the index of the first byte that differs. The BSF instruction finds the
208 // least significant 1 bit, the amd64 architecture is little-endian, and
209 // the shift by 3 converts a bit index to a byte index.
215 // Convert from &src[ret] to ret.
221 // In src's tail, compare 1 byte at a time.
233 // Convert from &src[ret] to ret.
238 // ----------------------------------------------------------------------------
240 // func encodeBlock(dst, src []byte) (d int)
242 // All local variables fit into registers, other than "var table". The register
246 // - CX 56 shift (note that amd64 shifts by non-immediates must use CX).
247 // - DX 64 &src[0], tableSize
251 // - R10 . &src[nextEmit]
252 // - R11 96 prevHash, currHash, nextHash, offset
253 // - R12 104 &src[base], skip
254 // - R13 . &src[nextS], &src[len(src) - 8]
255 // - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
256 // - R15 112 candidate
258 // The second column (56, 64, etc) is the stack offset to spill the registers
259 // when calling other functions. We could pack this slightly tighter, but it's
260 // simpler to have a dedicated spill map independent of the function called.
262 // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
263 // extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
264 // local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
265 TEXT ·encodeBlock(SB), 0, $32888-56
266 MOVQ dst_base+0(FP), DI
267 MOVQ src_base+24(FP), SI
268 MOVQ src_len+32(FP), R14
270 // shift, tableSize := uint32(32-8), 1<<8
275 // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
287 // var table [maxTableSize]uint16
289 // In the asm code, unlike the Go code, we can zero-initialize only the
290 // first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
291 // writes 16 bytes, so we can do only tableSize/8 writes instead of the
292 // 2048 writes that would zero-initialize all of table's 32768 bytes.
294 LEAQ table-32768(SP), BX
306 // sLimit := len(src) - inputMargin
310 // !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
311 // change for the rest of the function.
322 // nextHash := hash(load32(src, s), shift)
324 IMULL $0x1e35a7bd, R11
345 // bytesBetweenHashLookups := skip >> 5
349 // nextS = s + bytesBetweenHashLookups
352 // skip += bytesBetweenHashLookups
355 // if nextS > sLimit { goto emitRemainder }
361 // candidate = int(table[nextHash])
362 // XXX: MOVWQZX table-32768(SP)(R11*2), R15
363 // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
371 // table[nextHash] = uint16(s)
375 // XXX: MOVW AX, table-32768(SP)(R11*2)
376 // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
384 // nextHash = hash(load32(src, nextS), shift)
386 IMULL $0x1e35a7bd, R11
389 // if load32(src, s) != load32(src, candidate) { continue } break
396 // As per the encode_other.go code:
398 // A 4-byte match has been found. We'll later see etc.
400 // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
401 // on inputMargin in encode.go.
405 JLE emitLiteralFastPath
407 // ----------------------------------------
408 // Begin inline of the emitLiteral call.
410 // d += emitLiteral(dst[d:], src[nextEmit:s])
416 JLT inlineEmitLiteralOneByte
418 JLT inlineEmitLiteralTwoBytes
420 inlineEmitLiteralThreeBytes:
424 JMP inlineEmitLiteralMemmove
426 inlineEmitLiteralTwoBytes:
430 JMP inlineEmitLiteralMemmove
432 inlineEmitLiteralOneByte:
437 inlineEmitLiteralMemmove:
438 // Spill local variables (registers) onto the stack; call; unspill.
440 // copy(dst[i:], lit)
442 // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
443 // DI, R10 and AX as arguments.
447 ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)".
451 CALL runtime·memmove(SB)
460 inlineEmitLiteralEnd:
461 // End inline of the emitLiteral call.
462 // ----------------------------------------
465 // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
472 // !!! Implement the copy from lit to dst as a 16-byte load and store.
473 // (Encode's documentation says that dst and src must not overlap.)
475 // This always copies 16 bytes, instead of only len(lit) bytes, but that's
476 // OK. Subsequent iterations will fix up the overrun.
478 // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
479 // 16-byte loads and stores. This technique probably wouldn't be as
480 // effective on architectures that are fussier about alignment.
491 // !!! offset := base - candidate
496 // ----------------------------------------
497 // Begin inline of the extendMatch call.
499 // s = extendMatch(src, candidate+4, s+4)
501 // !!! R14 = &src[len(src)]
502 MOVQ src_len+32(FP), R14
505 // !!! R13 = &src[len(src) - 8]
509 // !!! R15 = &src[candidate + 4]
516 inlineExtendMatchCmp8:
517 // As long as we are 8 or more bytes before the end of src, we can load and
518 // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
520 JA inlineExtendMatchCmp1
524 JNE inlineExtendMatchBSF
527 JMP inlineExtendMatchCmp8
529 inlineExtendMatchBSF:
530 // If those 8 bytes were not equal, XOR the two 8 byte values, and return
531 // the index of the first byte that differs. The BSF instruction finds the
532 // least significant 1 bit, the amd64 architecture is little-endian, and
533 // the shift by 3 converts a bit index to a byte index.
538 JMP inlineExtendMatchEnd
540 inlineExtendMatchCmp1:
541 // In src's tail, compare 1 byte at a time.
543 JAE inlineExtendMatchEnd
547 JNE inlineExtendMatchEnd
550 JMP inlineExtendMatchCmp1
552 inlineExtendMatchEnd:
553 // End inline of the extendMatch call.
554 // ----------------------------------------
556 // ----------------------------------------
557 // Begin inline of the emitCopy call.
559 // d += emitCopy(dst[d:], base-candidate, s-base)
561 // !!! length := s - base
566 // for length >= 68 { etc }
568 JLT inlineEmitCopyStep1
570 // Emit a length 64 copy, encoded as 3 bytes.
575 JMP inlineEmitCopyLoop0
578 // if length > 64 { etc }
580 JLE inlineEmitCopyStep2
582 // Emit a length 60 copy, encoded as 3 bytes.
589 // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
591 JGE inlineEmitCopyStep3
593 JGE inlineEmitCopyStep3
595 // Emit the remaining copy, encoded as 2 bytes.
605 JMP inlineEmitCopyEnd
608 // Emit the remaining copy, encoded as 3 bytes.
617 // End inline of the emitCopy call.
618 // ----------------------------------------
623 // if s >= sLimit { goto emitRemainder }
629 // As per the encode_other.go code:
631 // We could immediately etc.
633 // x := load64(src, s-1)
636 // prevHash := hash(uint32(x>>0), shift)
638 IMULL $0x1e35a7bd, R11
641 // table[prevHash] = uint16(s-1)
646 // XXX: MOVW AX, table-32768(SP)(R11*2)
647 // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
655 // currHash := hash(uint32(x>>8), shift)
658 IMULL $0x1e35a7bd, R11
661 // candidate = int(table[currHash])
662 // XXX: MOVWQZX table-32768(SP)(R11*2), R15
663 // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
671 // table[currHash] = uint16(s)
674 // XXX: MOVW AX, table-32768(SP)(R11*2)
675 // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
683 // if uint32(x>>8) == load32(src, candidate) { continue }
688 // nextHash = hash(uint32(x>>16), shift)
691 IMULL $0x1e35a7bd, R11
697 // break out of the inner1 for loop, i.e. continue the outer loop.
701 // if nextEmit < len(src) { etc }
702 MOVQ src_len+32(FP), AX
707 // d += emitLiteral(dst[d:], src[nextEmit:])
711 MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
712 MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
716 MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
718 // Spill local variables (registers) onto the stack; call; unspill.
720 CALL ·emitLiteral(SB)
723 // Finish the "d +=" part of "d += emitLiteral(etc)".
727 MOVQ dst_base+0(FP), AX