vendor/github.com/golang/snappy/encode_amd64.s

   1 // Copyright 2016 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // +build !appengine
   6 // +build gc
   7 // +build !noasm
   8
   9 #include "textflag.h"
  10
  11 // The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
  12 // Go toolchain regression. See https://github.com/golang/go/issues/15426 and
  13 // https://github.com/golang/snappy/issues/29
  14 //
  15 // As a workaround, the package was built with a known good assembler, and
  16 // those instructions were disassembled by "objdump -d" to yield the
  17 //      4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
  18 // style comments, in AT&T asm syntax. Note that rsp here is a physical
  19 // register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
  20 // The instructions were then encoded as "BYTE $0x.." sequences, which assemble
  21 // fine on Go 1.6.
  22
  23 // The asm code generally follows the pure Go code in encode_other.go, except
  24 // where marked with a "!!!".
  25
  26 // ----------------------------------------------------------------------------
  27
  28 // func emitLiteral(dst, lit []byte) int
  29 //
  30 // All local variables fit into registers. The register allocation:
  31 //      - AX    len(lit)
  32 //      - BX    n
  33 //      - DX    return value
  34 //      - DI    &dst[i]
  35 //      - R10   &lit[0]
  36 //
  37 // The 24 bytes of stack space is to call runtime·memmove.
  38 //
  39 // The unusual register allocation of local variables, such as R10 for the
  40 // source pointer, matches the allocation used at the call site in encodeBlock,
  41 // which makes it easier to manually inline this function.
  42 TEXT ·emitLiteral(SB), NOSPLIT, $24-56
  43         MOVQ dst_base+0(FP), DI
  44         MOVQ lit_base+24(FP), R10
  45         MOVQ lit_len+32(FP), AX
  46         MOVQ AX, DX
  47         MOVL AX, BX
  48         SUBL $1, BX
  49
  50         CMPL BX, $60
  51         JLT  oneByte
  52         CMPL BX, $256
  53         JLT  twoBytes
  54
  55 threeBytes:
  56         MOVB $0xf4, 0(DI)
  57         MOVW BX, 1(DI)
  58         ADDQ $3, DI
  59         ADDQ $3, DX
  60         JMP  memmove
  61
  62 twoBytes:
  63         MOVB $0xf0, 0(DI)
  64         MOVB BX, 1(DI)
  65         ADDQ $2, DI
  66         ADDQ $2, DX
  67         JMP  memmove
  68
  69 oneByte:
  70         SHLB $2, BX
  71         MOVB BX, 0(DI)
  72         ADDQ $1, DI
  73         ADDQ $1, DX
  74
  75 memmove:
  76         MOVQ DX, ret+48(FP)
  77
  78         // copy(dst[i:], lit)
  79         //
  80         // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
  81         // DI, R10 and AX as arguments.
  82         MOVQ DI, 0(SP)
  83         MOVQ R10, 8(SP)
  84         MOVQ AX, 16(SP)
  85         CALL runtime·memmove(SB)
  86         RET
  87
  88 // ----------------------------------------------------------------------------
  89
  90 // func emitCopy(dst []byte, offset, length int) int
  91 //
  92 // All local variables fit into registers. The register allocation:
  93 //      - AX    length
  94 //      - SI    &dst[0]
  95 //      - DI    &dst[i]
  96 //      - R11   offset
  97 //
  98 // The unusual register allocation of local variables, such as R11 for the
  99 // offset, matches the allocation used at the call site in encodeBlock, which
 100 // makes it easier to manually inline this function.
 101 TEXT ·emitCopy(SB), NOSPLIT, $0-48
 102         MOVQ dst_base+0(FP), DI
 103         MOVQ DI, SI
 104         MOVQ offset+24(FP), R11
 105         MOVQ length+32(FP), AX
 106
 107 loop0:
 108         // for length >= 68 { etc }
 109         CMPL AX, $68
 110         JLT  step1
 111
 112         // Emit a length 64 copy, encoded as 3 bytes.
 113         MOVB $0xfe, 0(DI)
 114         MOVW R11, 1(DI)
 115         ADDQ $3, DI
 116         SUBL $64, AX
 117         JMP  loop0
 118
 119 step1:
 120         // if length > 64 { etc }
 121         CMPL AX, $64
 122         JLE  step2
 123
 124         // Emit a length 60 copy, encoded as 3 bytes.
 125         MOVB $0xee, 0(DI)
 126         MOVW R11, 1(DI)
 127         ADDQ $3, DI
 128         SUBL $60, AX
 129
 130 step2:
 131         // if length >= 12 || offset >= 2048 { goto step3 }
 132         CMPL AX, $12
 133         JGE  step3
 134         CMPL R11, $2048
 135         JGE  step3
 136
 137         // Emit the remaining copy, encoded as 2 bytes.
 138         MOVB R11, 1(DI)
 139         SHRL $8, R11
 140         SHLB $5, R11
 141         SUBB $4, AX
 142         SHLB $2, AX
 143         ORB  AX, R11
 144         ORB  $1, R11
 145         MOVB R11, 0(DI)
 146         ADDQ $2, DI
 147
 148         // Return the number of bytes written.
 149         SUBQ SI, DI
 150         MOVQ DI, ret+40(FP)
 151         RET
 152
 153 step3:
 154         // Emit the remaining copy, encoded as 3 bytes.
 155         SUBL $1, AX
 156         SHLB $2, AX
 157         ORB  $2, AX
 158         MOVB AX, 0(DI)
 159         MOVW R11, 1(DI)
 160         ADDQ $3, DI
 161
 162         // Return the number of bytes written.
 163         SUBQ SI, DI
 164         MOVQ DI, ret+40(FP)
 165         RET
 166
 167 // ----------------------------------------------------------------------------
 168
 169 // func extendMatch(src []byte, i, j int) int
 170 //
 171 // All local variables fit into registers. The register allocation:
 172 //      - DX    &src[0]
 173 //      - SI    &src[j]
 174 //      - R13   &src[len(src) - 8]
 175 //      - R14   &src[len(src)]
 176 //      - R15   &src[i]
 177 //
 178 // The unusual register allocation of local variables, such as R15 for a source
 179 // pointer, matches the allocation used at the call site in encodeBlock, which
 180 // makes it easier to manually inline this function.
 181 TEXT ·extendMatch(SB), NOSPLIT, $0-48
 182         MOVQ src_base+0(FP), DX
 183         MOVQ src_len+8(FP), R14
 184         MOVQ i+24(FP), R15
 185         MOVQ j+32(FP), SI
 186         ADDQ DX, R14
 187         ADDQ DX, R15
 188         ADDQ DX, SI
 189         MOVQ R14, R13
 190         SUBQ $8, R13
 191
 192 cmp8:
 193         // As long as we are 8 or more bytes before the end of src, we can load and
 194         // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
 195         CMPQ SI, R13
 196         JA   cmp1
 197         MOVQ (R15), AX
 198         MOVQ (SI), BX
 199         CMPQ AX, BX
 200         JNE  bsf
 201         ADDQ $8, R15
 202         ADDQ $8, SI
 203         JMP  cmp8
 204
 205 bsf:
 206         // If those 8 bytes were not equal, XOR the two 8 byte values, and return
 207         // the index of the first byte that differs. The BSF instruction finds the
 208         // least significant 1 bit, the amd64 architecture is little-endian, and
 209         // the shift by 3 converts a bit index to a byte index.
 210         XORQ AX, BX
 211         BSFQ BX, BX
 212         SHRQ $3, BX
 213         ADDQ BX, SI
 214
 215         // Convert from &src[ret] to ret.
 216         SUBQ DX, SI
 217         MOVQ SI, ret+40(FP)
 218         RET
 219
 220 cmp1:
 221         // In src's tail, compare 1 byte at a time.
 222         CMPQ SI, R14
 223         JAE  extendMatchEnd
 224         MOVB (R15), AX
 225         MOVB (SI), BX
 226         CMPB AX, BX
 227         JNE  extendMatchEnd
 228         ADDQ $1, R15
 229         ADDQ $1, SI
 230         JMP  cmp1
 231
 232 extendMatchEnd:
 233         // Convert from &src[ret] to ret.
 234         SUBQ DX, SI
 235         MOVQ SI, ret+40(FP)
 236         RET
 237
 238 // ----------------------------------------------------------------------------
 239
 240 // func encodeBlock(dst, src []byte) (d int)
 241 //
 242 // All local variables fit into registers, other than "var table". The register
 243 // allocation:
 244 //      - AX    .       .
 245 //      - BX    .       .
 246 //      - CX    56      shift (note that amd64 shifts by non-immediates must use CX).
 247 //      - DX    64      &src[0], tableSize
 248 //      - SI    72      &src[s]
 249 //      - DI    80      &dst[d]
 250 //      - R9    88      sLimit
 251 //      - R10   .       &src[nextEmit]
 252 //      - R11   96      prevHash, currHash, nextHash, offset
 253 //      - R12   104     &src[base], skip
 254 //      - R13   .       &src[nextS], &src[len(src) - 8]
 255 //      - R14   .       len(src), bytesBetweenHashLookups, &src[len(src)], x
 256 //      - R15   112     candidate
 257 //
 258 // The second column (56, 64, etc) is the stack offset to spill the registers
 259 // when calling other functions. We could pack this slightly tighter, but it's
 260 // simpler to have a dedicated spill map independent of the function called.
 261 //
 262 // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
 263 // extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
 264 // local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
 265 TEXT ·encodeBlock(SB), 0, $32888-56
 266         MOVQ dst_base+0(FP), DI
 267         MOVQ src_base+24(FP), SI
 268         MOVQ src_len+32(FP), R14
 269
 270         // shift, tableSize := uint32(32-8), 1<<8
 271         MOVQ $24, CX
 272         MOVQ $256, DX
 273
 274 calcShift:
 275         // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
 276         //      shift--
 277         // }
 278         CMPQ DX, $16384
 279         JGE  varTable
 280         CMPQ DX, R14
 281         JGE  varTable
 282         SUBQ $1, CX
 283         SHLQ $1, DX
 284         JMP  calcShift
 285
 286 varTable:
 287         // var table [maxTableSize]uint16
 288         //
 289         // In the asm code, unlike the Go code, we can zero-initialize only the
 290         // first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
 291         // writes 16 bytes, so we can do only tableSize/8 writes instead of the
 292         // 2048 writes that would zero-initialize all of table's 32768 bytes.
 293         SHRQ $3, DX
 294         LEAQ table-32768(SP), BX
 295         PXOR X0, X0
 296
 297 memclr:
 298         MOVOU X0, 0(BX)
 299         ADDQ  $16, BX
 300         SUBQ  $1, DX
 301         JNZ   memclr
 302
 303         // !!! DX = &src[0]
 304         MOVQ SI, DX
 305
 306         // sLimit := len(src) - inputMargin
 307         MOVQ R14, R9
 308         SUBQ $15, R9
 309
 310         // !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
 311         // change for the rest of the function.
 312         MOVQ CX, 56(SP)
 313         MOVQ DX, 64(SP)
 314         MOVQ R9, 88(SP)
 315
 316         // nextEmit := 0
 317         MOVQ DX, R10
 318
 319         // s := 1
 320         ADDQ $1, SI
 321
 322         // nextHash := hash(load32(src, s), shift)
 323         MOVL  0(SI), R11
 324         IMULL $0x1e35a7bd, R11
 325         SHRL  CX, R11
 326
 327 outer:
 328         // for { etc }
 329
 330         // skip := 32
 331         MOVQ $32, R12
 332
 333         // nextS := s
 334         MOVQ SI, R13
 335
 336         // candidate := 0
 337         MOVQ $0, R15
 338
 339 inner0:
 340         // for { etc }
 341
 342         // s := nextS
 343         MOVQ R13, SI
 344
 345         // bytesBetweenHashLookups := skip >> 5
 346         MOVQ R12, R14
 347         SHRQ $5, R14
 348
 349         // nextS = s + bytesBetweenHashLookups
 350         ADDQ R14, R13
 351
 352         // skip += bytesBetweenHashLookups
 353         ADDQ R14, R12
 354
 355         // if nextS > sLimit { goto emitRemainder }
 356         MOVQ R13, AX
 357         SUBQ DX, AX
 358         CMPQ AX, R9
 359         JA   emitRemainder
 360
 361         // candidate = int(table[nextHash])
 362         // XXX: MOVWQZX table-32768(SP)(R11*2), R15
 363         // XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
 364         BYTE $0x4e
 365         BYTE $0x0f
 366         BYTE $0xb7
 367         BYTE $0x7c
 368         BYTE $0x5c
 369         BYTE $0x78
 370
 371         // table[nextHash] = uint16(s)
 372         MOVQ SI, AX
 373         SUBQ DX, AX
 374
 375         // XXX: MOVW AX, table-32768(SP)(R11*2)
 376         // XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
 377         BYTE $0x66
 378         BYTE $0x42
 379         BYTE $0x89
 380         BYTE $0x44
 381         BYTE $0x5c
 382         BYTE $0x78
 383
 384         // nextHash = hash(load32(src, nextS), shift)
 385         MOVL  0(R13), R11
 386         IMULL $0x1e35a7bd, R11
 387         SHRL  CX, R11
 388
 389         // if load32(src, s) != load32(src, candidate) { continue } break
 390         MOVL 0(SI), AX
 391         MOVL (DX)(R15*1), BX
 392         CMPL AX, BX
 393         JNE  inner0
 394
 395 fourByteMatch:
 396         // As per the encode_other.go code:
 397         //
 398         // A 4-byte match has been found. We'll later see etc.
 399
 400         // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
 401         // on inputMargin in encode.go.
 402         MOVQ SI, AX
 403         SUBQ R10, AX
 404         CMPQ AX, $16
 405         JLE  emitLiteralFastPath
 406
 407         // ----------------------------------------
 408         // Begin inline of the emitLiteral call.
 409         //
 410         // d += emitLiteral(dst[d:], src[nextEmit:s])
 411
 412         MOVL AX, BX
 413         SUBL $1, BX
 414
 415         CMPL BX, $60
 416         JLT  inlineEmitLiteralOneByte
 417         CMPL BX, $256
 418         JLT  inlineEmitLiteralTwoBytes
 419
 420 inlineEmitLiteralThreeBytes:
 421         MOVB $0xf4, 0(DI)
 422         MOVW BX, 1(DI)
 423         ADDQ $3, DI
 424         JMP  inlineEmitLiteralMemmove
 425
 426 inlineEmitLiteralTwoBytes:
 427         MOVB $0xf0, 0(DI)
 428         MOVB BX, 1(DI)
 429         ADDQ $2, DI
 430         JMP  inlineEmitLiteralMemmove
 431
 432 inlineEmitLiteralOneByte:
 433         SHLB $2, BX
 434         MOVB BX, 0(DI)
 435         ADDQ $1, DI
 436
 437 inlineEmitLiteralMemmove:
 438         // Spill local variables (registers) onto the stack; call; unspill.
 439         //
 440         // copy(dst[i:], lit)
 441         //
 442         // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
 443         // DI, R10 and AX as arguments.
 444         MOVQ DI, 0(SP)
 445         MOVQ R10, 8(SP)
 446         MOVQ AX, 16(SP)
 447         ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
 448         MOVQ SI, 72(SP)
 449         MOVQ DI, 80(SP)
 450         MOVQ R15, 112(SP)
 451         CALL runtime·memmove(SB)
 452         MOVQ 56(SP), CX
 453         MOVQ 64(SP), DX
 454         MOVQ 72(SP), SI
 455         MOVQ 80(SP), DI
 456         MOVQ 88(SP), R9
 457         MOVQ 112(SP), R15
 458         JMP  inner1
 459
 460 inlineEmitLiteralEnd:
 461         // End inline of the emitLiteral call.
 462         // ----------------------------------------
 463
 464 emitLiteralFastPath:
 465         // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
 466         MOVB AX, BX
 467         SUBB $1, BX
 468         SHLB $2, BX
 469         MOVB BX, (DI)
 470         ADDQ $1, DI
 471
 472         // !!! Implement the copy from lit to dst as a 16-byte load and store.
 473         // (Encode's documentation says that dst and src must not overlap.)
 474         //
 475         // This always copies 16 bytes, instead of only len(lit) bytes, but that's
 476         // OK. Subsequent iterations will fix up the overrun.
 477         //
 478         // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
 479         // 16-byte loads and stores. This technique probably wouldn't be as
 480         // effective on architectures that are fussier about alignment.
 481         MOVOU 0(R10), X0
 482         MOVOU X0, 0(DI)
 483         ADDQ  AX, DI
 484
 485 inner1:
 486         // for { etc }
 487
 488         // base := s
 489         MOVQ SI, R12
 490
 491         // !!! offset := base - candidate
 492         MOVQ R12, R11
 493         SUBQ R15, R11
 494         SUBQ DX, R11
 495
 496         // ----------------------------------------
 497         // Begin inline of the extendMatch call.
 498         //
 499         // s = extendMatch(src, candidate+4, s+4)
 500
 501         // !!! R14 = &src[len(src)]
 502         MOVQ src_len+32(FP), R14
 503         ADDQ DX, R14
 504
 505         // !!! R13 = &src[len(src) - 8]
 506         MOVQ R14, R13
 507         SUBQ $8, R13
 508
 509         // !!! R15 = &src[candidate + 4]
 510         ADDQ $4, R15
 511         ADDQ DX, R15
 512
 513         // !!! s += 4
 514         ADDQ $4, SI
 515
 516 inlineExtendMatchCmp8:
 517         // As long as we are 8 or more bytes before the end of src, we can load and
 518         // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
 519         CMPQ SI, R13
 520         JA   inlineExtendMatchCmp1
 521         MOVQ (R15), AX
 522         MOVQ (SI), BX
 523         CMPQ AX, BX
 524         JNE  inlineExtendMatchBSF
 525         ADDQ $8, R15
 526         ADDQ $8, SI
 527         JMP  inlineExtendMatchCmp8
 528
 529 inlineExtendMatchBSF:
 530         // If those 8 bytes were not equal, XOR the two 8 byte values, and return
 531         // the index of the first byte that differs. The BSF instruction finds the
 532         // least significant 1 bit, the amd64 architecture is little-endian, and
 533         // the shift by 3 converts a bit index to a byte index.
 534         XORQ AX, BX
 535         BSFQ BX, BX
 536         SHRQ $3, BX
 537         ADDQ BX, SI
 538         JMP  inlineExtendMatchEnd
 539
 540 inlineExtendMatchCmp1:
 541         // In src's tail, compare 1 byte at a time.
 542         CMPQ SI, R14
 543         JAE  inlineExtendMatchEnd
 544         MOVB (R15), AX
 545         MOVB (SI), BX
 546         CMPB AX, BX
 547         JNE  inlineExtendMatchEnd
 548         ADDQ $1, R15
 549         ADDQ $1, SI
 550         JMP  inlineExtendMatchCmp1
 551
 552 inlineExtendMatchEnd:
 553         // End inline of the extendMatch call.
 554         // ----------------------------------------
 555
 556         // ----------------------------------------
 557         // Begin inline of the emitCopy call.
 558         //
 559         // d += emitCopy(dst[d:], base-candidate, s-base)
 560
 561         // !!! length := s - base
 562         MOVQ SI, AX
 563         SUBQ R12, AX
 564
 565 inlineEmitCopyLoop0:
 566         // for length >= 68 { etc }
 567         CMPL AX, $68
 568         JLT  inlineEmitCopyStep1
 569
 570         // Emit a length 64 copy, encoded as 3 bytes.
 571         MOVB $0xfe, 0(DI)
 572         MOVW R11, 1(DI)
 573         ADDQ $3, DI
 574         SUBL $64, AX
 575         JMP  inlineEmitCopyLoop0
 576
 577 inlineEmitCopyStep1:
 578         // if length > 64 { etc }
 579         CMPL AX, $64
 580         JLE  inlineEmitCopyStep2
 581
 582         // Emit a length 60 copy, encoded as 3 bytes.
 583         MOVB $0xee, 0(DI)
 584         MOVW R11, 1(DI)
 585         ADDQ $3, DI
 586         SUBL $60, AX
 587
 588 inlineEmitCopyStep2:
 589         // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
 590         CMPL AX, $12
 591         JGE  inlineEmitCopyStep3
 592         CMPL R11, $2048
 593         JGE  inlineEmitCopyStep3
 594
 595         // Emit the remaining copy, encoded as 2 bytes.
 596         MOVB R11, 1(DI)
 597         SHRL $8, R11
 598         SHLB $5, R11
 599         SUBB $4, AX
 600         SHLB $2, AX
 601         ORB  AX, R11
 602         ORB  $1, R11
 603         MOVB R11, 0(DI)
 604         ADDQ $2, DI
 605         JMP  inlineEmitCopyEnd
 606
 607 inlineEmitCopyStep3:
 608         // Emit the remaining copy, encoded as 3 bytes.
 609         SUBL $1, AX
 610         SHLB $2, AX
 611         ORB  $2, AX
 612         MOVB AX, 0(DI)
 613         MOVW R11, 1(DI)
 614         ADDQ $3, DI
 615
 616 inlineEmitCopyEnd:
 617         // End inline of the emitCopy call.
 618         // ----------------------------------------
 619
 620         // nextEmit = s
 621         MOVQ SI, R10
 622
 623         // if s >= sLimit { goto emitRemainder }
 624         MOVQ SI, AX
 625         SUBQ DX, AX
 626         CMPQ AX, R9
 627         JAE  emitRemainder
 628
 629         // As per the encode_other.go code:
 630         //
 631         // We could immediately etc.
 632
 633         // x := load64(src, s-1)
 634         MOVQ -1(SI), R14
 635
 636         // prevHash := hash(uint32(x>>0), shift)
 637         MOVL  R14, R11
 638         IMULL $0x1e35a7bd, R11
 639         SHRL  CX, R11
 640
 641         // table[prevHash] = uint16(s-1)
 642         MOVQ SI, AX
 643         SUBQ DX, AX
 644         SUBQ $1, AX
 645
 646         // XXX: MOVW AX, table-32768(SP)(R11*2)
 647         // XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
 648         BYTE $0x66
 649         BYTE $0x42
 650         BYTE $0x89
 651         BYTE $0x44
 652         BYTE $0x5c
 653         BYTE $0x78
 654
 655         // currHash := hash(uint32(x>>8), shift)
 656         SHRQ  $8, R14
 657         MOVL  R14, R11
 658         IMULL $0x1e35a7bd, R11
 659         SHRL  CX, R11
 660
 661         // candidate = int(table[currHash])
 662         // XXX: MOVWQZX table-32768(SP)(R11*2), R15
 663         // XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
 664         BYTE $0x4e
 665         BYTE $0x0f
 666         BYTE $0xb7
 667         BYTE $0x7c
 668         BYTE $0x5c
 669         BYTE $0x78
 670
 671         // table[currHash] = uint16(s)
 672         ADDQ $1, AX
 673
 674         // XXX: MOVW AX, table-32768(SP)(R11*2)
 675         // XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
 676         BYTE $0x66
 677         BYTE $0x42
 678         BYTE $0x89
 679         BYTE $0x44
 680         BYTE $0x5c
 681         BYTE $0x78
 682
 683         // if uint32(x>>8) == load32(src, candidate) { continue }
 684         MOVL (DX)(R15*1), BX
 685         CMPL R14, BX
 686         JEQ  inner1
 687
 688         // nextHash = hash(uint32(x>>16), shift)
 689         SHRQ  $8, R14
 690         MOVL  R14, R11
 691         IMULL $0x1e35a7bd, R11
 692         SHRL  CX, R11
 693
 694         // s++
 695         ADDQ $1, SI
 696
 697         // break out of the inner1 for loop, i.e. continue the outer loop.
 698         JMP outer
 699
 700 emitRemainder:
 701         // if nextEmit < len(src) { etc }
 702         MOVQ src_len+32(FP), AX
 703         ADDQ DX, AX
 704         CMPQ R10, AX
 705         JEQ  encodeBlockEnd
 706
 707         // d += emitLiteral(dst[d:], src[nextEmit:])
 708         //
 709         // Push args.
 710         MOVQ DI, 0(SP)
 711         MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
 712         MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
 713         MOVQ R10, 24(SP)
 714         SUBQ R10, AX
 715         MOVQ AX, 32(SP)
 716         MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
 717
 718         // Spill local variables (registers) onto the stack; call; unspill.
 719         MOVQ DI, 80(SP)
 720         CALL ·emitLiteral(SB)
 721         MOVQ 80(SP), DI
 722
 723         // Finish the "d +=" part of "d += emitLiteral(etc)".
 724         ADDQ 48(SP), DI
 725
 726 encodeBlockEnd:
 727         MOVQ dst_base+0(FP), AX
 728         SUBQ AX, DI
 729         MOVQ DI, d+48(FP)
 730         RET