vendor/github.com/golang/snappy/decode_amd64.s

   1 // Copyright 2016 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // +build !appengine
   6 // +build gc
   7 // +build !noasm
   8
   9 #include "textflag.h"
  10
  11 // The asm code generally follows the pure Go code in decode_other.go, except
  12 // where marked with a "!!!".
  13
  14 // func decode(dst, src []byte) int
  15 //
  16 // All local variables fit into registers. The non-zero stack size is only to
  17 // spill registers and push args when issuing a CALL. The register allocation:
  18 //      - AX    scratch
  19 //      - BX    scratch
  20 //      - CX    length or x
  21 //      - DX    offset
  22 //      - SI    &src[s]
  23 //      - DI    &dst[d]
  24 //      + R8    dst_base
  25 //      + R9    dst_len
  26 //      + R10   dst_base + dst_len
  27 //      + R11   src_base
  28 //      + R12   src_len
  29 //      + R13   src_base + src_len
  30 //      - R14   used by doCopy
  31 //      - R15   used by doCopy
  32 //
  33 // The registers R8-R13 (marked with a "+") are set at the start of the
  34 // function, and after a CALL returns, and are not otherwise modified.
  35 //
  36 // The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
  37 // The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
  38 TEXT ·decode(SB), NOSPLIT, $48-56
  39         // Initialize SI, DI and R8-R13.
  40         MOVQ dst_base+0(FP), R8
  41         MOVQ dst_len+8(FP), R9
  42         MOVQ R8, DI
  43         MOVQ R8, R10
  44         ADDQ R9, R10
  45         MOVQ src_base+24(FP), R11
  46         MOVQ src_len+32(FP), R12
  47         MOVQ R11, SI
  48         MOVQ R11, R13
  49         ADDQ R12, R13
  50
  51 loop:
  52         // for s < len(src)
  53         CMPQ SI, R13
  54         JEQ  end
  55
  56         // CX = uint32(src[s])
  57         //
  58         // switch src[s] & 0x03
  59         MOVBLZX (SI), CX
  60         MOVL    CX, BX
  61         ANDL    $3, BX
  62         CMPL    BX, $1
  63         JAE     tagCopy
  64
  65         // ----------------------------------------
  66         // The code below handles literal tags.
  67
  68         // case tagLiteral:
  69         // x := uint32(src[s] >> 2)
  70         // switch
  71         SHRL $2, CX
  72         CMPL CX, $60
  73         JAE  tagLit60Plus
  74
  75         // case x < 60:
  76         // s++
  77         INCQ SI
  78
  79 doLit:
  80         // This is the end of the inner "switch", when we have a literal tag.
  81         //
  82         // We assume that CX == x and x fits in a uint32, where x is the variable
  83         // used in the pure Go decode_other.go code.
  84
  85         // length = int(x) + 1
  86         //
  87         // Unlike the pure Go code, we don't need to check if length <= 0 because
  88         // CX can hold 64 bits, so the increment cannot overflow.
  89         INCQ CX
  90
  91         // Prepare to check if copying length bytes will run past the end of dst or
  92         // src.
  93         //
  94         // AX = len(dst) - d
  95         // BX = len(src) - s
  96         MOVQ R10, AX
  97         SUBQ DI, AX
  98         MOVQ R13, BX
  99         SUBQ SI, BX
 100
 101         // !!! Try a faster technique for short (16 or fewer bytes) copies.
 102         //
 103         // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
 104         //   goto callMemmove // Fall back on calling runtime·memmove.
 105         // }
 106         //
 107         // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
 108         // against 21 instead of 16, because it cannot assume that all of its input
 109         // is contiguous in memory and so it needs to leave enough source bytes to
 110         // read the next tag without refilling buffers, but Go's Decode assumes
 111         // contiguousness (the src argument is a []byte).
 112         CMPQ CX, $16
 113         JGT  callMemmove
 114         CMPQ AX, $16
 115         JLT  callMemmove
 116         CMPQ BX, $16
 117         JLT  callMemmove
 118
 119         // !!! Implement the copy from src to dst as a 16-byte load and store.
 120         // (Decode's documentation says that dst and src must not overlap.)
 121         //
 122         // This always copies 16 bytes, instead of only length bytes, but that's
 123         // OK. If the input is a valid Snappy encoding then subsequent iterations
 124         // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
 125         // non-nil error), so the overrun will be ignored.
 126         //
 127         // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
 128         // 16-byte loads and stores. This technique probably wouldn't be as
 129         // effective on architectures that are fussier about alignment.
 130         MOVOU 0(SI), X0
 131         MOVOU X0, 0(DI)
 132
 133         // d += length
 134         // s += length
 135         ADDQ CX, DI
 136         ADDQ CX, SI
 137         JMP  loop
 138
 139 callMemmove:
 140         // if length > len(dst)-d || length > len(src)-s { etc }
 141         CMPQ CX, AX
 142         JGT  errCorrupt
 143         CMPQ CX, BX
 144         JGT  errCorrupt
 145
 146         // copy(dst[d:], src[s:s+length])
 147         //
 148         // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
 149         // DI, SI and CX as arguments. Coincidentally, we also need to spill those
 150         // three registers to the stack, to save local variables across the CALL.
 151         MOVQ DI, 0(SP)
 152         MOVQ SI, 8(SP)
 153         MOVQ CX, 16(SP)
 154         MOVQ DI, 24(SP)
 155         MOVQ SI, 32(SP)
 156         MOVQ CX, 40(SP)
 157         CALL runtime·memmove(SB)
 158
 159         // Restore local variables: unspill registers from the stack and
 160         // re-calculate R8-R13.
 161         MOVQ 24(SP), DI
 162         MOVQ 32(SP), SI
 163         MOVQ 40(SP), CX
 164         MOVQ dst_base+0(FP), R8
 165         MOVQ dst_len+8(FP), R9
 166         MOVQ R8, R10
 167         ADDQ R9, R10
 168         MOVQ src_base+24(FP), R11
 169         MOVQ src_len+32(FP), R12
 170         MOVQ R11, R13
 171         ADDQ R12, R13
 172
 173         // d += length
 174         // s += length
 175         ADDQ CX, DI
 176         ADDQ CX, SI
 177         JMP  loop
 178
 179 tagLit60Plus:
 180         // !!! This fragment does the
 181         //
 182         // s += x - 58; if uint(s) > uint(len(src)) { etc }
 183         //
 184         // checks. In the asm version, we code it once instead of once per switch case.
 185         ADDQ CX, SI
 186         SUBQ $58, SI
 187         MOVQ SI, BX
 188         SUBQ R11, BX
 189         CMPQ BX, R12
 190         JA   errCorrupt
 191
 192         // case x == 60:
 193         CMPL CX, $61
 194         JEQ  tagLit61
 195         JA   tagLit62Plus
 196
 197         // x = uint32(src[s-1])
 198         MOVBLZX -1(SI), CX
 199         JMP     doLit
 200
 201 tagLit61:
 202         // case x == 61:
 203         // x = uint32(src[s-2]) | uint32(src[s-1])<<8
 204         MOVWLZX -2(SI), CX
 205         JMP     doLit
 206
 207 tagLit62Plus:
 208         CMPL CX, $62
 209         JA   tagLit63
 210
 211         // case x == 62:
 212         // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
 213         MOVWLZX -3(SI), CX
 214         MOVBLZX -1(SI), BX
 215         SHLL    $16, BX
 216         ORL     BX, CX
 217         JMP     doLit
 218
 219 tagLit63:
 220         // case x == 63:
 221         // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
 222         MOVL -4(SI), CX
 223         JMP  doLit
 224
 225 // The code above handles literal tags.
 226 // ----------------------------------------
 227 // The code below handles copy tags.
 228
 229 tagCopy4:
 230         // case tagCopy4:
 231         // s += 5
 232         ADDQ $5, SI
 233
 234         // if uint(s) > uint(len(src)) { etc }
 235         MOVQ SI, BX
 236         SUBQ R11, BX
 237         CMPQ BX, R12
 238         JA   errCorrupt
 239
 240         // length = 1 + int(src[s-5])>>2
 241         SHRQ $2, CX
 242         INCQ CX
 243
 244         // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
 245         MOVLQZX -4(SI), DX
 246         JMP     doCopy
 247
 248 tagCopy2:
 249         // case tagCopy2:
 250         // s += 3
 251         ADDQ $3, SI
 252
 253         // if uint(s) > uint(len(src)) { etc }
 254         MOVQ SI, BX
 255         SUBQ R11, BX
 256         CMPQ BX, R12
 257         JA   errCorrupt
 258
 259         // length = 1 + int(src[s-3])>>2
 260         SHRQ $2, CX
 261         INCQ CX
 262
 263         // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
 264         MOVWQZX -2(SI), DX
 265         JMP     doCopy
 266
 267 tagCopy:
 268         // We have a copy tag. We assume that:
 269         //      - BX == src[s] & 0x03
 270         //      - CX == src[s]
 271         CMPQ BX, $2
 272         JEQ  tagCopy2
 273         JA   tagCopy4
 274
 275         // case tagCopy1:
 276         // s += 2
 277         ADDQ $2, SI
 278
 279         // if uint(s) > uint(len(src)) { etc }
 280         MOVQ SI, BX
 281         SUBQ R11, BX
 282         CMPQ BX, R12
 283         JA   errCorrupt
 284
 285         // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
 286         MOVQ    CX, DX
 287         ANDQ    $0xe0, DX
 288         SHLQ    $3, DX
 289         MOVBQZX -1(SI), BX
 290         ORQ     BX, DX
 291
 292         // length = 4 + int(src[s-2])>>2&0x7
 293         SHRQ $2, CX
 294         ANDQ $7, CX
 295         ADDQ $4, CX
 296
 297 doCopy:
 298         // This is the end of the outer "switch", when we have a copy tag.
 299         //
 300         // We assume that:
 301         //      - CX == length && CX > 0
 302         //      - DX == offset
 303
 304         // if offset <= 0 { etc }
 305         CMPQ DX, $0
 306         JLE  errCorrupt
 307
 308         // if d < offset { etc }
 309         MOVQ DI, BX
 310         SUBQ R8, BX
 311         CMPQ BX, DX
 312         JLT  errCorrupt
 313
 314         // if length > len(dst)-d { etc }
 315         MOVQ R10, BX
 316         SUBQ DI, BX
 317         CMPQ CX, BX
 318         JGT  errCorrupt
 319
 320         // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
 321         //
 322         // Set:
 323         //      - R14 = len(dst)-d
 324         //      - R15 = &dst[d-offset]
 325         MOVQ R10, R14
 326         SUBQ DI, R14
 327         MOVQ DI, R15
 328         SUBQ DX, R15
 329
 330         // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
 331         //
 332         // First, try using two 8-byte load/stores, similar to the doLit technique
 333         // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
 334         // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
 335         // and not one 16-byte load/store, and the first store has to be before the
 336         // second load, due to the overlap if offset is in the range [8, 16).
 337         //
 338         // if length > 16 || offset < 8 || len(dst)-d < 16 {
 339         //   goto slowForwardCopy
 340         // }
 341         // copy 16 bytes
 342         // d += length
 343         CMPQ CX, $16
 344         JGT  slowForwardCopy
 345         CMPQ DX, $8
 346         JLT  slowForwardCopy
 347         CMPQ R14, $16
 348         JLT  slowForwardCopy
 349         MOVQ 0(R15), AX
 350         MOVQ AX, 0(DI)
 351         MOVQ 8(R15), BX
 352         MOVQ BX, 8(DI)
 353         ADDQ CX, DI
 354         JMP  loop
 355
 356 slowForwardCopy:
 357         // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
 358         // can still try 8-byte load stores, provided we can overrun up to 10 extra
 359         // bytes. As above, the overrun will be fixed up by subsequent iterations
 360         // of the outermost loop.
 361         //
 362         // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
 363         // commentary says:
 364         //
 365         // ----
 366         //
 367         // The main part of this loop is a simple copy of eight bytes at a time
 368         // until we've copied (at least) the requested amount of bytes.  However,
 369         // if d and d-offset are less than eight bytes apart (indicating a
 370         // repeating pattern of length < 8), we first need to expand the pattern in
 371         // order to get the correct results. For instance, if the buffer looks like
 372         // this, with the eight-byte <d-offset> and <d> patterns marked as
 373         // intervals:
 374         //
 375         //    abxxxxxxxxxxxx
 376         //    [------]           d-offset
 377         //      [------]         d
 378         //
 379         // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
 380         // once, after which we can move <d> two bytes without moving <d-offset>:
 381         //
 382         //    ababxxxxxxxxxx
 383         //    [------]           d-offset
 384         //        [------]       d
 385         //
 386         // and repeat the exercise until the two no longer overlap.
 387         //
 388         // This allows us to do very well in the special case of one single byte
 389         // repeated many times, without taking a big hit for more general cases.
 390         //
 391         // The worst case of extra writing past the end of the match occurs when
 392         // offset == 1 and length == 1; the last copy will read from byte positions
 393         // [0..7] and write to [4..11], whereas it was only supposed to write to
 394         // position 1. Thus, ten excess bytes.
 395         //
 396         // ----
 397         //
 398         // That "10 byte overrun" worst case is confirmed by Go's
 399         // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
 400         // and finishSlowForwardCopy algorithm.
 401         //
 402         // if length > len(dst)-d-10 {
 403         //   goto verySlowForwardCopy
 404         // }
 405         SUBQ $10, R14
 406         CMPQ CX, R14
 407         JGT  verySlowForwardCopy
 408
 409 makeOffsetAtLeast8:
 410         // !!! As above, expand the pattern so that offset >= 8 and we can use
 411         // 8-byte load/stores.
 412         //
 413         // for offset < 8 {
 414         //   copy 8 bytes from dst[d-offset:] to dst[d:]
 415         //   length -= offset
 416         //   d      += offset
 417         //   offset += offset
 418         //   // The two previous lines together means that d-offset, and therefore
 419         //   // R15, is unchanged.
 420         // }
 421         CMPQ DX, $8
 422         JGE  fixUpSlowForwardCopy
 423         MOVQ (R15), BX
 424         MOVQ BX, (DI)
 425         SUBQ DX, CX
 426         ADDQ DX, DI
 427         ADDQ DX, DX
 428         JMP  makeOffsetAtLeast8
 429
 430 fixUpSlowForwardCopy:
 431         // !!! Add length (which might be negative now) to d (implied by DI being
 432         // &dst[d]) so that d ends up at the right place when we jump back to the
 433         // top of the loop. Before we do that, though, we save DI to AX so that, if
 434         // length is positive, copying the remaining length bytes will write to the
 435         // right place.
 436         MOVQ DI, AX
 437         ADDQ CX, DI
 438
 439 finishSlowForwardCopy:
 440         // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
 441         // length means that we overrun, but as above, that will be fixed up by
 442         // subsequent iterations of the outermost loop.
 443         CMPQ CX, $0
 444         JLE  loop
 445         MOVQ (R15), BX
 446         MOVQ BX, (AX)
 447         ADDQ $8, R15
 448         ADDQ $8, AX
 449         SUBQ $8, CX
 450         JMP  finishSlowForwardCopy
 451
 452 verySlowForwardCopy:
 453         // verySlowForwardCopy is a simple implementation of forward copy. In C
 454         // parlance, this is a do/while loop instead of a while loop, since we know
 455         // that length > 0. In Go syntax:
 456         //
 457         // for {
 458         //   dst[d] = dst[d - offset]
 459         //   d++
 460         //   length--
 461         //   if length == 0 {
 462         //     break
 463         //   }
 464         // }
 465         MOVB (R15), BX
 466         MOVB BX, (DI)
 467         INCQ R15
 468         INCQ DI
 469         DECQ CX
 470         JNZ  verySlowForwardCopy
 471         JMP  loop
 472
 473 // The code above handles copy tags.
 474 // ----------------------------------------
 475
 476 end:
 477         // This is the end of the "for s < len(src)".
 478         //
 479         // if d != len(dst) { etc }
 480         CMPQ DI, R10
 481         JNE  errCorrupt
 482
 483         // return 0
 484         MOVQ $0, ret+48(FP)
 485         RET
 486
 487 errCorrupt:
 488         // return decodeErrCodeCorrupt
 489         MOVQ $1, ret+48(FP)
 490         RET