1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
11 // The asm code generally follows the pure Go code in decode_other.go, except
12 // where marked with a "!!!".
14 // func decode(dst, src []byte) int
16 // All local variables fit into registers. The non-zero stack size is only to
17 // spill registers and push args when issuing a CALL. The register allocation:
26 // + R10 dst_base + dst_len
29 // + R13 src_base + src_len
30 // - R14 used by doCopy
31 // - R15 used by doCopy
33 // The registers R8-R13 (marked with a "+") are set at the start of the
34 // function, and after a CALL returns, and are not otherwise modified.
36 // The d variable is implicitly DI - R8, and len(dst)-d is R10 - DI.
37 // The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
38 TEXT ·decode(SB), NOSPLIT, $48-56
39 // Initialize SI, DI and R8-R13.
40 MOVQ dst_base+0(FP), R8
41 MOVQ dst_len+8(FP), R9
45 MOVQ src_base+24(FP), R11
46 MOVQ src_len+32(FP), R12
56 // CX = uint32(src[s])
58 // switch src[s] & 0x03
65 // ----------------------------------------
66 // The code below handles literal tags.
69 // x := uint32(src[s] >> 2)
80 // This is the end of the inner "switch", when we have a literal tag.
82 // We assume that CX == x and x fits in a uint32, where x is the variable
83 // used in the pure Go decode_other.go code.
85 // length = int(x) + 1
87 // Unlike the pure Go code, we don't need to check if length <= 0 because
88 // CX can hold 64 bits, so the increment cannot overflow.
91 // Prepare to check if copying length bytes will run past the end of dst or
101 // !!! Try a faster technique for short (16 or fewer bytes) copies.
103 // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
104 // goto callMemmove // Fall back on calling runtime·memmove.
107 // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
108 // against 21 instead of 16, because it cannot assume that all of its input
109 // is contiguous in memory and so it needs to leave enough source bytes to
110 // read the next tag without refilling buffers, but Go's Decode assumes
111 // contiguousness (the src argument is a []byte).
119 // !!! Implement the copy from src to dst as a 16-byte load and store.
120 // (Decode's documentation says that dst and src must not overlap.)
122 // This always copies 16 bytes, instead of only length bytes, but that's
123 // OK. If the input is a valid Snappy encoding then subsequent iterations
124 // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
125 // non-nil error), so the overrun will be ignored.
127 // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
128 // 16-byte loads and stores. This technique probably wouldn't be as
129 // effective on architectures that are fussier about alignment.
140 // if length > len(dst)-d || length > len(src)-s { etc }
146 // copy(dst[d:], src[s:s+length])
148 // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
149 // DI, SI and CX as arguments. Coincidentally, we also need to spill those
150 // three registers to the stack, to save local variables across the CALL.
157 CALL runtime·memmove(SB)
159 // Restore local variables: unspill registers from the stack and
160 // re-calculate R8-R13.
164 MOVQ dst_base+0(FP), R8
165 MOVQ dst_len+8(FP), R9
168 MOVQ src_base+24(FP), R11
169 MOVQ src_len+32(FP), R12
180 // !!! This fragment does the
182 // s += x - 58; if uint(s) > uint(len(src)) { etc }
184 // checks. In the asm version, we code it once instead of once per switch case.
197 // x = uint32(src[s-1])
203 // x = uint32(src[s-2]) | uint32(src[s-1])<<8
212 // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
221 // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
225 // The code above handles literal tags.
226 // ----------------------------------------
227 // The code below handles copy tags.
234 // if uint(s) > uint(len(src)) { etc }
240 // length = 1 + int(src[s-5])>>2
244 // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
253 // if uint(s) > uint(len(src)) { etc }
259 // length = 1 + int(src[s-3])>>2
263 // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
268 // We have a copy tag. We assume that:
269 // - BX == src[s] & 0x03
279 // if uint(s) > uint(len(src)) { etc }
285 // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
292 // length = 4 + int(src[s-2])>>2&0x7
298 // This is the end of the outer "switch", when we have a copy tag.
301 // - CX == length && CX > 0
304 // if offset <= 0 { etc }
308 // if d < offset { etc }
314 // if length > len(dst)-d { etc }
320 // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
323 // - R14 = len(dst)-d
324 // - R15 = &dst[d-offset]
330 // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
332 // First, try using two 8-byte load/stores, similar to the doLit technique
333 // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
334 // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
335 // and not one 16-byte load/store, and the first store has to be before the
336 // second load, due to the overlap if offset is in the range [8, 16).
338 // if length > 16 || offset < 8 || len(dst)-d < 16 {
339 // goto slowForwardCopy
357 // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
358 // can still try 8-byte load stores, provided we can overrun up to 10 extra
359 // bytes. As above, the overrun will be fixed up by subsequent iterations
360 // of the outermost loop.
362 // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
367 // The main part of this loop is a simple copy of eight bytes at a time
368 // until we've copied (at least) the requested amount of bytes. However,
369 // if d and d-offset are less than eight bytes apart (indicating a
370 // repeating pattern of length < 8), we first need to expand the pattern in
371 // order to get the correct results. For instance, if the buffer looks like
372 // this, with the eight-byte <d-offset> and <d> patterns marked as
379 // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
380 // once, after which we can move <d> two bytes without moving <d-offset>:
386 // and repeat the exercise until the two no longer overlap.
388 // This allows us to do very well in the special case of one single byte
389 // repeated many times, without taking a big hit for more general cases.
391 // The worst case of extra writing past the end of the match occurs when
392 // offset == 1 and length == 1; the last copy will read from byte positions
393 // [0..7] and write to [4..11], whereas it was only supposed to write to
394 // position 1. Thus, ten excess bytes.
398 // That "10 byte overrun" worst case is confirmed by Go's
399 // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
400 // and finishSlowForwardCopy algorithm.
402 // if length > len(dst)-d-10 {
403 // goto verySlowForwardCopy
407 JGT verySlowForwardCopy
410 // !!! As above, expand the pattern so that offset >= 8 and we can use
411 // 8-byte load/stores.
414 // copy 8 bytes from dst[d-offset:] to dst[d:]
418 // // The two previous lines together means that d-offset, and therefore
419 // // R15, is unchanged.
422 JGE fixUpSlowForwardCopy
428 JMP makeOffsetAtLeast8
430 fixUpSlowForwardCopy:
431 // !!! Add length (which might be negative now) to d (implied by DI being
432 // &dst[d]) so that d ends up at the right place when we jump back to the
433 // top of the loop. Before we do that, though, we save DI to AX so that, if
434 // length is positive, copying the remaining length bytes will write to the
439 finishSlowForwardCopy:
440 // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
441 // length means that we overrun, but as above, that will be fixed up by
442 // subsequent iterations of the outermost loop.
450 JMP finishSlowForwardCopy
453 // verySlowForwardCopy is a simple implementation of forward copy. In C
454 // parlance, this is a do/while loop instead of a while loop, since we know
455 // that length > 0. In Go syntax:
458 // dst[d] = dst[d - offset]
470 JNZ verySlowForwardCopy
473 // The code above handles copy tags.
474 // ----------------------------------------
477 // This is the end of the "for s < len(src)".
479 // if d != len(dst) { etc }
488 // return decodeErrCodeCorrupt