Skip to content

Commit

Permalink
Merge pull request #162 from greatroar/arm64-literal-copy
Browse files Browse the repository at this point in the history
internal/lz4block: Short literal copying in arm64 decoder
  • Loading branch information
pierrec committed Feb 5, 2022
2 parents 6bd757c + e99166d commit 5e2de87
Showing 1 changed file with 42 additions and 30 deletions.
72 changes: 42 additions & 30 deletions internal/lz4block/decode_arm64.s
Expand Up @@ -8,23 +8,25 @@
#include "textflag.h"

// Register allocation.
#define dst R0
#define dstorig R1
#define src R2
#define dstend R3
#define srcend R4
#define match R5 // Match address.
#define dict R6
#define dictlen R7
#define dictend R8
#define token R9
#define len R10 // Literal and match lengths.
#define lenRem R11
#define offset R12 // Match offset.
#define tmp1 R13
#define tmp2 R14
#define tmp3 R15
#define tmp4 R16
#define dst R0
#define dstorig R1
#define src R2
#define dstend R3
#define dstend16 R4 // dstend - 16
#define srcend R5
#define srcend16 R6 // srcend - 16
#define match R7 // Match address.
#define dict R8
#define dictlen R9
#define dictend R10
#define token R11
#define len R12 // Literal and match lengths.
#define lenRem R13
#define offset R14 // Match offset.
#define tmp1 R15
#define tmp2 R16
#define tmp3 R17
#define tmp4 R19

// func decodeBlock(dst, src, dict []byte) int
TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $0-80
Expand All @@ -36,6 +38,12 @@ TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $0-80
CBZ srcend, shortSrc
ADD src, srcend

// dstend16 = max(dstend-16, 0) and similarly for srcend16.
SUBS $16, dstend, dstend16
CSEL LO, ZR, dstend16, dstend16
SUBS $16, srcend, srcend16
CSEL LO, ZR, srcend16, srcend16

LDP dict_base+48(FP), (dict, dictlen)
ADD dict, dictlen, dictend

Expand Down Expand Up @@ -71,27 +79,31 @@ readLitlenDone:
// Copy literal.
SUBS $16, len
BLO copyLiteralShort
AND $15, len, lenRem

copyLiteralLoop:
SUBS $16, len
LDP.P 16(src), (tmp1, tmp2)
STP.P (tmp1, tmp2), 16(dst)
SUBS $16, len
BPL copyLiteralLoop

// lenRem = len%16 is the remaining number of bytes we need to copy.
// Since len was >= 16, we can do this in one load and one store,
// overlapping with the last load and store, without worrying about
// writing out of bounds.
ADD lenRem, src
ADD lenRem, dst
LDP -16(src), (tmp1, tmp2)
STP (tmp1, tmp2), -16(dst)
// Copy (final part of) literal of length 0-15.
// If we have >=16 bytes left in src and dst, just copy 16 bytes.
copyLiteralShort:
CMP dstend16, dst
CCMP LO, src, srcend16, $0b0010 // 0010 = preserve carry (LO).
BHS copyLiteralShortEnd

AND $15, len

LDP (src), (tmp1, tmp2)
ADD len, src
STP (tmp1, tmp2), (dst)
ADD len, dst

B copyLiteralDone

// Copy literal of length 0-15.
copyLiteralShort:
// Safe but slow copy near the end of src, dst.
copyLiteralShortEnd:
TBZ $3, len, 3(PC)
MOVD.P 8(src), tmp1
MOVD.P tmp1, 8(dst)
Expand Down Expand Up @@ -159,7 +171,7 @@ copyDict:
CCMP NE, dictend, match, $0b0100 // 0100 sets the Z (EQ) flag.
BNE copyDict

CBZ len, copyMatchDone
CBZ len, copyMatchDone

// If the match extends beyond the dictionary, the rest is at dstorig.
MOVD dstorig, match
Expand Down

0 comments on commit 5e2de87

Please sign in to comment.