Skip to content

Commit

Permalink
internal/lz4block: Short literal copying in arm64 decoder
Browse files Browse the repository at this point in the history
This borrows a trick from the amd64 decoder: copy short literals by
always copying 16 bytes, using the space beyond the literal's length as
scratch space (but only when we're >=16 bytes from the end of src, dst).

Benchmark results on RPi4B:

name                old speed      new speed      delta
UncompressPg1661-4   101MB/s ± 1%   138MB/s ± 0%  +36.46%  (p=0.000 n=10+9)
UncompressDigits-4   453MB/s ± 0%   525MB/s ± 0%  +15.81%  (p=0.000 n=10+10)
UncompressTwain-4    110MB/s ± 0%   156MB/s ± 0%  +41.42%  (p=0.000 n=10+10)
UncompressRand-4    1.14GB/s ± 1%  1.13GB/s ± 1%     ~     (p=0.075 n=10+10)

Also moved a SUBS to just before the associated branch to allow
instruction fusion on ARMs that do that.
  • Loading branch information
greatroar committed Feb 4, 2022
1 parent 257c664 commit e99166d
Showing 1 changed file with 42 additions and 30 deletions.
72 changes: 42 additions & 30 deletions internal/lz4block/decode_arm64.s
Expand Up @@ -8,23 +8,25 @@
#include "textflag.h"

// Register allocation.
#define dst R0
#define dstorig R1
#define src R2
#define dstend R3
#define srcend R4
#define match R5 // Match address.
#define dict R6
#define dictlen R7
#define dictend R8
#define token R9
#define len R10 // Literal and match lengths.
#define lenRem R11
#define offset R12 // Match offset.
#define tmp1 R13
#define tmp2 R14
#define tmp3 R15
#define tmp4 R16
#define dst R0
#define dstorig R1
#define src R2
#define dstend R3
#define dstend16 R4 // dstend - 16
#define srcend R5
#define srcend16 R6 // srcend - 16
#define match R7 // Match address.
#define dict R8
#define dictlen R9
#define dictend R10
#define token R11
#define len R12 // Literal and match lengths.
#define lenRem R13
#define offset R14 // Match offset.
#define tmp1 R15
#define tmp2 R16
#define tmp3 R17
#define tmp4 R19

// func decodeBlock(dst, src, dict []byte) int
TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $0-80
Expand All @@ -36,6 +38,12 @@ TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $0-80
CBZ srcend, shortSrc
ADD src, srcend

// dstend16 = max(dstend-16, 0) and similarly for srcend16.
SUBS $16, dstend, dstend16
CSEL LO, ZR, dstend16, dstend16
SUBS $16, srcend, srcend16
CSEL LO, ZR, srcend16, srcend16

LDP dict_base+48(FP), (dict, dictlen)
ADD dict, dictlen, dictend

Expand Down Expand Up @@ -71,27 +79,31 @@ readLitlenDone:
// Copy literal.
SUBS $16, len
BLO copyLiteralShort
AND $15, len, lenRem

copyLiteralLoop:
SUBS $16, len
LDP.P 16(src), (tmp1, tmp2)
STP.P (tmp1, tmp2), 16(dst)
SUBS $16, len
BPL copyLiteralLoop

// lenRem = len%16 is the remaining number of bytes we need to copy.
// Since len was >= 16, we can do this in one load and one store,
// overlapping with the last load and store, without worrying about
// writing out of bounds.
ADD lenRem, src
ADD lenRem, dst
LDP -16(src), (tmp1, tmp2)
STP (tmp1, tmp2), -16(dst)
// Copy (final part of) literal of length 0-15.
// If we have >=16 bytes left in src and dst, just copy 16 bytes.
copyLiteralShort:
CMP dstend16, dst
CCMP LO, src, srcend16, $0b0010 // 0010 = preserve carry (LO).
BHS copyLiteralShortEnd

AND $15, len

LDP (src), (tmp1, tmp2)
ADD len, src
STP (tmp1, tmp2), (dst)
ADD len, dst

B copyLiteralDone

// Copy literal of length 0-15.
copyLiteralShort:
// Safe but slow copy near the end of src, dst.
copyLiteralShortEnd:
TBZ $3, len, 3(PC)
MOVD.P 8(src), tmp1
MOVD.P tmp1, 8(dst)
Expand Down Expand Up @@ -159,7 +171,7 @@ copyDict:
CCMP NE, dictend, match, $0b0100 // 0100 sets the Z (EQ) flag.
BNE copyDict

CBZ len, copyMatchDone
CBZ len, copyMatchDone

// If the match extends beyond the dictionary, the rest is at dstorig.
MOVD dstorig, match
Expand Down

0 comments on commit e99166d

Please sign in to comment.