Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

s2: Improve amd64 decompression speed. #436

Merged
merged 2 commits into from Sep 6, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
37 changes: 17 additions & 20 deletions s2/decode_amd64.s
Expand Up @@ -227,11 +227,11 @@ tagLit62Plus:

// case x == 62:
// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
MOVWLZX -3(R_SRC), R_LEN
MOVBLZX -1(R_SRC), R_TMP1
SHLL $16, R_TMP1
ORL R_TMP1, R_LEN
JMP doLit
// We read one byte, safe to read one back, since we are just reading tag.
// x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
MOVL -4(R_SRC), R_LEN
SHRL $8, R_LEN
JMP doLit

tagLit63:
// case x == 63:
Expand Down Expand Up @@ -294,20 +294,18 @@ tagCopy:
JA errCorrupt

// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
// length = 4 + int(src[s-2])>>2&0x7
MOVBQZX -1(R_SRC), R_TMP1
MOVQ R_LEN, R_TMP0
SHRQ $2, R_LEN
ANDQ $0xe0, R_TMP0
ANDQ $7, R_LEN
SHLQ $3, R_TMP0
MOVBQZX -1(R_SRC), R_TMP1
ADDQ $4, R_LEN
ORQ R_TMP1, R_TMP0

// length = 4 + int(src[s-2])>>2&0x7
SHRQ $2, R_LEN
ANDQ $7, R_LEN
ADDQ $4, R_LEN

// check if repeat code
CMPQ R_TMP0, $0
JE repeatCode
// check if repeat code, ZF set by ORQ.
JZ repeatCode

// This is a regular copy, transfer our temporary value to R_OFF (length)
MOVQ R_TMP0, R_OFF
Expand Down Expand Up @@ -363,12 +361,11 @@ repeatLen3:
JA errCorrupt

// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
MOVBLZX -1(R_SRC), R_TMP0
MOVWLZX -3(R_SRC), R_LEN
SHLL $16, R_TMP0
ORL R_TMP0, R_LEN
ADDL $65540, R_LEN
JMP doCopyRepeat
// Read one byte further back (just part of the tag, shifted out)
MOVL -4(R_SRC), R_LEN
SHRL $8, R_LEN
ADDL $65540, R_LEN
JMP doCopyRepeat

doCopy:
// This is the end of the outer "switch", when we have a copy tag.
Expand Down