From fce0a6d88340b414103ab41030b7eda5a717238f Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Wed, 13 Jul 2022 14:54:23 +0200 Subject: [PATCH] s2c: Convert directly to target types --- s2/cmd/s2c/main.go | 14 +++++++------- s2/cmd/s2d/main.go | 21 ++++++++++++--------- s2/encode_better.go | 32 +++++++++++++++++++++++++++----- 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/s2/cmd/s2c/main.go b/s2/cmd/s2c/main.go index 4292d62783..9e913562ac 100644 --- a/s2/cmd/s2c/main.go +++ b/s2/cmd/s2c/main.go @@ -91,7 +91,7 @@ Options:`) flag.PrintDefaults() os.Exit(0) } - opts := []s2.WriterOption{s2.WriterBlockSize(int(sz)), s2.WriterConcurrency(*cpu), s2.WriterPadding(int(pad))} + opts := []s2.WriterOption{s2.WriterBlockSize(sz), s2.WriterConcurrency(*cpu), s2.WriterPadding(pad)} if *index { opts = append(opts, s2.WriterAddIndex()) } @@ -123,7 +123,7 @@ Options:`) dstFile, err := os.OpenFile(*out, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, os.ModePerm) exitErr(err) defer dstFile.Close() - bw := bufio.NewWriterSize(dstFile, int(sz)*2) + bw := bufio.NewWriterSize(dstFile, sz*2) defer bw.Flush() wr.Reset(bw) } @@ -612,7 +612,7 @@ func exitErr(err error) { } // toSize converts a size indication to bytes. -func toSize(size string) (uint64, error) { +func toSize(size string) (int, error) { size = strings.ToUpper(strings.TrimSpace(size)) firstLetter := strings.IndexFunc(size, unicode.IsLetter) if firstLetter == -1 { @@ -620,18 +620,18 @@ func toSize(size string) (uint64, error) { } bytesString, multiple := size[:firstLetter], size[firstLetter:] - bytes, err := strconv.ParseUint(bytesString, 10, 64) + sz, err := strconv.Atoi(bytesString) if err != nil { return 0, fmt.Errorf("unable to parse size: %v", err) } switch multiple { case "M", "MB", "MIB": - return bytes * 1 << 20, nil + return sz * 1 << 20, nil case "K", "KB", "KIB": - return bytes * 1 << 10, nil + return sz * 1 << 10, nil case "B", "": - return bytes, nil + return sz, nil default: return 0, fmt.Errorf("unknown size suffix: %v", multiple) } diff --git a/s2/cmd/s2d/main.go b/s2/cmd/s2d/main.go index 5fc0abe692..8f7030183d 100644 --- a/s2/cmd/s2d/main.go +++ b/s2/cmd/s2d/main.go @@ -292,9 +292,9 @@ Options:`) rs, err := r.ReadSeeker(tailBytes > 0, nil) exitErr(err) if tailBytes > 0 { - _, err = rs.Seek(-int64(tailBytes), io.SeekEnd) + _, err = rs.Seek(-tailBytes, io.SeekEnd) } else { - _, err = rs.Seek(int64(offset), io.SeekStart) + _, err = rs.Seek(offset, io.SeekStart) } exitErr(err) } @@ -408,7 +408,7 @@ func (w *rCountSeeker) BytesRead() int64 { } // toSize converts a size indication to bytes. -func toSize(size string) (uint64, error) { +func toSize(size string) (int64, error) { if len(size) == 0 { return 0, nil } @@ -419,22 +419,25 @@ func toSize(size string) (uint64, error) { } bytesString, multiple := size[:firstLetter], size[firstLetter:] - bytes, err := strconv.ParseUint(bytesString, 10, 64) + sz, err := strconv.ParseInt(bytesString, 10, 64) if err != nil { return 0, fmt.Errorf("unable to parse size: %v", err) } + if sz < 0 { + return 0, errors.New("negative size given") + } switch multiple { case "T", "TB", "TIB": - return bytes * 1 << 40, nil + return sz * 1 << 40, nil case "G", "GB", "GIB": - return bytes * 1 << 30, nil + return sz * 1 << 30, nil case "M", "MB", "MIB": - return bytes * 1 << 20, nil + return sz * 1 << 20, nil case "K", "KB", "KIB": - return bytes * 1 << 10, nil + return sz * 1 << 10, nil case "B", "": - return bytes, nil + return sz, nil default: return 0, fmt.Errorf("unknown size suffix: %v", multiple) } diff --git a/s2/encode_better.go b/s2/encode_better.go index 943215b8ae..3c3ca1c7aa 100644 --- a/s2/encode_better.go +++ b/s2/encode_better.go @@ -97,9 +97,18 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { lTable[hashL] = uint32(s) sTable[hashS] = uint32(s) + if uint32(cv) == load32(src, candidateL) { + break + } + // Check repeat at offset checkRep. const checkRep = 1 - if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { + // Minimum length of a repeat. Tested with various values. + // While 4-5 offers improvements in some, 6 reduces + // regressions significantly. + const wantRepeatBytes = 5 + const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep) + if true && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask { base := s + checkRep // Extend back for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { @@ -138,15 +147,28 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { if s >= sLimit { goto emitRemainder } + { + // Index match start+1 (long) and start+2 (short) + index0 := base + 1 + // Index match end-2 (long) and end-1 (short) + index1 := s - 2 + + cv0 := load64(src, index0) + cv1 := load64(src, index1) + cv = load64(src, s) + lTable[hash7(cv0, lTableBits)] = uint32(index0) + lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1) + lTable[hash7(cv1, lTableBits)] = uint32(index1) + lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1) + sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) + sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2) + sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + } cv = load64(src, s) continue } - if uint32(cv) == load32(src, candidateL) { - break - } - // Check our short candidate if uint32(cv) == load32(src, candidateS) { // Try a long candidate at s+1