Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use hardware accelerated Alder32 impl #551

Merged
merged 1 commit into from Nov 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
285 changes: 285 additions & 0 deletions src/SharpCompress/Algorithms/Alder32.cs
@@ -0,0 +1,285 @@
// Copyright (c) Six Labors and contributors.
// Licensed under the GNU Affero General Public License, Version 3.

using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if NETCOREAPP3_1
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif

namespace SharpCompress.Algorithms
{
/// <summary>
/// Calculates the 32 bit Adler checksum of a given buffer according to
/// RFC 1950. ZLIB Compressed Data Format Specification version 3.3)
/// </summary>
internal static class Adler32
{
/// <summary>
/// The default initial seed value of a Adler32 checksum calculation.
/// </summary>
public const uint SeedValue = 1U;

#if NETCOREAPP3_1
private const int MinBufferSize = 64;
#endif

// Largest prime smaller than 65536
private const uint BASE = 65521;

// NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
private const uint NMAX = 5552;

/// <summary>
/// Calculates the Adler32 checksum with the bytes taken from the span.
/// </summary>
/// <param name="buffer">The readonly span of bytes.</param>
/// <returns>The <see cref="uint"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static uint Calculate(ReadOnlySpan<byte> buffer)
{
return Calculate(SeedValue, buffer);
}

/// <summary>
/// Calculates the Adler32 checksum with the bytes taken from the span and seed.
/// </summary>
/// <param name="adler">The input Adler32 value.</param>
/// <param name="buffer">The readonly span of bytes.</param>
/// <returns>The <see cref="uint"/>.</returns>
public static uint Calculate(uint adler, ReadOnlySpan<byte> buffer)
{
if (buffer.IsEmpty)
{
return SeedValue;
}

#if NETCOREAPP3_1
if (Sse3.IsSupported && buffer.Length >= MinBufferSize)
{
return CalculateSse(adler, buffer);
}

return CalculateScalar(adler, buffer);
#else
return CalculateScalar(adler, buffer);
#endif
}

// Based on https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c
#if NETCOREAPP3_1
private static unsafe uint CalculateSse(uint adler, ReadOnlySpan<byte> buffer)
{
uint s1 = adler & 0xFFFF;
uint s2 = (adler >> 16) & 0xFFFF;

// Process the data in blocks.
const int BLOCK_SIZE = 1 << 5;

uint length = (uint)buffer.Length;
uint blocks = length / BLOCK_SIZE;
length -= blocks * BLOCK_SIZE;

int index = 0;
fixed (byte* bufferPtr = &buffer[0])
{
index += (int)blocks * BLOCK_SIZE;
var localBufferPtr = bufferPtr;

// _mm_setr_epi8 on x86
var tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
var tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
Vector128<byte> zero = Vector128<byte>.Zero;
var ones = Vector128.Create((short)1);

while (blocks > 0)
{
uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
if (n > blocks)
{
n = blocks;
}

blocks -= n;

// Process n blocks of data. At most NMAX data bytes can be
// processed before s2 must be reduced modulo BASE.
Vector128<int> v_ps = Vector128.CreateScalar(s1 * n).AsInt32();
Vector128<int> v_s2 = Vector128.CreateScalar(s2).AsInt32();
Vector128<int> v_s1 = Vector128<int>.Zero;

do
{
// Load 32 input bytes.
Vector128<byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr);
Vector128<byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 16);

// Add previous block byte sum to v_ps.
v_ps = Sse2.Add(v_ps, v_s1);

// Horizontally add the bytes for s1, multiply-adds the
// bytes by [ 32, 31, 30, ... ] for s2.
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsInt32());
Vector128<short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones));

v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsInt32());
Vector128<short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones));

localBufferPtr += BLOCK_SIZE;
}
while (--n > 0);

v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));

// Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
const byte S2301 = 0b1011_0001; // A B C D -> B A D C
const byte S1032 = 0b0100_1110; // A B C D -> C D A B

v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S2301));
v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));

s1 += (uint)v_s1.ToScalar();

v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));

s2 = (uint)v_s2.ToScalar();

// Reduce.
s1 %= BASE;
s2 %= BASE;
}
}

ref byte bufferRef = ref MemoryMarshal.GetReference(buffer);

if (length > 0)
{
if (length >= 16)
{
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
length -= 16;
}

while (length-- > 0)
{
s2 += s1 += Unsafe.Add(ref bufferRef, index++);
}

if (s1 >= BASE)
{
s1 -= BASE;
}

s2 %= BASE;
}

return s1 | (s2 << 16);
}
#endif

private static uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer)
{
uint s1 = adler & 0xFFFF;
uint s2 = (adler >> 16) & 0xFFFF;
uint k;

ref byte bufferRef = ref MemoryMarshal.GetReference<byte>(buffer);
uint length = (uint)buffer.Length;
int index = 0;

while (length > 0)
{
k = length < NMAX ? length : NMAX;
length -= k;

while (k >= 16)
{
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
k -= 16;
}

if (k != 0)
{
do
{
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
}
while (--k != 0);
}

s1 %= BASE;
s2 %= BASE;
}

return (s2 << 16) | s1;
}
}
}
24 changes: 13 additions & 11 deletions src/SharpCompress/Compressors/Deflate/DeflateManager.cs
Expand Up @@ -70,6 +70,8 @@

using System;

using SharpCompress.Algorithms;

namespace SharpCompress.Compressors.Deflate
{
internal sealed partial class DeflateManager
Expand Down Expand Up @@ -1685,7 +1687,7 @@ internal void Reset()
Rfc1950BytesEmitted = false;

status = (WantRfc1950HeaderBytes) ? INIT_STATE : BUSY_STATE;
_codec._Adler32 = Adler.Adler32(0, null, 0, 0);
_codec._adler32 = 1;

last_flush = (int)FlushType.None;

Expand Down Expand Up @@ -1763,7 +1765,7 @@ internal int SetDictionary(byte[] dictionary)
throw new ZlibException("Stream error.");
}

_codec._Adler32 = Adler.Adler32(_codec._Adler32, dictionary, 0, dictionary.Length);
_codec._adler32 = Adler32.Calculate(_codec._adler32, dictionary);

if (length < MIN_MATCH)
{
Expand Down Expand Up @@ -1850,12 +1852,12 @@ internal int Deflate(FlushType flush)
////putShortMSB((int)(SharedUtils.URShift(_codec._Adler32, 16)));
//putShortMSB((int)((UInt64)_codec._Adler32 >> 16));
//putShortMSB((int)(_codec._Adler32 & 0xffff));
pending[pendingCount++] = (byte)((_codec._Adler32 & 0xFF000000) >> 24);
pending[pendingCount++] = (byte)((_codec._Adler32 & 0x00FF0000) >> 16);
pending[pendingCount++] = (byte)((_codec._Adler32 & 0x0000FF00) >> 8);
pending[pendingCount++] = (byte)(_codec._Adler32 & 0x000000FF);
pending[pendingCount++] = (byte)((_codec._adler32 & 0xFF000000) >> 24);
pending[pendingCount++] = (byte)((_codec._adler32 & 0x00FF0000) >> 16);
pending[pendingCount++] = (byte)((_codec._adler32 & 0x0000FF00) >> 8);
pending[pendingCount++] = (byte)(_codec._adler32 & 0x000000FF);
}
_codec._Adler32 = Adler.Adler32(0, null, 0, 0);
_codec._adler32 = 1;
}

// Flush as much pending output as possible
Expand Down Expand Up @@ -1968,10 +1970,10 @@ internal int Deflate(FlushType flush)
}

// Write the zlib trailer (adler32)
pending[pendingCount++] = (byte)((_codec._Adler32 & 0xFF000000) >> 24);
pending[pendingCount++] = (byte)((_codec._Adler32 & 0x00FF0000) >> 16);
pending[pendingCount++] = (byte)((_codec._Adler32 & 0x0000FF00) >> 8);
pending[pendingCount++] = (byte)(_codec._Adler32 & 0x000000FF);
pending[pendingCount++] = (byte)((_codec._adler32 & 0xFF000000) >> 24);
pending[pendingCount++] = (byte)((_codec._adler32 & 0x00FF0000) >> 16);
pending[pendingCount++] = (byte)((_codec._adler32 & 0x0000FF00) >> 8);
pending[pendingCount++] = (byte)(_codec._adler32 & 0x000000FF);

//putShortMSB((int)(SharedUtils.URShift(_codec._Adler32, 16)));
//putShortMSB((int)(_codec._Adler32 & 0xffff));
Expand Down
12 changes: 7 additions & 5 deletions src/SharpCompress/Compressors/Deflate/Inflate.cs
Expand Up @@ -65,6 +65,8 @@

using System;

using SharpCompress.Algorithms;

namespace SharpCompress.Compressors.Deflate
{
internal sealed class InflateBlocks
Expand Down Expand Up @@ -118,7 +120,7 @@ internal uint Reset()

if (checkfn != null)
{
_codec._Adler32 = check = Adler.Adler32(0, null, 0, 0);
_codec._adler32 = check = 1;
}
return oldCheck;
}
Expand Down Expand Up @@ -739,7 +741,7 @@ internal int Flush(int r)
// update check information
if (checkfn != null)
{
_codec._Adler32 = check = Adler.Adler32(check, window, readAt, nBytes);
_codec._adler32 = check = Adler32.Calculate(check, window.AsSpan(readAt, nBytes));
}

// copy as far as end of window
Expand Down Expand Up @@ -1764,7 +1766,7 @@ internal int Inflate(FlushType flush)
_codec.AvailableBytesIn--;
_codec.TotalBytesIn++;
expectedCheck += (uint)(_codec.InputBuffer[_codec.NextIn++] & 0x000000ff);
_codec._Adler32 = expectedCheck;
_codec._adler32 = expectedCheck;
mode = InflateManagerMode.DICT0;
return ZlibConstants.Z_NEED_DICT;

Expand Down Expand Up @@ -1879,12 +1881,12 @@ internal int SetDictionary(byte[] dictionary)
throw new ZlibException("Stream error.");
}

if (Adler.Adler32(1, dictionary, 0, dictionary.Length) != _codec._Adler32)
if (Adler32.Calculate(1, dictionary) != _codec._adler32)
{
return ZlibConstants.Z_DATA_ERROR;
}

_codec._Adler32 = Adler.Adler32(0, null, 0, 0);
_codec._adler32 = 1;

if (length >= (1 << wbits))
{
Expand Down