Skip to content

Commit

Permalink
Merge pull request #627 from Cuda-Chen/optimize-CRC-for-targets-lacki…
Browse files Browse the repository at this point in the history
…ng-of-CRC

Optimize CRC intrinisics for targets lacking of CRC extension
  • Loading branch information
jserv committed Jan 30, 2024
2 parents cfaa59f + 66267b5 commit 4a036e6
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 7 deletions.
47 changes: 41 additions & 6 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -8500,12 +8500,47 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
crc = __crc32cb(crc, v);
#else
crc ^= v;
for (int bit = 0; bit < 8; bit++) {
if (crc & 1)
crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
else
crc = (crc >> 1);
}
#if defined(__ARM_FEATURE_CRYPTO)
// Adapted from: https://mary.rs/lab/crc32/
// Barrent reduction
uint64x2_t orig =
vcombine_u64(vcreate_u64((uint64_t) (crc) << 24), vcreate_u64(0x0));
uint64x2_t tmp = orig;

// Polynomial P(x) of CRC32C
uint64_t p = 0x105EC76F1;
// Barrett Reduction (in bit-reflected form) constant mu_{64} = \lfloor
// 2^{64} / P(x) \rfloor = 0x11f91caf6
uint64_t mu = 0x1dea713f1;

// Multiply by mu_{64}
tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(mu));
// Divide by 2^{64} (mask away the unnecessary bits)
tmp =
vandq_u64(tmp, vcombine_u64(vcreate_u64(0xFFFFFFFF), vcreate_u64(0x0)));
// Multiply by P(x) (shifted left by 1 for alignment reasons)
tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(p));
// Subtract original from result
tmp = veorq_u64(tmp, orig);

// Extract the 'lower' (in bit-reflected sense) 32 bits
crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1);
#else // Fall back to the generic table lookup approach
// Adapted from: https://create.stephan-brumme.com/crc32/
// Apply half-byte comparision algorithm for the best ratio between
// performance and lookup table.

// The lookup table just needs to store every 16th entry
// of the standard look-up table.
static const uint32_t crc32_half_byte_tbl[] = {
0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, 0x417b1dbc, 0x5125dad3,
0x61c69362, 0x7198540d, 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9,
0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75,
};

crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
#endif
#endif
return crc;
}
Expand Down
2 changes: 1 addition & 1 deletion tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ uint32_t canonical_crc32_u8(uint32_t crc, uint8_t v)
crc ^= v;
for (int bit = 0; bit < 8; bit++) {
if (crc & 1)
crc = (crc >> 1) ^ uint32_t(0x82f63b78);
crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
else
crc = (crc >> 1);
}
Expand Down

0 comments on commit 4a036e6

Please sign in to comment.