Can ends_with be optimized further? #222

nazar-pc · 2023-05-02T10:10:59Z

I have a simple example that has two implementations of the same operation:

pub fn last_five_bits_zero(bytes: &[u8; 32]) -> bool {
    let bits = bytes.view_bits::<Msb0>();
    bits.ends_with(bits![u8, Msb0; 0, 0, 0, 0, 0])
}

pub fn last_five_bits_zero(bytes: &[u8; 32]) -> bool {
    let last_byte = bytes[31] & 0b00011111;
    last_byte == 0b00000000
}

The first uses this library and generates quite lengthy assembly:

.section .text.last_five_bits_zero,"ax",@progbits
	.globl	last_five_bits_zero
	.p2align	4, 0x90
	.type	last_five_bits_zero,@function
last_five_bits_zero:

		// lib.rs : 171
		pub fn last_five_bits_zero(bytes: &[u8; 32]) -> bool {
	.cfi_startproc
	push rbx
	.cfi_def_cfa_offset 16
	sub rsp, 16
	.cfi_def_cfa_offset 32
	.cfi_offset rbx, -16
	mov rbx, rdi

		// lib.rs : 173
		bits.ends_with(bits![u8, Msb0; 0, 0, 0, 0, 0])
	xor edi, edi

	call qword ptr [rip + bitvec::mem::BitElement<u8>::new@GOTPCREL]

		// /home/nazar-pc/.cargo/registry/src/index.crates.io-6f17d22bba15001f/bitvec-1.0.1/src/array.rs : 60
		Self { data, ..Self::ZERO }
	mov byte ptr [rsp + 8], al

		// /rustc/0599b6b931816ab46ab79072189075f543931cbd/library/core/src/ptr/mut_ptr.rs : 479
		unsafe { intrinsics::offset(self, count) as *mut T }
	add rbx, 31

		// /home/nazar-pc/.cargo/registry/src/index.crates.io-6f17d22bba15001f/bitvec-1.0.1/src/slice/specialization/msb0.rs : 112
		.all(|(a, b)| a.load_be::<usize>() == b.load_be::<usize>())
	mov esi, 43
	mov rdi, rbx
	call <bitvec::slice::BitSlice<T,bitvec::order::Msb0> as bitvec::field::BitField>::load_be

	mov rbx, rax

	lea rdi, [rsp + 8]
	mov esi, 40
	call <bitvec::slice::BitSlice<T,bitvec::order::Msb0> as bitvec::field::BitField>::load_be
	cmp rbx, rax
	sete al

		// lib.rs : 174
		}
	add rsp, 16
	.cfi_def_cfa_offset 16
	pop rbx
	.cfi_def_cfa_offset 8
	ret

At the same time the alternative is much shorter:

last_five_bits_zero:

		// lib.rs : 171
		pub fn last_five_bits_zero(bytes: &[u8; 32]) -> bool {
	.cfi_startproc
		// lib.rs : 173
		last_byte == 0b00000000
	test byte ptr [rdi + 31], 31
	sete al

		// lib.rs : 174
		}
	ret

It feels like there should be a way to optimize this much further, especially considering that here compiler has all the information about the size of the data and bits we're checking.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Can ends_with be optimized further? #222

Can ends_with be optimized further? #222

nazar-pc commented May 2, 2023

Can ends_with be optimized further? #222

Can ends_with be optimized further? #222

Comments

nazar-pc commented May 2, 2023