Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BooleanArray::true_count and BooleanArray::false_count #2957

Merged
merged 2 commits into from Oct 28, 2022
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
73 changes: 73 additions & 0 deletions arrow-array/src/array/boolean_array.rs
Expand Up @@ -103,6 +103,53 @@ impl BooleanArray {
&self.data.buffers()[0]
}

/// Returns the number of true values within this buffer
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// Returns the number of true values within this buffer
/// Returns the number of non null, true values within this array

pub fn true_count(&self) -> usize {
match self.data.null_buffer() {
Some(nulls) => {
let null_chunks = nulls.bit_chunks(self.offset(), self.len());
tustvold marked this conversation as resolved.
Show resolved Hide resolved
let value_chunks = self.values().bit_chunks(self.offset(), self.len());
null_chunks
.iter()
.zip(value_chunks.iter())
.chain(std::iter::once((
null_chunks.remainder_bits(),
value_chunks.remainder_bits(),
)))
.map(|(a, b)| (a & b).count_ones() as usize)
.sum()
}
None => self
.values()
.count_set_bits_offset(self.offset(), self.len()),
}
}

/// Returns the number of false values within this buffer
pub fn false_count(&self) -> usize {
match self.data.null_buffer() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe this could be simplified into self.size() - self.null_count() - self.true_count() ? I think that would be basically as fast?

Some(nulls) => {
let null_chunks = nulls.bit_chunks(self.offset(), self.len());
let value_chunks = self.values().bit_chunks(self.offset(), self.len());
null_chunks
.iter()
.zip(value_chunks.iter())
.chain(std::iter::once((
null_chunks.remainder_bits(),
value_chunks.remainder_bits(),
)))
.map(|(a, b)| (a & !b).count_ones() as usize)
.sum()
}
None => {
let true_count = self
.values()
.count_set_bits_offset(self.offset(), self.len());
self.len() - true_count
}
}
}

/// Returns the boolean value at index `i`.
///
/// # Safety
Expand Down Expand Up @@ -285,6 +332,7 @@ impl<Ptr: std::borrow::Borrow<Option<bool>>> FromIterator<Ptr> for BooleanArray
#[cfg(test)]
mod tests {
use super::*;
use rand::{thread_rng, Rng};

#[test]
fn test_boolean_fmt_debug() {
Expand Down Expand Up @@ -431,4 +479,29 @@ mod tests {
fn test_from_array_data_validation() {
let _ = BooleanArray::from(ArrayData::new_empty(&DataType::Int32));
}

#[test]
fn test_true_false_count() {
let mut rng = thread_rng();

for _ in 0..10 {
let d: Vec<_> = (0..2000).map(|_| rng.gen_bool(0.5)).collect();
tustvold marked this conversation as resolved.
Show resolved Hide resolved
let b = BooleanArray::from(d.clone());

let expected_true = d.iter().filter(|x| **x).count();
assert_eq!(b.true_count(), expected_true);
assert_eq!(b.false_count(), d.len() - expected_true);

let d: Vec<_> = (0..2000)
tustvold marked this conversation as resolved.
Show resolved Hide resolved
.map(|_| rng.gen_bool(0.5).then(|| rng.gen_bool(0.5)))
.collect();
let b = BooleanArray::from(d.clone());

let expected_true = d.iter().filter(|x| matches!(x, Some(true))).count();
assert_eq!(b.true_count(), expected_true);

let expected_false = d.iter().filter(|x| matches!(x, Some(false))).count();
assert_eq!(b.false_count(), expected_false);
}
}
}