Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support List in compare_op #5572

Open
Weijun-H opened this issue Mar 30, 2024 · 1 comment
Open

Support List in compare_op #5572

Weijun-H opened this issue Mar 30, 2024 · 1 comment
Labels
enhancement Any new improvement worthy of a entry in the changelog

Comments

@Weijun-H
Copy link
Contributor

Weijun-H commented Mar 30, 2024

Is your feature request related to a problem or challenge? Please describe what you are trying to do.

//  [[0, 1, 2], [3, 4, 5], [6, 7]]
let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]);
let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 6, 8]));
let field = Arc::new(Field::new("item", DataType::Int32, true));
let a = ListArray::new(field, offsets, Arc::new(values), None);
let b = a.clone();

let r = eq(&a, &b).unwrap();
assert!(r.value(0));
thread 'cmp::tests::test_list' panicked at arrow-ord/src/cmp.rs:695:28:
called `Result::unwrap()` on an `Err` value: InvalidArgumentError("Invalid comparison operation: List(Field { name: \"item\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) == List(Field { name: \"item\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })")

Describe the solution you'd like

support List in compare_op

/// Perform `op` on the provided `Datum`
#[inline(never)]
fn compare_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result<BooleanArray, ArrowError> {
use arrow_schema::DataType::*;
let (l, l_s) = lhs.get();
let (r, r_s) = rhs.get();
let l_len = l.len();
let r_len = r.len();
if l_len != r_len && !l_s && !r_s {
return Err(ArrowError::InvalidArgumentError(format!(
"Cannot compare arrays of different lengths, got {l_len} vs {r_len}"
)));
}
let len = match l_s {
true => r_len,
false => l_len,
};
let l_nulls = l.logical_nulls();
let r_nulls = r.logical_nulls();
let l_v = l.as_any_dictionary_opt();
let l = l_v.map(|x| x.values().as_ref()).unwrap_or(l);
let l_t = l.data_type();
let r_v = r.as_any_dictionary_opt();
let r = r_v.map(|x| x.values().as_ref()).unwrap_or(r);
let r_t = r.data_type();
if l_t != r_t || l_t.is_nested() {
return Err(ArrowError::InvalidArgumentError(format!(
"Invalid comparison operation: {l_t} {op} {r_t}"
)));
}
// Defer computation as may not be necessary
let values = || -> BooleanBuffer {
let d = downcast_primitive_array! {
(l, r) => apply(op, l.values().as_ref(), l_s, l_v, r.values().as_ref(), r_s, r_v),
(Boolean, Boolean) => apply(op, l.as_boolean(), l_s, l_v, r.as_boolean(), r_s, r_v),
(Utf8, Utf8) => apply(op, l.as_string::<i32>(), l_s, l_v, r.as_string::<i32>(), r_s, r_v),
(LargeUtf8, LargeUtf8) => apply(op, l.as_string::<i64>(), l_s, l_v, r.as_string::<i64>(), r_s, r_v),
(Binary, Binary) => apply(op, l.as_binary::<i32>(), l_s, l_v, r.as_binary::<i32>(), r_s, r_v),
(LargeBinary, LargeBinary) => apply(op, l.as_binary::<i64>(), l_s, l_v, r.as_binary::<i64>(), r_s, r_v),
(FixedSizeBinary(_), FixedSizeBinary(_)) => apply(op, l.as_fixed_size_binary(), l_s, l_v, r.as_fixed_size_binary(), r_s, r_v),
(Null, Null) => None,
_ => unreachable!(),
};
d.unwrap_or_else(|| BooleanBuffer::new_unset(len))
};
let l_nulls = l_nulls.filter(|n| n.null_count() > 0);
let r_nulls = r_nulls.filter(|n| n.null_count() > 0);
Ok(match (l_nulls, l_s, r_nulls, r_s) {
(Some(l), true, Some(r), true) | (Some(l), false, Some(r), false) => {
// Either both sides are scalar or neither side is scalar
match op {
Op::Distinct => {
let values = values();
let l = l.inner().bit_chunks().iter_padded();
let r = r.inner().bit_chunks().iter_padded();
let ne = values.bit_chunks().iter_padded();
let c = |((l, r), n)| ((l ^ r) | (l & r & n));
let buffer = l.zip(r).zip(ne).map(c).collect();
BooleanBuffer::new(buffer, 0, len).into()
}
Op::NotDistinct => {
let values = values();
let l = l.inner().bit_chunks().iter_padded();
let r = r.inner().bit_chunks().iter_padded();
let e = values.bit_chunks().iter_padded();
let c = |((l, r), e)| u64::not(l | r) | (l & r & e);
let buffer = l.zip(r).zip(e).map(c).collect();
BooleanBuffer::new(buffer, 0, len).into()
}
_ => BooleanArray::new(values(), NullBuffer::union(Some(&l), Some(&r))),
}
}
(Some(_), true, Some(a), false) | (Some(a), false, Some(_), true) => {
// Scalar is null, other side is non-scalar and nullable
match op {
Op::Distinct => a.into_inner().into(),
Op::NotDistinct => a.into_inner().not().into(),
_ => BooleanArray::new_null(len),
}
}
(Some(nulls), is_scalar, None, _) | (None, _, Some(nulls), is_scalar) => {
// Only one side is nullable
match is_scalar {
true => match op {
// Scalar is null, other side is not nullable
Op::Distinct => BooleanBuffer::new_set(len).into(),
Op::NotDistinct => BooleanBuffer::new_unset(len).into(),
_ => BooleanArray::new_null(len),
},
false => match op {
Op::Distinct => {
let values = values();
let l = nulls.inner().bit_chunks().iter_padded();
let ne = values.bit_chunks().iter_padded();
let c = |(l, n)| u64::not(l) | n;
let buffer = l.zip(ne).map(c).collect();
BooleanBuffer::new(buffer, 0, len).into()
}
Op::NotDistinct => (nulls.inner() & &values()).into(),
_ => BooleanArray::new(values(), Some(nulls)),
},
}
}
// Neither side is nullable
(None, _, None, _) => BooleanArray::new(values(), None),
})
}

Describe alternatives you've considered

Additional context

apache/datafusion#9857

@Weijun-H Weijun-H added the enhancement Any new improvement worthy of a entry in the changelog label Mar 30, 2024
@tustvold
Copy link
Contributor

Duplicate of #5426

@tustvold tustvold marked this as a duplicate of #5426 Mar 30, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement Any new improvement worthy of a entry in the changelog
Projects
None yet
Development

No branches or pull requests

2 participants