From fedaf5c49e477cc9b67856b0b063a42f43da57a1 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 6 Mar 2025 14:19:05 -0800 Subject: [PATCH 1/3] Do not default the struct array length to 0 in Struct::try_new if there are no child arrays. --- arrow-array/src/array/struct_array.rs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index de6d9c699d22..aa2e607c9eb2 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -114,7 +114,8 @@ impl StructArray { arrays.len() ))); } - let len = arrays.first().map(|x| x.len()).unwrap_or_default(); + + let len = arrays.first().map(|x| x.len()).ok_or_else(||ArrowError::InvalidArgumentError("use StructArray::new_empty_fields to create a struct array with no fields so that the length can be set correctly".to_string()))?; if let Some(n) = nulls.as_ref() { if n.len() != len { @@ -189,7 +190,10 @@ impl StructArray { arrays: Vec, nulls: Option, ) -> Self { - let len = arrays.first().map(|x| x.len()).unwrap_or_default(); + let len = arrays + .first() + .map(|x| x.len()) + .expect("cannot use new_unchecked if there are no fields, length is unknown"); Self { len, data_type: DataType::Struct(fields), @@ -729,9 +733,11 @@ mod tests { } #[test] + #[should_panic(expected = "use StructArray::new_empty_fields")] fn test_struct_array_from_empty() { - let sa = StructArray::from(vec![]); - assert!(sa.is_empty()) + // This can't work because we don't know how many rows the array should have. Previously we inferred 0 but + // that often led to bugs. + let _ = StructArray::from(vec![]); } #[test] From 126d8da57e275199af9c8338ff61a89206997bd1 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 6 Mar 2025 14:30:49 -0800 Subject: [PATCH 2/3] Extend testing for new_empty_fields --- arrow-array/src/array/struct_array.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index aa2e607c9eb2..62d6c4745267 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -740,6 +740,21 @@ mod tests { let _ = StructArray::from(vec![]); } + #[test] + fn test_empty_struct_array() { + assert!(StructArray::try_new(Fields::empty(), vec![], None).is_err()); + + let arr = StructArray::new_empty_fields(10, None); + assert_eq!(arr.len(), 10); + assert_eq!(arr.null_count(), 0); + assert_eq!(arr.num_columns(), 0); + + let arr = StructArray::new_empty_fields(10, Some(NullBuffer::new_null(10))); + assert_eq!(arr.len(), 10); + assert_eq!(arr.null_count(), 10); + assert_eq!(arr.num_columns(), 0); + } + #[test] #[should_panic(expected = "Found unmasked nulls for non-nullable StructArray field \\\"c\\\"")] fn test_struct_array_from_mismatched_nullability() { From 5a5f41d387654f2c4d2e1641fb9b5942da01f517 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 6 Mar 2025 14:42:09 -0800 Subject: [PATCH 3/3] Add try_new_with_length --- arrow-array/src/array/struct_array.rs | 71 ++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 8 deletions(-) diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 62d6c4745267..0c243d17c89b 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -91,6 +91,28 @@ impl StructArray { Self::try_new(fields, arrays, nulls).unwrap() } + /// Create a new [`StructArray`] from the provided parts, returning an error on failure + /// + /// The length will be inferred from the length of the child arrays. Returns an error if + /// there are no child arrays. Consider using [`Self::try_new_with_length`] if the length + /// is known to avoid this. + /// + /// # Errors + /// + /// Errors if + /// + /// * `fields.len() == 0` + /// * Any reason that [`Self::try_new_with_length`] would error + pub fn try_new( + fields: Fields, + arrays: Vec, + nulls: Option, + ) -> Result { + let len = arrays.first().map(|x| x.len()).ok_or_else(||ArrowError::InvalidArgumentError("use StructArray::try_new_with_length or StructArray::new_empty to create a struct array with no fields so that the length can be set correctly".to_string()))?; + + Self::try_new_with_length(fields, arrays, nulls, len) + } + /// Create a new [`StructArray`] from the provided parts, returning an error on failure /// /// # Errors @@ -102,10 +124,11 @@ impl StructArray { /// * `arrays[i].len() != arrays[j].len()` /// * `arrays[i].len() != nulls.len()` /// * `!fields[i].is_nullable() && !nulls.contains(arrays[i].nulls())` - pub fn try_new( + pub fn try_new_with_length( fields: Fields, arrays: Vec, nulls: Option, + len: usize, ) -> Result { if fields.len() != arrays.len() { return Err(ArrowError::InvalidArgumentError(format!( @@ -115,8 +138,6 @@ impl StructArray { ))); } - let len = arrays.first().map(|x| x.len()).ok_or_else(||ArrowError::InvalidArgumentError("use StructArray::new_empty_fields to create a struct array with no fields so that the length can be set correctly".to_string()))?; - if let Some(n) = nulls.as_ref() { if n.len() != len { return Err(ArrowError::InvalidArgumentError(format!( @@ -182,6 +203,10 @@ impl StructArray { /// Create a new [`StructArray`] from the provided parts without validation /// + /// The length will be inferred from the length of the child arrays. Panics if there are no + /// child arrays. Consider using [`Self::new_unchecked_with_length`] if the length is known + /// to avoid this. + /// /// # Safety /// /// Safe if [`Self::new`] would not panic with the given arguments @@ -190,10 +215,28 @@ impl StructArray { arrays: Vec, nulls: Option, ) -> Self { - let len = arrays - .first() - .map(|x| x.len()) - .expect("cannot use new_unchecked if there are no fields, length is unknown"); + let len = arrays.first().map(|x| x.len()).expect( + "cannot use StructArray::new_unchecked if there are no fields, length is unknown", + ); + Self { + len, + data_type: DataType::Struct(fields), + nulls, + fields: arrays, + } + } + + /// Create a new [`StructArray`] from the provided parts without validation + /// + /// # Safety + /// + /// Safe if [`Self::new`] would not panic with the given arguments + pub unsafe fn new_unchecked_with_length( + fields: Fields, + arrays: Vec, + nulls: Option, + len: usize, + ) -> Self { Self { len, data_type: DataType::Struct(fields), @@ -733,7 +776,7 @@ mod tests { } #[test] - #[should_panic(expected = "use StructArray::new_empty_fields")] + #[should_panic(expected = "use StructArray::try_new_with_length")] fn test_struct_array_from_empty() { // This can't work because we don't know how many rows the array should have. Previously we inferred 0 but // that often led to bugs. @@ -749,10 +792,22 @@ mod tests { assert_eq!(arr.null_count(), 0); assert_eq!(arr.num_columns(), 0); + let arr2 = StructArray::try_new_with_length(Fields::empty(), vec![], None, 10).unwrap(); + assert_eq!(arr2.len(), 10); + let arr = StructArray::new_empty_fields(10, Some(NullBuffer::new_null(10))); assert_eq!(arr.len(), 10); assert_eq!(arr.null_count(), 10); assert_eq!(arr.num_columns(), 0); + + let arr2 = StructArray::try_new_with_length( + Fields::empty(), + vec![], + Some(NullBuffer::new_null(10)), + 10, + ) + .unwrap(); + assert_eq!(arr2.len(), 10); } #[test]