Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correct truncation of AnyValues when using strings or bytes #9269

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 43 additions & 10 deletions rerun_py/rerun_sdk/rerun/any_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ def __init__(self, descriptor: str | ComponentDescriptor, value: Any, drop_untyp
will be dropped, and a warning will be sent to the log.

If you are want to inspect how your component will be converted to the
underlying arrow code, the following snippet is what is happening
internally:
underlying arrow code, we first attempt to cast it directly to a pyarrow
array. Failing this, we call

```
np_value = np.atleast_1d(np.array(value, copy=False))
pa_value = pa.array(value)
pa_scalar = pa.scalar(value)
pa_value = pa.array(pa_scalar)
```

Parameters
Expand Down Expand Up @@ -77,19 +77,52 @@ def __init__(self, descriptor: str | ComponentDescriptor, value: Any, drop_untyp
elif hasattr(value, "as_arrow_array"):
self.pa_array = value.as_arrow_array()
else:
if np_type is not None:
if pa_type is not None:
if value is None:
value = []
np_value = np.atleast_1d(np.asarray(value, dtype=np_type))
self.pa_array = pa.array(np_value, type=pa_type)
# Special case: strings are iterables so pyarrow will not
# handle them properly
if not isinstance(value, (str, bytes)):
try:
self.pa_array = pa.array(value, type=pa_type)
except TypeError:
pass
if self.pa_array is None:
try:
pa_scalar = pa.scalar(value, type=pa_type)
self.pa_array = pa.array([pa_scalar], type=pa_type)
except TypeError:
pass
if self.pa_array is None:
# Fall back - use numpy
np_value = np.atleast_1d(np.asarray(value, dtype=np_type))
self.pa_array = pa.array(np_value, type=pa_type)
else:
if value is None:
if not drop_untyped_nones:
raise ValueError("Cannot convert None to arrow array. Type is unknown.")
else:
np_value = np.atleast_1d(np.asarray(value))
self.pa_array = pa.array(np_value)
ANY_VALUE_TYPE_REGISTRY[descriptor] = (np_value.dtype, self.pa_array.type)
# This should handle most non-scalar values, but we have to
# treat str and bytes special because they are iterable
if not isinstance(value, (str, bytes)) and value is not None:
try:
self.pa_array = pa.array(value)
ANY_VALUE_TYPE_REGISTRY[descriptor] = (None, self.pa_array.type)
except TypeError:
pass
if self.pa_array is None:
try:
pa_scalar = pa.scalar(value)
self.pa_array = pa.array([pa_scalar])
ANY_VALUE_TYPE_REGISTRY[descriptor] = (None, self.pa_array.type)
except TypeError:
pass
if self.pa_array is None:
# Fall back - use numpy which handles a wide variety of lists, tuples,
# and mixtures of them and will turn into a well formed array
np_value = np.atleast_1d(np.asarray(value))
self.pa_array = pa.array(np_value)
ANY_VALUE_TYPE_REGISTRY[descriptor] = (np_value.dtype, self.pa_array.type)

def is_valid(self) -> bool:
return self.pa_array is not None
Expand Down
32 changes: 32 additions & 0 deletions rerun_py/tests/unit/test_any_values.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import numpy as np
import pyarrow as pa
import pytest
import rerun as rr
from rerun.error_utils import RerunWarning
Expand Down Expand Up @@ -98,3 +99,34 @@ def test_none_any_value() -> None:

assert len(batches) == 1
assert len(warnings) == 1 # no new warnings


def test_iterable_any_value() -> None:
SHORT_TEXT = "short"
LONG_TEXT = "longer_text"

SHORT_BYTES = b"ABCD"
LONG_BYTES = b"ABCDEFGH"

values = rr.AnyValues(str_values=SHORT_TEXT, bytes_values=SHORT_BYTES)
batches = list(values.as_component_batches())

assert len(batches) == 2
assert batches[0].as_arrow_array() == pa.array([SHORT_TEXT], type=pa.string())
assert batches[1].as_arrow_array() == pa.array([SHORT_BYTES], type=pa.binary())

# Issue #8781 - ensure subsequent calls do not truncate data
values = rr.AnyValues(str_values=LONG_TEXT, bytes_values=LONG_BYTES)
batches = list(values.as_component_batches())

assert len(batches) == 2
assert batches[0].as_arrow_array() == pa.array([LONG_TEXT], type=pa.string())
assert batches[1].as_arrow_array() == pa.array([LONG_BYTES], type=pa.binary())

# Ensure iterables of these types are handled as arrays
values = rr.AnyValues(str_values=[SHORT_TEXT, LONG_TEXT], bytes_values=[SHORT_BYTES, LONG_BYTES])
batches = list(values.as_component_batches())

assert len(batches) == 2
assert batches[0].as_arrow_array() == pa.array([SHORT_TEXT, LONG_TEXT], type=pa.string())
assert batches[1].as_arrow_array() == pa.array([SHORT_BYTES, LONG_BYTES], type=pa.binary())
Loading