Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 2ee5a06

Browse files
committedMar 17, 2025·
apacheGH-45821: [C++][Compute] Grouper improvements
1 parent bc0b858 commit 2ee5a06

File tree

4 files changed

+554
-187
lines changed

4 files changed

+554
-187
lines changed
 

‎cpp/src/arrow/compute/key_map_internal.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ namespace compute {
3535
//
3636
// A detailed explanation of this data structure (including concepts such as blocks,
3737
// slots, stamps) and operations provided by this class is given in the document:
38-
// arrow/compute/exec/doc/key_map.md.
38+
// arrow/acero/doc/key_map.md.
3939
//
4040
class ARROW_EXPORT SwissTable {
4141
friend class SwissTableMerge;

‎cpp/src/arrow/compute/row/grouper.cc

+159-49
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#include "arrow/compute/row/grouper.h"
1919

20+
#include <cstring>
2021
#include <iostream>
2122
#include <memory>
2223
#include <mutex>
@@ -318,7 +319,7 @@ Result<std::unique_ptr<RowSegmenter>> RowSegmenter::Make(
318319

319320
namespace {
320321

321-
Status CheckAndCapLengthForConsume(int64_t batch_length, int64_t& consume_offset,
322+
Status CheckAndCapLengthForConsume(int64_t batch_length, int64_t consume_offset,
322323
int64_t* consume_length) {
323324
if (consume_offset < 0) {
324325
return Status::Invalid("invalid grouper consume offset: ", consume_offset);
@@ -329,6 +330,8 @@ Status CheckAndCapLengthForConsume(int64_t batch_length, int64_t& consume_offset
329330
return Status::OK();
330331
}
331332

333+
enum class GrouperMode { kPopulate, kConsume, kLookup };
334+
332335
struct GrouperImpl : public Grouper {
333336
static Result<std::unique_ptr<GrouperImpl>> Make(
334337
const std::vector<TypeHolder>& key_types, ExecContext* ctx) {
@@ -388,11 +391,24 @@ struct GrouperImpl : public Grouper {
388391
return Status::OK();
389392
}
390393

394+
Status Populate(const ExecSpan& batch, int64_t offset, int64_t length) override {
395+
return ConsumeImpl(batch, offset, length, GrouperMode::kPopulate).status();
396+
}
397+
391398
Result<Datum> Consume(const ExecSpan& batch, int64_t offset, int64_t length) override {
399+
return ConsumeImpl(batch, offset, length, GrouperMode::kConsume);
400+
}
401+
402+
Result<Datum> Lookup(const ExecSpan& batch, int64_t offset, int64_t length) override {
403+
return ConsumeImpl(batch, offset, length, GrouperMode::kLookup);
404+
}
405+
406+
Result<Datum> ConsumeImpl(const ExecSpan& batch, int64_t offset, int64_t length,
407+
GrouperMode mode) {
392408
ARROW_RETURN_NOT_OK(CheckAndCapLengthForConsume(batch.length, offset, &length));
393409
if (offset != 0 || length != batch.length) {
394410
auto batch_slice = batch.ToExecBatch().Slice(offset, length);
395-
return Consume(ExecSpan(batch_slice), 0, -1);
411+
return ConsumeImpl(ExecSpan(batch_slice), 0, -1, mode);
396412
}
397413
std::vector<int32_t> offsets_batch(batch.length + 1);
398414
for (int i = 0; i < batch.num_values(); ++i) {
@@ -417,35 +433,91 @@ struct GrouperImpl : public Grouper {
417433
RETURN_NOT_OK(encoders_[i]->Encode(batch[i], batch.length, key_buf_ptrs.data()));
418434
}
419435

420-
TypedBufferBuilder<uint32_t> group_ids_batch(ctx_->memory_pool());
421-
RETURN_NOT_OK(group_ids_batch.Resize(batch.length));
436+
using MapIterator = typename decltype(map_)::iterator;
422437

423-
for (int64_t i = 0; i < batch.length; ++i) {
424-
int32_t key_length = offsets_batch[i + 1] - offsets_batch[i];
425-
std::string key(
426-
reinterpret_cast<const char*>(key_bytes_batch.data() + offsets_batch[i]),
427-
key_length);
428-
429-
auto it_success = map_.emplace(key, num_groups_);
430-
auto group_id = it_success.first->second;
431-
432-
if (it_success.second) {
433-
// new key; update offsets and key_bytes
434-
++num_groups_;
435-
// Skip if there are no keys
436-
if (key_length > 0) {
437-
auto next_key_offset = static_cast<int32_t>(key_bytes_.size());
438-
key_bytes_.resize(next_key_offset + key_length);
439-
offsets_.push_back(next_key_offset + key_length);
440-
memcpy(key_bytes_.data() + next_key_offset, key.c_str(), key_length);
438+
struct LookupResult {
439+
bool inserted;
440+
bool found;
441+
MapIterator it;
442+
};
443+
444+
auto generate_keys = [&](auto&& lookup_key, auto&& visit_group,
445+
auto&& visit_unknown_group) {
446+
for (int64_t i = 0; i < batch.length; ++i) {
447+
int32_t key_length = offsets_batch[i + 1] - offsets_batch[i];
448+
std::string key(
449+
reinterpret_cast<const char*>(key_bytes_batch.data() + offsets_batch[i]),
450+
key_length);
451+
452+
LookupResult res = lookup_key(std::move(key), num_groups_);
453+
454+
if (res.inserted) {
455+
// new key; update offsets and key_bytes
456+
++num_groups_;
457+
// Skip if there are no keys
458+
if (key_length > 0) {
459+
auto next_key_offset = static_cast<int32_t>(key_bytes_.size());
460+
key_bytes_.resize(next_key_offset + key_length);
461+
offsets_.push_back(next_key_offset + key_length);
462+
memcpy(key_bytes_.data() + next_key_offset, key.c_str(), key_length);
463+
}
464+
}
465+
466+
if (res.found) {
467+
visit_group(res.it->second);
468+
} else {
469+
visit_unknown_group();
441470
}
442471
}
472+
};
473+
474+
auto lookup_or_insert_key = [&](auto&& key, uint32_t new_group_id) -> LookupResult {
475+
auto [it, inserted] = map_.emplace(key, new_group_id);
476+
return {inserted, /*found=*/true, it};
477+
};
478+
auto lookup_key = [&](auto&& key, uint32_t new_group_id) -> LookupResult {
479+
auto it = map_.find(key);
480+
return {/*inserted=*/false, /*found=*/it != map_.end(), it};
481+
};
443482

444-
group_ids_batch.UnsafeAppend(group_id);
483+
if (mode == GrouperMode::kPopulate) {
484+
generate_keys(
485+
lookup_or_insert_key, [](uint32_t group_id) {}, [] {});
486+
return Datum();
445487
}
446488

489+
TypedBufferBuilder<uint32_t> group_ids_batch(ctx_->memory_pool());
490+
RETURN_NOT_OK(group_ids_batch.Resize(batch.length));
491+
std::shared_ptr<Buffer> null_bitmap;
492+
493+
if (mode == GrouperMode::kConsume) {
494+
auto visit_group = [&](uint32_t group_id) {
495+
group_ids_batch.UnsafeAppend(group_id);
496+
};
497+
auto visit_unknown_group = [] {};
498+
499+
generate_keys(lookup_or_insert_key, visit_group, visit_unknown_group);
500+
} else {
501+
DCHECK_EQ(mode, GrouperMode::kLookup);
502+
503+
TypedBufferBuilder<bool> null_bitmap_builder(ctx_->memory_pool());
504+
RETURN_NOT_OK(null_bitmap_builder.Resize(batch.length));
505+
506+
auto visit_group = [&](uint32_t group_id) {
507+
group_ids_batch.UnsafeAppend(group_id);
508+
null_bitmap_builder.UnsafeAppend(true);
509+
};
510+
auto visit_unknown_group = [&] {
511+
group_ids_batch.UnsafeAppend(0); // any defined value really
512+
null_bitmap_builder.UnsafeAppend(false);
513+
};
514+
515+
generate_keys(lookup_key, visit_group, visit_unknown_group);
516+
517+
ARROW_ASSIGN_OR_RAISE(null_bitmap, null_bitmap_builder.Finish());
518+
}
447519
ARROW_ASSIGN_OR_RAISE(auto group_ids, group_ids_batch.Finish());
448-
return Datum(UInt32Array(batch.length, std::move(group_ids)));
520+
return Datum(UInt32Array(batch.length, std::move(group_ids), std::move(null_bitmap)));
449521
}
450522

451523
uint32_t num_groups() const override { return num_groups_; }
@@ -470,6 +542,7 @@ struct GrouperImpl : public Grouper {
470542
}
471543

472544
ExecContext* ctx_;
545+
// TODO We could use std::string_view since the keys are copied in key_bytes_.
473546
std::unordered_map<std::string, uint32_t> map_;
474547
std::vector<int32_t> offsets_ = {0};
475548
std::vector<uint8_t> key_bytes_;
@@ -577,11 +650,24 @@ struct GrouperFastImpl : public Grouper {
577650
return Status::OK();
578651
}
579652

653+
Status Populate(const ExecSpan& batch, int64_t offset, int64_t length) override {
654+
return ConsumeImpl(batch, offset, length, GrouperMode::kPopulate).status();
655+
}
656+
580657
Result<Datum> Consume(const ExecSpan& batch, int64_t offset, int64_t length) override {
658+
return ConsumeImpl(batch, offset, length, GrouperMode::kConsume);
659+
}
660+
661+
Result<Datum> Lookup(const ExecSpan& batch, int64_t offset, int64_t length) override {
662+
return ConsumeImpl(batch, offset, length, GrouperMode::kLookup);
663+
}
664+
665+
Result<Datum> ConsumeImpl(const ExecSpan& batch, int64_t offset, int64_t length,
666+
GrouperMode mode) {
581667
ARROW_RETURN_NOT_OK(CheckAndCapLengthForConsume(batch.length, offset, &length));
582668
if (offset != 0 || length != batch.length) {
583669
auto batch_slice = batch.ToExecBatch().Slice(offset, length);
584-
return Consume(ExecSpan(batch_slice), 0, -1);
670+
return ConsumeImpl(ExecSpan(batch_slice), 0, -1, mode);
585671
}
586672
// ARROW-14027: broadcast scalar arguments for now
587673
for (int i = 0; i < batch.num_values(); i++) {
@@ -595,13 +681,13 @@ struct GrouperFastImpl : public Grouper {
595681
ctx_->memory_pool()));
596682
}
597683
}
598-
return ConsumeImpl(ExecSpan(expanded));
684+
return ConsumeImpl(ExecSpan(expanded), mode);
599685
}
600686
}
601-
return ConsumeImpl(batch);
687+
return ConsumeImpl(batch, mode);
602688
}
603689

604-
Result<Datum> ConsumeImpl(const ExecSpan& batch) {
690+
Result<Datum> ConsumeImpl(const ExecSpan& batch, GrouperMode mode) {
605691
int64_t num_rows = batch.length;
606692
int num_columns = batch.num_values();
607693
// Process dictionaries
@@ -621,10 +707,6 @@ struct GrouperFastImpl : public Grouper {
621707
}
622708
}
623709

624-
std::shared_ptr<arrow::Buffer> group_ids;
625-
ARROW_ASSIGN_OR_RAISE(
626-
group_ids, AllocateBuffer(sizeof(uint32_t) * num_rows, ctx_->memory_pool()));
627-
628710
for (int icol = 0; icol < num_columns; ++icol) {
629711
const uint8_t* non_nulls = NULLPTR;
630712
const uint8_t* fixedlen = NULLPTR;
@@ -649,11 +731,29 @@ struct GrouperFastImpl : public Grouper {
649731
cols_[icol] = col_base.Slice(offset, num_rows);
650732
}
651733

734+
std::shared_ptr<arrow::Buffer> group_ids, null_bitmap;
735+
// If we need to return the group ids, then allocate a buffer of group ids
736+
// for all rows, otherwise each minibatch will reuse the same buffer.
737+
const int64_t groups_ids_size =
738+
(mode == GrouperMode::kPopulate) ? minibatch_size_max_ : num_rows;
739+
ARROW_ASSIGN_OR_RAISE(group_ids, AllocateBuffer(sizeof(uint32_t) * groups_ids_size,
740+
ctx_->memory_pool()));
741+
if (mode == GrouperMode::kLookup) {
742+
ARROW_ASSIGN_OR_RAISE(null_bitmap,
743+
AllocateBitmap(groups_ids_size, ctx_->memory_pool()));
744+
}
745+
652746
// Split into smaller mini-batches
653747
//
654748
for (uint32_t start_row = 0; start_row < num_rows;) {
655749
uint32_t batch_size_next = std::min(static_cast<uint32_t>(minibatch_size_),
656750
static_cast<uint32_t>(num_rows) - start_row);
751+
uint32_t* batch_group_ids = group_ids->mutable_data_as<uint32_t>() +
752+
((mode == GrouperMode::kPopulate) ? 0 : start_row);
753+
if (mode == GrouperMode::kLookup) {
754+
// Zero-initialize
755+
memset(batch_group_ids, 0, batch_size_next * sizeof(uint32_t));
756+
}
657757

658758
// Encode
659759
rows_minibatch_.Clean();
@@ -672,28 +772,38 @@ struct GrouperFastImpl : public Grouper {
672772
match_bitvector.mutable_data(), local_slots.mutable_data());
673773
map_.find(batch_size_next, minibatch_hashes_.data(),
674774
match_bitvector.mutable_data(), local_slots.mutable_data(),
675-
reinterpret_cast<uint32_t*>(group_ids->mutable_data()) + start_row,
676-
&temp_stack_, map_equal_impl_, nullptr);
775+
batch_group_ids, &temp_stack_, map_equal_impl_, nullptr);
776+
}
777+
if (mode == GrouperMode::kLookup) {
778+
// Fill validity bitmap from match_bitvector
779+
::arrow::internal::CopyBitmap(match_bitvector.mutable_data(), /*offset=*/0,
780+
/*length=*/batch_size_next,
781+
null_bitmap->mutable_data(),
782+
/*dest_offset=*/start_row);
783+
} else {
784+
// Insert new keys
785+
auto ids = util::TempVectorHolder<uint16_t>(&temp_stack_, batch_size_next);
786+
int num_ids;
787+
util::bit_util::bits_to_indexes(0, encode_ctx_.hardware_flags, batch_size_next,
788+
match_bitvector.mutable_data(), &num_ids,
789+
ids.mutable_data());
790+
791+
RETURN_NOT_OK(map_.map_new_keys(
792+
num_ids, ids.mutable_data(), minibatch_hashes_.data(), batch_group_ids,
793+
&temp_stack_, map_equal_impl_, map_append_impl_, nullptr));
677794
}
678-
auto ids = util::TempVectorHolder<uint16_t>(&temp_stack_, batch_size_next);
679-
int num_ids;
680-
util::bit_util::bits_to_indexes(0, encode_ctx_.hardware_flags, batch_size_next,
681-
match_bitvector.mutable_data(), &num_ids,
682-
ids.mutable_data());
683-
684-
RETURN_NOT_OK(map_.map_new_keys(
685-
num_ids, ids.mutable_data(), minibatch_hashes_.data(),
686-
reinterpret_cast<uint32_t*>(group_ids->mutable_data()) + start_row,
687-
&temp_stack_, map_equal_impl_, map_append_impl_, nullptr));
688795

689796
start_row += batch_size_next;
690-
691-
if (minibatch_size_ * 2 <= minibatch_size_max_) {
692-
minibatch_size_ *= 2;
693-
}
797+
// XXX why not use minibatch_size_max_ from the start?
798+
minibatch_size_ = std::min(minibatch_size_max_, 2 * minibatch_size_);
694799
}
695800

696-
return Datum(UInt32Array(batch.length, std::move(group_ids)));
801+
if (mode == GrouperMode::kPopulate) {
802+
return Datum{};
803+
} else {
804+
return Datum(
805+
UInt32Array(batch.length, std::move(group_ids), std::move(null_bitmap)));
806+
}
697807
}
698808

699809
uint32_t num_groups() const override { return static_cast<uint32_t>(rows_.length()); }

‎cpp/src/arrow/compute/row/grouper.h

+10
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#pragma once
1919

20+
#include <limits>
2021
#include <memory>
2122
#include <vector>
2223

@@ -120,6 +121,15 @@ class ARROW_EXPORT Grouper {
120121
virtual Result<Datum> Consume(const ExecSpan& batch, int64_t offset = 0,
121122
int64_t length = -1) = 0;
122123

124+
/// Like Consume, but groups not already encountered emit null instead of
125+
/// generating a new group id.
126+
virtual Result<Datum> Lookup(const ExecSpan& batch, int64_t offset = 0,
127+
int64_t length = -1) = 0;
128+
129+
/// Like Consume, but only populates the Grouper without returning the group ids.
130+
virtual Status Populate(const ExecSpan& batch, int64_t offset = 0,
131+
int64_t length = -1) = 0;
132+
123133
/// Get current unique keys. May be called multiple times.
124134
virtual Result<ExecBatch> GetUniques() = 0;
125135

‎cpp/src/arrow/compute/row/grouper_test.cc

+384-137
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
#include <algorithm>
1819
#include <numeric>
1920

2021
#include <gtest/gtest.h>
@@ -30,11 +31,15 @@
3031
#include "arrow/testing/matchers.h"
3132
#include "arrow/testing/random.h"
3233
#include "arrow/type_fwd.h"
34+
#include "arrow/type_traits.h"
35+
#include "arrow/util/bitmap_ops.h"
3336
#include "arrow/util/checked_cast.h"
37+
#include "arrow/util/key_value_metadata.h"
3438
#include "arrow/util/string.h"
3539

3640
namespace arrow::compute {
3741

42+
using ::arrow::internal::checked_cast;
3843
using ::arrow::internal::checked_pointer_cast;
3944
using ::arrow::internal::ToChars;
4045
using ::testing::Eq;
@@ -605,17 +610,54 @@ struct TestGrouper {
605610
}
606611
}
607612

613+
void ExpectLookup(const std::string& key_json, const std::string& expected) {
614+
auto expected_arr = ArrayFromJSON(uint32(), expected);
615+
if (shapes_.size() > 0) {
616+
ExpectLookup(ExecBatchFromJSON(types_, shapes_, key_json), expected_arr);
617+
} else {
618+
ExpectLookup(ExecBatchFromJSON(types_, key_json), expected_arr);
619+
}
620+
}
621+
622+
void ExpectPopulate(const std::string& key_json) {
623+
if (shapes_.size() > 0) {
624+
ExpectPopulate(ExecBatchFromJSON(types_, shapes_, key_json));
625+
} else {
626+
ExpectPopulate(ExecBatchFromJSON(types_, key_json));
627+
}
628+
}
629+
608630
void ExpectConsume(const std::vector<Datum>& key_values, Datum expected) {
609631
ASSERT_OK_AND_ASSIGN(auto key_batch, ExecBatch::Make(key_values));
610632
ExpectConsume(key_batch, expected);
611633
}
612634

635+
void ExpectLookup(const std::vector<Datum>& key_values, Datum expected) {
636+
ASSERT_OK_AND_ASSIGN(auto key_batch, ExecBatch::Make(key_values));
637+
ExpectLookup(key_batch, expected);
638+
}
639+
640+
void ExpectPopulate(const std::vector<Datum>& key_values) {
641+
ASSERT_OK_AND_ASSIGN(auto key_batch, ExecBatch::Make(key_values));
642+
ExpectPopulate(key_batch);
643+
}
644+
613645
void ExpectConsume(const ExecBatch& key_batch, Datum expected) {
614646
Datum ids;
615647
ConsumeAndValidate(key_batch, &ids);
616648
AssertEquivalentIds(expected, ids);
617649
}
618650

651+
void ExpectLookup(const ExecBatch& key_batch, Datum expected) {
652+
Datum ids;
653+
LookupAndValidate(key_batch, &ids);
654+
AssertEquivalentIds(expected, ids);
655+
}
656+
657+
void ExpectPopulate(const ExecBatch& key_batch) {
658+
ASSERT_OK(grouper_->Populate(ExecSpan(key_batch)));
659+
}
660+
619661
void ExpectUniques(const ExecBatch& uniques) {
620662
EXPECT_THAT(grouper_->GetUniques(), ResultWith(Eq(uniques)));
621663
}
@@ -633,27 +675,28 @@ struct TestGrouper {
633675
auto right = actual.make_array();
634676
ASSERT_EQ(left->length(), right->length()) << "#ids unequal";
635677
int64_t num_ids = left->length();
636-
auto left_data = left->data();
637-
auto right_data = right->data();
638-
auto left_ids = reinterpret_cast<const uint32_t*>(left_data->buffers[1]->data());
639-
auto right_ids = reinterpret_cast<const uint32_t*>(right_data->buffers[1]->data());
678+
const auto& left_ids = checked_cast<const UInt32Array&>(*left);
679+
const auto& right_ids = checked_cast<const UInt32Array&>(*right);
640680
uint32_t max_left_id = 0;
641681
uint32_t max_right_id = 0;
642682
for (int64_t i = 0; i < num_ids; ++i) {
643-
if (left_ids[i] > max_left_id) {
644-
max_left_id = left_ids[i];
645-
}
646-
if (right_ids[i] > max_right_id) {
647-
max_right_id = right_ids[i];
683+
ASSERT_EQ(left_ids.IsNull(i), right_ids.IsNull(i)) << " at index " << i;
684+
if (left_ids.IsNull(i)) {
685+
continue;
648686
}
687+
max_left_id = std::max(max_left_id, left_ids.Value(i));
688+
max_right_id = std::max(max_right_id, right_ids.Value(i));
649689
}
650690
std::vector<bool> right_to_left_present(max_right_id + 1, false);
651691
std::vector<bool> left_to_right_present(max_left_id + 1, false);
652692
std::vector<uint32_t> right_to_left(max_right_id + 1);
653693
std::vector<uint32_t> left_to_right(max_left_id + 1);
654694
for (int64_t i = 0; i < num_ids; ++i) {
655-
uint32_t left_id = left_ids[i];
656-
uint32_t right_id = right_ids[i];
695+
if (left_ids.IsNull(i)) {
696+
continue;
697+
}
698+
uint32_t left_id = left_ids.Value(i);
699+
uint32_t right_id = right_ids.Value(i);
657700
if (!left_to_right_present[left_id]) {
658701
left_to_right[left_id] = right_id;
659702
left_to_right_present[left_id] = true;
@@ -662,22 +705,33 @@ struct TestGrouper {
662705
right_to_left[right_id] = left_id;
663706
right_to_left_present[right_id] = true;
664707
}
665-
ASSERT_EQ(left_id, right_to_left[right_id]);
666-
ASSERT_EQ(right_id, left_to_right[left_id]);
708+
ASSERT_EQ(left_id, right_to_left[right_id]) << " at index " << i;
709+
ASSERT_EQ(right_id, left_to_right[left_id]) << " at index " << i;
667710
}
668711
}
669712

670713
void ConsumeAndValidate(const ExecBatch& key_batch, Datum* ids = nullptr) {
671714
ASSERT_OK_AND_ASSIGN(Datum id_batch, grouper_->Consume(ExecSpan(key_batch)));
672715

673-
ValidateConsume(key_batch, id_batch);
716+
ValidateConsume(key_batch, id_batch, /*can_be_null=*/false);
674717

675718
if (ids) {
676719
*ids = std::move(id_batch);
677720
}
678721
}
679722

680-
void ValidateConsume(const ExecBatch& key_batch, const Datum& id_batch) {
723+
void LookupAndValidate(const ExecBatch& key_batch, Datum* ids = nullptr) {
724+
ASSERT_OK_AND_ASSIGN(Datum id_batch, grouper_->Lookup(ExecSpan(key_batch)));
725+
726+
ValidateConsume(key_batch, id_batch, /*can_be_null=*/true);
727+
728+
if (ids) {
729+
*ids = std::move(id_batch);
730+
}
731+
}
732+
733+
void ValidateConsume(const ExecBatch& key_batch, const Datum& id_batch,
734+
bool can_be_null) {
681735
if (uniques_.length == -1) {
682736
ASSERT_OK_AND_ASSIGN(uniques_, grouper_->GetUniques());
683737
} else if (static_cast<int64_t>(grouper_->num_groups()) > uniques_.length) {
@@ -695,18 +749,49 @@ struct TestGrouper {
695749
uniques_ = std::move(new_uniques);
696750
}
697751

698-
// check that the ids encode an equivalent key sequence
699-
auto ids = id_batch.make_array();
700-
ValidateOutput(*ids);
752+
// Check that the group ids encode an equivalent key sequence:
753+
// calling Take(uniques, group_ids) should yield the original data.
754+
auto group_ids = id_batch.make_array();
755+
ValidateOutput(*group_ids);
701756

702757
for (int i = 0; i < key_batch.num_values(); ++i) {
703758
SCOPED_TRACE(ToChars(i) + "th key array");
704759
auto original =
705760
key_batch[i].is_array()
706761
? key_batch[i].make_array()
707762
: *MakeArrayFromScalar(*key_batch[i].scalar(), key_batch.length);
708-
ASSERT_OK_AND_ASSIGN(auto encoded, Take(*uniques_[i].make_array(), *ids));
709-
AssertArraysEqual(*original, *encoded, /*verbose=*/true,
763+
ASSERT_OK_AND_ASSIGN(auto encoded, Take(*uniques_[i].make_array(), *group_ids));
764+
std::shared_ptr<Array> expected = original;
765+
if (can_be_null && original->type_id() != Type::NA) {
766+
// To compute the expected output, mask out the original entries that
767+
// have a null group id.
768+
auto expected_data = original->data()->Copy();
769+
auto original_null_bitmap = original->null_bitmap();
770+
auto group_ids_null_bitmap = group_ids->null_bitmap();
771+
772+
// This could be simplified with `OptionalBitmapAnd` (GH-45819).
773+
std::shared_ptr<Buffer> null_bitmap;
774+
if (original_null_bitmap && group_ids_null_bitmap) {
775+
ASSERT_OK_AND_ASSIGN(null_bitmap,
776+
::arrow::internal::BitmapAnd(
777+
default_memory_pool(), group_ids_null_bitmap->data(),
778+
group_ids->offset(), original_null_bitmap->data(),
779+
original->offset(), original->length(),
780+
/*out_offset=*/original->offset()));
781+
} else if (group_ids_null_bitmap) {
782+
ASSERT_OK_AND_ASSIGN(
783+
null_bitmap, AllocateEmptyBitmap(original->offset() + original->length()));
784+
::arrow::internal::CopyBitmap(group_ids_null_bitmap->data(),
785+
group_ids->offset(), group_ids->length(),
786+
null_bitmap->mutable_data(), original->offset());
787+
} else {
788+
null_bitmap = original_null_bitmap;
789+
}
790+
expected_data->buffers[0] = null_bitmap;
791+
expected_data->null_count = kUnknownNullCount;
792+
expected = MakeArray(expected_data);
793+
}
794+
AssertArraysEqual(*expected, *encoded, /*verbose=*/true,
710795
EqualOptions().nans_equal(true));
711796
}
712797
}
@@ -719,16 +804,27 @@ struct TestGrouper {
719804
};
720805

721806
TEST(Grouper, BooleanKey) {
722-
TestGrouper g({boolean()});
723-
724-
g.ExpectConsume("[[true], [true]]", "[0, 0]");
725-
726-
g.ExpectConsume("[[true], [true]]", "[0, 0]");
727-
728-
g.ExpectConsume("[[false], [null]]", "[1, 2]");
729-
730-
g.ExpectConsume("[[true], [false], [true], [false], [null], [false], [null]]",
731-
"[0, 1, 0, 1, 2, 1, 2]");
807+
{
808+
TestGrouper g({boolean()});
809+
g.ExpectConsume("[[true], [true]]", "[0, 0]");
810+
g.ExpectConsume("[[true], [true]]", "[0, 0]");
811+
g.ExpectConsume("[[false], [null]]", "[1, 2]");
812+
g.ExpectConsume("[[true], [false], [true], [false], [null], [false], [null]]",
813+
"[0, 1, 0, 1, 2, 1, 2]");
814+
}
815+
{
816+
TestGrouper g({boolean()});
817+
g.ExpectPopulate("[[true], [true]]");
818+
g.ExpectPopulate("[[true], [true]]");
819+
g.ExpectConsume("[[false], [null]]", "[1, 2]");
820+
g.ExpectConsume("[[true], [false], [true], [false], [null], [false], [null]]",
821+
"[0, 1, 0, 1, 2, 1, 2]");
822+
}
823+
{
824+
TestGrouper g({boolean()});
825+
g.ExpectPopulate("[[true], [null]]");
826+
g.ExpectLookup("[[null], [false], [true], [null]]", "[1, null, 0, 1]");
827+
}
732828
}
733829

734830
TEST(Grouper, NumericKey) {
@@ -747,20 +843,41 @@ TEST(Grouper, NumericKey) {
747843
}) {
748844
SCOPED_TRACE("key type: " + ty->ToString());
749845

750-
TestGrouper g({ty});
846+
{
847+
TestGrouper g({ty});
848+
g.ExpectConsume("[[3], [3]]", "[0, 0]");
849+
g.ExpectUniques("[[3]]");
850+
851+
g.ExpectConsume("[[3], [3]]", "[0, 0]");
852+
g.ExpectUniques("[[3]]");
751853

752-
g.ExpectConsume("[[3], [3]]", "[0, 0]");
753-
g.ExpectUniques("[[3]]");
854+
g.ExpectConsume("[[27], [81], [81]]", "[1, 2, 2]");
855+
g.ExpectUniques("[[3], [27], [81]]");
754856

755-
g.ExpectConsume("[[3], [3]]", "[0, 0]");
756-
g.ExpectUniques("[[3]]");
857+
g.ExpectConsume("[[3], [27], [3], [27], [null], [81], [27], [81]]",
858+
"[0, 1, 0, 1, 3, 2, 1, 2]");
859+
g.ExpectUniques("[[3], [27], [81], [null]]");
860+
}
861+
{
862+
TestGrouper g({ty});
863+
g.ExpectPopulate("[[3], [3]]");
864+
g.ExpectPopulate("[[3], [3]]");
865+
g.ExpectUniques("[[3]]");
757866

758-
g.ExpectConsume("[[27], [81], [81]]", "[1, 2, 2]");
759-
g.ExpectUniques("[[3], [27], [81]]");
867+
g.ExpectPopulate("[[27], [81], [81]]");
868+
g.ExpectUniques("[[3], [27], [81]]");
760869

761-
g.ExpectConsume("[[3], [27], [3], [27], [null], [81], [27], [81]]",
762-
"[0, 1, 0, 1, 3, 2, 1, 2]");
763-
g.ExpectUniques("[[3], [27], [81], [null]]");
870+
g.ExpectConsume("[[3], [27], [3], [27], [null], [81], [27], [81]]",
871+
"[0, 1, 0, 1, 3, 2, 1, 2]");
872+
g.ExpectUniques("[[3], [27], [81], [null]]");
873+
}
874+
{
875+
TestGrouper g({ty});
876+
g.ExpectPopulate("[[3], [3]]");
877+
g.ExpectPopulate("[[27], [81], [81]]");
878+
g.ExpectLookup("[[3], [27], [6], [27], [null], [81], [27], [6]]",
879+
"[0, 1, null, 1, null, 2, 1, null]");
880+
}
764881
}
765882
}
766883

@@ -780,21 +897,23 @@ TEST(Grouper, FloatingPointKey) {
780897

781898
TEST(Grouper, StringKey) {
782899
for (auto ty : {utf8(), large_utf8(), fixed_size_binary(2)}) {
783-
SCOPED_TRACE("key type: " + ty->ToString());
784-
785-
TestGrouper g({ty});
786-
787-
g.ExpectConsume(R"([["eh"], ["eh"]])", "[0, 0]");
788-
789-
g.ExpectConsume(R"([["eh"], ["eh"]])", "[0, 0]");
790-
791-
g.ExpectConsume(R"([["be"], [null]])", "[1, 2]");
900+
ARROW_SCOPED_TRACE("key type = ", *ty);
901+
{
902+
TestGrouper g({ty});
903+
g.ExpectConsume(R"([["eh"], ["eh"]])", "[0, 0]");
904+
g.ExpectConsume(R"([["eh"], ["eh"]])", "[0, 0]");
905+
g.ExpectConsume(R"([["be"], [null]])", "[1, 2]");
906+
}
907+
{
908+
TestGrouper g({ty});
909+
g.ExpectConsume(R"([["eh"], ["eh"]])", "[0, 0]");
910+
g.ExpectConsume(R"([["be"], [null]])", "[1, 2]");
911+
g.ExpectLookup(R"([["be"], [null], ["da"]])", "[1, 2, null]");
912+
}
792913
}
793914
}
794915

795916
TEST(Grouper, DictKey) {
796-
TestGrouper g({dictionary(int32(), utf8())});
797-
798917
// For dictionary keys, all batches must share a single dictionary.
799918
// Eventually, differing dictionaries will be unified and indices transposed
800919
// during encoding to relieve this restriction.
@@ -804,25 +923,47 @@ TEST(Grouper, DictKey) {
804923
return Datum(*DictionaryArray::FromArrays(ArrayFromJSON(int32(), indices), dict));
805924
};
806925

807-
// NB: null index is not considered equivalent to index=3 (which encodes null in dict)
808-
g.ExpectConsume({WithIndices(" [3, 1, null, 0, 2]")},
809-
ArrayFromJSON(uint32(), "[0, 1, 2, 3, 4]"));
810-
811-
g = TestGrouper({dictionary(int32(), utf8())});
812-
813-
g.ExpectConsume({WithIndices(" [0, 1, 2, 3, null]")},
814-
ArrayFromJSON(uint32(), "[0, 1, 2, 3, 4]"));
815-
816-
g.ExpectConsume({WithIndices(" [3, 1, null, 0, 2]")},
817-
ArrayFromJSON(uint32(), "[3, 1, 4, 0, 2]"));
818-
819-
auto dict_arr = *DictionaryArray::FromArrays(
820-
ArrayFromJSON(int32(), "[0, 1]"),
821-
ArrayFromJSON(utf8(), R"(["different", "dictionary"])"));
822-
ExecSpan dict_span({*dict_arr->data()}, 2);
823-
EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
824-
HasSubstr("Unifying differing dictionaries"),
825-
g.grouper_->Consume(dict_span));
926+
{
927+
TestGrouper g({dictionary(int32(), utf8())});
928+
// NB: null index is not considered equivalent to index=3 (which encodes null in dict)
929+
g.ExpectConsume({WithIndices(" [3, 1, null, 0, 2]")},
930+
ArrayFromJSON(uint32(), "[0, 1, 2, 3, 4]"));
931+
}
932+
{
933+
TestGrouper g({dictionary(int32(), utf8())});
934+
g.ExpectPopulate({WithIndices(" [3, 1, null, 2]")});
935+
g.ExpectConsume({WithIndices(" [1, null, 3, 0, 2]")},
936+
ArrayFromJSON(uint32(), "[1, 2, 0, 4, 3]"));
937+
}
938+
{
939+
TestGrouper g({dictionary(int32(), utf8())});
940+
g.ExpectPopulate({WithIndices(" [3, 1, null, 2]")});
941+
g.ExpectLookup({WithIndices(" [1, null, 3, 0, 2]")},
942+
ArrayFromJSON(uint32(), "[1, 2, 0, null, 3]"));
943+
}
944+
{
945+
TestGrouper g({dictionary(int32(), utf8())});
946+
947+
g.ExpectConsume({WithIndices(" [0, 1, 2, 3, null]")},
948+
ArrayFromJSON(uint32(), "[0, 1, 2, 3, 4]"));
949+
950+
g.ExpectConsume({WithIndices(" [3, 1, null, 0, 2]")},
951+
ArrayFromJSON(uint32(), "[3, 1, 4, 0, 2]"));
952+
953+
auto dict_arr = *DictionaryArray::FromArrays(
954+
ArrayFromJSON(int32(), "[0, 1]"),
955+
ArrayFromJSON(utf8(), R"(["different", "dictionary"])"));
956+
ExecSpan dict_span({*dict_arr->data()}, 2);
957+
EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
958+
HasSubstr("Unifying differing dictionaries"),
959+
g.grouper_->Consume(dict_span));
960+
EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
961+
HasSubstr("Unifying differing dictionaries"),
962+
g.grouper_->Populate(dict_span));
963+
EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
964+
HasSubstr("Unifying differing dictionaries"),
965+
g.grouper_->Lookup(dict_span));
966+
}
826967
}
827968

828969
// GH-45393: Test combinations of numeric type keys of different lengths.
@@ -834,55 +975,80 @@ TEST(Grouper, MultipleIntKeys) {
834975
ARROW_SCOPED_TRACE("t1=", t1->ToString());
835976
for (auto& t2 : types) {
836977
ARROW_SCOPED_TRACE("t2=", t2->ToString());
837-
TestGrouper g({t0, t1, t2});
838-
839-
g.ExpectConsume(R"([[0, 1, 2], [0, 1, 2]])", "[0, 0]");
840-
g.ExpectConsume(R"([[0, 1, 2], [null, 1, 2]])", "[0, 1]");
841-
g.ExpectConsume(R"([[0, 1, 2], [0, null, 2]])", "[0, 2]");
842-
g.ExpectConsume(R"([[0, 1, 2], [0, 1, null]])", "[0, 3]");
843-
844-
g.ExpectUniques("[[0, 1, 2], [null, 1, 2], [0, null, 2], [0, 1, null]]");
978+
{
979+
TestGrouper g({t0, t1, t2});
980+
981+
g.ExpectConsume(R"([[0, 1, 2], [0, 1, 2]])", "[0, 0]");
982+
g.ExpectConsume(R"([[0, 1, 2], [null, 1, 2]])", "[0, 1]");
983+
g.ExpectConsume(R"([[0, 1, 2], [0, null, 2]])", "[0, 2]");
984+
g.ExpectConsume(R"([[0, 1, 2], [0, 1, null]])", "[0, 3]");
985+
986+
g.ExpectUniques("[[0, 1, 2], [null, 1, 2], [0, null, 2], [0, 1, null]]");
987+
}
988+
{
989+
TestGrouper g({t0, t1, t2});
990+
991+
g.ExpectPopulate(R"([[0, 1, 2], [0, 1, 2]])");
992+
g.ExpectPopulate(R"([[0, 1, 2], [0, null, 2]])");
993+
g.ExpectLookup(R"([[0, null, 2], [0, 1, 2], [null, 1, 0], [0, null, 2]])",
994+
"[1, 0, null, 1]");
995+
g.ExpectLookup(R"([[0, null, 2], [0, 1, 2], [null, 1, 0], [0, null, 2]])",
996+
"[1, 0, null, 1]");
997+
998+
g.ExpectUniques("[[0, 1, 2], [0, null, 2]]");
999+
}
8451000
}
8461001
}
8471002
}
8481003
}
8491004

8501005
TEST(Grouper, StringInt64Key) {
851-
TestGrouper g({utf8(), int64()});
852-
853-
g.ExpectConsume(R"([["eh", 0], ["eh", 0]])", "[0, 0]");
854-
855-
g.ExpectConsume(R"([["eh", 0], ["eh", null]])", "[0, 1]");
856-
857-
g.ExpectConsume(R"([["eh", 1], ["bee", 1]])", "[2, 3]");
858-
859-
g.ExpectConsume(R"([["eh", null], ["bee", 1]])", "[1, 3]");
860-
861-
g = TestGrouper({utf8(), int64()});
862-
863-
g.ExpectConsume(R"([
864-
["ex", 0],
865-
["ex", 0],
866-
["why", 0],
867-
["ex", 1],
868-
["why", 0],
869-
["ex", 1],
870-
["ex", 0],
871-
["why", 1]
872-
])",
873-
"[0, 0, 1, 2, 1, 2, 0, 3]");
1006+
for (auto string_type : {utf8(), large_utf8()}) {
1007+
ARROW_SCOPED_TRACE("string_type = ", *string_type);
1008+
{
1009+
TestGrouper g({string_type, int64()});
8741010

875-
g.ExpectConsume(R"([
876-
["ex", 0],
877-
[null, 0],
878-
[null, 0],
879-
["ex", 1],
880-
[null, null],
881-
["ex", 1],
882-
["ex", 0],
883-
["why", null]
884-
])",
885-
"[0, 4, 4, 2, 5, 2, 0, 6]");
1011+
g.ExpectConsume(R"([["eh", 0], ["eh", 0]])", "[0, 0]");
1012+
g.ExpectConsume(R"([["eh", 0], ["eh", null]])", "[0, 1]");
1013+
g.ExpectConsume(R"([["eh", 1], ["bee", 1]])", "[2, 3]");
1014+
g.ExpectConsume(R"([["eh", null], ["bee", 1]])", "[1, 3]");
1015+
}
1016+
{
1017+
TestGrouper g({string_type, int64()});
1018+
1019+
g.ExpectPopulate(R"([["eh", 0], ["eh", 0]])");
1020+
g.ExpectPopulate(R"([["eh", 0], ["eh", null]])");
1021+
g.ExpectConsume(R"([["eh", 1], ["bee", 1]])", "[2, 3]");
1022+
g.ExpectConsume(R"([["eh", null], ["bee", 1]])", "[1, 3]");
1023+
g.ExpectLookup(R"([["da", null], ["bee", 1]])", "[null, 3]");
1024+
g.ExpectLookup(R"([["da", null], ["bee", 1]])", "[null, 3]");
1025+
}
1026+
{
1027+
TestGrouper g({string_type, int64()});
1028+
g.ExpectConsume(R"([
1029+
["ex", 0],
1030+
["ex", 0],
1031+
["why", 0],
1032+
["ex", 1],
1033+
["why", 0],
1034+
["ex", 1],
1035+
["ex", 0],
1036+
["why", 1]
1037+
])",
1038+
"[0, 0, 1, 2, 1, 2, 0, 3]");
1039+
g.ExpectConsume(R"([
1040+
["ex", 0],
1041+
[null, 0],
1042+
[null, 0],
1043+
["ex", 1],
1044+
[null, null],
1045+
["ex", 1],
1046+
["ex", 0],
1047+
["why", null]
1048+
])",
1049+
"[0, 4, 4, 2, 5, 2, 0, 6]");
1050+
}
1051+
}
8861052
}
8871053

8881054
TEST(Grouper, DoubleStringInt64Key) {
@@ -898,42 +1064,88 @@ TEST(Grouper, DoubleStringInt64Key) {
8981064
g.ExpectConsume(R"([[-0.0, "be", 7], [0.0, "be", 7]])", "[3, 4]");
8991065
}
9001066

901-
TEST(Grouper, RandomInt64Keys) {
902-
TestGrouper g({int64()});
1067+
FieldVector AnnotateForRandomGeneration(FieldVector fields) {
1068+
for (auto& field : fields) {
1069+
// For each field, constrain random generation to ensure that group ids
1070+
// can appear more than once.
1071+
if (is_integer(*field->type())) {
1072+
field =
1073+
field->WithMergedMetadata(key_value_metadata({"min", "max"}, {"100", "10000"}));
1074+
} else if (is_binary_like(*field->type())) {
1075+
// (note this is unsupported for large binary types)
1076+
field = field->WithMergedMetadata(key_value_metadata({"unique"}, {"100"}));
1077+
}
1078+
field = field->WithMergedMetadata(key_value_metadata({"null_probability"}, {"0.1"}));
1079+
}
1080+
return fields;
1081+
}
1082+
1083+
void TestRandomConsume(TestGrouper g) {
1084+
// Exercise Consume
1085+
auto fields = AnnotateForRandomGeneration(g.key_schema_->fields());
9031086
for (int i = 0; i < 4; ++i) {
9041087
SCOPED_TRACE(ToChars(i) + "th key batch");
9051088

906-
ExecBatch key_batch{
907-
*random::GenerateBatch(g.key_schema_->fields(), 1 << 12, 0xDEADBEEF)};
1089+
ExecBatch key_batch{*random::GenerateBatch(fields, 1 << 12, /*seed=*/i + 1)};
9081090
g.ConsumeAndValidate(key_batch);
9091091
}
9101092
}
9111093

912-
TEST(Grouper, RandomStringInt64Keys) {
913-
TestGrouper g({utf8(), int64()});
1094+
void TestRandomLookup(TestGrouper g) {
1095+
// Exercise Populate then Lookup
1096+
auto fields = AnnotateForRandomGeneration(g.key_schema_->fields());
1097+
ExecBatch key_batch{*random::GenerateBatch(fields, 1 << 12, /*seed=*/1)};
1098+
ASSERT_OK(g.grouper_->Populate(ExecSpan{key_batch}));
9141099
for (int i = 0; i < 4; ++i) {
9151100
SCOPED_TRACE(ToChars(i) + "th key batch");
9161101

917-
ExecBatch key_batch{
918-
*random::GenerateBatch(g.key_schema_->fields(), 1 << 12, 0xDEADBEEF)};
919-
g.ConsumeAndValidate(key_batch);
1102+
ExecBatch key_batch{*random::GenerateBatch(fields, 1 << 12, /*seed=*/i + 1)};
1103+
g.LookupAndValidate(key_batch);
9201104
}
9211105
}
9221106

923-
TEST(Grouper, RandomStringInt64DoubleInt32Keys) {
924-
TestGrouper g({utf8(), int64(), float64(), int32()});
925-
for (int i = 0; i < 4; ++i) {
926-
SCOPED_TRACE(ToChars(i) + "th key batch");
1107+
TEST(Grouper, RandomInt64Keys) {
1108+
TestRandomConsume(TestGrouper({int64()}));
1109+
TestRandomLookup(TestGrouper({int64()}));
1110+
}
9271111

928-
ExecBatch key_batch{
929-
*random::GenerateBatch(g.key_schema_->fields(), 1 << 12, 0xDEADBEEF)};
930-
g.ConsumeAndValidate(key_batch);
1112+
TEST(Grouper, RandomStringKeys) {
1113+
for (auto string_type : {utf8(), large_utf8()}) {
1114+
ARROW_SCOPED_TRACE("string_type = ", *string_type);
1115+
TestRandomConsume(TestGrouper({string_type}));
1116+
TestRandomLookup(TestGrouper({string_type}));
1117+
}
1118+
}
1119+
1120+
TEST(Grouper, RandomStringInt64Keys) {
1121+
for (auto string_type : {utf8(), large_utf8()}) {
1122+
ARROW_SCOPED_TRACE("string_type = ", *string_type);
1123+
TestRandomConsume(TestGrouper({string_type, int64()}));
1124+
TestRandomLookup(TestGrouper({string_type, int64()}));
9311125
}
9321126
}
9331127

1128+
TEST(Grouper, RandomStringInt64DoubleInt32Keys) {
1129+
TestRandomConsume(TestGrouper({utf8(), int64(), float64(), int32()}));
1130+
TestRandomLookup(TestGrouper({utf8(), int64(), float64(), int32()}));
1131+
}
1132+
9341133
TEST(Grouper, NullKeys) {
935-
TestGrouper g({null()});
936-
g.ExpectConsume("[[null], [null]]", "[0, 0]");
1134+
{
1135+
TestGrouper g({null()});
1136+
g.ExpectConsume("[[null], [null]]", "[0, 0]");
1137+
}
1138+
{
1139+
TestGrouper g({null()});
1140+
g.ExpectPopulate("[[null], [null]]");
1141+
g.ExpectConsume("[[null], [null]]", "[0, 0]");
1142+
}
1143+
{
1144+
TestGrouper g({null()});
1145+
g.ExpectLookup("[[null], [null]]", "[null, null]");
1146+
g.ExpectPopulate("[[null], [null]]");
1147+
g.ExpectLookup("[[null], [null], [null]]", "[0, 0, 0]");
1148+
}
9371149
}
9381150

9391151
TEST(Grouper, MultipleNullKeys) {
@@ -971,8 +1183,16 @@ TEST(Grouper, DoubleNullStringKey) {
9711183
}
9721184

9731185
TEST(Grouper, EmptyNullKeys) {
974-
TestGrouper g({null()});
975-
g.ExpectConsume("[]", "[]");
1186+
{
1187+
TestGrouper g({null()});
1188+
g.ExpectConsume("[]", "[]");
1189+
}
1190+
{
1191+
TestGrouper g({null()});
1192+
g.ExpectPopulate("[]");
1193+
g.ExpectConsume("[]", "[]");
1194+
g.ExpectLookup("[]", "[]");
1195+
}
9761196
}
9771197

9781198
TEST(Grouper, MakeGroupings) {
@@ -1021,22 +1241,49 @@ TEST(Grouper, ScalarValues) {
10211241
ArgShape::SCALAR, ArgShape::SCALAR, ArgShape::ARRAY});
10221242
g.ExpectConsume(
10231243
R"([
1024-
[true, 1, "1.00", "2.00", "ab", "foo", 2],
1025-
[true, 1, "1.00", "2.00", "ab", "foo", 2],
1026-
[true, 1, "1.00", "2.00", "ab", "foo", 3]
1027-
])",
1244+
[true, 1, "1.00", "2.00", "ab", "foo", 2],
1245+
[true, 1, "1.00", "2.00", "ab", "foo", 2],
1246+
[true, 1, "1.00", "2.00", "ab", "foo", 3]
1247+
])",
10281248
"[0, 0, 1]");
10291249
}
1250+
{
1251+
TestGrouper g(
1252+
{boolean(), int32(), decimal128(3, 2), decimal256(3, 2), fixed_size_binary(2),
1253+
str_type, int32()},
1254+
{ArgShape::SCALAR, ArgShape::SCALAR, ArgShape::SCALAR, ArgShape::SCALAR,
1255+
ArgShape::SCALAR, ArgShape::SCALAR, ArgShape::ARRAY});
1256+
g.ExpectPopulate(
1257+
R"([
1258+
[true, 1, "1.00", "2.00", "ab", "foo", 2],
1259+
[true, 1, "1.00", "2.00", "ab", "foo", 2],
1260+
[true, 1, "1.00", "2.00", "ab", "foo", 3]
1261+
])");
1262+
g.ExpectLookup(
1263+
R"([
1264+
[true, 1, "1.00", "2.00", "ab", "foo", 3],
1265+
[true, 1, "1.00", "2.00", "ab", "foo", 4],
1266+
[true, 1, "1.00", "2.00", "ab", "foo", 2],
1267+
[true, 1, "1.00", "2.00", "ab", "foo", 3]
1268+
])",
1269+
"[1, null, 0, 1]");
1270+
}
10301271
{
10311272
auto dict_type = dictionary(int32(), utf8());
10321273
TestGrouper g({dict_type, str_type}, {ArgShape::SCALAR, ArgShape::SCALAR});
1033-
const auto dict = R"(["foo", null])";
1274+
const auto dict = R"(["foo", null, "bar"])";
10341275
g.ExpectConsume(
10351276
{DictScalarFromJSON(dict_type, "0", dict), ScalarFromJSON(str_type, R"("")")},
10361277
ArrayFromJSON(uint32(), "[0]"));
10371278
g.ExpectConsume(
10381279
{DictScalarFromJSON(dict_type, "1", dict), ScalarFromJSON(str_type, R"("")")},
10391280
ArrayFromJSON(uint32(), "[1]"));
1281+
g.ExpectLookup(
1282+
{DictScalarFromJSON(dict_type, "1", dict), ScalarFromJSON(str_type, R"("")")},
1283+
ArrayFromJSON(uint32(), "[1]"));
1284+
g.ExpectLookup(
1285+
{DictScalarFromJSON(dict_type, "2", dict), ScalarFromJSON(str_type, R"("")")},
1286+
ArrayFromJSON(uint32(), "[null]"));
10401287
}
10411288
}
10421289
}

0 commit comments

Comments
 (0)
Please sign in to comment.