Skip to content

Commit 732a205

Browse files
committed
ARROW-261: Refactor String/Binary code paths to reflect unnested (non-list-based) structure
Per discussions on the mailing list. This should in theory match the Java implementation. Author: Wes McKinney <wes.mckinney@twosigma.com> Closes #176 from wesm/ARROW-261 and squashes the following commits: dca39ce [Wes McKinney] Make binary/string constants static to avoid memory-access-related segfaults in third party libraries 1e65b01 [Wes McKinney] Deprecate pyarrow::Status in favor of just arrow::Status. Conform pyarrow use of ArrayBuilder::Finish 9a1f77e [Wes McKinney] Add license header to index.rst bd70cab [Wes McKinney] Complete refactoring, fix up IPC tests for flattened string/binary buffer/metadata layout ae64f2e [Wes McKinney] Refactoring to reflect collaprsed list-like structure of Binary and String types. Not yet complete
1 parent 8e8b17f commit 732a205

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+484
-768
lines changed

cpp/CMakeLists.txt

-1
Original file line numberDiff line numberDiff line change
@@ -681,7 +681,6 @@ set(ARROW_SRCS
681681

682682
src/arrow/types/construct.cc
683683
src/arrow/types/decimal.cc
684-
src/arrow/types/json.cc
685684
src/arrow/types/list.cc
686685
src/arrow/types/primitive.cc
687686
src/arrow/types/string.cc

cpp/src/arrow/builder.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ class ARROW_EXPORT ArrayBuilder {
9393

9494
// Creates new array object to hold the contents of the builder and transfers
9595
// ownership of the data. This resets all variables on the builder.
96-
virtual std::shared_ptr<Array> Finish() = 0;
96+
virtual Status Finish(std::shared_ptr<Array>* out) = 0;
9797

9898
const std::shared_ptr<DataType>& type() const { return type_; }
9999

cpp/src/arrow/ipc/adapter.cc

+23-24
Original file line numberDiff line numberDiff line change
@@ -78,22 +78,6 @@ static bool IsPrimitive(const DataType* type) {
7878
}
7979
}
8080

81-
static bool IsListType(const DataType* type) {
82-
DCHECK(type != nullptr);
83-
switch (type->type) {
84-
// TODO(emkornfield) grouping like this are used in a few places in the
85-
// code consider using pattern like:
86-
// http://stackoverflow.com/questions/26784685/c-macro-for-calling-function-based-on-enum-type
87-
//
88-
case Type::BINARY:
89-
case Type::LIST:
90-
case Type::STRING:
91-
return true;
92-
default:
93-
return false;
94-
}
95-
}
96-
9781
// ----------------------------------------------------------------------
9882
// Record batch write path
9983

@@ -115,7 +99,11 @@ Status VisitArray(const Array* arr, std::vector<flatbuf::FieldNode>* field_nodes
11599
if (IsPrimitive(arr_type)) {
116100
const auto prim_arr = static_cast<const PrimitiveArray*>(arr);
117101
buffers->push_back(prim_arr->data());
118-
} else if (IsListType(arr_type)) {
102+
} else if (arr->type_enum() == Type::STRING || arr->type_enum() == Type::BINARY) {
103+
const auto binary_arr = static_cast<const BinaryArray*>(arr);
104+
buffers->push_back(binary_arr->offsets());
105+
buffers->push_back(binary_arr->data());
106+
} else if (arr->type_enum() == Type::LIST) {
119107
const auto list_arr = static_cast<const ListArray*>(arr);
120108
buffers->push_back(list_arr->offset_buffer());
121109
RETURN_NOT_OK(VisitArray(
@@ -331,9 +319,21 @@ class RecordBatchReader::RecordBatchReaderImpl {
331319
}
332320
return MakePrimitiveArray(
333321
type, field_meta.length, data, field_meta.null_count, null_bitmap, out);
334-
}
322+
} else if (type->type == Type::STRING || type->type == Type::BINARY) {
323+
std::shared_ptr<Buffer> offsets;
324+
std::shared_ptr<Buffer> values;
325+
RETURN_NOT_OK(GetBuffer(buffer_index_++, &offsets));
326+
RETURN_NOT_OK(GetBuffer(buffer_index_++, &values));
335327

336-
if (IsListType(type.get())) {
328+
if (type->type == Type::STRING) {
329+
*out = std::make_shared<StringArray>(
330+
field_meta.length, offsets, values, field_meta.null_count, null_bitmap);
331+
} else {
332+
*out = std::make_shared<BinaryArray>(
333+
field_meta.length, offsets, values, field_meta.null_count, null_bitmap);
334+
}
335+
return Status::OK();
336+
} else if (type->type == Type::LIST) {
337337
std::shared_ptr<Buffer> offsets;
338338
RETURN_NOT_OK(GetBuffer(buffer_index_++, &offsets));
339339
const int num_children = type->num_children();
@@ -346,11 +346,10 @@ class RecordBatchReader::RecordBatchReaderImpl {
346346
std::shared_ptr<Array> values_array;
347347
RETURN_NOT_OK(
348348
NextArray(type->child(0).get(), max_recursion_depth - 1, &values_array));
349-
return MakeListArray(type, field_meta.length, offsets, values_array,
350-
field_meta.null_count, null_bitmap, out);
351-
}
352-
353-
if (type->type == Type::STRUCT) {
349+
*out = std::make_shared<ListArray>(type, field_meta.length, offsets, values_array,
350+
field_meta.null_count, null_bitmap);
351+
return Status::OK();
352+
} else if (type->type == Type::STRUCT) {
354353
const int num_children = type->num_children();
355354
std::vector<ArrayPtr> fields;
356355
fields.reserve(num_children);

cpp/src/arrow/ipc/test-common.h

+8-11
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ const auto kListInt32 = std::make_shared<ListType>(kInt32);
4242
const auto kListListInt32 = std::make_shared<ListType>(kListInt32);
4343

4444
Status MakeRandomInt32Array(
45-
int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* array) {
45+
int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* out) {
4646
std::shared_ptr<PoolBuffer> data;
4747
test::MakeRandomInt32PoolBuffer(length, pool, &data);
4848
const auto kInt32 = std::make_shared<Int32Type>();
@@ -52,16 +52,14 @@ Status MakeRandomInt32Array(
5252
test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes);
5353
RETURN_NOT_OK(builder.Append(
5454
reinterpret_cast<const int32_t*>(data->data()), length, valid_bytes->data()));
55-
*array = builder.Finish();
56-
return Status::OK();
55+
return builder.Finish(out);
5756
}
5857
RETURN_NOT_OK(builder.Append(reinterpret_cast<const int32_t*>(data->data()), length));
59-
*array = builder.Finish();
60-
return Status::OK();
58+
return builder.Finish(out);
6159
}
6260

6361
Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_lists,
64-
bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* array) {
62+
bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* out) {
6563
// Create the null list values
6664
std::vector<uint8_t> valid_lists(num_lists);
6765
const double null_percent = include_nulls ? 0.1 : 0;
@@ -90,8 +88,8 @@ Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_li
9088
}
9189
ListBuilder builder(pool, child_array);
9290
RETURN_NOT_OK(builder.Append(offsets.data(), num_lists, valid_lists.data()));
93-
*array = builder.Finish();
94-
return (*array)->Validate();
91+
RETURN_NOT_OK(builder.Finish(out));
92+
return (*out)->Validate();
9593
}
9694

9795
typedef Status MakeRecordBatch(std::shared_ptr<RecordBatch>* out);
@@ -115,7 +113,7 @@ Status MakeIntRecordBatch(std::shared_ptr<RecordBatch>* out) {
115113

116114
template <class Builder, class RawType>
117115
Status MakeRandomBinaryArray(
118-
const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* array) {
116+
const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* out) {
119117
const std::vector<std::string> values = {
120118
"", "", "abc", "123", "efg", "456!@#!@#", "12312"};
121119
Builder builder(pool, type);
@@ -130,8 +128,7 @@ Status MakeRandomBinaryArray(
130128
builder.Append(reinterpret_cast<const RawType*>(value.data()), value.size()));
131129
}
132130
}
133-
*array = builder.Finish();
134-
return Status::OK();
131+
return builder.Finish(out);
135132
}
136133

137134
Status MakeStringTypesRecordBatch(std::shared_ptr<RecordBatch>* out) {

cpp/src/arrow/type.h

+4-16
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ struct ARROW_EXPORT DoubleType : public PrimitiveType<DoubleType> {
242242
struct ARROW_EXPORT ListType : public DataType {
243243
// List can contain any other logical value type
244244
explicit ListType(const std::shared_ptr<DataType>& value_type)
245-
: ListType(value_type, Type::LIST) {}
245+
: ListType(std::make_shared<Field>("item", value_type)) {}
246246

247247
explicit ListType(const std::shared_ptr<Field>& value_field) : DataType(Type::LIST) {
248248
children_ = {value_field};
@@ -255,26 +255,17 @@ struct ARROW_EXPORT ListType : public DataType {
255255
static char const* name() { return "list"; }
256256

257257
std::string ToString() const override;
258-
259-
protected:
260-
// Constructor for classes that are implemented as List Arrays.
261-
ListType(const std::shared_ptr<DataType>& value_type, Type::type logical_type)
262-
: DataType(logical_type) {
263-
// TODO ARROW-187 this can technically fail, make a constructor method ?
264-
children_ = {std::make_shared<Field>("item", value_type)};
265-
}
266258
};
267259

268260
// BinaryType type is reprsents lists of 1-byte values.
269-
struct ARROW_EXPORT BinaryType : public ListType {
261+
struct ARROW_EXPORT BinaryType : public DataType {
270262
BinaryType() : BinaryType(Type::BINARY) {}
271263
static char const* name() { return "binary"; }
272264
std::string ToString() const override;
273265

274266
protected:
275267
// Allow subclasses to change the logical type.
276-
explicit BinaryType(Type::type logical_type)
277-
: ListType(std::shared_ptr<DataType>(new UInt8Type()), logical_type) {}
268+
explicit BinaryType(Type::type logical_type) : DataType(logical_type) {}
278269
};
279270

280271
// UTF encoded strings
@@ -284,9 +275,6 @@ struct ARROW_EXPORT StringType : public BinaryType {
284275
static char const* name() { return "string"; }
285276

286277
std::string ToString() const override;
287-
288-
protected:
289-
explicit StringType(Type::type logical_type) : BinaryType(logical_type) {}
290278
};
291279

292280
struct ARROW_EXPORT StructType : public DataType {
@@ -300,7 +288,7 @@ struct ARROW_EXPORT StructType : public DataType {
300288

301289
// These will be defined elsewhere
302290
template <typename T>
303-
struct type_traits {};
291+
struct TypeTraits {};
304292

305293
} // namespace arrow
306294

cpp/src/arrow/types/CMakeLists.txt

-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ install(FILES
2525
construct.h
2626
datetime.h
2727
decimal.h
28-
json.h
2928
list.h
3029
primitive.h
3130
string.h

cpp/src/arrow/types/binary.h

-28
This file was deleted.

cpp/src/arrow/types/construct.cc

+3-28
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
5959
BUILDER_CASE(DOUBLE, DoubleBuilder);
6060

6161
BUILDER_CASE(STRING, StringBuilder);
62+
BUILDER_CASE(BINARY, BinaryBuilder);
6263

6364
case Type::LIST: {
6465
std::shared_ptr<ArrayBuilder> value_builder;
@@ -105,10 +106,10 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length,
105106
MAKE_PRIMITIVE_ARRAY_CASE(INT32, Int32Array);
106107
MAKE_PRIMITIVE_ARRAY_CASE(UINT64, UInt64Array);
107108
MAKE_PRIMITIVE_ARRAY_CASE(INT64, Int64Array);
108-
MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array);
109-
MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray);
110109
MAKE_PRIMITIVE_ARRAY_CASE(FLOAT, FloatArray);
111110
MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray);
111+
MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array);
112+
MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray);
112113
MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP_DOUBLE, DoubleArray);
113114
default:
114115
return Status::NotImplemented(type->ToString());
@@ -120,30 +121,4 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length,
120121
#endif
121122
}
122123

123-
Status MakeListArray(const TypePtr& type, int32_t length,
124-
const std::shared_ptr<Buffer>& offsets, const ArrayPtr& values, int32_t null_count,
125-
const std::shared_ptr<Buffer>& null_bitmap, ArrayPtr* out) {
126-
switch (type->type) {
127-
case Type::BINARY:
128-
out->reset(new BinaryArray(type, length, offsets, values, null_count, null_bitmap));
129-
break;
130-
131-
case Type::LIST:
132-
out->reset(new ListArray(type, length, offsets, values, null_count, null_bitmap));
133-
break;
134-
135-
case Type::DECIMAL_TEXT:
136-
case Type::STRING:
137-
out->reset(new StringArray(type, length, offsets, values, null_count, null_bitmap));
138-
break;
139-
default:
140-
return Status::NotImplemented(type->ToString());
141-
}
142-
#ifdef NDEBUG
143-
return Status::OK();
144-
#else
145-
return (*out)->Validate();
146-
#endif
147-
}
148-
149124
} // namespace arrow

cpp/src/arrow/types/construct.h

-8
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,6 @@ Status ARROW_EXPORT MakePrimitiveArray(const std::shared_ptr<DataType>& type,
4242
int32_t length, const std::shared_ptr<Buffer>& data, int32_t null_count,
4343
const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<Array>* out);
4444

45-
// Create new list arrays for logical types that are backed by ListArrays (e.g. list of
46-
// primitives and strings)
47-
// TODO(emkornfield) split up string vs list?
48-
Status ARROW_EXPORT MakeListArray(const std::shared_ptr<DataType>& type, int32_t length,
49-
const std::shared_ptr<Buffer>& offests, const std::shared_ptr<Array>& values,
50-
int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap,
51-
std::shared_ptr<Array>* out);
52-
5345
} // namespace arrow
5446

5547
#endif // ARROW_BUILDER_H_

cpp/src/arrow/types/json.cc

-37
This file was deleted.

cpp/src/arrow/types/json.h

-36
This file was deleted.

0 commit comments

Comments
 (0)