Skip to content

Commit 6a43921

Browse files
authored
Add conv function (apache#114)
* Initial commit * Consider negative input and add more test cases * Use kResultNullInternal and consider more spark test cases * Use unsigned long in the converting * Return null for empty input
1 parent b77c527 commit 6a43921

File tree

4 files changed

+151
-1
lines changed

4 files changed

+151
-1
lines changed

cpp/src/gandiva/function_registry_string.cc

+4
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,10 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
325325

326326
NativeFunction("url_decoder", {}, DataTypeVector{utf8()}, utf8(),
327327
kResultNullIfNull, "url_decoder",
328+
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
329+
330+
NativeFunction("conv", {}, DataTypeVector{utf8(), int32(), int32()}, utf8(),
331+
kResultNullInternal, "conv",
328332
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)};
329333

330334
return string_fn_registry_;

cpp/src/gandiva/precompiled/string_ops.cc

+77-1
Original file line numberDiff line numberDiff line change
@@ -1568,4 +1568,80 @@ const char* url_decoder(gdv_int64 context, const char* input, gdv_int32 input_le
15681568
return out_str;
15691569
}
15701570

1571-
} // extern "C"
1571+
FORCE_INLINE
1572+
const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, bool in1_valid,
1573+
gdv_int32 from_base, bool in2_valid, gdv_int32 to_base, bool in3_valid,
1574+
bool* out_valid, gdv_int32* out_len) {
1575+
if (!in1_valid || !in2_valid || !in3_valid || input_len == 0) {
1576+
*out_len = 0;
1577+
*out_valid = false;
1578+
return "";
1579+
}
1580+
1581+
// Consistent with spark, only support base belonging to [2, 36].
1582+
const int MIN_BASE = 2;
1583+
const int MAX_BASE = 36;
1584+
if (from_base < MIN_BASE || from_base > MAX_BASE ||
1585+
fabs(to_base) < MIN_BASE || fabs(to_base) > MAX_BASE) {
1586+
*out_len = 0;
1587+
*out_valid = false;
1588+
return "";
1589+
}
1590+
1591+
from_base = from_base < 0 ? -from_base : from_base;
1592+
bool is_negative_input;
1593+
unsigned long unsigned_decimal_value;
1594+
if (input[0] == '-') {
1595+
is_negative_input = true;
1596+
unsigned_decimal_value = strtoul(input + 1, nullptr, from_base);
1597+
} else {
1598+
is_negative_input = false;
1599+
unsigned_decimal_value = strtoul(input, nullptr, from_base);
1600+
}
1601+
1602+
bool has_negative_mark = false;
1603+
if (is_negative_input && to_base < 0) {
1604+
has_negative_mark = true;
1605+
} else if (is_negative_input && to_base > 0) {
1606+
// Use the max value for 64-bit to convert it to positive.
1607+
unsigned_decimal_value = strtoul("FFFFFFFFFFFFFFFF", nullptr, 16) - unsigned_decimal_value + 1;
1608+
}
1609+
to_base = to_base < 0 ? -to_base : to_base;
1610+
1611+
char reverse_ret[64];
1612+
int i = 0;
1613+
while (unsigned_decimal_value > 0) {
1614+
int remainder = unsigned_decimal_value % to_base;
1615+
char c;
1616+
if (remainder < 10) {
1617+
c = (char)(remainder + (int)'0');
1618+
} else {
1619+
c = (char)(remainder - 10 + (int)'A');
1620+
}
1621+
reverse_ret[i] = c;
1622+
i++;
1623+
unsigned_decimal_value = unsigned_decimal_value / to_base;
1624+
}
1625+
if (has_negative_mark) {
1626+
reverse_ret[i] = '-';
1627+
i++;
1628+
}
1629+
*out_len = i;
1630+
char ret[*out_len];
1631+
for (int i = 0; i < *out_len; i++) {
1632+
ret[i] = reverse_ret[*out_len - i - 1];
1633+
}
1634+
1635+
char* out_str = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1636+
if (ret == nullptr) {
1637+
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1638+
*out_len = 0;
1639+
*out_valid = false;
1640+
return "";
1641+
}
1642+
memcpy(out_str, ret, *out_len);
1643+
*out_valid = true;
1644+
return out_str;
1645+
}
1646+
1647+
} // extern "C"

cpp/src/gandiva/precompiled/string_ops_test.cc

+66
Original file line numberDiff line numberDiff line change
@@ -1142,4 +1142,70 @@ TEST(TestStringOps, TestURLDecoder) {
11421142
EXPECT_EQ(std::string(out_str, out_len), exp_str);
11431143
}
11441144

1145+
TEST(TestStringOps, TestConv) {
1146+
gandiva::ExecutionContext ctx;
1147+
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
1148+
gdv_int32 out_len = 0;
1149+
const char* out_str;
1150+
bool out_valid;
1151+
1152+
// 10-base to 2-base
1153+
out_str = conv(ctx_ptr, "4", 1, true, 10, true, 2, true, &out_valid, &out_len);
1154+
EXPECT_EQ(out_len, 3);
1155+
EXPECT_EQ(out_valid, true);
1156+
EXPECT_EQ(std::string(out_str, out_len), "100");
1157+
1158+
// 2-bae to 10-base
1159+
out_str = conv(ctx_ptr, "110", 3, true, 2, true, 10, true, &out_valid, &out_len);
1160+
EXPECT_EQ(out_len, 1);
1161+
EXPECT_EQ(std::string(out_str, out_len), "6");
1162+
1163+
// 10-base to 16-base
1164+
out_str = conv(ctx_ptr, "15", 2, true, 10, true, 16, true, &out_valid, &out_len);
1165+
EXPECT_EQ(out_len, 1);
1166+
EXPECT_EQ(std::string(out_str, out_len), "F");
1167+
1168+
// 36-base to 16-base
1169+
out_str = conv(ctx_ptr, "big", 3, true, 36, true, 16, true, &out_valid, &out_len);
1170+
EXPECT_EQ(out_len, 4);
1171+
EXPECT_EQ(std::string(out_str, out_len), "3A48");
1172+
1173+
// 36-base to 16-base.
1174+
std::string input = "9223372036854775807";
1175+
out_str = conv(ctx_ptr, input.c_str(), input.length(), true, 36, true, 16, true, &out_valid, &out_len);
1176+
std::string expected_str = "FFFFFFFFFFFFFFFF";
1177+
EXPECT_EQ(out_len, expected_str.length());
1178+
EXPECT_EQ(std::string(out_str, out_len), expected_str);
1179+
1180+
// Space is contained in input string.
1181+
out_str = conv(ctx_ptr, " 15 ", 2, true, 10, true, 16, true, &out_valid, &out_len);
1182+
EXPECT_EQ(out_len, 1);
1183+
EXPECT_EQ(std::string(out_str, out_len), "F");
1184+
1185+
// Negative input and negative to_base.
1186+
out_str = conv(ctx_ptr, "-15", 3, true, 10, true, -16, true, &out_valid, &out_len);
1187+
EXPECT_EQ(out_len, 2);
1188+
EXPECT_EQ(std::string(out_str, out_len), "-F");
1189+
1190+
// Negative input and positive to_base
1191+
out_str = conv(ctx_ptr, "-15", 3, true, 10, true, 16, true, &out_valid, &out_len);
1192+
EXPECT_EQ(out_len, 16);
1193+
EXPECT_EQ(std::string(out_str, out_len), "FFFFFFFFFFFFFFF1");
1194+
1195+
// Negative input and negative base.
1196+
out_str = conv(ctx_ptr, "-10", 3, true, 16, true, -10, true, &out_valid, &out_len);
1197+
EXPECT_EQ(out_len, 3);
1198+
EXPECT_EQ(std::string(out_str, out_len), "-16");
1199+
1200+
// If there is an invalid digit in the number, the longest
1201+
// valid prefix should be converted.
1202+
out_str = conv(ctx_ptr, "11abc", 5, true, 10, true, 16, true, &out_valid, &out_len);
1203+
EXPECT_EQ(out_len, 1);
1204+
EXPECT_EQ(std::string(out_str, out_len), "B");
1205+
1206+
// Should return null for Empty input.
1207+
out_str = conv(ctx_ptr, "", 0, true, 10, true, 16, true, &out_valid, &out_len);
1208+
EXPECT_EQ(out_valid, false);
1209+
}
1210+
11451211
} // namespace gandiva

cpp/src/gandiva/precompiled/types.h

+4
Original file line numberDiff line numberDiff line change
@@ -517,4 +517,8 @@ double castFLOAT8_utf8(int64_t context, const char* data, int32_t len);
517517

518518
const char* url_decoder(gdv_int64 context, const char* input, gdv_int32 input_len, gdv_int32* out_len);
519519

520+
const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, bool in1_valid,
521+
gdv_int32 from_base, bool in2_valid, gdv_int32 to_base, bool in3_valid,
522+
bool* out_valid, gdv_int32* out_len);
523+
520524
} // extern "C"

0 commit comments

Comments
 (0)