Skip to content

Commit 368217f

Browse files
rui-mozhztheplayer
authored andcommitted
Fix data and timestamp functions (apache#29)
* fix out_of_range error in castTIMESTAMP_date32 * support unix_date_seconds * castDATE_nullsafe_utf8 * fix castTIMESTAMP_utf8 exception on milliseconds * make castTIMESTAMP_withCarrying to be null-safe
1 parent 1fee193 commit 368217f

File tree

3 files changed

+226
-1
lines changed

3 files changed

+226
-1
lines changed

cpp/src/gandiva/function_registry_datetime.cc

+11
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,18 @@ std::vector<NativeFunction> GetDateTimeFunctionRegistry() {
6161
"castDATE_utf8",
6262
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
6363

64+
NativeFunction("castDATE_nullsafe", {}, DataTypeVector{utf8()}, date64(),
65+
kResultNullInternal, "castDATE_nullsafe_utf8",
66+
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
67+
6468
NativeFunction("castTIMESTAMP", {}, DataTypeVector{utf8()}, timestamp(),
6569
kResultNullIfNull, "castTIMESTAMP_utf8",
6670
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
6771

72+
NativeFunction("castTIMESTAMP_withCarrying", {}, DataTypeVector{utf8()}, timestamp(),
73+
kResultNullInternal, "castTIMESTAMP_withCarrying_utf8",
74+
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
75+
6876
NativeFunction("castVARCHAR", {}, DataTypeVector{date32(), int64()}, utf8(),
6977
kResultNullIfNull, "castVARCHAR_date32_int64",
7078
NativeFunction::kNeedsContext),
@@ -159,6 +167,9 @@ std::vector<NativeFunction> GetDateTimeFunctionRegistry() {
159167
NativeFunction("unix_date", {}, DataTypeVector{date32()}, int32(),
160168
kResultNullIfNull, "unix_date_date32"),
161169

170+
NativeFunction("unix_date_seconds", {}, DataTypeVector{date32()}, int64(),
171+
kResultNullIfNull, "unix_date_seconds_date32"),
172+
162173
NativeFunction("unix_seconds", {}, DataTypeVector{timestampusutc()}, int64(),
163174
kResultNullIfNull, "unix_seconds_timestampusutc"),
164175

cpp/src/gandiva/precompiled/time.cc

+209-1
Original file line numberDiff line numberDiff line change
@@ -622,6 +622,71 @@ gdv_date64 castDATE_utf8(int64_t context, const char* input, gdv_int32 length) {
622622
.count();
623623
}
624624

625+
// This function Will set result to be null if input is invalid, instead of throwing error.
626+
gdv_date64 castDATE_nullsafe_utf8(int64_t context, const char* input, gdv_int32 length,
627+
bool in_valid, bool* out_valid) {
628+
if (!in_valid) {
629+
*out_valid = false;
630+
return 0;
631+
}
632+
using arrow_vendored::date::day;
633+
using arrow_vendored::date::month;
634+
using arrow_vendored::date::sys_days;
635+
using arrow_vendored::date::year;
636+
using arrow_vendored::date::year_month_day;
637+
using gandiva::TimeFields;
638+
// format : 0 is year, 1 is month and 2 is day.
639+
int dateFields[3];
640+
int dateIndex = 0, index = 0, value = 0;
641+
int year_str_len = 0;
642+
while (dateIndex < 3 && index < length) {
643+
if (!isdigit(input[index])) {
644+
dateFields[dateIndex++] = value;
645+
value = 0;
646+
} else {
647+
value = (value * 10) + (input[index] - '0');
648+
if (dateIndex == TimeFields::kYear) {
649+
year_str_len++;
650+
}
651+
}
652+
index++;
653+
}
654+
655+
if (dateIndex < 3) {
656+
// If we reached the end of input, we would have not encountered a separator
657+
// store the last value
658+
dateFields[dateIndex++] = value;
659+
}
660+
const char* msg = "Not a valid date value ";
661+
if (dateIndex != 3) {
662+
*out_valid = false;
663+
return 0;
664+
}
665+
666+
/* Handle two digit years
667+
* If range of two digits is between 70 - 99 then year = 1970 - 1999
668+
* Else if two digits is between 00 - 69 = 2000 - 2069
669+
*/
670+
if (dateFields[TimeFields::kYear] < 100 && year_str_len < 4) {
671+
if (dateFields[TimeFields::kYear] < 70) {
672+
dateFields[TimeFields::kYear] += 2000;
673+
} else {
674+
dateFields[TimeFields::kYear] += 1900;
675+
}
676+
}
677+
year_month_day date = year(dateFields[TimeFields::kYear]) /
678+
month(dateFields[TimeFields::kMonth]) /
679+
day(dateFields[TimeFields::kDay]);
680+
if (!date.ok()) {
681+
*out_valid = false;
682+
return 0;
683+
}
684+
*out_valid = true;
685+
return std::chrono::time_point_cast<std::chrono::milliseconds>(sys_days(date))
686+
.time_since_epoch()
687+
.count();
688+
}
689+
625690
const char* castVARCHAR_date32_int64(gdv_int64 context, gdv_date32 in_day,
626691
gdv_int64 length, gdv_int32* out_len) {
627692
gdv_timestamp in = castDATE_date32(in_day);
@@ -798,6 +863,142 @@ gdv_timestamp castTIMESTAMP_utf8(int64_t context, const char* input, gdv_int32 l
798863
return std::chrono::time_point_cast<milliseconds>(date_time).time_since_epoch().count();
799864
}
800865

866+
/*
867+
* Input consists of mandatory and optional fields.
868+
* Mandatory fields are year, month and day.
869+
* Optional fields are time, displacement and zone.
870+
* Format is <year-month-day>[ hours:minutes:seconds][.millis][ displacement|zone]
871+
* This function will conduct carrying when the length of ms is greater than 3.
872+
*/
873+
gdv_timestamp castTIMESTAMP_withCarrying_utf8(int64_t context, const char* input,
874+
gdv_int32 length, bool in_valid,
875+
bool* out_valid) {
876+
if (!in_valid) {
877+
*out_valid = false;
878+
return 0;
879+
}
880+
*out_valid = true;
881+
using arrow_vendored::date::day;
882+
using arrow_vendored::date::month;
883+
using arrow_vendored::date::sys_days;
884+
using arrow_vendored::date::year;
885+
using arrow_vendored::date::year_month_day;
886+
using gandiva::TimeFields;
887+
using std::chrono::hours;
888+
using std::chrono::milliseconds;
889+
using std::chrono::minutes;
890+
using std::chrono::seconds;
891+
892+
int ts_fields[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
893+
gdv_boolean add_displacement = true;
894+
gdv_boolean encountered_zone = false;
895+
int year_str_len = 0, sub_seconds_len = 0;
896+
int ts_field_index = TimeFields::kYear, index = 0, value = 0;
897+
while (ts_field_index < TimeFields::kMax && index < length) {
898+
if (isdigit(input[index])) {
899+
value = (value * 10) + (input[index] - '0');
900+
if (ts_field_index == TimeFields::kYear) {
901+
year_str_len++;
902+
}
903+
if (ts_field_index == TimeFields::kSubSeconds) {
904+
sub_seconds_len++;
905+
}
906+
} else {
907+
ts_fields[ts_field_index] = value;
908+
value = 0;
909+
910+
switch (input[index]) {
911+
case '.':
912+
case ':':
913+
case ' ':
914+
ts_field_index++;
915+
break;
916+
case '+':
917+
// +08:00, means time zone is 8 hours ahead. Need to subtract.
918+
add_displacement = false;
919+
ts_field_index = TimeFields::kDisplacementHours;
920+
break;
921+
case '-':
922+
// Overloaded as date separator and negative displacement.
923+
ts_field_index = (ts_field_index < 3) ? (ts_field_index + 1)
924+
: TimeFields::kDisplacementHours;
925+
break;
926+
default:
927+
encountered_zone = true;
928+
break;
929+
}
930+
}
931+
if (encountered_zone) {
932+
break;
933+
}
934+
index++;
935+
}
936+
937+
// Store the last value
938+
if (ts_field_index < TimeFields::kMax) {
939+
ts_fields[ts_field_index++] = value;
940+
}
941+
942+
// adjust the year
943+
if (ts_fields[TimeFields::kYear] < 100 && year_str_len < 4) {
944+
if (ts_fields[TimeFields::kYear] < 70) {
945+
ts_fields[TimeFields::kYear] += 2000;
946+
} else {
947+
ts_fields[TimeFields::kYear] += 1900;
948+
}
949+
}
950+
951+
// adjust the milliseconds
952+
if (sub_seconds_len > 0) {
953+
if (ts_fields[TimeFields::kSubSeconds] < 1000) {
954+
while (sub_seconds_len < 3) {
955+
ts_fields[TimeFields::kSubSeconds] *= 10;
956+
sub_seconds_len++;
957+
}
958+
}
959+
}
960+
// handle timezone
961+
if (encountered_zone) {
962+
int err = 0;
963+
gdv_timestamp ret_time = 0;
964+
err = gdv_fn_time_with_zone(&ts_fields[0], (input + index), (length - index),
965+
&ret_time);
966+
if (err) {
967+
const char* msg = "Invalid timestamp or unknown zone for timestamp value ";
968+
set_error_for_date(length, input, msg, context);
969+
return 0;
970+
}
971+
return ret_time;
972+
}
973+
974+
year_month_day date = year(ts_fields[TimeFields::kYear]) /
975+
month(ts_fields[TimeFields::kMonth]) /
976+
day(ts_fields[TimeFields::kDay]);
977+
if (!date.ok()) {
978+
*out_valid = false;
979+
return 0;
980+
}
981+
982+
if (!is_valid_time(ts_fields[TimeFields::kHours], ts_fields[TimeFields::kMinutes],
983+
ts_fields[TimeFields::kSeconds])) {
984+
*out_valid = false;
985+
return 0;
986+
}
987+
988+
auto date_time = sys_days(date) + hours(ts_fields[TimeFields::kHours]) +
989+
minutes(ts_fields[TimeFields::kMinutes]) +
990+
seconds(ts_fields[TimeFields::kSeconds]) +
991+
milliseconds(ts_fields[TimeFields::kSubSeconds]);
992+
if (ts_fields[TimeFields::kDisplacementHours] ||
993+
ts_fields[TimeFields::kDisplacementMinutes]) {
994+
auto displacement_time = hours(ts_fields[TimeFields::kDisplacementHours]) +
995+
minutes(ts_fields[TimeFields::kDisplacementMinutes]);
996+
date_time = (add_displacement) ? (date_time + displacement_time)
997+
: (date_time - displacement_time);
998+
}
999+
return std::chrono::time_point_cast<milliseconds>(date_time).time_since_epoch().count();
1000+
}
1001+
8011002
gdv_timestamp castTIMESTAMP_date64(gdv_date64 date_in_millis) { return date_in_millis; }
8021003

8031004
gdv_timestamp castTIMESTAMP_int64(gdv_int64 in) { return in; }
@@ -917,7 +1118,8 @@ gdv_date32 castDATE32_date64(gdv_date64 date_in_millis) {
9171118
}
9181119

9191120
gdv_timestamp castTIMESTAMP_date32(gdv_date32 in_day) {
920-
return static_cast<gdv_date32>(in_day * (MILLIS_IN_DAY));
1121+
int64_t in = (int64_t)in_day;
1122+
return in * MILLIS_IN_DAY;
9211123
}
9221124

9231125
gdv_date32 castDATE32_timestamp(gdv_timestamp timestamp_in_millis) {
@@ -1067,6 +1269,12 @@ gdv_int32 unix_date_date32(gdv_date32 in) {
10671269
return in;
10681270
}
10691271

1272+
FORCE_INLINE
1273+
gdv_int64 unix_date_seconds_date32(gdv_date32 in) {
1274+
gdv_int64 in_day = (gdv_int64)in;
1275+
return in_day * SECONDS_IN_HOUR * 24;
1276+
}
1277+
10701278
FORCE_INLINE
10711279
gdv_int64 unix_seconds_timestampusutc(gdv_timestamp in) {
10721280
return in / 1000000;

cpp/src/gandiva/precompiled/types.h

+6
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,9 @@ gdv_int32 utf8_last_char_pos(gdv_int64 context, const char* data, gdv_int32 data
411411

412412
gdv_date64 castDATE_utf8(int64_t execution_context, const char* input, gdv_int32 length);
413413

414+
gdv_date64 castDATE_nullsafe_utf8(int64_t execution_context, const char* input,
415+
gdv_int32 length, bool in_valid, bool* out_valid);
416+
414417
gdv_date64 castDATE_int64(gdv_int64 date);
415418

416419
gdv_date64 castDATE_date32(gdv_date32 date);
@@ -419,6 +422,9 @@ gdv_date32 castDATE_int32(gdv_int32 date);
419422

420423
gdv_timestamp castTIMESTAMP_utf8(int64_t execution_context, const char* input,
421424
gdv_int32 length);
425+
gdv_timestamp castTIMESTAMP_withCarrying_utf8(int64_t context, const char* input,
426+
gdv_int32 length, bool in_valid,
427+
bool* out_valid);
422428
gdv_timestamp castTIMESTAMP_date64(gdv_date64);
423429
gdv_timestamp castTIMESTAMP_int64(gdv_int64);
424430
gdv_date64 castDATE_timestamp(gdv_timestamp);

0 commit comments

Comments
 (0)