Skip to content

Commit b49af84

Browse files
committed
Add method to parse date from null-terminated string
1 parent a262cf1 commit b49af84

File tree

4 files changed

+70
-2
lines changed

4 files changed

+70
-2
lines changed

cpp/src/arrow/util/value_parsing.h

+33
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <chrono>
2424
#include <cstddef>
2525
#include <cstdint>
26+
#include <cstring>
2627
#include <limits>
2728
#include <memory>
2829
#include <string>
@@ -561,6 +562,38 @@ static inline bool ParseTimestampStrptime(const char* buf, size_t length,
561562
return true;
562563
}
563564

565+
/// \brief Returns time since the UNIX epoch in the requested unit. Takes null terminated
566+
/// buffer as argument
567+
static inline bool ParseTimestampStrptime(const char* buf,
568+
const char* format, bool ignore_time_in_day,
569+
bool allow_trailing_chars, TimeUnit::type unit,
570+
int64_t* out) {
571+
// NOTE: strptime() is more than 10x faster than arrow_vendored::date::parse().
572+
struct tm result;
573+
memset(&result, 0, sizeof(struct tm));
574+
#ifdef _WIN32
575+
char* ret = arrow_strptime(buf, format, &result);
576+
#else
577+
char* ret = strptime(buf, format, &result);
578+
#endif
579+
if (ret == NULLPTR) {
580+
return false;
581+
}
582+
if (!allow_trailing_chars && static_cast<size_t>(ret - buf) != strlen(buf)) {
583+
return false;
584+
}
585+
// ignore the time part
586+
arrow_vendored::date::sys_seconds secs =
587+
arrow_vendored::date::sys_days(arrow_vendored::date::year(result.tm_year + 1900) /
588+
(result.tm_mon + 1) / result.tm_mday);
589+
if (!ignore_time_in_day) {
590+
secs += (std::chrono::hours(result.tm_hour) + std::chrono::minutes(result.tm_min) +
591+
std::chrono::seconds(result.tm_sec));
592+
}
593+
*out = detail::ConvertTimePoint(secs, unit);
594+
return true;
595+
}
596+
564597
/// \brief Parsing options for timestamps
565598
struct ParseTimestampContext {
566599
TimeUnit::type unit;

cpp/src/gandiva/tests/generate_data.h

+11
Original file line numberDiff line numberDiff line change
@@ -129,4 +129,15 @@ class FastUtf8DataGenerator : public DataGenerator<std::string> {
129129
char cur_char_;
130130
};
131131

132+
class Utf8DateDataGenerator : public DataGenerator<std::string> {
133+
public:
134+
Utf8DateDataGenerator() {}
135+
136+
std::string GenerateData() {
137+
return "1990-0" + std::to_string(random_.next() / 9 + 1) + "-1" + std::to_string(random_.next() / 9 + 1);
138+
}
139+
140+
private:
141+
Random random_;
142+
};
132143
} // namespace gandiva

cpp/src/gandiva/tests/micro_benchmarks.cc

+25-1
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,30 @@ static void TimedTestInExpr(benchmark::State& state) {
277277
ASSERT_OK(status);
278278
}
279279

280+
static void TimedTestToDate(benchmark::State& state) {
281+
auto field_a = field("a", utf8());
282+
auto schema = arrow::schema({field_a});
283+
auto pool = arrow::default_memory_pool();
284+
285+
auto field_result = field("res", arrow::date64());
286+
287+
auto node_a = TreeExprBuilder::MakeField(field_a);
288+
auto date_pattern = TreeExprBuilder::MakeStringLiteral("YYYY-MM-DD");
289+
auto suppress_literal = TreeExprBuilder::MakeLiteral(1);
290+
auto fn = TreeExprBuilder::MakeFunction("to_date", {node_a, date_pattern, suppress_literal}, arrow::date64());
291+
auto expr = TreeExprBuilder::MakeExpression(fn, field_result);
292+
293+
std::shared_ptr<Projector> projector;
294+
ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector));
295+
296+
Utf8DateDataGenerator data_generator;
297+
ProjectEvaluator evaluator(projector);
298+
299+
Status status = TimedEvaluate<arrow::StringType, std::string>(
300+
schema, evaluator, data_generator, pool, 1 * MILLION, 16 * THOUSAND, state);
301+
ASSERT_TRUE(status.ok());
302+
}
303+
280304
static void DoDecimalAdd3(benchmark::State& state, int32_t precision, int32_t scale,
281305
bool large = false) {
282306
// schema for input fields
@@ -398,6 +422,7 @@ BENCHMARK(TimedTestFilterLike)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
398422
BENCHMARK(TimedTestAllocs)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
399423
BENCHMARK(TimedTestMultiOr)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
400424
BENCHMARK(TimedTestInExpr)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
425+
BENCHMARK(TimedTestToDate)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
401426
BENCHMARK(DecimalAdd2Fast)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
402427
BENCHMARK(DecimalAdd2LeadingZeroes)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
403428
BENCHMARK(DecimalAdd2LeadingZeroesWithDiv)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
@@ -406,5 +431,4 @@ BENCHMARK(DecimalAdd3Fast)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
406431
BENCHMARK(DecimalAdd3LeadingZeroes)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
407432
BENCHMARK(DecimalAdd3LeadingZeroesWithDiv)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
408433
BENCHMARK(DecimalAdd3Large)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
409-
410434
} // namespace gandiva

cpp/src/gandiva/to_date_holder.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ int64_t ToDateHolder::operator()(ExecutionContext* context, const std::string& d
8484
// 2. does not process time in format +08:00 (or) id.
8585
int64_t seconds_since_epoch = 0;
8686
if (!::arrow::internal::ParseTimestampStrptime(
87-
data.c_str(), data.length(), pattern_.c_str(),
87+
data.c_str(), pattern_.c_str(),
8888
/*ignore_time_in_day=*/true, /*allow_trailing_chars=*/true,
8989
::arrow::TimeUnit::SECOND, &seconds_since_epoch)) {
9090
return_error(context, data);

0 commit comments

Comments
 (0)