Skip to content

Commit c39b8a6

Browse files
committed
ARROW-18235: [C++][Gandiva] Fix the like function implementation for escape chars (apache#14579)
The current implementation of optimisation for like function does a removal of the escape char, this causes errors in matching if the escape char is one of the pcre special chars or if the escape char is followed by itself. Fix this by only removing the '\\' escape char during optimisation. Authored-by: Siddhant Rao <siddhant.rao@dremio.com> Signed-off-by: Sutou Kouhei <kou@clear-code.com>
1 parent c3c98b8 commit c39b8a6

File tree

3 files changed

+45
-10
lines changed

3 files changed

+45
-10
lines changed

cpp/src/gandiva/regex_functions_holder.cc

+1-8
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,7 @@ RE2 LikeHolder::ends_with_regex_(R"(\.\*([^\.\*])*)");
2727
RE2 LikeHolder::is_substr_regex_(R"(\.\*([^\.\*])*\.\*)");
2828

2929
std::string& RemovePatternEscapeChars(const FunctionNode& node, std::string& pattern) {
30-
if (node.children().size() != 2) {
31-
auto escape_char = dynamic_cast<LiteralNode*>(node.children().at(2).get());
32-
pattern.erase(std::remove(pattern.begin(), pattern.end(),
33-
arrow::util::get<std::string>(escape_char->holder()).at(0)),
34-
pattern.end()); // remove escape chars
35-
} else {
36-
pattern.erase(std::remove(pattern.begin(), pattern.end(), '\\'), pattern.end());
37-
}
30+
pattern.erase(std::remove(pattern.begin(), pattern.end(), '\\'), pattern.end());
3831
return pattern;
3932
}
4033

cpp/src/gandiva/regex_functions_holder_test.cc

+11-2
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class TestLikeHolder : public ::testing::Test {
3939
auto pattern_node =
4040
std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern), false);
4141
auto escape_char_node = std::make_shared<LiteralNode>(
42-
arrow::int8(), LiteralHolder((int8_t)escape_char), false);
42+
arrow::utf8(), LiteralHolder(std::string(1, escape_char)), false);
4343
return FunctionNode("like", {field, pattern_node, escape_char_node},
4444
arrow::boolean());
4545
}
@@ -177,7 +177,16 @@ TEST_F(TestLikeHolder, TestOptimise) {
177177
fnode = LikeHolder::TryOptimize(BuildLike("\\%xyz", '\\'));
178178
EXPECT_EQ(fnode.descriptor()->name(), "like");
179179
EXPECT_EQ(fnode.ToString(),
180-
"bool like((string) in, (const string) '\\%xyz', (const int8) \\)");
180+
"bool like((string) in, (const string) '\\%xyz', (const string) '\\')");
181+
182+
// optimised for escape pattern that are pcre special chars.
183+
fnode = LikeHolder::TryOptimize(BuildLike("%ab^_cd^_de%", '^'));
184+
EXPECT_EQ(fnode.descriptor()->name(), "is_substr");
185+
EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) 'ab_cd_de')");
186+
187+
fnode = LikeHolder::TryOptimize(BuildLike("%ab^^cd^^de%", '^'));
188+
EXPECT_EQ(fnode.descriptor()->name(), "is_substr");
189+
EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) 'ab^cd^de')");
181190
}
182191

183192
TEST_F(TestLikeHolder, TestMatchOneEscape) {

cpp/src/gandiva/tests/filter_test.cc

+33
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,39 @@ TEST_F(TestFilter, TestLike) {
414414

415415
// Validate results
416416
EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
417+
418+
auto literal_escape_pattern =
419+
TreeExprBuilder::MakeStringLiteral("%tu^_security^_freeze%");
420+
auto escape_char = TreeExprBuilder::MakeStringLiteral("^");
421+
like_func = TreeExprBuilder::MakeFunction(
422+
"like", {node_f0, literal_escape_pattern, escape_char}, boolean());
423+
424+
condition = TreeExprBuilder::MakeCondition(like_func);
425+
426+
status = Filter::Make(schema, condition, TestConfiguration(), &filter);
427+
EXPECT_TRUE(status.ok());
428+
429+
// Create a row-batch with some sample data
430+
num_records = 5;
431+
array0 = MakeArrowArrayUtf8(
432+
{"AAAtu_security_freezeBBB", "hello", "bye", "abc-x", "AAAtusecurityfreezeBBB"},
433+
{true, true, true, true, true});
434+
435+
// expected output (indices for which condition matches)
436+
exp = MakeArrowArrayUint16({0});
437+
438+
// prepare input record batch
439+
in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
440+
441+
status = SelectionVector::MakeInt16(num_records, pool_, &selection_vector);
442+
EXPECT_TRUE(status.ok());
443+
444+
// Evaluate expression
445+
status = filter->Evaluate(*in_batch, selection_vector);
446+
EXPECT_TRUE(status.ok());
447+
448+
// Validate results
449+
EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
417450
}
418451

419452
} // namespace gandiva

0 commit comments

Comments
 (0)