From 744c87c2d608189883a3244ce6fa5ccf5035d9e0 Mon Sep 17 00:00:00 2001 From: Quentin Sabah Date: Thu, 27 Feb 2025 12:56:52 +0100 Subject: [PATCH 1/2] fix: regex works incorrectly when using -c option Make sure the c++ code generator emits raw string literals. fixes #2532 --- src/synthesiser/Synthesiser.cpp | 71 ++++++++++++------- tests/evaluation/CMakeLists.txt | 1 + tests/evaluation/issue2532/goterm.csv | 1 + tests/evaluation/issue2532/issue2532.dl | 19 +++++ tests/evaluation/issue2532/issue2532.err | 0 tests/evaluation/issue2532/issue2532.out | 0 tests/evaluation/issue2532/taxon.csv | 3 + .../evaluation/issue2532/taxon_complement.csv | 1 + 8 files changed, 70 insertions(+), 26 deletions(-) create mode 100644 tests/evaluation/issue2532/goterm.csv create mode 100644 tests/evaluation/issue2532/issue2532.dl create mode 100644 tests/evaluation/issue2532/issue2532.err create mode 100644 tests/evaluation/issue2532/issue2532.out create mode 100644 tests/evaluation/issue2532/taxon.csv create mode 100644 tests/evaluation/issue2532/taxon_complement.csv diff --git a/src/synthesiser/Synthesiser.cpp b/src/synthesiser/Synthesiser.cpp index 59e76067174..8133dcb669f 100644 --- a/src/synthesiser/Synthesiser.cpp +++ b/src/synthesiser/Synthesiser.cpp @@ -226,6 +226,25 @@ std::optional Synthesiser::compileRegex(const std::string& pattern) } } +/// Return the C++ string raw literal sequence for the given string. +std::string raw_str(const std::string& str) { + if (str.find(")_\"") == std::string::npos) { + // by default, use the shortest possible delimiter. + return "R\"_(" + str + ")_\""; + } else { + // when the input string contains the shortest possible ending sequence, we + // generate a delimiter based on the string hash value, that is statically + // very unlikely to appear in the string. + std::size_t h = std::hash{}(str); + std::string delim = std::to_string(h); + if (delim.size() > 16) { + delim.resize(16); + } + assert(str.find(")" + delim + "\"") == std::string::npos); + return "R\"" + delim + "(" + str + ")" + delim + "\""; + } +} + void Synthesiser::emitCode(std::ostream& out, const Statement& stmt) { class CodeEmitter : public ram::Visitor { using ram::Visitor::visit_; @@ -343,10 +362,10 @@ void Synthesiser::emitCode(std::ostream& out, const Statement& stmt) { if (cur == registry.end()) { return; } - out << "{{\"" << cur->first << "\",\"" << escape(cur->second) << "\"}"; + out << "{{" << raw_str(cur->first) << "," << raw_str(cur->second) << "}"; ++cur; for (; cur != registry.end(); ++cur) { - out << ",{\"" << cur->first << "\",\"" << escape(cur->second) << "\"}"; + out << ",{" << raw_str(cur->first) << "," << raw_str(cur->second) << "}"; } out << '}'; }; @@ -505,8 +524,8 @@ void Synthesiser::emitCode(std::ostream& out, const Statement& stmt) { void visit_(type_identity, const LogSize& size, std::ostream& out) override { PRINT_BEGIN_COMMENT(out); - out << "ProfileEventSingleton::instance().makeQuantityEvent( R\"("; - out << size.getMessage() << ")\","; + out << "ProfileEventSingleton::instance().makeQuantityEvent("; + out << raw_str(size.getMessage()) << ","; out << synthesiser.getRelationName(synthesiser.lookup(size.getRelation())) << "->size(),iter);"; PRINT_END_COMMENT(out); } @@ -625,7 +644,7 @@ void Synthesiser::emitCode(std::ostream& out, const Statement& stmt) { const auto* rel = synthesiser.lookup(timer.getRelation()); auto relName = synthesiser.getRelationName(rel); - out << "\tLogger logger(R\"_(" << timer.getMessage() << ")_\",iter, [&](){return " << relName + out << "\tLogger logger(" << raw_str(timer.getMessage()) << ",iter, [&](){return " << relName << "->size();});\n"; // insert statement to be measured dispatch(timer.getStatement(), out); @@ -643,7 +662,7 @@ void Synthesiser::emitCode(std::ostream& out, const Statement& stmt) { const std::string ext = fileExtension(glb.config().get("profile")); // create local timer - out << "\tLogger logger(R\"_(" << timer.getMessage() << ")_\",iter);\n"; + out << "\tLogger logger(" << raw_str(timer.getMessage()) << ",iter);\n"; // insert statement to be measured dispatch(timer.getStatement(), out); @@ -654,9 +673,9 @@ void Synthesiser::emitCode(std::ostream& out, const Statement& stmt) { void visit_(type_identity, const DebugInfo& dbg, std::ostream& out) override { PRINT_BEGIN_COMMENT(out); - out << "signalHandler->setMsg(R\"_("; - out << dbg.getMessage(); - out << ")_\");\n"; + out << "signalHandler->setMsg("; + out << raw_str(dbg.getMessage()); + out << ");\n"; // insert statements of the rule dispatch(dbg.getStatement(), out); @@ -2325,7 +2344,7 @@ void Synthesiser::emitCode(std::ostream& out, const Statement& stmt) { } else { out << "symTable.encode("; if (lstr) { - out << "R\"_(" << lstr->getConstant() << ")_\""; + out << raw_str(lstr->getConstant()); } else { out << "symTable.decode("; dispatch(*args[0], out); @@ -2333,7 +2352,7 @@ void Synthesiser::emitCode(std::ostream& out, const Statement& stmt) { } out << " + "; if (rstr) { - out << "R\"_(" << rstr->getConstant() << ")_\""; + out << raw_str(rstr->getConstant()); } else { out << "symTable.decode("; dispatch(*args[1], out); @@ -2795,8 +2814,7 @@ void Synthesiser::generateCode(GenDb& db, const std::string& id, bool& withShare } rst << "{\n"; for (const auto& p : patterns) { - const std::string escaped = escape(p); - rst << "\tstd::regex(\"" << escaped << "\"),\n"; + rst << " std::regex(" << raw_str(p) << "),\n"; } rst << "}"; @@ -2835,7 +2853,7 @@ void Synthesiser::generateCode(GenDb& db, const std::string& id, bool& withShare if (!symbolMap.empty()) { st << "{\n"; for (const auto& x : symbolIndex) { - st << "\tR\"_(" << x << ")_\",\n"; + st << " " << raw_str(x) << ",\n"; } st << "}"; } @@ -3049,10 +3067,10 @@ void Synthesiser::generateCode(GenDb& db, const std::string& id, bool& withShare if (cur == registry.end()) { return; } - o << "{{\"" << cur->first << "\",\"" << escape(cur->second) << "\"}"; + o << "{{" << raw_str(cur->first) << "," << raw_str(cur->second) << "}"; ++cur; for (; cur != registry.end(); ++cur) { - o << ",{\"" << cur->first << "\",\"" << escape(cur->second) << "\"}"; + o << ",{" << raw_str(cur->first) << "," << raw_str(cur->second) << "}"; } o << '}'; }; @@ -3113,7 +3131,7 @@ void Synthesiser::generateCode(GenDb& db, const std::string& id, bool& withShare os << "rwOperation[\"IO\"] = \"stdout\";\n"; os << R"(rwOperation["name"] = ")" << name << "\";\n"; os << "rwOperation[\"types\"] = "; - os << "\"" << escapeJSONstring(types.dump()) << "\""; + os << raw_str(types.dump()); os << ";\n"; os << "IOSystem::getInstance().getWriter("; os << "rwOperation, symTable, recordTable"; @@ -3182,12 +3200,13 @@ void Synthesiser::generateCode(GenDb& db, const std::string& id, bool& withShare dumpFreqs.setRetType("void"); for (auto const& cur : idxMap) { - dumpFreqs.body() << "\tProfileEventSingleton::instance().makeQuantityEvent(R\"_(" << cur.first - << ")_\", freqs[" << cur.second << "],0);\n"; + dumpFreqs.body() << " ProfileEventSingleton::instance().makeQuantityEvent(" << raw_str(cur.first) + << ", freqs[" << cur.second << "],0);\n"; } for (auto const& cur : neIdxMap) { - dumpFreqs.body() << "\tProfileEventSingleton::instance().makeQuantityEvent(R\"_(@relation-reads;" - << cur.first << ")_\", reads[" << cur.second << "],0);\n"; + dumpFreqs.body() << " ProfileEventSingleton::instance().makeQuantityEvent(" + << raw_str("@relation-reads;" + cur.first) << ", reads[" << cur.second + << "],0);\n"; } } @@ -3230,15 +3249,15 @@ void Synthesiser::generateCode(GenDb& db, const std::string& id, bool& withShare // parse arguments hook << "souffle::CmdOptions opt("; - hook << "R\"(" << glb.config().get("") << ")\",\n"; - hook << "R\"()\",\n"; - hook << "R\"()\",\n"; + hook << raw_str(glb.config().get("")) << ",\n"; + hook << raw_str("") << ",\n"; + hook << raw_str("") << ",\n"; if (glb.config().has("profile")) { hook << "true,\n"; - hook << "R\"(" << glb.config().get("profile") << ")\",\n"; + hook << raw_str(glb.config().get("profile")) << ",\n"; } else { hook << "false,\n"; - hook << "R\"()\",\n"; + hook << raw_str("") << ",\n"; } hook << std::stoi(glb.config().get("jobs")); hook << ");\n"; diff --git a/tests/evaluation/CMakeLists.txt b/tests/evaluation/CMakeLists.txt index 4cd6ed5de3c..20aad8b7517 100644 --- a/tests/evaluation/CMakeLists.txt +++ b/tests/evaluation/CMakeLists.txt @@ -171,3 +171,4 @@ positive_test(issue2160) add_subdirectory(issue2508) souffle_positive_functor_test(issue2508 CATEGORY evaluation) +positive_test(issue2532) diff --git a/tests/evaluation/issue2532/goterm.csv b/tests/evaluation/issue2532/goterm.csv new file mode 100644 index 00000000000..bdc4f1ee2e4 --- /dev/null +++ b/tests/evaluation/issue2532/goterm.csv @@ -0,0 +1 @@ + diff --git a/tests/evaluation/issue2532/issue2532.dl b/tests/evaluation/issue2532/issue2532.dl new file mode 100644 index 00000000000..8637a069b85 --- /dev/null +++ b/tests/evaluation/issue2532/issue2532.dl @@ -0,0 +1,19 @@ +.decl term(id: symbol) +.decl goterm(id: symbol) +.decl taxon(id: symbol) +.decl taxon_complement(id: symbol, taxon: symbol) + +term(""). +term(""). +term(""). +term(""). +term(""). +term(")_\""). + +goterm(id) :- term(id), match("", id). +taxon(id) :- term(id), match("", id). +taxon_complement(id, taxon) :- term(id), match("", id), len=strlen(id), taxon=cat(substr(id, 0, len-5), ">"). + +.output goterm +.output taxon +.output taxon_complement diff --git a/tests/evaluation/issue2532/issue2532.err b/tests/evaluation/issue2532/issue2532.err new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/evaluation/issue2532/issue2532.out b/tests/evaluation/issue2532/issue2532.out new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/evaluation/issue2532/taxon.csv b/tests/evaluation/issue2532/taxon.csv new file mode 100644 index 00000000000..32daa65e3bf --- /dev/null +++ b/tests/evaluation/issue2532/taxon.csv @@ -0,0 +1,3 @@ + + + diff --git a/tests/evaluation/issue2532/taxon_complement.csv b/tests/evaluation/issue2532/taxon_complement.csv new file mode 100644 index 00000000000..8f2fcac83ec --- /dev/null +++ b/tests/evaluation/issue2532/taxon_complement.csv @@ -0,0 +1 @@ + From 3a833b0c0561afa6298236c0fc4557cdd5f03b9d Mon Sep 17 00:00:00 2001 From: Quentin Sabah Date: Thu, 27 Feb 2025 13:03:09 +0100 Subject: [PATCH 2/2] fixup! fix: regex works incorrectly when using -c option --- src/synthesiser/Synthesiser.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/synthesiser/Synthesiser.cpp b/src/synthesiser/Synthesiser.cpp index 8133dcb669f..d8f603afa17 100644 --- a/src/synthesiser/Synthesiser.cpp +++ b/src/synthesiser/Synthesiser.cpp @@ -228,21 +228,21 @@ std::optional Synthesiser::compileRegex(const std::string& pattern) /// Return the C++ string raw literal sequence for the given string. std::string raw_str(const std::string& str) { - if (str.find(")_\"") == std::string::npos) { - // by default, use the shortest possible delimiter. - return "R\"_(" + str + ")_\""; - } else { - // when the input string contains the shortest possible ending sequence, we - // generate a delimiter based on the string hash value, that is statically - // very unlikely to appear in the string. - std::size_t h = std::hash{}(str); - std::string delim = std::to_string(h); - if (delim.size() > 16) { - delim.resize(16); + if (str.find(")_\"") == std::string::npos) { + // by default, use the shortest possible delimiter. + return "R\"_(" + str + ")_\""; + } else { + // when the input string contains the shortest possible ending sequence, we + // generate a delimiter based on the string hash value, that is statically + // very unlikely to appear in the string. + std::size_t h = std::hash{}(str); + std::string delim = std::to_string(h); + if (delim.size() > 16) { + delim.resize(16); + } + assert(str.find(")" + delim + "\"") == std::string::npos); + return "R\"" + delim + "(" + str + ")" + delim + "\""; } - assert(str.find(")" + delim + "\"") == std::string::npos); - return "R\"" + delim + "(" + str + ")" + delim + "\""; - } } void Synthesiser::emitCode(std::ostream& out, const Statement& stmt) {