1 | #include "duckdb/function/scalar/string_functions.hpp" |
2 | #include "duckdb/common/exception.hpp" |
3 | #include "duckdb/common/vector_operations/vector_operations.hpp" |
4 | #include "duckdb/execution/expression_executor.hpp" |
5 | #include "duckdb/planner/expression/bound_function_expression.hpp" |
6 | #include "duckdb/common/vector_operations/unary_executor.hpp" |
7 | #include "duckdb/common/vector_operations/binary_executor.hpp" |
8 | #include "duckdb/common/vector_operations/ternary_executor.hpp" |
9 | #include "utf8proc_wrapper.hpp" |
10 | |
11 | #include "re2/re2.h" |
12 | |
13 | using namespace std; |
14 | |
15 | namespace duckdb { |
16 | |
17 | RegexpMatchesBindData::RegexpMatchesBindData(unique_ptr<RE2> constant_pattern, string range_min, string range_max, |
18 | bool range_success) |
19 | : constant_pattern(std::move(constant_pattern)), range_min(range_min), range_max(range_max), |
20 | range_success(range_success) { |
21 | } |
22 | |
23 | RegexpMatchesBindData::~RegexpMatchesBindData() { |
24 | } |
25 | |
26 | unique_ptr<FunctionData> RegexpMatchesBindData::Copy() { |
27 | return make_unique<RegexpMatchesBindData>(move(constant_pattern), range_min, range_max, range_success); |
28 | } |
29 | |
30 | static inline re2::StringPiece CreateStringPiece(string_t &input) { |
31 | return re2::StringPiece(input.GetData(), input.GetSize()); |
32 | } |
33 | |
34 | struct RegexPartialMatch { |
35 | static inline bool Operation(const re2::StringPiece &input, RE2 &re) { |
36 | return RE2::PartialMatch(input, re); |
37 | } |
38 | }; |
39 | |
40 | struct RegexFullMatch { |
41 | static inline bool Operation(const re2::StringPiece &input, RE2 &re) { |
42 | return RE2::FullMatch(input, re); |
43 | } |
44 | }; |
45 | |
46 | template <class OP> static void regexp_matches_function(DataChunk &args, ExpressionState &state, Vector &result) { |
47 | auto &strings = args.data[0]; |
48 | auto &patterns = args.data[1]; |
49 | |
50 | auto &func_expr = (BoundFunctionExpression &)state.expr; |
51 | auto &info = (RegexpMatchesBindData &)*func_expr.bind_info; |
52 | |
53 | RE2::Options options; |
54 | options.set_log_errors(false); |
55 | |
56 | if (info.constant_pattern) { |
57 | // FIXME: this should be a unary loop |
58 | UnaryExecutor::Execute<string_t, bool, true>(strings, result, args.size(), [&](string_t input) { |
59 | return OP::Operation(CreateStringPiece(input), *info.constant_pattern); |
60 | }); |
61 | } else { |
62 | BinaryExecutor::Execute<string_t, string_t, bool, true>(strings, patterns, result, args.size(), |
63 | [&](string_t input, string_t pattern) { |
64 | RE2 re(CreateStringPiece(pattern), options); |
65 | if (!re.ok()) { |
66 | throw Exception(re.error()); |
67 | } |
68 | return OP::Operation(CreateStringPiece(input), re); |
69 | }); |
70 | } |
71 | } |
72 | |
73 | static unique_ptr<FunctionData> regexp_matches_get_bind_function(BoundFunctionExpression &expr, |
74 | ClientContext &context) { |
75 | // pattern is the second argument. If its constant, we can already prepare the pattern and store it for later. |
76 | assert(expr.children.size() == 2); |
77 | if (expr.children[1]->IsScalar()) { |
78 | Value pattern_str = ExpressionExecutor::EvaluateScalar(*expr.children[1]); |
79 | if (!pattern_str.is_null && pattern_str.type == TypeId::VARCHAR) { |
80 | RE2::Options options; |
81 | options.set_log_errors(false); |
82 | auto re = make_unique<RE2>(pattern_str.str_value, options); |
83 | if (!re->ok()) { |
84 | throw Exception(re->error()); |
85 | } |
86 | |
87 | string range_min, range_max; |
88 | auto range_success = re->PossibleMatchRange(&range_min, &range_max, 1000); |
89 | return make_unique<RegexpMatchesBindData>(move(re), range_min, range_max, range_success); |
90 | } |
91 | } |
92 | return make_unique<RegexpMatchesBindData>(nullptr, "" , "" , false); |
93 | } |
94 | |
95 | static void regexp_replace_function(DataChunk &args, ExpressionState &state, Vector &result) { |
96 | auto &strings = args.data[0]; |
97 | auto &patterns = args.data[1]; |
98 | auto &replaces = args.data[2]; |
99 | |
100 | RE2::Options options; |
101 | options.set_log_errors(false); |
102 | |
103 | TernaryExecutor::Execute<string_t, string_t, string_t, string_t>( |
104 | strings, patterns, replaces, result, args.size(), [&](string_t input, string_t pattern, string_t replace) { |
105 | RE2 re(CreateStringPiece(pattern), options); |
106 | std::string sstring(input.GetData(), input.GetSize()); |
107 | RE2::Replace(&sstring, re, CreateStringPiece(replace)); |
108 | return StringVector::AddString(result, sstring); |
109 | }); |
110 | } |
111 | |
112 | void RegexpFun::RegisterFunction(BuiltinFunctions &set) { |
113 | set.AddFunction(ScalarFunction("regexp_full_match" , {SQLType::VARCHAR, SQLType::VARCHAR}, SQLType::BOOLEAN, |
114 | regexp_matches_function<RegexFullMatch>, false, regexp_matches_get_bind_function)); |
115 | set.AddFunction(ScalarFunction("regexp_matches" , {SQLType::VARCHAR, SQLType::VARCHAR}, SQLType::BOOLEAN, |
116 | regexp_matches_function<RegexPartialMatch>, false, |
117 | regexp_matches_get_bind_function)); |
118 | set.AddFunction(ScalarFunction("regexp_replace" , {SQLType::VARCHAR, SQLType::VARCHAR, SQLType::VARCHAR}, |
119 | SQLType::VARCHAR, regexp_replace_function)); |
120 | } |
121 | |
122 | } // namespace duckdb |
123 | |