1 | #include "SensitiveDataMasker.h" |
2 | |
3 | #include <set> |
4 | #include <string> |
5 | #include <atomic> |
6 | |
7 | #include <re2/re2.h> |
8 | #include <re2/stringpiece.h> |
9 | |
10 | #include <Poco/Util/AbstractConfiguration.h> |
11 | |
12 | #include <common/logger_useful.h> |
13 | |
14 | #include <Common/Exception.h> |
15 | #include <Common/StringUtils/StringUtils.h> |
16 | |
17 | #ifndef NDEBUG |
18 | # include <iostream> |
19 | #endif |
20 | |
21 | |
22 | namespace DB |
23 | { |
24 | namespace ErrorCodes |
25 | { |
26 | extern const int CANNOT_COMPILE_REGEXP; |
27 | extern const int LOGICAL_ERROR; |
28 | extern const int NO_ELEMENTS_IN_CONFIG; |
29 | extern const int INVALID_CONFIG_PARAMETER; |
30 | } |
31 | |
32 | class SensitiveDataMasker::MaskingRule |
33 | { |
34 | private: |
35 | const std::string name; |
36 | const std::string replacement_string; |
37 | const std::string regexp_string; |
38 | |
39 | const RE2 regexp; |
40 | const re2::StringPiece replacement; |
41 | |
42 | #ifndef NDEBUG |
43 | mutable std::atomic<std::uint64_t> matches_count = 0; |
44 | #endif |
45 | |
46 | public: |
47 | //* TODO: option with hyperscan? https://software.intel.com/en-us/articles/why-and-how-to-replace-pcre-with-hyperscan |
48 | // re2::set should also work quite fast, but it doesn't return the match position, only which regexp was matched |
49 | |
50 | MaskingRule(const std::string & name_, const std::string & regexp_string_, const std::string & replacement_string_) |
51 | : name(name_) |
52 | , replacement_string(replacement_string_) |
53 | , regexp_string(regexp_string_) |
54 | , regexp(regexp_string, RE2::Quiet) |
55 | , replacement(replacement_string) |
56 | { |
57 | if (!regexp.ok()) |
58 | throw DB::Exception( |
59 | "SensitiveDataMasker: cannot compile re2: " + regexp_string_ + ", error: " + regexp.error() |
60 | + ". Look at https://github.com/google/re2/wiki/Syntax for reference." , |
61 | DB::ErrorCodes::CANNOT_COMPILE_REGEXP); |
62 | } |
63 | |
64 | uint64_t apply(std::string & data) const |
65 | { |
66 | auto m = RE2::GlobalReplace(&data, regexp, replacement); |
67 | #ifndef NDEBUG |
68 | matches_count += m; |
69 | #endif |
70 | return m; |
71 | } |
72 | |
73 | const std::string & getName() const { return name; } |
74 | const std::string & getReplacementString() const { return replacement_string; } |
75 | #ifndef NDEBUG |
76 | uint64_t getMatchesCount() const { return matches_count; } |
77 | #endif |
78 | |
79 | }; |
80 | |
81 | std::unique_ptr<SensitiveDataMasker> SensitiveDataMasker::sensitive_data_masker = nullptr; |
82 | |
83 | void SensitiveDataMasker::setInstance(std::unique_ptr<SensitiveDataMasker> sensitive_data_masker_) |
84 | { |
85 | if (!sensitive_data_masker_) |
86 | throw Exception("Logical error: the 'sensitive_data_masker' is not set" , ErrorCodes::LOGICAL_ERROR); |
87 | |
88 | if (sensitive_data_masker_->rulesCount() > 0) |
89 | { |
90 | sensitive_data_masker = std::move(sensitive_data_masker_); |
91 | } |
92 | } |
93 | |
94 | SensitiveDataMasker * SensitiveDataMasker::getInstance() |
95 | { |
96 | return sensitive_data_masker.get(); |
97 | } |
98 | |
99 | SensitiveDataMasker::SensitiveDataMasker(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix) |
100 | { |
101 | Poco::Util::AbstractConfiguration::Keys keys; |
102 | config.keys(config_prefix, keys); |
103 | Logger * logger = &Logger::get("SensitiveDataMaskerConfigRead" ); |
104 | |
105 | std::set<std::string> used_names; |
106 | |
107 | for (const auto & rule : keys) |
108 | { |
109 | if (startsWith(rule, "rule" )) |
110 | { |
111 | auto rule_config_prefix = config_prefix + "." + rule; |
112 | |
113 | auto rule_name = config.getString(rule_config_prefix + ".name" , rule_config_prefix); |
114 | |
115 | if (!used_names.insert(rule_name).second) |
116 | { |
117 | throw Exception( |
118 | "query_masking_rules configuration contains more than one rule named '" + rule_name + "'." , |
119 | ErrorCodes::INVALID_CONFIG_PARAMETER); |
120 | } |
121 | |
122 | auto regexp = config.getString(rule_config_prefix + ".regexp" , "" ); |
123 | |
124 | if (regexp.empty()) |
125 | { |
126 | throw Exception( |
127 | "query_masking_rules configuration, rule '" + rule_name + "' has no <regexp> node or <regexp> is empty." , |
128 | ErrorCodes::NO_ELEMENTS_IN_CONFIG); |
129 | } |
130 | |
131 | auto replace = config.getString(rule_config_prefix + ".replace" , "******" ); |
132 | |
133 | try |
134 | { |
135 | addMaskingRule(rule_name, regexp, replace); |
136 | } |
137 | catch (DB::Exception & e) |
138 | { |
139 | e.addMessage("while adding query masking rule '" + rule_name + "'." ); |
140 | throw; |
141 | } |
142 | } |
143 | else |
144 | { |
145 | LOG_WARNING(logger, "Unused param " << config_prefix << '.' << rule); |
146 | } |
147 | } |
148 | |
149 | auto rules_count = rulesCount(); |
150 | if (rules_count > 0) |
151 | { |
152 | LOG_INFO(logger, rules_count << " query masking rules loaded." ); |
153 | } |
154 | } |
155 | |
156 | SensitiveDataMasker::~SensitiveDataMasker() {} |
157 | |
158 | void SensitiveDataMasker::addMaskingRule( |
159 | const std::string & name, const std::string & regexp_string, const std::string & replacement_string) |
160 | { |
161 | all_masking_rules.push_back(std::make_unique<MaskingRule>(name, regexp_string, replacement_string)); |
162 | } |
163 | |
164 | |
165 | size_t SensitiveDataMasker::wipeSensitiveData(std::string & data) const |
166 | { |
167 | size_t matches = 0; |
168 | for (auto & rule : all_masking_rules) |
169 | matches += rule->apply(data); |
170 | return matches; |
171 | } |
172 | |
173 | #ifndef NDEBUG |
174 | void SensitiveDataMasker::printStats() |
175 | { |
176 | for (auto & rule : all_masking_rules) |
177 | { |
178 | std::cout << rule->getName() << " (replacement to " << rule->getReplacementString() << ") matched " << rule->getMatchesCount() |
179 | << " times" << std::endl; |
180 | } |
181 | } |
182 | #endif |
183 | |
184 | size_t SensitiveDataMasker::rulesCount() const |
185 | { |
186 | return all_masking_rules.size(); |
187 | } |
188 | |
189 | } |
190 | |