1#include "SensitiveDataMasker.h"
2
3#include <set>
4#include <string>
5#include <atomic>
6
7#include <re2/re2.h>
8#include <re2/stringpiece.h>
9
10#include <Poco/Util/AbstractConfiguration.h>
11
12#include <common/logger_useful.h>
13
14#include <Common/Exception.h>
15#include <Common/StringUtils/StringUtils.h>
16
17#ifndef NDEBUG
18# include <iostream>
19#endif
20
21
22namespace DB
23{
24namespace ErrorCodes
25{
26 extern const int CANNOT_COMPILE_REGEXP;
27 extern const int LOGICAL_ERROR;
28 extern const int NO_ELEMENTS_IN_CONFIG;
29 extern const int INVALID_CONFIG_PARAMETER;
30}
31
32class SensitiveDataMasker::MaskingRule
33{
34private:
35 const std::string name;
36 const std::string replacement_string;
37 const std::string regexp_string;
38
39 const RE2 regexp;
40 const re2::StringPiece replacement;
41
42#ifndef NDEBUG
43 mutable std::atomic<std::uint64_t> matches_count = 0;
44#endif
45
46public:
47 //* TODO: option with hyperscan? https://software.intel.com/en-us/articles/why-and-how-to-replace-pcre-with-hyperscan
48 // re2::set should also work quite fast, but it doesn't return the match position, only which regexp was matched
49
50 MaskingRule(const std::string & name_, const std::string & regexp_string_, const std::string & replacement_string_)
51 : name(name_)
52 , replacement_string(replacement_string_)
53 , regexp_string(regexp_string_)
54 , regexp(regexp_string, RE2::Quiet)
55 , replacement(replacement_string)
56 {
57 if (!regexp.ok())
58 throw DB::Exception(
59 "SensitiveDataMasker: cannot compile re2: " + regexp_string_ + ", error: " + regexp.error()
60 + ". Look at https://github.com/google/re2/wiki/Syntax for reference.",
61 DB::ErrorCodes::CANNOT_COMPILE_REGEXP);
62 }
63
64 uint64_t apply(std::string & data) const
65 {
66 auto m = RE2::GlobalReplace(&data, regexp, replacement);
67#ifndef NDEBUG
68 matches_count += m;
69#endif
70 return m;
71 }
72
73 const std::string & getName() const { return name; }
74 const std::string & getReplacementString() const { return replacement_string; }
75#ifndef NDEBUG
76 uint64_t getMatchesCount() const { return matches_count; }
77#endif
78
79};
80
81std::unique_ptr<SensitiveDataMasker> SensitiveDataMasker::sensitive_data_masker = nullptr;
82
83void SensitiveDataMasker::setInstance(std::unique_ptr<SensitiveDataMasker> sensitive_data_masker_)
84{
85 if (!sensitive_data_masker_)
86 throw Exception("Logical error: the 'sensitive_data_masker' is not set", ErrorCodes::LOGICAL_ERROR);
87
88 if (sensitive_data_masker_->rulesCount() > 0)
89 {
90 sensitive_data_masker = std::move(sensitive_data_masker_);
91 }
92}
93
94SensitiveDataMasker * SensitiveDataMasker::getInstance()
95{
96 return sensitive_data_masker.get();
97}
98
99SensitiveDataMasker::SensitiveDataMasker(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix)
100{
101 Poco::Util::AbstractConfiguration::Keys keys;
102 config.keys(config_prefix, keys);
103 Logger * logger = &Logger::get("SensitiveDataMaskerConfigRead");
104
105 std::set<std::string> used_names;
106
107 for (const auto & rule : keys)
108 {
109 if (startsWith(rule, "rule"))
110 {
111 auto rule_config_prefix = config_prefix + "." + rule;
112
113 auto rule_name = config.getString(rule_config_prefix + ".name", rule_config_prefix);
114
115 if (!used_names.insert(rule_name).second)
116 {
117 throw Exception(
118 "query_masking_rules configuration contains more than one rule named '" + rule_name + "'.",
119 ErrorCodes::INVALID_CONFIG_PARAMETER);
120 }
121
122 auto regexp = config.getString(rule_config_prefix + ".regexp", "");
123
124 if (regexp.empty())
125 {
126 throw Exception(
127 "query_masking_rules configuration, rule '" + rule_name + "' has no <regexp> node or <regexp> is empty.",
128 ErrorCodes::NO_ELEMENTS_IN_CONFIG);
129 }
130
131 auto replace = config.getString(rule_config_prefix + ".replace", "******");
132
133 try
134 {
135 addMaskingRule(rule_name, regexp, replace);
136 }
137 catch (DB::Exception & e)
138 {
139 e.addMessage("while adding query masking rule '" + rule_name + "'.");
140 throw;
141 }
142 }
143 else
144 {
145 LOG_WARNING(logger, "Unused param " << config_prefix << '.' << rule);
146 }
147 }
148
149 auto rules_count = rulesCount();
150 if (rules_count > 0)
151 {
152 LOG_INFO(logger, rules_count << " query masking rules loaded.");
153 }
154}
155
156SensitiveDataMasker::~SensitiveDataMasker() {}
157
158void SensitiveDataMasker::addMaskingRule(
159 const std::string & name, const std::string & regexp_string, const std::string & replacement_string)
160{
161 all_masking_rules.push_back(std::make_unique<MaskingRule>(name, regexp_string, replacement_string));
162}
163
164
165size_t SensitiveDataMasker::wipeSensitiveData(std::string & data) const
166{
167 size_t matches = 0;
168 for (auto & rule : all_masking_rules)
169 matches += rule->apply(data);
170 return matches;
171}
172
173#ifndef NDEBUG
174void SensitiveDataMasker::printStats()
175{
176 for (auto & rule : all_masking_rules)
177 {
178 std::cout << rule->getName() << " (replacement to " << rule->getReplacementString() << ") matched " << rule->getMatchesCount()
179 << " times" << std::endl;
180 }
181}
182#endif
183
184size_t SensitiveDataMasker::rulesCount() const
185{
186 return all_masking_rules.size();
187}
188
189}
190