1 | #include <Common/Exception.h> |
2 | #include <Common/SensitiveDataMasker.h> |
3 | #include <Poco/AutoPtr.h> |
4 | #include <Poco/Util/XMLConfiguration.h> |
5 | #include <Poco/XML/XMLException.h> |
6 | |
7 | #pragma GCC diagnostic ignored "-Wsign-compare" |
8 | #ifdef __clang__ |
9 | # pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant" |
10 | # pragma clang diagnostic ignored "-Wundef" |
11 | #endif |
12 | |
13 | #include <gtest/gtest.h> |
14 | #include <chrono> |
15 | |
16 | |
17 | namespace DB |
18 | { |
19 | namespace ErrorCodes |
20 | { |
21 | extern const int CANNOT_COMPILE_REGEXP; |
22 | extern const int NO_ELEMENTS_IN_CONFIG; |
23 | extern const int INVALID_CONFIG_PARAMETER; |
24 | } |
25 | }; |
26 | |
27 | |
28 | TEST(Common, SensitiveDataMasker) |
29 | { |
30 | |
31 | Poco::AutoPtr<Poco::Util::XMLConfiguration> empty_xml_config = new Poco::Util::XMLConfiguration(); |
32 | DB::SensitiveDataMasker masker(*empty_xml_config , "" ); |
33 | masker.addMaskingRule("all a letters" , "a+" , "--a--" ); |
34 | masker.addMaskingRule("all b letters" , "b+" , "--b--" ); |
35 | masker.addMaskingRule("all d letters" , "d+" , "--d--" ); |
36 | masker.addMaskingRule("all x letters" , "x+" , "--x--" ); |
37 | masker.addMaskingRule("rule \"d\" result" , "--d--" , "*****" ); // RE2 regexps are applied one-by-one in order |
38 | std::string x = "aaaaaaaaaaaaa bbbbbbbbbb cccc aaaaaaaaaaaa d " ; |
39 | EXPECT_EQ(masker.wipeSensitiveData(x), 5); |
40 | EXPECT_EQ(x, "--a-- --b-- cccc --a-- ***** " ); |
41 | #ifndef NDEBUG |
42 | masker.printStats(); |
43 | #endif |
44 | EXPECT_EQ(masker.wipeSensitiveData(x), 3); |
45 | EXPECT_EQ(x, "----a---- ----b---- cccc ----a---- ***** " ); |
46 | #ifndef NDEBUG |
47 | masker.printStats(); |
48 | #endif |
49 | |
50 | DB::SensitiveDataMasker masker2(*empty_xml_config , "" ); |
51 | masker2.addMaskingRule("hide root password" , "qwerty123" , "******" ); |
52 | masker2.addMaskingRule("hide SSN" , "[0-9]{3}-[0-9]{2}-[0-9]{4}" , "000-00-0000" ); |
53 | masker2.addMaskingRule("hide email" , "[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}" , "hidden@hidden.test" ); |
54 | |
55 | std::string query = "SELECT id FROM mysql('localhost:3308', 'database', 'table', 'root', 'qwerty123') WHERE ssn='123-45-6789' or " |
56 | "email='JonhSmith@secret.domain.test'" ; |
57 | EXPECT_EQ(masker2.wipeSensitiveData(query), 3); |
58 | EXPECT_EQ( |
59 | query, |
60 | "SELECT id FROM mysql('localhost:3308', 'database', 'table', 'root', '******') WHERE " |
61 | "ssn='000-00-0000' or email='hidden@hidden.test'" ); |
62 | |
63 | #ifndef NDEBUG |
64 | // simple benchmark |
65 | auto start = std::chrono::high_resolution_clock::now(); |
66 | constexpr unsigned long int iterations = 200000; |
67 | for (int i = 0; i < iterations; ++i) |
68 | { |
69 | std::string query2 = "SELECT id FROM mysql('localhost:3308', 'database', 'table', 'root', 'qwerty123') WHERE ssn='123-45-6789' or " |
70 | "email='JonhSmith@secret.domain.test'" ; |
71 | masker2.wipeSensitiveData(query2); |
72 | } |
73 | auto finish = std::chrono::high_resolution_clock::now(); |
74 | std::chrono::duration<double> elapsed = finish - start; |
75 | std::cout << "Elapsed time: " << elapsed.count() << "s per " << iterations <<" calls (" << elapsed.count() * 1000000 / iterations << "µs per call)" |
76 | << std::endl; |
77 | // I have: "Elapsed time: 3.44022s per 200000 calls (17.2011µs per call)" |
78 | masker2.printStats(); |
79 | #endif |
80 | |
81 | DB::SensitiveDataMasker maskerbad(*empty_xml_config , "" ); |
82 | |
83 | // gtest has not good way to check exception content, so just do it manually (see https://github.com/google/googletest/issues/952 ) |
84 | try |
85 | { |
86 | maskerbad.addMaskingRule("bad regexp" , "**" , "" ); |
87 | ADD_FAILURE() << "addMaskingRule() should throw an error" << std::endl; |
88 | } |
89 | catch (const DB::Exception & e) |
90 | { |
91 | EXPECT_EQ( |
92 | std::string(e.what()), |
93 | "SensitiveDataMasker: cannot compile re2: **, error: no argument for repetition operator: *. Look at " |
94 | "https://github.com/google/re2/wiki/Syntax for reference." ); |
95 | EXPECT_EQ(e.code(), DB::ErrorCodes::CANNOT_COMPILE_REGEXP); |
96 | } |
97 | /* catch (...) { // not needed, gtest will react unhandled exception |
98 | FAIL() << "ERROR: Unexpected exception thrown: " << std::current_exception << std::endl; // std::current_exception is part of C++11x |
99 | } */ |
100 | |
101 | EXPECT_EQ(maskerbad.rulesCount(), 0); |
102 | EXPECT_EQ(maskerbad.wipeSensitiveData(x), 0); |
103 | |
104 | { |
105 | std::istringstream xml_isteam(R"END(<?xml version="1.0"?> |
106 | <clickhouse> |
107 | <query_masking_rules> |
108 | <rule> |
109 | <name>hide SSN</name><!-- by default: it will use xml path, like query_masking_rules.rule[1] --> |
110 | <regexp>[0-9]{3}-[0-9]{2}-[0-9]{4}</regexp><!-- mandatory --> |
111 | <replace>000-00-0000</replace><!-- by default - six asterisks (******) --> |
112 | </rule> |
113 | <rule> |
114 | <name>hide root password</name> |
115 | <regexp>qwerty123</regexp> |
116 | </rule> |
117 | <rule> |
118 | <regexp>(?i)Ivan</regexp> |
119 | <replace>John</replace> |
120 | </rule> |
121 | <rule> |
122 | <regexp>(?i)Petrov</regexp> |
123 | <replace>Doe</replace> |
124 | </rule> |
125 | <rule> |
126 | <name>hide email</name> |
127 | <regexp>(?i)[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}</regexp> |
128 | <replace>hidden@hidden.test</replace> |
129 | </rule> |
130 | <rule> |
131 | <name>remove selects to bad_words table</name> |
132 | <regexp>^.*bad_words.*$</regexp> |
133 | <replace>[QUERY IS CENSORED]</replace> |
134 | </rule> |
135 | </query_masking_rules> |
136 | </clickhouse>)END" ); |
137 | |
138 | Poco::AutoPtr<Poco::Util::XMLConfiguration> xml_config = new Poco::Util::XMLConfiguration(xml_isteam); |
139 | DB::SensitiveDataMasker masker_xml_based(*xml_config, "query_masking_rules" ); |
140 | std::string top_secret = "The e-mail of IVAN PETROV is kotik1902@sdsdf.test, and the password is qwerty123" ; |
141 | EXPECT_EQ(masker_xml_based.wipeSensitiveData(top_secret), 4); |
142 | EXPECT_EQ(top_secret, "The e-mail of John Doe is hidden@hidden.test, and the password is ******" ); |
143 | |
144 | top_secret = "SELECT * FROM bad_words" ; |
145 | EXPECT_EQ(masker_xml_based.wipeSensitiveData(top_secret), 1); |
146 | EXPECT_EQ(top_secret, "[QUERY IS CENSORED]" ); |
147 | |
148 | #ifndef NDEBUG |
149 | masker_xml_based.printStats(); |
150 | #endif |
151 | } |
152 | |
153 | try |
154 | { |
155 | std::istringstream xml_isteam_bad(R"END(<?xml version="1.0"?> |
156 | <clickhouse> |
157 | <query_masking_rules> |
158 | <rule> |
159 | <name>test</name> |
160 | <regexp>abc</regexp> |
161 | </rule> |
162 | <rule> |
163 | <name>test</name> |
164 | <regexp>abc</regexp> |
165 | </rule> |
166 | </query_masking_rules> |
167 | </clickhouse>)END" ); |
168 | Poco::AutoPtr<Poco::Util::XMLConfiguration> xml_config = new Poco::Util::XMLConfiguration(xml_isteam_bad); |
169 | DB::SensitiveDataMasker masker_xml_based_exception_check(*xml_config, "query_masking_rules" ); |
170 | |
171 | ADD_FAILURE() << "XML should throw an error on bad XML" << std::endl; |
172 | } |
173 | catch (const DB::Exception & e) |
174 | { |
175 | EXPECT_EQ( |
176 | std::string(e.what()), |
177 | "query_masking_rules configuration contains more than one rule named 'test'." ); |
178 | EXPECT_EQ(e.code(), DB::ErrorCodes::INVALID_CONFIG_PARAMETER); |
179 | } |
180 | |
181 | try |
182 | { |
183 | std::istringstream xml_isteam_bad(R"END(<?xml version="1.0"?> |
184 | <clickhouse> |
185 | <query_masking_rules> |
186 | <rule><name>test</name></rule> |
187 | </query_masking_rules> |
188 | </clickhouse>)END" ); |
189 | |
190 | Poco::AutoPtr<Poco::Util::XMLConfiguration> xml_config = new Poco::Util::XMLConfiguration(xml_isteam_bad); |
191 | DB::SensitiveDataMasker masker_xml_based_exception_check(*xml_config, "query_masking_rules" ); |
192 | |
193 | ADD_FAILURE() << "XML should throw an error on bad XML" << std::endl; |
194 | } |
195 | catch (const DB::Exception & e) |
196 | { |
197 | EXPECT_EQ( |
198 | std::string(e.what()), |
199 | "query_masking_rules configuration, rule 'test' has no <regexp> node or <regexp> is empty." ); |
200 | EXPECT_EQ(e.code(), DB::ErrorCodes::NO_ELEMENTS_IN_CONFIG); |
201 | } |
202 | |
203 | try |
204 | { |
205 | std::istringstream xml_isteam_bad(R"END(<?xml version="1.0"?> |
206 | <clickhouse> |
207 | <query_masking_rules> |
208 | <rule><name>test</name><regexp>())(</regexp></rule> |
209 | </query_masking_rules> |
210 | </clickhouse>)END" ); |
211 | |
212 | Poco::AutoPtr<Poco::Util::XMLConfiguration> xml_config = new Poco::Util::XMLConfiguration(xml_isteam_bad); |
213 | DB::SensitiveDataMasker masker_xml_based_exception_check(*xml_config, "query_masking_rules" ); |
214 | |
215 | ADD_FAILURE() << "XML should throw an error on bad XML" << std::endl; |
216 | } |
217 | catch (const DB::Exception & e) |
218 | { |
219 | EXPECT_EQ( |
220 | std::string(e.message()), |
221 | "SensitiveDataMasker: cannot compile re2: ())(, error: missing ): ())(. Look at https://github.com/google/re2/wiki/Syntax for reference.: while adding query masking rule 'test'." |
222 | ); |
223 | EXPECT_EQ(e.code(), DB::ErrorCodes::CANNOT_COMPILE_REGEXP); |
224 | } |
225 | |
226 | } |
227 | |