sql_digest.cc source code [MariaDB/sql/sql_digest.cc]

1	/ Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.*
2	Copyright (c) 2017, MariaDB Corporation.
3
4	This program is free software; you can redistribute it and/or modify
5	it under the terms of the GNU General Public License as published by
6	the Free Software Foundation; version 2 of the License.
7
8	This program is distributed in the hope that it will be useful,
9	but WITHOUT ANY WARRANTY; without even the implied warranty of
10	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11	GNU General Public License for more details.
12
13	You should have received a copy of the GNU General Public License
14	along with this program; if not, write to the Free Software Foundation,
15	51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA /*
16
17	/*
18	This code needs extra visibility in the lexer structures
19	*/
20
21	#include "mariadb.h"
22	#include "my_md5.h"
23	#include "unireg.h"
24
25	#include "sql_string.h"
26	#include "sql_class.h"
27	#include "sql_lex.h"
28	#include "sp_pcontext.h"
29	#include "sql_digest.h"
30	#include "sql_digest_stream.h"
31
32	#include "sql_get_diagnostics.h"
33
34	/ Generated code /
35	#include "sql_yacc.h"
36	#define LEX_TOKEN_WITH_DEFINITION
37	#include "lex_token.h"
38
39	/ Name pollution from sql/sql_lex.h /
40	#ifdef LEX_YYSTYPE
41	#undef LEX_YYSTYPE
42	#endif
43
44	#define LEX_YYSTYPE YYSTYPE*
45
46	#define SIZE_OF_A_TOKEN 2
47
48	/**
49	Read a single token from token array.
50	*/
51	inline uint read_token(const sql_digest_storage *digest_storage,
52	uint index, uint *tok)
53	{
54	uint safe_byte_count= digest_storage->m_byte_count;
55
56	if (index + SIZE_OF_A_TOKEN <= safe_byte_count &&
57	safe_byte_count <= digest_storage->m_token_array_length)
58	{
59	const unsigned char *src= & digest_storage->m_token_array[index];
60	*tok= src[`0`] \| (src[`1`] << `8`);
61	return index + SIZE_OF_A_TOKEN;
62	}
63
64	/ The input byte stream is exhausted. /
65	*tok= `0`;
66	return MAX_DIGEST_STORAGE_SIZE + `1`;
67	}
68
69	/**
70	Store a single token in token array.
71	*/
72	inline void store_token(sql_digest_storage* digest_storage, uint token)
73	{
74	DBUG_ASSERT(digest_storage->m_byte_count <= digest_storage->m_token_array_length);
75
76	if (digest_storage->m_byte_count + SIZE_OF_A_TOKEN <= digest_storage->m_token_array_length)
77	{
78	unsigned char* dest= & digest_storage->m_token_array[digest_storage->m_byte_count];
79	dest[`0`]= token & `0xff`;
80	dest[`1`]= (token >> `8`) & `0xff`;
81	digest_storage->m_byte_count+= SIZE_OF_A_TOKEN;
82	}
83	else
84	{
85	digest_storage->m_full= true;
86	}
87	}
88
89	/**
90	Read an identifier from token array.
91	*/
92	inline uint read_identifier(const sql_digest_storage* digest_storage,
93	uint index, char ** id_string, int *id_length)
94	{
95	uint new_index;
96	uint safe_byte_count= digest_storage->m_byte_count;
97
98	DBUG_ASSERT(index <= safe_byte_count);
99	DBUG_ASSERT(safe_byte_count <= digest_storage->m_token_array_length);
100
101	/*
102	token + length + string are written in an atomic way,
103	so we do always expect a length + string here
104	*/
105
106	uint bytes_needed= SIZE_OF_A_TOKEN;
107	/ If we can read token and identifier length /
108	if ((index + bytes_needed) <= safe_byte_count)
109	{
110	const unsigned char *src= & digest_storage->m_token_array[index];
111	/ Read the length of identifier /
112	uint length= src[`0`] \| (src[`1`] << `8`);
113	bytes_needed+= length;
114	/ If we can read entire identifier from token array /
115	if ((index + bytes_needed) <= safe_byte_count)
116	{
117	id_string= (char* *) (src + `2`);
118	*id_length= length;
119
120	new_index= index + bytes_needed;
121	DBUG_ASSERT(new_index <= safe_byte_count);
122	return new_index;
123	}
124	}
125
126	/ The input byte stream is exhausted. /
127	return MAX_DIGEST_STORAGE_SIZE + `1`;
128	}
129
130	/**
131	Store an identifier in token array.
132	*/
133	inline void store_token_identifier(sql_digest_storage* digest_storage,
134	uint token,
135	size_t id_length, const char *id_name)
136	{
137	DBUG_ASSERT(digest_storage->m_byte_count <= digest_storage->m_token_array_length);
138
139	size_t bytes_needed= `2` * SIZE_OF_A_TOKEN + id_length;
140	if (digest_storage->m_byte_count + bytes_needed <= (unsigned int)digest_storage->m_token_array_length)
141	{
142	unsigned char* dest= & digest_storage->m_token_array[digest_storage->m_byte_count];
143	/ Write the token /
144	dest[`0`]= token & `0xff`;
145	dest[`1`]= (token >> `8`) & `0xff`;
146	/ Write the string length /
147	dest[`2`]= id_length & `0xff`;
148	dest[`3`]= (id_length >> `8`) & `0xff`;
149	/ Write the string data /
150	if (id_length > `0`)
151	memcpy((char *)(dest + `4`), id_name, id_length);
152	digest_storage->m_byte_count+= (uint)bytes_needed;
153	}
154	else
155	{
156	digest_storage->m_full= true;
157	}
158	}
159
160	void compute_digest_md5(const sql_digest_storage digest_storage, unsigned* char *md5)
161	{
162	compute_md5_hash(md5,
163	(const char *) digest_storage->m_token_array,
164	digest_storage->m_byte_count);
165	}
166
167	/*
168	Iterate token array and updates digest_text.
169	*/
170	void compute_digest_text(const sql_digest_storage* digest_storage,
171	String *digest_text)
172	{
173	DBUG_ASSERT(digest_storage != NULL);
174	uint byte_count= digest_storage->m_byte_count;
175	String *digest_output= digest_text;
176	uint tok= `0`;
177	uint current_byte= `0`;
178	lex_token_string *tok_data;
179
180	/ Reset existing data /
181	digest_output->length(`0`);
182
183	if (byte_count > digest_storage->m_token_array_length)
184	{
185	digest_output->append("\0", `1`);
186	return;
187	}
188
189	/ Convert text to utf8 /
190	const CHARSET_INFO *from_cs= get_charset(digest_storage->m_charset_number, MYF(`0`));
191	const CHARSET_INFO *to_cs= &my_charset_utf8_bin;
192
193	if (from_cs == NULL)
194	{
195	/*
196	Can happen, as we do dirty reads on digest_storage,
197	which can be written to in another thread.
198	*/
199	digest_output->append("\0", `1`);
200	return;
201	}
202
203	char id_buffer[NAME_LEN + `1`]= {`'\0'`};
204	char *id_string;
205	size_t id_length;
206	bool convert_text= !my_charset_same(from_cs, to_cs);
207
208	while (current_byte < byte_count)
209	{
210	current_byte= read_token(digest_storage, current_byte, &tok);
211
212	if (tok <= `0` \|\| tok >= array_elements(lex_token_array)
213	\|\| current_byte > max_digest_length)
214	return;
215
216	tok_data= &lex_token_array[tok];
217
218	switch (tok)
219	{
220	/ All identifiers are printed with their name. /
221	case IDENT:
222	case IDENT_QUOTED:
223	case TOK_IDENT:
224	{
225	char *id_ptr= NULL;
226	int id_len= `0`;
227	uint err_cs= `0`;
228
229	/ Get the next identifier from the storage buffer. /
230	current_byte= read_identifier(digest_storage, current_byte,
231	&id_ptr, &id_len);
232	if (current_byte > max_digest_length)
233	return;
234
235	if (convert_text)
236	{
237	/ Verify that the converted text will fit. /
238	if (to_cs->mbmaxlen*id_len > NAME_LEN)
239	{
240	digest_output->append("...", `3`);
241	break;
242	}
243	/ Convert identifier string into the storage character set. /
244	id_length= my_convert(id_buffer, NAME_LEN, to_cs,
245	id_ptr, id_len, from_cs, &err_cs);
246	id_string= id_buffer;
247	}
248	else
249	{
250	id_string= id_ptr;
251	id_length= id_len;
252	}
253
254	if (id_length == `0` \|\| err_cs != `0`)
255	{
256	break;
257	}
258	/ Copy the converted identifier into the digest string. /
259	digest_output->append("`", `1`);
260	if (id_length > `0`)
261	digest_output->append(id_string, id_length);
262	digest_output->append("` ", `2`);
263	}
264	break;
265
266	/ Everything else is printed as is. /
267	default:
268	/*
269	Make sure not to overflow digest_text buffer.
270	+1 is to make sure extra space for ' '.
271	*/
272	int tok_length= tok_data->m_token_length;
273
274	digest_output->append(tok_data->m_token_string, tok_length);
275	if (tok_data->m_append_space)
276	digest_output->append(" ", `1`);
277	break;
278	}
279	}
280	}
281
282	static inline uint peek_token(const sql_digest_storage *digest, uint index)
283	{
284	uint token;
285	DBUG_ASSERT(index + SIZE_OF_A_TOKEN <= digest->m_byte_count);
286	DBUG_ASSERT(digest->m_byte_count <= digest->m_token_array_length);
287
288	token= ((digest->m_token_array[index + `1`])<<`8`) \| digest->m_token_array[index];
289	return token;
290	}
291
292	/**
293	Function to read last two tokens from token array. If an identifier
294	is found, do not look for token before that.
295	*/
296	static inline void peek_last_two_tokens(const sql_digest_storage* digest_storage,
297	uint last_id_index, uint t1, uint t2)
298	{
299	uint byte_count= digest_storage->m_byte_count;
300	uint peek_index= byte_count;
301
302	if (last_id_index + SIZE_OF_A_TOKEN <= peek_index)
303	{
304	/ Take last token. /
305	peek_index-= SIZE_OF_A_TOKEN;
306	*t1= peek_token(digest_storage, peek_index);
307
308	if (last_id_index + SIZE_OF_A_TOKEN <= peek_index)
309	{
310	/ Take 2nd token from last. /
311	peek_index-= SIZE_OF_A_TOKEN;
312	*t2= peek_token(digest_storage, peek_index);
313	}
314	else
315	{
316	*t2= TOK_UNUSED;
317	}
318	}
319	else
320	{
321	*t1= TOK_UNUSED;
322	*t2= TOK_UNUSED;
323	}
324	}
325
326	/**
327	Function to read last three tokens from token array. If an identifier
328	is found, do not look for token before that.
329	*/
330	static inline void peek_last_three_tokens(const sql_digest_storage* digest_storage,
331	uint last_id_index, uint t1, uint t2, uint *t3)
332	{
333	uint byte_count= digest_storage->m_byte_count;
334	uint peek_index= byte_count;
335
336	if (last_id_index + SIZE_OF_A_TOKEN <= peek_index)
337	{
338	/ Take last token. /
339	peek_index-= SIZE_OF_A_TOKEN;
340	*t1= peek_token(digest_storage, peek_index);
341
342	if (last_id_index + SIZE_OF_A_TOKEN <= peek_index)
343	{
344	/ Take 2nd token from last. /
345	peek_index-= SIZE_OF_A_TOKEN;
346	*t2= peek_token(digest_storage, peek_index);
347
348	if (last_id_index + SIZE_OF_A_TOKEN <= peek_index)
349	{
350	/ Take 3rd token from last. /
351	peek_index-= SIZE_OF_A_TOKEN;
352	*t3= peek_token(digest_storage, peek_index);
353	}
354	else
355	{
356	*t3= TOK_UNUSED;
357	}
358	}
359	else
360	{
361	*t2= TOK_UNUSED;
362	*t3= TOK_UNUSED;
363	}
364	}
365	else
366	{
367	*t1= TOK_UNUSED;
368	*t2= TOK_UNUSED;
369	*t3= TOK_UNUSED;
370	}
371	}
372
373	sql_digest_state* digest_add_token(sql_digest_state *state,
374	uint token,
375	LEX_YYSTYPE yylval)
376	{
377	sql_digest_storage *digest_storage= NULL;
378
379	digest_storage= &state->m_digest_storage;
380
381	/*
382	Stop collecting further tokens if digest storage is full or
383	if END token is received.
384	*/
385	if (digest_storage->m_full \|\| token == END_OF_INPUT)
386	return NULL;
387
388	/*
389	Take last_token 2 tokens collected till now. These tokens will be used
390	in reduce for normalisation. Make sure not to consider ID tokens in reduce.
391	*/
392	uint last_token;
393	uint last_token2;
394
395	switch (token)
396	{
397	case NUM:
398	case LONG_NUM:
399	case ULONGLONG_NUM:
400	case DECIMAL_NUM:
401	case FLOAT_NUM:
402	case BIN_NUM:
403	case HEX_NUM:
404	{
405	bool found_unary;
406	do
407	{
408	found_unary= false;
409	peek_last_two_tokens(digest_storage, state->m_last_id_index,
410	&last_token, &last_token2);
411
412	if ((last_token == `'-'`) \|\| (last_token == `'+'`))
413	{
414	/*
415	We need to differentiate:
416	- a <unary minus> operator
417	- a <unary plus> operator
418	from
419	- a <binary minus> operator
420	- a <binary plus> operator
421	to only reduce "a = -1" to "a = ?", and not change "b - 1" to "b ?"
422
423	Binary operators are found inside an expression,
424	while unary operators are found at the beginning of an expression, or after operators.
425
426	To achieve this, every token that is followed by an <expr> expression
427	in the SQL grammar is flagged.
428	See sql/sql_yacc.yy
429	See sql/gen_lex_token.cc
430
431	For example,
432	"(-1)" is parsed as "(", "-", NUM, ")", and lex_token_array["("].m_start_expr is true,
433	so reduction of the "-" NUM is done, the result is "(?)".
434	"(a-1)" is parsed as "(", ID, "-", NUM, ")", and lex_token_array[ID].m_start_expr is false,
435	so the operator is binary, no reduction is done, and the result is "(a-?)".
436	*/
437	if (lex_token_array[last_token2].m_start_expr)
438	{
439	/*
440	REDUCE:
441	TOK_GENERIC_VALUE := (UNARY_PLUS \| UNARY_MINUS) (NUM \| LOG_NUM \| ... \| FLOAT_NUM)
442
443	REDUCE:
444	TOK_GENERIC_VALUE := (UNARY_PLUS \| UNARY_MINUS) TOK_GENERIC_VALUE
445	*/
446	token= TOK_GENERIC_VALUE;
447	digest_storage->m_byte_count-= SIZE_OF_A_TOKEN;
448	found_unary= true;
449	}
450	}
451	} while (found_unary);
452	}
453	/ for case NULL_SYM below /
454	/ fall through /
455	case LEX_HOSTNAME:
456	case TEXT_STRING:
457	case NCHAR_STRING:
458	case PARAM_MARKER:
459	{
460	/*
461	REDUCE:
462	TOK_GENERIC_VALUE := BIN_NUM \| DECIMAL_NUM \| ... \| ULONGLONG_NUM
463	*/
464	token= TOK_GENERIC_VALUE;
465
466	peek_last_two_tokens(digest_storage, state->m_last_id_index,
467	&last_token, &last_token2);
468
469	if ((last_token2 == TOK_GENERIC_VALUE \|\|
470	last_token2 == TOK_GENERIC_VALUE_LIST) &&
471	(last_token == `','`))
472	{
473	/*
474	REDUCE:
475	TOK_GENERIC_VALUE_LIST :=
476	TOK_GENERIC_VALUE ',' TOK_GENERIC_VALUE
477
478	REDUCE:
479	TOK_GENERIC_VALUE_LIST :=
480	TOK_GENERIC_VALUE_LIST ',' TOK_GENERIC_VALUE
481	*/
482	digest_storage->m_byte_count-= `2`*SIZE_OF_A_TOKEN;
483	token= TOK_GENERIC_VALUE_LIST;
484	}
485	/*
486	Add this token or the resulting reduce to digest storage.
487	*/
488	store_token(digest_storage, token);
489	break;
490	}
491	case `')'`:
492	{
493	peek_last_two_tokens(digest_storage, state->m_last_id_index,
494	&last_token, &last_token2);
495
496	if (last_token == TOK_GENERIC_VALUE &&
497	last_token2 == `'('`)
498	{
499	/*
500	REDUCE:
501	TOK_ROW_SINGLE_VALUE :=
502	'(' TOK_GENERIC_VALUE ')'
503	*/
504	digest_storage->m_byte_count-= `2`*SIZE_OF_A_TOKEN;
505	token= TOK_ROW_SINGLE_VALUE;
506
507	/ Read last two tokens again /
508	peek_last_two_tokens(digest_storage, state->m_last_id_index,
509	&last_token, &last_token2);
510
511	if ((last_token2 == TOK_ROW_SINGLE_VALUE \|\|
512	last_token2 == TOK_ROW_SINGLE_VALUE_LIST) &&
513	(last_token == `','`))
514	{
515	/*
516	REDUCE:
517	TOK_ROW_SINGLE_VALUE_LIST :=
518	TOK_ROW_SINGLE_VALUE ',' TOK_ROW_SINGLE_VALUE
519
520	REDUCE:
521	TOK_ROW_SINGLE_VALUE_LIST :=
522	TOK_ROW_SINGLE_VALUE_LIST ',' TOK_ROW_SINGLE_VALUE
523	*/
524	digest_storage->m_byte_count-= `2`*SIZE_OF_A_TOKEN;
525	token= TOK_ROW_SINGLE_VALUE_LIST;
526	}
527	}
528	else if (last_token == TOK_GENERIC_VALUE_LIST &&
529	last_token2 == `'('`)
530	{
531	/*
532	REDUCE:
533	TOK_ROW_MULTIPLE_VALUE :=
534	'(' TOK_GENERIC_VALUE_LIST ')'
535	*/
536	digest_storage->m_byte_count-= `2`*SIZE_OF_A_TOKEN;
537	token= TOK_ROW_MULTIPLE_VALUE;
538
539	/ Read last two tokens again /
540	peek_last_two_tokens(digest_storage, state->m_last_id_index,
541	&last_token, &last_token2);
542
543	if ((last_token2 == TOK_ROW_MULTIPLE_VALUE \|\|
544	last_token2 == TOK_ROW_MULTIPLE_VALUE_LIST) &&
545	(last_token == `','`))
546	{
547	/*
548	REDUCE:
549	TOK_ROW_MULTIPLE_VALUE_LIST :=
550	TOK_ROW_MULTIPLE_VALUE ',' TOK_ROW_MULTIPLE_VALUE
551
552	REDUCE:
553	TOK_ROW_MULTIPLE_VALUE_LIST :=
554	TOK_ROW_MULTIPLE_VALUE_LIST ',' TOK_ROW_MULTIPLE_VALUE
555	*/
556	digest_storage->m_byte_count-= `2`*SIZE_OF_A_TOKEN;
557	token= TOK_ROW_MULTIPLE_VALUE_LIST;
558	}
559	}
560	/*
561	Add this token or the resulting reduce to digest storage.
562	*/
563	store_token(digest_storage, token);
564	break;
565	}
566	case IDENT:
567	case IDENT_QUOTED:
568	{
569	YYSTYPE *lex_token= yylval;
570	const char *yytext= lex_token->lex_str.str;
571	size_t yylen= lex_token->lex_str.length;
572
573	/*
574	REDUCE:
575	TOK_IDENT := IDENT \| IDENT_QUOTED
576	The parser gives IDENT or IDENT_TOKEN for the same text,
577	depending on the character set used.
578	We unify both to always print the same digest text,
579	and always have the same digest hash.
580	*/
581	token= TOK_IDENT;
582	/ Add this token and identifier string to digest storage. /
583	store_token_identifier(digest_storage, token, yylen, yytext);
584
585	/ Update the index of last identifier found. /
586	state->m_last_id_index= digest_storage->m_byte_count;
587	break;
588	}
589	default:
590	{
591	/ Add this token to digest storage. /
592	store_token(digest_storage, token);
593	break;
594	}
595	}
596
597	return state;
598	}
599
600	sql_digest_state* digest_reduce_token(sql_digest_state *state,
601	uint token_left, uint token_right)
602	{
603	sql_digest_storage *digest_storage= NULL;
604
605	digest_storage= &state->m_digest_storage;
606
607	/*
608	Stop collecting further tokens if digest storage is full.
609	*/
610	if (digest_storage->m_full)
611	return NULL;
612
613	uint last_token;
614	uint last_token2;
615	uint last_token3;
616	uint token_to_push= TOK_UNUSED;
617
618	peek_last_two_tokens(digest_storage, state->m_last_id_index,
619	&last_token, &last_token2);
620
621	/*
622	There is only one caller of digest_reduce_token(),
623	see sql/sql_yacc.yy, rule literal := NULL_SYM.
624	REDUCE:
625	token_left := token_right
626	Used for:
627	TOK_GENERIC_VALUE := NULL_SYM
628	*/
629
630	if (last_token == token_right)
631	{
632	/*
633	Current stream is like:
634	TOKEN_X TOKEN_RIGHT .
635	REDUCE to
636	TOKEN_X TOKEN_LEFT .
637	*/
638	digest_storage->m_byte_count-= SIZE_OF_A_TOKEN;
639	store_token(digest_storage, token_left);
640	}
641	else
642	{
643	/*
644	Current stream is like:
645	TOKEN_X TOKEN_RIGHT TOKEN_Y .
646	Pop TOKEN_Y
647	TOKEN_X TOKEN_RIGHT . TOKEN_Y
648	REDUCE to
649	TOKEN_X TOKEN_LEFT . TOKEN_Y
650	*/
651	DBUG_ASSERT(last_token2 == token_right);
652	digest_storage->m_byte_count-= `2` * SIZE_OF_A_TOKEN;
653	store_token(digest_storage, token_left);
654	token_to_push= last_token;
655	}
656
657	peek_last_three_tokens(digest_storage, state->m_last_id_index,
658	&last_token, &last_token2, &last_token3);
659
660	if ((last_token3 == TOK_GENERIC_VALUE \|\|
661	last_token3 == TOK_GENERIC_VALUE_LIST) &&
662	(last_token2 == `','`) &&
663	(last_token == TOK_GENERIC_VALUE))
664	{
665	/*
666	REDUCE:
667	TOK_GENERIC_VALUE_LIST :=
668	TOK_GENERIC_VALUE ',' TOK_GENERIC_VALUE
669
670	REDUCE:
671	TOK_GENERIC_VALUE_LIST :=
672	TOK_GENERIC_VALUE_LIST ',' TOK_GENERIC_VALUE
673	*/
674	digest_storage->m_byte_count-= `3`*SIZE_OF_A_TOKEN;
675	store_token(digest_storage, TOK_GENERIC_VALUE_LIST);
676	}
677
678	if (token_to_push != TOK_UNUSED)
679	{
680	/*
681	Push TOKEN_Y
682	*/
683	store_token(digest_storage, token_to_push);
684	}
685
686	return state;
687	}
688
689

Browse the source code of MariaDB/sql/sql_digest.cc