1/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
2 Copyright (c) 2017, MariaDB Corporation.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software Foundation,
15 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
16
17/*
18 This code needs extra visibility in the lexer structures
19*/
20
21#include "mariadb.h"
22#include "my_md5.h"
23#include "unireg.h"
24
25#include "sql_string.h"
26#include "sql_class.h"
27#include "sql_lex.h"
28#include "sp_pcontext.h"
29#include "sql_digest.h"
30#include "sql_digest_stream.h"
31
32#include "sql_get_diagnostics.h"
33
34/* Generated code */
35#include "sql_yacc.h"
36#define LEX_TOKEN_WITH_DEFINITION
37#include "lex_token.h"
38
39/* Name pollution from sql/sql_lex.h */
40#ifdef LEX_YYSTYPE
41#undef LEX_YYSTYPE
42#endif
43
44#define LEX_YYSTYPE YYSTYPE*
45
46#define SIZE_OF_A_TOKEN 2
47
48/**
49 Read a single token from token array.
50*/
51inline uint read_token(const sql_digest_storage *digest_storage,
52 uint index, uint *tok)
53{
54 uint safe_byte_count= digest_storage->m_byte_count;
55
56 if (index + SIZE_OF_A_TOKEN <= safe_byte_count &&
57 safe_byte_count <= digest_storage->m_token_array_length)
58 {
59 const unsigned char *src= & digest_storage->m_token_array[index];
60 *tok= src[0] | (src[1] << 8);
61 return index + SIZE_OF_A_TOKEN;
62 }
63
64 /* The input byte stream is exhausted. */
65 *tok= 0;
66 return MAX_DIGEST_STORAGE_SIZE + 1;
67}
68
69/**
70 Store a single token in token array.
71*/
72inline void store_token(sql_digest_storage* digest_storage, uint token)
73{
74 DBUG_ASSERT(digest_storage->m_byte_count <= digest_storage->m_token_array_length);
75
76 if (digest_storage->m_byte_count + SIZE_OF_A_TOKEN <= digest_storage->m_token_array_length)
77 {
78 unsigned char* dest= & digest_storage->m_token_array[digest_storage->m_byte_count];
79 dest[0]= token & 0xff;
80 dest[1]= (token >> 8) & 0xff;
81 digest_storage->m_byte_count+= SIZE_OF_A_TOKEN;
82 }
83 else
84 {
85 digest_storage->m_full= true;
86 }
87}
88
89/**
90 Read an identifier from token array.
91*/
92inline uint read_identifier(const sql_digest_storage* digest_storage,
93 uint index, char ** id_string, int *id_length)
94{
95 uint new_index;
96 uint safe_byte_count= digest_storage->m_byte_count;
97
98 DBUG_ASSERT(index <= safe_byte_count);
99 DBUG_ASSERT(safe_byte_count <= digest_storage->m_token_array_length);
100
101 /*
102 token + length + string are written in an atomic way,
103 so we do always expect a length + string here
104 */
105
106 uint bytes_needed= SIZE_OF_A_TOKEN;
107 /* If we can read token and identifier length */
108 if ((index + bytes_needed) <= safe_byte_count)
109 {
110 const unsigned char *src= & digest_storage->m_token_array[index];
111 /* Read the length of identifier */
112 uint length= src[0] | (src[1] << 8);
113 bytes_needed+= length;
114 /* If we can read entire identifier from token array */
115 if ((index + bytes_needed) <= safe_byte_count)
116 {
117 *id_string= (char *) (src + 2);
118 *id_length= length;
119
120 new_index= index + bytes_needed;
121 DBUG_ASSERT(new_index <= safe_byte_count);
122 return new_index;
123 }
124 }
125
126 /* The input byte stream is exhausted. */
127 return MAX_DIGEST_STORAGE_SIZE + 1;
128}
129
130/**
131 Store an identifier in token array.
132*/
133inline void store_token_identifier(sql_digest_storage* digest_storage,
134 uint token,
135 size_t id_length, const char *id_name)
136{
137 DBUG_ASSERT(digest_storage->m_byte_count <= digest_storage->m_token_array_length);
138
139 size_t bytes_needed= 2 * SIZE_OF_A_TOKEN + id_length;
140 if (digest_storage->m_byte_count + bytes_needed <= (unsigned int)digest_storage->m_token_array_length)
141 {
142 unsigned char* dest= & digest_storage->m_token_array[digest_storage->m_byte_count];
143 /* Write the token */
144 dest[0]= token & 0xff;
145 dest[1]= (token >> 8) & 0xff;
146 /* Write the string length */
147 dest[2]= id_length & 0xff;
148 dest[3]= (id_length >> 8) & 0xff;
149 /* Write the string data */
150 if (id_length > 0)
151 memcpy((char *)(dest + 4), id_name, id_length);
152 digest_storage->m_byte_count+= (uint)bytes_needed;
153 }
154 else
155 {
156 digest_storage->m_full= true;
157 }
158}
159
160void compute_digest_md5(const sql_digest_storage *digest_storage, unsigned char *md5)
161{
162 compute_md5_hash(md5,
163 (const char *) digest_storage->m_token_array,
164 digest_storage->m_byte_count);
165}
166
167/*
168 Iterate token array and updates digest_text.
169*/
170void compute_digest_text(const sql_digest_storage* digest_storage,
171 String *digest_text)
172{
173 DBUG_ASSERT(digest_storage != NULL);
174 uint byte_count= digest_storage->m_byte_count;
175 String *digest_output= digest_text;
176 uint tok= 0;
177 uint current_byte= 0;
178 lex_token_string *tok_data;
179
180 /* Reset existing data */
181 digest_output->length(0);
182
183 if (byte_count > digest_storage->m_token_array_length)
184 {
185 digest_output->append("\0", 1);
186 return;
187 }
188
189 /* Convert text to utf8 */
190 const CHARSET_INFO *from_cs= get_charset(digest_storage->m_charset_number, MYF(0));
191 const CHARSET_INFO *to_cs= &my_charset_utf8_bin;
192
193 if (from_cs == NULL)
194 {
195 /*
196 Can happen, as we do dirty reads on digest_storage,
197 which can be written to in another thread.
198 */
199 digest_output->append("\0", 1);
200 return;
201 }
202
203 char id_buffer[NAME_LEN + 1]= {'\0'};
204 char *id_string;
205 size_t id_length;
206 bool convert_text= !my_charset_same(from_cs, to_cs);
207
208 while (current_byte < byte_count)
209 {
210 current_byte= read_token(digest_storage, current_byte, &tok);
211
212 if (tok <= 0 || tok >= array_elements(lex_token_array)
213 || current_byte > max_digest_length)
214 return;
215
216 tok_data= &lex_token_array[tok];
217
218 switch (tok)
219 {
220 /* All identifiers are printed with their name. */
221 case IDENT:
222 case IDENT_QUOTED:
223 case TOK_IDENT:
224 {
225 char *id_ptr= NULL;
226 int id_len= 0;
227 uint err_cs= 0;
228
229 /* Get the next identifier from the storage buffer. */
230 current_byte= read_identifier(digest_storage, current_byte,
231 &id_ptr, &id_len);
232 if (current_byte > max_digest_length)
233 return;
234
235 if (convert_text)
236 {
237 /* Verify that the converted text will fit. */
238 if (to_cs->mbmaxlen*id_len > NAME_LEN)
239 {
240 digest_output->append("...", 3);
241 break;
242 }
243 /* Convert identifier string into the storage character set. */
244 id_length= my_convert(id_buffer, NAME_LEN, to_cs,
245 id_ptr, id_len, from_cs, &err_cs);
246 id_string= id_buffer;
247 }
248 else
249 {
250 id_string= id_ptr;
251 id_length= id_len;
252 }
253
254 if (id_length == 0 || err_cs != 0)
255 {
256 break;
257 }
258 /* Copy the converted identifier into the digest string. */
259 digest_output->append("`", 1);
260 if (id_length > 0)
261 digest_output->append(id_string, id_length);
262 digest_output->append("` ", 2);
263 }
264 break;
265
266 /* Everything else is printed as is. */
267 default:
268 /*
269 Make sure not to overflow digest_text buffer.
270 +1 is to make sure extra space for ' '.
271 */
272 int tok_length= tok_data->m_token_length;
273
274 digest_output->append(tok_data->m_token_string, tok_length);
275 if (tok_data->m_append_space)
276 digest_output->append(" ", 1);
277 break;
278 }
279 }
280}
281
282static inline uint peek_token(const sql_digest_storage *digest, uint index)
283{
284 uint token;
285 DBUG_ASSERT(index + SIZE_OF_A_TOKEN <= digest->m_byte_count);
286 DBUG_ASSERT(digest->m_byte_count <= digest->m_token_array_length);
287
288 token= ((digest->m_token_array[index + 1])<<8) | digest->m_token_array[index];
289 return token;
290}
291
292/**
293 Function to read last two tokens from token array. If an identifier
294 is found, do not look for token before that.
295*/
296static inline void peek_last_two_tokens(const sql_digest_storage* digest_storage,
297 uint last_id_index, uint *t1, uint *t2)
298{
299 uint byte_count= digest_storage->m_byte_count;
300 uint peek_index= byte_count;
301
302 if (last_id_index + SIZE_OF_A_TOKEN <= peek_index)
303 {
304 /* Take last token. */
305 peek_index-= SIZE_OF_A_TOKEN;
306 *t1= peek_token(digest_storage, peek_index);
307
308 if (last_id_index + SIZE_OF_A_TOKEN <= peek_index)
309 {
310 /* Take 2nd token from last. */
311 peek_index-= SIZE_OF_A_TOKEN;
312 *t2= peek_token(digest_storage, peek_index);
313 }
314 else
315 {
316 *t2= TOK_UNUSED;
317 }
318 }
319 else
320 {
321 *t1= TOK_UNUSED;
322 *t2= TOK_UNUSED;
323 }
324}
325
326/**
327 Function to read last three tokens from token array. If an identifier
328 is found, do not look for token before that.
329*/
330static inline void peek_last_three_tokens(const sql_digest_storage* digest_storage,
331 uint last_id_index, uint *t1, uint *t2, uint *t3)
332{
333 uint byte_count= digest_storage->m_byte_count;
334 uint peek_index= byte_count;
335
336 if (last_id_index + SIZE_OF_A_TOKEN <= peek_index)
337 {
338 /* Take last token. */
339 peek_index-= SIZE_OF_A_TOKEN;
340 *t1= peek_token(digest_storage, peek_index);
341
342 if (last_id_index + SIZE_OF_A_TOKEN <= peek_index)
343 {
344 /* Take 2nd token from last. */
345 peek_index-= SIZE_OF_A_TOKEN;
346 *t2= peek_token(digest_storage, peek_index);
347
348 if (last_id_index + SIZE_OF_A_TOKEN <= peek_index)
349 {
350 /* Take 3rd token from last. */
351 peek_index-= SIZE_OF_A_TOKEN;
352 *t3= peek_token(digest_storage, peek_index);
353 }
354 else
355 {
356 *t3= TOK_UNUSED;
357 }
358 }
359 else
360 {
361 *t2= TOK_UNUSED;
362 *t3= TOK_UNUSED;
363 }
364 }
365 else
366 {
367 *t1= TOK_UNUSED;
368 *t2= TOK_UNUSED;
369 *t3= TOK_UNUSED;
370 }
371}
372
373sql_digest_state* digest_add_token(sql_digest_state *state,
374 uint token,
375 LEX_YYSTYPE yylval)
376{
377 sql_digest_storage *digest_storage= NULL;
378
379 digest_storage= &state->m_digest_storage;
380
381 /*
382 Stop collecting further tokens if digest storage is full or
383 if END token is received.
384 */
385 if (digest_storage->m_full || token == END_OF_INPUT)
386 return NULL;
387
388 /*
389 Take last_token 2 tokens collected till now. These tokens will be used
390 in reduce for normalisation. Make sure not to consider ID tokens in reduce.
391 */
392 uint last_token;
393 uint last_token2;
394
395 switch (token)
396 {
397 case NUM:
398 case LONG_NUM:
399 case ULONGLONG_NUM:
400 case DECIMAL_NUM:
401 case FLOAT_NUM:
402 case BIN_NUM:
403 case HEX_NUM:
404 {
405 bool found_unary;
406 do
407 {
408 found_unary= false;
409 peek_last_two_tokens(digest_storage, state->m_last_id_index,
410 &last_token, &last_token2);
411
412 if ((last_token == '-') || (last_token == '+'))
413 {
414 /*
415 We need to differentiate:
416 - a <unary minus> operator
417 - a <unary plus> operator
418 from
419 - a <binary minus> operator
420 - a <binary plus> operator
421 to only reduce "a = -1" to "a = ?", and not change "b - 1" to "b ?"
422
423 Binary operators are found inside an expression,
424 while unary operators are found at the beginning of an expression, or after operators.
425
426 To achieve this, every token that is followed by an <expr> expression
427 in the SQL grammar is flagged.
428 See sql/sql_yacc.yy
429 See sql/gen_lex_token.cc
430
431 For example,
432 "(-1)" is parsed as "(", "-", NUM, ")", and lex_token_array["("].m_start_expr is true,
433 so reduction of the "-" NUM is done, the result is "(?)".
434 "(a-1)" is parsed as "(", ID, "-", NUM, ")", and lex_token_array[ID].m_start_expr is false,
435 so the operator is binary, no reduction is done, and the result is "(a-?)".
436 */
437 if (lex_token_array[last_token2].m_start_expr)
438 {
439 /*
440 REDUCE:
441 TOK_GENERIC_VALUE := (UNARY_PLUS | UNARY_MINUS) (NUM | LOG_NUM | ... | FLOAT_NUM)
442
443 REDUCE:
444 TOK_GENERIC_VALUE := (UNARY_PLUS | UNARY_MINUS) TOK_GENERIC_VALUE
445 */
446 token= TOK_GENERIC_VALUE;
447 digest_storage->m_byte_count-= SIZE_OF_A_TOKEN;
448 found_unary= true;
449 }
450 }
451 } while (found_unary);
452 }
453 /* for case NULL_SYM below */
454 /* fall through */
455 case LEX_HOSTNAME:
456 case TEXT_STRING:
457 case NCHAR_STRING:
458 case PARAM_MARKER:
459 {
460 /*
461 REDUCE:
462 TOK_GENERIC_VALUE := BIN_NUM | DECIMAL_NUM | ... | ULONGLONG_NUM
463 */
464 token= TOK_GENERIC_VALUE;
465
466 peek_last_two_tokens(digest_storage, state->m_last_id_index,
467 &last_token, &last_token2);
468
469 if ((last_token2 == TOK_GENERIC_VALUE ||
470 last_token2 == TOK_GENERIC_VALUE_LIST) &&
471 (last_token == ','))
472 {
473 /*
474 REDUCE:
475 TOK_GENERIC_VALUE_LIST :=
476 TOK_GENERIC_VALUE ',' TOK_GENERIC_VALUE
477
478 REDUCE:
479 TOK_GENERIC_VALUE_LIST :=
480 TOK_GENERIC_VALUE_LIST ',' TOK_GENERIC_VALUE
481 */
482 digest_storage->m_byte_count-= 2*SIZE_OF_A_TOKEN;
483 token= TOK_GENERIC_VALUE_LIST;
484 }
485 /*
486 Add this token or the resulting reduce to digest storage.
487 */
488 store_token(digest_storage, token);
489 break;
490 }
491 case ')':
492 {
493 peek_last_two_tokens(digest_storage, state->m_last_id_index,
494 &last_token, &last_token2);
495
496 if (last_token == TOK_GENERIC_VALUE &&
497 last_token2 == '(')
498 {
499 /*
500 REDUCE:
501 TOK_ROW_SINGLE_VALUE :=
502 '(' TOK_GENERIC_VALUE ')'
503 */
504 digest_storage->m_byte_count-= 2*SIZE_OF_A_TOKEN;
505 token= TOK_ROW_SINGLE_VALUE;
506
507 /* Read last two tokens again */
508 peek_last_two_tokens(digest_storage, state->m_last_id_index,
509 &last_token, &last_token2);
510
511 if ((last_token2 == TOK_ROW_SINGLE_VALUE ||
512 last_token2 == TOK_ROW_SINGLE_VALUE_LIST) &&
513 (last_token == ','))
514 {
515 /*
516 REDUCE:
517 TOK_ROW_SINGLE_VALUE_LIST :=
518 TOK_ROW_SINGLE_VALUE ',' TOK_ROW_SINGLE_VALUE
519
520 REDUCE:
521 TOK_ROW_SINGLE_VALUE_LIST :=
522 TOK_ROW_SINGLE_VALUE_LIST ',' TOK_ROW_SINGLE_VALUE
523 */
524 digest_storage->m_byte_count-= 2*SIZE_OF_A_TOKEN;
525 token= TOK_ROW_SINGLE_VALUE_LIST;
526 }
527 }
528 else if (last_token == TOK_GENERIC_VALUE_LIST &&
529 last_token2 == '(')
530 {
531 /*
532 REDUCE:
533 TOK_ROW_MULTIPLE_VALUE :=
534 '(' TOK_GENERIC_VALUE_LIST ')'
535 */
536 digest_storage->m_byte_count-= 2*SIZE_OF_A_TOKEN;
537 token= TOK_ROW_MULTIPLE_VALUE;
538
539 /* Read last two tokens again */
540 peek_last_two_tokens(digest_storage, state->m_last_id_index,
541 &last_token, &last_token2);
542
543 if ((last_token2 == TOK_ROW_MULTIPLE_VALUE ||
544 last_token2 == TOK_ROW_MULTIPLE_VALUE_LIST) &&
545 (last_token == ','))
546 {
547 /*
548 REDUCE:
549 TOK_ROW_MULTIPLE_VALUE_LIST :=
550 TOK_ROW_MULTIPLE_VALUE ',' TOK_ROW_MULTIPLE_VALUE
551
552 REDUCE:
553 TOK_ROW_MULTIPLE_VALUE_LIST :=
554 TOK_ROW_MULTIPLE_VALUE_LIST ',' TOK_ROW_MULTIPLE_VALUE
555 */
556 digest_storage->m_byte_count-= 2*SIZE_OF_A_TOKEN;
557 token= TOK_ROW_MULTIPLE_VALUE_LIST;
558 }
559 }
560 /*
561 Add this token or the resulting reduce to digest storage.
562 */
563 store_token(digest_storage, token);
564 break;
565 }
566 case IDENT:
567 case IDENT_QUOTED:
568 {
569 YYSTYPE *lex_token= yylval;
570 const char *yytext= lex_token->lex_str.str;
571 size_t yylen= lex_token->lex_str.length;
572
573 /*
574 REDUCE:
575 TOK_IDENT := IDENT | IDENT_QUOTED
576 The parser gives IDENT or IDENT_TOKEN for the same text,
577 depending on the character set used.
578 We unify both to always print the same digest text,
579 and always have the same digest hash.
580 */
581 token= TOK_IDENT;
582 /* Add this token and identifier string to digest storage. */
583 store_token_identifier(digest_storage, token, yylen, yytext);
584
585 /* Update the index of last identifier found. */
586 state->m_last_id_index= digest_storage->m_byte_count;
587 break;
588 }
589 default:
590 {
591 /* Add this token to digest storage. */
592 store_token(digest_storage, token);
593 break;
594 }
595 }
596
597 return state;
598}
599
600sql_digest_state* digest_reduce_token(sql_digest_state *state,
601 uint token_left, uint token_right)
602{
603 sql_digest_storage *digest_storage= NULL;
604
605 digest_storage= &state->m_digest_storage;
606
607 /*
608 Stop collecting further tokens if digest storage is full.
609 */
610 if (digest_storage->m_full)
611 return NULL;
612
613 uint last_token;
614 uint last_token2;
615 uint last_token3;
616 uint token_to_push= TOK_UNUSED;
617
618 peek_last_two_tokens(digest_storage, state->m_last_id_index,
619 &last_token, &last_token2);
620
621 /*
622 There is only one caller of digest_reduce_token(),
623 see sql/sql_yacc.yy, rule literal := NULL_SYM.
624 REDUCE:
625 token_left := token_right
626 Used for:
627 TOK_GENERIC_VALUE := NULL_SYM
628 */
629
630 if (last_token == token_right)
631 {
632 /*
633 Current stream is like:
634 TOKEN_X TOKEN_RIGHT .
635 REDUCE to
636 TOKEN_X TOKEN_LEFT .
637 */
638 digest_storage->m_byte_count-= SIZE_OF_A_TOKEN;
639 store_token(digest_storage, token_left);
640 }
641 else
642 {
643 /*
644 Current stream is like:
645 TOKEN_X TOKEN_RIGHT TOKEN_Y .
646 Pop TOKEN_Y
647 TOKEN_X TOKEN_RIGHT . TOKEN_Y
648 REDUCE to
649 TOKEN_X TOKEN_LEFT . TOKEN_Y
650 */
651 DBUG_ASSERT(last_token2 == token_right);
652 digest_storage->m_byte_count-= 2 * SIZE_OF_A_TOKEN;
653 store_token(digest_storage, token_left);
654 token_to_push= last_token;
655 }
656
657 peek_last_three_tokens(digest_storage, state->m_last_id_index,
658 &last_token, &last_token2, &last_token3);
659
660 if ((last_token3 == TOK_GENERIC_VALUE ||
661 last_token3 == TOK_GENERIC_VALUE_LIST) &&
662 (last_token2 == ',') &&
663 (last_token == TOK_GENERIC_VALUE))
664 {
665 /*
666 REDUCE:
667 TOK_GENERIC_VALUE_LIST :=
668 TOK_GENERIC_VALUE ',' TOK_GENERIC_VALUE
669
670 REDUCE:
671 TOK_GENERIC_VALUE_LIST :=
672 TOK_GENERIC_VALUE_LIST ',' TOK_GENERIC_VALUE
673 */
674 digest_storage->m_byte_count-= 3*SIZE_OF_A_TOKEN;
675 store_token(digest_storage, TOK_GENERIC_VALUE_LIST);
676 }
677
678 if (token_to_push != TOK_UNUSED)
679 {
680 /*
681 Push TOKEN_Y
682 */
683 store_token(digest_storage, token_to_push);
684 }
685
686 return state;
687}
688
689