1 | /* Copyright 2006-2008 MySQL AB, 2008 Sun Microsystems, Inc. |
2 | |
3 | This program is free software; you can redistribute it and/or modify |
4 | it under the terms of the GNU General Public License as published by |
5 | the Free Software Foundation; version 2 of the License. |
6 | |
7 | This program is distributed in the hope that it will be useful, |
8 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
10 | GNU General Public License for more details. |
11 | |
12 | You should have received a copy of the GNU General Public License |
13 | along with this program; if not, write to the Free Software |
14 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ |
15 | |
16 | #ifndef SQL_STATISTICS_H |
17 | #define SQL_STATISTICS_H |
18 | |
19 | typedef |
20 | enum enum_use_stat_tables_mode |
21 | { |
22 | NEVER, |
23 | COMPLEMENTARY, |
24 | PEFERABLY, |
25 | } Use_stat_tables_mode; |
26 | |
27 | typedef |
28 | enum enum_histogram_type |
29 | { |
30 | SINGLE_PREC_HB, |
31 | DOUBLE_PREC_HB |
32 | } Histogram_type; |
33 | |
34 | enum enum_stat_tables |
35 | { |
36 | TABLE_STAT, |
37 | COLUMN_STAT, |
38 | INDEX_STAT, |
39 | }; |
40 | |
41 | |
42 | /* |
43 | These enumeration types comprise the dictionary of three |
44 | statistical tables table_stat, column_stat and index_stat |
45 | as they defined in ../scripts/mysql_system_tables.sql. |
46 | |
47 | It would be nice if the declarations of these types were |
48 | generated automatically by the table definitions. |
49 | */ |
50 | |
51 | enum enum_table_stat_col |
52 | { |
53 | TABLE_STAT_DB_NAME, |
54 | TABLE_STAT_TABLE_NAME, |
55 | TABLE_STAT_CARDINALITY, |
56 | TABLE_STAT_N_FIELDS |
57 | }; |
58 | |
59 | enum enum_column_stat_col |
60 | { |
61 | COLUMN_STAT_DB_NAME, |
62 | COLUMN_STAT_TABLE_NAME, |
63 | COLUMN_STAT_COLUMN_NAME, |
64 | COLUMN_STAT_MIN_VALUE, |
65 | COLUMN_STAT_MAX_VALUE, |
66 | COLUMN_STAT_NULLS_RATIO, |
67 | COLUMN_STAT_AVG_LENGTH, |
68 | COLUMN_STAT_AVG_FREQUENCY, |
69 | COLUMN_STAT_HIST_SIZE, |
70 | COLUMN_STAT_HIST_TYPE, |
71 | COLUMN_STAT_HISTOGRAM, |
72 | COLUMN_STAT_N_FIELDS |
73 | }; |
74 | |
75 | enum enum_index_stat_col |
76 | { |
77 | INDEX_STAT_DB_NAME, |
78 | INDEX_STAT_TABLE_NAME, |
79 | INDEX_STAT_INDEX_NAME, |
80 | INDEX_STAT_PREFIX_ARITY, |
81 | INDEX_STAT_AVG_FREQUENCY, |
82 | INDEX_STAT_N_FIELDS |
83 | }; |
84 | |
85 | inline |
86 | Use_stat_tables_mode get_use_stat_tables_mode(THD *thd) |
87 | { |
88 | return (Use_stat_tables_mode) (thd->variables.use_stat_tables); |
89 | } |
90 | |
91 | int read_statistics_for_tables_if_needed(THD *thd, TABLE_LIST *tables); |
92 | int collect_statistics_for_table(THD *thd, TABLE *table); |
93 | int alloc_statistics_for_table_share(THD* thd, TABLE_SHARE *share, |
94 | bool is_safe); |
95 | int alloc_statistics_for_table(THD *thd, TABLE *table); |
96 | int update_statistics_for_table(THD *thd, TABLE *table); |
97 | int delete_statistics_for_table(THD *thd, LEX_CSTRING *db, LEX_CSTRING *tab); |
98 | int delete_statistics_for_column(THD *thd, TABLE *tab, Field *col); |
99 | int delete_statistics_for_index(THD *thd, TABLE *tab, KEY *key_info, |
100 | bool ext_prefixes_only); |
101 | int rename_table_in_stat_tables(THD *thd, const LEX_CSTRING *db, const LEX_CSTRING *tab, |
102 | const LEX_CSTRING *new_db, const LEX_CSTRING *new_tab); |
103 | int rename_column_in_stat_tables(THD *thd, TABLE *tab, Field *col, |
104 | const char *new_name); |
105 | void set_statistics_for_table(THD *thd, TABLE *table); |
106 | |
107 | double get_column_avg_frequency(Field * field); |
108 | |
109 | double get_column_range_cardinality(Field *field, |
110 | key_range *min_endp, |
111 | key_range *max_endp, |
112 | uint range_flag); |
113 | bool is_stat_table(const LEX_CSTRING *db, LEX_CSTRING *table); |
114 | |
115 | class Histogram |
116 | { |
117 | |
118 | private: |
119 | Histogram_type type; |
120 | uint8 size; /* Size of values array, in bytes */ |
121 | uchar *values; |
122 | |
123 | uint prec_factor() |
124 | { |
125 | switch (type) { |
126 | case SINGLE_PREC_HB: |
127 | return ((uint) (1 << 8) - 1); |
128 | case DOUBLE_PREC_HB: |
129 | return ((uint) (1 << 16) - 1); |
130 | } |
131 | return 1; |
132 | } |
133 | |
134 | public: |
135 | uint get_width() |
136 | { |
137 | switch (type) { |
138 | case SINGLE_PREC_HB: |
139 | return size; |
140 | case DOUBLE_PREC_HB: |
141 | return size / 2; |
142 | } |
143 | return 0; |
144 | } |
145 | |
146 | private: |
147 | uint get_value(uint i) |
148 | { |
149 | DBUG_ASSERT(i < get_width()); |
150 | switch (type) { |
151 | case SINGLE_PREC_HB: |
152 | return (uint) (((uint8 *) values)[i]); |
153 | case DOUBLE_PREC_HB: |
154 | return (uint) uint2korr(values + i * 2); |
155 | } |
156 | return 0; |
157 | } |
158 | |
159 | /* Find the bucket which value 'pos' falls into. */ |
160 | uint find_bucket(double pos, bool first) |
161 | { |
162 | uint val= (uint) (pos * prec_factor()); |
163 | int lp= 0; |
164 | int rp= get_width() - 1; |
165 | int d= get_width() / 2; |
166 | uint i= lp + d; |
167 | for ( ; d; d= (rp - lp) / 2, i= lp + d) |
168 | { |
169 | if (val == get_value(i)) |
170 | break; |
171 | if (val < get_value(i)) |
172 | rp= i; |
173 | else if (val > get_value(i + 1)) |
174 | lp= i + 1; |
175 | else |
176 | break; |
177 | } |
178 | |
179 | if (val > get_value(i) && i < (get_width() - 1)) |
180 | i++; |
181 | |
182 | if (val == get_value(i)) |
183 | { |
184 | if (first) |
185 | { |
186 | while(i && val == get_value(i - 1)) |
187 | i--; |
188 | } |
189 | else |
190 | { |
191 | while(i + 1 < get_width() && val == get_value(i + 1)) |
192 | i++; |
193 | } |
194 | } |
195 | return i; |
196 | } |
197 | |
198 | public: |
199 | |
200 | uint get_size() { return (uint) size; } |
201 | |
202 | Histogram_type get_type() { return type; } |
203 | |
204 | uchar *get_values() { return (uchar *) values; } |
205 | |
206 | void set_size (ulonglong sz) { size= (uint8) sz; } |
207 | |
208 | void set_type (Histogram_type t) { type= t; } |
209 | |
210 | void set_values (uchar *vals) { values= (uchar *) vals; } |
211 | |
212 | bool is_available() { return get_size() > 0 && get_values(); } |
213 | |
214 | void set_value(uint i, double val) |
215 | { |
216 | switch (type) { |
217 | case SINGLE_PREC_HB: |
218 | ((uint8 *) values)[i]= (uint8) (val * prec_factor()); |
219 | return; |
220 | case DOUBLE_PREC_HB: |
221 | int2store(values + i * 2, val * prec_factor()); |
222 | return; |
223 | } |
224 | } |
225 | |
226 | void set_prev_value(uint i) |
227 | { |
228 | switch (type) { |
229 | case SINGLE_PREC_HB: |
230 | ((uint8 *) values)[i]= ((uint8 *) values)[i-1]; |
231 | return; |
232 | case DOUBLE_PREC_HB: |
233 | int2store(values + i * 2, uint2korr(values + i * 2 - 2)); |
234 | return; |
235 | } |
236 | } |
237 | |
238 | double range_selectivity(double min_pos, double max_pos) |
239 | { |
240 | double sel; |
241 | double bucket_sel= 1.0/(get_width() + 1); |
242 | uint min= find_bucket(min_pos, TRUE); |
243 | uint max= find_bucket(max_pos, FALSE); |
244 | sel= bucket_sel * (max - min + 1); |
245 | return sel; |
246 | } |
247 | |
248 | /* |
249 | Estimate selectivity of "col=const" using a histogram |
250 | */ |
251 | double point_selectivity(double pos, double avg_sel); |
252 | }; |
253 | |
254 | |
255 | class Columns_statistics; |
256 | class Index_statistics; |
257 | |
258 | /* Statistical data on a table */ |
259 | |
260 | class Table_statistics |
261 | { |
262 | |
263 | public: |
264 | my_bool cardinality_is_null; /* TRUE if the cardinality is unknown */ |
265 | ha_rows cardinality; /* Number of rows in the table */ |
266 | uchar *min_max_record_buffers; /* Record buffers for min/max values */ |
267 | Column_statistics *column_stats; /* Array of statistical data for columns */ |
268 | Index_statistics *index_stats; /* Array of statistical data for indexes */ |
269 | ulong *idx_avg_frequency; /* Array of records per key for index prefixes */ |
270 | ulong total_hist_size; /* Total size of all histograms */ |
271 | uchar *histograms; /* Sequence of histograms */ |
272 | }; |
273 | |
274 | |
275 | /* |
276 | Statistical data on a column |
277 | |
278 | Note: objects of this class may be "empty", where they have almost all fields |
279 | as zeros, for example, get_avg_frequency() will return 0. |
280 | |
281 | objects are allocated in alloc_statistics_for_table[_share]. |
282 | */ |
283 | |
284 | class Column_statistics |
285 | { |
286 | |
287 | private: |
288 | static const uint Scale_factor_nulls_ratio= 100000; |
289 | static const uint Scale_factor_avg_length= 100000; |
290 | static const uint Scale_factor_avg_frequency= 100000; |
291 | |
292 | public: |
293 | /* |
294 | Bitmap indicating what statistical characteristics |
295 | are available for the column |
296 | */ |
297 | uint32 column_stat_nulls; |
298 | |
299 | /* For the below two, see comments in get_column_range_cardinality() */ |
300 | /* Minimum value for the column */ |
301 | Field *min_value; |
302 | /* Maximum value for the column */ |
303 | Field *max_value; |
304 | |
305 | private: |
306 | |
307 | /* |
308 | The ratio Z/N multiplied by the scale factor Scale_factor_nulls_ratio, |
309 | where |
310 | N is the total number of rows, |
311 | Z is the number of nulls in the column |
312 | */ |
313 | ulong nulls_ratio; |
314 | |
315 | /* |
316 | Average number of bytes occupied by the representation of a |
317 | value of the column in memory buffers such as join buffer |
318 | multiplied by the scale factor Scale_factor_avg_length. |
319 | CHAR values are stripped of trailing spaces. |
320 | Flexible values are stripped of their length prefixes. |
321 | */ |
322 | ulong avg_length; |
323 | |
324 | /* |
325 | The ratio N/D multiplied by the scale factor Scale_factor_avg_frequency, |
326 | where |
327 | N is the number of rows with not null value in the column, |
328 | D the number of distinct values among them |
329 | */ |
330 | ulong avg_frequency; |
331 | |
332 | public: |
333 | |
334 | Histogram histogram; |
335 | |
336 | void set_all_nulls() |
337 | { |
338 | column_stat_nulls= |
339 | ((1 << (COLUMN_STAT_HISTOGRAM-COLUMN_STAT_COLUMN_NAME))-1) << |
340 | (COLUMN_STAT_COLUMN_NAME+1); |
341 | } |
342 | |
343 | void set_not_null(uint stat_field_no) |
344 | { |
345 | column_stat_nulls&= ~(1 << stat_field_no); |
346 | } |
347 | |
348 | bool is_null(uint stat_field_no) |
349 | { |
350 | return MY_TEST(column_stat_nulls & (1 << stat_field_no)); |
351 | } |
352 | |
353 | double get_nulls_ratio() |
354 | { |
355 | return (double) nulls_ratio / Scale_factor_nulls_ratio; |
356 | } |
357 | |
358 | double get_avg_length() |
359 | { |
360 | return (double) avg_length / Scale_factor_avg_length; |
361 | } |
362 | |
363 | double get_avg_frequency() |
364 | { |
365 | return (double) avg_frequency / Scale_factor_avg_frequency; |
366 | } |
367 | |
368 | void set_nulls_ratio (double val) |
369 | { |
370 | nulls_ratio= (ulong) (val * Scale_factor_nulls_ratio); |
371 | } |
372 | |
373 | void set_avg_length (double val) |
374 | { |
375 | avg_length= (ulong) (val * Scale_factor_avg_length); |
376 | } |
377 | |
378 | void set_avg_frequency (double val) |
379 | { |
380 | avg_frequency= (ulong) (val * Scale_factor_avg_frequency); |
381 | } |
382 | |
383 | bool min_max_values_are_provided() |
384 | { |
385 | return !is_null(COLUMN_STAT_MIN_VALUE) && |
386 | !is_null(COLUMN_STAT_MIN_VALUE); |
387 | } |
388 | }; |
389 | |
390 | |
391 | /* Statistical data on an index prefixes */ |
392 | |
393 | class Index_statistics |
394 | { |
395 | |
396 | private: |
397 | static const uint Scale_factor_avg_frequency= 100000; |
398 | /* |
399 | The k-th element of this array contains the ratio N/D |
400 | multiplied by the scale factor Scale_factor_avg_frequency, |
401 | where N is the number of index entries without nulls |
402 | in the first k components, and D is the number of distinct |
403 | k-component prefixes among them |
404 | */ |
405 | ulong *avg_frequency; |
406 | |
407 | public: |
408 | |
409 | void init_avg_frequency(ulong *ptr) { avg_frequency= ptr; } |
410 | |
411 | bool avg_frequency_is_inited() { return avg_frequency != NULL; } |
412 | |
413 | double get_avg_frequency(uint i) |
414 | { |
415 | return (double) avg_frequency[i] / Scale_factor_avg_frequency; |
416 | } |
417 | |
418 | void set_avg_frequency(uint i, double val) |
419 | { |
420 | avg_frequency[i]= (ulong) (val * Scale_factor_avg_frequency); |
421 | } |
422 | |
423 | }; |
424 | |
425 | #endif /* SQL_STATISTICS_H */ |
426 | |