| 1 | /* Copyright 2006-2008 MySQL AB, 2008 Sun Microsystems, Inc. |
| 2 | |
| 3 | This program is free software; you can redistribute it and/or modify |
| 4 | it under the terms of the GNU General Public License as published by |
| 5 | the Free Software Foundation; version 2 of the License. |
| 6 | |
| 7 | This program is distributed in the hope that it will be useful, |
| 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 10 | GNU General Public License for more details. |
| 11 | |
| 12 | You should have received a copy of the GNU General Public License |
| 13 | along with this program; if not, write to the Free Software |
| 14 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ |
| 15 | |
| 16 | #ifndef SQL_STATISTICS_H |
| 17 | #define SQL_STATISTICS_H |
| 18 | |
| 19 | typedef |
| 20 | enum enum_use_stat_tables_mode |
| 21 | { |
| 22 | NEVER, |
| 23 | COMPLEMENTARY, |
| 24 | PEFERABLY, |
| 25 | } Use_stat_tables_mode; |
| 26 | |
| 27 | typedef |
| 28 | enum enum_histogram_type |
| 29 | { |
| 30 | SINGLE_PREC_HB, |
| 31 | DOUBLE_PREC_HB |
| 32 | } Histogram_type; |
| 33 | |
| 34 | enum enum_stat_tables |
| 35 | { |
| 36 | TABLE_STAT, |
| 37 | COLUMN_STAT, |
| 38 | INDEX_STAT, |
| 39 | }; |
| 40 | |
| 41 | |
| 42 | /* |
| 43 | These enumeration types comprise the dictionary of three |
| 44 | statistical tables table_stat, column_stat and index_stat |
| 45 | as they defined in ../scripts/mysql_system_tables.sql. |
| 46 | |
| 47 | It would be nice if the declarations of these types were |
| 48 | generated automatically by the table definitions. |
| 49 | */ |
| 50 | |
| 51 | enum enum_table_stat_col |
| 52 | { |
| 53 | TABLE_STAT_DB_NAME, |
| 54 | TABLE_STAT_TABLE_NAME, |
| 55 | TABLE_STAT_CARDINALITY, |
| 56 | TABLE_STAT_N_FIELDS |
| 57 | }; |
| 58 | |
| 59 | enum enum_column_stat_col |
| 60 | { |
| 61 | COLUMN_STAT_DB_NAME, |
| 62 | COLUMN_STAT_TABLE_NAME, |
| 63 | COLUMN_STAT_COLUMN_NAME, |
| 64 | COLUMN_STAT_MIN_VALUE, |
| 65 | COLUMN_STAT_MAX_VALUE, |
| 66 | COLUMN_STAT_NULLS_RATIO, |
| 67 | COLUMN_STAT_AVG_LENGTH, |
| 68 | COLUMN_STAT_AVG_FREQUENCY, |
| 69 | COLUMN_STAT_HIST_SIZE, |
| 70 | COLUMN_STAT_HIST_TYPE, |
| 71 | COLUMN_STAT_HISTOGRAM, |
| 72 | COLUMN_STAT_N_FIELDS |
| 73 | }; |
| 74 | |
| 75 | enum enum_index_stat_col |
| 76 | { |
| 77 | INDEX_STAT_DB_NAME, |
| 78 | INDEX_STAT_TABLE_NAME, |
| 79 | INDEX_STAT_INDEX_NAME, |
| 80 | INDEX_STAT_PREFIX_ARITY, |
| 81 | INDEX_STAT_AVG_FREQUENCY, |
| 82 | INDEX_STAT_N_FIELDS |
| 83 | }; |
| 84 | |
| 85 | inline |
| 86 | Use_stat_tables_mode get_use_stat_tables_mode(THD *thd) |
| 87 | { |
| 88 | return (Use_stat_tables_mode) (thd->variables.use_stat_tables); |
| 89 | } |
| 90 | |
| 91 | int read_statistics_for_tables_if_needed(THD *thd, TABLE_LIST *tables); |
| 92 | int collect_statistics_for_table(THD *thd, TABLE *table); |
| 93 | int alloc_statistics_for_table_share(THD* thd, TABLE_SHARE *share, |
| 94 | bool is_safe); |
| 95 | int alloc_statistics_for_table(THD *thd, TABLE *table); |
| 96 | int update_statistics_for_table(THD *thd, TABLE *table); |
| 97 | int delete_statistics_for_table(THD *thd, LEX_CSTRING *db, LEX_CSTRING *tab); |
| 98 | int delete_statistics_for_column(THD *thd, TABLE *tab, Field *col); |
| 99 | int delete_statistics_for_index(THD *thd, TABLE *tab, KEY *key_info, |
| 100 | bool ext_prefixes_only); |
| 101 | int rename_table_in_stat_tables(THD *thd, const LEX_CSTRING *db, const LEX_CSTRING *tab, |
| 102 | const LEX_CSTRING *new_db, const LEX_CSTRING *new_tab); |
| 103 | int rename_column_in_stat_tables(THD *thd, TABLE *tab, Field *col, |
| 104 | const char *new_name); |
| 105 | void set_statistics_for_table(THD *thd, TABLE *table); |
| 106 | |
| 107 | double get_column_avg_frequency(Field * field); |
| 108 | |
| 109 | double get_column_range_cardinality(Field *field, |
| 110 | key_range *min_endp, |
| 111 | key_range *max_endp, |
| 112 | uint range_flag); |
| 113 | bool is_stat_table(const LEX_CSTRING *db, LEX_CSTRING *table); |
| 114 | |
| 115 | class Histogram |
| 116 | { |
| 117 | |
| 118 | private: |
| 119 | Histogram_type type; |
| 120 | uint8 size; /* Size of values array, in bytes */ |
| 121 | uchar *values; |
| 122 | |
| 123 | uint prec_factor() |
| 124 | { |
| 125 | switch (type) { |
| 126 | case SINGLE_PREC_HB: |
| 127 | return ((uint) (1 << 8) - 1); |
| 128 | case DOUBLE_PREC_HB: |
| 129 | return ((uint) (1 << 16) - 1); |
| 130 | } |
| 131 | return 1; |
| 132 | } |
| 133 | |
| 134 | public: |
| 135 | uint get_width() |
| 136 | { |
| 137 | switch (type) { |
| 138 | case SINGLE_PREC_HB: |
| 139 | return size; |
| 140 | case DOUBLE_PREC_HB: |
| 141 | return size / 2; |
| 142 | } |
| 143 | return 0; |
| 144 | } |
| 145 | |
| 146 | private: |
| 147 | uint get_value(uint i) |
| 148 | { |
| 149 | DBUG_ASSERT(i < get_width()); |
| 150 | switch (type) { |
| 151 | case SINGLE_PREC_HB: |
| 152 | return (uint) (((uint8 *) values)[i]); |
| 153 | case DOUBLE_PREC_HB: |
| 154 | return (uint) uint2korr(values + i * 2); |
| 155 | } |
| 156 | return 0; |
| 157 | } |
| 158 | |
| 159 | /* Find the bucket which value 'pos' falls into. */ |
| 160 | uint find_bucket(double pos, bool first) |
| 161 | { |
| 162 | uint val= (uint) (pos * prec_factor()); |
| 163 | int lp= 0; |
| 164 | int rp= get_width() - 1; |
| 165 | int d= get_width() / 2; |
| 166 | uint i= lp + d; |
| 167 | for ( ; d; d= (rp - lp) / 2, i= lp + d) |
| 168 | { |
| 169 | if (val == get_value(i)) |
| 170 | break; |
| 171 | if (val < get_value(i)) |
| 172 | rp= i; |
| 173 | else if (val > get_value(i + 1)) |
| 174 | lp= i + 1; |
| 175 | else |
| 176 | break; |
| 177 | } |
| 178 | |
| 179 | if (val > get_value(i) && i < (get_width() - 1)) |
| 180 | i++; |
| 181 | |
| 182 | if (val == get_value(i)) |
| 183 | { |
| 184 | if (first) |
| 185 | { |
| 186 | while(i && val == get_value(i - 1)) |
| 187 | i--; |
| 188 | } |
| 189 | else |
| 190 | { |
| 191 | while(i + 1 < get_width() && val == get_value(i + 1)) |
| 192 | i++; |
| 193 | } |
| 194 | } |
| 195 | return i; |
| 196 | } |
| 197 | |
| 198 | public: |
| 199 | |
| 200 | uint get_size() { return (uint) size; } |
| 201 | |
| 202 | Histogram_type get_type() { return type; } |
| 203 | |
| 204 | uchar *get_values() { return (uchar *) values; } |
| 205 | |
| 206 | void set_size (ulonglong sz) { size= (uint8) sz; } |
| 207 | |
| 208 | void set_type (Histogram_type t) { type= t; } |
| 209 | |
| 210 | void set_values (uchar *vals) { values= (uchar *) vals; } |
| 211 | |
| 212 | bool is_available() { return get_size() > 0 && get_values(); } |
| 213 | |
| 214 | void set_value(uint i, double val) |
| 215 | { |
| 216 | switch (type) { |
| 217 | case SINGLE_PREC_HB: |
| 218 | ((uint8 *) values)[i]= (uint8) (val * prec_factor()); |
| 219 | return; |
| 220 | case DOUBLE_PREC_HB: |
| 221 | int2store(values + i * 2, val * prec_factor()); |
| 222 | return; |
| 223 | } |
| 224 | } |
| 225 | |
| 226 | void set_prev_value(uint i) |
| 227 | { |
| 228 | switch (type) { |
| 229 | case SINGLE_PREC_HB: |
| 230 | ((uint8 *) values)[i]= ((uint8 *) values)[i-1]; |
| 231 | return; |
| 232 | case DOUBLE_PREC_HB: |
| 233 | int2store(values + i * 2, uint2korr(values + i * 2 - 2)); |
| 234 | return; |
| 235 | } |
| 236 | } |
| 237 | |
| 238 | double range_selectivity(double min_pos, double max_pos) |
| 239 | { |
| 240 | double sel; |
| 241 | double bucket_sel= 1.0/(get_width() + 1); |
| 242 | uint min= find_bucket(min_pos, TRUE); |
| 243 | uint max= find_bucket(max_pos, FALSE); |
| 244 | sel= bucket_sel * (max - min + 1); |
| 245 | return sel; |
| 246 | } |
| 247 | |
| 248 | /* |
| 249 | Estimate selectivity of "col=const" using a histogram |
| 250 | */ |
| 251 | double point_selectivity(double pos, double avg_sel); |
| 252 | }; |
| 253 | |
| 254 | |
| 255 | class Columns_statistics; |
| 256 | class Index_statistics; |
| 257 | |
| 258 | /* Statistical data on a table */ |
| 259 | |
| 260 | class Table_statistics |
| 261 | { |
| 262 | |
| 263 | public: |
| 264 | my_bool cardinality_is_null; /* TRUE if the cardinality is unknown */ |
| 265 | ha_rows cardinality; /* Number of rows in the table */ |
| 266 | uchar *min_max_record_buffers; /* Record buffers for min/max values */ |
| 267 | Column_statistics *column_stats; /* Array of statistical data for columns */ |
| 268 | Index_statistics *index_stats; /* Array of statistical data for indexes */ |
| 269 | ulong *idx_avg_frequency; /* Array of records per key for index prefixes */ |
| 270 | ulong total_hist_size; /* Total size of all histograms */ |
| 271 | uchar *histograms; /* Sequence of histograms */ |
| 272 | }; |
| 273 | |
| 274 | |
| 275 | /* |
| 276 | Statistical data on a column |
| 277 | |
| 278 | Note: objects of this class may be "empty", where they have almost all fields |
| 279 | as zeros, for example, get_avg_frequency() will return 0. |
| 280 | |
| 281 | objects are allocated in alloc_statistics_for_table[_share]. |
| 282 | */ |
| 283 | |
| 284 | class Column_statistics |
| 285 | { |
| 286 | |
| 287 | private: |
| 288 | static const uint Scale_factor_nulls_ratio= 100000; |
| 289 | static const uint Scale_factor_avg_length= 100000; |
| 290 | static const uint Scale_factor_avg_frequency= 100000; |
| 291 | |
| 292 | public: |
| 293 | /* |
| 294 | Bitmap indicating what statistical characteristics |
| 295 | are available for the column |
| 296 | */ |
| 297 | uint32 column_stat_nulls; |
| 298 | |
| 299 | /* For the below two, see comments in get_column_range_cardinality() */ |
| 300 | /* Minimum value for the column */ |
| 301 | Field *min_value; |
| 302 | /* Maximum value for the column */ |
| 303 | Field *max_value; |
| 304 | |
| 305 | private: |
| 306 | |
| 307 | /* |
| 308 | The ratio Z/N multiplied by the scale factor Scale_factor_nulls_ratio, |
| 309 | where |
| 310 | N is the total number of rows, |
| 311 | Z is the number of nulls in the column |
| 312 | */ |
| 313 | ulong nulls_ratio; |
| 314 | |
| 315 | /* |
| 316 | Average number of bytes occupied by the representation of a |
| 317 | value of the column in memory buffers such as join buffer |
| 318 | multiplied by the scale factor Scale_factor_avg_length. |
| 319 | CHAR values are stripped of trailing spaces. |
| 320 | Flexible values are stripped of their length prefixes. |
| 321 | */ |
| 322 | ulong avg_length; |
| 323 | |
| 324 | /* |
| 325 | The ratio N/D multiplied by the scale factor Scale_factor_avg_frequency, |
| 326 | where |
| 327 | N is the number of rows with not null value in the column, |
| 328 | D the number of distinct values among them |
| 329 | */ |
| 330 | ulong avg_frequency; |
| 331 | |
| 332 | public: |
| 333 | |
| 334 | Histogram histogram; |
| 335 | |
| 336 | void set_all_nulls() |
| 337 | { |
| 338 | column_stat_nulls= |
| 339 | ((1 << (COLUMN_STAT_HISTOGRAM-COLUMN_STAT_COLUMN_NAME))-1) << |
| 340 | (COLUMN_STAT_COLUMN_NAME+1); |
| 341 | } |
| 342 | |
| 343 | void set_not_null(uint stat_field_no) |
| 344 | { |
| 345 | column_stat_nulls&= ~(1 << stat_field_no); |
| 346 | } |
| 347 | |
| 348 | bool is_null(uint stat_field_no) |
| 349 | { |
| 350 | return MY_TEST(column_stat_nulls & (1 << stat_field_no)); |
| 351 | } |
| 352 | |
| 353 | double get_nulls_ratio() |
| 354 | { |
| 355 | return (double) nulls_ratio / Scale_factor_nulls_ratio; |
| 356 | } |
| 357 | |
| 358 | double get_avg_length() |
| 359 | { |
| 360 | return (double) avg_length / Scale_factor_avg_length; |
| 361 | } |
| 362 | |
| 363 | double get_avg_frequency() |
| 364 | { |
| 365 | return (double) avg_frequency / Scale_factor_avg_frequency; |
| 366 | } |
| 367 | |
| 368 | void set_nulls_ratio (double val) |
| 369 | { |
| 370 | nulls_ratio= (ulong) (val * Scale_factor_nulls_ratio); |
| 371 | } |
| 372 | |
| 373 | void set_avg_length (double val) |
| 374 | { |
| 375 | avg_length= (ulong) (val * Scale_factor_avg_length); |
| 376 | } |
| 377 | |
| 378 | void set_avg_frequency (double val) |
| 379 | { |
| 380 | avg_frequency= (ulong) (val * Scale_factor_avg_frequency); |
| 381 | } |
| 382 | |
| 383 | bool min_max_values_are_provided() |
| 384 | { |
| 385 | return !is_null(COLUMN_STAT_MIN_VALUE) && |
| 386 | !is_null(COLUMN_STAT_MIN_VALUE); |
| 387 | } |
| 388 | }; |
| 389 | |
| 390 | |
| 391 | /* Statistical data on an index prefixes */ |
| 392 | |
| 393 | class Index_statistics |
| 394 | { |
| 395 | |
| 396 | private: |
| 397 | static const uint Scale_factor_avg_frequency= 100000; |
| 398 | /* |
| 399 | The k-th element of this array contains the ratio N/D |
| 400 | multiplied by the scale factor Scale_factor_avg_frequency, |
| 401 | where N is the number of index entries without nulls |
| 402 | in the first k components, and D is the number of distinct |
| 403 | k-component prefixes among them |
| 404 | */ |
| 405 | ulong *avg_frequency; |
| 406 | |
| 407 | public: |
| 408 | |
| 409 | void init_avg_frequency(ulong *ptr) { avg_frequency= ptr; } |
| 410 | |
| 411 | bool avg_frequency_is_inited() { return avg_frequency != NULL; } |
| 412 | |
| 413 | double get_avg_frequency(uint i) |
| 414 | { |
| 415 | return (double) avg_frequency[i] / Scale_factor_avg_frequency; |
| 416 | } |
| 417 | |
| 418 | void set_avg_frequency(uint i, double val) |
| 419 | { |
| 420 | avg_frequency[i]= (ulong) (val * Scale_factor_avg_frequency); |
| 421 | } |
| 422 | |
| 423 | }; |
| 424 | |
| 425 | #endif /* SQL_STATISTICS_H */ |
| 426 | |