| 1 | /* |
| 2 | * Legal Notice |
| 3 | * |
| 4 | * This document and associated source code (the "Work") is a part of a |
| 5 | * benchmark specification maintained by the TPC. |
| 6 | * |
| 7 | * The TPC reserves all right, title, and interest to the Work as provided |
| 8 | * under U.S. and international laws, including without limitation all patent |
| 9 | * and trademark rights therein. |
| 10 | * |
| 11 | * No Warranty |
| 12 | * |
| 13 | * 1.1 TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THE INFORMATION |
| 14 | * CONTAINED HEREIN IS PROVIDED "AS IS" AND WITH ALL FAULTS, AND THE |
| 15 | * AUTHORS AND DEVELOPERS OF THE WORK HEREBY DISCLAIM ALL OTHER |
| 16 | * WARRANTIES AND CONDITIONS, EITHER EXPRESS, IMPLIED OR STATUTORY, |
| 17 | * INCLUDING, BUT NOT LIMITED TO, ANY (IF ANY) IMPLIED WARRANTIES, |
| 18 | * DUTIES OR CONDITIONS OF MERCHANTABILITY, OF FITNESS FOR A PARTICULAR |
| 19 | * PURPOSE, OF ACCURACY OR COMPLETENESS OF RESPONSES, OF RESULTS, OF |
| 20 | * WORKMANLIKE EFFORT, OF LACK OF VIRUSES, AND OF LACK OF NEGLIGENCE. |
| 21 | * ALSO, THERE IS NO WARRANTY OR CONDITION OF TITLE, QUIET ENJOYMENT, |
| 22 | * QUIET POSSESSION, CORRESPONDENCE TO DESCRIPTION OR NON-INFRINGEMENT |
| 23 | * WITH REGARD TO THE WORK. |
| 24 | * 1.2 IN NO EVENT WILL ANY AUTHOR OR DEVELOPER OF THE WORK BE LIABLE TO |
| 25 | * ANY OTHER PARTY FOR ANY DAMAGES, INCLUDING BUT NOT LIMITED TO THE |
| 26 | * COST OF PROCURING SUBSTITUTE GOODS OR SERVICES, LOST PROFITS, LOSS |
| 27 | * OF USE, LOSS OF DATA, OR ANY INCIDENTAL, CONSEQUENTIAL, DIRECT, |
| 28 | * INDIRECT, OR SPECIAL DAMAGES WHETHER UNDER CONTRACT, TORT, WARRANTY, |
| 29 | * OR OTHERWISE, ARISING IN ANY WAY OUT OF THIS OR ANY OTHER AGREEMENT |
| 30 | * RELATING TO THE WORK, WHETHER OR NOT SUCH AUTHOR OR DEVELOPER HAD |
| 31 | * ADVANCE NOTICE OF THE POSSIBILITY OF SUCH DAMAGES. |
| 32 | * |
| 33 | * Contributors: |
| 34 | * Gradient Systems |
| 35 | */ |
| 36 | #include "config.h" |
| 37 | #include "porting.h" |
| 38 | #include <stdio.h> |
| 39 | #include "date.h" |
| 40 | #include "decimal.h" |
| 41 | #include "dist.h" |
| 42 | #include "constants.h" |
| 43 | #include "columns.h" |
| 44 | #include "genrand.h" |
| 45 | #include "tdefs.h" |
| 46 | #include "tables.h" |
| 47 | #include "build_support.h" |
| 48 | #include "tpcds.idx.h" |
| 49 | #include "scaling.h" |
| 50 | #include "w_web_sales.h" |
| 51 | #include "error_msg.h" |
| 52 | #include "tdefs.h" |
| 53 | #include "scd.h" |
| 54 | #include "r_params.h" |
| 55 | #include "sparse.h" |
| 56 | |
| 57 | static ds_key_t web_join(int col, ds_key_t join_key); |
| 58 | |
| 59 | /* |
| 60 | * Routine: date_join(int from_tbl, int join_count) |
| 61 | * Purpose: account for the different date-adjusted patterns in the data set |
| 62 | * Data Structures: |
| 63 | * |
| 64 | * Params: |
| 65 | * Returns: |
| 66 | * Called By: |
| 67 | * Calls: |
| 68 | * Assumptions: |
| 69 | * Side Effects: |
| 70 | * TODO: Relies on existing RNG code, which isn't really 64bit; will probably |
| 71 | * requre a rework of the genrand_xx routines |
| 72 | */ |
| 73 | static ds_key_t date_join(int from_tbl, int from_col, ds_key_t join_count, int nYear) { |
| 74 | int nDay, nTemp, nMin = -1, nMax = -1, nResult; |
| 75 | static int bInit = 0, jToday; |
| 76 | date_t TempDate; |
| 77 | |
| 78 | if (bInit == 0) { |
| 79 | strtodt(&TempDate, TODAYS_DATE); |
| 80 | jToday = dttoj(&TempDate); |
| 81 | bInit = 1; |
| 82 | } |
| 83 | |
| 84 | switch (from_tbl) { |
| 85 | case STORE_SALES: |
| 86 | case CATALOG_SALES: |
| 87 | case WEB_SALES: |
| 88 | pick_distribution(&nDay, "calendar" , 1, calendar_sales + is_leap(nYear), from_col); |
| 89 | break; |
| 90 | |
| 91 | /* |
| 92 | * returns are keyed to the sale date, with the lag between sale and return |
| 93 | * selected within a known range, based on sales channel |
| 94 | */ |
| 95 | case STORE_RETURNS: |
| 96 | nMin = SS_MIN_SHIP_DELAY; |
| 97 | nMax = SS_MAX_SHIP_DELAY; |
| 98 | case CATALOG_RETURNS: |
| 99 | if (nMin == -1) { |
| 100 | nMin = CS_MIN_SHIP_DELAY; |
| 101 | nMax = CS_MAX_SHIP_DELAY; |
| 102 | } |
| 103 | case WEB_RETURNS: |
| 104 | if (nMin == -1) { |
| 105 | nMin = WS_MIN_SHIP_DELAY; |
| 106 | nMax = WS_MAX_SHIP_DELAY; |
| 107 | } |
| 108 | genrand_integer(&nTemp, DIST_UNIFORM, nMin * 2, nMax * 2, 0, from_col); |
| 109 | return (join_count + nTemp); |
| 110 | break; |
| 111 | case WEB_SITE: |
| 112 | case WEB_PAGE: |
| 113 | return (web_join(from_col, join_count)); |
| 114 | default: |
| 115 | pick_distribution(&nDay, "calendar" , 1, 1 + is_leap(nYear), from_col); |
| 116 | break; |
| 117 | } |
| 118 | |
| 119 | TempDate.year = nYear; |
| 120 | TempDate.month = 1; |
| 121 | TempDate.day = 1; |
| 122 | |
| 123 | nResult = dttoj(&TempDate) + nDay; |
| 124 | |
| 125 | return ((ds_key_t)(nResult > jToday) ? -1 : nResult); |
| 126 | } |
| 127 | |
| 128 | /* |
| 129 | * Routine: time_join(int from_tbl, int join_count) |
| 130 | * Purpose: create joins that are time-skewed |
| 131 | * Data Structures: |
| 132 | * |
| 133 | * Params: |
| 134 | * Returns: |
| 135 | * Called By: |
| 136 | * Calls: |
| 137 | * Assumptions: |
| 138 | * Side Effects: |
| 139 | * TODO: Relies on existing RNG code, which isn't really 64bit; will probably |
| 140 | * requre a rework of the genrand_xx routines |
| 141 | */ |
| 142 | static ds_key_t time_join(int to_tbl, int to_col, ds_key_t join_count) { |
| 143 | int hour, secs; |
| 144 | |
| 145 | switch (to_tbl) { |
| 146 | case STORE_SALES: |
| 147 | case STORE_RETURNS: |
| 148 | pick_distribution(&hour, "hours" , 1, 2, to_col); |
| 149 | break; |
| 150 | case CATALOG_SALES: |
| 151 | case WEB_SALES: |
| 152 | case CATALOG_RETURNS: |
| 153 | case WEB_RETURNS: |
| 154 | pick_distribution(&hour, "hours" , 1, 3, to_col); |
| 155 | break; |
| 156 | default: |
| 157 | pick_distribution(&hour, "hours" , 1, 1, to_col); |
| 158 | break; |
| 159 | } |
| 160 | genrand_integer(&secs, DIST_UNIFORM, 0, 3599, 0, to_col); |
| 161 | |
| 162 | return ((ds_key_t)(hour * 3600 + secs)); |
| 163 | } |
| 164 | |
| 165 | /* |
| 166 | * Routine: cp_join(int from_tbl, int join_count) |
| 167 | * Purpose: create joins to catalog_page |
| 168 | * Data Structures: |
| 169 | * |
| 170 | * Params: |
| 171 | * Returns: |
| 172 | * Called By: |
| 173 | * Calls: |
| 174 | * Assumptions: |
| 175 | * Side Effects: |
| 176 | * TODO: None |
| 177 | */ |
| 178 | static ds_key_t cp_join(int tbl, int col, ds_key_t jDate) { |
| 179 | ds_key_t res; |
| 180 | static int init = 0, nPagePerCatalog; |
| 181 | int nType, nCount, nOffset, nPage; |
| 182 | static date_t dTemp; |
| 183 | char *szTemp; |
| 184 | |
| 185 | if (!init) { |
| 186 | nPagePerCatalog = ((int)get_rowcount(CATALOG_PAGE) / CP_CATALOGS_PER_YEAR) / (YEAR_MAXIMUM - YEAR_MINIMUM + 2); |
| 187 | strtodt(&dTemp, DATA_START_DATE); |
| 188 | init = 1; |
| 189 | } |
| 190 | |
| 191 | nType = pick_distribution(&szTemp, "catalog_page_type" , 1, 2, col); |
| 192 | genrand_integer(&nPage, DIST_UNIFORM, 1, nPagePerCatalog, 0, col); |
| 193 | nOffset = (int)jDate - dTemp.julian - 1; |
| 194 | nCount = (nOffset / 365) * CP_CATALOGS_PER_YEAR; |
| 195 | nOffset %= 365; |
| 196 | |
| 197 | switch (nType) { |
| 198 | case 1: /* bi-annual */ |
| 199 | if (nOffset > 183) |
| 200 | nCount += 1; |
| 201 | break; |
| 202 | case 2: /* quarterly */ |
| 203 | nCount += (nOffset / 91); |
| 204 | break; |
| 205 | case 3: /* monthly */ |
| 206 | nCount += (nOffset / 31); |
| 207 | break; |
| 208 | } |
| 209 | |
| 210 | res = CP_SK(nCount, nPagePerCatalog, nPage); |
| 211 | |
| 212 | return (res); |
| 213 | } |
| 214 | /* |
| 215 | * Routine: |
| 216 | * Purpose: |
| 217 | * Algorithm: |
| 218 | * Data Structures: |
| 219 | * |
| 220 | * Params: |
| 221 | * Returns: |
| 222 | * Called By: |
| 223 | * Calls: |
| 224 | * Assumptions: |
| 225 | * Side Effects: |
| 226 | * TODO: None |
| 227 | */ |
| 228 | ds_key_t getCatalogNumberFromPage(ds_key_t kPageNumber) { |
| 229 | static int bInit = 0; |
| 230 | static int nPagePerCatalog; |
| 231 | |
| 232 | if (!bInit) { |
| 233 | nPagePerCatalog = ((int)get_rowcount(CATALOG_PAGE) / CP_CATALOGS_PER_YEAR) / (YEAR_MAXIMUM - YEAR_MINIMUM + 2); |
| 234 | bInit = 1; |
| 235 | } |
| 236 | |
| 237 | return (kPageNumber / nPagePerCatalog); |
| 238 | } |
| 239 | |
| 240 | /* |
| 241 | * Routine: web_join(int col, ds_key_t join_key) |
| 242 | * Purpose: create joins to web_site/web_page. These need to be handled |
| 243 | *together, since the date of transaction must fit within the lifetime of a |
| 244 | *particular page, which must fit within the lifetime of a particular site Data |
| 245 | *Structures: |
| 246 | * |
| 247 | * Params: |
| 248 | * join_key is one of two things: |
| 249 | * 1. the xxx_sk for a particular row in the dimension for which we need |
| 250 | *appropriate dates |
| 251 | * 2. a julian date for which we need to pick a valid xxx_sk value |
| 252 | * Returns: |
| 253 | * Called By: |
| 254 | * Calls: |
| 255 | * Assumptions: |
| 256 | * Side Effects: |
| 257 | * TODO: None |
| 258 | */ |
| 259 | static ds_key_t web_join(int col, ds_key_t join_key) { |
| 260 | ds_key_t res = -1, kSite; |
| 261 | static int init = 0, nConcurrentSites, nSiteDuration, nOffset; |
| 262 | static date_t dSiteOpen, /* open/close dates for current web site */ |
| 263 | dSiteClose; |
| 264 | int nTemp; |
| 265 | tdef *pWS = getSimpleTdefsByNumber(WEB_SITE); |
| 266 | tdef *pWP = getSimpleTdefsByNumber(WEB_PAGE); |
| 267 | |
| 268 | if (!init) { |
| 269 | strtodt(&dSiteClose, WEB_END_DATE); |
| 270 | nSiteDuration = dSiteClose.julian; |
| 271 | nConcurrentSites = (int)get_rowcount(CONCURRENT_WEB_SITES); |
| 272 | strtodt(&dSiteOpen, WEB_START_DATE); |
| 273 | nSiteDuration -= dSiteOpen.julian; |
| 274 | nSiteDuration *= nConcurrentSites; |
| 275 | nOffset = (dSiteClose.julian - dSiteOpen.julian) / (2 * nSiteDuration); |
| 276 | init = 1; |
| 277 | } |
| 278 | |
| 279 | switch (col) { |
| 280 | /************** |
| 281 | * join_key is the xxx_sk value for a dimension |
| 282 | */ |
| 283 | case WEB_OPEN_DATE: |
| 284 | strtodt(&dSiteOpen, DATE_MINIMUM); |
| 285 | res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2); |
| 286 | if (WEB_IS_REPLACED(join_key)) /* this site is completely replaced */ |
| 287 | { |
| 288 | if (WEB_IS_REPLACEMENT(join_key)) /* this is the second site */ |
| 289 | { |
| 290 | /* the open date of the second site needs to align on a revision |
| 291 | * boundary */ |
| 292 | res += nOffset * nSiteDuration; |
| 293 | } |
| 294 | } |
| 295 | break; |
| 296 | case WEB_CLOSE_DATE: |
| 297 | strtodt(&dSiteOpen, DATE_MINIMUM); |
| 298 | res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2); |
| 299 | res += pWS->nParam * nSiteDuration; |
| 300 | if (WEB_IS_REPLACED(join_key)) /* this site is completely replaced */ |
| 301 | { |
| 302 | if (!WEB_IS_REPLACEMENT(join_key)) /* this is the first site */ |
| 303 | { |
| 304 | /* the close date of the first site needs to align on a revision |
| 305 | * boundary */ |
| 306 | res -= pWS->nParam * nSiteDuration / 2; |
| 307 | } |
| 308 | } |
| 309 | break; |
| 310 | case WEB_REC_START_DATE_ID: |
| 311 | strtodt(&dSiteOpen, DATE_MINIMUM); |
| 312 | res = dSiteOpen.julian - (((join_key - 1) * WEB_DATE_STAGGER) % nSiteDuration / 2); |
| 313 | res += (join_key % pWS->nParam) * nSiteDuration; |
| 314 | break; |
| 315 | case WEB_REC_END_DATE_ID: |
| 316 | strtodt(&dSiteOpen, DATE_MINIMUM); |
| 317 | res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2); |
| 318 | res += ((join_key + 1) % pWS->nParam) * nSiteDuration * 5 - 1; |
| 319 | break; |
| 320 | case WP_REC_START_DATE_ID: |
| 321 | strtodt(&dSiteOpen, DATE_MINIMUM); |
| 322 | res = dSiteOpen.julian - (((join_key - 1) * WEB_DATE_STAGGER) % nSiteDuration / 2); |
| 323 | res += (join_key % pWP->nParam) * nSiteDuration * 5; |
| 324 | break; |
| 325 | case WP_REC_END_DATE_ID: |
| 326 | strtodt(&dSiteOpen, DATE_MINIMUM); |
| 327 | res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2); |
| 328 | res += ((join_key + 1) % pWP->nParam) * nSiteDuration - 1; |
| 329 | break; |
| 330 | case WP_CREATION_DATE_SK: |
| 331 | /* page creation has to happen outside of the page window, to assure a |
| 332 | * constant number of pages, so it occurs in the gap between site |
| 333 | * creation and the site's actual activity. For sites that are replaced |
| 334 | * in the time span of the data set, this will depend on whether they |
| 335 | * are the first version or the second |
| 336 | */ |
| 337 | strtodt(&dSiteOpen, DATE_MINIMUM); |
| 338 | kSite = join_key / WEB_PAGES_PER_SITE + 1; |
| 339 | res = dSiteOpen.julian - (((int)kSite * WEB_DATE_STAGGER) % nSiteDuration / 2); |
| 340 | if (((int)kSite % pWP->nParam) == 0) /* this is a site that gets replaced */ |
| 341 | { |
| 342 | genrand_integer(&nTemp, DIST_UNIFORM, (int)res, dSiteOpen.julian, 0, col); |
| 343 | res = nTemp; |
| 344 | } |
| 345 | break; |
| 346 | /***************** |
| 347 | * join key from here on is a date for which a valid site/page must be |
| 348 | * found the sk for a web page is a compound value: <site id><page id> |
| 349 | * and each component is a combination of the unique site or page and |
| 350 | * the active revision to it |
| 351 | */ |
| 352 | case WR_WEB_PAGE_SK: |
| 353 | case WS_WEB_PAGE_SK: |
| 354 | res = genrand_integer(NULL, DIST_UNIFORM, 1, WEB_PAGES_PER_SITE, 0, col); |
| 355 | break; |
| 356 | } |
| 357 | |
| 358 | return (res); |
| 359 | } |
| 360 | |
| 361 | /* |
| 362 | * Routine: mk_join(int from_tbl, int to_tbl, int join_count) |
| 363 | * Purpose: return a primary key for to_tbl, creating a join between from_tbl |
| 364 | *and to_tbl Algorithm: all joins are currently uniformly distributed. The |
| 365 | *calling convention allows for each join in the schema to be distributed |
| 366 | *differently Data Structures: |
| 367 | * |
| 368 | * Params: |
| 369 | * Returns: |
| 370 | * Called By: |
| 371 | * Calls: |
| 372 | * Assumptions: |
| 373 | * Side Effects: |
| 374 | * TODO: Relies on existing RNG code, which isn't really 64bit; will probably |
| 375 | *requre a rework of the genrand_xx routines |
| 376 | */ |
| 377 | ds_key_t mk_join(int from_col, int to_tbl, ds_key_t join_count) { |
| 378 | ds_key_t res; |
| 379 | int nYear, nFromTable = 0, nTableIndex = to_tbl; |
| 380 | tdef *pTdef; |
| 381 | |
| 382 | nFromTable = getTableFromColumn(from_col); |
| 383 | |
| 384 | /* |
| 385 | * if the table being joined to employs sparse keys, the join gets handled |
| 386 | * in sparse.c |
| 387 | */ |
| 388 | pTdef = getSimpleTdefsByNumber(to_tbl); |
| 389 | if (pTdef->flags & FL_SPARSE) { |
| 390 | if (pTdef->arSparseKeys == NULL) |
| 391 | initSparseKeys(to_tbl); |
| 392 | } |
| 393 | |
| 394 | switch (to_tbl) { |
| 395 | /* some tables require special handling */ |
| 396 | case CATALOG_PAGE: |
| 397 | return (cp_join(nFromTable, from_col, join_count)); |
| 398 | case DATET: |
| 399 | genrand_integer(&nYear, DIST_UNIFORM, YEAR_MINIMUM, YEAR_MAXIMUM, 0, from_col); |
| 400 | return (date_join(nFromTable, from_col, join_count, nYear)); |
| 401 | case TIME: |
| 402 | return (time_join(nFromTable, from_col, join_count)); |
| 403 | /* the rest of the tables use standard, uniform joins */ |
| 404 | default: |
| 405 | /* |
| 406 | * all TYPE2 tables (i.e., history keeping dimensions) need a special |
| 407 | * join algorithm |
| 408 | */ |
| 409 | if (pTdef->flags & FL_TYPE_2) |
| 410 | return (scd_join(nTableIndex, from_col, join_count)); |
| 411 | |
| 412 | if (pTdef->flags & FL_SPARSE) |
| 413 | return (randomSparseKey(nTableIndex, from_col)); |
| 414 | |
| 415 | genrand_key(&res, DIST_UNIFORM, (ds_key_t)1, get_rowcount(nTableIndex), (ds_key_t)0, from_col); |
| 416 | break; |
| 417 | } |
| 418 | |
| 419 | return ((ds_key_t)res); |
| 420 | } |
| 421 | |