1 | /* |
2 | * Legal Notice |
3 | * |
4 | * This document and associated source code (the "Work") is a part of a |
5 | * benchmark specification maintained by the TPC. |
6 | * |
7 | * The TPC reserves all right, title, and interest to the Work as provided |
8 | * under U.S. and international laws, including without limitation all patent |
9 | * and trademark rights therein. |
10 | * |
11 | * No Warranty |
12 | * |
13 | * 1.1 TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THE INFORMATION |
14 | * CONTAINED HEREIN IS PROVIDED "AS IS" AND WITH ALL FAULTS, AND THE |
15 | * AUTHORS AND DEVELOPERS OF THE WORK HEREBY DISCLAIM ALL OTHER |
16 | * WARRANTIES AND CONDITIONS, EITHER EXPRESS, IMPLIED OR STATUTORY, |
17 | * INCLUDING, BUT NOT LIMITED TO, ANY (IF ANY) IMPLIED WARRANTIES, |
18 | * DUTIES OR CONDITIONS OF MERCHANTABILITY, OF FITNESS FOR A PARTICULAR |
19 | * PURPOSE, OF ACCURACY OR COMPLETENESS OF RESPONSES, OF RESULTS, OF |
20 | * WORKMANLIKE EFFORT, OF LACK OF VIRUSES, AND OF LACK OF NEGLIGENCE. |
21 | * ALSO, THERE IS NO WARRANTY OR CONDITION OF TITLE, QUIET ENJOYMENT, |
22 | * QUIET POSSESSION, CORRESPONDENCE TO DESCRIPTION OR NON-INFRINGEMENT |
23 | * WITH REGARD TO THE WORK. |
24 | * 1.2 IN NO EVENT WILL ANY AUTHOR OR DEVELOPER OF THE WORK BE LIABLE TO |
25 | * ANY OTHER PARTY FOR ANY DAMAGES, INCLUDING BUT NOT LIMITED TO THE |
26 | * COST OF PROCURING SUBSTITUTE GOODS OR SERVICES, LOST PROFITS, LOSS |
27 | * OF USE, LOSS OF DATA, OR ANY INCIDENTAL, CONSEQUENTIAL, DIRECT, |
28 | * INDIRECT, OR SPECIAL DAMAGES WHETHER UNDER CONTRACT, TORT, WARRANTY, |
29 | * OR OTHERWISE, ARISING IN ANY WAY OUT OF THIS OR ANY OTHER AGREEMENT |
30 | * RELATING TO THE WORK, WHETHER OR NOT SUCH AUTHOR OR DEVELOPER HAD |
31 | * ADVANCE NOTICE OF THE POSSIBILITY OF SUCH DAMAGES. |
32 | * |
33 | * Contributors: |
34 | * Gradient Systems |
35 | */ |
36 | #include "config.h" |
37 | #include "porting.h" |
38 | #include <stdio.h> |
39 | #include "date.h" |
40 | #include "decimal.h" |
41 | #include "dist.h" |
42 | #include "constants.h" |
43 | #include "columns.h" |
44 | #include "genrand.h" |
45 | #include "tdefs.h" |
46 | #include "tables.h" |
47 | #include "build_support.h" |
48 | #include "tpcds.idx.h" |
49 | #include "scaling.h" |
50 | #include "w_web_sales.h" |
51 | #include "error_msg.h" |
52 | #include "tdefs.h" |
53 | #include "scd.h" |
54 | #include "r_params.h" |
55 | #include "sparse.h" |
56 | |
57 | static ds_key_t web_join(int col, ds_key_t join_key); |
58 | |
59 | /* |
60 | * Routine: date_join(int from_tbl, int join_count) |
61 | * Purpose: account for the different date-adjusted patterns in the data set |
62 | * Data Structures: |
63 | * |
64 | * Params: |
65 | * Returns: |
66 | * Called By: |
67 | * Calls: |
68 | * Assumptions: |
69 | * Side Effects: |
70 | * TODO: Relies on existing RNG code, which isn't really 64bit; will probably |
71 | * requre a rework of the genrand_xx routines |
72 | */ |
73 | static ds_key_t date_join(int from_tbl, int from_col, ds_key_t join_count, int nYear) { |
74 | int nDay, nTemp, nMin = -1, nMax = -1, nResult; |
75 | static int bInit = 0, jToday; |
76 | date_t TempDate; |
77 | |
78 | if (bInit == 0) { |
79 | strtodt(&TempDate, TODAYS_DATE); |
80 | jToday = dttoj(&TempDate); |
81 | bInit = 1; |
82 | } |
83 | |
84 | switch (from_tbl) { |
85 | case STORE_SALES: |
86 | case CATALOG_SALES: |
87 | case WEB_SALES: |
88 | pick_distribution(&nDay, "calendar" , 1, calendar_sales + is_leap(nYear), from_col); |
89 | break; |
90 | |
91 | /* |
92 | * returns are keyed to the sale date, with the lag between sale and return |
93 | * selected within a known range, based on sales channel |
94 | */ |
95 | case STORE_RETURNS: |
96 | nMin = SS_MIN_SHIP_DELAY; |
97 | nMax = SS_MAX_SHIP_DELAY; |
98 | case CATALOG_RETURNS: |
99 | if (nMin == -1) { |
100 | nMin = CS_MIN_SHIP_DELAY; |
101 | nMax = CS_MAX_SHIP_DELAY; |
102 | } |
103 | case WEB_RETURNS: |
104 | if (nMin == -1) { |
105 | nMin = WS_MIN_SHIP_DELAY; |
106 | nMax = WS_MAX_SHIP_DELAY; |
107 | } |
108 | genrand_integer(&nTemp, DIST_UNIFORM, nMin * 2, nMax * 2, 0, from_col); |
109 | return (join_count + nTemp); |
110 | break; |
111 | case WEB_SITE: |
112 | case WEB_PAGE: |
113 | return (web_join(from_col, join_count)); |
114 | default: |
115 | pick_distribution(&nDay, "calendar" , 1, 1 + is_leap(nYear), from_col); |
116 | break; |
117 | } |
118 | |
119 | TempDate.year = nYear; |
120 | TempDate.month = 1; |
121 | TempDate.day = 1; |
122 | |
123 | nResult = dttoj(&TempDate) + nDay; |
124 | |
125 | return ((ds_key_t)(nResult > jToday) ? -1 : nResult); |
126 | } |
127 | |
128 | /* |
129 | * Routine: time_join(int from_tbl, int join_count) |
130 | * Purpose: create joins that are time-skewed |
131 | * Data Structures: |
132 | * |
133 | * Params: |
134 | * Returns: |
135 | * Called By: |
136 | * Calls: |
137 | * Assumptions: |
138 | * Side Effects: |
139 | * TODO: Relies on existing RNG code, which isn't really 64bit; will probably |
140 | * requre a rework of the genrand_xx routines |
141 | */ |
142 | static ds_key_t time_join(int to_tbl, int to_col, ds_key_t join_count) { |
143 | int hour, secs; |
144 | |
145 | switch (to_tbl) { |
146 | case STORE_SALES: |
147 | case STORE_RETURNS: |
148 | pick_distribution(&hour, "hours" , 1, 2, to_col); |
149 | break; |
150 | case CATALOG_SALES: |
151 | case WEB_SALES: |
152 | case CATALOG_RETURNS: |
153 | case WEB_RETURNS: |
154 | pick_distribution(&hour, "hours" , 1, 3, to_col); |
155 | break; |
156 | default: |
157 | pick_distribution(&hour, "hours" , 1, 1, to_col); |
158 | break; |
159 | } |
160 | genrand_integer(&secs, DIST_UNIFORM, 0, 3599, 0, to_col); |
161 | |
162 | return ((ds_key_t)(hour * 3600 + secs)); |
163 | } |
164 | |
165 | /* |
166 | * Routine: cp_join(int from_tbl, int join_count) |
167 | * Purpose: create joins to catalog_page |
168 | * Data Structures: |
169 | * |
170 | * Params: |
171 | * Returns: |
172 | * Called By: |
173 | * Calls: |
174 | * Assumptions: |
175 | * Side Effects: |
176 | * TODO: None |
177 | */ |
178 | static ds_key_t cp_join(int tbl, int col, ds_key_t jDate) { |
179 | ds_key_t res; |
180 | static int init = 0, nPagePerCatalog; |
181 | int nType, nCount, nOffset, nPage; |
182 | static date_t dTemp; |
183 | char *szTemp; |
184 | |
185 | if (!init) { |
186 | nPagePerCatalog = ((int)get_rowcount(CATALOG_PAGE) / CP_CATALOGS_PER_YEAR) / (YEAR_MAXIMUM - YEAR_MINIMUM + 2); |
187 | strtodt(&dTemp, DATA_START_DATE); |
188 | init = 1; |
189 | } |
190 | |
191 | nType = pick_distribution(&szTemp, "catalog_page_type" , 1, 2, col); |
192 | genrand_integer(&nPage, DIST_UNIFORM, 1, nPagePerCatalog, 0, col); |
193 | nOffset = (int)jDate - dTemp.julian - 1; |
194 | nCount = (nOffset / 365) * CP_CATALOGS_PER_YEAR; |
195 | nOffset %= 365; |
196 | |
197 | switch (nType) { |
198 | case 1: /* bi-annual */ |
199 | if (nOffset > 183) |
200 | nCount += 1; |
201 | break; |
202 | case 2: /* quarterly */ |
203 | nCount += (nOffset / 91); |
204 | break; |
205 | case 3: /* monthly */ |
206 | nCount += (nOffset / 31); |
207 | break; |
208 | } |
209 | |
210 | res = CP_SK(nCount, nPagePerCatalog, nPage); |
211 | |
212 | return (res); |
213 | } |
214 | /* |
215 | * Routine: |
216 | * Purpose: |
217 | * Algorithm: |
218 | * Data Structures: |
219 | * |
220 | * Params: |
221 | * Returns: |
222 | * Called By: |
223 | * Calls: |
224 | * Assumptions: |
225 | * Side Effects: |
226 | * TODO: None |
227 | */ |
228 | ds_key_t getCatalogNumberFromPage(ds_key_t kPageNumber) { |
229 | static int bInit = 0; |
230 | static int nPagePerCatalog; |
231 | |
232 | if (!bInit) { |
233 | nPagePerCatalog = ((int)get_rowcount(CATALOG_PAGE) / CP_CATALOGS_PER_YEAR) / (YEAR_MAXIMUM - YEAR_MINIMUM + 2); |
234 | bInit = 1; |
235 | } |
236 | |
237 | return (kPageNumber / nPagePerCatalog); |
238 | } |
239 | |
240 | /* |
241 | * Routine: web_join(int col, ds_key_t join_key) |
242 | * Purpose: create joins to web_site/web_page. These need to be handled |
243 | *together, since the date of transaction must fit within the lifetime of a |
244 | *particular page, which must fit within the lifetime of a particular site Data |
245 | *Structures: |
246 | * |
247 | * Params: |
248 | * join_key is one of two things: |
249 | * 1. the xxx_sk for a particular row in the dimension for which we need |
250 | *appropriate dates |
251 | * 2. a julian date for which we need to pick a valid xxx_sk value |
252 | * Returns: |
253 | * Called By: |
254 | * Calls: |
255 | * Assumptions: |
256 | * Side Effects: |
257 | * TODO: None |
258 | */ |
259 | static ds_key_t web_join(int col, ds_key_t join_key) { |
260 | ds_key_t res = -1, kSite; |
261 | static int init = 0, nConcurrentSites, nSiteDuration, nOffset; |
262 | static date_t dSiteOpen, /* open/close dates for current web site */ |
263 | dSiteClose; |
264 | int nTemp; |
265 | tdef *pWS = getSimpleTdefsByNumber(WEB_SITE); |
266 | tdef *pWP = getSimpleTdefsByNumber(WEB_PAGE); |
267 | |
268 | if (!init) { |
269 | strtodt(&dSiteClose, WEB_END_DATE); |
270 | nSiteDuration = dSiteClose.julian; |
271 | nConcurrentSites = (int)get_rowcount(CONCURRENT_WEB_SITES); |
272 | strtodt(&dSiteOpen, WEB_START_DATE); |
273 | nSiteDuration -= dSiteOpen.julian; |
274 | nSiteDuration *= nConcurrentSites; |
275 | nOffset = (dSiteClose.julian - dSiteOpen.julian) / (2 * nSiteDuration); |
276 | init = 1; |
277 | } |
278 | |
279 | switch (col) { |
280 | /************** |
281 | * join_key is the xxx_sk value for a dimension |
282 | */ |
283 | case WEB_OPEN_DATE: |
284 | strtodt(&dSiteOpen, DATE_MINIMUM); |
285 | res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2); |
286 | if (WEB_IS_REPLACED(join_key)) /* this site is completely replaced */ |
287 | { |
288 | if (WEB_IS_REPLACEMENT(join_key)) /* this is the second site */ |
289 | { |
290 | /* the open date of the second site needs to align on a revision |
291 | * boundary */ |
292 | res += nOffset * nSiteDuration; |
293 | } |
294 | } |
295 | break; |
296 | case WEB_CLOSE_DATE: |
297 | strtodt(&dSiteOpen, DATE_MINIMUM); |
298 | res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2); |
299 | res += pWS->nParam * nSiteDuration; |
300 | if (WEB_IS_REPLACED(join_key)) /* this site is completely replaced */ |
301 | { |
302 | if (!WEB_IS_REPLACEMENT(join_key)) /* this is the first site */ |
303 | { |
304 | /* the close date of the first site needs to align on a revision |
305 | * boundary */ |
306 | res -= pWS->nParam * nSiteDuration / 2; |
307 | } |
308 | } |
309 | break; |
310 | case WEB_REC_START_DATE_ID: |
311 | strtodt(&dSiteOpen, DATE_MINIMUM); |
312 | res = dSiteOpen.julian - (((join_key - 1) * WEB_DATE_STAGGER) % nSiteDuration / 2); |
313 | res += (join_key % pWS->nParam) * nSiteDuration; |
314 | break; |
315 | case WEB_REC_END_DATE_ID: |
316 | strtodt(&dSiteOpen, DATE_MINIMUM); |
317 | res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2); |
318 | res += ((join_key + 1) % pWS->nParam) * nSiteDuration * 5 - 1; |
319 | break; |
320 | case WP_REC_START_DATE_ID: |
321 | strtodt(&dSiteOpen, DATE_MINIMUM); |
322 | res = dSiteOpen.julian - (((join_key - 1) * WEB_DATE_STAGGER) % nSiteDuration / 2); |
323 | res += (join_key % pWP->nParam) * nSiteDuration * 5; |
324 | break; |
325 | case WP_REC_END_DATE_ID: |
326 | strtodt(&dSiteOpen, DATE_MINIMUM); |
327 | res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2); |
328 | res += ((join_key + 1) % pWP->nParam) * nSiteDuration - 1; |
329 | break; |
330 | case WP_CREATION_DATE_SK: |
331 | /* page creation has to happen outside of the page window, to assure a |
332 | * constant number of pages, so it occurs in the gap between site |
333 | * creation and the site's actual activity. For sites that are replaced |
334 | * in the time span of the data set, this will depend on whether they |
335 | * are the first version or the second |
336 | */ |
337 | strtodt(&dSiteOpen, DATE_MINIMUM); |
338 | kSite = join_key / WEB_PAGES_PER_SITE + 1; |
339 | res = dSiteOpen.julian - (((int)kSite * WEB_DATE_STAGGER) % nSiteDuration / 2); |
340 | if (((int)kSite % pWP->nParam) == 0) /* this is a site that gets replaced */ |
341 | { |
342 | genrand_integer(&nTemp, DIST_UNIFORM, (int)res, dSiteOpen.julian, 0, col); |
343 | res = nTemp; |
344 | } |
345 | break; |
346 | /***************** |
347 | * join key from here on is a date for which a valid site/page must be |
348 | * found the sk for a web page is a compound value: <site id><page id> |
349 | * and each component is a combination of the unique site or page and |
350 | * the active revision to it |
351 | */ |
352 | case WR_WEB_PAGE_SK: |
353 | case WS_WEB_PAGE_SK: |
354 | res = genrand_integer(NULL, DIST_UNIFORM, 1, WEB_PAGES_PER_SITE, 0, col); |
355 | break; |
356 | } |
357 | |
358 | return (res); |
359 | } |
360 | |
361 | /* |
362 | * Routine: mk_join(int from_tbl, int to_tbl, int join_count) |
363 | * Purpose: return a primary key for to_tbl, creating a join between from_tbl |
364 | *and to_tbl Algorithm: all joins are currently uniformly distributed. The |
365 | *calling convention allows for each join in the schema to be distributed |
366 | *differently Data Structures: |
367 | * |
368 | * Params: |
369 | * Returns: |
370 | * Called By: |
371 | * Calls: |
372 | * Assumptions: |
373 | * Side Effects: |
374 | * TODO: Relies on existing RNG code, which isn't really 64bit; will probably |
375 | *requre a rework of the genrand_xx routines |
376 | */ |
377 | ds_key_t mk_join(int from_col, int to_tbl, ds_key_t join_count) { |
378 | ds_key_t res; |
379 | int nYear, nFromTable = 0, nTableIndex = to_tbl; |
380 | tdef *pTdef; |
381 | |
382 | nFromTable = getTableFromColumn(from_col); |
383 | |
384 | /* |
385 | * if the table being joined to employs sparse keys, the join gets handled |
386 | * in sparse.c |
387 | */ |
388 | pTdef = getSimpleTdefsByNumber(to_tbl); |
389 | if (pTdef->flags & FL_SPARSE) { |
390 | if (pTdef->arSparseKeys == NULL) |
391 | initSparseKeys(to_tbl); |
392 | } |
393 | |
394 | switch (to_tbl) { |
395 | /* some tables require special handling */ |
396 | case CATALOG_PAGE: |
397 | return (cp_join(nFromTable, from_col, join_count)); |
398 | case DATET: |
399 | genrand_integer(&nYear, DIST_UNIFORM, YEAR_MINIMUM, YEAR_MAXIMUM, 0, from_col); |
400 | return (date_join(nFromTable, from_col, join_count, nYear)); |
401 | case TIME: |
402 | return (time_join(nFromTable, from_col, join_count)); |
403 | /* the rest of the tables use standard, uniform joins */ |
404 | default: |
405 | /* |
406 | * all TYPE2 tables (i.e., history keeping dimensions) need a special |
407 | * join algorithm |
408 | */ |
409 | if (pTdef->flags & FL_TYPE_2) |
410 | return (scd_join(nTableIndex, from_col, join_count)); |
411 | |
412 | if (pTdef->flags & FL_SPARSE) |
413 | return (randomSparseKey(nTableIndex, from_col)); |
414 | |
415 | genrand_key(&res, DIST_UNIFORM, (ds_key_t)1, get_rowcount(nTableIndex), (ds_key_t)0, from_col); |
416 | break; |
417 | } |
418 | |
419 | return ((ds_key_t)res); |
420 | } |
421 | |