text.c source code [DuckDB/third_party/dbgen/text.c]

1	/*
2	* text.c --- pseaudo text generator for use in DBGEN 2.0
3	*
4	* Defined Routines:
5	* dbg_text() -- select and translate a sentance form
6	*/
7
8	#ifdef TEXT_TEST
9	#define DECLARER
10	#endif /* TEST */
11
12	#include "config.h"
13
14	#include <stdlib.h>
15	#if (defined(_POSIX_) \|\| !defined(WIN32)) /* Change for Windows NT */
16	#include <unistd.h>
17	#endif /* WIN32 */
18	#include <ctype.h>
19	#include <errno.h>
20	#include <limits.h>
21	#include <math.h>
22	#include <signal.h>
23	#include <stdio.h> /* */
24	#include <string.h>
25	#ifdef HP
26	#include <strings.h>
27	#endif
28	#if (defined(WIN32) && !defined(_POSIX_))
29	#include <process.h>
30	#pragma warning(disable : 4201)
31	#pragma warning(disable : 4214)
32	#pragma warning(disable : 4514)
33	#define WIN32_LEAN_AND_MEAN
34	#define NOATOM
35	#define NOGDICAPMASKS
36	#define NOMETAFILE
37	#define NOMINMAX
38	#define NOMSG
39	#define NOOPENFILE
40	#define NORASTEROPS
41	#define NOSCROLL
42	#define NOSOUND
43	#define NOSYSMETRICS
44	#define NOTEXTMETRIC
45	#define NOWH
46	#define NOCOMM
47	#define NOKANJI
48	#define NOMCX
49	#include <windows.h>
50	#pragma warning(default : 4201)
51	#pragma warning(default : 4214)
52	#endif
53
54	#define TEXT_POOL_SIZE (300 * 1024 * 1024) /* 300MiB */
55
56	#include "dss.h"
57	#include "dsstypes.h"
58
59	/*
60	* txt_vp() --
61	* generate a verb phrase by
62	* 1) selecting a verb phrase form
63	* 2) parsing it to select parts of speech
64	* 3) selecting appropriate words
65	* 4) adding punctuation as required
66	*
67	* Returns: length of generated phrase
68	* Called By: txt_sentence()
69	* Calls: pick_str()
70	*/
71	static int txt_vp(char dest, int* sd) {
72	char syntax[MAX_GRAMMAR_LEN + `1`], cptr, parse_target;
73	distribution *src;
74	int i, res = `0`;
75
76	pick_str(&vp, sd, &syntax[`0`]);
77	parse_target = syntax;
78	while ((cptr = strtok(parse_target, " ")) != NULL) {
79	src = NULL;
80	switch (*cptr) {
81	case `'D'`:
82	src = &adverbs;
83	break;
84	case `'V'`:
85	src = &verbs;
86	break;
87	case `'X'`:
88	src = &auxillaries;
89	break;
90	} / end of POS switch statement /
91	i = pick_str(src, sd, dest);
92	i = (int)strlen(DIST_MEMBER(src, i));
93	dest += i;
94	res += i;
95	if ((++cptr)) /* miscelaneous fillagree, like punctuation /
96	{
97	dest += `1`;
98	res += `1`;
99	dest = cptr;
100	}
101	*dest = `' '`;
102	dest++;
103	res++;
104	parse_target = NULL;
105	} / end of while loop /
106
107	return (res);
108	}
109
110	/*
111	* txt_np() --
112	* generate a noun phrase by
113	* 1) selecting a noun phrase form
114	* 2) parsing it to select parts of speech
115	* 3) selecting appropriate words
116	* 4) adding punctuation as required
117	*
118	* Returns: length of generated phrase
119	* Called By: txt_sentence()
120	* Calls: pick_str(),
121	*/
122	static int txt_np(char dest, int* sd) {
123	char syntax[MAX_GRAMMAR_LEN + `1`], cptr, parse_target;
124	distribution *src;
125	int i, res = `0`;
126
127	pick_str(&np, sd, &syntax[`0`]);
128	parse_target = syntax;
129	while ((cptr = strtok(parse_target, " ")) != NULL) {
130	src = NULL;
131	switch (*cptr) {
132	case `'A'`:
133	src = &articles;
134	break;
135	case `'J'`:
136	src = &adjectives;
137	break;
138	case `'D'`:
139	src = &adverbs;
140	break;
141	case `'N'`:
142	src = &nouns;
143	break;
144	} / end of POS switch statement /
145	i = pick_str(src, sd, dest);
146	i = (int)strlen(DIST_MEMBER(src, i));
147	dest += i;
148	res += i;
149	if ((++cptr)) /* miscelaneous fillagree, like punctuation /
150	{
151	dest = cptr;
152	dest += `1`;
153	res += `1`;
154	}
155	*dest = `' '`;
156	dest++;
157	res++;
158	parse_target = NULL;
159	} / end of while loop /
160
161	return (res);
162	}
163
164	/*
165	* txt_sentence() --
166	* generate a sentence by
167	* 1) selecting a sentence form
168	* 2) parsing it to select parts of speech or phrase types
169	* 3) selecting appropriate words
170	* 4) adding punctuation as required
171	*
172	* Returns: length of generated sentence
173	* Called By: dbg_text()
174	* Calls: pick_str(), txt_np(), txt_vp()
175	*/
176	static int txt_sentence(char dest, int* sd) {
177	char syntax[MAX_GRAMMAR_LEN + `1`], *cptr;
178	int i, res = `0`, len = `0`;
179
180	pick_str(&grammar, sd, syntax);
181	cptr = syntax;
182
183	next_token: / I hate goto's, but can't seem to have parent and child use strtok() /
184	while (cptr && cptr == `' '`)
185	cptr++;
186	if (*cptr == `'\0'`)
187	goto done;
188	switch (*cptr) {
189	case `'V'`:
190	len = txt_vp(dest, sd);
191	break;
192	case `'N'`:
193	len = txt_np(dest, sd);
194	break;
195	case `'P'`:
196	i = pick_str(&prepositions, sd, dest);
197	len = (int)strlen(DIST_MEMBER(&prepositions, i));
198	strcpy((dest + len), " the ");
199	len += `5`;
200	len += txt_np(dest + len, sd);
201	break;
202	case `'T'`:
203	i = pick_str(&terminators, sd, --dest); /terminators should abut previous word /
204	len = (int)strlen(DIST_MEMBER(&terminators, i));
205	break;
206	} / end of POS switch statement /
207	dest += len;
208	res += len;
209	cptr++;
210	if (cptr && cptr != `' '`) / miscelaneous fillagree, like punctuation /
211	{
212	dest += `1`;
213	res += `1`;
214	dest = cptr;
215	}
216	goto next_token;
217	done:
218	*dest = `'\0'`;
219	return (--res);
220	}
221
222	static char gen_text(char* dest, int* sd, distribution *s) {
223	long i = `0`;
224	DSS_HUGE j;
225
226	RANDOM(j, `1`, s->list[s->count - `1`].weight, sd);
227	while (s->list[i].weight < j)
228	i++;
229	char *src = s->list[i].text;
230	int ind = `0`;
231	while (src[ind]) {
232	dest[ind] = src[ind];
233	ind++;
234	}
235	dest[ind] = `' '`;
236	return dest + ind + `1`;
237	}
238
239	#define NOUN_MAX_WEIGHT 340
240	#define ADJECTIVES_MAX_WEIGHT 289
241	#define ADVERBS_MAX_WEIGHT 262
242	#define AUXILLARIES_MAX_WEIGHT 18
243	#define VERBS_MAX_WEIGHT 174
244	#define PREPOSITIONS_MAX_WEIGHT 456
245
246	static char *noun_index[NOUN_MAX_WEIGHT + `1`];
247	static char *adjectives_index[ADJECTIVES_MAX_WEIGHT + `1`];
248	static char *adverbs_index[ADVERBS_MAX_WEIGHT + `1`];
249	static char *auxillaries_index[AUXILLARIES_MAX_WEIGHT + `1`];
250	static char *verbs_index[VERBS_MAX_WEIGHT + `1`];
251	static char *prepositions_index[PREPOSITIONS_MAX_WEIGHT + `1`];
252
253	// generate a lookup table for weight -> str
254	static void gen_index(char *index, distribution s) {
255	for (size_t w = `0`; w <= s->list[s->count - `1`].weight; w++) {
256	long i = `0`;
257	while (s->list[i].weight < w)
258	i++;
259	index[w] = s->list[i].text;
260	}
261	}
262
263	static char gen_text_index(char* dest, int* sd, char *index, distribution s) {
264	long i = `0`;
265	DSS_HUGE j;
266
267	RANDOM(j, `1`, s->list[s->count - `1`].weight, sd);
268	char *src = index[j];
269	int ind = `0`;
270	while (src[ind]) {
271	dest[ind] = src[ind];
272	ind++;
273	}
274	dest[ind] = `' '`;
275	return dest + ind + `1`;
276	}
277
278	static char gen_vp(char* dest, int* sd) {
279	DSS_HUGE j;
280	RANDOM(j, `1`, vp.list[vp.count - `1`].weight, sd);
281	int index = `0`;
282	index += vp.list[`0`].weight < j;
283	index += vp.list[`1`].weight < j;
284	index += vp.list[`2`].weight < j;
285
286	if (index == `0`) {
287	dest = gen_text_index(dest, sd, verbs_index, &verbs);
288	} else if (index == `1`) {
289	dest = gen_text_index(dest, sd, auxillaries_index, &auxillaries);
290	dest = gen_text_index(dest, sd, verbs_index, &verbs);
291	} else if (index == `2`) {
292	dest = gen_text_index(dest, sd, verbs_index, &verbs);
293	dest = gen_text_index(dest, sd, adverbs_index, &adverbs);
294	} else {
295	dest = gen_text_index(dest, sd, auxillaries_index, &auxillaries);
296	dest = gen_text_index(dest, sd, verbs_index, &verbs);
297	dest = gen_text_index(dest, sd, adverbs_index, &adverbs);
298	}
299	return dest;
300	}
301
302	static char gen_np(char* dest, int* sd) {
303	DSS_HUGE j;
304	RANDOM(j, `1`, np.list[np.count - `1`].weight, sd);
305	int index = `0`;
306	index += np.list[`0`].weight < j;
307	index += np.list[`1`].weight < j;
308	index += np.list[`2`].weight < j;
309
310	if (index == `0`) {
311	dest = gen_text_index(dest, sd, noun_index, &nouns);
312	} else if (index == `1`) {
313	dest = gen_text_index(dest, sd, adjectives_index, &adjectives);
314	dest = gen_text_index(dest, sd, noun_index, &nouns);
315	} else if (index == `2`) {
316	dest = gen_text_index(dest, sd, adjectives_index, &adjectives);
317	dest[-`1`] = `','`;
318	*(dest++) = `' '`;
319	dest = gen_text_index(dest, sd, adjectives_index, &adjectives);
320	dest = gen_text_index(dest, sd, noun_index, &nouns);
321	} else {
322	dest = gen_text_index(dest, sd, adverbs_index, &adverbs);
323	dest = gen_text_index(dest, sd, adjectives_index, &adjectives);
324	dest = gen_text_index(dest, sd, noun_index, &nouns);
325	}
326	return dest;
327	}
328
329	static char gen_preposition(char* dest, int* sd) {
330	dest = gen_text_index(dest, sd, prepositions_index, &prepositions);
331	*(dest++) = `'t'`;
332	*(dest++) = `'h'`;
333	*(dest++) = `'e'`;
334	*(dest++) = `' '`;
335	return gen_np(dest, sd);
336	}
337
338	static char gen_terminator(char* dest, int* sd) {
339	dest = gen_text(--dest, sd, &terminators);
340	return dest - `1`;
341	}
342
343	static char gen_sentence(char* dest, int* sd) {
344	const char *cptr;
345	int i;
346
347	DSS_HUGE j;
348	RANDOM(j, `1`, grammar.list[grammar.count - `1`].weight, sd);
349	int index = `0`;
350	index += grammar.list[`0`].weight < j;
351	index += grammar.list[`1`].weight < j;
352	index += grammar.list[`2`].weight < j;
353	index += grammar.list[`3`].weight < j;
354	cptr = grammar.list[index].text;
355
356	if (index == `0`) {
357	dest = gen_np(dest, sd);
358	dest = gen_vp(dest, sd);
359	dest = gen_terminator(dest, sd);
360	} else if (index == `1`) {
361	dest = gen_np(dest, sd);
362	dest = gen_vp(dest, sd);
363	dest = gen_preposition(dest, sd);
364	dest = gen_terminator(dest, sd);
365	} else if (index == `2`) {
366	dest = gen_np(dest, sd);
367	dest = gen_vp(dest, sd);
368	dest = gen_np(dest, sd);
369	dest = gen_terminator(dest, sd);
370	} else if (index == `3`) {
371	dest = gen_np(dest, sd);
372	dest = gen_preposition(dest, sd);
373	dest = gen_vp(dest, sd);
374	dest = gen_np(dest, sd);
375	dest = gen_terminator(dest, sd);
376	} else {
377	dest = gen_np(dest, sd);
378	dest = gen_preposition(dest, sd);
379	dest = gen_vp(dest, sd);
380	dest = gen_preposition(dest, sd);
381	dest = gen_terminator(dest, sd);
382	}
383	*dest = `' '`;
384	return dest + `1`;
385	}
386
387	/*
388	* dbg_text() --
389	* produce ELIZA-like text of random, bounded length, truncating the last
390	* generated sentence as required
391	*/
392	void dbg_text(char tgt, int* min, int max, int sd) {
393	DSS_HUGE hgLength = `0`, hgOffset, wordlen = `0`, s_len, needed;
394	char sentence[MAX_SENT_LEN + `1`], *cp;
395	static char szTextPool[TEXT_POOL_SIZE + `1` + `100`];
396	static int bInit = `0`;
397
398	if (!bInit) {
399	gen_index(noun_index, &nouns);
400	gen_index(adjectives_index, &adjectives);
401	gen_index(adverbs_index, &adverbs);
402	gen_index(auxillaries_index, &auxillaries);
403	gen_index(verbs_index, &verbs);
404	gen_index(prepositions_index, &prepositions);
405
406	char *ptr = szTextPool;
407	char *endptr = szTextPool + TEXT_POOL_SIZE + `1`;
408	while (ptr < endptr) {
409	ptr = gen_sentence(ptr, `5`);
410	}
411	szTextPool[TEXT_POOL_SIZE] = `'\0'`;
412
413	bInit = `1`;
414	}
415
416	RANDOM(hgOffset, `0`, TEXT_POOL_SIZE - max, sd);
417	RANDOM(hgLength, min, max, sd);
418	strncpy(&tgt[`0`], &szTextPool[hgOffset], (int)hgLength);
419	tgt[hgLength] = `'\0'`;
420
421	return;
422	}
423
424	#ifdef TEXT_TEST
425	tdef tdefs[`1`] = {NULL};
426	distribution nouns, verbs, adjectives, adverbs, auxillaries, terminators, articles, prepositions, grammar, np, vp;
427
428	int main() {
429	char prattle[`401`];
430
431	verbose = `1`;
432
433	read_dist(env_config(DIST_TAG, DIST_DFLT), "nouns", &nouns);
434	read_dist(env_config(DIST_TAG, DIST_DFLT), "verbs", &verbs);
435	read_dist(env_config(DIST_TAG, DIST_DFLT), "adjectives", &adjectives);
436	read_dist(env_config(DIST_TAG, DIST_DFLT), "adverbs", &adverbs);
437	read_dist(env_config(DIST_TAG, DIST_DFLT), "auxillaries", &auxillaries);
438	read_dist(env_config(DIST_TAG, DIST_DFLT), "terminators", &terminators);
439	read_dist(env_config(DIST_TAG, DIST_DFLT), "articles", &articles);
440	read_dist(env_config(DIST_TAG, DIST_DFLT), "prepositions", &prepositions);
441	read_dist(env_config(DIST_TAG, DIST_DFLT), "grammar", &grammar);
442	read_dist(env_config(DIST_TAG, DIST_DFLT), "np", &np);
443	read_dist(env_config(DIST_TAG, DIST_DFLT), "vp", &vp);
444
445	while (`1`) {
446	dbg_text(&prattle[`0`], `300`, `400`, `0`);
447	printf("<%s>\n", prattle);
448	}
449
450	return (`0`);
451	}
452	#endif /* TEST */
453

Browse the source code of DuckDB/third_party/dbgen/text.c