1 | /* |
2 | * $Id: text.c,v 1.6 2006/07/31 17:23:09 jms Exp $ |
3 | * |
4 | * Revision History |
5 | * =================== |
6 | * $Log: text.c,v $ |
7 | * Revision 1.6 2006/07/31 17:23:09 jms |
8 | * fix to parallelism problem |
9 | * |
10 | * Revision 1.5 2006/05/18 23:50:00 jms |
11 | * commit text generation change with larger buffer |
12 | * |
13 | * Revision 1.4 2006/05/16 16:26:51 jms |
14 | * remove calls to FAKE_V_STR |
15 | * |
16 | * Revision 1.3 2006/05/16 15:55:58 jms |
17 | * first cut to Meikel |
18 | * |
19 | * Revision 1.2 2005/01/03 20:08:59 jms |
20 | * change line terminations |
21 | * |
22 | * Revision 1.1.1.1 2004/11/24 23:31:47 jms |
23 | * re-establish external server |
24 | * |
25 | * Revision 1.1.1.1 2003/08/07 17:58:34 jms |
26 | * recreation after CVS crash |
27 | * |
28 | * Revision 1.2 2003/08/07 17:58:34 jms |
29 | * Convery RNG to 64bit space as preparation for new large scale RNG |
30 | * |
31 | * Revision 1.1.1.1 2003/04/03 18:54:21 jms |
32 | * initial checkin |
33 | * |
34 | * |
35 | */ |
36 | /* |
37 | * text.c --- pseaudo text generator for use in DBGEN 2.0 |
38 | * |
39 | * Defined Routines: |
40 | * dbg_text() -- select and translate a sentance form |
41 | */ |
42 | |
43 | #ifdef TEXT_TEST |
44 | #define DECLARER |
45 | #endif /* TEST */ |
46 | |
47 | #include "config.h" |
48 | #include <stdlib.h> |
49 | #if (defined(_POSIX_)||!defined(WIN32)) /* Change for Windows NT */ |
50 | #include <unistd.h> |
51 | #include <sys/wait.h> |
52 | #endif /* WIN32 */ |
53 | #include <stdio.h> /* */ |
54 | #include <limits.h> |
55 | #include <math.h> |
56 | #include <ctype.h> |
57 | #include <signal.h> |
58 | #include <string.h> |
59 | #include <errno.h> |
60 | #ifdef HP |
61 | #include <strings.h> |
62 | #endif |
63 | #if (defined(WIN32)&&!defined(_POSIX_)) |
64 | #include <process.h> |
65 | #pragma warning(disable:4201) |
66 | #pragma warning(disable:4214) |
67 | #pragma warning(disable:4514) |
68 | #define WIN32_LEAN_AND_MEAN |
69 | #define NOATOM |
70 | #define NOGDICAPMASKS |
71 | #define NOMETAFILE |
72 | #define NOMINMAX |
73 | #define NOMSG |
74 | #define NOOPENFILE |
75 | #define NORASTEROPS |
76 | #define NOSCROLL |
77 | #define NOSOUND |
78 | #define NOSYSMETRICS |
79 | #define NOTEXTMETRIC |
80 | #define NOWH |
81 | #define NOCOMM |
82 | #define NOKANJI |
83 | #define NOMCX |
84 | #include <windows.h> |
85 | #pragma warning(default:4201) |
86 | #pragma warning(default:4214) |
87 | #endif |
88 | |
89 | #define TEXT_POOL_SIZE (300 * 1024 * 1024) /* 300MiB */ |
90 | |
91 | #include "dss.h" |
92 | #include "dsstypes.h" |
93 | |
94 | /* |
95 | * txt_vp() -- |
96 | * generate a verb phrase by |
97 | * 1) selecting a verb phrase form |
98 | * 2) parsing it to select parts of speech |
99 | * 3) selecting appropriate words |
100 | * 4) adding punctuation as required |
101 | * |
102 | * Returns: length of generated phrase |
103 | * Called By: txt_sentence() |
104 | * Calls: pick_str() |
105 | */ |
106 | static int |
107 | txt_vp(char *dest, int sd) |
108 | { |
109 | char syntax[MAX_GRAMMAR_LEN + 1], |
110 | *cptr, |
111 | *parse_target; |
112 | distribution *src; |
113 | int i, |
114 | res = 0; |
115 | |
116 | |
117 | pick_str(&vp, sd, &syntax[0]); |
118 | parse_target = syntax; |
119 | while ((cptr = strtok(parse_target, " " )) != NULL) |
120 | { |
121 | src = NULL; |
122 | switch(*cptr) |
123 | { |
124 | case 'D': |
125 | src = &adverbs; |
126 | break; |
127 | case 'V': |
128 | src = &verbs; |
129 | break; |
130 | case 'X': |
131 | src = &auxillaries; |
132 | break; |
133 | } /* end of POS switch statement */ |
134 | i = pick_str(src, sd, dest); |
135 | i = (int)strlen(DIST_MEMBER(src, i)); |
136 | dest += i; |
137 | res += i; |
138 | if (*(++cptr)) /* miscelaneous fillagree, like punctuation */ |
139 | { |
140 | dest += 1; |
141 | res += 1; |
142 | *dest = *cptr; |
143 | } |
144 | *dest = ' '; |
145 | dest++; |
146 | res++; |
147 | parse_target = NULL; |
148 | } /* end of while loop */ |
149 | |
150 | return(res); |
151 | } |
152 | |
153 | /* |
154 | * txt_np() -- |
155 | * generate a noun phrase by |
156 | * 1) selecting a noun phrase form |
157 | * 2) parsing it to select parts of speech |
158 | * 3) selecting appropriate words |
159 | * 4) adding punctuation as required |
160 | * |
161 | * Returns: length of generated phrase |
162 | * Called By: txt_sentence() |
163 | * Calls: pick_str(), |
164 | */ |
165 | static int |
166 | txt_np(char *dest, int sd) |
167 | { |
168 | char syntax[MAX_GRAMMAR_LEN + 1], |
169 | *cptr, |
170 | *parse_target; |
171 | distribution *src; |
172 | int i, |
173 | res = 0; |
174 | |
175 | |
176 | pick_str(&np, sd, &syntax[0]); |
177 | parse_target = syntax; |
178 | while ((cptr = strtok(parse_target, " " )) != NULL) |
179 | { |
180 | src = NULL; |
181 | switch(*cptr) |
182 | { |
183 | case 'A': |
184 | src = &articles; |
185 | break; |
186 | case 'J': |
187 | src = &adjectives; |
188 | break; |
189 | case 'D': |
190 | src = &adverbs; |
191 | break; |
192 | case 'N': |
193 | src = &nouns; |
194 | break; |
195 | } /* end of POS switch statement */ |
196 | i = pick_str(src, sd, dest); |
197 | i = (int)strlen(DIST_MEMBER(src, i)); |
198 | dest += i; |
199 | res += i; |
200 | if (*(++cptr)) /* miscelaneous fillagree, like punctuation */ |
201 | { |
202 | *dest = *cptr; |
203 | dest += 1; |
204 | res += 1; |
205 | } |
206 | *dest = ' '; |
207 | dest++; |
208 | res++; |
209 | parse_target = NULL; |
210 | } /* end of while loop */ |
211 | |
212 | return(res); |
213 | } |
214 | |
215 | /* |
216 | * txt_sentence() -- |
217 | * generate a sentence by |
218 | * 1) selecting a sentence form |
219 | * 2) parsing it to select parts of speech or phrase types |
220 | * 3) selecting appropriate words |
221 | * 4) adding punctuation as required |
222 | * |
223 | * Returns: length of generated sentence |
224 | * Called By: dbg_text() |
225 | * Calls: pick_str(), txt_np(), txt_vp() |
226 | */ |
227 | static int |
228 | txt_sentence(char *dest, int sd) |
229 | { |
230 | char syntax[MAX_GRAMMAR_LEN + 1], |
231 | *cptr; |
232 | int i, |
233 | res = 0, |
234 | len = 0; |
235 | |
236 | |
237 | pick_str(&grammar, sd, syntax); |
238 | cptr = syntax; |
239 | |
240 | next_token: /* I hate goto's, but can't seem to have parent and child use strtok() */ |
241 | while (*cptr && *cptr == ' ') |
242 | cptr++; |
243 | if (*cptr == '\0') |
244 | goto done; |
245 | switch(*cptr) |
246 | { |
247 | case 'V': |
248 | len = txt_vp(dest, sd); |
249 | break; |
250 | case 'N': |
251 | len = txt_np(dest, sd); |
252 | break; |
253 | case 'P': |
254 | i = pick_str(&prepositions, sd, dest); |
255 | len = (int)strlen(DIST_MEMBER(&prepositions, i)); |
256 | strcpy((dest + len), " the " ); |
257 | len += 5; |
258 | len += txt_np(dest + len, sd); |
259 | break; |
260 | case 'T': |
261 | i = pick_str(&terminators, sd, --dest); /*terminators should abut previous word */ |
262 | len = (int)strlen(DIST_MEMBER(&terminators, i)); |
263 | break; |
264 | } /* end of POS switch statement */ |
265 | dest += len; |
266 | res += len; |
267 | cptr++; |
268 | if (*cptr && *cptr != ' ') /* miscelaneous fillagree, like punctuation */ |
269 | { |
270 | dest += 1; |
271 | res += 1; |
272 | *dest = *cptr; |
273 | } |
274 | goto next_token; |
275 | done: |
276 | *dest = '\0'; |
277 | return(--res); |
278 | } |
279 | |
280 | /* |
281 | * dbg_text() -- |
282 | * produce ELIZA-like text of random, bounded length, truncating the last |
283 | * generated sentence as required |
284 | */ |
285 | void |
286 | dbg_text(char *tgt, int min, int max, int sd) |
287 | { |
288 | DSS_HUGE hgLength = 0, |
289 | hgOffset, |
290 | wordlen = 0, |
291 | s_len, |
292 | needed; |
293 | char sentence[MAX_SENT_LEN + 1], |
294 | *cp; |
295 | static char szTextPool[TEXT_POOL_SIZE + 1]; |
296 | static int bInit = 0; |
297 | int nLifeNoise = 0; |
298 | |
299 | if (!bInit) |
300 | { |
301 | cp = &szTextPool[0]; |
302 | if (verbose > 0) |
303 | fprintf(stderr, "\nPreloading text ... " ); |
304 | |
305 | while (wordlen < TEXT_POOL_SIZE) |
306 | { |
307 | if ((verbose > 0) && (wordlen > nLifeNoise)) |
308 | { |
309 | nLifeNoise += 200000; |
310 | fprintf(stderr, "%3.0f%%\b\b\b\b" , (100.0 * wordlen)/TEXT_POOL_SIZE); |
311 | } |
312 | |
313 | s_len = txt_sentence(sentence, 5); |
314 | if ( s_len < 0) |
315 | INTERNAL_ERROR("Bad sentence formation" ); |
316 | needed = TEXT_POOL_SIZE - wordlen; |
317 | if (needed >= (s_len + 1)) /* need the entire sentence */ |
318 | { |
319 | strcpy(cp, sentence); |
320 | cp += s_len; |
321 | wordlen += s_len + 1; |
322 | *(cp++) = ' '; |
323 | } |
324 | else /* chop the new sentence off to match the length target */ |
325 | { |
326 | sentence[needed] = '\0'; |
327 | strcpy(cp, sentence); |
328 | wordlen += needed; |
329 | cp += needed; |
330 | } |
331 | } |
332 | *cp = '\0'; |
333 | bInit = 1; |
334 | if (verbose > 0) |
335 | fprintf(stderr, "\n" ); |
336 | } |
337 | |
338 | RANDOM(hgOffset, 0, TEXT_POOL_SIZE - max, sd); |
339 | RANDOM(hgLength, min, max, sd); |
340 | strncpy(&tgt[0], &szTextPool[hgOffset], (int)hgLength); |
341 | tgt[hgLength] = '\0'; |
342 | |
343 | return; |
344 | } |
345 | |
346 | #ifdef TEXT_TEST |
347 | tdef tdefs[1] = { NULL }; |
348 | distribution nouns, |
349 | verbs, |
350 | adjectives, |
351 | adverbs, |
352 | auxillaries, |
353 | terminators, |
354 | articles, |
355 | prepositions, |
356 | grammar, |
357 | np, |
358 | vp; |
359 | |
360 | main() |
361 | { |
362 | char prattle[401]; |
363 | |
364 | verbose = 1; |
365 | |
366 | read_dist (env_config (DIST_TAG, DIST_DFLT), "nouns" , &nouns); |
367 | read_dist (env_config (DIST_TAG, DIST_DFLT), "verbs" , &verbs); |
368 | read_dist (env_config (DIST_TAG, DIST_DFLT), "adjectives" , &adjectives); |
369 | read_dist (env_config (DIST_TAG, DIST_DFLT), "adverbs" , &adverbs); |
370 | read_dist (env_config (DIST_TAG, DIST_DFLT), "auxillaries" , &auxillaries); |
371 | read_dist (env_config (DIST_TAG, DIST_DFLT), "terminators" , &terminators); |
372 | read_dist (env_config (DIST_TAG, DIST_DFLT), "articles" , &articles); |
373 | read_dist (env_config (DIST_TAG, DIST_DFLT), "prepositions" , &prepositions); |
374 | read_dist (env_config (DIST_TAG, DIST_DFLT), "grammar" , &grammar); |
375 | read_dist (env_config (DIST_TAG, DIST_DFLT), "np" , &np); |
376 | read_dist (env_config (DIST_TAG, DIST_DFLT), "vp" , &vp); |
377 | |
378 | while (1) |
379 | { |
380 | dbg_text(&prattle[0], 300, 400, 0); |
381 | printf("<%s>\n" , prattle); |
382 | } |
383 | |
384 | return(0); |
385 | } |
386 | #endif /* TEST */ |
387 | |