| 1 | /* |
| 2 | * $Id: text.c,v 1.6 2006/07/31 17:23:09 jms Exp $ |
| 3 | * |
| 4 | * Revision History |
| 5 | * =================== |
| 6 | * $Log: text.c,v $ |
| 7 | * Revision 1.6 2006/07/31 17:23:09 jms |
| 8 | * fix to parallelism problem |
| 9 | * |
| 10 | * Revision 1.5 2006/05/18 23:50:00 jms |
| 11 | * commit text generation change with larger buffer |
| 12 | * |
| 13 | * Revision 1.4 2006/05/16 16:26:51 jms |
| 14 | * remove calls to FAKE_V_STR |
| 15 | * |
| 16 | * Revision 1.3 2006/05/16 15:55:58 jms |
| 17 | * first cut to Meikel |
| 18 | * |
| 19 | * Revision 1.2 2005/01/03 20:08:59 jms |
| 20 | * change line terminations |
| 21 | * |
| 22 | * Revision 1.1.1.1 2004/11/24 23:31:47 jms |
| 23 | * re-establish external server |
| 24 | * |
| 25 | * Revision 1.1.1.1 2003/08/07 17:58:34 jms |
| 26 | * recreation after CVS crash |
| 27 | * |
| 28 | * Revision 1.2 2003/08/07 17:58:34 jms |
| 29 | * Convery RNG to 64bit space as preparation for new large scale RNG |
| 30 | * |
| 31 | * Revision 1.1.1.1 2003/04/03 18:54:21 jms |
| 32 | * initial checkin |
| 33 | * |
| 34 | * |
| 35 | */ |
| 36 | /* |
| 37 | * text.c --- pseaudo text generator for use in DBGEN 2.0 |
| 38 | * |
| 39 | * Defined Routines: |
| 40 | * dbg_text() -- select and translate a sentance form |
| 41 | */ |
| 42 | |
| 43 | #ifdef TEXT_TEST |
| 44 | #define DECLARER |
| 45 | #endif /* TEST */ |
| 46 | |
| 47 | #include "config.h" |
| 48 | #include <stdlib.h> |
| 49 | #if (defined(_POSIX_)||!defined(WIN32)) /* Change for Windows NT */ |
| 50 | #include <unistd.h> |
| 51 | #include <sys/wait.h> |
| 52 | #endif /* WIN32 */ |
| 53 | #include <stdio.h> /* */ |
| 54 | #include <limits.h> |
| 55 | #include <math.h> |
| 56 | #include <ctype.h> |
| 57 | #include <signal.h> |
| 58 | #include <string.h> |
| 59 | #include <errno.h> |
| 60 | #ifdef HP |
| 61 | #include <strings.h> |
| 62 | #endif |
| 63 | #if (defined(WIN32)&&!defined(_POSIX_)) |
| 64 | #include <process.h> |
| 65 | #pragma warning(disable:4201) |
| 66 | #pragma warning(disable:4214) |
| 67 | #pragma warning(disable:4514) |
| 68 | #define WIN32_LEAN_AND_MEAN |
| 69 | #define NOATOM |
| 70 | #define NOGDICAPMASKS |
| 71 | #define NOMETAFILE |
| 72 | #define NOMINMAX |
| 73 | #define NOMSG |
| 74 | #define NOOPENFILE |
| 75 | #define NORASTEROPS |
| 76 | #define NOSCROLL |
| 77 | #define NOSOUND |
| 78 | #define NOSYSMETRICS |
| 79 | #define NOTEXTMETRIC |
| 80 | #define NOWH |
| 81 | #define NOCOMM |
| 82 | #define NOKANJI |
| 83 | #define NOMCX |
| 84 | #include <windows.h> |
| 85 | #pragma warning(default:4201) |
| 86 | #pragma warning(default:4214) |
| 87 | #endif |
| 88 | |
| 89 | #define TEXT_POOL_SIZE (300 * 1024 * 1024) /* 300MiB */ |
| 90 | |
| 91 | #include "dss.h" |
| 92 | #include "dsstypes.h" |
| 93 | |
| 94 | /* |
| 95 | * txt_vp() -- |
| 96 | * generate a verb phrase by |
| 97 | * 1) selecting a verb phrase form |
| 98 | * 2) parsing it to select parts of speech |
| 99 | * 3) selecting appropriate words |
| 100 | * 4) adding punctuation as required |
| 101 | * |
| 102 | * Returns: length of generated phrase |
| 103 | * Called By: txt_sentence() |
| 104 | * Calls: pick_str() |
| 105 | */ |
| 106 | static int |
| 107 | txt_vp(char *dest, int sd) |
| 108 | { |
| 109 | char syntax[MAX_GRAMMAR_LEN + 1], |
| 110 | *cptr, |
| 111 | *parse_target; |
| 112 | distribution *src; |
| 113 | int i, |
| 114 | res = 0; |
| 115 | |
| 116 | |
| 117 | pick_str(&vp, sd, &syntax[0]); |
| 118 | parse_target = syntax; |
| 119 | while ((cptr = strtok(parse_target, " " )) != NULL) |
| 120 | { |
| 121 | src = NULL; |
| 122 | switch(*cptr) |
| 123 | { |
| 124 | case 'D': |
| 125 | src = &adverbs; |
| 126 | break; |
| 127 | case 'V': |
| 128 | src = &verbs; |
| 129 | break; |
| 130 | case 'X': |
| 131 | src = &auxillaries; |
| 132 | break; |
| 133 | } /* end of POS switch statement */ |
| 134 | i = pick_str(src, sd, dest); |
| 135 | i = (int)strlen(DIST_MEMBER(src, i)); |
| 136 | dest += i; |
| 137 | res += i; |
| 138 | if (*(++cptr)) /* miscelaneous fillagree, like punctuation */ |
| 139 | { |
| 140 | dest += 1; |
| 141 | res += 1; |
| 142 | *dest = *cptr; |
| 143 | } |
| 144 | *dest = ' '; |
| 145 | dest++; |
| 146 | res++; |
| 147 | parse_target = NULL; |
| 148 | } /* end of while loop */ |
| 149 | |
| 150 | return(res); |
| 151 | } |
| 152 | |
| 153 | /* |
| 154 | * txt_np() -- |
| 155 | * generate a noun phrase by |
| 156 | * 1) selecting a noun phrase form |
| 157 | * 2) parsing it to select parts of speech |
| 158 | * 3) selecting appropriate words |
| 159 | * 4) adding punctuation as required |
| 160 | * |
| 161 | * Returns: length of generated phrase |
| 162 | * Called By: txt_sentence() |
| 163 | * Calls: pick_str(), |
| 164 | */ |
| 165 | static int |
| 166 | txt_np(char *dest, int sd) |
| 167 | { |
| 168 | char syntax[MAX_GRAMMAR_LEN + 1], |
| 169 | *cptr, |
| 170 | *parse_target; |
| 171 | distribution *src; |
| 172 | int i, |
| 173 | res = 0; |
| 174 | |
| 175 | |
| 176 | pick_str(&np, sd, &syntax[0]); |
| 177 | parse_target = syntax; |
| 178 | while ((cptr = strtok(parse_target, " " )) != NULL) |
| 179 | { |
| 180 | src = NULL; |
| 181 | switch(*cptr) |
| 182 | { |
| 183 | case 'A': |
| 184 | src = &articles; |
| 185 | break; |
| 186 | case 'J': |
| 187 | src = &adjectives; |
| 188 | break; |
| 189 | case 'D': |
| 190 | src = &adverbs; |
| 191 | break; |
| 192 | case 'N': |
| 193 | src = &nouns; |
| 194 | break; |
| 195 | } /* end of POS switch statement */ |
| 196 | i = pick_str(src, sd, dest); |
| 197 | i = (int)strlen(DIST_MEMBER(src, i)); |
| 198 | dest += i; |
| 199 | res += i; |
| 200 | if (*(++cptr)) /* miscelaneous fillagree, like punctuation */ |
| 201 | { |
| 202 | *dest = *cptr; |
| 203 | dest += 1; |
| 204 | res += 1; |
| 205 | } |
| 206 | *dest = ' '; |
| 207 | dest++; |
| 208 | res++; |
| 209 | parse_target = NULL; |
| 210 | } /* end of while loop */ |
| 211 | |
| 212 | return(res); |
| 213 | } |
| 214 | |
| 215 | /* |
| 216 | * txt_sentence() -- |
| 217 | * generate a sentence by |
| 218 | * 1) selecting a sentence form |
| 219 | * 2) parsing it to select parts of speech or phrase types |
| 220 | * 3) selecting appropriate words |
| 221 | * 4) adding punctuation as required |
| 222 | * |
| 223 | * Returns: length of generated sentence |
| 224 | * Called By: dbg_text() |
| 225 | * Calls: pick_str(), txt_np(), txt_vp() |
| 226 | */ |
| 227 | static int |
| 228 | txt_sentence(char *dest, int sd) |
| 229 | { |
| 230 | char syntax[MAX_GRAMMAR_LEN + 1], |
| 231 | *cptr; |
| 232 | int i, |
| 233 | res = 0, |
| 234 | len = 0; |
| 235 | |
| 236 | |
| 237 | pick_str(&grammar, sd, syntax); |
| 238 | cptr = syntax; |
| 239 | |
| 240 | next_token: /* I hate goto's, but can't seem to have parent and child use strtok() */ |
| 241 | while (*cptr && *cptr == ' ') |
| 242 | cptr++; |
| 243 | if (*cptr == '\0') |
| 244 | goto done; |
| 245 | switch(*cptr) |
| 246 | { |
| 247 | case 'V': |
| 248 | len = txt_vp(dest, sd); |
| 249 | break; |
| 250 | case 'N': |
| 251 | len = txt_np(dest, sd); |
| 252 | break; |
| 253 | case 'P': |
| 254 | i = pick_str(&prepositions, sd, dest); |
| 255 | len = (int)strlen(DIST_MEMBER(&prepositions, i)); |
| 256 | strcpy((dest + len), " the " ); |
| 257 | len += 5; |
| 258 | len += txt_np(dest + len, sd); |
| 259 | break; |
| 260 | case 'T': |
| 261 | i = pick_str(&terminators, sd, --dest); /*terminators should abut previous word */ |
| 262 | len = (int)strlen(DIST_MEMBER(&terminators, i)); |
| 263 | break; |
| 264 | } /* end of POS switch statement */ |
| 265 | dest += len; |
| 266 | res += len; |
| 267 | cptr++; |
| 268 | if (*cptr && *cptr != ' ') /* miscelaneous fillagree, like punctuation */ |
| 269 | { |
| 270 | dest += 1; |
| 271 | res += 1; |
| 272 | *dest = *cptr; |
| 273 | } |
| 274 | goto next_token; |
| 275 | done: |
| 276 | *dest = '\0'; |
| 277 | return(--res); |
| 278 | } |
| 279 | |
| 280 | /* |
| 281 | * dbg_text() -- |
| 282 | * produce ELIZA-like text of random, bounded length, truncating the last |
| 283 | * generated sentence as required |
| 284 | */ |
| 285 | void |
| 286 | dbg_text(char *tgt, int min, int max, int sd) |
| 287 | { |
| 288 | DSS_HUGE hgLength = 0, |
| 289 | hgOffset, |
| 290 | wordlen = 0, |
| 291 | s_len, |
| 292 | needed; |
| 293 | char sentence[MAX_SENT_LEN + 1], |
| 294 | *cp; |
| 295 | static char szTextPool[TEXT_POOL_SIZE + 1]; |
| 296 | static int bInit = 0; |
| 297 | int nLifeNoise = 0; |
| 298 | |
| 299 | if (!bInit) |
| 300 | { |
| 301 | cp = &szTextPool[0]; |
| 302 | if (verbose > 0) |
| 303 | fprintf(stderr, "\nPreloading text ... " ); |
| 304 | |
| 305 | while (wordlen < TEXT_POOL_SIZE) |
| 306 | { |
| 307 | if ((verbose > 0) && (wordlen > nLifeNoise)) |
| 308 | { |
| 309 | nLifeNoise += 200000; |
| 310 | fprintf(stderr, "%3.0f%%\b\b\b\b" , (100.0 * wordlen)/TEXT_POOL_SIZE); |
| 311 | } |
| 312 | |
| 313 | s_len = txt_sentence(sentence, 5); |
| 314 | if ( s_len < 0) |
| 315 | INTERNAL_ERROR("Bad sentence formation" ); |
| 316 | needed = TEXT_POOL_SIZE - wordlen; |
| 317 | if (needed >= (s_len + 1)) /* need the entire sentence */ |
| 318 | { |
| 319 | strcpy(cp, sentence); |
| 320 | cp += s_len; |
| 321 | wordlen += s_len + 1; |
| 322 | *(cp++) = ' '; |
| 323 | } |
| 324 | else /* chop the new sentence off to match the length target */ |
| 325 | { |
| 326 | sentence[needed] = '\0'; |
| 327 | strcpy(cp, sentence); |
| 328 | wordlen += needed; |
| 329 | cp += needed; |
| 330 | } |
| 331 | } |
| 332 | *cp = '\0'; |
| 333 | bInit = 1; |
| 334 | if (verbose > 0) |
| 335 | fprintf(stderr, "\n" ); |
| 336 | } |
| 337 | |
| 338 | RANDOM(hgOffset, 0, TEXT_POOL_SIZE - max, sd); |
| 339 | RANDOM(hgLength, min, max, sd); |
| 340 | strncpy(&tgt[0], &szTextPool[hgOffset], (int)hgLength); |
| 341 | tgt[hgLength] = '\0'; |
| 342 | |
| 343 | return; |
| 344 | } |
| 345 | |
| 346 | #ifdef TEXT_TEST |
| 347 | tdef tdefs[1] = { NULL }; |
| 348 | distribution nouns, |
| 349 | verbs, |
| 350 | adjectives, |
| 351 | adverbs, |
| 352 | auxillaries, |
| 353 | terminators, |
| 354 | articles, |
| 355 | prepositions, |
| 356 | grammar, |
| 357 | np, |
| 358 | vp; |
| 359 | |
| 360 | main() |
| 361 | { |
| 362 | char prattle[401]; |
| 363 | |
| 364 | verbose = 1; |
| 365 | |
| 366 | read_dist (env_config (DIST_TAG, DIST_DFLT), "nouns" , &nouns); |
| 367 | read_dist (env_config (DIST_TAG, DIST_DFLT), "verbs" , &verbs); |
| 368 | read_dist (env_config (DIST_TAG, DIST_DFLT), "adjectives" , &adjectives); |
| 369 | read_dist (env_config (DIST_TAG, DIST_DFLT), "adverbs" , &adverbs); |
| 370 | read_dist (env_config (DIST_TAG, DIST_DFLT), "auxillaries" , &auxillaries); |
| 371 | read_dist (env_config (DIST_TAG, DIST_DFLT), "terminators" , &terminators); |
| 372 | read_dist (env_config (DIST_TAG, DIST_DFLT), "articles" , &articles); |
| 373 | read_dist (env_config (DIST_TAG, DIST_DFLT), "prepositions" , &prepositions); |
| 374 | read_dist (env_config (DIST_TAG, DIST_DFLT), "grammar" , &grammar); |
| 375 | read_dist (env_config (DIST_TAG, DIST_DFLT), "np" , &np); |
| 376 | read_dist (env_config (DIST_TAG, DIST_DFLT), "vp" , &vp); |
| 377 | |
| 378 | while (1) |
| 379 | { |
| 380 | dbg_text(&prattle[0], 300, 400, 0); |
| 381 | printf("<%s>\n" , prattle); |
| 382 | } |
| 383 | |
| 384 | return(0); |
| 385 | } |
| 386 | #endif /* TEST */ |
| 387 | |