| 1 | /* |
| 2 | Copyright (c) 2000, 2014, Oracle and/or its affiliates |
| 3 | |
| 4 | This program is free software; you can redistribute it and/or |
| 5 | modify it under the terms of the GNU General Public License |
| 6 | as published by the Free Software Foundation; version 2 of |
| 7 | the License. |
| 8 | |
| 9 | This program is distributed in the hope that it will be useful, |
| 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 12 | GNU General Public License for more details. |
| 13 | |
| 14 | You should have received a copy of the GNU General Public License |
| 15 | along with this program; if not, write to the Free Software |
| 16 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
| 17 | 02110-1301 USA */ |
| 18 | |
| 19 | /* |
| 20 | Replace strings in textfile |
| 21 | |
| 22 | This program replaces strings in files or from stdin to stdout. |
| 23 | It accepts a list of from-string/to-string pairs and replaces |
| 24 | each occurrence of a from-string with the corresponding to-string. |
| 25 | The first occurrence of a found string is matched. If there is more |
| 26 | than one possibility for the string to replace, longer matches |
| 27 | are preferred before shorter matches. |
| 28 | |
| 29 | Special characters in from string: |
| 30 | \^ Match start of line. |
| 31 | \$ Match end of line. |
| 32 | \b Match space-character, start of line or end of line. |
| 33 | For end \b the next replace starts locking at the end space-character. |
| 34 | An \b alone or in a string matches only a space-character. |
| 35 | \r, \t, \v as in C. |
| 36 | The programs make a DFA-state-machine of the strings and the speed isn't |
| 37 | dependent on the count of replace-strings (only of the number of replaces). |
| 38 | A line is assumed ending with \n or \0. |
| 39 | There are no limit except memory on length of strings. |
| 40 | |
| 41 | Written by Monty. |
| 42 | fill_buffer_retaining() is taken from gnu-grep and modified. |
| 43 | */ |
| 44 | |
| 45 | #include <my_global.h> |
| 46 | #include <m_ctype.h> |
| 47 | #include <my_sys.h> |
| 48 | #include <m_string.h> |
| 49 | #include <errno.h> |
| 50 | |
| 51 | #define PC_MALLOC 256 /* Bytes for pointers */ |
| 52 | #define PS_MALLOC 512 /* Bytes for data */ |
| 53 | |
| 54 | typedef struct st_pointer_array { /* when using array-strings */ |
| 55 | TYPELIB typelib; /* Pointer to strings */ |
| 56 | uchar *str; /* Strings is here */ |
| 57 | uint8 *flag; /* Flag about each var. */ |
| 58 | uint array_allocs,max_count,length,max_length; |
| 59 | } POINTER_ARRAY; |
| 60 | |
| 61 | #define SPACE_CHAR 256 |
| 62 | #define START_OF_LINE 257 |
| 63 | #define END_OF_LINE 258 |
| 64 | #define LAST_CHAR_CODE 259 |
| 65 | |
| 66 | typedef struct st_replace { |
| 67 | my_bool found; |
| 68 | struct st_replace *next[256]; |
| 69 | } REPLACE; |
| 70 | |
| 71 | typedef struct st_replace_found { |
| 72 | my_bool found; |
| 73 | char *replace_string; |
| 74 | uint to_offset; |
| 75 | int from_offset; |
| 76 | } REPLACE_STRING; |
| 77 | |
| 78 | #ifndef WORD_BIT |
| 79 | #define WORD_BIT (8*sizeof(uint)) |
| 80 | #endif |
| 81 | |
| 82 | /* functions defined in this file */ |
| 83 | |
| 84 | static int static_get_options(int *argc,char * * *argv); |
| 85 | static int get_replace_strings(int *argc,char * * *argv, |
| 86 | POINTER_ARRAY *from_array, |
| 87 | POINTER_ARRAY *to_array); |
| 88 | static int insert_pointer_name(POINTER_ARRAY *pa, char * name); |
| 89 | static void free_pointer_array(POINTER_ARRAY *pa); |
| 90 | static int convert_pipe(REPLACE *,FILE *,FILE *); |
| 91 | static int convert_file(REPLACE *, char *); |
| 92 | static REPLACE *init_replace(char * *from, char * *to,uint count, |
| 93 | char * word_end_chars); |
| 94 | static uint replace_strings(REPLACE *rep, char * *start,uint *max_length, |
| 95 | char * from); |
| 96 | static int initialize_buffer(void); |
| 97 | static void reset_buffer(void); |
| 98 | static void free_buffer(void); |
| 99 | |
| 100 | static int silent=0,verbose=0,updated=0; |
| 101 | |
| 102 | /* The main program */ |
| 103 | |
| 104 | int main(int argc, char *argv[]) |
| 105 | { |
| 106 | int i,error; |
| 107 | char word_end_chars[256],*pos; |
| 108 | POINTER_ARRAY from,to; |
| 109 | REPLACE *replace; |
| 110 | MY_INIT(argv[0]); |
| 111 | |
| 112 | if (static_get_options(&argc,&argv)) |
| 113 | exit(1); |
| 114 | if (get_replace_strings(&argc,&argv,&from,&to)) |
| 115 | exit(1); |
| 116 | |
| 117 | for (i=1,pos=word_end_chars ; i < 256 ; i++) |
| 118 | if (my_isspace(&my_charset_latin1,i)) |
| 119 | *pos++= (char) i; |
| 120 | *pos=0; |
| 121 | if (!(replace=init_replace((char**) from.typelib.type_names, |
| 122 | (char**) to.typelib.type_names, |
| 123 | (uint) from.typelib.count,word_end_chars))) |
| 124 | exit(1); |
| 125 | free_pointer_array(&from); |
| 126 | free_pointer_array(&to); |
| 127 | if (initialize_buffer()) |
| 128 | return 1; |
| 129 | |
| 130 | error=0; |
| 131 | if (argc == 0) |
| 132 | error=convert_pipe(replace,stdin,stdout); |
| 133 | else |
| 134 | { |
| 135 | while (argc--) |
| 136 | { |
| 137 | error=convert_file(replace,*(argv++)); |
| 138 | } |
| 139 | } |
| 140 | free_buffer(); |
| 141 | my_free(replace); |
| 142 | my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR); |
| 143 | exit(error ? 2 : 0); |
| 144 | return 0; /* No compiler warning */ |
| 145 | } /* main */ |
| 146 | |
| 147 | |
| 148 | /* reads options */ |
| 149 | /* Initiates DEBUG - but no debugging here ! */ |
| 150 | |
| 151 | static int static_get_options(argc,argv) |
| 152 | register int *argc; |
| 153 | register char **argv[]; |
| 154 | { |
| 155 | int help,version; |
| 156 | char *pos; |
| 157 | |
| 158 | silent=verbose=help=0; |
| 159 | |
| 160 | while (--*argc > 0 && *(pos = *(++*argv)) == '-' && pos[1] != '-') { |
| 161 | while (*++pos) |
| 162 | { |
| 163 | version=0; |
| 164 | switch((*pos)) { |
| 165 | case 's': |
| 166 | silent=1; |
| 167 | break; |
| 168 | case 'v': |
| 169 | verbose=1; |
| 170 | break; |
| 171 | case '#': |
| 172 | DBUG_PUSH (++pos); |
| 173 | pos= (char*) " " ; /* Skip rest of arguments */ |
| 174 | break; |
| 175 | case 'V': |
| 176 | version=1; |
| 177 | /* fall through */ |
| 178 | case 'I': |
| 179 | case '?': |
| 180 | help=1; /* Help text written */ |
| 181 | printf("%s Ver 1.4 for %s at %s\n" ,my_progname,SYSTEM_TYPE, |
| 182 | MACHINE_TYPE); |
| 183 | if (version) |
| 184 | break; |
| 185 | puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,\nand you are welcome to modify and redistribute it under the GPL license\n" ); |
| 186 | puts("This program replaces strings in files or from stdin to stdout.\n" |
| 187 | "It accepts a list of from-string/to-string pairs and replaces\n" |
| 188 | "each occurrence of a from-string with the corresponding to-string.\n" |
| 189 | "The first occurrence of a found string is matched. If there is\n" |
| 190 | "more than one possibility for the string to replace, longer\n" |
| 191 | "matches are preferred before shorter matches.\n\n" |
| 192 | "A from-string can contain these special characters:\n" |
| 193 | " \\^ Match start of line.\n" |
| 194 | " \\$ Match end of line.\n" |
| 195 | " \\b Match space-character, start of line or end of line.\n" |
| 196 | " For a end \\b the next replace starts locking at the end\n" |
| 197 | " space-character. A \\b alone in a string matches only a\n" |
| 198 | " space-character.\n" ); |
| 199 | printf("Usage: %s [-?svIV] from to from to ... -- [files]\n" , my_progname); |
| 200 | puts("or" ); |
| 201 | printf("Usage: %s [-?svIV] from to from to ... < fromfile > tofile\n" , my_progname); |
| 202 | puts("" ); |
| 203 | puts("Options: -? or -I \"Info\" -s \"silent\" -v \"verbose\"" ); |
| 204 | break; |
| 205 | default: |
| 206 | fprintf(stderr,"illegal option: -%c\n" ,*pos); |
| 207 | break; |
| 208 | } |
| 209 | } |
| 210 | } |
| 211 | if (*argc == 0) |
| 212 | { |
| 213 | if (!help) |
| 214 | my_message(0,"No replace options given" ,MYF(ME_BELL)); |
| 215 | exit(0); /* Don't use as pipe */ |
| 216 | } |
| 217 | return(0); |
| 218 | } /* static_get_options */ |
| 219 | |
| 220 | |
| 221 | static int get_replace_strings(argc,argv,from_array,to_array) |
| 222 | register int *argc; |
| 223 | register char **argv[]; |
| 224 | POINTER_ARRAY *from_array,*to_array; |
| 225 | { |
| 226 | char *pos; |
| 227 | |
| 228 | bzero((char*) from_array,sizeof(from_array[0])); |
| 229 | bzero((char*) to_array,sizeof(to_array[0])); |
| 230 | while (*argc > 0 && (*(pos = *(*argv)) != '-' || pos[1] != '-' || pos[2])) |
| 231 | { |
| 232 | insert_pointer_name(from_array,pos); |
| 233 | (*argc)--; |
| 234 | (*argv)++; |
| 235 | if (!*argc || !strcmp(**argv,"--" )) |
| 236 | { |
| 237 | my_message(0,"No to-string for last from-string" ,MYF(ME_BELL)); |
| 238 | return 1; |
| 239 | } |
| 240 | insert_pointer_name(to_array,**argv); |
| 241 | (*argc)--; |
| 242 | (*argv)++; |
| 243 | } |
| 244 | if (*argc) |
| 245 | { /* Skip "--" argument */ |
| 246 | (*argc)--; |
| 247 | (*argv)++; |
| 248 | } |
| 249 | return 0; |
| 250 | } |
| 251 | |
| 252 | static int insert_pointer_name(reg1 POINTER_ARRAY *pa,char * name) |
| 253 | { |
| 254 | uint i,length,old_count; |
| 255 | uchar *new_pos; |
| 256 | const char **new_array; |
| 257 | DBUG_ENTER("insert_pointer_name" ); |
| 258 | |
| 259 | if (! pa->typelib.count) |
| 260 | { |
| 261 | if (!(pa->typelib.type_names=(const char **) |
| 262 | my_malloc(((PC_MALLOC-MALLOC_OVERHEAD)/ |
| 263 | (sizeof(char *)+sizeof(*pa->flag))* |
| 264 | (sizeof(char *)+sizeof(*pa->flag))),MYF(MY_WME)))) |
| 265 | DBUG_RETURN(-1); |
| 266 | if (!(pa->str= (uchar*) my_malloc((uint) (PS_MALLOC-MALLOC_OVERHEAD), |
| 267 | MYF(MY_WME)))) |
| 268 | { |
| 269 | my_free((void*) pa->typelib.type_names); |
| 270 | DBUG_RETURN (-1); |
| 271 | } |
| 272 | pa->max_count=(PC_MALLOC-MALLOC_OVERHEAD)/(sizeof(uchar*)+ |
| 273 | sizeof(*pa->flag)); |
| 274 | pa->flag= (uint8*) (pa->typelib.type_names+pa->max_count); |
| 275 | pa->length=0; |
| 276 | pa->max_length=PS_MALLOC-MALLOC_OVERHEAD; |
| 277 | pa->array_allocs=1; |
| 278 | } |
| 279 | length=(uint) strlen(name)+1; |
| 280 | if (pa->length+length >= pa->max_length) |
| 281 | { |
| 282 | pa->max_length=(pa->length+length+MALLOC_OVERHEAD+PS_MALLOC-1)/PS_MALLOC; |
| 283 | pa->max_length=pa->max_length*PS_MALLOC-MALLOC_OVERHEAD; |
| 284 | if (!(new_pos= (uchar*) my_realloc((uchar*) pa->str, |
| 285 | (uint) pa->max_length, |
| 286 | MYF(MY_WME)))) |
| 287 | DBUG_RETURN(1); |
| 288 | if (new_pos != pa->str) |
| 289 | { |
| 290 | my_ptrdiff_t diff=PTR_BYTE_DIFF(new_pos,pa->str); |
| 291 | for (i=0 ; i < pa->typelib.count ; i++) |
| 292 | pa->typelib.type_names[i]= ADD_TO_PTR(pa->typelib.type_names[i],diff, |
| 293 | char*); |
| 294 | pa->str=new_pos; |
| 295 | } |
| 296 | } |
| 297 | if (pa->typelib.count >= pa->max_count-1) |
| 298 | { |
| 299 | int len; |
| 300 | pa->array_allocs++; |
| 301 | len=(PC_MALLOC*pa->array_allocs - MALLOC_OVERHEAD); |
| 302 | if (!(new_array=(const char **) my_realloc((uchar*) pa->typelib.type_names, |
| 303 | (uint) len/ |
| 304 | (sizeof(uchar*)+sizeof(*pa->flag))* |
| 305 | (sizeof(uchar*)+sizeof(*pa->flag)), |
| 306 | MYF(MY_WME)))) |
| 307 | DBUG_RETURN(1); |
| 308 | pa->typelib.type_names=new_array; |
| 309 | old_count=pa->max_count; |
| 310 | pa->max_count=len/(sizeof(uchar*) + sizeof(*pa->flag)); |
| 311 | pa->flag= (uint8*) (pa->typelib.type_names+pa->max_count); |
| 312 | memcpy((uchar*) pa->flag,(char *) (pa->typelib.type_names+old_count), |
| 313 | old_count*sizeof(*pa->flag)); |
| 314 | } |
| 315 | pa->flag[pa->typelib.count]=0; /* Reset flag */ |
| 316 | pa->typelib.type_names[pa->typelib.count++]= (char*) (pa->str+pa->length); |
| 317 | pa->typelib.type_names[pa->typelib.count]= NullS; /* Put end-mark */ |
| 318 | (void) strmov((char*) pa->str + pa->length, name); |
| 319 | pa->length+=length; |
| 320 | DBUG_RETURN(0); |
| 321 | } /* insert_pointer_name */ |
| 322 | |
| 323 | |
| 324 | /* free pointer array */ |
| 325 | |
| 326 | static void free_pointer_array(reg1 POINTER_ARRAY *pa) |
| 327 | { |
| 328 | if (pa->typelib.count) |
| 329 | { |
| 330 | pa->typelib.count=0; |
| 331 | my_free((void*) pa->typelib.type_names); |
| 332 | pa->typelib.type_names=0; |
| 333 | my_free(pa->str); |
| 334 | } |
| 335 | return; |
| 336 | } /* free_pointer_array */ |
| 337 | |
| 338 | |
| 339 | /* Code for replace rutines */ |
| 340 | |
| 341 | #define SET_MALLOC_HUNC 64 |
| 342 | |
| 343 | typedef struct st_rep_set { |
| 344 | uint *bits; /* Pointer to used sets */ |
| 345 | short next[LAST_CHAR_CODE]; /* Pointer to next sets */ |
| 346 | uint found_len; /* Best match to date */ |
| 347 | int found_offset; |
| 348 | uint table_offset; |
| 349 | uint size_of_bits; /* For convinience */ |
| 350 | } REP_SET; |
| 351 | |
| 352 | typedef struct st_rep_sets { |
| 353 | uint count; /* Number of sets */ |
| 354 | uint ; /* Extra sets in buffer */ |
| 355 | uint invisible; /* Sets not chown */ |
| 356 | uint size_of_bits; |
| 357 | REP_SET *set,*set_buffer; |
| 358 | uint *bit_buffer; |
| 359 | } REP_SETS; |
| 360 | |
| 361 | typedef struct st_found_set { |
| 362 | uint table_offset; |
| 363 | int found_offset; |
| 364 | } FOUND_SET; |
| 365 | |
| 366 | typedef struct st_follow { |
| 367 | int chr; |
| 368 | uint table_offset; |
| 369 | uint len; |
| 370 | } FOLLOWS; |
| 371 | |
| 372 | |
| 373 | static int init_sets(REP_SETS *sets,uint states); |
| 374 | static REP_SET *make_new_set(REP_SETS *sets); |
| 375 | static void make_sets_invisible(REP_SETS *sets); |
| 376 | static void free_last_set(REP_SETS *sets); |
| 377 | static void free_sets(REP_SETS *sets); |
| 378 | static void internal_set_bit(REP_SET *set, uint bit); |
| 379 | static void internal_clear_bit(REP_SET *set, uint bit); |
| 380 | static void or_bits(REP_SET *to,REP_SET *from); |
| 381 | static void copy_bits(REP_SET *to,REP_SET *from); |
| 382 | static int cmp_bits(REP_SET *set1,REP_SET *set2); |
| 383 | static int get_next_bit(REP_SET *set,uint lastpos); |
| 384 | static short find_set(REP_SETS *sets,REP_SET *find); |
| 385 | static short find_found(FOUND_SET *found_set,uint table_offset, |
| 386 | int found_offset); |
| 387 | static uint start_at_word(char * pos); |
| 388 | static uint end_of_word(char * pos); |
| 389 | static uint replace_len(char * pos); |
| 390 | |
| 391 | static uint found_sets=0; |
| 392 | |
| 393 | |
| 394 | /* Init a replace structure for further calls */ |
| 395 | |
| 396 | static REPLACE *init_replace(char * *from, char * *to,uint count, |
| 397 | char * word_end_chars) |
| 398 | { |
| 399 | uint i,j,states,set_nr,len,result_len,max_length,found_end,bits_set,bit_nr; |
| 400 | int used_sets,chr; |
| 401 | short default_state; |
| 402 | char used_chars[LAST_CHAR_CODE],is_word_end[256]; |
| 403 | char * pos, *to_pos, **to_array; |
| 404 | REP_SETS sets; |
| 405 | REP_SET *set,*start_states,*word_states,*new_set; |
| 406 | FOLLOWS *follow,*follow_ptr; |
| 407 | REPLACE *replace; |
| 408 | FOUND_SET *found_set; |
| 409 | REPLACE_STRING *rep_str; |
| 410 | DBUG_ENTER("init_replace" ); |
| 411 | |
| 412 | /* Count number of states */ |
| 413 | for (i=result_len=max_length=0 , states=2 ; i < count ; i++) |
| 414 | { |
| 415 | len=replace_len(from[i]); |
| 416 | if (!len) |
| 417 | { |
| 418 | errno=EINVAL; |
| 419 | my_message(0,"No to-string for last from-string" ,MYF(ME_BELL)); |
| 420 | DBUG_RETURN(0); |
| 421 | } |
| 422 | states+=len+1; |
| 423 | result_len+=(uint) strlen(to[i])+1; |
| 424 | if (len > max_length) |
| 425 | max_length=len; |
| 426 | } |
| 427 | bzero((char*) is_word_end,sizeof(is_word_end)); |
| 428 | for (i=0 ; word_end_chars[i] ; i++) |
| 429 | is_word_end[(uchar) word_end_chars[i]]=1; |
| 430 | |
| 431 | if (init_sets(&sets,states)) |
| 432 | DBUG_RETURN(0); |
| 433 | found_sets=0; |
| 434 | if (!(found_set= (FOUND_SET*) my_malloc(sizeof(FOUND_SET)*max_length*count, |
| 435 | MYF(MY_WME)))) |
| 436 | { |
| 437 | free_sets(&sets); |
| 438 | DBUG_RETURN(0); |
| 439 | } |
| 440 | (void) make_new_set(&sets); /* Set starting set */ |
| 441 | make_sets_invisible(&sets); /* Hide previus sets */ |
| 442 | used_sets=-1; |
| 443 | word_states=make_new_set(&sets); /* Start of new word */ |
| 444 | start_states=make_new_set(&sets); /* This is first state */ |
| 445 | if (!(follow=(FOLLOWS*) my_malloc((states+2)*sizeof(FOLLOWS),MYF(MY_WME)))) |
| 446 | { |
| 447 | free_sets(&sets); |
| 448 | my_free(found_set); |
| 449 | DBUG_RETURN(0); |
| 450 | } |
| 451 | |
| 452 | /* Init follow_ptr[] */ |
| 453 | for (i=0, states=1, follow_ptr=follow+1 ; i < count ; i++) |
| 454 | { |
| 455 | if (from[i][0] == '\\' && from[i][1] == '^') |
| 456 | { |
| 457 | internal_set_bit(start_states,states+1); |
| 458 | if (!from[i][2]) |
| 459 | { |
| 460 | start_states->table_offset=i; |
| 461 | start_states->found_offset=1; |
| 462 | } |
| 463 | } |
| 464 | else if (from[i][0] == '\\' && from[i][1] == '$') |
| 465 | { |
| 466 | internal_set_bit(start_states,states); |
| 467 | internal_set_bit(word_states,states); |
| 468 | if (!from[i][2] && start_states->table_offset == (uint) ~0) |
| 469 | { |
| 470 | start_states->table_offset=i; |
| 471 | start_states->found_offset=0; |
| 472 | } |
| 473 | } |
| 474 | else |
| 475 | { |
| 476 | internal_set_bit(word_states,states); |
| 477 | if (from[i][0] == '\\' && (from[i][1] == 'b' && from[i][2])) |
| 478 | internal_set_bit(start_states,states+1); |
| 479 | else |
| 480 | internal_set_bit(start_states,states); |
| 481 | } |
| 482 | for (pos=from[i], len=0; *pos ; pos++) |
| 483 | { |
| 484 | if (*pos == '\\' && *(pos+1)) |
| 485 | { |
| 486 | pos++; |
| 487 | switch (*pos) { |
| 488 | case 'b': |
| 489 | follow_ptr->chr = SPACE_CHAR; |
| 490 | break; |
| 491 | case '^': |
| 492 | follow_ptr->chr = START_OF_LINE; |
| 493 | break; |
| 494 | case '$': |
| 495 | follow_ptr->chr = END_OF_LINE; |
| 496 | break; |
| 497 | case 'r': |
| 498 | follow_ptr->chr = '\r'; |
| 499 | break; |
| 500 | case 't': |
| 501 | follow_ptr->chr = '\t'; |
| 502 | break; |
| 503 | case 'v': |
| 504 | follow_ptr->chr = '\v'; |
| 505 | break; |
| 506 | default: |
| 507 | follow_ptr->chr = (uchar) *pos; |
| 508 | break; |
| 509 | } |
| 510 | } |
| 511 | else |
| 512 | follow_ptr->chr= (uchar) *pos; |
| 513 | follow_ptr->table_offset=i; |
| 514 | follow_ptr->len= ++len; |
| 515 | follow_ptr++; |
| 516 | } |
| 517 | follow_ptr->chr=0; |
| 518 | follow_ptr->table_offset=i; |
| 519 | follow_ptr->len=len; |
| 520 | follow_ptr++; |
| 521 | states+=(uint) len+1; |
| 522 | } |
| 523 | |
| 524 | |
| 525 | for (set_nr=0,pos=0 ; set_nr < sets.count ; set_nr++) |
| 526 | { |
| 527 | set=sets.set+set_nr; |
| 528 | default_state= 0; /* Start from beginning */ |
| 529 | |
| 530 | /* If end of found-string not found or start-set with current set */ |
| 531 | |
| 532 | for (i= (uint) ~0; (i=get_next_bit(set,i)) ;) |
| 533 | { |
| 534 | if (!follow[i].chr) |
| 535 | { |
| 536 | if (! default_state) |
| 537 | default_state= find_found(found_set,set->table_offset, |
| 538 | set->found_offset+1); |
| 539 | } |
| 540 | } |
| 541 | copy_bits(sets.set+used_sets,set); /* Save set for changes */ |
| 542 | if (!default_state) |
| 543 | or_bits(sets.set+used_sets,sets.set); /* Can restart from start */ |
| 544 | |
| 545 | /* Find all chars that follows current sets */ |
| 546 | bzero((char*) used_chars,sizeof(used_chars)); |
| 547 | for (i= (uint) ~0; (i=get_next_bit(sets.set+used_sets,i)) ;) |
| 548 | { |
| 549 | used_chars[follow[i].chr]=1; |
| 550 | if ((follow[i].chr == SPACE_CHAR && !follow[i+1].chr && |
| 551 | follow[i].len > 1) || follow[i].chr == END_OF_LINE) |
| 552 | used_chars[0]=1; |
| 553 | } |
| 554 | |
| 555 | /* Mark word_chars used if \b is in state */ |
| 556 | if (used_chars[SPACE_CHAR]) |
| 557 | for (pos= word_end_chars ; *pos ; pos++) |
| 558 | used_chars[(int) (uchar) *pos] = 1; |
| 559 | |
| 560 | /* Handle other used characters */ |
| 561 | for (chr= 0 ; chr < 256 ; chr++) |
| 562 | { |
| 563 | if (! used_chars[chr]) |
| 564 | set->next[chr]= (short) (chr ? default_state : -1); |
| 565 | else |
| 566 | { |
| 567 | new_set=make_new_set(&sets); |
| 568 | set=sets.set+set_nr; /* if realloc */ |
| 569 | new_set->table_offset=set->table_offset; |
| 570 | new_set->found_len=set->found_len; |
| 571 | new_set->found_offset=set->found_offset+1; |
| 572 | found_end=0; |
| 573 | |
| 574 | for (i= (uint) ~0 ; (i=get_next_bit(sets.set+used_sets,i)) ; ) |
| 575 | { |
| 576 | if (!follow[i].chr || follow[i].chr == chr || |
| 577 | (follow[i].chr == SPACE_CHAR && |
| 578 | (is_word_end[chr] || |
| 579 | (!chr && follow[i].len > 1 && ! follow[i+1].chr))) || |
| 580 | (follow[i].chr == END_OF_LINE && ! chr)) |
| 581 | { |
| 582 | if ((! chr || (follow[i].chr && !follow[i+1].chr)) && |
| 583 | follow[i].len > found_end) |
| 584 | found_end=follow[i].len; |
| 585 | if (chr && follow[i].chr) |
| 586 | internal_set_bit(new_set,i+1); /* To next set */ |
| 587 | else |
| 588 | internal_set_bit(new_set,i); |
| 589 | } |
| 590 | } |
| 591 | if (found_end) |
| 592 | { |
| 593 | new_set->found_len=0; /* Set for testing if first */ |
| 594 | bits_set=0; |
| 595 | for (i= (uint) ~0; (i=get_next_bit(new_set,i)) ;) |
| 596 | { |
| 597 | if ((follow[i].chr == SPACE_CHAR || |
| 598 | follow[i].chr == END_OF_LINE) && ! chr) |
| 599 | bit_nr=i+1; |
| 600 | else |
| 601 | bit_nr=i; |
| 602 | if (follow[bit_nr-1].len < found_end || |
| 603 | (new_set->found_len && |
| 604 | (chr == 0 || !follow[bit_nr].chr))) |
| 605 | internal_clear_bit(new_set,i); |
| 606 | else |
| 607 | { |
| 608 | if (chr == 0 || !follow[bit_nr].chr) |
| 609 | { /* best match */ |
| 610 | new_set->table_offset=follow[bit_nr].table_offset; |
| 611 | if (chr || (follow[i].chr == SPACE_CHAR || |
| 612 | follow[i].chr == END_OF_LINE)) |
| 613 | new_set->found_offset=found_end; /* New match */ |
| 614 | new_set->found_len=found_end; |
| 615 | } |
| 616 | bits_set++; |
| 617 | } |
| 618 | } |
| 619 | if (bits_set == 1) |
| 620 | { |
| 621 | set->next[chr] = find_found(found_set, |
| 622 | new_set->table_offset, |
| 623 | new_set->found_offset); |
| 624 | free_last_set(&sets); |
| 625 | } |
| 626 | else |
| 627 | set->next[chr] = find_set(&sets,new_set); |
| 628 | } |
| 629 | else |
| 630 | set->next[chr] = find_set(&sets,new_set); |
| 631 | } |
| 632 | } |
| 633 | } |
| 634 | |
| 635 | /* Alloc replace structure for the replace-state-machine */ |
| 636 | |
| 637 | if ((replace=(REPLACE*) my_malloc(sizeof(REPLACE)*(sets.count)+ |
| 638 | sizeof(REPLACE_STRING)*(found_sets+1)+ |
| 639 | sizeof(char *)*count+result_len, |
| 640 | MYF(MY_WME | MY_ZEROFILL)))) |
| 641 | { |
| 642 | rep_str=(REPLACE_STRING*) (replace+sets.count); |
| 643 | to_array=(char **) (rep_str+found_sets+1); |
| 644 | to_pos=(char *) (to_array+count); |
| 645 | for (i=0 ; i < count ; i++) |
| 646 | { |
| 647 | to_array[i]=to_pos; |
| 648 | to_pos=strmov(to_pos,to[i])+1; |
| 649 | } |
| 650 | rep_str[0].found=1; |
| 651 | rep_str[0].replace_string=0; |
| 652 | for (i=1 ; i <= found_sets ; i++) |
| 653 | { |
| 654 | pos=from[found_set[i-1].table_offset]; |
| 655 | rep_str[i].found= (my_bool) (!memcmp(pos,"\\^" ,3) ? 2 : 1); |
| 656 | rep_str[i].replace_string=to_array[found_set[i-1].table_offset]; |
| 657 | rep_str[i].to_offset=found_set[i-1].found_offset-start_at_word(pos); |
| 658 | rep_str[i].from_offset=found_set[i-1].found_offset-replace_len(pos)+ |
| 659 | end_of_word(pos); |
| 660 | } |
| 661 | for (i=0 ; i < sets.count ; i++) |
| 662 | { |
| 663 | for (j=0 ; j < 256 ; j++) |
| 664 | if (sets.set[i].next[j] >= 0) |
| 665 | replace[i].next[j]=replace+sets.set[i].next[j]; |
| 666 | else |
| 667 | replace[i].next[j]=(REPLACE*) (rep_str+(-sets.set[i].next[j]-1)); |
| 668 | } |
| 669 | } |
| 670 | my_free(follow); |
| 671 | free_sets(&sets); |
| 672 | my_free(found_set); |
| 673 | DBUG_PRINT("exit" ,("Replace table has %d states" ,sets.count)); |
| 674 | DBUG_RETURN(replace); |
| 675 | } |
| 676 | |
| 677 | |
| 678 | static int init_sets(REP_SETS *sets,uint states) |
| 679 | { |
| 680 | bzero((char*) sets,sizeof(*sets)); |
| 681 | sets->size_of_bits=((states+7)/8); |
| 682 | if (!(sets->set_buffer=(REP_SET*) my_malloc(sizeof(REP_SET)*SET_MALLOC_HUNC, |
| 683 | MYF(MY_WME)))) |
| 684 | return 1; |
| 685 | if (!(sets->bit_buffer=(uint*) my_malloc(sizeof(uint)*sets->size_of_bits* |
| 686 | SET_MALLOC_HUNC,MYF(MY_WME)))) |
| 687 | { |
| 688 | my_free(sets->set); |
| 689 | return 1; |
| 690 | } |
| 691 | return 0; |
| 692 | } |
| 693 | |
| 694 | /* Make help sets invisible for nicer codeing */ |
| 695 | |
| 696 | static void make_sets_invisible(REP_SETS *sets) |
| 697 | { |
| 698 | sets->invisible=sets->count; |
| 699 | sets->set+=sets->count; |
| 700 | sets->count=0; |
| 701 | } |
| 702 | |
| 703 | static REP_SET *make_new_set(REP_SETS *sets) |
| 704 | { |
| 705 | uint i,count,*bit_buffer; |
| 706 | REP_SET *set; |
| 707 | if (sets->extra) |
| 708 | { |
| 709 | sets->extra--; |
| 710 | set=sets->set+ sets->count++; |
| 711 | bzero((char*) set->bits,sizeof(uint)*sets->size_of_bits); |
| 712 | bzero((char*) &set->next[0],sizeof(set->next[0])*LAST_CHAR_CODE); |
| 713 | set->found_offset=0; |
| 714 | set->found_len=0; |
| 715 | set->table_offset= (uint) ~0; |
| 716 | set->size_of_bits=sets->size_of_bits; |
| 717 | return set; |
| 718 | } |
| 719 | count=sets->count+sets->invisible+SET_MALLOC_HUNC; |
| 720 | if (!(set=(REP_SET*) my_realloc((uchar*) sets->set_buffer, |
| 721 | sizeof(REP_SET)*count, |
| 722 | MYF(MY_WME)))) |
| 723 | return 0; |
| 724 | sets->set_buffer=set; |
| 725 | sets->set=set+sets->invisible; |
| 726 | if (!(bit_buffer=(uint*) my_realloc((uchar*) sets->bit_buffer, |
| 727 | (sizeof(uint)*sets->size_of_bits)*count, |
| 728 | MYF(MY_WME)))) |
| 729 | return 0; |
| 730 | sets->bit_buffer=bit_buffer; |
| 731 | for (i=0 ; i < count ; i++) |
| 732 | { |
| 733 | sets->set_buffer[i].bits=bit_buffer; |
| 734 | bit_buffer+=sets->size_of_bits; |
| 735 | } |
| 736 | sets->extra=SET_MALLOC_HUNC; |
| 737 | return make_new_set(sets); |
| 738 | } |
| 739 | |
| 740 | static void free_last_set(REP_SETS *sets) |
| 741 | { |
| 742 | sets->count--; |
| 743 | sets->extra++; |
| 744 | return; |
| 745 | } |
| 746 | |
| 747 | static void free_sets(REP_SETS *sets) |
| 748 | { |
| 749 | my_free(sets->set_buffer); |
| 750 | my_free(sets->bit_buffer); |
| 751 | return; |
| 752 | } |
| 753 | |
| 754 | static void internal_set_bit(REP_SET *set, uint bit) |
| 755 | { |
| 756 | set->bits[bit / WORD_BIT] |= 1 << (bit % WORD_BIT); |
| 757 | return; |
| 758 | } |
| 759 | |
| 760 | static void internal_clear_bit(REP_SET *set, uint bit) |
| 761 | { |
| 762 | set->bits[bit / WORD_BIT] &= ~ (1 << (bit % WORD_BIT)); |
| 763 | return; |
| 764 | } |
| 765 | |
| 766 | |
| 767 | static void or_bits(REP_SET *to,REP_SET *from) |
| 768 | { |
| 769 | reg1 uint i; |
| 770 | for (i=0 ; i < to->size_of_bits ; i++) |
| 771 | to->bits[i]|=from->bits[i]; |
| 772 | return; |
| 773 | } |
| 774 | |
| 775 | static void copy_bits(REP_SET *to,REP_SET *from) |
| 776 | { |
| 777 | memcpy((uchar*) to->bits,(uchar*) from->bits, |
| 778 | (size_t) (sizeof(uint) * to->size_of_bits)); |
| 779 | } |
| 780 | |
| 781 | static int cmp_bits(REP_SET *set1,REP_SET *set2) |
| 782 | { |
| 783 | return memcmp(set1->bits, set2->bits, |
| 784 | sizeof(uint) * set1->size_of_bits); |
| 785 | } |
| 786 | |
| 787 | |
| 788 | /* Get next set bit from set. */ |
| 789 | |
| 790 | static int get_next_bit(REP_SET *set,uint lastpos) |
| 791 | { |
| 792 | uint pos,*start,*end,bits; |
| 793 | |
| 794 | start=set->bits+ ((lastpos+1) / WORD_BIT); |
| 795 | end=set->bits + set->size_of_bits; |
| 796 | bits=start[0] & ~((1 << ((lastpos+1) % WORD_BIT)) -1); |
| 797 | |
| 798 | while (! bits && ++start < end) |
| 799 | bits=start[0]; |
| 800 | if (!bits) |
| 801 | return 0; |
| 802 | pos=(uint) (start-set->bits)*WORD_BIT; |
| 803 | while (! (bits & 1)) |
| 804 | { |
| 805 | bits>>=1; |
| 806 | pos++; |
| 807 | } |
| 808 | return pos; |
| 809 | } |
| 810 | |
| 811 | /* find if there is a same set in sets. If there is, use it and |
| 812 | free given set, else put in given set in sets and return it's |
| 813 | position */ |
| 814 | |
| 815 | static short find_set(REP_SETS *sets,REP_SET *find) |
| 816 | { |
| 817 | uint i; |
| 818 | for (i=0 ; i < sets->count-1 ; i++) |
| 819 | { |
| 820 | if (!cmp_bits(sets->set+i,find)) |
| 821 | { |
| 822 | free_last_set(sets); |
| 823 | return (short) i; |
| 824 | } |
| 825 | } |
| 826 | return (short) i; /* return new position */ |
| 827 | } |
| 828 | |
| 829 | |
| 830 | /* |
| 831 | find if there is a found_set with same table_offset & found_offset |
| 832 | If there is return offset to it, else add new offset and return pos. |
| 833 | Pos returned is -offset-2 in found_set_structure because it's is |
| 834 | saved in set->next and set->next[] >= 0 points to next set and |
| 835 | set->next[] == -1 is reserved for end without replaces. |
| 836 | */ |
| 837 | |
| 838 | static short find_found(FOUND_SET *found_set,uint table_offset, |
| 839 | int found_offset) |
| 840 | { |
| 841 | int i; |
| 842 | for (i=0 ; (uint) i < found_sets ; i++) |
| 843 | if (found_set[i].table_offset == table_offset && |
| 844 | found_set[i].found_offset == found_offset) |
| 845 | return (short) (-i-2); |
| 846 | found_set[i].table_offset=table_offset; |
| 847 | found_set[i].found_offset=found_offset; |
| 848 | found_sets++; |
| 849 | return (short) (-i-2); /* return new position */ |
| 850 | } |
| 851 | |
| 852 | /* Return 1 if regexp starts with \b or ends with \b*/ |
| 853 | |
| 854 | static uint start_at_word(char * pos) |
| 855 | { |
| 856 | return (((!memcmp(pos,"\\b" ,2) && pos[2]) || !memcmp(pos,"\\^" ,2)) ? 1 : 0); |
| 857 | } |
| 858 | |
| 859 | static uint end_of_word(char * pos) |
| 860 | { |
| 861 | char * end=strend(pos); |
| 862 | return ((end > pos+2 && !memcmp(end-2,"\\b" ,2)) || |
| 863 | (end >= pos+2 && !memcmp(end-2,"\\$" ,2))) ? |
| 864 | 1 : 0; |
| 865 | } |
| 866 | |
| 867 | |
| 868 | static uint replace_len(char * str) |
| 869 | { |
| 870 | uint len=0; |
| 871 | while (*str) |
| 872 | { |
| 873 | if (str[0] == '\\' && str[1]) |
| 874 | str++; |
| 875 | str++; |
| 876 | len++; |
| 877 | } |
| 878 | return len; |
| 879 | } |
| 880 | |
| 881 | |
| 882 | /* The actual loop */ |
| 883 | |
| 884 | static uint replace_strings(REPLACE *rep, char **start, uint *max_length, |
| 885 | char *from) |
| 886 | { |
| 887 | reg1 REPLACE *rep_pos; |
| 888 | reg2 REPLACE_STRING *rep_str; |
| 889 | char *to, *end, *pos, *new; |
| 890 | |
| 891 | end=(to= *start) + *max_length-1; |
| 892 | rep_pos=rep+1; |
| 893 | for(;;) |
| 894 | { |
| 895 | while (!rep_pos->found) |
| 896 | { |
| 897 | rep_pos= rep_pos->next[(uchar) *from]; |
| 898 | if (to == end) |
| 899 | { |
| 900 | (*max_length)+=8192; |
| 901 | if (!(new=my_realloc(*start,*max_length,MYF(MY_WME)))) |
| 902 | return (uint) -1; |
| 903 | to=new+(to - *start); |
| 904 | end=(*start=new)+ *max_length-1; |
| 905 | } |
| 906 | *to++= *from++; |
| 907 | } |
| 908 | if (!(rep_str = ((REPLACE_STRING*) rep_pos))->replace_string) |
| 909 | return (uint) (to - *start)-1; |
| 910 | updated=1; /* Some char * is replaced */ |
| 911 | to-=rep_str->to_offset; |
| 912 | for (pos=rep_str->replace_string; *pos ; pos++) |
| 913 | { |
| 914 | if (to == end) |
| 915 | { |
| 916 | (*max_length)*=2; |
| 917 | if (!(new=my_realloc(*start,*max_length,MYF(MY_WME)))) |
| 918 | return (uint) -1; |
| 919 | to=new+(to - *start); |
| 920 | end=(*start=new)+ *max_length-1; |
| 921 | } |
| 922 | *to++= *pos; |
| 923 | } |
| 924 | if (!*(from-=rep_str->from_offset) && rep_pos->found != 2) |
| 925 | return (uint) (to - *start); |
| 926 | rep_pos=rep; |
| 927 | } |
| 928 | } |
| 929 | |
| 930 | static char *buffer; /* The buffer itself, grown as needed. */ |
| 931 | static int bufbytes; /* Number of bytes in the buffer. */ |
| 932 | static int bufread,my_eof; /* Number of bytes to get with each read(). */ |
| 933 | static uint bufalloc; |
| 934 | static char *out_buff; |
| 935 | static uint out_length; |
| 936 | |
| 937 | static int initialize_buffer() |
| 938 | { |
| 939 | bufread = 8192; |
| 940 | bufalloc = bufread + bufread / 2; |
| 941 | if (!(buffer = my_malloc(bufalloc+1,MYF(MY_WME)))) |
| 942 | return 1; |
| 943 | bufbytes=my_eof=0; |
| 944 | out_length=bufread; |
| 945 | if (!(out_buff=my_malloc(out_length,MYF(MY_WME)))) |
| 946 | return(1); |
| 947 | return 0; |
| 948 | } |
| 949 | |
| 950 | static void reset_buffer() |
| 951 | { |
| 952 | bufbytes=my_eof=0; |
| 953 | } |
| 954 | |
| 955 | static void free_buffer() |
| 956 | { |
| 957 | my_free(buffer); |
| 958 | my_free(out_buff); |
| 959 | } |
| 960 | |
| 961 | |
| 962 | /* |
| 963 | Fill the buffer retaining the last n bytes at the beginning of the |
| 964 | newly filled buffer (for backward context). Returns the number of new |
| 965 | bytes read from disk. |
| 966 | */ |
| 967 | |
| 968 | static int fill_buffer_retaining(fd,n) |
| 969 | File fd; |
| 970 | int n; |
| 971 | { |
| 972 | int i; |
| 973 | |
| 974 | /* See if we need to grow the buffer. */ |
| 975 | if ((int) bufalloc - n <= bufread) |
| 976 | { |
| 977 | while ((int) bufalloc - n <= bufread) |
| 978 | { |
| 979 | bufalloc *= 2; |
| 980 | bufread *= 2; |
| 981 | } |
| 982 | buffer = my_realloc(buffer, bufalloc+1, MYF(MY_WME)); |
| 983 | if (! buffer) |
| 984 | return(-1); |
| 985 | } |
| 986 | |
| 987 | /* Shift stuff down. */ |
| 988 | bmove(buffer,buffer+bufbytes-n,(uint) n); |
| 989 | bufbytes = n; |
| 990 | |
| 991 | if (my_eof) |
| 992 | return 0; |
| 993 | |
| 994 | /* Read in new stuff. */ |
| 995 | if ((i=(int) my_read(fd, (uchar*) buffer + bufbytes, |
| 996 | (size_t) bufread, MYF(MY_WME))) < 0) |
| 997 | return -1; |
| 998 | |
| 999 | /* Kludge to pretend every nonempty file ends with a newline. */ |
| 1000 | if (i == 0 && bufbytes > 0 && buffer[bufbytes - 1] != '\n') |
| 1001 | { |
| 1002 | my_eof = i = 1; |
| 1003 | buffer[bufbytes] = '\n'; |
| 1004 | } |
| 1005 | |
| 1006 | bufbytes += i; |
| 1007 | return i; |
| 1008 | } |
| 1009 | |
| 1010 | /* Return 0 if convert is ok */ |
| 1011 | /* Global variable update is set if something was changed */ |
| 1012 | |
| 1013 | static int convert_pipe(rep,in,out) |
| 1014 | REPLACE *rep; |
| 1015 | FILE *in,*out; |
| 1016 | { |
| 1017 | int retain,error; |
| 1018 | uint length; |
| 1019 | char save_char,*end_of_line,*start_of_line; |
| 1020 | DBUG_ENTER("convert_pipe" ); |
| 1021 | |
| 1022 | updated=retain=0; |
| 1023 | reset_buffer(); |
| 1024 | |
| 1025 | while ((error=fill_buffer_retaining(my_fileno(in),retain)) > 0) |
| 1026 | { |
| 1027 | end_of_line=buffer ; |
| 1028 | buffer[bufbytes]=0; /* Sentinel */ |
| 1029 | for (;;) |
| 1030 | { |
| 1031 | start_of_line=end_of_line; |
| 1032 | while (end_of_line[0] != '\n' && end_of_line[0]) |
| 1033 | end_of_line++; |
| 1034 | if (end_of_line == buffer+bufbytes) |
| 1035 | { |
| 1036 | retain= (int) (end_of_line - start_of_line); |
| 1037 | break; /* No end of line, read more */ |
| 1038 | } |
| 1039 | save_char=end_of_line[0]; |
| 1040 | end_of_line[0]=0; |
| 1041 | end_of_line++; |
| 1042 | if ((length=replace_strings(rep,&out_buff,&out_length,start_of_line)) == |
| 1043 | (uint) -1) |
| 1044 | return 1; |
| 1045 | if (!my_eof) |
| 1046 | out_buff[length++]=save_char; /* Don't write added newline */ |
| 1047 | if (my_fwrite(out, (uchar*) out_buff, length, MYF(MY_WME | MY_NABP))) |
| 1048 | DBUG_RETURN(1); |
| 1049 | } |
| 1050 | } |
| 1051 | DBUG_RETURN(error); |
| 1052 | } |
| 1053 | |
| 1054 | |
| 1055 | static int convert_file(REPLACE *rep, char * name) |
| 1056 | { |
| 1057 | int error; |
| 1058 | FILE *in,*out; |
| 1059 | char dir_buff[FN_REFLEN], tempname[FN_REFLEN], *org_name = name; |
| 1060 | #ifdef HAVE_READLINK |
| 1061 | char link_name[FN_REFLEN]; |
| 1062 | #endif |
| 1063 | File temp_file; |
| 1064 | size_t dir_buff_length; |
| 1065 | DBUG_ENTER("convert_file" ); |
| 1066 | |
| 1067 | /* check if name is a symlink */ |
| 1068 | #ifdef HAVE_READLINK |
| 1069 | org_name= (!my_disable_symlinks && |
| 1070 | !my_readlink(link_name, name, MYF(0))) ? link_name : name; |
| 1071 | #endif |
| 1072 | if (!(in= my_fopen(org_name,O_RDONLY,MYF(MY_WME)))) |
| 1073 | DBUG_RETURN(1); |
| 1074 | dirname_part(dir_buff, org_name, &dir_buff_length); |
| 1075 | if ((temp_file= create_temp_file(tempname, dir_buff, "PR" , 0, |
| 1076 | MYF(MY_WME))) < 0) |
| 1077 | { |
| 1078 | my_fclose(in,MYF(0)); |
| 1079 | DBUG_RETURN(1); |
| 1080 | } |
| 1081 | if (!(out= my_fdopen(temp_file, tempname, O_WRONLY, MYF(MY_WME)))) |
| 1082 | { |
| 1083 | my_fclose(in,MYF(0)); |
| 1084 | DBUG_RETURN(1); |
| 1085 | } |
| 1086 | |
| 1087 | error=convert_pipe(rep,in,out); |
| 1088 | my_fclose(in,MYF(0)); my_fclose(out,MYF(0)); |
| 1089 | |
| 1090 | if (updated && ! error) |
| 1091 | my_redel(org_name, tempname, 0, MYF(MY_WME | MY_LINK_WARNING)); |
| 1092 | else |
| 1093 | my_delete(tempname,MYF(MY_WME)); |
| 1094 | if (!silent && ! error) |
| 1095 | { |
| 1096 | if (updated) |
| 1097 | printf("%s converted\n" ,name); |
| 1098 | else if (verbose) |
| 1099 | printf("%s left unchanged\n" ,name); |
| 1100 | } |
| 1101 | DBUG_RETURN(error); |
| 1102 | } |
| 1103 | |