1 | /* |
2 | Copyright (c) 2000, 2014, Oracle and/or its affiliates |
3 | |
4 | This program is free software; you can redistribute it and/or |
5 | modify it under the terms of the GNU General Public License |
6 | as published by the Free Software Foundation; version 2 of |
7 | the License. |
8 | |
9 | This program is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | GNU General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU General Public License |
15 | along with this program; if not, write to the Free Software |
16 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
17 | 02110-1301 USA */ |
18 | |
19 | /* |
20 | Replace strings in textfile |
21 | |
22 | This program replaces strings in files or from stdin to stdout. |
23 | It accepts a list of from-string/to-string pairs and replaces |
24 | each occurrence of a from-string with the corresponding to-string. |
25 | The first occurrence of a found string is matched. If there is more |
26 | than one possibility for the string to replace, longer matches |
27 | are preferred before shorter matches. |
28 | |
29 | Special characters in from string: |
30 | \^ Match start of line. |
31 | \$ Match end of line. |
32 | \b Match space-character, start of line or end of line. |
33 | For end \b the next replace starts locking at the end space-character. |
34 | An \b alone or in a string matches only a space-character. |
35 | \r, \t, \v as in C. |
36 | The programs make a DFA-state-machine of the strings and the speed isn't |
37 | dependent on the count of replace-strings (only of the number of replaces). |
38 | A line is assumed ending with \n or \0. |
39 | There are no limit except memory on length of strings. |
40 | |
41 | Written by Monty. |
42 | fill_buffer_retaining() is taken from gnu-grep and modified. |
43 | */ |
44 | |
45 | #include <my_global.h> |
46 | #include <m_ctype.h> |
47 | #include <my_sys.h> |
48 | #include <m_string.h> |
49 | #include <errno.h> |
50 | |
51 | #define PC_MALLOC 256 /* Bytes for pointers */ |
52 | #define PS_MALLOC 512 /* Bytes for data */ |
53 | |
54 | typedef struct st_pointer_array { /* when using array-strings */ |
55 | TYPELIB typelib; /* Pointer to strings */ |
56 | uchar *str; /* Strings is here */ |
57 | uint8 *flag; /* Flag about each var. */ |
58 | uint array_allocs,max_count,length,max_length; |
59 | } POINTER_ARRAY; |
60 | |
61 | #define SPACE_CHAR 256 |
62 | #define START_OF_LINE 257 |
63 | #define END_OF_LINE 258 |
64 | #define LAST_CHAR_CODE 259 |
65 | |
66 | typedef struct st_replace { |
67 | my_bool found; |
68 | struct st_replace *next[256]; |
69 | } REPLACE; |
70 | |
71 | typedef struct st_replace_found { |
72 | my_bool found; |
73 | char *replace_string; |
74 | uint to_offset; |
75 | int from_offset; |
76 | } REPLACE_STRING; |
77 | |
78 | #ifndef WORD_BIT |
79 | #define WORD_BIT (8*sizeof(uint)) |
80 | #endif |
81 | |
82 | /* functions defined in this file */ |
83 | |
84 | static int static_get_options(int *argc,char * * *argv); |
85 | static int get_replace_strings(int *argc,char * * *argv, |
86 | POINTER_ARRAY *from_array, |
87 | POINTER_ARRAY *to_array); |
88 | static int insert_pointer_name(POINTER_ARRAY *pa, char * name); |
89 | static void free_pointer_array(POINTER_ARRAY *pa); |
90 | static int convert_pipe(REPLACE *,FILE *,FILE *); |
91 | static int convert_file(REPLACE *, char *); |
92 | static REPLACE *init_replace(char * *from, char * *to,uint count, |
93 | char * word_end_chars); |
94 | static uint replace_strings(REPLACE *rep, char * *start,uint *max_length, |
95 | char * from); |
96 | static int initialize_buffer(void); |
97 | static void reset_buffer(void); |
98 | static void free_buffer(void); |
99 | |
100 | static int silent=0,verbose=0,updated=0; |
101 | |
102 | /* The main program */ |
103 | |
104 | int main(int argc, char *argv[]) |
105 | { |
106 | int i,error; |
107 | char word_end_chars[256],*pos; |
108 | POINTER_ARRAY from,to; |
109 | REPLACE *replace; |
110 | MY_INIT(argv[0]); |
111 | |
112 | if (static_get_options(&argc,&argv)) |
113 | exit(1); |
114 | if (get_replace_strings(&argc,&argv,&from,&to)) |
115 | exit(1); |
116 | |
117 | for (i=1,pos=word_end_chars ; i < 256 ; i++) |
118 | if (my_isspace(&my_charset_latin1,i)) |
119 | *pos++= (char) i; |
120 | *pos=0; |
121 | if (!(replace=init_replace((char**) from.typelib.type_names, |
122 | (char**) to.typelib.type_names, |
123 | (uint) from.typelib.count,word_end_chars))) |
124 | exit(1); |
125 | free_pointer_array(&from); |
126 | free_pointer_array(&to); |
127 | if (initialize_buffer()) |
128 | return 1; |
129 | |
130 | error=0; |
131 | if (argc == 0) |
132 | error=convert_pipe(replace,stdin,stdout); |
133 | else |
134 | { |
135 | while (argc--) |
136 | { |
137 | error=convert_file(replace,*(argv++)); |
138 | } |
139 | } |
140 | free_buffer(); |
141 | my_free(replace); |
142 | my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR); |
143 | exit(error ? 2 : 0); |
144 | return 0; /* No compiler warning */ |
145 | } /* main */ |
146 | |
147 | |
148 | /* reads options */ |
149 | /* Initiates DEBUG - but no debugging here ! */ |
150 | |
151 | static int static_get_options(argc,argv) |
152 | register int *argc; |
153 | register char **argv[]; |
154 | { |
155 | int help,version; |
156 | char *pos; |
157 | |
158 | silent=verbose=help=0; |
159 | |
160 | while (--*argc > 0 && *(pos = *(++*argv)) == '-' && pos[1] != '-') { |
161 | while (*++pos) |
162 | { |
163 | version=0; |
164 | switch((*pos)) { |
165 | case 's': |
166 | silent=1; |
167 | break; |
168 | case 'v': |
169 | verbose=1; |
170 | break; |
171 | case '#': |
172 | DBUG_PUSH (++pos); |
173 | pos= (char*) " " ; /* Skip rest of arguments */ |
174 | break; |
175 | case 'V': |
176 | version=1; |
177 | /* fall through */ |
178 | case 'I': |
179 | case '?': |
180 | help=1; /* Help text written */ |
181 | printf("%s Ver 1.4 for %s at %s\n" ,my_progname,SYSTEM_TYPE, |
182 | MACHINE_TYPE); |
183 | if (version) |
184 | break; |
185 | puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,\nand you are welcome to modify and redistribute it under the GPL license\n" ); |
186 | puts("This program replaces strings in files or from stdin to stdout.\n" |
187 | "It accepts a list of from-string/to-string pairs and replaces\n" |
188 | "each occurrence of a from-string with the corresponding to-string.\n" |
189 | "The first occurrence of a found string is matched. If there is\n" |
190 | "more than one possibility for the string to replace, longer\n" |
191 | "matches are preferred before shorter matches.\n\n" |
192 | "A from-string can contain these special characters:\n" |
193 | " \\^ Match start of line.\n" |
194 | " \\$ Match end of line.\n" |
195 | " \\b Match space-character, start of line or end of line.\n" |
196 | " For a end \\b the next replace starts locking at the end\n" |
197 | " space-character. A \\b alone in a string matches only a\n" |
198 | " space-character.\n" ); |
199 | printf("Usage: %s [-?svIV] from to from to ... -- [files]\n" , my_progname); |
200 | puts("or" ); |
201 | printf("Usage: %s [-?svIV] from to from to ... < fromfile > tofile\n" , my_progname); |
202 | puts("" ); |
203 | puts("Options: -? or -I \"Info\" -s \"silent\" -v \"verbose\"" ); |
204 | break; |
205 | default: |
206 | fprintf(stderr,"illegal option: -%c\n" ,*pos); |
207 | break; |
208 | } |
209 | } |
210 | } |
211 | if (*argc == 0) |
212 | { |
213 | if (!help) |
214 | my_message(0,"No replace options given" ,MYF(ME_BELL)); |
215 | exit(0); /* Don't use as pipe */ |
216 | } |
217 | return(0); |
218 | } /* static_get_options */ |
219 | |
220 | |
221 | static int get_replace_strings(argc,argv,from_array,to_array) |
222 | register int *argc; |
223 | register char **argv[]; |
224 | POINTER_ARRAY *from_array,*to_array; |
225 | { |
226 | char *pos; |
227 | |
228 | bzero((char*) from_array,sizeof(from_array[0])); |
229 | bzero((char*) to_array,sizeof(to_array[0])); |
230 | while (*argc > 0 && (*(pos = *(*argv)) != '-' || pos[1] != '-' || pos[2])) |
231 | { |
232 | insert_pointer_name(from_array,pos); |
233 | (*argc)--; |
234 | (*argv)++; |
235 | if (!*argc || !strcmp(**argv,"--" )) |
236 | { |
237 | my_message(0,"No to-string for last from-string" ,MYF(ME_BELL)); |
238 | return 1; |
239 | } |
240 | insert_pointer_name(to_array,**argv); |
241 | (*argc)--; |
242 | (*argv)++; |
243 | } |
244 | if (*argc) |
245 | { /* Skip "--" argument */ |
246 | (*argc)--; |
247 | (*argv)++; |
248 | } |
249 | return 0; |
250 | } |
251 | |
252 | static int insert_pointer_name(reg1 POINTER_ARRAY *pa,char * name) |
253 | { |
254 | uint i,length,old_count; |
255 | uchar *new_pos; |
256 | const char **new_array; |
257 | DBUG_ENTER("insert_pointer_name" ); |
258 | |
259 | if (! pa->typelib.count) |
260 | { |
261 | if (!(pa->typelib.type_names=(const char **) |
262 | my_malloc(((PC_MALLOC-MALLOC_OVERHEAD)/ |
263 | (sizeof(char *)+sizeof(*pa->flag))* |
264 | (sizeof(char *)+sizeof(*pa->flag))),MYF(MY_WME)))) |
265 | DBUG_RETURN(-1); |
266 | if (!(pa->str= (uchar*) my_malloc((uint) (PS_MALLOC-MALLOC_OVERHEAD), |
267 | MYF(MY_WME)))) |
268 | { |
269 | my_free((void*) pa->typelib.type_names); |
270 | DBUG_RETURN (-1); |
271 | } |
272 | pa->max_count=(PC_MALLOC-MALLOC_OVERHEAD)/(sizeof(uchar*)+ |
273 | sizeof(*pa->flag)); |
274 | pa->flag= (uint8*) (pa->typelib.type_names+pa->max_count); |
275 | pa->length=0; |
276 | pa->max_length=PS_MALLOC-MALLOC_OVERHEAD; |
277 | pa->array_allocs=1; |
278 | } |
279 | length=(uint) strlen(name)+1; |
280 | if (pa->length+length >= pa->max_length) |
281 | { |
282 | pa->max_length=(pa->length+length+MALLOC_OVERHEAD+PS_MALLOC-1)/PS_MALLOC; |
283 | pa->max_length=pa->max_length*PS_MALLOC-MALLOC_OVERHEAD; |
284 | if (!(new_pos= (uchar*) my_realloc((uchar*) pa->str, |
285 | (uint) pa->max_length, |
286 | MYF(MY_WME)))) |
287 | DBUG_RETURN(1); |
288 | if (new_pos != pa->str) |
289 | { |
290 | my_ptrdiff_t diff=PTR_BYTE_DIFF(new_pos,pa->str); |
291 | for (i=0 ; i < pa->typelib.count ; i++) |
292 | pa->typelib.type_names[i]= ADD_TO_PTR(pa->typelib.type_names[i],diff, |
293 | char*); |
294 | pa->str=new_pos; |
295 | } |
296 | } |
297 | if (pa->typelib.count >= pa->max_count-1) |
298 | { |
299 | int len; |
300 | pa->array_allocs++; |
301 | len=(PC_MALLOC*pa->array_allocs - MALLOC_OVERHEAD); |
302 | if (!(new_array=(const char **) my_realloc((uchar*) pa->typelib.type_names, |
303 | (uint) len/ |
304 | (sizeof(uchar*)+sizeof(*pa->flag))* |
305 | (sizeof(uchar*)+sizeof(*pa->flag)), |
306 | MYF(MY_WME)))) |
307 | DBUG_RETURN(1); |
308 | pa->typelib.type_names=new_array; |
309 | old_count=pa->max_count; |
310 | pa->max_count=len/(sizeof(uchar*) + sizeof(*pa->flag)); |
311 | pa->flag= (uint8*) (pa->typelib.type_names+pa->max_count); |
312 | memcpy((uchar*) pa->flag,(char *) (pa->typelib.type_names+old_count), |
313 | old_count*sizeof(*pa->flag)); |
314 | } |
315 | pa->flag[pa->typelib.count]=0; /* Reset flag */ |
316 | pa->typelib.type_names[pa->typelib.count++]= (char*) (pa->str+pa->length); |
317 | pa->typelib.type_names[pa->typelib.count]= NullS; /* Put end-mark */ |
318 | (void) strmov((char*) pa->str + pa->length, name); |
319 | pa->length+=length; |
320 | DBUG_RETURN(0); |
321 | } /* insert_pointer_name */ |
322 | |
323 | |
324 | /* free pointer array */ |
325 | |
326 | static void free_pointer_array(reg1 POINTER_ARRAY *pa) |
327 | { |
328 | if (pa->typelib.count) |
329 | { |
330 | pa->typelib.count=0; |
331 | my_free((void*) pa->typelib.type_names); |
332 | pa->typelib.type_names=0; |
333 | my_free(pa->str); |
334 | } |
335 | return; |
336 | } /* free_pointer_array */ |
337 | |
338 | |
339 | /* Code for replace rutines */ |
340 | |
341 | #define SET_MALLOC_HUNC 64 |
342 | |
343 | typedef struct st_rep_set { |
344 | uint *bits; /* Pointer to used sets */ |
345 | short next[LAST_CHAR_CODE]; /* Pointer to next sets */ |
346 | uint found_len; /* Best match to date */ |
347 | int found_offset; |
348 | uint table_offset; |
349 | uint size_of_bits; /* For convinience */ |
350 | } REP_SET; |
351 | |
352 | typedef struct st_rep_sets { |
353 | uint count; /* Number of sets */ |
354 | uint ; /* Extra sets in buffer */ |
355 | uint invisible; /* Sets not chown */ |
356 | uint size_of_bits; |
357 | REP_SET *set,*set_buffer; |
358 | uint *bit_buffer; |
359 | } REP_SETS; |
360 | |
361 | typedef struct st_found_set { |
362 | uint table_offset; |
363 | int found_offset; |
364 | } FOUND_SET; |
365 | |
366 | typedef struct st_follow { |
367 | int chr; |
368 | uint table_offset; |
369 | uint len; |
370 | } FOLLOWS; |
371 | |
372 | |
373 | static int init_sets(REP_SETS *sets,uint states); |
374 | static REP_SET *make_new_set(REP_SETS *sets); |
375 | static void make_sets_invisible(REP_SETS *sets); |
376 | static void free_last_set(REP_SETS *sets); |
377 | static void free_sets(REP_SETS *sets); |
378 | static void internal_set_bit(REP_SET *set, uint bit); |
379 | static void internal_clear_bit(REP_SET *set, uint bit); |
380 | static void or_bits(REP_SET *to,REP_SET *from); |
381 | static void copy_bits(REP_SET *to,REP_SET *from); |
382 | static int cmp_bits(REP_SET *set1,REP_SET *set2); |
383 | static int get_next_bit(REP_SET *set,uint lastpos); |
384 | static short find_set(REP_SETS *sets,REP_SET *find); |
385 | static short find_found(FOUND_SET *found_set,uint table_offset, |
386 | int found_offset); |
387 | static uint start_at_word(char * pos); |
388 | static uint end_of_word(char * pos); |
389 | static uint replace_len(char * pos); |
390 | |
391 | static uint found_sets=0; |
392 | |
393 | |
394 | /* Init a replace structure for further calls */ |
395 | |
396 | static REPLACE *init_replace(char * *from, char * *to,uint count, |
397 | char * word_end_chars) |
398 | { |
399 | uint i,j,states,set_nr,len,result_len,max_length,found_end,bits_set,bit_nr; |
400 | int used_sets,chr; |
401 | short default_state; |
402 | char used_chars[LAST_CHAR_CODE],is_word_end[256]; |
403 | char * pos, *to_pos, **to_array; |
404 | REP_SETS sets; |
405 | REP_SET *set,*start_states,*word_states,*new_set; |
406 | FOLLOWS *follow,*follow_ptr; |
407 | REPLACE *replace; |
408 | FOUND_SET *found_set; |
409 | REPLACE_STRING *rep_str; |
410 | DBUG_ENTER("init_replace" ); |
411 | |
412 | /* Count number of states */ |
413 | for (i=result_len=max_length=0 , states=2 ; i < count ; i++) |
414 | { |
415 | len=replace_len(from[i]); |
416 | if (!len) |
417 | { |
418 | errno=EINVAL; |
419 | my_message(0,"No to-string for last from-string" ,MYF(ME_BELL)); |
420 | DBUG_RETURN(0); |
421 | } |
422 | states+=len+1; |
423 | result_len+=(uint) strlen(to[i])+1; |
424 | if (len > max_length) |
425 | max_length=len; |
426 | } |
427 | bzero((char*) is_word_end,sizeof(is_word_end)); |
428 | for (i=0 ; word_end_chars[i] ; i++) |
429 | is_word_end[(uchar) word_end_chars[i]]=1; |
430 | |
431 | if (init_sets(&sets,states)) |
432 | DBUG_RETURN(0); |
433 | found_sets=0; |
434 | if (!(found_set= (FOUND_SET*) my_malloc(sizeof(FOUND_SET)*max_length*count, |
435 | MYF(MY_WME)))) |
436 | { |
437 | free_sets(&sets); |
438 | DBUG_RETURN(0); |
439 | } |
440 | (void) make_new_set(&sets); /* Set starting set */ |
441 | make_sets_invisible(&sets); /* Hide previus sets */ |
442 | used_sets=-1; |
443 | word_states=make_new_set(&sets); /* Start of new word */ |
444 | start_states=make_new_set(&sets); /* This is first state */ |
445 | if (!(follow=(FOLLOWS*) my_malloc((states+2)*sizeof(FOLLOWS),MYF(MY_WME)))) |
446 | { |
447 | free_sets(&sets); |
448 | my_free(found_set); |
449 | DBUG_RETURN(0); |
450 | } |
451 | |
452 | /* Init follow_ptr[] */ |
453 | for (i=0, states=1, follow_ptr=follow+1 ; i < count ; i++) |
454 | { |
455 | if (from[i][0] == '\\' && from[i][1] == '^') |
456 | { |
457 | internal_set_bit(start_states,states+1); |
458 | if (!from[i][2]) |
459 | { |
460 | start_states->table_offset=i; |
461 | start_states->found_offset=1; |
462 | } |
463 | } |
464 | else if (from[i][0] == '\\' && from[i][1] == '$') |
465 | { |
466 | internal_set_bit(start_states,states); |
467 | internal_set_bit(word_states,states); |
468 | if (!from[i][2] && start_states->table_offset == (uint) ~0) |
469 | { |
470 | start_states->table_offset=i; |
471 | start_states->found_offset=0; |
472 | } |
473 | } |
474 | else |
475 | { |
476 | internal_set_bit(word_states,states); |
477 | if (from[i][0] == '\\' && (from[i][1] == 'b' && from[i][2])) |
478 | internal_set_bit(start_states,states+1); |
479 | else |
480 | internal_set_bit(start_states,states); |
481 | } |
482 | for (pos=from[i], len=0; *pos ; pos++) |
483 | { |
484 | if (*pos == '\\' && *(pos+1)) |
485 | { |
486 | pos++; |
487 | switch (*pos) { |
488 | case 'b': |
489 | follow_ptr->chr = SPACE_CHAR; |
490 | break; |
491 | case '^': |
492 | follow_ptr->chr = START_OF_LINE; |
493 | break; |
494 | case '$': |
495 | follow_ptr->chr = END_OF_LINE; |
496 | break; |
497 | case 'r': |
498 | follow_ptr->chr = '\r'; |
499 | break; |
500 | case 't': |
501 | follow_ptr->chr = '\t'; |
502 | break; |
503 | case 'v': |
504 | follow_ptr->chr = '\v'; |
505 | break; |
506 | default: |
507 | follow_ptr->chr = (uchar) *pos; |
508 | break; |
509 | } |
510 | } |
511 | else |
512 | follow_ptr->chr= (uchar) *pos; |
513 | follow_ptr->table_offset=i; |
514 | follow_ptr->len= ++len; |
515 | follow_ptr++; |
516 | } |
517 | follow_ptr->chr=0; |
518 | follow_ptr->table_offset=i; |
519 | follow_ptr->len=len; |
520 | follow_ptr++; |
521 | states+=(uint) len+1; |
522 | } |
523 | |
524 | |
525 | for (set_nr=0,pos=0 ; set_nr < sets.count ; set_nr++) |
526 | { |
527 | set=sets.set+set_nr; |
528 | default_state= 0; /* Start from beginning */ |
529 | |
530 | /* If end of found-string not found or start-set with current set */ |
531 | |
532 | for (i= (uint) ~0; (i=get_next_bit(set,i)) ;) |
533 | { |
534 | if (!follow[i].chr) |
535 | { |
536 | if (! default_state) |
537 | default_state= find_found(found_set,set->table_offset, |
538 | set->found_offset+1); |
539 | } |
540 | } |
541 | copy_bits(sets.set+used_sets,set); /* Save set for changes */ |
542 | if (!default_state) |
543 | or_bits(sets.set+used_sets,sets.set); /* Can restart from start */ |
544 | |
545 | /* Find all chars that follows current sets */ |
546 | bzero((char*) used_chars,sizeof(used_chars)); |
547 | for (i= (uint) ~0; (i=get_next_bit(sets.set+used_sets,i)) ;) |
548 | { |
549 | used_chars[follow[i].chr]=1; |
550 | if ((follow[i].chr == SPACE_CHAR && !follow[i+1].chr && |
551 | follow[i].len > 1) || follow[i].chr == END_OF_LINE) |
552 | used_chars[0]=1; |
553 | } |
554 | |
555 | /* Mark word_chars used if \b is in state */ |
556 | if (used_chars[SPACE_CHAR]) |
557 | for (pos= word_end_chars ; *pos ; pos++) |
558 | used_chars[(int) (uchar) *pos] = 1; |
559 | |
560 | /* Handle other used characters */ |
561 | for (chr= 0 ; chr < 256 ; chr++) |
562 | { |
563 | if (! used_chars[chr]) |
564 | set->next[chr]= (short) (chr ? default_state : -1); |
565 | else |
566 | { |
567 | new_set=make_new_set(&sets); |
568 | set=sets.set+set_nr; /* if realloc */ |
569 | new_set->table_offset=set->table_offset; |
570 | new_set->found_len=set->found_len; |
571 | new_set->found_offset=set->found_offset+1; |
572 | found_end=0; |
573 | |
574 | for (i= (uint) ~0 ; (i=get_next_bit(sets.set+used_sets,i)) ; ) |
575 | { |
576 | if (!follow[i].chr || follow[i].chr == chr || |
577 | (follow[i].chr == SPACE_CHAR && |
578 | (is_word_end[chr] || |
579 | (!chr && follow[i].len > 1 && ! follow[i+1].chr))) || |
580 | (follow[i].chr == END_OF_LINE && ! chr)) |
581 | { |
582 | if ((! chr || (follow[i].chr && !follow[i+1].chr)) && |
583 | follow[i].len > found_end) |
584 | found_end=follow[i].len; |
585 | if (chr && follow[i].chr) |
586 | internal_set_bit(new_set,i+1); /* To next set */ |
587 | else |
588 | internal_set_bit(new_set,i); |
589 | } |
590 | } |
591 | if (found_end) |
592 | { |
593 | new_set->found_len=0; /* Set for testing if first */ |
594 | bits_set=0; |
595 | for (i= (uint) ~0; (i=get_next_bit(new_set,i)) ;) |
596 | { |
597 | if ((follow[i].chr == SPACE_CHAR || |
598 | follow[i].chr == END_OF_LINE) && ! chr) |
599 | bit_nr=i+1; |
600 | else |
601 | bit_nr=i; |
602 | if (follow[bit_nr-1].len < found_end || |
603 | (new_set->found_len && |
604 | (chr == 0 || !follow[bit_nr].chr))) |
605 | internal_clear_bit(new_set,i); |
606 | else |
607 | { |
608 | if (chr == 0 || !follow[bit_nr].chr) |
609 | { /* best match */ |
610 | new_set->table_offset=follow[bit_nr].table_offset; |
611 | if (chr || (follow[i].chr == SPACE_CHAR || |
612 | follow[i].chr == END_OF_LINE)) |
613 | new_set->found_offset=found_end; /* New match */ |
614 | new_set->found_len=found_end; |
615 | } |
616 | bits_set++; |
617 | } |
618 | } |
619 | if (bits_set == 1) |
620 | { |
621 | set->next[chr] = find_found(found_set, |
622 | new_set->table_offset, |
623 | new_set->found_offset); |
624 | free_last_set(&sets); |
625 | } |
626 | else |
627 | set->next[chr] = find_set(&sets,new_set); |
628 | } |
629 | else |
630 | set->next[chr] = find_set(&sets,new_set); |
631 | } |
632 | } |
633 | } |
634 | |
635 | /* Alloc replace structure for the replace-state-machine */ |
636 | |
637 | if ((replace=(REPLACE*) my_malloc(sizeof(REPLACE)*(sets.count)+ |
638 | sizeof(REPLACE_STRING)*(found_sets+1)+ |
639 | sizeof(char *)*count+result_len, |
640 | MYF(MY_WME | MY_ZEROFILL)))) |
641 | { |
642 | rep_str=(REPLACE_STRING*) (replace+sets.count); |
643 | to_array=(char **) (rep_str+found_sets+1); |
644 | to_pos=(char *) (to_array+count); |
645 | for (i=0 ; i < count ; i++) |
646 | { |
647 | to_array[i]=to_pos; |
648 | to_pos=strmov(to_pos,to[i])+1; |
649 | } |
650 | rep_str[0].found=1; |
651 | rep_str[0].replace_string=0; |
652 | for (i=1 ; i <= found_sets ; i++) |
653 | { |
654 | pos=from[found_set[i-1].table_offset]; |
655 | rep_str[i].found= (my_bool) (!memcmp(pos,"\\^" ,3) ? 2 : 1); |
656 | rep_str[i].replace_string=to_array[found_set[i-1].table_offset]; |
657 | rep_str[i].to_offset=found_set[i-1].found_offset-start_at_word(pos); |
658 | rep_str[i].from_offset=found_set[i-1].found_offset-replace_len(pos)+ |
659 | end_of_word(pos); |
660 | } |
661 | for (i=0 ; i < sets.count ; i++) |
662 | { |
663 | for (j=0 ; j < 256 ; j++) |
664 | if (sets.set[i].next[j] >= 0) |
665 | replace[i].next[j]=replace+sets.set[i].next[j]; |
666 | else |
667 | replace[i].next[j]=(REPLACE*) (rep_str+(-sets.set[i].next[j]-1)); |
668 | } |
669 | } |
670 | my_free(follow); |
671 | free_sets(&sets); |
672 | my_free(found_set); |
673 | DBUG_PRINT("exit" ,("Replace table has %d states" ,sets.count)); |
674 | DBUG_RETURN(replace); |
675 | } |
676 | |
677 | |
678 | static int init_sets(REP_SETS *sets,uint states) |
679 | { |
680 | bzero((char*) sets,sizeof(*sets)); |
681 | sets->size_of_bits=((states+7)/8); |
682 | if (!(sets->set_buffer=(REP_SET*) my_malloc(sizeof(REP_SET)*SET_MALLOC_HUNC, |
683 | MYF(MY_WME)))) |
684 | return 1; |
685 | if (!(sets->bit_buffer=(uint*) my_malloc(sizeof(uint)*sets->size_of_bits* |
686 | SET_MALLOC_HUNC,MYF(MY_WME)))) |
687 | { |
688 | my_free(sets->set); |
689 | return 1; |
690 | } |
691 | return 0; |
692 | } |
693 | |
694 | /* Make help sets invisible for nicer codeing */ |
695 | |
696 | static void make_sets_invisible(REP_SETS *sets) |
697 | { |
698 | sets->invisible=sets->count; |
699 | sets->set+=sets->count; |
700 | sets->count=0; |
701 | } |
702 | |
703 | static REP_SET *make_new_set(REP_SETS *sets) |
704 | { |
705 | uint i,count,*bit_buffer; |
706 | REP_SET *set; |
707 | if (sets->extra) |
708 | { |
709 | sets->extra--; |
710 | set=sets->set+ sets->count++; |
711 | bzero((char*) set->bits,sizeof(uint)*sets->size_of_bits); |
712 | bzero((char*) &set->next[0],sizeof(set->next[0])*LAST_CHAR_CODE); |
713 | set->found_offset=0; |
714 | set->found_len=0; |
715 | set->table_offset= (uint) ~0; |
716 | set->size_of_bits=sets->size_of_bits; |
717 | return set; |
718 | } |
719 | count=sets->count+sets->invisible+SET_MALLOC_HUNC; |
720 | if (!(set=(REP_SET*) my_realloc((uchar*) sets->set_buffer, |
721 | sizeof(REP_SET)*count, |
722 | MYF(MY_WME)))) |
723 | return 0; |
724 | sets->set_buffer=set; |
725 | sets->set=set+sets->invisible; |
726 | if (!(bit_buffer=(uint*) my_realloc((uchar*) sets->bit_buffer, |
727 | (sizeof(uint)*sets->size_of_bits)*count, |
728 | MYF(MY_WME)))) |
729 | return 0; |
730 | sets->bit_buffer=bit_buffer; |
731 | for (i=0 ; i < count ; i++) |
732 | { |
733 | sets->set_buffer[i].bits=bit_buffer; |
734 | bit_buffer+=sets->size_of_bits; |
735 | } |
736 | sets->extra=SET_MALLOC_HUNC; |
737 | return make_new_set(sets); |
738 | } |
739 | |
740 | static void free_last_set(REP_SETS *sets) |
741 | { |
742 | sets->count--; |
743 | sets->extra++; |
744 | return; |
745 | } |
746 | |
747 | static void free_sets(REP_SETS *sets) |
748 | { |
749 | my_free(sets->set_buffer); |
750 | my_free(sets->bit_buffer); |
751 | return; |
752 | } |
753 | |
754 | static void internal_set_bit(REP_SET *set, uint bit) |
755 | { |
756 | set->bits[bit / WORD_BIT] |= 1 << (bit % WORD_BIT); |
757 | return; |
758 | } |
759 | |
760 | static void internal_clear_bit(REP_SET *set, uint bit) |
761 | { |
762 | set->bits[bit / WORD_BIT] &= ~ (1 << (bit % WORD_BIT)); |
763 | return; |
764 | } |
765 | |
766 | |
767 | static void or_bits(REP_SET *to,REP_SET *from) |
768 | { |
769 | reg1 uint i; |
770 | for (i=0 ; i < to->size_of_bits ; i++) |
771 | to->bits[i]|=from->bits[i]; |
772 | return; |
773 | } |
774 | |
775 | static void copy_bits(REP_SET *to,REP_SET *from) |
776 | { |
777 | memcpy((uchar*) to->bits,(uchar*) from->bits, |
778 | (size_t) (sizeof(uint) * to->size_of_bits)); |
779 | } |
780 | |
781 | static int cmp_bits(REP_SET *set1,REP_SET *set2) |
782 | { |
783 | return memcmp(set1->bits, set2->bits, |
784 | sizeof(uint) * set1->size_of_bits); |
785 | } |
786 | |
787 | |
788 | /* Get next set bit from set. */ |
789 | |
790 | static int get_next_bit(REP_SET *set,uint lastpos) |
791 | { |
792 | uint pos,*start,*end,bits; |
793 | |
794 | start=set->bits+ ((lastpos+1) / WORD_BIT); |
795 | end=set->bits + set->size_of_bits; |
796 | bits=start[0] & ~((1 << ((lastpos+1) % WORD_BIT)) -1); |
797 | |
798 | while (! bits && ++start < end) |
799 | bits=start[0]; |
800 | if (!bits) |
801 | return 0; |
802 | pos=(uint) (start-set->bits)*WORD_BIT; |
803 | while (! (bits & 1)) |
804 | { |
805 | bits>>=1; |
806 | pos++; |
807 | } |
808 | return pos; |
809 | } |
810 | |
811 | /* find if there is a same set in sets. If there is, use it and |
812 | free given set, else put in given set in sets and return it's |
813 | position */ |
814 | |
815 | static short find_set(REP_SETS *sets,REP_SET *find) |
816 | { |
817 | uint i; |
818 | for (i=0 ; i < sets->count-1 ; i++) |
819 | { |
820 | if (!cmp_bits(sets->set+i,find)) |
821 | { |
822 | free_last_set(sets); |
823 | return (short) i; |
824 | } |
825 | } |
826 | return (short) i; /* return new position */ |
827 | } |
828 | |
829 | |
830 | /* |
831 | find if there is a found_set with same table_offset & found_offset |
832 | If there is return offset to it, else add new offset and return pos. |
833 | Pos returned is -offset-2 in found_set_structure because it's is |
834 | saved in set->next and set->next[] >= 0 points to next set and |
835 | set->next[] == -1 is reserved for end without replaces. |
836 | */ |
837 | |
838 | static short find_found(FOUND_SET *found_set,uint table_offset, |
839 | int found_offset) |
840 | { |
841 | int i; |
842 | for (i=0 ; (uint) i < found_sets ; i++) |
843 | if (found_set[i].table_offset == table_offset && |
844 | found_set[i].found_offset == found_offset) |
845 | return (short) (-i-2); |
846 | found_set[i].table_offset=table_offset; |
847 | found_set[i].found_offset=found_offset; |
848 | found_sets++; |
849 | return (short) (-i-2); /* return new position */ |
850 | } |
851 | |
852 | /* Return 1 if regexp starts with \b or ends with \b*/ |
853 | |
854 | static uint start_at_word(char * pos) |
855 | { |
856 | return (((!memcmp(pos,"\\b" ,2) && pos[2]) || !memcmp(pos,"\\^" ,2)) ? 1 : 0); |
857 | } |
858 | |
859 | static uint end_of_word(char * pos) |
860 | { |
861 | char * end=strend(pos); |
862 | return ((end > pos+2 && !memcmp(end-2,"\\b" ,2)) || |
863 | (end >= pos+2 && !memcmp(end-2,"\\$" ,2))) ? |
864 | 1 : 0; |
865 | } |
866 | |
867 | |
868 | static uint replace_len(char * str) |
869 | { |
870 | uint len=0; |
871 | while (*str) |
872 | { |
873 | if (str[0] == '\\' && str[1]) |
874 | str++; |
875 | str++; |
876 | len++; |
877 | } |
878 | return len; |
879 | } |
880 | |
881 | |
882 | /* The actual loop */ |
883 | |
884 | static uint replace_strings(REPLACE *rep, char **start, uint *max_length, |
885 | char *from) |
886 | { |
887 | reg1 REPLACE *rep_pos; |
888 | reg2 REPLACE_STRING *rep_str; |
889 | char *to, *end, *pos, *new; |
890 | |
891 | end=(to= *start) + *max_length-1; |
892 | rep_pos=rep+1; |
893 | for(;;) |
894 | { |
895 | while (!rep_pos->found) |
896 | { |
897 | rep_pos= rep_pos->next[(uchar) *from]; |
898 | if (to == end) |
899 | { |
900 | (*max_length)+=8192; |
901 | if (!(new=my_realloc(*start,*max_length,MYF(MY_WME)))) |
902 | return (uint) -1; |
903 | to=new+(to - *start); |
904 | end=(*start=new)+ *max_length-1; |
905 | } |
906 | *to++= *from++; |
907 | } |
908 | if (!(rep_str = ((REPLACE_STRING*) rep_pos))->replace_string) |
909 | return (uint) (to - *start)-1; |
910 | updated=1; /* Some char * is replaced */ |
911 | to-=rep_str->to_offset; |
912 | for (pos=rep_str->replace_string; *pos ; pos++) |
913 | { |
914 | if (to == end) |
915 | { |
916 | (*max_length)*=2; |
917 | if (!(new=my_realloc(*start,*max_length,MYF(MY_WME)))) |
918 | return (uint) -1; |
919 | to=new+(to - *start); |
920 | end=(*start=new)+ *max_length-1; |
921 | } |
922 | *to++= *pos; |
923 | } |
924 | if (!*(from-=rep_str->from_offset) && rep_pos->found != 2) |
925 | return (uint) (to - *start); |
926 | rep_pos=rep; |
927 | } |
928 | } |
929 | |
930 | static char *buffer; /* The buffer itself, grown as needed. */ |
931 | static int bufbytes; /* Number of bytes in the buffer. */ |
932 | static int bufread,my_eof; /* Number of bytes to get with each read(). */ |
933 | static uint bufalloc; |
934 | static char *out_buff; |
935 | static uint out_length; |
936 | |
937 | static int initialize_buffer() |
938 | { |
939 | bufread = 8192; |
940 | bufalloc = bufread + bufread / 2; |
941 | if (!(buffer = my_malloc(bufalloc+1,MYF(MY_WME)))) |
942 | return 1; |
943 | bufbytes=my_eof=0; |
944 | out_length=bufread; |
945 | if (!(out_buff=my_malloc(out_length,MYF(MY_WME)))) |
946 | return(1); |
947 | return 0; |
948 | } |
949 | |
950 | static void reset_buffer() |
951 | { |
952 | bufbytes=my_eof=0; |
953 | } |
954 | |
955 | static void free_buffer() |
956 | { |
957 | my_free(buffer); |
958 | my_free(out_buff); |
959 | } |
960 | |
961 | |
962 | /* |
963 | Fill the buffer retaining the last n bytes at the beginning of the |
964 | newly filled buffer (for backward context). Returns the number of new |
965 | bytes read from disk. |
966 | */ |
967 | |
968 | static int fill_buffer_retaining(fd,n) |
969 | File fd; |
970 | int n; |
971 | { |
972 | int i; |
973 | |
974 | /* See if we need to grow the buffer. */ |
975 | if ((int) bufalloc - n <= bufread) |
976 | { |
977 | while ((int) bufalloc - n <= bufread) |
978 | { |
979 | bufalloc *= 2; |
980 | bufread *= 2; |
981 | } |
982 | buffer = my_realloc(buffer, bufalloc+1, MYF(MY_WME)); |
983 | if (! buffer) |
984 | return(-1); |
985 | } |
986 | |
987 | /* Shift stuff down. */ |
988 | bmove(buffer,buffer+bufbytes-n,(uint) n); |
989 | bufbytes = n; |
990 | |
991 | if (my_eof) |
992 | return 0; |
993 | |
994 | /* Read in new stuff. */ |
995 | if ((i=(int) my_read(fd, (uchar*) buffer + bufbytes, |
996 | (size_t) bufread, MYF(MY_WME))) < 0) |
997 | return -1; |
998 | |
999 | /* Kludge to pretend every nonempty file ends with a newline. */ |
1000 | if (i == 0 && bufbytes > 0 && buffer[bufbytes - 1] != '\n') |
1001 | { |
1002 | my_eof = i = 1; |
1003 | buffer[bufbytes] = '\n'; |
1004 | } |
1005 | |
1006 | bufbytes += i; |
1007 | return i; |
1008 | } |
1009 | |
1010 | /* Return 0 if convert is ok */ |
1011 | /* Global variable update is set if something was changed */ |
1012 | |
1013 | static int convert_pipe(rep,in,out) |
1014 | REPLACE *rep; |
1015 | FILE *in,*out; |
1016 | { |
1017 | int retain,error; |
1018 | uint length; |
1019 | char save_char,*end_of_line,*start_of_line; |
1020 | DBUG_ENTER("convert_pipe" ); |
1021 | |
1022 | updated=retain=0; |
1023 | reset_buffer(); |
1024 | |
1025 | while ((error=fill_buffer_retaining(my_fileno(in),retain)) > 0) |
1026 | { |
1027 | end_of_line=buffer ; |
1028 | buffer[bufbytes]=0; /* Sentinel */ |
1029 | for (;;) |
1030 | { |
1031 | start_of_line=end_of_line; |
1032 | while (end_of_line[0] != '\n' && end_of_line[0]) |
1033 | end_of_line++; |
1034 | if (end_of_line == buffer+bufbytes) |
1035 | { |
1036 | retain= (int) (end_of_line - start_of_line); |
1037 | break; /* No end of line, read more */ |
1038 | } |
1039 | save_char=end_of_line[0]; |
1040 | end_of_line[0]=0; |
1041 | end_of_line++; |
1042 | if ((length=replace_strings(rep,&out_buff,&out_length,start_of_line)) == |
1043 | (uint) -1) |
1044 | return 1; |
1045 | if (!my_eof) |
1046 | out_buff[length++]=save_char; /* Don't write added newline */ |
1047 | if (my_fwrite(out, (uchar*) out_buff, length, MYF(MY_WME | MY_NABP))) |
1048 | DBUG_RETURN(1); |
1049 | } |
1050 | } |
1051 | DBUG_RETURN(error); |
1052 | } |
1053 | |
1054 | |
1055 | static int convert_file(REPLACE *rep, char * name) |
1056 | { |
1057 | int error; |
1058 | FILE *in,*out; |
1059 | char dir_buff[FN_REFLEN], tempname[FN_REFLEN], *org_name = name; |
1060 | #ifdef HAVE_READLINK |
1061 | char link_name[FN_REFLEN]; |
1062 | #endif |
1063 | File temp_file; |
1064 | size_t dir_buff_length; |
1065 | DBUG_ENTER("convert_file" ); |
1066 | |
1067 | /* check if name is a symlink */ |
1068 | #ifdef HAVE_READLINK |
1069 | org_name= (!my_disable_symlinks && |
1070 | !my_readlink(link_name, name, MYF(0))) ? link_name : name; |
1071 | #endif |
1072 | if (!(in= my_fopen(org_name,O_RDONLY,MYF(MY_WME)))) |
1073 | DBUG_RETURN(1); |
1074 | dirname_part(dir_buff, org_name, &dir_buff_length); |
1075 | if ((temp_file= create_temp_file(tempname, dir_buff, "PR" , 0, |
1076 | MYF(MY_WME))) < 0) |
1077 | { |
1078 | my_fclose(in,MYF(0)); |
1079 | DBUG_RETURN(1); |
1080 | } |
1081 | if (!(out= my_fdopen(temp_file, tempname, O_WRONLY, MYF(MY_WME)))) |
1082 | { |
1083 | my_fclose(in,MYF(0)); |
1084 | DBUG_RETURN(1); |
1085 | } |
1086 | |
1087 | error=convert_pipe(rep,in,out); |
1088 | my_fclose(in,MYF(0)); my_fclose(out,MYF(0)); |
1089 | |
1090 | if (updated && ! error) |
1091 | my_redel(org_name, tempname, 0, MYF(MY_WME | MY_LINK_WARNING)); |
1092 | else |
1093 | my_delete(tempname,MYF(MY_WME)); |
1094 | if (!silent && ! error) |
1095 | { |
1096 | if (updated) |
1097 | printf("%s converted\n" ,name); |
1098 | else if (verbose) |
1099 | printf("%s left unchanged\n" ,name); |
1100 | } |
1101 | DBUG_RETURN(error); |
1102 | } |
1103 | |