| 1 | // Copyright 2014 Paul Sokolovsky. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | #include "re1.5.h" |
| 6 | |
| 7 | #define INSERT_CODE(at, num, pc) \ |
| 8 | ((code ? memmove(code + at + num, code + at, pc - at) : 0), pc += num) |
| 9 | #define REL(at, to) (to - at - 2) |
| 10 | #define EMIT(at, byte) (code ? (code[at] = byte) : (at)) |
| 11 | #define EMIT_CHECKED(at, byte) (_emit_checked(at, code, byte, &err)) |
| 12 | #define PC (prog->bytelen) |
| 13 | |
| 14 | static void _emit_checked(int at, char *code, int val, bool *err) { |
| 15 | *err |= val != (int8_t)val; |
| 16 | if (code) { |
| 17 | code[at] = val; |
| 18 | } |
| 19 | } |
| 20 | |
| 21 | static const char *_compilecode(const char *re, ByteProg *prog, int sizecode) |
| 22 | { |
| 23 | char *code = sizecode ? NULL : prog->insts; |
| 24 | bool err = false; |
| 25 | int start = PC; |
| 26 | int term = PC; |
| 27 | int alt_label = 0; |
| 28 | |
| 29 | for (; *re && *re != ')'; re++) { |
| 30 | switch (*re) { |
| 31 | case '\\': |
| 32 | re++; |
| 33 | if (!*re) return NULL; // Trailing backslash |
| 34 | if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') { |
| 35 | term = PC; |
| 36 | EMIT(PC++, NamedClass); |
| 37 | EMIT(PC++, *re); |
| 38 | prog->len++; |
| 39 | break; |
| 40 | } |
| 41 | MP_FALLTHROUGH |
| 42 | default: |
| 43 | term = PC; |
| 44 | EMIT(PC++, Char); |
| 45 | EMIT(PC++, *re); |
| 46 | prog->len++; |
| 47 | break; |
| 48 | case '.': |
| 49 | term = PC; |
| 50 | EMIT(PC++, Any); |
| 51 | prog->len++; |
| 52 | break; |
| 53 | case '[': { |
| 54 | int cnt; |
| 55 | term = PC; |
| 56 | re++; |
| 57 | if (*re == '^') { |
| 58 | EMIT(PC++, ClassNot); |
| 59 | re++; |
| 60 | } else { |
| 61 | EMIT(PC++, Class); |
| 62 | } |
| 63 | PC++; // Skip # of pair byte |
| 64 | prog->len++; |
| 65 | for (cnt = 0; *re != ']'; re++, cnt++) { |
| 66 | if (*re == '\\') { |
| 67 | ++re; |
| 68 | } |
| 69 | if (!*re) return NULL; |
| 70 | EMIT(PC++, *re); |
| 71 | if (re[1] == '-' && re[2] != ']') { |
| 72 | re += 2; |
| 73 | } |
| 74 | EMIT(PC++, *re); |
| 75 | } |
| 76 | EMIT_CHECKED(term + 1, cnt); |
| 77 | break; |
| 78 | } |
| 79 | case '(': { |
| 80 | term = PC; |
| 81 | int sub = 0; |
| 82 | int capture = re[1] != '?' || re[2] != ':'; |
| 83 | |
| 84 | if (capture) { |
| 85 | sub = ++prog->sub; |
| 86 | EMIT(PC++, Save); |
| 87 | EMIT_CHECKED(PC++, 2 * sub); |
| 88 | prog->len++; |
| 89 | } else { |
| 90 | re += 2; |
| 91 | } |
| 92 | |
| 93 | re = _compilecode(re + 1, prog, sizecode); |
| 94 | if (re == NULL || *re != ')') return NULL; // error, or no matching paren |
| 95 | |
| 96 | if (capture) { |
| 97 | EMIT(PC++, Save); |
| 98 | EMIT_CHECKED(PC++, 2 * sub + 1); |
| 99 | prog->len++; |
| 100 | } |
| 101 | |
| 102 | break; |
| 103 | } |
| 104 | case '?': |
| 105 | if (PC == term) return NULL; // nothing to repeat |
| 106 | INSERT_CODE(term, 2, PC); |
| 107 | if (re[1] == '?') { |
| 108 | EMIT(term, RSplit); |
| 109 | re++; |
| 110 | } else { |
| 111 | EMIT(term, Split); |
| 112 | } |
| 113 | EMIT_CHECKED(term + 1, REL(term, PC)); |
| 114 | prog->len++; |
| 115 | term = PC; |
| 116 | break; |
| 117 | case '*': |
| 118 | if (PC == term) return NULL; // nothing to repeat |
| 119 | INSERT_CODE(term, 2, PC); |
| 120 | EMIT(PC, Jmp); |
| 121 | EMIT_CHECKED(PC + 1, REL(PC, term)); |
| 122 | PC += 2; |
| 123 | if (re[1] == '?') { |
| 124 | EMIT(term, RSplit); |
| 125 | re++; |
| 126 | } else { |
| 127 | EMIT(term, Split); |
| 128 | } |
| 129 | EMIT_CHECKED(term + 1, REL(term, PC)); |
| 130 | prog->len += 2; |
| 131 | term = PC; |
| 132 | break; |
| 133 | case '+': |
| 134 | if (PC == term) return NULL; // nothing to repeat |
| 135 | if (re[1] == '?') { |
| 136 | EMIT(PC, Split); |
| 137 | re++; |
| 138 | } else { |
| 139 | EMIT(PC, RSplit); |
| 140 | } |
| 141 | EMIT_CHECKED(PC + 1, REL(PC, term)); |
| 142 | PC += 2; |
| 143 | prog->len++; |
| 144 | term = PC; |
| 145 | break; |
| 146 | case '|': |
| 147 | if (alt_label) { |
| 148 | EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1); |
| 149 | } |
| 150 | INSERT_CODE(start, 2, PC); |
| 151 | EMIT(PC++, Jmp); |
| 152 | alt_label = PC++; |
| 153 | EMIT(start, Split); |
| 154 | EMIT_CHECKED(start + 1, REL(start, PC)); |
| 155 | prog->len += 2; |
| 156 | term = PC; |
| 157 | break; |
| 158 | case '^': |
| 159 | EMIT(PC++, Bol); |
| 160 | prog->len++; |
| 161 | term = PC; |
| 162 | break; |
| 163 | case '$': |
| 164 | EMIT(PC++, Eol); |
| 165 | prog->len++; |
| 166 | term = PC; |
| 167 | break; |
| 168 | } |
| 169 | } |
| 170 | |
| 171 | if (alt_label) { |
| 172 | EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1); |
| 173 | } |
| 174 | return err ? NULL : re; |
| 175 | } |
| 176 | |
| 177 | int re1_5_sizecode(const char *re) |
| 178 | { |
| 179 | ByteProg dummyprog = { |
| 180 | // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code |
| 181 | .bytelen = 5 + NON_ANCHORED_PREFIX |
| 182 | }; |
| 183 | |
| 184 | if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1; |
| 185 | |
| 186 | return dummyprog.bytelen; |
| 187 | } |
| 188 | |
| 189 | int re1_5_compilecode(ByteProg *prog, const char *re) |
| 190 | { |
| 191 | prog->len = 0; |
| 192 | prog->bytelen = 0; |
| 193 | prog->sub = 0; |
| 194 | |
| 195 | // Add code to implement non-anchored operation ("search"), |
| 196 | // for anchored operation ("match"), this code will be just skipped. |
| 197 | // TODO: Implement search in much more efficient manner |
| 198 | prog->insts[prog->bytelen++] = RSplit; |
| 199 | prog->insts[prog->bytelen++] = 3; |
| 200 | prog->insts[prog->bytelen++] = Any; |
| 201 | prog->insts[prog->bytelen++] = Jmp; |
| 202 | prog->insts[prog->bytelen++] = -5; |
| 203 | prog->len += 3; |
| 204 | |
| 205 | prog->insts[prog->bytelen++] = Save; |
| 206 | prog->insts[prog->bytelen++] = 0; |
| 207 | prog->len++; |
| 208 | |
| 209 | re = _compilecode(re, prog, /*sizecode*/0); |
| 210 | if (re == NULL || *re) return 1; |
| 211 | |
| 212 | prog->insts[prog->bytelen++] = Save; |
| 213 | prog->insts[prog->bytelen++] = 1; |
| 214 | prog->len++; |
| 215 | |
| 216 | prog->insts[prog->bytelen++] = Match; |
| 217 | prog->len++; |
| 218 | |
| 219 | return 0; |
| 220 | } |
| 221 | |
| 222 | #if 0 |
| 223 | int main(int argc, char *argv[]) |
| 224 | { |
| 225 | int pc = 0; |
| 226 | ByteProg *code = re1_5_compilecode(argv[1]); |
| 227 | re1_5_dumpcode(code); |
| 228 | } |
| 229 | #endif |
| 230 | |