| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * regis.c |
| 4 | * Fast regex subset |
| 5 | * |
| 6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 7 | * |
| 8 | * |
| 9 | * IDENTIFICATION |
| 10 | * src/backend/tsearch/regis.c |
| 11 | * |
| 12 | *------------------------------------------------------------------------- |
| 13 | */ |
| 14 | |
| 15 | #include "postgres.h" |
| 16 | |
| 17 | #include "tsearch/dicts/regis.h" |
| 18 | #include "tsearch/ts_locale.h" |
| 19 | |
| 20 | #define RS_IN_ONEOF 1 |
| 21 | #define RS_IN_ONEOF_IN 2 |
| 22 | #define RS_IN_NONEOF 3 |
| 23 | #define RS_IN_WAIT 4 |
| 24 | |
| 25 | |
| 26 | /* |
| 27 | * Test whether a regex is of the subset supported here. |
| 28 | * Keep this in sync with RS_compile! |
| 29 | */ |
| 30 | bool |
| 31 | RS_isRegis(const char *str) |
| 32 | { |
| 33 | int state = RS_IN_WAIT; |
| 34 | const char *c = str; |
| 35 | |
| 36 | while (*c) |
| 37 | { |
| 38 | if (state == RS_IN_WAIT) |
| 39 | { |
| 40 | if (t_isalpha(c)) |
| 41 | /* okay */ ; |
| 42 | else if (t_iseq(c, '[')) |
| 43 | state = RS_IN_ONEOF; |
| 44 | else |
| 45 | return false; |
| 46 | } |
| 47 | else if (state == RS_IN_ONEOF) |
| 48 | { |
| 49 | if (t_iseq(c, '^')) |
| 50 | state = RS_IN_NONEOF; |
| 51 | else if (t_isalpha(c)) |
| 52 | state = RS_IN_ONEOF_IN; |
| 53 | else |
| 54 | return false; |
| 55 | } |
| 56 | else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) |
| 57 | { |
| 58 | if (t_isalpha(c)) |
| 59 | /* okay */ ; |
| 60 | else if (t_iseq(c, ']')) |
| 61 | state = RS_IN_WAIT; |
| 62 | else |
| 63 | return false; |
| 64 | } |
| 65 | else |
| 66 | elog(ERROR, "internal error in RS_isRegis: state %d" , state); |
| 67 | c += pg_mblen(c); |
| 68 | } |
| 69 | |
| 70 | return (state == RS_IN_WAIT); |
| 71 | } |
| 72 | |
| 73 | static RegisNode * |
| 74 | newRegisNode(RegisNode *prev, int len) |
| 75 | { |
| 76 | RegisNode *ptr; |
| 77 | |
| 78 | ptr = (RegisNode *) palloc0(RNHDRSZ + len + 1); |
| 79 | if (prev) |
| 80 | prev->next = ptr; |
| 81 | return ptr; |
| 82 | } |
| 83 | |
| 84 | void |
| 85 | RS_compile(Regis *r, bool issuffix, const char *str) |
| 86 | { |
| 87 | int len = strlen(str); |
| 88 | int state = RS_IN_WAIT; |
| 89 | const char *c = str; |
| 90 | RegisNode *ptr = NULL; |
| 91 | |
| 92 | memset(r, 0, sizeof(Regis)); |
| 93 | r->issuffix = (issuffix) ? 1 : 0; |
| 94 | |
| 95 | while (*c) |
| 96 | { |
| 97 | if (state == RS_IN_WAIT) |
| 98 | { |
| 99 | if (t_isalpha(c)) |
| 100 | { |
| 101 | if (ptr) |
| 102 | ptr = newRegisNode(ptr, len); |
| 103 | else |
| 104 | ptr = r->node = newRegisNode(NULL, len); |
| 105 | COPYCHAR(ptr->data, c); |
| 106 | ptr->type = RSF_ONEOF; |
| 107 | ptr->len = pg_mblen(c); |
| 108 | } |
| 109 | else if (t_iseq(c, '[')) |
| 110 | { |
| 111 | if (ptr) |
| 112 | ptr = newRegisNode(ptr, len); |
| 113 | else |
| 114 | ptr = r->node = newRegisNode(NULL, len); |
| 115 | ptr->type = RSF_ONEOF; |
| 116 | state = RS_IN_ONEOF; |
| 117 | } |
| 118 | else /* shouldn't get here */ |
| 119 | elog(ERROR, "invalid regis pattern: \"%s\"" , str); |
| 120 | } |
| 121 | else if (state == RS_IN_ONEOF) |
| 122 | { |
| 123 | if (t_iseq(c, '^')) |
| 124 | { |
| 125 | ptr->type = RSF_NONEOF; |
| 126 | state = RS_IN_NONEOF; |
| 127 | } |
| 128 | else if (t_isalpha(c)) |
| 129 | { |
| 130 | COPYCHAR(ptr->data, c); |
| 131 | ptr->len = pg_mblen(c); |
| 132 | state = RS_IN_ONEOF_IN; |
| 133 | } |
| 134 | else /* shouldn't get here */ |
| 135 | elog(ERROR, "invalid regis pattern: \"%s\"" , str); |
| 136 | } |
| 137 | else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) |
| 138 | { |
| 139 | if (t_isalpha(c)) |
| 140 | { |
| 141 | COPYCHAR(ptr->data + ptr->len, c); |
| 142 | ptr->len += pg_mblen(c); |
| 143 | } |
| 144 | else if (t_iseq(c, ']')) |
| 145 | state = RS_IN_WAIT; |
| 146 | else /* shouldn't get here */ |
| 147 | elog(ERROR, "invalid regis pattern: \"%s\"" , str); |
| 148 | } |
| 149 | else |
| 150 | elog(ERROR, "internal error in RS_compile: state %d" , state); |
| 151 | c += pg_mblen(c); |
| 152 | } |
| 153 | |
| 154 | if (state != RS_IN_WAIT) /* shouldn't get here */ |
| 155 | elog(ERROR, "invalid regis pattern: \"%s\"" , str); |
| 156 | |
| 157 | ptr = r->node; |
| 158 | while (ptr) |
| 159 | { |
| 160 | r->nchar++; |
| 161 | ptr = ptr->next; |
| 162 | } |
| 163 | } |
| 164 | |
| 165 | void |
| 166 | RS_free(Regis *r) |
| 167 | { |
| 168 | RegisNode *ptr = r->node, |
| 169 | *tmp; |
| 170 | |
| 171 | while (ptr) |
| 172 | { |
| 173 | tmp = ptr->next; |
| 174 | pfree(ptr); |
| 175 | ptr = tmp; |
| 176 | } |
| 177 | |
| 178 | r->node = NULL; |
| 179 | } |
| 180 | |
| 181 | static bool |
| 182 | mb_strchr(char *str, char *c) |
| 183 | { |
| 184 | int clen, |
| 185 | plen, |
| 186 | i; |
| 187 | char *ptr = str; |
| 188 | bool res = false; |
| 189 | |
| 190 | clen = pg_mblen(c); |
| 191 | while (*ptr && !res) |
| 192 | { |
| 193 | plen = pg_mblen(ptr); |
| 194 | if (plen == clen) |
| 195 | { |
| 196 | i = plen; |
| 197 | res = true; |
| 198 | while (i--) |
| 199 | if (*(ptr + i) != *(c + i)) |
| 200 | { |
| 201 | res = false; |
| 202 | break; |
| 203 | } |
| 204 | } |
| 205 | |
| 206 | ptr += plen; |
| 207 | } |
| 208 | |
| 209 | return res; |
| 210 | } |
| 211 | |
| 212 | bool |
| 213 | RS_execute(Regis *r, char *str) |
| 214 | { |
| 215 | RegisNode *ptr = r->node; |
| 216 | char *c = str; |
| 217 | int len = 0; |
| 218 | |
| 219 | while (*c) |
| 220 | { |
| 221 | len++; |
| 222 | c += pg_mblen(c); |
| 223 | } |
| 224 | |
| 225 | if (len < r->nchar) |
| 226 | return 0; |
| 227 | |
| 228 | c = str; |
| 229 | if (r->issuffix) |
| 230 | { |
| 231 | len -= r->nchar; |
| 232 | while (len-- > 0) |
| 233 | c += pg_mblen(c); |
| 234 | } |
| 235 | |
| 236 | |
| 237 | while (ptr) |
| 238 | { |
| 239 | switch (ptr->type) |
| 240 | { |
| 241 | case RSF_ONEOF: |
| 242 | if (!mb_strchr((char *) ptr->data, c)) |
| 243 | return false; |
| 244 | break; |
| 245 | case RSF_NONEOF: |
| 246 | if (mb_strchr((char *) ptr->data, c)) |
| 247 | return false; |
| 248 | break; |
| 249 | default: |
| 250 | elog(ERROR, "unrecognized regis node type: %d" , ptr->type); |
| 251 | } |
| 252 | ptr = ptr->next; |
| 253 | c += pg_mblen(c); |
| 254 | } |
| 255 | |
| 256 | return true; |
| 257 | } |
| 258 | |