1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * regis.c |
4 | * Fast regex subset |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * |
8 | * |
9 | * IDENTIFICATION |
10 | * src/backend/tsearch/regis.c |
11 | * |
12 | *------------------------------------------------------------------------- |
13 | */ |
14 | |
15 | #include "postgres.h" |
16 | |
17 | #include "tsearch/dicts/regis.h" |
18 | #include "tsearch/ts_locale.h" |
19 | |
20 | #define RS_IN_ONEOF 1 |
21 | #define RS_IN_ONEOF_IN 2 |
22 | #define RS_IN_NONEOF 3 |
23 | #define RS_IN_WAIT 4 |
24 | |
25 | |
26 | /* |
27 | * Test whether a regex is of the subset supported here. |
28 | * Keep this in sync with RS_compile! |
29 | */ |
30 | bool |
31 | RS_isRegis(const char *str) |
32 | { |
33 | int state = RS_IN_WAIT; |
34 | const char *c = str; |
35 | |
36 | while (*c) |
37 | { |
38 | if (state == RS_IN_WAIT) |
39 | { |
40 | if (t_isalpha(c)) |
41 | /* okay */ ; |
42 | else if (t_iseq(c, '[')) |
43 | state = RS_IN_ONEOF; |
44 | else |
45 | return false; |
46 | } |
47 | else if (state == RS_IN_ONEOF) |
48 | { |
49 | if (t_iseq(c, '^')) |
50 | state = RS_IN_NONEOF; |
51 | else if (t_isalpha(c)) |
52 | state = RS_IN_ONEOF_IN; |
53 | else |
54 | return false; |
55 | } |
56 | else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) |
57 | { |
58 | if (t_isalpha(c)) |
59 | /* okay */ ; |
60 | else if (t_iseq(c, ']')) |
61 | state = RS_IN_WAIT; |
62 | else |
63 | return false; |
64 | } |
65 | else |
66 | elog(ERROR, "internal error in RS_isRegis: state %d" , state); |
67 | c += pg_mblen(c); |
68 | } |
69 | |
70 | return (state == RS_IN_WAIT); |
71 | } |
72 | |
73 | static RegisNode * |
74 | newRegisNode(RegisNode *prev, int len) |
75 | { |
76 | RegisNode *ptr; |
77 | |
78 | ptr = (RegisNode *) palloc0(RNHDRSZ + len + 1); |
79 | if (prev) |
80 | prev->next = ptr; |
81 | return ptr; |
82 | } |
83 | |
84 | void |
85 | RS_compile(Regis *r, bool issuffix, const char *str) |
86 | { |
87 | int len = strlen(str); |
88 | int state = RS_IN_WAIT; |
89 | const char *c = str; |
90 | RegisNode *ptr = NULL; |
91 | |
92 | memset(r, 0, sizeof(Regis)); |
93 | r->issuffix = (issuffix) ? 1 : 0; |
94 | |
95 | while (*c) |
96 | { |
97 | if (state == RS_IN_WAIT) |
98 | { |
99 | if (t_isalpha(c)) |
100 | { |
101 | if (ptr) |
102 | ptr = newRegisNode(ptr, len); |
103 | else |
104 | ptr = r->node = newRegisNode(NULL, len); |
105 | COPYCHAR(ptr->data, c); |
106 | ptr->type = RSF_ONEOF; |
107 | ptr->len = pg_mblen(c); |
108 | } |
109 | else if (t_iseq(c, '[')) |
110 | { |
111 | if (ptr) |
112 | ptr = newRegisNode(ptr, len); |
113 | else |
114 | ptr = r->node = newRegisNode(NULL, len); |
115 | ptr->type = RSF_ONEOF; |
116 | state = RS_IN_ONEOF; |
117 | } |
118 | else /* shouldn't get here */ |
119 | elog(ERROR, "invalid regis pattern: \"%s\"" , str); |
120 | } |
121 | else if (state == RS_IN_ONEOF) |
122 | { |
123 | if (t_iseq(c, '^')) |
124 | { |
125 | ptr->type = RSF_NONEOF; |
126 | state = RS_IN_NONEOF; |
127 | } |
128 | else if (t_isalpha(c)) |
129 | { |
130 | COPYCHAR(ptr->data, c); |
131 | ptr->len = pg_mblen(c); |
132 | state = RS_IN_ONEOF_IN; |
133 | } |
134 | else /* shouldn't get here */ |
135 | elog(ERROR, "invalid regis pattern: \"%s\"" , str); |
136 | } |
137 | else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) |
138 | { |
139 | if (t_isalpha(c)) |
140 | { |
141 | COPYCHAR(ptr->data + ptr->len, c); |
142 | ptr->len += pg_mblen(c); |
143 | } |
144 | else if (t_iseq(c, ']')) |
145 | state = RS_IN_WAIT; |
146 | else /* shouldn't get here */ |
147 | elog(ERROR, "invalid regis pattern: \"%s\"" , str); |
148 | } |
149 | else |
150 | elog(ERROR, "internal error in RS_compile: state %d" , state); |
151 | c += pg_mblen(c); |
152 | } |
153 | |
154 | if (state != RS_IN_WAIT) /* shouldn't get here */ |
155 | elog(ERROR, "invalid regis pattern: \"%s\"" , str); |
156 | |
157 | ptr = r->node; |
158 | while (ptr) |
159 | { |
160 | r->nchar++; |
161 | ptr = ptr->next; |
162 | } |
163 | } |
164 | |
165 | void |
166 | RS_free(Regis *r) |
167 | { |
168 | RegisNode *ptr = r->node, |
169 | *tmp; |
170 | |
171 | while (ptr) |
172 | { |
173 | tmp = ptr->next; |
174 | pfree(ptr); |
175 | ptr = tmp; |
176 | } |
177 | |
178 | r->node = NULL; |
179 | } |
180 | |
181 | static bool |
182 | mb_strchr(char *str, char *c) |
183 | { |
184 | int clen, |
185 | plen, |
186 | i; |
187 | char *ptr = str; |
188 | bool res = false; |
189 | |
190 | clen = pg_mblen(c); |
191 | while (*ptr && !res) |
192 | { |
193 | plen = pg_mblen(ptr); |
194 | if (plen == clen) |
195 | { |
196 | i = plen; |
197 | res = true; |
198 | while (i--) |
199 | if (*(ptr + i) != *(c + i)) |
200 | { |
201 | res = false; |
202 | break; |
203 | } |
204 | } |
205 | |
206 | ptr += plen; |
207 | } |
208 | |
209 | return res; |
210 | } |
211 | |
212 | bool |
213 | RS_execute(Regis *r, char *str) |
214 | { |
215 | RegisNode *ptr = r->node; |
216 | char *c = str; |
217 | int len = 0; |
218 | |
219 | while (*c) |
220 | { |
221 | len++; |
222 | c += pg_mblen(c); |
223 | } |
224 | |
225 | if (len < r->nchar) |
226 | return 0; |
227 | |
228 | c = str; |
229 | if (r->issuffix) |
230 | { |
231 | len -= r->nchar; |
232 | while (len-- > 0) |
233 | c += pg_mblen(c); |
234 | } |
235 | |
236 | |
237 | while (ptr) |
238 | { |
239 | switch (ptr->type) |
240 | { |
241 | case RSF_ONEOF: |
242 | if (!mb_strchr((char *) ptr->data, c)) |
243 | return false; |
244 | break; |
245 | case RSF_NONEOF: |
246 | if (mb_strchr((char *) ptr->data, c)) |
247 | return false; |
248 | break; |
249 | default: |
250 | elog(ERROR, "unrecognized regis node type: %d" , ptr->type); |
251 | } |
252 | ptr = ptr->next; |
253 | c += pg_mblen(c); |
254 | } |
255 | |
256 | return true; |
257 | } |
258 | |