1/*-------------------------------------------------------------------------
2 *
3 * dict_snowball.c
4 * Snowball dictionary
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/backend/snowball/dict_snowball.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "postgres.h"
14
15#include "commands/defrem.h"
16#include "tsearch/ts_locale.h"
17#include "tsearch/ts_utils.h"
18
19/* Some platforms define MAXINT and/or MININT, causing conflicts */
20#ifdef MAXINT
21#undef MAXINT
22#endif
23#ifdef MININT
24#undef MININT
25#endif
26
27/* Now we can include the original Snowball header.h */
28#include "snowball/libstemmer/header.h"
29#include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
30#include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
31#include "snowball/libstemmer/stem_ISO_8859_1_english.h"
32#include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
33#include "snowball/libstemmer/stem_ISO_8859_1_french.h"
34#include "snowball/libstemmer/stem_ISO_8859_1_german.h"
35#include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
36#include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
37#include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
38#include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
39#include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
40#include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
41#include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
42#include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
43#include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
44#include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
45#include "snowball/libstemmer/stem_KOI8_R_russian.h"
46#include "snowball/libstemmer/stem_UTF_8_arabic.h"
47#include "snowball/libstemmer/stem_UTF_8_danish.h"
48#include "snowball/libstemmer/stem_UTF_8_dutch.h"
49#include "snowball/libstemmer/stem_UTF_8_english.h"
50#include "snowball/libstemmer/stem_UTF_8_finnish.h"
51#include "snowball/libstemmer/stem_UTF_8_french.h"
52#include "snowball/libstemmer/stem_UTF_8_german.h"
53#include "snowball/libstemmer/stem_UTF_8_hungarian.h"
54#include "snowball/libstemmer/stem_UTF_8_indonesian.h"
55#include "snowball/libstemmer/stem_UTF_8_irish.h"
56#include "snowball/libstemmer/stem_UTF_8_italian.h"
57#include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
58#include "snowball/libstemmer/stem_UTF_8_nepali.h"
59#include "snowball/libstemmer/stem_UTF_8_norwegian.h"
60#include "snowball/libstemmer/stem_UTF_8_porter.h"
61#include "snowball/libstemmer/stem_UTF_8_portuguese.h"
62#include "snowball/libstemmer/stem_UTF_8_romanian.h"
63#include "snowball/libstemmer/stem_UTF_8_russian.h"
64#include "snowball/libstemmer/stem_UTF_8_spanish.h"
65#include "snowball/libstemmer/stem_UTF_8_swedish.h"
66#include "snowball/libstemmer/stem_UTF_8_tamil.h"
67#include "snowball/libstemmer/stem_UTF_8_turkish.h"
68
69PG_MODULE_MAGIC;
70
71PG_FUNCTION_INFO_V1(dsnowball_init);
72
73PG_FUNCTION_INFO_V1(dsnowball_lexize);
74
75/* List of supported modules */
76typedef struct stemmer_module
77{
78 const char *name;
79 pg_enc enc;
80 struct SN_env *(*create) (void);
81 void (*close) (struct SN_env *);
82 int (*stem) (struct SN_env *);
83} stemmer_module;
84
85/* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
86#define STEMMER_MODULE(name,enc,senc) \
87 {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
88
89static const stemmer_module stemmer_modules[] =
90{
91 /*
92 * Stemmers list from Snowball distribution
93 */
94 STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
95 STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
96 STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
97 STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
98 STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
99 STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
100 STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
101 STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
102 STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
103 STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
104 STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
105 STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
106 STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
107 STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
108 STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
109 STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
110 STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
111 STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
112 STEMMER_MODULE(danish, PG_UTF8, UTF_8),
113 STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
114 STEMMER_MODULE(english, PG_UTF8, UTF_8),
115 STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
116 STEMMER_MODULE(french, PG_UTF8, UTF_8),
117 STEMMER_MODULE(german, PG_UTF8, UTF_8),
118 STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
119 STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
120 STEMMER_MODULE(irish, PG_UTF8, UTF_8),
121 STEMMER_MODULE(italian, PG_UTF8, UTF_8),
122 STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
123 STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
124 STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
125 STEMMER_MODULE(porter, PG_UTF8, UTF_8),
126 STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
127 STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
128 STEMMER_MODULE(russian, PG_UTF8, UTF_8),
129 STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
130 STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
131 STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
132 STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
133
134 /*
135 * Stemmer with PG_SQL_ASCII encoding should be valid for any server
136 * encoding
137 */
138 STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
139
140 {NULL, 0, NULL, NULL, NULL} /* list end marker */
141};
142
143
144typedef struct DictSnowball
145{
146 struct SN_env *z;
147 StopList stoplist;
148 bool needrecode; /* needs recoding before/after call stem */
149 int (*stem) (struct SN_env *z);
150
151 /*
152 * snowball saves alloced memory between calls, so we should run it in our
153 * private memory context. Note, init function is executed in long lived
154 * context, so we just remember CurrentMemoryContext
155 */
156 MemoryContext dictCtx;
157} DictSnowball;
158
159
160static void
161locate_stem_module(DictSnowball *d, const char *lang)
162{
163 const stemmer_module *m;
164
165 /*
166 * First, try to find exact match of stemmer module. Stemmer with
167 * PG_SQL_ASCII encoding is treated as working with any server encoding
168 */
169 for (m = stemmer_modules; m->name; m++)
170 {
171 if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
172 pg_strcasecmp(m->name, lang) == 0)
173 {
174 d->stem = m->stem;
175 d->z = m->create();
176 d->needrecode = false;
177 return;
178 }
179 }
180
181 /*
182 * Second, try to find stemmer for needed language for UTF8 encoding.
183 */
184 for (m = stemmer_modules; m->name; m++)
185 {
186 if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
187 {
188 d->stem = m->stem;
189 d->z = m->create();
190 d->needrecode = true;
191 return;
192 }
193 }
194
195 ereport(ERROR,
196 (errcode(ERRCODE_UNDEFINED_OBJECT),
197 errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
198 lang, GetDatabaseEncodingName())));
199}
200
201Datum
202dsnowball_init(PG_FUNCTION_ARGS)
203{
204 List *dictoptions = (List *) PG_GETARG_POINTER(0);
205 DictSnowball *d;
206 bool stoploaded = false;
207 ListCell *l;
208
209 d = (DictSnowball *) palloc0(sizeof(DictSnowball));
210
211 foreach(l, dictoptions)
212 {
213 DefElem *defel = (DefElem *) lfirst(l);
214
215 if (strcmp(defel->defname, "stopwords") == 0)
216 {
217 if (stoploaded)
218 ereport(ERROR,
219 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
220 errmsg("multiple StopWords parameters")));
221 readstoplist(defGetString(defel), &d->stoplist, lowerstr);
222 stoploaded = true;
223 }
224 else if (strcmp(defel->defname, "language") == 0)
225 {
226 if (d->stem)
227 ereport(ERROR,
228 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
229 errmsg("multiple Language parameters")));
230 locate_stem_module(d, defGetString(defel));
231 }
232 else
233 {
234 ereport(ERROR,
235 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
236 errmsg("unrecognized Snowball parameter: \"%s\"",
237 defel->defname)));
238 }
239 }
240
241 if (!d->stem)
242 ereport(ERROR,
243 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
244 errmsg("missing Language parameter")));
245
246 d->dictCtx = CurrentMemoryContext;
247
248 PG_RETURN_POINTER(d);
249}
250
251Datum
252dsnowball_lexize(PG_FUNCTION_ARGS)
253{
254 DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
255 char *in = (char *) PG_GETARG_POINTER(1);
256 int32 len = PG_GETARG_INT32(2);
257 char *txt = lowerstr_with_len(in, len);
258 TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
259
260 if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
261 {
262 pfree(txt);
263 }
264 else
265 {
266 MemoryContext saveCtx;
267
268 /*
269 * recode to utf8 if stemmer is utf8 and doesn't match server encoding
270 */
271 if (d->needrecode)
272 {
273 char *recoded;
274
275 recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
276 if (recoded != txt)
277 {
278 pfree(txt);
279 txt = recoded;
280 }
281 }
282
283 /* see comment about d->dictCtx */
284 saveCtx = MemoryContextSwitchTo(d->dictCtx);
285 SN_set_current(d->z, strlen(txt), (symbol *) txt);
286 d->stem(d->z);
287 MemoryContextSwitchTo(saveCtx);
288
289 if (d->z->p && d->z->l)
290 {
291 txt = repalloc(txt, d->z->l + 1);
292 memcpy(txt, d->z->p, d->z->l);
293 txt[d->z->l] = '\0';
294 }
295
296 /* back recode if needed */
297 if (d->needrecode)
298 {
299 char *recoded;
300
301 recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
302 if (recoded != txt)
303 {
304 pfree(txt);
305 txt = recoded;
306 }
307 }
308
309 res->lexeme = txt;
310 }
311
312 PG_RETURN_POINTER(res);
313}
314