dict_snowball.c source code [PostgreSQL/src/backend/snowball/dict_snowball.c]

1	/-------------------------------------------------------------------------*
2	*
3	* dict_snowball.c
4	* Snowball dictionary
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	*
8	* IDENTIFICATION
9	* src/backend/snowball/dict_snowball.c
10	*
11	*-------------------------------------------------------------------------
12	*/
13	#include "postgres.h"
14
15	#include "commands/defrem.h"
16	#include "tsearch/ts_locale.h"
17	#include "tsearch/ts_utils.h"
18
19	/ Some platforms define MAXINT and/or MININT, causing conflicts /
20	#ifdef MAXINT
21	#undef MAXINT
22	#endif
23	#ifdef MININT
24	#undef MININT
25	#endif
26
27	/ Now we can include the original Snowball header.h /
28	#include "snowball/libstemmer/header.h"
29	#include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
30	#include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
31	#include "snowball/libstemmer/stem_ISO_8859_1_english.h"
32	#include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
33	#include "snowball/libstemmer/stem_ISO_8859_1_french.h"
34	#include "snowball/libstemmer/stem_ISO_8859_1_german.h"
35	#include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
36	#include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
37	#include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
38	#include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
39	#include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
40	#include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
41	#include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
42	#include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
43	#include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
44	#include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
45	#include "snowball/libstemmer/stem_KOI8_R_russian.h"
46	#include "snowball/libstemmer/stem_UTF_8_arabic.h"
47	#include "snowball/libstemmer/stem_UTF_8_danish.h"
48	#include "snowball/libstemmer/stem_UTF_8_dutch.h"
49	#include "snowball/libstemmer/stem_UTF_8_english.h"
50	#include "snowball/libstemmer/stem_UTF_8_finnish.h"
51	#include "snowball/libstemmer/stem_UTF_8_french.h"
52	#include "snowball/libstemmer/stem_UTF_8_german.h"
53	#include "snowball/libstemmer/stem_UTF_8_hungarian.h"
54	#include "snowball/libstemmer/stem_UTF_8_indonesian.h"
55	#include "snowball/libstemmer/stem_UTF_8_irish.h"
56	#include "snowball/libstemmer/stem_UTF_8_italian.h"
57	#include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
58	#include "snowball/libstemmer/stem_UTF_8_nepali.h"
59	#include "snowball/libstemmer/stem_UTF_8_norwegian.h"
60	#include "snowball/libstemmer/stem_UTF_8_porter.h"
61	#include "snowball/libstemmer/stem_UTF_8_portuguese.h"
62	#include "snowball/libstemmer/stem_UTF_8_romanian.h"
63	#include "snowball/libstemmer/stem_UTF_8_russian.h"
64	#include "snowball/libstemmer/stem_UTF_8_spanish.h"
65	#include "snowball/libstemmer/stem_UTF_8_swedish.h"
66	#include "snowball/libstemmer/stem_UTF_8_tamil.h"
67	#include "snowball/libstemmer/stem_UTF_8_turkish.h"
68
69	PG_MODULE_MAGIC;
70
71	PG_FUNCTION_INFO_V1(dsnowball_init);
72
73	PG_FUNCTION_INFO_V1(dsnowball_lexize);
74
75	/ List of supported modules /
76	typedef struct stemmer_module
77	{
78	const char *name;
79	pg_enc enc;
80	struct SN_env (create) (void);
81	void (close) (struct* SN_env *);
82	int (stem) (struct* SN_env *);
83	} stemmer_module;
84
85	/ Args: stemmer name, PG code for encoding, Snowball's name for encoding /
86	#define STEMMER_MODULE(name,enc,senc) \
87	{#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
88
89	static const stemmer_module stemmer_modules[] =
90	{
91	/*
92	* Stemmers list from Snowball distribution
93	*/
94	STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
95	STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
96	STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
97	STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
98	STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
99	STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
100	STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
101	STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
102	STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
103	STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
104	STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
105	STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
106	STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
107	STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
108	STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
109	STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
110	STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
111	STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
112	STEMMER_MODULE(danish, PG_UTF8, UTF_8),
113	STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
114	STEMMER_MODULE(english, PG_UTF8, UTF_8),
115	STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
116	STEMMER_MODULE(french, PG_UTF8, UTF_8),
117	STEMMER_MODULE(german, PG_UTF8, UTF_8),
118	STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
119	STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
120	STEMMER_MODULE(irish, PG_UTF8, UTF_8),
121	STEMMER_MODULE(italian, PG_UTF8, UTF_8),
122	STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
123	STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
124	STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
125	STEMMER_MODULE(porter, PG_UTF8, UTF_8),
126	STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
127	STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
128	STEMMER_MODULE(russian, PG_UTF8, UTF_8),
129	STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
130	STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
131	STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
132	STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
133
134	/*
135	* Stemmer with PG_SQL_ASCII encoding should be valid for any server
136	* encoding
137	*/
138	STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
139
140	{NULL, `0`, NULL, NULL, NULL} / list end marker /
141	};
142
143
144	typedef struct DictSnowball
145	{
146	struct SN_env *z;
147	StopList stoplist;
148	bool needrecode; / needs recoding before/after call stem /
149	int (stem) (struct* SN_env *z);
150
151	/*
152	* snowball saves alloced memory between calls, so we should run it in our
153	* private memory context. Note, init function is executed in long lived
154	* context, so we just remember CurrentMemoryContext
155	*/
156	MemoryContext dictCtx;
157	} DictSnowball;
158
159
160	static void
161	locate_stem_module(DictSnowball d, const* char *lang)
162	{
163	const stemmer_module *m;
164
165	/*
166	* First, try to find exact match of stemmer module. Stemmer with
167	* PG_SQL_ASCII encoding is treated as working with any server encoding
168	*/
169	for (m = stemmer_modules; m->name; m++)
170	{
171	if ((m->enc == PG_SQL_ASCII \|\| m->enc == GetDatabaseEncoding()) &&
172	pg_strcasecmp(m->name, lang) == `0`)
173	{
174	d->stem = m->stem;
175	d->z = m->create();
176	d->needrecode = false;
177	return;
178	}
179	}
180
181	/*
182	* Second, try to find stemmer for needed language for UTF8 encoding.
183	*/
184	for (m = stemmer_modules; m->name; m++)
185	{
186	if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == `0`)
187	{
188	d->stem = m->stem;
189	d->z = m->create();
190	d->needrecode = true;
191	return;
192	}
193	}
194
195	ereport(ERROR,
196	(errcode(ERRCODE_UNDEFINED_OBJECT),
197	errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
198	lang, GetDatabaseEncodingName())));
199	}
200
201	Datum
202	dsnowball_init(PG_FUNCTION_ARGS)
203	{
204	List dictoptions = (List ) PG_GETARG_POINTER(`0`);
205	DictSnowball *d;
206	bool stoploaded = false;
207	ListCell *l;
208
209	d = (DictSnowball ) palloc0(sizeof*(DictSnowball));
210
211	foreach(l, dictoptions)
212	{
213	DefElem defel = (DefElem ) lfirst(l);
214
215	if (strcmp(defel->defname, "stopwords") == `0`)
216	{
217	if (stoploaded)
218	ereport(ERROR,
219	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
220	errmsg("multiple StopWords parameters")));
221	readstoplist(defGetString(defel), &d->stoplist, lowerstr);
222	stoploaded = true;
223	}
224	else if (strcmp(defel->defname, "language") == `0`)
225	{
226	if (d->stem)
227	ereport(ERROR,
228	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
229	errmsg("multiple Language parameters")));
230	locate_stem_module(d, defGetString(defel));
231	}
232	else
233	{
234	ereport(ERROR,
235	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
236	errmsg("unrecognized Snowball parameter: \"%s\"",
237	defel->defname)));
238	}
239	}
240
241	if (!d->stem)
242	ereport(ERROR,
243	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
244	errmsg("missing Language parameter")));
245
246	d->dictCtx = CurrentMemoryContext;
247
248	PG_RETURN_POINTER(d);
249	}
250
251	Datum
252	dsnowball_lexize(PG_FUNCTION_ARGS)
253	{
254	DictSnowball d = (DictSnowball ) PG_GETARG_POINTER(`0`);
255	char in = (char* *) PG_GETARG_POINTER(`1`);
256	int32 len = PG_GETARG_INT32(`2`);
257	char *txt = lowerstr_with_len(in, len);
258	TSLexeme res = palloc0(sizeof(TSLexeme) `2`);
259
260	if (*txt == `'\0'` \|\| searchstoplist(&(d->stoplist), txt))
261	{
262	pfree(txt);
263	}
264	else
265	{
266	MemoryContext saveCtx;
267
268	/*
269	* recode to utf8 if stemmer is utf8 and doesn't match server encoding
270	*/
271	if (d->needrecode)
272	{
273	char *recoded;
274
275	recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
276	if (recoded != txt)
277	{
278	pfree(txt);
279	txt = recoded;
280	}
281	}
282
283	/ see comment about d->dictCtx /
284	saveCtx = MemoryContextSwitchTo(d->dictCtx);
285	SN_set_current(d->z, strlen(txt), (symbol *) txt);
286	d->stem(d->z);
287	MemoryContextSwitchTo(saveCtx);
288
289	if (d->z->p && d->z->l)
290	{
291	txt = repalloc(txt, d->z->l + `1`);
292	memcpy(txt, d->z->p, d->z->l);
293	txt[d->z->l] = `'\0'`;
294	}
295
296	/ back recode if needed /
297	if (d->needrecode)
298	{
299	char *recoded;
300
301	recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
302	if (recoded != txt)
303	{
304	pfree(txt);
305	txt = recoded;
306	}
307	}
308
309	res->lexeme = txt;
310	}
311
312	PG_RETURN_POINTER(res);
313	}
314

Browse the source code of PostgreSQL/src/backend/snowball/dict_snowball.c