src/backend/snowball/dict_snowball.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * dict_snowball.c
  *		Snowball dictionary
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
  *	  src/backend/snowball/dict_snowball.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"

 #include "commands/defrem.h"
 #include "tsearch/ts_locale.h"
 #include "tsearch/ts_utils.h"

 /* Some platforms define MAXINT and/or MININT, causing conflicts */
 #ifdef MAXINT
 #undef MAXINT
 #endif
 #ifdef MININT
 #undef MININT
 #endif

 /* Now we can include the original Snowball header.h */
 #include "snowball/libstemmer/header.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_basque.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_catalan.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
 #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
 #include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
 #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
 #include "snowball/libstemmer/stem_KOI8_R_russian.h"
 #include "snowball/libstemmer/stem_UTF_8_arabic.h"
 #include "snowball/libstemmer/stem_UTF_8_armenian.h"
 #include "snowball/libstemmer/stem_UTF_8_basque.h"
 #include "snowball/libstemmer/stem_UTF_8_catalan.h"
 #include "snowball/libstemmer/stem_UTF_8_danish.h"
 #include "snowball/libstemmer/stem_UTF_8_dutch.h"
 #include "snowball/libstemmer/stem_UTF_8_english.h"
 #include "snowball/libstemmer/stem_UTF_8_finnish.h"
 #include "snowball/libstemmer/stem_UTF_8_french.h"
 #include "snowball/libstemmer/stem_UTF_8_german.h"
 #include "snowball/libstemmer/stem_UTF_8_greek.h"
 #include "snowball/libstemmer/stem_UTF_8_hindi.h"
 #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
 #include "snowball/libstemmer/stem_UTF_8_indonesian.h"
 #include "snowball/libstemmer/stem_UTF_8_irish.h"
 #include "snowball/libstemmer/stem_UTF_8_italian.h"
 #include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
 #include "snowball/libstemmer/stem_UTF_8_nepali.h"
 #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
 #include "snowball/libstemmer/stem_UTF_8_porter.h"
 #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
 #include "snowball/libstemmer/stem_UTF_8_romanian.h"
 #include "snowball/libstemmer/stem_UTF_8_russian.h"
 #include "snowball/libstemmer/stem_UTF_8_serbian.h"
 #include "snowball/libstemmer/stem_UTF_8_spanish.h"
 #include "snowball/libstemmer/stem_UTF_8_swedish.h"
 #include "snowball/libstemmer/stem_UTF_8_tamil.h"
 #include "snowball/libstemmer/stem_UTF_8_turkish.h"
 #include "snowball/libstemmer/stem_UTF_8_yiddish.h"

 PG_MODULE_MAGIC;

 PG_FUNCTION_INFO_V1(dsnowball_init);

 PG_FUNCTION_INFO_V1(dsnowball_lexize);

 /* List of supported modules */
 typedef struct stemmer_module
 {
 	const char *name;
 	pg_enc		enc;
 	struct SN_env *(*create) (void);
 	void		(*close) (struct SN_env *);
 	int			(*stem) (struct SN_env *);
 } stemmer_module;

 /* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
 #define STEMMER_MODULE(name,enc,senc) \
 	{#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}

 static const stemmer_module stemmer_modules[] =
 {
 	/*
 	 * Stemmers list from Snowball distribution
 	 */
 	STEMMER_MODULE(basque, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(catalan, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
 	STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
 	STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
 	STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
 	STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
 	STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
 	STEMMER_MODULE(basque, PG_UTF8, UTF_8),
 	STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
 	STEMMER_MODULE(danish, PG_UTF8, UTF_8),
 	STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
 	STEMMER_MODULE(english, PG_UTF8, UTF_8),
 	STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
 	STEMMER_MODULE(french, PG_UTF8, UTF_8),
 	STEMMER_MODULE(german, PG_UTF8, UTF_8),
 	STEMMER_MODULE(greek, PG_UTF8, UTF_8),
 	STEMMER_MODULE(hindi, PG_UTF8, UTF_8),
 	STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
 	STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
 	STEMMER_MODULE(irish, PG_UTF8, UTF_8),
 	STEMMER_MODULE(italian, PG_UTF8, UTF_8),
 	STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
 	STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
 	STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
 	STEMMER_MODULE(porter, PG_UTF8, UTF_8),
 	STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
 	STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
 	STEMMER_MODULE(russian, PG_UTF8, UTF_8),
 	STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
 	STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
 	STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
 	STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
 	STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
 	STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),

 	/*
 	 * Stemmer with PG_SQL_ASCII encoding should be valid for any server
 	 * encoding
 	 */
 	STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),

 	{NULL, 0, NULL, NULL, NULL} /* list end marker */
 };


 typedef struct DictSnowball
 {
 	struct SN_env *z;
 	StopList	stoplist;
 	bool		needrecode;		/* needs recoding before/after call stem */
 	int			(*stem) (struct SN_env *z);

 	/*
 	 * snowball saves alloced memory between calls, so we should run it in our
 	 * private memory context. Note, init function is executed in long lived
 	 * context, so we just remember CurrentMemoryContext
 	 */
 	MemoryContext dictCtx;
 } DictSnowball;


 static void
 locate_stem_module(DictSnowball *d, const char *lang)
 {
 	const stemmer_module *m;

 	/*
 	 * First, try to find exact match of stemmer module. Stemmer with
 	 * PG_SQL_ASCII encoding is treated as working with any server encoding
 	 */
 	for (m = stemmer_modules; m->name; m++)
 	{
 		if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
 			pg_strcasecmp(m->name, lang) == 0)
 		{
 			d->stem = m->stem;
 			d->z = m->create();
 			d->needrecode = false;
 			return;
 		}
 	}

 	/*
 	 * Second, try to find stemmer for needed language for UTF8 encoding.
 	 */
 	for (m = stemmer_modules; m->name; m++)
 	{
 		if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
 		{
 			d->stem = m->stem;
 			d->z = m->create();
 			d->needrecode = true;
 			return;
 		}
 	}

 	ereport(ERROR,
 			(errcode(ERRCODE_UNDEFINED_OBJECT),
 			 errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
 					lang, GetDatabaseEncodingName())));
 }

 Datum
 dsnowball_init(PG_FUNCTION_ARGS)
 {
 	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
 	DictSnowball *d;
 	bool		stoploaded = false;
 	ListCell   *l;

 	d = (DictSnowball *) palloc0(sizeof(DictSnowball));

 	foreach(l, dictoptions)
 	{
 		DefElem    *defel = (DefElem *) lfirst(l);

 		if (strcmp(defel->defname, "stopwords") == 0)
 		{
 			if (stoploaded)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple StopWords parameters")));
 			readstoplist(defGetString(defel), &d->stoplist, lowerstr);
 			stoploaded = true;
 		}
 		else if (strcmp(defel->defname, "language") == 0)
 		{
 			if (d->stem)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple Language parameters")));
 			locate_stem_module(d, defGetString(defel));
 		}
 		else
 		{
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("unrecognized Snowball parameter: \"%s\"",
 							defel->defname)));
 		}
 	}

 	if (!d->stem)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("missing Language parameter")));

 	d->dictCtx = CurrentMemoryContext;

 	PG_RETURN_POINTER(d);
 }

 Datum
 dsnowball_lexize(PG_FUNCTION_ARGS)
 {
 	DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
 	char	   *in = (char *) PG_GETARG_POINTER(1);
 	int32		len = PG_GETARG_INT32(2);
 	char	   *txt = lowerstr_with_len(in, len);
 	TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);

 	/*
 	 * Do not pass strings exceeding 1000 bytes to the stemmer, as they're
 	 * surely not words in any human language.  This restriction avoids
 	 * wasting cycles on stuff like base64-encoded data, and it protects us
 	 * against possible inefficiency or misbehavior in the stemmer.  (For
 	 * example, the Turkish stemmer has an indefinite recursion, so it can
 	 * crash on long-enough strings.)  However, Snowball dictionaries are
 	 * defined to recognize all strings, so we can't reject the string as an
 	 * unknown word.
 	 */
 	if (len > 1000)
 	{
 		/* return the lexeme lowercased, but otherwise unmodified */
 		res->lexeme = txt;
 	}
 	else if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
 	{
 		/* empty or stopword, so report as stopword */
 		pfree(txt);
 	}
 	else
 	{
 		MemoryContext saveCtx;

 		/*
 		 * recode to utf8 if stemmer is utf8 and doesn't match server encoding
 		 */
 		if (d->needrecode)
 		{
 			char	   *recoded;

 			recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
 			if (recoded != txt)
 			{
 				pfree(txt);
 				txt = recoded;
 			}
 		}

 		/* see comment about d->dictCtx */
 		saveCtx = MemoryContextSwitchTo(d->dictCtx);
 		SN_set_current(d->z, strlen(txt), (symbol *) txt);
 		d->stem(d->z);
 		MemoryContextSwitchTo(saveCtx);

 		if (d->z->p && d->z->l)
 		{
 			txt = repalloc(txt, d->z->l + 1);
 			memcpy(txt, d->z->p, d->z->l);
 			txt[d->z->l] = '\0';
 		}

 		/* back recode if needed */
 		if (d->needrecode)
 		{
 			char	   *recoded;

 			recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
 			if (recoded != txt)
 			{
 				pfree(txt);
 				txt = recoded;
 			}
 		}

 		res->lexeme = txt;
 	}

 	PG_RETURN_POINTER(res);
 }
	/*-------------------------------------------------------------------------
	*
	* dict_snowball.c
	* Snowball dictionary
	*
	* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
	*
	* IDENTIFICATION
	* src/backend/snowball/dict_snowball.c
	*
	*-------------------------------------------------------------------------
	*/
	#include "postgres.h"

	#include "commands/defrem.h"
	#include "tsearch/ts_locale.h"
	#include "tsearch/ts_utils.h"

	/* Some platforms define MAXINT and/or MININT, causing conflicts */
	#ifdef MAXINT
	#undef MAXINT
	#endif
	#ifdef MININT
	#undef MININT
	#endif

	/* Now we can include the original Snowball header.h */
	#include "snowball/libstemmer/header.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_basque.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_catalan.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_english.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_french.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_german.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
	#include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
	#include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
	#include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
	#include "snowball/libstemmer/stem_KOI8_R_russian.h"
	#include "snowball/libstemmer/stem_UTF_8_arabic.h"
	#include "snowball/libstemmer/stem_UTF_8_armenian.h"
	#include "snowball/libstemmer/stem_UTF_8_basque.h"
	#include "snowball/libstemmer/stem_UTF_8_catalan.h"
	#include "snowball/libstemmer/stem_UTF_8_danish.h"
	#include "snowball/libstemmer/stem_UTF_8_dutch.h"
	#include "snowball/libstemmer/stem_UTF_8_english.h"
	#include "snowball/libstemmer/stem_UTF_8_finnish.h"
	#include "snowball/libstemmer/stem_UTF_8_french.h"
	#include "snowball/libstemmer/stem_UTF_8_german.h"
	#include "snowball/libstemmer/stem_UTF_8_greek.h"
	#include "snowball/libstemmer/stem_UTF_8_hindi.h"
	#include "snowball/libstemmer/stem_UTF_8_hungarian.h"
	#include "snowball/libstemmer/stem_UTF_8_indonesian.h"
	#include "snowball/libstemmer/stem_UTF_8_irish.h"
	#include "snowball/libstemmer/stem_UTF_8_italian.h"
	#include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
	#include "snowball/libstemmer/stem_UTF_8_nepali.h"
	#include "snowball/libstemmer/stem_UTF_8_norwegian.h"
	#include "snowball/libstemmer/stem_UTF_8_porter.h"
	#include "snowball/libstemmer/stem_UTF_8_portuguese.h"
	#include "snowball/libstemmer/stem_UTF_8_romanian.h"
	#include "snowball/libstemmer/stem_UTF_8_russian.h"
	#include "snowball/libstemmer/stem_UTF_8_serbian.h"
	#include "snowball/libstemmer/stem_UTF_8_spanish.h"
	#include "snowball/libstemmer/stem_UTF_8_swedish.h"
	#include "snowball/libstemmer/stem_UTF_8_tamil.h"
	#include "snowball/libstemmer/stem_UTF_8_turkish.h"
	#include "snowball/libstemmer/stem_UTF_8_yiddish.h"

	PG_MODULE_MAGIC;

	PG_FUNCTION_INFO_V1(dsnowball_init);

	PG_FUNCTION_INFO_V1(dsnowball_lexize);

	/* List of supported modules */
	typedef struct stemmer_module
	{
	const char *name;
	pg_enc enc;
	struct SN_env (create) (void);
	void (close) (struct SN_env );
	int (stem) (struct SN_env );
	} stemmer_module;

	/* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
	#define STEMMER_MODULE(name,enc,senc) \
	{#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}

	static const stemmer_module stemmer_modules[] =
	{
	/*
	* Stemmers list from Snowball distribution
	*/
	STEMMER_MODULE(basque, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(catalan, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
	STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
	STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
	STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
	STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
	STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
	STEMMER_MODULE(basque, PG_UTF8, UTF_8),
	STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
	STEMMER_MODULE(danish, PG_UTF8, UTF_8),
	STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
	STEMMER_MODULE(english, PG_UTF8, UTF_8),
	STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
	STEMMER_MODULE(french, PG_UTF8, UTF_8),
	STEMMER_MODULE(german, PG_UTF8, UTF_8),
	STEMMER_MODULE(greek, PG_UTF8, UTF_8),
	STEMMER_MODULE(hindi, PG_UTF8, UTF_8),
	STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
	STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
	STEMMER_MODULE(irish, PG_UTF8, UTF_8),
	STEMMER_MODULE(italian, PG_UTF8, UTF_8),
	STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
	STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
	STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
	STEMMER_MODULE(porter, PG_UTF8, UTF_8),
	STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
	STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
	STEMMER_MODULE(russian, PG_UTF8, UTF_8),
	STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
	STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
	STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
	STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
	STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
	STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),

	/*
	* Stemmer with PG_SQL_ASCII encoding should be valid for any server
	* encoding
	*/
	STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),

	{NULL, 0, NULL, NULL, NULL} /* list end marker */
	};


	typedef struct DictSnowball
	{
	struct SN_env *z;
	StopList stoplist;
	bool needrecode; /* needs recoding before/after call stem */
	int (stem) (struct SN_env z);

	/*
	* snowball saves alloced memory between calls, so we should run it in our
	* private memory context. Note, init function is executed in long lived
	* context, so we just remember CurrentMemoryContext
	*/
	MemoryContext dictCtx;
	} DictSnowball;


	static void
	locate_stem_module(DictSnowball d, const char lang)
	{
	const stemmer_module *m;

	/*
	* First, try to find exact match of stemmer module. Stemmer with
	* PG_SQL_ASCII encoding is treated as working with any server encoding
	*/
	for (m = stemmer_modules; m->name; m++)
	{
	if ((m->enc == PG_SQL_ASCII \|\| m->enc == GetDatabaseEncoding()) &&
	pg_strcasecmp(m->name, lang) == 0)
	{
	d->stem = m->stem;
	d->z = m->create();
	d->needrecode = false;
	return;
	}
	}

	/*
	* Second, try to find stemmer for needed language for UTF8 encoding.
	*/
	for (m = stemmer_modules; m->name; m++)
	{
	if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
	{
	d->stem = m->stem;
	d->z = m->create();
	d->needrecode = true;
	return;
	}
	}

	ereport(ERROR,
	(errcode(ERRCODE_UNDEFINED_OBJECT),
	errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
	lang, GetDatabaseEncodingName())));
	}

	Datum
	dsnowball_init(PG_FUNCTION_ARGS)
	{
	List dictoptions = (List ) PG_GETARG_POINTER(0);
	DictSnowball *d;
	bool stoploaded = false;
	ListCell *l;

	d = (DictSnowball *) palloc0(sizeof(DictSnowball));

	foreach(l, dictoptions)
	{
	DefElem defel = (DefElem ) lfirst(l);

	if (strcmp(defel->defname, "stopwords") == 0)
	{
	if (stoploaded)
	ereport(ERROR,
	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
	errmsg("multiple StopWords parameters")));
	readstoplist(defGetString(defel), &d->stoplist, lowerstr);
	stoploaded = true;
	}
	else if (strcmp(defel->defname, "language") == 0)
	{
	if (d->stem)
	ereport(ERROR,
	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
	errmsg("multiple Language parameters")));
	locate_stem_module(d, defGetString(defel));
	}
	else
	{
	ereport(ERROR,
	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
	errmsg("unrecognized Snowball parameter: \"%s\"",
	defel->defname)));
	}
	}

	if (!d->stem)
	ereport(ERROR,
	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
	errmsg("missing Language parameter")));

	d->dictCtx = CurrentMemoryContext;

	PG_RETURN_POINTER(d);
	}

	Datum
	dsnowball_lexize(PG_FUNCTION_ARGS)
	{
	DictSnowball d = (DictSnowball ) PG_GETARG_POINTER(0);
	char in = (char ) PG_GETARG_POINTER(1);
	int32 len = PG_GETARG_INT32(2);
	char *txt = lowerstr_with_len(in, len);
	TSLexeme res = palloc0(sizeof(TSLexeme) 2);

	/*
	* Do not pass strings exceeding 1000 bytes to the stemmer, as they're
	* surely not words in any human language. This restriction avoids
	* wasting cycles on stuff like base64-encoded data, and it protects us
	* against possible inefficiency or misbehavior in the stemmer. (For
	* example, the Turkish stemmer has an indefinite recursion, so it can
	* crash on long-enough strings.) However, Snowball dictionaries are
	* defined to recognize all strings, so we can't reject the string as an
	* unknown word.
	*/
	if (len > 1000)
	{
	/* return the lexeme lowercased, but otherwise unmodified */
	res->lexeme = txt;
	}
	else if (*txt == '\0' \|\| searchstoplist(&(d->stoplist), txt))
	{
	/* empty or stopword, so report as stopword */
	pfree(txt);
	}
	else
	{
	MemoryContext saveCtx;

	/*
	* recode to utf8 if stemmer is utf8 and doesn't match server encoding
	*/
	if (d->needrecode)
	{
	char *recoded;

	recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
	if (recoded != txt)
	{
	pfree(txt);
	txt = recoded;
	}
	}

	/* see comment about d->dictCtx */
	saveCtx = MemoryContextSwitchTo(d->dictCtx);
	SN_set_current(d->z, strlen(txt), (symbol *) txt);
	d->stem(d->z);
	MemoryContextSwitchTo(saveCtx);

	if (d->z->p && d->z->l)
	{
	txt = repalloc(txt, d->z->l + 1);
	memcpy(txt, d->z->p, d->z->l);
	txt[d->z->l] = '\0';
	}

	/* back recode if needed */
	if (d->needrecode)
	{
	char *recoded;

	recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
	if (recoded != txt)
	{
	pfree(txt);
	txt = recoded;
	}
	}

	res->lexeme = txt;
	}

	PG_RETURN_POINTER(res);
	}