blob: 598e232f4b0b39ade178966cd96dd3297fa37d33 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* The library provides a simple C API. Essentially, a new stemmer can
* be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then
* used to stem a word, "sb_stemmer_length" returns the stemmed
* length of the last word processed, and "sb_stemmer_delete" is
* used to delete a stemmer.
*
* Creating a stemmer is a relatively expensive operation - the expected
* usage pattern is that a new stemmer is created when needed, used
* to stem many words, and deleted after some time.
*
* Stemmers are re-entrant, but not threadsafe. In other words, if
* you wish to access the same stemmer object from multiple threads,
* you must ensure that all access is protected by a mutex or similar
* device.
*
* libstemmer does not currently incorporate any mechanism for caching the results
* of stemming operations. Such caching can greatly increase the performance of a
* stemmer under certain situations, so suitable patches will be considered for
* inclusion.
*
* The standard libstemmer sources contain an algorithm for each of the supported
* languages. The algorithm may be selected using the english name of the
* language, or using the 2 or 3 letter ISO 639 language codes. In addition,
* the traditional "Porter" stemming algorithm for english is included for
* backwards compatibility purposes, but we recommend use of the "English"
* stemmer in preference for new projects.
*/
/*
* Postgres headers
*/
#include <postgres.h>
#include <catalog/pg_type.h>
#include <fmgr.h>
#include <sys/time.h>
#include <funcapi.h>
#include <utils/builtins.h>
#include <utils/array.h>
/*
* libstemmer header
*/
#include "libstemmer/include/libstemmer.h"
static const char * stem_token_text(struct sb_stemmer *stemmer, text *token) {
char * stemmed_token = TextDatumGetCString(PointerGetDatum(token));
if (strcmp(stemmed_token, "") == 0) {
return "";
}
const sb_symbol *stemmed = sb_stemmer_stem(stemmer,
(const sb_symbol *) stemmed_token,
strlen(stemmed_token));
return (const char*) stemmed;
}
/*
* Plug in stemmer call to be invoked via SQL
*/
PG_FUNCTION_INFO_V1(stem_token);
Datum stem_token(PG_FUNCTION_ARGS)
{
if (PG_ARGISNULL(0)){
PG_RETURN_NULL();
}
text * org_token = PG_GETARG_TEXT_P(0);
struct sb_stemmer *stemmer = sb_stemmer_new(
"english" /* language */, NULL /* language encoding NULL for UTF-8 */);
Assert(stemmer);
text *stemmed = cstring_to_text(stem_token_text(stemmer, org_token));
sb_stemmer_delete(stemmer);
PG_RETURN_TEXT_P(stemmed);
}
/*
* Stemmer function processing text[] input
*/
PG_FUNCTION_INFO_V1(stem_token_arr);
Datum stem_token_arr(PG_FUNCTION_ARGS)
{
if (PG_ARGISNULL(0)){
PG_RETURN_NULL();
}
/* Prepare elements to receive input text[] */
ArrayType *arr = PG_GETARG_ARRAYTYPE_P(0);
Datum *dtum;
bool *nulls;
int ndim;
/* Deconstruct input text[] */
deconstruct_array(arr, TEXTOID, -1, false, 'i', &dtum, &nulls, &ndim);
/* Prepare stemmer */
struct sb_stemmer *stemmer = sb_stemmer_new(
"english" /* language */, NULL /* language encoding NULL for UTF-8 */);
Assert(stemmer);
/* Call stemming code */
text **result = (text **) palloc(ndim * sizeof(text * ));
for(int i=0; i< ndim; i++) {
text *token = dtum[i] == 0 ? NULL : DatumGetTextP(dtum[i]);
char *empty;
if(token == NULL) {
empty = (char *)palloc(sizeof(char));
empty[0] = '\0';
}
result[i] = (token == NULL ?
cstring_to_text(empty) :
cstring_to_text(stem_token_text(stemmer, token)));
}
ArrayType *res = construct_array((Datum*)result, ndim, TEXTOID, -1, false, 'i');
sb_stemmer_delete(stemmer);
PG_RETURN_ARRAYTYPE_P(res);
}