blob: d237c1d8d3dac0b8205dd1e61002582069a2f6f0 [file] [log] [blame]
# This file contains a list of stemmers to include in the distribution.
# The format is a set of space separated lines - on each line:
# First item is name of stemmer.
# Second item is comma separated list of character sets.
# Third item is comma separated list of names to refer to the stemmer by.
#
# Lines starting with a #, or blank lines, are ignored.
# List all the main algorithms for each language, in UTF-8, and also with
# the most commonly used encoding.
danish UTF_8,ISO_8859_1 danish,da,dan
dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld
english UTF_8,ISO_8859_1 english,en,eng
finnish UTF_8,ISO_8859_1 finnish,fi,fin
french UTF_8,ISO_8859_1 french,fr,fre,fra
german UTF_8,ISO_8859_1 german,de,ger,deu
hungarian UTF_8,ISO_8859_2 hungarian,hu,hun
italian UTF_8,ISO_8859_1 italian,it,ita
norwegian UTF_8,ISO_8859_1 norwegian,no,nor
portuguese UTF_8,ISO_8859_1 portuguese,pt,por
romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron
russian UTF_8,KOI8_R russian,ru,rus
spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
swedish UTF_8,ISO_8859_1 swedish,sv,swe
turkish UTF_8 turkish,tr,tur
# Also include the traditional porter algorithm for english.
# The porter algorithm is included in the libstemmer distribution to assist
# with backwards compatibility, but for new systems the english algorithm
# should be used in preference.
porter UTF_8,ISO_8859_1 porter
# Some other stemmers in the snowball project are not included in the standard
# distribution. To compile a libstemmer with them in, add them to this list,
# and regenerate the distribution. (You will need a full source checkout for
# this.) They are included in the snowball website as curiosities, but are not
# intended for general use, and use of them is is not fully supported. These
# algorithms are:
#
# german2 - This is a slight modification of the german stemmer.
#german2 UTF_8,ISO_8859_1 german2
#
# kraaij_pohlmann - This is a different dutch stemmer.
#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann
#
# lovins - This is an english stemmer, but fairly outdated, and
# only really applicable to a restricted type of input text
# (keywords in academic publications).
#lovins UTF_8,ISO_8859_1 lovins