blob: 077f8e2fe5621a5b2c5897870466f5f186206411 [file] [log] [blame]
/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
parcel Lucy;
/** Suppress a "stoplist" of common words.
*
* A "stoplist" is collection of "stopwords": words which are common enough to
* be of little value when determining search results. For example, so many
* documents in English contain "the", "if", and "maybe" that it may improve
* both performance and relevance to block them.
*
* Before filtering stopwords:
*
* ("i", "am", "the", "walrus")
*
* After filtering stopwords:
*
* ("walrus")
*
* SnowballStopFilter provides default stoplists for several languages, courtesy of
* the Snowball project (<http://snowball.tartarus.org>), or you may supply
* your own.
*
* |-----------------------|
* | ISO CODE | LANGUAGE |
* |-----------------------|
* | da | Danish |
* | de | German |
* | en | English |
* | es | Spanish |
* | fi | Finnish |
* | fr | French |
* | hu | Hungarian |
* | it | Italian |
* | nl | Dutch |
* | no | Norwegian |
* | pt | Portuguese |
* | sv | Swedish |
* | ru | Russian |
* |-----------------------|
*/
class Lucy::Analysis::SnowballStopFilter cnick SnowStop
inherits Lucy::Analysis::Analyzer : dumpable {
Hash *stoplist;
inert const uint8_t** snow_da;
inert const uint8_t** snow_de;
inert const uint8_t** snow_en;
inert const uint8_t** snow_es;
inert const uint8_t** snow_fi;
inert const uint8_t** snow_fr;
inert const uint8_t** snow_hu;
inert const uint8_t** snow_it;
inert const uint8_t** snow_nl;
inert const uint8_t** snow_no;
inert const uint8_t** snow_pt;
inert const uint8_t** snow_ru;
inert const uint8_t** snow_sv;
inert incremented SnowballStopFilter*
new(const CharBuf *language = NULL, Hash *stoplist = NULL);
/**
* @param stoplist A hash with stopwords as the keys.
* @param language The ISO code for a supported language.
*/
public inert SnowballStopFilter*
init(SnowballStopFilter *self, const CharBuf *language = NULL,
Hash *stoplist = NULL);
/** Return a Hash with the Snowball stoplist for the supplied language.
*/
inert incremented Hash*
gen_stoplist(const CharBuf *language);
public incremented Inversion*
Transform(SnowballStopFilter *self, Inversion *inversion);
public bool_t
Equals(SnowballStopFilter *self, Obj *other);
public void
Destroy(SnowballStopFilter *self);
}
class Lucy::Analysis::SnowballStopFilter::NoCloneHash inherits Lucy::Object::Hash {
inert incremented NoCloneHash*
new(uint32_t capacity = 0);
inert NoCloneHash*
init(NoCloneHash *self, uint32_t capacity = 0);
public incremented Obj*
Make_Key(NoCloneHash *self, Obj *key, int32_t hash_sum);
}