| /************************************************************** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| *************************************************************/ |
| |
| |
| |
| /** |
| * |
| * |
| * |
| * |
| * TODO |
| * - Add exception throwing when h == NULL |
| * - Not init h when implicit constructor is launched |
| */ |
| |
| // MARKER(update_precomp.py): autogen include statement, do not remove |
| #include "precompiled_lingucomponent.hxx" |
| |
| #include <string.h> |
| #include <sstream> |
| #include <iostream> |
| |
| #include <libtextcat/textcat.h> |
| #include <libtextcat/common.h> |
| #include <libtextcat/constants.h> |
| #include <libtextcat/fingerprint.h> |
| #include <libtextcat/utf8misc.h> |
| |
| #include <sal/types.h> |
| |
| #include "altstrfunc.hxx" |
| #include "simpleguesser.hxx" |
| |
| #ifndef _UTF8_ |
| #define _UTF8_ |
| #endif |
| |
| |
| using namespace std; |
| |
| |
| /** |
| * This 3 following structures are from fingerprint.c and textcat.c |
| */ |
| |
| typedef struct ngram_t { |
| |
| sint2 rank; |
| char str[MAXNGRAMSIZE+1]; |
| |
| } ngram_t; |
| |
| typedef struct fp_t { |
| |
| const char *name; |
| ngram_t *fprint; |
| uint4 size; |
| |
| } fp_t; |
| |
| typedef struct textcat_t{ |
| |
| void **fprint; |
| char *fprint_disable; |
| uint4 size; |
| uint4 maxsize; |
| |
| char output[MAXOUTPUTSIZE]; |
| |
| } textcat_t; |
| /** end of the 3 structs */ |
| |
| SimpleGuesser::SimpleGuesser() |
| { |
| h = NULL; |
| } |
| |
| void SimpleGuesser::operator=(SimpleGuesser& sg){ |
| if(h){textcat_Done(h);} |
| h = sg.h; |
| } |
| |
| SimpleGuesser::~SimpleGuesser() |
| { |
| if(h){textcat_Done(h);} |
| } |
| |
| |
| /*! |
| \fn SimpleGuesser::GuessLanguage(char* text) |
| */ |
| vector<Guess> SimpleGuesser::GuessLanguage(char* text) |
| { |
| vector<Guess> guesses; |
| |
| if(!h){return guesses;} |
| |
| //calculate le number of unicode charcters (symbols) |
| int len = utfstrlen(text); |
| |
| if( len > MAX_STRING_LENGTH_TO_ANALYSE ){len = MAX_STRING_LENGTH_TO_ANALYSE ;} |
| |
| char *guess_list = textcat_Classify(h, text, len); |
| |
| if(strcmp(guess_list, _TEXTCAT_RESULT_SHORT) == 0){ |
| return guesses; |
| } |
| |
| int current_pointer = 0; |
| |
| for(int i = 0; guess_list[current_pointer] != '\0'; i++) |
| { |
| while(guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0'){ |
| current_pointer++; |
| } |
| if(guess_list[current_pointer] != '\0') |
| { |
| Guess g((char*)(guess_list + current_pointer)); |
| |
| guesses.push_back(g); |
| |
| current_pointer++; |
| } |
| } |
| |
| return guesses; |
| } |
| |
| /*! |
| \fn SimpleGuesser::GuessPrimaryLanguage(char* text) |
| */ |
| Guess SimpleGuesser::GuessPrimaryLanguage(char* text) |
| { |
| vector<Guess> ret = GuessLanguage(text); |
| if(ret.size() > 0){ |
| return GuessLanguage(text)[0]; |
| } |
| else{ |
| return Guess(); |
| } |
| } |
| /** |
| * Is used to know wich language is available, unavailable or both |
| * when mask = 0xF0, return only Available |
| * when mask = 0x0F, return only Unavailable |
| * when mask = 0xFF, return both Available and Unavailable |
| */ |
| vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask) |
| { |
| size_t i; |
| textcat_t *tables = (textcat_t*)h; |
| |
| vector<Guess> lang; |
| if(!h){return lang;} |
| |
| for (i=0; i<tables->size; i++) { |
| if(tables->fprint_disable[i] & mask){ |
| string langStr = "["; |
| langStr += (char*)fp_Name(tables->fprint[i]); |
| Guess g( (char *)langStr.c_str()); |
| lang.push_back(g); |
| } |
| } |
| |
| return lang; |
| } |
| |
| vector<Guess> SimpleGuesser::GetAvailableLanguages(){ |
| return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) ); |
| } |
| |
| vector<Guess> SimpleGuesser::GetUnavailableLanguages(){ |
| return GetManagedLanguages( sal::static_int_cast< char >( 0x0F )); |
| } |
| |
| vector<Guess> SimpleGuesser::GetAllManagedLanguages(){ |
| return GetManagedLanguages( sal::static_int_cast< char >( 0xFF )); |
| } |
| |
| void SimpleGuesser::XableLanguage(string lang, char mask){ |
| size_t i; |
| textcat_t *tables = (textcat_t*)h; |
| |
| if(!h){return;} |
| |
| for (i=0; i<tables->size; i++) { |
| string language(fp_Name(tables->fprint[i])); |
| if(start(language,lang) == 0){ |
| //cout << language << endl; |
| tables->fprint_disable[i] = mask; |
| //continue; |
| } |
| } |
| } |
| |
| void SimpleGuesser::EnableLanguage(string lang){ |
| XableLanguage(lang, sal::static_int_cast< char >( 0xF0 )); |
| } |
| |
| void SimpleGuesser::DisableLanguage(string lang){ |
| XableLanguage(lang, sal::static_int_cast< char >( 0x0F )); |
| } |
| |
| /** |
| * |
| */ |
| void SimpleGuesser::SetDBPath(const char* path, const char* prefix){ |
| if(h){ |
| textcat_Done(h); |
| } |
| h = special_textcat_Init(path, prefix); |
| } |