blob: e5a5a461282193d33f01f1b0f748368632ee32c4 [file] [log] [blame]
/*
Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
mkcls - a program for making word classes .
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
*/
#ifndef KATEG_OPT_H
#define KATEG_OPT_H
#include <string>
#include <stdlib.h>
#include "Problem.h"
extern double rhoLo;
typedef int Kategory;
typedef int Word;
#ifdef FREQTYPE_DOUBLE
typedef double FreqType;
#else
typedef int FreqType;
#endif
#include "KategProblemWBC.h"
#include "KategProblemKBC.h"
enum {
INIT_RAN=1,
INIT_AIO=2,
INIT_LWRW=3,
INIT_FREQ=4,
INIT_OTHER=5
};
enum {
W_RAN=(8|16),
W_DET_DECR=(16),
W_DET_INCR =(32)
};
#define CHOOSE_WORD (8|16|32)
enum {
K_DET=(64),
K_RAN=(128),
K_BEST=(64|128)
};
#define CHOOSE_KAT (64|128)
enum {
CRITERION_ML=0,
CRITERION_LO=1,
CRITERION_MY=2
};
class NWG
{
private:
Array<FreqType> freq;
Array<int> timeOfFreq;
int curTime;
public:
NWG(int n);
void init();
int anzNot0;
Array<int> not0;
int word;
inline void addFreq(int C,FreqType n);
void sort();
FreqType getFreq(int i)
{
if( timeOfFreq[i]==curTime )
return freq[i];
else
return 0;
};
};
inline void NWG::addFreq(int g,FreqType n)
{
if(timeOfFreq[g]==curTime)
freq[g]+=n;
else
{
timeOfFreq[g]=curTime;
freq[g]=n;
not0[anzNot0++]=g;
}
}
struct KategProblemChange : public ProblemChange
{
void *operator new(size_t size);
void operator delete(void *ptr,size_t size);
int word;
int toKat;
int fromKat;
};
class KategProblem : public Problem
{
private:
double kat_h_full(int n);
double kat_h_full(double n);
double kat_h_part(int n);
double kat_h_part(double n);
double sigmaVerfaelschung;
short katWasEmpty;
int nwgWord;
NWG nwg;
NWG ngw;
FreqType nww;
int ursprung,ziel;
Array<int> _katOfWord;
int _maxComp,_maxCompVal;
double nmo_my(int i,int j);
double nmo(int i,int j);
double nmo_lo(int i,int j,int &e0,int &e1);
void putWord(int word,int to);
void fastPutWord(int word,int to);
void setKatOfWord(int w,int k)
{
if( !(wordFreq.fixedWord[w]==k||wordFreq.fixedWord[w]==-1||k==-1) )
{
cout << "mkcls::setKatOfWord::ERROR: " << w << " " << k << " " << wordFreq.fixedWord[w] << " " << (*words)[w] << endl;
}
_katOfWord[w]=k;
nwgWord=-1;
};
void fillNWG(int w);
inline FreqType nstrich(int i,int j);
void vnstrich(int i,int j);
protected:
virtual int _change(ProblemChange **p);
virtual void _doChange(ProblemChange &c);
virtual void _undoChange(ProblemChange &c);
virtual double _value();
double _valueChange(KategProblemChange &k);
virtual void incrementDirection();
virtual int maxDimensionVal(void) ;
virtual int maxDimension(void) ;
public:
leda_array<string> *words;
typedef leda_set<int> intSet;
leda_array<intSet> *kats;
KategProblemWBC wordFreq;
KategProblemKBC katFreq;
Array<int> initLike;
KategProblem(int aw,int mak,int _initialisierung,int _auswertung,
int _nachbarschaft,int minw=0);
virtual ~KategProblem();
virtual void _initialize(int initTyp);
virtual void _initialize(int initTyp,int specialFixedWord);
virtual double valueChange(ProblemChange&c);
virtual Problem *makeEqualProblem();
virtual double nicevalue(double value=1e100);
void makeKats();
virtual void dumpOn(ostream &strm);
virtual void dumpInfos(ostream &strm);
inline void katwahl(int k);
inline void wortwahl(int w);
inline int katOfWord(int w);
inline short wortwahl();
inline short katwahl() ;
virtual int maxNonBetterIterations();
virtual int expectedNumberOfIterations();
const char *getString(int i);
string getTheString(int i);
void makeTitle(char x[512]);
void fixInitLike();
};
inline int KategProblem::katOfWord(int w){return _katOfWord[w];};
inline short KategProblem::wortwahl(){return nachbarschaft&CHOOSE_WORD;};
inline short KategProblem::katwahl() {return nachbarschaft&CHOOSE_KAT;};
inline void KategProblem::katwahl(int k)
{
nachbarschaft = (nachbarschaft&(~CHOOSE_KAT)) | k;
if(k==K_BEST)
_maxCompVal=1;
else
_maxCompVal=katFreq.nKats-2;
};
inline void KategProblem::wortwahl(int w)
{
nachbarschaft = (nachbarschaft&(~CHOOSE_WORD)) | w;
};
inline FreqType KategProblem::nstrich(int i,int j)
{
FreqType n=0;
if( i==ursprung )
n-=nwg.getFreq(j);
if( i==ziel )
n+=nwg.getFreq(j);
if( j==ursprung )
n-=ngw.getFreq(i);
if( j==ziel )
n+=ngw.getFreq(i);
if( i==ursprung && j==ursprung )
n+=nww;
if( i==ziel && j==ziel )
n+=nww;
if( i==ursprung && j==ziel )
n-=nww;
if( i==ziel && j==ursprung )
n-=nww;
return n;
}
#define MAX_H_TABLE 4000
extern double h_table[],l_table[],hmy_table[],hmy_sigma;
inline double kat_mlog(double x)
{
if(x<=1e-9)
return 0;
else
return log(x);
}
inline double kat_mlog(int s)
{
if(s<=0)
return 0;
else if( s<MAX_H_TABLE )
{
massert( s==0 || l_table[s]==log(s) );
return l_table[s];
}
else
return log((double)(s));
}
inline double kat_hlo(int n)
{
return n*kat_mlog(n-1);
}
inline double kat_hlo(double n)
{
return n*kat_mlog(n-1);
}
inline double kat_h(int n)
{
massert(n>=-1);
if(n<=0)
return 0;
else
if(n<MAX_H_TABLE)
{
massert(n==0||fabs(h_table[n]-n*log((double)n))<1e-8);
return h_table[n];
}
else
return n*log((double)(n));
}
inline double kat_h(double n)
{
if(n<=1e-9)
return 0;
else
return n*log(n);
}
inline double kat_etaFkt(int _e0,int e1,int immer0,int cats)
{
int e0 = _e0 - immer0;
int ePlus = cats*cats - _e0;
if(cats*cats-e0>1)
return e1*log( (ePlus-1.0)/(e0+1.0)*rhoLo );
else
return 0;
}
double mkat_h_full(int n,double tf);
double mkat_h_part(int n,double cf);
int Hash(const string& s);
#endif