blob: df88d18eba39dea53f1e9f1c142662b33b7602aa [file] [log] [blame]
/*
Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
mkcls - a program for making word classes .
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
*/
#include "KategProblemTest.h"
#include "ProblemTest.h"
#include "HCOptimization.h"
#include "TAOptimization.h"
#include "RRTOptimization.h"
#include "GDAOptimization.h"
#include <stdlib.h>
#include <stdio.h>
#include <string>
#include <sstream>
typedef pair<string,string> PSS;
#define NEW_SENTENCE_END "mkcls-mapped-dollar-symbol-$"
#ifdef NeXT
char *strdup(char *a)
{
char *p = (char *)malloc(strlen(a)+1);
strcpy(p,a);
return p;
}
#endif
void writeClasses(Array<Kategory> &katOfWord,KategProblem &problem,ostream &to)
{
for(int i=0;i<katOfWord.size();i++)
{
if( strcmp(problem.getString(i),"$") ) {
if( strcmp(problem.getString(i),"mkcls-mapped-dollar-symbol-$")==0 )
to << "$" << "\t" << katOfWord[i] << endl;
else
to << problem.getString(i) << "\t" << katOfWord[i] << endl;
}
}
}
void mysplit(const string &s,string &s1,string &s2)
{
unsigned int i=0;
for(;i<s.length();i++)if( s[i]==' ' || s[i]=='\t' || s[i]==' ')break;
s1=s.substr(0,i);
for(;i<s.length();i++)if( !(s[i]==' ' || s[i]=='\t' || s[i]==' ') )break;
s2=s.substr(i,s.length()-i);
iassert(s1.size());
iassert(s2.size());
}
int fromCatFile(KategProblem *p,const char *fname,bool verb)
{
leda_h_array<string,int> translation(-1);
int maxCat=2;
ifstream in(fname);
if(!in)
{
cerr << "Error: File '" << fname << "' cannot be opened.\n";
exit(1);
}
for(int i=0;i<p->wordFreq.nWords;i++)
(p->initLike)[i]= -1;
translation["1"]=1;
translation["0"]=0;
string s;
while( getline(in,s) )
{
string str,categ;
mysplit(s,str,categ);
int i=p->words->binary_locate(str);
if(i>=0 && (*(p->words))[i]==str )
{
if( translation[categ]==-1 )
translation[categ]=maxCat++;
int cat=translation[categ];
if( (p->initLike)[i]!= -1 )
cerr << "Warning: Word '" << ((*(p->words))[i])<< "' is already in a category.\n";
(p->initLike)[i]=cat;
}
else
cerr << "Warning: Word '" << str << "' " << i << " is not in training corpus.\n";
}
if( verboseMode )
cout << "We have " << maxCat << " read non-empty categories"
" (with words from the corpus).\n";
if(maxCat>p->katFreq.nKats)
{
cerr << "Error: Not enough categories reserved (only "
<< p->katFreq.nKats << ", but i need " << maxCat << ").\n";
exit(1);
}
int i=p->words->binary_locate("$");
if( i>=0 && (*(p->words))[i]=="$" )
(p->initLike)[i]=0;
else
if( verboseMode )
cerr << "Warning: No '$' in vocabulary!\n";
int errors=0;
for(i=0;i<p->wordFreq.nWords;i++)
if((p->initLike)[i]== -1 )
{
if( verb ) cerr << "Error: I don't know the category of word " << i
<< " (" << (*(p->words))[i] << ") " << ".\n";
errors=1;
}
return errors;
}
KategProblem *makeKategProblem(const leda_h_array<PSS,FreqType>&cTbl,const leda_set<string>&setVokabular, int maxClass,int initialisierung,
int auswertung,int nachbarschaft,int minWordFrequency)
{
int nwrd=0;
leda_array<string>&sVok = *new leda_array<string>(setVokabular.size());
string s;
unsigned int ctr=0;
forall_set(leda_set<string>,s,setVokabular)
{
if( verboseMode>2 )
cout << "mkcls:Wort " << ctr << " " << s << endl;
sVok[ctr++]=s;
}
for(unsigned int z=0;z<ctr-1;z++)
iassert( sVok[z]<sVok[z+1] );
sVok.sort();
if( verboseMode>2 )
cout << "*****Vocabulary: " << sVok;
unsigned int vokSize=sVok.size();
massert(vokSize==ctr); massert(vokSize==setVokabular.size());
if(verboseMode)
{cout << "Size of vocabulary: " << vokSize << "\n";cout.flush();}
KategProblem *k = new KategProblem(vokSize,maxClass,initialisierung,
auswertung,nachbarschaft,minWordFrequency);
KategProblemWBC &w=k->wordFreq;
k->words=&sVok;
Array<int> after(vokSize,0);
Array<int> before(vokSize,0);
nwrd=0;
{
PSS s;
forall_defined_h2(PSS,FreqType,s,cTbl)
{
const string&ss1=s.first;
const string&ss2=s.second;
if( ss2.length()&&(ss1!="$" || ss2!="$") )
{
int i1=sVok.binary_search(ss1);
int i2=sVok.binary_search(ss2);
iassert( sVok[i1] == ss1 );iassert( sVok[i2] == ss2 );
after[i1]++;
before[i2]++;
}
if( verboseMode&&((nwrd++)%10000==0) )
{cout<<"Statistiken-1 " << nwrd<< ". \r";cout.flush();}
}
}
for(unsigned int i=0;i<vokSize;i++)
{
w.setAfterWords(i,after[i]);
w.setBeforeWords(i,before[i]);
}
{
nwrd=0;
PSS s;
forall_defined_h2(PSS,FreqType,s,cTbl)
{
const string&ss1=s.first;
const string&ss2=s.second;
FreqType p=cTbl[s];
if( ss2.length()&&(ss1!="$" || ss2!="$") )
{
int i1=sVok.binary_search(ss1);
int i2=sVok.binary_search(ss2);
iassert( sVok[i1] == ss1 );iassert( sVok[i2] == ss2 );
w.setFreq(i1,i2,p);
if( verboseMode>2 )
cout << "BIGRAMM-HAEUF: " << ss1 << ":" << i1 << " "
<< ss2 << ":" << i2 << " " << p << endl;
}
if( verboseMode&&((nwrd++)%10000==0) )
{cout<<"Statistiken-2 " <<nwrd<< ". \r";cout.flush();}
}
}
w.testFull();
if(verboseMode){cout << "Datenintegritaet getestet.\n";cout.flush();}
return k;
}
KategProblem *fromNgrFile(const char *str,int maxClass,int initialisierung,
int auswertung,int nachbarschaft,int minWordFrequency)
{
ifstream file(str);
if(!file)return 0;
leda_set<string> setVokabular;
leda_h_array<PSS,FreqType> cTbl;
double c=0;
if( verboseMode )cout << "NGRFILE: " << str << endl;
string s1,s2;
while(file >> c >> s1 >> s2)
{
if( s1.length()==0||s2.length()==0 )
{
cerr << "ERROR: strings are zero: " << s1.length() <<" " << s1 <<" " << s2.length()<<" " << s2 << endl;
return 0;
}
if( c==0 )
{
cerr << "Count ist 0 " << s1 << " " << s2 << endl;
return 0;
}
cTbl[pair<string,string>(s1,s2)]=(FreqType)c;
setVokabular.insert(s1);
setVokabular.insert(s2);
if( verboseMode>1 )
cout << "R: " << s1 << " " << s2 << " " << c << endl;
c=0;
}
return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency);
}
KategProblem *fromKModel(const char *str,int maxClass,int initialisierung,
int auswertung,int nachbarschaft,int minWordFrequency)
{
string oldText,text,line;
ifstream f(str);
if( !f )
{
cerr << "ERROR: can not open file " << str << ".\n";
return 0;
}
leda_set<string> setVokabular;
leda_h_array<PSS,FreqType> cTbl(0);
oldText="$";
while(1)
{
getline(f,line);
if(f.fail() && !f.bad() && !f.eof())
{
cerr << "WARNING: strange characters in stream (getline) " << endl;f.clear();
}
if(!f)break;
istringstream f2(line);
while( 1 )
{
f2 >> text;
if(f2.fail() && !f2.bad() && !f2.eof())
{
cerr << "WARNING: strange characters in stream (>>) !\n";
f2.clear(ios::failbit);
}
if(!f2){break;}
if( text == "$" )
text = "mkcls-mapped-dollar-symbol-$";
if( !setVokabular.member(text) )setVokabular.insert(text);
cTbl[pair<string,string>(oldText,text)]++;
oldText=text;
}
text="$";
if( !setVokabular.member(text) )setVokabular.insert(text);
cTbl[pair<string,string>(oldText,text)]++;
oldText=text;
}
return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency);
}
void KategProblemSetParameters(KategProblem &p)
{
if( p.katwahl()==K_BEST )
{
TAOptimization::defaultAnnRate=0.7;
RRTOptimization::defaultAnnRate=0.95;
GDAOptimization::defaultAlpha=0.05;
if( verboseMode )
cout << "Parameter-setting like W-DET-BEST\n";
}
else
{
TAOptimization::defaultAnnRate=0.4;
RRTOptimization::defaultAnnRate=0.6;
GDAOptimization::defaultAlpha=0.0125;
if( verboseMode )
cout << "Parameter-setting like W-DET-DET\n";
}
}
KategProblem &makRandom(int ANZ_WORD,int ANZ_CLS,int initValue,
int auswertung,int nachbarschaft,float relInit)
{
KategProblem &k=
*new KategProblem(ANZ_WORD,ANZ_CLS,initValue,auswertung,nachbarschaft);
KategProblemWBC &w=k.wordFreq;
Array<int> after(ANZ_WORD,0);
Array<int> before(ANZ_WORD,0);
Array<FreqArray> twoD(ANZ_WORD);
int i;
for(i=0;i<ANZ_WORD;i++) twoD[i].init(ANZ_WORD,0);
for(i=0;i<ANZ_WORD;i++)
{
massert(after[i]==0);
massert(before[i]==0);
for(int j=0;j<ANZ_WORD;j++)
{
massert(twoD[i][j]==0);
}
}
for(i=0;i<ANZ_WORD*ANZ_WORD*relInit;i++)
{
int x=randomInt(ANZ_WORD);
int y=randomInt(ANZ_WORD);
if(twoD[x][y]==0)
{
after[x]++;
before[y]++;
}
twoD[x][y]+=randomInt(10)+1;
}
for(i=0;i<ANZ_WORD;i++)
{
w.setAfterWords(i,after[i]);
w.setBeforeWords(i,before[i]);
}
for(i=0;i<ANZ_WORD;i++)
{
for(int j=0;j<ANZ_WORD;j++)
if( twoD[i][j] )
w.setFreq(i,j,twoD[i][j]);
}
w.testFull();
return k;
}
char *makeTitle(KategProblem &problem,int verfahren)
{
char x[1024];
switch(verfahren)
{
case HC_OPT:
strcpy(x,"HC ");
break;
case SA_OPT:
strcpy(x,"SA ");
break;
case TA_OPT:
strcpy(x,"TA ");
break;
case GDA_OPT:
strcpy(x,"GDA ");
break;
case RRT_OPT:
strcpy(x,"RRT ");
break;
}
problem.makeTitle(x+strlen(x));
return strdup(x);
}
#define MAX_MULTIPLE 10
Array<KategProblem *> &_izrOptimization(Array<KategProblem *> &probs,
int anzprob,double timeForOneRed,double maxClock,Array<Kategory> &katOfWord,
int anzIter,int verfahren)
{
massert(anzprob>1);
massert(probs[0]->wordFreq.mindestAnzahl<=1);
KategProblem *p0=probs[0];
int nWords=p0->wordFreq.nWords;
int nKats=p0->katFreq.nKats;
int minimumNumberOfWords = max(1,int(nWords*0.95));
int indexOfDurchschnitt;
Array<int> newWords(nWords);
int useAnzprob=anzprob;
do
{
int w,k;
indexOfDurchschnitt=0;
for(w=0;w<nWords;w++)
newWords[w]=-1;
for(k=0;k<useAnzprob;k++)
{
massert(probs[k]->wordFreq.nWords==nWords);
probs[k]->makeKats();
}
for(w=0;w<nWords;w++)
{
if( newWords[w]==-1 )
{
leda_set<int> durchschnitt=(*p0->kats)[p0->katOfWord(w)];
for(k=1;k<useAnzprob;k++)
durchschnitt = durchschnitt & (*probs[k]->kats)[probs[k]->katOfWord(w)];
int _anzInDurchschnitt=0;
int nr=0;
forall_set(leda_set<int>,nr,durchschnitt)
{
_anzInDurchschnitt++;
newWords[nr]=indexOfDurchschnitt;
}
if( verboseMode && _anzInDurchschnitt>1 && anzIter==0 )
{
cout << "- (";
forall_set(leda_set<int>,nr,durchschnitt)
{
cout << p0->getString(nr);
if( p0->wordFreq.n1(nr)==1 )
cout << "* ";
else
cout << " ";
}
cout << ")\n";
}
for(k=0;k<useAnzprob;k++)
{
durchschnitt = durchschnitt - (*probs[k]->kats)[probs[k]->katOfWord(w)];
}
indexOfDurchschnitt++;
}
}
if(indexOfDurchschnitt>=minimumNumberOfWords)
{
if(useAnzprob==1)
{
cout << "useAnzProb==1 => mysterious.\n";
break;
}
useAnzprob--;
}
}
while(indexOfDurchschnitt>=minimumNumberOfWords);
Array<KategProblem *> &neu=*new Array<KategProblem *>(MAX_MULTIPLE*anzprob,(KategProblem *)0);
qsort(probs.getPointerToData(),useAnzprob,sizeof(KategProblem *),compareProblem);
massert(useAnzprob<=probs.size());
double startTime=clockSec();
int i, numberOfNew;
for(numberOfNew=0; (clockSec()-startTime<timeForOneRed)
|| (numberOfNew < anzprob) ; numberOfNew++)
{
int w;
if( numberOfNew==anzprob*MAX_MULTIPLE-1 )
break;
KategProblem *p
= neu[numberOfNew]
= new KategProblem(indexOfDurchschnitt,nKats-2,
p0->initialisierung,p0->auswertung,p0->nachbarschaft);
for(w=0;w<indexOfDurchschnitt;w++)
{
p->wordFreq.setAfterWords(w,5);
p->wordFreq.setBeforeWords(w,5);
}
for(w=0;w<nWords;w++)
{
Array<OneFreq> &after=p0->wordFreq.after[w];
int size=after.size();
for(i=0;i<size;i++)
p->wordFreq.addFreq(newWords[w],newWords[after[i].w],after[i].n);
}
p->wordFreq.testFull(1);
p->wordFreq.set_h_of_words(p0->wordFreq.get_h_of_words());
double w1=0.0,w2=0.0;
if(numberOfNew<useAnzprob)
{
for(i=0;i<nWords;i++)
(p->initLike)[newWords[i]]=probs[numberOfNew]->katOfWord(i);
p->_initialize(5);
HCOptimization hc(*p,-1);
if(verboseMode)
{
w1=p->nicevalue();
cout << "from old category system:" << w1 << endl;
}
hc.minimize(-1);
if(verboseMode)
{
w2=p->nicevalue();
if(w2<w1)
cout << "improvement: " << w1-w2 << endl;
}
}
else
{
p->_initialize(1);
double mean;
StatVar end,laufzeit,start;
solveProblem(0,*p,1,-1,verfahren,mean,end,laufzeit,start);
w2=p->value();
if(verboseMode)
cout << "new category system: " << w2 << " (" << p->nicevalue()
<< ") Zeit: " << clockSec() << "\n";
}
}
int p;
for(p=0;p<probs.size();p++)
{
if( probs[p] )
delete probs[p];
}
qsort(neu.getPointerToData(),numberOfNew,sizeof(Problem *),compareProblem);
massert(numberOfNew<=neu.size());
if( verboseMode )
cout << "Iterierte Zustandsraum-Reduktion: " << indexOfDurchschnitt
<< " words. costs: " << neu[0]->value() << " "
<< neu[0]->nicevalue() << " (" << numberOfNew-anzprob << ")" << "time: "
<< clockSec() << endl;
if( indexOfDurchschnitt<=nKats
|| (clockSec()>maxClock&&maxClock) )
{
if( clockSec()>maxClock&&maxClock )
cout << "STOP (time limit: " << (clockSec()-maxClock) << " s)\n";
for(i=0;i<nWords;i++)
katOfWord[i]=neu[0]->katOfWord(newWords[i]);
return neu;
}
else
{
Array<Kategory> &newKatOfWord=
*(new Array<Kategory>(neu[0]->wordFreq.nWords,-1));
Array<KategProblem *> &erg=_izrOptimization(neu,anzprob,timeForOneRed,
maxClock,newKatOfWord,
anzIter+1,verfahren);
for(i=0;i<nWords;i++)
katOfWord[i]=newKatOfWord[newWords[i]];
return erg;
}
}
KategProblem *izrOptimization(KategProblem &p,int minN,int firstN,
double clockForOneRed,double maxClock,int verfahren)
{
Array<Kategory> katOfWord(p.wordFreq.nWords,-1);
int startN;
if( clockForOneRed<=0 )
startN=firstN;
else
startN=1000;
Array<KategProblem *> probs(startN);
double val1=0.0,val2=0.0;
double endTime=-1;
double startTime=clockSec();
int i;
for(i=0;i<startN;i++)
{
StatVar end,laufzeit,start;
double mean;
probs[i] = (KategProblem *)((KategProblem *)p.makeEqualProblem());
solveProblem(0,*(probs[i]),1,-1,verfahren,mean,end,laufzeit,start);
if( i==minN-1 )
endTime = clockSec();
if( i>=firstN-1 && (startTime+clockForOneRed>clockSec() || i==999) )
break;
}
if( endTime<0 )
endTime=clockSec();
massert(i>=firstN);
qsort(probs.getPointerToData(),i,sizeof(KategProblem *),compareProblem);
massert(i<=probs.size());
if( clockForOneRed<=0 )
{
clockForOneRed=endTime-startTime;
if( verboseMode )
cout << "time for one reduction: " << clockForOneRed << endl;
}
_izrOptimization(probs,minN,clockForOneRed,maxClock,katOfWord,0,verfahren);
KategProblem *n=(KategProblem *)(p.makeEqualProblem());
n->initLike= katOfWord;
n->_initialize(5);
if( verboseMode )
val1=n->value();
HCOptimization hc(*n,-1);
hc.minimize(-1);
val2=n->value();
if( verboseMode )
cout << "last improvement: " << val2-val1 << "\n";
cout << "final costs: " << val2 << " " << n->nicevalue() << endl;
if(PrintBestTo)
n->dumpOn(*PrintBestTo);
return n;
}