blob: 1a0d43975093e555ab3c4e1e7ffc481496514ddf [file] [log] [blame]
/*
Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
mkcls - a program for making word classes .
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
*/
#include <stdlib.h>
#include "KategProblem.h"
static int oneFreqCompareSteigend(const void *p,const void *j)
{
#ifdef FREQTYPE_DOUBLE
if( (((OneFreq *)p)->n < ((OneFreq *)j)->n) )
return -1;
if( (((OneFreq *)p)->n > ((OneFreq *)j)->n) )
return +1;
else
return 0;
#else
return ((OneFreq *)p)->n - ((OneFreq *)j)->n;
#endif
}
static int oneFreqCompareFallend(const void *p,const void *j)
{
#ifdef FREQTYPE_DOUBLE
if( (((OneFreq *)p)->n > ((OneFreq *)j)->n) )
return -1;
if( (((OneFreq *)p)->n < ((OneFreq *)j)->n) )
return +1;
else
return 0;
#else
return -((OneFreq *)p)->n + ((OneFreq *)j)->n;
#endif
}
KategProblemWBC::KategProblemWBC(int n,int minw)
: _n1(n,0),_n2(n,0),with_h_of_words(0),afterFilled(n,0),beforeFilled(n,0),filled(0),fixedWord(n,-1),absteigend(0),nWords(n),nTranspWords(0),
mindestAnzahl(minw),after(n),before(n),minIndex(n,-1),maxIndex(n,-1)
{
}
KategProblemWBC::~KategProblemWBC()
{
massert( after.size()==nWords);
if( absteigend )
delete absteigend;
}
void KategProblemWBC::init(int specialFixedWord)
{
nTranspWords=0;
int i;
for(i=0;i<_n1.size();i++)
{
if( (_n1[i]<mindestAnzahl && _n2[i]<mindestAnzahl && minIndex[i]<=1) ||i==specialFixedWord )
{
if(!( fixedWord[i]==1 || fixedWord[i]== -1))
cerr << "mkcls:KategProblemWBC::init::ERROR: " << i << " " << fixedWord[i] << endl;
fixedWord[i]=1;
}
else if(fixedWord[i]<0)
nTranspWords++;
}
if( absteigend==0 )
absteigend= &(getSortedList(0));
if(verboseMode && nTranspWords!=_n1.size()-1 )
cout << "Es sind: " <<nTranspWords<<" transportierbar.\n";
}
void KategProblemWBC::set_h_of_words(double s)
{
with_h_of_words=1;
h_of_words = -s;
}
double KategProblemWBC::get_h_of_words()
{
if( with_h_of_words )
return -h_of_words;
else
{
h_of_words=0;
for(int i=0;i<nWords;i++)
h_of_words+=0.5*(kat_h(_n2[i])+kat_h(_n1[i]));
with_h_of_words=1;
return -h_of_words;
}
}
void KategProblemWBC::setAfterWords(int w,int anzahl)
{
OneFreq o;
o.w=-1;
o.n=0;
afterFilled[w]=0;
after[w].init(anzahl,o,1);
}
void KategProblemWBC::setBeforeWords(int w,int anzahl)
{
OneFreq o;
o.w=-1;
o.n=0;
beforeFilled[w]=0;
before[w].init(anzahl,o,1);
}
void KategProblemWBC::setFreq(int w1,int w2,FreqType anzahl)
{
OneFreq o;
o.n=anzahl;
o.w=w2;
after[w1][afterFilled[w1]++]=o;
_n1[w1]+=anzahl;
o.w=w1;
before[w2][beforeFilled[w2]++]=o;
_n2[w2]+=anzahl;
}
void KategProblemWBC::addFreq(int w1,int w2,FreqType anzahl)
{
OneFreq o;
o.n=anzahl;
int pos=-1,i;
for(i=0;i<afterFilled[w1];i++)
if(after[w1][i].w==w2)
pos=i;
if(pos==-1)
{
o.w=w2;
after[w1][afterFilled[w1]++]=o;
}
else
after[w1][pos].n+=anzahl;
_n1[w1]+=anzahl;
pos=-1;
for(i=0;i<beforeFilled[w2];i++)
if(before[w2][i].w==w1)
pos=i;
if(pos==-1)
{
o.w=w1;
before[w2][beforeFilled[w2]++]=o;
}
else
before[w2][pos].n+=anzahl;
_n2[w2]+=anzahl;
}
short KategProblemWBC::testFull(int doIt)
{
int enaNom=0;
int afterFilledSum=0,beforeFilledSum=0;
int ret=1,i;
for(i=0;i<nWords;i++)
{
if( n1(i)==1 && n2(i)==1 )
enaNom++;
afterFilledSum+=afterFilled[i];
beforeFilledSum+=beforeFilled[i];
if(afterFilled[i]!=after[i].size())
{
ret=0;
if( doIt )
after[i].resize(afterFilled[i]);
}
if(beforeFilled[i]!=before[i].size())
{
ret=0;
if( doIt )
before[i].resize(beforeFilled[i]);
}
}
if( ret==0 && !doIt )
{
cerr << "Error: Unfilled word bigram statistics.\n";
exit(1);
}
else
filled=1;
if( verboseMode>1 )
{
cout << "MEAN(|L(w)|+|R(w)|)=" << (beforeFilledSum/(float)nWords)
+(afterFilledSum/(float)nWords) << endl;
cout << "Hapaslegomena: " << enaNom << endl;
}
int symmetrisch=1;
for(i=0;i<nWords;i++)
{
int j;
massert(before[i].size()==beforeFilled[i]);
massert( after[i].size()== afterFilled[i]);
FreqType sum=0;
for(j=0;j<after[i].size();j++)
sum+=after[i][j].n;
massert( sum==_n1[i] );
sum=0;
for(j=0;j<before[i].size();j++)
sum+=before[i][j].n;
massert(sum==_n2[i]);
if(_n1[i]!=_n2[i])
{
symmetrisch=0;
if( verboseMode>1 )
cout << "Asymmetrie: " << i << " " << _n1[i] << " " << _n2[i] << endl;
}
}
if(verboseMode && symmetrisch==0)
cout << "Warning: word bigram statistic is not symmetric "
"(this is possibly an error)\n";
return ret;
}
Array<Word> &KategProblemWBC::getSortedList(int steigend)
{
int siz=_n2.size(),i;
massert(filled);
Array<Word> &sortedList =*new Array<Word>(siz);
Array<OneFreq> list(siz);
int pos=0;
for(i=0;i<siz;i++)
{
if( fixedWord[i]<0 )
{
list[pos].w=i;
list[pos].n=_n1[i];
pos++;
}
}
int anzFree=pos;
for(i=0;i<siz;i++)
{
if( fixedWord[i]>=0 )
{
list[pos].w=i;
list[pos].n=_n1[i];
pos++;
}
}
massert(pos==siz);
if(steigend )
qsort(list.getPointerToData(),anzFree,sizeof(OneFreq),oneFreqCompareSteigend);
else
qsort(list.getPointerToData(),anzFree,sizeof(OneFreq),oneFreqCompareFallend);
massert( anzFree<=list.size() );
for(i=0;i<siz;i++)
{
sortedList[i]=list[i].w;
massert(steigend || i==0 || i>=anzFree || list[i-1].n>=list[i].n );
massert((!steigend) || i==0 || i>=anzFree || list[i-1].n<=list[i].n );
}
return sortedList;
}
FreqType KategProblemWBC::numberOfWords()
{
FreqType n1=0,n2=0;
for(int i=0;i<_n1.size();i++)
{
n1+=_n1[i];
n2+=_n2[i];
}
#ifndef FREQTYPE_DOUBLE
massert(n1==n2);
#endif
return n1;
}
void KategProblemWBC::setDollar(int n)
{
if( fixedWord[n]<0 )
nTranspWords--;
fixedWord[n]=0;
}
void KategProblemWBC::initializeIndex(const leda_array<string>&words,char firstChar,int unten,int oben,bool noHapas)
{
int n=0;
int i;
massert(-1<unten);massert(unten<oben);
if( verboseMode )
cout << "InitializeIndex: " << firstChar << " u:" << unten << " o:" << oben << " " << noHapas << endl;
over_array(words,i)
{
if( words[i][0]==firstChar && (noHapas || ((short)(n1(i)+0.0001))>=mindestAnzahl || ((short)(n2(i)+0.0001))>=mindestAnzahl) )
{
minIndex[i]=unten;
maxIndex[i]=oben;
n++;
}
}
if( verboseMode )
cout << "InitializeIndex gefunden fuer " << n << " Woerter.\n";
}