blob: 2aae066015043ce829eb0904c9c8a28ebb5da60c [file] [log] [blame]
#include <iostream>
#include <string>
#include <strstream>
#include <fstream>
#include <map>
#include <vector>
#include <stdlib.h>
#include <stdio.h>
using namespace std;
int main(int argc,char**argv)
{
string snt1(""), snt2(""), vcb1(""), vcb2("");
vector<double>weights;
vector<string>filenames;
for(int i=1;i<argc;++i)
if(string(argv[i])=="-weight")
weights.push_back(atof(argv[++i]));
else if(string(argv[i])=="-snt1")
snt1=argv[++i];
else if(string(argv[i])=="-snt2")
snt2=argv[++i];
else if(string(argv[i])=="-vcb1")
vcb1=argv[++i];
else if(string(argv[i])=="-vcb2")
vcb2=argv[++i];
else
filenames.push_back(argv[i]);
if((filenames.size()%2)==1||filenames.size()==0 )
{
cerr << argv[0] << " txt1 txt2 [txt3 txt4 -weight w -vcb1 output1.vcb -vcb2 output2.vcb -snt1 output1_output2.snt -snt2 output2_output1.snt]\n";
cerr << " Converts plain text into GIZA++ snt-format.\n";
exit(1);
}
string line1,line2,word;
map<string,int> v1,v2;
map<string,int> id1,id2;
vector<string> iid1(2),iid2(2);
string w1(filenames[0]);
string w2(filenames[1]);
if( w1.length()>4&&w2.length()>4&&((w1.substr(w1.length()-4,w1.length())==".tok" && w2.substr(w2.length()-4,w2.length())==".tok" )||
(w1.substr(w1.length()-4,w1.length())==".txt" && w2.substr(w2.length()-4,w2.length())==".txt" ) ))
{
w1=w1.substr(0,w1.length()-4);
w2=w2.substr(0,w2.length()-4);
cerr << "w1:"<< w1 << " w2:" << w2 << endl;
}
string vocab1(w1),vocab2(w2);
unsigned int slashpos=vocab1.rfind('/')+1;
#ifdef WIN32
if(slashpos==0) slashpos=vocab1.rfind('\\')+1;
#endif
if( slashpos>=vocab1.length() ) slashpos=0;
string vocab1x(vocab1.substr(slashpos,vocab1.length()));
cout << vocab1 << " -> " << vocab1x << endl;
slashpos=vocab2.rfind('/')+1;
#ifdef WIN32
if(slashpos==0) slashpos=vocab1.rfind('\\')+1;
#endif
if( slashpos>=vocab2.length() ) slashpos=0;
string vocab2x(vocab2.substr(slashpos,vocab2.length()));
cout << vocab2 << " -> " << vocab2x << endl;
if (snt1=="") {
snt1=vocab1+"_"+vocab2x+string(".snt");
}
if (snt2=="") {
snt2=vocab2+"_"+vocab1x+string(".snt");
}
if (vcb1=="") {
vocab1+=string(".vcb");
} else {
vocab1=vcb1;
}
if (vcb2=="") {
vocab2+=string(".vcb");
} else {
vocab2=vcb2;
}
ofstream ovocab1(vocab1.c_str()),ovocab2(vocab2.c_str()),osnt1(snt1.c_str()),osnt2(snt2.c_str());
for(unsigned int i=0;i<filenames.size();i+=2)
{
ifstream i1(filenames[i].c_str()),i2(filenames[i+1].c_str());
if(!i1)cerr << "WARNING: " << filenames[i] << " cannot be read.\n";
if(!i2)cerr << "WARNING: " << filenames[i+1] << " cannot be read.\n";
while(getline(i1,line1) && getline(i2,line2) )
{
vector<string> t1,t2;
istrstream ii1(line1.c_str());
while(ii1>>word)
{
t1.push_back(word);
v1[word]++;
if( id1.find(word)==id1.end() )
{
iid1.push_back(word);
id1[word]=iid1.size()-1;
}
}
istrstream ii2(line2.c_str());
while(ii2>>word)
{
t2.push_back(word);
v2[word]++;
if( id2.find(word)==id2.end() )
{
iid2.push_back(word);
id2[word]=iid2.size()-1;
}
}
double w=1.0;
if( i/2<weights.size() )
w=weights[i/2];
if( t1.size()&&t2.size() )
{
osnt1 << w << "\n";
for(unsigned int j=0;j<t1.size();++j)osnt1 << id1[t1[j]] << ' ';
osnt1 << '\n';
for(unsigned int j=0;j<t2.size();++j)osnt1 << id2[t2[j]] << ' ';
osnt1 << '\n';
osnt2 << w << "\n";
for(unsigned int j=0;j<t2.size();++j)osnt2 << id2[t2[j]] << ' ';
osnt2 << '\n';
for(unsigned int j=0;j<t1.size();++j)osnt2 << id1[t1[j]] << ' ';
osnt2 << '\n';
}
else
cerr << "WARNING: filtered out empty sentence (source: " << filenames[i] << " " << t1.size() <<
" target: " << filenames[i+1] << " " << t2.size() << ").\n";
}
}
for(unsigned int i=2;i<iid1.size();++i)
ovocab1 << i << ' ' << iid1[i] << ' ' << v1[iid1[i]] << '\n';
for(unsigned int i=2;i<iid2.size();++i)
ovocab2 << i << ' ' << iid2[i] << ' ' << v2[iid2[i]] << '\n';
}