scripts/training/MGIZA/src/plain2snt.cpp - joshua - Git at Google

 #include <iostream>
 #include <string>
 #include <strstream>
 #include <fstream>
 #include <map>
 #include <vector>
 #include <stdlib.h>
 #include <stdio.h>

 using namespace std;

 int main(int argc,char**argv)
 {
   string snt1(""), snt2(""), vcb1(""), vcb2("");
   vector<double>weights;
   vector<string>filenames;
   for(int i=1;i<argc;++i)
     if(string(argv[i])=="-weight")
       weights.push_back(atof(argv[++i]));
     else if(string(argv[i])=="-snt1")
       snt1=argv[++i];
     else if(string(argv[i])=="-snt2")
       snt2=argv[++i];
     else if(string(argv[i])=="-vcb1")
       vcb1=argv[++i];
     else if(string(argv[i])=="-vcb2")
       vcb2=argv[++i];
     else
       filenames.push_back(argv[i]);

   if((filenames.size()%2)==1||filenames.size()==0 )
     {
       cerr << argv[0] << " txt1 txt2 [txt3 txt4 -weight w -vcb1 output1.vcb -vcb2 output2.vcb -snt1 output1_output2.snt -snt2 output2_output1.snt]\n";
       cerr << " Converts plain text into GIZA++ snt-format.\n";
       exit(1);
     }
   string line1,line2,word;
   map<string,int> v1,v2;
   map<string,int> id1,id2;
   vector<string> iid1(2),iid2(2);

   string w1(filenames[0]);
   string w2(filenames[1]);

   if( w1.length()>4&&w2.length()>4&&((w1.substr(w1.length()-4,w1.length())==".tok" && w2.substr(w2.length()-4,w2.length())==".tok" )||
 				     (w1.substr(w1.length()-4,w1.length())==".txt" && w2.substr(w2.length()-4,w2.length())==".txt" ) ))
     {
       w1=w1.substr(0,w1.length()-4);
       w2=w2.substr(0,w2.length()-4);
       cerr << "w1:"<< w1 << " w2:" << w2 << endl;
     }

   string vocab1(w1),vocab2(w2);
   unsigned int slashpos=vocab1.rfind('/')+1;
 #ifdef WIN32
   if(slashpos==0) slashpos=vocab1.rfind('\\')+1;
 #endif
   if( slashpos>=vocab1.length() ) slashpos=0;
   string vocab1x(vocab1.substr(slashpos,vocab1.length()));
   cout << vocab1 << " -> " << vocab1x << endl;
   slashpos=vocab2.rfind('/')+1;
 #ifdef WIN32
   if(slashpos==0) slashpos=vocab1.rfind('\\')+1;
 #endif
   if( slashpos>=vocab2.length() ) slashpos=0;
   string vocab2x(vocab2.substr(slashpos,vocab2.length()));
   cout << vocab2 << " -> " << vocab2x << endl;
   if (snt1=="") {
     snt1=vocab1+"_"+vocab2x+string(".snt");
   }
   if (snt2=="") {
     snt2=vocab2+"_"+vocab1x+string(".snt");
   }
   if (vcb1=="") {
     vocab1+=string(".vcb");
   } else {
     vocab1=vcb1;
   }
   if (vcb2=="") {
     vocab2+=string(".vcb");
   } else {
     vocab2=vcb2;
   }

   ofstream ovocab1(vocab1.c_str()),ovocab2(vocab2.c_str()),osnt1(snt1.c_str()),osnt2(snt2.c_str());
   for(unsigned int i=0;i<filenames.size();i+=2)
     {
       ifstream i1(filenames[i].c_str()),i2(filenames[i+1].c_str());
       if(!i1)cerr << "WARNING: " << filenames[i] << " cannot be read.\n";
       if(!i2)cerr << "WARNING: " << filenames[i+1] << " cannot be read.\n";
       while(getline(i1,line1) && getline(i2,line2) )
 	{
 	  vector<string> t1,t2;
 	  istrstream ii1(line1.c_str());
 	  while(ii1>>word)
 	    {
 	      t1.push_back(word);
 	      v1[word]++;
 	      if( id1.find(word)==id1.end() )
 		{
 		  iid1.push_back(word);
 		  id1[word]=iid1.size()-1;
 		}
 	    }
 	  istrstream ii2(line2.c_str());
 	  while(ii2>>word)
 	    {
 	      t2.push_back(word);
 	      v2[word]++;
 	      if( id2.find(word)==id2.end() )
 		{
 		  iid2.push_back(word);
 		  id2[word]=iid2.size()-1;
 		}
 	    }
 	  double w=1.0;
 	  if( i/2<weights.size() )
 	    w=weights[i/2];
 	  if( t1.size()&&t2.size() )
 	    {
 	      osnt1 << w << "\n";
 	      for(unsigned int j=0;j<t1.size();++j)osnt1 << id1[t1[j]] << ' ';
 	      osnt1 << '\n';
 	      for(unsigned int j=0;j<t2.size();++j)osnt1 << id2[t2[j]] << ' ';
 	      osnt1 << '\n';

 	      osnt2 << w << "\n";
 	      for(unsigned int j=0;j<t2.size();++j)osnt2 << id2[t2[j]] << ' ';
 	      osnt2 << '\n';
 	      for(unsigned int j=0;j<t1.size();++j)osnt2 << id1[t1[j]] << ' ';
 	      osnt2 << '\n';
 	    }
 	  else
 	    cerr << "WARNING: filtered out empty sentence (source: " << filenames[i] << " " << t1.size() <<
 	      " target: " << filenames[i+1] << " " << t2.size() << ").\n";
 	}
     }

   for(unsigned int i=2;i<iid1.size();++i)
     ovocab1 << i << ' ' << iid1[i] << ' ' << v1[iid1[i]] << '\n';
   for(unsigned int i=2;i<iid2.size();++i)
     ovocab2 << i << ' ' << iid2[i] << ' ' << v2[iid2[i]] << '\n';
 }
	#include <iostream>
	#include <string>
	#include <strstream>
	#include <fstream>
	#include <map>
	#include <vector>
	#include <stdlib.h>
	#include <stdio.h>

	using namespace std;

	int main(int argc,char**argv)
	{
	string snt1(""), snt2(""), vcb1(""), vcb2("");
	vector<double>weights;
	vector<string>filenames;
	for(int i=1;i<argc;++i)
	if(string(argv[i])=="-weight")
	weights.push_back(atof(argv[++i]));
	else if(string(argv[i])=="-snt1")
	snt1=argv[++i];
	else if(string(argv[i])=="-snt2")
	snt2=argv[++i];
	else if(string(argv[i])=="-vcb1")
	vcb1=argv[++i];
	else if(string(argv[i])=="-vcb2")
	vcb2=argv[++i];
	else
	filenames.push_back(argv[i]);

	if((filenames.size()%2)==1\|\|filenames.size()==0 )
	{
	cerr << argv[0] << " txt1 txt2 [txt3 txt4 -weight w -vcb1 output1.vcb -vcb2 output2.vcb -snt1 output1_output2.snt -snt2 output2_output1.snt]\n";
	cerr << " Converts plain text into GIZA++ snt-format.\n";
	exit(1);
	}
	string line1,line2,word;
	map<string,int> v1,v2;
	map<string,int> id1,id2;
	vector<string> iid1(2),iid2(2);

	string w1(filenames[0]);
	string w2(filenames[1]);

	if( w1.length()>4&&w2.length()>4&&((w1.substr(w1.length()-4,w1.length())==".tok" && w2.substr(w2.length()-4,w2.length())==".tok" )\|\|
	(w1.substr(w1.length()-4,w1.length())==".txt" && w2.substr(w2.length()-4,w2.length())==".txt" ) ))
	{
	w1=w1.substr(0,w1.length()-4);
	w2=w2.substr(0,w2.length()-4);
	cerr << "w1:"<< w1 << " w2:" << w2 << endl;
	}

	string vocab1(w1),vocab2(w2);
	unsigned int slashpos=vocab1.rfind('/')+1;
	#ifdef WIN32
	if(slashpos==0) slashpos=vocab1.rfind('\\')+1;
	#endif
	if( slashpos>=vocab1.length() ) slashpos=0;
	string vocab1x(vocab1.substr(slashpos,vocab1.length()));
	cout << vocab1 << " -> " << vocab1x << endl;
	slashpos=vocab2.rfind('/')+1;
	#ifdef WIN32
	if(slashpos==0) slashpos=vocab1.rfind('\\')+1;
	#endif
	if( slashpos>=vocab2.length() ) slashpos=0;
	string vocab2x(vocab2.substr(slashpos,vocab2.length()));
	cout << vocab2 << " -> " << vocab2x << endl;
	if (snt1=="") {
	snt1=vocab1+"_"+vocab2x+string(".snt");
	}
	if (snt2=="") {
	snt2=vocab2+"_"+vocab1x+string(".snt");
	}
	if (vcb1=="") {
	vocab1+=string(".vcb");
	} else {
	vocab1=vcb1;
	}
	if (vcb2=="") {
	vocab2+=string(".vcb");
	} else {
	vocab2=vcb2;
	}

	ofstream ovocab1(vocab1.c_str()),ovocab2(vocab2.c_str()),osnt1(snt1.c_str()),osnt2(snt2.c_str());
	for(unsigned int i=0;i<filenames.size();i+=2)
	{
	ifstream i1(filenames[i].c_str()),i2(filenames[i+1].c_str());
	if(!i1)cerr << "WARNING: " << filenames[i] << " cannot be read.\n";
	if(!i2)cerr << "WARNING: " << filenames[i+1] << " cannot be read.\n";
	while(getline(i1,line1) && getline(i2,line2) )
	{
	vector<string> t1,t2;
	istrstream ii1(line1.c_str());
	while(ii1>>word)
	{
	t1.push_back(word);
	v1[word]++;
	if( id1.find(word)==id1.end() )
	{
	iid1.push_back(word);
	id1[word]=iid1.size()-1;
	}
	}
	istrstream ii2(line2.c_str());
	while(ii2>>word)
	{
	t2.push_back(word);
	v2[word]++;
	if( id2.find(word)==id2.end() )
	{
	iid2.push_back(word);
	id2[word]=iid2.size()-1;
	}
	}
	double w=1.0;
	if( i/2<weights.size() )
	w=weights[i/2];
	if( t1.size()&&t2.size() )
	{
	osnt1 << w << "\n";
	for(unsigned int j=0;j<t1.size();++j)osnt1 << id1[t1[j]] << ' ';
	osnt1 << '\n';
	for(unsigned int j=0;j<t2.size();++j)osnt1 << id2[t2[j]] << ' ';
	osnt1 << '\n';

	osnt2 << w << "\n";
	for(unsigned int j=0;j<t2.size();++j)osnt2 << id2[t2[j]] << ' ';
	osnt2 << '\n';
	for(unsigned int j=0;j<t1.size();++j)osnt2 << id1[t1[j]] << ' ';
	osnt2 << '\n';
	}
	else
	cerr << "WARNING: filtered out empty sentence (source: " << filenames[i] << " " << t1.size() <<
	" target: " << filenames[i+1] << " " << t2.size() << ").\n";
	}
	}

	for(unsigned int i=2;i<iid1.size();++i)
	ovocab1 << i << ' ' << iid1[i] << ' ' << v1[iid1[i]] << '\n';
	for(unsigned int i=2;i<iid2.size();++i)
	ovocab2 << i << ' ' << iid2[i] << ' ' << v2[iid2[i]] << '\n';
	}