blob: 626bc68dffe2da7693830322e95eda0122ca529d [file] [log] [blame]
#!/usr/bin/env python
# Author : Qin Gao
# Date : Dec 31, 2007
# Purpose: Combine multiple alignment files into a single one, the files are
# prodcuced by MGIZA, which has sentence IDs, and every file is
# ordered inside
import sys
import re
if len(sys.argv)<2:
sys.stderr.write("Provide me the file names (at least 2)\n");
sys.exit();
sent_id = 0;
files = [];
ids = [];
sents = [];
done = [];
for i in range(1,len(sys.argv)):
files.append(open(sys.argv[i],"r"));
ids.append(0);
sents.append("");
done.append(False);
r = re.compile("\\((\\d+)\\)");
i = 0;
while i< len(files):
st1 = files[i].readline();
st2 = files[i].readline();
st3 = files[i].readline();
if len(st1)==0 or len(st2)==0 or len(st3)==0:
done[i] = True;
else:
mt = r.search(st1);
id = int(mt.group(1));
ids[i] = id;
sents[i] = (st1, st2, st3);
i += 1
cont = True;
while (cont):
sent_id += 1;
writeOne = False;
# Now try to read more sentences
i = 0;
cont = False;
while i < len(files):
if done[i]:
i+=1
continue;
cont = True;
if ids[i] == sent_id:
sys.stdout.write("%s%s%s"%(sents[i][0],sents[i][1],sents[i][2]));
writeOne = True;
st1 = files[i].readline();
st2 = files[i].readline();
st3 = files[i].readline();
if len(st1)==0 or len(st2)==0 or len(st3)==0:
done[i] = True;
else:
mt = r.search(st1);
id = int(mt.group(1));
ids[i] = id;
sents[i] = (st1, st2, st3);
cont = True;
break;
elif ids[i] < sent_id:
sys.stderr.write("ERROR! DUPLICATED ENTRY %d\n" % ids[i]);
sys.exit();
else:
cont = True;
i+=1;
if (not writeOne) and cont:
sys.stderr.write("ERROR! MISSING ENTRy %d\n" % sent_id);
#sys.exit();
sys.stderr.write("Combined %d files, totally %d sents \n" %(len(files),sent_id-1));