| #!/usr/bin/env python |
| # Author : Qin Gao |
| # Date : Dec 31, 2007 |
| # Purpose: Combine multiple alignment files into a single one, the files are |
| # prodcuced by MGIZA, which has sentence IDs, and every file is |
| # ordered inside |
| |
| import sys |
| import re |
| |
| if len(sys.argv)<2: |
| sys.stderr.write("Provide me the file names (at least 2)\n"); |
| sys.exit(); |
| |
| sent_id = 0; |
| |
| files = []; |
| ids = []; |
| |
| sents = []; |
| done = []; |
| |
| for i in range(1,len(sys.argv)): |
| files.append(open(sys.argv[i],"r")); |
| ids.append(0); |
| sents.append(""); |
| done.append(False); |
| |
| r = re.compile("\\((\\d+)\\)"); |
| i = 0; |
| while i< len(files): |
| st1 = files[i].readline(); |
| st2 = files[i].readline(); |
| st3 = files[i].readline(); |
| if len(st1)==0 or len(st2)==0 or len(st3)==0: |
| done[i] = True; |
| else: |
| mt = r.search(st1); |
| id = int(mt.group(1)); |
| ids[i] = id; |
| sents[i] = (st1, st2, st3); |
| i += 1 |
| |
| cont = True; |
| while (cont): |
| sent_id += 1; |
| writeOne = False; |
| # Now try to read more sentences |
| i = 0; |
| cont = False; |
| while i < len(files): |
| if done[i]: |
| i+=1 |
| continue; |
| cont = True; |
| if ids[i] == sent_id: |
| sys.stdout.write("%s%s%s"%(sents[i][0],sents[i][1],sents[i][2])); |
| writeOne = True; |
| st1 = files[i].readline(); |
| st2 = files[i].readline(); |
| st3 = files[i].readline(); |
| if len(st1)==0 or len(st2)==0 or len(st3)==0: |
| done[i] = True; |
| else: |
| mt = r.search(st1); |
| id = int(mt.group(1)); |
| ids[i] = id; |
| sents[i] = (st1, st2, st3); |
| cont = True; |
| break; |
| elif ids[i] < sent_id: |
| sys.stderr.write("ERROR! DUPLICATED ENTRY %d\n" % ids[i]); |
| sys.exit(); |
| else: |
| cont = True; |
| i+=1; |
| if (not writeOne) and cont: |
| sys.stderr.write("ERROR! MISSING ENTRy %d\n" % sent_id); |
| #sys.exit(); |
| sys.stderr.write("Combined %d files, totally %d sents \n" %(len(files),sent_id-1)); |