#!/usr/bin/env python2.5

################################################################################
# September 2011                                              TRELLET Mikael   #
#                              format_PDB.py                                   #
#                                                                              #
#     Format and parse reference PDB structure to avoid errors in HADDOCK      #
#                       results analysis by Results_formatting.sh              #
################################################################################

import sys
from sys import argv
from os import popen
from Bio.PDB import PDBParser
from Bio.PDB import PDBIO
from Bio.PDB.Chain import Chain
from Bio.PDB.Polypeptide import PPBuilder
from Bio.Emboss.Applications import NeedleCommandline
from Bio import AlignIO

# Checks that chains are A and B (default for HADDOCK)
def check_chains(struct):
        chains= [chain.id for chain in struct.get_chains()]
	len1=len(struct[0][chains[0]])
	len2=len(struct[0][chains[1]])

	if len1 < len2:
		tmp=chains[0]
		tmp2=chains[1]
		chains[0]=tmp2
		chains[1]=tmp

        if chains[0] != 'A':
                new_chain=Chain('A')
                struct[0][chains[0]].id='A'
                new_chain=chain=struct[0].child_dict[chains[0]]
                del struct[0].child_dict[chains[0]]
                struct[0].child_dict['A']=new_chain
        if chains[1] != 'B':
                new_chain=Chain('B')
                struct[0][chains[1]].id='B'
                new_chain=chain=struct[0].child_dict[chains[1]]
                del struct[0].child_dict[chains[1]]
                struct[0].child_dict['B']=new_chain

# Checks water molecules presence (can perturbate numbering and residues matching)
def check_hetatm(struct):
        resids=[(res.id,res.parent.id) for res in struct.get_residues() if res.id[0]=='W']
        for ids, chain in resids:
                struct[0][chain].detach_child(ids)

# Checks umbering of structures to make them match
def check_numbering(struct, model):
        p_m=PDBParser(QUIET=True)
        s_m=p_m.get_structure('model',model)

        numA=[resid.id[1] for resid in s_m[0]['A']]
        numB=[resid.id[1] for resid in s_m[0]['B']]
        countA=0
        countB=0

        for res in struct.get_residues():
                if res.parent.id == 'A':
                        res.id=(' ',numA[countA],' ')
                        countA=countA+1
                if res.parent.id == 'B':
                        res.id=(' ',numB[countB],' ')
                        countB=countB+1

# Align reference and model to avoid holes and mismatch in the sequence
def align_receptor( refe, model):
    id_ref=[res.id[1] for res in refe[0]['A']]
    id_model=[res.id[1] for res in model[0]['A']]
    #Create sequence files for both reference and model
    refe_file=open('refe.seq','w')
    model_file=open('model.seq','w')
    #Get receptor chains sequence
    ppb=PPBuilder()
    ref_tot=[res for res in refe.get_residues()]
    mod_tot=[res for res in model.get_residues()]
    ref_seq=''
    model_seq=''
    for seq in ppb.build_peptides(s[0]['A']):
        ref_seq=ref_seq+seq.get_sequence().tostring()
    for seq in ppb.build_peptides(s2[0]['A']):
        model_seq=model_seq+seq.get_sequence().tostring()

    #Write sequences into files
    refe_file.write('>refe \n'+ref_seq)
    model_file.write('>model \n'+model_seq)
    refe_file.close()
    model_file.close()

    #Use needle to align sequences
    needle_cline = NeedleCommandline(asequence="refe.seq", bsequence="model.seq",gapopen=10, gapextend=0.5, outfile="needle.align")
    stdout, stderr = needle_cline()

    #Read alignment provided by needle
    align = AlignIO.read("needle.align", "emboss")
    count_ref=0
    count_model=0
    count=0
    for res in align[1]:
        if align[0][count]=='-':
            count_model=count_model+1
        elif res=='-':
            refe[0]['A'].detach_child((' ',id_ref[count_ref],' '))
            count_ref=count_ref+1
        else:
            refe[0]['A'][id_ref[count_ref]].id=(' ',id_model[count_model],' ')
            count_model=count_model+1
            count_ref=count_ref+1
        count=count+1

# Return segid unkonwn by HADDOCK and provided in the reference file
def return_bad_segid(refe):
        bad_ids = []
        for r in refe.get_residues():
            if len(r.segid.strip()) > 1:
                bad_ids.append(r.segid)
        bad_ids = '\n'.join(list(set(bad_ids)))
        print bad_ids
                  

if __name__ == '__main__':
	# Quick check of PDB models generated by HADDOCK to avoid wrong chain_ids
	if argv[1]=='check':
		refe=argv[2]
		p=PDBParser(QUIET=True)
		s=p.get_structure('reference',refe)
		return_bad_segid(s)
		sys.exit(1)

        refe=argv[1] # Reference structure provided by the user
        model=argv[2] # Random model taken from PDB files generated by HADDOCK
        p=PDBParser(QUIET=True)
        s=p.get_structure('reference',refe)
        check_chains(s)
        check_hetatm(s)
        p2=PDBParser(QUIET=True)
        s2=p2.get_structure('model',model)
        resA=[res.resname for res in s[0]['A']]
        resB=[res.resname for res in s2[0]['A']]
        if resA == resB:
            check_numbering(s, model)
        else:
            align_receptor(s, s2)

        w = PDBIO()
        w.set_structure(s)
        w.save("reference.pdb")

        sys.exit(1)
