#!/usr/bin/python

"""	REQUIRED INSTALLATIONS:
        -Python 2.4 +
        -Biopython 1.42
        -Numpy/Scipy 1.2.1
        -mxTextTools: http://www.egenix.com/files/python/mxTextTools.html
	REQUIRED INSTALLATIONS:
        -Python 2.4 +
        -Biopython 1.42
        -Numpy/Scipy 1.2.1
        -mxTextTools: http://www.egenix.com/files/python/mxTextTools.html

INPUT REQUIREMENTS:
        -2 PDB files
                PDB file must be in correct PDB format (i.e. chain ID's present, unique atom name within each residue, occupancy, etc...)

Global_wRMSD.py INFORMATION

        NOTE: For similar structures (characterized by a small sRMSD; example: sRMSD < 5), the scaling factor is set to 2
                For nonsimilar structures (characterized by a large sRMSD; example: sRMSD > 5), the scaling factor is set to 5

        TO RUN Global_wRMSD.py:

                Global_wRMSD.py Protein_1.pdb Protein_1_Chain_ID Protein_2.pdb Protein_2_Chain_ID

                Example:
                Global_wRMSD.py 3ERD.pdb A 3ERT.pdb A

                Example output:
                        3ERD_sRMSD.pdb
                        3ERD_wRMSD.pdb
                        Calculated standard RMSD value

To run through cygwin:
        In c:cygwin\etc\profile

        Change PATH to local python (Python25):

        PATH=/usr/local/bin:/cygdrive/c/Python25:/usr/bin:/bin:/usr/X11R6/bin:$PATH
        export PATH

For questions or comments:

Nickolay Khazanov
nickolay@umich.edu
University of Michigan
Carlson Lab """


#define global functions
from __future__ import division
import sys, os
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.PDBIO import PDBIO
import numpy
from scipy import linalg
from numpy import sqrt, array, dot, transpose, exp, multiply

def run_Global_wRMSD(file1, file2, X_Chain_ID, Y_Chain_ID, only_standard):
        #COMPARE RESIDUES FROM PROTEIN X AND PROTEIN Y BY CHAINS, REMOVES NONMATCHING RESIDUE COORDINATES AND RETURNS CA COORDINATES FROM MATCHING RESIDUES
	#print "Source file:", file1, "chain", X_Chain_ID
	#print "Reference file:", file2, "chain", Y_Chain_ID

	parser = PDBParser(PERMISSIVE = 0)
	struct1 = parser.get_structure('str1', file1)
	struct2 = parser.get_structure('str2', file2)

        x_CAcoords, y_CAcoords = EnsureCorrespondence(file1, file2, X_Chain_ID, Y_Chain_ID, struct1, struct2)
        x_CAcoords = transpose(x_CAcoords)
        y_CAcoords = transpose(y_CAcoords)
        #PERFORM STANDARD AND WeigHTED RMSD CALCULATION
        x_all_coords = getAll_Coords(struct1)
        filebase = os.path.splitext(file1)[0]
        finalRMSD = performAlignment(x_CAcoords, y_CAcoords, x_all_coords, struct1, filebase)
#       x_CAcoords,x_all_coords, SRMSD = standard_alignment(x_CAcoords, y_CAcoords, x_all_coords, struct1, filebase)
#	print "The standard RMSD value is", SRMSD
#	if not only_standard:
#		WRMSD = weighted_alignment(x_CAcoords, y_CAcoords, x_all_coords, struct1, filebase, SRMSD)
#		print "The weighted RMSD value is", WRMSD

#ENSURE RESIDUE CORRESPONDENCE
def EnsureCorrespondence(X_name, Y_name, X_Chain_ID, Y_Chain_ID, struct1, struct2):
	# GET ALL RESIDUE NUMBERS THAT HAVE CA ATOM
        x_CAResIDs = get_CAResIDs(struct1, X_Chain_ID)
        y_CAResIDs = get_CAResIDs(struct2, Y_Chain_ID)

        #REMOVE DISORDERED RESIDUES
        x_disordered = get_disordered(struct1, X_Chain_ID)
	if x_disordered:
		print "Disordered residue(s) in source PDB file will not be included:",
		for resid in x_disordered:
			print resid,
		print
        y_disordered = get_disordered(struct2, Y_Chain_ID)
	if y_disordered:
		print "Disordered residue(s) in reference PDB file will not be included:",
		for resid in y_disordered:
			print resid,
		print

        #COMPARE PROTEIN1 TO PROTEIN2 AND REMOVE NONMATCHING RESIDUES
	if X_OFFSET or Y_OFFSET:
		print "Offset is being used: %s for source, %s for reference"%(X_OFFSET,Y_OFFSET)
        x_nonmatch, y_nonmatch = compare(x_CAResIDs, y_CAResIDs, X_OFFSET, Y_OFFSET)
	if y_nonmatch:
		print "Residue(s) not in source PDB file:",
		for y_resi in y_nonmatch:
			print y_resi,
		print
	if x_nonmatch:
		print "Residue(s) not in reference PDB file:",
		for x_resi in x_nonmatch:
			print x_resi,
		print

        #MAKE LIST OF ALL RESIDUES TO BE REMOVED
        x_resis_to_remove = x_disordered + y_disordered + x_nonmatch
        y_resis_to_remove = x_disordered + y_disordered + y_nonmatch

        #RETURNS ALL CA COORDINATES EXCEPT THOSE TO BE REMOVED
        x_CAcoords = get_CAcoords(struct1, X_Chain_ID, x_resis_to_remove)
        y_CAcoords = get_CAcoords(struct2, Y_Chain_ID, y_resis_to_remove)
        x_seq = get_Sequence(struct1, X_Chain_ID, x_resis_to_remove)
        y_seq = get_Sequence(struct2, Y_Chain_ID, y_resis_to_remove)

	#FINAL CHECK TO ENSURE RESIDUE CORRESPONDENCE
	num_CAx, num_CAy = len(x_CAcoords), len(y_CAcoords)
        if num_CAx != num_CAy:
        	message = "ERROR: Difference in numbers of relevant CA atoms in PDB files.\n" + \
			  str(num_CAx) + " CA atoms found in source PDB file.\n" + \
			  str(num_CAy) + " CA atoms found in reference PDB file."
                sys.exit(message)
	else:
		print str(num_CAx) + " corresponding CA atoms found in each PDB file."

        #CHECK TO DETERMINE IF APPROPRIATE NUMBER OF COORDINATES PRESENT
        if (len(x_CAcoords[0]) != 3):
                sys.exit("ERROR: Source PDB file does not have a 3xN atom coordinate set")
        if (len(y_CAcoords[0]) != 3):
                sys.exit("ERROR: Reference PDB file does not have a 3xN atom coordinate set")

        #CHECK TO DETERMINE IF >4 COORDINATES PRESENT FOR EACH PROTEIN
        if num_CAx < 4:
                sys.exit("ERROR: Source PDB file has 3 or fewer CA atoms, 4 or more needed to perform alignment")
        if num_CAy < 4:
                sys.exit("ERROR: Reference PDB file has 3 or fewer CA atoms, 4 or more needed to perform alignment")
        return x_CAcoords, y_CAcoords

##Standard and Weighted RMSD ALIGNMENT##
def performAlignment(x, y, all, set, filebase):

	#Initial standard alignment without weight
	#TRANSLATE PROTEINS X AND Y TO CENTER
	n = len(x[0])
	x_mean = mean(x,n)
	y_mean = mean(y,n)
	x_trans = translation(x, x_mean)
	x_translated = nested_list(x_trans,n)
	y_trans = translation(y, y_mean)
	y_translated = nested_list(y_trans,n)
	x_transpose = transpose(x_translated)

        #CALCULATE COVARIANCE MATRIX (y_translated *x_translated^t)
        R = dot(y_translated, x_transpose)
	R_transpose = transpose(R)
        R2 = dot(R_transpose, R)


	#DETERMINE THE eigENVECTORS AND eigENVALUES of R2
        mu,A = linalg.eig(R2)
	A = transpose(A)

	#SORT eigENVECTORS IN DECREASING ORDER OF eigENVALUES
        a = [(mu[i],A[i]) for i in range(len(A))]
        a.sort()
        a.reverse()
        mu = [x[0] for x in a]
        A = [x[1] for x in a]

        #DETERMINE RIGHT-HANDED SYSTEM
        A_3 = crossproduct(A[0], A[1])
        A = [A[0], A[1], A_3]

        #CALCULATE B, NORMALIZED PRODUCT OF (RxA)
        B_1 = dot(R, A[0])
        B_2 = dot(R, A[1])
        norm_B_1 = normalize(B_1)
        norm_B_2 = normalize(B_2)
        norm_B_3  = crossproduct(norm_B_1,norm_B_2)
        B = [norm_B_1,norm_B_2,norm_B_3]
        B_transpose = transpose(B)

        #CALCULATE ROTATION MATRIX, U
        U = dot(B_transpose, A)
	x_rot = dot(U,x_translated)

        #CALCULATE STANDARD RMSD
        standard_RMSD = sRMSD(x_rot, y_translated)

        #ADD MEAN VALUES OF PROTEIN Y TO ROTATED COORDINATES OF PROTEIN X
        x_coords = add_coords(x_rot, y_mean)
        x = nested_list(x_coords,n)

        #TRANSLATE ALL COORDINATES OF PROTEIN X
        all_trans = translation(all, x_mean)
        j = len(all[0])
        all_translated = nested_list(all_trans,j)
        all_rot = dot(U,all_translated)

        #ADD ALL MEAN VALUES OF PROTEIN Y TO ALL ROTATED COORDINATES OF PROTEIN X
        all_coords = add_coords(all_rot, y_mean)
        all = nested_list(all_coords,j)
        allrot = transpose(all)
        setAll(set,allrot)

        #OUTPUT STANDARD RMSD ALIGNMENT
        writeStructure(set,'%s_sRMSD.pdb'%filebase)

	#return x,all, standard_RMSD
	SRMSD = standard_RMSD

	##Weighted RMSD ALIGNMENT##
        atoms = len(x[0])
        z = 1 #iteration counter
	max_z = 5000 #number of iterations to perform
        weighted_rmsds = []
	final_rmsd = 0
        w_metric = []
        all_list = []

	#SET APPROPRIATE SCALING FACTOR
        scaling_factor = SRMSD
	#scaling_factor = 10.0
	print "Set scaling factor to %s based on initial RMSD."%(scaling_factor)

	while z < max_z:
                n = len(x[0])
                #TRANSLATE WeigHTED CENTROIDS TO ORIGIN
                #CALCULATE WeigHTS (protein1, protein2, scaling_factor)
                weights = weight(x,y,scaling_factor)
 		weighted_x_mean =  weight_trans(x,weights,n)
                weighted_y_mean =  weight_trans(y,weights,n)
                x_trans = translation(x, weighted_x_mean)
                y_trans = translation(y, weighted_y_mean)
                x_translated = nested_list(x_trans,n)
                y_translated = nested_list(y_trans,n)
                #CALCULATE WeigHTED COVARIENCE MATRIX
		weighted_rot = weight(x_translated,y_translated,scaling_factor)
		weighted_x_translated = multiply(weighted_rot,x_translated)
                wx_transpose = transpose(weighted_x_translated)
		R = dot(y_translated,wx_transpose)
                R_transpose = transpose(R)
		R2 = dot(R_transpose, R)

                #DETERMINE THE eigENVECTORS AND eigENVALUES of R2
		mu,A = linalg.eig(R2)
		A = transpose(A)

		#SORT eigENVECTORS IN DECREASING ORDER OF eigENVALUES
		indeces = mu.ravel().argsort()
		indeces[::-1]
		a = []
		for i in indeces:
			 a.append((mu[i],A[i]))
                mu = [x[0] for x in a]
                A = [x[1] for x in a]
                #DETERMINE RIGHT-HANDED SYSTEM
                A_3 = crossproduct(A[0], A[1])
                A = [A[0], A[1], A_3]

                #CALCULATE B, NORMALIZED PRODUCT OF (RxA)
                B_1 = dot(R, A[0])
                B_2 = dot(R, A[1])
                norm_B_1 = normalize(B_1)
                norm_B_2 = normalize(B_2)
                norm_B_3 = crossproduct(norm_B_1,norm_B_2)
                B = [norm_B_1,norm_B_2,norm_B_3]
                B_transpose = transpose(B)

                #CALCULATE WeigHTED ROTATION MATRIX, U
                U = dot(B_transpose, A)
                x_rot = dot(U,x_translated)

                #CALCULATE WeigHTED RMSD
                weighted_rmsds.append(wRMSD(x_rot, y_translated,scaling_factor))
                w_metric.append(wSUM(x_rot, y_translated,scaling_factor,atoms,z))

                #DETERMINE IF CONVERGENCE IS REACHED
                if z > 1:
                        wrmsd_diff = weighted_rmsds[-2] - weighted_rmsds[-1]
			final_rmsd = weighted_rmsds[-1]
		else:
                        wrmsd_diff = []

                if 0 < wrmsd_diff < 0.000001:

                        #ADD MEAN VALUES OF PROTEIN Y TO ROTATED COORDINATES OF PROTEIN X
                        x_coords = add_coords(x_rot, weighted_y_mean)
                        x = nested_list(x_coords,n)

                        #TRANSFORM ALL COORDINATES OF PROTEIN X
                        all_trans = translation(all, weighted_x_mean)
                        j = len(all[0])
                        all_translated = nested_list(all_trans,j)
                        all_rot = dot(U,all_translated)

			#ADD ALL MEAN VALUES OF PROTEIN Y TO ALL ROTATED COORDINATES OF PROTEIN X
                        all_coords = add_coords(all_rot, weighted_y_mean)
                        all = nested_list(all_coords,j)
			print 'Convergence reached after %s iterations, final wRMSD is %s'%(z, final_rmsd)
			allrot = transpose(all)
			setAll(set, allrot)
			writeStructure(set,'%s_wRMSD.pdb'%filebase)
			break
                else:
                        #ADD MEAN VALUES OF PROTEIN Y TO ROTATED COORDINATES OF PROTEIN X
                        x_coords = add_coords(x_rot, weighted_y_mean)
                        x = nested_list(x_coords,n)

                        #TRANSFORM ALL COORDINATES OF PROTEIN X
                        all_trans = translation(all, weighted_x_mean)
                        j = len(all[0])
                        all_translated = nested_list(all_trans,j)
                        all_rot = dot(U,all_translated)

                        #ADD ALL MEAN VALUES OF PROTEIN Y TO ALL ROTATED COORDINATES OF PROTEIN X
                        all_coords = add_coords(all_rot, weighted_y_mean)
                        all = nested_list(all_coords,j)
                        z = z + 1
        else:
                print "Alignment stopped after %s iterations, convergence was never reached"%max_z
	return final_rmsd

#####HELPER FUNCTIONS#####
def get_CAResIDs(struct, chain_key):
        CA_Reslist =[]   # List of residue numbers of the residues that have CA atom
        for model in struct.get_list():
                chain_A = model[chain_key]
                for residue in chain_A.get_list():
                        if residue.has_id("CA"):
                                CA_Reslist.append(residue.get_id()[1])
        return CA_Reslist

def get_disordered(struct, chain_key):
        disord_list =[]  # List of residue numbers of the disordered residues
        for model in struct.get_list():
                chain_A = model[chain_key]
                for residue in chain_A.get_list():
                        if residue.is_disordered():
                                disord_list.append(residue.get_id()[1])
        return disord_list

def compare(x_ResIDs,y_ResIDs, X_OFFSET=0,Y_OFFSET=0):
        x_not_y_list = [] # Residues in source not in reference
        y_not_x_list = [] # Residues in reference not in source
        cx_ResIDs = {}
	cy_ResIDs = {}
	for each in x_ResIDs:
		cx_ResIDs[each-X_OFFSET] = each
	for each in y_ResIDs:
		cy_ResIDs[each-Y_OFFSET] = each
	for key in cx_ResIDs.keys():
		#print "Looking for residue %s(%s) in y"%(key,cx_ResIDs[key])
                if key < 0 or key not in cy_ResIDs.keys():
                        x_not_y_list.append(cx_ResIDs[key])
	for key in cy_ResIDs.keys():
		#print "Looking for residue %s(%s) in x"%(key,cy_ResIDs[key])
                if key < 0 or key not in cx_ResIDs.keys():
                        y_not_x_list.append(cy_ResIDs[key])
        x_not_y_list.sort()
        y_not_x_list.sort()
        return x_not_y_list, y_not_x_list

def get_CAcoords(struct, chain_key, resis_to_remove):
        x =[]
        for model in struct.get_list():
                for residue in model[chain_key].get_list():
			resinum = residue.get_id()[1]
                        if residue.has_id('CA') and resinum not in resis_to_remove:
                                x.append(residue['CA'].get_coord())
		break
	return array(x)

def get_Sequence(struct, chain_key, resis_to_remove):
        x =[]
        for model in struct.get_list():
		for residue in model[chain_key].get_list():
			resinum = residue.get_id()[1]
                        if residue.has_id('CA') and resinum not in resis_to_remove:
                                x.append(residue.get_resname()+str(residue.get_id()[1]))
		break
	return array(x)

def getAll_Coords(struct):
        result = []
        for atom in struct.get_atoms():
                result.append(atom.get_coord())
        x_t = transpose(result)
        return x_t

def writeStructure(structure,filename):
        io = PDBIO()
        io.set_structure(structure)
        io.save(filename)

def setAll(structure,newCACoords):
        allCAs =[]
        for model in structure.get_list():
                chain = model.get_list()
                for each in chain:
                        for res in each.get_list():
				#print "processing residues %s"%(res.get_full_id())
                                for x in res.get_list():
                                        allCAs.append(x)
        if len(allCAs) != len(newCACoords):
                print "wrong number of atoms .. structure had",len(allCAs),"you gave me",len(newCACoords)
                raise Exception("wrong number of atoms")
        for newCoords,ca in zip(newCACoords,allCAs):
                ca.set_coord(newCoords)

def mean(first,n):
        return [sum(each)/n for each in first]

def translation(first,second):
        c = 0
        k = []
        q = len(first)
        while c < q:
                for each in first[c]:
                        subtr = each - second[c]
                        k.append(subtr)
                c = c + 1
        return k

def nested_list(name,n):
        first1 = name[0:n]
        first2 = name[n:2*n]
        first3 = name[2*n:3*n]
        first_translated =[first1,first2,first3]
        return first_translated

def crossproduct(a,b):
        C_0 = a[1]*b[2] - a[2]*b[1]
        C_1 = a[2]*b[0] - a[0]*b[2]
        C_2 = a[0]*b[1] - a[1]*b[0]
        return [C_0, C_1, C_2]

def normalize(a):
        B_0 = (a[0])/(((a[0]**2)+(a[1]**2)+((a[2])**2))**(1/2))
        B_1 = (a[1])/(((a[0]**2)+(a[1]**2)+((a[2])**2))**(1/2))
        B_2 = (a[2])/(((a[0]**2)+(a[1]**2)+((a[2])**2))**(1/2))
        return [B_0, B_1, B_2]

def sRMSD(first,second):
        subtr = first - second
        subtr_s = pow(subtr,2)
        sum_subtr_s = subtr_s.sum(axis=0)
        d = sqrt(sum_subtr_s)
        sq_d = pow(d,2)
        s_sq_d = sum(sq_d)
        tot = (len(first[0]))
        value = sqrt(s_sq_d/tot)
        return value

def add_coords(first,second):
        c = 0
        k = []
        q = len(first)
        while c < q:
                for each in first[c]:
                        add = each + second[c]
                        k.append(add)
                c = c + 1
        return k

def weight_trans(first,weight,n):
        mult = multiply(first, weight)
        sum_mult = sum(mult)
        mean = [sum(each)/n for each in mult]
        return mean

def weight(first,second,constant):
        first = array(first)
	second = array(second)
	subtr = first - second
	#print subtr
	subtr_s = pow(subtr,2)
        sum_subtr_s = subtr_s.sum(axis=0)
	d = sqrt(sum_subtr_s)
	weighted_d = Gaussian(d,constant)
	weighted_d = Gaussian2(weighted_d)
	return weighted_d

def wSUM(first,second,constant,atoms,z):
        first =array(first)
        second = array(second)
        subtr = first - second
        subtr_s = pow(subtr,2)
        sum_subtr_s = subtr_s.sum(axis=0)
        d = sqrt(sum_subtr_s)
        weighted_d = Gaussian(d,constant)
        weighted_d = Gaussian2(weighted_d)
        sum_weighted_d = sum(weighted_d)
        value = sum_weighted_d/atoms
        return value

def wRMSD(first,second,constant):
        first = array(first)
        second = array(second)
        subtr = first - second
        subtr_s = pow(subtr,2)
	sum_subtr_s = subtr_s.sum(axis=0)
	d = sqrt(sum_subtr_s)
        weighted_d = Gaussian(d,constant)
        weights = Gaussian2(weighted_d)
        sq_d = pow(d,2)
        wd = multiply(sq_d,weights)
        s_wd = sum(wd)
        n = len(d)
        s_sq_d_divide = s_wd/n
        value = sqrt(s_sq_d_divide)
        return value

def Gaussian(first,z):
	weights = (-(pow(first,2))/z)
	return weights

def Gaussian2(first):
	values = exp(first)
	return values

#RUN Global_wRMSD.py
if __name__ == "__main__":
	usage = 'Usage: global_wRMSD.py SourcePDBFile1 [SourcePDBChain1]\n' + \
                '                       [SourcePDBFile2 [SourcePDBChain2]] [...]\n' + \
		'                       ReferencePDBFile2 [ReferencePDBChain2]'

	description = """    Aligns the specified chains of the source PDB files to the
specified chain of the reference PDB file according to the weighted
RMSD algorithm.  Any number of source PDB files can be specified.
The last PDB file on the input line will be used as the reference
PDB file.  Chain information, if present, must immediately follow
its corresponding PDB file.  The default chain is 'A'.
    The algorithm is described in detail in the following reference:
KL Damm and HA Carlson. (2006) "Gaussian-weighted RMSD superposition
of proteins:  A structural comparison for flexible proteins and
predicted protein structures." Biophys. J. 90, 4558-4573."""

	only_standard = False
	correct_index = False

	X_OFFSET = 0
	Y_OFFSET = 0

	args = sys.argv[1:]
	if '-h' in args or '--help' in args:
		sys.exit(usage + '\n\n' + description)
	if '-s' in args:
		#use standard alignment only
		only_standard = True


	# Create list of input files.  The list actually contains
	# tuples of (filename, chainID).  If no chain ID is
	# specified on the command line, we use the default 'A'
	in_file_list = []
	i = 0
	while i < len(args):
		if args[i] == '-o':
			X_OFFSET = int(args[i+1])
			Y_OFFSET = int(args[i+2])
			i = i + 3

		chainPresent = False
		in_file = args[i]
		try:
			if len(args[i + 1]) == 1:
				chainPresent = True
		except IndexError:
			pass

		if chainPresent:
			in_file_chain = (in_file, args[i + 1])
			i += 2
		else:
			in_file_chain = (in_file, 'A')
			i += 1
		in_file_list.append(in_file_chain)

	# The last file in the list is the reference PDB
	try:
		referencePDB = in_file_list[-1][0]
	except IndexError:
		sys.exit('ERROR: No reference PDB file specified.\n' + usage)
	referenceChain = in_file_list[-1][1]

	# Check existence of reference PDB file, make sure it's not empty.
	# Empty PDB files result in an obscure error from PDB parser.
	try:
        	size = os.stat(referencePDB)[6]
	except OSError:
		sys.exit('ERROR: Unable to open reference PDB file ' + referencePDB + '\n' + usage)
        if not size:
                sys.exit('ERROR: Reference PDB file ' + referencePDB + ' is empty.\n' + usage)

	# Iterate through the source files and run alignment function for each

	if len(in_file_list[:-1]) < 1:
		sys.exit('ERROR: No source PDB files specified.\n' + usage)

	for sourcePDB, sourceChain in in_file_list[:-1]:

		if sourcePDB != in_file_list[0][0]:  # i.e. if not on first file
			print # Add blank line for nicer output

		# Check existence of source PDB file, make sure it's not empty.
		# Empty PDB files result in an obscure error from PDB parser.
		try:
        		size = os.stat(sourcePDB)[6]
		except OSError:
			#print 'Warning: Ignoring source PDB file ' + sourcePDB + '; Could not be opened.'
			continue
        	if not size:
			print 'Warning: Ignoring source PDB file ' + sourcePDB + '; File is empty.'
			continue

        	run_Global_wRMSD(sourcePDB, referencePDB, sourceChain, referenceChain, only_standard)