#!/usr/bin/python """ REQUIRED INSTALLATIONS: -Python 2.4 + -Biopython 1.42 -Numpy/Scipy 1.2.1 -mxTextTools: http://www.egenix.com/files/python/mxTextTools.html REQUIRED INSTALLATIONS: -Python 2.4 + -Biopython 1.42 -Numpy/Scipy 1.2.1 -mxTextTools: http://www.egenix.com/files/python/mxTextTools.html INPUT REQUIREMENTS: -2 PDB files PDB file must be in correct PDB format (i.e. chain ID's present, unique atom name within each residue, occupancy, etc...) Global_wRMSD.py INFORMATION NOTE: For similar structures (characterized by a small sRMSD; example: sRMSD < 5), the scaling factor is set to 2 For nonsimilar structures (characterized by a large sRMSD; example: sRMSD > 5), the scaling factor is set to 5 TO RUN Global_wRMSD.py: Global_wRMSD.py Protein_1.pdb Protein_1_Chain_ID Protein_2.pdb Protein_2_Chain_ID Example: Global_wRMSD.py 3ERD.pdb A 3ERT.pdb A Example output: 3ERD_sRMSD.pdb 3ERD_wRMSD.pdb Calculated standard RMSD value To run through cygwin: In c:cygwin\etc\profile Change PATH to local python (Python25): PATH=/usr/local/bin:/cygdrive/c/Python25:/usr/bin:/bin:/usr/X11R6/bin:$PATH export PATH For questions or comments: Nickolay Khazanov nickolay@umich.edu University of Michigan Carlson Lab """ #define global functions from __future__ import division import sys, os from Bio.PDB.PDBParser import PDBParser from Bio.PDB.PDBIO import PDBIO import numpy from scipy import linalg from numpy import sqrt, array, dot, transpose, exp, multiply def run_Global_wRMSD(file1, file2, X_Chain_ID, Y_Chain_ID, only_standard): #COMPARE RESIDUES FROM PROTEIN X AND PROTEIN Y BY CHAINS, REMOVES NONMATCHING RESIDUE COORDINATES AND RETURNS CA COORDINATES FROM MATCHING RESIDUES #print "Source file:", file1, "chain", X_Chain_ID #print "Reference file:", file2, "chain", Y_Chain_ID parser = PDBParser(PERMISSIVE = 0) struct1 = parser.get_structure('str1', file1) struct2 = parser.get_structure('str2', file2) x_CAcoords, y_CAcoords = EnsureCorrespondence(file1, file2, X_Chain_ID, Y_Chain_ID, struct1, struct2) x_CAcoords = transpose(x_CAcoords) y_CAcoords = transpose(y_CAcoords) #PERFORM STANDARD AND WeigHTED RMSD CALCULATION x_all_coords = getAll_Coords(struct1) filebase = os.path.splitext(file1)[0] finalRMSD = performAlignment(x_CAcoords, y_CAcoords, x_all_coords, struct1, filebase) # x_CAcoords,x_all_coords, SRMSD = standard_alignment(x_CAcoords, y_CAcoords, x_all_coords, struct1, filebase) # print "The standard RMSD value is", SRMSD # if not only_standard: # WRMSD = weighted_alignment(x_CAcoords, y_CAcoords, x_all_coords, struct1, filebase, SRMSD) # print "The weighted RMSD value is", WRMSD #ENSURE RESIDUE CORRESPONDENCE def EnsureCorrespondence(X_name, Y_name, X_Chain_ID, Y_Chain_ID, struct1, struct2): # GET ALL RESIDUE NUMBERS THAT HAVE CA ATOM x_CAResIDs = get_CAResIDs(struct1, X_Chain_ID) y_CAResIDs = get_CAResIDs(struct2, Y_Chain_ID) #REMOVE DISORDERED RESIDUES x_disordered = get_disordered(struct1, X_Chain_ID) if x_disordered: print "Disordered residue(s) in source PDB file will not be included:", for resid in x_disordered: print resid, print y_disordered = get_disordered(struct2, Y_Chain_ID) if y_disordered: print "Disordered residue(s) in reference PDB file will not be included:", for resid in y_disordered: print resid, print #COMPARE PROTEIN1 TO PROTEIN2 AND REMOVE NONMATCHING RESIDUES if X_OFFSET or Y_OFFSET: print "Offset is being used: %s for source, %s for reference"%(X_OFFSET,Y_OFFSET) x_nonmatch, y_nonmatch = compare(x_CAResIDs, y_CAResIDs, X_OFFSET, Y_OFFSET) if y_nonmatch: print "Residue(s) not in source PDB file:", for y_resi in y_nonmatch: print y_resi, print if x_nonmatch: print "Residue(s) not in reference PDB file:", for x_resi in x_nonmatch: print x_resi, print #MAKE LIST OF ALL RESIDUES TO BE REMOVED x_resis_to_remove = x_disordered + y_disordered + x_nonmatch y_resis_to_remove = x_disordered + y_disordered + y_nonmatch #RETURNS ALL CA COORDINATES EXCEPT THOSE TO BE REMOVED x_CAcoords = get_CAcoords(struct1, X_Chain_ID, x_resis_to_remove) y_CAcoords = get_CAcoords(struct2, Y_Chain_ID, y_resis_to_remove) x_seq = get_Sequence(struct1, X_Chain_ID, x_resis_to_remove) y_seq = get_Sequence(struct2, Y_Chain_ID, y_resis_to_remove) #FINAL CHECK TO ENSURE RESIDUE CORRESPONDENCE num_CAx, num_CAy = len(x_CAcoords), len(y_CAcoords) if num_CAx != num_CAy: message = "ERROR: Difference in numbers of relevant CA atoms in PDB files.\n" + \ str(num_CAx) + " CA atoms found in source PDB file.\n" + \ str(num_CAy) + " CA atoms found in reference PDB file." sys.exit(message) else: print str(num_CAx) + " corresponding CA atoms found in each PDB file." #CHECK TO DETERMINE IF APPROPRIATE NUMBER OF COORDINATES PRESENT if (len(x_CAcoords[0]) != 3): sys.exit("ERROR: Source PDB file does not have a 3xN atom coordinate set") if (len(y_CAcoords[0]) != 3): sys.exit("ERROR: Reference PDB file does not have a 3xN atom coordinate set") #CHECK TO DETERMINE IF >4 COORDINATES PRESENT FOR EACH PROTEIN if num_CAx < 4: sys.exit("ERROR: Source PDB file has 3 or fewer CA atoms, 4 or more needed to perform alignment") if num_CAy < 4: sys.exit("ERROR: Reference PDB file has 3 or fewer CA atoms, 4 or more needed to perform alignment") return x_CAcoords, y_CAcoords ##Standard and Weighted RMSD ALIGNMENT## def performAlignment(x, y, all, set, filebase): #Initial standard alignment without weight #TRANSLATE PROTEINS X AND Y TO CENTER n = len(x[0]) x_mean = mean(x,n) y_mean = mean(y,n) x_trans = translation(x, x_mean) x_translated = nested_list(x_trans,n) y_trans = translation(y, y_mean) y_translated = nested_list(y_trans,n) x_transpose = transpose(x_translated) #CALCULATE COVARIANCE MATRIX (y_translated *x_translated^t) R = dot(y_translated, x_transpose) R_transpose = transpose(R) R2 = dot(R_transpose, R) #DETERMINE THE eigENVECTORS AND eigENVALUES of R2 mu,A = linalg.eig(R2) A = transpose(A) #SORT eigENVECTORS IN DECREASING ORDER OF eigENVALUES a = [(mu[i],A[i]) for i in range(len(A))] a.sort() a.reverse() mu = [x[0] for x in a] A = [x[1] for x in a] #DETERMINE RIGHT-HANDED SYSTEM A_3 = crossproduct(A[0], A[1]) A = [A[0], A[1], A_3] #CALCULATE B, NORMALIZED PRODUCT OF (RxA) B_1 = dot(R, A[0]) B_2 = dot(R, A[1]) norm_B_1 = normalize(B_1) norm_B_2 = normalize(B_2) norm_B_3 = crossproduct(norm_B_1,norm_B_2) B = [norm_B_1,norm_B_2,norm_B_3] B_transpose = transpose(B) #CALCULATE ROTATION MATRIX, U U = dot(B_transpose, A) x_rot = dot(U,x_translated) #CALCULATE STANDARD RMSD standard_RMSD = sRMSD(x_rot, y_translated) #ADD MEAN VALUES OF PROTEIN Y TO ROTATED COORDINATES OF PROTEIN X x_coords = add_coords(x_rot, y_mean) x = nested_list(x_coords,n) #TRANSLATE ALL COORDINATES OF PROTEIN X all_trans = translation(all, x_mean) j = len(all[0]) all_translated = nested_list(all_trans,j) all_rot = dot(U,all_translated) #ADD ALL MEAN VALUES OF PROTEIN Y TO ALL ROTATED COORDINATES OF PROTEIN X all_coords = add_coords(all_rot, y_mean) all = nested_list(all_coords,j) allrot = transpose(all) setAll(set,allrot) #OUTPUT STANDARD RMSD ALIGNMENT writeStructure(set,'%s_sRMSD.pdb'%filebase) #return x,all, standard_RMSD SRMSD = standard_RMSD ##Weighted RMSD ALIGNMENT## atoms = len(x[0]) z = 1 #iteration counter max_z = 5000 #number of iterations to perform weighted_rmsds = [] final_rmsd = 0 w_metric = [] all_list = [] #SET APPROPRIATE SCALING FACTOR scaling_factor = SRMSD #scaling_factor = 10.0 print "Set scaling factor to %s based on initial RMSD."%(scaling_factor) while z < max_z: n = len(x[0]) #TRANSLATE WeigHTED CENTROIDS TO ORIGIN #CALCULATE WeigHTS (protein1, protein2, scaling_factor) weights = weight(x,y,scaling_factor) weighted_x_mean = weight_trans(x,weights,n) weighted_y_mean = weight_trans(y,weights,n) x_trans = translation(x, weighted_x_mean) y_trans = translation(y, weighted_y_mean) x_translated = nested_list(x_trans,n) y_translated = nested_list(y_trans,n) #CALCULATE WeigHTED COVARIENCE MATRIX weighted_rot = weight(x_translated,y_translated,scaling_factor) weighted_x_translated = multiply(weighted_rot,x_translated) wx_transpose = transpose(weighted_x_translated) R = dot(y_translated,wx_transpose) R_transpose = transpose(R) R2 = dot(R_transpose, R) #DETERMINE THE eigENVECTORS AND eigENVALUES of R2 mu,A = linalg.eig(R2) A = transpose(A) #SORT eigENVECTORS IN DECREASING ORDER OF eigENVALUES indeces = mu.ravel().argsort() indeces[::-1] a = [] for i in indeces: a.append((mu[i],A[i])) mu = [x[0] for x in a] A = [x[1] for x in a] #DETERMINE RIGHT-HANDED SYSTEM A_3 = crossproduct(A[0], A[1]) A = [A[0], A[1], A_3] #CALCULATE B, NORMALIZED PRODUCT OF (RxA) B_1 = dot(R, A[0]) B_2 = dot(R, A[1]) norm_B_1 = normalize(B_1) norm_B_2 = normalize(B_2) norm_B_3 = crossproduct(norm_B_1,norm_B_2) B = [norm_B_1,norm_B_2,norm_B_3] B_transpose = transpose(B) #CALCULATE WeigHTED ROTATION MATRIX, U U = dot(B_transpose, A) x_rot = dot(U,x_translated) #CALCULATE WeigHTED RMSD weighted_rmsds.append(wRMSD(x_rot, y_translated,scaling_factor)) w_metric.append(wSUM(x_rot, y_translated,scaling_factor,atoms,z)) #DETERMINE IF CONVERGENCE IS REACHED if z > 1: wrmsd_diff = weighted_rmsds[-2] - weighted_rmsds[-1] final_rmsd = weighted_rmsds[-1] else: wrmsd_diff = [] if 0 < wrmsd_diff < 0.000001: #ADD MEAN VALUES OF PROTEIN Y TO ROTATED COORDINATES OF PROTEIN X x_coords = add_coords(x_rot, weighted_y_mean) x = nested_list(x_coords,n) #TRANSFORM ALL COORDINATES OF PROTEIN X all_trans = translation(all, weighted_x_mean) j = len(all[0]) all_translated = nested_list(all_trans,j) all_rot = dot(U,all_translated) #ADD ALL MEAN VALUES OF PROTEIN Y TO ALL ROTATED COORDINATES OF PROTEIN X all_coords = add_coords(all_rot, weighted_y_mean) all = nested_list(all_coords,j) print 'Convergence reached after %s iterations, final wRMSD is %s'%(z, final_rmsd) allrot = transpose(all) setAll(set, allrot) writeStructure(set,'%s_wRMSD.pdb'%filebase) break else: #ADD MEAN VALUES OF PROTEIN Y TO ROTATED COORDINATES OF PROTEIN X x_coords = add_coords(x_rot, weighted_y_mean) x = nested_list(x_coords,n) #TRANSFORM ALL COORDINATES OF PROTEIN X all_trans = translation(all, weighted_x_mean) j = len(all[0]) all_translated = nested_list(all_trans,j) all_rot = dot(U,all_translated) #ADD ALL MEAN VALUES OF PROTEIN Y TO ALL ROTATED COORDINATES OF PROTEIN X all_coords = add_coords(all_rot, weighted_y_mean) all = nested_list(all_coords,j) z = z + 1 else: print "Alignment stopped after %s iterations, convergence was never reached"%max_z return final_rmsd #####HELPER FUNCTIONS##### def get_CAResIDs(struct, chain_key): CA_Reslist =[] # List of residue numbers of the residues that have CA atom for model in struct.get_list(): chain_A = model[chain_key] for residue in chain_A.get_list(): if residue.has_id("CA"): CA_Reslist.append(residue.get_id()[1]) return CA_Reslist def get_disordered(struct, chain_key): disord_list =[] # List of residue numbers of the disordered residues for model in struct.get_list(): chain_A = model[chain_key] for residue in chain_A.get_list(): if residue.is_disordered(): disord_list.append(residue.get_id()[1]) return disord_list def compare(x_ResIDs,y_ResIDs, X_OFFSET=0,Y_OFFSET=0): x_not_y_list = [] # Residues in source not in reference y_not_x_list = [] # Residues in reference not in source cx_ResIDs = {} cy_ResIDs = {} for each in x_ResIDs: cx_ResIDs[each-X_OFFSET] = each for each in y_ResIDs: cy_ResIDs[each-Y_OFFSET] = each for key in cx_ResIDs.keys(): #print "Looking for residue %s(%s) in y"%(key,cx_ResIDs[key]) if key < 0 or key not in cy_ResIDs.keys(): x_not_y_list.append(cx_ResIDs[key]) for key in cy_ResIDs.keys(): #print "Looking for residue %s(%s) in x"%(key,cy_ResIDs[key]) if key < 0 or key not in cx_ResIDs.keys(): y_not_x_list.append(cy_ResIDs[key]) x_not_y_list.sort() y_not_x_list.sort() return x_not_y_list, y_not_x_list def get_CAcoords(struct, chain_key, resis_to_remove): x =[] for model in struct.get_list(): for residue in model[chain_key].get_list(): resinum = residue.get_id()[1] if residue.has_id('CA') and resinum not in resis_to_remove: x.append(residue['CA'].get_coord()) break return array(x) def get_Sequence(struct, chain_key, resis_to_remove): x =[] for model in struct.get_list(): for residue in model[chain_key].get_list(): resinum = residue.get_id()[1] if residue.has_id('CA') and resinum not in resis_to_remove: x.append(residue.get_resname()+str(residue.get_id()[1])) break return array(x) def getAll_Coords(struct): result = [] for atom in struct.get_atoms(): result.append(atom.get_coord()) x_t = transpose(result) return x_t def writeStructure(structure,filename): io = PDBIO() io.set_structure(structure) io.save(filename) def setAll(structure,newCACoords): allCAs =[] for model in structure.get_list(): chain = model.get_list() for each in chain: for res in each.get_list(): #print "processing residues %s"%(res.get_full_id()) for x in res.get_list(): allCAs.append(x) if len(allCAs) != len(newCACoords): print "wrong number of atoms .. structure had",len(allCAs),"you gave me",len(newCACoords) raise Exception("wrong number of atoms") for newCoords,ca in zip(newCACoords,allCAs): ca.set_coord(newCoords) def mean(first,n): return [sum(each)/n for each in first] def translation(first,second): c = 0 k = [] q = len(first) while c < q: for each in first[c]: subtr = each - second[c] k.append(subtr) c = c + 1 return k def nested_list(name,n): first1 = name[0:n] first2 = name[n:2*n] first3 = name[2*n:3*n] first_translated =[first1,first2,first3] return first_translated def crossproduct(a,b): C_0 = a[1]*b[2] - a[2]*b[1] C_1 = a[2]*b[0] - a[0]*b[2] C_2 = a[0]*b[1] - a[1]*b[0] return [C_0, C_1, C_2] def normalize(a): B_0 = (a[0])/(((a[0]**2)+(a[1]**2)+((a[2])**2))**(1/2)) B_1 = (a[1])/(((a[0]**2)+(a[1]**2)+((a[2])**2))**(1/2)) B_2 = (a[2])/(((a[0]**2)+(a[1]**2)+((a[2])**2))**(1/2)) return [B_0, B_1, B_2] def sRMSD(first,second): subtr = first - second subtr_s = pow(subtr,2) sum_subtr_s = subtr_s.sum(axis=0) d = sqrt(sum_subtr_s) sq_d = pow(d,2) s_sq_d = sum(sq_d) tot = (len(first[0])) value = sqrt(s_sq_d/tot) return value def add_coords(first,second): c = 0 k = [] q = len(first) while c < q: for each in first[c]: add = each + second[c] k.append(add) c = c + 1 return k def weight_trans(first,weight,n): mult = multiply(first, weight) sum_mult = sum(mult) mean = [sum(each)/n for each in mult] return mean def weight(first,second,constant): first = array(first) second = array(second) subtr = first - second #print subtr subtr_s = pow(subtr,2) sum_subtr_s = subtr_s.sum(axis=0) d = sqrt(sum_subtr_s) weighted_d = Gaussian(d,constant) weighted_d = Gaussian2(weighted_d) return weighted_d def wSUM(first,second,constant,atoms,z): first =array(first) second = array(second) subtr = first - second subtr_s = pow(subtr,2) sum_subtr_s = subtr_s.sum(axis=0) d = sqrt(sum_subtr_s) weighted_d = Gaussian(d,constant) weighted_d = Gaussian2(weighted_d) sum_weighted_d = sum(weighted_d) value = sum_weighted_d/atoms return value def wRMSD(first,second,constant): first = array(first) second = array(second) subtr = first - second subtr_s = pow(subtr,2) sum_subtr_s = subtr_s.sum(axis=0) d = sqrt(sum_subtr_s) weighted_d = Gaussian(d,constant) weights = Gaussian2(weighted_d) sq_d = pow(d,2) wd = multiply(sq_d,weights) s_wd = sum(wd) n = len(d) s_sq_d_divide = s_wd/n value = sqrt(s_sq_d_divide) return value def Gaussian(first,z): weights = (-(pow(first,2))/z) return weights def Gaussian2(first): values = exp(first) return values #RUN Global_wRMSD.py if __name__ == "__main__": usage = 'Usage: global_wRMSD.py SourcePDBFile1 [SourcePDBChain1]\n' + \ ' [SourcePDBFile2 [SourcePDBChain2]] [...]\n' + \ ' ReferencePDBFile2 [ReferencePDBChain2]' description = """ Aligns the specified chains of the source PDB files to the specified chain of the reference PDB file according to the weighted RMSD algorithm. Any number of source PDB files can be specified. The last PDB file on the input line will be used as the reference PDB file. Chain information, if present, must immediately follow its corresponding PDB file. The default chain is 'A'. The algorithm is described in detail in the following reference: KL Damm and HA Carlson. (2006) "Gaussian-weighted RMSD superposition of proteins: A structural comparison for flexible proteins and predicted protein structures." Biophys. J. 90, 4558-4573.""" only_standard = False correct_index = False X_OFFSET = 0 Y_OFFSET = 0 args = sys.argv[1:] if '-h' in args or '--help' in args: sys.exit(usage + '\n\n' + description) if '-s' in args: #use standard alignment only only_standard = True # Create list of input files. The list actually contains # tuples of (filename, chainID). If no chain ID is # specified on the command line, we use the default 'A' in_file_list = [] i = 0 while i < len(args): if args[i] == '-o': X_OFFSET = int(args[i+1]) Y_OFFSET = int(args[i+2]) i = i + 3 chainPresent = False in_file = args[i] try: if len(args[i + 1]) == 1: chainPresent = True except IndexError: pass if chainPresent: in_file_chain = (in_file, args[i + 1]) i += 2 else: in_file_chain = (in_file, 'A') i += 1 in_file_list.append(in_file_chain) # The last file in the list is the reference PDB try: referencePDB = in_file_list[-1][0] except IndexError: sys.exit('ERROR: No reference PDB file specified.\n' + usage) referenceChain = in_file_list[-1][1] # Check existence of reference PDB file, make sure it's not empty. # Empty PDB files result in an obscure error from PDB parser. try: size = os.stat(referencePDB)[6] except OSError: sys.exit('ERROR: Unable to open reference PDB file ' + referencePDB + '\n' + usage) if not size: sys.exit('ERROR: Reference PDB file ' + referencePDB + ' is empty.\n' + usage) # Iterate through the source files and run alignment function for each if len(in_file_list[:-1]) < 1: sys.exit('ERROR: No source PDB files specified.\n' + usage) for sourcePDB, sourceChain in in_file_list[:-1]: if sourcePDB != in_file_list[0][0]: # i.e. if not on first file print # Add blank line for nicer output # Check existence of source PDB file, make sure it's not empty. # Empty PDB files result in an obscure error from PDB parser. try: size = os.stat(sourcePDB)[6] except OSError: #print 'Warning: Ignoring source PDB file ' + sourcePDB + '; Could not be opened.' continue if not size: print 'Warning: Ignoring source PDB file ' + sourcePDB + '; File is empty.' continue run_Global_wRMSD(sourcePDB, referencePDB, sourceChain, referenceChain, only_standard)