#!/users/mlerner/software/bin/python """ Usage: Prep_for_wRMSD.py [options] PDBFile1 [PDBFile2 [...]] This program reads any number of PDB files and brings them into the convention required for appropriate parsing by Biopython. Biopython requires that all atoms have chain information, occupancy, and temperature factors, which is not always output by all PDB-generating programs. These three pieces of information are added to any ATOM or HETATM lines that lack them. Each input file is read line by line. If occupancy and/or temperature factor are missing, the values 1.00 and 0.00, respectively, are written. Chain information is more complicated. The '-c' option is used to specify any number of chain identifiers (separated by commas), which are then assigned to the residue ranges specified by the '-r' option. The program fails if the number of chain IDs is different from the number of residue ranges. The residue number of each atom is identified, and the chain ID that corresponds to the first residue range that contains this number is then used. This number is removed from that range so that later appearances of this residue number in the file are assigned to a different chain. If the residue number is not found in any ranges, the default chain ID (specified by the '-d' option) is assigned. Note that if chain information is actually present, it is not overwritten. Notes: * The residue numbers may be adjusted by simple addition of an offset, using the '-O' option. This is useful for bringing the numbering into a convention unique to a particular system, such as BACE1. * The format of the names of the output files is set with the '-o' option. The input file name (without the extension) will replace a '%s' character in this formatting string. If no '%s' character is found, the string will be used without modification, which implies that if multiple input files are provided and no '%s' character is in the format string, the same name will be used for all output files. The user will be warned of this outcome. If it is not already present, a '.pdb' extension will be automatically appended to the format string. * Known bug: this program fails if the second number in any residue range is negative. Usage: Prep_for_wRMSD.py [options] PDBFile1 [PDBFile2 [...]] Options: -h, --help show this help message and exit -c CHAIN, --chain=CHAIN Comma-separated list of chain identifiers to add. (default: A) -r RESIDUES, --residues=RESIDUES Comma-separated list of residue ranges for each chain. If None, the chain will be added to all atoms. (default: none) -d DEFAULT, --default=DEFAULT Default chain to add to any residues not found in the list of residue ranges. (default: Z) -o OUTPUT_FORMAT, --output-format=OUTPUT_FORMAT Output file name format. A '%s' in this string will be replaced by the name of the input file (without the extension). A '.pdb' extension will automatically be appended. (default: %s_f) -O OFFSET, --offset=OFFSET Offset to add to each residue number. (default: 0) """ import sys, os from optparse import OptionParser import copy if __name__ == '__main__': usage = '''%prog [options] PDBFile1 [PDBFile2 [...]] This program reads any number of PDB files and brings them into the convention required for appropriate parsing by Biopython. Biopython requires that all atoms have chain information, occupancy, and temperature factors, which is not always output by all PDB-generating programs. These three pieces of information are added to any ATOM or HETATM lines that lack them. Each input file is read line by line. If occupancy and/or temperature factor are missing, the values 1.00 and 0.00, respectively, are written. Chain information is more complicated. The '-c' option is used to specify any number of chain identifiers (separated by commas), which are then assigned to the residue ranges specified by the '-r' option. The program fails if the number of chain IDs is different from the number of residue ranges. The residue number of each atom is identified, and the chain ID that corresponds to the first residue range that contains this number is then used. This number is removed from that range so that later appearances of this residue number in the file are assigned to a different chain. If the residue number is not found in any ranges, the default chain ID (specified by the '-d' option) is assigned. Note that if chain information is actually present, it is not overwritten. Notes: * The residue numbers may be adjusted by simple addition of an offset, using the '-O' option. This is useful for bringing the numbering into a convention unique to a particular system, such as BACE1. * The format of the names of the output files is set with the '-o' option. The input file name (without the extension) will replace a '%s' character in this formatting string. If no '%s' character is found, the string will be used without modification, which implies that if multiple input files are provided and no '%s' character is in the format string, the same name will be used for all output files. The user will be warned of this outcome. If it is not already present, a '.pdb' extension will be automatically appended to the format string. * Known bug: this program fails if the second number in any residue range is negative. Usage: %prog [options] PDBFile1 [PDBFile2 [...]]''' parser = OptionParser(usage = usage) parser.add_option('-c', '--chain', dest = 'chain', default = 'A', help = 'Comma-separated list of chain identifiers to add. (default: %default)' ) parser.add_option('-r', '--residues', dest = 'residues', default = None, help = 'Comma-separated list of residue ranges for each chain. If None, the chain will be added to all atoms. (default: %default)' ) parser.add_option('-d', '--default', dest = 'default', default = 'Z', help = 'Default chain to add to any residues not found in the list of residue ranges. (default: %default)' ) parser.add_option('-o', '--output-format', dest = 'output_format', default = '%s_f', help = "Output file name format. A '%s' in this string will be replaced by the name of the input file (without the extension). A '.pdb' extension will automatically be appended. (default: %default)" ) parser.add_option('-O', '--offset', dest = 'offset', default = 0, type = 'int', help = 'Offset to add to each residue number. (default: %default)' ) # Parse the inputs # Files options, in_files = parser.parse_args() if len(in_files) == 0: sys.exit('ERROR: No input files provided.') if len(in_files) > 1 and '%s' not in options.output_format: print 'Warning: Multiple input files provided, but output format does not allow\n\tfor different output file names. Only one output file will be written\n\tand overwritten for each input file.)' # Chain information chain_list = options.chain.split(',') for i in range(len(chain_list)): chain = chain_list[i] if len(chain) != 1: print 'Warning: Chain identifier %s is longer than one character. Will use\n\tonly first character %s.'%(chain, chain[0]) chain_list[i] = chain[0] # Residue information if options.residues == None: options.residues = '-999-9999' resis_list = options.residues.split(',') if len(resis_list) != len(chain_list): sys.exit('ERROR: Number of chains provided must exactly match the number of\n\tresidue ranges.') expanded_resis_list = [] for resis_str in resis_list: IsNeg = False resis = resis_str if resis.startswith('-'): IsNeg = True resis = resis[1:] resis = resis.split('-') if IsNeg: resis[0] = '-' + resis[0] try: resis = [int(i) for i in resis] except ValueError: sys.exit("ERROR: Residue range %s does not have the proper format. Please provide\n\ta comma-separated list of residues ranges (either single residue\n\tintegers or two values separated by a '-')."%resis_str) if len(resis) == 1: resis = [resis[0], resis[0]] elif len(resis) != 2: sys.exit("ERROR: Residue range %s does not have the proper format. Please provide\n\ta comma-separated list of residues ranges (either single residue integers\n\tor two values separated by a '-')."%resis_str) expanded_resis_list.append(range(resis[0], resis[1] + 1)) # Read and write altered files for in_file in in_files: exp_resis_list_copy = copy.deepcopy(expanded_resis_list) try: in_f = file(in_file) except IOError: print 'Warning: Unable to open input file %s.'%in_file continue try: out_file = options.output_format%os.path.splitext(in_file)[0] except TypeError: out_file = options.output_format if not out_file.endswith('.pdb'): out_file += '.pdb' try: out_f = file(out_file, 'w') except IOError: print 'Warning: Unable to open output file %s.'%out_file continue last_resi_num = None for line in in_f: if not line.startswith('ATOM ') and not line.startswith('HETATM'): out_f.write(line) last_resi_num = None continue resi_num = int(line[22:26]) + options.offset if resi_num == last_resi_num: chain = last_chain else: chain = options.default for i in range(len(exp_resis_list_copy)): if resi_num in exp_resis_list_copy[i]: chain = chain_list[i] exp_resis_list_copy[i].remove(resi_num) break last_resi_num = resi_num last_chain = chain chainPresent = line[21] if chainPresent != ' ': chain = chainPresent try: occ = float(line[54:60]) except (IndexError, ValueError): occ = 1. try: bfactor = float(line[60:66]) except (IndexError, ValueError): bfactor = 0. new_line = line[:21] + chain + '%4i'%resi_num + line[26:54] + '%6.2f%6.2f\n'%(occ, bfactor) out_f.write(new_line) in_f.close() out_f.close()