#note: this is the version of the evaluation script made on 6/15/16 #it uses the .RR files located in the downloads section of the casp website instead of pdb files import sys import os import math #usage #python evalCasp11Fina.py pdbFile predFile outFileName def main(args): try: tfName = sys.argv[1] pfName = sys.argv[2] proteinName = sys.argv[3] L = sys.argv[4] except: tfName = args[0] pfName = args[1] proteinName = args[2] L = args[3] tFile = open(tfName, 'r') #a file in the pdb format. contains 3D coordinates for the structure. will pick carbon alpha of each residue pFile = open(pfName, 'r') #the .RR file available from casp (the "answer" file) """ if os.path.isfile(outName): sys.exit("output file already exists") else: oFile = open(outName, 'w') """ correct = 0 incorrect = 0 pCount = 1 trueCons = [] #[ [res1, res2], [res1, res2], ... ] #contacts are "binned" into 3 "seq sep" ranges trueSep = [0, 0, 0] #trueContact with sequence separation at >= 6,12,24 truePredSep = [0, 0, 0] #correct pred with sequence separation at >= 6,12,24 falsePredSep = [0, 0, 0] #false pred with sequence separation at >= 6,12,24 #count and bin all the contacts skip = 0 for line in tFile: #the .RR file if skip == 0: skip += 1 continue #skip the first line head = unicode(line[0]) if not (head.isnumeric()): break #end of the list of sequence contacts items = line.split() #[seq sep, res1:res2, ss1:ss2, dist_cb] resPair = items[1].split(':') r1 = int(''.join(i for i in resPair[0] if i.isdigit())) #strip the amino acid code from the residue number. r2 = int(''.join(i for i in resPair[1] if i.isdigit())) dist = items[3] #carbon beta distance from the answer file residueSeparation = int(items[0]) #the sequence separation sep = -1 #a number to indicate which sequence separation level (6,12,24) if residueSeparation >= 6 and residueSeparation < 12: sep = 0 if residueSeparation >= 12 and residueSeparation <24: sep = 1 if residueSeparation >= 24: sep = 2 if float(dist) <= 8: #cb_dist trueSep[sep] += 1 #true prediction at different sep lens trueCons.append([r1, r2]) for line in pFile: if line[0].isalpha(): continue items = line.split() r1 = int(items[0]) #the first and second residues from the prediction file r2 = int(items[1]) residueSeparation = r2 - r1 #the sequence separation sep = -1 #a number to indicate which sequence separation level (6,12,24) if residueSeparation >= 6 and residueSeparation < 12: sep = 0 if residueSeparation >= 12 and residueSeparation < 24: sep = 1 if residueSeparation >= 24: sep = 2 found = 0 for contact in trueCons: if (r2 == contact[0] and r1 == contact[1]) or (r1 == contact[0] and r2 == contact[1]): #prediction is true because found in .RR list truePredSep[sep] += 1 found = 1 if found == 0: falsePredSep[sep] += 1 #prediction is false because not found in .RR list if pCount >= int(L): break pCount += 1 try: acc6 = float(truePredSep[0]) / (truePredSep[0] + falsePredSep[0]) except: acc6 = 0.0 try: acc12 = float(truePredSep[1]) / (truePredSep[1] + falsePredSep[1]) except: acc12 = 0.0 try: acc24 = float(truePredSep[2]) / (truePredSep[2] + falsePredSep[2]) except: acc24 = 0.0 try: cov6 = float(truePredSep[0]) / trueSep[0] except: cov6 = 0.0 try: cov12 = float(truePredSep[1]) / trueSep[1] except: cov12 = 0.0 try: cov24 = float(truePredSep[2]) / trueSep[2] except: cov24 = 0.0 return [acc6, acc12, acc24, cov6, cov12, cov24] if __name__ == '__main__': main(sys.argv)