#a script for finding the average variable importance of each category of features #requires a variable importance file from and variable label file #2/22/17 Joseph Luttrell #update 7/22/2018 for bmcbioinformatics redo. just made the sorting be decreasing #the higher values are the more important and were used in the manuscript that way too #before they were sorted with excel. new output is already sorted correctly. import os import sys importanceFileName = sys.argv[1] #importance of each feature labelFileName = sys.argv[2] #each feature labeled by type of feature outputFileName = sys.argv[3] #output importance sorted average outputIndividualFileName = sys.argv[4] #output importance sorted by individual features importanceFile = open(importanceFileName, 'r') labelFile = open(labelFileName, 'r') outF = None outIndivF = None if not os.path.isfile(outputFileName) and not os.path.isfile(outputIndividualFileName): outF = open(outputFileName, 'w') outIndivF = open(outputIndividualFileName, 'w') else: print("output file exists: exiting") exit(0) #build a dictionary of the feature labels labels = {} labelsWithImp = [] #[featNum, importance, featLabel] importanceAvg = {} for line in labelFile: line = line.rstrip() if not '*' in line: continue else: featNum = line[:line.find(':')] featLabel = line[line.find('*')+1:] labels[featNum] = featLabel #build a dictionary with the sum of the importance of each feature label #and a list with tuples of each individual feature label with its associated importance value for line in importanceFile: items = line.split() if not items[0][1] == 'f': continue featNum = items[0][2:-1] #skip the f and the last " importance = float(items[1]) featLabel = labels[featNum] labelsWithImp.append([featNum, importance, featLabel]) if featLabel in importanceAvg: importanceAvg[featLabel][0] += importance importanceAvg[featLabel][1] += 1 else: importanceAvg[featLabel] = [importance, 1] finalAvgs = [] for key, vals in importanceAvg.items(): avg = vals[0]/vals[1] # sum of importance / num of features in that category finalAvgs.append([key, avg]) #sort the finalAvg by their value and the individual variable importance list finalAvgs.sort(key=lambda x: x[1], reverse=True) labelsWithImp.sort(key=lambda x: x[1], reverse=True) for item in finalAvgs: outF.write(str(item) + "\n") for item in labelsWithImp: outIndivF.write(str(item[0]) + ' ' + str(item[1]) + ' ' + str(item[2]) + '\n')