#NOTE: this old version doesn't match the order of the examples. #Bailey 7/21/18. Script written for the bmc bioinformatics redo to match the randomly chosen balanced examples (exact examples) present in the #feature selected datasets to the full feature/full example dataset. #this is for creating the new balanced/feature selected dataset with new feature selection (meanDecreaseAccuracy) that also #maintains the same balance as the meanDecreaseGini dataset in the manuscript #example usage: nohup python matchBalancedExamps.py sep6_featSelected.txt original/allFullFeatureExamples.txt sep6_fullFeat.txt &> out1.txt & import os import sys balancedFile = open(sys.argv[1], 'r') #the original balanced file (smaller) bigFile = open(sys.argv[2], 'r') #the collection of all training data (bigger) or whatever file has the full examples list. outName = sys.argv[3] #the name of the output file examps = {} outFile = None if not os.path.isfile(outName): outFile = open(outName, 'w') else: print("output file already exists, exiting") quit() #build the dictionary of the comments (tags) that are from the balanced examples for line in balancedFile: tag = line[line.find('#'):] protId = tag.split()[1] #extract comment (tag) from line if protId in examps: examps[protId].append(tag) else: examps[protId] = [tag] print("finished building dictionary") #read through the bigFile and output lines that match in the dictionary of balanced examples calculated above for line in bigFile: tag = line[line.find('#'):] protId = tag.split()[1] #extract comment (tag) from line if protId in examps: if tag in examps[protId]: outFile.write(line) outFile.close() balancedFile.close() bigFile.close()