#Bailey 7/21/18. Script written for the bmc bioinformatics redo to match the randomly chosen balanced examples (exact examples) present in the #feature selected datasets to the full feature/full example dataset. #this is for creating the new balanced/feature selected dataset with new feature selection (meanDecreaseAccuracy) that also #maintains the same balance as the meanDecreaseGini dataset in the manuscript #edited 7/22/18 to make the example order match the order of the original file. This was not done for the "outOfOrder" RF. #example usage: nohup python matchBalancedExamps.py sep6_featSelected.txt original/allFullFeatureExamples.txt sep6_fullFeat.txt &> out1.txt & import os import sys balancedFile = open(sys.argv[1], 'r') #the original balanced file (smaller) bigFile = open(sys.argv[2], 'r') #the collection of all training data (bigger) or whatever file has the full examples list. outName = sys.argv[3] #the name of the output file examps = {} outFile = None offset = 0 #line offset counter for the bigfile orderedTagList = [] #hold a list of the tags in the order of the original file if not os.path.isfile(outName): outFile = open(outName, 'w') else: print("output file already exists, exiting") quit() #build the dictionary of the comments (tags) that are from the balanced examples for line in balancedFile: tag = line[line.find('#'):] protId = tag.split()[1] #extract comment (tag) from line orderedTagList.append([protId, tag]) #store in order for later retrieval from offset dictionary if protId in examps: examps[protId][tag] = 0 #dictionary entry {line comment: offset in bigfile to be found below} else: examps[protId] = {} #make a new dictionary for this protein id. examps[protId][tag] = 0 #put the put the tag in for this protein id. print("finished building dictionary") #read through the bigFile and record the offset of lines that match in the dictionary of balanced examples calculated above for line in bigFile: tag = line[line.find('#'):] protId = tag.split()[1] #extract comment (tag) from line if protId in examps: if tag in examps[protId]: #outFile.write(line) examps[protId][tag] = offset #store the location of this matching tag in the big file offset += len(line) #traverse the ordered list of of tags and find them in the dictionary of the large file. Then print using the offset location for item in orderedTagList: bigFile.seek(examps[item[0]][item[1]]) #retrieve the offset from the dictionary seekLine = bigFile.readline() #read the line at this offset outFile.write(seekLine) outFile.close() balancedFile.close() bigFile.close()