### #Chooses random 50% of the input file's negative and positive classes. #modified 8/4/18 to automatically handle the sample size (made for unbalanced p1, p2, etc files from sda unbalanced) #NOTE: This version maintains the unbalance ratio and samples neg/pos independently. It will also remove one example from #pos and/or neg if the amount is not evenly divisible by 2. #Bailey 7/19/2018 #written for bmc bioinformatics redo #generates the 50% off data for the bmc bioinformatics paper (maintains balance but 50% less input data) import random import os import sys #### #change these values featureFile = sys.argv[1] #input feature files to be resampled (output will be 50% the number of examples (maybe missing 1-2 examples)) fName = sys.argv[2] #output feature file name #### outFile = None pos = [] neg = [] randNeg = [] randPos = [] lc = 0 if not os.path.isfile(fName): outFile = open(fName, 'w') with open(featureFile) as feats: for line in feats: lc = lc + 1 if line[0] == '1': pos.append(line) if line[0] != '1': neg.append(line) lc = 0 #get the number of the new (downsampled) numPos = len(pos) / 2 numNeg = len(neg) / 2 randNeg = random.sample(neg, numNeg) randPos = random.sample(pos, numPos) for j in range(0, len(randPos)): outFile.write(randPos[j]) for j in range(0, len(randNeg)): outFile.write(randNeg[j]) outFile.close() print ("finished")