### #Bailey 7/19/2018 #written for bmc bioinformatics redo #generates the 50% off data for the bmc bioinformatics paper (maintains balance but 50% less input data) import random import os #### #change these values fName = "sep24_featSelected_50PercentOff.txt" #output feature file name featureFile = "sep24_featSelected.txt" #### outFile = None pos = [] neg = [] randNeg = [] randPos = [] choice = [0, 1] lc = 0 sampSize = 185900 #outFile file will have 2 times this number of examples (equal pos/neg) (make sure there is enough in the input file) if not os.path.isfile(fName): outFile = open(fName, 'w') with open(featureFile) as feats: for line in feats: lc = lc + 1 if line[0] == '1': pos.append(line) if line[0] != '1': neg.append(line) lc = 0 randNeg = random.sample(neg, sampSize) randPos = random.sample(pos, sampSize) for j in range(0, len(randNeg)): outFile.write(randNeg[j]) for j in range(0, len(randPos)): outFile.write(randPos[j]) outFile.close() print ("finished")