###
#Chooses random 50% of the input file's negative and positive classes.
#modified 8/4/18 to automatically handle the sample size (made for unbalanced p1, p2, etc files from sda unbalanced)
#NOTE: This version maintains the unbalance ratio and samples neg/pos independently. It will also remove one example from
#pos and/or neg if the amount is not evenly divisible by 2.

#Bailey 7/19/2018
#written for bmc bioinformatics redo
#generates the 50% off data for the bmc bioinformatics paper (maintains balance but 50% less input data)
import random
import os
import sys

####
#change these values
featureFile = sys.argv[1] #input feature files to be resampled (output will be 50% the number of examples (maybe missing 1-2 examples))
fName = sys.argv[2] #output feature file name
####
outFile = None
pos = []
neg = []
randNeg = []
randPos = []
lc = 0

if not os.path.isfile(fName):
    outFile = open(fName, 'w')

with open(featureFile) as feats:
    for line in feats:
        lc = lc + 1
        if line[0] == '1':
            pos.append(line)
        if line[0] != '1':
            neg.append(line)

lc = 0

#get the number of the new (downsampled)
numPos = len(pos) / 2
numNeg = len(neg) / 2 

randNeg = random.sample(neg, numNeg)
randPos = random.sample(pos, numPos)

for j in range(0, len(randPos)):
    outFile.write(randPos[j])

for j in range(0, len(randNeg)):
    outFile.write(randNeg[j])

outFile.close()
print ("finished")