#This version takes a matches each positive example to a random negative example from the SAME protein.
#previous versions just selected all positive examples and matched to a sample of the entire random population of
#negatives

import random
import sys
import os
from collections import defaultdict

neg = defaultdict(list)
chosenNegs = [] #list of the actual chosen line counts
pos = [] #line number of the positives
inName = sys.argv[1]
outName = sys.argv[2]
out = None
if not os.path.isfile(outName):
	out = open(outName, 'w')
else:
	print "output file exists already"
	exit(1)

inF = inName

# Read in the file once and build a list of line offsets
line_offset = []
offset = 0
lc = 0
trainDataObject = open(inF, 'rb')
for line in trainDataObject:
    line_offset.append(offset)
    offset += len(line)
    if line[0] == '1':
        pos.append(lc)
    if line[0] == '-':
        protId = line[line.find('#'):].split()[1]
        neg[protId].append(lc) #organize the negative examples by their protein id

    lc += 1

trainDataObject.seek(0) #move the pointer back to beginning of file
lc = 0
with open(inF) as file:
    for line in file:
        protId = line[line.find('#'):].split()[1]
        lc = lc + 1
        if line[0] == '1':
            out.write(line)
            #now take a random selection of those matching line counts
            #these four lines are probably wrong. finish checking
            randNegIndex = random.randint(0, len(neg[protId])-1)
            trainDataObject.seek(line_offset[neg[protId][randNegIndex]])
            out.write(trainDataObject.readline())
            neg[protId].pop(randNegIndex) #make sure the same one isn't chosen again