#This version takes a matches each positive example to a random negative example from the SAME protein. #previous versions just selected all positive examples and matched to a sample of the entire random population of #negatives import random import sys import os from collections import defaultdict neg = defaultdict(list) chosenNegs = [] #list of the actual chosen line counts pos = [] #line number of the positives inName = sys.argv[1] outName = sys.argv[2] out = None if not os.path.isfile(outName): out = open(outName, 'w') else: print "output file exists already" exit(1) inF = inName # Read in the file once and build a list of line offsets line_offset = [] offset = 0 lc = 0 trainDataObject = open(inF, 'rb') for line in trainDataObject: line_offset.append(offset) offset += len(line) if line[0] == '1': pos.append(lc) if line[0] == '-': protId = line[line.find('#'):].split()[1] neg[protId].append(lc) #organize the negative examples by their protein id lc += 1 trainDataObject.seek(0) #move the pointer back to beginning of file lc = 0 with open(inF) as file: for line in file: protId = line[line.find('#'):].split()[1] lc = lc + 1 if line[0] == '1': out.write(line) #now take a random selection of those matching line counts #these four lines are probably wrong. finish checking randNegIndex = random.randint(0, len(neg[protId])-1) trainDataObject.seek(line_offset[neg[protId][randNegIndex]]) out.write(trainDataObject.readline()) neg[protId].pop(randNegIndex) #make sure the same one isn't chosen again