import os
import sys

trainName = sys.argv[1] #name of file with training data (bigger)
testName = sys.argv[2] #name of file with test data (smaller)
newTest = sys.argv[3] #name of the new test file to be produced with no duplicates from the training file
    
traF = open(trainName, 'r')
tesF = open(testName, 'r')
newTesF = None
if not os.path.isfile(newTest):
	newTesF = open(newTest, 'w')
else:
	print "error: output file already exists"
	exit(1)

traL = [x for x in traF] #lines in the training
traIds = set([x[x.find('#'):].split()[1] for x in traL]) #list of the protein ids in the training file
tesL = [x for x in tesF]
tesIds = set([x[x.find('#'):].split()[1] for x in tesL])

#find which protein IDs match between the two files
matchIds = []
for id in tesIds:
    if id in traIds:
        matchIds.append(id)

print "removing the matching proteins: " + str(matchIds)
tesF.close()
tesF = open(testName, 'r')

for line in tesF:
	if line[line.find('#'):].split()[1] in matchIds:
		continue
	else:
		newTesF.write(line)