import os import sys trainName = sys.argv[1] #name of file with training data (bigger) testName = sys.argv[2] #name of file with test data (smaller) newTest = sys.argv[3] #name of the new test file to be produced with no duplicates from the training file traF = open(trainName, 'r') tesF = open(testName, 'r') newTesF = None if not os.path.isfile(newTest): newTesF = open(newTest, 'w') else: print "error: output file already exists" exit(1) traL = [x for x in traF] #lines in the training traIds = set([x[x.find('#'):].split()[1] for x in traL]) #list of the protein ids in the training file tesL = [x for x in tesF] tesIds = set([x[x.find('#'):].split()[1] for x in tesL]) #find which protein IDs match between the two files matchIds = [] for id in tesIds: if id in traIds: matchIds.append(id) print "removing the matching proteins: " + str(matchIds) tesF.close() tesF = open(testName, 'r') for line in tesF: if line[line.find('#'):].split()[1] in matchIds: continue else: newTesF.write(line)