import os import sys trainName = sys.argv[1] #name of file with training data (bigger) testName = sys.argv[2] #name of file with test data (smaller) traF = open(trainName, 'r') tesF = open(testName, 'r') traL = [x for x in traF] #lines in the training traIds = set([x[x.find('#'):].split()[1] for x in traL]) #list of the protein ids in the training file tesL = [x for x in tesF] tesIds = set([x[x.find('#'):].split()[1] for x in tesL]) #find which protein IDs match between the two files matchIds = [] for id in tesIds: if id in traIds: matchIds.append(id) print "matching: " + str(matchIds)