import os
import sys

trainName = sys.argv[1] #name of file with training data (bigger)
testName = sys.argv[2] #name of file with test data (smaller)
    
traF = open(trainName, 'r')
tesF = open(testName, 'r')

traL = [x for x in traF] #lines in the training
traIds = set([x[x.find('#'):].split()[1] for x in traL]) #list of the protein ids in the training file
tesL = [x for x in tesF]
tesIds = set([x[x.find('#'):].split()[1] for x in tesL])

#find which protein IDs match between the two files
matchIds = []
for id in tesIds:
    if id in traIds:
        matchIds.append(id)

print "matching: " + str(matchIds)