#NOTE: this old version doesn't match the order of the examples. 
#Bailey 7/21/18. Script written for the bmc bioinformatics redo to match the randomly chosen balanced examples (exact examples) present in the
#feature selected datasets to the full feature/full example dataset.
#this is for creating the new balanced/feature selected dataset with new feature selection (meanDecreaseAccuracy) that also
#maintains the same balance as the meanDecreaseGini dataset in the manuscript 
#example usage: nohup python matchBalancedExamps.py sep6_featSelected.txt original/allFullFeatureExamples.txt sep6_fullFeat.txt &> out1.txt &


import os
import sys

balancedFile = open(sys.argv[1], 'r') #the original balanced file (smaller)
bigFile = open(sys.argv[2], 'r') #the collection of all training data (bigger) or whatever file has the full examples list. 
outName = sys.argv[3] #the name of the output file
examps = {}
outFile = None

if not os.path.isfile(outName):
    outFile = open(outName, 'w')
else:
	print("output file already exists, exiting")
	quit()

#build the dictionary of the comments (tags) that are from the balanced examples
for line in balancedFile:
	tag = line[line.find('#'):]
	protId = tag.split()[1] #extract comment (tag) from line
	if protId in examps:
		examps[protId].append(tag)
	else:
		examps[protId] = [tag]

print("finished building dictionary")
#read through the bigFile and output lines that match in the dictionary of balanced examples calculated above
for line in bigFile:
	tag = line[line.find('#'):]
	protId = tag.split()[1] #extract comment (tag) from line
	if protId in examps:
		if tag in examps[protId]:
			outFile.write(line)

outFile.close()
balancedFile.close()
bigFile.close()