#Bailey 7/21/18. Script written for the bmc bioinformatics redo to match the randomly chosen balanced examples (exact examples) present in the
#feature selected datasets to the full feature/full example dataset.
#this is for creating the new balanced/feature selected dataset with new feature selection (meanDecreaseAccuracy) that also
#maintains the same balance as the meanDecreaseGini dataset in the manuscript 
#edited 7/22/18 to make the example order match the order of the original file. This was not done for the "outOfOrder" RF. 

#example usage: nohup python matchBalancedExamps.py sep6_featSelected.txt original/allFullFeatureExamples.txt sep6_fullFeat.txt &> out1.txt &


import os
import sys

balancedFile = open(sys.argv[1], 'r') #the original balanced file (smaller)
bigFile = open(sys.argv[2], 'r') #the collection of all training data (bigger) or whatever file has the full examples list. 
outName = sys.argv[3] #the name of the output file
examps = {}
outFile = None
offset = 0 #line offset counter for the bigfile
orderedTagList = [] #hold a list of the tags in the order of the original file

if not os.path.isfile(outName):
    outFile = open(outName, 'w')
else:
	print("output file already exists, exiting")
	quit()

#build the dictionary of the comments (tags) that are from the balanced examples
for line in balancedFile:
	tag = line[line.find('#'):]
	protId = tag.split()[1] #extract comment (tag) from line
	orderedTagList.append([protId, tag]) #store in order for later retrieval from offset dictionary
	if protId in examps:
		examps[protId][tag] = 0 #dictionary entry {line comment: offset in bigfile to be found below}
	else:
		examps[protId] = {} #make a new dictionary for this protein id.
		examps[protId][tag] = 0 #put the put the tag in for this protein id. 

print("finished building dictionary")
#read through the bigFile and record the offset of lines that match in the dictionary of balanced examples calculated above
for line in bigFile:
	tag = line[line.find('#'):]
	protId = tag.split()[1] #extract comment (tag) from line
	if protId in examps:
		if tag in examps[protId]:
			#outFile.write(line)
			examps[protId][tag] = offset #store the location of this matching tag in the big file

	offset += len(line)

#traverse the ordered list of of tags and find them in the dictionary of the large file. Then print using the offset location
for item in orderedTagList:
	bigFile.seek(examps[item[0]][item[1]]) #retrieve the offset from the dictionary
	seekLine = bigFile.readline() #read the line at this offset

	outFile.write(seekLine)

outFile.close()
balancedFile.close()
bigFile.close()