#program for taking a feature file in the svm format and removing features from each example that do not match
#a given list of features to retain.

import os
import sys

#user editable variables ------------------------
originalFeatFileName = 'sep6_fullFeat.txt'
outputFileName = 'sep6_featSelected_accVarimp.txt'
#select list: a list of feature numbers (the number label of each feature (before the value) in the number:value pair as it appears in the feature file.)
#for example, the pair 2480:0.03 has a feature number of 2480.
selectList = ['1138', '302', '1420', '1172', '1002', '1070', '968', '1036', '1104', '934', '1386', '1318', '1454', '292', '1216', '1308', '1284', '1148', '1182', '1274', '1488', '1206', '1522', '1250', '1342', '1381', '1211', '166', '1240', '1410', '900', '2088', '334', '134', '2218', '1352', '1658', '2130', '1218', '1279', '2056', '1114', '1228', '1556', '260', '1483', '1415', '102', '1262', '1245', '1150', '1116', '1376', '866', '1449', '1330', '1347', '324', '2162', '1286', '1082', '1444', '1252', '1222', '1478', '1296', '304', '1012', '1290', '1046', '1188', '1888', '1107', '1048', '1174', '2098', '1922', '1288', '2120', '1624', '944', '1955', '1324', '798', '2066', '1038', '1313', '1517', '2070', '1653', '1956', '2238', '1084', '1160', '1432', '261', '2121', '1140', '1760', '2250'] #top100 singleimportance meandecreaseaccuracy features
#------------------------------------------------

if os.path.isfile(outputFileName):
	print ("output file exists, exiting")
	exit(1)

featFile = open(originalFeatFileName, 'r')
outF = open(outputFileName, 'w')

for line in featFile:
	dataLine = line[:line.find('#')] #the example line without the comment
	comment = line[line.find('#'):]
	items = dataLine.split()
	targetVal = items[0] #the first item before a space. could be 1, 0, -1, etc
	
	outLine = targetVal + ' '
	fc = 1 #new feature count
	for item in items:
		if ':' not in item:
			continue #not a feature
		else:
			featNum = item[:item.find(':')] #number of the feature
			if not featNum in selectList:
				continue
			else:
				outLine += str(fc) + ':' + str(item[item.find(':')+1:]) + ' ' #extract just the feature value from the pair
				fc += 1

	outF.write(outLine + comment) #comment already has \n