#program for taking a feature file in the svm format and removing features from each example that do not match
#a given list of features to retain.

import os
import sys

#user editable variables ------------------------
originalFeatFileName = 'sep12_fullFeat.txt'
outputFileName = 'sep12_featSelected_accVarimp.txt'
#select list: a list of feature numbers (the number label of each feature (before the value) in the number:value pair as it appears in the feature file.)
#for example, the pair 2480:0.03 has a feature number of 2480.
selectList = ['166', '198', '358', '426', '392', '1888', '1922', '460', '1854', '2024', '2186', '2154', '1956', '1990', '230', '1172', '1240', '1820', '2122', '2218', '1206', '1002', '1250', '1104', '1036', '494', '102', '1138', '1216', '1786', '968', '134', '1070', '528', '1274', '1752', '562', '1080', '1342', '1194', '1989', '1308', '1175', '596', '1182', '1318', '1650', '1210', '197', '1114', '934', '1718', '1684', '1043', '1284', '1222', '1242', '1410', '2219', '357', '1547', '952', '493', '1315', '2023', '1046', '262', '2185', '1616', '527', '391', '2090', '1142', '866', '1376', '229', '1352', '1220', '459', '2121', '1176', '1174', '70', '630', '978', '2238', '1649', '629', '1254', '1207', '1262', '1445', '1228', '696', '1052', '1247', '1738', '595', '172', '1004'] #top100 singleimportance meandecreaseaccuracy features
#------------------------------------------------

if os.path.isfile(outputFileName):
	print ("output file exists, exiting")
	exit(1)

featFile = open(originalFeatFileName, 'r')
outF = open(outputFileName, 'w')

for line in featFile:
	dataLine = line[:line.find('#')] #the example line without the comment
	comment = line[line.find('#'):]
	items = dataLine.split()
	targetVal = items[0] #the first item before a space. could be 1, 0, -1, etc
	
	outLine = targetVal + ' '
	fc = 1 #new feature count
	for item in items:
		if ':' not in item:
			continue #not a feature
		else:
			featNum = item[:item.find(':')] #number of the feature
			if not featNum in selectList:
				continue
			else:
				outLine += str(fc) + ':' + str(item[item.find(':')+1:]) + ' ' #extract just the feature value from the pair
				fc += 1

	outF.write(outLine + comment) #comment already has \n