#program for taking a feature file in the svm format and removing features from each example that do not match #a given list of features to retain. import os import sys #user editable variables ------------------------ originalFeatFileName = 'sep24_fullFeat.txt' outputFileName = 'sep24_featSelected_accVarimp.txt' #select list: a list of feature numbers (the number label of each feature (before the value) in the number:value pair as it appears in the feature file.) #for example, the pair 2480:0.03 has a feature number of 2480. selectList = ['1919', '558', '524', '1851', '1885', '386', '167', '592', '490', '2219', '1817', '456', '2051', '2186', '2187', '177', '626', '2218', '1783', '420', '1953', '134', '171', '2229', '166', '360', '2024', '2224', '2206', '394', '172', '1949', '2154', '487', '168', '358', '204', '2238', '232', '2197', '2250', '2017', '200', '198', '453', '2026', '135', '1983', '199', '2025', '2196', '2092', '2223', '136', '2220', '104', '1694', '1344', '2153', '183', '521', '2252', '164', '1448', '1881', '1992', '2190', '40', '346', '38', '793', '1746', '364', '359', '186', '1915', '2010', '264', '2044', '231', '102', '70', '2023', '2235', '111', '1222', '2126', '1156', '1568', '1916', '2301', '2237', '1363', '1643', '165', '213', '76', '1958', '1814', '691'] #top100 singleimportance meandecreaseaccuracy features #------------------------------------------------ if os.path.isfile(outputFileName): print ("output file exists, exiting") exit(1) featFile = open(originalFeatFileName, 'r') outF = open(outputFileName, 'w') for line in featFile: dataLine = line[:line.find('#')] #the example line without the comment comment = line[line.find('#'):] items = dataLine.split() targetVal = items[0] #the first item before a space. could be 1, 0, -1, etc outLine = targetVal + ' ' fc = 1 #new feature count for item in items: if ':' not in item: continue #not a feature else: featNum = item[:item.find(':')] #number of the feature if not featNum in selectList: continue else: outLine += str(fc) + ':' + str(item[item.find(':')+1:]) + ' ' #extract just the feature value from the pair fc += 1 outF.write(outLine + comment) #comment already has \n