#program for taking a feature file in the svm format and removing features from each example that do not match #a given list of features to retain. import os import sys #user editable variables ------------------------ originalFeatFileName = 'sep6_fullFeat.txt' outputFileName = 'sep6_featSelected_accVarimp.txt' #select list: a list of feature numbers (the number label of each feature (before the value) in the number:value pair as it appears in the feature file.) #for example, the pair 2480:0.03 has a feature number of 2480. selectList = ['1138', '302', '1420', '1172', '1002', '1070', '968', '1036', '1104', '934', '1386', '1318', '1454', '292', '1216', '1308', '1284', '1148', '1182', '1274', '1488', '1206', '1522', '1250', '1342', '1381', '1211', '166', '1240', '1410', '900', '2088', '334', '134', '2218', '1352', '1658', '2130', '1218', '1279', '2056', '1114', '1228', '1556', '260', '1483', '1415', '102', '1262', '1245', '1150', '1116', '1376', '866', '1449', '1330', '1347', '324', '2162', '1286', '1082', '1444', '1252', '1222', '1478', '1296', '304', '1012', '1290', '1046', '1188', '1888', '1107', '1048', '1174', '2098', '1922', '1288', '2120', '1624', '944', '1955', '1324', '798', '2066', '1038', '1313', '1517', '2070', '1653', '1956', '2238', '1084', '1160', '1432', '261', '2121', '1140', '1760', '2250'] #top100 singleimportance meandecreaseaccuracy features #------------------------------------------------ if os.path.isfile(outputFileName): print ("output file exists, exiting") exit(1) featFile = open(originalFeatFileName, 'r') outF = open(outputFileName, 'w') for line in featFile: dataLine = line[:line.find('#')] #the example line without the comment comment = line[line.find('#'):] items = dataLine.split() targetVal = items[0] #the first item before a space. could be 1, 0, -1, etc outLine = targetVal + ' ' fc = 1 #new feature count for item in items: if ':' not in item: continue #not a feature else: featNum = item[:item.find(':')] #number of the feature if not featNum in selectList: continue else: outLine += str(fc) + ':' + str(item[item.find(':')+1:]) + ' ' #extract just the feature value from the pair fc += 1 outF.write(outLine + comment) #comment already has \n