#program for taking a feature file in the svm format and removing features from each example that do not match #a given list of features to retain. import os import sys #user editable variables ------------------------ originalFeatFileName = 'sep12_fullFeat.txt' outputFileName = 'sep12_featSelected_accVarimp.txt' #select list: a list of feature numbers (the number label of each feature (before the value) in the number:value pair as it appears in the feature file.) #for example, the pair 2480:0.03 has a feature number of 2480. selectList = ['166', '198', '358', '426', '392', '1888', '1922', '460', '1854', '2024', '2186', '2154', '1956', '1990', '230', '1172', '1240', '1820', '2122', '2218', '1206', '1002', '1250', '1104', '1036', '494', '102', '1138', '1216', '1786', '968', '134', '1070', '528', '1274', '1752', '562', '1080', '1342', '1194', '1989', '1308', '1175', '596', '1182', '1318', '1650', '1210', '197', '1114', '934', '1718', '1684', '1043', '1284', '1222', '1242', '1410', '2219', '357', '1547', '952', '493', '1315', '2023', '1046', '262', '2185', '1616', '527', '391', '2090', '1142', '866', '1376', '229', '1352', '1220', '459', '2121', '1176', '1174', '70', '630', '978', '2238', '1649', '629', '1254', '1207', '1262', '1445', '1228', '696', '1052', '1247', '1738', '595', '172', '1004'] #top100 singleimportance meandecreaseaccuracy features #------------------------------------------------ if os.path.isfile(outputFileName): print ("output file exists, exiting") exit(1) featFile = open(originalFeatFileName, 'r') outF = open(outputFileName, 'w') for line in featFile: dataLine = line[:line.find('#')] #the example line without the comment comment = line[line.find('#'):] items = dataLine.split() targetVal = items[0] #the first item before a space. could be 1, 0, -1, etc outLine = targetVal + ' ' fc = 1 #new feature count for item in items: if ':' not in item: continue #not a feature else: featNum = item[:item.find(':')] #number of the feature if not featNum in selectList: continue else: outLine += str(fc) + ':' + str(item[item.find(':')+1:]) + ' ' #extract just the feature value from the pair fc += 1 outF.write(outLine + comment) #comment already has \n