#bmc bioinformatics redo testing to see if meanDecreaseGini Selected Features can be recreated #program for taking a feature file in the svm format and removing features from each example that do not match #a given list of features to retain. import os import sys #user editable variables ------------------------ originalFeatFileName = 'ordered_sep6_fullFeat.txt' outputFileName = 'ordered_sep6_featSelected.txt' #select list: a list of feature numbers (the number label of each feature (before the value) in the number:value pair as it appears in the feature file.) #for example, the pair 2480:0.03 has a feature number of 2480. selectList = ['984', '730', '1194', '950', '1037', '969', '944', '1320', '2238', '308', '2282', '1152', '1120', '1313', '1012', '1692', '1347', '1256', '1279', '1590', '1211', '764', '1624', '1245', '2070', '1330', '1658', '2098', '1517', '1048', '2130', '70', '2223', '38', '696', '1046', '1184', '166', '1483', '1381', '1082', '1415', '304', '1080', '832', '1150', '1444', '2250', '1116', '1218', '1286', '1262', '1252', '1296', '1478', '1449', '134', '2120', '1556', '1228', '798', '102', '2218', '334', '324', '1376', '1410', '1114', '1182', '1522', '1148', '1352', '2056', '1284', '1488', '866', '1318', '900', '1454', '1250', '260', '1420', '2088', '934', '1308', '1240', '1216', '1386', '1342', '292', '302', '1274', '1206', '1002', '1172', '968', '1104', '1070', '1036', '1138'] #------------------------------------------------ if os.path.isfile(outputFileName): print ("output file exists, exiting") exit(1) featFile = open(originalFeatFileName, 'r') outF = open(outputFileName, 'w') for line in featFile: dataLine = line[:line.find('#')] #the example line without the comment comment = line[line.find('#'):] items = dataLine.split() targetVal = items[0] #the first item before a space. could be 1, 0, -1, etc outLine = targetVal + ' ' fc = 1 #new feature count for item in items: if ':' not in item: continue #not a feature else: featNum = item[:item.find(':')] #number of the feature if not featNum in selectList: continue else: outLine += str(fc) + ':' + str(item[item.find(':')+1:]) + ' ' #extract just the feature value from the pair fc += 1 outF.write(outLine + comment) #comment already has \n