From 9f9278ef6e122aed609d1f6884b7075effbc0471 Mon Sep 17 00:00:00 2001 From: Sarvesh Prakash Nikumbh Date: Tue, 25 Apr 2017 11:45:42 +0200 Subject: [PATCH] Fixed: 1. ommittedFastaIds also prinds sequences indices; 2. comikl_main function correct handles the adjustment of testIndices --- comikl_main_with_weight_vector.m | 41 +++++++++++++++++++++----------- comikl_wrapper.m | 2 +- readFastaSequences.m | 10 ++++---- 3 files changed, 33 insertions(+), 20 deletions(-) diff --git a/comikl_main_with_weight_vector.m b/comikl_main_with_weight_vector.m index f285c90..e706cfb 100644 --- a/comikl_main_with_weight_vector.m +++ b/comikl_main_with_weight_vector.m @@ -33,26 +33,31 @@ % Read positives logMessages(debugMsgLocation,sprintf('Positives...'), debugLevel); -[nPos, posIndices, allSeqsRawPosFasta] = readFastaSequences(givenPosFastaFilename, chunkSizeInBps, givenNPos, outputFolder, debugLevel, debugMsgLocation); +[nPos, ommittedPosIndices, allSeqsRawPosFasta] = readFastaSequences(givenPosFastaFilename, chunkSizeInBps, givenNPos, outputFolder, debugLevel, debugMsgLocation); % nPos and nNeg be could smaller than equal to givenNPos and givenNNeg resp. due to ommitting any sequences % whose length is less than the chunkSizeInBps % % Read Negatives logMessages(debugMsgLocation,sprintf('Negatives...'), debugLevel); -[nNeg, negIndices, allSeqsRawNegFasta] = readFastaSequences(givenNegFastaFilename, chunkSizeInBps, givenNNeg, outputFolder, debugLevel, debugMsgLocation); +[nNeg, ommittedNegIndices, allSeqsRawNegFasta] = readFastaSequences(givenNegFastaFilename, chunkSizeInBps, givenNNeg, outputFolder, debugLevel, debugMsgLocation); % Combine positives + negatives nBags = nPos + nNeg; % Check if we need to update the testIndices if some sequences have been ommitted % if posIndices and negIndices are empty, then no update required -if ~isempty(posIndices) - testIndices(find(ismember(testIndices, posIndices))) = []; +if ~isempty(ommittedPosIndices) + %nOmmitted = length(find(ismember(testIndices, ommittedPosIndices))); + testIndices(find(ismember(testIndices, ommittedPosIndices))) = []; + %logMessages(debugMsgLocation,sprintf('%d positive sequences ommitted from test set.\n', nOmmitted), debugLevel); end -if ~isempty(negIndices) - testIndices(find(ismember(testIndices, nPos+negIndices))) = [];%offset by nPos +if ~isempty(ommittedNegIndices) + %nOmmitted = length(find(ismember(testIndices, ommittedNegIndices))); + testIndices(find(ismember(testIndices, nPos+ommittedNegIndices))) = [];%offset by nPos + %logMessages(debugMsgLocation,sprintf('%d negative sequences ommitted from test set.\n', nOmmitted), debugLevel); end -% Additionally, one needs to account for the reduction in the number of sequences affecting the indices +% Additionally, one needs to account for the reduction in the number of sequences affecting the indices. Remove indices > nBags +%testIndices = testIndices(testIndices <= nBags); for i=1:length(testIndices) - testIndices(i) = testIndices(i) - nnz(posIndices < testIndices(i)) - nnz(negIndices < testIndices(i)); + testIndices(i) = testIndices(i) - nnz(ommittedPosIndices < testIndices(i)) - nnz( (nPos+ommittedNegIndices) < testIndices(i)); end % Put them together allSeqsRawFasta.sequence = cell(1, nBags); @@ -64,8 +69,12 @@ allSeqsRawFasta.sequence{i+nPos} = allSeqsRawNegFasta.sequence{i};%offset by nPos end clear allSeqsRawNegFasta; -% Write test indices to disk for reproducibility -dlmwrite(strcat(outputFolder, '/testIndices.txt'), testIndices); +% Write the revised set test indices to disk for reproducibility + +if ~isempty(ommittedPosIndices) | ~isempty(ommittedNegIndices) + sortedTestIndices = sort(testIndices); + dlmwrite(strcat(outputFolder, '/testIndices_New.txt'), sortedTestIndices'); +end % generate labels % handle imbalance, multiply this to C for negatives @@ -112,7 +121,7 @@ nBagsForTest = nBags-idx1; logMessages(debugMsgLocation,sprintf('#Test: %d\n#Train: %d\n', nBagsForTest, nBagsForTrain), debugLevel); logMessages(debugMsgLocation,sprintf('Segmentation statistics:\n'), debugLevel); -logMessages(debugMsgLocation,sprintf(' Mean number of instancesa in bags: %.2f\t Median: %d\t Max.: %d\t Min.: %d\n', mean(thisInstances), median(thisInstances), max(thisInstances), min(thisInstances)), debugLevel); +logMessages(debugMsgLocation,sprintf(' Mean number of instances in bags: %.2f\t Median: %d\t Max.: %d\t Min.: %d\n', mean(thisInstances), median(thisInstances), max(thisInstances), min(thisInstances)), debugLevel); % Variable 'nBags' holds nBagsForTrain + nBagsForTest % % 2. Compute the instanceWide kernel that is used further @@ -250,13 +259,13 @@ % % Use the best param combinations to train using the whole trainiing set and predict the test set here % -logMessages(debugMsgLocation,sprintf('---Test---\n'), debugLevel); +logMessages(debugMsgLocation,sprintf('---Re-training and Test---\n'), debugLevel); % Re-train the model with all train instances together % trainIndicesInBags is 1:nBagsForTrain nClusters = bestParamComb.best_nClusters; conformalXformationParam = bestParamComb.best_sigma; -logMessages(debugMsgLocation,sprintf('nClusters: %d, sigma: %d\n', nClusters, conformalXformationParam), debugLevel); -logMessages(debugMsgLocation,sprintf('Re-training uisng complete trainning set examples...\n'), debugLevel); +logMessages(debugMsgLocation,sprintf('Best param-values:\nSVM-Cost:%.3f, nClusters: %d, sigma: %.3f\n', nClusters, conformalXformationParam), debugLevel); +logMessages(debugMsgLocation,sprintf('Re-training using complete trainning set examples...\n'), debugLevel); logMessages(debugMsgLocation,sprintf('Conformed Multi-instance kernel for all bags...\n'), debugLevel); [allSeqsConformedSetKernel, rawConformedSetKernel, allSeqsTransformationKernel, clusterCentres] = ... @@ -364,7 +373,11 @@ % logMessages(debugMsgLocation,sprintf('Bias value: %.4f\n', biasValue), debugLevel); yhat = yhatTemp + biasValue; + if(length(Youtertest) == length(yhat)) + logMessages(debugMsgLocation, sprintf('%d and %d: Dimensions of yhat and ytest match!\n', length(Youtertest), length(yhat)), debugLevel); + end dlmwrite(strcat(outputFolder,'/predictedLabelsWeightVector.txt'), yhat'); + dlmwrite(strcat(outputFolder,'/givenLabels.txt'), Youtertest'); logMessages(debugMsgLocation,sprintf('Predicted labels written to disk. Computing the auROC/auPRC, this may take some time...\n'), debugLevel); tic; [test_teAUROC, test_teAUPRC] = libsvm_plotroc(Youtertest', yhat', 'personal'); diff --git a/comikl_wrapper.m b/comikl_wrapper.m index a0e1159..fa393ca 100644 --- a/comikl_wrapper.m +++ b/comikl_wrapper.m @@ -357,7 +357,7 @@ case char('distal-proximal-cgi') fprintf(testFid, 'testIndices for this outer fold:\n'); fclose(testFid); if exist(testIndicesFilename, 'file') == 2 - dlmwrite(testIndicesFilename, testIndices','-append', 'roffset', 1); + dlmwrite(testIndicesFilename, sort(testIndices'),'-append', 'roffset', 1); logMessages(debugMsgLocation, sprintf('Test indices for this outer fold written to disk'), debugLevel); end diff --git a/readFastaSequences.m b/readFastaSequences.m index b4050f3..f35e056 100644 --- a/readFastaSequences.m +++ b/readFastaSequences.m @@ -1,4 +1,4 @@ -function [nSeqs, indices, passFasta] = readFastaSequences(txt, chunkSizeInBps, givenNSeqs, outputFolder, debugLevel, debugMsgLocation) +function [nSeqs, ommittedIndices, passFasta] = readFastaSequences(txt, chunkSizeInBps, givenNSeqs, outputFolder, debugLevel, debugMsgLocation) % FUNCTION FASTA=READFASTA(FILENAME) % @@ -25,7 +25,7 @@ fasta.legend = cell(1, givenNSeqs); fasta.sequence = cell(1, givenNSeqs); i=0; -indices = []; +ommittedIndices = []; fasta.title=[txt,', read: ',date]; % READS SEQUENCES AND THEIR LEGENDS fid=fopen(txt,'r'); @@ -44,8 +44,8 @@ if lengths(end) >= chunkSizeInBps fasta.sequence{i}=[fasta.sequence{i},upper(lineRead)]; else % we omit that sequence - indices = [indices i]; - fprintf(ofid, fasta.legend{i}); + ommittedIndices = [ommittedIndices i]; + fprintf(ofid, '%d\t%s', i, fasta.legend{i}); fprintf(ofid, '\n'); end end @@ -54,7 +54,7 @@ fclose(ofid); fclose(fid); ineligibleLengths = lengths(lengths < chunkSizeInBps); -ineligibleIndices = find(lengths <= chunkSizeInBps); +ineligibleIndices = find(lengths < chunkSizeInBps); nSeqs = givenNSeqs; passFasta = fasta; if size(ineligibleLengths, 2) > 0