From c9f81923dab012305f2464566feace46e3d35854 Mon Sep 17 00:00:00 2001 From: Sarvesh Prakash Nikumbh Date: Wed, 19 Jul 2017 15:45:51 +0200 Subject: [PATCH] major edit: name changed to comik. Small edits: various files kernel dimension stored in a separate variable name. --- README.md | 32 +- analyze_wvector.m | 2 +- barplot-heatmap-visualization.R | 52 +++ ...ector.m => comik_main_with_weight_vector.m | 22 +- comikl_wrapper.m => comik_wrapper.m | 17 +- comikl_main.m | 410 ------------------ computeConformedMultiInstanceKernel.m | 15 +- getExpansionPoints.m | 4 +- getInstanceWeights.m | 18 +- getODHFeatureVecInstances.m | 7 +- mkl.py | 5 +- plot_heatmap.m | 26 +- 12 files changed, 149 insertions(+), 461 deletions(-) create mode 100644 barplot-heatmap-visualization.R rename comikl_main_with_weight_vector.m => comik_main_with_weight_vector.m (95%) rename comikl_wrapper.m => comik_wrapper.m (91%) delete mode 100644 comikl_main.m diff --git a/README.md b/README.md index c0b0f99..aaa60b7 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,22 @@ -# CoMIKL : Conformal Multi-Instance Kernel Learning +# CoMIK : Conformal Multi-Instance Kernels Manuscript authors: Sarvesh Nikumbh, Peter Ebert, Nico Pfeifer - -Manuscript under preparation. It can be accessed at the department's SVN repository [link] +To appear in Proceedings of WABI 2017. +Preprint at bioRxiv: ## Requirements: -- MATLAB 9.0.0.341360 (R2016a) [We developed _CoMIKL_ using this version of MATLAB. Compatibility with earlier versions is to be checked.] +- MATLAB 9.0.0.341360 (R2016a) [We developed _comik_ using this version of MATLAB. Compatibility with earlier versions is to be checked.] - Python 2.7 or higher - SHOGUN Release version 3.2.0 -This repository contains Matlab code for the project *CoMIKL*. The various ".m" files define corresponding Matlab functions. +This repository contains Matlab code for the project *CoMIK*. The various ".m" files define corresponding Matlab functions. We use MKL implementation from Shogun's modular interface for Python. The Python script mkl.py handles solving of the MKL problem. ## Installation: ```MATLAB -git clone https://github.molgen.mpg.de/snikumbh/comikl -cd comikl +git clone https://github.molgen.mpg.de/snikumbh/comik +cd comik chmod +x mkl.py export PATH=".:$PATH" ``` @@ -25,14 +25,14 @@ export PATH=".:$PATH" Example function call from inside Matlab: For simulated dataset 1 provided in the folder `sample_data` ```Matlab -comikl_wrapper('sample_data/simulated_dataset1/pos.fasta', 'sample_data/simulated_dataset1/neg.fa', 600, 600, [501:600 1101:1200], 'comikl_run_simulated_dataset1', [2], 10, 10, [2 5 7], 10.^[1:1:2], 10.^[-3:1:3], 2.0, 10, 5, 'No', 'Yes', 2, 'runLog.txt'); +comik_wrapper('sample_data/simulated_dataset1/pos.fasta', 'sample_data/simulated_dataset1/neg.fa', 600, 600, [501:600 1101:1200], 'comik_run_simulated_dataset1', [2], 10, 10, [2 5 7], 10.^[1:1:2], 10.^[-3:1:3], 2.0, 10, 5, 'No', 'Yes', 2, 'runLog.txt'); ``` -_CoMIKL_ accepts two FASTA files as input -- the first FASTA file containing sequences in the positive class followed by a second FASTA file containing the sequences in the negative class. Other params are explained below. +_comik_ accepts two FASTA files as input -- the first FASTA file containing sequences in the positive class followed by a second FASTA file containing the sequences in the negative class. Other params are explained below. Further details: -The CoMIKL wrapper function takes the following arguments as input +The comik wrapper function takes the following arguments as input - positive FASTA filename [type: str] - negative FASTA filename [type: str] @@ -45,7 +45,7 @@ The above are required while the following have default values Param name | type | default value | Additional comments -----------|-------|---------------|--------------------- - oligomer-lengths-as-vector | int | [2] | Required for the _ODH_ representation. Recommended values: 2 or 3 (suffices). Passing a vector [2 3] will run _CoMIKL_ first with oligomer-length 2 followed by an independent second run with oligomer-length 3. Further, see **Note 2** below + oligomer-lengths-as-vector | int | [2] | Required for the _ODH_ representation. Recommended values: 2 or 3 (suffices). Passing a vector [2 3] will run _comik_ first with oligomer-length 2 followed by an independent second run with oligomer-length 3. Further, see **Note 2** below maximum-distance | int | 50 | Required for the _ODH_ representation. Typically, a maximum distance of 100 basepairs suffices even if the segment-size is larger segment-size | int | 100 | See **Note 1** below number-of-clusters | vector of ints | [2 5] | Recommended maximum number of clusters: 7 @@ -55,14 +55,14 @@ Param name | type | default value | Additional comments number-of-inner-folds | int | 10 | number-of-outer-folds | int | 5 | whetherToPlotHeatmap | str | 'No' | Possible values: 'Yes' or 'No'; Set 'Yes' only if all sequences are of the same length. - debugLevel | int | 2 | Possible values: [0, 1, 2]. Value 0 makes _CoMIKL_ completely silent, and 2 makes it maximally verbose. Value 1 may be used in the future. + debugLevel | int | 2 | Possible values: [0, 1, 2]. Value 0 makes _comik_ completely silent, and 2 makes it maximally verbose. Value 1 may be used in the future. debugMsgLocation | int/str | 1 | Value 1 denotes the command prompt, else specify a filename, say 'runLog.txt'. This file is written for each outer fold separately. computationVersion | str | 'Looping'| Possible values: 'Looping' or 'AccumArray'. 'Looping' is faster and preferred/recommended for large datasets when 'AccumArray' can be memory intensive. -The `comikl_wrapper` function handles creation of and running outer cross-validation folds (as part of nested cross-validation). The supplied indices of the test sequences, or `test_indices` are then used to note the proportion of positives and negatives from the whole set that is to be treated as unseen test examples. The given set of sequences are then shuffled before splitting them into training and unseen test examples as per the specified proportions. The indices of the samples treated as unseen test examples is also written to disk per outer fold (filename: testIndices.txt). +The `comik_wrapper` function handles creation of and running outer cross-validation folds (as part of nested cross-validation). The supplied indices of the test sequences, or `test_indices` are then used to note the proportion of positives and negatives from the whole set that is to be treated as unseen test examples. The given set of sequences are then shuffled before splitting them into training and unseen test examples as per the specified proportions. The indices of the samples treated as unseen test examples is also written to disk per outer fold (filename: testIndices.txt). -**Note 1**: In case you are interested in performing a quick, exploratory run using _CoMIKL_ on your data, kindly note that depending on the number of sequences in the collection and their lengths, if there are many very long sequences, a small segment-size may lead to very high number of segments in total thereby increasing the computation time which may be prohibitive for this initial run. Hence, for such an initial run, the following values are recommended: +**Note 1**: In case you are interested in performing a quick, exploratory run using _comik_ on your data, kindly note that depending on the number of sequences in the collection and their lengths, if there are many very long sequences, a small segment-size may lead to very high number of segments in total thereby increasing the computation time which may be prohibitive for this initial run. Hence, for such an initial run, the following values are recommended: oligomer-length: [2] maximum-distance: min(segment-size, 50) @@ -71,10 +71,10 @@ segment-size: 100 or 200 **Note 2**: Kindly note that oligomer-lengths of 3 or larger than 3, depending on the number and length of the sequences, and segment-size used, can lead to very high-dimensional vectors which can be memory-intensive. -**Note 3**: Presently, _CoMIKL_ uses MATLAB parfor-loop to execute the outer cross-validation folds in parallel. +**Note 3**: Presently, _comik_ uses MATLAB parfor-loop to execute the outer cross-validation folds in parallel. During the run, -* _CoMIKL_ omits the sequences whose lengths are shorter than the segment-size specified from the run. It reports the number of sequences that got ommitted, their FASTA-Ids in a separate file named `ommittedFastaIds.txt` per outer fold separately. +* _comik_ omits the sequences whose lengths are shorter than the segment-size specified from the run. It reports the number of sequences that got ommitted, their FASTA-Ids in a separate file named `ommittedFastaIds.txt` per outer fold separately. * the following files are written to the disk per outer fold at any intermediate stage of the pipeline. Most of these are used by the pipeline itself in its subsequent stages. - Run summary file: The resultString is also written to the summary file which is characterized by the segment-size and oligomer-length. The summary file is typically named: 'runSummary_segment-sizeX_oligoLenY.txt' where X and Y are as set for the pipeline run. diff --git a/analyze_wvector.m b/analyze_wvector.m index 8d7aa61..f9b9000 100644 --- a/analyze_wvector.m +++ b/analyze_wvector.m @@ -310,7 +310,7 @@ %set(new_ax, 'Ydir', 'reverse'); pdf_fname = strcat(folderName, '/SeqLogos_MinMax_at_distances_rank', num2str(rankId), '_oligoLen', num2str(oligoLen)); - print(pdf_fname,'-dpdf'); + print(pdf_fname,'-dpdf', '-r900', '-bestfit'); %print(pdf_fname, '-depsc'); pdfcrop_lines = [pdfcrop_lines; sprintf('pdfcrop %s\n', pdf_fname)]; grid off; diff --git a/barplot-heatmap-visualization.R b/barplot-heatmap-visualization.R new file mode 100644 index 0000000..6c2f31b --- /dev/null +++ b/barplot-heatmap-visualization.R @@ -0,0 +1,52 @@ + + +#rscaled <- scale(r, center = FALSE, scale = colSums(r,na.rm=T)) +#may not need re-scaling +pdf('samarth_viz_in_r.pdf', width=10.0, height=6.0) +flds <- count.fields('samarth_test1.csv',sep=',') +s <- read.csv('samarth_test1.csv', fill=TRUE, nrows=length(flds), header=F, sep=",", col.names=paste0("Seg-", seq(1,max(flds),by=1))) +r <- s[1:25,] +r_ones <- r +r_ones[!is.na(r)] <- 1 +#ncolors <- max(r[complete.cases(r),]) +#or +ncolors <- max(flds) +#color_palette <- rainbow(ncolors) +color_palette <- heat.colors(ncolors) + +jet.colors <- + colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", + "#7FFF7F", "yellow", "#FF7F00"))#, "red", "#7F0000")) +#color_palette <- jet.colors(ncolors) + +ypos <- apply(r, 2, cumsum) +ypos <- ypos - r/2 + +#xpos <- barplot(t(r_ones), col=rainbow(ncolors), axes=F, horiz=T) +xpos <- barplot(t(r_ones), col=c("white"), axes=F, horiz=T, axisnames=F) +cp <- 0 +for (i in 1:ncol(r_ones)){ + temp <- color_palette[r[seq(1,nrow(r),by=1), i]] + cp <- cbind(cp, temp) + #barplot(t(r_ones[]), col=color_palette[], add=T, axes=F) +} +new_cp <- cp[,-1] + +for (i in 1:nrow(r_ones)){ + #xx = r_ones + #xx[,-i] <- NA + #colnames(xx)[-i] <- NA + barplot(t(r_ones[i,]), col=new_cp[i,], horiz=T, add=T, space=(i-1)*1.2, axes=F, xlab="", axisnames=F) +} + +axis(1,at=0.5+seq(0,ncolors-1,1), labels=seq(1,ncolors,1)) +#Additional axis showing the numcleotide ranges +#axis(1,at=seq(0,ncolors-1,1), labels=seq(1,ncolors,1)) +legend("topright", legend = seq(1,ncolors), fill = color_palette, title="Segment ranks", ncol = 2, x.intersp = 0.1, bty="n", cex=0.8) +#pie(rep(1,ncolors), col=color_palette, labels=paste0("Rank ", seq(1,ncolors)), add=T) + +dev.off() +#text(rep(xpos,5), as.vector(t(ypos)), labels=t(r)) # ?? + + + diff --git a/comikl_main_with_weight_vector.m b/comik_main_with_weight_vector.m similarity index 95% rename from comikl_main_with_weight_vector.m rename to comik_main_with_weight_vector.m index e706cfb..a1cd369 100644 --- a/comikl_main_with_weight_vector.m +++ b/comik_main_with_weight_vector.m @@ -1,4 +1,4 @@ -function [allSeqsAsBags, allSeqsConformedSetKernel, subkernelWeights, thetaVals, instanceWeightsInEachBag, resultString, bestParamComb, test_teAUROC, test_teAUPRC, predictions] = comikl_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, givenNPos, givenNNeg, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector) +function [allSeqsAsBags, allSeqsConformedSetKernel, subkernelWeights, thetaVals, instanceWeightsInEachBag, resultString, bestParamComb, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, givenNPos, givenNNeg, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector) % Author: snikumbh % @@ -93,11 +93,11 @@ thisInstances = []; for k=1:nBags - if oligoLen == 3 + %if oligoLen == 3 if rem(k,100) == 0 logMessages(debugMsgLocation,sprintf('%d--', k), debugLevel); end - end + %end if any(k == testIndices) % do-nothing now, we collect them later % Collection of test sequences can take place later, @@ -328,7 +328,7 @@ logMessages(debugMsgLocation, sprintf('---Prediction using weight vector---\n'), debugLevel); % 6.1 Use the subkernel weights and get transformed instances among the test examples, may be use applyTransformation.m % instanceWeights for instances in training bags - [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel); + [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel, 'samarth_train1.csv'); % Get the transformations for test examples, and the diagonal elements which will be used for normalization further % Should one proceed block-wise at this juncture if the number of test examples is extremely large as is typically the case? % @@ -338,7 +338,7 @@ logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel); % % Getting the instance weightings for the test examples - [thetaVals, instanceWeightsInEachBag_Test] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel_Test); + [thetaVals, instanceWeightsInEachBag_Test] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel_Test, 'samarth_test1.csv'); % At this point, all (train + test examples) instance weights have been obtained. % % 6.2 Use function obtainWeightVector and procure the weight vectors and the bias value @@ -387,6 +387,16 @@ end % % +% Write the instanceWeightInEachBag to file +%dlmwrite('samarth_test.csv', instanceWeightsInEachBag, '-append'); +%disp(instanceWeightsInEachBag{1}); +%disp(size(instanceWeightsInEachBag)); +%disp(size(instanceWeightsInEachBag{1})); +%disp(size(instanceWeightsInEachBag{2})); +%for i=1:size(instanceWeightsInEachBag,2) +% dlmwrite('samarth_test.csv', cell2mat(instanceWeightsInEachBag{i})', '-append'); +%end + if strcmp(whetherToPlotHeatmap, 'Yes') logMessages(debugMsgLocation,sprintf('Plotting heatmaps...'), debugLevel); if(predictUsingWeightVector == 1) @@ -397,7 +407,7 @@ % heatmapFname = strcat(outputFolder,'/Heatmap-InstanceWeights', '_oligoLen', num2str(oligoLen), '_segmentSize', num2str(chunkSizeInBps) , '_maxDist', num2str(maxDist), '_nClusters', num2str(bestParamComb.best_nClusters), '_svmC', num2str(bestParamComb.best_C) ,'_sigma', num2str(bestParamComb.best_sigma), '.pdf'); % - p = plot_heatmap(normInstanceWeightsInEachBag, heatmapFname, 1, 2); + p = plot_heatmap(normInstanceWeightsInEachBag, heatmapFname, 0, 1); logMessages(debugMsgLocation,sprintf('done!\n'), debugLevel); % end diff --git a/comikl_wrapper.m b/comik_wrapper.m similarity index 91% rename from comikl_wrapper.m rename to comik_wrapper.m index fa393ca..5035eff 100644 --- a/comikl_wrapper.m +++ b/comik_wrapper.m @@ -1,8 +1,8 @@ -function [] = comikl_wrapper(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, testIndices, outputFolder, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, nOuterFolds, whetherToPlotHeatmap, whetherToVisualizeWVector, debugLevel, debugMsgLocation, computationVersion) +function [] = comik_wrapper(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, testIndices, outputFolder, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, nOuterFolds, whetherToPlotHeatmap, whetherToVisualizeWVector, debugLevel, debugMsgLocation, computationVersion) % COMIKL_WRAPPER % Usage: -% comikl_wrapper(givenFastaFilename, nPosSequences, nNegSequences, ... +% comik_wrapper(givenFastaFilename, nPosSequences, nNegSequences, ... % outputFolder, oligoLen, maxDist, chunkSizeInBps, nClusterVals, ... % sigmaVals, Cs, mklNorm, nFolds, nOuterFolds, whetherToPlotHeatmap, ... % computationVersion, whetherToVisualizeWVector) @@ -306,8 +306,8 @@ case char('distal-proximal-cgi') posTestIndices = intersect(testIndices, posIndices); negTestIndices = setdiff(testIndices, posIndices) - nPosSequences; -%thisTestIndices{1} = testIndices; -for i=1:nOuterFolds +thisTestIndices{1} = testIndices; +for i=2:nOuterFolds thisPosTestIndices = circshift(posIndices, [0, (i-1)*length(posTestIndices)]); thisNegTestIndices = circshift(negIndices, [0, (i-1)*length(negTestIndices)]) + length(posIndices); thisTestIndices{i} = [thisPosTestIndices(end-(length(posTestIndices))+1:end) thisNegTestIndices(end-(length(negTestIndices))+1:end)]; @@ -327,7 +327,7 @@ case char('distal-proximal-cgi') end parfor (i=1:nOuterFolds, nOuterFolds) thisFoldDirectory = strcat(outputFolder, '/outer_fold_', num2str(i)); - returnStatusVal = perform_one_outer_fold_for_comikl(i, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, ... + returnStatusVal = perform_one_outer_fold_for_comik(i, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, ... maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, ... thisTestIndices{i}, debugLevel, thisFoldDirectory, whetherToPlotHeatmap, ... computationVersion, whetherToVisualizeWVector, debugMsgLocation); @@ -336,9 +336,9 @@ case char('distal-proximal-cgi') logMessages(1, sprintf('All outer folds completed!\n'), debugLevel); -end % comikl_wrapper function ends +end % comik_wrapper function ends -function statusVal = perform_one_outer_fold_for_comikl(outerFoldID, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, outputFolder, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector, debugMsgLocation) +function statusVal = perform_one_outer_fold_for_comik(outerFoldID, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, outputFolder, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector, debugMsgLocation) % Runs one outer fold for CoMIKL % Make the outputFolder for this outer fold @@ -414,11 +414,12 @@ case char('distal-proximal-cgi') fclose(fid); % % - [allSeqsAsBags, allSeqsConformedSetKernel, kernelweights, thetaVals, instanceWeightsInEachBag, resultString, bestVals, test_teAUROC, test_teAUPRC, predictions] = comikl_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen(l), maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector); + [allSeqsAsBags, allSeqsConformedSetKernel, kernelweights, thetaVals, instanceWeightsInEachBag, resultString, bestVals, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen(l), maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector); fid=fopen(runSummaryFilename, 'a'); fprintf(fid, '\n\n'); fprintf(fid, '=====Validation results - SAMARTH=====\n'); fprintf(fid, 'OligoLen: %d, maxDist: %d, segment size: %d\n', oligoLen(l), maxDist, chunkSizeInBps); + fprintf(fid, 'Validation teAUPRC: %.4f\n', test_teAUPRC); fprintf(fid, 'Validation teAUROC: %.4f\n', test_teAUROC); fprintf(fid, 'Best C: %.3f\n', bestVals.best_C); fprintf(fid, 'Best sigma: %.3f\n', bestVals.best_sigma); diff --git a/comikl_main.m b/comikl_main.m deleted file mode 100644 index c8b6f17..0000000 --- a/comikl_main.m +++ /dev/null @@ -1,410 +0,0 @@ -function [allSeqsAsBagsNonShifted, allSeqsAsBagsShifted, allSeqsNonShiftedConformedSetKernel, allSeqsShiftedConformedSetKernel, subkernelWeights, thetaVals, instanceWeightsInEachNonShiftedBag, instanceWeightsInEachShiftedBag, resultString, bestParamComb, test_teAUROC, predictions] = comikl_main(givenFastaFilename, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debug, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector) - -% Author: snikumbh -% -% Sample cmd: -% [a, b, c, d, e] = nips_short_paper('sample1_5C.fasta', 3, 100, 300, 10, 0.5, 1.0, 2.0, 0) -% -% this file is the main function for the project 'nips_short_paper' -% 1. A FASTA file has to be given with a small variation -- the first line -% of the FASTA file gives #positves|#negatives, then follow the FASTA -% file as usual. -% 2. The variable allSeqsAsBagsNonShifted_Test/ allSeqsAsBagsShifted_Test hold the test seqs -% for which we need not compute the kernel value. These are the unseen test examples -% - -%RandStream('mt19937ar','seed',1); - -% Handle arguments and values/defaults -predictUsingWeightVector = 0; -if nargin < 15 - computationVersion = 'Looping'; -end -if nargin < 16 - whetherToVisualizeWVector = 'Yes'; - computationVersion = 'Looping'; -end - - -[nPos, nNeg, allSeqsRawFasta] = readFastaSequences(givenFastaFilename, chunkSizeInBps); -nBags = nPos + nNeg; -% generate labels -% handle imbalance, multiply this to C for negatives -Y = [ones(1, nPos) -ones(1, nNeg)]; -fprintf('#Bags: %d\n', nBags); - -% default for 'acgt' -numDist = maxDist + 1; -alphabet = 'acgt'; -fvLength = (4^oligoLen)^2 * numDist; -chunkSizeInPercentage = 0; %default -% non-shifted chunks -tic; -%chunkingStart = 'no-shift'; -fprintf('Non-shifted bags...'); -idx1 = 0; idx2 = 0; -for k=1:nBags - if debug - fprintf('%d--', k); - end - %fprintf('Bag-%d',k); - %if any(k == testIndices) - % idx1 = idx1 + 1; - % %% Non-shifted - % allSeqsAsBagsNonShifted_Test{idx1} = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ... - % oligoLen, maxDist, chunkSizeInPercentage, ... - % chunkSizeInBps, 'no-shift', alphabet, debug); - % %% Shifted - % allSeqsAsBagsShifted_Test{idx1} = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ... - % oligoLen, maxDist, chunkSizeInPercentage, ... - % chunkSizeInBps, 'shift', alphabet, debug); - %else - idx2 = idx2 + 1; - %% Non-shifted - allSeqsAsBagsNonShifted{idx2} = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ... - oligoLen, maxDist, chunkSizeInPercentage, ... - chunkSizeInBps, 'no-shift', alphabet, debug); - %% Shifted - allSeqsAsBagsShifted{idx2} = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ... - oligoLen, maxDist, chunkSizeInPercentage, ... - chunkSizeInBps, 'shift', alphabet, debug); - %end -end -fprintf('done in %.3f seconds\n', toc); -%fprintf('#Test: %d\n#Train: %d\n', idx1, idx2); -%nBagsForTrain = idx2; -%nBagsForTest = idx1; -% nBags holds nBagsForTrain + nBagsForTest - -% shifted chunks -%chunkingStart = 'shift'; -%tic; -%fprintf('Shifted bags...'); -%idx1 = 0; idx2 = 0; -%for k=1:nBags -% if debug -% fprintf('%d--', k); -% end -% if any(k == testIndices) -% idx = idx1 + 1; -% allSeqsAsBagsShifted_Test{idx1} = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ... -% oligoLen, maxDist, chunkSizeInPercentage, ... -% chunkSizeInBps, chunkingStart, alphabet, debug); -% else -% idx2 = idx2 + 1; -% allSeqsAsBagsShifted{idx2} = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ... -% oligoLen, maxDist, chunkSizeInPercentage, ... -% chunkSizeInBps, chunkingStart, alphabet, debug); -% end -%end -%fprintf('done in %.3f seconds\n', toc); -% -% all bags are now ready, perform a sanity check -% don't leave any bag without an instance, especially -% the shifted case -fprintf('Performing sanity check for the shifted bags...'); -for k=1:nBags - if size(allSeqsAsBagsShifted{k}, 2) == 0 - fprintf('\n**Replacing for bag %d**', k); - allSeqsAsBagsShifted{k} = allSeqsAsBagsNonShifted{k}; - end -end -%for k=1:nBagsForTest -% if size(allSeqsAsBagsShifted_Test{k}, 2) == 0 -% fprintf('\n**Replacing for bag %d**', k); -% allSeqsAsBagsShifted_Test{k} = allSeqsAsBagsNonShifted_Test{k}; -% end -%end -fprintf('\n...done\n'); -% -% -% 2. Compute the standard set kernel for non-shifted bags -% -- the instanceWide kernel is used further -% -fprintf('Standard set kernel for non-shifted bags...\n'); -direct = 0; % for direct computation from bags -sparseComputation = 1; -[instanceStartsNonShifted, instanceEndsNonShifted, instanceWideNonShiftedKernel, allSeqsNonShiftedStdSetKernel] = computeStdMultiInstanceKernel(allSeqsAsBagsNonShifted, direct, sparseComputation); -fprintf('done in %.3f seconds\n', toc); -% -% 3. Compute the standard set kernel for shifted bags -% -- the instanceWide kernel is used further -% -fprintf('Standard set kernel for shifted bags...\n'); -[instanceStartsShifted, instanceEndsShifted, instanceWideShiftedKernel, allSeqsShiftedStdSetKernel] = computeStdMultiInstanceKernel(allSeqsAsBagsShifted, direct, sparseComputation); -fprintf('done in %.3f seconds\n', toc); -% -% Further are functions that will be recomputed per CROSS-VALIDATION iteration -% Cross-validation loop is also handled simultaneously -% -% 4. Computing the conformed multi-instance kernel is done via multiple kernel -% learning, for either shifted or non-shifted bags -% -- conformalXformationParam is the Gaussian bandwidth (sigma) we wish to use -% -resultString = []; -bestParamComb.best_C = 0.0; -bestParamComb.best_teAUROC = 0.0; -bestParamComb.best_nClusters = 0; -bestParamComb.best_sigma = 0.0; -fid=fopen(runSummaryFilename, 'a'); -%fid=fopen('toydataset2-results/result_string.txt', 'w'); -%fid=fopen('new_simulated_dataset1/result_string.txt', 'w'); -Youtertest = Y(testIndices); -%%kirmes-gen: 1:nBags is 1:1200 -%% thus trainIndices will have [1:500 601:1100] -trainIndices = setdiff([1:nBags], testIndices); -YwholeTrain = Y(trainIndices); -nPosTr = size(find(YwholeTrain > 0),2); -nNegTr = size(find(YwholeTrain < 0),2); -imbalance = nPosTr/nNegTr; - -for nc=1:size(nClusterVals,2) - for sigmaItr=1:size(sigmaVals,2) - nClusters = nClusterVals(1,nc); - conformalXformationParam = sigmaVals(1, sigmaItr); - fprintf('Conformed Multi-instance kernel for non-shifted bags...\n'); - % send only YwholeTrain and training set of bags - foldIdForIndices = cvpartition(YwholeTrain, 'Kfold', nFolds); - %crossvalind('Kfold', trainIndices, nFolds); - %[trSplit, teSplit] = create_cvsplit(YwholeTrain, nFolds); % reusing function from Peter Gehler's code for Infinite Kernel Learning - % - % 5. All kernels ready. Pose this as a multiple kernel learning problem next. - % - We have used shogun's python modular interface to perform MKL. - % - Upon performing this MKL, the kernel weights obtained are squared theta values - % - for c=1:size(Cs,2) - C = Cs(c); - test_auROCs = []; test_auPRCs = []; - for f=1:nFolds - % For fold f, corresponding examples will be used as test - thisTrainIndices = find(foldIdForIndices.training(f) == 1); - thisTestIndices = find(foldIdForIndices.test(f) == 1); - - % Computation of conformal MI kernel also inside folds loop, because it - % shouldn't use instances from the test bags - % This restriction is for clustering step, rest, the kernel has to be computed for all wholeTrain indices - indicesForConformalComputation = - %setdiff(trainIndices, thisTestIndices); - %indicesForConformalComputation = setdiff(trainIndices, teSplit{f}); - tic; - [allSeqsNonShiftedConformedSetKernel, rawNonShiftedConformedSetKernel , allSeqsNonShiftedTransformationKernel] = ... - computeConformedMultiInstanceKernel(instanceStartsNonShifted, instanceEndsNonShifted, ... - instanceWideNonShiftedKernel, indicesForConformalComputation, allSeqsAsBagsNonShifted, ... - nClusters, allSeqsNonShiftedStdSetKernel, conformalXformationParam, YwholeTrain, ... - computationVersion); - fprintf('\ndone in %.3f seconds\n', toc); - % - % - fprintf('Conformed Multi-instance kernel for shifted bags...\n'); - tic; - [allSeqsShiftedConformedSetKernel, rawShiftedConformedSetKernel , allSeqsShiftedTransformationKernel] = ... - computeConformedMultiInstanceKernel(instanceStartsShifted, instanceEndsShifted, instanceWideShiftedKernel, ... - indicesForConformalComputation, allSeqsAsBagsShifted, nClusters, allSeqsShiftedStdSetKernel, conformalXformationParam, ... - YwholeTrain, computationVersion); - fprintf('\ndone in %.3f seconds\n', toc); - fprintf('MKL, Fold %d, C-SVM: %.3f, nClusters: %d, Sigma: %.3f\n', f, C, nClusters, conformalXformationParam); - Ytrain = Y(thisTrainIndices); - Ytest = Y(thisTestIndices); - %Ytrain = YwholeTrain(trSplit{f}'); - %Ytest = YwholeTrain(teSplit{f}'); - for k=1:nClusters - % 'Non-shift' conformed kernels - trainConformedKernelNonShifts{k} = allSeqsNonShiftedConformedSetKernel{k}(thisTrainIndices, thisTrainIndices); - testConformedKernelNonShifts{k} = allSeqsNonShiftedConformedSetKernel{k}(thisTrainIndices, thisTestIndices); - % 'Shift' conformed kernels - trainConformedKernelShifts{k} = allSeqsShiftedConformedSetKernel{k}(thisTrainIndices, thisTrainIndices); - testConformedKernelShifts{k} = allSeqsShiftedConformedSetKernel{k}(thisTrainIndices, thisTestIndices); - %% 'Non-shift' conformed kernels - %trainConformedKernelNonShifts{k} = allSeqsNonShiftedConformedSetKernel{k}(trSplit{f}, trSplit{f}); - %testConformedKernelNonShifts{k} = allSeqsNonShiftedConformedSetKernel{k}(trSplit{f}, teSplit{f}); - %% 'Shift' conformed kernels - %trainConformedKernelShifts{k} = allSeqsShiftedConformedSetKernel{k}(trSplit{f}, trSplit{f}); - %testConformedKernelShifts{k} = allSeqsShiftedConformedSetKernel{k}(trSplit{f}, teSplit{f}); - end - % perform MKL, weights, predictions should be returned after mkl - whetherPerformTest = 1; - [subkernelWeights, predictions] = performMKLWithShogunPython(C, Ytrain, Ytest, mklNorm, imbalance, ... - trainConformedKernelNonShifts, trainConformedKernelShifts, ... - testConformedKernelNonShifts, testConformedKernelShifts, outputFolder, whetherPerformTest); - % - fprintf('done in %.3f seconds\n', toc); - [this_teAUROC, this_teAUPRC] = libsvm_plotroc(Ytest', predictions', 'personal'); - % print this to the summary file - this_resultString = [datestr(now), '--#Clusters: ', num2str(nClusters), '--SigmaGaussian: ', ... - num2str(conformalXformationParam), '--C-SVM: ', num2str(C), '--mklNorm: ', ... - num2str(mklNorm), '--teAUROC: ', num2str(this_teAUROC), '--teAUPRC: ', ... - num2str(this_teAUPRC), '\n']; - resultString = [resultString this_resultString]; - fprintf(fid, this_resultString); - test_auROCs = [test_auROCs this_teAUROC]; - test_auPRCs = [test_auPRCs this_teAUPRC]; - end % nFolds for loop ends - % Note best values based on the average performance on the n-folds - if mean(test_auROCs) >= bestParamComb.best_teAUROC - fprintf('mean_teAUROC: %.3f\n', mean(test_auROCs)); - bestParamComb.best_teAUROC = mean(test_auROCs); - bestParamComb.best_C = C; - bestParamComb.best_nClusters = nClusters; - bestParamComb.best_sigma = conformalXformationParam; - end - end % for loop/different values of C ends - % -- Noted best performing pair of values for nClusters and Gaussian RBF sigma - % -- Model selected after cross-validation will be used further to get the instance weights - end -end -fclose(fid); -% Clear some variables created/used during model selection phase -clear trainConformedKernelNonShifts; -clear testConformedKernelNonShifts; -clear trainConformedKernelShifts; -clear testConformedKernelShifts; -% -% Use the best param combinations to train using the whole trainiing set and predict the test set here -% -fprintf('---Test---\n'); -% Re-train the model with all train instances together -nClusters = bestParamComb.best_nClusters; -conformalXformationParam = bestParamComb.best_sigma; -fprintf('nClusters: %d, sigma: %d\n', nClusters, conformalXformationParam); -[allSeqsNonShiftedConformedSetKernel, rawNonShiftedConformedSetKernel, allSeqsNonShiftedTransformationKernel, clusterCentresNonShiftedCase] = ... - computeConformedMultiInstanceKernel(instanceStartsNonShifted, instanceEndsNonShifted, instanceWideNonShiftedKernel, ... - trainIndices, allSeqsAsBagsNonShifted, nClusters, allSeqsNonShiftedStdSetKernel, conformalXformationParam, ... - YwholeTrain, computationVersion); -fprintf('\ndone in %.3f seconds\n', toc); -% -% -fprintf('Conformed Multi-instance kernel for shifted bags...\n'); -[allSeqsShiftedConformedSetKernel, rawShiftedConformedSetKernel, allSeqsShiftedTransformationKernel, clusterCentresShiftedCase] = ... - computeConformedMultiInstanceKernel(instanceStartsShifted, instanceEndsShifted, instanceWideShiftedKernel, ... - trainIndices, allSeqsAsBagsShifted, nClusters, allSeqsShiftedStdSetKernel, conformalXformationParam, ... - YwholeTrain, computationVersion); -fprintf('\ndone in %.3f seconds\n', toc); - -% -for k=1:nClusters - % 'Non-shift' conformed kernels - VtrainConformedKernelNonShifts{k} = allSeqsNonShiftedConformedSetKernel{k}(trainIndices, trainIndices); - VtestConformedKernelNonShifts{k} = allSeqsNonShiftedConformedSetKernel{k}(trainIndices, testIndices); - % 'Shift' conformed kernels - VtrainConformedKernelShifts{k} = allSeqsShiftedConformedSetKernel{k}(trainIndices, trainIndices); - VtestConformedKernelShifts{k} = allSeqsShiftedConformedSetKernel{k}(trainIndices, testIndices); -end -% -whetherPerformTest = 1; %set this to 0, since we will make predictions using computed weight vector -% -[subkernelWeights, predictions] = performMKLWithShogunPython(bestParamComb.best_C, YwholeTrain, Youtertest, mklNorm, imbalance, ... - VtrainConformedKernelNonShifts, VtrainConformedKernelShifts, ... - VtestConformedKernelNonShifts, VtestConformedKernelShifts, outputFolder, whetherPerformTest); -% Predictions returned at this juncture are dummy values, they are not to be used. -% Below, we make our own predictions using the weight vector -% -[test_teAUROC, test_teAUPRC] = libsvm_plotroc(Youtertest', predictions', 'personal'); -fprintf('Validation_teAUROC with test-kernels: %.3f\n', test_teAUROC); - -if(predictUsingWeightVector == 1) - fprintf('---Prediction using weight vector---\n'); - % 1. Use the subkernel weights and get transformed instances among the test examples, may be use applyTransformation.m - [thetaVals, instanceWeightsInEachNonShiftedBag, instanceWeightsInEachShiftedBag] = getInstanceWeights(subkernelWeights, ... - allSeqsNonShiftedTransformationKernel, allSeqsShiftedTransformationKernel); - % Get the transformations for test examples, and the diagonal elements which will be used for normalization further - % Non-shifted case - % - [transformationKernelNonShifted, allSeqsNonShiftedTransformationKernel_Test] = applyTransformation(allSeqsAsBagsNonShifted_Test, clusterCentresNonShiftedCase, conformalXformationParam); - % - % Shifted case - % - [transformationKernelShifted, allSeqsShiftedTransformationKernel_Test] = applyTransformation(allSeqsAsBagsShifted_Test, clusterCentresShiftedCase, conformalXformationParam); - % - % - [thetaVals, instanceWeightsInEachNonShiftedBag_Test, instanceWeightsInEachShiftedBag_Test] = getInstanceWeights(subkernelWeights, ... - allSeqsNonShiftedTransformationKernel_Test, allSeqsShiftedTransformationKernel_Test); - % 2. Use function obtainWeightVector and procure the weight vectors and the bias value - [weightVectors, biasValue] = obtainWeightVector(char(outputFolder), oligoLen, maxDist, subkernelWeights, allSeqsAsBagsNonShifted, allSeqsAsBagsShifted, rawNonShiftedConformedSetKernel, allSeqsNonShiftedTransformationKernel, rawShiftedConformedSetKernel, allSeqsShiftedTransformationKernel); - % 3. Use these to make predictions using y = w*x + b - yhatTemp = 0.0; - - % bagPhi for Test is nBagForTest-by-nClusters - bagPhiNonShifted_Test = zeros( size(allSeqsAsBagsNonShifted_Test{1},1), nBagsForTest, nClusters); - bagPhiShifted_Test = zeros( size(allSeqsAsBagsShifted_Test{1},1), nBagsForTest, nClusters); - % - % For normalization of test samples, we need to obtain the normalization factor for them. - % - for i=1:length(allSeqsAsBagsNonShifted_Test)% this is nBagsForTest - for j=1:nClusters - %allSeqs NonShifted/Shifted TransformationKernel already has nInstances x nClusters - % ALSO NEED NORMALIZATION - term = sum(allSeqsAsBagsNonShifted_Test{i} .* repmat(allSeqsNonShiftedTransformationKernel_Test{i}(:,j)', size(allSeqsAsBagsNonShifted_Test{i},1), 1), 2); - bagPhiNonShifted_Test(:, i, j) = term ./ norm(term, 2); - term = sum(allSeqsAsBagsShifted_Test{i} .* repmat(allSeqsShiftedTransformationKernel_Test{i}(:,j)', size(allSeqsAsBagsShifted_Test{i},1), 1), 2); - bagPhiShifted_Test(:, i, j) = term ./ norm(term, 2); - end - end - fprintf('Bag Phi for test samples done.\n'); - for i=1:nClusters - fprintf('Samarth yhat...\n'); - %disp(size(bagPhiNonShifted_Test(:, 1, 1))); - %disp(size(weightVectors(:, i))); - %disp(size(bagPhiNonShifted_Test(:, :, 1))); - yhatTemp = yhatTemp + weightVectors(:, i)' * bagPhiNonShifted_Test(:, :, i); - %disp(size(yhatTemp)); - end - for i=(nClusters+1):2*nClusters - yhatTemp = yhatTemp + weightVectors(:, i)' * bagPhiShifted_Test(:, :, (i-nClusters)); - end - fprintf('Bias value: %.4f\n', biasValue); - %disp(yhatTemp(1,1:15)); - %disp(yhatTemp(1,186:200)); - yhat = yhatTemp + biasValue; - %disp(yhat(1,1:15)); - %disp(yhat(1,186:200)); - %disp(size(Youtertest)); - %disp(size(yhat)); - [test_teAUROC, test_teAUPRC] = libsvm_plotroc(Youtertest', yhat', 'personal'); - fprintf('Validation_teAUROC with test-kernels: %.3f\n', test_teAUROC); - - % 4. Check and ensure that these are equal to what the Shogun implementation is giving us? - - % If all OK, we can use this for fast prediction of unseen test examples. -end -% -% 6. Obtain instance weights from the subkernel weights -% -[thetaVals, instanceWeightsInEachNonShiftedBag, instanceWeightsInEachShiftedBag] = getInstanceWeights(subkernelWeights, ... - allSeqsNonShiftedTransformationKernel, allSeqsShiftedTransformationKernel); -% -% -if strcmp(whetherToPlotHeatmap, 'Yes') - fprintf('Plotting heatmaps...'); - % Non-shifted segment weights - if(predictUsingWeightVector == 1) - normInstanceWeightsInEachNonShiftedBag = standardizeMatrix(vertcat(cell2mat(instanceWeightsInEachNonShiftedBag)', cell2mat(instanceWeightsInEachNonShiftedBag_Test)')); - else - normInstanceWeightsInEachNonShiftedBag = standardizeMatrix(cell2mat(instanceWeightsInEachNonShiftedBag)'); - end - % - heatmapFname = strcat(outputFolder,'/Heatmap-InstanceWeights-NonShiftedSegments', '_oligoLen', num2str(oligoLen), '_segmentSize', num2str(chunkSizeInBps) , '_maxDist', num2str(maxDist), '_nClusters', num2str(bestParamComb.best_nClusters), '_svmC', num2str(bestParamComb.best_C) ,'_sigma', num2str(bestParamComb.best_sigma), '.pdf'); - % - p = plot_heatmap(normInstanceWeightsInEachNonShiftedBag, heatmapFname, 1, 2); - % - % Shifted segment weights - if(predictUsingWeightVector == 1) - normInstanceWeightsInEachShiftedBag = standardizeMatrix(vertcat(cell2mat(instanceWeightsInEachShiftedBag)', cell2mat(instanceWeightsInEachShiftedBag_Test)')); - else - normInstanceWeightsInEachShiftedBag = standardizeMatrix(cell2mat(instanceWeightsInEachShiftedBag)'); - end - % - heatmapFname = strcat(outputFolder,'/Heatmap-InstanceWeights-ShiftedSegments', '_oligoLen', num2str(oligoLen), '_segmentSize', num2str(chunkSizeInBps) , '_maxDist', num2str(maxDist), '_nClusters', num2str(bestParamComb.best_nClusters), '_svmC', num2str(bestParamComb.best_C) ,'_sigma', num2str(bestParamComb.best_sigma), '.pdf'); - % - p = plot_heatmap(normInstanceWeightsInEachShiftedBag, heatmapFname, 1, 2); - fprintf('done!\n'); -end -% -% 7. Visualize weight vector as motifs -% -if strcmp(whetherToVisualizeWVector, 'Yes') - fprintf('Visualizing weight vectors...\n'); - visualize_wvector(char(outputFolder), oligoLen, maxDist, subkernelWeights, allSeqsAsBagsNonShifted, allSeqsAsBagsShifted, rawNonShiftedConformedSetKernel, allSeqsNonShiftedTransformationKernel, rawShiftedConformedSetKernel, allSeqsShiftedTransformationKernel); -end -fprintf('done!\n'); -% -end % function ends diff --git a/computeConformedMultiInstanceKernel.m b/computeConformedMultiInstanceKernel.m index b0255dd..dd0aacf 100644 --- a/computeConformedMultiInstanceKernel.m +++ b/computeConformedMultiInstanceKernel.m @@ -73,6 +73,7 @@ nSampled = round(sqrt(nClusters * currentTotal)); logMessages(debugMsgLocation, sprintf('--- for kmeans, Total: %d, nSampled: %d, nClusters: %d\n', currentTotal, nSampled, nClusters), debugLevel); XtoPass = XtoPassFrom( transpose(randsample( currentTotal, round(sqrt(nClusters * currentTotal)) )), :); +%XtoPass = XtoPassFrom; clear('XtoPassFrom'); [clusterCentres, matrixOfDistancesFromCentresForPassedVectors] = getExpansionPoints(XtoPass, nClusters, nofRep, debugLevel, debugMsgLocation); clear('XtoPass'); @@ -151,7 +152,7 @@ tempToBeSummed = bsxfun(@times, b1b2_transformationProducts, (b1BagAsMat' * b2BagAsMat)); %tempToBeSummed = bsxfun(@times, b1b2_transformationProducts, full(allSeqsAsBags{b1}' * allSeqsAsBags{b2})); %tempToBeSummed = bsxfun(@times, b1b2_transformationProducts, instanceWideKernel(instanceStarts(b1):instanceEnds(b1), instanceStarts(b2):instanceEnds(b2)) ); - tempKernelCollection(b1, b2, :) = sum(sum(tempToBeSummed,1)); + tempKernelCollection(b1, b2, :) = 1/(idxInstances(b1) * idxInstances(b2)) * sum(sum(tempToBeSummed,1)); end end clear b1BagAsMat; @@ -162,6 +163,7 @@ tempKernel = zeros(nBags); tempKernel = triu(tempKernelCollection(:,:,i), 1) + tempKernelCollection(:,:,i)'; rawConformedMultiInstanceKernel{i} = tempKernel; + %fprintf('***Max.: %.4f --- Min.:%.4f***\n', max(max(tempKernel)), min(min(tempKernel))); conformedMultiInstanceKernel{i} = normalizeKernel(tempKernel); end logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel); @@ -182,6 +184,7 @@ b1b2_transformationProducts = bsxfun(@times, b1r , b2rt); %tempToBeSummed = bsxfun(@times, b1b2_transformationProducts, full(allSeqsAsBags{b1}' * allSeqsAsBags{b2})); tempToBeSummed = bsxfun(@times, b1b2_transformationProducts, instanceWideKernel(instanceStarts(b1):instanceEnds(b1), instanceStarts(b2):instanceEnds(b2)) ); + term = 1/(idxInstances(b1) * idxInstances(b2)); tempKernelCollection(b1, b2, :) = sum(sum(tempToBeSummed,1)); end end @@ -189,9 +192,17 @@ tic; for i=1:nClusters tempKernel = zeros(nBags); + %tempKernel = tempKernelCollection(:,:,i)'; tempKernel = triu(tempKernelCollection(:,:,i), 1) + tempKernelCollection(:,:,i)'; + %if issymmetric(tempKernel) + % fprintf('Samarth symmetric\n'); + %else + % fprintf('Samarth.. not sysmmetric.. triu operation is needed\n'); + %end rawConformedMultiInstanceKernel{i} = tempKernel; + %fprintf('***Max.: %.4f --- Min.:%.4f***\n', max(max(tempKernel)), min(min(tempKernel))); conformedMultiInstanceKernel{i} = normalizeKernel(tempKernel); + %fprintf('***Max.: %.4f --- Min.:%.4f***\n', max(max(conformedMultiInstanceKernel{i})), min(min(conformedMultiInstanceKernel{i}))); end logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel); else % if do-reshaped ends @@ -219,7 +230,7 @@ [~,p] = chol(conformedMultiInstanceKernel{i}); if p==0 else - logMessages(debugMsgLocation,sprintf('Cholesky p: %.4f\n', p), debugLevel); + logMessages(debugMsgLocation,sprintf('cluster %d Cholesky p: %.4f\n', i, p), debugLevel); % p~=0 means there is an eigen value < 0 % print the min(eigen values) in the log logMessages(debugMsgLocation, sprintf('%e,', min(eig(conformedMultiInstanceKernel{i}))), debugLevel); diff --git a/getExpansionPoints.m b/getExpansionPoints.m index 69d8598..18a70dd 100644 --- a/getExpansionPoints.m +++ b/getExpansionPoints.m @@ -18,9 +18,9 @@ rng('default'); % or can be set to 'default' logMessages(debugMsgLocation, sprintf('with K-means: \n'), debugLevel); if debugLevel == 2 - [idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','off');%Display: final + [idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','off');%, 'MaxIter',1000);%Display: final elseif debugLevel == 0 - [idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','off'); + [idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','off');%, 'MaxIter',1000); end logMessages(debugMsgLocation, sprintf('--- Took %.3f seconds\n',toc), debugLevel); diff --git a/getInstanceWeights.m b/getInstanceWeights.m index e2eccce..234abbc 100644 --- a/getInstanceWeights.m +++ b/getInstanceWeights.m @@ -1,4 +1,4 @@ -function [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(kernelWeights, allSeqsTransformationKernel) +function [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(kernelWeights, allSeqsTransformationKernel, fname) % % Return the thetaals and @@ -20,9 +20,19 @@ % compute instance weights for instances in each bag % for j=1:nBags - instanceWeightsInEachBag{j} = sum(repmat(thetaVals(1, 1 : nKernels ), ... - size(allSeqsTransformationKernel{j},1), 1) ... - .* allSeqsTransformationKernel{j}, 2); + temp = sum(repmat(thetaVals(1, 1 : nKernels ), ... + size(allSeqsTransformationKernel{j},1), 1) ... + .* allSeqsTransformationKernel{j}, 2); + instanceWeightsInEachBag{j} = temp; + % To write to file + stdTemp = standardizeMatrix(temp'); + finalTemp = zeros(size(stdTemp)); + %using rankings + [~, ranking] = sort(stdTemp(1:size(stdTemp,1),1:size(stdTemp,2)), 2, 'descend'); + %for i=1:size(plotDataM,1) + finalTemp(1,ranking) = 1:size(stdTemp,2); + %end + dlmwrite(fname, finalTemp, '-append'); end diff --git a/getODHFeatureVecInstances.m b/getODHFeatureVecInstances.m index a5f7049..f8111f8 100644 --- a/getODHFeatureVecInstances.m +++ b/getODHFeatureVecInstances.m @@ -138,13 +138,15 @@ %modifiedODHFeatureVec = [];%zeros( modifiedODHFeatureVecLen , 1); % Refer paper. -%Chunking already done. +% Chunking already done. +% Set variables before entering loop +singleChunkODHFeatureVecLen = (M^2) * numDist; +multiplier = alphabetLen.^[oligoLen-1:-1:0]; for chunkIndex = 1:numOfChunks thisChunk = seq(chunkStarts(chunkIndex) : chunkBoundaries(chunkIndex)); thisChunkLen = length(thisChunk); - singleChunkODHFeatureVecLen = (M^2) * numDist; singleChunkODHFeatureVec = zeros( singleChunkODHFeatureVecLen , 1); seq_numerical = zeros(1, thisChunkLen); @@ -155,7 +157,6 @@ seq_numerical(seq_numerical==0) = -Inf; seq_numerical = seq_numerical - 1; - multiplier = alphabetLen.^[oligoLen-1:-1:0]; for posInChunk = 1:thisChunkLen-oligoLen+1 ind_list(posInChunk) = sum(multiplier .* seq_numerical(posInChunk:posInChunk+oligoLen-1)); end diff --git a/mkl.py b/mkl.py index 9655585..c7b1cd2 100755 --- a/mkl.py +++ b/mkl.py @@ -7,7 +7,7 @@ try: from modshogun import BinaryLabels - from modshogun import CombinedKernel, CustomKernel + from modshogun import CombinedKernel, CustomKernel, SqrtDiagKernelNormalizer from modshogun import MKLClassification from modshogun import LibSVM except ImportError: @@ -67,7 +67,7 @@ else: raise OptionValueError("Cannot open %s as a file. Please check if it exists." % thisFname) - +#trainkernel.set_normalizer(SqrtDiagKernelNormalizer(True)) # train MKL mkl = MKLClassification(LibSVM()) @@ -79,6 +79,7 @@ mkl.set_C_mkl(options.C) # set kernel and labels mkl.set_kernel(trainkernel) +#kernel.set_normalizer(SqrtDiagKernelNormalizer(True)) mkl.set_labels(trainlabels) mkl.io.disable_progress() diff --git a/plot_heatmap.m b/plot_heatmap.m index 28a87af..4a9d41a 100644 --- a/plot_heatmap.m +++ b/plot_heatmap.m @@ -18,29 +18,41 @@ end normDataM = standardizeMatrix(dataM); +%plotDataM = normDataM; plotDataM = zeros(size(normDataM)); %using rankings [~, rankingDataM] = sort(normDataM(1:size(dataM,1),1:size(dataM,2)), 2, 'descend'); for i=1:size(plotDataM,1) plotDataM(i,rankingDataM(i,:)) = 1:size(dataM,2); end +% adjust indices +%nCols = (size(plotDataM,2)+1)/2; +%idx = []; +%for i=1:nCols +% idx = [idx i i+(nCols)]; +%end +%nidx = idx(1:(end-1)); +%plotDataM_adjusted = plotDataM(:,nidx); fig = figure; %colormap(map) % -colormap(gray(climmax-climmin+1)); +%colormap(gray(climmax-climmin+1)); % clims = [climmin climmax]; -imagesc(plotDataM, clims); +%clims = [0 1]; +imagesc(plotDataM); +%imagesc(standardizeMatrix(dataM)); cb = colorbar; % -caxismin = 1; -caxismax = climmax-climmin+1; -caxis([caxismin caxismax]); % sets colorbar limits -set(cb,'xtick',[climmin climmax]); +%caxismin = 1; +%caxismax = climmax-climmin+1; +%caxis([caxismin caxismax]); % sets colorbar limits +%set(cb,'xtick',[climmin climmax]); %set(cb, 'ylim', climmin+0.5:1:climmax); %labels = {'Rank 1', 'Rank 2','Rank 3 \& beyond'}; %lcolorbar(labels,'fontweight','bold'); % -saveas(gcf, filename, 'pdf'); +%saveas(gcf, filename, 'pdf'); +print(gcf, filename, '-dpdf', '-r900'); end