Updated help text for functions, sanitized the barplotting r-script a…

…nd the main functions for comik.
snikumbh · Jul 28, 2017 · f674716 · f674716
1 parent 5799b81
commit f674716
Show file tree

Hide file tree

Showing 7 changed files with 185 additions and 52 deletions.
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ export PATH=".:$PATH"
 Example function call from inside Matlab:
 For simulated dataset 1 provided in the folder `sample_data`
 ```Matlab
-comik_wrapper('sample_data/simulated_dataset1/pos.fasta', 'sample_data/simulated_dataset1/neg.fa', 600, 600, [501:600 1101:1200], 'comik_run_simulated_dataset1', [2], 10, 10, [2 5 7], 10.^[1:1:2], 10.^[-3:1:3], 2.0, 10, 5, 'No', 'Yes', 2, 'runLog.txt');
+comik_wrapper('sample_data/simulated_dataset1/pos.fasta', 'sample_data/simulated_dataset1/neg.fasta', 600, 600, [501:600 1101:1200], 'comik_run_simulated_dataset1', [2], 10, 10, [2 5 7], 10.^[1:1:2], 10.^[-3:1:3], 2.0, 10, 5, 'No', 'Yes', 2, 'runLog.txt');
 ```
 
 _CoMIK_ accepts two FASTA files as input -- the first FASTA file containing sequences in the positive class followed by a second FASTA file containing the sequences in the negative class. Other params are explained below.
@@ -56,6 +56,7 @@ Param name | type  | default value | Additional comments
  number-of-inner-folds | int | 10 |  
  number-of-outer-folds | int | 5 |  
  whetherToPlotHeatmap | str | 'No' | Possible values: 'Yes' or 'No'; Set 'Yes' only if all sequences are of the same length.  
+ whetherToVisualizeWVector | str | 'Yes' | Possible values: 'Yes' or 'No'; Set 'Yes' when you wish to have the distance-centric k-mer visualization.
  debugLevel | int | 2 | Possible values: [0, 1, 2]. Value 0 makes _comik_ completely silent, and 2 makes it maximally verbose. Value 1 may be used in the future.   
  debugMsgLocation | int/str | 1 | Value 1 denotes the command prompt, else specify a filename, say 'runLog.txt'. This file is written for each outer fold separately.  
  computationVersion | str | 'Looping'| Possible values: 'Looping' or 'AccumArray'. 'Looping' is faster and preferred/recommended for large datasets when 'AccumArray' can be memory intensive.  

diff --git a/barplot-heatmap-visualization.R b/barplot-heatmap-visualization.R
@@ -1,40 +1,56 @@
 
-
-
-pdf('sample_viz_in_r.pdf', width=10.0, height=6.0)
-csv_filename <- '<enter-name>.csv'
-flds <- count.fields(csv_filename, sep=',')
-s <- read.csv(csv_filename, fill=TRUE, nrows=length(flds), header=F, sep=",", col.names=paste0("Seg-", seq(1,max(flds),by=1)))
-r <- s[1:25,]
-r_ones <- r
-r_ones[!is.na(r)] <- 1
+# cmdline arguments to be given in the following order
+# 1. inputFilename
+# 2. width
+# 3. height
+# 4. outputFilename (csv format)
+# 5. # of sequences to visualize 
+# (This is useful because there could be hugely many sequences in the 
+# collection and visualizing them all together may not be a good idea!)
+#
+# author: snikumbh@mpi-inf.mpg.de
+
+args <- commandArgs(TRUE)
+inputFilename <- args[1]
+widthVal <- args[2]
+heightVal <- args[3]
+outputFilename <- args[4]
+numOfSequences <- args[5]
+
+# If a file with this name already exists, this overwrites it!
+pdf('barplot-visualization-variable-len-seq.pdf', width=10.0, height=6.0)
+csv_filename <- paste0(outputFilename, '.csv')
+flds <- count.fields(csv_filename, sep=',') # get the number of segments in all sequences
+seqs <- read.csv(csv_filename, fill=TRUE, nrows=length(flds), header=F, sep=",", col.names=paste0("Seg-", seq(1,max(flds),by=1)))
+seqSegs <- seqs[1:numOfSequences,]
+seqSegs_ones <- seqSegs
+seqSegs_ones[!is.na(seqSegs)] <- 1 # sequences which had less than the maximum number of segments for any sequence in the collection, will have NA
 
 ncolors <- max(flds)
 color_palette <- heat.colors(ncolors)
 
-# If a different color palette is required
-# jet.colors <-
-#  colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan",
+# If a different color palette is required could use the following with experimenting:
+# jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan",
 #                     "#7FFF7F", "yellow", "#FF7F00"))#, "red", "#7F0000"))
 
-ypos <- apply(r, 2, cumsum)
-ypos <- ypos - r/2
+ypos <- apply(seqSegs, 2, cumsum)
+ypos <- ypos - seqSegs/2
 
-xpos <- barplot(t(r_ones), col=c("white"), axes=F, horiz=T, axisnames=F)
+xpos <- barplot(t(seqSegs_ones), col=c("white"), axes=F, horiz=T, axisnames=F)
 cp <- 0
-for (i in 1:ncol(r_ones)){
-    temp <- color_palette[r[seq(1,nrow(r),by=1), i]]
+for (i in 1:ncol(seqSegs_ones)){
+    temp <- color_palette[seqSegs[seq(1,nrow(seqSegs),by=1), i]]
     cp <- cbind(cp, temp)
 }
 new_cp <- cp[,-1]
 
-for (i in 1:nrow(r_ones)){
-    barplot(t(r_ones[i,]), col=new_cp[i,], horiz=T, add=T, space=(i-1)*1.2, axes=F, xlab="", axisnames=F)
+for (i in 1:nrow(seqSegs_ones)){
+    barplot(t(seqSegs_ones[i,]), col=new_cp[i,], horiz=T, add=T, space=(i-1)*1.2, axes=F, xlab="", axisnames=F)
 }
 
 axis(1,at=0.5+seq(0,ncolors-1,1), labels=seq(1,ncolors,1))
-#Additional axis showing the nucleotide ranges, if required
-## axis(1,at=seq(0,ncolors-1,1), labels=seq(1,ncolors,1))
+# Additional axis showing the nucleotide ranges, if required
+# axis(1,at=seq(0,ncolors-1,1), labels=seq(1,ncolors,1))
 legend("topright", legend = seq(1,ncolors), fill = color_palette, title="Segment ranks", ncol = 2, x.intersp = 0.1, bty="n", cex=0.8)
 
 dev.off()

diff --git a/comik_main_with_weight_vector.m b/comik_main_with_weight_vector.m
@@ -2,7 +2,7 @@
 
 % Author: snikumbh 
 %
-% All segments, shifted and non-shifted in one bag; order of segments: non-shifted segments followed by shifted segments.
+% All segments, shifted and non-shifted in one bag; Order of segments: non-shifted segments followed by shifted segments.
 % This results in #kernels = #clusterCentres
 %
 % debugLevel 2	prints all messages
@@ -131,7 +131,7 @@
 % This hasn't been tested for either correctness or efficiency. We recommend using sparseComputation.
 % We are using sparseComputation.
 sparseComputation = 1;
-[instanceStarts, instanceEnds, instanceWideKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, sparseComputation, debugLevel, debugMsgLocation);
+[instanceStarts, instanceEnds, instanceWideKernel] = computeInstanceWideKernel(allSeqsAsBags, thisInstances, sparseComputation, debugLevel, debugMsgLocation);
 %
 % Further are functions that will be recomputed per inner cross-validation iteration
 % Cross-validation loop is also handled simultaneously
@@ -145,6 +145,7 @@
 bestParamComb.best_teAUROC = 0.0;
 bestParamComb.best_nClusters = 0;
 bestParamComb.best_sigma = 0.0;
+
 % open summary file for writing
 fid=fopen(runSummaryFilename, 'a');
 

diff --git a/comik_wrapper.m b/comik_wrapper.m
@@ -53,6 +53,7 @@
 % Param 'Cs': Cost values for SVM
 % 
 % Author: snikumbh
+
 totalArguments = 20;
 if nargin < totalArguments
     computationVersion = 'Looping';

diff --git a/computeConformedMultiInstanceKernel.m b/computeConformedMultiInstanceKernel.m
@@ -1,6 +1,65 @@
 function [conformedMultiInstanceKernel, rawConformedMultiInstanceKernel, instancesTransformationKernel, clusterCentres] = computeConformedMultiInstanceKernel(instanceStarts, instanceEnds, instanceWideKernel, trainIndices, allSeqsAsBags, nClusters, sig, Y, computationVersion, debugLevel, debugMsgLocation)
+
+% COMPUTECONFORMEDMULTIINSTANCEKERNEL
+% Computes the conformally transformed multi-instance kernel
+
+% INPUT PARAMS
+% param 'instanceStarts' (vector)
+% Indices where instances (segments) of any sequence begin
+%
+% param 'instanceEnds' (vector)
+% Indices where instances (segments) of any sequence end
+% 
+% param 'instanceWideKernel' (matrix)
+% The instance-wide kernel matrix of dimension #Instances x #Instances
+%
+% param 'trainIndices' (vector)
+% Indices of the training sequences
+%
+% param 'allSeqsAsBags' (cell array)
+% Collection of all segments per sequence
+% 
+% param 'nClusters'
+% number of clusters for k-means
+%
+% param 'sig' 
+% Gaussian bandwidth (sigma) value for the Gaussian transformation
+%
+% param 'Y' (vector)
+% Vector of labels
+%
+% param 'computationVersion' ('AccumArray'/'Looping')
+% Flag specifying what approach to use for computing the kernel
+%
+% param 'debugLevel' (0/1/2)
+%
+% param 'debugMsgLocation' (1/fileID)
+%
+%
+% OUTPUT PARAMS
+% param 'conformedMultiInstanceKernel' (matrix)
+% Normalized conformally transformed MI kernel
 % 
-% Obtain expansion points
+% param 'rawConformedMultiInstanceKernel' (matrix)
+% Unnormalized conformally transformed MI kernel
+%
+% param 'instancesTransformationKernel' (matrix)
+% Transformation of the training instances
+%
+% param 'clusterCentres' (matrix)
+% Cluster centres obtained upon k-means
+%
+% ADDITIONAL NOTES:
+% -- Stratification
+%    . If stratification is required, see below, set stratified = 1
+% -- Replicates for k-means
+%    . currently set to 1
+%    . can be increased
+%
+% author: snikumbh@mpi-inf.mpg.de
+
+
+% 1.1 Obtain expansion points
 %
 % -- Convert allSeqsAsBags in to a matrix X of dimensions nInstances x FVlength 
 % 
@@ -16,8 +75,13 @@
 nBags = length(allSeqsAsBags);
 % nBags, only as many bags as training examples
 idxInstances = zeros(1,nBags);
-posInd = size(find(Y > 0),2);
-negInd = size(find(Y < 0),2);
+stratified = 0;
+% The following two lines also used for stratified representation.
+if stratified
+    posInd = size(find(Y > 0),2);
+    negInd = size(find(Y < 0),2);
+end
+
 if strcmp(computationVersion, 'AccumArray')
     instanceIDVector = zeros(1,instanceEnds(end));
 end
@@ -32,15 +96,17 @@
     if strcmp(computationVersion, 'AccumArray')
         instanceIDVector(instanceStarts(i):instanceEnds(i)) = repmat(i, [1 size(allSeqsAsBags{i}, 2)]);
     end
-    %% useful to have stratified repressentation, we don't need this now.
-    % if i <= posInd
-    %     nInstancesPos = nInstancesPos + size(allSeqsAsBags{i}, 2);
-    % end 
-    %% useful to have stratified repressentation
+    % The following is useful to have stratified repressentation, 
+    % we don't need this now.
+    if stratified
+        if i <= posInd
+            nInstancesPos = nInstancesPos + size(allSeqsAsBags{i}, 2);
+        end 
+    end
     if any(i==trainIndices) == 1 %check, if a trainingIndex, get instances
         currentTotal = currentTotal+idxInstances(i);
     end 
-    %% useful to count instances only in training bags
+    % useful to count instances only in training bags
 end
 forIndicesInX = cumsum(idxInstances);
 nInstances = sum(idxInstances);
@@ -49,7 +115,6 @@
     assert(size(instanceIDVector,2) == nInstances);
 end
 
-%X = zeros( nInstances, size(allSeqsAsBags{1},1) );
 logMessages(debugMsgLocation, sprintf('\n--- currentTotal for setting a zeros XtoPassFrom: %d\n', currentTotal), debugLevel);
 XtoPassFrom = zeros( currentTotal, size(allSeqsAsBags{1},1) );
 % Make sure that only instances from training bags are taken
@@ -68,7 +133,7 @@
 %           data points from nInstances
 %         - this will give nClusters
 
-% If sending complete X for k-means clustering set XtoPass = X;
+% If sending complete XtoPassFrom for k-means clustering set XtoPass = XtoPassFrom;
 % Replicates
 nofRep = 1;
 % Randomly sample without replacement
@@ -110,16 +175,16 @@
 if strcmp(computationVersion, 'AccumArray')
     logMessages(debugMsgLocation, sprintf('\nAccumarray version---\n'), debugLevel);tic;
     instanceSubs = combvec(instanceIDVector, instanceIDVector)';
-    %instanceSubs = tempInstanceSubs(tempInstanceSubs(:,1) <= tempInstanceSubs(:,2),:);
-    %%instanceSubs are only the upper-triangular indices including the diagonal
-    %clear('tempInstanceSubs');
+    % instanceSubs = tempInstanceSubs(tempInstanceSubs(:,1) <= tempInstanceSubs(:,2),:);
+    % instanceSubs are only the upper-triangular indices including the diagonal
+    % clear('tempInstanceSubs');
     instanceWideKernelRepeated = repmat(instanceWideKernel, 1, 1, nClusters);
     allInstancesTransformations = cat(1,instancesTransformationKernel{:});
     for i=1:nClusters
         KernelAsVector = (allInstancesTransformations(:,i) * allInstancesTransformations(:,i)') .* instanceWideKernelRepeated(:,:,i);
         tempKernel = accumarray(instanceSubs, KernelAsVector(:));
         assert(size(tempKernel,1) == size(zeros(nBags),1));
-        %tempKernel = triu(tempKernel, 1) + tempKernel'; %because instanceSubs stored indices for only the upper-triangular part of the matrix
+        % tempKernel = triu(tempKernel, 1) + tempKernel'; %because instanceSubs stored indices for only the upper-triangular part of the matrix
         conformedMultiInstanceKernel{i} = normalizeKernel(tempKernel);
     end
     clear('instanceWideKernelRepeated');
@@ -129,12 +194,12 @@
     logMessages(debugMsgLocation, sprintf('done in %.3f  seconds', toc), debugLevel);
 end
 if strcmp(computationVersion, 'Looping')
-    %% one can convert the transformationKernels into 3D matrices
+    % One can convert the transformationKernels into 3D matrices
     logMessages(debugMsgLocation, sprintf('\n--- Looping-over-bags version...'), debugLevel);
-    %% Performing the reshaped version
+    % Performing the reshaped version
     if issparse(allSeqsAsBags{1})
 	logMessages(debugMsgLocation, sprintf('Doing the reshaped version..\n'), debugLevel);
-        %% Pre-allocate memory, doesn't help in speed
+        % Pre-allocate memory, doesn't help in speed
         % for i=1:nClusters
         %     rawConformedMultiInstanceKernel{i} = zeros(nBags);
         %     conformedMultiInstanceKernel{i} = zeros(nBags);

diff --git a/computeInstanceWideKernel.m b/computeInstanceWideKernel.m
@@ -1,7 +1,36 @@
 function  [instanceStarts, instanceEnds, instanceWideKernel] = computeInstanceWideKernel(allSeqsAsBags, thisInstances, sparseComputation, debugLevel, debugMsgLocation)
 
 % COMPUTEINSTANCEWIDEKERNEL 
-% 
+% Computes the instance-wide kernel matrix given a bag of instances
+%
+% INPUT PARAMS
+% param 'allSeqsAsBags' (cell array)
+% Collection of all segments per sequence
+%
+% param 'thisInstances' (vector)
+% #instances in each bag (in other words, #segments per sequence) 
+%
+% param 'sparseComputation' (0/1)
+% Flag specifying whether sparseComputation is to be performed (0/1)
+%
+% param 'debugLevel' (0/1/2)
+%
+% param 'debugMsgLocation' (1/fileID)
+%
+%
+% OUTPUT PARAMS
+% param 'instanceStarts' (vector)
+% Indexes in the vector where instances (segments) of any sequence begin
+%
+% param 'instanceEnds' (vector)
+% Indexes in the vector where istances (segments) of any sequence end
+%
+% param 'instanceWideKernel' (matrix)
+% The instance-wide kernel matrix of dimension #Instances x #Instances
+%
+% ADDITIONAL NOTES:
+%
+% author: snikumbh@mpi-inf.mpg.de
 
 if nargin < 3
     debugMsgLocation = 1;
@@ -26,7 +55,8 @@
 	allSeqsAsMat(:, instanceStarts(i): instanceEnds(i)) = allSeqsAsBags{i};
     end
     logMessages(debugMsgLocation, sprintf('Loop completed in %.4f seconds\n', toc), debugLevel);
-    % Above, looping version instead of cell2mat. This takes nearly half the time for really long sequences and too many instances.
+    % Above is a looping version instead of cell2mat (see next line). This takes 
+    % nearly half the time for really long sequences and too many instances.
     % tic; allSeqsAsMat = cell2mat(allSeqsAsBags); toc;
     tic;
     instanceWideKernel = full(allSeqsAsMat' * allSeqsAsMat);

diff --git a/readFastaSequences.m b/readFastaSequences.m
@@ -1,17 +1,36 @@
 function [nSeqs, ommittedIndices, passFasta] = readFastaSequences(txt, segmentSizeInBps, givenNSeqs, outputFolder, debugLevel, debugMsgLocation)
 
 % READFASTASEQUENCES
-%
-% READFASTA reads fasta file and extracts sequence to fasta variable.
+% 
+% READFASTASEQUENCES reads fasta file and extracts sequence to fasta variable.
 % Modified version of readfasta shipped with ODH Matlab code.
+% 
+% INPUT arguments:
+% txt - name of FASTA filename
+% 
+% segmentSizeInBps - segment size in basepairs
 %
-% fasta - struct containing cell arrays with legends and sequences
-%	  fasta.legend   = cell array with sequence legends (headers)
-%	  fasta.seqeunce = cell array with sequences
+% givenNSeqs - # sequences as given from the main function (expected in 
+%	the FASTA file). This is needed since the returned number of 
+%	sequences can be different than givenNSeqs
 %
+% outputFolder - path to the output folder (omitted fasta IDs are written 
+%	at this location
 %
-
-% Edits: snikumbh@mpi-inf,mpg.de
+% debugLevel and debugMsgLocation - (default) 2, and (default) 1 for stdout 
+%	or filename if to be written to file
+%
+% RETURNS:
+% passfasta - struct containing cell arrays with legends and sequences
+%	passFasta.legend   = cell array with sequence legends (headers)
+%	passFasta.seqeunce = cell array with sequences
+% 
+% omittedIndices - List of indices that were omittedIndices due to their 
+%	length shorter than the segment size
+% 
+% nSeqs - #sequences read from the given file (
+%
+% author: snikumbh@mpi-inf,mpg.de
 
 totalArguments = 6;
 if nargin < totalArguments
@@ -27,7 +46,7 @@
 i=0;
 ommittedIndices = [];
 fasta.title=[txt,',  read: ', date];
-% READS SEQUENCES AND THEIR LEGENDS
+% Read Sequences and their FASTA legends
 fid=fopen(txt,'r');
 ofid=fopen(strcat(outputFolder, '/ommittedFastaIds.txt'), 'a'); 
 fprintf(ofid, '----%s----\n', txt);