From f674716b88a1a033eb21a77c9f88b9619d3c29bb Mon Sep 17 00:00:00 2001 From: Sarvesh Prakash Nikumbh Date: Fri, 28 Jul 2017 12:16:29 +0200 Subject: [PATCH] Updated help text for functions, sanitized the barplotting r-script and the main functions for comik. --- README.md | 3 +- barplot-heatmap-visualization.R | 58 +++++++++------ comik_main_with_weight_vector.m | 5 +- comik_wrapper.m | 1 + computeConformedMultiInstanceKernel.m | 101 +++++++++++++++++++++----- computeInstanceWideKernel.m | 34 ++++++++- readFastaSequences.m | 35 +++++++-- 7 files changed, 185 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index ff26c58..2fffd3d 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ export PATH=".:$PATH" Example function call from inside Matlab: For simulated dataset 1 provided in the folder `sample_data` ```Matlab -comik_wrapper('sample_data/simulated_dataset1/pos.fasta', 'sample_data/simulated_dataset1/neg.fa', 600, 600, [501:600 1101:1200], 'comik_run_simulated_dataset1', [2], 10, 10, [2 5 7], 10.^[1:1:2], 10.^[-3:1:3], 2.0, 10, 5, 'No', 'Yes', 2, 'runLog.txt'); +comik_wrapper('sample_data/simulated_dataset1/pos.fasta', 'sample_data/simulated_dataset1/neg.fasta', 600, 600, [501:600 1101:1200], 'comik_run_simulated_dataset1', [2], 10, 10, [2 5 7], 10.^[1:1:2], 10.^[-3:1:3], 2.0, 10, 5, 'No', 'Yes', 2, 'runLog.txt'); ``` _CoMIK_ accepts two FASTA files as input -- the first FASTA file containing sequences in the positive class followed by a second FASTA file containing the sequences in the negative class. Other params are explained below. @@ -56,6 +56,7 @@ Param name | type | default value | Additional comments number-of-inner-folds | int | 10 | number-of-outer-folds | int | 5 | whetherToPlotHeatmap | str | 'No' | Possible values: 'Yes' or 'No'; Set 'Yes' only if all sequences are of the same length. + whetherToVisualizeWVector | str | 'Yes' | Possible values: 'Yes' or 'No'; Set 'Yes' when you wish to have the distance-centric k-mer visualization. debugLevel | int | 2 | Possible values: [0, 1, 2]. Value 0 makes _comik_ completely silent, and 2 makes it maximally verbose. Value 1 may be used in the future. debugMsgLocation | int/str | 1 | Value 1 denotes the command prompt, else specify a filename, say 'runLog.txt'. This file is written for each outer fold separately. computationVersion | str | 'Looping'| Possible values: 'Looping' or 'AccumArray'. 'Looping' is faster and preferred/recommended for large datasets when 'AccumArray' can be memory intensive. diff --git a/barplot-heatmap-visualization.R b/barplot-heatmap-visualization.R index fb0fc0d..0781424 100644 --- a/barplot-heatmap-visualization.R +++ b/barplot-heatmap-visualization.R @@ -1,40 +1,56 @@ - - -pdf('sample_viz_in_r.pdf', width=10.0, height=6.0) -csv_filename <- '.csv' -flds <- count.fields(csv_filename, sep=',') -s <- read.csv(csv_filename, fill=TRUE, nrows=length(flds), header=F, sep=",", col.names=paste0("Seg-", seq(1,max(flds),by=1))) -r <- s[1:25,] -r_ones <- r -r_ones[!is.na(r)] <- 1 +# cmdline arguments to be given in the following order +# 1. inputFilename +# 2. width +# 3. height +# 4. outputFilename (csv format) +# 5. # of sequences to visualize +# (This is useful because there could be hugely many sequences in the +# collection and visualizing them all together may not be a good idea!) +# +# author: snikumbh@mpi-inf.mpg.de + +args <- commandArgs(TRUE) +inputFilename <- args[1] +widthVal <- args[2] +heightVal <- args[3] +outputFilename <- args[4] +numOfSequences <- args[5] + +# If a file with this name already exists, this overwrites it! +pdf('barplot-visualization-variable-len-seq.pdf', width=10.0, height=6.0) +csv_filename <- paste0(outputFilename, '.csv') +flds <- count.fields(csv_filename, sep=',') # get the number of segments in all sequences +seqs <- read.csv(csv_filename, fill=TRUE, nrows=length(flds), header=F, sep=",", col.names=paste0("Seg-", seq(1,max(flds),by=1))) +seqSegs <- seqs[1:numOfSequences,] +seqSegs_ones <- seqSegs +seqSegs_ones[!is.na(seqSegs)] <- 1 # sequences which had less than the maximum number of segments for any sequence in the collection, will have NA ncolors <- max(flds) color_palette <- heat.colors(ncolors) -# If a different color palette is required -# jet.colors <- -# colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", +# If a different color palette is required could use the following with experimenting: +# jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", # "#7FFF7F", "yellow", "#FF7F00"))#, "red", "#7F0000")) -ypos <- apply(r, 2, cumsum) -ypos <- ypos - r/2 +ypos <- apply(seqSegs, 2, cumsum) +ypos <- ypos - seqSegs/2 -xpos <- barplot(t(r_ones), col=c("white"), axes=F, horiz=T, axisnames=F) +xpos <- barplot(t(seqSegs_ones), col=c("white"), axes=F, horiz=T, axisnames=F) cp <- 0 -for (i in 1:ncol(r_ones)){ - temp <- color_palette[r[seq(1,nrow(r),by=1), i]] +for (i in 1:ncol(seqSegs_ones)){ + temp <- color_palette[seqSegs[seq(1,nrow(seqSegs),by=1), i]] cp <- cbind(cp, temp) } new_cp <- cp[,-1] -for (i in 1:nrow(r_ones)){ - barplot(t(r_ones[i,]), col=new_cp[i,], horiz=T, add=T, space=(i-1)*1.2, axes=F, xlab="", axisnames=F) +for (i in 1:nrow(seqSegs_ones)){ + barplot(t(seqSegs_ones[i,]), col=new_cp[i,], horiz=T, add=T, space=(i-1)*1.2, axes=F, xlab="", axisnames=F) } axis(1,at=0.5+seq(0,ncolors-1,1), labels=seq(1,ncolors,1)) -#Additional axis showing the nucleotide ranges, if required -## axis(1,at=seq(0,ncolors-1,1), labels=seq(1,ncolors,1)) +# Additional axis showing the nucleotide ranges, if required +# axis(1,at=seq(0,ncolors-1,1), labels=seq(1,ncolors,1)) legend("topright", legend = seq(1,ncolors), fill = color_palette, title="Segment ranks", ncol = 2, x.intersp = 0.1, bty="n", cex=0.8) dev.off() diff --git a/comik_main_with_weight_vector.m b/comik_main_with_weight_vector.m index 192c8f7..9f35df4 100644 --- a/comik_main_with_weight_vector.m +++ b/comik_main_with_weight_vector.m @@ -2,7 +2,7 @@ % Author: snikumbh % -% All segments, shifted and non-shifted in one bag; order of segments: non-shifted segments followed by shifted segments. +% All segments, shifted and non-shifted in one bag; Order of segments: non-shifted segments followed by shifted segments. % This results in #kernels = #clusterCentres % % debugLevel 2 prints all messages @@ -131,7 +131,7 @@ % This hasn't been tested for either correctness or efficiency. We recommend using sparseComputation. % We are using sparseComputation. sparseComputation = 1; -[instanceStarts, instanceEnds, instanceWideKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, sparseComputation, debugLevel, debugMsgLocation); +[instanceStarts, instanceEnds, instanceWideKernel] = computeInstanceWideKernel(allSeqsAsBags, thisInstances, sparseComputation, debugLevel, debugMsgLocation); % % Further are functions that will be recomputed per inner cross-validation iteration % Cross-validation loop is also handled simultaneously @@ -145,6 +145,7 @@ bestParamComb.best_teAUROC = 0.0; bestParamComb.best_nClusters = 0; bestParamComb.best_sigma = 0.0; + % open summary file for writing fid=fopen(runSummaryFilename, 'a'); diff --git a/comik_wrapper.m b/comik_wrapper.m index d0f6c7c..7a50157 100644 --- a/comik_wrapper.m +++ b/comik_wrapper.m @@ -53,6 +53,7 @@ % Param 'Cs': Cost values for SVM % % Author: snikumbh + totalArguments = 20; if nargin < totalArguments computationVersion = 'Looping'; diff --git a/computeConformedMultiInstanceKernel.m b/computeConformedMultiInstanceKernel.m index 2d4b6a2..383a289 100644 --- a/computeConformedMultiInstanceKernel.m +++ b/computeConformedMultiInstanceKernel.m @@ -1,6 +1,65 @@ function [conformedMultiInstanceKernel, rawConformedMultiInstanceKernel, instancesTransformationKernel, clusterCentres] = computeConformedMultiInstanceKernel(instanceStarts, instanceEnds, instanceWideKernel, trainIndices, allSeqsAsBags, nClusters, sig, Y, computationVersion, debugLevel, debugMsgLocation) + +% COMPUTECONFORMEDMULTIINSTANCEKERNEL +% Computes the conformally transformed multi-instance kernel + +% INPUT PARAMS +% param 'instanceStarts' (vector) +% Indices where instances (segments) of any sequence begin +% +% param 'instanceEnds' (vector) +% Indices where instances (segments) of any sequence end +% +% param 'instanceWideKernel' (matrix) +% The instance-wide kernel matrix of dimension #Instances x #Instances +% +% param 'trainIndices' (vector) +% Indices of the training sequences +% +% param 'allSeqsAsBags' (cell array) +% Collection of all segments per sequence +% +% param 'nClusters' +% number of clusters for k-means +% +% param 'sig' +% Gaussian bandwidth (sigma) value for the Gaussian transformation +% +% param 'Y' (vector) +% Vector of labels +% +% param 'computationVersion' ('AccumArray'/'Looping') +% Flag specifying what approach to use for computing the kernel +% +% param 'debugLevel' (0/1/2) +% +% param 'debugMsgLocation' (1/fileID) +% +% +% OUTPUT PARAMS +% param 'conformedMultiInstanceKernel' (matrix) +% Normalized conformally transformed MI kernel % -% Obtain expansion points +% param 'rawConformedMultiInstanceKernel' (matrix) +% Unnormalized conformally transformed MI kernel +% +% param 'instancesTransformationKernel' (matrix) +% Transformation of the training instances +% +% param 'clusterCentres' (matrix) +% Cluster centres obtained upon k-means +% +% ADDITIONAL NOTES: +% -- Stratification +% . If stratification is required, see below, set stratified = 1 +% -- Replicates for k-means +% . currently set to 1 +% . can be increased +% +% author: snikumbh@mpi-inf.mpg.de + + +% 1.1 Obtain expansion points % % -- Convert allSeqsAsBags in to a matrix X of dimensions nInstances x FVlength % @@ -16,8 +75,13 @@ nBags = length(allSeqsAsBags); % nBags, only as many bags as training examples idxInstances = zeros(1,nBags); -posInd = size(find(Y > 0),2); -negInd = size(find(Y < 0),2); +stratified = 0; +% The following two lines also used for stratified representation. +if stratified + posInd = size(find(Y > 0),2); + negInd = size(find(Y < 0),2); +end + if strcmp(computationVersion, 'AccumArray') instanceIDVector = zeros(1,instanceEnds(end)); end @@ -32,15 +96,17 @@ if strcmp(computationVersion, 'AccumArray') instanceIDVector(instanceStarts(i):instanceEnds(i)) = repmat(i, [1 size(allSeqsAsBags{i}, 2)]); end - %% useful to have stratified repressentation, we don't need this now. - % if i <= posInd - % nInstancesPos = nInstancesPos + size(allSeqsAsBags{i}, 2); - % end - %% useful to have stratified repressentation + % The following is useful to have stratified repressentation, + % we don't need this now. + if stratified + if i <= posInd + nInstancesPos = nInstancesPos + size(allSeqsAsBags{i}, 2); + end + end if any(i==trainIndices) == 1 %check, if a trainingIndex, get instances currentTotal = currentTotal+idxInstances(i); end - %% useful to count instances only in training bags + % useful to count instances only in training bags end forIndicesInX = cumsum(idxInstances); nInstances = sum(idxInstances); @@ -49,7 +115,6 @@ assert(size(instanceIDVector,2) == nInstances); end -%X = zeros( nInstances, size(allSeqsAsBags{1},1) ); logMessages(debugMsgLocation, sprintf('\n--- currentTotal for setting a zeros XtoPassFrom: %d\n', currentTotal), debugLevel); XtoPassFrom = zeros( currentTotal, size(allSeqsAsBags{1},1) ); % Make sure that only instances from training bags are taken @@ -68,7 +133,7 @@ % data points from nInstances % - this will give nClusters -% If sending complete X for k-means clustering set XtoPass = X; +% If sending complete XtoPassFrom for k-means clustering set XtoPass = XtoPassFrom; % Replicates nofRep = 1; % Randomly sample without replacement @@ -110,16 +175,16 @@ if strcmp(computationVersion, 'AccumArray') logMessages(debugMsgLocation, sprintf('\nAccumarray version---\n'), debugLevel);tic; instanceSubs = combvec(instanceIDVector, instanceIDVector)'; - %instanceSubs = tempInstanceSubs(tempInstanceSubs(:,1) <= tempInstanceSubs(:,2),:); - %%instanceSubs are only the upper-triangular indices including the diagonal - %clear('tempInstanceSubs'); + % instanceSubs = tempInstanceSubs(tempInstanceSubs(:,1) <= tempInstanceSubs(:,2),:); + % instanceSubs are only the upper-triangular indices including the diagonal + % clear('tempInstanceSubs'); instanceWideKernelRepeated = repmat(instanceWideKernel, 1, 1, nClusters); allInstancesTransformations = cat(1,instancesTransformationKernel{:}); for i=1:nClusters KernelAsVector = (allInstancesTransformations(:,i) * allInstancesTransformations(:,i)') .* instanceWideKernelRepeated(:,:,i); tempKernel = accumarray(instanceSubs, KernelAsVector(:)); assert(size(tempKernel,1) == size(zeros(nBags),1)); - %tempKernel = triu(tempKernel, 1) + tempKernel'; %because instanceSubs stored indices for only the upper-triangular part of the matrix + % tempKernel = triu(tempKernel, 1) + tempKernel'; %because instanceSubs stored indices for only the upper-triangular part of the matrix conformedMultiInstanceKernel{i} = normalizeKernel(tempKernel); end clear('instanceWideKernelRepeated'); @@ -129,12 +194,12 @@ logMessages(debugMsgLocation, sprintf('done in %.3f seconds', toc), debugLevel); end if strcmp(computationVersion, 'Looping') - %% one can convert the transformationKernels into 3D matrices + % One can convert the transformationKernels into 3D matrices logMessages(debugMsgLocation, sprintf('\n--- Looping-over-bags version...'), debugLevel); - %% Performing the reshaped version + % Performing the reshaped version if issparse(allSeqsAsBags{1}) logMessages(debugMsgLocation, sprintf('Doing the reshaped version..\n'), debugLevel); - %% Pre-allocate memory, doesn't help in speed + % Pre-allocate memory, doesn't help in speed % for i=1:nClusters % rawConformedMultiInstanceKernel{i} = zeros(nBags); % conformedMultiInstanceKernel{i} = zeros(nBags); diff --git a/computeInstanceWideKernel.m b/computeInstanceWideKernel.m index 01e1330..a6ae7f4 100644 --- a/computeInstanceWideKernel.m +++ b/computeInstanceWideKernel.m @@ -1,7 +1,36 @@ function [instanceStarts, instanceEnds, instanceWideKernel] = computeInstanceWideKernel(allSeqsAsBags, thisInstances, sparseComputation, debugLevel, debugMsgLocation) % COMPUTEINSTANCEWIDEKERNEL -% +% Computes the instance-wide kernel matrix given a bag of instances +% +% INPUT PARAMS +% param 'allSeqsAsBags' (cell array) +% Collection of all segments per sequence +% +% param 'thisInstances' (vector) +% #instances in each bag (in other words, #segments per sequence) +% +% param 'sparseComputation' (0/1) +% Flag specifying whether sparseComputation is to be performed (0/1) +% +% param 'debugLevel' (0/1/2) +% +% param 'debugMsgLocation' (1/fileID) +% +% +% OUTPUT PARAMS +% param 'instanceStarts' (vector) +% Indexes in the vector where instances (segments) of any sequence begin +% +% param 'instanceEnds' (vector) +% Indexes in the vector where istances (segments) of any sequence end +% +% param 'instanceWideKernel' (matrix) +% The instance-wide kernel matrix of dimension #Instances x #Instances +% +% ADDITIONAL NOTES: +% +% author: snikumbh@mpi-inf.mpg.de if nargin < 3 debugMsgLocation = 1; @@ -26,7 +55,8 @@ allSeqsAsMat(:, instanceStarts(i): instanceEnds(i)) = allSeqsAsBags{i}; end logMessages(debugMsgLocation, sprintf('Loop completed in %.4f seconds\n', toc), debugLevel); - % Above, looping version instead of cell2mat. This takes nearly half the time for really long sequences and too many instances. + % Above is a looping version instead of cell2mat (see next line). This takes + % nearly half the time for really long sequences and too many instances. % tic; allSeqsAsMat = cell2mat(allSeqsAsBags); toc; tic; instanceWideKernel = full(allSeqsAsMat' * allSeqsAsMat); diff --git a/readFastaSequences.m b/readFastaSequences.m index 821ee93..8617896 100644 --- a/readFastaSequences.m +++ b/readFastaSequences.m @@ -1,17 +1,36 @@ function [nSeqs, ommittedIndices, passFasta] = readFastaSequences(txt, segmentSizeInBps, givenNSeqs, outputFolder, debugLevel, debugMsgLocation) % READFASTASEQUENCES -% -% READFASTA reads fasta file and extracts sequence to fasta variable. +% +% READFASTASEQUENCES reads fasta file and extracts sequence to fasta variable. % Modified version of readfasta shipped with ODH Matlab code. +% +% INPUT arguments: +% txt - name of FASTA filename +% +% segmentSizeInBps - segment size in basepairs % -% fasta - struct containing cell arrays with legends and sequences -% fasta.legend = cell array with sequence legends (headers) -% fasta.seqeunce = cell array with sequences +% givenNSeqs - # sequences as given from the main function (expected in +% the FASTA file). This is needed since the returned number of +% sequences can be different than givenNSeqs % +% outputFolder - path to the output folder (omitted fasta IDs are written +% at this location % - -% Edits: snikumbh@mpi-inf,mpg.de +% debugLevel and debugMsgLocation - (default) 2, and (default) 1 for stdout +% or filename if to be written to file +% +% RETURNS: +% passfasta - struct containing cell arrays with legends and sequences +% passFasta.legend = cell array with sequence legends (headers) +% passFasta.seqeunce = cell array with sequences +% +% omittedIndices - List of indices that were omittedIndices due to their +% length shorter than the segment size +% +% nSeqs - #sequences read from the given file ( +% +% author: snikumbh@mpi-inf,mpg.de totalArguments = 6; if nargin < totalArguments @@ -27,7 +46,7 @@ i=0; ommittedIndices = []; fasta.title=[txt,', read: ', date]; -% READS SEQUENCES AND THEIR LEGENDS +% Read Sequences and their FASTA legends fid=fopen(txt,'r'); ofid=fopen(strcat(outputFolder, '/ommittedFastaIds.txt'), 'a'); fprintf(ofid, '----%s----\n', txt);