From eb8bd568975e2c539388331d85645271c6710efe Mon Sep 17 00:00:00 2001 From: Sarvesh Prakash Nikumbh Date: Wed, 19 Jul 2017 16:46:58 +0200 Subject: [PATCH] Cleaning up function definitions and comments phase-I --- analyze_wvector.m | 28 +------ barplot-heatmap-visualization.R | 33 +++----- comik_main_with_weight_vector.m | 7 +- comik_wrapper.m | 129 ++++---------------------------- computeInstanceWideKernel.m | 62 +++++++++++++++ computeStdMultiInstanceKernel.m | 123 ------------------------------ readFastaSequences.m | 24 +++--- 7 files changed, 108 insertions(+), 298 deletions(-) create mode 100644 computeInstanceWideKernel.m delete mode 100644 computeStdMultiInstanceKernel.m diff --git a/analyze_wvector.m b/analyze_wvector.m index f9b9000..86237cb 100644 --- a/analyze_wvector.m +++ b/analyze_wvector.m @@ -1,9 +1,5 @@ function [run_this_cmd, pdfcrop_lines] = analyse_wvector(folderName, rankId, wvector, oligoLen, maxDist, topN, verbose, max_or_topN, debugLevel, debugMsgLocation) -%if(nargin < 4) -% topN = 50; -% verbose = 0; -%end totalArguments = 10; if nargin < totalArguments debugMsgLocation = 1; @@ -27,18 +23,14 @@ % different based on whether the given maxDist is same as the segment size or not. % This is more reliable. max_dist = length(wvector) / (alphabet_size^oligoLen)^2; -%assert((max_dist) == maxDist); pos_thresholds = []; neg_thresholds = []; twoD_means = wvector; logMessages(debugMsgLocation, sprintf('\nNow analyzing weight vector ranked: %d\n', rankId), debugLevel); -%disp(size(wvector)); -%disp(max_dist); -%%Getting a 3D heatmap + mat_rows = alphabet_size^oligoLen; mat_cols = mat_rows; -%single_wvector_length = (alphabet_size^oligoLen)^2 * (max_dist); norms_pair_matrix = zeros(mat_rows, mat_cols); for i= 0 : (alphabet_size^oligoLen)-1 % i iterates over the first oligomer in the pair @@ -316,7 +308,7 @@ grid off; else %% Motif files are written - %% We will have R scripting runnign on it to generate sequencelogos + %% We will have R script running on it to generate sequencelogos %%Write out the motifs, and their weights to a file, which is read from R and plot logos %for k = 1 : length(Xd) @@ -341,22 +333,10 @@ % end end -if max_or_topN == 1 %%-- generating inidividual EPS figures +if max_or_topN == 1 + %% generate inidividual EPS figures collated_pdf_name = strcat(folderName, '/MinMax_at_distances_oligoLen', num2str(oligoLen) ,'_combined_colored.pdf'); - %fprintf('=====Cmds for unix:====\n'); - %for i= 1:size(pdfcrop_lines,1) - % fprintf('%s\n', [pdfcrop_lines(i,1:end-1)]); - % %strcat('%s\n', [pdfcrop_lines(i,1:end-1)]); - %end - - %% Combine/append all PDFs - %fprintf('\npdftk '); - %for i= 1:size(pdfcrop_lines,1) - % fprintf('%s-crop.pdf ', [pdfcrop_lines(i,9:end-1)]); - %end - %fprintf('cat output %s\n\n', collated_pdf_name); run_this_cmd = collated_pdf_name; - %strcat(); else run_this_cmd = ''; all_thresholds = [neg_thresholds; pos_thresholds]'; diff --git a/barplot-heatmap-visualization.R b/barplot-heatmap-visualization.R index 6c2f31b..fb0fc0d 100644 --- a/barplot-heatmap-visualization.R +++ b/barplot-heatmap-visualization.R @@ -1,52 +1,43 @@ -#rscaled <- scale(r, center = FALSE, scale = colSums(r,na.rm=T)) -#may not need re-scaling -pdf('samarth_viz_in_r.pdf', width=10.0, height=6.0) -flds <- count.fields('samarth_test1.csv',sep=',') -s <- read.csv('samarth_test1.csv', fill=TRUE, nrows=length(flds), header=F, sep=",", col.names=paste0("Seg-", seq(1,max(flds),by=1))) + +pdf('sample_viz_in_r.pdf', width=10.0, height=6.0) +csv_filename <- '.csv' +flds <- count.fields(csv_filename, sep=',') +s <- read.csv(csv_filename, fill=TRUE, nrows=length(flds), header=F, sep=",", col.names=paste0("Seg-", seq(1,max(flds),by=1))) r <- s[1:25,] r_ones <- r r_ones[!is.na(r)] <- 1 -#ncolors <- max(r[complete.cases(r),]) -#or + ncolors <- max(flds) -#color_palette <- rainbow(ncolors) color_palette <- heat.colors(ncolors) -jet.colors <- - colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", - "#7FFF7F", "yellow", "#FF7F00"))#, "red", "#7F0000")) -#color_palette <- jet.colors(ncolors) +# If a different color palette is required +# jet.colors <- +# colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", +# "#7FFF7F", "yellow", "#FF7F00"))#, "red", "#7F0000")) ypos <- apply(r, 2, cumsum) ypos <- ypos - r/2 -#xpos <- barplot(t(r_ones), col=rainbow(ncolors), axes=F, horiz=T) xpos <- barplot(t(r_ones), col=c("white"), axes=F, horiz=T, axisnames=F) cp <- 0 for (i in 1:ncol(r_ones)){ temp <- color_palette[r[seq(1,nrow(r),by=1), i]] cp <- cbind(cp, temp) - #barplot(t(r_ones[]), col=color_palette[], add=T, axes=F) } new_cp <- cp[,-1] for (i in 1:nrow(r_ones)){ - #xx = r_ones - #xx[,-i] <- NA - #colnames(xx)[-i] <- NA barplot(t(r_ones[i,]), col=new_cp[i,], horiz=T, add=T, space=(i-1)*1.2, axes=F, xlab="", axisnames=F) } axis(1,at=0.5+seq(0,ncolors-1,1), labels=seq(1,ncolors,1)) -#Additional axis showing the numcleotide ranges -#axis(1,at=seq(0,ncolors-1,1), labels=seq(1,ncolors,1)) +#Additional axis showing the nucleotide ranges, if required +## axis(1,at=seq(0,ncolors-1,1), labels=seq(1,ncolors,1)) legend("topright", legend = seq(1,ncolors), fill = color_palette, title="Segment ranks", ncol = 2, x.intersp = 0.1, bty="n", cex=0.8) -#pie(rep(1,ncolors), col=color_palette, labels=paste0("Rank ", seq(1,ncolors)), add=T) dev.off() -#text(rep(xpos,5), as.vector(t(ypos)), labels=t(r)) # ?? diff --git a/comik_main_with_weight_vector.m b/comik_main_with_weight_vector.m index a1cd369..31cf9a6 100644 --- a/comik_main_with_weight_vector.m +++ b/comik_main_with_weight_vector.m @@ -132,10 +132,11 @@ logMessages(debugMsgLocation,sprintf('Total instances from all bags: %d\n', nInstances), debugLevel); direct = 0; -% use this flag for direct computation from bags instead of the 'sparseComputation' flag -% IMP: the 'direct' flag uses parfor for speeding up the instances' kernel computation. +% Using this flag for direct computation from bags (by setting to 1) instead of the 'sparseComputation' flag. +% IMP: the 'direct' flag uses parfor for speeding up the instance-wide kernel computation. +% We are using sparseComputation. sparseComputation = 1; -[instanceStarts, instanceEnds, instanceWideKernel, allSeqsStdSetKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation); +[instanceStarts, instanceEnds, instanceWideKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation); % % Further are functions that will be recomputed per inner cross-validation iteration % Cross-validation loop is also handled simultaneously diff --git a/comik_wrapper.m b/comik_wrapper.m index 5035eff..d0f6c7c 100644 --- a/comik_wrapper.m +++ b/comik_wrapper.m @@ -1,9 +1,9 @@ -function [] = comik_wrapper(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, testIndices, outputFolder, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, nOuterFolds, whetherToPlotHeatmap, whetherToVisualizeWVector, debugLevel, debugMsgLocation, computationVersion) +function [] = comik_wrapper(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, testIndices, outputFolder, oligoLen, maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, nOuterFolds, whetherToPlotHeatmap, whetherToVisualizeWVector, debugLevel, debugMsgLocation, computationVersion) -% COMIKL_WRAPPER +% COMIK_WRAPPER % Usage: % comik_wrapper(givenFastaFilename, nPosSequences, nNegSequences, ... -% outputFolder, oligoLen, maxDist, chunkSizeInBps, nClusterVals, ... +% outputFolder, oligoLen, maxDist, segmentSizeInBps, nClusterVals, ... % sigmaVals, Cs, mklNorm, nFolds, nOuterFolds, whetherToPlotHeatmap, ... % computationVersion, whetherToVisualizeWVector) % @@ -154,7 +154,7 @@ Cs = 10.^[-3:1:3]; sigmaVals = 10.^[-1:1:2]; nClusterVals = [2 5]; - chunkSizeInBps = 100; + segmentSizeInBps = 100; end if nargin < totalArguments - 12 computationVersion = 'Looping'; @@ -168,7 +168,7 @@ Cs = 10.^[-3:1:3]; sigmaVals = 10.^[-1:1:2]; nClusterVals = [2 5]; - chunkSizeInBps = 100; + segmentSizeInBps = 100; maxDist = 50; end if nargin < totalArguments - 13 @@ -183,111 +183,12 @@ Cs = 10.^[-3:1:3]; sigmaVals = 10.^[-1:1:2]; nClusterVals = [2 5]; - chunkSizeInBps = 100; + segmentSizeInBps = 100; maxDist = 50; oligoLen = [2]; end -problemDesc = ''; -switch problemDesc - %chr19-- 535|3340 -- 400 positives for training, 800 negatives for training - %pos train : 1:400, pos test : 401:535 - %neg train : 536:1335 , neg test : 1336:3875 - case char('nucleosome_chr19') - givenFastaFilename = 'dataGeneration/nucleosome_positioning/01_HepG2_LiHG_CtX.isect.ovl.chr19.fasta'; - testIndices = [401:535 1336:3875]; - % - % - %chr17 -- 649|3669 -- 475 positives for training, 900 negatives for training - %pos train : 1:475, pos test : 476:649 - %neg train : 650:1549 , neg test : 1550:4318 - case char('nucleosome_chr17') - givenFastaFilename = 'dataGeneration/nucleosome_positioning/01_HepG2_LiHG_CtX.isect.ovl.chr17.fasta'; - testIndices = [476:649 1550:4318]; - % - % - case char('WUS=LFY_instersect') - givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS=LFY_intersect-prepared1.fasta'; - testIndices = [51:65 2066:16712]; - % - % - case char('WUS-loop-logic') - givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-loop-logic-prepared1.fasta'; - testIndices = [81:104 2105:16751]; - % - % - case char('kirmes-athaliana-WUS-CLV3-subtract5') - %42 pos, 16647 neg, as test examples, shall predict using weight vector - givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-CLV3_subtract-prepared5.fasta'; - testIndices = [31:42 2043:16689]; - % - % - case char('kirmes-gen') - givenFastaFilename = 'data/kirmes-gen.fasta'; - testIndices = [501:600 1101:1200]; - % - % - case char('new-simulated-dataset1') - givenFastaFilename = 'dataGeneration/new_simulated_dataset1.fasta'; - testIndices = [901:1000 1901:2000]; - % - % - case char('new-simulated-dataset2') - givenFastaFilename = 'dataGeneration/new_simulated_dataset2.fasta'; - testIndices = [501:600 1101:1200]; - % - % - case char('new-simulated-dataset3') - givenFastaFilename = 'dataGeneration/new_simulated_dataset3.fasta'; - testIndices = [401:500 901:1000]; - % - % - case char('new-simulated-dataset4') - givenFastaFilename = 'dataGeneration/new_simulated_dataset4.fasta'; - testIndices = [501:600 1101:1200]; - % - % - case char('cgi-noncgi-dataset') - givenFastaFilename = 'dataGeneration/cgi-noncgi-dataset/chr17_19_22.fasta'; - testIndices = [1301:1954 2605:2926]; - % - % - case char('cgi-noncgi-chr19only') - givenFastaFilename = 'dataGeneration/cgi-noncgi-dataset/chr19.fasta'; - testIndices = [601:934 1235:1401]; - % - % - case char('kirmes-athaliana-WUS-CLV3-subtract') - givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-CLV3-subtract-prepared.fasta'; - testIndices = [31:42 193:242]; - % - % - case char('kirmes-athaliana-WUS-CLV3-subtract2') - givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-CLV3-subtract-prepared2.fasta'; - testIndices = [31:42 843:1042]; - % - % - case char('kirmes-athaliana-WUS-CLV3-subtract3') - %42 pos, 5000 neg, 4200 as test examples, shall predict using weight vector - givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-CLV3-subtract-prepared3.fasta'; - testIndices = [31:42 843:5042]; - % - % - case char('kirmes-athaliana-WUS-CLV3-subtract4') - %42 pos, 16647 neg, as test examples, shall predict using weight vector - givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-CLV3-subtract-prepared4.fasta'; - testIndices = [31:42 843:16689]; - % - % - case char('distal-proximal-cgi') - givenFastaFilename = 'dataGeneration/distal_proximal_cgi/all-combined.fasta'; - testIndices = [1501:2260 2461:2541]; - % - % - otherwise - %fprintf('No problem specified with param problemDesc.\n'); -end if exist(outputFolder, 'dir') == 7 %do-nothing else @@ -328,7 +229,7 @@ case char('distal-proximal-cgi') parfor (i=1:nOuterFolds, nOuterFolds) thisFoldDirectory = strcat(outputFolder, '/outer_fold_', num2str(i)); returnStatusVal = perform_one_outer_fold_for_comik(i, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, ... - maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, ... + maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, ... thisTestIndices{i}, debugLevel, thisFoldDirectory, whetherToPlotHeatmap, ... computationVersion, whetherToVisualizeWVector, debugMsgLocation); logMessages(1, sprintf('\nOuter fold %d: Status %s\n', i, returnStatusVal), debugLevel); @@ -338,7 +239,7 @@ case char('distal-proximal-cgi') end % comik_wrapper function ends -function statusVal = perform_one_outer_fold_for_comik(outerFoldID, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, outputFolder, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector, debugMsgLocation) +function statusVal = perform_one_outer_fold_for_comik(outerFoldID, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, outputFolder, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector, debugMsgLocation) % Runs one outer fold for CoMIKL % Make the outputFolder for this outer fold @@ -362,7 +263,7 @@ case char('distal-proximal-cgi') end for l=1:size(oligoLen,2) - filenameSuffix = strcat('_segment-size', num2str(chunkSizeInBps), '_oligoLen', num2str(oligoLen(l))); + filenameSuffix = strcat('_segment-size', num2str(segmentSizeInBps), '_oligoLen', num2str(oligoLen(l))); runSummaryFilename = strcat(outputFolder, '/runSummary', filenameSuffix, '.txt'); fid = fopen(runSummaryFilename, 'a'); @@ -382,8 +283,8 @@ case char('distal-proximal-cgi') logMessages(debugMsgLocation, sprintf(' NegFasta: %s \n', givenNegFastaFilename), debugLevel); - fprintf(fid, ' oligoLength: %d, maxDist: %d, chunkSize: %d bps \n', oligoLen(l), maxDist, chunkSizeInBps); - logMessages(debugMsgLocation, sprintf(' oligoLength: %d, maxDist: %d, chunkSize: %d bps \n', oligoLen(l), maxDist, chunkSizeInBps), debugLevel); + fprintf(fid, ' oligoLength: %d, maxDist: %d, segmentSize: %d bps \n', oligoLen(l), maxDist, segmentSizeInBps); + logMessages(debugMsgLocation, sprintf(' oligoLength: %d, maxDist: %d, segmentSize: %d bps \n', oligoLen(l), maxDist, segmentSizeInBps), debugLevel); fprintf(fid, ' mklNorm: %.2f\n', mklNorm); logMessages(debugMsgLocation, sprintf(' mklNorm: %.2f\n', mklNorm), debugLevel); @@ -414,18 +315,18 @@ case char('distal-proximal-cgi') fclose(fid); % % - [allSeqsAsBags, allSeqsConformedSetKernel, kernelweights, thetaVals, instanceWeightsInEachBag, resultString, bestVals, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen(l), maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector); + [allSeqsAsBags, allSeqsConformedSetKernel, kernelweights, thetaVals, instanceWeightsInEachBag, resultString, bestVals, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen(l), maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector); fid=fopen(runSummaryFilename, 'a'); fprintf(fid, '\n\n'); - fprintf(fid, '=====Validation results - SAMARTH=====\n'); - fprintf(fid, 'OligoLen: %d, maxDist: %d, segment size: %d\n', oligoLen(l), maxDist, chunkSizeInBps); + fprintf(fid, '===== Validation results =====\n'); + fprintf(fid, 'OligoLen: %d, maxDist: %d, segment size: %d\n', oligoLen(l), maxDist, segmentSizeInBps); fprintf(fid, 'Validation teAUPRC: %.4f\n', test_teAUPRC); fprintf(fid, 'Validation teAUROC: %.4f\n', test_teAUROC); fprintf(fid, 'Best C: %.3f\n', bestVals.best_C); fprintf(fid, 'Best sigma: %.3f\n', bestVals.best_sigma); fprintf(fid, 'Best nClusters: %d\n', bestVals.best_nClusters); fprintf(fid, 'Best teAUROC: %.4f\n', bestVals.best_teAUROC); - fprintf(fid, '======END======\n\n'); + fprintf(fid, '====== END ======\n\n'); fclose(fid); end if debugMsgLocation ~= 1 diff --git a/computeInstanceWideKernel.m b/computeInstanceWideKernel.m new file mode 100644 index 0000000..c2086f4 --- /dev/null +++ b/computeInstanceWideKernel.m @@ -0,0 +1,62 @@ +function [instanceStarts, instanceEnds, instanceWideKernel] = computeInstanceWideKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation) + +% COMPUTEINSTANCEWIDEKERNEL +% + +if nargin < 3 + debugMsgLocation = 1; + debugLevel = 2; + sparceComputation = 0; +end +logMessages(debugMsgLocation, sprintf('Onto compute the Std MI kernel\n'), debugLevel); +nBags = length(allSeqsAsBags); + +tic; +instanceEnds = cumsum(thisInstances); +instanceStarts = cumsum(thisInstances) - thisInstances + 1; +nInstances = instanceEnds(end); +logMessages(debugMsgLocation, sprintf('instances starts and ends collected in %.4f seconds\n', toc), debugLevel); + + +if sparseComputation + logMessages(debugMsgLocation, sprintf('Performing sparse computation for kernel on instances\n'), debugLevel); + tic; + allSeqsAsMat = zeros( size( allSeqsAsBags{1}, 1) , nInstances); + for i=1:nBags + allSeqsAsMat(:, instanceStarts(i): instanceEnds(i)) = allSeqsAsBags{i}; + end + logMessages(debugMsgLocation, sprintf('Loop completed in %.4f seconds\n', toc), debugLevel); + % Above, looping version instead of cell2mat. This takes nearly half the time for really long sequences and too many instances. + % tic; allSeqsAsMat = cell2mat(allSeqsAsBags); toc; + tic; + instanceWideKernel = full(allSeqsAsMat' * allSeqsAsMat); + logMessages(debugMsgLocation, sprintf('full kernel computed in %.4f seconds\n', toc), debugLevel); + % nBags x nBags kernel actually need not be computed + clear allSeqsAsMat; + % +else + % direct (not sparseComputation) + % This is tested neither for correctness nor efficiency. We recommend using sparseComputation. + logMessages(debugMsgLocation, sprintf('Computing directly, %d rows.\n', nBags), debugLevel); + stdMultiInstanceKernel = zeros(nBags); + for i=1:nBags + if mod(i, 10) == 0 + logMessages(debugMsgLocation, sprintf('%d, ', i), debugLevel); + end + iInstances = size(allSeqsAsBags{i}, 2); + for j=i:nBags + term = transpose(allSeqsAsBags{i}) * allSeqsAsBags{j}; + jInstances = size(allSeqsAsBags{j}, 2); + stdMultiInstanceKernel(i,j) = sum(term(:)) * (1/iInstances) * (1/jInstances); + end + end + logMessages(debugMsgLocation, sprintf('\n'), debugLevel); + % Fill-up the other half + stdMultiInstanceKernel = triu(stdMultiInstanceKernel, 1) + stdMultiInstanceKernel'; + % Normalize the kernel matrix -- done by calling normalizeKernel + stdMultiInstanceKernel = normalizeKernel(stdMultiInstanceKernel); + % + % +end %if-else ends + +end % function ends diff --git a/computeStdMultiInstanceKernel.m b/computeStdMultiInstanceKernel.m deleted file mode 100644 index b62ac4d..0000000 --- a/computeStdMultiInstanceKernel.m +++ /dev/null @@ -1,123 +0,0 @@ -function [instanceStarts, instanceEnds, instanceWideKernel, stdMultiInstanceKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation) - -% Input Params: -% - -if nargin < 3 - sparceComputation = 0; -end -logMessages(debugMsgLocation, sprintf('Onto compute the Std MI kernel\n'), debugLevel); -nBags = length(allSeqsAsBags); -%%instancesInEachBag = [0]; - -% the for loop below can be replaced by a cellfun -%thisInstances = cellfun(@(x) size(x,2), allSeqsAsBags); -% get this beforehand -tic; -instanceEnds = cumsum(thisInstances); -instanceStarts = cumsum(thisInstances) - thisInstances + 1; -nInstances = instanceEnds(end); -logMessages(debugMsgLocation, sprintf('instances starts and ends collected in %.4f seconds\n', toc), debugLevel); -%instanceStarts = [1]; -%instanceEnds = [0]; -%for i=1:nBags -% thisInstances = size(allSeqsAsBags{i}, 2); - % this happens because the number of chunks in shifted case - % are one less than the non-shifted case. So if there is exactly - % one chunk in the non-shifted case, we copy it to the shifted case. - % This is done in the main script. - % --- Not needed anymore - %if(thisInstances == 0) - % fprintf('YES, ZERO --%d \n', i); - %end - %instancesInEachBag = [instancesInEachBag thisInstances]; -% instanceEnds = [instanceEnds instanceEnds(end)+thisInstances]; -% instanceStarts = [instanceStarts instanceStarts(end)+thisInstances]; -%end -%%instancesInEachBag = instancesInEachBag(2:end); -%instanceEnds = instanceEnds(2:end); -%instanceStarts = instanceStarts(1:end-1); -%nInstances = instanceEnds(end); - - -if sparseComputation - logMessages(debugMsgLocation, sprintf('Performing sparse computation for kernel on instances\n'), debugLevel); - tic; - allSeqsAsMat = zeros( size( allSeqsAsBags{1}, 1) , nInstances); - for i=1:nBags - allSeqsAsMat(:, instanceStarts(i): instanceEnds(i)) = allSeqsAsBags{i}; - end - logMessages(debugMsgLocation, sprintf('Loop completed in %.4f seconds\n', toc), debugLevel); - % IMplemented looping instead of cell2mat. It takes nearly half the time - %tic; - %allSeqsAsMat = cell2mat(allSeqsAsBags); - %fprintf('cell2mat completed in %.4f seconds\n', toc); - tic; - instanceWideKernel = full(allSeqsAsMat' * allSeqsAsMat); - logMessages(debugMsgLocation, sprintf('full kernel computed in %.4f seconds\n', toc), debugLevel); - % nBags x nBags kernel actually need not be computed - stdMultiInstanceKernel = zeros(nBags); - clear allSeqsAsMat; - % -elseif direct - logMessages(debugMsgLocation, sprintf('Computing directly, %d rows.\n', nBags), debugLevel); - stdMultiInstanceKernel = zeros(nBags); - for i=1:nBags - if mod(i, 10) == 0 - logMessages(debugMsgLocation, sprintf('%d, ', i), debugLevel); - end - iInstances = size(allSeqsAsBags{i}, 2); - for j=i:nBags - term = transpose(allSeqsAsBags{i}) * allSeqsAsBags{j}; - %jInstances = size(allSeqsAsBags{j}, 2); - %stdMultiInstanceKernel(i,j) = sum(term(:)) * (1/iInstances) * (1/jInstances); - end - end - logMessages(debugMsgLocation, sprintf('\n'), debugLevel); - % Fill-up the other half - stdMultiInstanceKernel = triu(stdMultiInstanceKernel, 1) + stdMultiInstanceKernel'; - % Normalize the kernel matrix -- done by calling normalizeKernel - stdMultiInstanceKernel = normalizeKernel(stdMultiInstanceKernel); - %disp(stdMultiInstanceKernel(1:5,1:5)); - % - % -else %breakdown - %tic; - %spead out the instances - X = zeros( size(allSeqsAsBags{1},1), nInstances); - disp(size(X)); - for i=1:nBags - X(:,instanceStarts(i):instanceEnds(i)) = allSeqsAsBags{i}; - end - - allInstanceKernel = zeros(nInstances, nInstances); - - start = 1; - step_size = 1; - for j=step_size:step_size:nInstances - parfor(i=start:j, 25) - allInstanceKernel(i, :) = [zeros(1, i-1) sum(bsxfun(@times, X(:,i), X(:,i:end)), 1)]; - end - start = j+1; - logMessages(debugMsgLocation, sprintf('%d', j), debugLevel); - end - % Remainder of the rows - parfor(i=start:nInstances, 25)%%i iterates over rows - allInstancesKernel(i,:) = [zeros(1,i-1) sum(bsxfun(@times, X(:,i), X(:,i:end)), 1)]; - end - % Fill-up the other half - allInstancesKernel = triu(allInstancesKernel, 1) + allInstancesKernel'; - disp(size(allInstancesKernel)); - - stdMultiInstanceKernel = zeros(nBags, nBags); - %for i=1:nBags - % for j=1:nBags - % stdMultiInstanceKernel(i,j) = sum(allInstancesKernel( instanceStarts(i):instanceEnds(i), instanceStarts(j):instanceEnds(j) )) * ... - % (1 / (instanceEnds(i)-instanceStarts(i)+1) ) * (1 / (instanceEnds(j)-instanceStarts(j)+1) ); - % end - %end -end -%t = toc; -%fprintf('%s\n',t); - -end diff --git a/readFastaSequences.m b/readFastaSequences.m index f35e056..821ee93 100644 --- a/readFastaSequences.m +++ b/readFastaSequences.m @@ -1,17 +1,17 @@ -function [nSeqs, ommittedIndices, passFasta] = readFastaSequences(txt, chunkSizeInBps, givenNSeqs, outputFolder, debugLevel, debugMsgLocation) +function [nSeqs, ommittedIndices, passFasta] = readFastaSequences(txt, segmentSizeInBps, givenNSeqs, outputFolder, debugLevel, debugMsgLocation) -% FUNCTION FASTA=READFASTA(FILENAME) +% READFASTASEQUENCES % -% READFASTA reads fasta file and extracts sequence to fasta variable -% speed-optimized version of Jonas Almeidas readfasta.m +% READFASTA reads fasta file and extracts sequence to fasta variable. +% Modified version of readfasta shipped with ODH Matlab code. % % fasta - struct containing cell arrays with legends and sequences % fasta.legend = cell array with sequence legends (headers) % fasta.seqeunce = cell array with sequences % -%** +% -% Edits: March 20, 2017, snikumbh +% Edits: snikumbh@mpi-inf,mpg.de totalArguments = 6; if nargin < totalArguments @@ -26,7 +26,7 @@ fasta.sequence = cell(1, givenNSeqs); i=0; ommittedIndices = []; -fasta.title=[txt,', read: ',date]; +fasta.title=[txt,', read: ', date]; % READS SEQUENCES AND THEIR LEGENDS fid=fopen(txt,'r'); ofid=fopen(strcat(outputFolder, '/ommittedFastaIds.txt'), 'a'); @@ -36,12 +36,11 @@ lineRead=fgetl(fid); if isempty(strfind(lineRead, '#')) if strfind(lineRead,'>') - %seqLegends{i} = lineRead; fasta.legend{i+1} = lineRead; else i=i+1; lengths = [lengths length(lineRead)]; - if lengths(end) >= chunkSizeInBps + if lengths(end) >= segmentSizeInBps fasta.sequence{i}=[fasta.sequence{i},upper(lineRead)]; else % we omit that sequence ommittedIndices = [ommittedIndices i]; @@ -53,15 +52,14 @@ end fclose(ofid); fclose(fid); -ineligibleLengths = lengths(lengths < chunkSizeInBps); -ineligibleIndices = find(lengths < chunkSizeInBps); +ineligibleLengths = lengths(lengths < segmentSizeInBps); +ineligibleIndices = find(lengths < segmentSizeInBps); nSeqs = givenNSeqs; passFasta = fasta; if size(ineligibleLengths, 2) > 0 - %fprintf('TIP: FASTA file has sequences of length smaller than the segment-size specified. Please rectify by either removing these sequences or specifying a suitable segment-size.\n'); logMessages(debugMsgLocation, sprintf('%d sequence(s) have been ommitted as they are shorter than the segment-size specified. Check ommittedFastaIds.txt\n', length(ineligibleIndices)), debugLevel); % Change to what would be passed, if it is different than 'fasta' above - nSeqs = length(find(lengths >= chunkSizeInBps)); + nSeqs = length(find(lengths >= segmentSizeInBps)); passFasta.sequence = cell(1, nSeqs); j = 1; for i=1:givenNSeqs