From eb8bd568975e2c539388331d85645271c6710efe Mon Sep 17 00:00:00 2001
From: Sarvesh Prakash Nikumbh <snikumbh@mpi-inf.mpg.de>
Date: Wed, 19 Jul 2017 16:46:58 +0200
Subject: [PATCH] Cleaning up function definitions and comments phase-I

---
 analyze_wvector.m               |  28 +------
 barplot-heatmap-visualization.R |  33 +++-----
 comik_main_with_weight_vector.m |   7 +-
 comik_wrapper.m                 | 129 ++++----------------------------
 computeInstanceWideKernel.m     |  62 +++++++++++++++
 computeStdMultiInstanceKernel.m | 123 ------------------------------
 readFastaSequences.m            |  24 +++---
 7 files changed, 108 insertions(+), 298 deletions(-)
 create mode 100644 computeInstanceWideKernel.m
 delete mode 100644 computeStdMultiInstanceKernel.m
diff --git a/analyze_wvector.m b/analyze_wvector.m
index f9b9000..86237cb 100644
--- a/analyze_wvector.m
+++ b/analyze_wvector.m
@@ -1,9 +1,5 @@
 function [run_this_cmd, pdfcrop_lines] = analyse_wvector(folderName, rankId, wvector, oligoLen, maxDist, topN, verbose, max_or_topN, debugLevel, debugMsgLocation)
 
-%if(nargin < 4)
-%    topN = 50;
-%    verbose = 0;
-%end
 totalArguments = 10;
 if nargin < totalArguments
     debugMsgLocation = 1;
@@ -27,18 +23,14 @@
 % different based on whether the given maxDist is same as the segment size or not.
 % This is more reliable.
 max_dist = length(wvector) / (alphabet_size^oligoLen)^2;
-%assert((max_dist) == maxDist);
 
 pos_thresholds = []; neg_thresholds = [];
 
 twoD_means = wvector;
 logMessages(debugMsgLocation, sprintf('\nNow analyzing weight vector ranked: %d\n', rankId), debugLevel);
-%disp(size(wvector));
-%disp(max_dist);
-%%Getting a 3D heatmap
+
 mat_rows = alphabet_size^oligoLen;
 mat_cols = mat_rows;
-%single_wvector_length = (alphabet_size^oligoLen)^2 * (max_dist);
 norms_pair_matrix = zeros(mat_rows, mat_cols);
 for i= 0 : (alphabet_size^oligoLen)-1
     % i iterates over the first oligomer in the pair
@@ -316,7 +308,7 @@
 	grid off;
 else
     %% Motif files are written
-    %% We will have R scripting runnign on it to generate sequencelogos
+    %% We will have R script running on it to generate sequencelogos
 
     %%Write out the motifs, and their weights to a file, which is read from R and plot logos
     %for k = 1 : length(Xd)
@@ -341,22 +333,10 @@
 %    end
 end
 
-if max_or_topN == 1 %%-- generating inidividual EPS figures
+if max_or_topN == 1 
+	%% generate inidividual EPS figures
 	collated_pdf_name = strcat(folderName, '/MinMax_at_distances_oligoLen', num2str(oligoLen) ,'_combined_colored.pdf');
-	%fprintf('=====Cmds for unix:====\n');
-	%for i= 1:size(pdfcrop_lines,1)
-    	%	fprintf('%s\n', [pdfcrop_lines(i,1:end-1)]);
-	%	%strcat('%s\n', [pdfcrop_lines(i,1:end-1)]);
-	%end
-
-	%% Combine/append all PDFs
-	%fprintf('\npdftk ');
-	%for i= 1:size(pdfcrop_lines,1)
-    	%	fprintf('%s-crop.pdf ', [pdfcrop_lines(i,9:end-1)]);
-	%end
-	%fprintf('cat output %s\n\n', collated_pdf_name);
  	run_this_cmd = collated_pdf_name;
-	%strcat();
 else
     run_this_cmd = '';
     all_thresholds = [neg_thresholds; pos_thresholds]';
diff --git a/barplot-heatmap-visualization.R b/barplot-heatmap-visualization.R
index 6c2f31b..fb0fc0d 100644
--- a/barplot-heatmap-visualization.R
+++ b/barplot-heatmap-visualization.R
@@ -1,52 +1,43 @@
 
 
-#rscaled <- scale(r, center = FALSE, scale = colSums(r,na.rm=T))
-#may not need re-scaling
-pdf('samarth_viz_in_r.pdf', width=10.0, height=6.0)
-flds <- count.fields('samarth_test1.csv',sep=',')
-s <- read.csv('samarth_test1.csv', fill=TRUE, nrows=length(flds), header=F, sep=",", col.names=paste0("Seg-", seq(1,max(flds),by=1)))
+
+pdf('sample_viz_in_r.pdf', width=10.0, height=6.0)
+csv_filename <- '<enter-name>.csv'
+flds <- count.fields(csv_filename, sep=',')
+s <- read.csv(csv_filename, fill=TRUE, nrows=length(flds), header=F, sep=",", col.names=paste0("Seg-", seq(1,max(flds),by=1)))
 r <- s[1:25,]
 r_ones <- r
 r_ones[!is.na(r)] <- 1
-#ncolors <- max(r[complete.cases(r),])
-#or 
+
 ncolors <- max(flds)
-#color_palette <- rainbow(ncolors)
 color_palette <- heat.colors(ncolors)
 
-jet.colors <-
-  colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan",
-                     "#7FFF7F", "yellow", "#FF7F00"))#, "red", "#7F0000"))
-#color_palette <- jet.colors(ncolors)
+# If a different color palette is required
+# jet.colors <-
+#  colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan",
+#                     "#7FFF7F", "yellow", "#FF7F00"))#, "red", "#7F0000"))
 
 ypos <- apply(r, 2, cumsum)
 ypos <- ypos - r/2
 
-#xpos <- barplot(t(r_ones), col=rainbow(ncolors), axes=F, horiz=T)
 xpos <- barplot(t(r_ones), col=c("white"), axes=F, horiz=T, axisnames=F)
 cp <- 0
 for (i in 1:ncol(r_ones)){
     temp <- color_palette[r[seq(1,nrow(r),by=1), i]]
     cp <- cbind(cp, temp)
-    #barplot(t(r_ones[]), col=color_palette[], add=T, axes=F) 
 }
 new_cp <- cp[,-1]
 
 for (i in 1:nrow(r_ones)){
-    #xx = r_ones
-    #xx[,-i] <- NA
-    #colnames(xx)[-i] <- NA
     barplot(t(r_ones[i,]), col=new_cp[i,], horiz=T, add=T, space=(i-1)*1.2, axes=F, xlab="", axisnames=F)
 }
 
 axis(1,at=0.5+seq(0,ncolors-1,1), labels=seq(1,ncolors,1))
-#Additional axis showing the numcleotide ranges
-#axis(1,at=seq(0,ncolors-1,1), labels=seq(1,ncolors,1))
+#Additional axis showing the nucleotide ranges, if required
+## axis(1,at=seq(0,ncolors-1,1), labels=seq(1,ncolors,1))
 legend("topright", legend = seq(1,ncolors), fill = color_palette, title="Segment ranks", ncol = 2, x.intersp = 0.1, bty="n", cex=0.8)
-#pie(rep(1,ncolors), col=color_palette, labels=paste0("Rank ", seq(1,ncolors)), add=T)
 
 dev.off()
-#text(rep(xpos,5), as.vector(t(ypos)), labels=t(r)) # ?? 
 
 
 
diff --git a/comik_main_with_weight_vector.m b/comik_main_with_weight_vector.m
index a1cd369..31cf9a6 100644
--- a/comik_main_with_weight_vector.m
+++ b/comik_main_with_weight_vector.m
@@ -132,10 +132,11 @@
 logMessages(debugMsgLocation,sprintf('Total instances from all bags: %d\n', nInstances), debugLevel);
 
 direct = 0; 
-% use this flag for direct computation from bags instead of the 'sparseComputation' flag
-% IMP: the 'direct' flag uses parfor for speeding up the instances' kernel computation.
+% Using this flag for direct computation from bags (by setting to 1) instead of the 'sparseComputation' flag.
+% IMP: the 'direct' flag uses parfor for speeding up the instance-wide kernel computation.
+% We are using sparseComputation.
 sparseComputation = 1;
-[instanceStarts, instanceEnds, instanceWideKernel, allSeqsStdSetKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation);
+[instanceStarts, instanceEnds, instanceWideKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation);
 %
 % Further are functions that will be recomputed per inner cross-validation iteration
 % Cross-validation loop is also handled simultaneously
diff --git a/comik_wrapper.m b/comik_wrapper.m
index 5035eff..d0f6c7c 100644
--- a/comik_wrapper.m
+++ b/comik_wrapper.m
@@ -1,9 +1,9 @@
-function [] = comik_wrapper(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, testIndices, outputFolder, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, nOuterFolds, whetherToPlotHeatmap, whetherToVisualizeWVector, debugLevel, debugMsgLocation, computationVersion)
+function [] = comik_wrapper(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, testIndices, outputFolder, oligoLen, maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, nOuterFolds, whetherToPlotHeatmap, whetherToVisualizeWVector, debugLevel, debugMsgLocation, computationVersion)
 
-% COMIKL_WRAPPER
+% COMIK_WRAPPER
 % Usage:
 %     comik_wrapper(givenFastaFilename, nPosSequences, nNegSequences, ...
-%     outputFolder, oligoLen, maxDist, chunkSizeInBps, nClusterVals, ...
+%     outputFolder, oligoLen, maxDist, segmentSizeInBps, nClusterVals, ...
 %     sigmaVals, Cs, mklNorm, nFolds, nOuterFolds, whetherToPlotHeatmap, ...
 %     computationVersion, whetherToVisualizeWVector)
 %
@@ -154,7 +154,7 @@
     Cs = 10.^[-3:1:3];
     sigmaVals = 10.^[-1:1:2];
     nClusterVals = [2 5];
-    chunkSizeInBps = 100;
+    segmentSizeInBps = 100;
 end
 if nargin < totalArguments - 12
     computationVersion = 'Looping';
@@ -168,7 +168,7 @@
     Cs = 10.^[-3:1:3];
     sigmaVals = 10.^[-1:1:2];
     nClusterVals = [2 5];
-    chunkSizeInBps = 100;
+    segmentSizeInBps = 100;
     maxDist = 50;
 end
 if nargin < totalArguments - 13
@@ -183,111 +183,12 @@
     Cs = 10.^[-3:1:3];
     sigmaVals = 10.^[-1:1:2];
     nClusterVals = [2 5];
-    chunkSizeInBps = 100;
+    segmentSizeInBps = 100;
     maxDist = 50;
     oligoLen = [2];
 end
 
 
-problemDesc = '';
-switch problemDesc
-    %chr19-- 535|3340 -- 400 positives for training, 800 negatives for training
-    %pos train : 1:400, pos test : 401:535
-    %neg train : 536:1335 , neg test : 1336:3875
-    case char('nucleosome_chr19')
-        givenFastaFilename = 'dataGeneration/nucleosome_positioning/01_HepG2_LiHG_CtX.isect.ovl.chr19.fasta';
-        testIndices = [401:535 1336:3875];
-        %
-        %
-    %chr17 -- 649|3669 -- 475 positives for training, 900 negatives for training
-    %pos train : 1:475, pos test : 476:649
-    %neg train : 650:1549 , neg test : 1550:4318
-    case char('nucleosome_chr17')
-        givenFastaFilename = 'dataGeneration/nucleosome_positioning/01_HepG2_LiHG_CtX.isect.ovl.chr17.fasta';
-        testIndices = [476:649 1550:4318];
-        %
-        %
-    case char('WUS=LFY_instersect')
-        givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS=LFY_intersect-prepared1.fasta';
-        testIndices = [51:65 2066:16712];
-        %
-        %
-    case char('WUS-loop-logic')
-        givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-loop-logic-prepared1.fasta';
-        testIndices = [81:104 2105:16751];
-        %
-        %
-    case char('kirmes-athaliana-WUS-CLV3-subtract5')
-        %42 pos, 16647 neg,  as test examples, shall predict using weight vector
-        givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-CLV3_subtract-prepared5.fasta';
-        testIndices = [31:42 2043:16689];
-        %
-        %
-    case char('kirmes-gen')
-	givenFastaFilename = 'data/kirmes-gen.fasta';
-        testIndices = [501:600 1101:1200];
-        %
-        %
-    case char('new-simulated-dataset1')
-        givenFastaFilename = 'dataGeneration/new_simulated_dataset1.fasta';
-        testIndices = [901:1000 1901:2000];
-        %
-        %
-    case char('new-simulated-dataset2')
-	givenFastaFilename = 'dataGeneration/new_simulated_dataset2.fasta';
-        testIndices = [501:600 1101:1200];
-        %
-        %
-    case char('new-simulated-dataset3')
-        givenFastaFilename = 'dataGeneration/new_simulated_dataset3.fasta';
-        testIndices = [401:500 901:1000];
-        %
-        %
-    case char('new-simulated-dataset4')
-        givenFastaFilename = 'dataGeneration/new_simulated_dataset4.fasta';
-        testIndices = [501:600 1101:1200];
-        %
-        %
-    case char('cgi-noncgi-dataset')
-        givenFastaFilename = 'dataGeneration/cgi-noncgi-dataset/chr17_19_22.fasta';
-        testIndices = [1301:1954 2605:2926];
-        %
-        %
-    case char('cgi-noncgi-chr19only')
-        givenFastaFilename = 'dataGeneration/cgi-noncgi-dataset/chr19.fasta';
-        testIndices = [601:934 1235:1401];
-        %
-        %
-    case char('kirmes-athaliana-WUS-CLV3-subtract')
-        givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-CLV3-subtract-prepared.fasta';
-        testIndices = [31:42 193:242];
-        %
-        %
-    case char('kirmes-athaliana-WUS-CLV3-subtract2')
-        givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-CLV3-subtract-prepared2.fasta';
-        testIndices = [31:42 843:1042];
-        %
-        %
-    case char('kirmes-athaliana-WUS-CLV3-subtract3')
-	%42 pos, 5000 neg, 4200 as test examples, shall predict using weight vector	
-        givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-CLV3-subtract-prepared3.fasta';
-        testIndices = [31:42 843:5042];
-        %
-        %
-    case char('kirmes-athaliana-WUS-CLV3-subtract4')
-        %42 pos, 16647 neg,  as test examples, shall predict using weight vector
-        givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-CLV3-subtract-prepared4.fasta';
-        testIndices = [31:42 843:16689];
-        %
-        %
-    case char('distal-proximal-cgi')
-        givenFastaFilename = 'dataGeneration/distal_proximal_cgi/all-combined.fasta';
-        testIndices = [1501:2260 2461:2541];
-	%
-	%
-    otherwise
-        %fprintf('No problem specified with param problemDesc.\n');
-end
 if exist(outputFolder, 'dir') == 7
     %do-nothing
 else
@@ -328,7 +229,7 @@ case char('distal-proximal-cgi')
 parfor (i=1:nOuterFolds, nOuterFolds)
     thisFoldDirectory = strcat(outputFolder, '/outer_fold_', num2str(i));
     returnStatusVal = perform_one_outer_fold_for_comik(i, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, ...
-							maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, ...
+							maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, ...
 							thisTestIndices{i}, debugLevel, thisFoldDirectory, whetherToPlotHeatmap, ...
 							computationVersion, whetherToVisualizeWVector, debugMsgLocation);
     logMessages(1, sprintf('\nOuter fold %d: Status %s\n', i, returnStatusVal), debugLevel);
@@ -338,7 +239,7 @@ case char('distal-proximal-cgi')
 
 end % comik_wrapper function ends
 
-function statusVal = perform_one_outer_fold_for_comik(outerFoldID, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, outputFolder, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector, debugMsgLocation)
+function statusVal = perform_one_outer_fold_for_comik(outerFoldID, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, outputFolder, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector, debugMsgLocation)
 % Runs one outer fold for CoMIKL
 
 % Make the outputFolder for this outer fold
@@ -362,7 +263,7 @@ case char('distal-proximal-cgi')
 end
 
 for l=1:size(oligoLen,2)
-    filenameSuffix = strcat('_segment-size', num2str(chunkSizeInBps), '_oligoLen', num2str(oligoLen(l)));
+    filenameSuffix = strcat('_segment-size', num2str(segmentSizeInBps), '_oligoLen', num2str(oligoLen(l)));
     runSummaryFilename = strcat(outputFolder, '/runSummary',  filenameSuffix, '.txt');
     fid = fopen(runSummaryFilename, 'a');
 
@@ -382,8 +283,8 @@ case char('distal-proximal-cgi')
     logMessages(debugMsgLocation, sprintf('   NegFasta: %s \n', givenNegFastaFilename), debugLevel);
 
 
-    fprintf(fid, '   oligoLength: %d, maxDist: %d, chunkSize: %d bps \n', oligoLen(l), maxDist, chunkSizeInBps);
-    logMessages(debugMsgLocation, sprintf('   oligoLength: %d, maxDist: %d, chunkSize: %d bps \n', oligoLen(l), maxDist, chunkSizeInBps), debugLevel);
+    fprintf(fid, '   oligoLength: %d, maxDist: %d, segmentSize: %d bps \n', oligoLen(l), maxDist, segmentSizeInBps);
+    logMessages(debugMsgLocation, sprintf('   oligoLength: %d, maxDist: %d, segmentSize: %d bps \n', oligoLen(l), maxDist, segmentSizeInBps), debugLevel);
     
     fprintf(fid, '   mklNorm: %.2f\n', mklNorm);
     logMessages(debugMsgLocation, sprintf('   mklNorm: %.2f\n', mklNorm), debugLevel);
@@ -414,18 +315,18 @@ case char('distal-proximal-cgi')
     fclose(fid);
     %
     %
-    [allSeqsAsBags, allSeqsConformedSetKernel, kernelweights, thetaVals, instanceWeightsInEachBag, resultString, bestVals, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen(l), maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector);
+    [allSeqsAsBags, allSeqsConformedSetKernel, kernelweights, thetaVals, instanceWeightsInEachBag, resultString, bestVals, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen(l), maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector);
     fid=fopen(runSummaryFilename, 'a');
     fprintf(fid, '\n\n');
-    fprintf(fid, '=====Validation results - SAMARTH=====\n');
-    fprintf(fid, 'OligoLen: %d, maxDist: %d, segment size: %d\n', oligoLen(l), maxDist, chunkSizeInBps);
+    fprintf(fid, '===== Validation results =====\n');
+    fprintf(fid, 'OligoLen: %d, maxDist: %d, segment size: %d\n', oligoLen(l), maxDist, segmentSizeInBps);
     fprintf(fid, 'Validation teAUPRC: %.4f\n', test_teAUPRC);
     fprintf(fid, 'Validation teAUROC: %.4f\n', test_teAUROC);
     fprintf(fid, 'Best C: %.3f\n', bestVals.best_C);
     fprintf(fid, 'Best sigma: %.3f\n', bestVals.best_sigma);
     fprintf(fid, 'Best nClusters: %d\n', bestVals.best_nClusters);
     fprintf(fid, 'Best teAUROC: %.4f\n', bestVals.best_teAUROC);
-    fprintf(fid, '======END======\n\n');
+    fprintf(fid, '====== END ======\n\n');
     fclose(fid);
 end
 if debugMsgLocation ~= 1
diff --git a/computeInstanceWideKernel.m b/computeInstanceWideKernel.m
new file mode 100644
index 0000000..c2086f4
--- /dev/null
+++ b/computeInstanceWideKernel.m
@@ -0,0 +1,62 @@
+function  [instanceStarts, instanceEnds, instanceWideKernel] = computeInstanceWideKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation)
+
+% COMPUTEINSTANCEWIDEKERNEL 
+% 
+
+if nargin < 3
+    debugMsgLocation = 1;
+    debugLevel = 2;
+    sparceComputation = 0;
+end
+logMessages(debugMsgLocation, sprintf('Onto compute the Std MI kernel\n'), debugLevel);
+nBags = length(allSeqsAsBags);
+
+tic;
+instanceEnds = cumsum(thisInstances);
+instanceStarts = cumsum(thisInstances) - thisInstances + 1;
+nInstances = instanceEnds(end);
+logMessages(debugMsgLocation, sprintf('instances starts and ends collected in %.4f seconds\n', toc), debugLevel);
+
+
+if sparseComputation
+    logMessages(debugMsgLocation, sprintf('Performing sparse computation for kernel on instances\n'), debugLevel);
+    tic;
+    allSeqsAsMat = zeros( size( allSeqsAsBags{1}, 1) , nInstances);
+    for i=1:nBags
+	allSeqsAsMat(:, instanceStarts(i): instanceEnds(i)) = allSeqsAsBags{i};
+    end
+    logMessages(debugMsgLocation, sprintf('Loop completed in %.4f seconds\n', toc), debugLevel);
+    % Above, looping version instead of cell2mat. This takes nearly half the time for really long sequences and too many instances.
+    % tic; allSeqsAsMat = cell2mat(allSeqsAsBags); toc;
+    tic;
+    instanceWideKernel = full(allSeqsAsMat' * allSeqsAsMat);
+    logMessages(debugMsgLocation, sprintf('full kernel computed in %.4f seconds\n', toc), debugLevel);
+    % nBags x nBags kernel actually need not be computed
+    clear allSeqsAsMat;
+    %
+else
+    % direct (not sparseComputation)
+    % This is tested neither for correctness nor efficiency. We recommend using sparseComputation.
+    logMessages(debugMsgLocation, sprintf('Computing directly, %d rows.\n', nBags), debugLevel);
+    stdMultiInstanceKernel = zeros(nBags);
+    for i=1:nBags
+        if mod(i, 10) == 0
+            logMessages(debugMsgLocation, sprintf('%d, ', i), debugLevel);
+        end
+        iInstances = size(allSeqsAsBags{i}, 2);
+        for j=i:nBags
+            term = transpose(allSeqsAsBags{i}) * allSeqsAsBags{j};
+	    jInstances = size(allSeqsAsBags{j}, 2);
+            stdMultiInstanceKernel(i,j) = sum(term(:)) * (1/iInstances) * (1/jInstances);
+        end
+    end
+    logMessages(debugMsgLocation, sprintf('\n'), debugLevel);
+    % Fill-up the other half
+    stdMultiInstanceKernel = triu(stdMultiInstanceKernel, 1) + stdMultiInstanceKernel';
+    % Normalize the kernel matrix -- done by calling normalizeKernel
+    stdMultiInstanceKernel = normalizeKernel(stdMultiInstanceKernel);
+    %
+    %
+end %if-else ends
+
+end % function ends
diff --git a/computeStdMultiInstanceKernel.m b/computeStdMultiInstanceKernel.m
deleted file mode 100644
index b62ac4d..0000000
--- a/computeStdMultiInstanceKernel.m
+++ /dev/null
@@ -1,123 +0,0 @@
-function  [instanceStarts, instanceEnds, instanceWideKernel, stdMultiInstanceKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation)
-
-% Input Params:
-% 
-
-if nargin < 3
-    sparceComputation = 0;
-end
-logMessages(debugMsgLocation, sprintf('Onto compute the Std MI kernel\n'), debugLevel);
-nBags = length(allSeqsAsBags);
-%%instancesInEachBag = [0];
-
-% the for loop below can be replaced by a cellfun
-%thisInstances = cellfun(@(x) size(x,2), allSeqsAsBags);
-% get this beforehand
-tic;
-instanceEnds = cumsum(thisInstances);
-instanceStarts = cumsum(thisInstances) - thisInstances + 1;
-nInstances = instanceEnds(end);
-logMessages(debugMsgLocation, sprintf('instances starts and ends collected in %.4f seconds\n', toc), debugLevel);
-%instanceStarts = [1];
-%instanceEnds = [0];
-%for i=1:nBags
-%    thisInstances = size(allSeqsAsBags{i}, 2);
-    % this happens because the number of chunks in shifted case
-    % are one less than the non-shifted case. So if there is exactly
-    % one chunk in the non-shifted case, we copy it to the shifted case.
-    % This is done in the main script.
-    % --- Not needed anymore
-    %if(thisInstances == 0)
-    %	fprintf('YES, ZERO --%d \n', i);
-    %end
-    %instancesInEachBag = [instancesInEachBag thisInstances];
-%    instanceEnds = [instanceEnds instanceEnds(end)+thisInstances];
-%    instanceStarts = [instanceStarts instanceStarts(end)+thisInstances];
-%end
-%%instancesInEachBag = instancesInEachBag(2:end);
-%instanceEnds = instanceEnds(2:end);
-%instanceStarts = instanceStarts(1:end-1);
-%nInstances = instanceEnds(end);
-
-
-if sparseComputation
-    logMessages(debugMsgLocation, sprintf('Performing sparse computation for kernel on instances\n'), debugLevel);
-    tic;
-    allSeqsAsMat = zeros( size( allSeqsAsBags{1}, 1) , nInstances);
-    for i=1:nBags
-	allSeqsAsMat(:, instanceStarts(i): instanceEnds(i)) = allSeqsAsBags{i};
-    end
-    logMessages(debugMsgLocation, sprintf('Loop completed in %.4f seconds\n', toc), debugLevel);
-    % IMplemented looping instead of cell2mat. It takes nearly half the time
-    %tic;
-    %allSeqsAsMat = cell2mat(allSeqsAsBags);
-    %fprintf('cell2mat completed in %.4f seconds\n', toc);
-    tic;
-    instanceWideKernel = full(allSeqsAsMat' * allSeqsAsMat);
-    logMessages(debugMsgLocation, sprintf('full kernel computed in %.4f seconds\n', toc), debugLevel);
-    % nBags x nBags kernel actually need not be computed
-    stdMultiInstanceKernel = zeros(nBags);
-    clear allSeqsAsMat;
-    %
-elseif direct
-    logMessages(debugMsgLocation, sprintf('Computing directly, %d rows.\n', nBags), debugLevel);
-    stdMultiInstanceKernel = zeros(nBags);
-    for i=1:nBags
-        if mod(i, 10) == 0
-            logMessages(debugMsgLocation, sprintf('%d, ', i), debugLevel);
-        end
-        iInstances = size(allSeqsAsBags{i}, 2);
-        for j=i:nBags
-            term = transpose(allSeqsAsBags{i}) * allSeqsAsBags{j};
-	    %jInstances = size(allSeqsAsBags{j}, 2);
-            %stdMultiInstanceKernel(i,j) = sum(term(:)) * (1/iInstances) * (1/jInstances);
-        end
-    end
-    logMessages(debugMsgLocation, sprintf('\n'), debugLevel);
-    % Fill-up the other half
-    stdMultiInstanceKernel = triu(stdMultiInstanceKernel, 1) + stdMultiInstanceKernel';
-    % Normalize the kernel matrix -- done by calling normalizeKernel
-    stdMultiInstanceKernel = normalizeKernel(stdMultiInstanceKernel);
-    %disp(stdMultiInstanceKernel(1:5,1:5));
-    %
-    %
-else %breakdown
-    %tic;
-    %spead out the instances
-    X = zeros( size(allSeqsAsBags{1},1), nInstances);
-    disp(size(X));
-    for i=1:nBags
-        X(:,instanceStarts(i):instanceEnds(i)) = allSeqsAsBags{i};
-    end
-
-    allInstanceKernel = zeros(nInstances, nInstances);
-
-    start = 1;
-    step_size = 1;
-    for j=step_size:step_size:nInstances
-        parfor(i=start:j, 25)
-	    allInstanceKernel(i, :) = [zeros(1, i-1) sum(bsxfun(@times, X(:,i), X(:,i:end)), 1)];
-        end
-        start = j+1;
-        logMessages(debugMsgLocation, sprintf('%d', j), debugLevel);
-    end
-    % Remainder of the rows
-    parfor(i=start:nInstances, 25)%%i iterates over rows
-        allInstancesKernel(i,:) = [zeros(1,i-1) sum(bsxfun(@times, X(:,i), X(:,i:end)), 1)];
-    end
-    % Fill-up the other half
-    allInstancesKernel = triu(allInstancesKernel, 1) + allInstancesKernel';
-    disp(size(allInstancesKernel));
-
-    stdMultiInstanceKernel = zeros(nBags, nBags);
-    %for i=1:nBags
-    %    for j=1:nBags
-    %    stdMultiInstanceKernel(i,j) = sum(allInstancesKernel( instanceStarts(i):instanceEnds(i), instanceStarts(j):instanceEnds(j) )) * ...
-    %					(1 / (instanceEnds(i)-instanceStarts(i)+1) ) * (1 / (instanceEnds(j)-instanceStarts(j)+1) );
-    %    end
-    %end
-end
-%t = toc;
-%fprintf('%s\n',t);
-
-end
diff --git a/readFastaSequences.m b/readFastaSequences.m
index f35e056..821ee93 100644
--- a/readFastaSequences.m
+++ b/readFastaSequences.m
@@ -1,17 +1,17 @@
-function [nSeqs, ommittedIndices, passFasta] = readFastaSequences(txt, chunkSizeInBps, givenNSeqs, outputFolder, debugLevel, debugMsgLocation)
+function [nSeqs, ommittedIndices, passFasta] = readFastaSequences(txt, segmentSizeInBps, givenNSeqs, outputFolder, debugLevel, debugMsgLocation)
 
-% FUNCTION FASTA=READFASTA(FILENAME)
+% READFASTASEQUENCES
 %
-% READFASTA reads fasta file and extracts sequence to fasta variable
-% speed-optimized version of Jonas Almeidas readfasta.m
+% READFASTA reads fasta file and extracts sequence to fasta variable.
+% Modified version of readfasta shipped with ODH Matlab code.
 %
 % fasta - struct containing cell arrays with legends and sequences
 %	  fasta.legend   = cell array with sequence legends (headers)
 %	  fasta.seqeunce = cell array with sequences
 %
-%**
+%
 
-% Edits: March 20, 2017, snikumbh
+% Edits: snikumbh@mpi-inf,mpg.de
 
 totalArguments = 6;
 if nargin < totalArguments
@@ -26,7 +26,7 @@
 fasta.sequence = cell(1, givenNSeqs);
 i=0;
 ommittedIndices = [];
-fasta.title=[txt,',  read: ',date];
+fasta.title=[txt,',  read: ', date];
 % READS SEQUENCES AND THEIR LEGENDS
 fid=fopen(txt,'r');
 ofid=fopen(strcat(outputFolder, '/ommittedFastaIds.txt'), 'a'); 
@@ -36,12 +36,11 @@
     lineRead=fgetl(fid);
     if isempty(strfind(lineRead, '#'))
         if strfind(lineRead,'>')
-	    %seqLegends{i} = lineRead;
             fasta.legend{i+1} = lineRead;
         else
             i=i+1;
 	    lengths = [lengths length(lineRead)];
-	    if lengths(end) >= chunkSizeInBps
+	    if lengths(end) >= segmentSizeInBps
                 fasta.sequence{i}=[fasta.sequence{i},upper(lineRead)];
 	    else % we omit that sequence
 		ommittedIndices = [ommittedIndices i];
@@ -53,15 +52,14 @@
 end
 fclose(ofid);
 fclose(fid);
-ineligibleLengths = lengths(lengths < chunkSizeInBps);
-ineligibleIndices = find(lengths < chunkSizeInBps);
+ineligibleLengths = lengths(lengths < segmentSizeInBps);
+ineligibleIndices = find(lengths < segmentSizeInBps);
 nSeqs = givenNSeqs;
 passFasta = fasta;
 if size(ineligibleLengths, 2) > 0
-    %fprintf('TIP: FASTA file has sequences of length smaller than the segment-size specified. Please rectify by either removing these sequences or specifying a suitable segment-size.\n');
     logMessages(debugMsgLocation, sprintf('%d sequence(s) have been ommitted as they are shorter than the segment-size specified. Check ommittedFastaIds.txt\n', length(ineligibleIndices)), debugLevel);
     % Change to what would be passed, if it is different than 'fasta' above
-    nSeqs = length(find(lengths >= chunkSizeInBps));
+    nSeqs = length(find(lengths >= segmentSizeInBps));
     passFasta.sequence = cell(1, nSeqs);
     j = 1;
     for i=1:givenNSeqs