Skip to content

Commit

Permalink
Cleaning up function definitions and comments phase-I
Browse files Browse the repository at this point in the history
  • Loading branch information
snikumbh committed Jul 19, 2017
1 parent ecf9401 commit eb8bd56
Show file tree
Hide file tree
Showing 7 changed files with 108 additions and 298 deletions.
28 changes: 4 additions & 24 deletions analyze_wvector.m
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
function [run_this_cmd, pdfcrop_lines] = analyse_wvector(folderName, rankId, wvector, oligoLen, maxDist, topN, verbose, max_or_topN, debugLevel, debugMsgLocation)

%if(nargin < 4)
% topN = 50;
% verbose = 0;
%end
totalArguments = 10;
if nargin < totalArguments
debugMsgLocation = 1;
Expand All @@ -27,18 +23,14 @@
% different based on whether the given maxDist is same as the segment size or not.
% This is more reliable.
max_dist = length(wvector) / (alphabet_size^oligoLen)^2;
%assert((max_dist) == maxDist);

pos_thresholds = []; neg_thresholds = [];

twoD_means = wvector;
logMessages(debugMsgLocation, sprintf('\nNow analyzing weight vector ranked: %d\n', rankId), debugLevel);
%disp(size(wvector));
%disp(max_dist);
%%Getting a 3D heatmap

mat_rows = alphabet_size^oligoLen;
mat_cols = mat_rows;
%single_wvector_length = (alphabet_size^oligoLen)^2 * (max_dist);
norms_pair_matrix = zeros(mat_rows, mat_cols);
for i= 0 : (alphabet_size^oligoLen)-1
% i iterates over the first oligomer in the pair
Expand Down Expand Up @@ -316,7 +308,7 @@
grid off;
else
%% Motif files are written
%% We will have R scripting runnign on it to generate sequencelogos
%% We will have R script running on it to generate sequencelogos

%%Write out the motifs, and their weights to a file, which is read from R and plot logos
%for k = 1 : length(Xd)
Expand All @@ -341,22 +333,10 @@
% end
end

if max_or_topN == 1 %%-- generating inidividual EPS figures
if max_or_topN == 1
%% generate inidividual EPS figures
collated_pdf_name = strcat(folderName, '/MinMax_at_distances_oligoLen', num2str(oligoLen) ,'_combined_colored.pdf');
%fprintf('=====Cmds for unix:====\n');
%for i= 1:size(pdfcrop_lines,1)
% fprintf('%s\n', [pdfcrop_lines(i,1:end-1)]);
% %strcat('%s\n', [pdfcrop_lines(i,1:end-1)]);
%end

%% Combine/append all PDFs
%fprintf('\npdftk ');
%for i= 1:size(pdfcrop_lines,1)
% fprintf('%s-crop.pdf ', [pdfcrop_lines(i,9:end-1)]);
%end
%fprintf('cat output %s\n\n', collated_pdf_name);
run_this_cmd = collated_pdf_name;
%strcat();
else
run_this_cmd = '';
all_thresholds = [neg_thresholds; pos_thresholds]';
Expand Down
33 changes: 12 additions & 21 deletions barplot-heatmap-visualization.R
Original file line number Diff line number Diff line change
@@ -1,52 +1,43 @@


#rscaled <- scale(r, center = FALSE, scale = colSums(r,na.rm=T))
#may not need re-scaling
pdf('samarth_viz_in_r.pdf', width=10.0, height=6.0)
flds <- count.fields('samarth_test1.csv',sep=',')
s <- read.csv('samarth_test1.csv', fill=TRUE, nrows=length(flds), header=F, sep=",", col.names=paste0("Seg-", seq(1,max(flds),by=1)))

pdf('sample_viz_in_r.pdf', width=10.0, height=6.0)
csv_filename <- '<enter-name>.csv'
flds <- count.fields(csv_filename, sep=',')
s <- read.csv(csv_filename, fill=TRUE, nrows=length(flds), header=F, sep=",", col.names=paste0("Seg-", seq(1,max(flds),by=1)))
r <- s[1:25,]
r_ones <- r
r_ones[!is.na(r)] <- 1
#ncolors <- max(r[complete.cases(r),])
#or

ncolors <- max(flds)
#color_palette <- rainbow(ncolors)
color_palette <- heat.colors(ncolors)

jet.colors <-
colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan",
"#7FFF7F", "yellow", "#FF7F00"))#, "red", "#7F0000"))
#color_palette <- jet.colors(ncolors)
# If a different color palette is required
# jet.colors <-
# colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan",
# "#7FFF7F", "yellow", "#FF7F00"))#, "red", "#7F0000"))

ypos <- apply(r, 2, cumsum)
ypos <- ypos - r/2

#xpos <- barplot(t(r_ones), col=rainbow(ncolors), axes=F, horiz=T)
xpos <- barplot(t(r_ones), col=c("white"), axes=F, horiz=T, axisnames=F)
cp <- 0
for (i in 1:ncol(r_ones)){
temp <- color_palette[r[seq(1,nrow(r),by=1), i]]
cp <- cbind(cp, temp)
#barplot(t(r_ones[]), col=color_palette[], add=T, axes=F)
}
new_cp <- cp[,-1]

for (i in 1:nrow(r_ones)){
#xx = r_ones
#xx[,-i] <- NA
#colnames(xx)[-i] <- NA
barplot(t(r_ones[i,]), col=new_cp[i,], horiz=T, add=T, space=(i-1)*1.2, axes=F, xlab="", axisnames=F)
}

axis(1,at=0.5+seq(0,ncolors-1,1), labels=seq(1,ncolors,1))
#Additional axis showing the numcleotide ranges
#axis(1,at=seq(0,ncolors-1,1), labels=seq(1,ncolors,1))
#Additional axis showing the nucleotide ranges, if required
## axis(1,at=seq(0,ncolors-1,1), labels=seq(1,ncolors,1))
legend("topright", legend = seq(1,ncolors), fill = color_palette, title="Segment ranks", ncol = 2, x.intersp = 0.1, bty="n", cex=0.8)
#pie(rep(1,ncolors), col=color_palette, labels=paste0("Rank ", seq(1,ncolors)), add=T)

dev.off()
#text(rep(xpos,5), as.vector(t(ypos)), labels=t(r)) # ??



7 changes: 4 additions & 3 deletions comik_main_with_weight_vector.m
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,11 @@
logMessages(debugMsgLocation,sprintf('Total instances from all bags: %d\n', nInstances), debugLevel);

direct = 0;
% use this flag for direct computation from bags instead of the 'sparseComputation' flag
% IMP: the 'direct' flag uses parfor for speeding up the instances' kernel computation.
% Using this flag for direct computation from bags (by setting to 1) instead of the 'sparseComputation' flag.
% IMP: the 'direct' flag uses parfor for speeding up the instance-wide kernel computation.
% We are using sparseComputation.
sparseComputation = 1;
[instanceStarts, instanceEnds, instanceWideKernel, allSeqsStdSetKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation);
[instanceStarts, instanceEnds, instanceWideKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation);
%
% Further are functions that will be recomputed per inner cross-validation iteration
% Cross-validation loop is also handled simultaneously
Expand Down
129 changes: 15 additions & 114 deletions comik_wrapper.m
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
function [] = comik_wrapper(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, testIndices, outputFolder, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, nOuterFolds, whetherToPlotHeatmap, whetherToVisualizeWVector, debugLevel, debugMsgLocation, computationVersion)
function [] = comik_wrapper(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, testIndices, outputFolder, oligoLen, maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, nOuterFolds, whetherToPlotHeatmap, whetherToVisualizeWVector, debugLevel, debugMsgLocation, computationVersion)

% COMIKL_WRAPPER
% COMIK_WRAPPER
% Usage:
% comik_wrapper(givenFastaFilename, nPosSequences, nNegSequences, ...
% outputFolder, oligoLen, maxDist, chunkSizeInBps, nClusterVals, ...
% outputFolder, oligoLen, maxDist, segmentSizeInBps, nClusterVals, ...
% sigmaVals, Cs, mklNorm, nFolds, nOuterFolds, whetherToPlotHeatmap, ...
% computationVersion, whetherToVisualizeWVector)
%
Expand Down Expand Up @@ -154,7 +154,7 @@
Cs = 10.^[-3:1:3];
sigmaVals = 10.^[-1:1:2];
nClusterVals = [2 5];
chunkSizeInBps = 100;
segmentSizeInBps = 100;
end
if nargin < totalArguments - 12
computationVersion = 'Looping';
Expand All @@ -168,7 +168,7 @@
Cs = 10.^[-3:1:3];
sigmaVals = 10.^[-1:1:2];
nClusterVals = [2 5];
chunkSizeInBps = 100;
segmentSizeInBps = 100;
maxDist = 50;
end
if nargin < totalArguments - 13
Expand All @@ -183,111 +183,12 @@
Cs = 10.^[-3:1:3];
sigmaVals = 10.^[-1:1:2];
nClusterVals = [2 5];
chunkSizeInBps = 100;
segmentSizeInBps = 100;
maxDist = 50;
oligoLen = [2];
end


problemDesc = '';
switch problemDesc
%chr19-- 535|3340 -- 400 positives for training, 800 negatives for training
%pos train : 1:400, pos test : 401:535
%neg train : 536:1335 , neg test : 1336:3875
case char('nucleosome_chr19')
givenFastaFilename = 'dataGeneration/nucleosome_positioning/01_HepG2_LiHG_CtX.isect.ovl.chr19.fasta';
testIndices = [401:535 1336:3875];
%
%
%chr17 -- 649|3669 -- 475 positives for training, 900 negatives for training
%pos train : 1:475, pos test : 476:649
%neg train : 650:1549 , neg test : 1550:4318
case char('nucleosome_chr17')
givenFastaFilename = 'dataGeneration/nucleosome_positioning/01_HepG2_LiHG_CtX.isect.ovl.chr17.fasta';
testIndices = [476:649 1550:4318];
%
%
case char('WUS=LFY_instersect')
givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS=LFY_intersect-prepared1.fasta';
testIndices = [51:65 2066:16712];
%
%
case char('WUS-loop-logic')
givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-loop-logic-prepared1.fasta';
testIndices = [81:104 2105:16751];
%
%
case char('kirmes-athaliana-WUS-CLV3-subtract5')
%42 pos, 16647 neg, as test examples, shall predict using weight vector
givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-CLV3_subtract-prepared5.fasta';
testIndices = [31:42 2043:16689];
%
%
case char('kirmes-gen')
givenFastaFilename = 'data/kirmes-gen.fasta';
testIndices = [501:600 1101:1200];
%
%
case char('new-simulated-dataset1')
givenFastaFilename = 'dataGeneration/new_simulated_dataset1.fasta';
testIndices = [901:1000 1901:2000];
%
%
case char('new-simulated-dataset2')
givenFastaFilename = 'dataGeneration/new_simulated_dataset2.fasta';
testIndices = [501:600 1101:1200];
%
%
case char('new-simulated-dataset3')
givenFastaFilename = 'dataGeneration/new_simulated_dataset3.fasta';
testIndices = [401:500 901:1000];
%
%
case char('new-simulated-dataset4')
givenFastaFilename = 'dataGeneration/new_simulated_dataset4.fasta';
testIndices = [501:600 1101:1200];
%
%
case char('cgi-noncgi-dataset')
givenFastaFilename = 'dataGeneration/cgi-noncgi-dataset/chr17_19_22.fasta';
testIndices = [1301:1954 2605:2926];
%
%
case char('cgi-noncgi-chr19only')
givenFastaFilename = 'dataGeneration/cgi-noncgi-dataset/chr19.fasta';
testIndices = [601:934 1235:1401];
%
%
case char('kirmes-athaliana-WUS-CLV3-subtract')
givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-CLV3-subtract-prepared.fasta';
testIndices = [31:42 193:242];
%
%
case char('kirmes-athaliana-WUS-CLV3-subtract2')
givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-CLV3-subtract-prepared2.fasta';
testIndices = [31:42 843:1042];
%
%
case char('kirmes-athaliana-WUS-CLV3-subtract3')
%42 pos, 5000 neg, 4200 as test examples, shall predict using weight vector
givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-CLV3-subtract-prepared3.fasta';
testIndices = [31:42 843:5042];
%
%
case char('kirmes-athaliana-WUS-CLV3-subtract4')
%42 pos, 16647 neg, as test examples, shall predict using weight vector
givenFastaFilename = 'dataGeneration/athaliana_kirmes/WUS-CLV3-subtract-prepared4.fasta';
testIndices = [31:42 843:16689];
%
%
case char('distal-proximal-cgi')
givenFastaFilename = 'dataGeneration/distal_proximal_cgi/all-combined.fasta';
testIndices = [1501:2260 2461:2541];
%
%
otherwise
%fprintf('No problem specified with param problemDesc.\n');
end
if exist(outputFolder, 'dir') == 7
%do-nothing
else
Expand Down Expand Up @@ -328,7 +229,7 @@ case char('distal-proximal-cgi')
parfor (i=1:nOuterFolds, nOuterFolds)
thisFoldDirectory = strcat(outputFolder, '/outer_fold_', num2str(i));
returnStatusVal = perform_one_outer_fold_for_comik(i, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, ...
maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, ...
maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, ...
thisTestIndices{i}, debugLevel, thisFoldDirectory, whetherToPlotHeatmap, ...
computationVersion, whetherToVisualizeWVector, debugMsgLocation);
logMessages(1, sprintf('\nOuter fold %d: Status %s\n', i, returnStatusVal), debugLevel);
Expand All @@ -338,7 +239,7 @@ case char('distal-proximal-cgi')

end % comik_wrapper function ends

function statusVal = perform_one_outer_fold_for_comik(outerFoldID, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, outputFolder, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector, debugMsgLocation)
function statusVal = perform_one_outer_fold_for_comik(outerFoldID, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, outputFolder, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector, debugMsgLocation)
% Runs one outer fold for CoMIKL

% Make the outputFolder for this outer fold
Expand All @@ -362,7 +263,7 @@ case char('distal-proximal-cgi')
end

for l=1:size(oligoLen,2)
filenameSuffix = strcat('_segment-size', num2str(chunkSizeInBps), '_oligoLen', num2str(oligoLen(l)));
filenameSuffix = strcat('_segment-size', num2str(segmentSizeInBps), '_oligoLen', num2str(oligoLen(l)));
runSummaryFilename = strcat(outputFolder, '/runSummary', filenameSuffix, '.txt');
fid = fopen(runSummaryFilename, 'a');

Expand All @@ -382,8 +283,8 @@ case char('distal-proximal-cgi')
logMessages(debugMsgLocation, sprintf(' NegFasta: %s \n', givenNegFastaFilename), debugLevel);


fprintf(fid, ' oligoLength: %d, maxDist: %d, chunkSize: %d bps \n', oligoLen(l), maxDist, chunkSizeInBps);
logMessages(debugMsgLocation, sprintf(' oligoLength: %d, maxDist: %d, chunkSize: %d bps \n', oligoLen(l), maxDist, chunkSizeInBps), debugLevel);
fprintf(fid, ' oligoLength: %d, maxDist: %d, segmentSize: %d bps \n', oligoLen(l), maxDist, segmentSizeInBps);
logMessages(debugMsgLocation, sprintf(' oligoLength: %d, maxDist: %d, segmentSize: %d bps \n', oligoLen(l), maxDist, segmentSizeInBps), debugLevel);

fprintf(fid, ' mklNorm: %.2f\n', mklNorm);
logMessages(debugMsgLocation, sprintf(' mklNorm: %.2f\n', mklNorm), debugLevel);
Expand Down Expand Up @@ -414,18 +315,18 @@ case char('distal-proximal-cgi')
fclose(fid);
%
%
[allSeqsAsBags, allSeqsConformedSetKernel, kernelweights, thetaVals, instanceWeightsInEachBag, resultString, bestVals, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen(l), maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector);
[allSeqsAsBags, allSeqsConformedSetKernel, kernelweights, thetaVals, instanceWeightsInEachBag, resultString, bestVals, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen(l), maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector);
fid=fopen(runSummaryFilename, 'a');
fprintf(fid, '\n\n');
fprintf(fid, '=====Validation results - SAMARTH=====\n');
fprintf(fid, 'OligoLen: %d, maxDist: %d, segment size: %d\n', oligoLen(l), maxDist, chunkSizeInBps);
fprintf(fid, '===== Validation results =====\n');
fprintf(fid, 'OligoLen: %d, maxDist: %d, segment size: %d\n', oligoLen(l), maxDist, segmentSizeInBps);
fprintf(fid, 'Validation teAUPRC: %.4f\n', test_teAUPRC);
fprintf(fid, 'Validation teAUROC: %.4f\n', test_teAUROC);
fprintf(fid, 'Best C: %.3f\n', bestVals.best_C);
fprintf(fid, 'Best sigma: %.3f\n', bestVals.best_sigma);
fprintf(fid, 'Best nClusters: %d\n', bestVals.best_nClusters);
fprintf(fid, 'Best teAUROC: %.4f\n', bestVals.best_teAUROC);
fprintf(fid, '======END======\n\n');
fprintf(fid, '====== END ======\n\n');
fclose(fid);
end
if debugMsgLocation ~= 1
Expand Down
Loading

0 comments on commit eb8bd56

Please sign in to comment.