Skip to content
Permalink
a056a1c421
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
351 lines (330 sloc) 13.8 KB
function [run_this_cmd, pdfcrop_lines] = analyse_wvector(folderName, rankId, wvector, oligoLen, maxDist, topN, verbose, max_or_topN, debugLevel, debugMsgLocation)
% ANALYSE_WVECTOR
% Analyses the weight vector and plot distance-centric k-mer visualizations
%
% INPUT PARAMS
% Param 'folderName'
% Location/Path on disk of the output folder
%
% Param 'rankId'
% Rank of the weight vector in CoMIK
%
% Param 'wvector'
% The weight vector itself which is to be used here
%
% Param 'oligoLen'
% Oligomer length for the ODH representation
%
% Param 'maxDist'
% Maximum distance value for the ODH representation
%
% Param 'topN'
% Specify a value for topN
%
% Param 'verbose'
% Set to 1 for verbose output
%
% Param 'max_or_topN'
% Flag to specify if topN vizualization or max (Absolute Max Per Distance) visualization is to be performed
% Refer to Nikumbh and Pfeifer, BMC Bioinformatics, 2017.
%
% Param 'debugLevel'
%
% Param 'debugMsgLocation'
%
% OUTPUT PARAMS
% Param 'run_this_cmd'
% This could be used create a cmd that is to be run after the program ends to
% stitch together pages from the separate PDF files
%
% Param 'pdfcrop_lines'
% Cropping the PDF pages/files to cut out the unused area
%
%
% ADDITIONAL NOTES
%
%
% Author: snikumbh@mpi-inf.mpg.de
totalArguments = 10;
if nargin < totalArguments
debugMsgLocation = 1;
end
if nargin < totalArguments - 1
debugMsgLocation = 1;
debugLevel = 2;
end
pdfcrop_lines = [];
alphabet_size = 4;
alphabet = 'acgt';
% We will fetch max_dist from the weightvector length because the maxDist can be
% different based on whether the given maxDist is same as the segment size or not.
% This is more reliable.
max_dist = length(wvector) / (alphabet_size^oligoLen)^2;
pos_thresholds = []; neg_thresholds = [];
twoD_means = wvector;
logMessages(debugMsgLocation, sprintf('\nNow analyzing weight vector ranked: %d\n', rankId), debugLevel);
mat_rows = alphabet_size^oligoLen;
mat_cols = mat_rows;
norms_pair_matrix = zeros(mat_rows, mat_cols);
for i= 0 : (alphabet_size^oligoLen)-1
% i iterates over the first oligomer in the pair
start_pos = (i * (alphabet_size^oligoLen) * max_dist) + 1;
for j=0:(alphabet_size^oligoLen)-1
% j iterates over the second oligomer in the pair
jump_pos = j*max_dist;
perfect_pos = start_pos + jump_pos;
norms_pair_matrix(i+1,j+1) = norm( twoD_means(perfect_pos : perfect_pos + max_dist-1), 2);
end % inner for ends
end % outer for ends
heatmapCSV_fname = strcat(folderName, '/weightVectorNormed_rank', num2str(rankId), '.csv');
csvwrite(heatmapCSV_fname, norms_pair_matrix);
mat_layers = max_dist;
threeD_wvectors = zeros(mat_rows, mat_cols, mat_layers);
for i= 0 : (alphabet_size^oligoLen)-1
start_pos = (i * (alphabet_size^oligoLen) * max_dist) + 1;
for j=0:(alphabet_size^oligoLen)-1
jump_pos = j*max_dist;
perfect_pos = start_pos + jump_pos;
threeD_wvectors(i+1,j+1, :) = twoD_means(1, perfect_pos : perfect_pos + max_dist-1); %%Could use twoD_wvectors instead for individual vectors
end % inner for ends
end % outer for ends
if(verbose)
fprintf('2D wvector made into 3D\n');
end
% generate a switch case or mapping function that maps 3-mers
nums = [0:(alphabet_size^oligoLen)-1];
nums = dec2base(nums,4);
oligos = [];
for i=1:size(nums,1)
temp = [];
for j=1:size(nums,2)
temp = [temp alphabet(str2num(nums(i,j))+1)];
end
oligos = [oligos; temp];
end
% oligos would be a 64 x 3 char array for oligoLen 3
sorted_wvectors = sort(twoD_means, 'descend');
threshold_for_topN_pos = sorted_wvectors(topN);
pos_thresholds = [pos_thresholds threshold_for_topN_pos];
sorted_wvectors = sort(twoD_means, 'ascend');
threshold_for_topN_neg = sorted_wvectors(topN);
neg_thresholds = [neg_thresholds threshold_for_topN_neg];
% Which distance has most positive or negative weight
max_at_distances = []; max_motifs = {}; max_motif_weights = [];
min_at_distances = []; min_motifs = {}; min_motif_weights = [];
pos_motif_count_at_each_distance = [];
neg_motif_count_at_each_distance = [];
if max_or_topN
for i=1:max_dist
% Maximums
max_at_distances = [max_at_distances max(max(threeD_wvectors(:,:,i)))];
[a b] = find(threeD_wvectors(:,:,i) == max_at_distances(i));
max_motifs = [max_motifs; [oligos(a,:) oligos(b,:)]];
% Minimums
min_at_distances = [min_at_distances min(min(threeD_wvectors(:,:,i)))];
[a b] = find(threeD_wvectors(:,:,i) == min_at_distances(i));
min_motifs = [min_motifs; [oligos(a,:) oligos(b,:)]];
end
else
for i=1:max_dist
% Maximums
[a b] = find(threeD_wvectors(:,:,i) >= threshold_for_topN_pos);
pos_motif_count_at_each_distance = [pos_motif_count_at_each_distance; [i size(a,1)]];
temp_motifs = {};
for j=1:size(a)
temp_motifs{j} = [oligos(a(j),:) oligos(b(j),:)];
end
max_motifs{i} = temp_motifs;
max_motif_weights = [max_motif_weights; diag(threeD_wvectors(a,b,i))];
% Minimums
[a b] = find(threeD_wvectors(:,:,i) <= threshold_for_topN_neg);
neg_motif_count_at_each_distance = [neg_motif_count_at_each_distance; [i size(a,1)]];
temp_motifs = {};
for j=1:size(a)
temp_motifs{j} = [oligos(a(j),:) oligos(b(j),:)];
end
min_motifs{i} = temp_motifs;
min_motif_weights = [min_motif_weights; diag(threeD_wvectors(a,b,i))];
end
end
if max_or_topN == 0
% Write motifs to file, also write info. distance, weights
% 1. Write positive motifs
pos_motifs_fname = strcat(folderName, '/motifs/pos_motifs_oligoLen', num2str(oligoLen), '_thresholding_top', num2str(topN), '.new_motifs_rank', num2str(rankId));
fid = fopen(pos_motifs_fname, 'w');
% INFO lines
fprintf(fid, '#File created by MATLAB script @author: snikumbh@mpi-inf.mpg.de\n');
fprintf(fid, '#Motifs contributing towards classifying a sample as positive\n');
fprintf(fid, '#\n');
adder = 0; % required for indexing max_motif_weights
non_empty_count = 0;
fmt=[repmat('%1.4f ', 1, length(alphabet)) '\n'];
for i=1:length(max_motifs)
if ~isempty(max_motifs{i})
non_empty_count = non_empty_count + 1;
fprintf(fid, '#Distance = %d, Max.score: %1.4f, Min.score: %1.4f\n#K-mers: ', i-1, ...
max( max_motif_weights(adder+1:adder+length(max_motifs{i}))), ...
min( max_motif_weights(adder+1:adder+length(max_motifs{i}))));
temp_pssm = zeros(length(alphabet), 2*oligoLen);
for j=1:length(max_motifs{i})
fprintf(fid, '%s, ', max_motifs{i}{j});
for k=1:(2*oligoLen)
temp_pssm( find(alphabet == max_motifs{i}{j}(k)), k) = temp_pssm( find(alphabet == max_motifs{i}{j}(k)), k) + max_motif_weights(adder + j);
end
pssm = bsxfun(@rdivide, temp_pssm, sum(temp_pssm));
end
fprintf(fid, '\n');
fprintf(fid, fmt, pssm);
else
fprintf(fid, '#Distance = %d, EMPTY', i-1);
end
fprintf(fid, '\n');
adder = adder + length(max_motifs{i});
end
fclose(fid);
fprintf('\n#%d positive motifs in: %s.\n', non_empty_count, pos_motifs_fname);
% 2. Write negative motifs
neg_motifs_fname = strcat(folderName, '/motifs/neg_motifs_oligoLen', num2str(oligoLen), '_thresholding_top', num2str(topN), '.new_motifs_rank', num2str(rankId));
fid = fopen(neg_motifs_fname, 'w');
% INFO lines
fprintf(fid, '#File created by MATLAB script @author: snikumbh@mpi-inf.mpg.de\n');
fprintf(fid, '#Motifs contributing towards classifying a sample as negative\n');
fprintf(fid, '#\n');
adder = 0; % required for indexing max_motif_weights
non_empty_count = 0;
fmt=[repmat('%1.4f ', 1, length(alphabet)) '\n'];
for i=1:length(min_motifs)
if ~isempty(min_motifs{i})
non_empty_count = non_empty_count + 1;
fprintf(fid, '#Distance = %d, Max.score: %1.4f, Min.score: %1.4f\n#K-mers: ', i-1, ...
max(min_motif_weights(adder+1:adder+length(min_motifs{i}))), ...
min(min_motif_weights(adder+1:adder+length(min_motifs{i}))));
temp_pssm = zeros(length(alphabet), 2*oligoLen);
for j=1:length(min_motifs{i})
fprintf(fid, '%s, ', min_motifs{i}{j});
for k=1:(2*oligoLen)
temp_pssm( find(alphabet == min_motifs{i}{j}(k)), k) = temp_pssm( find(alphabet == min_motifs{i}{j}(k)), k) + abs(min_motif_weights(adder + j));
end
pssm = bsxfun(@rdivide, temp_pssm, sum(temp_pssm));
end
fprintf(fid, '\n');
fprintf(fid, fmt, pssm);
else
fprintf(fid, '#Distance = %d, EMPTY', i-1);
end
fprintf(fid, '\n');
adder = adder + length(min_motifs{i});
end
fclose(fid);
fprintf('#%d negative motifs in: %s.\n', non_empty_count, neg_motifs_fname);
end % max_or_topN condition ends
if max_or_topN
X = 0:(max_dist-1);
csv_fname = strcat(folderName, '/motifs/Max_and_Mins_oligolen', num2str(oligoLen), '_rank', num2str(rankId), '.csv');
max_and_mins = [min_at_distances; max_at_distances]';
csvwrite(csv_fname, max_and_mins);
Y = [min_at_distances; max_at_distances]';
h = stem(X,Y);
h(1).Color = 'black';
h(2).Color = 'black';
view([90 90]);
grid on;
xlim([-1 max_dist]);
ylim([-1.0 1.0]);
% ylim([min(min_at_distances)-1.0 max(max_at_distances)+1.0]);
xstr = sprintf('Distances (in basepairs)');
xlabel(xstr, 'Interpreter', 'latex', 'FontSize', 13);
ylabel('Weights', 'Interpreter', 'latex', 'FontSize', 13);
% [s,w] = strtok(cellline, '_');
% celltype_name_for_title = char(s);
tstr = sprintf('Highest scoring motifs for kernel ranked %d', rankId);
tstr2 = strrep(tstr,'_','\_');
title(tstr2, 'Interpreter', 'latex', 'FontSize', 14);
% Annotate with the oligomer-pair at the positions for which the maximum or minimum is recorded
% And place these annotations
Xd = get(h(1), 'XData');
Y1d = get(h(1), 'YData');
Y2d = get(h(2), 'YData');
if oligoLen == 3
plot_shift = 0.1;
else
plot_shift = 0.2;
end
for k = 1 : length(Xd)
% disp(upper(min_motifs(k,:)));
%if(k == oligoLen+1)
% %%this is odd distance value
% t1 = text(Xd(k), -0.6 + plot_shift, getColored(upper(min_motifs(k,:)), 0), 'HorizontalAlignment', 'center', 'FontSize', 6);
%else
if(mod(k,2) ~= 0) %Y1d(k) - 0.1 or + 0.1
if(Y1d(k) ~= 0.0)
%y-position : Y1d(k)-0.70
t1 = text(Xd(k), -0.5, getColored(upper(min_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'FontSize', 8);
%t1 = text(Xd(k), -0.65, 'Odd distances', 'HorizontalAlignment', 'center', 'FontSize', 6);
end
else
if(Y1d(k) ~= 0.0)
%y-position: Y1d(k)-0.70
t1 = text(Xd(k), -0.7, getColored(upper(min_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'FontSize', 8);
%t1 = text(Xd(k), -0.65+plot_shift, 'Odd distances', 'HorizontalAlignment', 'center', 'FontSize', 6);
end
end
%%set(t1, 'Rotation', 90);
%if(k == oligoLen+1)
% %%this is odd distance value
% t2 = text(Xd(k), 0.6 - plot_shift, getColored(upper(max_motifs(k,:)), 0), 'HorizontalAlignment', 'center', 'FontSize', 6);
%else
if(mod(k,2) ~= 0)
if(Y2d(k) ~= 0.0)
t2 = text(Xd(k), 0.5, getColored(upper(max_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'Fontsize', 8);
end
else
if(Y2d(k) ~= 0.0)
t2 = text(Xd(k), 0.7, getColored(upper(max_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'Fontsize', 8);
end
end
%set(t2, 'Rotation', 90);
end
if oligoLen == 3
t = text(length(Xd)/2, -0.85, 'Odd distances, $$d = \{2\bf{N}_{0}+1\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
t = text(length(Xd)/2, -0.35, 'Even distances, $$d = \{2\bf{N}_{0}\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
t = text(length(Xd)/2, 0.35, 'Even distances, $$d = \{2\bf{N}_{0}\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
t = text(length(Xd)/2, 0.85, 'Odd distances, $$d = \{2\bf{N}_{0}+1\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
else
t = text(length(Xd)/2, -0.9, 'Odd distances, $$d = \{2\bf{N}_{0}+1\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
t = text(length(Xd)/2, -0.35, 'Even distances, $$d = \{2\bf{N}_{0}\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
t = text(length(Xd)/2, 0.35, 'Even distances, $$d = \{2\bf{N}_{0}\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
t = text(length(Xd)/2, 0.9, 'Odd distances, $$d = \{2\bf{N}_{0}+1\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
end
%yyaxis right;
%new_ax = axes('ylim', [-1 max_dist], 'color', 'none', 'XTick', [], 'YAxisLocation', 'right');
%set(new_ax, 'Ydir', 'reverse');
pdf_fname = strcat(folderName, '/SeqLogos_MinMax_at_distances_rank', num2str(rankId), '_oligoLen', num2str(oligoLen));
print(pdf_fname,'-dpdf', '-r900');
% to produce EPS figures, use
% print(pdf_fname, '-depsc');
pdfcrop_lines = [pdfcrop_lines; sprintf('pdfcrop %s\n', pdf_fname)];
grid off;
else
%% Motif files are written
%% We will have R script running on it to generate sequence logos
end
if max_or_topN == 1
% generate inidividual EPS figures
collated_pdf_name = strcat(folderName, '/MinMax_at_distances_oligoLen', num2str(oligoLen) ,'_combined_colored.pdf');
run_this_cmd = collated_pdf_name;
else
run_this_cmd = '';
all_thresholds = [neg_thresholds; pos_thresholds]';
thresholds_fname = strcat(folderName, '/top', num2str(topN), '_thresholds_oligoLen', num2str(oligoLen),'.csv');
csvwrite(thresholds_fname, all_thresholds);
end
end %function ends