Skip to content
Permalink
master
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
function [run_this_cmd, pdfcrop_lines] = analyse_wvector(folderName, rankId, wvector, oligoLen, maxDist, topN, verbose, max_or_topN, debugLevel, debugMsgLocation)
% ANALYSE_WVECTOR
% Analyses the weight vector and plot distance-centric k-mer visualizations
%
% INPUT PARAMS
% Param 'folderName'
% Location/Path on disk of the output folder
%
% Param 'rankId'
% Rank of the weight vector in CoMIK
%
% Param 'wvector'
% The weight vector itself which is to be used here
%
% Param 'oligoLen'
% Oligomer length for the ODH representation
%
% Param 'maxDist'
% Maximum distance value for the ODH representation
%
% Param 'topN'
% Specify a value for topN
%
% Param 'verbose'
% Set to 1 for verbose output
%
% Param 'max_or_topN'
% Flag to specify if topN vizualization or max (Absolute Max Per Distance) visualization is to be performed
% Refer to Nikumbh and Pfeifer, BMC Bioinformatics, 2017.
%
% Param 'debugLevel'
%
% Param 'debugMsgLocation'
%
% OUTPUT PARAMS
% Param 'run_this_cmd'
% This could be used create a cmd that is to be run after the program ends to
% stitch together pages from the separate PDF files
%
% Param 'pdfcrop_lines'
% Cropping the PDF pages/files to cut out the unused area
%
%
% ADDITIONAL NOTES
%
%
% Author: snikumbh@mpi-inf.mpg.de
totalArguments = 10;
if nargin < totalArguments
debugMsgLocation = 1;
end
if nargin < totalArguments - 1
debugMsgLocation = 1;
debugLevel = 2;
end
pdfcrop_lines = [];
alphabet_size = 4;
alphabet = 'acgt';
% We will fetch max_dist from the weightvector length because the maxDist can be
% different based on whether the given maxDist is same as the segment size or not.
% This is more reliable.
max_dist = length(wvector) / (alphabet_size^oligoLen)^2;
pos_thresholds = []; neg_thresholds = [];
twoD_means = wvector;
logMessages(debugMsgLocation, sprintf('\nNow analyzing weight vector ranked: %d\n', rankId), debugLevel);
mat_rows = alphabet_size^oligoLen;
mat_cols = mat_rows;
norms_pair_matrix = zeros(mat_rows, mat_cols);
for i= 0 : (alphabet_size^oligoLen)-1
% i iterates over the first oligomer in the pair
start_pos = (i * (alphabet_size^oligoLen) * max_dist) + 1;
for j=0:(alphabet_size^oligoLen)-1
% j iterates over the second oligomer in the pair
jump_pos = j*max_dist;
perfect_pos = start_pos + jump_pos;
norms_pair_matrix(i+1,j+1) = norm( twoD_means(perfect_pos : perfect_pos + max_dist-1), 2);
end % inner for ends
end % outer for ends
heatmapCSV_fname = strcat(folderName, '/weightVectorNormed_rank', num2str(rankId), '.csv');
csvwrite(heatmapCSV_fname, norms_pair_matrix);
mat_layers = max_dist;
threeD_wvectors = zeros(mat_rows, mat_cols, mat_layers);
for i= 0 : (alphabet_size^oligoLen)-1
start_pos = (i * (alphabet_size^oligoLen) * max_dist) + 1;
for j=0:(alphabet_size^oligoLen)-1
jump_pos = j*max_dist;
perfect_pos = start_pos + jump_pos;
threeD_wvectors(i+1,j+1, :) = twoD_means(1, perfect_pos : perfect_pos + max_dist-1); %%Could use twoD_wvectors instead for individual vectors
end % inner for ends
end % outer for ends
if(verbose)
fprintf('2D wvector made into 3D\n');
end
% generate a switch case or mapping function that maps 3-mers
nums = [0:(alphabet_size^oligoLen)-1];
nums = dec2base(nums,4);
oligos = [];
for i=1:size(nums,1)
temp = [];
for j=1:size(nums,2)
temp = [temp alphabet(str2num(nums(i,j))+1)];
end
oligos = [oligos; temp];
end
% oligos would be a 64 x 3 char array for oligoLen 3
sorted_wvectors = sort(twoD_means, 'descend');
threshold_for_topN_pos = sorted_wvectors(topN);
pos_thresholds = [pos_thresholds threshold_for_topN_pos];
sorted_wvectors = sort(twoD_means, 'ascend');
threshold_for_topN_neg = sorted_wvectors(topN);
neg_thresholds = [neg_thresholds threshold_for_topN_neg];
% Which distance has most positive or negative weight
max_at_distances = []; max_motifs = {}; max_motif_weights = [];
min_at_distances = []; min_motifs = {}; min_motif_weights = [];
pos_motif_count_at_each_distance = [];
neg_motif_count_at_each_distance = [];
if max_or_topN
for i=1:max_dist
% Maximums
max_at_distances = [max_at_distances max(max(threeD_wvectors(:,:,i)))];
[a b] = find(threeD_wvectors(:,:,i) == max_at_distances(i));
max_motifs = [max_motifs; [oligos(a,:) oligos(b,:)]];
% Minimums
min_at_distances = [min_at_distances min(min(threeD_wvectors(:,:,i)))];
[a b] = find(threeD_wvectors(:,:,i) == min_at_distances(i));
min_motifs = [min_motifs; [oligos(a,:) oligos(b,:)]];
end
else
for i=1:max_dist
% Maximums
[a b] = find(threeD_wvectors(:,:,i) >= threshold_for_topN_pos);
pos_motif_count_at_each_distance = [pos_motif_count_at_each_distance; [i size(a,1)]];
temp_motifs = {};
for j=1:size(a)
temp_motifs{j} = [oligos(a(j),:) oligos(b(j),:)];
end
max_motifs{i} = temp_motifs;
max_motif_weights = [max_motif_weights; diag(threeD_wvectors(a,b,i))];
% Minimums
[a b] = find(threeD_wvectors(:,:,i) <= threshold_for_topN_neg);
neg_motif_count_at_each_distance = [neg_motif_count_at_each_distance; [i size(a,1)]];
temp_motifs = {};
for j=1:size(a)
temp_motifs{j} = [oligos(a(j),:) oligos(b(j),:)];
end
min_motifs{i} = temp_motifs;
min_motif_weights = [min_motif_weights; diag(threeD_wvectors(a,b,i))];
end
end
if max_or_topN == 0
% Write motifs to file, also write info. distance, weights
% 1. Write positive motifs
pos_motifs_fname = strcat(folderName, '/motifs/pos_motifs_oligoLen', num2str(oligoLen), '_thresholding_top', num2str(topN), '.new_motifs_rank', num2str(rankId));
fid = fopen(pos_motifs_fname, 'w');
% INFO lines
fprintf(fid, '#File created by MATLAB script @author: snikumbh@mpi-inf.mpg.de\n');
fprintf(fid, '#Motifs contributing towards classifying a sample as positive\n');
fprintf(fid, '#\n');
adder = 0; % required for indexing max_motif_weights
non_empty_count = 0;
fmt=[repmat('%1.4f ', 1, length(alphabet)) '\n'];
for i=1:length(max_motifs)
if ~isempty(max_motifs{i})
non_empty_count = non_empty_count + 1;
fprintf(fid, '#Distance = %d, Max.score: %1.4f, Min.score: %1.4f\n#K-mers: ', i-1, ...
max( max_motif_weights(adder+1:adder+length(max_motifs{i}))), ...
min( max_motif_weights(adder+1:adder+length(max_motifs{i}))));
temp_pssm = zeros(length(alphabet), 2*oligoLen);
for j=1:length(max_motifs{i})
fprintf(fid, '%s, ', max_motifs{i}{j});
for k=1:(2*oligoLen)
temp_pssm( find(alphabet == max_motifs{i}{j}(k)), k) = temp_pssm( find(alphabet == max_motifs{i}{j}(k)), k) + max_motif_weights(adder + j);
end
pssm = bsxfun(@rdivide, temp_pssm, sum(temp_pssm));
end
fprintf(fid, '\n');
fprintf(fid, fmt, pssm);
else
fprintf(fid, '#Distance = %d, EMPTY', i-1);
end
fprintf(fid, '\n');
adder = adder + length(max_motifs{i});
end
fclose(fid);
fprintf('\n#%d positive motifs in: %s.\n', non_empty_count, pos_motifs_fname);
% 2. Write negative motifs
neg_motifs_fname = strcat(folderName, '/motifs/neg_motifs_oligoLen', num2str(oligoLen), '_thresholding_top', num2str(topN), '.new_motifs_rank', num2str(rankId));
fid = fopen(neg_motifs_fname, 'w');
% INFO lines
fprintf(fid, '#File created by MATLAB script @author: snikumbh@mpi-inf.mpg.de\n');
fprintf(fid, '#Motifs contributing towards classifying a sample as negative\n');
fprintf(fid, '#\n');
adder = 0; % required for indexing max_motif_weights
non_empty_count = 0;
fmt=[repmat('%1.4f ', 1, length(alphabet)) '\n'];
for i=1:length(min_motifs)
if ~isempty(min_motifs{i})
non_empty_count = non_empty_count + 1;
fprintf(fid, '#Distance = %d, Max.score: %1.4f, Min.score: %1.4f\n#K-mers: ', i-1, ...
max(min_motif_weights(adder+1:adder+length(min_motifs{i}))), ...
min(min_motif_weights(adder+1:adder+length(min_motifs{i}))));
temp_pssm = zeros(length(alphabet), 2*oligoLen);
for j=1:length(min_motifs{i})
fprintf(fid, '%s, ', min_motifs{i}{j});
for k=1:(2*oligoLen)
temp_pssm( find(alphabet == min_motifs{i}{j}(k)), k) = temp_pssm( find(alphabet == min_motifs{i}{j}(k)), k) + abs(min_motif_weights(adder + j));
end
pssm = bsxfun(@rdivide, temp_pssm, sum(temp_pssm));
end
fprintf(fid, '\n');
fprintf(fid, fmt, pssm);
else
fprintf(fid, '#Distance = %d, EMPTY', i-1);
end
fprintf(fid, '\n');
adder = adder + length(min_motifs{i});
end
fclose(fid);
fprintf('#%d negative motifs in: %s.\n', non_empty_count, neg_motifs_fname);
end % max_or_topN condition ends
if max_or_topN
X = 0:(max_dist-1);
csv_fname = strcat(folderName, '/motifs/Max_and_Mins_oligolen', num2str(oligoLen), '_rank', num2str(rankId), '.csv');
max_and_mins = [min_at_distances; max_at_distances]';
csvwrite(csv_fname, max_and_mins);
Y = [min_at_distances; max_at_distances]';
h = stem(X,Y);
h(1).Color = 'black';
h(2).Color = 'black';
view([90 90]);
grid on;
xlim([-1 max_dist]);
ylim([-1.0 1.0]);
% ylim([min(min_at_distances)-1.0 max(max_at_distances)+1.0]);
xstr = sprintf('Distances (in basepairs)');
xlabel(xstr, 'Interpreter', 'latex', 'FontSize', 13);
ylabel('Weights', 'Interpreter', 'latex', 'FontSize', 13);
% [s,w] = strtok(cellline, '_');
% celltype_name_for_title = char(s);
tstr = sprintf('Highest scoring motifs for kernel ranked %d', rankId);
tstr2 = strrep(tstr,'_','\_');
title(tstr2, 'Interpreter', 'latex', 'FontSize', 14);
% Annotate with the oligomer-pair at the positions for which the maximum or minimum is recorded
% And place these annotations
Xd = get(h(1), 'XData');
Y1d = get(h(1), 'YData');
Y2d = get(h(2), 'YData');
if oligoLen == 3
plot_shift = 0.1;
else
plot_shift = 0.2;
end
for k = 1 : length(Xd)
% disp(upper(min_motifs(k,:)));
%if(k == oligoLen+1)
% %%this is odd distance value
% t1 = text(Xd(k), -0.6 + plot_shift, getColored(upper(min_motifs(k,:)), 0), 'HorizontalAlignment', 'center', 'FontSize', 6);
%else
if(mod(k,2) ~= 0) %Y1d(k) - 0.1 or + 0.1
if(Y1d(k) ~= 0.0)
%y-position : Y1d(k)-0.70
t1 = text(Xd(k), -0.5, getColored(upper(min_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'FontSize', 8);
%t1 = text(Xd(k), -0.65, 'Odd distances', 'HorizontalAlignment', 'center', 'FontSize', 6);
end
else
if(Y1d(k) ~= 0.0)
%y-position: Y1d(k)-0.70
t1 = text(Xd(k), -0.7, getColored(upper(min_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'FontSize', 8);
%t1 = text(Xd(k), -0.65+plot_shift, 'Odd distances', 'HorizontalAlignment', 'center', 'FontSize', 6);
end
end
%%set(t1, 'Rotation', 90);
%if(k == oligoLen+1)
% %%this is odd distance value
% t2 = text(Xd(k), 0.6 - plot_shift, getColored(upper(max_motifs(k,:)), 0), 'HorizontalAlignment', 'center', 'FontSize', 6);
%else
if(mod(k,2) ~= 0)
if(Y2d(k) ~= 0.0)
t2 = text(Xd(k), 0.5, getColored(upper(max_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'Fontsize', 8);
end
else
if(Y2d(k) ~= 0.0)
t2 = text(Xd(k), 0.7, getColored(upper(max_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'Fontsize', 8);
end
end
%set(t2, 'Rotation', 90);
end
if oligoLen == 3
t = text(length(Xd)/2, -0.85, 'Odd distances, $$d = \{2\bf{N}_{0}+1\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
t = text(length(Xd)/2, -0.35, 'Even distances, $$d = \{2\bf{N}_{0}\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
t = text(length(Xd)/2, 0.35, 'Even distances, $$d = \{2\bf{N}_{0}\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
t = text(length(Xd)/2, 0.85, 'Odd distances, $$d = \{2\bf{N}_{0}+1\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
else
t = text(length(Xd)/2, -0.9, 'Odd distances, $$d = \{2\bf{N}_{0}+1\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
t = text(length(Xd)/2, -0.35, 'Even distances, $$d = \{2\bf{N}_{0}\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
t = text(length(Xd)/2, 0.35, 'Even distances, $$d = \{2\bf{N}_{0}\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
t = text(length(Xd)/2, 0.9, 'Odd distances, $$d = \{2\bf{N}_{0}+1\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex');
set(t, 'Rotation', 90);
end
%yyaxis right;
%new_ax = axes('ylim', [-1 max_dist], 'color', 'none', 'XTick', [], 'YAxisLocation', 'right');
%set(new_ax, 'Ydir', 'reverse');
pdf_fname = strcat(folderName, '/SeqLogos_MinMax_at_distances_rank', num2str(rankId), '_oligoLen', num2str(oligoLen));
print(pdf_fname,'-dpdf', '-r900');
% to produce EPS figures, use
% print(pdf_fname, '-depsc');
pdfcrop_lines = [pdfcrop_lines; sprintf('pdfcrop %s\n', pdf_fname)];
grid off;
else
%% Motif files are written
%% We will have R script running on it to generate sequence logos
end
if max_or_topN == 1
% generate inidividual EPS figures
collated_pdf_name = strcat(folderName, '/MinMax_at_distances_oligoLen', num2str(oligoLen) ,'_combined_colored.pdf');
run_this_cmd = collated_pdf_name;
else
run_this_cmd = '';
all_thresholds = [neg_thresholds; pos_thresholds]';
thresholds_fname = strcat(folderName, '/top', num2str(topN), '_thresholds_oligoLen', num2str(oligoLen),'.csv');
csvwrite(thresholds_fname, all_thresholds);
end
end %function ends