Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
comik/analyze_wvector.m
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
351 lines (330 sloc)
13.8 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function [run_this_cmd, pdfcrop_lines] = analyse_wvector(folderName, rankId, wvector, oligoLen, maxDist, topN, verbose, max_or_topN, debugLevel, debugMsgLocation) | |
% ANALYSE_WVECTOR | |
% Analyses the weight vector and plot distance-centric k-mer visualizations | |
% | |
% INPUT PARAMS | |
% Param 'folderName' | |
% Location/Path on disk of the output folder | |
% | |
% Param 'rankId' | |
% Rank of the weight vector in CoMIK | |
% | |
% Param 'wvector' | |
% The weight vector itself which is to be used here | |
% | |
% Param 'oligoLen' | |
% Oligomer length for the ODH representation | |
% | |
% Param 'maxDist' | |
% Maximum distance value for the ODH representation | |
% | |
% Param 'topN' | |
% Specify a value for topN | |
% | |
% Param 'verbose' | |
% Set to 1 for verbose output | |
% | |
% Param 'max_or_topN' | |
% Flag to specify if topN vizualization or max (Absolute Max Per Distance) visualization is to be performed | |
% Refer to Nikumbh and Pfeifer, BMC Bioinformatics, 2017. | |
% | |
% Param 'debugLevel' | |
% | |
% Param 'debugMsgLocation' | |
% | |
% OUTPUT PARAMS | |
% Param 'run_this_cmd' | |
% This could be used create a cmd that is to be run after the program ends to | |
% stitch together pages from the separate PDF files | |
% | |
% Param 'pdfcrop_lines' | |
% Cropping the PDF pages/files to cut out the unused area | |
% | |
% | |
% ADDITIONAL NOTES | |
% | |
% | |
% Author: snikumbh@mpi-inf.mpg.de | |
totalArguments = 10; | |
if nargin < totalArguments | |
debugMsgLocation = 1; | |
end | |
if nargin < totalArguments - 1 | |
debugMsgLocation = 1; | |
debugLevel = 2; | |
end | |
pdfcrop_lines = []; | |
alphabet_size = 4; | |
alphabet = 'acgt'; | |
% We will fetch max_dist from the weightvector length because the maxDist can be | |
% different based on whether the given maxDist is same as the segment size or not. | |
% This is more reliable. | |
max_dist = length(wvector) / (alphabet_size^oligoLen)^2; | |
pos_thresholds = []; neg_thresholds = []; | |
twoD_means = wvector; | |
logMessages(debugMsgLocation, sprintf('\nNow analyzing weight vector ranked: %d\n', rankId), debugLevel); | |
mat_rows = alphabet_size^oligoLen; | |
mat_cols = mat_rows; | |
norms_pair_matrix = zeros(mat_rows, mat_cols); | |
for i= 0 : (alphabet_size^oligoLen)-1 | |
% i iterates over the first oligomer in the pair | |
start_pos = (i * (alphabet_size^oligoLen) * max_dist) + 1; | |
for j=0:(alphabet_size^oligoLen)-1 | |
% j iterates over the second oligomer in the pair | |
jump_pos = j*max_dist; | |
perfect_pos = start_pos + jump_pos; | |
norms_pair_matrix(i+1,j+1) = norm( twoD_means(perfect_pos : perfect_pos + max_dist-1), 2); | |
end % inner for ends | |
end % outer for ends | |
heatmapCSV_fname = strcat(folderName, '/weightVectorNormed_rank', num2str(rankId), '.csv'); | |
csvwrite(heatmapCSV_fname, norms_pair_matrix); | |
mat_layers = max_dist; | |
threeD_wvectors = zeros(mat_rows, mat_cols, mat_layers); | |
for i= 0 : (alphabet_size^oligoLen)-1 | |
start_pos = (i * (alphabet_size^oligoLen) * max_dist) + 1; | |
for j=0:(alphabet_size^oligoLen)-1 | |
jump_pos = j*max_dist; | |
perfect_pos = start_pos + jump_pos; | |
threeD_wvectors(i+1,j+1, :) = twoD_means(1, perfect_pos : perfect_pos + max_dist-1); %%Could use twoD_wvectors instead for individual vectors | |
end % inner for ends | |
end % outer for ends | |
if(verbose) | |
fprintf('2D wvector made into 3D\n'); | |
end | |
% generate a switch case or mapping function that maps 3-mers | |
nums = [0:(alphabet_size^oligoLen)-1]; | |
nums = dec2base(nums,4); | |
oligos = []; | |
for i=1:size(nums,1) | |
temp = []; | |
for j=1:size(nums,2) | |
temp = [temp alphabet(str2num(nums(i,j))+1)]; | |
end | |
oligos = [oligos; temp]; | |
end | |
% oligos would be a 64 x 3 char array for oligoLen 3 | |
sorted_wvectors = sort(twoD_means, 'descend'); | |
threshold_for_topN_pos = sorted_wvectors(topN); | |
pos_thresholds = [pos_thresholds threshold_for_topN_pos]; | |
sorted_wvectors = sort(twoD_means, 'ascend'); | |
threshold_for_topN_neg = sorted_wvectors(topN); | |
neg_thresholds = [neg_thresholds threshold_for_topN_neg]; | |
% Which distance has most positive or negative weight | |
max_at_distances = []; max_motifs = {}; max_motif_weights = []; | |
min_at_distances = []; min_motifs = {}; min_motif_weights = []; | |
pos_motif_count_at_each_distance = []; | |
neg_motif_count_at_each_distance = []; | |
if max_or_topN | |
for i=1:max_dist | |
% Maximums | |
max_at_distances = [max_at_distances max(max(threeD_wvectors(:,:,i)))]; | |
[a b] = find(threeD_wvectors(:,:,i) == max_at_distances(i)); | |
max_motifs = [max_motifs; [oligos(a,:) oligos(b,:)]]; | |
% Minimums | |
min_at_distances = [min_at_distances min(min(threeD_wvectors(:,:,i)))]; | |
[a b] = find(threeD_wvectors(:,:,i) == min_at_distances(i)); | |
min_motifs = [min_motifs; [oligos(a,:) oligos(b,:)]]; | |
end | |
else | |
for i=1:max_dist | |
% Maximums | |
[a b] = find(threeD_wvectors(:,:,i) >= threshold_for_topN_pos); | |
pos_motif_count_at_each_distance = [pos_motif_count_at_each_distance; [i size(a,1)]]; | |
temp_motifs = {}; | |
for j=1:size(a) | |
temp_motifs{j} = [oligos(a(j),:) oligos(b(j),:)]; | |
end | |
max_motifs{i} = temp_motifs; | |
max_motif_weights = [max_motif_weights; diag(threeD_wvectors(a,b,i))]; | |
% Minimums | |
[a b] = find(threeD_wvectors(:,:,i) <= threshold_for_topN_neg); | |
neg_motif_count_at_each_distance = [neg_motif_count_at_each_distance; [i size(a,1)]]; | |
temp_motifs = {}; | |
for j=1:size(a) | |
temp_motifs{j} = [oligos(a(j),:) oligos(b(j),:)]; | |
end | |
min_motifs{i} = temp_motifs; | |
min_motif_weights = [min_motif_weights; diag(threeD_wvectors(a,b,i))]; | |
end | |
end | |
if max_or_topN == 0 | |
% Write motifs to file, also write info. distance, weights | |
% 1. Write positive motifs | |
pos_motifs_fname = strcat(folderName, '/motifs/pos_motifs_oligoLen', num2str(oligoLen), '_thresholding_top', num2str(topN), '.new_motifs_rank', num2str(rankId)); | |
fid = fopen(pos_motifs_fname, 'w'); | |
% INFO lines | |
fprintf(fid, '#File created by MATLAB script @author: snikumbh@mpi-inf.mpg.de\n'); | |
fprintf(fid, '#Motifs contributing towards classifying a sample as positive\n'); | |
fprintf(fid, '#\n'); | |
adder = 0; % required for indexing max_motif_weights | |
non_empty_count = 0; | |
fmt=[repmat('%1.4f ', 1, length(alphabet)) '\n']; | |
for i=1:length(max_motifs) | |
if ~isempty(max_motifs{i}) | |
non_empty_count = non_empty_count + 1; | |
fprintf(fid, '#Distance = %d, Max.score: %1.4f, Min.score: %1.4f\n#K-mers: ', i-1, ... | |
max( max_motif_weights(adder+1:adder+length(max_motifs{i}))), ... | |
min( max_motif_weights(adder+1:adder+length(max_motifs{i})))); | |
temp_pssm = zeros(length(alphabet), 2*oligoLen); | |
for j=1:length(max_motifs{i}) | |
fprintf(fid, '%s, ', max_motifs{i}{j}); | |
for k=1:(2*oligoLen) | |
temp_pssm( find(alphabet == max_motifs{i}{j}(k)), k) = temp_pssm( find(alphabet == max_motifs{i}{j}(k)), k) + max_motif_weights(adder + j); | |
end | |
pssm = bsxfun(@rdivide, temp_pssm, sum(temp_pssm)); | |
end | |
fprintf(fid, '\n'); | |
fprintf(fid, fmt, pssm); | |
else | |
fprintf(fid, '#Distance = %d, EMPTY', i-1); | |
end | |
fprintf(fid, '\n'); | |
adder = adder + length(max_motifs{i}); | |
end | |
fclose(fid); | |
fprintf('\n#%d positive motifs in: %s.\n', non_empty_count, pos_motifs_fname); | |
% 2. Write negative motifs | |
neg_motifs_fname = strcat(folderName, '/motifs/neg_motifs_oligoLen', num2str(oligoLen), '_thresholding_top', num2str(topN), '.new_motifs_rank', num2str(rankId)); | |
fid = fopen(neg_motifs_fname, 'w'); | |
% INFO lines | |
fprintf(fid, '#File created by MATLAB script @author: snikumbh@mpi-inf.mpg.de\n'); | |
fprintf(fid, '#Motifs contributing towards classifying a sample as negative\n'); | |
fprintf(fid, '#\n'); | |
adder = 0; % required for indexing max_motif_weights | |
non_empty_count = 0; | |
fmt=[repmat('%1.4f ', 1, length(alphabet)) '\n']; | |
for i=1:length(min_motifs) | |
if ~isempty(min_motifs{i}) | |
non_empty_count = non_empty_count + 1; | |
fprintf(fid, '#Distance = %d, Max.score: %1.4f, Min.score: %1.4f\n#K-mers: ', i-1, ... | |
max(min_motif_weights(adder+1:adder+length(min_motifs{i}))), ... | |
min(min_motif_weights(adder+1:adder+length(min_motifs{i})))); | |
temp_pssm = zeros(length(alphabet), 2*oligoLen); | |
for j=1:length(min_motifs{i}) | |
fprintf(fid, '%s, ', min_motifs{i}{j}); | |
for k=1:(2*oligoLen) | |
temp_pssm( find(alphabet == min_motifs{i}{j}(k)), k) = temp_pssm( find(alphabet == min_motifs{i}{j}(k)), k) + abs(min_motif_weights(adder + j)); | |
end | |
pssm = bsxfun(@rdivide, temp_pssm, sum(temp_pssm)); | |
end | |
fprintf(fid, '\n'); | |
fprintf(fid, fmt, pssm); | |
else | |
fprintf(fid, '#Distance = %d, EMPTY', i-1); | |
end | |
fprintf(fid, '\n'); | |
adder = adder + length(min_motifs{i}); | |
end | |
fclose(fid); | |
fprintf('#%d negative motifs in: %s.\n', non_empty_count, neg_motifs_fname); | |
end % max_or_topN condition ends | |
if max_or_topN | |
X = 0:(max_dist-1); | |
csv_fname = strcat(folderName, '/motifs/Max_and_Mins_oligolen', num2str(oligoLen), '_rank', num2str(rankId), '.csv'); | |
max_and_mins = [min_at_distances; max_at_distances]'; | |
csvwrite(csv_fname, max_and_mins); | |
Y = [min_at_distances; max_at_distances]'; | |
h = stem(X,Y); | |
h(1).Color = 'black'; | |
h(2).Color = 'black'; | |
view([90 90]); | |
grid on; | |
xlim([-1 max_dist]); | |
ylim([-1.0 1.0]); | |
% ylim([min(min_at_distances)-1.0 max(max_at_distances)+1.0]); | |
xstr = sprintf('Distances (in basepairs)'); | |
xlabel(xstr, 'Interpreter', 'latex', 'FontSize', 13); | |
ylabel('Weights', 'Interpreter', 'latex', 'FontSize', 13); | |
% [s,w] = strtok(cellline, '_'); | |
% celltype_name_for_title = char(s); | |
tstr = sprintf('Highest scoring motifs for kernel ranked %d', rankId); | |
tstr2 = strrep(tstr,'_','\_'); | |
title(tstr2, 'Interpreter', 'latex', 'FontSize', 14); | |
% Annotate with the oligomer-pair at the positions for which the maximum or minimum is recorded | |
% And place these annotations | |
Xd = get(h(1), 'XData'); | |
Y1d = get(h(1), 'YData'); | |
Y2d = get(h(2), 'YData'); | |
if oligoLen == 3 | |
plot_shift = 0.1; | |
else | |
plot_shift = 0.2; | |
end | |
for k = 1 : length(Xd) | |
% disp(upper(min_motifs(k,:))); | |
%if(k == oligoLen+1) | |
% %%this is odd distance value | |
% t1 = text(Xd(k), -0.6 + plot_shift, getColored(upper(min_motifs(k,:)), 0), 'HorizontalAlignment', 'center', 'FontSize', 6); | |
%else | |
if(mod(k,2) ~= 0) %Y1d(k) - 0.1 or + 0.1 | |
if(Y1d(k) ~= 0.0) | |
%y-position : Y1d(k)-0.70 | |
t1 = text(Xd(k), -0.5, getColored(upper(min_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'FontSize', 8); | |
%t1 = text(Xd(k), -0.65, 'Odd distances', 'HorizontalAlignment', 'center', 'FontSize', 6); | |
end | |
else | |
if(Y1d(k) ~= 0.0) | |
%y-position: Y1d(k)-0.70 | |
t1 = text(Xd(k), -0.7, getColored(upper(min_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'FontSize', 8); | |
%t1 = text(Xd(k), -0.65+plot_shift, 'Odd distances', 'HorizontalAlignment', 'center', 'FontSize', 6); | |
end | |
end | |
%%set(t1, 'Rotation', 90); | |
%if(k == oligoLen+1) | |
% %%this is odd distance value | |
% t2 = text(Xd(k), 0.6 - plot_shift, getColored(upper(max_motifs(k,:)), 0), 'HorizontalAlignment', 'center', 'FontSize', 6); | |
%else | |
if(mod(k,2) ~= 0) | |
if(Y2d(k) ~= 0.0) | |
t2 = text(Xd(k), 0.5, getColored(upper(max_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'Fontsize', 8); | |
end | |
else | |
if(Y2d(k) ~= 0.0) | |
t2 = text(Xd(k), 0.7, getColored(upper(max_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'Fontsize', 8); | |
end | |
end | |
%set(t2, 'Rotation', 90); | |
end | |
if oligoLen == 3 | |
t = text(length(Xd)/2, -0.85, 'Odd distances, $$d = \{2\bf{N}_{0}+1\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex'); | |
set(t, 'Rotation', 90); | |
t = text(length(Xd)/2, -0.35, 'Even distances, $$d = \{2\bf{N}_{0}\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex'); | |
set(t, 'Rotation', 90); | |
t = text(length(Xd)/2, 0.35, 'Even distances, $$d = \{2\bf{N}_{0}\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex'); | |
set(t, 'Rotation', 90); | |
t = text(length(Xd)/2, 0.85, 'Odd distances, $$d = \{2\bf{N}_{0}+1\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex'); | |
set(t, 'Rotation', 90); | |
else | |
t = text(length(Xd)/2, -0.9, 'Odd distances, $$d = \{2\bf{N}_{0}+1\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex'); | |
set(t, 'Rotation', 90); | |
t = text(length(Xd)/2, -0.35, 'Even distances, $$d = \{2\bf{N}_{0}\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex'); | |
set(t, 'Rotation', 90); | |
t = text(length(Xd)/2, 0.35, 'Even distances, $$d = \{2\bf{N}_{0}\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex'); | |
set(t, 'Rotation', 90); | |
t = text(length(Xd)/2, 0.9, 'Odd distances, $$d = \{2\bf{N}_{0}+1\}$$', 'HorizontalAlignment', 'center', 'FontSize', 8, 'Interpreter','latex'); | |
set(t, 'Rotation', 90); | |
end | |
%yyaxis right; | |
%new_ax = axes('ylim', [-1 max_dist], 'color', 'none', 'XTick', [], 'YAxisLocation', 'right'); | |
%set(new_ax, 'Ydir', 'reverse'); | |
pdf_fname = strcat(folderName, '/SeqLogos_MinMax_at_distances_rank', num2str(rankId), '_oligoLen', num2str(oligoLen)); | |
print(pdf_fname,'-dpdf', '-r900'); | |
% to produce EPS figures, use | |
% print(pdf_fname, '-depsc'); | |
pdfcrop_lines = [pdfcrop_lines; sprintf('pdfcrop %s\n', pdf_fname)]; | |
grid off; | |
else | |
%% Motif files are written | |
%% We will have R script running on it to generate sequence logos | |
end | |
if max_or_topN == 1 | |
% generate inidividual EPS figures | |
collated_pdf_name = strcat(folderName, '/MinMax_at_distances_oligoLen', num2str(oligoLen) ,'_combined_colored.pdf'); | |
run_this_cmd = collated_pdf_name; | |
else | |
run_this_cmd = ''; | |
all_thresholds = [neg_thresholds; pos_thresholds]'; | |
thresholds_fname = strcat(folderName, '/top', num2str(topN), '_thresholds_oligoLen', num2str(oligoLen),'.csv'); | |
csvwrite(thresholds_fname, all_thresholds); | |
end | |
end %function ends |