Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Help functions for analyse and visualize_wvector functions. get_color…
…ed changed to getColored
  • Loading branch information
snikumbh committed Jul 28, 2017
1 parent 34bf9c0 commit 3aa7aa2
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 109 deletions.
114 changes: 34 additions & 80 deletions analyze_wvector.m
Expand Up @@ -46,8 +46,6 @@
%
%
% Author: snikumbh@mpi-inf.mpg.de
%


totalArguments = 10;
if nargin < totalArguments
Expand All @@ -58,16 +56,10 @@
debugLevel = 2;
end

%folder created in visualize_wvector.m
%if exist(strcat(folderName,'/motifs'), 'dir') == 7
% %do-nothing
%else
% status = mkdir(strcat(folderName,'/motifs'));
%end
pdfcrop_lines = [];
alphabet_size = 4;
alphabet = 'acgt';
%max_dist = maxDist + 1;

% We will fetch max_dist from the weightvector length because the maxDist can be
% different based on whether the given maxDist is same as the segment size or not.
% This is more reliable.
Expand Down Expand Up @@ -99,18 +91,16 @@
threeD_wvectors = zeros(mat_rows, mat_cols, mat_layers);
for i= 0 : (alphabet_size^oligoLen)-1
start_pos = (i * (alphabet_size^oligoLen) * max_dist) + 1;
%disp(start_pos);
for j=0:(alphabet_size^oligoLen)-1
jump_pos = j*max_dist;
perfect_pos = start_pos + jump_pos;
%disp(start_pos);
threeD_wvectors(i+1,j+1, :) = twoD_means(1, perfect_pos : perfect_pos + max_dist-1); %%Could use twoD_wvectors instead for individual vectors
end%inner for ends
end%outer for ends
end % inner for ends
end % outer for ends
if(verbose)
fprintf('2D wvector made into 3D\n');
end
%%generate a switch case or mapping function that maps 3-mers
% generate a switch case or mapping function that maps 3-mers
nums = [0:(alphabet_size^oligoLen)-1];
nums = dec2base(nums,4);
oligos = [];
Expand All @@ -121,54 +111,46 @@
end
oligos = [oligos; temp];
end
%%oligos would be a 64 x 3 char array for oligoLen 3
% oligos would be a 64 x 3 char array for oligoLen 3

%%Reduce or filter even further
%% 1. Pick every 10th distance OR
%% 2. get a threshold which separates the top-100 dimensions positive and negative separately
sorted_wvectors = sort(twoD_means, 'descend');
%topN = 100;
threshold_for_topN_pos = sorted_wvectors(topN);
%%disp(sorted_wvectors(5+topN));
pos_thresholds = [pos_thresholds threshold_for_topN_pos];
sorted_wvectors = sort(twoD_means, 'ascend');
threshold_for_topN_neg = sorted_wvectors(topN);
neg_thresholds = [neg_thresholds threshold_for_topN_neg];
%max_or_topN = 0; %%1 for max, 0 for topN
%% 3.
%% Which distance has most positive or negative weight

% Which distance has most positive or negative weight
max_at_distances = []; max_motifs = {}; max_motif_weights = [];
min_at_distances = []; min_motifs = {}; min_motif_weights = [];
pos_motif_count_at_each_distance = [];
neg_motif_count_at_each_distance = [];

if max_or_topN
for i=1:max_dist
%% Maximums
% Maximums
max_at_distances = [max_at_distances max(max(threeD_wvectors(:,:,i)))];
[a b] = find(threeD_wvectors(:,:,i) == max_at_distances(i));
max_motifs = [max_motifs; [oligos(a,:) oligos(b,:)]];
%% Now minimums
% Minimums
min_at_distances = [min_at_distances min(min(threeD_wvectors(:,:,i)))];
[a b] = find(threeD_wvectors(:,:,i) == min_at_distances(i));
min_motifs = [min_motifs; [oligos(a,:) oligos(b,:)]];
end
else
for i=1:max_dist
%% Maximums
% Maximums
[a b] = find(threeD_wvectors(:,:,i) >= threshold_for_topN_pos);
pos_motif_count_at_each_distance = [pos_motif_count_at_each_distance; [i size(a,1)]];
%%max_motifs = [max_motifs; [oligos(a,:) oligos(b,:)]];
temp_motifs = {};
for j=1:size(a)
temp_motifs{j} = [oligos(a(j),:) oligos(b(j),:)];
end
max_motifs{i} = temp_motifs;
max_motif_weights = [max_motif_weights; diag(threeD_wvectors(a,b,i))];
%% Now minimums
% Minimums
[a b] = find(threeD_wvectors(:,:,i) <= threshold_for_topN_neg);
neg_motif_count_at_each_distance = [neg_motif_count_at_each_distance; [i size(a,1)]];
%min_motifs = [min_motifs; [oligos(a,:) oligos(b,:)]];
temp_motifs = {};
for j=1:size(a)
temp_motifs{j} = [oligos(a(j),:) oligos(b(j),:)];
Expand All @@ -179,15 +161,15 @@
end

if max_or_topN == 0
%% Write motifs to file, also write info. distance, weights
%% 1. Write positive motifs
% Write motifs to file, also write info. distance, weights
% 1. Write positive motifs
pos_motifs_fname = strcat(folderName, '/motifs/pos_motifs_oligoLen', num2str(oligoLen), '_thresholding_top', num2str(topN), '.new_motifs_rank', num2str(rankId));
fid = fopen(pos_motifs_fname, 'w');
%% INFO lines
% INFO lines
fprintf(fid, '#File created by MATLAB script @author: snikumbh@mpi-inf.mpg.de\n');
fprintf(fid, '#Motifs contributing towards classifying a sample as positive\n');
fprintf(fid, '#\n');
adder = 0; %%required for indexing max_motif_weights
adder = 0; % required for indexing max_motif_weights
non_empty_count = 0;
fmt=[repmat('%1.4f ', 1, length(alphabet)) '\n'];
for i=1:length(max_motifs)
Expand All @@ -214,14 +196,14 @@
end
fclose(fid);
fprintf('\n#%d positive motifs in: %s.\n', non_empty_count, pos_motifs_fname);
%% 2. Write negative motifs
% 2. Write negative motifs
neg_motifs_fname = strcat(folderName, '/motifs/neg_motifs_oligoLen', num2str(oligoLen), '_thresholding_top', num2str(topN), '.new_motifs_rank', num2str(rankId));
fid = fopen(neg_motifs_fname, 'w');
%% INFO lines
% INFO lines
fprintf(fid, '#File created by MATLAB script @author: snikumbh@mpi-inf.mpg.de\n');
fprintf(fid, '#Motifs contributing towards classifying a sample as negative\n');
fprintf(fid, '#\n');
adder = 0; %%required for indexing max_motif_weights
adder = 0; % required for indexing max_motif_weights
non_empty_count = 0;
fmt=[repmat('%1.4f ', 1, length(alphabet)) '\n'];
for i=1:length(min_motifs)
Expand All @@ -248,40 +230,33 @@
end
fclose(fid);
fprintf('#%d negative motifs in: %s.\n', non_empty_count, neg_motifs_fname);
end %%max_or_topN condition ends
end % max_or_topN condition ends

if max_or_topN
%disp(max_motifs);
%max_motifs = max_motifs';
%min_motifs = min_motifs';
%disp(max_motifs);
%% May be we plot this information
X = 0:(max_dist-1);
csv_fname = strcat(folderName, '/motifs/Max_and_Mins_oligolen', num2str(oligoLen), '_rank', num2str(rankId), '.csv');
max_and_mins = [min_at_distances; max_at_distances]';
csvwrite(csv_fname, max_and_mins);

Y = [min_at_distances; max_at_distances]';
%disp(size(X));
%disp(size(Y));
h = stem(X,Y);
h(1).Color = 'black';
h(2).Color = 'black';
view([90 90]);
grid on;
xlim([-1 max_dist]);
ylim([-1.0 1.0]);
%ylim([min(min_at_distances)-1.0 max(max_at_distances)+1.0]);
% ylim([min(min_at_distances)-1.0 max(max_at_distances)+1.0]);
xstr = sprintf('Distances (in basepairs)');
xlabel(xstr, 'Interpreter', 'latex', 'FontSize', 13);
ylabel('Weights', 'Interpreter', 'latex', 'FontSize', 13);
%[s,w] = strtok(cellline, '_');
%celltype_name_for_title = char(s);
% [s,w] = strtok(cellline, '_');
% celltype_name_for_title = char(s);
tstr = sprintf('Highest scoring motifs for kernel ranked %d', rankId);
tstr2 = strrep(tstr,'_','\_');
title(tstr2, 'Interpreter', 'latex', 'FontSize', 14);
%% Annotate with the oligomer-pair at the positions for which the maximum or minimum is recorded
%% And place these annotations
% Annotate with the oligomer-pair at the positions for which the maximum or minimum is recorded
% And place these annotations
Xd = get(h(1), 'XData');
Y1d = get(h(1), 'YData');
Y2d = get(h(2), 'YData');
Expand All @@ -294,33 +269,33 @@
% disp(upper(min_motifs(k,:)));
%if(k == oligoLen+1)
% %%this is odd distance value
% t1 = text(Xd(k), -0.6 + plot_shift, get_colored(upper(min_motifs(k,:)), 0), 'HorizontalAlignment', 'center', 'FontSize', 6);
% t1 = text(Xd(k), -0.6 + plot_shift, getColored(upper(min_motifs(k,:)), 0), 'HorizontalAlignment', 'center', 'FontSize', 6);
%else
if(mod(k,2) ~= 0) %Y1d(k) - 0.1 or + 0.1
if(Y1d(k) ~= 0.0)
%y-position : Y1d(k)-0.70
t1 = text(Xd(k), -0.5, get_colored(upper(min_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'FontSize', 8);
t1 = text(Xd(k), -0.5, getColored(upper(min_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'FontSize', 8);
%t1 = text(Xd(k), -0.65, 'Odd distances', 'HorizontalAlignment', 'center', 'FontSize', 6);
end
else
if(Y1d(k) ~= 0.0)
%y-position: Y1d(k)-0.70
t1 = text(Xd(k), -0.7, get_colored(upper(min_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'FontSize', 8);
t1 = text(Xd(k), -0.7, getColored(upper(min_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'FontSize', 8);
%t1 = text(Xd(k), -0.65+plot_shift, 'Odd distances', 'HorizontalAlignment', 'center', 'FontSize', 6);
end
end
%%set(t1, 'Rotation', 90);
%if(k == oligoLen+1)
% %%this is odd distance value
% t2 = text(Xd(k), 0.6 - plot_shift, get_colored(upper(max_motifs(k,:)), 0), 'HorizontalAlignment', 'center', 'FontSize', 6);
% t2 = text(Xd(k), 0.6 - plot_shift, getColored(upper(max_motifs(k,:)), 0), 'HorizontalAlignment', 'center', 'FontSize', 6);
%else
if(mod(k,2) ~= 0)
if(Y2d(k) ~= 0.0)
t2 = text(Xd(k), 0.5, get_colored(upper(max_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'Fontsize', 8);
t2 = text(Xd(k), 0.5, getColored(upper(max_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'Fontsize', 8);
end
else
if(Y2d(k) ~= 0.0)
t2 = text(Xd(k), 0.7, get_colored(upper(max_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'Fontsize', 8);
t2 = text(Xd(k), 0.7, getColored(upper(max_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'Fontsize', 8);
end
end
%set(t2, 'Rotation', 90);
Expand Down Expand Up @@ -352,38 +327,17 @@

pdf_fname = strcat(folderName, '/SeqLogos_MinMax_at_distances_rank', num2str(rankId), '_oligoLen', num2str(oligoLen));
print(pdf_fname,'-dpdf', '-r900', '-bestfit');
%print(pdf_fname, '-depsc');
% to produce EPS figures, use
% print(pdf_fname, '-depsc');
pdfcrop_lines = [pdfcrop_lines; sprintf('pdfcrop %s\n', pdf_fname)];
grid off;
else
%% Motif files are written
%% We will have R script running on it to generate sequencelogos

%%Write out the motifs, and their weights to a file, which is read from R and plot logos
%for k = 1 : length(Xd)
% %disp(min_motifs{k});
% if ~isempty(min_motifs{k})
% fprintf('Min = %d--', k);
% t1 = text(Xd(k), Y1d(k) - 0.1, seqlogo(min_motifs{k}), 'HorizontalAlignment', 'center', 'FontSize', 6);
% else
% fprintf('Min = %d-EMPTY-\n', k);
% disp(min_motifs{k});
% end
%%set(t1, 'Rotation', 90);
%if ~isempty(max_motifs{k})
% fprintf('Max = %d--\n', k);
% t2 = text(Xd(k), Y2d(k) + 0.1, seqlogo(max_motifs{k}), 'HorizontalAlignment', 'center', 'Fontsize', 6);
% disp(max_motifs{k});
%else
% fprintf('Max = %d-Empty-\n', k);
% disp(max_motifs{k});
%end
%%set(t2, 'Rotation', 90);
% end
%% We will have R script running on it to generate sequence logos
end

if max_or_topN == 1
%% generate inidividual EPS figures
% generate inidividual EPS figures
collated_pdf_name = strcat(folderName, '/MinMax_at_distances_oligoLen', num2str(oligoLen) ,'_combined_colored.pdf');
run_this_cmd = collated_pdf_name;
else
Expand Down
2 changes: 1 addition & 1 deletion get_colored.m → getColored.m
@@ -1,4 +1,4 @@
function [colored_string] = get_colored(given_string, oligoLen)
function [colored_string] = getColored(given_string, oligoLen)

% GETCOLORED
% Colors the given string (DNA motif) using the IUPAC color scheme for the nucleotides A, C, G, T.
Expand Down

0 comments on commit 3aa7aa2

Please sign in to comment.