From 3aa7aa2960d9a9b42da834b0a6821585e35fa564 Mon Sep 17 00:00:00 2001 From: Sarvesh Prakash Nikumbh Date: Sat, 29 Jul 2017 00:31:50 +0200 Subject: [PATCH] Help functions for analyse and visualize_wvector functions. get_colored changed to getColored --- analyze_wvector.m | 114 ++++++++++------------------------ get_colored.m => getColored.m | 2 +- visualize_wvector.m | 89 +++++++++++++++++--------- 3 files changed, 96 insertions(+), 109 deletions(-) rename get_colored.m => getColored.m (95%) diff --git a/analyze_wvector.m b/analyze_wvector.m index 35f0d07..687dc1f 100644 --- a/analyze_wvector.m +++ b/analyze_wvector.m @@ -46,8 +46,6 @@ % % % Author: snikumbh@mpi-inf.mpg.de -% - totalArguments = 10; if nargin < totalArguments @@ -58,16 +56,10 @@ debugLevel = 2; end -%folder created in visualize_wvector.m -%if exist(strcat(folderName,'/motifs'), 'dir') == 7 -% %do-nothing -%else -% status = mkdir(strcat(folderName,'/motifs')); -%end pdfcrop_lines = []; alphabet_size = 4; alphabet = 'acgt'; -%max_dist = maxDist + 1; + % We will fetch max_dist from the weightvector length because the maxDist can be % different based on whether the given maxDist is same as the segment size or not. % This is more reliable. @@ -99,18 +91,16 @@ threeD_wvectors = zeros(mat_rows, mat_cols, mat_layers); for i= 0 : (alphabet_size^oligoLen)-1 start_pos = (i * (alphabet_size^oligoLen) * max_dist) + 1; - %disp(start_pos); for j=0:(alphabet_size^oligoLen)-1 jump_pos = j*max_dist; perfect_pos = start_pos + jump_pos; - %disp(start_pos); threeD_wvectors(i+1,j+1, :) = twoD_means(1, perfect_pos : perfect_pos + max_dist-1); %%Could use twoD_wvectors instead for individual vectors - end%inner for ends -end%outer for ends + end % inner for ends +end % outer for ends if(verbose) fprintf('2D wvector made into 3D\n'); end -%%generate a switch case or mapping function that maps 3-mers +% generate a switch case or mapping function that maps 3-mers nums = [0:(alphabet_size^oligoLen)-1]; nums = dec2base(nums,4); oligos = []; @@ -121,22 +111,16 @@ end oligos = [oligos; temp]; end -%%oligos would be a 64 x 3 char array for oligoLen 3 +% oligos would be a 64 x 3 char array for oligoLen 3 -%%Reduce or filter even further -%% 1. Pick every 10th distance OR -%% 2. get a threshold which separates the top-100 dimensions positive and negative separately sorted_wvectors = sort(twoD_means, 'descend'); -%topN = 100; threshold_for_topN_pos = sorted_wvectors(topN); -%%disp(sorted_wvectors(5+topN)); pos_thresholds = [pos_thresholds threshold_for_topN_pos]; sorted_wvectors = sort(twoD_means, 'ascend'); threshold_for_topN_neg = sorted_wvectors(topN); neg_thresholds = [neg_thresholds threshold_for_topN_neg]; -%max_or_topN = 0; %%1 for max, 0 for topN -%% 3. -%% Which distance has most positive or negative weight + +% Which distance has most positive or negative weight max_at_distances = []; max_motifs = {}; max_motif_weights = []; min_at_distances = []; min_motifs = {}; min_motif_weights = []; pos_motif_count_at_each_distance = []; @@ -144,31 +128,29 @@ if max_or_topN for i=1:max_dist - %% Maximums + % Maximums max_at_distances = [max_at_distances max(max(threeD_wvectors(:,:,i)))]; [a b] = find(threeD_wvectors(:,:,i) == max_at_distances(i)); max_motifs = [max_motifs; [oligos(a,:) oligos(b,:)]]; - %% Now minimums + % Minimums min_at_distances = [min_at_distances min(min(threeD_wvectors(:,:,i)))]; [a b] = find(threeD_wvectors(:,:,i) == min_at_distances(i)); min_motifs = [min_motifs; [oligos(a,:) oligos(b,:)]]; end else for i=1:max_dist - %% Maximums + % Maximums [a b] = find(threeD_wvectors(:,:,i) >= threshold_for_topN_pos); pos_motif_count_at_each_distance = [pos_motif_count_at_each_distance; [i size(a,1)]]; - %%max_motifs = [max_motifs; [oligos(a,:) oligos(b,:)]]; temp_motifs = {}; for j=1:size(a) temp_motifs{j} = [oligos(a(j),:) oligos(b(j),:)]; end max_motifs{i} = temp_motifs; max_motif_weights = [max_motif_weights; diag(threeD_wvectors(a,b,i))]; - %% Now minimums + % Minimums [a b] = find(threeD_wvectors(:,:,i) <= threshold_for_topN_neg); neg_motif_count_at_each_distance = [neg_motif_count_at_each_distance; [i size(a,1)]]; - %min_motifs = [min_motifs; [oligos(a,:) oligos(b,:)]]; temp_motifs = {}; for j=1:size(a) temp_motifs{j} = [oligos(a(j),:) oligos(b(j),:)]; @@ -179,15 +161,15 @@ end if max_or_topN == 0 - %% Write motifs to file, also write info. distance, weights - %% 1. Write positive motifs + % Write motifs to file, also write info. distance, weights + % 1. Write positive motifs pos_motifs_fname = strcat(folderName, '/motifs/pos_motifs_oligoLen', num2str(oligoLen), '_thresholding_top', num2str(topN), '.new_motifs_rank', num2str(rankId)); fid = fopen(pos_motifs_fname, 'w'); - %% INFO lines + % INFO lines fprintf(fid, '#File created by MATLAB script @author: snikumbh@mpi-inf.mpg.de\n'); fprintf(fid, '#Motifs contributing towards classifying a sample as positive\n'); fprintf(fid, '#\n'); - adder = 0; %%required for indexing max_motif_weights + adder = 0; % required for indexing max_motif_weights non_empty_count = 0; fmt=[repmat('%1.4f ', 1, length(alphabet)) '\n']; for i=1:length(max_motifs) @@ -214,14 +196,14 @@ end fclose(fid); fprintf('\n#%d positive motifs in: %s.\n', non_empty_count, pos_motifs_fname); - %% 2. Write negative motifs + % 2. Write negative motifs neg_motifs_fname = strcat(folderName, '/motifs/neg_motifs_oligoLen', num2str(oligoLen), '_thresholding_top', num2str(topN), '.new_motifs_rank', num2str(rankId)); fid = fopen(neg_motifs_fname, 'w'); - %% INFO lines + % INFO lines fprintf(fid, '#File created by MATLAB script @author: snikumbh@mpi-inf.mpg.de\n'); fprintf(fid, '#Motifs contributing towards classifying a sample as negative\n'); fprintf(fid, '#\n'); - adder = 0; %%required for indexing max_motif_weights + adder = 0; % required for indexing max_motif_weights non_empty_count = 0; fmt=[repmat('%1.4f ', 1, length(alphabet)) '\n']; for i=1:length(min_motifs) @@ -248,22 +230,15 @@ end fclose(fid); fprintf('#%d negative motifs in: %s.\n', non_empty_count, neg_motifs_fname); -end %%max_or_topN condition ends +end % max_or_topN condition ends if max_or_topN - %disp(max_motifs); - %max_motifs = max_motifs'; - %min_motifs = min_motifs'; - %disp(max_motifs); - %% May be we plot this information X = 0:(max_dist-1); csv_fname = strcat(folderName, '/motifs/Max_and_Mins_oligolen', num2str(oligoLen), '_rank', num2str(rankId), '.csv'); max_and_mins = [min_at_distances; max_at_distances]'; csvwrite(csv_fname, max_and_mins); Y = [min_at_distances; max_at_distances]'; - %disp(size(X)); - %disp(size(Y)); h = stem(X,Y); h(1).Color = 'black'; h(2).Color = 'black'; @@ -271,17 +246,17 @@ grid on; xlim([-1 max_dist]); ylim([-1.0 1.0]); - %ylim([min(min_at_distances)-1.0 max(max_at_distances)+1.0]); + % ylim([min(min_at_distances)-1.0 max(max_at_distances)+1.0]); xstr = sprintf('Distances (in basepairs)'); xlabel(xstr, 'Interpreter', 'latex', 'FontSize', 13); ylabel('Weights', 'Interpreter', 'latex', 'FontSize', 13); - %[s,w] = strtok(cellline, '_'); - %celltype_name_for_title = char(s); + % [s,w] = strtok(cellline, '_'); + % celltype_name_for_title = char(s); tstr = sprintf('Highest scoring motifs for kernel ranked %d', rankId); tstr2 = strrep(tstr,'_','\_'); title(tstr2, 'Interpreter', 'latex', 'FontSize', 14); - %% Annotate with the oligomer-pair at the positions for which the maximum or minimum is recorded - %% And place these annotations + % Annotate with the oligomer-pair at the positions for which the maximum or minimum is recorded + % And place these annotations Xd = get(h(1), 'XData'); Y1d = get(h(1), 'YData'); Y2d = get(h(2), 'YData'); @@ -294,33 +269,33 @@ % disp(upper(min_motifs(k,:))); %if(k == oligoLen+1) % %%this is odd distance value - % t1 = text(Xd(k), -0.6 + plot_shift, get_colored(upper(min_motifs(k,:)), 0), 'HorizontalAlignment', 'center', 'FontSize', 6); + % t1 = text(Xd(k), -0.6 + plot_shift, getColored(upper(min_motifs(k,:)), 0), 'HorizontalAlignment', 'center', 'FontSize', 6); %else if(mod(k,2) ~= 0) %Y1d(k) - 0.1 or + 0.1 if(Y1d(k) ~= 0.0) %y-position : Y1d(k)-0.70 - t1 = text(Xd(k), -0.5, get_colored(upper(min_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'FontSize', 8); + t1 = text(Xd(k), -0.5, getColored(upper(min_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'FontSize', 8); %t1 = text(Xd(k), -0.65, 'Odd distances', 'HorizontalAlignment', 'center', 'FontSize', 6); end else if(Y1d(k) ~= 0.0) %y-position: Y1d(k)-0.70 - t1 = text(Xd(k), -0.7, get_colored(upper(min_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'FontSize', 8); + t1 = text(Xd(k), -0.7, getColored(upper(min_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'FontSize', 8); %t1 = text(Xd(k), -0.65+plot_shift, 'Odd distances', 'HorizontalAlignment', 'center', 'FontSize', 6); end end %%set(t1, 'Rotation', 90); %if(k == oligoLen+1) % %%this is odd distance value - % t2 = text(Xd(k), 0.6 - plot_shift, get_colored(upper(max_motifs(k,:)), 0), 'HorizontalAlignment', 'center', 'FontSize', 6); + % t2 = text(Xd(k), 0.6 - plot_shift, getColored(upper(max_motifs(k,:)), 0), 'HorizontalAlignment', 'center', 'FontSize', 6); %else if(mod(k,2) ~= 0) if(Y2d(k) ~= 0.0) - t2 = text(Xd(k), 0.5, get_colored(upper(max_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'Fontsize', 8); + t2 = text(Xd(k), 0.5, getColored(upper(max_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'Fontsize', 8); end else if(Y2d(k) ~= 0.0) - t2 = text(Xd(k), 0.7, get_colored(upper(max_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'Fontsize', 8); + t2 = text(Xd(k), 0.7, getColored(upper(max_motifs(k,:)), oligoLen), 'HorizontalAlignment', 'center', 'Fontsize', 8); end end %set(t2, 'Rotation', 90); @@ -352,38 +327,17 @@ pdf_fname = strcat(folderName, '/SeqLogos_MinMax_at_distances_rank', num2str(rankId), '_oligoLen', num2str(oligoLen)); print(pdf_fname,'-dpdf', '-r900', '-bestfit'); - %print(pdf_fname, '-depsc'); + % to produce EPS figures, use + % print(pdf_fname, '-depsc'); pdfcrop_lines = [pdfcrop_lines; sprintf('pdfcrop %s\n', pdf_fname)]; grid off; else %% Motif files are written - %% We will have R script running on it to generate sequencelogos - - %%Write out the motifs, and their weights to a file, which is read from R and plot logos - %for k = 1 : length(Xd) -% %disp(min_motifs{k}); -% if ~isempty(min_motifs{k}) -% fprintf('Min = %d--', k); -% t1 = text(Xd(k), Y1d(k) - 0.1, seqlogo(min_motifs{k}), 'HorizontalAlignment', 'center', 'FontSize', 6); -% else -% fprintf('Min = %d-EMPTY-\n', k); -% disp(min_motifs{k}); -% end - %%set(t1, 'Rotation', 90); - %if ~isempty(max_motifs{k}) - % fprintf('Max = %d--\n', k); - % t2 = text(Xd(k), Y2d(k) + 0.1, seqlogo(max_motifs{k}), 'HorizontalAlignment', 'center', 'Fontsize', 6); - % disp(max_motifs{k}); - %else - % fprintf('Max = %d-Empty-\n', k); - % disp(max_motifs{k}); - %end - %%set(t2, 'Rotation', 90); -% end + %% We will have R script running on it to generate sequence logos end if max_or_topN == 1 - %% generate inidividual EPS figures + % generate inidividual EPS figures collated_pdf_name = strcat(folderName, '/MinMax_at_distances_oligoLen', num2str(oligoLen) ,'_combined_colored.pdf'); run_this_cmd = collated_pdf_name; else diff --git a/get_colored.m b/getColored.m similarity index 95% rename from get_colored.m rename to getColored.m index 3665c7a..d48537c 100644 --- a/get_colored.m +++ b/getColored.m @@ -1,4 +1,4 @@ -function [colored_string] = get_colored(given_string, oligoLen) +function [colored_string] = getColored(given_string, oligoLen) % GETCOLORED % Colors the given string (DNA motif) using the IUPAC color scheme for the nucleotides A, C, G, T. diff --git a/visualize_wvector.m b/visualize_wvector.m index 355e6e2..4270985 100644 --- a/visualize_wvector.m +++ b/visualize_wvector.m @@ -1,12 +1,49 @@ function [] = visualize_wvector(folderName, oligoLen, maxDist, subkernelWeights, allSeqsAsBags, rawConformedSetKernel, allSeqsTransformationKernel, debugLevel, debugMsgLocation) -% 1. We have subkernel weights given, all corresponding weight vectors are to be computed -% 2. Alpha values are given from the trained model -% 3. Location for alpha values: folderName specified in arguments -% 4. Output the list of ranked weight vectors, specifying the individual weights doesn't seem -% seem to serve any/much purpose -% can't work without folderName -% thus, there are no default set of values of subkernelWeights +% VISUALIZE_WVECTOR +% For the given weight vector, plot distance-centric k-mer visualizations with +% the help of analyse_wvector +% +% INPUT PARAMS +% Param 'folderName' (string) +% Location/Path on disk of the output folder +% +% Param 'oligoLen' +% Oligomer length for the ODH representation +% +% Param 'maxDist' +% Maximum distance value for the ODH representation +% +% Param 'subkernelWeights' (vector) +% Weights assigned to the sub-kernel by MKL +% +% Param 'allSeqsAsBags' (cell array) +% Collection of all sequences +% +% Param 'rawConformedSetKernel' (matrix) +% Unnormalized conformally transformed MI kernel +% +% Param 'allSeqsTransformationKernel' (matrix) +% Transformation for all instances of all sequences +% +% Param 'debugLevel' +% +% Param 'debugMsgLocation' +% +% OUTPUT PARAMS +% +% ADDITIONAL NOTES +% -- We have subkernel weights given, all corresponding weight vectors are to be computed +% -- Alpha values are given from the trained model +% -- Location for alpha values: folderName specified in arguments +% -- Output the list of ranked weight vectors, specifying the individual +% weights doesn't seem to serve any/much purpose +% -- Can't work without folderName, thus, there are no default set of values +% of subkernelWeights +% +% +% Author: snikumbh@mpi-inf.mpg.de + totalArguments = 9; if nargin < totalArguments debugMsgLocation = 1; @@ -16,32 +53,33 @@ debugLevel = 2; end +max_or_topN = 1; +% max_or_topN to be set to 1 for AMPD visualization, 0 for topN visualization +topN = 25; % Although this value is set, it is used when max_or_topN is set to 0 + alphaFilename = 'alphas.txt'; svFilename = 'sv.txt'; alphaValues = dlmread(strcat(folderName, '/', alphaFilename)); alphaIndices = dlmread(strcat(folderName, '/', svFilename)); alphaIndices = alphaIndices + 1; %these are nothing but bagIndices % -% Compute the weight vectors corresponding to all subkernels, keep them in the same order as the kernels -% % 3. read in the output from the python script and -% return back the values. +% Compute the weight vectors corresponding to all subkernels, keep them in the +% same order as the kernels +% -- read in the output from the python script and return back the values. % nClusters = size(subkernelWeights, 1); nKernels = nClusters; -% prepare bag-level phi, restrict only to support vectorsa +% prepare bag-level phi, restrict only to support vectors % bagPhi is nSVBags-by-nClusters bagPhi = zeros( size(allSeqsAsBags{1},1), size(alphaIndices,1), nClusters); idx = 0; for i=1:length(allSeqsAsBags)% this is nBags - %restrict to sv indices + % restrict to sv indices if any(i == alphaIndices) - idx = idx + 1;%this is second dimension of the bagPhi, ranges from 1 to #alphaIndices + idx = idx + 1;% this is second dimension of the bagPhi, ranges from 1 to #alphaIndices for j=1:nClusters - %allSeqs NonShifted/Shifted TransformationKernel already has nInstances x nClusters - %disp(size(bagPhiNonShifted(:, idx, j))); - %bagPhiNonShifted(:, idx, j) = allSeqsNonShiftedTransformationKernel{i}(:,j); - %bagPhiShifted(:, idx, j) = allSeqsShiftedTransformationKernel{i}(:,j); + % allSeqs TransformationKernel already has nInstances x nClusters bagPhi(:, idx, j) = (1/sqrt(rawConformedSetKernel{j}(i,i))) * sum(allSeqsAsBags{i} .* repmat(allSeqsTransformationKernel{i}(:,j)', size(allSeqsAsBags{i},1), 1), 2); end end @@ -62,22 +100,17 @@ rankedOrderWeightVectors = weightVectors(:,rankedIndices'); overallWeightVector = sum(weightVectors,2); -topN = 25; verbose_for_analyze_wvector_call = 0; if exist(strcat(folderName,'/motifs'), 'dir') == 7 %do-nothing else status = mkdir(strcat(folderName,'/motifs')); end -max_or_topN = 1; % 1 for max, 0 for topN %pdfcrop_lines = []; -%fprintf('\npdftk '); for i=1:nKernels - %fprintf('In analyze_wvector kernel-id %d rank %d\n', rankedIndices(i), i); - %analyze_wvectors(subkernelWeights, oligoLen, max_dist, topN, verbose) [collated_pdf_name, pdfcrop_line] = analyze_wvector(folderName, i, weightVectors(:, rankedIndices(i))' , oligoLen, maxDist, topN, verbose_for_analyze_wvector_call, max_or_topN, debugLevel, debugMsgLocation); logMessages(debugMsgLocation, sprintf('%s-crop.pdf ', [pdfcrop_line(1,9:end-1)]), debugLevel); - %% Plotting l2-norm heatmaps of weight vector + % Plotting l2-norm heatmaps of weight vector inputCSVFile = strcat(folderName, '/weightVectorNormed_rank', num2str(i), '.csv'); [status, output] = system(['Rscript plot_heatmap.R ' inputCSVFile ' ' folderName ' ' num2str(i) ' ' num2str(oligoLen)]); logMessages(debugMsgLocation, sprintf('R script to plot heatmaps\n'), debugLevel); @@ -90,11 +123,11 @@ % Analyze weight vector for the combined kernel [collated_pdf_name, pdfcrop_line] = analyze_wvector(folderName, 0, overallWeightVector' , oligoLen, maxDist, topN, verbose_for_analyze_wvector_call, max_or_topN, debugLevel, debugMsgLocation); -%% Combine/append all PDFs -%fprintf('\npdftk '); -%for i= 1:size(pdfcrop_lines,1) +% Combine/append all PDFs +% fprintf('\npdftk '); +% for i= 1:size(pdfcrop_lines,1) % fprintf('%s-crop.pdf ', [pdfcrop_lines(i,9:end-1)]); -%end -%fprintf('cat output %s\n\n', collated_pdf_name); +% end +% fprintf('cat output %s\n\n', collated_pdf_name); end %function end