Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Help for functions added for many smaller functions
  • Loading branch information
snikumbh committed Jul 28, 2017
1 parent 1ae57b9 commit 4d48120
Show file tree
Hide file tree
Showing 14 changed files with 342 additions and 102 deletions.
49 changes: 49 additions & 0 deletions analyze_wvector.m
@@ -1,5 +1,54 @@
function [run_this_cmd, pdfcrop_lines] = analyse_wvector(folderName, rankId, wvector, oligoLen, maxDist, topN, verbose, max_or_topN, debugLevel, debugMsgLocation)

% ANALYSE_WVECTOR
% Analyses the weight vector and plot distance-centric k-mer visualizations
%
% INPUT PARAMS
% Param 'folderName'
% Location/Path on disk of the output folder
%
% Param 'rankId'
% Rank of the weight vector in CoMIK
%
% Param 'wvector'
% The weight vector itself which is to be used here
%
% Param 'oligoLen'
% Oligomer length for the ODH representation
%
% Param 'maxDist'
% Maximum distance value for the ODH representation
%
% Param 'topN'
% Specify a value for topN
%
% Param 'verbose'
% Set to 1 for verbose output
%
% Param 'max_or_topN'
% Flag to specify if topN vizualization or max (Absolute Max Per Distance) visualization is to be performed
% Refer to Nikumbh and Pfeifer, BMC Bioinformatics, 2017.
%
% Param 'debugLevel'
%
% Param 'debugMsgLocation'
%
% OUTPUT PARAMS
% Param 'run_this_cmd'
% This could be used create a cmd that is to be run after the program ends to
% stitch together pages from the separate PDF files
%
% Param 'pdfcrop_lines'
% Cropping the PDF pages/files to cut out the unused area
%
%
% ADDITIONAL NOTES
%
%
% Author: snikumbh@mpi-inf.mpg.de
%


totalArguments = 10;
if nargin < totalArguments
debugMsgLocation = 1;
Expand Down
41 changes: 33 additions & 8 deletions applyTransformation.m
@@ -1,14 +1,39 @@
function [transformationKernel, instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation)
function [instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation)

% APPLYTRANSFORMATION
% Transforms the individual segments w.r.t. the cluster centres obtained by k-means
%
% we apply Gaussian tranformation
% INPUT PARAMS
% Param 'allSeqsAsBags' (Cell array)
% Collection of all segments per sequence
%
% Param 'clusterCentres' (matrix)
% A matrix where each cluster centre is represented coulnwise; dimensions: nClusters x FVlength
%
% Param 'sig'
% Gaussian bandwidth value
%
% Param 'debugLevel'
%
% Param 'debugMsgLocation'
%
% OUTPUT PARAMS
% Param 'instancesTransformationKernel'
% Transformation of the instances w.r.t. the cluster centres
%
% ADDITIONAL NOTES
%
% Author: snikumbh@mpi-inif.mpg.de


% We apply Gaussian tranformation
%

nBags = size(allSeqsAsBags, 2);
transformationKernel = zeros(nBags, size(clusterCentres, 1)); % bag level information
% transformationKernel = zeros(nBags, size(clusterCentres, 1)); % bag level information
nClusters = size(clusterCentres,1);
FVDim1 = size(clusterCentres, 2);
do_reshaped = 1;
do_reshaped = 1; % by default, we use th reshaped version. This is tested to be the fastest is most experiments.
if ~issparse(allSeqsAsBags{1}) && do_reshaped == 1
logMessages(debugMsgLocation, sprintf('Doing reshaped version..\n'), debugLevel);
for i=1:nBags
Expand All @@ -24,15 +49,15 @@
end
elseif issparse(allSeqsAsBags{1}) && do_reshaped == 1
logMessages(debugMsgLocation, sprintf('Doing reshaped version for sparse vectors..\n'), debugLevel);
% Because N-D sparse arrays are not possible
% Hence, we opt to reshape/repmat the clusterCentres instead of the sparse FVs
% Because N-D sparse arrays are not possible, hence, we opt to reshape/repmat
% the clusterCentres instead of the sparse FVs
for i=1:nBags
% length of instancesTransformationKernel is nBags
% dimensions of each instancesTransformationKernel nInstances-by-nClusters
numOfInstances = size(allSeqsAsBags{i},2);
instancesTransformationKernel{i} = zeros(numOfInstances, nClusters);
reshapedClusterCentres = reshape(repmat(clusterCentres, numOfInstances,1), FVDim1, numOfInstances, nClusters);
%dim are now nClusters-by-weightVectorDim-by-nClusters
% dim are now nClusters-by-weightVectorDim-by-nClusters
temp = bsxfun(@minus, allSeqsAsBags{i}, reshapedClusterCentres);
temp = sum(temp.^2,1);
temp = exp(-0.5*temp./sig^2);
Expand All @@ -46,7 +71,7 @@
temp = sum(temp'.^2,1);
temp = exp(-0.5*temp./sig^2);
instancesTransformationKernelA{i}(:,j) = temp';
transformationKernel(i,j) = sum(temp(:));
% transformationKernel(i,j) = sum(temp(:));
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion comik_main_with_weight_vector.m
Expand Up @@ -420,7 +420,7 @@
%
tic;
logMessages(debugMsgLocation, sprintf('Applying transformation to the test samples, this might take some time...\n'), debugLevel);
[~, allSeqsTransformationKernel_Test] = applyTransformation(allSeqsAsBags_Test, clusterCentres, conformalXformationParam, debugLevel, debugMsgLocation);
[allSeqsTransformationKernel_Test] = applyTransformation(allSeqsAsBags_Test, clusterCentres, conformalXformationParam, debugLevel, debugMsgLocation);
logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel);
%
% Getting the instance weightings for the test examples
Expand Down
3 changes: 1 addition & 2 deletions computeConformedMultiInstanceKernel.m
Expand Up @@ -159,9 +159,8 @@
% transformationKernel is Gaussian RBF
tic;
logMessages(debugMsgLocation, sprintf('--- applying transformation...'), debugLevel);
[transformationKernel, instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation);
[instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation);
logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel);
% transformationKernel is nBags-by-nClusters
% instancesTransformationKernel is cell array for number of bags
% instancesTransformationKernel{eachBag} is nInstanceInBag-by-nClusters

Expand Down
5 changes: 0 additions & 5 deletions getCompleteSequenceODHFeatureVec.m

This file was deleted.

43 changes: 30 additions & 13 deletions getExpansionPoints.m
@@ -1,24 +1,41 @@
function [expansionPoints, matrixOfDistancesFromCentres] = getExpansionPoints(X, nClusters, rep, debugLevel, debugMsgLocation)

% Params
% X : All bags opened up into instances
% nClusters : #clusters
% GETEXPANSIONPOINTS
% Performs k-means and returns the clusterCentres as the expansion points
%
% INPUT PARAMS
% Param X (matrix)
% All bags opened up into instances
%
% Param nClusters (matrix)
% Number of clusters for k-means
%
% Returns
% a set of expansionPoints
% OUTPUT PARAMS
% Param 'expansionPoints' (matrix)
% a set of expansionPoints, in other words cluster centres
%
% Param 'matrixOfDistancesFromCentres' (matrix)
% As the name suggests,
%
% ADDITIONAL NOTES
%
% Author: snikumbh@mpi-inf.mpg.de

logMessages(debugMsgLocation, sprintf('--- Obtaining a set of expansion points '), debugLevel);
% obtained X is an n x p matrix, ready for kmeans
% expansionPoints is a k-by-p matrix
% D is a n-by-k matrix giving distances from each
% point to every centroid. This will be useful
% if we use Gaussian form for the transformation
% function
% Obtained X in the input arguments is an n x p matrix, ready for kmeans
% expansionPoints, to be returned, is a k-by-p matrix

% Below, matrixOfDistancesFromCentres is a n-by-k matrix giving distances from each point to every
% cluster centre.
%
% sumD, from the Matlab documentation, is the within-cluster sums of point-to-centroid distances in a k-by-1 vector
%

tic;
rng('default'); % or can be set to 'default'
rng('default');
logMessages(debugMsgLocation, sprintf('with K-means: \n'), debugLevel);
if debugLevel == 2
[idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','off');%, 'MaxIter',1000);%Display: final
[idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','on');%, 'MaxIter',1000);%Display: final
elseif debugLevel == 0
[idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','off');%, 'MaxIter',1000);
end
Expand Down
45 changes: 33 additions & 12 deletions getInstanceWeights.m
@@ -1,12 +1,33 @@
function [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(kernelWeights, allSeqsTransformationKernel)

% GETINSTANCEWEIGHTS
% Computes the theta values and instance weights from the sub-kernel weights and segment
% transformations
%
% INPUT PARAMS
% Param 'kernelWeights'
% Weights assigned by MKL tf each of the sub-kernels in the collection
%
% Param 'allSeqsTransformationKernel'
% Per sequence transformation of segments used to compute the instance weights
%
% OUTPUT PARAMS
% Param 'thetaVals'
% The theta values corresponding to each cluster centre
%
% Param 'instanceWeightsInEachBag'
% Weight for each segment per sequence
%
% ADDITIONAL NOTES
% -- Theta vals and sub-kernel weights follow a squared relation (Refer paper)
%
% -- Order followed is:
% . non-shifted kernels (1 to N) first, then shifted kernels (N+1 to 2N) in total 2N kernels
%
% -- Number of kernels is same as number of clusters
%
% Return the theta values and instance weights
% for all bags
%
% following a squared relation
% order followed is: non-shifted kernels (1 to N) first, then shifted kernels (N+1 to 2N) in total 2N kernels
% Edit: now number of kernels is same as number of clusters
% Author: snikumbh@mpi-inf.mpg.de

thetaVals = sqrt(kernelWeights);
thetaVals = thetaVals';
Expand All @@ -23,13 +44,13 @@
size(allSeqsTransformationKernel{j},1), 1) ...
.* allSeqsTransformationKernel{j}, 2);
instanceWeightsInEachBag{j} = temp;
%% if to be written to file, pass fname to the function
%stdTemp = standardizeMatrix(temp');
%finalTemp = zeros(size(stdTemp));
%% Using rankings
%[~, ranking] = sort(stdTemp(1:size(stdTemp,1),1:size(stdTemp,2)), 2, 'descend');
%finalTemp(1,ranking) = 1:size(stdTemp,2);
%dlmwrite(fname, finalTemp, '-append');
% if to be written to file, pass fname to the function
% stdTemp = standardizeMatrix(temp');
% finalTemp = zeros(size(stdTemp));
% Using rankings
% [~, ranking] = sort(stdTemp(1:size(stdTemp,1),1:size(stdTemp,2)), 2, 'descend');
% finalTemp(1,ranking) = 1:size(stdTemp,2);
% dlmwrite(fname, finalTemp, '-append');
end


Expand Down
80 changes: 51 additions & 29 deletions getODHFeatureVecInstances.m
@@ -1,36 +1,58 @@
function [ODHFeatureVecInstances, numOfSegments] = getODHFeatureVecInstances(seq, oligoLen, maxDist, segmentSizeInPercentage, segmentSizeInBps, segmentingStart, alphabet, debug)

% Author: snikumbh
% GETODHFEATUREVECINSTANCES
%
% Params
% 1. seq: candidate sequence (length L given in #base pairs)
% 2. oligoLen: oligomer length for ODH (K, default: 2)
% 3. maxDist: maximum distance value to consider for ODH;
% INPUT PARAMS
% Param 'seq'
% candidate sequence (length L given in #base pairs)
%
% Param 'oligoLen'
% Oligomer length for ODH (K, default: 2)
%
% Param 'maxDist'
% Maximum distance value to consider for ODH;
% (default: 50)
% 4. segmentSizeInPercentage: Percentage value used to segment
% the complete sequence; set this
% to 0 if percentage-based segmenting
% is not to be used (default: 0)
% 5. segmentSizeInBps: If percentage value for segmenting is 0
% this has to be non-zero and less than
% complete sequence length. It is given
% in number of basepairs. (default:
% 100 bps)
% 6. segmentingStart: 'shift' specifies shifting the segmentStart
% to (round(segmentSizeInBps/2) + 1);
% otherwise 'no-shift' (default)
% 7. alphabet: DNA/Protein sequence alphabet (default: DNA)
%
%
% Returns
% the ODH FeatureVec representation of (ordered) instances
% (segments) of the sequence supplied; each in sparse format.
% Sparsify the end result again? May be not needed.
% [These can be concatenated to obtain the one final modifiedODHFeatureVec]
%
% Param 'segmentSizeInPercentage'
% Percentage value used to segment the complete sequence; set this to 0 if
% percentage-based segmenting is not to be used (default: 0)
%
% Param 'segmentSizeInBps'
% If percentage value for segmenting is 0 this has to be non-zero and less
% than complete sequence length. It is given in number of basepairs.
% (default: 100 bps)
%
% Param 'segmentingStart'
% 'shift' specifies shifting the segmentStart to
% (round(segmentSizeInBps/2) + 1);
% otherwise 'no-shift' (default)
%
% Param 'alphabet'
% DNA/Protein sequence alphabet (default: DNA)
%
%
% OUTPUT PARAMS
% Param 'ODHFeatureVecInstances'
% Representation of (ordered) instances (or segments) of the sequence supplied;
% each in sparse format.
%
% Param numOfSegments
%
% Edits: 06, February, 2017, By: snikumbh
% maxDist is now as per the original ODH paper, Lmax - K.
% ADDITIONAL NOTES
% -- Order of segments
% . This is only to help the user in determining the where non-shifted
% segments end in the collection, and where shifted segment start
% . The subsequent (set) kernel computation is agnostic to this arrangement
%
% -- maxDist is now as per the original ODH paper, Lmax - K
% . Edits: 06, February, 2017, By: snikumbh
%
% -- Sparsify the end result again? May be not needed.
% . [These can be concatenated to obtain the one final modifiedODHFeatureVec]
%
%
%
% Author: snikumbh@mpi-inf.mpg.de

% Set default values when not supplied
totalArguments = 8;
Expand Down Expand Up @@ -103,7 +125,7 @@
numOfSegments = length(segmentBoundaries);
end
% Asert same number of segmentStarts and Boundaries
%assert(size(segmentStarts,2) == size(segmentBoundaries,2), 'Check segments!');
% assert(size(segmentStarts,2) == size(segmentBoundaries,2), 'Check segments!');
if debug
fprintf('Segments with %s\n', segmentingStart);
fprintf('#Segments = %d\n', numOfSegments);
Expand Down Expand Up @@ -135,7 +157,7 @@
%
%columns correspond to different instances; rows to FVs
ODHFeatureVecInstances = zeros((M^2) * numDist, numOfSegments);
%modifiedODHFeatureVec = [];%zeros( modifiedODHFeatureVecLen , 1);
% modifiedODHFeatureVec = [];%zeros( modifiedODHFeatureVecLen , 1);
% Refer paper.

% Segmenting already done.
Expand Down

0 comments on commit 4d48120

Please sign in to comment.