From 4d48120142947f24a744868be8029c40b74196e0 Mon Sep 17 00:00:00 2001 From: Sarvesh Prakash Nikumbh Date: Fri, 28 Jul 2017 17:04:49 +0200 Subject: [PATCH] Help for functions added for many smaller functions --- analyze_wvector.m | 49 ++++++++++++++++ applyTransformation.m | 41 +++++++++++--- comik_main_with_weight_vector.m | 2 +- computeConformedMultiInstanceKernel.m | 3 +- getCompleteSequenceODHFeatureVec.m | 5 -- getExpansionPoints.m | 43 +++++++++----- getInstanceWeights.m | 45 +++++++++++---- getODHFeatureVecInstances.m | 80 +++++++++++++++++---------- get_colored.m | 18 ++++++ mkl.py | 1 - normalizeKernel.m | 16 ++++++ obtainWeightVector.m | 78 +++++++++++++++++--------- performMKLWithShogunPython.m | 48 ++++++++++++++++ readFastaSequences.m | 15 +++-- 14 files changed, 342 insertions(+), 102 deletions(-) delete mode 100644 getCompleteSequenceODHFeatureVec.m diff --git a/analyze_wvector.m b/analyze_wvector.m index 86237cb..35f0d07 100644 --- a/analyze_wvector.m +++ b/analyze_wvector.m @@ -1,5 +1,54 @@ function [run_this_cmd, pdfcrop_lines] = analyse_wvector(folderName, rankId, wvector, oligoLen, maxDist, topN, verbose, max_or_topN, debugLevel, debugMsgLocation) +% ANALYSE_WVECTOR +% Analyses the weight vector and plot distance-centric k-mer visualizations +% +% INPUT PARAMS +% Param 'folderName' +% Location/Path on disk of the output folder +% +% Param 'rankId' +% Rank of the weight vector in CoMIK +% +% Param 'wvector' +% The weight vector itself which is to be used here +% +% Param 'oligoLen' +% Oligomer length for the ODH representation +% +% Param 'maxDist' +% Maximum distance value for the ODH representation +% +% Param 'topN' +% Specify a value for topN +% +% Param 'verbose' +% Set to 1 for verbose output +% +% Param 'max_or_topN' +% Flag to specify if topN vizualization or max (Absolute Max Per Distance) visualization is to be performed +% Refer to Nikumbh and Pfeifer, BMC Bioinformatics, 2017. +% +% Param 'debugLevel' +% +% Param 'debugMsgLocation' +% +% OUTPUT PARAMS +% Param 'run_this_cmd' +% This could be used create a cmd that is to be run after the program ends to +% stitch together pages from the separate PDF files +% +% Param 'pdfcrop_lines' +% Cropping the PDF pages/files to cut out the unused area +% +% +% ADDITIONAL NOTES +% +% +% Author: snikumbh@mpi-inf.mpg.de +% + + totalArguments = 10; if nargin < totalArguments debugMsgLocation = 1; diff --git a/applyTransformation.m b/applyTransformation.m index 1cb6ab9..9b68482 100644 --- a/applyTransformation.m +++ b/applyTransformation.m @@ -1,14 +1,39 @@ -function [transformationKernel, instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation) +function [instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation) +% APPLYTRANSFORMATION +% Transforms the individual segments w.r.t. the cluster centres obtained by k-means % -% we apply Gaussian tranformation +% INPUT PARAMS +% Param 'allSeqsAsBags' (Cell array) +% Collection of all segments per sequence +% +% Param 'clusterCentres' (matrix) +% A matrix where each cluster centre is represented coulnwise; dimensions: nClusters x FVlength +% +% Param 'sig' +% Gaussian bandwidth value +% +% Param 'debugLevel' +% +% Param 'debugMsgLocation' +% +% OUTPUT PARAMS +% Param 'instancesTransformationKernel' +% Transformation of the instances w.r.t. the cluster centres +% +% ADDITIONAL NOTES +% +% Author: snikumbh@mpi-inif.mpg.de + + +% We apply Gaussian tranformation % nBags = size(allSeqsAsBags, 2); -transformationKernel = zeros(nBags, size(clusterCentres, 1)); % bag level information +% transformationKernel = zeros(nBags, size(clusterCentres, 1)); % bag level information nClusters = size(clusterCentres,1); FVDim1 = size(clusterCentres, 2); -do_reshaped = 1; +do_reshaped = 1; % by default, we use th reshaped version. This is tested to be the fastest is most experiments. if ~issparse(allSeqsAsBags{1}) && do_reshaped == 1 logMessages(debugMsgLocation, sprintf('Doing reshaped version..\n'), debugLevel); for i=1:nBags @@ -24,15 +49,15 @@ end elseif issparse(allSeqsAsBags{1}) && do_reshaped == 1 logMessages(debugMsgLocation, sprintf('Doing reshaped version for sparse vectors..\n'), debugLevel); - % Because N-D sparse arrays are not possible - % Hence, we opt to reshape/repmat the clusterCentres instead of the sparse FVs + % Because N-D sparse arrays are not possible, hence, we opt to reshape/repmat + % the clusterCentres instead of the sparse FVs for i=1:nBags % length of instancesTransformationKernel is nBags % dimensions of each instancesTransformationKernel nInstances-by-nClusters numOfInstances = size(allSeqsAsBags{i},2); instancesTransformationKernel{i} = zeros(numOfInstances, nClusters); reshapedClusterCentres = reshape(repmat(clusterCentres, numOfInstances,1), FVDim1, numOfInstances, nClusters); - %dim are now nClusters-by-weightVectorDim-by-nClusters + % dim are now nClusters-by-weightVectorDim-by-nClusters temp = bsxfun(@minus, allSeqsAsBags{i}, reshapedClusterCentres); temp = sum(temp.^2,1); temp = exp(-0.5*temp./sig^2); @@ -46,7 +71,7 @@ temp = sum(temp'.^2,1); temp = exp(-0.5*temp./sig^2); instancesTransformationKernelA{i}(:,j) = temp'; - transformationKernel(i,j) = sum(temp(:)); + % transformationKernel(i,j) = sum(temp(:)); end end end diff --git a/comik_main_with_weight_vector.m b/comik_main_with_weight_vector.m index 7887379..b35da36 100644 --- a/comik_main_with_weight_vector.m +++ b/comik_main_with_weight_vector.m @@ -420,7 +420,7 @@ % tic; logMessages(debugMsgLocation, sprintf('Applying transformation to the test samples, this might take some time...\n'), debugLevel); - [~, allSeqsTransformationKernel_Test] = applyTransformation(allSeqsAsBags_Test, clusterCentres, conformalXformationParam, debugLevel, debugMsgLocation); + [allSeqsTransformationKernel_Test] = applyTransformation(allSeqsAsBags_Test, clusterCentres, conformalXformationParam, debugLevel, debugMsgLocation); logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel); % % Getting the instance weightings for the test examples diff --git a/computeConformedMultiInstanceKernel.m b/computeConformedMultiInstanceKernel.m index 0c32ab1..ef4623b 100644 --- a/computeConformedMultiInstanceKernel.m +++ b/computeConformedMultiInstanceKernel.m @@ -159,9 +159,8 @@ % transformationKernel is Gaussian RBF tic; logMessages(debugMsgLocation, sprintf('--- applying transformation...'), debugLevel); -[transformationKernel, instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation); +[instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation); logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel); -% transformationKernel is nBags-by-nClusters % instancesTransformationKernel is cell array for number of bags % instancesTransformationKernel{eachBag} is nInstanceInBag-by-nClusters diff --git a/getCompleteSequenceODHFeatureVec.m b/getCompleteSequenceODHFeatureVec.m deleted file mode 100644 index a4cd8ff..0000000 --- a/getCompleteSequenceODHFeatureVec.m +++ /dev/null @@ -1,5 +0,0 @@ -function getCompleteSequenceODHFeatureVec() - -%returns the ODH feature vector using the complete sequence as one - -end diff --git a/getExpansionPoints.m b/getExpansionPoints.m index 18a70dd..676392c 100644 --- a/getExpansionPoints.m +++ b/getExpansionPoints.m @@ -1,24 +1,41 @@ function [expansionPoints, matrixOfDistancesFromCentres] = getExpansionPoints(X, nClusters, rep, debugLevel, debugMsgLocation) -% Params -% X : All bags opened up into instances -% nClusters : #clusters +% GETEXPANSIONPOINTS +% Performs k-means and returns the clusterCentres as the expansion points +% +% INPUT PARAMS +% Param X (matrix) +% All bags opened up into instances +% +% Param nClusters (matrix) +% Number of clusters for k-means % -% Returns -% a set of expansionPoints +% OUTPUT PARAMS +% Param 'expansionPoints' (matrix) +% a set of expansionPoints, in other words cluster centres +% +% Param 'matrixOfDistancesFromCentres' (matrix) +% As the name suggests, % +% ADDITIONAL NOTES +% +% Author: snikumbh@mpi-inf.mpg.de + logMessages(debugMsgLocation, sprintf('--- Obtaining a set of expansion points '), debugLevel); -% obtained X is an n x p matrix, ready for kmeans -% expansionPoints is a k-by-p matrix -% D is a n-by-k matrix giving distances from each -% point to every centroid. This will be useful -% if we use Gaussian form for the transformation -% function +% Obtained X in the input arguments is an n x p matrix, ready for kmeans +% expansionPoints, to be returned, is a k-by-p matrix + +% Below, matrixOfDistancesFromCentres is a n-by-k matrix giving distances from each point to every +% cluster centre. +% +% sumD, from the Matlab documentation, is the within-cluster sums of point-to-centroid distances in a k-by-1 vector +% + tic; -rng('default'); % or can be set to 'default' +rng('default'); logMessages(debugMsgLocation, sprintf('with K-means: \n'), debugLevel); if debugLevel == 2 - [idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','off');%, 'MaxIter',1000);%Display: final + [idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','on');%, 'MaxIter',1000);%Display: final elseif debugLevel == 0 [idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','off');%, 'MaxIter',1000); end diff --git a/getInstanceWeights.m b/getInstanceWeights.m index 62b49c7..447e83d 100644 --- a/getInstanceWeights.m +++ b/getInstanceWeights.m @@ -1,12 +1,33 @@ function [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(kernelWeights, allSeqsTransformationKernel) +% GETINSTANCEWEIGHTS +% Computes the theta values and instance weights from the sub-kernel weights and segment +% transformations +% +% INPUT PARAMS +% Param 'kernelWeights' +% Weights assigned by MKL tf each of the sub-kernels in the collection +% +% Param 'allSeqsTransformationKernel' +% Per sequence transformation of segments used to compute the instance weights +% +% OUTPUT PARAMS +% Param 'thetaVals' +% The theta values corresponding to each cluster centre +% +% Param 'instanceWeightsInEachBag' +% Weight for each segment per sequence +% +% ADDITIONAL NOTES +% -- Theta vals and sub-kernel weights follow a squared relation (Refer paper) +% +% -- Order followed is: +% . non-shifted kernels (1 to N) first, then shifted kernels (N+1 to 2N) in total 2N kernels +% +% -- Number of kernels is same as number of clusters % -% Return the theta values and instance weights -% for all bags % -% following a squared relation -% order followed is: non-shifted kernels (1 to N) first, then shifted kernels (N+1 to 2N) in total 2N kernels -% Edit: now number of kernels is same as number of clusters +% Author: snikumbh@mpi-inf.mpg.de thetaVals = sqrt(kernelWeights); thetaVals = thetaVals'; @@ -23,13 +44,13 @@ size(allSeqsTransformationKernel{j},1), 1) ... .* allSeqsTransformationKernel{j}, 2); instanceWeightsInEachBag{j} = temp; - %% if to be written to file, pass fname to the function - %stdTemp = standardizeMatrix(temp'); - %finalTemp = zeros(size(stdTemp)); - %% Using rankings - %[~, ranking] = sort(stdTemp(1:size(stdTemp,1),1:size(stdTemp,2)), 2, 'descend'); - %finalTemp(1,ranking) = 1:size(stdTemp,2); - %dlmwrite(fname, finalTemp, '-append'); + % if to be written to file, pass fname to the function + % stdTemp = standardizeMatrix(temp'); + % finalTemp = zeros(size(stdTemp)); + % Using rankings + % [~, ranking] = sort(stdTemp(1:size(stdTemp,1),1:size(stdTemp,2)), 2, 'descend'); + % finalTemp(1,ranking) = 1:size(stdTemp,2); + % dlmwrite(fname, finalTemp, '-append'); end diff --git a/getODHFeatureVecInstances.m b/getODHFeatureVecInstances.m index 70744b1..4c7e5fc 100644 --- a/getODHFeatureVecInstances.m +++ b/getODHFeatureVecInstances.m @@ -1,36 +1,58 @@ function [ODHFeatureVecInstances, numOfSegments] = getODHFeatureVecInstances(seq, oligoLen, maxDist, segmentSizeInPercentage, segmentSizeInBps, segmentingStart, alphabet, debug) -% Author: snikumbh +% GETODHFEATUREVECINSTANCES % -% Params -% 1. seq: candidate sequence (length L given in #base pairs) -% 2. oligoLen: oligomer length for ODH (K, default: 2) -% 3. maxDist: maximum distance value to consider for ODH; +% INPUT PARAMS +% Param 'seq' +% candidate sequence (length L given in #base pairs) +% +% Param 'oligoLen' +% Oligomer length for ODH (K, default: 2) +% +% Param 'maxDist' +% Maximum distance value to consider for ODH; % (default: 50) -% 4. segmentSizeInPercentage: Percentage value used to segment -% the complete sequence; set this -% to 0 if percentage-based segmenting -% is not to be used (default: 0) -% 5. segmentSizeInBps: If percentage value for segmenting is 0 -% this has to be non-zero and less than -% complete sequence length. It is given -% in number of basepairs. (default: -% 100 bps) -% 6. segmentingStart: 'shift' specifies shifting the segmentStart -% to (round(segmentSizeInBps/2) + 1); -% otherwise 'no-shift' (default) -% 7. alphabet: DNA/Protein sequence alphabet (default: DNA) -% -% -% Returns -% the ODH FeatureVec representation of (ordered) instances -% (segments) of the sequence supplied; each in sparse format. -% Sparsify the end result again? May be not needed. -% [These can be concatenated to obtain the one final modifiedODHFeatureVec] +% +% Param 'segmentSizeInPercentage' +% Percentage value used to segment the complete sequence; set this to 0 if +% percentage-based segmenting is not to be used (default: 0) +% +% Param 'segmentSizeInBps' +% If percentage value for segmenting is 0 this has to be non-zero and less +% than complete sequence length. It is given in number of basepairs. +% (default: 100 bps) +% +% Param 'segmentingStart' +% 'shift' specifies shifting the segmentStart to +% (round(segmentSizeInBps/2) + 1); +% otherwise 'no-shift' (default) +% +% Param 'alphabet' +% DNA/Protein sequence alphabet (default: DNA) +% +% +% OUTPUT PARAMS +% Param 'ODHFeatureVecInstances' +% Representation of (ordered) instances (or segments) of the sequence supplied; +% each in sparse format. +% +% Param numOfSegments % -% Edits: 06, February, 2017, By: snikumbh -% maxDist is now as per the original ODH paper, Lmax - K. +% ADDITIONAL NOTES +% -- Order of segments +% . This is only to help the user in determining the where non-shifted +% segments end in the collection, and where shifted segment start +% . The subsequent (set) kernel computation is agnostic to this arrangement +% +% -- maxDist is now as per the original ODH paper, Lmax - K +% . Edits: 06, February, 2017, By: snikumbh +% +% -- Sparsify the end result again? May be not needed. +% . [These can be concatenated to obtain the one final modifiedODHFeatureVec] +% +% % +% Author: snikumbh@mpi-inf.mpg.de % Set default values when not supplied totalArguments = 8; @@ -103,7 +125,7 @@ numOfSegments = length(segmentBoundaries); end % Asert same number of segmentStarts and Boundaries -%assert(size(segmentStarts,2) == size(segmentBoundaries,2), 'Check segments!'); +% assert(size(segmentStarts,2) == size(segmentBoundaries,2), 'Check segments!'); if debug fprintf('Segments with %s\n', segmentingStart); fprintf('#Segments = %d\n', numOfSegments); @@ -135,7 +157,7 @@ % %columns correspond to different instances; rows to FVs ODHFeatureVecInstances = zeros((M^2) * numDist, numOfSegments); -%modifiedODHFeatureVec = [];%zeros( modifiedODHFeatureVecLen , 1); +% modifiedODHFeatureVec = [];%zeros( modifiedODHFeatureVecLen , 1); % Refer paper. % Segmenting already done. diff --git a/get_colored.m b/get_colored.m index 6f09d16..3665c7a 100644 --- a/get_colored.m +++ b/get_colored.m @@ -1,11 +1,29 @@ function [colored_string] = get_colored(given_string, oligoLen) +% GETCOLORED % Colors the given string (DNA motif) using the IUPAC color scheme for the nucleotides A, C, G, T. +% +% INPUT PARAMS +% Param 'given_string' +% The DNA oligomer +% +% Param 'oligoLen' +% Length of the oligomer +% +% OUTPUT PARAMS +% Param 'colored_string' +% String with each DNA sequence character appropriately colored according to the IUPAC scheme +% +% ADDITIONAL NOTES +% +% +% Author: snikumbh@mpi-inf.mpg.de given_string = char(given_string); alphabet = upper('acgt'); colored_string = []; for i=1:size(given_string, 2) + % i iterates over the number of characters in the string if isscalar(given_string(1,i)) if oligoLen~= 0 && i == oligoLen+1 colored_string = strcat(colored_string, '\color{black}|'); diff --git a/mkl.py b/mkl.py index c7b1cd2..dd4793e 100755 --- a/mkl.py +++ b/mkl.py @@ -133,7 +133,6 @@ testkernel = CombinedKernel() for k in range(0, len(kernelFilenames)): thisFname = kernelFilenames[k] - #print thisFname if os.path.exists(thisFname): testkernel.append_kernel(CustomKernel(numpy.loadtxt(thisFname, delimiter=','))) else: diff --git a/normalizeKernel.m b/normalizeKernel.m index f801afa..27a7f6b 100644 --- a/normalizeKernel.m +++ b/normalizeKernel.m @@ -1,5 +1,21 @@ function normalizedKernel = normalizeKernel(Kmat) +% NORMALIZEKERNEL +% Diagonal normalizes a given kernel matrix +% +% INPUT PARAMS +% Param 'Kmat' (matrix) +% The input kernel matrix +% +% OUTPUT PARAMS +% Param 'normalizedKernel' +% The diagonal-normalized kernel matrix +% +% ADDITIONAL NOTES +% +% +% Author: snikumbh@mpi-inf.mpg.de + KdiagVec = sqrt(diag(Kmat)); normalizedKernel = Kmat ./ (KdiagVec * KdiagVec'); diff --git a/obtainWeightVector.m b/obtainWeightVector.m index 124f742..32914d8 100644 --- a/obtainWeightVector.m +++ b/obtainWeightVector.m @@ -1,15 +1,55 @@ function [overallWeightVector, weightVectors, biasValue] = obtainWeightVector(folderName, oligoLen, maxDist, subkernelWeights, allSeqsAsBags, rawConformedSetKernel, allSeqsTransformationKernel, debugLevel, debugMsgLocation) + +% OBTAINWEIGHTVECTOR % -% Author: snikumbh@mpi-inf.mpg.de +% INPUT PARAMS +% Param 'folderName' +% Location/Path on disk +% +% Param 'oligoLen' +% Value of oligomer length for the ODH representation +% +% Param 'maxDist' +% Value of the maximum distance for the ODH representation +% +% Param 'subkernelWeights' (vector) +% Weights assigned to each sub-kernel upon solving MKL +% +% Param 'allSeqsAsBags' (cell array) +% All sequences represented as bags +% +% Param 'rawConformedSetKernel' (matrix) +% The unnormalized conformally transformed MI kernel +% +% Param 'allSeqsTransformationKernel' (matrix) +% Transformations of all segments for each sequence in the collection % -% 1. We have subkernel weights given, all corresponding weight vectors are to be computed -% 2. Alpha values are given from the trained model -% 3. Location for alpha values: folderName specified in arguments -% 4. Output arguments: overallWeightVector and biasValue +% Param 'debugLevel' % -% Additional comment: -% Passing a value for 'folderName' is essential -% Therefore, there are no default set of values of subkernelWeights +% Param 'debugMsgLocation' +% +% OUTPUT PARAMS +% Param 'overallWeightVector' +% The overall weight vector which is a linear combination of the individual +% weight vectors +% +% Param 'weightVectors' +% The individual weight vectors corresponding the multiple kernels in CoMIK +% +% Param 'biasValue' +% The bias value obtained solving SVM +% +% +% ADDITIONAL NOTES +% -- Alpha values are given from the trained model +% -- Location for alpha values: folderName specified in arguments +% -- Output arguments: overallWeightVector and biasValue +% -- We have subkernel weights given, all corresponding weight vectors are to be computed +% -- Passing a value for 'folderName', is essential, where one can find the subkernel weights +% . Therefore, there are no default set of values of subkernelWeights +% +% Author: snikumbh@mpi-inf.mpg.de + alphaFilename = 'alphas.txt'; svFilename = 'sv.txt'; @@ -20,12 +60,8 @@ biasValue = dlmread(strcat(folderName, '/', biasFilename)); % Compute the weight vectors corresponding to all subkernels, keep them in the same order as the kernels -% % 3. read in the output from the python script and +% -- Read in the output from the python script and % return back the values. -% Order: The kernels are arranged as -% Non-shifted conformed kernels (#kernels = nClusters) -% Shifted conformed kernels (#kernels = nClusters) -% Total kernels= 2*nCluster % nClusters = size(subkernelWeights, 1); nKernels = nClusters; @@ -33,19 +69,16 @@ % prepare bag-level phi, restrict only to support vectorsa % bagPhi is nSVBags-by-nClusters bagPhi = zeros( size(allSeqsAsBags{1},1), size(alphaIndices,1), nClusters); -%bagPhiShifted = zeros( size(allSeqsAsBagsShifted{1},1), size(alphaIndices,1), nClusters); idx = 0; for i=1:length(allSeqsAsBags)% this is nBags - %restrict to sv indices + % restrict to sv indices if any(i == alphaIndices) - idx = idx + 1;%this is second dimension of the bagPhi, ranges from 1 to #alphaIndices + idx = idx + 1; % this is second dimension of the bagPhi, ranges from 1 to #alphaIndices for j=1:nClusters - %allSeqs NonShifted/Shifted TransformationKernel already has nInstances x nClusters + % allSeqs TransformationKernel already has nInstances x nClusters normFactor = (1/sqrt(rawConformedSetKernel{j}(i,i))); bagPhi(:, idx, j) = normFactor * sum(allSeqsAsBags{i} .* repmat(allSeqsTransformationKernel{i}(:,j)', size(allSeqsAsBags{i},1), 1), 2); - %normFactor = (1/sqrt(rawShiftedConformedSetKernel{j}(i,i))); - %bagPhiShifted(:, idx, j) = normFactor * sum(allSeqsAsBagsShifted{i} .* repmat(allSeqsShiftedTransformationKernel{i}(:,j)', size(allSeqsAsBagsShifted{i},1), 1), 2); end end end @@ -54,17 +87,10 @@ % The first dimension of the weight vector comes from the featurevector length which depends on oligoLength param weightVectors = zeros(size(allSeqsAsBags{1},1), nClusters); -%disp(size(alphaValues)); -%disp(size(bagPhiNonShifted(:, :, 1))); for i=1:nClusters % subkernels for non-shifted bags weightVectors(:, i) = (subkernelWeights(i,1) * alphaValues' * bagPhi(:, :, i)')'; end -%disp(size(weightVectors(:,1))); -%for i=(nClusters+1):2*nClusters -% % subkernels for non-shifted bags -% weightVectors(:, i) = (subkernelWeights(i,1) * alphaValues' * bagPhiShifted(:,:, (i-nClusters))')'; -%end % Get the ranked order (weight vector corresponding to the subkernel with the highest weight is ranked at the top) [sortedSubkernelWeights, rankedIndices] = sort(subkernelWeights, 'descend'); diff --git a/performMKLWithShogunPython.m b/performMKLWithShogunPython.m index 05b3666..4be814b 100644 --- a/performMKLWithShogunPython.m +++ b/performMKLWithShogunPython.m @@ -1,5 +1,53 @@ function [subkernelWeights, predLabels] = performMKLwithShogunPython(Cval, Ytrain, Ytest, mklNorm, imbalance, train_allSeqsConformedSetKernel, test_allSeqsConformedSetKernel, outputFolder, whetherPerformTest, debugLevel, debugMsgLocation) +% PERFORMMKLWITHSHOGUNPYTHON +% +% INPUT PARAMS +% Param 'Cval' +% Cost value for SVM +% +% Param 'Ytrain' +% Labels of the training examples +% +% Param 'Ytest' +% Labels of the test examples +% +% Param 'mklNorm' +% p-norm for the MKL +% +% Param 'imbalance' +% Class imbalance, to be used for upweighting the mis-classification costs of the +% minority class +% +% Param 'train_allSeqsConformedSetKernel' +% Kernel matrix corresponding to the training sequences (train x train dimension) +% +% Param 'test_allSeqsConformedSetKernel' +% Kernel matrix corresponding to the test sequences (possibly test x train dimension) +% +% Param 'outputFolder' +% Location/path on disk of the output folder +% +% Param 'whetherPerformTest' +% Flag to specify to the python script that is run from the function if test +% predictions are to be made using the test kernel +% +% Param 'debugLevel' +% +% Param 'debugMsgLocation +% +% OUTPUT PARAMS +% Param 'subkernelWeights' +% Weights assigned to the sub kernel matrices by MKL +% +% Param 'predLabels' +% Prediction labels assigned by the trained model +% +% ADDITIONAL NOTES +% +% Author: snikumbh@mpi-inf.mpg.de + + totalArguments = 10; if nargin < totalArguments debugLevel = 2; diff --git a/readFastaSequences.m b/readFastaSequences.m index 8617896..cea2d72 100644 --- a/readFastaSequences.m +++ b/readFastaSequences.m @@ -5,7 +5,7 @@ % READFASTASEQUENCES reads fasta file and extracts sequence to fasta variable. % Modified version of readfasta shipped with ODH Matlab code. % -% INPUT arguments: +% INPUT PARAMS: % txt - name of FASTA filename % % segmentSizeInBps - segment size in basepairs @@ -20,7 +20,7 @@ % debugLevel and debugMsgLocation - (default) 2, and (default) 1 for stdout % or filename if to be written to file % -% RETURNS: +% OUTPUT PARAMS: % passfasta - struct containing cell arrays with legends and sequences % passFasta.legend = cell array with sequence legends (headers) % passFasta.seqeunce = cell array with sequences @@ -28,9 +28,11 @@ % omittedIndices - List of indices that were omittedIndices due to their % length shorter than the segment size % -% nSeqs - #sequences read from the given file ( +% nSeqs - #sequences read from the given file % -% author: snikumbh@mpi-inf,mpg.de +% ADDITIONAL NOTES +% +% Author: snikumbh@mpi-inf,mpg.de totalArguments = 6; if nargin < totalArguments @@ -51,7 +53,7 @@ ofid=fopen(strcat(outputFolder, '/ommittedFastaIds.txt'), 'a'); fprintf(ofid, '----%s----\n', txt); lengths = []; -while i') @@ -71,10 +73,13 @@ end fclose(ofid); fclose(fid); + ineligibleLengths = lengths(lengths < segmentSizeInBps); ineligibleIndices = find(lengths < segmentSizeInBps); + nSeqs = givenNSeqs; passFasta = fasta; + if size(ineligibleLengths, 2) > 0 logMessages(debugMsgLocation, sprintf('%d sequence(s) have been ommitted as they are shorter than the segment-size specified. Check ommittedFastaIds.txt\n', length(ineligibleIndices)), debugLevel); % Change to what would be passed, if it is different than 'fasta' above