diff --git a/cluster_peptides/CalculateMeasures.m b/cluster_peptides/CalculateMeasures.m new file mode 100755 index 0000000..68d1697 --- /dev/null +++ b/cluster_peptides/CalculateMeasures.m @@ -0,0 +1,43 @@ +function [ ary_cor, ary_iAB ] = CalculateMeasures( data ) + % calculate measures + + count = size(data.seqaa,1); + ary_rep = zeros(count,1); + ary_exp = zeros(count,1); + ary_smp = zeros(count,1); + + ary_iAB = zeros(count,1); + ary_cor = zeros(count,1); + + for k = 1 : count + + % get each experiment ICC + EI = data.imtx(k,data.idx_exp == 1)'; + EII = data.imtx(k,data.idx_exp == 2)'; + ary_exp(k) = ICC([EI,EII], '1-1'); + + % get each sample ICC + SA = data.imtx(k,data.idx_smp == 'A')'; + SB = data.imtx(k,data.idx_smp == 'B')'; + ary_smp(k) = -ICC([SA,SB],'1-1'); + + % get each replica ICC + EIRI = data.imtx(k,data.idx_exp == 1 & data.idx_rep == 1)'; + EIRII = data.imtx(k,data.idx_exp == 1 & data.idx_rep == 2)'; + EIIRI = data.imtx(k,data.idx_exp == 2 & data.idx_rep == 1)'; + EIIRII = data.imtx(k,data.idx_exp == 2 & data.idx_rep == 2)'; + ary_rep(k) = ICC([EIRI,EIRII,EIIRI,EIIRII], '1-1'); + + ary_cor(k) = ary_exp(k) * ary_rep(k);% ary_smp(k); + + % difference + iA = mean(data.imtx(k,data.idx_smp == 'A')); + iB = mean(data.imtx(k,data.idx_smp == 'B')); + %iPeak = data.intensity(k); + iPeak = max(data.imtx(k,:)); + ary_iAB(k) = (iA - iB)/iPeak; + %} + end + +end + diff --git a/cluster_peptides/ICC.m b/cluster_peptides/ICC.m new file mode 100755 index 0000000..386a065 --- /dev/null +++ b/cluster_peptides/ICC.m @@ -0,0 +1,189 @@ +function [r, LB, UB, F, df1, df2, p] = ICC(M, type, alpha, r0) +% Intraclass correlation +% [r, LB, UB, F, df1, df2, p] = ICC(M, type, alpha, r0) +% +% M is matrix of observations. Each row is an object of measurement and +% each column is a judge or measurement. +% +% 'type' is a string that can be one of the six possible codes for the desired +% type of ICC: +% '1-1': The degree of absolute agreement among measurements made on +% randomly seleted objects. It estimates the correlation of any two +% measurements. +% '1-k': The degree of absolute agreement of measurements that are +% averages of k independent measurements on randomly selected +% objects. +% 'C-1': case 2: The degree of consistency among measurements. Also known +% as norm-referenced reliability and as Winer's adjustment for +% anchor points. case 3: The degree of consistency among measurements maded under +% the fixed levels of the column factor. This ICC estimates the +% corrlation of any two measurements, but when interaction is +% present, it underestimates reliability. +% 'C-k': case 2: The degree of consistency for measurements that are +% averages of k independent measurements on randomly selected +% onbjectgs. Known as Cronbach's alpha in psychometrics. case 3: +% The degree of consistency for averages of k independent +% measures made under the fixed levels of column factor. +% 'A-1': case 2: The degree of absolute agreement among measurements. Also +% known as criterion-referenced reliability. case 3: The absolute +% agreement of measurements made under the fixed levels of the column factor. +% 'A-k': case 2: The degree of absolute agreement for measurements that are +% averages of k independent measurements on randomly selected objects. +% case 3: he degree of absolute agreement for measurements that are +% based on k independent measurements maded under the fixed levels +% of the column factor. +% +% ICC is the estimated intraclass correlation. LB and UB are upper +% and lower bounds of the ICC with alpha level of significance. +% +% In addition to estimation of ICC, a hypothesis test is performed +% with the null hypothesis that ICC = r0. The F value, degrees of +% freedom and the corresponding p-value of the this test are +% reported. +% +% (c) Arash Salarian, 2008 +% +% Reference: McGraw, K. O., Wong, S. P., "Forming Inferences About +% Some Intraclass Correlation Coefficients", Psychological Methods, +% Vol. 1, No. 1, pp. 30-46, 1996 +% + +if nargin < 3 + alpha = .05; +end + +if nargin < 4 + r0 = 0; +end + +[n, k] = size(M); +[p, table] = anova_rm(M, 'off'); + +SSR = table{3,2}; +SSE = table{4,2}; +SSC = table{2,2}; +SSW = SSE + SSC; + +MSR = SSR / (n-1); +MSE = SSE / ((n-1)*(k-1)); +MSC = SSC / (k-1); +MSW = SSW / (n*(k-1)); + +switch type + case '1-1' + [r, LB, UB, F, df1, df2, p] = ICC_case_1_1(MSR, MSE, MSC, MSW, alpha, r0, n, k); + case '1-k' + [r, LB, UB, F, df1, df2, p] = ICC_case_1_k(MSR, MSE, MSC, MSW, alpha, r0, n, k); + case 'C-1' + [r, LB, UB, F, df1, df2, p] = ICC_case_C_1(MSR, MSE, MSC, MSW, alpha, r0, n, k); + case 'C-k' + [r, LB, UB, F, df1, df2, p] = ICC_case_C_k(MSR, MSE, MSC, MSW, alpha, r0, n, k); + case 'A-1' + [r, LB, UB, F, df1, df2, p] = ICC_case_A_1(MSR, MSE, MSC, MSW, alpha, r0, n, k); + case 'A-k' + [r, LB, UB, F, df1, df2, p] = ICC_case_A_k(MSR, MSE, MSC, MSW, alpha, r0, n, k); +end + + +%---------------------------------------- +function [r, LB, UB, F, df1, df2, p] = ICC_case_1_1(MSR, MSE, MSC, MSW, alpha, r0, n, k) +r = (MSR - MSW) / (MSR + (k-1)*MSW); + +F = (MSR/MSW) * (1-r0)/(1+(k-1)*r0); +df1 = n-1; +df2 = n*(k-1); +p = 1-fcdf(F, df1, df2); + +FL = F / finv(1-alpha/2, n-1, n*(k-1)); +FU = F * finv(1-alpha/2, n*(k-1), n-1); + +LB = (FL - 1) / (FL + (k-1)); +UB = (FU - 1) / (FU + (k-1)); + +%---------------------------------------- +function [r, LB, UB, F, df1, df2, p] = ICC_case_1_k(MSR, MSE, MSC, MSW, alpha, r0, n, k) +r = (MSR - MSW) / MSR; + +F = (MSR/MSW) * (1-r0); +df1 = n-1; +df2 = n*(k-1); +p = 1-fcdf(F, df1, df2); + +FL = F / finv(1-alpha/2, n-1, n*(k-1)); +FU = F * finv(1-alpha/2, n*(k-1), n-1); + +LB = 1 - 1 / FL; +UB = 1 - 1 / FU; + +%---------------------------------------- +function [r, LB, UB, F, df1, df2, p] = ICC_case_C_1(MSR, MSE, MSC, MSW, alpha, r0, n, k) +r = (MSR - MSE) / (MSR + (k-1)*MSE); + +F = (MSR/MSE) * (1-r0)/(1+(k-1)*r0); +df1 = n - 1; +df2 = (n-1)*(k-1); +p = 1-fcdf(F, df1, df2); + +FL = F / finv(1-alpha/2, n-1, (n-1)*(k-1)); +FU = F * finv(1-alpha/2, (n-1)*(k-1), n-1); + +LB = (FL - 1) / (FL + (k-1)); +UB = (FU - 1) / (FU + (k-1)); + +%---------------------------------------- +function [r, LB, UB, F, df1, df2, p] = ICC_case_C_k(MSR, MSE, MSC, MSW, alpha, r0, n, k) +r = (MSR - MSE) / MSR; + +F = (MSR/MSE) * (1-r0); +df1 = n - 1; +df2 = (n-1)*(k-1); +p = 1-fcdf(F, df1, df2); + +FL = F / finv(1-alpha/2, n-1, (n-1)*(k-1)); +FU = F * finv(1-alpha/2, (n-1)*(k-1), n-1); + +LB = 1 - 1 / FL; +UB = 1 - 1 / FU; + +%---------------------------------------- +function [r, LB, UB, F, df1, df2, p] = ICC_case_A_1(MSR, MSE, MSC, MSW, alpha, r0, n, k) +r = (MSR - MSE) / (MSR + (k-1)*MSE + k*(MSC-MSE)/n); + +a = (k*r0) / (n*(1-r0)); +b = 1 + (k*r0*(n-1))/(n*(1-r0)); +F = MSR / (a*MSC + b*MSE); +df1 = n - 1; +df2 = (a*MSC + b*MSE)^2/((a*MSC)^2/(k-1) + (b*MSE)^2/((n-1)*(k-1))); +p = 1-fcdf(F, df1, df2); + +a = k*r/(n*(1-r)); +b = 1+k*r*(n-1)/(n*(1-r)); +v = (a*MSC + b*MSE)^2/((a*MSC)^2/(k-1) + (b*MSE)^2/((n-1)*(k-1))); + +Fs = finv(1-alpha/2, n-1, v); +LB = n*(MSR - Fs*MSE)/(Fs*(k*MSC + (k*n - k - n)*MSE) + n*MSR); + +Fs = finv(1-alpha/2, v, n-1); +UB = n*(Fs*MSR-MSE)/(k*MSC + (k*n - k - n)*MSE + n*Fs*MSR); + +%---------------------------------------- +function [r, LB, UB, F, df1, df2, p] = ICC_case_A_k(MSR, MSE, MSC, MSW, alpha, r0, n, k) +r = (MSR - MSE) / (MSR + (MSC-MSE)/n); + +c = r0/(n*(1-r0)); +d = 1 + (r0*(n-1))/(n*(1-r0)); +F = MSR / (c*MSC + d*MSE); +df1 = n - 1; +df2 = (c*MSC + d*MSE)^2/((c*MSC)^2/(k-1) + (d*MSE)^2/((n-1)*(k-1))); +p = 1-fcdf(F, df1, df2); + +a = r/(n*(1-r)); +b = 1+r*(n-1)/(n*(1-r)); +v = (a*MSC + b*MSE)^2/((a*MSC)^2/(k-1) + (b*MSE)^2/((n-1)*(k-1))); + +Fs = finv(1-alpha/2, n-1, v); +LB = n*(MSR - Fs*MSE)/(Fs*(MSC-MSE) + n*MSR); + +Fs = finv(1-alpha/2, v, n-1); +UB = n*(Fs*MSR - MSE)/(MSC - MSE + n*Fs*MSR); + diff --git a/cluster_peptides/LogPeptideList.m b/cluster_peptides/LogPeptideList.m new file mode 100755 index 0000000..7e25860 --- /dev/null +++ b/cluster_peptides/LogPeptideList.m @@ -0,0 +1,17 @@ +function [] = LogPeptideList( idx, pep_data, ref_ann, label) +% Write out lists +gid_list = cat(1,pep_data.gid_list{idx}); +gid_unq = unique(gid_list); +[~,~,idx_R] = intersect(gid_unq,ref_ann.gid); +sym_unq = unique(ref_ann.sym(idx_R)); + + fW = fopen(sprintf('resultSet_GI_%s.txt',label),'w'); + fprintf(fW,'%s\n',gid_unq{:}); + fclose(fW); + + fW = fopen(sprintf('resultSet_SYM_%s.txt',label),'w'); + fprintf(fW,'%s\n',sym_unq{:}); + fclose(fW); + +end + diff --git a/cluster_peptides/MatchIDList.m b/cluster_peptides/MatchIDList.m new file mode 100755 index 0000000..f8ff2db --- /dev/null +++ b/cluster_peptides/MatchIDList.m @@ -0,0 +1,18 @@ +function [ idx ] = MatchIDList(map, data, qry) +% get symbols that match + +qry_sym = unique(qry); + +idx_sym = cellfun(@(x) any(strcmp(x,qry_sym)), map.sym_unq); + +qry_pep = unique(map.gid(idx_sym(map.idx_sym_rev))); + +[gid_unq,~,gid_unq_rev] = unique(data.gid_ary); + +idx_pep = cellfun(@(x) any(strcmp(x,qry_pep)), gid_unq); +pos_pep = data.gid_ary_rev(idx_pep(gid_unq_rev)); +idx = false(size(data.length,1),1); +idx(unique(pos_pep)) = true; + +end + diff --git a/cluster_peptides/PeptideAnalysis.m b/cluster_peptides/PeptideAnalysis.m new file mode 100755 index 0000000..6a62f9f --- /dev/null +++ b/cluster_peptides/PeptideAnalysis.m @@ -0,0 +1,201 @@ +% PeptideAnalysis +clc +clear all +close all + +%%% --- calculate peptides metrics --- %%% +if exist('PeptidesMetrics.mat','file') ~= 2 + fprintf('Calculate metrics ...\n'); + % read data + pep_data = ReadPeptideData('./data/865_peptides_refseq_07Sep2015.txt'); + ref_ann = ReadReferenceAnnotation('./data/proteinRefSeq_rat_14Sep2015.txt'); + + % calculate metrics + [ary_icc, ary_dab] = CalculateMeasures(pep_data); + + % save data file + save('PeptidesMetrics.mat','pep_data','ref_ann','ary_icc','ary_dab'); +else + fprintf('Load metrics ...\n'); + load('PeptidesMetrics.mat'); +end + +%%% --- simulation --- %%% +if exist('PeptidesSimulation.mat','file') ~= 2 + fprintf('Simulate thresholds ... \n'); + + % define initial thresholds + thresh_dab = 0.5; + thresh_icc = 0.75; + thresh_scr = prctile(pep_data.score, 5); + + Nsim = 10000; + Ndrw = 100; + + % index of training sets + idx_train_pos = (ary_icc > thresh_icc) & (ary_dab > thresh_dab) & (pep_data.score > thresh_scr); + idx_train_neg = (~idx_train_pos); + + % position of trained peptides + psn_train_pos = find(idx_train_pos); + psn_train_neg = find(idx_train_neg); + + % count of sets + cnt_train_pos = sum(idx_train_pos); + cnt_train_neg = sum(idx_train_neg); + + % define training matrix (ICC,dAB, Score) + mtx_train = [ary_icc, ary_dab, pep_data.score]; + + % accumulate training choice + idx_used_pos = zeros(size(pep_data.score,1),1); + idx_used_neg = zeros(size(pep_data.score,1),1); + + % define training groups + grp_train = true(2*Ndrw,1); + grp_train(Ndrw+1:end) = false; + + % allocate class matrix + mtx_class = false(size(pep_data.score,1),Nsim); + for k = 1 : Nsim + + % shuffle random sampling position + psn_smp_pos = psn_train_pos(randperm(cnt_train_pos)); + psn_smp_neg = psn_train_neg(randperm(cnt_train_neg)); + + % draw random sampling position + psn_smp_pos = psn_smp_pos(1:Ndrw); + psn_smp_neg = psn_smp_neg(1:Ndrw); + + % accumulate choice + idx_used_pos(psn_smp_pos) = idx_used_pos(psn_smp_pos) + 1; + idx_used_neg(psn_smp_neg) = idx_used_neg(psn_smp_neg) + 1; + + % build training matrix + mtx_train_pos = mtx_train(psn_smp_pos,:); + mtx_train_neg = mtx_train(psn_smp_neg,:); + + % train svm classifier + svm = svmtrain([mtx_train_pos;mtx_train_neg],... + grp_train,... + 'kernel_function','quadratic'); + + % classify matrix + mtx_class(:,k) = svmclassify(svm, mtx_train); + + end + + % calculate simulation result + idx_sim_pos = sum(mtx_class,2)/Nsim >= 0.95; + idx_sim_neg = sum(mtx_class,2)/Nsim <= 0.05; + idx_sim_buf = ~(idx_sim_pos | idx_sim_neg); + fprintf('Positive: %d\n',sum(idx_sim_pos)); + fprintf('Buffer: %d\n',sum(idx_sim_buf)); + fprintf('Negative: %d\n',sum(idx_sim_neg)); + fprintf('Total: %d\n',size(idx_sim_pos,1)); + + save('PeptidesSimulation.mat','idx_sim_pos','idx_sim_neg','idx_sim_buf','Nsim','Ndrw','thresh_icc','thresh_dab','thresh_scr'); + %} +else + fprintf('Load simulation ... \n'); + %load('PeptidesSimulation.mat'); + idx_sim_pos = (ary_icc > 0.9) & (ary_dab > 0.8); + idx_sim_neg = ~idx_sim_pos; +end + +%%% --- write out lists --- %%% +LogPeptideList( idx_sim_pos, pep_data, ref_ann, 'CoreGlyco_13Oct2015'); +LogPeptideList( idx_sim_neg, pep_data, ref_ann, 'Filtered_13Oct2015'); +%LogPeptideList( idx_sim_buf, pep_data, ref_ann, 'Buffer_13Oct2015'); + +%%% --- plot data --- %%% +% control genes +lbl_pos = {'Cdh2';'Gabra1';'Gabrb3';'Gabrg2';'Gria1';'Gria2';'Gria3';'Gria4';'Grin1';'Grin2a';'Grin2b'}; +lbl_neg = {'Cacng8','Nlgn2'}; + +% index controls +idx_ctrl_pos = MatchIDList(ref_ann, pep_data, lbl_pos); +idx_ctrl_neg = MatchIDList(ref_ann, pep_data, lbl_neg); + +x_lim = [-0.2,1]; +y_lim = [-1,1]; +X = ary_icc; +Y = ary_dab; +figure('Color','w'); +hold on; +plot(x_lim,[0,0],'k-.','LineWidth',0.1); +plot([0,0],y_lim,'k-.','LineWidth',0.1); +%plot(x_lim,[thresh_dab,thresh_dab],'k:','LineWidth',0.1); +%plot([thresh_icc,thresh_icc],y_lim,'k:','LineWidth',0.1); + +plot(X(idx_sim_neg),Y(idx_sim_neg),'.','Color',[.7,.7,.7],'MarkerSize',3); +%plot(X(idx_sim_buf),Y(idx_sim_buf),'.','Color',[.9,.9,.9],'MarkerSize',3); +plot(X(idx_sim_pos),Y(idx_sim_pos),'.','Color',[.25,.25,.25],'MarkerSize',3); + +plot(X(idx_ctrl_neg),Y(idx_ctrl_neg),'.','Color',[255,69,0]./255,'MarkerSize',7.5); +plot(X(idx_ctrl_pos),Y(idx_ctrl_pos),'.','Color',[50,205,50]./255,'MarkerSize',7.5); + +%{ +text(0.6,-0.2,... + sprintf('SVM clustering\n%d simulations\n %d peptides',Nsim,size(idx_sim_pos,1)),... + 'VerticalAlignment','top','HorizontalAlignment','left','FontSize',8,'BackgroundColor','w'); +%} + +plot(0.6,-0.6,'.','Color',[.7,.7,.7],'MarkerSize',12); +text(0.625,-0.6,sprintf('%d "bad" peptides',sum(idx_sim_neg)),... + 'VerticalAlignment','middle','HorizontalAlignment','left','FontSize',8,'BackgroundColor','w'); + +%{ +plot(0.6,-0.6,'.','Color',[.9,.9,.9],'MarkerSize',12); +text(0.625,-0.6,sprintf('%d "buffer" peptides',sum(idx_sim_buf)),... + 'VerticalAlignment','middle','HorizontalAlignment','left','FontSize',8,'BackgroundColor','w'); +%} + +plot(0.6,-0.7,'.','Color',[.25,.25,.25],'MarkerSize',12); +text(0.625,-0.7,sprintf('%d "good" peptides',sum(idx_sim_pos)),... + 'VerticalAlignment','middle','HorizontalAlignment','left','FontSize',8,'BackgroundColor','w'); + +plot(0.6,-0.8,'.','Color',[255,69,0]./255,'MarkerSize',12); +text(0.625,-0.8,sprintf('%d neg.controls',sum(idx_ctrl_neg)),... + 'VerticalAlignment','middle','HorizontalAlignment','left','FontSize',8,'BackgroundColor','w'); + +plot(0.6,-0.9,'.','Color',[50,205,50]./255,'MarkerSize',12); +text(0.625,-0.9,sprintf('%d pos.controls',sum(idx_ctrl_pos)),... + 'VerticalAlignment','middle','HorizontalAlignment','left','FontSize',8,'BackgroundColor','w'); +hold off; + +xlabel('reproducibility [intraclass correlation]','FontSize',12); +ylabel('normalized expression [(A-B)/max(A,B)]','FontSize',12); +%print(gcf,'-dpsc2','-r300','figureX_Clustering_22Sep2015.eps'); + +figure('Color','w'); +[frq_all, x_bins] = hist(ary_dab(~idx_ctrl_pos), 100); +frq_cor = hist(ary_dab(idx_ctrl_pos), x_bins); +hold on; +plot(x_bins,cumsum(frq_all./sum(frq_all)),'k'); +plot(x_bins,cumsum(frq_cor./sum(frq_cor)),'g'); +plot([-0.9,-0.8],[.9,.9],'k'); +text(-0.7,0.9,'overall peptides','FontSize',12,'VerticalALignment','middle','HorizontalALignment','left'); +plot([-0.9,-0.8],[.8,.8],'g'); +text(-0.7,0.8,'pos.control peptides','FontSize',12,'VerticalALignment','middle','HorizontalALignment','left'); +hold off; +set(gca,'Box','off','YLim',[-0.05,1.05],'YTick',0:.2:1); +xlabel('normalized expression [(A-B)/max(A,B)]','FontSize',12); +ylabel('cumulative ratio','FontSize',12); +print(gcf,'-dpsc2','-r300','figureX_Difference_11Nov2015.eps'); + +figure('Color','w'); +[frq_all, x_bins] = hist(ary_icc(~idx_ctrl_pos), 100); +frq_cor = hist(ary_icc(idx_ctrl_pos), x_bins); +hold on; +plot(x_bins,cumsum(frq_all./sum(frq_all)),'k'); +plot(x_bins,cumsum(frq_cor./sum(frq_cor)),'g'); +plot([-0.9,-0.8],[.9,.9],'k'); +text(-0.7,0.9,'overall peptides','FontSize',12,'VerticalALignment','middle','HorizontalALignment','left'); +plot([-0.9,-0.8],[.8,.8],'g'); +text(-0.7,0.8,'pos.control peptides','FontSize',12,'VerticalALignment','middle','HorizontalALignment','left'); +hold off; +set(gca,'Box','off','YLim',[-0.05,1.05],'YTick',0:.2:1); +xlabel('reproducibility [intraclass correlation]','FontSize',12); +ylabel('cumulative ratio','FontSize',12); +print(gcf,'-dpsc2','-r300','figureX_ICC_11Nov2015.eps'); \ No newline at end of file diff --git a/cluster_peptides/PeptideClustering.m b/cluster_peptides/PeptideClustering.m new file mode 100755 index 0000000..e8a6b16 --- /dev/null +++ b/cluster_peptides/PeptideClustering.m @@ -0,0 +1,80 @@ +% PeptideClustering +clc +close all +% +clear all + +%%% --- calculate peptides metrics --- %%% +if exist('PeptidesMetrics.mat','file') ~= 2 + fprintf('Calculate metrics ...\n'); + % read data + pep_data = ReadPeptideData('./data/865_peptides_refseq_07Sep2015.txt'); + ref_ann = ReadReferenceAnnotation('./data/proteinRefSeq_rat_14Sep2015.txt'); + + % calculate metrics + [ary_icc, ary_dab] = CalculateMeasures(pep_data); + + % save data file + save('PeptidesMetrics.mat','pep_data','ref_ann','ary_icc','ary_dab'); +else + fprintf('Load metrics ...\n'); + load('PeptidesMetrics.mat'); +end +%%% --- --- --- %%% +%} +%clearvars -except ary_icc ary_dab; +x = ary_icc; +y = ary_dab; + +% hierarchical clustering +D = pdist([x,y],'euclidean'); +Z = linkage(D,'average'); +idx = cluster(Z,'cutoff',0.1,'criterion','distance'); +%c = cophenet(Z,D); + +% cluster centers +ctr_x = accumarray(idx,x,[max(idx),1],@mean); +ctr_y = accumarray(idx,y,[max(idx),1],@mean); + +cls_weight = sqrt((ctr_y + 1).^2 + (ctr_x +1).^2); + +clr_mtx = (cls_weight - min(cls_weight))./(max(cls_weight)-min(cls_weight)); +clr_mtx = [clr_mtx,rand(max(idx),1),1-clr_mtx]; +% thresholds +% +thresh_icc = 0.5; +thresh_dab = 0.5; + +cls_max_x = accumarray(idx,x,[max(idx),1],@max); +cls_max_y = accumarray(idx,y,[max(idx),1],@max); + +cls_min_x = accumarray(idx,x,[max(idx),1],@min); +cls_min_y = accumarray(idx,y,[max(idx),1],@min); + + + +T = (cls_min_x >= thresh_icc) & (cls_min_y >= thresh_dab); +%P = T(idx); +P = idx == max(idx); + +%LogPeptideList( P, pep_data, ref_ann, 'CoreGlyco_15Oct2015'); +%LogPeptideList( ~P, pep_data, ref_ann, 'Filtered_15Oct2015'); + +figure('Color','w'); +hold on; +plot(x(~P),y(~P),'.','Color',[.7,.7,.7]); +plot(x(P),y(P),'.','Color',[.0,.9,.0]); +hold off; +%} +% + +figure('Color','w'); +hold on; +for k = 1 : max(idx) + + plot(x(idx==k),y(idx==k),'.','Color',clr_mtx(k,:)); +end +hold off; +xlabel('reproducibility [intra class correlation]','fontsize',10); +ylabel('expression [(A-B)/max(A,B)]','fontsize',10); +%} \ No newline at end of file diff --git a/cluster_peptides/PeptideSupplementary.m b/cluster_peptides/PeptideSupplementary.m new file mode 100755 index 0000000..2bb9af4 --- /dev/null +++ b/cluster_peptides/PeptideSupplementary.m @@ -0,0 +1,98 @@ +% PeptideSupplementary +clc +close all +% +clear all +load PeptidesMetrics; +%} +clearvars -except ary_dab ary_icc pep_data ref_ann; + +idx_good = (ary_icc > 0.9) & (ary_dab > 0.8); + +gid_good = unique(cat(1,pep_data.gid_list{idx_good})); + + +cnt = size(pep_data.gid_list,1); +idx_good_all = false(cnt,1); +k = 1; +tic +for k = 1 : cnt + i = intersect(pep_data.gid_list{k},gid_good); + idx_good_all(k) = ~isempty(i); +end +toc + + +figure('Color','w'); +hold on; +plot(ary_icc(~idx_good_all),ary_dab(~idx_good_all),'.','Color',[.75,.75,.75],'MarkerSize',3); +%plot(ary_icc(idx_good_all),ary_dab(idx_good_all),'k.','MarkerSize',3); +hold off; +set(gca,'Box','Off',... + 'XLim',[-0.2,1.05],... + 'YLim',[-1.05,1.05]); +xlabel('reproducibility [intraclass correlation]','FontSize',12); +ylabel('normalized expression [(A-B)/max(A,B)]','FontSize',12); +print(gcf,'-dpsc2','-r300','figureX_Clustering_NoGlyco_12Nov2015.eps'); + +figure('Color','w'); +hold on; +%plot(ary_icc(~idx_good_all),ary_dab(~idx_good_all),'.','Color',[.75,.75,.75],'MarkerSize',3); +plot(ary_icc(idx_good_all),ary_dab(idx_good_all),'k.','MarkerSize',3); +hold off; +set(gca,'Box','Off',... + 'XLim',[-0.2,1.05],... + 'YLim',[-1.05,1.05]); +xlabel('reproducibility [intraclass correlation]','FontSize',12); +ylabel('normalized expression [(A-B)/max(A,B)]','FontSize',12); +print(gcf,'-dpsc2','-r300','figureX_Clustering_Glyco_12Nov2015.eps'); + + +idx_ctrl_pos = idx_good_all; +disp(sum(idx_ctrl_pos)); +disp(sum(~idx_ctrl_pos)); +figure('Color','w'); +[frq_all, x_bins] = hist(ary_dab(~idx_ctrl_pos), 100); +frq_cor = hist(ary_dab(idx_ctrl_pos), x_bins); +y_all = cumsum(frq_all./sum(frq_all)); +y_crr = cumsum(frq_cor./sum(frq_cor)); +[~,p] = kstest2(y_all,y_crr,'Alpha',0.01); +hold on; +plot(x_bins,cumsum(frq_all./sum(frq_all)),'k'); +plot(x_bins,cumsum(frq_cor./sum(frq_cor)),'g'); +plot([-0.9,-0.8],[.9,.9],'k'); +text(-0.7,0.9,'"bad" peptides','FontSize',12,'VerticalALignment','middle','HorizontalALignment','left'); +plot([-0.9,-0.8],[.8,.8],'g'); +text(-0.7,0.8,'core-glyco peptides','FontSize',12,'VerticalALignment','middle','HorizontalALignment','left'); +text(-0.7,0.7,sprintf('KSTest %.2E',p),'FontSize',12,'VerticalAlignment','middle','HorizontalAlignment','left'); +hold off; + + +set(gca,'Box','off','YLim',[-0.05,1.05],'YTick',0:.2:1); +xlabel('normalized expression [(A-B)/max(A,B)]','FontSize',12); +ylabel('cumulative ratio','FontSize',12); +print(gcf,'-dpsc2','-r300','figureX_Difference_12Nov2015.eps'); + +figure('Color','w'); +[frq_all, x_bins] = hist(ary_icc(~idx_ctrl_pos), 100); +frq_cor = hist(ary_icc(idx_ctrl_pos), x_bins); +x_all = cumsum(frq_all./sum(frq_all)); +x_crr = cumsum(frq_cor./sum(frq_cor)); +[~,p] = kstest2(x_all,x_crr,'Alpha',0.01); +hold on; +plot(x_bins,cumsum(frq_all./sum(frq_all)),'k'); +plot(x_bins,cumsum(frq_cor./sum(frq_cor)),'g'); +plot([-0.9,-0.8],[.9,.9],'k'); +text(-0.7,0.9,'"bad" peptides','FontSize',12,'VerticalALignment','middle','HorizontalALignment','left'); +plot([-0.9,-0.8],[.8,.8],'g'); +text(-0.7,0.8,'core-glyco peptides','FontSize',12,'VerticalALignment','middle','HorizontalALignment','left'); +text(-0.7,0.7,sprintf('KSTest %.2E',p),'FontSize',12,'VerticalAlignment','middle','HorizontalAlignment','left'); +hold off; +set(gca,'Box','off','YLim',[-0.05,1.05],'YTick',0:.2:1); +xlabel('reproducibility [intraclass correlation]','FontSize',12); +ylabel('cumulative ratio','FontSize',12); +print(gcf,'-dpsc2','-r300','figureX_ICC_12Nov2015.eps'); + + + + diff --git a/cluster_peptides/ReadPeptideData.m b/cluster_peptides/ReadPeptideData.m new file mode 100755 index 0000000..ea29523 --- /dev/null +++ b/cluster_peptides/ReadPeptideData.m @@ -0,0 +1,153 @@ +function [ data ] = ReadPeptideData(query_file) +% read peptide information + + %%% --- read raw data --- %%% + data = ReadRawData(query_file); + + %%% --- clean peptide naming --- %%% + data = CleanPeptideLabels(data); + + %%% --- clean empty intensity --- %%% + data = CleanEmptyIntensity(data); + + %%% --- set peptide list and index --- %%% + data = GetPeptideList(data); + + %%% --- set experiment, sample, replica index --- %%% + data = SetConfigColumns(data); + +end + +function [data] = SetConfigColumns(data) + + %%% --- set experiment --- %%% + idx_exp = ones(24,1); + idx_exp(13:end) = 2; + data.idx_exp = idx_exp; + + %%% --- set sample --- %%% + idx_smp = [repmat('A',6,1);repmat('B',6,1);repmat('A',6,1);repmat('B',6,1)]; + data.idx_smp = idx_smp; + + %%% --- set replica --- %%% + idx_rep = ones(6,1); + idx_rep(4:end) = 2; + idx_rep = repmat(idx_rep, 4,1); + data.idx_rep = idx_rep; +end + +function [data] = GetPeptideList(data) + + %%% --- update gid list with razor --- %%% + gid_list = cellfun(@(x,y) {unique([x;y])},data.gid_razor,data.gid_list); + data.gid_list = gid_list; + + %%% --- symbol dictionary --- %%% + data.gid_cnt = cellfun('size',gid_list,1); + data.gid_ary = cat(1,gid_list{:}); + + %%% --- order matrix index --- %%% + ary_idx = false(sum(data.gid_cnt,1),1); + ary_idx(cumsum([1;data.gid_cnt(1:end-1)])) = true; + data.gid_ary_rev = cumsum(ary_idx); + + %%% --- order razor ids --- %%% + idx_raz = cellfun(@(x,y) {strcmp(x,y)},data.gid_razor,gid_list); + data.gid_ary_razor = cat(1,idx_raz{:}); +%} +end + +function [data] = CleanEmptyIntensity(data) + + % get empty index + idx_empty = all(data.intensity == 0, 2); + idx_list = strcmp('',data.gid_list); + idx_razor = strcmp('',data.gid_razor); + idx_empty = idx_empty | (idx_list & idx_razor); + + % remove from table + data.seqaa(idx_empty) = []; + data.length(idx_empty) = []; + data.gid_list(idx_empty) = []; + data.gid_razor(idx_empty) = []; + data.mass(idx_empty) = []; + data.score(idx_empty) = []; + data.intensity(idx_empty) = []; + data.imtx(idx_empty,:) = []; + + % put back razor to list + idx_list = strcmp('',data.gid_list); + data.gid_list(idx_list) = data.gid_razor(idx_list); + data.gid_list = regexp(data.gid_list,'\;','split'); + data.gid_list = cellfun(@(x) {unique(x')},data.gid_list); + +end + +function [data] = CleanPeptideLabels(data) + + list = data.gid_list; + razor = data.gid_razor; + + %%% --- clean " --- %%% + list = regexprep(list,'\"',''); + razor = regexprep(razor,'\"',''); + + %%% --- clean gi| --- %%% + list = regexprep(list,'gi\|',''); + razor = regexprep(razor,'gi\|',''); + + %%% --- clean CON__ --- %%% + list = regexprep(list,'CON\_\_',''); + razor = regexprep(razor,'CON\_\_',''); + + %%% --- clean REV__ --- %%% + list = regexprep(list,'REV\_\_',''); + razor = regexprep(razor,'REV\_\_',''); + + + %%% --- clean ENSEMBL: --- %%% + list = regexprep(list,'ENSEMBL\:',''); + razor = regexprep(razor,'ENSEMBL\:',''); + + %%% --- clean REFSEQ: --- %%% + list = regexprep(list,'REFSEQ\:',''); + razor = regexprep(razor,'REFSEQ\:',''); + %{ + + %%% --- clean all numbers --- %%% + for k = 1 : 20 + peps = regexprep(peps,sprintf('\\-%d',k),''); + razor = regexprep(razor,sprintf('\\-%d',k),''); + end + %} + + %%% --- update data --- %%% + data.gid_list = list; + data.gid_razor = razor; + +end + + +function [data] = ReadRawData(query_file) + % define text format + fmt = repmat({'%*s'},92,1); + fmt([2,9,58,59:82]) = {'%n'}; + fmt([1,3,4]) = {'%s'}; + fmt = sprintf('%s ',fmt{:}); + fmt(end) = []; + + % read file + fRead = fopen(query_file,'r'); + fgetl(fRead); + txt = textscan(fRead,fmt,'delimiter','\t'); + fclose(fRead); + data.seqaa = txt{1}; + data.length = cellfun('size',data.seqaa,2); + data.gid_list = txt{3}; + data.gid_razor = txt{4}; + data.mass = txt{2}; + data.score = txt{5}; + data.intensity = log10(txt{6}+1); + data.imtx = log10([txt{7:end}]+1); + +end diff --git a/cluster_peptides/ReadReferenceAnnotation.m b/cluster_peptides/ReadReferenceAnnotation.m new file mode 100755 index 0000000..9504d13 --- /dev/null +++ b/cluster_peptides/ReadReferenceAnnotation.m @@ -0,0 +1,27 @@ +function [ map ] = ReadReferenceAnnotation( file_map ) +% read id map + + fRead = fopen(file_map, 'r'); + fgetl(fRead); + txt = textscan(fRead,'%s %s %s %s %n %s %*s %n','delimiter','\t'); + fclose(fRead); + map.protid = txt{1}; + map.gid = txt{2}; + map.sym = txt{3}; + map.rnaid = txt{4}; + map.entrez = txt{5}; + map.description = txt{6}; + map.length = txt{7}; + + % unique symbols + [sym_unq,~,idx_sym_rev] = unique(map.sym); + map.idx_sym_rev = idx_sym_rev; + map.sym_unq = sym_unq; + + % unique protein gids + [pids_unq,~,idx_pids_rev] = unique(map.protid); + map.idx_pids_rev = idx_pids_rev; + map.pids_unq = pids_unq; + +end + diff --git a/cluster_peptides/anova_rm.m b/cluster_peptides/anova_rm.m new file mode 100755 index 0000000..10e38cd --- /dev/null +++ b/cluster_peptides/anova_rm.m @@ -0,0 +1,207 @@ +function [p, table] = anova_rm(X, displayopt) +% [p, table] = anova_rm(X, displayopt) +% Single factor, tepeated measures ANOVA. +% +% [p, table] = anova_rm(X, displayopt) performs a repeated measures ANOVA +% for comparing the means of two or more columns (time) in one or more +% samples(groups). Unbalanced samples (i.e. different number of subjects +% per group) is supported though the number of columns (followups)should +% be the same. +% +% DISPLAYOPT can be 'on' (the default) to display the ANOVA table, or +% 'off' to skip the display. For a design with only one group and two or +% more follow-ups, X should be a matrix with one row for each subject. +% In a design with multiple groups, X should be a cell array of matrixes. +% +% Example: Gait-Cycle-times of a group of 7 PD patients have been +% measured 3 times, in one baseline and two follow-ups: +% +% patients = [ +% 1.1015 1.0675 1.1264 +% 0.9850 1.0061 1.0230 +% 1.2253 1.2021 1.1248 +% 1.0231 1.0573 1.0529 +% 1.0612 1.0055 1.0600 +% 1.0389 1.0219 1.0793 +% 1.0869 1.1619 1.0827 ]; +% +% more over, a group of 8 controls has been measured in the same protocol: +% +% controls = [ +% 0.9646 0.9821 0.9709 +% 0.9768 0.9735 0.9576 +% 1.0140 0.9689 0.9328 +% 0.9391 0.9532 0.9237 +% 1.0207 1.0306 0.9482 +% 0.9684 0.9398 0.9501 +% 1.0692 1.0601 1.0766 +% 1.0187 1.0534 1.0802 ]; +% +% We are interested to see if the performance of the patients for the +% followups were the same or not: +% +% p = anova_rm(patients); +% +% By considering the both groups, we can also check to see if the +% follow-ups were significantly different and also check two see that the +% two groups had a different performance: +% +% p = anova_rm({patients controls}); +% +% +% ref: Statistical Methods for the Analysis of Repeated Measurements, +% C. S. Daivs, Springer, 2002 +% +% Copyright 2008, Arash Salarian +% mailto://arash.salarian@ieee.org +% + +if nargin < 2 + displayopt = 'on'; +end + +if ~iscell(X) + X = {X}; +end + +%number of groups +s = size(X,2); + +%subjects per group +n_h = zeros(s, 1); +for h=1:s + n_h(h) = size(X{h}, 1); +end +n = sum(n_h); + +%number of follow-ups +t = size(X{1},2); + +% overall mean +y = 0; +for h=1:s + y = y + sum(sum(X{h})); +end +y = y / (n * t); + +% allocate means +y_h = zeros(s,1); +y_j = zeros(t,1); +y_hj = zeros(s,t); +y_hi = cell(s,1); +for h=1:s + y_hi{h} = zeros(n_h(h),1); +end + +% group means +for h=1:s + y_h(h) = sum(sum(X{h})) / (n_h(h) * t); +end + +% follow-up means +for j=1:t + y_j(j) = 0; + for h=1:s + y_j(j) = y_j(j) + sum(X{h}(:,j)); + end + y_j(j) = y_j(j) / n; +end + +% group h and time j mean +for h=1:s + for j=1:t + y_hj(h,j) = sum(X{h}(:,j) / n_h(h)); + end +end + +% subject i'th of group h mean +for h=1:s + for i=1:n_h(h) + y_hi{h}(i) = sum(X{h}(i,:)) / t; + end +end + +% calculate the sum of squares +ssG = 0; +ssSG = 0; +ssT = 0; +ssGT = 0; +ssR = 0; + +for h=1:s + for i=1:n_h(h) + for j=1:t + ssG = ssG + (y_h(h) - y)^2; + ssSG = ssSG + (y_hi{h}(i) - y_h(h))^2; + ssT = ssT + (y_j(j) - y)^2; + ssGT = ssGT + (y_hj(h,j) - y_h(h) - y_j(j) + y)^2; + ssR = ssR + (X{h}(i,j) - y_hj(h,j) - y_hi{h}(i) + y_h(h))^2; + end + end +end + +% calculate means +if s > 1 + msG = ssG / (s-1); + msGT = ssGT / ((s-1)*(t-1)); +end +msSG = ssSG / (n-s); +msT = ssT / (t-1); +msR = ssR / ((n-s)*(t-1)); + + +% calculate the F-statistics +if s > 1 + FG = msG / msSG; + FGT = msGT / msR; +end +FT = msT / msR; +FSG = msSG / msR; + + +% single or multiple sample designs? +if s > 1 + % case for multiple samples + pG = 1 - fcdf(FG, s-1, n-s); + pT = 1 - fcdf(FT, t-1, (n-s)*(t-1)); + pGT = 1 - fcdf(FGT, (s-1)*(t-1), (n-s)*(t-1)); + pSG = 1 - fcdf(FSG, n-s, (n-s)*(t-1)); + + p = [pT, pG, pSG, pGT]; + + table = { 'Source' 'SS' 'df' 'MS' 'F' 'Prob>F' + 'Time' ssT t-1 msT FT pT + 'Group' ssG s-1 msG FG pG + 'Ineratcion' ssGT (s-1)*(t-1) msGT FGT pGT + 'Subjects (matching)' ssSG n-s msSG FSG pSG + 'Error' ssR (n-s)*(t-1) msR [] [] + 'Total' [] [] [] [] [] + }; + table{end, 2} = sum([table{2:end-1,2}]); + table{end, 3} = sum([table{2:end-1,3}]); + + if (isequal(displayopt, 'on')) + digits = [-1 -1 0 -1 2 4]; + statdisptable(table, 'multi-sample repeated measures ANOVA', 'ANOVA Table', '', digits); + end +else + % case for only one sample + pT = 1 - fcdf(FT, t-1, (n-s)*(t-1)); + pSG = 1 - fcdf(FSG, n-s, (n-s)*(t-1)); + + p = [pT, pSG]; + + table = { 'Source' 'SS' 'df' 'MS' 'F' 'Prob>F' + 'Time' ssT t-1 msT FT pT + 'Subjects (matching)' ssSG n-s msSG FSG pSG + 'Error' ssR (n-s)*(t-1) msR [] [] + 'Total' [] [] [] [] [] + }; + table{end, 2} = sum([table{2:end-1,2}]); + table{end, 3} = sum([table{2:end-1,3}]); + + if (isequal(displayopt, 'on')) + digits = [-1 -1 0 -1 2 4]; + statdisptable(table, 'repeated measures ANOVA', 'ANOVA Table', '', digits); + end +end