## This source code file is part of the analysis of variable protein complexes (VariableComplexes). ## Copyright (C) 2016 Murat Iskar, Alessandro Ori ## ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation, either version 3 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program. If not, see . ## ## Please check the publication "Spatiotemporal variation of mammalian protein complex stoichiometries", XXX ## and the supplementary website: www.bork.embl.de/variable_complexes ## ## identifying_differentially_expressed_stoichiometric_members.R ## v1 ## Murat Iskar ## Alessandro Ori ## 20.1.2016 ## ## ## input files: ## matched-protein-complexes-reprogramming-dataset.txt ## Reprogramming-dataset-only-Protein-complexes.txt ## ## output files: ## reprogramming-limma-results-adj-P-values.txt ## reprogramming-limma-results-fold-changes.txt ## options(max.print=200) options(stringsAsFactors=FALSE) library("limma") # reading in the proteomics dataset subsetted to the space of protein complexes: dataset = read.table("input-files/Reprogramming-dataset-only-Protein-complexes.txt",sep="\t") # reading definitions of the protein complexes comps = read.table("input-files/matched-protein-complexes-reprogramming-dataset.txt",header=TRUE,sep="\t") # we will generate a normalized protein expression of complex members # by taking into account the average expression of the corresponding complex. datasetNormalized = matrix(data=NA,nrow=length(dataset[,1]),ncol=length(dataset[1,])) for(i in 1:length(dataset[,1])) { # we process each protein quantified in the whole proteomic data matrix prot = rownames(dataset)[i] # complex stores the info of which complexes is this protein member of? complex = comps[comps[,3]%in%prot,1] # the average expression of protein complexes that we will use later to normalize the expression of the protein. averageexp = c() # for each complex, we calculate the (trimmed) average expression of protein members. for(s in 1:length(complex)) { # l contains all the members of the protein complex in interest. l = comps[comps[,1]%in%complex[s],3] # we exclude the protein that will be normalized. l = l[!l%in%prot] # sm, small matrix contains the proteomic expression of the protein complex (excluding the protein in interest.) sm = dataset[rownames(dataset)%in%l,] # calculating the trimmed mean for each complex. sm2 = (sm-apply(sm,1,function(x) mean(x,na.rm=TRUE,trim=0.25))) # Due to missing values, if there are less than 4 members to estimate the trimmed mean, it was set to NA and excluded from further analysis. topna = apply(!is.na(sm2),2,sum) sm2[,topna<4] = NA # in case of redundant proteins that are members of multiple protein complexes, # trimmed means for each complex were stored in averageexp. averageexp = rbind(averageexp,apply(sm2,2,function(x) mean(x,na.rm=TRUE))) } # As a means of normalization, from the expression of the protein, # we substract the average expression of the rest of the complex members. # If the protein is part of multiple complexes, the average is calculated from the trimmed means of all complexes. datasetNormalized[i,] = t(dataset[i,]-apply(averageexp,2,function(x) mean(x,na.rm=TRUE))) } # carrying the protein IDS and the sample information to the normalized matrix. rownames(datasetNormalized) = rownames(dataset) colnames(datasetNormalized) = colnames(dataset) # Each time point will be tested against the rest of the time points. # Due to 5 distinct time points, we will perform 5 comparisons. # P values (pval) and fold changes (fc) were saved for these comparisons. pval = matrix(data=NA,nrow=length(datasetNormalized[,1]),ncol=length(datasetNormalized[1,])/2) fc = matrix(data=NA,nrow=length(datasetNormalized[,1]),ncol=length(datasetNormalized[1,])/2) for(i in 1:5) { # preparing the design matrix by considering replicates Treatment=rep(0,10) # labeling which samples will be tested against. ch=((i*2):(i*2+1))-1 Treatment[ch]=1 # renaming the design matrix. design <- model.matrix(~ -1+factor(Treatment)) colnames(design)<-c("control","treatment") fit = lmFit(datasetNormalized, design=design) # Fit the original matrix to the above design. contrastsMatrix = makeContrasts("control-treatment",levels = design) fit2 = contrasts.fit(fit, contrasts = contrastsMatrix) # Making the comparison. fit2 = eBayes(fit2) # Moderating the t-tetst by eBayes method. # extracting the results: a = topTable(fit2, coef = "control-treatment",number = length(datasetNormalized[,1]),adjust.method="fdr") # extracting the FDR adjusted P value pval[,i]<-a[rownames(datasetNormalized),5] # fold changes were also stored for each comparison fc[,i]<-a[rownames(datasetNormalized),2] } # renaming the row and column names for the limma results ensemblid=comps[,2] names(ensemblid)=comps[,3] reprogrammingpval = pval rownames(reprogrammingpval) = ensemblid[rownames(datasetNormalized)] colnames(reprogrammingpval) = c("D3.D0", "D6.D3", "D9.D6", "D12.D9", "iPS.D12") reprogrammingfc = fc rownames(reprogrammingfc) = ensemblid[rownames(datasetNormalized)] colnames(reprogrammingfc) = c("D3.D0", "D6.D3", "D9.D6", "D12.D9", "iPS.D12") # generating the output files: write.table(reprogrammingpval,"output-files/reprogramming-limma-results-adj-P-values.txt",sep="\t",quote=FALSE) write.table(reprogrammingfc,"output-files/reprogramming-limma-results-fold-changes.txt",sep="\t",quote=FALSE) writeLines(capture.output(sessionInfo()), "sessionInfo.txt")