## This source code file is part of the analysis of variable protein complexes (VariableComplexes). ## Copyright (C) 2016 Murat Iskar, Alessandro Ori ## ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation, either version 3 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program. If not, see . ## ## Please check the publication "Spatiotemporal variation of mammalian protein complex stoichiometries", XXX ## and the supplementary website: www.bork.embl.de/variable_complexes ## ## identifying_differentially_expressed_stoichiometric_members.R ## v1 ## Murat Iskar ## Alessandro Ori ## 20.1.2016 ## ## ## input files: ## 11-cell-line-dataset-only-Protein-complexes.txt ## matched-protein-complexes-11-cell-line-dataset.txt ## ## output files: ## 11-cell-lines-limma-results-adj-P-values.txt ## 11-cell-lines-limma-results-fold-changes.txt ## options(max.print=200) options(stringsAsFactors=FALSE) library("limma") # reading in the proteomics dataset subsetted to the space of protein complexes: dataset = read.table("input-files/11-cell-line-dataset-only-Protein-complexes.txt",sep="\t") # reading definitions of the protein complexes comps = read.table("input-files/matched-protein-complexes-11-cell-line-dataset.txt",header=TRUE,sep="\t") # we will generate a normalized protein expression of complex members # by taking into account the average expression of the corresponding complex. datasetNormalized = matrix(data=NA,nrow=length(dataset[,1]),ncol=length(dataset[1,])) for(i in 1:length(dataset[,1])) { # we process each protein quantified in the whole proteomic data matrix prot = rownames(dataset)[i] # complex stores the info of which complexes is this protein member of? complex = comps[comps[,3]%in%prot,1] # the average expression of protein complexes that we will use later to normalize the expression of the protein. averageexp = c() # for each complex, we calculate the (trimmed) average expression of protein members. for(s in 1:length(complex)) { # l contains all the members of the protein complex in interest. l = comps[comps[,1]%in%complex[s],3] # we exclude the protein that will be normalized. l = l[!l%in%prot] # sm, small matrix contains the proteomic expression of the protein complex (excluding the protein in interest.) sm = dataset[rownames(dataset)%in%l,] # calculating the trimmed mean for each complex. sm2 = (sm-apply(sm,1,function(x) mean(x,na.rm=TRUE,trim=0.25))) # Due to missing values, if there are less than 4 members to estimate the trimmed mean, it was set to NA and excluded from further analysis. topna = apply(!is.na(sm2),2,sum) sm2[,topna<4] = NA # in case of redundant proteins that are members of multiple protein complexes, # trimmed means for each complex were stored in averageexp. averageexp = rbind(averageexp,apply(sm2,2,function(x) mean(x,na.rm=TRUE))) } # As a means of normalization, from the expression of the protein, # we substract the average expression of the rest of the complex members. # If the protein is part of multiple complexes, the average is calculated from the trimmed means of all complexes. datasetNormalized[i,] = t(dataset[i,]-apply(averageexp,2,function(x) mean(x,na.rm=TRUE))) } # carrying the protein IDS and the sample information to the normalized matrix. rownames(datasetNormalized) = rownames(dataset) colnames(datasetNormalized) = colnames(dataset) # Each cell line will be tested against the rest of the cell lines. # Due to 11 cell lines, we will perform 11 comparisons. # P values (pval) and fold changes (fc) were kept for these comparisons. pval = matrix(data=NA,nrow=length(datasetNormalized[,1]),ncol=11) fc = matrix(data=NA,nrow=length(datasetNormalized[,1]),ncol=11) for(i in 1:11) { # preparing the design matrix by considering replicate samples as well as samples that were filtered as being outliers. Treatment = rep(0,33) # labeling which samples will be tested against. ch=((i*3):(i*3+2))-2 Treatment[ch] = 1 # removing outlier samples from the design matrix. design = model.matrix(~ -1+factor(Treatment[c(-3,-21,-30)])) # renaming the design matrix. colnames(design) = c("control","treatment") fit = lmFit(datasetNormalized, design=design) # Fit the original matrix to the above design. contrastsMatrix = makeContrasts("control-treatment",levels = design) fit2 = contrasts.fit(fit, contrasts = contrastsMatrix) # Making the comparison. fit2 = eBayes(fit2) # Moderating the t-tests by empirical Bayes smoothing method. # extracting the results: a = topTable(fit2, coef = "control-treatment",number=length(datasetNormalized[,1]),adjust.method="fdr") pval[,i] = a[rownames(datasetNormalized),5] fc[,i] = a[rownames(datasetNormalized),2] } # renaming the row and column names for the limma results ensemblid=comps[,2] names(ensemblid)=comps[,3] celllinespval = pval rownames(celllinespval) = ensemblid[rownames(datasetNormalized)] colnames(celllinespval) = c("A549","GAMG", "HEK293", "HeLa", "HepG2", "Jurkat", "K562", "LnCap", "MCF7", "RKO", "U2OS") celllinesfc = fc rownames(celllinesfc) = ensemblid[rownames(datasetNormalized)] colnames(celllinesfc) = c("A549","GAMG", "HEK293", "HeLa", "HepG2", "Jurkat", "K562", "LnCap", "MCF7", "RKO", "U2OS") # generating the output files: write.table(celllinespval,"output-files/11-cell-lines-limma-results-adj-P-values.txt",sep="\t",quote=FALSE) write.table(celllinesfc,"output-files/11-cell-lines-limma-results-fold-changes.txt",sep="\t",quote=FALSE) writeLines(capture.output(sessionInfo()), "sessionInfo.txt")