## This source code file is part of the analysis of variable protein complexes (VariableComplexes).
## Copyright (C) 2016 Murat Iskar, Alessandro Ori
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see .
##
## Please check the publication "Spatiotemporal variation of mammalian protein complex stoichiometries", XXX
## and the supplementary website: www.bork.embl.de/variable_complexes
##
## identifying_differentially_expressed_stoichiometric_members.R
## v1
## Murat Iskar
## Alessandro Ori
## 20.1.2016
##
##
## input files:
## 11-cell-line-dataset-only-Protein-complexes.txt
## matched-protein-complexes-11-cell-line-dataset.txt
##
## output files:
## 11-cell-lines-limma-results-adj-P-values.txt
## 11-cell-lines-limma-results-fold-changes.txt
##
options(max.print=200)
options(stringsAsFactors=FALSE)
library("limma")
# reading in the proteomics dataset subsetted to the space of protein complexes:
dataset = read.table("input-files/11-cell-line-dataset-only-Protein-complexes.txt",sep="\t")
# reading definitions of the protein complexes
comps = read.table("input-files/matched-protein-complexes-11-cell-line-dataset.txt",header=TRUE,sep="\t")
# we will generate a normalized protein expression of complex members
# by taking into account the average expression of the corresponding complex.
datasetNormalized = matrix(data=NA,nrow=length(dataset[,1]),ncol=length(dataset[1,]))
for(i in 1:length(dataset[,1]))
{
# we process each protein quantified in the whole proteomic data matrix
prot = rownames(dataset)[i]
# complex stores the info of which complexes is this protein member of?
complex = comps[comps[,3]%in%prot,1]
# the average expression of protein complexes that we will use later to normalize the expression of the protein.
averageexp = c()
# for each complex, we calculate the (trimmed) average expression of protein members.
for(s in 1:length(complex))
{
# l contains all the members of the protein complex in interest.
l = comps[comps[,1]%in%complex[s],3]
# we exclude the protein that will be normalized.
l = l[!l%in%prot]
# sm, small matrix contains the proteomic expression of the protein complex (excluding the protein in interest.)
sm = dataset[rownames(dataset)%in%l,]
# calculating the trimmed mean for each complex.
sm2 = (sm-apply(sm,1,function(x) mean(x,na.rm=TRUE,trim=0.25)))
# Due to missing values, if there are less than 4 members to estimate the trimmed mean, it was set to NA and excluded from further analysis.
topna = apply(!is.na(sm2),2,sum)
sm2[,topna<4] = NA
# in case of redundant proteins that are members of multiple protein complexes,
# trimmed means for each complex were stored in averageexp.
averageexp = rbind(averageexp,apply(sm2,2,function(x) mean(x,na.rm=TRUE)))
}
# As a means of normalization, from the expression of the protein,
# we substract the average expression of the rest of the complex members.
# If the protein is part of multiple complexes, the average is calculated from the trimmed means of all complexes.
datasetNormalized[i,] = t(dataset[i,]-apply(averageexp,2,function(x) mean(x,na.rm=TRUE)))
}
# carrying the protein IDS and the sample information to the normalized matrix.
rownames(datasetNormalized) = rownames(dataset)
colnames(datasetNormalized) = colnames(dataset)
# Each cell line will be tested against the rest of the cell lines.
# Due to 11 cell lines, we will perform 11 comparisons.
# P values (pval) and fold changes (fc) were kept for these comparisons.
pval = matrix(data=NA,nrow=length(datasetNormalized[,1]),ncol=11)
fc = matrix(data=NA,nrow=length(datasetNormalized[,1]),ncol=11)
for(i in 1:11)
{
# preparing the design matrix by considering replicate samples as well as samples that were filtered as being outliers.
Treatment = rep(0,33)
# labeling which samples will be tested against.
ch=((i*3):(i*3+2))-2
Treatment[ch] = 1
# removing outlier samples from the design matrix.
design = model.matrix(~ -1+factor(Treatment[c(-3,-21,-30)]))
# renaming the design matrix.
colnames(design) = c("control","treatment")
fit = lmFit(datasetNormalized, design=design) # Fit the original matrix to the above design.
contrastsMatrix = makeContrasts("control-treatment",levels = design)
fit2 = contrasts.fit(fit, contrasts = contrastsMatrix) # Making the comparison.
fit2 = eBayes(fit2) # Moderating the t-tests by empirical Bayes smoothing method.
# extracting the results:
a = topTable(fit2, coef = "control-treatment",number=length(datasetNormalized[,1]),adjust.method="fdr")
pval[,i] = a[rownames(datasetNormalized),5]
fc[,i] = a[rownames(datasetNormalized),2]
}
# renaming the row and column names for the limma results
ensemblid=comps[,2]
names(ensemblid)=comps[,3]
celllinespval = pval
rownames(celllinespval) = ensemblid[rownames(datasetNormalized)]
colnames(celllinespval) = c("A549","GAMG", "HEK293", "HeLa", "HepG2", "Jurkat", "K562", "LnCap", "MCF7", "RKO", "U2OS")
celllinesfc = fc
rownames(celllinesfc) = ensemblid[rownames(datasetNormalized)]
colnames(celllinesfc) = c("A549","GAMG", "HEK293", "HeLa", "HepG2", "Jurkat", "K562", "LnCap", "MCF7", "RKO", "U2OS")
# generating the output files:
write.table(celllinespval,"output-files/11-cell-lines-limma-results-adj-P-values.txt",sep="\t",quote=FALSE)
write.table(celllinesfc,"output-files/11-cell-lines-limma-results-fold-changes.txt",sep="\t",quote=FALSE)
writeLines(capture.output(sessionInfo()), "sessionInfo.txt")