## This source code file is part of the analysis of variable protein complexes (VariableComplexes).
## Copyright (C) 2016 Murat Iskar, Alessandro Ori
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see .
##
## Please check the publication "Spatiotemporal variation of mammalian protein complex stoichiometries", XXX
## and the supplementary website: www.bork.embl.de/variable_complexes
##
## mRNA_co_expression.R
## v1
## Murat Iskar
## Alessandro Ori
## 20.1.2016
##
##
## input files:
## affy-Ensembl-Uniprot-links.RData
## matched-protein-complexes-11-cell-line-dataset.txt
## raw-CEL-files folder
## selected-profiles.txt
##
## output files:
## 10-cell-lines-expression-matrix.txt
## higher-mRNA-co-expression-complex.pdf
##
options(max.print=200)
options(stringsAsFactors=FALSE)
library(affy)
library(ggplot2)
# input files and RMA normalize the microarray data:
#check file for the input-files/selected-profiles.txt for the randomly picked microarray data available from GEO (HG-U133A) as representatives.
setwd("input-files/raw-CEL-files/")
microarraydata = ReadAffy()
setwd("../../")
eset = rma(microarraydata)
expmat = exprs(eset)
write.exprs(eset, file="output-files/10-cell-lines-expression-matrix.txt", sep="\t")
###################################
# reading protein complex definitions:
load("input-files/affy-Ensembl-Uniprot-links.RData")
# replacing affy names with the uniprot ids:
# extracting probes
pr = rownames(expmat)
# in case of multiple matches per uniprot id, we will select the probe with highest standard deviation.
expmatsd = apply(expmat,1,sd)
names(expmatsd) = rownames(expmat)
#match with proteomics data:
un = unique(affyids[,3])
expmatUniprot = mat.or.vec(nc = 29, nr = length(un))
for(i in 1:length(un))
{
# for each Uniprot ID, we check the affy id(s) available.
pr = affyids[affyids[,3]%in%un[i],1]
# in case of multiple entries per Uniprot ID, we select the one with the highest standard deviation.
expmatUniprot[i,] = expmat[pr[which.max(expmatsd[pr])],]
}
rownames(expmatUniprot) = un
colnames(expmatUniprot) = colnames(expmat)
# We will use the expmatUniprot matrix for the co-expression analysis.
#############################################################################################
# Now, we start the mRNA co-expression analysis.
# loading the complex definition that was considered for the protein-co-expression analysis:
comps = read.table("input-files/matched-protein-complexes-11-cell-line-dataset.txt",sep="\t",header=TRUE)
# filter complexes that is not represented by affy probes.
comps = comps[comps[,3]%in%un,]
# removing complexes that has lower than 4 members.
comp.count = table(comps[,"ProteinComplex"])
compsFiltered = comps[comps[,"ProteinComplex"]%in%names(comp.count)[comp.count>=4],]
# now, we limit the data matrix to the protein space of complex definition that was quantified in 11 cell line dataset.
expmatUniprot = expmatUniprot[rownames(expmatUniprot)%in%compsFiltered[,3],]
# calculating pearson correlation between all the genes considered as protein members
compcor = cor(t(expmatUniprot),method="pearson",use="pairwise.complete.obs")
# a matrix same size as the correlation matrix
samecomplexmat = compcor
# background value was set to zero, indicating as protein pair are not members of the same complex.
samecomplexmat[] = "different comp."
uniqcomp = unique(comps[,1])
# Gene pairs from the same protein complex were labeled as positive in the samecomplexmat.
for(i in 1:length(uniqcomp))
{
#retrieve all members of a protein complex
g = comps[comps[,1]%in%uniqcomp[i],3]
g = g[g%in%rownames(samecomplexmat)]
samecomplexmat[g,g]="same comp.";
}
# we exclude the diagonal of the matrix from the plot.
diag(compcor) = NA
# preparing the data frame.
df = data.frame(correlationvalue=c(compcor),fromsamecomplex=as.factor(c(samecomplexmat)))
# plot the data using ggplot.
ggplot(df, aes(x=correlationvalue, fill=fromsamecomplex))+ylab("Density")+xlab("Pearson correlation") + geom_density(alpha=.3)+ theme_bw() +scale_x_continuous(limits=c(-1, 1), breaks=c(-1,-0.5,0,0.5,1))
ggsave(filename="output-files/higher-mRNA-co-expression-complex.pdf",height=5,width=8)
writeLines(capture.output(sessionInfo()), "sessionInfo.txt")