##  This source code file is part of the analysis of variable protein complexes (VariableComplexes).
##  Copyright (C) 2016  Murat Iskar, Alessandro Ori
## 
##  This program is free software: you can redistribute it and/or modify
##  it under the terms of the GNU General Public License as published by
##  the Free Software Foundation, either version 3 of the License, or
##  (at your option) any later version.
## 
##  This program is distributed in the hope that it will be useful,
##  but WITHOUT ANY WARRANTY; without even the implied warranty of
##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##  GNU General Public License for more details.
## 
##  You should have received a copy of the GNU General Public License
##  along with this program.  If not, see <http://www.gnu.org/licenses/>.
##
##  Please check the publication "Spatiotemporal variation of mammalian protein complex stoichiometries", XXX
##  and the supplementary website: www.bork.embl.de/variable_complexes
##
## mRNA_co_expression.R
## v1
## Murat Iskar <muratiskar at gmail.com>
## Alessandro Ori <alessandro.ori at embl.de> 
##  20.1.2016
## 
## 
## input files: 
## affy-Ensembl-Uniprot-links.RData
## matched-protein-complexes-11-cell-line-dataset.txt
## raw-CEL-files folder
## selected-profiles.txt
##
## output files: 
## 10-cell-lines-expression-matrix.txt
## higher-mRNA-co-expression-complex.pdf
## 

options(max.print=200)
options(stringsAsFactors=FALSE)

library(affy)
library(ggplot2)

# input files and RMA normalize the microarray data:
#check file for the input-files/selected-profiles.txt for the randomly picked microarray data available from GEO (HG-U133A) as representatives.
setwd("input-files/raw-CEL-files/")
microarraydata = ReadAffy()
setwd("../../")
eset = rma(microarraydata)
expmat = exprs(eset)
write.exprs(eset, file="output-files/10-cell-lines-expression-matrix.txt", sep="\t")

###################################
# reading protein complex definitions:
load("input-files/affy-Ensembl-Uniprot-links.RData")
# replacing affy names with the uniprot ids:
# extracting probes
pr = rownames(expmat)

# in case of multiple matches per uniprot id, we will select the probe with highest standard deviation.
expmatsd = apply(expmat,1,sd)
names(expmatsd) = rownames(expmat)

#match with proteomics data:
un = unique(affyids[,3])
expmatUniprot = mat.or.vec(nc = 29, nr = length(un))

for(i in 1:length(un))
{
  # for each Uniprot ID, we check the affy id(s) available.
  pr = affyids[affyids[,3]%in%un[i],1]
  # in case of multiple entries per Uniprot ID, we select the one with the highest standard deviation.
  expmatUniprot[i,] = expmat[pr[which.max(expmatsd[pr])],]
}
rownames(expmatUniprot) = un
colnames(expmatUniprot) = colnames(expmat)
# We will use the expmatUniprot matrix for the co-expression analysis.

#############################################################################################
# Now, we start the mRNA co-expression analysis.
# loading the complex definition that was considered for the protein-co-expression analysis:
comps = read.table("input-files/matched-protein-complexes-11-cell-line-dataset.txt",sep="\t",header=TRUE)
# filter complexes that is not represented by affy probes.
comps = comps[comps[,3]%in%un,]
# removing complexes that has lower than 4 members.
comp.count = table(comps[,"ProteinComplex"])
compsFiltered = comps[comps[,"ProteinComplex"]%in%names(comp.count)[comp.count>=4],]

# now, we limit the data matrix to the protein space of complex definition that was quantified in 11 cell line dataset.
expmatUniprot = expmatUniprot[rownames(expmatUniprot)%in%compsFiltered[,3],]

# calculating pearson correlation between all the genes considered as protein members
compcor = cor(t(expmatUniprot),method="pearson",use="pairwise.complete.obs")
# a matrix same size as the correlation matrix
samecomplexmat = compcor
# background value was set to zero, indicating as protein pair are not members of the same complex.
samecomplexmat[] = "different comp."
uniqcomp = unique(comps[,1])
# Gene pairs from the same protein complex were labeled as positive in the samecomplexmat.
for(i in 1:length(uniqcomp))
{
  #retrieve all members of a protein complex
  g = comps[comps[,1]%in%uniqcomp[i],3]
  g = g[g%in%rownames(samecomplexmat)]
  samecomplexmat[g,g]="same comp.";
}
# we exclude the diagonal of the matrix from the plot.
diag(compcor) = NA

# preparing the data frame.
df = data.frame(correlationvalue=c(compcor),fromsamecomplex=as.factor(c(samecomplexmat)))
# plot the data using ggplot.
ggplot(df, aes(x=correlationvalue, fill=fromsamecomplex))+ylab("Density")+xlab("Pearson correlation") + geom_density(alpha=.3)+ theme_bw() +scale_x_continuous(limits=c(-1, 1), breaks=c(-1,-0.5,0,0.5,1)) 
ggsave(filename="output-files/higher-mRNA-co-expression-complex.pdf",height=5,width=8)

writeLines(capture.output(sessionInfo()), "sessionInfo.txt")