## This source code file is part of the analysis of variable protein complexes (VariableComplexes). ## Copyright (C) 2016 Murat Iskar, Alessandro Ori ## ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation, either version 3 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program. If not, see . ## ## Please check the publication "Spatiotemporal variation of mammalian protein complex stoichiometries", XXX ## and the supplementary website: www.bork.embl.de/variable_complexes ## ## mRNA_co_expression.R ## v1 ## Murat Iskar ## Alessandro Ori ## 20.1.2016 ## ## ## input files: ## affy-Ensembl-Uniprot-links.RData ## matched-protein-complexes-11-cell-line-dataset.txt ## raw-CEL-files folder ## selected-profiles.txt ## ## output files: ## 10-cell-lines-expression-matrix.txt ## higher-mRNA-co-expression-complex.pdf ## options(max.print=200) options(stringsAsFactors=FALSE) library(affy) library(ggplot2) # input files and RMA normalize the microarray data: #check file for the input-files/selected-profiles.txt for the randomly picked microarray data available from GEO (HG-U133A) as representatives. setwd("input-files/raw-CEL-files/") microarraydata = ReadAffy() setwd("../../") eset = rma(microarraydata) expmat = exprs(eset) write.exprs(eset, file="output-files/10-cell-lines-expression-matrix.txt", sep="\t") ################################### # reading protein complex definitions: load("input-files/affy-Ensembl-Uniprot-links.RData") # replacing affy names with the uniprot ids: # extracting probes pr = rownames(expmat) # in case of multiple matches per uniprot id, we will select the probe with highest standard deviation. expmatsd = apply(expmat,1,sd) names(expmatsd) = rownames(expmat) #match with proteomics data: un = unique(affyids[,3]) expmatUniprot = mat.or.vec(nc = 29, nr = length(un)) for(i in 1:length(un)) { # for each Uniprot ID, we check the affy id(s) available. pr = affyids[affyids[,3]%in%un[i],1] # in case of multiple entries per Uniprot ID, we select the one with the highest standard deviation. expmatUniprot[i,] = expmat[pr[which.max(expmatsd[pr])],] } rownames(expmatUniprot) = un colnames(expmatUniprot) = colnames(expmat) # We will use the expmatUniprot matrix for the co-expression analysis. ############################################################################################# # Now, we start the mRNA co-expression analysis. # loading the complex definition that was considered for the protein-co-expression analysis: comps = read.table("input-files/matched-protein-complexes-11-cell-line-dataset.txt",sep="\t",header=TRUE) # filter complexes that is not represented by affy probes. comps = comps[comps[,3]%in%un,] # removing complexes that has lower than 4 members. comp.count = table(comps[,"ProteinComplex"]) compsFiltered = comps[comps[,"ProteinComplex"]%in%names(comp.count)[comp.count>=4],] # now, we limit the data matrix to the protein space of complex definition that was quantified in 11 cell line dataset. expmatUniprot = expmatUniprot[rownames(expmatUniprot)%in%compsFiltered[,3],] # calculating pearson correlation between all the genes considered as protein members compcor = cor(t(expmatUniprot),method="pearson",use="pairwise.complete.obs") # a matrix same size as the correlation matrix samecomplexmat = compcor # background value was set to zero, indicating as protein pair are not members of the same complex. samecomplexmat[] = "different comp." uniqcomp = unique(comps[,1]) # Gene pairs from the same protein complex were labeled as positive in the samecomplexmat. for(i in 1:length(uniqcomp)) { #retrieve all members of a protein complex g = comps[comps[,1]%in%uniqcomp[i],3] g = g[g%in%rownames(samecomplexmat)] samecomplexmat[g,g]="same comp."; } # we exclude the diagonal of the matrix from the plot. diag(compcor) = NA # preparing the data frame. df = data.frame(correlationvalue=c(compcor),fromsamecomplex=as.factor(c(samecomplexmat))) # plot the data using ggplot. ggplot(df, aes(x=correlationvalue, fill=fromsamecomplex))+ylab("Density")+xlab("Pearson correlation") + geom_density(alpha=.3)+ theme_bw() +scale_x_continuous(limits=c(-1, 1), breaks=c(-1,-0.5,0,0.5,1)) ggsave(filename="output-files/higher-mRNA-co-expression-complex.pdf",height=5,width=8) writeLines(capture.output(sessionInfo()), "sessionInfo.txt")