## This source code file is part of the analysis of variable protein complexes (VariableComplexes).
## Copyright (C) 2016 Murat Iskar, Alessandro Ori
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see .
##
## Please check the publication "Spatiotemporal variation of mammalian protein complex stoichiometries", XXX
## and the supplementary website: www.bork.embl.de/variable_complexes
##
## preprocessing_id_matching.R
## v1
## Murat Iskar
## Alessandro Ori
## 20.1.2016
##
##
## input files:
## Geiger_etal_Mol_Cell_Proteomics_2012_11_cell_lines_proteomic_dataset.csv
## Protein-Complexes-UniprotIds.txt
##
## output files:
## 11-cell-line-dataset-only-Protein-complexes.txt
## matched-protein-complexes-11-cell-line-dataset.txt
## Outlier-detection-correlation-of-samples.pdf
##
options(max.print=200)
options(stringsAsFactors=FALSE)
# to use heatmap.2 function
library("gplots")
########################################################
########################################################
# INPUTS
########################################################
# proteomic dataset, log2 normalized, rownames = protein ids ";" separated
dataset<-read.delim("input-files/Geiger_etal_Mol_Cell_Proteomics_2012_11_cell_lines_proteomic_dataset.csv", sep=",", row.names=1,quote="",comment.char="")
dataset<-as.matrix(dataset)
# reading in complex definitions
comps<-read.table("input-files/Protein-Complexes-UniprotIds.txt",sep="\t",quote="",comment.char="",header=TRUE)
##########
# missing value (NA) removal function
##########
#ns: number of samples
#nrep: number of replicates per sample
#maxna: maximum number of missing values allowed per sample
na.removal<-function(data,ns,nrep,maxna){
NAcounts<-NULL
p<-seq(1,(nrep*ns),by=nrep)
for(i in 1:ns){
tmp=apply(is.na(data[,p[i]:(p[i]+nrep-1)]),1,sum)
NAcounts<-cbind(NAcounts,tmp)
}
datafiltered = data[apply(NAcounts,1,function(x) all(x <=maxna)),]
datafiltered
}
# Filtering quantified proteins if there are multiple missing values in any cell line.
#defining number of samples
ns = 11
#number of replicates per sample
nrep = 3
#max number of missing value allowed per sample
maxna = 1
# filtering of proteins with missing values in any of the samples
dataset = na.removal(dataset,ns,nrep,maxna)
correlation = cor(dataset, method="pearson",use="pairwise.complete.obs")
pdf("output-files/Outlier-detection-correlation-of-samples.pdf", width=11, height=11)
heatmap.2(correlation,trace = "none",density.info="none", cexRow=0.9, cexCol=0.9,
colsep=c(seq(1,(nrep*ns),1)),
rowsep=c(seq(1,(nrep*ns),1)),
sepcolor='white',sepwidth=c(0.0125,0.02))
dev.off()
# samples A549.3 (3rd position), RKO.3 (30) and K562.3 (21) were identified as outliers and were discarded from further analysis.
dataset = dataset[,c(-3,-21,-30)]
#preprocessing of the proteomics profiles
# profiles were centered around their trimmed mean.
medpoint = apply(dataset,2,function(x) mean(x,trim=0.5,na.rm=TRUE))
dataset = t(t(dataset)-medpoint)
##########################################################
comp.sel<-as.character(unique(comps$ProteinComplex))
comps2<-c()
data2<-c()
for(h in 1:length(comp.sel))
{
print(h)
subcomp = as.matrix(comps[comps$ProteinComplex==comp.sel[h],])
subunitRedundantNames = subcomp[,"UniProtIDS"]
for(m in 1:length(subunitRedundantNames))
{
subunitname = unique(unlist(strsplit(as.character(subunitRedundantNames[m]),":")))
match = c()
idmatch = c()
for(s in 1:length(dataset[,1]))
{
subjectids = unique(unlist(strsplit(rownames(dataset)[s],";")))
if(sum(subunitname%in%subjectids))
{
if(length(match)==0)
{
match = subunitname[subunitname%in%subjectids][1];
idmatch = s;
}
else
{
print(c("reporting redundantcase in name matching:",h,m,s,subunitname))
}
}
}
if(length(match)>0)
{
subcomp[m,"UniProtIDS"] = match;
rownames(dataset)[idmatch] = match;
comps2 = rbind(comps2,subcomp[m,]);
}
}
}
# we only retain complexes having at least 5 members quantified.
comp.count = table(comps2[,"ProteinComplex"])
compsmin5members = comps2[comps2[,"ProteinComplex"]%in%names(comp.count)[comp.count>=5],]
# Finally, we also get the subset of the proteomics dataset that matches with the protein complexes.
ProteinCompDataset = dataset[rownames(dataset)%in%compsmin5members[,3],]
# writing output files that will be used in further analysis.
write.table(compsmin5members,"output-files/matched-protein-complexes-11-cell-line-dataset.txt",sep="\t",quote=FALSE,row.names=FALSE)
write.table(ProteinCompDataset,"output-files/11-cell-line-dataset-only-Protein-complexes.txt",sep="\t",quote=FALSE)
writeLines(capture.output(sessionInfo()), "sessionInfo.txt")