## This source code file is part of the analysis of variable protein complexes (VariableComplexes).
## Copyright (C) 2016 Murat Iskar, Alessandro Ori
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see .
##
## Please check the publication "Spatiotemporal variation of mammalian protein complex stoichiometries", XXX
## and the supplementary website: www.bork.embl.de/variable_complexes
##
## preprocessing_id_matching.R
## v1
## Murat Iskar
## Alessandro Ori
## 20.1.2016
##
##
## input files:
## Hansson_etal_Cell_Rep_2012_reprogramming_proteomic_dataset.csv
## Protein-Complexes-Ensembl_Protein_Ids_Mus_musculus.txt
##
## output files:
## matched-protein-complexes-reprogramming-dataset.txt
## Reprogramming-dataset-only-Protein-complexes.txt
options(max.print=200)
options(stringsAsFactors=FALSE)
########################################################
########################################################
# INPUTS
########################################################
#DEFINE DATASET, ALREADY LOG2 TRANSFORMED AND WITH ROWNAME = PROTEIN IDs ";" SEPARATED
dataset = read.delim("input-files/Hansson_etal_Cell_Rep_2012_reprogramming_proteomic_dataset.csv", sep=",", row.names=1)
dataset = as.matrix(dataset)
#GET COMPLEXES DEFINITIONS
comps = read.csv("input-files/Protein-Complexes-Ensembl_Protein_Ids_Mus_musculus.txt",sep="\t",quote="",comment.char="",header=TRUE)
#DEFINE SAMPLE NAMES
colnames(dataset) = c("D3.D0.1","D3.D0.2","D6.D3.1","D6.D3.2","D9.D6.1","D9.D6.2","D12.D9.1","D12.D9.2","iPS.D12.1","iPS.D12.2")
##########################################################
comp.sel<-as.character(unique(comps$ProteinComplex))
comps2<-c()
data2<-c()
for(h in 1:length(comp.sel))
{
print(h)
subcomp = as.matrix(comps[comps$ProteinComplex==comp.sel[h],])
subunitRedundantNames = subcomp[,"OrthologEnsemblinMusmusculus"]
for(m in 1:length(subunitRedundantNames))
{
subunitname = unique(unlist(strsplit(as.character(subunitRedundantNames[m]),":")))
match = c()
idmatch = c()
for(s in 1:length(dataset[,1]))
{
subjectids = unique(unlist(strsplit(rownames(dataset)[s],";")))
if(sum(subunitname%in%subjectids))
{
if(length(match)==0)
{
match = subunitname[subunitname%in%subjectids][1];
idmatch = s;
}
else
{
print(c("reporting redundantcase in name matching:",h,m,s,subunitname))
}
}
}
if(length(match)>0)
{
subcomp[m,"OrthologEnsemblinMusmusculus"] = match;
rownames(dataset)[idmatch] = match;
comps2 = rbind(comps2,subcomp[m,]);
}
}
}
# we only retain complexes having at least 5 members quantified.
comp.count<-table(comps2[,"ProteinComplex"])
compsmin5members<-comps2[comps2[,"ProteinComplex"]%in%names(comp.count)[comp.count>=5],]
# Finally, we also get the subset of the proteomics dataset that matches with the protein complexes.
ProteinCompDataset<-dataset[rownames(dataset)%in%compsmin5members[,3],]
# writing output files that will be used in further analysis.
write.table(compsmin5members,"output-files/matched-protein-complexes-reprogramming-dataset.txt",sep="\t",quote=FALSE,row.names=FALSE)
write.table(ProteinCompDataset,"output-files/Reprogramming-dataset-only-Protein-complexes.txt",sep="\t",quote=FALSE)
writeLines(capture.output(sessionInfo()), "sessionInfo.txt")