## This source code file is part of the analysis of variable protein complexes (VariableComplexes). ## Copyright (C) 2016 Murat Iskar, Alessandro Ori ## ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation, either version 3 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program. If not, see . ## ## Please check the publication "Spatiotemporal variation of mammalian protein complex stoichiometries", XXX ## and the supplementary website: www.bork.embl.de/variable_complexes ## ## preprocessing_id_matching.R ## v1 ## Murat Iskar ## Alessandro Ori ## 20.1.2016 ## ## ## input files: ## Hansson_etal_Cell_Rep_2012_reprogramming_proteomic_dataset.csv ## Protein-Complexes-Ensembl_Protein_Ids_Mus_musculus.txt ## ## output files: ## matched-protein-complexes-reprogramming-dataset.txt ## Reprogramming-dataset-only-Protein-complexes.txt options(max.print=200) options(stringsAsFactors=FALSE) ######################################################## ######################################################## # INPUTS ######################################################## #DEFINE DATASET, ALREADY LOG2 TRANSFORMED AND WITH ROWNAME = PROTEIN IDs ";" SEPARATED dataset = read.delim("input-files/Hansson_etal_Cell_Rep_2012_reprogramming_proteomic_dataset.csv", sep=",", row.names=1) dataset = as.matrix(dataset) #GET COMPLEXES DEFINITIONS comps = read.csv("input-files/Protein-Complexes-Ensembl_Protein_Ids_Mus_musculus.txt",sep="\t",quote="",comment.char="",header=TRUE) #DEFINE SAMPLE NAMES colnames(dataset) = c("D3.D0.1","D3.D0.2","D6.D3.1","D6.D3.2","D9.D6.1","D9.D6.2","D12.D9.1","D12.D9.2","iPS.D12.1","iPS.D12.2") ########################################################## comp.sel<-as.character(unique(comps$ProteinComplex)) comps2<-c() data2<-c() for(h in 1:length(comp.sel)) { print(h) subcomp = as.matrix(comps[comps$ProteinComplex==comp.sel[h],]) subunitRedundantNames = subcomp[,"OrthologEnsemblinMusmusculus"] for(m in 1:length(subunitRedundantNames)) { subunitname = unique(unlist(strsplit(as.character(subunitRedundantNames[m]),":"))) match = c() idmatch = c() for(s in 1:length(dataset[,1])) { subjectids = unique(unlist(strsplit(rownames(dataset)[s],";"))) if(sum(subunitname%in%subjectids)) { if(length(match)==0) { match = subunitname[subunitname%in%subjectids][1]; idmatch = s; } else { print(c("reporting redundantcase in name matching:",h,m,s,subunitname)) } } } if(length(match)>0) { subcomp[m,"OrthologEnsemblinMusmusculus"] = match; rownames(dataset)[idmatch] = match; comps2 = rbind(comps2,subcomp[m,]); } } } # we only retain complexes having at least 5 members quantified. comp.count<-table(comps2[,"ProteinComplex"]) compsmin5members<-comps2[comps2[,"ProteinComplex"]%in%names(comp.count)[comp.count>=5],] # Finally, we also get the subset of the proteomics dataset that matches with the protein complexes. ProteinCompDataset<-dataset[rownames(dataset)%in%compsmin5members[,3],] # writing output files that will be used in further analysis. write.table(compsmin5members,"output-files/matched-protein-complexes-reprogramming-dataset.txt",sep="\t",quote=FALSE,row.names=FALSE) write.table(ProteinCompDataset,"output-files/Reprogramming-dataset-only-Protein-complexes.txt",sep="\t",quote=FALSE) writeLines(capture.output(sessionInfo()), "sessionInfo.txt")