Impact on Proteotype: STEP2

STEP 2: Python Code

									
class file_Loader(object):
	def __init__(self, **kwargs):
		self.folder = kwargs.get('folder','PATH')

		print('load_standard_dataframe_into_memory')
		self.load_standard_dataframe_into_memory()

	@staticmethod
	def load_file(folder, filename):
		return DataFrameAnalyzer.open_in_chunks(folder, filename, compression = 'gzip')

	def load_standard_dataframe_into_memory(self):
		self.gygi3 = file_Loader.load_file(self.folder,"dataset_gygi3_remapped.tsv.gz")
		self.gygi2 = file_Loader.load_file(self.folder,"dataset_gygi2_remapped.tsv.gz")
		self.gygi1 = file_Loader.load_file(self.folder,"dataset_gygi1_remapped.tsv.gz")
		self.battle_protein = file_Loader.load_file(self.folder,"dataset_battle_protein_remapped.tsv.gz")
		self.battle_ribo = file_Loader.load_file(self.folder,"dataset_battle_ribo_remapped.tsv.gz")
		self.battle_rna = file_Loader.load_file(self.folder,"dataset_battle_rna_remapped.tsv.gz")
		self.mann_all_log2 = file_Loader.load_file(self.folder,"dataset_mann_all_log2_remapped.tsv.gz")
		self.wu = file_Loader.load_file(self.folder,"dataset_wu_remapped.tsv.gz")
		self.tiannan = file_Loader.load_file(self.folder,"dataset_tiannan_remapped.tsv.gz")
		self.primatePRO = file_Loader.load_file(self.folder,"dataset_primatePRO_remapped.tsv.gz")
		self.primateRNA = file_Loader.load_file(self.folder,"dataset_primateRNA_remapped.tsv.gz")
		self.tcga_ovarian = file_Loader.load_file(self.folder,"dataset_tcga_ovarian_remapped.tsv.gz")
		self.tcga_breast = file_Loader.load_file(self.folder,"dataset_tcga_breast_remapped.tsv.gz")
		self.bxd_protein  =  file_Loader.load_file(self.folder, 'dataset_bxdMouse_remapped.tsv.gz')
		self.colo_cancer  =  file_Loader.load_file(self.folder, 'dataset_coloCa_remapped.tsv.gz')

	@staticmethod
	def get_quantCols(fileData):
		columnList = list(fileData.columns)
		return utilsFacade.filtering(columnList, 'quant')

class step2_check_normalization:

	@staticmethod
	def execute(wp, **kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		datasets = [wp.gygi1, wp.gygi2, wp.gygi3, wp.battle_protein, wp.battle_ribo,
					wp.battle_rna, wp.mann_all_log2, wp.wu, wp.tiannan, wp.primateRNA,
					wp.primatePRO, wp.tcga_ovarian, wp.tcga_breast, wp.colo_cancer,
					wp.bxd_protein]
		names = ['gygi1','gygi2','gygi3','battle_protein','battle_ribo',
				 'battle_rna','mann_all_log2','wu','tiannan','primateRNA',
				 'primatePRO','tcga_ovarian','tcga_breast','colo_cancer',
				 'bxd_protein']

		for dat,name in zip(datasets, names):
			step2_check_normalization.check_normalization_boxplot(dat, name, folder)

	@staticmethod
	def check_normalization_boxplot(data, name, folder):
		quant_cols = utilsFacade.filtering(list(data.columns),'quant_')
		data = data[quant_cols]

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		plt.clf()
		fig = plt.figure(figsize =  (10,5))
		ax = fig.add_subplot(111)
		plottingFacade.boxplot(ax, data)
		plt.savefig(folder + 'normalization_check_' + name + '.pdf', bbox_inches = 'tight', dpi =  400)

		#normal distribution of protein abundance within samples ensured
		normal_check_list = list()
		normal_pval_list = list()
		for q in quant_cols:
			w, p = scipy.stats.shapiro(utilsFacade.finite(list(data[q])))
			normal_check_list.append(w)
			normal_pval_list.append(p)
		print(mean(normal_check_list))
		print(mean(normal_pval_list))
		
class step2_complex_filtering(object):
	def __init__(self, wp, **kwargs):
		self.folder = kwargs.get('folder','PATH')

		print("remapping_cycle")
		self.remapping_cycle(wp, self.folder)

	def remapping_cycle(self, wp, folder):

		dataList = [wp.battle_protein, wp.battle_ribo, wp.battle_rna,
					wp.mann_all_log2, wp.primatePRO, wp.primateRNA,
					wp.tiannan, wp.wu, wp.tcga_ovarian, wp.tcga_breast,
					wp.colo_cancer, wp.bxd_protein, wp.gygi1, wp.gygi2,
					wp.gygi3]
		nameList = ["battle_protein", "battle_ribo", "battle_rna",
					"mann_all_log2", "primatePRO", "primateRNA",
					"tiannan", "wu", 'tcga_ovarian', 'tcga_breast',
					'tcga_colo','bxd_protein','gygi1','gygi2','gygi3']

		for d,name in zip(dataList,nameList):
			data = self.reduce_to_complex_proteins(d, name, folder)

	def reduce_to_complex_proteins(self, data, name, folder):
		complex_proteinData = data[data["complex_formal_ID"]>0]
		complex_id_list = list(complex_proteinData["complex_formal_ID"])
		complex_name_list = list(complex_proteinData["complexName"])
		dfList = map(list,complex_proteinData.values)
		idxList = list(complex_proteinData.index)
		new_dfList = list()
		count = 0
		proteinList = list()
		for com_id,com_name in zip(complex_id_list,complex_name_list):
			df = dfList[count]
			com_id = com_id.split(";;")
			com_name = com_name.split(";;")
			for i,n in zip(com_id,com_name):
				tempList = df + [i,n]
				new_dfList.append(tempList)
				proteinList.append(idxList[count])
			count +=  1
		new_df = pd.DataFrame(new_dfList)
		new_df.columns = list(complex_proteinData.columns) + ['ComplexID','ComplexName']
		new_df.index = proteinList
		new_df["complex_search"] = new_df["ComplexID"] + "::" + new_df["ComplexName"]
		complex_lst = list(new_df["complex_search"])
		member_numList = list()
		for complexID in complex_lst:
			sub = new_df[new_df["complex_search"] == complexID]
			member_numList.append(len(sub))
		new_df["member_num"] = pd.Series(member_numList,index = new_df.index)
		'''
		new_df   =   new_df.rename(columns   =   {'complex_formal_ID':'complexID',
												  'complexName':'ComplexName'})
		'''
		new_df.to_csv(folder  +  'complex_filtered_' + name + 'tsv.gz', sep  =  "\t", compression   =   'gzip')
		return new_df

class step2_stoichiometry_preparation(object):
	def __init__(self,com_data,name,**kwargs):
		self.name = name
		self.com_data = com_data
		self.folder = kwargs.get('folder','PATH')
		self.log2_method = kwargs.get("log2_method","yes")
		self.output_folder = kwargs.get('output_folder','PATH')

		print("get_stoichiometry_data")
		self.stoch_data = self.get_stoichiometry_data(com_data)

		print("export_stoichiometry_data")
		self.export_stoichiometry_data()

	def export_stoichiometry_data(self):
		stoch_data = self.stoch_data

		stoch_data.to_csv(folder + self.name + "_remapped_complexStoichiometry.tsv.gz",
						  sep = "\t", compression = 'gzip')

	def get_stoichiometry_data(self, com_data):
		quant_cols = file_Loader.get_quantCols(com_data)
		complex_lst = list(set(com_data["complex_search"]))
		concat_list = list()
		count = 0
		for complex_id in complex_lst:
			print(count,complex_id)
			sub = com_data[com_data["complex_search"] == complex_id]
			quant_sub = sub[quant_cols]
			proteinList = list(quant_sub.index)
			other_dict = dict()
			for protein in proteinList:
				other_complexes = self.check_protein_in_other_complexes(protein,com_data,complex_id)
				if len(other_complexes) > 0:
					other_dict.setdefault(protein,[]).append(other_complexes)
			if len(other_dict) > 0:
				mean_dict = self.get_average_value_from_other_complexes(other_dict,com_data,quant_cols)
			else:
				other_dict = dict()
				mean_dict = dict()
			quant_list = map(list,quant_sub.values)
			trimmed_list = np.array(self.getTrimmedMeanList(quant_list))
			stoch_list,relative_variance = self.calculate_relative_abundances(quant_list,proteinList,trimmed_list,mean_dict,other_dict)
			stoch_data = pd.DataFrame(stoch_list)
			stoch_data.columns = quant_sub.columns
			stoch_data.index = quant_sub.index
			stoch_data["relative_variance"] = pd.Series(relative_variance,index = stoch_data.index)
			additional_columns = list(set(sub.columns).difference(set(stoch_data.columns)))
			for col in additional_columns:
				if col! = "SYMBOL.1.1":
					stoch_data[col] = pd.Series(list(sub[col]),index = stoch_data.index)
			concat_list.append(stoch_data)
			count+ = 1
		concat_stoch = pd.concat(concat_list)
		return concat_stoch

	def calculate_relative_abundances(self, quant_list, proteinList, trimmed_list, mean_dict, other_dict):
		
		log2_method = self.log2_method

		stoch_list = list()
		relative_variance = list()
		for q,prot in zip(quant_list,proteinList):
			if len(other_dict) = 0:
				if log2_method = "no":
					tempList = np.array(q) - trimmed_list
				else:
					ratio_list = numpy.array(q)/numpy.array(trimmed_list)
					ratio_list = np.array([x if x! = 0 else np.nan for x in list(ratio_list)])
					tempList = numpy.log2(ratio_list)
			else:
				get_prot = mean_dict.get(prot,0)
				if get_prot! = 0:
					if log2_method = "no":
						tempList = np.array(q) - np.array(mean_dict[prot][0])
					else:
						ratio_list = numpy.array(q)/numpy.array(mean_dict[prot][0])
						ratio_list = np.array([x if x! = 0 else np.nan for x in list(ratio_list)])
						tempList = numpy.log2(ratio_list)
				else:
					if log2_method = "no":
						tempList = np.array(q) - trimmed_list
					if log2_method = "yes":
						ratio_list = numpy.array(q)/numpy.array(trimmed_list)
						ratio_list = np.array([x if x! = 0 else np.nan for x in list(ratio_list)])
						tempList = numpy.log2(ratio_list)
			stoch_list.append(tempList)
			relative_variance.append(np.var(utilsFacade.finite(tempList)))
		return stoch_list,relative_variance

	def get_average_value_from_other_complexes(self,other_dict,com_data,quant_cols):
		mean_dict = dict()
		for prot in other_dict:
			trim_list = list()
			for com_id in other_dict[prot][0]:
				s = com_data[com_data["complex_search"] =  = com_id]
				quant_s = s[quant_cols]
				quant = map(list,quant_s.values)
				trimmed = self.getTrimmedMeanList(quant)
				trim_list.append(trimmed)
			mean_list = self.getMeanList(trim_list)
			mean_dict.setdefault(prot,[]).append(mean_list)
		return mean_dict

	def getMeanList(self,lists):
		log2_method = self.log2_method
		"""
		input: lists (nested lists)
		This function calculates the medianProfile out of several lists.
		"""	
		meanList = list()
		for i in xrange(len(lists[0])):
		    tempList = list()
		    for l in lists:
		    	if str(l[i])!= "nan":
		    		tempList.append(l[i])
		    meanList.append(numpy.mean(filter(lambda a:str(a)!= "nan",tempList)))
		if log2_method == "yes":
			meanList = [x if x!= 0 else np.nan for x in meanList]
		return meanList

	def getTrimmedMeanList(self,lists):
		log2_method = self.log2_method
		"""
		input: lists (nested lists)
		This function calculates the medianProfile out of several lists.
		""" 
		medianList = list()
		for i in xrange(len(lists[0])):
		    tempList = list()
		    for l in lists:
		    	if str(l[i])!= "nan":
		    		tempList.append(l[i])
		    trimValue = utilsFacade.trimmean(filter(lambda a:str(a)!= "nan",tempList))
		    if trimValue = 0:
		    	medianList.append(np.nan)
		    else:
		    	medianList.append(trimValue)
		if log2_method = "yes":
			medianList = [x if x!=0 else np.nan for x in medianList]
		return medianList

	def check_protein_in_other_complexes(self,protein,com_data,complex_id):
		temp = com_data.loc[protein]
		if type(temp) = pd.DataFrame:
			other_complexes = list(temp["complex_search"])
		else:
			other_complexes = list()
		return other_complexes

class step2_variability_testing_subunits(object):
	def __init__(self,data,name,**kwargs):
		self.data = data
		self.name = name

		print("checking_variability")
		self.concat_stoch = self.checking_variability()

		print("export_concat_stoch")
		self.export_concat_stoch()

	def checking_variability(self):
		stoch_data = self.data

		stoch_data['complex_search'] = stoch_data['ComplexID'] + ':' + stoch_data['ComplexName']
		concatList = list()
		quant_cols = file_Loader.get_quantCols(stoch_data)
		complex_list = list(set(stoch_data["complex_search"]))
		outer_count = 0
		for com in complex_list:
			print(outer_count,com)
			stoch = stoch_data[stoch_data["complex_search"] =  = com]
			sub = stoch[quant_cols].T
			dataList = list()
			proteins = list()
			for p,protein in enumerate(list(sub.columns)):
				d = utilsFacade.finite(list(sub.iloc[:,p]))
				dataList.append(d)
				proteins.append(protein)
			levene_pval_list = list()
			stateList = list()
			coverageList = list()
			count = 0
			for p,d in zip(proteins,dataList):
				others = utilsFacade.flatten(dataList[0:count] + dataList[count+1:])
				try:
					levene_pval_list.append(scipy.stats.levene(d,others)[1])
				except:
					levene_pval_list.append(np.nan)
				coverageList.append((len(d)/len(quant_cols))*100)
				if np.var(d) > np.var(others):
					stateList.append("variable")
				else:
					stateList.append("stable")
				count += 1
			stoch["coverage"] = pd.Series(coverageList,index = stoch.index)
			stoch["levene_pval"] = pd.Series(levene_pval_list,index = stoch.index)
			stoch["state"] = pd.Series(stateList,index = stoch.index)
			concatList.append(stoch)
			outer_count += 1
		concat_stoch = pd.concat(concatList)
		levene = list(concat_stoch["levene_pval"])
		lev = utilsFacade.finite(levene)
		levene_corr = utilsFacade.correct_pvalues(lev)
		adj_list = list()
		for l in levene:
			if str(l)! = "nan":
				idx = lev.index(l)
				adj_list.append(levene_corr[idx])
			else:
				adj_list.append(np.nan)
		concat_stoch["levene_pval.adj"] = pd.Series(adj_list,index = concat_stoch.index)
		return concat_stoch
		
	def export_concat_stoch(self):
		concat_stoch = self.concat_stoch
		concat_stoch.to_csv(self.folder + self.name + "_remapped_complexStoichiometry_withVar.tsv.gz",
							sep = "\t", compression  =  'gzip')

class step2:
	@staticmethod
	def execute(wp, **kwargs):
		folder = kwargs.get('folder','PATH')

		print('run_norm_check')
		step2.run_norm_check(wp, folder)

		print('run_complex_filtering')
		step2.run_complex_filtering(wp, folder)

		print('run_complex_stoichiometry_preparation')
		step2.run_complex_stoichiometry_preparation(wp, folder)

	@staticmethod
	def run_norm_check(wp, folder):
		step2_check_normalization(wp, folder = folder)

	@staticmethod
	def run_complex_filtering(wp, folder):
		step2_complex_filtering(wp, folder = folder)

	@staticmethod
	def run_complex_stoichiometry_preparation(wp, folder):
		nameList  =  ["battle_protein", "battle_ribo", "battle_rna",
					  "mann_all_log2", "primatePRO", "primateRNA",
					  "tiannan", "wu", 'tcga_ovarian', 'tcga_breast',
					  'tcga_colo','bxd_protein','gygi1','gygi2','gygi3']
		log_list = ['no','yes','yes','no','no','no','yes','no','yes',
					'yes','yes','yes','yes','no','yes']
		for name,log in zip(nameList, log_list):
			d = DataFrameAnalyzer.getFile(folder + 'complex_filtered_' + name + '.tsv.gz')
			stoch = step2_stoichiometry_preparation(d, name, log2_method = log)
			stochvar = step2_variability_testing_subunits(stoch.stoch_data, name)

if __name__ == "__main__":
	## EXECUTE STEP2
	wp = file_Loader(folder = sys.argv[1])
	step2.execute(wp, folder = sys.argv[1])
All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.
GNU LICENSE
Download script here
Computational Pipeline

All Code Material

STEP 2: Python Code