STEP 18: Python Code

									
class stringData_yeast:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder', 'PATH')

		prepare_string_dictionary = kwargs.get('prepare_string_dictionary', False)
		collect_corr_values = kwargs.get('collect_corr_values', False)
		visualize = kwargs.get('visualize', True)

		if prepare_string_dictionary == True:
			print('load_all_string_data_yeast')
			interactor_dict, interactor_dict700 = stringData_yeast.load_all_string_data_yeast(folder)

		if collect_corr_values == True:
			print('load_interactor_dictionaries')
			interactor_dict, interactor_dict700 = stringData_yeast.load_interactor_dictionaries(folder)
			print('collect_corr_yeast_datasets')
			stringData_yeast.collect_corr_yeast_datasets(folder, interactor_dict, interactor_dict700)

		if visualize == True:
			stringData_yeast.iterate_visualisation(folder)

		print('VISUALIZATION: FINISHED')

	@staticmethod
	def load_interactor_dictionaries(folder):
		interactor_dict = DataFrameAnalyzer.read_pickle(folder + '4932.interaction.dict.allScores.pkl')
		interactor_dict700 = DataFrameAnalyzer.read_pickle(folder + '4932.interaction.dict.700Scores.pkl')
		return interactor_dict, interactor_dict700

	@staticmethod
	def load_all_string_data_yeast(folder):
		fname = '4932.protein.links.detailed.v10.5.txt'
		data = pd.read_csv(folder + fname, sep = ' ')

		p1_list = list()
		p2_list = list()
		protein1_list = list(data.protein1)
		protein2_list = list(data.protein2)
		for protein1, protein2 in zip(protein1_list, protein2_list):
			p1_list.append(protein1.split('.')[1])
			p2_list.append(protein2.split('.')[1])

		data['pro1'] = pd.Series(p1_list, index = data.index)
		data['pro2'] = pd.Series(p2_list, index = data.index)

		proteins1 = list(set(data.pro1))
		interactor_dict = dict((e1,list()) for e1 in proteins1)
		for i,protein in enumerate(proteins1):
			print(i, protein)
			sub = data[data.pro1==protein]
			interactors = list(sub.pro2)
			interactor_dict[protein] = interactors
		DataFrameAnalyzer.to_pickle(interactor_dict, folder + '4932.interaction.dict.allScores.pkl')

		data = data[data.combined_score >= 700]
		proteins1 = list(set(data.pro1))
		interactor_dict700 = dict((e1,list()) for e1 in proteins1)
		for i,protein in enumerate(proteins1):
			print(i, protein)
			sub = data[data.pro1==protein]
			interactors = list(sub.pro2)
			interactor_dict700[protein] = interactors
		DataFrameAnalyzer.to_pickle(interactor_dict700, folder + '4932.interaction.dict.700Scores.pkl')
		return interactor_dict, interactor_dict700

	@staticmethod
	def get_string_and_other_correlations(folder, data, quant_cols, interactor_dict, interactor_dict700, name):
		quant_data = data[quant_cols]
		available_proteins = set(quant_data.index)

		string_corr_values = list()
		other_corr_values = list()
		for p,protein in enumerate(interactor_dict.keys()):
			print(p)
			if protein in list(available_proteins):
				interactors = set(list(interactor_dict[protein]) + [protein])
				available_interactors = list(available_proteins.intersection(interactors))
				non_interactors = list(available_proteins.difference(interactors))
				sub_data = quant_data.T[available_interactors]
				other_data = quant_data.T[[protein] + non_interactors]
				string_corrData = sub_data.corr()
				other_corrData = other_data.corr()
				#if type(string_corrData[protein])==pd.Series:
				string_corrs = list(string_corrData[protein])
				string_corr_values.append(string_corrs)
				#if type(other_corrData[protein])==pd.Series:
				other_corrs = list(other_corrData[protein])
				other_corr_values.append(other_corrs)
		string_corr_values = utilsFacade.flatten(string_corr_values)
		other_corr_values = utilsFacade.flatten(other_corr_values)
		string_corr_values = np.array(string_corr_values)
		string_corr_values = string_corr_values[string_corr_values < 1]
		print(len(string_corr_values))
		string_corr_values = utilsFacade.finite(list(string_corr_values))
		print(len(string_corr_values))
		string_corr_values = np.array(string_corr_values)
		string_corr_values = string_corr_values[string_corr_values >- 1]
		print(len(string_corr_values))
		string_corr_values = list(string_corr_values)
		########################################################################
		other_corr_values = np.array(other_corr_values)
		other_corr_values = other_corr_values[other_corr_values < 1]
		print(len(other_corr_values))
		other_corr_values = utilsFacade.finite(list(other_corr_values))
		print(len(other_corr_values))
		other_corr_values = np.array(other_corr_values)
		other_corr_values = other_corr_values[other_corr_values >- 1]
		print(len(other_corr_values))
		other_corr_values = list(other_corr_values)


		string700_corr_values = list()
		for p,protein in enumerate(interactor_dict700.keys()):
			print(p)
			if protein in list(available_proteins):
				interactors = set(list(interactor_dict700[protein]) + [protein])
				available_interactors = list(available_proteins.intersection(interactors))
				sub_data = quant_data.T[available_interactors]
				string_corrData = sub_data.corr()
				if type(string_corrData[protein])==pd.Series:
					string_corrs = list(string_corrData[protein])
					string700_corr_values.append(string_corrs)
		string700_corr_values = utilsFacade.flatten(string700_corr_values)
		string700_corr_values = np.array(string700_corr_values)
		string700_corr_values = string700_corr_values[string700_corr_values < 1]
		print(len(string700_corr_values))
		string700_corr_values = utilsFacade.finite(list(string700_corr_values))
		print(len(string700_corr_values))
		string700_corr_values = np.array(string700_corr_values)
		string700_corr_values = string700_corr_values[string700_corr_values >- 1]
		print(len(string700_corr_values))
		string700_corr_values = list(string700_corr_values)

		yeast_corrs = {'string': string_corr_values,
					   'string700': string700_corr_values,
					   'other': other_corr_values}
		DataFrameAnalyzer.to_pickle(yeast_corrs, folder + name + '.string.corrs.pkl')
		return yeast_corrs

	@staticmethod
	def collect_corr_yeast_datasets(folder, interactor_dict, interactor_dict700):
		fname = 'yeast3_quant_Proteome_carbonSources_MAPPED_complexes_pathways_NORM2.tsv.gz'
		data = DataFrameAnalyzer.getFile(folder, fname)
		quant_cols = utilsFacade.filtering(list(data.columns), 'rel. Intensity')
		print('get_string_and_other_correlations: YEAST3')
		yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
													 					 quant_cols, interactor_dict,
													 					 interactor_dict700, 'yeast3')
		###################################################################################
		fname = 'yeast4_quant_Proteome_naclStress_MAPPED_complexes_pathways_NORM2.tsv.gz'
		data = DataFrameAnalyzer.getFile(folder, fname)
		quant_cols = utilsFacade.filtering(list(data.columns), 'rel.Intensity')
		print('get_string_and_other_correlations: yeast4')
		yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
													 					 quant_cols, interactor_dict,
													 					 interactor_dict700, 'yeast4')
		###################################################################################
		fname = 'yeast5_quant_Proteome_MAPPED_complexes_pathways_NORM2_IBAQ.tsv.gz'
		data = DataFrameAnalyzer.getFile(folder, fname)
		quant_cols = utilsFacade.filtering(list(data.columns), 'iBAQ')[:-1]
		print('get_string_and_other_correlations: yeast5')
		yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
													 					 quant_cols, interactor_dict,
													 					 interactor_dict700, 'yeast5')
		###################################################################################
		fname = 'yeast8_quant_GFP_Proteome_MAPPED_complexes_pathways_NORM2.tsv.gz'
		data = DataFrameAnalyzer.getFile(folder, fname)
		quant_cols = utilsFacade.filtering(list(data.columns), 'quant')
		print('get_string_and_other_correlations: yeast8')
		yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
																		 quant_cols, interactor_dict,
													 					 interactor_dict700, 'yeast8')
		###################################################################################
		fname = 'yeast10_quant_RNA_MAPPED_complexes_pathways_NORM2.tsv.gz'
		data = DataFrameAnalyzer.getFile(folder, fname)
		quant_cols = utilsFacade.filtering(list(data.columns), 'quant')
		print('get_string_and_other_correlations: yeast10')
		yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
													 					 quant_cols, interactor_dict,
													 					 interactor_dict700, 'yeast10')
		###################################################################################
		fname = 'yeast11_quant_proteome_MAPPED_complexes_pathways_NORM2.tsv.gz'
		data = DataFrameAnalyzer.getFile(folder, fname)
		quant_cols = utilsFacade.filtering(list(data.columns), 'quant')
		print('get_string_and_other_correlations: yeast11')
		yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
													 					 quant_cols, interactor_dict,
													 					 interactor_dict700, 'yeast11')
		###################################################################################
		fname = 'yeast14_quant_proteome_MAPPED_complexes_pathways_NORM2.tsv.gz'
		data = DataFrameAnalyzer.getFile(folder, fname)
		quant_cols = utilsFacade.filtering(list(data.columns), 'quant_', condition = 'startswith')
		print('get_string_and_other_correlations: yeast14')
		yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
													 					 quant_cols, interactor_dict,
													 					 interactor_dict700, 'yeast14')
		###################################################################################
		fname = 'yeast16_quant_proteome_MAPPED_complexes_pathways_NORM2.tsv.gz'
		data = DataFrameAnalyzer.getFile(folder, fname)
		quant_cols = utilsFacade.filtering(list(data.columns), 'quant_', condition = 'startswith')
		print('get_string_and_other_correlations: yeast16')
		yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
													 					 quant_cols, interactor_dict,
													 					 interactor_dict700, 'yeast16')
		###################################################################################
		fname = 'yeast18_quant_proteome_MAPPED_complexes_pathways_NORM2.tsv.gz'
		data = DataFrameAnalyzer.getFile(folder, fname)
		quant_cols = utilsFacade.filtering(list(data.columns), 'quant_', condition = 'startswith')
		print('get_string_and_other_correlations: yeast18')
		yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
													 					 quant_cols, interactor_dict,
													 					 interactor_dict700, 'yeast18')
		###################################################################################
		fname = 'yeast19_quant_transcriptome_MAPPED_complexes_pathways_NORM2.tsv.gz'
		data = DataFrameAnalyzer.getFile(folder, fname)
		quant_cols = utilsFacade.filtering(list(data.columns), 'quant_', condition = 'startswith')
		print('get_string_and_other_correlations: yeast19')
		yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
													 					 quant_cols, interactor_dict,
													 					 interactor_dict700, 'yeast19')
		###################################################################################
		fname = 'yeast20_quant_proteome_MAPPED_complexes_pathways_NORM2.tsv.gz'
		data = DataFrameAnalyzer.getFile(folder, fname)
		quant_cols = utilsFacade.filtering(list(data.columns), 'quant_', condition = 'startswith')
		print('get_string_and_other_correlations: yeast20')
		yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
													 					 quant_cols, interactor_dict,
													 					 interactor_dict700, 'yeast20')
		###################################################################################
		fname = 'yeast21_quant_proteome_MAPPED_complexes_pathways_NORM2_REP.tsv.gz'
		data = DataFrameAnalyzer.getFile(folder, fname)
		quant_cols = utilsFacade.filtering(list(data.columns), 'quant_', condition = 'startswith')
		print('get_string_and_other_correlations: yeast21')
		yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
													 					 quant_cols, interactor_dict,
													 					 interactor_dict700, 'yeast21')
		###################################################################################

	@staticmethod
	def yeast_string_visualisation_signal(folder, name):
		yeast_dict = DataFrameAnalyzer.read_pickle(folder + name + '.string.corrs.pkl')

		string_corrs, string700_corrs, other_corrs = yeast_dict['string'], yeast_dict['string700'], yeast_dict['other']

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		plt.clf()
		fig = plt.figure(figsize = (5,5))
		gs = gridspec.GridSpec(10,10)
		ax = plt.subplot(gs[0:7,0:])
		plottingFacade.func_plotDensities_border(ax, other_corrs, facecolor = 'grey')
		plottingFacade.func_plotDensities_border(ax, string_corrs, facecolor = '#F9A22C')
		plottingFacade.func_plotDensities_border(ax, string700_corrs, facecolor = '#EC7723')
		ax.set_xlim(-1,1)
		ax.set_xticklabels([])

		pval_distribution = list()
		for i in xrange(1,1000):
			pval = scipy.stats.mannwhitneyu(random.sample(other_corrs,1000),
											random.sample(string_corrs,1000))[1]
			pval_distribution.append(pval)
		pval_distribution_best = list()
		for i in xrange(1,1000):
			pval = scipy.stats.mannwhitneyu(random.sample(other_corrs,1000),
											random.sample(string_corrs,1000))[1]
			pval_distribution_best.append(pval)
		pval1 = np.mean(pval_distribution)
		pval2 = np.mean(pval_distribution_best)
		plottingFacade.make_full_legend(ax, [pval1, pval2], ['#F9A22C','#EC7723'])

		ax = plt.subplot(gs[7:,0:])
		dataList = [string700_corrs, string_corrs, other_corrs]
		bp = ax.boxplot(dataList, notch=0, sym="", vert=0, patch_artist=True, widths=(0.5,0.5,0.5))
		plt.setp(bp['medians'], color="black")
		plt.setp(bp['whiskers'], color="black",linestyle="-")
		for i,patch in enumerate(bp['boxes']):
			if i==2:
				patch.set_facecolor("grey")	
			elif i==1:
				patch.set_facecolor("#F9A22C")	
			else:
				patch.set_facecolor("#EC7723")	
			patch.set_edgecolor("black")
			patch.set_alpha(1)
		ax.set_xlim(-1,1)
		ax.set_yticklabels([])
		ax.set_xticklabels([])
		plt.savefig(folder + name + '_quant_proteome_STRINGcheckup.pdf',
					bbox_inches = 'tight', dpi = 300)

	@staticmethod
	def iterate_visualisation(folder):
		name_list = [3, 4, 5, 10, 11, 14, 16, 18, 19, 20, 21]
		for name in name_list:
			print('yeast_string_visualisation_signal: yeast'+ str(name))
			stringData_yeast.yeast_string_visualisation_signal(folder, 'yeast' + str(name))

if __name__ == "__main__":
	s = stringData_yeast.execute(folder = sys.argv[1], output_folder = sys.argv[2],
		visualize = sys.argv[3], collect_corr_values = sys.argv[4],prepare_string_dictionary = sys.argv[5])
									
								

All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.

GNU LICENSE

Dowload script here