Impact on Proteotype: STEP10

STEP 10: Python Code

									
class file_Loader:

	@staticmethod
	def get_complex_dictionary(**kwargs):
		folder = kwargs.get('folder','PATH')

		complexDict = DataFrameAnalyzer.read_pickle(folder + 'complex_dictionary.pkl')
		return complexDict

class step10_preparation:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print('prepare_data_male_female')
		data_dict = step10_preparation.prepare_data_male_female(folder = folder)

	@staticmethod
	def prepare_data_highfat_chow(**kwargs):
		folder = kwargs.get('folder','PATH')

		data = DataFrameAnalyzer.open_in_chunks(folder,'complex_filtered_gygi3.tsv.gz')
		quant_list = utilsFacade.filtering(list(data.columns),'quant_')
		male_list = utilsFacade.filtering(quant_list,'H')
		female_list = utilsFacade.filtering(quant_list,'S')
		relevant_complexes = list(set(data.complex_search))

		#demonstration in Figure 5
		relevant_complexes = ['HC663:COPII','HC2402:COPI',
							  'SMC_AM000047:Cohesin complex',
							  'tRNA splicing endonuclease:tRNA splicing endonuclease']

		data_dict = dict((e1,list()) for e1 in relevant_complexes)
		for complexID in relevant_complexes:
			sub = data[data.complex_search==complexID]
			male_sub = sub[male_list]
			female_sub = sub[female_list]
			if len(sub) >= 5:
				male_data_list = list(male_sub.median())
				female_data_list = list(female_sub.median())
				tt,tt_pval = scipy.stats.ttest_ind(male_data_list, female_data_list)
				data_dict[complexID] = (male_data_list, female_data_list,pval, tt_pval)
				print(pval,tt_pval)

		complex_list = list()
		pvalues = list()
		tt_pvalues = list()
		cohen_list = list()
		for complexId in data_dict.keys():
			complex_list.append(complexId)
			m, f, p, t = data_dict[complexId]
			cohen_list.append(statsFacade.cohen_d(m,f))
			pvalues.append(p)
			tt_pvalues.append(t)

		df = pd.DataFrame({'complex':complex_list, 'pval':pvalues,
						   'tt_pval':tt_pvalues, 'cohen': cohen_list})
		df['qval'] = pd.Series(statsFacade.correct_nan_pvalues(tt_pvalues), index = df.index)
		df[df.qval < 0.01]

		DataFrameAnalyzer.to_pickle(data_dict, folder + 'figure5a_diet_data_dictionary.pkl')
		return data_dict
	
	@staticmethod
	def prepare_data_male_female(**kwargs):
		folder = kwargs.get('folder','PATH')

		data = DataFrameAnalyzer.open_in_chunks(folder,'complex_filtered_gygi3.tsv.gz')
		quant_list = utilsFacade.filtering(list(data.columns),'quant_')
		male_list = utilsFacade.filtering(quant_list,'M')
		female_list = utilsFacade.filtering(quant_list,'F')

		relevant_complexes = ['HC663:COPII','HC2402:COPI','HC1479:retromer complex',
							  'SMC_AM000047:Cohesin complex',
							  'tRNA splicing endonuclease:tRNA splicing endonuclease']

		data_dict = dict((e1,list()) for e1 in relevant_complexes)
		for complexID in relevant_complexes:
			sub = data[data.complex_search==complexID]
			male_sub = sub[male_list]
			female_sub = sub[female_list]
			male_data_list = list(male_sub.median())
			female_data_list = list(female_sub.median())
			wc,pval = scipy.stats.ranksums(male_data_list, female_data_list)
			tt,tt_pval = scipy.stats.ttest_ind(male_data_list, female_data_list)
			data_dict[complexID] = (male_data_list, female_data_list,pval, tt_pval)
			print(pval,tt_pval)
		DataFrameAnalyzer.to_pickle(data_dict, folder + 'figure5a_data_dictionary.pkl')
		return data_dict

	@staticmethod
	def get_complex_abundance_data(cov_sig_dict, num_dict, **kwargs):
		folder = kwargs.get('folder','PATH')

		#########################################################
		#Notice that cov_sig_dict is prepared in wp_step11_code.py
		#########################################################

		complexDict = file_Loader.get_complex_dictionary(folder = folder)

		abu_data = DataFrameAnalyzer.open_in_chunks(folder,
				   'figure5a_gygi3_complex_diet_total_abundance_difference.tsv.gz')
		key_list = list(set(abu_data.index).intersection(set(cov_sig_dict.keys())))

		cohen_list = list()
		tt_pval_list  = list()
		complex_list = list()
		stable_fractions = list()
		variable_fractions = list()
		for key in key_list:
			num = num_dict[key]
			complexID = key.split(':')[0]
			gold = complexDict[complexID]['goldComplex'][0]
			if gold=='yes' and num >= 4:
				coverage = cov_sig_dict[key]
				cohen_list.append(abu_data.loc[key]['cohen_distance'])
				tt_pval_list.append(abu_data.loc[key]['ttest_pval'])
				complex_list.append(':'.join(key.split(':')[1:]))
				stable_fractions.append(100 - coverage)
				variable_fractions.append(coverage)
			if complexID.find('tRNA splicing')!=-1:
				coverage = cov_sig_dict[key]
				cohen_list.append(abu_data.loc[key]['cohen_distance'])
				tt_pval_list.append(abu_data.loc[key]['ttest_pval'])
				complex_list.append(':'.join(key.split(':')[1:]))
				stable_fractions.append(100 - coverage)
				variable_fractions.append(coverage)

		cohen_list,complex_list,tt_pval_list,stable_fractions, variable_fractions = utilsFacade.sort_multiple_lists([cohen_list, complex_list,tt_pval_list, stable_fractions, variable_fractions], reverse = True)
		tt_pvalCorrs = statsFacade.correct_pvalues(tt_pval_list)

		#gather numbers
		print(scipy.stats.spearmanr(cohen_list,variable_fractions))

		tt1 = 'bla'
		for idx,tt in enumerate(tt_pvalCorrs):
			if tt>=0.01 and tt1=='bla':
				tt1 = cohen_list[idx]
			if tt1!='bla' and tt<=0.01:
				tt2 = cohen_list[idx]
				break

		df = {'complex':complex_list,
			  'cohen':cohen_list,
			  'tt_pval':tt_pval_list,
			  'stable':stable_fractions,
			  'variable':variable_fractions,
			  'tt_pvalCorr':tt_pvalCorrs,
			  'tt1':[tt1]*len(complex_list),
			  'tt2':[tt2]*len(complex_list)}

		df = pd.DataFrame(df)
		df.to_csv(folder + 'figure5a_bigPicture_underlying_data.tsv.gz',
				  sep = '\t', compression = 'gzip')
		return df

class step10_figure:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print('FIGURE5A: figure5a_complex_abundanceChange_demonstration')
		step10_figure.figure5a_complex_abundanceChange_demonstration(folder = folder, output_folder = output_folder)

	@staticmethod
	def get_data(**kwargs):
		folder = kwargs.get('folder','PATH')

		data_dict = DataFrameAnalyzer.read_pickle(folder + 'figure5a_data_dictionary.pkl')
		relevant_complexes = ['HC663:COPII','HC2402:COPI',
							  'HC1479:retromer complex',
					  		  'SMC_AM000047:Cohesin complex']
		return data_dict, relevant_complexes

	@staticmethod
	def figure5a_complex_abundanceChange_demonstration(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		data_dict, relevant_complexes = step10_figure.get_data(folder = folder)

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		positions = [1,1.5]
		plt.clf()
		fig = plt.figure(figsize = (10,5))
		ax = fig.add_subplot(141)
		dataList = [data_dict[relevant_complexes[0]][0],data_dict[relevant_complexes[0]][1]]
		bp = ax.boxplot(dataList,notch=0,sym="",vert=1,patch_artist=True,
						widths=[0.4]*len(dataList), positions=positions)
		plt.setp(bp['medians'], color="black")
		plt.setp(bp['whiskers'], color="black",linestyle="--",alpha=0.8)
		for i,patch in enumerate(bp['boxes']):
			if i%2==0:
				patch.set_facecolor("#7FBF7F")	
			else:
				patch.set_facecolor("orange")	
			patch.set_edgecolor("black")
			patch.set_alpha(0.9)
			sub_group = dataList[i]
			x = numpy.random.normal(positions[i], 0.04, size=len(dataList[i]))
			ax.scatter(x,sub_group,color='white', alpha=0.5,edgecolor="black",s=5)
		ax.set_ylim(7,14)
		ax = fig.add_subplot(142)
		dataList = [data_dict[relevant_complexes[1]][0],data_dict[relevant_complexes[1]][1]]
		bp = ax.boxplot(dataList,notch=0,sym="",vert=1,patch_artist=True,
						widths=[0.4]*len(dataList), positions=positions)
		plt.setp(bp['medians'], color="black")
		plt.setp(bp['whiskers'], color="black",linestyle="--",alpha=0.8)
		for i,patch in enumerate(bp['boxes']):
			if i%2==0:
				patch.set_facecolor("#7FBF7F")	
			else:
				patch.set_facecolor("orange")	
			patch.set_edgecolor("black")
			patch.set_alpha(0.9)
			sub_group = dataList[i]
			x = numpy.random.normal(positions[i], 0.04, size=len(dataList[i]))
			ax.scatter(x,sub_group,color='white', alpha=0.5,edgecolor="black",s=5)
		ax.set_ylim(7,14)
		ax = fig.add_subplot(143)
		dataList = [data_dict[relevant_complexes[2]][0],data_dict[relevant_complexes[2]][1]]
		bp = ax.boxplot(dataList,notch=0,sym="",vert=1,patch_artist=True,
						widths=[0.4]*len(dataList), positions=positions)
		plt.setp(bp['medians'], color="black")
		plt.setp(bp['whiskers'], color="black",linestyle="--",alpha=0.8)
		for i,patch in enumerate(bp['boxes']):
			if i%2==0:
				patch.set_facecolor("#7FBF7F")	
			else:
				patch.set_facecolor("orange")	
			patch.set_edgecolor("black")
			patch.set_alpha(0.9)
			sub_group = dataList[i]
			x = numpy.random.normal(positions[i], 0.04, size=len(dataList[i]))
			ax.scatter(x,sub_group,color='white', alpha=0.5,edgecolor="black",s=5)
		ax.set_ylim(7,14)
		ax = fig.add_subplot(144)
		dataList = [data_dict[relevant_complexes[3]][0],data_dict[relevant_complexes[3]][1]]
		bp = ax.boxplot(dataList,notch=0,sym="",vert=1,patch_artist=True, 
						widths=[0.4]*len(dataList), positions=positions)
		plt.setp(bp['medians'], color="black")
		plt.setp(bp['whiskers'], color="black",linestyle="--",alpha=0.8)
		for i,patch in enumerate(bp['boxes']):
			if i%2==0:
				patch.set_facecolor("#7FBF7F")	
			else:
				patch.set_facecolor("orange")	
			patch.set_edgecolor("black")
			patch.set_alpha(0.9)
			sub_group = dataList[i]
			x = numpy.random.normal(positions[i], 0.04, size=len(dataList[i]))
			ax.scatter(x,sub_group, color='white', alpha=0.5,edgecolor="black",s=5)
		ax.set_ylim(7,14)
		plt.savefig(output_folder + 'fig5a_complex_abundances.pdf', bbox_inches = 'tight',dpi = 400)

if __name__ == "__main__":
	## EXECUTE STEP10
	step10_preparation.execute(folder = sys.argv[1], output_folder = sys.argv[2])
	step10_figure.execute(folder = sys.argv[1], output_folder = sys.argv[2])
All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.
GNU LICENSE
Download script here
Computational Pipeline

All Code Material

STEP 10: Python Code