Impact on Proteotype: STEP11

STEP 11: Python Code

									
class step11_preparation:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		step11_preparation.prepare_data_for_second_graph(folder = folder)

	@staticmethod
	def get_limma_data(**kwargs):
		folder = kwargs.get('folder','PATH')
		
		data = DataFrameAnalyzer.open_in_chunks(folder, 'gygi3_complex_mf_merged.tsv.gz',
				    							sep = '\t', compression = 'gzip')
		data = data.drop_duplicates()
		quant_data = data[data.analysis_type=='quant_male-female']		
		stoch_data = data[data.analysis_type=='stoch_male-female']

		'''
		#same for diet for Supplementary information
		data = DataFrameAnalyzer.open_in_chunks(folder, 'gygi3_complex_hs_merged.tsv.gz',
				    							sep = '\t', compression = 'gzip')
		data = data.drop_duplicates()
		quant_data = data[data.analysis_type=='quant_highfat-chow']		
		stoch_data = data[data.analysis_type=='stoch_highfat-chow']
		'''
		return data, quant_data, stoch_data

	@staticmethod
	def prepare_stochiometry_hit_dictionary(**kwargs):
		folder = kwargs.get('folder','PATH')

		data = step11_preparation.get_limma_data(folder = folder)

		complex_set = list(set(data.complex_search))
		cov_sig_dict = dict()
		num_dict = dict()
		for complexID in complex_set:
			quant_sub = quant_data[quant_data.complex_search==complexID]
			stoch_sub = stoch_data[stoch_data.complex_search==complexID]
			if len(stoch_sub) >= 4:
				num_dict[complexID] = len(stoch_sub)
				stoch_sig_sub = stoch_sub[stoch_sub['pval.adj'] <= 0.01]
				stoch_non_sub = stoch_sub[stoch_sub['pval.adj'] > 0.01]
				coverage = float(len(stoch_sig_sub))/float(len(stoch_sub))*100
				cov_sig_dict[complexID] = coverage

		DataFrameAnalyzer.to_pickle(cov_sig_dict, folder + 'figure5a_stoichiometric_hits_per_complex_dict.pkl')
		DataFrameAnalyzer.to_pickle(num_dict, folder + 'figure5a_stoichiometric_hitsNUM_per_complex_dict.pkl')

		#run similarly for diet differences as well
		#DataFrameAnalyzer.to_pickle(cov_sig_dict, folder + 'figure5a_stoichiometric_diet_hits_per_complex_dict.pkl')
		#DataFrameAnalyzer.to_pickle(num_dict, folder + 'figure5a_stoichiometric_diet_hitsNUM_per_complex_dict.pkl')
		return cov_sig_dict, num_dict

	@staticmethod
	def get_stoichiometric_hits_per_complex_dict(**kwargs):
		folder = kwargs.get('folder','PATH')

		cov_sig_dict = DataFrameAnalyzer.read_pickle(folder + 'figure5a_stoichiometric_hits_per_complex_dict.pkl')
		return cov_sig_dict

	@staticmethod
	def get_complex_dictionary(**kwargs):
		folder = kwargs.get('folder','PATH')

		complexDict = DataFrameAnalyzer.read_pickle(folder + 'complex_dictionary.pkl')
		return complexDict

	@staticmethod
	def get_complex_abundance_data(cov_sig_dict, num_dict, **kwargs):
		folder = kwargs.get('folder','PATH')

		complexDict = step11_preparation.get_complex_dictionary(folder = folder)

		#data retrieved from previous Step 10
		abu_data = DataFrameAnalyzer.open_in_chunks(folder,
				   'figure5a_gygi3_complex_diet_total_abundance_difference.tsv.gz')
		key_list = list(set(abu_data.index).intersection(set(cov_sig_dict.keys())))

		cohen_list = list()
		tt_pval_list  = list()
		complex_list = list()
		stable_fractions = list()
		variable_fractions = list()
		for key in key_list:
			num = num_dict[key]
			complexID = key.split(':')[0]
			gold = complexDict[complexID]['goldComplex'][0]
			if gold=='yes' and num >= 4:
				coverage = cov_sig_dict[key]
				cohen_list.append(abu_data.loc[key]['cohen_distance'])
				tt_pval_list.append(abu_data.loc[key]['ttest_pval'])
				complex_list.append(':'.join(key.split(':')[1:]))
				stable_fractions.append(100 - coverage)
				variable_fractions.append(coverage)
			if complexID.find('tRNA splicing')!=-1:
				coverage = cov_sig_dict[key]
				cohen_list.append(abu_data.loc[key]['cohen_distance'])
				tt_pval_list.append(abu_data.loc[key]['ttest_pval'])
				complex_list.append(':'.join(key.split(':')[1:]))
				stable_fractions.append(100 - coverage)
				variable_fractions.append(coverage)

		cohen_list,complex_list,tt_pval_list,stable_fractions, variable_fractions = utilsFacade.sort_multiple_lists([cohen_list, complex_list,tt_pval_list, stable_fractions, variable_fractions], reverse = True)
		tt_pvalCorrs = statsFacade.correct_pvalues(tt_pval_list)

		#gather numbers
		print(scipy.stats.spearmanr(cohen_list,variable_fractions))

		tt1 = 'bla'
		for idx,tt in enumerate(tt_pvalCorrs):
			if tt>=0.01 and tt1=='bla':
				tt1 = cohen_list[idx]
			if tt1!='bla' and tt<=0.01:
				tt2 = cohen_list[idx]
				break

		df = {'complex':complex_list, 'cohen':cohen_list, 'tt_pval':tt_pval_list,
			  'stable':stable_fractions, 'variable':variable_fractions,
			  'tt_pvalCorr':tt_pvalCorrs, 'tt1':[tt1]*len(complex_list),
			  'tt2':[tt2]*len(complex_list)}
		df = pd.DataFrame(df)

		'''
		similar workflow for diet differences as well:
		"figure5a_gygi3_complex_diet_total_abundance_difference.tsv.gz"
		'''
		df.to_csv(folder + 'figure5a_bigPicture_underlying_data.tsv.gz',
				  sep = '\t', compression = 'gzip')
		return df

	@staticmethod
	def prepare_data_for_second_graph(**kwargs):
		folder = kwargs.get('folder','PATH')

		print('get_limma_data')
		data, quant_data, stoch_data = step11_preparation.get_limma_data(folder = folder)

		print('prepare_stochiometry_hit_dictionary')
		cov_sig_dict, num_dict = step11_preparation.prepare_stochiometry_hit_dictionary(folder = folder)

		print('get_complex_abundance_data')
		df = step11_preparation.get_complex_abundance_data(folder = folder)

class step11_limma_stoichiometry:
	@staticmethod
	def complex_limma_stoichiometry(dat, name, dat_type):
		stoch_data = dat.copy()
		complexSet = list(set(stoch_data.complex_search))
		quant_list = utilsFacade.filtering(list(stoch_data.columns),'quant_')
		quant_data = stoch_data[quant_list]

		sex_list = [item.split('.')[0].split('_')[-1][-2] for item in list(quant_data.columns)]
		male_list = [1 if item=='M' else 0 for item in sex_list]
		female_list = [1 if item=='F' else 0 for item in sex_list]
		design_df = pd.DataFrame({'male':male_list, 'female':female_list})
		design_df.index = list(quant_data.columns)

		concat_list = list()
		for complexID in complexSet:
			print(complexID)
			complex_quant_data = stoch_data[stoch_data.complex_search==complexID][quant_list]
			try:
				concat_list_all = list()
				limma_res = RFacade.perform_limma_analysis(complex_quant_data, design_df, 'male-female')
				concat_list_all.append(limma_res)
				limma_res_concat = pd.concat(concat_list_all)
				limma_res_concat['complex_search'] = pd.Series([complexID]*len(limma_res_concat), index = limma_res_concat.index)	
				limma_res_concat['analysis_type'] = pd.Series(['all']*len(limma_res_concat), index = limma_res_concat.index)	
			except:
				print('problem with ',complexID, len(complex_quant_data))
			concat_list.append(limma_res)
		complex_stoch_data = pd.concat(concat_list)
		complex_stoch_data.to_csv(folder + 'gygi3_stoichiometry_limma_results.tsv.gz', sep = '\t',compression = 'gzip')
		return complex_stoch_data

	@staticmethod
	def complex_limma_interpretation(folder):
		filename = 'gygi3_stoichiometry_limma_results.tsv.gz'
		complex_stoch_data = DataFrameAnalyzer.open_in_chunks(folder, filename)
		complex_stoch_data = complex_stoch_data.drop_duplicates()
		pvals = utilsFacade.finite(list(complex_stoch_data['adj.P.Val']))
		pvalCorrs = statsFacade.correct_pvalues(pvals)
		pval_df = pd.DataFrame({'pvalCorr':pvalCorrs})
		pval_df.index = pd.Series(pvals, index = pval_df.index)

		pvals = list(complex_stoch_data['adj.P.Val'])
		pvalCorrs = list()
		for p,pval in enumerate(pvals):
			try:
				if type(pval_df.loc[pval]) == pd.DataFrame:
					pvalCorrs.append(list(pval_df.loc[pval].iloc[0])[0])
				else:
					pvalCorrs.append(list(pval_df.loc[pval])[0])
			except:
				pvalCorrs.append(np.nan)
		complex_stoch_data['pval.adj'] = pd.Series(pvalCorrs, index = complex_stoch_data.index)
		complex_stoch_data = complex_stoch_data.sort_values('pval.adj')
		complex_stoch_data.to_csv(folder + filename, sep = '\t', compression = 'gzip')

class step11_check_stats:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print('sec24_paralogs_egfr_correlation')
		pvalues = step11_check_stats.sec24_paralogs_egfr_correlation(folder = folder)

		print('calculate_combined_pvalues_complex_effect')
		d = step11_check_stats.calculate_combined_pvalues_complex_effect(folder = folder)

	@staticmethod
	def sec24_paralogs_egfr_correlation(**kwargs):
		folder = kwargs.get('folder','PATH')

		dat_gygi3 = DataFrameAnalyzer.getFile(folder, 'gygi3_entire_dataset.tsv.gz')
		quant_cols = utilsFacade.filtering(dat_gygi3.columns, 'quant_')
		dat_gygi3 = dat_gygi3[quant_cols]

		table_list = list()
		for i,row in dat_gygi3.iterrows():
			temp = list(row)
			np.random.shuffle(temp)
			table_list.append(temp)
		rand_data = pd.DataFrame(table_list)
		rand_data = rand_data.T
		table_list = list()
		for i,row in rand_data.iterrows():
			temp = list(row)
			np.random.shuffle(temp)
			table_list.append(temp)
		rand_data = pd.DataFrame(table_list)
		rand_data = rand_data.T

		rand_corr = rand_data.T.corr()
		rand_corr_values = utilsFacade.get_correlation_values(rand_corr)
		rand_corr_values = np.array(rand_corr_values)

		sec24b = list(dat_gygi3.loc['Sec24b'])
		sec24d = list(dat_gygi3.loc['Sec24d'])
		egfr = list(dat_gygi3.loc['Egfr'])

		data_dict = {'Sec24b':sec24b, 'Sec24d':sec24d, 'Egfr':egfr}
		data_df = pd.DataFrame(data_dict)
		data_corr = data_df.corr()

		data_corr = data_corr[['Egfr']]

		zscores = list()
		pvalues = list()
		for protein in list(data_corr.index):
			zscore = statsFacade.getSampleZScore(rand_corr_values, list(data_corr.loc[protein])[0])
			p1,p2 = statsFacade.zscore_to_pvalue(zscore)
			zscores.append(zscore)
			pvalues.append(p1)
		#SEC24D: 3.443234354026811e-15
		return pvalues

	@staticmethod
	def calculate_combined_pvalues_complex_effect(**kwargs):
		folder = kwargs.get('folder','PATH')

		filename = 'gygi3_complex_mf_merged.tsv.gz'
		data = DataFrameAnalyzer.open_in_chunks(folder, filename)
		data = data.drop_duplicates()
		data = data[data.analysis_type=='stoch_male-female']
		data = data.drop(['C9','C2','Psd3'], axis = 0)

		complex_list = list(set(data.complex_search))
		combined_dict = dict()
		combined_list = list()
		for complex_id in complex_list:
			complex_sub = data[data.complex_search==complex_id]
			combined_list.append(scipy.stats.combine_pvalues(list(complex_sub['pval.adj']))[1])
			combined_dict[complex_id] = scipy.stats.combine_pvalues(list(complex_sub['pval.adj']))[1]

		df = pd.DataFrame({'complex':complex_list,
						   'combined':combined_list,
						   'adj':statsFacade.correct_nan_pvalues(combined_list)})
		return df[df.complex=='HC2402:COPI']

class step11_figures:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print('FIGURE5A: overview_stoichiometry')
		step11_figures.figure5a_overview_stoichiometry(folder = folder, output_folder = output_folder)
	
		print('FIGURE5B: main_figure5b_volcanoPlots')
		step11_figures.main_figure5b_volcanoPlots()

		print('FIGURE5B: main_figure5b_boxPlots')
		step11_figures.main_figure5b_boxPlots()

	@staticmethod
	def figure5a_overview_stoichiometry(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		df = DataFrameAnalyzer.open_in_chunks(folder,'figure5a_bigPicture_underlying_data.tsv.gz')

		complex_list = list(df['complex'])
		cohen_list = list(df['cohen'])
		tt_pval_list = list(df['tt_pval'])
		stable_fractions = list(df['stable'])
		variable_fractions = list(df['variable'])
		tt_pvalCorrs = list(df['tt_pvalCorr'])
		tt1 = list(df.tt1)[0]
		tt2 = list(df.tt2)[0]

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = False

		plt.clf()
		fig = plt.figure(figsize = (15,5))
		gs = gridspec.GridSpec(10,32)
		ax = plt.subplot(gs[0:5,0:])
		ind = np.arange(len(complex_list))
		width = 0.85
		ax.axhline(0, color = 'k')
		ax.axhline(0.5,linestyle = '--', color = 'k')
		ax.axhline(-0.5,linestyle = '--', color = 'k')
		ax.axhline(1.0,linestyle = '--', color = 'k')
		ax.axhline(-1.0,linestyle = '--', color = 'k')
		ax.axhline(1.5,linestyle = '--', color = 'k')
		ax.axhline(-1.5,linestyle = '--', color = 'k')
		ax.axhline(tt1, color = 'red')
		ax.axhline(tt2, color = 'red')
		colors = list()
		for p,pval in enumerate(tt_pval_list):
			if str(pval)=='nan':
				colors.append('grey')
			else:
				if cohen_list[p] >= tt1:
					colors.append('purple')
				elif cohen_list[p] <= tt2:
					colors.append('lightgreen')
				else:
					colors.append('grey')
		rects = ax.bar(ind, cohen_list, width, color=colors, edgecolor = 'white')
		ax.set_ylim(-2,2)
		ax.set_xlim(-0.1, len(complex_list)+0.1)
		ax.set_xticklabels([])

		ax = plt.subplot(gs[5:7,0:])
		rects1 = ax.bar(ind, stable_fractions, width, color='darkblue',
						edgecolor = 'white')
		rects2 = ax.bar(ind, variable_fractions, width, color='red',
						bottom =np.array(stable_fractions),
						edgecolor = 'white')
		ax.set_ylim(0,100)
		ax.set_xlim(-0.1, len(complex_list) + 0.1)
		plt.xticks(list(utilsFacade.frange(0.5,len(complex_list) + 0.5,1)))
		ax.set_xticklabels(complex_list,rotation = 90, fontsize = 7)
		plt.savefig(output_folder + 'fig5a_abundance_stoichiometry_comparison.pdf',
					bbox_inches = 'tight', dpi = 400)

	@staticmethod
	def figure5a_overview_stoichiometry_diet(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		df = DataFrameAnalyzer.open_in_chunks(folder,'figure5a_bigPicture_diet_underlying_data.tsv.gz')
		complex_list = list(df['complex'])
		cohen_list = list(df['cohen'])
		tt_pval_list = list(df['tt_pval'])
		stable_fractions = list(df['stable'])
		variable_fractions = list(df['variable'])
		tt_pvalCorrs = list(df['tt_pvalCorr'])
		tt1 = list(df.tt1)[0]
		tt2 = list(df.tt2)[0]

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = False

		plt.clf()
		fig = plt.figure(figsize = (15,5))
		gs = gridspec.GridSpec(10,32)
		ax = plt.subplot(gs[0:5,0:])
		ind = np.arange(len(complex_list))
		width = 0.85
		ax.axhline(0, color = 'k')
		ax.axhline(0.5,linestyle = '--', color = 'k')
		ax.axhline(-0.5,linestyle = '--', color = 'k')
		ax.axhline(1.0,linestyle = '--', color = 'k')
		ax.axhline(-1.0,linestyle = '--', color = 'k')
		ax.axhline(1.5,linestyle = '--', color = 'k')
		ax.axhline(-1.5,linestyle = '--', color = 'k')
		ax.axhline(tt1, color = 'red')
		ax.axhline(tt2, color = 'red')
		colors = list()
		for p,pval in enumerate(tt_pval_list):
			if str(pval)=='nan':
				colors.append('grey')
			else:
				if cohen_list[p] >= tt1:
					colors.append('purple')
				elif cohen_list[p] <= tt2:
					colors.append('lightgreen')
				else:
					colors.append('grey')
		rects = ax.bar(ind, cohen_list, width, color=colors,
			edgecolor = 'white')
		ax.set_ylim(-2,2)
		ax.set_xlim(-0.1, len(complex_list) + 0.1)
		ax.set_xticklabels([])

		ax = plt.subplot(gs[5:7,0:])
		rects1 = ax.bar(ind, stable_fractions, width, color='darkblue', edgecolor = 'white')
		rects2 = ax.bar(ind, variable_fractions, width, color='red',bottom = np.array(stable_fractions),
						edgecolor = 'white')
		ax.set_ylim(0,100)
		ax.set_xlim(-0.1, len(complex_list) + 0.1)
		plt.xticks(list(utilsFacade.frange(0.5,len(complex_list) + 0.5,1)))
		ax.set_xticklabels(complex_list,rotation = 90, fontsize = 7)
		plt.savefig(folder + 'fig5a_abundance_diet_stoichiometry_comparison.pdf',
					bbox_inches = 'tight', dpi = 400)

	@staticmethod
	def get_data(name,**kwargs):
		folder = kwargs.get('folder','PATH')

		filename = 'gygi3_complex_mf_merged.tsv.gz'
		data = DataFrameAnalyzer.open_in_chunks(folder, filename)
		data = data.drop_duplicates()
		data = data[data.analysis_type=='stoch_male-female']
		data = data.drop(['C9','C2','Psd3'], axis = 0)
		complex_data = data[data.complex_search.str.contains(name)]

		pval_list = -np.log10(np.array(data['pval.adj']))
		fc_list = np.array(data['logFC'])
		complex_pval_list = -np.log10(np.array(complex_data['pval.adj']))
		complex_fc_list = np.array(complex_data['logFC'])
		complex_label_list = list(complex_data.index)
		complex_colors = ['red' if item<=0.01 else 'darkblue' for item in list(complex_data['pval.adj'])]
		return {'pval':pval_list,
				'fc': fc_list,
				'complex_pval': complex_pval_list,
				'complex_fc':complex_fc_list,
				'complex_label': complex_label_list,
				'complex_color':complex_colors}

	@staticmethod
	def main_figure5b_volcanoPlots(**kwargs):
		output_folder = kwargs.get('output_folder','PATH')

		print('volcano_plot_stoichiometry:COPI')
		step11_figures.figure5b_volcanoPlot_cop1_mfComparison(folder = output_folder)
		print('volcano_plot_stoichiometry:COPII')
		step11_figures.figure5b_volcanoPlot_cop2_mfComparison(folder = output_folder)
		print('volcano_plot_stoichiometry:Cohesin')
		step11_figures.figure5b_volcanoPlot_cohesin_mfComparison(folder = output_folder)
		print('volcano_plot_stoichiometry:retromer complex')
		step11_figures.figure5b_volcanoPlot_retromer_mfComparison(folder = output_folder)

	@staticmethod
	def main_figure5b_boxPlots(**kwargs):
		output_folder = kwargs.get('output_folder','PATH/')

		print('boxplot:COPI')
		step11_figures.figure5b_cop1_boxplot_logFC_male_female(folder = output_folder)
		print('boxplot:COPII')
		step11_figures.figure5b_cop2_boxplot_logFC_male_female(folder = output_folder)
		print('boxplot:retromer complex')
		step11_figures.figure5b_retromer_boxplot_logFC_male_female(folder = output_folder)
		print('boxplot:Cohesin')
		step11_figures.figure5b_cohesin_boxplot_logFC_male_female(folder = output_folder)

	@staticmethod
	def figure5b_volcanoPlot_retromer_mfComparison(**kwargs):
		folder = kwargs.get('folder','PATH')

		data_dict = step11_figures.get_data('retromer')
		pval_list = data_dict['pval']
		fc_list = data_dict['fc']
		complex_pval_list = data_dict['complex_pval']
		complex_fc_list = data_dict['complex_fc']
		complex_label_list = data_dict['complex_label']
		complex_colors = data_dict['complex_color']

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		plt.clf()
		fig = plt.figure(figsize = (5,5))
		ax = fig.add_subplot(111)
		ax.scatter(fc_list, pval_list, edgecolor = 'black', s = 30, color = 'white', alpha = 0.2)
		ax.scatter(complex_fc_list, complex_pval_list, edgecolor = 'black', s = 50, color = complex_colors)
		for count, label in enumerate(complex_label_list):
			x,y = complex_fc_list[count], complex_pval_list[count]
			ax.annotate(label, xy = (x,y), color = complex_colors[count])
		ax.set_xlabel('logFC (male/female)')
		ax.set_ylabel('p.value[-log10]')
		ax.set_xlim(-1.25,1.25)
		ax.set_ylim(-0.01, 35)
		plt.savefig(folder + 'fig5b_retromer_volcano_mf_comparison.pdf',
					bbox_inches = 'tight', dpi = 400)

	@staticmethod
	def figure5b_volcanoPlot_cop1_mfComparison(**kwargs):
		folder = kwargs.get('folder','PATH')

		data_dict = step11_figures.get_data('HC2402:COPI')
		pval_list = data_dict['pval']
		fc_list = data_dict['fc']
		complex_pval_list = data_dict['complex_pval']
		complex_fc_list = data_dict['complex_fc']
		complex_label_list = data_dict['complex_label']
		complex_colors = data_dict['complex_color']

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		plt.clf()
		fig = plt.figure(figsize = (5,5))
		ax = fig.add_subplot(111)
		ax.scatter(fc_list, pval_list, edgecolor = 'black', s = 30, color = 'white', alpha = 0.2)
		ax.scatter(complex_fc_list, complex_pval_list, edgecolor = 'black', s = 50, color = complex_colors)
		for count, label in enumerate(complex_label_list):
			x,y = complex_fc_list[count], complex_pval_list[count]
			ax.annotate(label, xy = (x,y), color = complex_colors[count])
		ax.set_xlabel('logFC (male/female)')
		ax.set_ylabel('p.value[-log10]')
		ax.set_xlim(-1.25,1.25)
		ax.set_ylim(-0.01, 35)
		plt.savefig(folder + 'fig5b_cop1_volcano_mf_comparison.pdf',
					bbox_inches = 'tight', dpi = 400)

	@staticmethod
	def figure5b_volcanoPlot_cohesin_mfComparison(**kwargs):
		folder = kwargs.get('folder','PATH')

		data_dict = step11_figures.get_data('Cohesin')
		pval_list = data_dict['pval']
		fc_list = data_dict['fc']
		complex_pval_list = data_dict['complex_pval']
		complex_fc_list = data_dict['complex_fc']
		complex_label_list = data_dict['complex_label']
		complex_colors = data_dict['complex_color']

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		plt.clf()
		fig = plt.figure(figsize = (5,5))
		ax = fig.add_subplot(111)
		ax.scatter(fc_list, pval_list, edgecolor = 'black', s = 30, color = 'white', alpha = 0.2)
		ax.scatter(complex_fc_list, complex_pval_list, edgecolor = 'black', s = 50, color = complex_colors)
		for count, label in enumerate(complex_label_list):
			x,y = complex_fc_list[count], complex_pval_list[count]
			ax.annotate(label, xy = (x,y), color = complex_colors[count])
		ax.set_xlabel('logFC (male/female)')
		ax.set_ylabel('p.value[-log10]')
		ax.set_xlim(-1.25,1.25)
		ax.set_ylim(-0.01, 35)
		plt.savefig(folder + 'fig5b_cohesin_volcano_mf_comparison.pdf',
					bbox_inches = 'tight', dpi = 400)

	@staticmethod
	def figure5b_volcanoPlot_cop2_mfComparison(**kwargs):
		folder = kwargs.get('folder','PATH')

		data_dict = step11_figures.get_data('HC663:COPII')
		pval_list = data_dict['pval']
		fc_list = data_dict['fc']
		complex_pval_list = data_dict['complex_pval']
		complex_fc_list = data_dict['complex_fc']
		complex_label_list = data_dict['complex_label']
		complex_colors = data_dict['complex_color']

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		plt.clf()
		fig = plt.figure(figsize = (5,5))
		ax = fig.add_subplot(111)
		ax.scatter(fc_list, pval_list, edgecolor = 'black', s = 30, color = 'white', alpha = 0.2)
		ax.scatter(complex_fc_list, complex_pval_list, edgecolor = 'black', s = 50, color = complex_colors)
		for count, label in enumerate(complex_label_list):
			x,y = complex_fc_list[count], complex_pval_list[count]
			ax.annotate(label, xy = (x,y), color = complex_colors[count])
		ax.set_xlabel('logFC (male/female)')
		ax.set_ylabel('p.value[-log10]')
		ax.set_xlim(-1.25,1.25)
		ax.set_ylim(-0.01, 35)
		plt.savefig(folder + 'fig5b_cop2_volcano_mf_comparison.pdf',
					bbox_inches = 'tight', dpi = 400)

	@staticmethod
	def get_data_boxplot(name,**kwargs):
		folder = kwargs.get('folder','PATH')

		filename = 'gygi3_complex_mf_merged.tsv.gz'
		data = DataFrameAnalyzer.open_in_chunks(folder, filename)
		data = data.drop_duplicates()
		data = data.drop(['C9','C2','Psd3'], axis = 0)
		complex_data = data[data.complex_search.str.contains(name)]
		male_data = complex_data[complex_data.analysis_type=='male']
		female_data = complex_data[complex_data.analysis_type=='female']
		stoch_maleFemale_data = complex_data[complex_data.analysis_type=='stoch_male-female']
		pval_dict = stoch_maleFemale_data['pval.adj'].to_dict()

		wpc_data = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi3.tsv.gz')
		wpc_data = wpc_data[wpc_data.complex_search.str.contains(name)]
		quant_list = utilsFacade.filtering(list(wpc_data.columns),'quant_')
		male_list = utilsFacade.filtering(quant_list, 'M')
		female_list = utilsFacade.filtering(quant_list, 'F')
		male_data = wpc_data[male_list].T
		female_data = wpc_data[female_list].T

		protein_list = list(set(male_data.columns).intersection(set(female_data.columns)))
		pval_list = [pval_dict[protein] for protein in protein_list]
		pval_list, protein_list = zip(*sorted(zip(pval_list, protein_list)))
		data_list = list()
		color_list = list()
		sig_protein_list = list()
		positions = [0.5]
		for protein in protein_list:
			if pval_dict[protein]<=0.01:
				try:
					data_list.append(utilsFacade.finite(list(male_data[protein])))
					data_list.append(utilsFacade.finite(list(female_data[protein])))
					color_list.append('purple')
					color_list.append('green')
					sig_protein_list.append(protein)
					sig_protein_list.append(protein)
					positions.append(positions[-1] + 0.4)
					positions.append(positions[-1] + 0.6)
				except:
					data_list.append(utilsFacade.finite(list(male_data[protein].T.iloc[0].T)))
					data_list.append(utilsFacade.finite(list(female_data[protein].T.iloc[0].T)))
					color_list.append('purple')
					color_list.append('green')
					sig_protein_list.append(protein)
					sig_protein_list.append(protein)
					positions.append(positions[-1] + 0.4)
					positions.append(positions[-1] + 0.6)					

		male_median_data = male_data.T.median().T
		female_median_data = female_data.T.median().T
		data_list.append(list(male_median_data))
		data_list.append(list(female_median_data))
		color_list.append('grey')
		color_list.append('grey')
		if name=='HC2402:COPI':
			sig_protein_list.append('COPI-male')
			sig_protein_list.append('COPI-female')
		elif name == 'HC663:COPII':
			sig_protein_list.append('COPII-male')
			sig_protein_list.append('COPII-female')
		elif name=='MAPK':
			sig_protein_list.append('MAPK-male')
			sig_protein_list.append('MAPK-female')
		elif name == 'Cohesin':
			sig_protein_list.append('Cohesin-male')
			sig_protein_list.append('Cohesin-female')			
		positions.append(positions[-1] + 0.5)
		return {'data':data_list,
				'color': color_list,
				'sig_protein':sig_protein_list,
				'positions':positions}

	@staticmethod
	def figure5b_retromer_boxplot_logFC_male_female(**kwargs):
		folder = kwargs.get('folder','PATH')

		data_dict = step11_figures.get_data_boxplot('retromer')
		data_list = data_dict['data']
		color_list = data_dict['color']
		sig_protein_list = data_dict['sig_protein']
		positions = data_dict['positions']

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		plt.clf()
		fig = plt.figure(figsize = (5,5))
		ax = fig.add_subplot(111)
		bp = ax.boxplot(data_list,notch=0,sym="",vert=1,patch_artist=True,
			 			widths=[0.4]*len(data_list), positions = positions)
		plt.setp(bp['medians'], color="black")
		plt.setp(bp['whiskers'], color="black",linestyle="--",alpha=0.8)
		for i,patch in enumerate(bp['boxes']):
			patch.set_edgecolor("black")
			patch.set_alpha(0.6)
			patch.set_color(color_list[i])
			sub_group = data_list[i]
			x = numpy.random.normal(positions[i], 0.04, size=len(data_list[i]))
			ax.scatter(x,sub_group,color='white', alpha=0.5,edgecolor="black",s=5)
		ax.set_title('COPI complex')
		ax.set_xticklabels(sig_protein_list, rotation = 90)
		ax.set_ylim(-1,1)
		ax.set_ylabel('normalized abundances')
		plt.savefig(folder + 'fig5b_retromer_boxplot_males_females.pdf',
					bbox_inches = 'tight', dpi = 400)
	
	@staticmethod
	def figure5b_cop1_boxplot_logFC_male_female(**kwargs):
		folder = kwargs.get('folder','PATH')

		data_dict = step11_figures.get_data_boxplot('HC2402:COPI')
		data_list = data_dict['data']
		color_list = data_dict['color']
		sig_protein_list = data_dict['sig_protein']
		positions = data_dict['positions']

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		plt.clf()
		fig = plt.figure(figsize = (5,5))
		ax = fig.add_subplot(111)
		bp = ax.boxplot(data_list,notch=0,sym="",vert=1,patch_artist=True,
			 			widths=[0.4]*len(data_list), positions = positions)
		plt.setp(bp['medians'], color="black")
		plt.setp(bp['whiskers'], color="black",linestyle="--",alpha=0.8)
		for i,patch in enumerate(bp['boxes']):
			patch.set_edgecolor("black")
			patch.set_alpha(0.6)
			patch.set_color(color_list[i])
			sub_group = data_list[i]
			x = numpy.random.normal(positions[i], 0.04, size=len(data_list[i]))
			ax.scatter(x,sub_group,color='white', alpha=0.5,edgecolor="black",s=5)
		ax.set_title('COPI complex')
		ax.set_xticklabels(sig_protein_list, rotation = 90)
		ax.set_ylim(-1,1)
		ax.set_ylabel('normalized abundances')
		plt.savefig(folder + 'fig5b_cop1_boxplot_males_females.pdf',
					bbox_inches = 'tight', dpi = 400)
	
	@staticmethod
	def figure5b_cop2_boxplot_logFC_male_female(**kwargs):
		folder = kwargs.get('folder','PATH')

		data_dict = step11_figures.get_data_boxplot('HC663:COPII')
		data_list = data_dict['data']
		color_list = data_dict['color']
		sig_protein_list = data_dict['sig_protein']
		positions = data_dict['positions']

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		plt.clf()
		fig = plt.figure(figsize = (5,5))
		ax = fig.add_subplot(111)
		bp = ax.boxplot(data_list,notch=0,sym="",vert=1,patch_artist=True,
						widths=[0.4]*len(data_list), positions = positions)
		plt.setp(bp['medians'], color="black")
		plt.setp(bp['whiskers'], color="black",linestyle="--",alpha=0.8)
		for i,patch in enumerate(bp['boxes']):
			patch.set_edgecolor("black")
			patch.set_alpha(0.6)
			patch.set_color(color_list[i])
			sub_group = data_list[i]
			x = numpy.random.normal(positions[i], 0.04, size=len(data_list[i]))
			ax.scatter(x,sub_group,color='white', alpha=0.5,edgecolor="black",s=5)
		ax.set_title('COPII complex')
		ax.set_xticklabels(sig_protein_list, rotation = 90)
		ax.set_ylim(-1,1)
		ax.set_ylabel('normalized abundances')
		plt.savefig(folder + 'fig5b_cop2_boxplot_males_females.pdf',
					bbox_inches = 'tight', dpi = 400)

	@staticmethod
	def figure5b_cohesin_boxplot_logFC_male_female(**kwargs):
		folder = kwargs.get('folder','PATH')

		data_dict = figure5b.get_data_boxplot('Cohesin')
		data_list = data_dict['data']
		color_list = data_dict['color']
		sig_protein_list = data_dict['sig_protein']
		positions = data_dict['positions']

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		plt.clf()
		fig = plt.figure(figsize = (5,5))
		ax = fig.add_subplot(111)
		bp = ax.boxplot(data_list,notch=0,sym="",vert=1,patch_artist=True,
						widths=[0.4]*len(data_list), positions = positions)
		plt.setp(bp['medians'], color="black")
		plt.setp(bp['whiskers'], color="black",linestyle="--",alpha=0.8)
		for i,patch in enumerate(bp['boxes']):
			patch.set_edgecolor("black")
			patch.set_alpha(0.6)
			patch.set_color(color_list[i])
			sub_group = data_list[i]
			x = numpy.random.normal(positions[i], 0.04, size=len(data_list[i]))
			ax.scatter(x,sub_group,color='white', alpha=0.5,edgecolor="black",s=5)
		ax.set_title('Cohesin complex')
		ax.set_xticklabels(sig_protein_list, rotation = 90)
		ax.set_ylim(-1,1)
		ax.set_ylabel('normalized abundances')
		plt.savefig(folder + 'fig5b_cohesin_boxplot_males_females.pdf',
					bbox_inches = 'tight', dpi = 400)

if __name__ == "__main__":
	## EXECUTE STEP11
	step11_preparation.execute(folder = sys.argv[1], output_folder = sys.argv[2])
	step11_check_stats.execute(folder = sys.argv[1], output_folder = sys.argv[2])
	step11_figures.execute(folder = sys.argv[1], output_folder = sys.argv[2])
All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.
GNU LICENSE
Download script here
Computational Pipeline

All Code Material

STEP 11: Python Code