STEP 9: Python Code

									
class file_Loader:

	@staticmethod
	def load_data(**kwargs):
		folder = kwargs.get('folder','PATH')

		dat_gygi3 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_gygi3.tsv.gz')
		dat_gygi1 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_gygi1.tsv.gz')
		dat_mann = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_mann.tsv.gz')
		dat_battle_protein = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_battle_protein.tsv.gz')
		dat_tcga_colo = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_tcga_colo.tsv.gz')
		dat_tcga_breast = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_tcga_breast.tsv.gz')
		dat_tcga_ovarian = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_tcga_ovarian.tsv.gz')
		data_list = [dat_gygi1, dat_gygi2, dat_gygi3, dat_battle_protein 
					 dat_mann, dat_tcga_colo, dat_tcga_breast, dat_tcga_ovarian]
		return data_list

class step9_preparation:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print('load_data')
		data_list = file_Loader.load_data(folder = folder)

		print('load_GO_input_df')
		step9_preparation.load_GO_input_df(data_list, output_folder)

	@staticmethod
	def load_GO_input_df(data_list, output_folder):
		df = DataFrameAnalyzer.getFile(output_folder ,'figure2B_underlying_data.tsv')

		mean_df = df.mean()
		stable_df = mean_df[mean_df > 0]
		variable_df = mean_df[mean_df < 0]
		complex_list = list(df.columns)
		stable_protein_list = list()
		stable_protein_dict = dict()

		for complexID in list(stable_df.index):
			print(complexID)
			protein_list = list()
			for d in data_list:
				sub = d[d.ComplexName==complexID]
				protein_list.append([item.upper() for item in list(sub.index)])
			protein_list = list(set(utilsFacade.flatten(protein_list)))
			stable_protein_dict[complexID] = protein_list
			stable_protein_list.append(protein_list)

		variable_protein_list = list()
		variable_protein_dict = dict()
		for complexID in list(variable_df.index):
			print(complexID)
			protein_list = list()
			for d in data_list:
				sub = d[d.ComplexName==complexID]
				protein_list.append([item.upper() for item in list(sub.index)])
			protein_list = list(set(utilsFacade.flatten(protein_list)))
			variable_protein_dict[complexID] = protein_list
			variable_protein_list.append(protein_list)

		o = open(output_folder + 'suppFigure3_stable_protein_list.tsv','wb')
		stable_proteins = map(str,list(set(utilsFacade.flatten(stable_protein_list))))
		m = Mapper(stable_proteins, input = 'symbol', output = 'entrezgene')
		stable_entrezgenes = map(str,map(int,utilsFacade.finite(list(m.trans_df['entrezgene']))))
		exportText = '\n'.join(stable_entrezgenes)
		o.write(exportText)
		o.close()

		o = open(output_folder + 'suppFigure3_variable_protein_list.tsv','wb')
		variable_proteins = map(str,list(set(utilsFacade.flatten(variable_protein_list))))
		m = Mapper(variable_proteins, input = 'symbol', output = 'entrezgene')
		variable_entrezgenes = map(str,map(int,utilsFacade.finite(list(m.trans_df['entrezgene']))))
		exportText = '\n'.join(variable_entrezgenes)
		o.write(exportText)
		o.close()

		#these files are submitted to DAVID v6.8

class step9_figure:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print('SUPP-FIGURE3: main_supp_figure2c_GOenrichment')
		step9_figure.main_supp_figure2c_GOenrichment(folder = folder, output_folder = output_folder)

	@staticmethod
	def main_supp_figure2c_GOenrichment(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print('load_data')
		go_dict = step9_figure.load_data(output_folder)

		print('make_go_plot_scatter')
		step9_figure.make_go_barplot(go_dict, output_folder)

	@staticmethod
	def load_data(output_folder):
		category_list = ['GOTERM_BP_DIRECT','GOTERM_MF_DIRECT','GOTERM_CC_DIRECT']
		stable_data = DataFrameAnalyzer.getFile(output_folder,
					  'wpc_suppFigure2C_goEnrichment_stableProteins_DAVID6_8.txt')
		variable_data = DataFrameAnalyzer.getFile(output_folder,
						'wpc_suppFigure2C_goEnrichment_variableProteins_DAVID6_8.txt')

		#filter GO result table according to strict criteria
		stable_data = stable_data[stable_data.index.isin(category_list)]
		variable_data = variable_data[variable_data.index.isin(category_list)]
		stable_data = stable_data[stable_data.FDR<=0.01]
		variable_data = variable_data[variable_data.FDR<=0.01]

		go_dict = dict((e1,list()) for e1 in ['BP','CC','MF'])
		for category in category_list:
			stable_sub = stable_data[stable_data.index == category]
			variable_sub = variable_data[variable_data.index == category]

			stable_sub = stable_sub[stable_sub.Count >= 50]
			stable_sub = stable_sub[stable_sub['%'] >= 10]

			variable_sub = variable_sub[variable_sub.Count >= 50]
			variable_sub = variable_sub[variable_sub['%'] >= 10]

			term_list = [item.split('~')[1] for item in list(stable_sub['Term'])]
			stable_sub['go_term'] = pd.Series(term_list, index = stable_sub.index)
			term_list = [item.split('~')[1] for item in list(variable_sub['Term'])]
			variable_sub['go_term'] = pd.Series(term_list, index = variable_sub.index)
			go_dict[category.split('_')[1]].append((stable_sub, variable_sub))

		return go_dict
	
	@staticmethod
	def make_go_barplot(go_dict, output_folder):

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		for category in go_dict:
			stable_data, variable_data = go_dict[category][0]
			stable_data = stable_data[stable_data.FDR < 0.01]
			variable_data = variable_data[variable_data.FDR < 0.01]
			all_terms = set.intersection(*[set(stable_data.go_term), set(variable_data.go_term)])

			stable_data = stable_data.sort_values('Fold Enrichment', ascending = False)
			stable_fe_list = list(stable_data['Fold Enrichment'])
			label_list = list(stable_data.go_term)
			variable_fe_list = list()
			for term in list(stable_data.go_term):
				if term in all_terms:
					variable_fe_list.append(list(variable_data[variable_data.go_term==term]['Fold Enrichment'])[0])
				else:
					variable_fe_list.append(0)
			variable_other_data = variable_data[~variable_data.go_term.isin(list(stable_data.go_term))]
			variable_other_data = variable_other_data.sort_values('Fold Enrichment', ascending = True)
			for term in list(variable_other_data.go_term):
				variable_fe_list.append(list(variable_other_data[variable_other_data.go_term==term]['Fold Enrichment'])[0])
				label_list.append(term)
			stable_length = len(stable_fe_list)
			stable_fe_list = stable_fe_list + [0]*(len(variable_fe_list)-stable_length)

			sc_stable,colors_stable = colorFacade.get_specific_color_gradient(plt.cm.Blues, np.array(stable_fe_list))
			sc_variable,colors_variable = colorFacade.get_specific_color_gradient(plt.cm.Reds, np.array(variable_fe_list))

			plt.clf()
			fig = plt.figure()
			ax = fig.add_subplot(111)
			y_pos = np.arange(len(label_list))
			ax.barh(y_pos, list((-1)*np.array(stable_fe_list)), align='center',color=colors_stable, ecolor='black')
			ax.barh(y_pos, list(variable_fe_list), align='center',color=colors_variable, ecolor='black')
			ax.set_yticklabels([])

			ax2 = ax.twinx()
			plt.yticks(list(xrange(len(label_list))))
			ax2.set_yticklabels(label_list)
			ax.set_xlabel('Fold Enrichment')
			ax.set_title('GO:'+str(category))
			ax.set_ylim(-1,len(label_list) + 1)
			ax2.set_ylim(-1,len(label_list) + 1)
			plt.savefig(output_folder + 'suppFigure3_' + category + '_complexes_DAVID6_8.pdf',
						bbox_inches = 'tight', dpi = 400)

if __name__ == "__main__":
	## EXECUTE STEP9
	step9_preparation.execute(folder = sys.argv[1], output_folder = sys.argv[2])
	step9_figure.execute(folder = sys.argv[1], output_folder = sys.argv[2])
									
								

All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.

GNU LICENSE

Download script here