Impact on Proteotype: Figure4

FIGURE 4: Python Code

									
class figure4a:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print('FIGURE4A: main_figure4a_complex_intern_landscapes')
		figure4a.main_figure4a_complex_intern_landscapes(folder = folder, output_folder = output_folder)

	@staticmethod
	def main_figure4a_complex_intern_landscapes(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print('get_data')
		data_dict = figure4a.get_data(folder = folder)

		name_list = ['gygi3','gygi1','battle_protein','mann_all_log2',
					 'tcgaBreast','tcgaOvarian', 'tcgaColoCancer']

		data_list = [data_dict['gygi3'], data_dict['gygi1'],
					 data_dict['battle_protein'], data_dict['mann'],
					 data_dict['tcgaBreast'], data_dict['tcgaOvarian'],
					 data_dict['tcgaColoCancer']]

		print('iteration_complexes')
		figure4a.iteration_complexes(data_list, name_list, output_folder = output_folder)

	@staticmethod
	def get_data(**kwargs):
		folder = kwargs.get('folder','PATH')

		stoch_gygi3 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi3.tsv.gz')
		stoch_gygi1 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi1.tsv.gz')
		stoch_mann = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_mann.tsv.gz')
		stoch_battle_protein = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_battle_protein.tsv.gz')
		stoch_tcgaBreast = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_tcgaBreast.tsv.gz')
		stoch_tcgaOvarian = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_tcgaOvarian.tsv.gz')
		stoch_tcgaColorCancer = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_tcgaColoCancer.tsv.gz')

		return {'gygi3':stoch_gygi3,
				'gygi1':stoch_gygi1,
				'mann':stoch_mann,
				'battle_protein':stoch_battle_protein,
				'tcgaBreast':stoch_tcgaBreast,
				'tcgaOvarian':stoch_tcgaOvarian,
				'tcgaColoCancer':stoch_tcgaColorCancer}

	@staticmethod
	def load_relevant_complexes(**kwargs):
		folder = kwargs.get('folder','PATH')

		com_data = DataFrameAnalyzer.getFile(folder, 'figure4a_relevant_complexes.tsv')
		relevant_complexes = list(com_data.index)
		return relevant_complexes

	@staticmethod
	def prepare_variance_dataframe(name_list, data_list, complex_df, proteins):
		df_list = list()
		for protein in proteins:
			temp = list()
			for d,n in zip(data_list, name_list):
				sub = complex_df[complex_df.fileName==n]
				if protein in list(sub.index):
					if type(sub.loc[protein]) == pd.DataFrame:
						quant_cols = utilsFacade.filtering(d, 'quant_', condition = 'startswith')
						s = sub.loc[protein][quant_cols].T
						coverage_list = list()
						for c,col in enumerate(list(s.columns)):
							temp_finite = utilsFacade.finite(list(np.array(s)[:,c]))
							tempSmall = list(np.array(s)[:,c])
							coverage_list.append(float(len(temp_finite))/float(len(tempSmall))*100)
						s = sub.loc[protein]
						s['coverage'] = pd.Series(coverage_list, index = s.index)
						s = s.sort_values('coverage', ascending = False)
						var_value = s.iloc[0]['relative_variance']
						temp.append(var_value)
					else:
						var_value = sub.loc[protein]['relative_variance']
						temp.append(var_value)
				else:
					temp.append(np.nan)
			df_list.append(temp)
		df = pd.DataFrame(df_list)
		df.index = proteins
		df.columns = name_list
		df = df.dropna(thresh = int(len(df.columns)/2.0))
		return df.T

	@staticmethod
	def rank_dataset(df, proteins, name_list):
		df_lst = list()
		dfList = map(list,df.values)
		for f in dfList:
			rank_list = rankdata(f)
			all_ranks = rankdata(utilsFacade.finite(f))
			if len(all_ranks)>0:
				minimum_rank = all_ranks.min() 
			temp_list = list()
			for fitem, rank in zip(f, rank_list):
				if str(fitem) != 'nan':
					temp_list.append(rank)
				else:
					temp_list.append(np.nan)
			df_lst.append(temp_list)
		df = pd.DataFrame(df_lst)
		df.columns = proteins
		df.index = name_list
		return df

	@staticmethod
	def get_zscores(df):
		myArray = np.array(df)
		normalizedArray = []
		for row in range(0, len(myArray)):
			list_values = []
			Min =  min(utilsFacade.finite(list(myArray[row])))
			Max = max(utilsFacade.finite(list(myArray[row])))
			mean = np.mean(utilsFacade.finite(list(myArray[row])))
			std = np.std(utilsFacade.finite(list(myArray[row])))
			for element in myArray[row]:
				list_values.append((element - mean)/std)
			normalizedArray.append(list_values)

		newArray = []
		for row in range(0, len(normalizedArray)):
			list_values = normalizedArray[row]
			newArray.append(list_values)

		new_df = pd.DataFrame(newArray)
		new_df.columns = list(df.columns)
		new_df.index = df.index
		df = new_df.copy()
		df = df.iloc[::-1]
		dfList = map(list,df.values)
		return df, dfList

	@staticmethod
	def manage_proteasome_data(df):
		try:
			df = df.drop(['PSD3','C9','C2','C6','RPN1'], axis = 1)
		except:
			remove_list = ['PSD3','C9','C2','C6','RPN1']
			remove_list1 = [item[0] + item[1:].lower() for item in remove_list]
			remove_list = remove_list + remove_list1
			df = df[~df.index.isin(remove_list)]
		return df		

	@staticmethod
	def plot_heatmap(df, complex_id, altName, **kwargs):
		output_folder = kwargs.get('output_folder','PATH')

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = False

		sc, color_list = colorFacade.get_specific_color_gradient(plt.cm.RdBu_r,
						 np.array(utilsFacade.finite(utilsFacade.flatten(map(list, df.values)))),
						 vmin = -2, vmax = 2)
		
		plt.clf()
		if complex_id.find('26S')!=-1 or complex_id.find('NPC')!=-1:
			fig = plt.figure(figsize = (10,3))
		else:
			fig = plt.figure(figsize = (5,3))
		ax = fig.add_subplot(111)
		mask = df.isnull()
		mask1 = mask.copy()
		mask1 = mask1.replace(True,'bla')
		mask1 = mask1.replace(False,'20')
		mask1 = mask1.replace('bla',1)
		mask1 = mask1.replace('20',0)
		sns.heatmap(mask1, cmap = ['grey'], linewidth = 0.2,
					cbar = False, xticklabels = [], yticklabels = [])
		sns.heatmap(df, cmap = plt.cm.RdBu_r, linewidth = 0.2,
					mask = mask, vmin = -2, vmax = 2)
		plt.savefig(output_folder + 'fig4a_' + altName + '_heatmap_subunits.pdf',
					bbox_inches = 'tight', dpi = 400)

	@staticmethod
	def get_alternative_name(complex_id):
		altName = complex_id
		altName = '_'.join(altName.split(' '))
		altName = altName.replace('/','')
		altName = altName.replace(':','')
		return altName

	@staticmethod
	def iteration_complexes(data_list, name_list, **kwargs):
		do_ranking = kwargs.get('do_ranking', False)
		output_folder = kwargs.get('output_folder','PATH')

		relevant_complexes = figure4a.load_relevant_complexes()
		concatanated_complex_dfs = list()
		for complex_id in relevant_complexes:
			print('*************************')
			print(complex_id)
			altName = figure4a.get_alternative_name(complex_id)

			concat_list = list()
			for d,n in zip(data_list, name_list):
				sub = d[d.ComplexName == complex_id]
				if len(sub)>0:
					sub['fileName'] = pd.Series([n]*len(sub), index = sub.index)
					concat_list.append(sub)
			complex_df = pd.concat(concat_list)
			complex_df.index = [item.upper() for item in list(complex_df.index)]
			proteins = list(set(complex_df.index))			

			df = figure4a.prepare_variance_dataframe(name_list, data_list, complex_df, proteins)
			if do_ranking == True:
				df = figure4a.rank_dataset(df, proteins, name_list)
			df, dfList = figure4a.get_zscores(df)
			if complex_id.find('26S')!=-1:
				df = figure4a.manage_proteasome_data(df)

			protein_list = list(df.columns)
			label_list = list()
			median_list = list()
			for c,col in enumerate(df.columns):
				median_list.append(np.mean(utilsFacade.finite(list(np.array(df)[:,c]))))
				label_list.append(col)
			median_list, label_list = zip(*sorted(zip(median_list, label_list), reverse = False))
			df = df[list(label_list)]
			if complex_id.find('Kornberg')!=-1:
				complex_id = 'Kornbergs mediator (SRB) complex'

			df1 = df.T.copy()
			df1['complex_id'] = pd.Series([complex_id]*len(df1), index = df1.index)
			concatanated_complex_dfs.append(df1)
			print('plot_heatmap')
			print('*************************')

			figure4a.plot_heatmap(df, complex_id, altName, output_folder = output_folder)

		complex_data = pd.concat(concatanated_complex_dfs)
		complex_data.to_csv(output_folder + 'fig4a_underlyingData_complexes_RNA_newData_' + time.strftime('%Y%m%d') + '.tsv',
							sep = '\t')
		return complex_data

class figure4b:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print('FIGURE4B: main_figure4b_26SProteasome_analysis')
		figure4b.main_figure4b_26SProteasome_analysis()

	@staticmethod
	def main_figure4b_26SProteasome_analysis(**kwargs):
		output_folder = kwargs.get('output_folder','PATH')

		print("load_boxplot_data")
		sig_data_gygi3, sig_data_bp = figure4b.load_boxplot_data()

		print('plot_boxplot_complexVariance')
		figure4b.plot_boxplot_complexVariance(sig_data_gygi3,
											  output_folder = output_folder)

	@staticmethod
	def load_boxplot_data(**kwargs):
		folder = kwargs.get('folder','PATH')

		data = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi3.tsv.gz')
		sig_data_gygi3 = data[data.ComplexID=="26S Proteasome"]

		data = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_battle_protein.tsv.gz')
		sig_data_bp = data[data.ComplexID=="26S Proteasome"]
		return sig_data_gygi3,sig_data_bp

	@staticmethod
	def plot_boxplot_complexVariance(sig_data, **kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('folder','PATH')
		
		dataset = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi3.tsv.gz')
		dataset = dataset[dataset.ComplexID=="26S Proteasome"]
		quantCols = utilsFacade.get_quantCols(sig_data)
		dataset = dataset[quantCols]
		proteinList = list(dataset.index)
		dataset.index = proteinList
		dataset = dataset.drop_duplicates()

		complexID = "26S Proteasome"
		sub = sig_data[sig_data.ComplexID==complexID]
		sub = sub.sort_values("relative_variance")
		pvalueDict = sub.to_dict()["levene_pval.adj"]
		varianceDict = sub.to_dict()["relative_variance"]
		mean_variance = np.mean(utilsFacade.finite(sub["relative_variance"]))
		stateDict = sub.to_dict()["state"]
		quant_sub = sub[quantCols].T
		dsub = dataset.T[quant_sub.columns]
		dsub = dsub.T.drop_duplicates()
		dsub = dsub.T

		original_dataList = list()
		dataList1 = list()
		proteins1 = list()
		dataList2 = list()
		proteins = list()
		proteins2 = ["Psmb7","Psmd10","Psmb6","Psmb5","Psmb8","Psmd9","Psmb9","Psmb10"]
		for c,col in enumerate(dsub.columns):
			if col!="Psma8" and col!="Psmd4":#not enough datapoints/coverage
				if col in proteins2:
					dataList2.append(utilsFacade.finite(dsub.iloc[:,c]))
					proteins.append(col)
				else:
					dataList1.append(utilsFacade.finite(dsub.iloc[:,c]))
					proteins1.append(col)
		proteins2 = proteins

		immuno_proteasome = ["Psmb5","Psmb6","Psmb7","Psmb8","Psmb9","Psmb10"]


		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		plt.clf()
		fig = plt.figure(figsize=(15,6))
		gs = gridspec.GridSpec(3,3)
		ax = plt.subplot(gs[0:,0:2])
		ax.axhline(1, color = 'k', linestyle = '--')
		ax.axhline(0.5, color = 'k', linestyle = '--')
		ax.axhline(-1, color = 'k', linestyle = '--')
		ax.axhline(0, color = 'k', linestyle = '--')
		ax.axhline(-0.5, color = 'k', linestyle = '--')
		bp=ax.boxplot(dataList1,notch=0,sym="",vert=1,patch_artist=True,widths=[0.8]*len(dataList1))
		plt.setp(bp['medians'], color="black")
		plt.setp(bp['whiskers'], color="black")
		plt.setp(bp['whiskers'], color="black")
		for i,patch in enumerate(bp['boxes']):
			protein=proteins1[i]
			x = numpy.random.normal(i+1, 0.04, size=len(dataList1[i]))
			if protein in immuno_proteasome:
				patch.set_facecolor("orange")	
				patch.set_edgecolor("orange")
			else:
				patch.set_facecolor("grey")	
				patch.set_edgecolor("grey")
			patch.set_alpha(0.8)
		plt.xticks(list(xrange(len(proteins1)+1)))
		ax.set_xlim(0,len(proteins1)+1)
		ax.set_xticklabels([""]+proteins1,fontsize=13,rotation=90)
		ax.set_ylabel("Relative Abundance",fontsize=13)
		plt.tick_params(axis="y",which="both",bottom="off",top="off",labelsize=12)
		complex_name = complexID.replace("/","_")
		complex_name = complex_name.replace(":","_")
		ax.set_ylim(-1.5,1.5)

		ax = plt.subplot(gs[0:,2:])
		ax.axhline(1, color = 'k', linestyle = '--')
		ax.axhline(0.5, color = 'k', linestyle = '--')
		ax.axhline(-1, color = 'k', linestyle = '--')
		ax.axhline(-0.5, color = 'k', linestyle = '--')
		ax.axhline(0, color = 'k', linestyle = '--')
		bp = ax.boxplot(dataList2,notch=0,sym="",vert=1,patch_artist=True,widths=[0.8]*len(dataList2))
		plt.setp(bp['medians'], color="black")
		plt.setp(bp['whiskers'], color="black")
		plt.setp(bp['whiskers'], color="black")
		for i,patch in enumerate(bp['boxes']):
			protein = proteins2[i]
			x = numpy.random.normal(i+1, 0.04, size=len(dataList2[i]))
			if protein in immuno_proteasome:
				patch.set_facecolor("orange")	
				ax.scatter(x,dataList2[i],color='white', alpha=0.9,edgecolor="brown",s=20)		
			elif protein.find("a")!=-1 or protein.find("b")!=-1:
				patch.set_facecolor("#95DCEC")	
				ax.scatter(x,dataList2[i],color='white', alpha=0.9,edgecolor="darkblue",s=20)		
			elif protein.find("c")!=-1 or protein.find("d")!=-1:
				patch.set_facecolor("grey")	
				ax.scatter(x,dataList2[i],color='white', alpha=0.9,edgecolor="black",s=20)		
			patch.set_edgecolor("black")
			patch.set_alpha(0.8)
		plt.xticks(list(xrange(len(proteins2)+1)))
		ax.set_xlim(0,len(proteins2)+1)
		ax.set_xticklabels([""]+[item.upper() for item in proteins2],fontsize=13,rotation=90)
		ax.set_yticklabels([])
		plt.tick_params(axis="y",which="both",bottom="off",top="off",labelsize=12)
		complex_name = complexID.replace("/","_")
		complex_name = complex_name.replace(":","_")
		ax.set_ylim(-1.5,1.5)
		plt.savefig(output_folder + "fig4b_stochiometry_data_complexes_26SProteasome_gygi3.pdf",
					bbox_inches="tight",dpi=600)

if __name__ == "__main__":
	## EXECUTE FIGURE4
	figure4a.execute(folder = sys.argv[1], output_folder = sys.argv[2])
	figure4b.execute(folder = sys.argv[1], output_folder = sys.argv[2])
All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.
GNU LICENSE
Download script here
Computational Pipeline

All Code Material

FIGURE 4: Python Code