Impact on Proteotype: STEP8

STEP 8: Python Code

									
class file_Loader:

	@staticmethod
	def get_data(**kwargs):
		folder = kwargs.get('folder','PATH')

		stoch_gygi3 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi3.tsv.gz')
		stoch_gygi1 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi1.tsv.gz')
		stoch_mann = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_mann.tsv.gz')
		stoch_battle_protein = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_battle_protein.tsv.gz')
		stoch_tcgaBreast = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_tcgaBreast.tsv.gz')
		stoch_tcgaOvarian = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_tcgaOvarian.tsv.gz')
		stoch_tcgaColorCancer = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_tcgaColoCancer.tsv.gz')

		data_dict = {'gygi3':stoch_gygi3, 
					 'gygi1':stoch_gygi1,
					 'mann':stoch_mann,
					 'battle_protein':stoch_battle_protein,
					 'tcgaBreast':stoch_tcgaBreast,
					 'tcgaOvarian':stoch_tcgaOvarian,
					 'tcgaColoCancer':stoch_tcgaColorCancer}

		return data_dict

	@staticmethod
	def load_complex_dictionary(**kwargs):
		folder = kwargs.get('folder','PATH')

		complexDict = DataFrameAnalyzer.read_pickle(folder + 'complex_dictionary.pkl')
		return complexDict

class step8_preparation:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print('select_complexes for figure presentation; or iterate through all possible complexes')
		relevant_complexes = step8_preparation.select_complexes(folder = folder)

	@staticmethod
	def select_complexes(**kwargs):
		folder = kwargs.get('folder','PATH')

		data_dict = file_Loader.get_data(folder = folder)
		name_list = ['gygi3','gygi1','battle_protein','mann_all_log2',
					 'tcgaBreast','tcgaOvarian', 'tcgaColoCancer']
		data_list = [data_dict['gygi3'], data_dict['gygi1'], data_dict['battle_protein'],
					data_dict['mann'], data_dict['tcgaBreast'], data_dict['tcgaOvarian'],
					data_dict['tcgaColoCancer']]
		
		complexSet = list()
		for d, n in zip(data_list, name_list):
			complexSet.append(set(d.ComplexName))
		complexSet = set.union(*complexSet)		
		complexSet = list(complexSet)
		
		#for visualization purpose show the dissection for some stable/variable complexes
		relevant_complexes = list()
		for complex_id in complexSet:
			if complex_id.find('COP')!=-1 or complex_id.find('26S')!=-1 or complex_id.find('eIF4F')!=-1 or complex_id.find('F0')!=-1 or complex_id.find('NPC')!=-1 or complex_id.find('NuRD')!=-1 or complex_id.find('RSC')!=-1 or complex_id.find('mediator')!=-1 or complex_id.find('HAT')!=-1:
				relevant_complexes.append(complex_id)		
		
		o = open(folder + 'figure4a_relevant_complexes.tsv', 'w')		
		export_text = '\n'.join(relevant_complexes)
		o.write(export_text)
		o.close()
		return relevant_complexes

class step8_stats_check:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print('check_on_overall_agreement_between_datasets')
		mean_pval = step8_stats_check.check_on_overall_agreement_between_datasets(folder = folder)

	@staticmethod
	def check_on_overall_agreement_between_datasets(**kwargs):
		folder = kwargs.get('folder','PATH')

		fname = 'fig4_stability_complexSubunits.txt'
		data = DataFrameAnalyzer.getFile(folder, fname)

		complex_list = list(set(data.index))
		datasets = list(set(data['dataset of origin']))
		all_complex_corrs = list()
		all_rand_complex_corrs = list()
		for complex_id in complex_list:
			print(complex_id)
			sub = data.loc[complex_id]
			complex_dict = dict((e1,dict()) for e1 in datasets)
			for d in datasets:
				dsub = sub[sub['dataset of origin']==d]
				temp_df = dsub[['z-score']]
				temp_df.index = dsub.Protein
				temp_dict = temp_df['z-score'].to_dict()
				complex_dict[d] = temp_dict
			complex_df = pd.DataFrame(complex_dict)

			#reshuffle dataset to extract random correlations
			table_list = list()
			for i,row in complex_df.iterrows():
				temp = list(row)
				np.random.shuffle(temp)
				table_list.append(temp)
			rand_data = pd.DataFrame(table_list)
			rand_data = rand_data.T
			table_list = list()
			for i,row in rand_data.iterrows():
				temp = list(row)
				np.random.shuffle(temp)
				table_list.append(temp)
			rand_data = pd.DataFrame(table_list)
			rand_data = rand_data.T

			complex_corrs = utilsFacade.get_correlation_values(complex_df.corr())
			rand_complex_corrs = utilsFacade.get_correlation_values(rand_data.corr())

			all_complex_corrs.append(complex_corrs)
			all_rand_complex_corrs.append(rand_complex_corrs)
		all_complex_corrs = utilsFacade.flatten(all_complex_corrs)
		all_rand_complex_corrs = utilsFacade.flatten(all_rand_complex_corrs)

		#check normality of distribution with Shapiro-test
		w_true, p_true = scipy.stats.shapiro(all_complex_corrs)
		w_rand, p_rand = scipy.stats.shapiro(all_rand_complex_corrs)
		print(w_true, w_rand)

		pvalues = list()
		for i in np.arange(1,100):
			pvalues.append(scipy.stats.ttest_ind(random.sample(all_complex_corrs, 1000), random.sample(all_rand_complex_corrs, 1000))[1])
		mean_pvalue = np.mean(pvalues)#5.66x10-17
		return mean_pvalue

	@staticmethod
	def calculate_enrichment_for_variable_member(df1):
		#df1 calculated in step8_figures
		df1 = df1.drop('complex_id', axis = 1)

		pvalue_dict = dict()
		for col in list(df1.columns)[:-1]:
			zscores = list(df1[col])
			pvalues = list()
			for item in zscores:
				pvalues.append(statsFacade.zscore_to_pvalue(item)[0])
			pvalue_dict[col] = pvalues
		pvalue_df = pd.DataFrame(pvalue_dict)
		pvalue_df.index = df1.index

		pvalue_dict = dict()
		for protein in list(df1.index):
			sub = np.array(df1.loc[protein])
			temp = sub[sub >= 1]
			diff_proteins = list(set(df1.index).difference(set(protein)))
			rest_sub = df1.T[diff_proteins].T
			rest_var = 0
			rest_stab = 0
			for protein1 in list(rest_sub.index):
				rsub = np.array(df1.loc[protein1])
				rtemp = rsub[rsub >= 1]
				rest_var += len(rtemp)
				rest_stab += len(rsub) - len(rtemp)
			table = [[len(temp),len(sub) - len(temp)],[rest_var, rest_stab]]
			if len(temp)>0:
				odds, pval = scipy.stats.fisher_exact(table)
				print(protein, pval, odds)
				pvalue_dict[protein] = pval
		return pvalue_dict

	@staticmethod
	def gather_numbers(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		complexDict = file_Loader.load_complex_dictionary(folder = folder)

		complex_data = DataFrameAnalyzer.getFile(folder,'wpc_figure2B_underlyingData_complexes_RNA_newData.tsv')
		complex_set = list(set(complex_data.complex_id))
		corr_dict = dict()
		gold_corr_dict = dict()
		for complexID in complex_set:
			sub = complex_data[complex_data.complex_id==complexID]
			quant_data = sub.drop(['complex_id','mean','p.value','median','label'],1)
			if len(quant_data)>=5:
				corr_data = quant_data.corr()
				corr_values = utilsFacade.get_correlation_values(corr_data)
				gold = ''
				for complex_idName in complexDict:
					altName = complexDict[complex_idName]['altName'][0]
					if altName==complexID:
						gold = complexDict[complex_idName]['goldComplex'][0]
						break
				if gold=='yes':
					gold_corr_dict[complexID] = np.mean(corr_values)
				corr_dict[complexID] = np.mean(corr_values)
		
		label_list = list()
		avCorr_list = list()
		for k,v in zip(gold_corr_dict.keys(), gold_corr_dict.values()):
			label_list.append(k)
			avCorr_list.append(v)
		avCorr_list, label_list = zip(*sorted(zip(avCorr_list, label_list)))

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = False
		
		plt.clf()
		fig = plt.figure()
		ax = fig.add_subplot(111)
		ax.scatter(list(xrange(len(avCorr_list))), avCorr_list, edgecolor = 'black', s = 40)
		ax.set_ylabel('mean correlation')
		plt.savefig(output_folder + 'consistency_figure2b_overview.pdf', bbox_inches = 'tight', dpi = 400)

		################################################################################################
		complex_set = list(set(complex_data.complex_id))
		corr_dict = dict()
		gold_corr_dict = dict()
		var_dict = dict()
		for complexID in complex_set:
			sub = complex_data[complex_data.complex_id==complexID]
			gold = ''
			for complex_idName in complexDict:
				altName = complexDict[complex_idName]['altName'][0]
				if altName==complexID:
					gold = complexDict[complex_idName]['goldComplex'][0]
					break
			if gold=='yes' and len(sub) >= 5:
				var_sub = sub[sub['p.value'] < 0.1]
				fraction = float(len(var_sub))/float(len(sub))*100
				var_dict[complexID] = fraction
		label_list = list()
		var_list = list()
		for k,v in zip(var_dict.keys(), var_dict.values()):
			label_list.append(k)
			var_list.append(v)
		var_list,label_list = zip(*sorted(zip(var_list,label_list)))
		var_df = pd.DataFrame({'var':var_list})
		var_df.index = label_list
		var_df.to_csv(output_folder + 'variation_dataframe_figure2B_fractionOfVariableComponents.tsv', sep = '\t')

class step8_figure4a:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		do_ranking = kwargs.get('do_ranking', False)
		output_folder = kwargs.get('output_folder','PATH')

		print('FIGURE4A: main_figure4a_complex_intern_landscapes')
		step8_figure4a.main_figure4a_complex_intern_landscapes(folder = folder, output_folder = output_folder)

	@staticmethod
	def main_figure4a_complex_intern_landscapes(**kwargs):
		folder = kwargs.get('folder','PATH')
		do_ranking = kwargs.get('do_ranking', False)
		output_folder = kwargs.get('output_folder','PATH')

		print('get_data')
		data_dict = file_Loader.get_data(folder = folder)

		name_list = ['gygi3','gygi1','battle_protein','mann_all_log2',
					 'tcgaBreast','tcgaOvarian', 'tcgaColoCancer']

		data_list = [data_dict['gygi3'], data_dict['gygi1'], data_dict['battle_protein'],
					data_dict['mann'], data_dict['tcgaBreast'], data_dict['tcgaOvarian'],
					data_dict['tcgaColoCancer']]

		print('iteration_complexes')
		step8_figure4a.iteration_complexes(data_list, name_list,
										  do_ranking = do_ranking,
										  output_folder = output_folder)

	@staticmethod
	def load_relevant_complexes(**kwargs):
		folder = kwargs.get('folder','PATH')

		com_data = DataFrameAnalyzer.getFile(folder, 'figure4a_relevant_complexes.tsv')
		relevant_complexes = list(com_data.index)
		return relevant_complexes

	@staticmethod
	def get_alternative_name(complex_id):
		altName = complex_id
		altName = '_'.join(altName.split(' '))
		altName = altName.replace('/','')
		altName = altName.replace(':','')
		return altName

	@staticmethod
	def rank_dataset(df, proteins, name_list):
		df_lst = list()
		dfList = map(list,df.values)
		for f in dfList:
			rank_list = rankdata(f)
			all_ranks = rankdata(utilsFacade.finite(f))
			if len(all_ranks) > 0:
				minimum_rank = all_ranks.min() 
			temp_list = list()
			for fitem, rank in zip(f, rank_list):
				if str(fitem) != 'nan':
					temp_list.append(rank)
				else:
					temp_list.append(np.nan)
			df_lst.append(temp_list)
		df = pd.DataFrame(df_lst)
		df.columns = proteins
		df.index = name_list
		return df

	@staticmethod
	def get_zscores(df):
		myArray = np.array(df)
		normalizedArray = []
		for row in range(0, len(myArray)):
			list_values = []
			Min =  min(utilsFacade.finite(list(myArray[row])))
			Max = max(utilsFacade.finite(list(myArray[row])))
			mean = np.mean(utilsFacade.finite(list(myArray[row])))
			std = np.std(utilsFacade.finite(list(myArray[row])))
			for element in myArray[row]:
				list_values.append((element - mean)/std)
			normalizedArray.append(list_values)

		newArray = []
		for row in range(0, len(normalizedArray)):
			list_values = normalizedArray[row]
			newArray.append(list_values)

		new_df = pd.DataFrame(newArray)
		new_df.columns = list(df.columns)
		new_df.index = df.index
		df = new_df.copy()
		df = df.iloc[::-1]
		dfList = map(list,df.values)
		return df, dfList

	@staticmethod
	def manage_proteasome_data(df):
		try:
			df = df.drop(['PSD3','C9','C2','C6','RPN1'], axis = 1)
		except:
			remove_list = ['PSD3','C9','C2','C6','RPN1']
			remove_list1 = [item[0] + item[1:].lower() for item in remove_list]
			remove_list = remove_list + remove_list1
			df = df[~df.index.isin(remove_list)]
		return df		

	@staticmethod
	def prepare_variance_dataframe(name_list, data_list, complex_df, proteins):
		df_list = list()
		for protein in proteins:
			temp = list()
			for d,n in zip(data_list, name_list):
				sub = complex_df[complex_df.fileName==n]
				if protein in list(sub.index):
					if type(sub.loc[protein]) == pd.DataFrame:
						quant_cols = utilsFacade.filtering(d, 'quant_', condition = 'startswith')
						s = sub.loc[protein][quant_cols].T
						coverage_list = list()
						for c,col in enumerate(list(s.columns)):
							temp_finite = utilsFacade.finite(list(np.array(s)[:,c]))
							tempSmall = list(np.array(s)[:,c])
							coverage_list.append(float(len(temp_finite))/float(len(tempSmall))*100)
						s = sub.loc[protein]
						s['coverage'] = pd.Series(coverage_list, index = s.index)
						s = s.sort_values('coverage', ascending = False)
						var_value = s.iloc[0]['relative_variance']
						temp.append(var_value)
					else:
						var_value = sub.loc[protein]['relative_variance']
						temp.append(var_value)
				else:
					temp.append(np.nan)
			df_list.append(temp)
		df = pd.DataFrame(df_list)
		df.index = proteins
		df.columns = name_list
		df = df.dropna(thresh = int(len(df.columns)/2.0))
		df = df.T
		return df

	@staticmethod
	def plot_heatmap(df, complex_id, altName, **kwargs):
		output_folder = kwargs.get('output_folder','PATH')

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = False

		sc, color_list = colorFacade.get_specific_color_gradient(plt.cm.RdBu_r,
									 np.array(utilsFacade.finite(utilsFacade.flatten(map(list, df.values)))),
									 vmin = -2, vmax = 2)
		
		plt.clf()
		if complex_id.find('26S')!=-1 or complex_id.find('NPC')!=-1:
			fig = plt.figure(figsize = (10,3))
		else:
			fig = plt.figure(figsize = (5,3))
		ax = fig.add_subplot(111)
		mask = df.isnull()
		mask1 = mask.copy()
		mask1 = mask1.replace(True,'bla')
		mask1 = mask1.replace(False,'20')
		mask1 = mask1.replace('bla',1)
		mask1 = mask1.replace('20',0)
		sns.heatmap(mask1, cmap = ['grey'], linewidth = 0.2, cbar = False, xticklabels = [], yticklabels = [])
		sns.heatmap(df, cmap = plt.cm.RdBu_r, linewidth = 0.2, mask = mask, vmin = -2, vmax = 2)
		plt.savefig(output_folder + 'fig4a_' + altName + '_heatmap_subunits.pdf', bbox_inches = 'tight', dpi = 400)

	@staticmethod
	def iteration_complexes(data_list, name_list, **kwargs):
		do_ranking = kwargs.get('do_ranking', False)
		output_folder = kwargs.get('output_folder','PATH')

		relevant_complexes = step8_figure4a.load_relevant_complexes()

		concatanated_complex_dfs = list()
		for complex_id in relevant_complexes:
			print('*************************')
			print(complex_id)
			altName = step8_figure4a.get_alternative_name(complex_id)

			concat_list = list()
			for d,n in zip(data_list, name_list):
				sub = d[d.ComplexName == complex_id]
				if len(sub) > 0:
					sub['fileName'] = pd.Series([n]*len(sub), index = sub.index)
					concat_list.append(sub)
			complex_df = pd.concat(concat_list)
			complex_df.index = [item.upper() for item in list(complex_df.index)]
			proteins = list(set(complex_df.index))			

			df = step8_figures.prepare_variance_dataframe(name_list, data_list, complex_df, proteins)

			if do_ranking == True:
				df = step8_figures.rank_dataset(df, proteins, name_list)

			df, dfList = step8_figures.get_zscores(df)

			if complex_id.find('26S')!=-1:
				df = step8_figures.manage_proteasome_data(df)

			protein_list = list(df.columns)
			label_list = list()
			median_list = list()
			for c,col in enumerate(df.columns):
				median_list.append(np.mean(utilsFacade.finite(list(np.array(df)[:,c]))))
				label_list.append(col)

			median_list, label_list = zip(*sorted(zip(median_list, label_list), reverse = False))
			df = df[list(label_list)]
			if complex_id.find('Kornberg')!=-1:
				complex_id = 'Kornbergs mediator (SRB) complex'

			df1 = df.T.copy()
			df1['complex_id'] = pd.Series([complex_id]*len(df1), index = df1.index)
			concatanated_complex_dfs.append(df1)
			print('plot_heatmap')
			print('*************************')
			figure4a.plot_heatmap(df, complex_id, altName, output_folder = output_folder)

		complex_data = pd.concat(concatanated_complex_dfs)
		complex_data.to_csv(output_folder + 'fig4a_underlyingData_complexes_RNA_newData.tsv',
							sep = '\t')
		return complex_data

class step8_figure4b:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print('FIGURE4B: main_figure4b_26SProteasome_analysis')
		step8_figure4b.main_figure4b_26SProteasome_analysis()

	@staticmethod
	def main_figure4b_26SProteasome_analysis(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print("load_boxplot_data")
		sig_data_gygi3 = step8_figure4b.load_boxplot_data(folder = folder)

		print('plot_boxplot_complexVariance')
		figure4b.plot_boxplot_complexVariance(sig_data_gygi3, folder = folder, output_folder =  output_folder)

	@staticmethod
	def load_boxplot_data(**kwargs):
		folder = kwargs.get('folder','PATH')

		data = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi3.tsv.gz')
		sig_data_gygi3 = data[data.ComplexID=="26S Proteasome"]
		return sig_data_gygi3

	@staticmethod
	def plot_boxplot_complexVariance(sig_data, **kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		dataset = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi3.tsv.gz')
		dataset = dataset[dataset.ComplexID=="26S Proteasome"]
		quantCols = utilsFacade.get_quantCols(sig_data)
		dataset = dataset[quantCols]
		proteinList = list(dataset.index)
		dataset.index = proteinList
		dataset = dataset.drop_duplicates()

		complexID = "26S Proteasome"
		sub = sig_data[sig_data.ComplexID==complexID]
		sub = sub.sort_values("relative_variance")
		pvalueDict = sub.to_dict()["levene_pval.adj"]
		varianceDict = sub.to_dict()["relative_variance"]
		mean_variance = np.mean(utilsFacade.finite(sub["relative_variance"]))
		stateDict = sub.to_dict()["state"]
		quant_sub = sub[quantCols].T
		dsub = dataset.T[quant_sub.columns]
		dsub = dsub.T.drop_duplicates()
		dsub = dsub.T

		original_dataList = list()
		dataList1 = list()
		proteins1 = list()
		dataList2 = list()
		proteins = list()
		proteins2 = ["Psmb7","Psmd10","Psmb6","Psmb5","Psmb8","Psmd9","Psmb9","Psmb10"]
		for c,col in enumerate(dsub.columns):
			if col!="Psma8" and col!="Psmd4":#not enough datapoints/coverage
				if col in proteins2:
					dataList2.append(utilsFacade.finite(dsub.iloc[:,c]))
					proteins.append(col)
				else:
					dataList1.append(utilsFacade.finite(dsub.iloc[:,c]))
					proteins1.append(col)
		proteins2 = proteins
		immuno_proteasome = ["Psmb5","Psmb6","Psmb7","Psmb8","Psmb9","Psmb10"]

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		plt.clf()
		fig = plt.figure(figsize=(15,6))
		gs = gridspec.GridSpec(3,3)
		ax = plt.subplot(gs[0:,0:2])
		ax.axhline(1, color = 'k', linestyle = '--')
		ax.axhline(0.5, color = 'k', linestyle = '--')
		ax.axhline(-1, color = 'k', linestyle = '--')
		ax.axhline(0, color = 'k', linestyle = '--')
		ax.axhline(-0.5, color = 'k', linestyle = '--')
		bp=ax.boxplot(dataList1,notch=0,sym="",vert=1,patch_artist=True,widths=[0.8]*len(dataList1))
		plt.setp(bp['medians'], color="black")
		plt.setp(bp['whiskers'], color="black")
		plt.setp(bp['whiskers'], color="black")
		for i,patch in enumerate(bp['boxes']):
			protein=proteins1[i]
			x = numpy.random.normal(i+1, 0.04, size=len(dataList1[i]))
			if protein in immuno_proteasome:
				patch.set_facecolor("orange")	
				patch.set_edgecolor("orange")
			else:
				patch.set_facecolor("grey")	
				patch.set_edgecolor("grey")
			patch.set_alpha(0.8)
		plt.xticks(list(xrange(len(proteins1)+1)))
		ax.set_xlim(0,len(proteins1)+1)
		ax.set_xticklabels([""]+proteins1,fontsize=13,rotation=90)
		ax.set_ylabel("Relative Abundance",fontsize=13)
		plt.tick_params(axis="y",which="both",bottom="off",top="off",labelsize=12)
		complex_name = complexID.replace("/","_")
		complex_name = complex_name.replace(":","_")
		ax.set_ylim(-1.5,1.5)

		ax = plt.subplot(gs[0:,2:])
		ax.axhline(1, color = 'k', linestyle = '--')
		ax.axhline(0.5, color = 'k', linestyle = '--')
		ax.axhline(-1, color = 'k', linestyle = '--')
		ax.axhline(-0.5, color = 'k', linestyle = '--')
		ax.axhline(0, color = 'k', linestyle = '--')
		bp = ax.boxplot(dataList2,notch=0,sym="",vert=1,patch_artist=True,widths=[0.8]*len(dataList2))
		plt.setp(bp['medians'], color="black")
		plt.setp(bp['whiskers'], color="black")
		plt.setp(bp['whiskers'], color="black")
		for i,patch in enumerate(bp['boxes']):
			protein = proteins2[i]
			x = numpy.random.normal(i+1, 0.04, size=len(dataList2[i]))
			if protein in immuno_proteasome:
				patch.set_facecolor("orange")	
				ax.scatter(x,dataList2[i],color='white', alpha=0.9,edgecolor="brown",s=20)		
			elif protein.find("a")!=-1 or protein.find("b")!=-1:
				patch.set_facecolor("#95DCEC")	
				ax.scatter(x,dataList2[i],color='white', alpha=0.9,edgecolor="darkblue",s=20)		
			elif protein.find("c")!=-1 or protein.find("d")!=-1:
				patch.set_facecolor("grey")	
				ax.scatter(x,dataList2[i],color='white', alpha=0.9,edgecolor="black",s=20)		
			patch.set_edgecolor("black")
			patch.set_alpha(0.8)
		plt.xticks(list(xrange(len(proteins2)+1)))
		ax.set_xlim(0,len(proteins2)+1)
		ax.set_xticklabels([""]+[item.upper() for item in proteins2],fontsize=13,rotation=90)
		ax.set_yticklabels([])
		plt.tick_params(axis="y",which="both",bottom="off",top="off",labelsize=12)
		complex_name = complexID.replace("/","_")
		complex_name = complex_name.replace(":","_")
		ax.set_ylim(-1.5,1.5)
		plt.savefig(output_folder + "fig4b_stochiometry_data_complexes_26SProteasome_gygi3.pdf",
					bbox_inches="tight", dpi=400)

class step8_export:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print('export_underlying_data_part1')
		concat_df = step8_export.export_underlying_data_part1(folder = folder, output_folder = output_folder)

	@staticmethod
	def export_underlying_data_part1(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		stoch_gygi3 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi3.tsv.gz')
		stoch_gygi1 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi1.tsv.gz')
		stoch_mann = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_mann.tsv.gz')
		stoch_battle_protein = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_battle_protein.tsv.gz')
		stoch_tcgaBreast = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_tcgaBreast.tsv.gz')
		stoch_tcgaOvarian = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_tcgaOvarian.tsv.gz')
		stoch_tcgaColorCancer = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_tcgaColoCancer.tsv.gz')

		name_list = ['gygi3','gygi1','battle_protein','mann_all_log2', 'tcgaBreast',
					 'tcgaOvarian', 'tcgaColoCancer']
		data_list = [stoch_gygi3, stoch_gygi1, stoch_battle_protein, stoch_mann_all_log2,
					 stoch_tcgaBreast, stoch_tcgaOvarian,stoch_colo_cancer]
		temp_columns = ['relative_variance','coverage','levene_pval','levene_pval.adj',
						'state','phos','ac','sumo','ub','met','location_humanProteinAtlas',
						'ComplexID','ComplexName']

		names = list()
		state_list = list()
		zscore_list = list()
		concat_list = list()
		combined_list = list()
		member_num_list = list()
		combined_corr_list = list()
		
		for name,d in zip(name_list, data_list):
			quant_cols = utilsFacade.filtering(list(d.columns),'quant_')
			quant_data = d[quant_cols]
			coverage_list = list()
			for i,row in quant_data.iterrows():
				temp = utilsFacade.finite(list(row))
				coverage_list.append(float(len(temp))/float(len(quant_cols))*100)
			subdf = d[temp_columns]
			subdf['coverage'] = pd.Series(coverage_list, index = subdf.index)
			complex_list = list(set(subdf.ComplexName))
			for complexID in complex_list:
				print(name,complexID)
				com_sub = subdf[subdf.ComplexName==complexID]
				myArray = list(com_sub['relative_variance'])

				try:
					list_values = []
					Min =  min(utilsFacade.finite(myArray))
					Max = max(utilsFacade.finite(myArray))
					mean = np.mean(utilsFacade.finite(myArray))
					std = np.std(utilsFacade.finite(myArray))
					for element in myArray:
						list_values.append((element - mean)/std)
				except:
					print('Failed_zscore')

				combined = scipy.stats.combine_pvalues(list(com_sub['levene_pval']))[1]
				combined_corr = scipy.stats.combine_pvalues(list(com_sub['levene_pval.adj']))[1]
				concat_list.append(com_sub)
				for i in xrange(len(com_sub)):
					combined_list.append(combined)
					combined_corr_list.append(combined_corr)
					names.append(name)
					member_num_list.append(len(com_sub))
					if len(list_values)==0:
						zscore_list.append(np.nan)
						state_list.append('')
					else:
						zscore_list.append(list_values[i])
						if list_values[i]>1:
							state_list.append('variable')
						elif list_values[i]<-1:
							state_list.append('stable')
						else:
							state_list.append('')

		concat_df = pd.concat(concat_list)
		concat_df['member.num'] = pd.Series(member_num_list, index = concat_df.index)
		concat_df['name'] = pd.Series(names, index = concat_df.index)
		concat_df['combined_pvalue'] = pd.Series(combined_list, index = concat_df.index)
		concat_df['combined_pvalCorr'] = pd.Series(combined_corr_list, index = concat_df.index)
		corrs = statsFacade.correct_nan_pvalues(combined_corr_list)
		concat_df['combined_pvalCorrFinal'] = pd.Series(corrs, index = concat_df.index)
		concat_df['z-score'] = pd.Series(zscore_list, index = concat_df.index)
		concat_df['new_state'] = pd.Series(state_list, index = concat_df.index)

		concat_df.to_csv(output_folder + 'suppTable3_fig4_underlyingData_subunitStability.tsv', sep = '\t')
		return concat_df

	@staticmethod
	def export_underlying_data_part2(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		filename = 'fig4_stability_complexSubunits.txt'
		data = DataFrameAnalyzer.getFile(folder, filename)

		comData_corr = DataFrameAnalyzer.getFile(folder, 'figure3_underlying_data_real_corrValues.tsv')
		comData_zsco = DataFrameAnalyzer.getFile(folder, 'figure3_underlying_data_ranked_zscores.tsv')
		comDict_corr = comData_corr.to_dict()
		comDict_zsco = comData_zsco.to_dict()

		corr_list = list()
		zscore_list = list()
		for i,row in data.iterrows():
			dataset = row['dataset of origin']
			complexID = i
			try:
				corr_value = comDict_corr[dataset][complexID]
			except:
				corr_value = ''
			corr_list.append(corr_value)

		data['complex_corr'] = pd.Series(corr_list, index = data.index)
		data.to_csv(output_folder + 'fig4_with3_underlyingData_subunitStability.tsv', sep = '\t')

	@staticmethod
	def read_data(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		complex_data = DataFrameAnalyzer.getFile(folder,'wpc_figure2B_underlyingData_complexes_RNA_newData.tsv')
		quant_data = complex_data.drop(['complex_id','mean','p.value','label'],1)
		mean_vector = quant_data.T.mean()
		median_vector = quant_data.T.median()
		pvalues = list()
		label_list = list()
		for zscore in median_vector:
			pval = utilsFacade.zscore_to_pvalue(zscore)[1]
			pvalues.append(pval)
			if pval < 0.1 and zscore > 0:
				label_list.append('variable')
			elif pval < 0.1 and zscore < 0:
				label_list.append('stable')
			else:
				label_list.append('')
		complex_data['median'] = pd.Series(list(median_vector), index = complex_data.index)
		complex_data['p.value'] = pd.Series(pvalues, index = complex_data.index)
		complex_data['label'] = pd.Series(label_list, index = complex_data.index)
		complex_data.to_csv(output_folder + 'wpc_figure2B_underlyingData_complexes_RNA_newData.tsv', sep = '\t')

if __name__ == "__main__":
	## EXECUTE STEP8
	step8_preparation.execute(folder = sys.argv[1], output_folder = sys.argv[2])
	step8_stats_check.execute(folder = sys.argv[1], output_folder = sys.argv[2])
	step8_figure4a.execute(folder = sys.argv[1], output_folder = sys.argv[2])
	step8_figure4b.execute(folder = sys.argv[1], output_folder = sys.argv[2])
	step8_export.execute(folder = sys.argv[1], output_folder = sys.argv[2])
All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.
GNU LICENSE
Download script here
Computational Pipeline

All Code Material

STEP 8: Python Code