STEP 3: Python Code

									
class file_Loader(object):

	@staticmethod
	def load_step3_prep_data(folder):
		dat_gygi3 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_gygi3.tsv.gz')
		dat_gygi2 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_gygi2.tsv.gz')
		dat_gygi1 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_gygi1.tsv.gz')
		dat_mann = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_mann.tsv.gz')
		dat_battle_rna = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_battle_rna.tsv.gz')
		dat_battle_ribo = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_battle_ribo.tsv.gz')
		dat_battle_protein = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_battle_protein.tsv.gz')
		dat_tcga_colo = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_tcga_colo.tsv.gz')
		dat_tcga_breast = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_tcga_breast.tsv.gz')
		dat_tcga_ovarian = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_tcga_ovarian.tsv.gz')

		data_dict = {'mann':dat_mann,
					 'gygi2':dat_gygi2,
					 'gygi3':dat_gygi3,
					 'gygi1':dat_gygi1,
					 'tcga_colo':dat_tcga_colo,
					 'battle_rna':dat_battle_rna,
					 'tcga_breast':dat_tcga_breast,
					 'tcga_ovarian':dat_tcga_ovarian,
					 'battle_ribo':dat_battle_ribo,
					 'battle_protein':dat_battle_protein}
		return data_dict

	@staticmethod
	def load_step3_figure_data(folder):
		complex_data_mann = DataFrameAnalyzer.open_in_chunks(folder,'suppFig2a_wpc_mann_all_log2_complex_data.tsv.gz')
		complex_data_gygi2 = DataFrameAnalyzer.open_in_chunks(folder,'suppFig2a_wpc_gygi2_complex_data.tsv.gz')
		complex_data_gygi3 = DataFrameAnalyzer.open_in_chunks(folder,'suppFig2a_wpc_gygi3_complex_data.tsv.gz')
		complex_data_gygi1 = DataFrameAnalyzer.open_in_chunks(folder,'suppFig2a_wpc_gygi1_complex_data.tsv.gz')
		complex_data_bp = DataFrameAnalyzer.open_in_chunks(folder,'suppFig2a_wpc_battle_protein_complex_data.tsv.gz')
		complex_data_br = DataFrameAnalyzer.open_in_chunks(folder,'suppFig2a_wpc_battleRNA_complex_data.tsv.gz')
		complex_data_bribo = DataFrameAnalyzer.open_in_chunks(folder,'suppFig2a_wpc_battleRibo_complex_data.tsv.gz')
		complex_data_coloCancer = DataFrameAnalyzer.open_in_chunks(folder,'suppFig2a_wpc_coloCancer_complex_data.tsv.gz')
		complex_data_breastCancer = DataFrameAnalyzer.open_in_chunks(folder,'suppFig2a_wpc_breastCancer_complex_data.tsv.gz')
		complex_data_ovarianCancer = DataFrameAnalyzer.open_in_chunks(folder,'suppFig2a_wpc_ovarianCancer_complex_data.tsv.gz')

		data_dict = {'mann':complex_data_mann,
					 'gygi2':complex_data_gygi2,
					 'gygi3':complex_data_gygi3,
					 'gygi1':complex_data_gygi1,
					 'bp': complex_data_bp,
					 'br':complex_data_br,
					 'bribo': complex_data_bribo,
					 'colo': complex_data_coloCancer,
					 'breast':complex_data_breastCancer
					 'ovarian':complex_data_ovarianCancer}
		return data_dict

class step3_preparation:

	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		print('load_step3_prep_data')
		data_dict = file_Loader.load_step3_prep_data(folder)

		for n in data_dict.keys():
			complex_specific_dict = step3_preparation.get_complex_specific_data(data_dict[n])
			complex_data = step3_preparation.export_data(complex_specific_dict,
														 n, output_folder = output_folder)

	@staticmethod
	def get_complex_specific_data(data):
		complexList = list(set(data["ComplexID"]))
		quantCols = utilsFacade.filtering(list(data.columns), 'quant_')
		complex_specific_dict = dict((e1,dict()) for e1 in complexList)
		median_complexList = list()
		varMedian_complexList = list()
		for complexID in complexList:
			sub = data[data.ComplexID == complexID]
			quant_sub = sub[quantCols].T
			medianList = list()
			for p,protein in enumerate(list(quant_sub.columns)):
				lst = list(quant_sub.iloc[:,p])
				medianList.append(np.median(utilsFacade.finite(lst)))
			corrData = quant_sub.corr()
			corr_values = utilsFacade.get_correlation_values(corrData)

			median_abundance = np.mean(medianList)
			variance_abundance = np.std(medianList)
			median_correlation = np.median(corr_values)
			complex_specific_dict[complexID] = dict((e1,list()) for e1 in ['abundance','corr',
																		   'var','num','var_zscore',
																		   'zscore'])
			complex_specific_dict[complexID]['abundance'] = median_abundance
			complex_specific_dict[complexID]['corr'] = median_correlation
			complex_specific_dict[complexID]['var'] = variance_abundance
			complex_specific_dict[complexID]['num'] = len(sub)
			median_complexList.append(median_abundance)
			varMedian_complexList.append(variance_abundance)

		zscores = list()
		median_mean = np.mean(utilsFacade.finite(median_complexList))
		median_std = np.std(utilsFacade.finite(median_complexList))
		for item in median_complexList:
			zscores.append(float(item - median_mean)/float(median_std))

		var_zscores = list()
		median_mean = np.mean(utilsFacade.finite(varMedian_complexList))
		median_std = np.std(utilsFacade.finite(varMedian_complexList))
		for item in varMedian_complexList:
			var_zscores.append(float(item - median_mean)/float(median_std))

		count = 0
		for complexID,z in zip(complexList,zscores):
			var_zscore = var_zscores[count]
			complex_specific_dict[complexID]['zscore'] = z
			complex_specific_dict[complexID]['var_zscore'] = var_zscore
			count+= 1
		return complex_specific_dict

	@staticmethod
	def export_data(complex_specific_dict, name, **kwargs):
		output_folder = kwargs.get('output_folder', 'PATH')

		complex_data = pd.DataFrame(complex_specific_dict)
		complex_data.to_csv(output_folder + 'suppFig2a_wpc_' + name + '_complex_data.tsv.gz',
							sep = '\t', compression = 'gzip')
		return complex_data

class step3_figure:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')
		
		print('SUPP-FIGURE2A: main_supp_figure2a_bins_boxplots_effect')
		step3_figure.main_supp_figure2a_bins_boxplots_effect(output_folder = output_folder)

	@staticmethod
	def main_supp_figure2a_bins_boxplots_effect(**kwargs):
		output_folder = kwargs.get('output_folder','PATH')

		print('load_data')
		data_dict = file_Loader.load_step3_figure_data(output_folder)

		print('make_boxplots')
		step3_figure.make_boxplots(data_dict, output_folder)

	@staticmethod
	def bin_abundances_variances(complex_data, keyword):
		complex_data = complex_data.copy()
		complex_data = complex_data.T
		complex_data = complex_data.sort_values(keyword)

		zscore_list = list(complex_data[keyword])
		corr_list = list(complex_data['corr'])
		bin_list = list()
		for i in xrange(0,5):
			bin_list.append([(i)*20, (20+(i)*20)])
		bin_list = [(0,25), (25,50), (50,75), (75,100)]
		zscore_dict = dict()
		for binL in bin_list:
			c1,c2 = binL
			temp = corr_list[c1:c2]
			zscore_dict.setdefault(str(c1) + ':' + str(c2),[]).append(utilsFacade.finite(temp))
		return zscore_dict

	@staticmethod
	def make_boxplots(data_dict, folder):
		complex_data_gygi3 = data_dict['gygi3']
		complex_data_gygi1 = data_dict['gygi1']
		complex_data_gygi2 = data_dict['gygi2']
		complex_data_bp = data_dict['bp']
		complex_data_br = data_dict['br']
		complex_data_bribo = data_dict['bribo']
		complex_data_mann = data_dict['mann']
		complex_data_coloCancer = data_dict['colo']
		complex_data_breastCancer = data_dict['breast']
		complex_data_ovarianCancer = data_dict['ovarian']

		bin_list = list()
		for i in xrange(0,5):
			bin_list.append(str((i)*20) + ':' + str((20+(i)*20)))
		bin_list = ['0:25','25:50','50:75','75:100']

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans',
			font_scale=1, color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		name_list = ['gygi1','gygi2','gygi3','battleRNA',
					 'battleRibo','battle_protein','mann_all_log2',
					 'coloCancer','breastCancer','ovarianCancer']
		data_list = [complex_data_gygi1, complex_data_gygi2,
					 complex_data_gygi3, complex_data_br,
					 complex_data_bribo, complex_data_bp,
					 complex_data_mann, complex_data_coloCancer,
					 complex_data_breastCancer,
					 complex_data_ovarianCancer]

		for n,d in zip(name_list, data_list):
			d = d.T
			d = d[d.num >= 0]
			d = d.T

			print(n)
			zscore_dict_abundances = step3_figure.bin_abundances_variances(d,'zscore')
			zscore_dict_variances = step3_figure.bin_abundances_variances(d,'var_zscore')

			dataList = list()
			var_dataList = list()
			median_abundances = list()
			median_variances = list()
			for dbin in bin_list:
				dataList.append(zscore_dict_abundances[dbin][0])
				var_dataList.append(zscore_dict_variances[dbin][0])
				median_abundances.append(np.mean(zscore_dict_abundances[dbin][0]))
				median_variances.append(np.mean(zscore_dict_variances[dbin][0]))

			positions = list(np.arange(1,len(bin_list)/5.0+5,0.5)[0:(len(bin_list))])
			widths = [0.4]*len(bin_list)

			plt.clf()
			fig = plt.figure(figsize = (7,4))
			ax = fig.add_subplot(121)
			bp = ax.boxplot(dataList,notch=0,sym="",vert=1, patch_artist=True,widths=widths,
							positions = positions)
			plt.setp(bp['medians'], color="black")
			plt.setp(bp['whiskers'], color="black")
			nList = list()
			for i,patch in enumerate(bp['boxes']):
				patch.set_facecolor("lightgrey")	
				patch.set_edgecolor("black")
				patch.set_alpha(0.6)
				sub_group = dataList[i]
				x = numpy.random.normal(positions[i], 0.04, size=len(dataList[i]))
				ax.scatter(x,sub_group, color='white', alpha=0.9,edgecolor="black",s=40)
				nList.append(len(dataList[i]))
			pvals = list()
			combiList = utilsFacade.getCombinations(np.arange(0,len(dataList),1))
			for combi in combiList:
				c1,c2 = combi
				pval = scipy.stats.ttest_ind(dataList[c1],dataList[c2])[1]
				pvals.append(str(c1) + ':' + str(c2) + '=' + str(pval))
			ax.set_title('\n'.join(pvals), fontsize =5)
			ax.set_ylim(-0.5,1)
			ax.set_xticklabels(nList)

			ax = fig.add_subplot(122)
			bp = ax.boxplot(var_dataList,notch=0,sym="",vert=1, patch_artist=True,widths=widths,
							positions = positions)
			plt.setp(bp['medians'], color="black")
			plt.setp(bp['whiskers'], color="black")
			nList = list()
			for i,patch in enumerate(bp['boxes']):
				patch.set_facecolor("lightgrey")	
				patch.set_edgecolor("black")
				patch.set_alpha(0.6)
				sub_group = var_dataList[i]
				x = numpy.random.normal(positions[i], 0.04, size=len(var_dataList[i]))
				ax.scatter(x,sub_group, color='white', alpha=0.9,edgecolor="black",s=40)
				nList.append(len(dataList[i]))
			df_var = d.T[['corr','var_zscore']]
			df_var = df_var.dropna()
			pvals = list()
			combiList = utilsFacade.getCombinations(np.arange(0,len(var_dataList),1))
			for combi in combiList:
				c1,c2 = combi
				pval = scipy.stats.ttest_ind(var_dataList[c1],var_dataList[c2])[1]
				pvals.append(str(c1) + ':' + str(c2) + '=' + str(pval))
			ax.set_title('\n'.join(pvals), fontsize =5)
			ax.set_ylim(-0.5,1)
			ax.set_xticklabels(nList)

			plt.savefig(folder + 'suppFig2a_boxplots_overview_' + n + '.pdf',
						bbox_inches = 'tight', dpi = 400)

if __name__ == "__main__":
	## EXECUTE STEP 3
	step3_preparation.execute(folder = sys.argv[1], output_folder = sys.argv[2])
	step3_figure.execute(folder = sys.argv[1], output_folder = sys.argv[2])
									
								

All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.

GNU LICENSE

Download script here