STEP 4: Python Code

									
class file_Loader(object):

	@staticmethod
	def load_step4_prep_data(folder):
		dat_gygi3 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_gygi3.tsv.gz')
		dat_gygi2 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_gygi2.tsv.gz')
		dat_gygi1 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_gygi1.tsv.gz')
		dat_mann = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_mann.tsv.gz')
		dat_battle_rna = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_battle_rna.tsv.gz')
		dat_battle_ribo = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_battle_ribo.tsv.gz')
		dat_battle_protein = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_battle_protein.tsv.gz')
		dat_tcga_colo = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_tcga_colo.tsv.gz')
		dat_tcga_breast = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_tcga_breast.tsv.gz')
		dat_tcga_ovarian = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_tcga_ovarian.tsv.gz')

		data_dict = {'mann':dat_mann,
					 'gygi2':dat_gygi2,
					 'gygi3':dat_gygi3,
					 'gygi1':dat_gygi1,
					 'tcga_colo':dat_tcga_colo,
					 'tcga_breast':dat_tcga_breast,
					 'tcga_ovarian':dat_tcga_ovarian,
					 'battle_rna':dat_battle_rna,
					 'battle_ribo':dat_battle_ribo,
					 'battle_protein':dat_battle_protein}
		return data_dict

	@staticmethod
	def load_step4_dictionaries(folder, n)
		real_complexDict = json.load(open(folder + n + '_real_complexSet_dict.json','rb'))
		randomized_complexDict = json.load(open(folder + n + '_randomized_complexSet_dict.json','rb'))
		randomized_shuffled_complexDict = json.load(open(folder + n + '_randomized_complexSet_reshuffledData_dict.json','rb'))
		return real_complexDict, randomized_complexDict, randomized_shuffled_complexDict

class step4_preparation:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')
		
		print('load_data')
		data_dict = file_Loader.load_step4_prep_data(folder)
		for n in data_dict.keys():
			data = data_dict[n]
			all_random_corrDict, all_random_corrs = step4_preparation.get_randomized_complex_set(data, name, folder)
			all_random_reshuffled_corrDict, all_random_reshuffled_corrs = step4_preparation.get_randomized_complex_set_reshuffled_data(data, name, folder)
			all_real_corrDict, all_real_corrs = step4_preparation.get_real_complex_set(data, name, folder)

	@staticmethod
	def get_randomized_complex_set(data, name, output_folder):
		all_random_corrDict = dict()
		all_random_dict = dict()

		d = data.copy()
		n = name
		complexSet = list(set(d['ComplexName']))
		quant_list = utilsFacade.filtering(d, 'quant', condition = 'startswith')
		quant_data = d[quant_list]
		all_random_corrs = list()
		all_random_corrDict = dict((e1,list()) for e1 in complexSet)
		count = 0
		for complexID in complexSet:
			print(n, count, complexID)
			sub = d[d.ComplexName==complexID]
			complex_proteins = list(set(sub.index))
			other_sub = d[~d.index.isin(complex_proteins)]
			other_proteins = list(other_sub.index)
			corr_list = list()
			for i in xrange(0,10):
				random_proteins = random.sample(other_proteins, len(complex_proteins))
				quant_sub = quant_data.T[random_proteins].T
				quant_sub = quant_sub.drop_duplicates()
				corr_data = quant_sub.T.corr()
				corr_values = utilsFacade.get_correlation_values(corr_data)
				corr_list.append(np.median(corr_values))
			all_random_corrs.append(corr_list)
			all_random_corrDict[complexID] = corr_list
			count+=1
		all_random_corrs = utilsFacade.flatten(all_random_corrs)

		all_random_corrDict['ALL'] = utilsFacade.finite(all_random_corrs)
		json.dump(all_random_corrDict, open(output_folder + n + '_randomized_complexSet_dict.json','wb'))
		return all_random_corrDict, all_random_corrs

	@staticmethod
	def get_randomized_complex_set_reshuffled_data(data, name, output_folder):
		all_random_corrDict = dict()
		all_random_dict = dict()

		d = data.copy()
		n = name
		complexSet = list(set(d['ComplexName']))
		quant_list = utilsFacade.filtering(d, 'quant', condition = 'startswith')
		quant_data = d[quant_list]
		all_random_corrs = list()
		all_random_corrDict = dict((e1,list()) for e1 in complexSet)
		count = 0
		for complexID in complexSet:
			print(n, count, complexID)
			sub = d[d.ComplexName==complexID]
			complex_proteins = list(set(sub.index))
			other_sub = d[~d.index.isin(complex_proteins)]
			other_proteins = list(other_sub.index)
			corr_list = list()
			for i in xrange(0,10):
				random_proteins = random.sample(other_proteins, len(complex_proteins))
				quant_sub = quant_data.T[random_proteins].T
				quant_sub = quant_sub.drop_duplicates()
				df_list = list()
				for q,row in quant_sub.iterrows():
					temp = list(row)
					random.shuffle(temp)
					df_list.append(temp)
				new_quant_sub = pd.DataFrame(df_list)
				corr_data = new_quant_sub.T.corr()
				corr_values = utilsFacade.get_correlation_values(corr_data)
				corr_list.append(np.median(corr_values))
			all_random_corrs.append(corr_list)
			all_random_corrDict[complexID] = corr_list
			count+=1
		all_random_corrs = utilsFacade.flatten(all_random_corrs)

		all_random_corrDict['ALL'] = utilsFacade.finite(all_random_corrs)
		json.dump(all_random_corrDict,
				  open(output_folder + n + '_randomized_complexSet_reshuffledData_dict.json','wb'))
		return all_random_corrDict, all_random_corrs

	@staticmethod
	def get_real_complex_set(data, name, output_folder):
		all_real_corrDict = dict()
		all_real_dict = dict()

		d = data.copy()
		n = name
		complexSet = list(set(d['ComplexName']))
		quant_list = utilsFacade.filtering(d, 'quant', condition = 'startswith')
		quant_data = d[quant_list]
		all_real_corrs = list()
		all_real_corrDict = dict((e1,list()) for e1 in complexSet)
		count = 0
		for complexID in complexSet:
			print(n, count, complexID)
			sub = d[d.ComplexName==complexID]
			complex_proteins = list(set(sub.index))
			quant_sub = quant_data.T[complex_proteins].T
			quant_sub = quant_sub.drop_duplicates()
			corr_data = quant_sub.T.corr()
			corr_values = utilsFacade.get_correlation_values(corr_data)
			all_real_corrs.append(np.median(corr_values))
			all_real_corrDict[complexID] = np.median(corr_values)
			count+=1

		all_real_corrDict['ALL'] = utilsFacade.finite(all_real_corrs)
		json.dump(all_real_corrDict,open(output_folder + n + '_real_complexSet_dict.json','wb'))
		return all_real_corrDict, all_real_corrs

class step4_figure:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		num_subunits = kwargs.get('num_subunits',0)
		output_folder = kwargs.get('output_folder','PATH')
		
		print('SUPP-FIGURE2B: main_supp_figure2b_boxplot_randomControl')
		step4_figure.main_supp_figure2b_boxplot_randomControl(output_folder = output_folder,
															  num_subunits = num_subunits,
															  folder = folder)

	@staticmethod
	def main_supp_figure2b_boxplot_randomControl(**kwargs):
		folder = kwargs.get('folder','PATH')
		num_subunits = kwargs.get('num_subunits',0)
		output_folder = kwargs.get('output_folder','PATH')

		print('prepare_data_for_boxplot')
		bp_data_dict = step4_figure.prepare_data_for_boxplot(num_subunits = num_subunits,
					   folder = folder, output_folder = output_folder)
		
		print('plot_boxplot')
		step4_figure.plot_boxplot(bp_data_dict, output_folder)

	@staticmethod
	def prepare_data_for_boxplot(**kwargs):

		folder = kwargs.get('folder','PATH')
		num_subunits = kwargs.get('num_subunits',0)
		output_folder = kwargs.get('output_folder','PATH')

		name_list = ['gygi1','gygi2','gygi3','battle_rna',
					 'battle_ribo','battle_protein',
					 'mann','tcgaColo','tcgaBreast','tcgaOvarian']

		bp_data_dict = dict()
		for n in name_list:
			print(n)
			d1, d2, d3 = file_Loader.load_step4_dictionaries(folder, n)
			real_complexes = set(d1.keys())
			randomized_complexes = set(d2.keys())
			randomized_shuffled_complexes = set(d3.keys())
			complexList = set.intersection(*[real_complexes, randomized_complexes, 
						  					 randomized_shuffled_complexes])

			randomized_shuffled_list = list()
			randomized_list = list()
			real_list = list()
			for complexID in complexList:
				if complexID != 'ALL':
					randomized_shuffled = d3[complexID]
					randomized = d2[complexID]
					real = d1[complexID]
					randomized_shuffled_list.append(randomized_shuffled)
					randomized_list.append(randomized)
					real_list.append(real)
			randomized_shuffled_list = utilsFacade.finite(utilsFacade.flatten(randomized_shuffled_list))
			randomized_list = utilsFacade.finite(utilsFacade.flatten(randomized_list))
			real_list = filter(lambda a:str(a)!='nan',real_list)

			bp_data_dict[n] = [randomized_shuffled_list, randomized_list, real_list]

		return bp_data_dict

	@staticmethod
	def plot_boxplot(bp_data_dict, output_folder):

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		name_list = ['gygi1','gygi2','gygi3','battle_rna','battle_ribo','battle_protein',
					 'mann','tcgaColo','tcgaBreast','tcgaOvarian']

		positions = [1,1.5,2]
		plt.clf()
		fig = plt.figure(figsize = (15,4))
		gs = gridspec.GridSpec(6,30)
		for n,name in enumerate(name_list):
			dataList = bp_data_dict[name]
			ax = plt.subplot(gs[0:, (3*n):(3*n)+3])
			bp = ax.boxplot(dataList,notch=0,sym="",vert=1,patch_artist=True,
							widths=[0.45,0.45,0.45], positions = positions)
			plt.setp(bp['medians'], color="black")
			plt.setp(bp['whiskers'], color="black")
			pval1 = scipy.stats.ranksums(dataList[0], dataList[-1])[1]
			pval2 = scipy.stats.ranksums(dataList[1], dataList[-1])[1]
			print(name, pval1, pval2)

			for i,patch in enumerate(bp['boxes']):
				patch.set_facecolor("lightgrey")	
				patch.set_edgecolor("black")
				patch.set_alpha(0.6)
				x = numpy.random.normal(positions[i], 0.04, size=len(dataList[2]))
				ax.scatter(x, random.sample(dataList[i],len(dataList[2])), color='white',
						   alpha=0.9, edgecolor="black",s=10)
			ax.set_ylim(-0.7, 1)
			if n > 0:
				ax.set_yticklabels([])

		plt.savefig(output_folder + 'suppFigure2B_boxplot_random_complex_variability_all.pdf',
					bbox_inches = 'tight', dpi = 400)

if __name__ == "__main__":
	## EXECUTE STEP4
	step4_preparation.execute(folder = sys.argv[1], output_folder = sys.argv[2])
	step4_figure.execute(folder = sys.argv[1], output_folder = sys.argv[2])

									
								

All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.

GNU LICENSE

Download script here