Impact on Proteotype: STEP5

STEP 5: Python Code

									

class file_Loader:

	@staticmethod
	def load_string_data_mouse(folder):
		stringData_mouse = DataFrameAnalyzer.open_in_chunks(folder,"stringData_mouse_onlyGeneNames.tsv.gz")
		stringData_mouse_700 = DataFrameAnalyzer.open_in_chunks(folder,"stringData_mouse_700_onlyGeneNames.tsv.gz")
		return stringData_mouse, stringData_mouse_700

	@staticmethod
	def load_string_data_human(folder):
		stringData_human = DataFrameAnalyzer.open_in_chunks(folder,"stringData_human_onlyGeneNames.tsv.gz")
		stringData_human_700 = DataFrameAnalyzer.open_in_chunks(folder,"stringData_human_700_onlyGeneNames.tsv.gz")
		return stringData_human, stringData_human_700

	@staticmethod
	def load_data(**kwargs):
		folder = kwargs.get('folder', 'PATH')

		gygi3 = DataFrameAnalyzer.open_in_chunks(folder, "dataset_gygi3_remapped.tsv.gz")
		gygi2 = DataFrameAnalyzer.open_in_chunks(folder, "dataset_gygi2_remapped.tsv.gz")
		gygi1 = DataFrameAnalyzer.open_in_chunks(folder, "dataset_gygi1_remapped.tsv.gz")
		battle_protein = DataFrameAnalyzer.open_in_chunks(folder, "dataset_battle_protein_remapped.tsv.gz")
		battle_ribo = DataFrameAnalyzer.open_in_chunks(folder, "dataset_battle_ribo_remapped.tsv.gz")
		battle_rna = DataFrameAnalyzer.open_in_chunks(folder, "dataset_battle_rna_remapped.tsv.gz")
		mann_all_log2 = DataFrameAnalyzer.open_in_chunks(folder, "dataset_mann_all_log2_remapped.tsv.gz")
		wu = DataFrameAnalyzer.open_in_chunks(folder, "dataset_wu_remapped.tsv.gz")
		tiannan = DataFrameAnalyzer.open_in_chunks(folder, "dataset_tiannan_remapped.tsv.gz")
		primatePRO = DataFrameAnalyzer.open_in_chunks(folder, "dataset_primatePRO_remapped.tsv.gz")
		primateRNA = DataFrameAnalyzer.open_in_chunks(folder, "dataset_primateRNA_remapped.tsv.gz")
		tcga_ovarian = DataFrameAnalyzer.open_in_chunks(folder, "dataset_tcga_ovarian_remapped.tsv.gz")
		tcga_breast = DataFrameAnalyzer.open_in_chunks(folder, "dataset_tcga_breast_remapped.tsv.gz")
		bxd_protein = DataFrameAnalyzer.open_in_chunks(folder, "dataset_bxdMouse_remapped.tsv.gz")
		colo_cancer = DataFrameAnalyzer.open_in_chunks(folder, "dataset_coloCa_remapped.tsv.gz")

		data_dict = {'gygi1':gygi1,
					 'gygi2':gygi2,
					 'gygi3': gygi3,
					 'battle_protein':battle_protein,
					 'battle_ribo': battle_ribo,
					 'battle_rna':battle_rna,
					 'wu':wu,
					 'tiannan':tiannan,
					 'colo_cancer':coloCa, 
					 'tcga_breast':tcga_breast,
					 'tcga_ovarian':tcga_ovarian,
					 'bxd_protein':bxd_protein,
					 'primateRNA':primateRNA,
					 'primatePRO':primatePRO,
					 'mann':mann}

		return data_dict

class step5_preparation:
	@staticmethod
	def execute(data, name, **kwargs):
		folder = kwargs.get('folder','PATH')
		species = kwargs.get("species","mouse")
		output_folder = kwargs.get('output_folder','PATH')

		if species == "mouse":
			print("load_string_data:MOUSE")
			stringData, stringData_700 = file_Loader.load_string_data_mouse(folder)
		elif species == "human":
			print("load_string_data:HUMAN")
			stringData, stringData_700 = file_Loader.load_string_data_human(folder)

		interactions_all = list(stringData["interaction"])
		interactions_700 = list(stringData_700["interaction"])

		print("get_other_list")
		other_interactions = step5_preparation.get_other_list(data, interactions_all)

		print("load_corr_data")
		corrData = step5_preparation.load_corr_data(data)

		print("overlap_STRING_interactions_with_data")
		proteinList_left = step5_preparation.overlap_STRING_interactions_with_data(corrData, stringData)
		proteinList_left700 = step5_preparation.overlap_STRING_interactions_with_data(corrData, stringData_700)

		print("overlap_OTHER_interactions_with_data")
		other_proteinList_left = step5_preparation.overlap_OTHER_interactions_with_data(corrData, other_interactions)

		print("get_export_relevant_STRING_correlation_values")
		info1 = corrData, proteinList_left, interactions_all, "string_correlations_allFINAL_" + name
		step5_preparation.get_export_relevant_STRING_correlation_values(info1, folder = folder)
		info2 = corrData, proteinList_left700, interactions_700, "string_correlations_FINAL700_" + name
		step5_preparation.get_export_relevant_STRING_correlation_values(info2, folder = folder)
		info2 = corrData, other_proteinList_left, other_interactions, 'other_string_correlations_allFINAL_' + name
		step5_preparation.get_export_relevant_OTHER_correlation_values(info2, folder = folder)

	@staticmethod
	def get_other_list(data, interactions_all)
		proteinList = list(data.index)
		protein_combiList = utilsFacade.getCombinations(proteinList)
		other_interactions = list(set(protein_combiList).difference(set(interactions_all)))
		return other_interactions

	@staticmethod
	def load_corr_data(data):
		quant_cols = utilsFacade.filtering(list(data.columns), 'quant_')
		quant_data = data[quant_cols]
		corrData = quant_data.corr()
		return corrData

	@staticmethod
	def overlap_STRING_interactions_with_data(corrData, stringData):
		sym1List = list(stringData.sym1)
		sym2List = list(stringData.sym2)
		symList = list(set(sym1List).union(set(sym2List)))
		proteinList = list(corrData.index)
		proteinList_left = filter(lambda a:str(a)!="nan",
							      list(set(proteinList).intersection(set(symList))))
		return proteinList_left

	@staticmethod
	def overlap_OTHER_interactions_with_data(corrData, other_interactions):
		other_data = pd.DataFrame(other_interactions)
		other_vals = list(set(other_data[0]).union(set(other_data[1])))

		proteinList = list(corrData.index)
		other_proteinList_left = filter(lambda a:str(a)!="nan",
								        list(set(proteinList).intersection(set(other_vals))))
		return other_proteinList_left

	@staticmethod
	def get_export_relevant_STRING_correlation_values(info1, **kwargs):
		folder = kwargs.get('folder','PATH')

		corrData, proteinList_left, interactions_all, output_fileName = info1

		sub_corrData = corrData[proteinList_left]
		sub_corrData = sub_corrData[proteinList_left]
		row_names = list(sub_corrData.index)
		col_names = list(sub_corrData.columns)
		dfList = list()
		for count,r in enumerate(row_names):
			temp = list()
			for c in col_names:
				temp.append(str(r) + ":" + str(c))
			dfList.append(temp)
			print(count)
		df = pd.DataFrame(dfList)
		df.index = row_names
		df.columns = col_names

		mask = df.isin(list(interactions_all))
		sub = sub_corrData[mask]
		columnsList = list(sub.columns)
		unique_vals = np.unique(sub[columnsList])
		unique_vals = list(unique_vals)
		finite_mask = np.isfinite(unique_vals)
		finite_unique_vals = np.array(unique_vals)[finite_mask]
		finite_unique_vals = list(finite_unique_vals)

		with open(folder + output_fileName + '.json', 'w') as outfile:
			json.dump(finite_unique_vals, outfile)

	@staticmethod
	def get_export_relevant_OTHER_correlation_values(info2, **kwargs):
		folder = kwargs.get('folder','PATH')

		corrData, proteinList_left, interactions_all, output_fileName = info2
		sub_corrData = corrData[other_proteinList_left]
		sub_corrData = sub_corrData[other_proteinList_left]
		row_names = list(sub_corrData.index)
		col_names = list(sub_corrData.columns)
		dfList = list()
		for count,r in enumerate(row_names):
			temp = list()
			for c in col_names:
				temp.append(str(r) + ":" + str(c))
			dfList.append(temp)
			print(count)
		df = pd.DataFrame(dfList)
		df.index = row_names
		df.columns = col_names

		otherList = list()
		count = 0
		for o1,o2 in other_interactions:
			if count%100000 == 0:
				print(count)
			otherList.append(str(o1) + ":" + str(o2))
			count+=1
		mask = df.isin(otherList)
		sub = sub_corrData[mask]
		columnsList = list(sub.columns)
		unique_vals = np.unique(sub[columnsList])
		unique_vals = list(unique_vals)
		finite_mask = np.isfinite(unique_vals)
		finite_unique_vals = np.array(unique_vals)[finite_mask]
		finite_unique_vals = list(finite_unique_vals)

		with open(folder + output_fileName + '.json', 'w') as outfile:
		    json.dump(finite_unique_vals, outfile)

class step5:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		data_dict = file_Loader.load_data(folder = folder)

		for name in data_dict.keys():
			data = data_dict[name]

			if name in ['gygi1','gygi2','gygi3','bxd_protein']:

				step5_preparation.execute(data,
										  name,
										  folder = folder,
										  output_folder = output_folder,
										  species = 'mouse') 

			else:
				
				step5_preparation.execute(data,
										  name,
										  folder = folder,
										  output_folder = output_folder,
										  species = 'human') 

class step5_figures:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		nameList = ["gygi1","gygi3","battle_protein", "wu",
					"mann_all_log2","tiannan", "primateRNA",
					"primatePRO","gygi2","battle_rna","battle_ribo",
					'coloCa','tcga_breast','tcga_ovarian','bxd_protein']

		print("load_data")
		corr_dict = step5_figures.load_data(nameList, folder = folder)

		print('make_supplementary_plot')
		step5_figures.make_supplementary_plot(nameList, corr_dict, output_folder = output_folder)

		print("get_significancies")
		pval_distribution = step5_figures.get_significancies(nameList, folder = folder)
	

	@staticmethod
	def load_data(nameList, **kwargs):
		folder = kwargs.get('folder','PATH')

		corr_dict = dict()
		for name in nameList:
			print(name)
			for ty in ["all","700","other"]:
				print(ty)
				if ty=="all" or ty=="700":
					file_name = "string_correlations_" + ty + "_" + name + ".json"
					with open(folder + file_name) as data_file:    
						correlation_values = json.load(data_file)
					corr_dict.setdefault(name + ":" + ty,[])
					corr_dict[name + ":" + ty] = correlation_values
				else:
					file_name = "other_string_correlations_all_" + name + ".json"
					with open(folder + file_name) as data_file:    
						correlation_values = json.load(data_file)
					corr_dict.setdefault(name + ":" + ty,[])
					corr_dict[name + ":" + ty] = correlation_values
		return corr_dict

	@staticmethod
	def make_supplementary_plot(nameList, corr_dict, **kwargs):
		output_folder = kwargs.get('output_folder','PATH')

		for name in nameList:
			print(name)
			dataList = list()
			for ty in ["700","all","other"]:
				key = name + ":" + ty
				dataList.append(corr_dict[key])
				if ty == '700':
					best_string_correlation_values = list(corr_dict[key])
				elif ty == 'all':
					string_correlation_values = list(corr_dict[key])
				else:
					other_correlation_values = list(corr_dict[key])

			sns.set_style("white")
			plt.rcParams["axes.grid"] = True

			plt.clf()
			fig = plt.figure(figsize=(5,5))
			gs = gridspec.GridSpec(10,10)
			ax = plt.subplot(gs[0:7,0:])
			plottingFacade.func_plotDensities_border(ax, other_correlation_values, facecolor="grey")
			plottingFacade.func_plotDensities_border(ax, string_correlation_values, facecolor="orange")
			plottingFacade.func_plotDensities_border(ax, best_string_correlation_values, facecolor="#EE7600")
			ax.set_xlim(-1,1)
			ax.set_xticklabels([])
			plt.tick_params(axis="y",which="both",left="off",right="off",labelsize=10)
		
			ax = plt.subplot(gs[7:,0:])
			bp = ax.boxplot(dataList,notch=0,sym="",vert=0,patch_artist=True,widths=(0.5,0.5,0.5))
			plt.setp(bp['medians'], color="black")
			plt.setp(bp['whiskers'], color="black",linestyle="-")
			for i,patch in enumerate(bp['boxes']):
				if i==0:
					patch.set_facecolor("#EE7600")	
				elif i==1:
					patch.set_facecolor("orange")	
				else:
					patch.set_facecolor("#D8D8D8")	
				patch.set_edgecolor("black")
				patch.set_alpha(1)
			ax.set_xlim(-1,1)
			ax.set_yticklabels([])
			plt.tick_params(axis="y",which="both",left="off",right="off",labelsize=15)
			plt.savefig(output_folder + "suppFig1a_string_correlation_recovery_" + name + ".pdf",
						bbox_inches="tight", dpi = 400)

	@staticmethod
	def get_significancies(nameList, **kwargs):
		folder = kwargs.get('folder','PATH')

		pval_dict = dict()
		for name in nameList:
			print(name)
			dataList = list()
			for ty in ["700","all","other"]:
				key = name + ":" + ty
				dataList.append(corr_dict[key])
				if ty == '700':
					best_string_correlation_values = list(corr_dict[key])
				elif ty == 'all':
					string_correlation_values = list(corr_dict[key])
				else:
					other_correlation_values = list(corr_dict[key])

			pval_distribution_mann = list()
			for i in xrange(1,1000):
				pval_mann_all = scipy.stats.mannwhitneyu(random.sample(other_correlation_values,1000),
														 random.sample(string_correlation_values,1000))[1]
				pval_distribution_mann.append(pval_mann_all)
		return pval_distribution_mann

if __name__ == "__main__":
	## EXECUTE STEP5
	step5.execute(folder = sys.argv[1], output_folder = sys.argv[2])
	step5_figures.execute(folder = sys.argv[1], output_folder = sys.argv[2])
All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.
GNU LICENSE
Download script here
Computational Pipeline

All Code Material

STEP 5: Python Code