STEP 6: Python Code

									
class file_Loader:

	@staticmethod
	def load_data(**kwargs):
		folder = kwargs.get('folder', 'PATH')

		gygi3 = DataFrameAnalyzer.open_in_chunks(folder, "dataset_gygi3_remapped.tsv.gz")
		gygi2 = DataFrameAnalyzer.open_in_chunks(folder, "dataset_gygi2_remapped.tsv.gz")
		gygi1 = DataFrameAnalyzer.open_in_chunks(folder, "dataset_gygi1_remapped.tsv.gz")
		battle_protein = DataFrameAnalyzer.open_in_chunks(folder, "dataset_battle_protein_remapped.tsv.gz")
		battle_ribo = DataFrameAnalyzer.open_in_chunks(folder, "dataset_battle_ribo_remapped.tsv.gz")
		battle_rna = DataFrameAnalyzer.open_in_chunks(folder, "dataset_battle_rna_remapped.tsv.gz")
		mann_all_log2 = DataFrameAnalyzer.open_in_chunks(folder, "dataset_mann_all_log2_remapped.tsv.gz")
		wu = DataFrameAnalyzer.open_in_chunks(folder, "dataset_wu_remapped.tsv.gz")
		tiannan = DataFrameAnalyzer.open_in_chunks(folder, "dataset_tiannan_remapped.tsv.gz")
		primatePRO = DataFrameAnalyzer.open_in_chunks(folder, "dataset_primatePRO_remapped.tsv.gz")
		primateRNA = DataFrameAnalyzer.open_in_chunks(folder, "dataset_primateRNA_remapped.tsv.gz")
		tcga_ovarian = DataFrameAnalyzer.open_in_chunks(folder, "dataset_tcga_ovarian_remapped.tsv.gz")
		tcga_breast = DataFrameAnalyzer.open_in_chunks(folder, "dataset_tcga_breast_remapped.tsv.gz")
		bxd_protein = DataFrameAnalyzer.open_in_chunks(folder, "dataset_bxdMouse_remapped.tsv.gz")
		colo_cancer = DataFrameAnalyzer.open_in_chunks(folder, "dataset_coloCa_remapped.tsv.gz")

		data_dict = {'gygi1':gygi1,
					 'gygi2':gygi2,
					 'gygi3': gygi3,
					 'battle_protein':battle_protein,
					 'battle_ribo': battle_ribo,
					 'battle_rna':battle_rna,
					 'wu':wu,
					 'tiannan':tiannan,
					 'colo_cancer':coloCa, 
					 'tcga_breast':tcga_breast, 
					 'tcga_ovarian':tcga_ovarian,
					 'bxd_protein':bxd_protein, 
					 'primateRNA':primateRNA, 
					 'primatePRO':primatePRO,
					 'mann':mann}

		return data_dict

	@staticmethod
	def load_housekeeping_data(folder):
		file_name = "housekeeping_genes.txt"
		housekeeping_data = DataFrameAnalyzer.getFile(folder,file_name)
		housekeeping_genes = housekeeping_data.index
		return housekeeping_genes

	@staticmethod
	def load_essential_genes(folder):
		file_name = "essentiality_genes.txt"
		essentiality_data = DataFrameAnalyzer.getFile(folder,file_name)
		kbm7_essentiality_data = essentiality_data[essentiality_data["KBM7 adjusted p-value"] < 0.05]
		k562_essentiality_data = essentiality_data[essentiality_data["K562 adjusted p-value"] < 0.05]
		jiyoye_essentiality_data = essentiality_data[essentiality_data["Jiyoye adjusted p-value"] < 0.05]
		raji_essentiality_data = essentiality_data[essentiality_data["Raji adjusted p-value"] < 0.05]
		kbm7_essential_genes = kbm7_essentiality_data.index
		k562_essential_genes = k562_essentiality_data.index
		jiyoye_essential_genes = jiyoye_essentiality_data.index
		raji_essential_genes = raji_essentiality_data.index
		return kbm7_essential_genes, k562_essential_genes, jiyoye_essential_genes, raji_essential_genes

	@staticmethod
	def load_string_data(folder, species):
		file_name = species + "_STRING_geneName_per_protein_allInteractingProteins_dict.json"
		with open(string_folder + file_name) as json_data:
			string_dict_all = json.load(json_data)

		file_name = species + "_STRING_only500_geneName_per_protein_allInteractingProteins_dict.json"
		with open(string_folder + file_name) as json_data:
			string_dict_500 = json.load(json_data)

		file_name = species + "_STRING_only700_geneName_per_protein_allInteractingProteins_dict.json"
		with open(string_folder + file_name) as json_data:
			string_dict_700 = json.load(json_data)

		return string_dict_all, string_dict_500, string_dict_700

	@staticmethod
	def get_complex_dictionary(**kwargs):
		folder = kwargs.get('folder','PATH')

		complexDict = DataFrameAnalyzer.read_pickle(folder + 'complex_dictionary.pkl')
		return complexDict

class step6_preparation(object):
	def __init__(self, data, name, species, **kwargs):
		self.data = data
		self.name = name
		self.species = species

		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('folder','PATH')

		complexDict = file_Loader.get_complex_dictionary(folder)

		print('load_modules')
		self.load_modules(folder, data, species)

		print('load_correlations')
		self.load_correlations()

		print('load_features')
		self.load_features()

		print("get_summary_auc")
		self.get_summary_auc()

		print("export_dataframes")
		self.export_dataframes(output_folder)

		print("make_auc_dict")
		self.auc_dict = self.make_auc_dict(output_folder)

		print('prepare_auc_matrix')
		self.prepare_auc_matrix(folder)

		print('summarize_essentiality')
		self.summarize_essentiality(folder)

	@staticmethod
	def get_corr_data(data):
		quant_cols = utilsFacade.filtering(list(data.columns), 'quant_')
		quant_data = data[quant_cols]
		corrData = quant_data.corr()
		return corrData

	def load_modules(self, folder, data, species):
		self.housekeeping_genes = file_Loader.load_housekeeping_data(folder)
		essential_genes = file_Loader.load_essential_genes(folder)
		essential_genes = self.kbm7_essential_genes, self.k562_essential_genes, self.jiyoye_essential_genes, self.raji_essential_genes

		self.corrData = step6_preparation.get_corr_data(data)

		pathway_data = data[data['reactome']>0]#"PATHWAY_REACTOME"
		if data.shape == pathway_data.shape:
			pathway_data = data[data['reactome']!=""]#"PATHWAY_REACTOME"

		self.pathway_proteins = list(pathway_data.index)

		other_data = data[~data.index.isin(pathway_proteins)]
		self.other_genes = filter(lambda a:str(a)!="nan",list(set(other_data.index)))
		
		self.string_dict_all, self.string_dict_500, self.string_dict_700 = file_Loader.load_string_data(folder, species)

	def load_correlations(self):
		print("get_complex_correlation_values")
		self.get_complex_correlation_values()
		print("get_pathway_correlation_values")
		self.get_pathway_correlation_values()
		print("get_housekeeping_correlation_values")
		self.get_housekeeping_correlation_values()
		print("get_compartment_correlation_values")
		self.get_compartment_correlation_values()
		print("get_essentiality_correlation_values")
		self.get_essentiality_correlation_values()
		print("get_chromosome_correlation_values")
		self.get_chromosome_correlation_values()
		print("get_other_correlation_values")
		self.get_other_correlation_values()
		print("get_STRING_correlation_values")
		self.get_STRING_correlation_values()

	def load_features(self):
		print("prepare_complex_df")
		self.prepare_complex_df()
		print("prepare_pathway_df")
		self.prepare_pathway_df()

		print("prepare_feature_df: COMPARTMENT")
		self.compartment_df = self.prepare_feature_df(self.compartment_correlation_values, self.pathway_other_correlations)
		
		print("prepare_feature_df: ESSENTIALITY")
		self.kbm7_essential_df = self.prepare_feature_df(self.kbm7_essential_correlation_values,self.pathway_other_correlations)
		self.k562_essential_df = self.prepare_feature_df(self.k562_essential_correlation_values,self.pathway_other_correlations)
		self.jiyoye_essential_df = self.prepare_feature_df(self.jiyoye_essential_correlation_values,self.pathway_other_correlations)
		self.raji_essential_df = self.prepare_feature_df(self.raji_essential_correlation_values,self.pathway_other_correlations)
		
		print("prepare_feature_df: CHROMOSOME")
		self.chromosome_df = self.prepare_feature_df(self.chromosome_correlation_values,self.pathway_other_correlations)
		
		print("prepare_feature_df: HOUSEKEEPING")
		self.housekeeping_df = self.prepare_feature_df(self.housekeeping_correlation_values,self.pathway_other_correlations)		
		
		print("prepare_feature_df: STRING")
		self.string_all_df = self.prepare_feature_df(self.string_correlation_values_all,self.pathway_other_correlations)
		self.string_500_df = self.prepare_feature_df(self.string_correlation_values_500,self.pathway_other_correlations)
		self.string_700_df = self.prepare_feature_df(self.string_correlation_values_700,self.pathway_other_correlations)

	def get_housekeeping_correlation_values(self):
		corrData = self.corrData
		all_genes = self.all_genes
		housekeeping_genes = self.housekeeping_genes

		housekeeping_genes_reformatted = list()
		for h in housekeeping_genes:
			housekeeping_genes_reformatted.append(h.strip())
		housekeeping_genes = housekeeping_genes_reformatted
		housekeeping_genes = [item[0] + item[1:].lower() for item in housekeeping_genes]

		housekeeping_genes = list(set(housekeeping_genes).difference(set(all_genes)))
		overlapping_genes = list(set(corrData.index).intersection(set(housekeeping_genes)))
		sub = corrData[overlapping_genes].T[overlapping_genes].T
		housekeeping_correlation_values = utilsFacade.get_correlation_values(sub)
		self.housekeeping_correlation_values = housekeeping_correlation_values

	def get_complex_correlation_values(self, complexDict):
		corrData = self.corrData
		species = self.species

		complex_correlation_values = list()
		all_genes = list()
		for complexID in complexDict:
			human_genes = complexDict[complexID][species + "GeneNames"]
			overlapping_genes = list(set(human_genes).intersection(set(corrData.index)))
			sub_corr = corrData[overlapping_genes].T[overlapping_genes].T
			corr_values = utilsFacade.get_correlation_values(sub_corr)
			complex_correlation_values.append(corr_values)
			all_genes.append(overlapping_genes)
		self.complex_correlation_values = utilsFacade.flatten(complex_correlation_values)
		self.all_genes = utilsFacade.flatten(all_genes)
		
	def get_pathway_correlation_values(self):
		data = self.data
		all_genes = self.all_genes
		pathway_data = self.pathway_data

		pathway_data = pathway_data[~pathway_data.index.isin(all_genes)]
		pathway_list = list()
		for pat in list(set(pathway_data['reactome'])):#"PATHWAY_REACTOME"
			for item in pat.split(",name:"):
				try:
					pathway_list.append(item.split("id:")[1])
				except:
					continue
		pathway_list = list(set(pathway_list))
		quant_cols = utilsFacade.get_quantCols(data)
		pathway_correlation_values = list()
		for pat in pathway_list:
			sub = pathway_data[pathway_data['reactome'].str.contains(pat)]#"PATHWAY_REACTOME"
			quant_sub = sub[quant_cols]
			corr_data = quant_sub.T.corr()
			corr_values = utilsFacade.get_correlation_values(corr_data)
			pathway_correlation_values.append(corr_values)
		pathway_correlation_values = utilsFacade.flatten(pathway_correlation_values)
		return pathway_correlation_values

	def get_other_correlation_values(self):
		corrData = self.corrData
		other_genes = self.other_genes
		complex_correlation_values = self.complex_correlation_values
		pathway_correlation_values = self.pathway_correlation_values 

		overlapping_genes = list(set(other_genes).intersection(set(corrData.index)))
		otherData = corrData[overlapping_genes]
		other_correlations = utilsFacade.get_correlation_values(otherData)
		try:
			self.complex_other_correlations = random.sample(other_correlations,len(complex_correlation_values))
			self.pathway_other_correlations = random.sample(other_correlations,len(pathway_correlation_values))
		except:
			self.complex_other_correlations = other_correlations
			self.pathway_other_correlations = other_correlations
		
	def get_essentiality_correlation_values(self):
		corrData = self.corrData
		all_genes = self.all_genes
		kbm7_essential_genes = self.kbm7_essential_genes
		k562_essential_genes = self.k562_essential_genes
		jiyoye_essential_genes = self.jiyoye_essential_genes
		raji_essential_genes = self.raji_essential_genes

		kbm7_essential_genes = list(set(kbm7_essential_genes).difference(set(all_genes)))
		k562_essential_genes = list(set(k562_essential_genes).difference(set(all_genes)))
		jiyoye_essential_genes = list(set(jiyoye_essential_genes).difference(set(all_genes)))
		raji_essential_genes = list(set(raji_essential_genes).difference(set(all_genes)))

		kbm7_essential_genes = [item[0] + item[1:].lower() for item in kbm7_essential_genes]
		k562_essential_genes = [item[0] + item[1:].lower() for item in k562_essential_genes]
		jiyoye_essential_genes = [item[0] + item[1:].lower() for item in jiyoye_essential_genes]
		raji_essential_genes = [item[0] + item[1:].lower() for item in raji_essential_genes]

		kbm7_genes = list(set(corrData.index).intersection(set(kbm7_essential_genes)))
		k562_genes = list(set(corrData.index).intersection(set(k562_essential_genes)))
		jiyoye_genes = list(set(corrData.index).intersection(set(jiyoye_essential_genes)))
		raji_genes = list(set(corrData.index).intersection(set(raji_essential_genes)))

		kbm7_corr = corrData[kbm7_genes].T[kbm7_genes].T
		k562_corr = corrData[k562_genes].T[k562_genes].T
		jiyoye_corr = corrData[jiyoye_genes].T[jiyoye_genes].T
		raji_corr = corrData[raji_genes].T[raji_genes].T

		self.kbm7_essential_correlation_values = utilsFacade.get_correlation_values(kbm7_corr)
		self.k562_essential_correlation_values = utilsFacade.get_correlation_values(k562_corr)
		self.jiyoye_essential_correlation_values = utilsFacade.get_correlation_values(jiyoye_corr)
		self.raji_essential_correlation_values = utilsFacade.get_correlation_values(raji_corr)

	def get_compartment_correlation_values(self):
		data = self.data
		compartment_data = data[data["location_humanProteinAtlas"]>0]
		compartment_list = list()
		for com in list(compartment_data["location_humanProteinAtlas"]):
			for item in com.split(";"):
				compartment_list.append(item)
		compartment_list = list(set(compartment_list))
		quant_cols = utilsFacade.get_quantCols(compartment_data)
		compartment_correlation_values = list()
		for compartment in compartment_list:
			sub = data[data["location_humanProteinAtlas"] == compartment][quant_cols]
			sub_corr = sub.T.corr()
			corr_values = utilsFacade.get_correlation_values(sub_corr)
			compartment_correlation_values.append(corr_values)
		self.compartment_correlation_values = utilsFacade.flatten(compartment_correlation_values)

	def get_chromosome_correlation_values(self):
		data = self.data
		data_chrom = data.copy()
		chrom_list = list()
		for chrom_pos in list(data_chrom["GENOMIC_POS"]):
			try:
				chrom_list.append(chrom_pos.split("chr:")[1].split(",start")[0])
			except:
				chrom_list.append(np.nan)
		data_chrom["chrom"] = pd.Series(chrom_list, index = data_chrom.index)
		quant_cols = utilsFacade.get_quantCols(data_chrom)
		chromosome_correlation_values = list()
		for chrom in map(str,list(xrange(1,23)))+["X"]:
			sub = data_chrom[data_chrom["chrom"] == chrom][quant_cols]
			sub_corr = sub.T.corr()
			corr_values = utilsFacade.get_correlation_values(sub_corr)
			chromosome_correlation_values.append(corr_values)
		self.chromosome_correlation_values = utilsFacade.flatten(chromosome_correlation_values)

	def get_STRING_correlation_values(self):
		corrData = self.corrData
		string_dict_all = self.string_dict_all
		string_dict_500 = self.string_dict_500
		string_dict_700 = self.string_dict_700

		overlapping_all = list(set(corrData.index).intersection(set(string_dict_all.keys())))
		overlapping_500 = list(set(corrData.index).intersection(set(string_dict_500.keys())))
		overlapping_700 = list(set(corrData.index).intersection(set(string_dict_700.keys())))

		string_correlation_values_all = list()
		string_correlation_values_500 = list()
		string_correlation_values_700 = list()
		for key in overlapping_700:
			interactors = string_dict_700[key]
			overlapping_interactors = list(set(corrData.index).intersection(set(interactors)))
			if len(overlapping_interactors) > 0:
				sub_corr = corrData[overlapping_interactors].T[key].T
			string_correlation_values_700.append(sub_corr)
		for key in overlapping_500:
			interactors = string_dict_500[key]
			overlapping_interactors = list(set(corrData.index).intersection(set(interactors)))
			if len(overlapping_interactors) > 0:
				sub_corr = corrData[overlapping_interactors].T[key].T
			string_correlation_values_500.append(sub_corr)
		for key in overlapping_all:
			interactors = string_dict_all[key]
			overlapping_interactors = list(set(corrData.index).intersection(set(interactors)))
			if len(overlapping_interactors) > 0:
				sub_corr = corrData[overlapping_interactors].T[key].T
			string_correlation_values_all.append(sub_corr)
		self.string_correlation_values_700 = utilsFacade.flatten(string_correlation_values_700)
		self.string_correlation_values_500 = utilsFacade.flatten(string_correlation_values_500)
		self.string_correlation_values_all = utilsFacade.flatten(string_correlation_values_all)

	def prepare_pathway_df(self):
		pathway_correlation_values = self.pathway_correlation_values
		pathway_other_correlations = self.pathway_other_correlations

		df = pd.DataFrame({"correlations": pathway_correlation_values + pathway_other_correlations,
						   "label": [True]*len(pathway_correlation_values) + [False]*len(pathway_other_correlations)})
		df = df.sort_values("correlations", ascending = True)

		count_list = list()
		dec_list = list(utilsFacade.frange(-1,1,0.01))
		max_corr = df["correlations"].max()
		for count,dec in enumerate(list(reversed(np.array(dec_list)))):
			if dec < max_corr:
				break
		idx = count + 1

		df_small = pd.DataFrame({"correlations":dec_list[:-idx]})
		fpr_list = list()
		tpr_list = list()
		recall_list = list()
		accuracy_list = list()
		precision_list = list()
		all_true_positives = len(df[df.label == True])
		all_false_positives = len(df[df.label == False])
		for t in dec_list[:-idx]:
			rejected = df[df["correlations"]=t]

			total_info = self.calculate_tp_fp(sub, rejected)
			true_positives, false_positives, false_negatives, true_negatives, tpr, fpr, precision, recall, accuracy = total_info

			fpr_list.append(fpr)
			tpr_list.append(tpr)
			recall_list.append(recall)
			accuracy_list.append(accuracy)
			precision_list.append(precision)
		df_small["fpr"] = pd.Series(fpr_list, index = df_small.index)
		df_small["tpr"] = pd.Series(tpr_list, index = df_small.index)
		df_small["recall"] = pd.Series(recall_list, index = df_small.index)
		df_small["accuracy"] = pd.Series(accuracy_list, index = df_small.index)
		df_small["precision"] = pd.Series(precision_list, index = df_small.index)
		auc_value = sklearn.metrics.auc(fpr_list,tpr_list)
		df_small["auc"] = [auc_value]*len(df_small)
		self.pathway_df = df_small

	def prepare_complex_df(self):
		complex_correlation_values = self.complex_correlation_values
		complex_other_correlations = self.complex_other_correlations

		df = pd.DataFrame({"correlations": complex_correlation_values + complex_other_correlations,
						   "label": [True]*len(complex_correlation_values) + [False]*len(complex_other_correlations)})
		df = df.sort_values("correlations", ascending = True)

		fpr_list = list()
		tpr_list = list()
		recall_list = list()
		accuracy_list = list()
		precision_list = list()
		all_true_positives = len(df[df.label == True])
		all_false_positives = len(df[df.label == False])
		count = 0
		for i,row in df.iterrows():
			rejected = df[0:count]
			sub = df[count:]

			total_info = self.calculate_tp_fp(sub, rejected)
			true_positives, false_positives, false_negatives, true_negatives, tpr, fpr, precision, recall, accuracy = total_info

			fpr_list.append(fpr)
			tpr_list.append(tpr)
			recall_list.append(recall)
			accuracy_list.append(accuracy)
			precision_list.append(precision)
			count +=1 
		df["tpr"] = pd.Series(tpr_list,index = df.index)
		df["fpr"] = pd.Series(fpr_list,index = df.index)
		df["auc"] = [sklearn.metrics.auc(fpr_list,tpr_list)] * len(df)
		df["recall"] = pd.Series(recall_list, index = df.index)
		df["accuracy"] = pd.Series(accuracy_list, index = df.index)
		df["precision"] = pd.Series(precision_list, index = df.index)
		df.sort_values("correlations",ascending=False).head()
		self.complex_df = df

	def prepare_feature_df(self,correlation_values,other_correlations):
		compartment_correlation_values = correlation_values
		pathway_other_correlations = other_correlations
		if len(pathway_other_correlations) > len(compartment_correlation_values):
			compartment_other_correlations = random.sample(other_correlations,len(compartment_correlation_values))
		else:
			compartment_other_correlations = pathway_other_correlations

		df = pd.DataFrame({"correlations": compartment_correlation_values + compartment_other_correlations,
						   "label": [True]*len(compartment_correlation_values) + [False]*len(compartment_other_correlations)})
		df = df.sort_values("correlations", ascending = True)

		count_list = list()
		dec_list = list(utilsFacade.frange(-1,1,0.01))
		max_corr = df["correlations"].max()
		for count,dec in enumerate(list(reversed(np.array(dec_list)))):
			if dec < max_corr:
				break
		idx = count + 1

		df_small = pd.DataFrame({"correlations":dec_list[:-idx]})
		fpr_list = list()
		tpr_list = list()
		recall_list = list()
		accuracy_list = list()
		precision_list = list()
		all_true_positives = len(df[df.label == True])
		all_false_positives = len(df[df.label == False])
		for t in dec_list[:-idx]:
			rejected = df[df["correlations"]=t]

			total_info = self.calculate_tp_fp(sub, rejected)
			true_positives, false_positives, false_negatives, true_negatives, tpr, fpr, precision, recall, accuracy = total_info

			fpr_list.append(fpr)
			tpr_list.append(tpr)
			recall_list.append(recall)
			accuracy_list.append(accuracy)
			precision_list.append(precision)

		df_small["fpr"] = pd.Series(fpr_list, index = df_small.index)
		df_small["tpr"] = pd.Series(tpr_list, index = df_small.index)
		df_small["recall"] = pd.Series(recall_list, index = df_small.index)
		df_small["accuracy"] = pd.Series(accuracy_list, index = df_small.index)
		df_small["precision"] = pd.Series(precision_list, index = df_small.index)
		auc_value = sklearn.metrics.auc(fpr_list,tpr_list)
		df_small["auc"] = [auc_value]*len(df_small)

		return df_small

	def calculate_tp_fp(self, sub, rejected):
		true_positives = float(len(sub[sub.label == True]))
		false_positives = float(len(sub[sub.label == False]))
		false_negatives = float(len(rejected.label == True))
		true_negatives = float(len(rejected.label == False))
		tpr = true_positives/all_true_positives
		fpr = false_positives/all_false_positives
		precision = true_positives/(true_positives + false_positives)
		recall = true_positives/(true_positives + false_negatives)
		accuracy = (true_positives + true_negatives)/(true_positives + true_negatives + false_positives + false_negatives)

		total_info = (true_positives, false_positives, false_negatives, true_negatives,
					  tpr, fpr, precision, recall, accuracy)
		return total_info

	def get_summary_auc(self):
		complex_auc = self.complex_df["auc"][0]
		pathway_auc = self.pathway_df["auc"][0]
		compartment_auc = self.compartment_df["auc"][0]
		string_all_auc = self.string_all_df["auc"][0]
		string_500_auc = self.string_500_df["auc"][0]
		string_700_auc = self.string_700_df["auc"][0]
		chromosome_auc = self.chromosome_df["auc"][0]
		housekeeping_auc = self.housekeeping_df["auc"][0]
		kbm7_essential_auc = self.kbm7_essential_df["auc"][0]
		k562_essential_auc = self.k562_essential_df["auc"][0]
		raji_essential_auc = self.raji_essential_df["auc"][0]
		jiyoye_essential_auc = self.jiyoye_essential_df["auc"][0]

		auc_list = [complex_auc,pathway_auc,compartment_auc, string_all_auc,
					string_500_auc, string_700_auc, chromosome_auc, housekeeping_auc,
					kbm7_essential_auc, k562_essential_auc, raji_essential_auc,
					jiyoye_essential_auc]
		name_list = ["complex","pathway", "compartment", "string_all",
					 "string_500","string_700", "chromosome", "housekeeping",
					 "kbm7_essential", "k562_essential", "raji_essential", "jiyoye_essential"]
		self.auc_df = pd.DataFrame({"name":name_list,"auc":auc_list})

	def export_dataframes(self, output_folder):
		name = self.name
		auc_df = self.auc_df
		complex_df = self.complex_df
		pathway_df = self.pathway_df
		compartment_df = self.compartment_df
		string_all_df = self.string_all_df
		string_500_df = self.string_500_df
		string_700_df = self.string_700_df
		chromosome_df = self.chromosome_df
		housekeeping_df = self.housekeeping_df
		kbm7_essential_df = self.kbm7_essential_df
		k562_essential_df = self.k562_essential_df
		raji_essential_df = self.raji_essential_df
		jiyoye_essential_df = self.jiyoye_essential_df

		#creating roc_temporary files
		auc_df.to_csv(output_folder + name + "_figure1_data_AUC.tsv.gz",sep = "\t", compression = "gzip")
		complex_df.to_csv(output_folder + name + "_figure1_data_COMPLEX.tsv.gz",sep = "\t", compression = "gzip")
		pathway_df.to_csv(output_folder + name + "_figure1_data_PATHWAY.tsv.gz",sep = "\t", compression = "gzip")
		compartment_df.to_csv(output_folder + name + "_figure1_data_COMPARTMENT.tsv.gz",sep = "\t", compression = "gzip")
		string_all_df.to_csv(output_folder + name + "_figure1_data_STRING_ALL.tsv.gz",sep = "\t", compression = "gzip")
		string_500_df.to_csv(output_folder + name + "_figure1_data_STRING_500.tsv.gz",sep = "\t", compression = "gzip")
		string_700_df.to_csv(output_folder + name + "_figure1_data_STRING_700.tsv.gz",sep = "\t", compression = "gzip")
		chromosome_df.to_csv(output_folder + name + "_figure1_data_CHROMOSOME.tsv.gz",sep = "\t", compression = "gzip")
		housekeeping_df.to_csv(output_folder + name + "_figure1_data_HOUSEKEEPING.tsv.gz",sep = "\t", compression = "gzip")
		kbm7_essential_df.to_csv(output_folder + name + "_figure1_data_KBM7_ESSENTIALITY.tsv.gz",sep = "\t", compression = "gzip")
		k562_essential_df.to_csv(output_folder + name + "_figure1_data_K562_ESSENTIALITY.tsv.gz",sep = "\t", compression = "gzip")
		jiyoye_essential_df.to_csv(output_folder + name + "_figure1_data_JIYOYE_ESSENTIALITY.tsv.gz",sep = "\t", compression = "gzip")
		raji_essential_df.to_csv(output_folder + name + "_figure1_data_RAJI_ESSENTIALITY.tsv.gz",sep = "\t", compression = "gzip")

	def make_auc_dict(self, folder):
		name_list = ["battle_protein","tiannan","battle_ribo", "battle_rna",
					 "primateRNA", "primatePRO", "wu","mann_all_log2",
					 "gygi1","gygi2","gygi3","tcga_ovarian",'tcga_breast',
					 'bxd_protein','colo_cancer']		

		auc_dict = dict((e1,list()) for e1 in name_list)
		for name in name_list:
			file_name = name + "_figure1_data_AUC.tsv.gz"
			data = DataFrameAnalyzer.open_in_chunks(ffolder, file_name)
			data.index = data["name"]
			data = data.drop(["string_all","string_500"],0)
			data_dict = data["auc"].to_dict()
			auc_dict[name] = data_dict

		DataFrameAnalyzer.to_pickle(auc_dict, folder + 'fig2a_auc_dictionary.pkl')
		return auc_dict

	def prepare_auc_matrix(self,folder):
		auc_dict = self.auc_dict

		linkage_method = 'average'
		metric = 'euclidean'

		auc_matrix = pd.DataFrame(auc_dict)
		auc_matrix = auc_matrix.replace(np.nan, 0.5)
		auc_matrix = utilsFacade.recluster_matrix(auc_matrix,
					 linkage_method = linkage_method, metric = metric)
		auc_matrix.to_csv(folder + 'fig2a_auc_matrix.tsv.gz', sep = '\t', compression = 'gzip')
		return auc_matrix

	def summarize_essentiality(self,folder):
		auc_df = self.auc_df

		auc_df = auc_df.T
		essential_columns = filter(lambda a:str(a).find("ess")!=-1, list(auc_df.columns))
		mean_values = np.mean(auc_df[essential_columns].T)
		auc_df["essential"] = mean_values
		auc_df = auc_df.drop(essential_columns,1)
		auc_matrix = auc_df.T
		auc_matrix.to_csv(folder + 'fig2a_auc_matrix.tsv.gz', sep = '\t', compression = 'gzip')
		return auc_matrix

class step6:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		data_dict = file_Loader.load_data(folder = folder)

		for name in data_dict.keys():
			data = data_dict[name]
			if name in ['gygi1','gygi2','gygi3','bxd_protein']:
				step6_preparation.execute(data, name, species, folder = folder, output_folder = output_folder) 
			else:
				step6_preparation.execute(data, name, species, folder = folder, output_folder = output_folder) 

class step6_figure2a:
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('folder','PATH')

		print('FIGURE2A: main_figure2a_rocMatrix')
		step6_figure2a.main_figure2a_rocMatrix(folder = folder, output_folder = output_folder)

	@staticmethod
	def main_figure2a_rocMatrix(**kwargs):	
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('folder','PATH')

		print('get_auc_matrix')
		auc_matrix = step6_figure2a.get_auc_matrix(folder = folder)

		print("plot_auc_matrix")
		step6_figure2a.plot_auc_matrix(auc_matrix, output_folder = output_folder)

		print('export_underlying_data')
		step6_figure2a.export_underlying_data(auc_matrix, folder = folder, output_folder = output_folder)

	@staticmethod
	def get_auc_dict(**kwargs):
		folder = kwargs.get('folder','PATH')

		auc_dict = DataFrameAnalyzer.read_pickle(folder + 'fig2a_auc_dictionary.pkl')
		return auc_dict

	@staticmethod
	def get_specific_color_gradient(colormap,inputList, **kwargs):
		vmin = kwargs.get("vmin", False)
		vmax = kwargs.get("vmax", False)

		cm = plt.get_cmap(colormap)
		if type(inputList)==list:
			if vmin == False and vmax == False:
				cNorm = mpl.colors.Normalize(vmin=min(inputList), vmax=max(inputList))
			else:
				cNorm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
		else:
			if vmin == False and vmax == False:
				cNorm = mpl.colors.Normalize(vmin=inputList.min(), vmax=inputList.max())
			else:
				cNorm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
		scalarMap = mpl.cm.ScalarMappable(norm=cNorm, cmap=cm)
		scalarMap.set_array(inputList)
		colorList = scalarMap.to_rgba(inputList)
		return scalarMap,colorList

	@staticmethod
	def plot_auc_matrix(auc_df, **kwargs):
		output_folder = kwargs.get('folder','PATH')

		datasets = ["tcga_ovarian","battle_protein",'colo_cancer',"gygi3",
					'tcga_breast',"gygi1","mann_all_log2","primatePRO","wu",
					"battle_ribo","battle_rna","gygi2",'bxd_protein',
					"primateRNA","tiannan"]
		categories = ["chromosome","housekeeping","essential","pathway",
					  "compartment","string_700","complex"]
		data = auc_df.T
		data = data[categories]
		data = data.T[datasets].T

		data.index = ["TCGA Ovarian Cancer(P)",'Human Individuals(Battle,P)',
					  'TCGA Colorectal Cancer(P)','DO mouse strains(P)',
					  "TCGA Breast Cancer(P)",'Founder mouse strains (P)',
					  'Human cell lines(P)','Primate cells(P)',
					  'Human Individuals(Wu,P)', 'Human Individuals(RP)',
					  'Human Individuals(RS)','DO mouse strains(RS)',
					  'BXD80 mouse strains(P)','Primate cells(RS)',
					  'Kidney cancer cells(P)']
		data.columns = ['chromosome','housekeeping','essential',
						'pathway','compartment','STRING','complex']


		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = False

		plt.clf()
		x_mean_list = list()
		for col in data.columns:
			x_mean_list.append(np.mean(utilsFacade.finite(list(data[col])))-0.5)
		y_mean_list = list()
		for col in data.T.columns:
			y_mean_list.append(max(utilsFacade.finite(list(data.T[col])))-0.5)

		plt.clf()
		fig = plt.figure(figsize = (5,8))
		gs = gridspec.GridSpec(16,11)
		ax1_density = plt.subplot(gs[0:2,0:8])
		ax1_density.set_ylim(0.5,0.8)
		ax1_density.axhline(0.55, alpha = 0.6, color="grey", linestyle='--', linewidth = 0.2, zorder=1)
		ax1_density.axhline(0.6, alpha = 0.6, color="grey", linestyle='--', linewidth = 0.2, zorder=1)
		ax1_density.axhline(0.65, alpha = 0.6, color="grey", linestyle='--', linewidth = 0.2, zorder=1)
		ax1_density.axhline(0.7, alpha = 0.6, color="grey", linestyle='--', linewidth = 0.2, zorder=1)
		ax1_density.axhline(0.75, alpha = 0.6, color="grey", linestyle='--', linewidth = 0.2, zorder=1)
		scalarmap_x, colorList_x = figure2a.get_specific_color_gradient(plt.cm.Greys,
															   			np.array(xrange(len(x_mean_list))))
		ax1_density.bar(np.arange(len(x_mean_list)), 
			x_mean_list, 0.95, color = colorList_x, bottom = 0.5,
			edgecolor = "white", linewidth = 2, zorder = 3)
		plt.xticks(list(xrange(len(data.columns))))
		ax1_density.set_xticklabels([])
		ax1_density.set_xlim(0,len(data.columns))
		ax = plt.subplot(gs[2:10,0:8])
		scalarmap, colorList = figure2a.get_specific_color_gradient(plt.cm.RdBu,
														   			np.array(data), vmin = 0.4, vmax = 0.7)
		sns.heatmap(data, cmap = plt.cm.RdBu, vmin = 0.4,vmax = 0.7,
			linecolor = "white", linewidth = 2, cbar = False)
		y_mean_list = y_mean_list[::-1]
		ax2_density = plt.subplot(gs[2:10,8:10])
		plt.yticks(list(xrange(len(data.index))))
		ax2_density.set_ylim(0,len(data.index))
		ax2_density.set_xlim(0.5,0.85)
		scalarmap_y, colorList_y = figure2a.get_specific_color_gradient(plt.cm.Greys,
															   			np.array(xrange(len(y_mean_list))))
		plt.setp(ax2_density.get_xticklabels(), rotation = 90)
		ax2_density.set_yticklabels([])
		ax2_density.axvline(0.7, color = "red", linestyle = "--", linewidth = 0.5)
		ax2_density.axvline(0.55, alpha = 0.6, color="grey", linestyle='--', linewidth = 0.2, zorder=1)
		ax2_density.axvline(0.6, alpha = 0.6, color="grey", linestyle='--', linewidth = 0.2, zorder=1)
		ax2_density.axvline(0.65, alpha = 0.6, color="grey", linestyle='--', linewidth = 0.2, zorder=1)
		ax2_density.axvline(0.75, alpha = 0.6, color="grey", linestyle='--', linewidth = 0.2, zorder=1)
		ax2_density.axvline(0.8, alpha = 0.6, color="grey", linestyle='--', linewidth = 0.2, zorder=1)
		ax2_density.barh(np.arange(len(y_mean_list)), y_mean_list,
						 0.95, color = colorList_y, left = 0.5,
						 edgecolor = "white", linewidth = 2, zorder = 3)
		ax_category = plt.subplot(gs[2:10,10:11])
		df = pd.DataFrame({"color":["green"]*7+["lightgreen"]+["magenta"]*2+["green","magenta","green"]})
		df = pd.DataFrame({'color':14*[1]})
		sns.heatmap(df, cbar = False, linewidth = 2, linecolor = "white")
		ax_category.axis("off")
		ax_cbar = plt.subplot(gs[13:14,0:8])
		cbar = fig.colorbar(scalarmap, cax = ax_cbar, 
			   orientation = "horizontal")
		cbar.set_label("Area under curve (AUC)")
		plt.savefig(output_folder + "fig2a_auc_matrix.pdf", bbox_inches = "tight", dpi=600)

class step6_figure2b:

	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('folder','PATH')

		print('FIGURE2B: main_figure2b_vignette_complexEffect')
		step6_figure2b.main_figure2b_vignette_complexEffect(folder = folder, output_folder = output_folder)

	@staticmethod
	def main_figure2b_vignette_complexEffect(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('folder','PATH')

		print('plot_complex_effect:tcga_ovarian')
		step6_figure2b.plot_complex_effect('tcga_ovarian')
		print('plot_complex_effect:battle_protein')
		step6_figure2b.plot_complex_effect('battle_protein')
		print('plot_complex_effect:gygi1')
		step6_figure2b.plot_complex_effect('gygi1')
		print('plot_complex_effect:gygi3')
		step6_figure2b.plot_complex_effect('gygi3')
		print('plot_complex_effect:tcga_breast')
		step6_figure2b.plot_complex_effect('tcga_breast')
		print('plot_complex_effect:colo_cancer')
		step6_figure2b.plot_complex_effect('colo_cancer')

	@staticmethod
	def get_data(name, **kwargs):
		folder = kwargs.get('folder','PATH')

		complex_data = DataFrameAnalyzer.open_in_chunks(folder, name + '_figure1_data_COMPLEX.tsv.gz')
		return complex_data

	@staticmethod
	def plot_complex_effect(name, **kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('folder','PATH')

		complex_data = step6_figure2b.get_data(name, folder = folder)
		other_correlations = utilsFacade.finite(list(complex_data[complex_data.label==False]["correlations"]))
		complex_correlations = utilsFacade.finite(list(complex_data[complex_data.label==True]["correlations"]))

		pval_list1 = list()
		for i in xrange(1,1000):
			corr1 = random.sample(complex_correlations,100)
			corr2 = random.sample(other_correlations,100)
			odds1, pval1 = scipy.stats.mannwhitneyu(corr1, corr2)
			pval_list1.append(pval1)
		print(np.mean(pval_list1))

		sns.set(context='notebook', style='white', 
			palette='deep', font='Liberation Sans', font_scale=1, 
			color_codes=False, rc=None)
		plt.rcParams["axes.grid"] = True

		plt.clf()
		fig = plt.figure(figsize = (7,5))
		ax = fig.add_subplot(111)
		ax.set_ylabel('Density', fontsize=12)
		ax.set_xlabel('correlation coefficient (pearson)', fontsize=12)
		ax.hist(other_correlations, bins = 50, color='grey', alpha =0.3, normed = 1)
		plottingFacade.func_plotDensities_border(ax, other_correlations, 
												 linewidth = 2, alpha = 1, facecolor = 'grey')
		ax.hist(complex_correlations, bins = 50, color='#AF2D2D', alpha =0.3, normed =1)
		plottingFacade.func_plotDensities_border(ax, complex_correlations, 
												 linewidth = 2, alpha = 1, facecolor = '#AF2D2D')
		plottingFacade.make_full_legend(ax,['n(other)='+str(len(other_correlations)),
			'n(complex)='+str(len(complex_correlations)),
			'pvalComplex(Wilcoxon)='+str(np.mean(pval_list1))],['grey']*3, loc = 'upper left')
		ax.set_xlim(-1,1)
		plt.savefig(output_folder + 'fig2b_' + name + '_complex_density_effect.pdf',
					bbox_inches = 'tight', dpi = 400)

class step6_export:
	
	@staticmethod
	def execute(**kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('folder','PATH')

		print('get_auc_dict')
		auc_matrix = step6_export.get_auc_dict(folder = folder)

		print('export_underlying_data')
		step6_export.export_underlying_data(auc_matrix, folder = folder, output_folder = output_folder)

	@staticmethod
	def get_auc_dict(**kwargs):
		folder = kwargs.get('folder','PATH')

		auc_dict = DataFrameAnalyzer.read_pickle(folder + 'fig2a_auc_dictionary.pkl')
		auc_matrix = pd.DataFrame(auc_dict)
		return auc_matrix

	@staticmethod
	def export_underlying_data(auc_matrix, **kwargs):
		folder = kwargs.get('folder','PATH')
		output_folder = kwargs.get('output_folder','PATH')

		category_list = ['chromosome', 'compartment', 'complex',
						 'housekeeping', 'jiyoye_essentiality', 
						 'k562_essentiality','kbm7_essentiality',
						 'pathway', 'raji_essentiality','string_700']
		name_list = ["battle_protein","tiannan","battle_ribo", "battle_rna",
					 "primateRNA", "primatePRO", "wu","mann_all_log2",
					 "gygi1","gygi2","gygi3","tcga_ovarian",'tcga_breast',
					 'bxd_protein','colo_cancer']		

		concat_list = list()
		names_columns = list()
		category_columns = list()
		for name in name_list:
			for category in category_list:
				print(name,category)
				file_name = name + '_figure1_data_' + category.upper() + '.tsv.gz'
				data = DataFrameAnalyzer.open_in_chunks(folder, file_name)
				concat_list.append(data)
				for i in xrange(len(data)):
					names_columns.append(name)
					category_columns.append(category)

		data = pd.concat(concat_list)
		data['name'] = pd.Series(names_columns, index = data.index)
		data['category'] = pd.Series(category_columns, index = data.index)

		#add numbers of interactions
		category_list = ['chromosome', 'compartment', 'complex',
						 'housekeeping', 'jiyoye', 'k562',
						 'kbm7', 'pathway', 'raji',
						 'STRING_700']
		name_list = ["battle_protein","tiannan","battle_ribo", "battle_rna",
					 "primateRNA", "primatePRO", "wu","mann_all_log2",
					 "gygi1","gygi2","gygi3","tcga_ovarian",'tcga_breast',
					 'bxd_protein','coloCa']	

		num_dict = dict((e1,dict()) for e1 in name_list)
		for name in name_list:
			num_data = DataFrameAnalyzer.getFile(folder, 'numInteractions_aurocAnalysis_' + name + '_forMethods.tsv')
			num_data = num_data.T
			temp_dict = num_data.to_dict()
			num_dict[name] = dict((e1,dict()) for e1 in category_list)
			for category in category_list:
				num_dict[name][category] = temp_dict[category]

		pos_num_list = list()
		neg_num_list = list()
		real_neg_num_list = list()
		for i,row in data.iterrows():
			name = row['name']
			category = row['category']

			if category.endswith('_essentiality')==True:
				category = category.split('_')[0]
			if category == 'string_700':
				category = 'STRING_700'
			if name == 'colo_cancer':
				name = 'coloCa'

			pos_num_list.append('')
			neg_num_list.append('')
			real_neg_num_list.append('')

		data['pos_num'] = pd.Series(pos_num_list, index = data.index)
		data['neg_num'] = pd.Series(neg_num_list, index = data.index)
		data['real_neg_num'] = pd.Series(real_neg_num_list, index = data.index)

		data.to_csv(output_folder + 'suppTable2_fig2a_underlying_data_ROCanalysis.tsv',
					sep = '\t')

if __name__ == "__main__":
	## EXECUTE STEP6
	step6.execute(folder = sys.argv[1], output_folder = sys.argv[2])
	step6_figure2a.execute(folder = sys.argv[1], output_folder = sys.argv[2])
	step6_figure2b.execute(folder = sys.argv[1], output_folder = sys.argv[2])
	step6_export.execute(folder = sys.argv[1], output_folder = sys.argv[2])
									
								

All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.

GNU LICENSE

Download script here