STEP 5: Python Code
class file_Loader:
@staticmethod
def load_string_data_mouse(folder):
stringData_mouse = DataFrameAnalyzer.open_in_chunks(folder,"stringData_mouse_onlyGeneNames.tsv.gz")
stringData_mouse_700 = DataFrameAnalyzer.open_in_chunks(folder,"stringData_mouse_700_onlyGeneNames.tsv.gz")
return stringData_mouse, stringData_mouse_700
@staticmethod
def load_string_data_human(folder):
stringData_human = DataFrameAnalyzer.open_in_chunks(folder,"stringData_human_onlyGeneNames.tsv.gz")
stringData_human_700 = DataFrameAnalyzer.open_in_chunks(folder,"stringData_human_700_onlyGeneNames.tsv.gz")
return stringData_human, stringData_human_700
@staticmethod
def load_data(**kwargs):
folder = kwargs.get('folder', 'PATH')
gygi3 = DataFrameAnalyzer.open_in_chunks(folder, "dataset_gygi3_remapped.tsv.gz")
gygi2 = DataFrameAnalyzer.open_in_chunks(folder, "dataset_gygi2_remapped.tsv.gz")
gygi1 = DataFrameAnalyzer.open_in_chunks(folder, "dataset_gygi1_remapped.tsv.gz")
battle_protein = DataFrameAnalyzer.open_in_chunks(folder, "dataset_battle_protein_remapped.tsv.gz")
battle_ribo = DataFrameAnalyzer.open_in_chunks(folder, "dataset_battle_ribo_remapped.tsv.gz")
battle_rna = DataFrameAnalyzer.open_in_chunks(folder, "dataset_battle_rna_remapped.tsv.gz")
mann_all_log2 = DataFrameAnalyzer.open_in_chunks(folder, "dataset_mann_all_log2_remapped.tsv.gz")
wu = DataFrameAnalyzer.open_in_chunks(folder, "dataset_wu_remapped.tsv.gz")
tiannan = DataFrameAnalyzer.open_in_chunks(folder, "dataset_tiannan_remapped.tsv.gz")
primatePRO = DataFrameAnalyzer.open_in_chunks(folder, "dataset_primatePRO_remapped.tsv.gz")
primateRNA = DataFrameAnalyzer.open_in_chunks(folder, "dataset_primateRNA_remapped.tsv.gz")
tcga_ovarian = DataFrameAnalyzer.open_in_chunks(folder, "dataset_tcga_ovarian_remapped.tsv.gz")
tcga_breast = DataFrameAnalyzer.open_in_chunks(folder, "dataset_tcga_breast_remapped.tsv.gz")
bxd_protein = DataFrameAnalyzer.open_in_chunks(folder, "dataset_bxdMouse_remapped.tsv.gz")
colo_cancer = DataFrameAnalyzer.open_in_chunks(folder, "dataset_coloCa_remapped.tsv.gz")
data_dict = {'gygi1':gygi1,
'gygi2':gygi2,
'gygi3': gygi3,
'battle_protein':battle_protein,
'battle_ribo': battle_ribo,
'battle_rna':battle_rna,
'wu':wu,
'tiannan':tiannan,
'colo_cancer':coloCa,
'tcga_breast':tcga_breast,
'tcga_ovarian':tcga_ovarian,
'bxd_protein':bxd_protein,
'primateRNA':primateRNA,
'primatePRO':primatePRO,
'mann':mann}
return data_dict
class step5_preparation:
@staticmethod
def execute(data, name, **kwargs):
folder = kwargs.get('folder','PATH')
species = kwargs.get("species","mouse")
output_folder = kwargs.get('output_folder','PATH')
if species == "mouse":
print("load_string_data:MOUSE")
stringData, stringData_700 = file_Loader.load_string_data_mouse(folder)
elif species == "human":
print("load_string_data:HUMAN")
stringData, stringData_700 = file_Loader.load_string_data_human(folder)
interactions_all = list(stringData["interaction"])
interactions_700 = list(stringData_700["interaction"])
print("get_other_list")
other_interactions = step5_preparation.get_other_list(data, interactions_all)
print("load_corr_data")
corrData = step5_preparation.load_corr_data(data)
print("overlap_STRING_interactions_with_data")
proteinList_left = step5_preparation.overlap_STRING_interactions_with_data(corrData, stringData)
proteinList_left700 = step5_preparation.overlap_STRING_interactions_with_data(corrData, stringData_700)
print("overlap_OTHER_interactions_with_data")
other_proteinList_left = step5_preparation.overlap_OTHER_interactions_with_data(corrData, other_interactions)
print("get_export_relevant_STRING_correlation_values")
info1 = corrData, proteinList_left, interactions_all, "string_correlations_allFINAL_" + name
step5_preparation.get_export_relevant_STRING_correlation_values(info1, folder = folder)
info2 = corrData, proteinList_left700, interactions_700, "string_correlations_FINAL700_" + name
step5_preparation.get_export_relevant_STRING_correlation_values(info2, folder = folder)
info2 = corrData, other_proteinList_left, other_interactions, 'other_string_correlations_allFINAL_' + name
step5_preparation.get_export_relevant_OTHER_correlation_values(info2, folder = folder)
@staticmethod
def get_other_list(data, interactions_all)
proteinList = list(data.index)
protein_combiList = utilsFacade.getCombinations(proteinList)
other_interactions = list(set(protein_combiList).difference(set(interactions_all)))
return other_interactions
@staticmethod
def load_corr_data(data):
quant_cols = utilsFacade.filtering(list(data.columns), 'quant_')
quant_data = data[quant_cols]
corrData = quant_data.corr()
return corrData
@staticmethod
def overlap_STRING_interactions_with_data(corrData, stringData):
sym1List = list(stringData.sym1)
sym2List = list(stringData.sym2)
symList = list(set(sym1List).union(set(sym2List)))
proteinList = list(corrData.index)
proteinList_left = filter(lambda a:str(a)!="nan",
list(set(proteinList).intersection(set(symList))))
return proteinList_left
@staticmethod
def overlap_OTHER_interactions_with_data(corrData, other_interactions):
other_data = pd.DataFrame(other_interactions)
other_vals = list(set(other_data[0]).union(set(other_data[1])))
proteinList = list(corrData.index)
other_proteinList_left = filter(lambda a:str(a)!="nan",
list(set(proteinList).intersection(set(other_vals))))
return other_proteinList_left
@staticmethod
def get_export_relevant_STRING_correlation_values(info1, **kwargs):
folder = kwargs.get('folder','PATH')
corrData, proteinList_left, interactions_all, output_fileName = info1
sub_corrData = corrData[proteinList_left]
sub_corrData = sub_corrData[proteinList_left]
row_names = list(sub_corrData.index)
col_names = list(sub_corrData.columns)
dfList = list()
for count,r in enumerate(row_names):
temp = list()
for c in col_names:
temp.append(str(r) + ":" + str(c))
dfList.append(temp)
print(count)
df = pd.DataFrame(dfList)
df.index = row_names
df.columns = col_names
mask = df.isin(list(interactions_all))
sub = sub_corrData[mask]
columnsList = list(sub.columns)
unique_vals = np.unique(sub[columnsList])
unique_vals = list(unique_vals)
finite_mask = np.isfinite(unique_vals)
finite_unique_vals = np.array(unique_vals)[finite_mask]
finite_unique_vals = list(finite_unique_vals)
with open(folder + output_fileName + '.json', 'w') as outfile:
json.dump(finite_unique_vals, outfile)
@staticmethod
def get_export_relevant_OTHER_correlation_values(info2, **kwargs):
folder = kwargs.get('folder','PATH')
corrData, proteinList_left, interactions_all, output_fileName = info2
sub_corrData = corrData[other_proteinList_left]
sub_corrData = sub_corrData[other_proteinList_left]
row_names = list(sub_corrData.index)
col_names = list(sub_corrData.columns)
dfList = list()
for count,r in enumerate(row_names):
temp = list()
for c in col_names:
temp.append(str(r) + ":" + str(c))
dfList.append(temp)
print(count)
df = pd.DataFrame(dfList)
df.index = row_names
df.columns = col_names
otherList = list()
count = 0
for o1,o2 in other_interactions:
if count%100000 == 0:
print(count)
otherList.append(str(o1) + ":" + str(o2))
count+=1
mask = df.isin(otherList)
sub = sub_corrData[mask]
columnsList = list(sub.columns)
unique_vals = np.unique(sub[columnsList])
unique_vals = list(unique_vals)
finite_mask = np.isfinite(unique_vals)
finite_unique_vals = np.array(unique_vals)[finite_mask]
finite_unique_vals = list(finite_unique_vals)
with open(folder + output_fileName + '.json', 'w') as outfile:
json.dump(finite_unique_vals, outfile)
class step5:
@staticmethod
def execute(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
data_dict = file_Loader.load_data(folder = folder)
for name in data_dict.keys():
data = data_dict[name]
if name in ['gygi1','gygi2','gygi3','bxd_protein']:
step5_preparation.execute(data,
name,
folder = folder,
output_folder = output_folder,
species = 'mouse')
else:
step5_preparation.execute(data,
name,
folder = folder,
output_folder = output_folder,
species = 'human')
class step5_figures:
@staticmethod
def execute(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
nameList = ["gygi1","gygi3","battle_protein", "wu",
"mann_all_log2","tiannan", "primateRNA",
"primatePRO","gygi2","battle_rna","battle_ribo",
'coloCa','tcga_breast','tcga_ovarian','bxd_protein']
print("load_data")
corr_dict = step5_figures.load_data(nameList, folder = folder)
print('make_supplementary_plot')
step5_figures.make_supplementary_plot(nameList, corr_dict, output_folder = output_folder)
print("get_significancies")
pval_distribution = step5_figures.get_significancies(nameList, folder = folder)
@staticmethod
def load_data(nameList, **kwargs):
folder = kwargs.get('folder','PATH')
corr_dict = dict()
for name in nameList:
print(name)
for ty in ["all","700","other"]:
print(ty)
if ty=="all" or ty=="700":
file_name = "string_correlations_" + ty + "_" + name + ".json"
with open(folder + file_name) as data_file:
correlation_values = json.load(data_file)
corr_dict.setdefault(name + ":" + ty,[])
corr_dict[name + ":" + ty] = correlation_values
else:
file_name = "other_string_correlations_all_" + name + ".json"
with open(folder + file_name) as data_file:
correlation_values = json.load(data_file)
corr_dict.setdefault(name + ":" + ty,[])
corr_dict[name + ":" + ty] = correlation_values
return corr_dict
@staticmethod
def make_supplementary_plot(nameList, corr_dict, **kwargs):
output_folder = kwargs.get('output_folder','PATH')
for name in nameList:
print(name)
dataList = list()
for ty in ["700","all","other"]:
key = name + ":" + ty
dataList.append(corr_dict[key])
if ty == '700':
best_string_correlation_values = list(corr_dict[key])
elif ty == 'all':
string_correlation_values = list(corr_dict[key])
else:
other_correlation_values = list(corr_dict[key])
sns.set_style("white")
plt.rcParams["axes.grid"] = True
plt.clf()
fig = plt.figure(figsize=(5,5))
gs = gridspec.GridSpec(10,10)
ax = plt.subplot(gs[0:7,0:])
plottingFacade.func_plotDensities_border(ax, other_correlation_values, facecolor="grey")
plottingFacade.func_plotDensities_border(ax, string_correlation_values, facecolor="orange")
plottingFacade.func_plotDensities_border(ax, best_string_correlation_values, facecolor="#EE7600")
ax.set_xlim(-1,1)
ax.set_xticklabels([])
plt.tick_params(axis="y",which="both",left="off",right="off",labelsize=10)
ax = plt.subplot(gs[7:,0:])
bp = ax.boxplot(dataList,notch=0,sym="",vert=0,patch_artist=True,widths=(0.5,0.5,0.5))
plt.setp(bp['medians'], color="black")
plt.setp(bp['whiskers'], color="black",linestyle="-")
for i,patch in enumerate(bp['boxes']):
if i==0:
patch.set_facecolor("#EE7600")
elif i==1:
patch.set_facecolor("orange")
else:
patch.set_facecolor("#D8D8D8")
patch.set_edgecolor("black")
patch.set_alpha(1)
ax.set_xlim(-1,1)
ax.set_yticklabels([])
plt.tick_params(axis="y",which="both",left="off",right="off",labelsize=15)
plt.savefig(output_folder + "suppFig1a_string_correlation_recovery_" + name + ".pdf",
bbox_inches="tight", dpi = 400)
@staticmethod
def get_significancies(nameList, **kwargs):
folder = kwargs.get('folder','PATH')
pval_dict = dict()
for name in nameList:
print(name)
dataList = list()
for ty in ["700","all","other"]:
key = name + ":" + ty
dataList.append(corr_dict[key])
if ty == '700':
best_string_correlation_values = list(corr_dict[key])
elif ty == 'all':
string_correlation_values = list(corr_dict[key])
else:
other_correlation_values = list(corr_dict[key])
pval_distribution_mann = list()
for i in xrange(1,1000):
pval_mann_all = scipy.stats.mannwhitneyu(random.sample(other_correlation_values,1000),
random.sample(string_correlation_values,1000))[1]
pval_distribution_mann.append(pval_mann_all)
return pval_distribution_mann
if __name__ == "__main__":
## EXECUTE STEP5
step5.execute(folder = sys.argv[1], output_folder = sys.argv[2])
step5_figures.execute(folder = sys.argv[1], output_folder = sys.argv[2])
All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.