STEP 18: Python Code
class stringData_yeast:
@staticmethod
def execute(**kwargs):
folder = kwargs.get('folder', 'PATH')
prepare_string_dictionary = kwargs.get('prepare_string_dictionary', False)
collect_corr_values = kwargs.get('collect_corr_values', False)
visualize = kwargs.get('visualize', True)
if prepare_string_dictionary == True:
print('load_all_string_data_yeast')
interactor_dict, interactor_dict700 = stringData_yeast.load_all_string_data_yeast(folder)
if collect_corr_values == True:
print('load_interactor_dictionaries')
interactor_dict, interactor_dict700 = stringData_yeast.load_interactor_dictionaries(folder)
print('collect_corr_yeast_datasets')
stringData_yeast.collect_corr_yeast_datasets(folder, interactor_dict, interactor_dict700)
if visualize == True:
stringData_yeast.iterate_visualisation(folder)
print('VISUALIZATION: FINISHED')
@staticmethod
def load_interactor_dictionaries(folder):
interactor_dict = DataFrameAnalyzer.read_pickle(folder + '4932.interaction.dict.allScores.pkl')
interactor_dict700 = DataFrameAnalyzer.read_pickle(folder + '4932.interaction.dict.700Scores.pkl')
return interactor_dict, interactor_dict700
@staticmethod
def load_all_string_data_yeast(folder):
fname = '4932.protein.links.detailed.v10.5.txt'
data = pd.read_csv(folder + fname, sep = ' ')
p1_list = list()
p2_list = list()
protein1_list = list(data.protein1)
protein2_list = list(data.protein2)
for protein1, protein2 in zip(protein1_list, protein2_list):
p1_list.append(protein1.split('.')[1])
p2_list.append(protein2.split('.')[1])
data['pro1'] = pd.Series(p1_list, index = data.index)
data['pro2'] = pd.Series(p2_list, index = data.index)
proteins1 = list(set(data.pro1))
interactor_dict = dict((e1,list()) for e1 in proteins1)
for i,protein in enumerate(proteins1):
print(i, protein)
sub = data[data.pro1==protein]
interactors = list(sub.pro2)
interactor_dict[protein] = interactors
DataFrameAnalyzer.to_pickle(interactor_dict, folder + '4932.interaction.dict.allScores.pkl')
data = data[data.combined_score >= 700]
proteins1 = list(set(data.pro1))
interactor_dict700 = dict((e1,list()) for e1 in proteins1)
for i,protein in enumerate(proteins1):
print(i, protein)
sub = data[data.pro1==protein]
interactors = list(sub.pro2)
interactor_dict700[protein] = interactors
DataFrameAnalyzer.to_pickle(interactor_dict700, folder + '4932.interaction.dict.700Scores.pkl')
return interactor_dict, interactor_dict700
@staticmethod
def get_string_and_other_correlations(folder, data, quant_cols, interactor_dict, interactor_dict700, name):
quant_data = data[quant_cols]
available_proteins = set(quant_data.index)
string_corr_values = list()
other_corr_values = list()
for p,protein in enumerate(interactor_dict.keys()):
print(p)
if protein in list(available_proteins):
interactors = set(list(interactor_dict[protein]) + [protein])
available_interactors = list(available_proteins.intersection(interactors))
non_interactors = list(available_proteins.difference(interactors))
sub_data = quant_data.T[available_interactors]
other_data = quant_data.T[[protein] + non_interactors]
string_corrData = sub_data.corr()
other_corrData = other_data.corr()
#if type(string_corrData[protein])==pd.Series:
string_corrs = list(string_corrData[protein])
string_corr_values.append(string_corrs)
#if type(other_corrData[protein])==pd.Series:
other_corrs = list(other_corrData[protein])
other_corr_values.append(other_corrs)
string_corr_values = utilsFacade.flatten(string_corr_values)
other_corr_values = utilsFacade.flatten(other_corr_values)
string_corr_values = np.array(string_corr_values)
string_corr_values = string_corr_values[string_corr_values < 1]
print(len(string_corr_values))
string_corr_values = utilsFacade.finite(list(string_corr_values))
print(len(string_corr_values))
string_corr_values = np.array(string_corr_values)
string_corr_values = string_corr_values[string_corr_values >- 1]
print(len(string_corr_values))
string_corr_values = list(string_corr_values)
########################################################################
other_corr_values = np.array(other_corr_values)
other_corr_values = other_corr_values[other_corr_values < 1]
print(len(other_corr_values))
other_corr_values = utilsFacade.finite(list(other_corr_values))
print(len(other_corr_values))
other_corr_values = np.array(other_corr_values)
other_corr_values = other_corr_values[other_corr_values >- 1]
print(len(other_corr_values))
other_corr_values = list(other_corr_values)
string700_corr_values = list()
for p,protein in enumerate(interactor_dict700.keys()):
print(p)
if protein in list(available_proteins):
interactors = set(list(interactor_dict700[protein]) + [protein])
available_interactors = list(available_proteins.intersection(interactors))
sub_data = quant_data.T[available_interactors]
string_corrData = sub_data.corr()
if type(string_corrData[protein])==pd.Series:
string_corrs = list(string_corrData[protein])
string700_corr_values.append(string_corrs)
string700_corr_values = utilsFacade.flatten(string700_corr_values)
string700_corr_values = np.array(string700_corr_values)
string700_corr_values = string700_corr_values[string700_corr_values < 1]
print(len(string700_corr_values))
string700_corr_values = utilsFacade.finite(list(string700_corr_values))
print(len(string700_corr_values))
string700_corr_values = np.array(string700_corr_values)
string700_corr_values = string700_corr_values[string700_corr_values >- 1]
print(len(string700_corr_values))
string700_corr_values = list(string700_corr_values)
yeast_corrs = {'string': string_corr_values,
'string700': string700_corr_values,
'other': other_corr_values}
DataFrameAnalyzer.to_pickle(yeast_corrs, folder + name + '.string.corrs.pkl')
return yeast_corrs
@staticmethod
def collect_corr_yeast_datasets(folder, interactor_dict, interactor_dict700):
fname = 'yeast3_quant_Proteome_carbonSources_MAPPED_complexes_pathways_NORM2.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'rel. Intensity')
print('get_string_and_other_correlations: YEAST3')
yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
quant_cols, interactor_dict,
interactor_dict700, 'yeast3')
###################################################################################
fname = 'yeast4_quant_Proteome_naclStress_MAPPED_complexes_pathways_NORM2.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'rel.Intensity')
print('get_string_and_other_correlations: yeast4')
yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
quant_cols, interactor_dict,
interactor_dict700, 'yeast4')
###################################################################################
fname = 'yeast5_quant_Proteome_MAPPED_complexes_pathways_NORM2_IBAQ.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'iBAQ')[:-1]
print('get_string_and_other_correlations: yeast5')
yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
quant_cols, interactor_dict,
interactor_dict700, 'yeast5')
###################################################################################
fname = 'yeast8_quant_GFP_Proteome_MAPPED_complexes_pathways_NORM2.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'quant')
print('get_string_and_other_correlations: yeast8')
yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
quant_cols, interactor_dict,
interactor_dict700, 'yeast8')
###################################################################################
fname = 'yeast10_quant_RNA_MAPPED_complexes_pathways_NORM2.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'quant')
print('get_string_and_other_correlations: yeast10')
yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
quant_cols, interactor_dict,
interactor_dict700, 'yeast10')
###################################################################################
fname = 'yeast11_quant_proteome_MAPPED_complexes_pathways_NORM2.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'quant')
print('get_string_and_other_correlations: yeast11')
yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
quant_cols, interactor_dict,
interactor_dict700, 'yeast11')
###################################################################################
fname = 'yeast14_quant_proteome_MAPPED_complexes_pathways_NORM2.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'quant_', condition = 'startswith')
print('get_string_and_other_correlations: yeast14')
yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
quant_cols, interactor_dict,
interactor_dict700, 'yeast14')
###################################################################################
fname = 'yeast16_quant_proteome_MAPPED_complexes_pathways_NORM2.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'quant_', condition = 'startswith')
print('get_string_and_other_correlations: yeast16')
yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
quant_cols, interactor_dict,
interactor_dict700, 'yeast16')
###################################################################################
fname = 'yeast18_quant_proteome_MAPPED_complexes_pathways_NORM2.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'quant_', condition = 'startswith')
print('get_string_and_other_correlations: yeast18')
yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
quant_cols, interactor_dict,
interactor_dict700, 'yeast18')
###################################################################################
fname = 'yeast19_quant_transcriptome_MAPPED_complexes_pathways_NORM2.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'quant_', condition = 'startswith')
print('get_string_and_other_correlations: yeast19')
yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
quant_cols, interactor_dict,
interactor_dict700, 'yeast19')
###################################################################################
fname = 'yeast20_quant_proteome_MAPPED_complexes_pathways_NORM2.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'quant_', condition = 'startswith')
print('get_string_and_other_correlations: yeast20')
yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
quant_cols, interactor_dict,
interactor_dict700, 'yeast20')
###################################################################################
fname = 'yeast21_quant_proteome_MAPPED_complexes_pathways_NORM2_REP.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'quant_', condition = 'startswith')
print('get_string_and_other_correlations: yeast21')
yeast_corrs = stringData_yeast.get_string_and_other_correlations(folder, data,
quant_cols, interactor_dict,
interactor_dict700, 'yeast21')
###################################################################################
@staticmethod
def yeast_string_visualisation_signal(folder, name):
yeast_dict = DataFrameAnalyzer.read_pickle(folder + name + '.string.corrs.pkl')
string_corrs, string700_corrs, other_corrs = yeast_dict['string'], yeast_dict['string700'], yeast_dict['other']
sns.set(context='notebook', style='white',
palette='deep', font='Liberation Sans', font_scale=1,
color_codes=False, rc=None)
plt.rcParams["axes.grid"] = True
plt.clf()
fig = plt.figure(figsize = (5,5))
gs = gridspec.GridSpec(10,10)
ax = plt.subplot(gs[0:7,0:])
plottingFacade.func_plotDensities_border(ax, other_corrs, facecolor = 'grey')
plottingFacade.func_plotDensities_border(ax, string_corrs, facecolor = '#F9A22C')
plottingFacade.func_plotDensities_border(ax, string700_corrs, facecolor = '#EC7723')
ax.set_xlim(-1,1)
ax.set_xticklabels([])
pval_distribution = list()
for i in xrange(1,1000):
pval = scipy.stats.mannwhitneyu(random.sample(other_corrs,1000),
random.sample(string_corrs,1000))[1]
pval_distribution.append(pval)
pval_distribution_best = list()
for i in xrange(1,1000):
pval = scipy.stats.mannwhitneyu(random.sample(other_corrs,1000),
random.sample(string_corrs,1000))[1]
pval_distribution_best.append(pval)
pval1 = np.mean(pval_distribution)
pval2 = np.mean(pval_distribution_best)
plottingFacade.make_full_legend(ax, [pval1, pval2], ['#F9A22C','#EC7723'])
ax = plt.subplot(gs[7:,0:])
dataList = [string700_corrs, string_corrs, other_corrs]
bp = ax.boxplot(dataList, notch=0, sym="", vert=0, patch_artist=True, widths=(0.5,0.5,0.5))
plt.setp(bp['medians'], color="black")
plt.setp(bp['whiskers'], color="black",linestyle="-")
for i,patch in enumerate(bp['boxes']):
if i==2:
patch.set_facecolor("grey")
elif i==1:
patch.set_facecolor("#F9A22C")
else:
patch.set_facecolor("#EC7723")
patch.set_edgecolor("black")
patch.set_alpha(1)
ax.set_xlim(-1,1)
ax.set_yticklabels([])
ax.set_xticklabels([])
plt.savefig(folder + name + '_quant_proteome_STRINGcheckup.pdf',
bbox_inches = 'tight', dpi = 300)
@staticmethod
def iterate_visualisation(folder):
name_list = [3, 4, 5, 10, 11, 14, 16, 18, 19, 20, 21]
for name in name_list:
print('yeast_string_visualisation_signal: yeast'+ str(name))
stringData_yeast.yeast_string_visualisation_signal(folder, 'yeast' + str(name))
if __name__ == "__main__":
s = stringData_yeast.execute(folder = sys.argv[1], output_folder = sys.argv[2],
visualize = sys.argv[3], collect_corr_values = sys.argv[4],prepare_string_dictionary = sys.argv[5])
All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.