STEP 8: Python Code
class file_Loader:
@staticmethod
def get_data(**kwargs):
folder = kwargs.get('folder','PATH')
stoch_gygi3 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi3.tsv.gz')
stoch_gygi1 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi1.tsv.gz')
stoch_mann = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_mann.tsv.gz')
stoch_battle_protein = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_battle_protein.tsv.gz')
stoch_tcgaBreast = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_tcgaBreast.tsv.gz')
stoch_tcgaOvarian = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_tcgaOvarian.tsv.gz')
stoch_tcgaColorCancer = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_tcgaColoCancer.tsv.gz')
data_dict = {'gygi3':stoch_gygi3,
'gygi1':stoch_gygi1,
'mann':stoch_mann,
'battle_protein':stoch_battle_protein,
'tcgaBreast':stoch_tcgaBreast,
'tcgaOvarian':stoch_tcgaOvarian,
'tcgaColoCancer':stoch_tcgaColorCancer}
return data_dict
@staticmethod
def load_complex_dictionary(**kwargs):
folder = kwargs.get('folder','PATH')
complexDict = DataFrameAnalyzer.read_pickle(folder + 'complex_dictionary.pkl')
return complexDict
class step8_preparation:
@staticmethod
def execute(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
print('select_complexes for figure presentation; or iterate through all possible complexes')
relevant_complexes = step8_preparation.select_complexes(folder = folder)
@staticmethod
def select_complexes(**kwargs):
folder = kwargs.get('folder','PATH')
data_dict = file_Loader.get_data(folder = folder)
name_list = ['gygi3','gygi1','battle_protein','mann_all_log2',
'tcgaBreast','tcgaOvarian', 'tcgaColoCancer']
data_list = [data_dict['gygi3'], data_dict['gygi1'], data_dict['battle_protein'],
data_dict['mann'], data_dict['tcgaBreast'], data_dict['tcgaOvarian'],
data_dict['tcgaColoCancer']]
complexSet = list()
for d, n in zip(data_list, name_list):
complexSet.append(set(d.ComplexName))
complexSet = set.union(*complexSet)
complexSet = list(complexSet)
#for visualization purpose show the dissection for some stable/variable complexes
relevant_complexes = list()
for complex_id in complexSet:
if complex_id.find('COP')!=-1 or complex_id.find('26S')!=-1 or complex_id.find('eIF4F')!=-1 or complex_id.find('F0')!=-1 or complex_id.find('NPC')!=-1 or complex_id.find('NuRD')!=-1 or complex_id.find('RSC')!=-1 or complex_id.find('mediator')!=-1 or complex_id.find('HAT')!=-1:
relevant_complexes.append(complex_id)
o = open(folder + 'figure4a_relevant_complexes.tsv', 'w')
export_text = '\n'.join(relevant_complexes)
o.write(export_text)
o.close()
return relevant_complexes
class step8_stats_check:
@staticmethod
def execute(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
print('check_on_overall_agreement_between_datasets')
mean_pval = step8_stats_check.check_on_overall_agreement_between_datasets(folder = folder)
@staticmethod
def check_on_overall_agreement_between_datasets(**kwargs):
folder = kwargs.get('folder','PATH')
fname = 'fig4_stability_complexSubunits.txt'
data = DataFrameAnalyzer.getFile(folder, fname)
complex_list = list(set(data.index))
datasets = list(set(data['dataset of origin']))
all_complex_corrs = list()
all_rand_complex_corrs = list()
for complex_id in complex_list:
print(complex_id)
sub = data.loc[complex_id]
complex_dict = dict((e1,dict()) for e1 in datasets)
for d in datasets:
dsub = sub[sub['dataset of origin']==d]
temp_df = dsub[['z-score']]
temp_df.index = dsub.Protein
temp_dict = temp_df['z-score'].to_dict()
complex_dict[d] = temp_dict
complex_df = pd.DataFrame(complex_dict)
#reshuffle dataset to extract random correlations
table_list = list()
for i,row in complex_df.iterrows():
temp = list(row)
np.random.shuffle(temp)
table_list.append(temp)
rand_data = pd.DataFrame(table_list)
rand_data = rand_data.T
table_list = list()
for i,row in rand_data.iterrows():
temp = list(row)
np.random.shuffle(temp)
table_list.append(temp)
rand_data = pd.DataFrame(table_list)
rand_data = rand_data.T
complex_corrs = utilsFacade.get_correlation_values(complex_df.corr())
rand_complex_corrs = utilsFacade.get_correlation_values(rand_data.corr())
all_complex_corrs.append(complex_corrs)
all_rand_complex_corrs.append(rand_complex_corrs)
all_complex_corrs = utilsFacade.flatten(all_complex_corrs)
all_rand_complex_corrs = utilsFacade.flatten(all_rand_complex_corrs)
#check normality of distribution with Shapiro-test
w_true, p_true = scipy.stats.shapiro(all_complex_corrs)
w_rand, p_rand = scipy.stats.shapiro(all_rand_complex_corrs)
print(w_true, w_rand)
pvalues = list()
for i in np.arange(1,100):
pvalues.append(scipy.stats.ttest_ind(random.sample(all_complex_corrs, 1000), random.sample(all_rand_complex_corrs, 1000))[1])
mean_pvalue = np.mean(pvalues)#5.66x10-17
return mean_pvalue
@staticmethod
def calculate_enrichment_for_variable_member(df1):
#df1 calculated in step8_figures
df1 = df1.drop('complex_id', axis = 1)
pvalue_dict = dict()
for col in list(df1.columns)[:-1]:
zscores = list(df1[col])
pvalues = list()
for item in zscores:
pvalues.append(statsFacade.zscore_to_pvalue(item)[0])
pvalue_dict[col] = pvalues
pvalue_df = pd.DataFrame(pvalue_dict)
pvalue_df.index = df1.index
pvalue_dict = dict()
for protein in list(df1.index):
sub = np.array(df1.loc[protein])
temp = sub[sub >= 1]
diff_proteins = list(set(df1.index).difference(set(protein)))
rest_sub = df1.T[diff_proteins].T
rest_var = 0
rest_stab = 0
for protein1 in list(rest_sub.index):
rsub = np.array(df1.loc[protein1])
rtemp = rsub[rsub >= 1]
rest_var += len(rtemp)
rest_stab += len(rsub) - len(rtemp)
table = [[len(temp),len(sub) - len(temp)],[rest_var, rest_stab]]
if len(temp)>0:
odds, pval = scipy.stats.fisher_exact(table)
print(protein, pval, odds)
pvalue_dict[protein] = pval
return pvalue_dict
@staticmethod
def gather_numbers(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
complexDict = file_Loader.load_complex_dictionary(folder = folder)
complex_data = DataFrameAnalyzer.getFile(folder,'wpc_figure2B_underlyingData_complexes_RNA_newData.tsv')
complex_set = list(set(complex_data.complex_id))
corr_dict = dict()
gold_corr_dict = dict()
for complexID in complex_set:
sub = complex_data[complex_data.complex_id==complexID]
quant_data = sub.drop(['complex_id','mean','p.value','median','label'],1)
if len(quant_data)>=5:
corr_data = quant_data.corr()
corr_values = utilsFacade.get_correlation_values(corr_data)
gold = ''
for complex_idName in complexDict:
altName = complexDict[complex_idName]['altName'][0]
if altName==complexID:
gold = complexDict[complex_idName]['goldComplex'][0]
break
if gold=='yes':
gold_corr_dict[complexID] = np.mean(corr_values)
corr_dict[complexID] = np.mean(corr_values)
label_list = list()
avCorr_list = list()
for k,v in zip(gold_corr_dict.keys(), gold_corr_dict.values()):
label_list.append(k)
avCorr_list.append(v)
avCorr_list, label_list = zip(*sorted(zip(avCorr_list, label_list)))
sns.set(context='notebook', style='white',
palette='deep', font='Liberation Sans', font_scale=1,
color_codes=False, rc=None)
plt.rcParams["axes.grid"] = False
plt.clf()
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(list(xrange(len(avCorr_list))), avCorr_list, edgecolor = 'black', s = 40)
ax.set_ylabel('mean correlation')
plt.savefig(output_folder + 'consistency_figure2b_overview.pdf', bbox_inches = 'tight', dpi = 400)
################################################################################################
complex_set = list(set(complex_data.complex_id))
corr_dict = dict()
gold_corr_dict = dict()
var_dict = dict()
for complexID in complex_set:
sub = complex_data[complex_data.complex_id==complexID]
gold = ''
for complex_idName in complexDict:
altName = complexDict[complex_idName]['altName'][0]
if altName==complexID:
gold = complexDict[complex_idName]['goldComplex'][0]
break
if gold=='yes' and len(sub) >= 5:
var_sub = sub[sub['p.value'] < 0.1]
fraction = float(len(var_sub))/float(len(sub))*100
var_dict[complexID] = fraction
label_list = list()
var_list = list()
for k,v in zip(var_dict.keys(), var_dict.values()):
label_list.append(k)
var_list.append(v)
var_list,label_list = zip(*sorted(zip(var_list,label_list)))
var_df = pd.DataFrame({'var':var_list})
var_df.index = label_list
var_df.to_csv(output_folder + 'variation_dataframe_figure2B_fractionOfVariableComponents.tsv', sep = '\t')
class step8_figure4a:
@staticmethod
def execute(**kwargs):
folder = kwargs.get('folder','PATH')
do_ranking = kwargs.get('do_ranking', False)
output_folder = kwargs.get('output_folder','PATH')
print('FIGURE4A: main_figure4a_complex_intern_landscapes')
step8_figure4a.main_figure4a_complex_intern_landscapes(folder = folder, output_folder = output_folder)
@staticmethod
def main_figure4a_complex_intern_landscapes(**kwargs):
folder = kwargs.get('folder','PATH')
do_ranking = kwargs.get('do_ranking', False)
output_folder = kwargs.get('output_folder','PATH')
print('get_data')
data_dict = file_Loader.get_data(folder = folder)
name_list = ['gygi3','gygi1','battle_protein','mann_all_log2',
'tcgaBreast','tcgaOvarian', 'tcgaColoCancer']
data_list = [data_dict['gygi3'], data_dict['gygi1'], data_dict['battle_protein'],
data_dict['mann'], data_dict['tcgaBreast'], data_dict['tcgaOvarian'],
data_dict['tcgaColoCancer']]
print('iteration_complexes')
step8_figure4a.iteration_complexes(data_list, name_list,
do_ranking = do_ranking,
output_folder = output_folder)
@staticmethod
def load_relevant_complexes(**kwargs):
folder = kwargs.get('folder','PATH')
com_data = DataFrameAnalyzer.getFile(folder, 'figure4a_relevant_complexes.tsv')
relevant_complexes = list(com_data.index)
return relevant_complexes
@staticmethod
def get_alternative_name(complex_id):
altName = complex_id
altName = '_'.join(altName.split(' '))
altName = altName.replace('/','')
altName = altName.replace(':','')
return altName
@staticmethod
def rank_dataset(df, proteins, name_list):
df_lst = list()
dfList = map(list,df.values)
for f in dfList:
rank_list = rankdata(f)
all_ranks = rankdata(utilsFacade.finite(f))
if len(all_ranks) > 0:
minimum_rank = all_ranks.min()
temp_list = list()
for fitem, rank in zip(f, rank_list):
if str(fitem) != 'nan':
temp_list.append(rank)
else:
temp_list.append(np.nan)
df_lst.append(temp_list)
df = pd.DataFrame(df_lst)
df.columns = proteins
df.index = name_list
return df
@staticmethod
def get_zscores(df):
myArray = np.array(df)
normalizedArray = []
for row in range(0, len(myArray)):
list_values = []
Min = min(utilsFacade.finite(list(myArray[row])))
Max = max(utilsFacade.finite(list(myArray[row])))
mean = np.mean(utilsFacade.finite(list(myArray[row])))
std = np.std(utilsFacade.finite(list(myArray[row])))
for element in myArray[row]:
list_values.append((element - mean)/std)
normalizedArray.append(list_values)
newArray = []
for row in range(0, len(normalizedArray)):
list_values = normalizedArray[row]
newArray.append(list_values)
new_df = pd.DataFrame(newArray)
new_df.columns = list(df.columns)
new_df.index = df.index
df = new_df.copy()
df = df.iloc[::-1]
dfList = map(list,df.values)
return df, dfList
@staticmethod
def manage_proteasome_data(df):
try:
df = df.drop(['PSD3','C9','C2','C6','RPN1'], axis = 1)
except:
remove_list = ['PSD3','C9','C2','C6','RPN1']
remove_list1 = [item[0] + item[1:].lower() for item in remove_list]
remove_list = remove_list + remove_list1
df = df[~df.index.isin(remove_list)]
return df
@staticmethod
def prepare_variance_dataframe(name_list, data_list, complex_df, proteins):
df_list = list()
for protein in proteins:
temp = list()
for d,n in zip(data_list, name_list):
sub = complex_df[complex_df.fileName==n]
if protein in list(sub.index):
if type(sub.loc[protein]) == pd.DataFrame:
quant_cols = utilsFacade.filtering(d, 'quant_', condition = 'startswith')
s = sub.loc[protein][quant_cols].T
coverage_list = list()
for c,col in enumerate(list(s.columns)):
temp_finite = utilsFacade.finite(list(np.array(s)[:,c]))
tempSmall = list(np.array(s)[:,c])
coverage_list.append(float(len(temp_finite))/float(len(tempSmall))*100)
s = sub.loc[protein]
s['coverage'] = pd.Series(coverage_list, index = s.index)
s = s.sort_values('coverage', ascending = False)
var_value = s.iloc[0]['relative_variance']
temp.append(var_value)
else:
var_value = sub.loc[protein]['relative_variance']
temp.append(var_value)
else:
temp.append(np.nan)
df_list.append(temp)
df = pd.DataFrame(df_list)
df.index = proteins
df.columns = name_list
df = df.dropna(thresh = int(len(df.columns)/2.0))
df = df.T
return df
@staticmethod
def plot_heatmap(df, complex_id, altName, **kwargs):
output_folder = kwargs.get('output_folder','PATH')
sns.set(context='notebook', style='white',
palette='deep', font='Liberation Sans', font_scale=1,
color_codes=False, rc=None)
plt.rcParams["axes.grid"] = False
sc, color_list = colorFacade.get_specific_color_gradient(plt.cm.RdBu_r,
np.array(utilsFacade.finite(utilsFacade.flatten(map(list, df.values)))),
vmin = -2, vmax = 2)
plt.clf()
if complex_id.find('26S')!=-1 or complex_id.find('NPC')!=-1:
fig = plt.figure(figsize = (10,3))
else:
fig = plt.figure(figsize = (5,3))
ax = fig.add_subplot(111)
mask = df.isnull()
mask1 = mask.copy()
mask1 = mask1.replace(True,'bla')
mask1 = mask1.replace(False,'20')
mask1 = mask1.replace('bla',1)
mask1 = mask1.replace('20',0)
sns.heatmap(mask1, cmap = ['grey'], linewidth = 0.2, cbar = False, xticklabels = [], yticklabels = [])
sns.heatmap(df, cmap = plt.cm.RdBu_r, linewidth = 0.2, mask = mask, vmin = -2, vmax = 2)
plt.savefig(output_folder + 'fig4a_' + altName + '_heatmap_subunits.pdf', bbox_inches = 'tight', dpi = 400)
@staticmethod
def iteration_complexes(data_list, name_list, **kwargs):
do_ranking = kwargs.get('do_ranking', False)
output_folder = kwargs.get('output_folder','PATH')
relevant_complexes = step8_figure4a.load_relevant_complexes()
concatanated_complex_dfs = list()
for complex_id in relevant_complexes:
print('*************************')
print(complex_id)
altName = step8_figure4a.get_alternative_name(complex_id)
concat_list = list()
for d,n in zip(data_list, name_list):
sub = d[d.ComplexName == complex_id]
if len(sub) > 0:
sub['fileName'] = pd.Series([n]*len(sub), index = sub.index)
concat_list.append(sub)
complex_df = pd.concat(concat_list)
complex_df.index = [item.upper() for item in list(complex_df.index)]
proteins = list(set(complex_df.index))
df = step8_figures.prepare_variance_dataframe(name_list, data_list, complex_df, proteins)
if do_ranking == True:
df = step8_figures.rank_dataset(df, proteins, name_list)
df, dfList = step8_figures.get_zscores(df)
if complex_id.find('26S')!=-1:
df = step8_figures.manage_proteasome_data(df)
protein_list = list(df.columns)
label_list = list()
median_list = list()
for c,col in enumerate(df.columns):
median_list.append(np.mean(utilsFacade.finite(list(np.array(df)[:,c]))))
label_list.append(col)
median_list, label_list = zip(*sorted(zip(median_list, label_list), reverse = False))
df = df[list(label_list)]
if complex_id.find('Kornberg')!=-1:
complex_id = 'Kornbergs mediator (SRB) complex'
df1 = df.T.copy()
df1['complex_id'] = pd.Series([complex_id]*len(df1), index = df1.index)
concatanated_complex_dfs.append(df1)
print('plot_heatmap')
print('*************************')
figure4a.plot_heatmap(df, complex_id, altName, output_folder = output_folder)
complex_data = pd.concat(concatanated_complex_dfs)
complex_data.to_csv(output_folder + 'fig4a_underlyingData_complexes_RNA_newData.tsv',
sep = '\t')
return complex_data
class step8_figure4b:
@staticmethod
def execute(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
print('FIGURE4B: main_figure4b_26SProteasome_analysis')
step8_figure4b.main_figure4b_26SProteasome_analysis()
@staticmethod
def main_figure4b_26SProteasome_analysis(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
print("load_boxplot_data")
sig_data_gygi3 = step8_figure4b.load_boxplot_data(folder = folder)
print('plot_boxplot_complexVariance')
figure4b.plot_boxplot_complexVariance(sig_data_gygi3, folder = folder, output_folder = output_folder)
@staticmethod
def load_boxplot_data(**kwargs):
folder = kwargs.get('folder','PATH')
data = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi3.tsv.gz')
sig_data_gygi3 = data[data.ComplexID=="26S Proteasome"]
return sig_data_gygi3
@staticmethod
def plot_boxplot_complexVariance(sig_data, **kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
dataset = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi3.tsv.gz')
dataset = dataset[dataset.ComplexID=="26S Proteasome"]
quantCols = utilsFacade.get_quantCols(sig_data)
dataset = dataset[quantCols]
proteinList = list(dataset.index)
dataset.index = proteinList
dataset = dataset.drop_duplicates()
complexID = "26S Proteasome"
sub = sig_data[sig_data.ComplexID==complexID]
sub = sub.sort_values("relative_variance")
pvalueDict = sub.to_dict()["levene_pval.adj"]
varianceDict = sub.to_dict()["relative_variance"]
mean_variance = np.mean(utilsFacade.finite(sub["relative_variance"]))
stateDict = sub.to_dict()["state"]
quant_sub = sub[quantCols].T
dsub = dataset.T[quant_sub.columns]
dsub = dsub.T.drop_duplicates()
dsub = dsub.T
original_dataList = list()
dataList1 = list()
proteins1 = list()
dataList2 = list()
proteins = list()
proteins2 = ["Psmb7","Psmd10","Psmb6","Psmb5","Psmb8","Psmd9","Psmb9","Psmb10"]
for c,col in enumerate(dsub.columns):
if col!="Psma8" and col!="Psmd4":#not enough datapoints/coverage
if col in proteins2:
dataList2.append(utilsFacade.finite(dsub.iloc[:,c]))
proteins.append(col)
else:
dataList1.append(utilsFacade.finite(dsub.iloc[:,c]))
proteins1.append(col)
proteins2 = proteins
immuno_proteasome = ["Psmb5","Psmb6","Psmb7","Psmb8","Psmb9","Psmb10"]
sns.set(context='notebook', style='white',
palette='deep', font='Liberation Sans', font_scale=1,
color_codes=False, rc=None)
plt.rcParams["axes.grid"] = True
plt.clf()
fig = plt.figure(figsize=(15,6))
gs = gridspec.GridSpec(3,3)
ax = plt.subplot(gs[0:,0:2])
ax.axhline(1, color = 'k', linestyle = '--')
ax.axhline(0.5, color = 'k', linestyle = '--')
ax.axhline(-1, color = 'k', linestyle = '--')
ax.axhline(0, color = 'k', linestyle = '--')
ax.axhline(-0.5, color = 'k', linestyle = '--')
bp=ax.boxplot(dataList1,notch=0,sym="",vert=1,patch_artist=True,widths=[0.8]*len(dataList1))
plt.setp(bp['medians'], color="black")
plt.setp(bp['whiskers'], color="black")
plt.setp(bp['whiskers'], color="black")
for i,patch in enumerate(bp['boxes']):
protein=proteins1[i]
x = numpy.random.normal(i+1, 0.04, size=len(dataList1[i]))
if protein in immuno_proteasome:
patch.set_facecolor("orange")
patch.set_edgecolor("orange")
else:
patch.set_facecolor("grey")
patch.set_edgecolor("grey")
patch.set_alpha(0.8)
plt.xticks(list(xrange(len(proteins1)+1)))
ax.set_xlim(0,len(proteins1)+1)
ax.set_xticklabels([""]+proteins1,fontsize=13,rotation=90)
ax.set_ylabel("Relative Abundance",fontsize=13)
plt.tick_params(axis="y",which="both",bottom="off",top="off",labelsize=12)
complex_name = complexID.replace("/","_")
complex_name = complex_name.replace(":","_")
ax.set_ylim(-1.5,1.5)
ax = plt.subplot(gs[0:,2:])
ax.axhline(1, color = 'k', linestyle = '--')
ax.axhline(0.5, color = 'k', linestyle = '--')
ax.axhline(-1, color = 'k', linestyle = '--')
ax.axhline(-0.5, color = 'k', linestyle = '--')
ax.axhline(0, color = 'k', linestyle = '--')
bp = ax.boxplot(dataList2,notch=0,sym="",vert=1,patch_artist=True,widths=[0.8]*len(dataList2))
plt.setp(bp['medians'], color="black")
plt.setp(bp['whiskers'], color="black")
plt.setp(bp['whiskers'], color="black")
for i,patch in enumerate(bp['boxes']):
protein = proteins2[i]
x = numpy.random.normal(i+1, 0.04, size=len(dataList2[i]))
if protein in immuno_proteasome:
patch.set_facecolor("orange")
ax.scatter(x,dataList2[i],color='white', alpha=0.9,edgecolor="brown",s=20)
elif protein.find("a")!=-1 or protein.find("b")!=-1:
patch.set_facecolor("#95DCEC")
ax.scatter(x,dataList2[i],color='white', alpha=0.9,edgecolor="darkblue",s=20)
elif protein.find("c")!=-1 or protein.find("d")!=-1:
patch.set_facecolor("grey")
ax.scatter(x,dataList2[i],color='white', alpha=0.9,edgecolor="black",s=20)
patch.set_edgecolor("black")
patch.set_alpha(0.8)
plt.xticks(list(xrange(len(proteins2)+1)))
ax.set_xlim(0,len(proteins2)+1)
ax.set_xticklabels([""]+[item.upper() for item in proteins2],fontsize=13,rotation=90)
ax.set_yticklabels([])
plt.tick_params(axis="y",which="both",bottom="off",top="off",labelsize=12)
complex_name = complexID.replace("/","_")
complex_name = complex_name.replace(":","_")
ax.set_ylim(-1.5,1.5)
plt.savefig(output_folder + "fig4b_stochiometry_data_complexes_26SProteasome_gygi3.pdf",
bbox_inches="tight", dpi=400)
class step8_export:
@staticmethod
def execute(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
print('export_underlying_data_part1')
concat_df = step8_export.export_underlying_data_part1(folder = folder, output_folder = output_folder)
@staticmethod
def export_underlying_data_part1(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
stoch_gygi3 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi3.tsv.gz')
stoch_gygi1 = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_gygi1.tsv.gz')
stoch_mann = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_mann.tsv.gz')
stoch_battle_protein = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_battle_protein.tsv.gz')
stoch_tcgaBreast = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_tcgaBreast.tsv.gz')
stoch_tcgaOvarian = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_tcgaOvarian.tsv.gz')
stoch_tcgaColorCancer = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_stoch_tcgaColoCancer.tsv.gz')
name_list = ['gygi3','gygi1','battle_protein','mann_all_log2', 'tcgaBreast',
'tcgaOvarian', 'tcgaColoCancer']
data_list = [stoch_gygi3, stoch_gygi1, stoch_battle_protein, stoch_mann_all_log2,
stoch_tcgaBreast, stoch_tcgaOvarian,stoch_colo_cancer]
temp_columns = ['relative_variance','coverage','levene_pval','levene_pval.adj',
'state','phos','ac','sumo','ub','met','location_humanProteinAtlas',
'ComplexID','ComplexName']
names = list()
state_list = list()
zscore_list = list()
concat_list = list()
combined_list = list()
member_num_list = list()
combined_corr_list = list()
for name,d in zip(name_list, data_list):
quant_cols = utilsFacade.filtering(list(d.columns),'quant_')
quant_data = d[quant_cols]
coverage_list = list()
for i,row in quant_data.iterrows():
temp = utilsFacade.finite(list(row))
coverage_list.append(float(len(temp))/float(len(quant_cols))*100)
subdf = d[temp_columns]
subdf['coverage'] = pd.Series(coverage_list, index = subdf.index)
complex_list = list(set(subdf.ComplexName))
for complexID in complex_list:
print(name,complexID)
com_sub = subdf[subdf.ComplexName==complexID]
myArray = list(com_sub['relative_variance'])
try:
list_values = []
Min = min(utilsFacade.finite(myArray))
Max = max(utilsFacade.finite(myArray))
mean = np.mean(utilsFacade.finite(myArray))
std = np.std(utilsFacade.finite(myArray))
for element in myArray:
list_values.append((element - mean)/std)
except:
print('Failed_zscore')
combined = scipy.stats.combine_pvalues(list(com_sub['levene_pval']))[1]
combined_corr = scipy.stats.combine_pvalues(list(com_sub['levene_pval.adj']))[1]
concat_list.append(com_sub)
for i in xrange(len(com_sub)):
combined_list.append(combined)
combined_corr_list.append(combined_corr)
names.append(name)
member_num_list.append(len(com_sub))
if len(list_values)==0:
zscore_list.append(np.nan)
state_list.append('')
else:
zscore_list.append(list_values[i])
if list_values[i]>1:
state_list.append('variable')
elif list_values[i]<-1:
state_list.append('stable')
else:
state_list.append('')
concat_df = pd.concat(concat_list)
concat_df['member.num'] = pd.Series(member_num_list, index = concat_df.index)
concat_df['name'] = pd.Series(names, index = concat_df.index)
concat_df['combined_pvalue'] = pd.Series(combined_list, index = concat_df.index)
concat_df['combined_pvalCorr'] = pd.Series(combined_corr_list, index = concat_df.index)
corrs = statsFacade.correct_nan_pvalues(combined_corr_list)
concat_df['combined_pvalCorrFinal'] = pd.Series(corrs, index = concat_df.index)
concat_df['z-score'] = pd.Series(zscore_list, index = concat_df.index)
concat_df['new_state'] = pd.Series(state_list, index = concat_df.index)
concat_df.to_csv(output_folder + 'suppTable3_fig4_underlyingData_subunitStability.tsv', sep = '\t')
return concat_df
@staticmethod
def export_underlying_data_part2(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
filename = 'fig4_stability_complexSubunits.txt'
data = DataFrameAnalyzer.getFile(folder, filename)
comData_corr = DataFrameAnalyzer.getFile(folder, 'figure3_underlying_data_real_corrValues.tsv')
comData_zsco = DataFrameAnalyzer.getFile(folder, 'figure3_underlying_data_ranked_zscores.tsv')
comDict_corr = comData_corr.to_dict()
comDict_zsco = comData_zsco.to_dict()
corr_list = list()
zscore_list = list()
for i,row in data.iterrows():
dataset = row['dataset of origin']
complexID = i
try:
corr_value = comDict_corr[dataset][complexID]
except:
corr_value = ''
corr_list.append(corr_value)
data['complex_corr'] = pd.Series(corr_list, index = data.index)
data.to_csv(output_folder + 'fig4_with3_underlyingData_subunitStability.tsv', sep = '\t')
@staticmethod
def read_data(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
complex_data = DataFrameAnalyzer.getFile(folder,'wpc_figure2B_underlyingData_complexes_RNA_newData.tsv')
quant_data = complex_data.drop(['complex_id','mean','p.value','label'],1)
mean_vector = quant_data.T.mean()
median_vector = quant_data.T.median()
pvalues = list()
label_list = list()
for zscore in median_vector:
pval = utilsFacade.zscore_to_pvalue(zscore)[1]
pvalues.append(pval)
if pval < 0.1 and zscore > 0:
label_list.append('variable')
elif pval < 0.1 and zscore < 0:
label_list.append('stable')
else:
label_list.append('')
complex_data['median'] = pd.Series(list(median_vector), index = complex_data.index)
complex_data['p.value'] = pd.Series(pvalues, index = complex_data.index)
complex_data['label'] = pd.Series(label_list, index = complex_data.index)
complex_data.to_csv(output_folder + 'wpc_figure2B_underlyingData_complexes_RNA_newData.tsv', sep = '\t')
if __name__ == "__main__":
## EXECUTE STEP8
step8_preparation.execute(folder = sys.argv[1], output_folder = sys.argv[2])
step8_stats_check.execute(folder = sys.argv[1], output_folder = sys.argv[2])
step8_figure4a.execute(folder = sys.argv[1], output_folder = sys.argv[2])
step8_figure4b.execute(folder = sys.argv[1], output_folder = sys.argv[2])
step8_export.execute(folder = sys.argv[1], output_folder = sys.argv[2])
All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.