STEP 13: Python Code
class step13_protein_variance_machine_learning(object):
def __init__(self, jobid, dataset, module, datatype,**kwargs):
self.folder = kwargs.get('folder','PATH')
self.jobid = int(jobid)-1
self.module = module
self.dataset = dataset
self.datatype = datatype
print('load_dictionary')
self.load_dictionary()
print('load_data')
self.data, self.name, self.subunit_names, self.relevant_filename = self.load_data()
print(self.relevant_filename)
r = self.run_allProteins_multivariate_analysis(folder = folder)
def load_dictionary(self):
folder = self.folder
datatype = self.datatype
file_list = os.listdir(folder)
json_list = filter(lambda a:a.endswith('.json')==True, file_list)
json_list = filter(lambda a:a.find(datatype)!=-1, json_list)
self.name_dict = json.load(open(folder + json_list[0], 'rb')) #53 is eif2b
def load_data(self):
jobid = self.jobid
folder = self.folder
module = self.module
dataset = self.dataset
datatype = self.datatype
print(datatype)
if module.find('essential')==-1 and module.find('housekeeping')==-1:
name_dict = self.name_dict
file_list = os.listdir(folder)
final_inputs = utilsFacade.filtering(final_inputs, module,condition = 'find')
if len(final_inputs)==0:
final_inputs = utilsFacade.filtering(input_list, module,
condition = 'startswith')
relevant_filename = utilsFacade.filtering(final_inputs, 'Num' + str(jobid) + '.')
if len(relevant_filename)==0:
return None
else:
print(relevant_filename[0])
m = DataFrameAnalyzer.getFile(folder, relevant_filename[0])
diet_cols = m.loc['diet'][:-1]
sex_cols = m.loc['sex'][:-1]
m = m[:-2].dropna(axis = 0, how='any')
m = m.T
m['sex'] = sex_cols
m['diet'] = diet_cols
m = m.T
subunit_names = [c for c in m.index if not c in {'sex', 'diet'}]
return m, name, subunit_names, relevant_filename[0]
def identify_dataset(self, **kwargs):
folder = kwargs.get('folder','PATH')
name = 'gygi3_entire_dataset.tsv.gz'
m = DataFrameAnalyzer.getFile(folder, name)
quantCols = utilsFacade.filtering(list(m.columns),'quant_')
sex_cols = [item.split('.')[0][-2] for item in quantCols]
diet_cols = [item.split('.')[0][-1] for item in quantCols]
data = m[quantCols].T
data['sex'] = pd.Series(sex_cols, index = data.index)
data['diet'] = pd.Series(diet_cols, index = data.index)
return data
def run_allProteins_multivariate_analysis(self, **kwargs):
folder = kwargs.get('folder','PATH')
m = self.identify_dataset(folder = folder)
complex_name = self.name
columns = list(m.columns)
subunit_names = list(m.columns)[0:-2]
passed_subunits = list()
idx_list = list()
for su in subunit_names:
c = passed_subunits.count(su)
idx_list.append(c)
passed_subunits.append(su)
columns = list()
for su,idx in zip(subunit_names, idx_list):
if idx != 0:
columns.append(str(su) + '.' + str(idx))
else:
columns.append(str(su))
m.columns = columns + ['sex','diet']
m['sex'] = [sex == 'M' for sex in m['sex']]
m['diet'] = [diet == 'H' for diet in m['diet']]
covariate_list = [['sex'], ['diet']]
m = m.convert_objects(convert_numeric=True)
print(m.shape)
nkfold = 10
counter = 0
table = list()
subunit_names = list(m.columns)[:-2]
if m.shape[1]>2:
for covariate in [['sex'],['diet'],['sex','diet']]:
for subunit in subunit_names:
print(subunit)
msub = m[[subunit] + covariate].dropna()
if len(msub)>10:
y_ref = np.array(msub[subunit])
X = np.array(msub[covariate])
kf = KFold(n_splits=nkfold, shuffle=True, random_state=500)
ki = 0
for train, test in kf.split(X):
X_train, X_test = X[train], X[test] #sex should be the explanatory variable
y_ref_train, y_ref_test = y_ref[train], y_ref[test] #subunits of complex/module in their abundance
clf = Ridge(random_state=500)
model = clf.fit(X_train,y_ref_train)
model_all = clf.fit(X, y_ref)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
coefficients = model.coef_
mean_se = mean_squared_error(y_ref_test,y_pred_test) #Mean squared error
r2_all = r2_score(y_ref, model_all.predict(X))
r2_train = r2_score(y_ref_train, y_pred_train)
r2_test = r2_score(y_ref_test, y_pred_test) #Variance score
explained_score_train = explained_variance_score(y_ref_train, y_pred_train)
explained_score_test = explained_variance_score(y_ref_test, y_pred_test)
explained_score_all = explained_variance_score(y_ref, model_all.predict(X))
t_stats_per_module = np.mean(model.coef_)/mean_se
p_per_module = 2 * (1 - scipy.stats.t.cdf(np.abs(t_stats_per_module), y_pred_test.shape[0] - X_test.shape[1]))
temp = [ki,complex_name,subunit,len(msub),'_'.join(covariate),
mean_se, r2_all, r2_train, r2_test, explained_score_all,
explained_score_train, explained_score_test,
t_stats_per_module, p_per_module]
table.append(temp)
res = pd.DataFrame(table, columns=['ki','complex.name','subunit','num',
'covariates','mean_se',
'r2.all.module','r2.train.module', 'r2.test.module',
'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module',
't_stats_per_module','pvalue_per_module'])
grouped_module = []
for idx, grp in res.groupby(['subunit', 'covariates']):
grouped_module.append(list(idx) + [list(grp['num'])[0]] +
[np.median(grp['mean_se']),
np.median(grp['r2.all.module']),
np.median(grp['r2.train.module']), np.median(grp['r2.test.module']),
np.median(grp['explained_variance_score.all.module']),
np.median(grp['explained_variance_score.training.module']),
np.median(grp['explained_variance_score.testing.module']),
np.median(grp['t_stats_per_module']), np.median(grp['pvalue_per_module'])])
res_module = pd.DataFrame(grouped_module, columns=['subunit', 'covariates','num',
'mean_se', 'r2.all.module',
'r2.train.module','r2.test.module', 'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module','t_stats_per_module','pvalue_per_module'])
res_module.to_csv(folder + 'protein_variation.tsv.gz',compression = 'gzip', sep ='\t')
class step13_preparation:
@staticmethod
def execute(**kwargs):
folder = kwargs.get('folder','PATH')
step = step13_protein_variance_machine_learning(1, 'gygi3','complex', 'quant', folder = folder)
class step13_figure:
@staticmethod
def execute(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
print('SUPP-FIGURE4A: main_suppFig5a_aebersold_liu_comparison')
step13_figure.main_suppFig4a_aebersold_liu_comparison(folder = folder, output_folder = output_folder)
@staticmethod
def main_suppFig4a_aebersold_liu_comparison(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
print('load_aebersold_liu_data')
data = step13_figure.load_aebersold_liu_data(folder = folder, output_folder = output_folder)
print('comparison_aebersold_liu_data')
step13_figure.comparison_aebersold_liu_data(data, folder = folder, output_folder = output_folder)
@staticmethod
def load_aebersold_liu_data(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
filename = 'aebersold_liu_twin_data.txt'
data = DataFrameAnalyzer.getFile(folder, filename)
data.index = data['Protein Symbol']
return data
@staticmethod
def comparison_aebersold_liu_data(aedata, **kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
fname = 'protein_variation.tsv.gz'
data = DataFrameAnalyzer.open_in_chunks(folder, fname)
data.index = [su.upper() for su in map(str,list(data['subunit']))]
sex_all_quant = data[data.covariates=='sex']
diet_all_quant = data[data.covariates=='diet']
overlapping_proteins = list(set(aedata.index).intersection(set(sex_all_quant.index)))
aedata = aedata.T[overlapping_proteins].T
print(aedata.shape)
heritability_dict = aedata['Heritability'].to_dict()
environment_dict = aedata['Individual environment'].to_dict()
sub_sex_data = sex_all_quant.loc[overlapping_proteins]
sex_dict = sub_sex_data['r2.all.module'].to_dict()
sub_diet_data = diet_all_quant.loc[overlapping_proteins]
diet_dict = sub_diet_data['r2.all.module'].to_dict()
ae_heritability_list = list()
ae_environment_list = list()
ou_gender_list = list()
ou_diet_list = list()
genetic_protein_list = list()
env_protein_list = list()
for protein in overlapping_proteins:
if heritability_dict[protein]>0 and environment_dict[protein]>0:
ou_gender_list.append(sex_dict[protein])
ae_heritability_list.append(heritability_dict[protein])
genetic_protein_list.append(protein)
ou_diet_list.append(diet_dict[protein])
ae_environment_list.append(environment_dict[protein])
env_protein_list.append(protein)
#51 proteins compared
genetic_df = pd.DataFrame({'gender':ou_gender_list,
'heritability':ae_heritability_list})
env_df = pd.DataFrame({'environment':ae_environment_list,'diet':ou_diet_list})
genetic_df.index = genetic_protein_list
env_df.index = env_protein_list
df = pd.DataFrame({'gender':ou_gender_list,
'heritability':ae_heritability_list,
'environment':ae_environment_list,
'diet':ou_diet_list})
df.index = env_protein_list
rand_corrs = list()
for i in np.arange(1,100):
table_list = list()
for i,row in df.iterrows():
temp = list(row)
np.random.shuffle(temp)
table_list.append(temp)
rand_data = pd.DataFrame(table_list)
rand_data = rand_data.T
table_list = list()
for i,row in rand_data.iterrows():
temp = list(row)
np.random.shuffle(temp)
table_list.append(temp)
rand_data = pd.DataFrame(table_list)
rand_data = rand_data.T
rand_corrs.append(utilsFacade.get_correlation_values(rand_data.corr(method = 'spearman')))
rand_corrs = utilsFacade.flatten(rand_corrs)
rand_corrs = np.array(rand_corrs)
corrData = df.corr(method = 'spearman')
corrData = corrData[['gender','diet']]
corrData = corrData.T[['heritability','environment']].T
pvalues = list()
for i,row in corrData.iterrows():
c1,c2 = list(row)
zscore1 = utilsFacade.getSampleZScore(rand_corrs, c1)
zscore2 = utilsFacade.getSampleZScore(rand_corrs, c2)
p1,p2 = utilsFacade.zscore_to_pvalue(zscore1)
temp = [p1]
p1,p2 = utilsFacade.zscore_to_pvalue(zscore2)
temp.append(p1)
pvalues.append(temp)
sns.set(context='notebook', style='white',
palette='deep', font='Liberation Sans', font_scale=1,
color_codes=False, rc=None)
plt.rcParams["axes.grid"] = False
plt.clf()
fig = plt.figure(figsize = (1,1))
ax = fig.add_subplot(111)
sns.heatmap(corrData, cmap = plt.cm.RdBu, linewidth = 0.5, cbar = False, annot = True)
plt.savefig(output_folder + 'suppFig5a_comparison_aebersold.pdf', bbox_inches = 'tight', dpi = 400)
#make a heatmap with ranking for supp
rank_list = list()
for col in list(df.columns):
rank_list.append(rankdata(list(df[col])))
ranked_df = pd.DataFrame(rank_list)
ranked_df = ranked_df.T
ranked_df.index = df.index
ranked_df.columns = df.columns
ranked_df, pro = utilsFacade.recluster_matrix_only_rows(ranked_df)
ranked_df = ranked_df.T
plt.clf()
fig = plt.figure(figsize = (20,3))
ax = fig.add_subplot(111)
sns.heatmap(ranked_df, cmap = plt.cm.coolwarm, linewidth = 1.0)
plt.savefig(output_folder + 'suppFig5a_comparison_aebersold_heatmap_rankedDF.pdf',
bbox_inches = 'tight', dpi = 400)
if __name__ == "__main__":
## EXECUTE STEP13
step13_preparation.execute(folder = sys.argv[1])
step13_figure.execute(folder = sys.argv[1], output_folder = sys.argv[2])
All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.