STEP 20: Python Code
class genetic_environmental_effect_prep:
@staticmethod
def execute(**kwargs):
folder = kwargs.get('folder', 'PATH')
print('load_data')
data_dict = genetic_environmental_effect_prep.load_data(folder)
print('load_stoichiometry_data')
com_yeast_dict, pat_yeast_dict = genetic_environmental_effect_prep.load_stoichiometry_data(folder)
print('prepare_dataframes_complex_abundances')
complex_data_dict = genetic_environmental_effect_prep.prepare_dataframes_complex_abundances(data_dict)
print('prepare_dataframes_pathway_abundances')
pat_data_dict = genetic_environmental_effect_prep.prepare_dataframes_pathway_abundances(data_dict)
print('prepare_dataframes_complex_stoichiometries')
complexStoch_data_dict = genetic_environmental_effect_prep.prepare_dataframes_complex_stoichiometries(data_dict, com_yeast_dict)
print('prepare_dataframes_pathway_stoichiometries')
pathwayStoch_data_dict = genetic_environmental_effect_prep.prepare_dataframes_pathway_stoichiometries(data_dict, pat_yeast_dict)
print('prepare_dataframes_protein_abundances')
protein_data_dict = genetic_environmental_effect_prep.prepare_dataframes_protein_abundances(data_dict)
all_dictionaries = data_dict, complex_data_dict, pat_data_dict, complexStoch_data_dict, pathwayStoch_data_dict, protein_data_dict
print('gather_explained_variance_values')
genetic_environmental_effect_prep.gather_explained_variance_values(all_dictionaries, folder)
@staticmethod
def load_data(folder):
data_dict = dict()
fname = 'yeast3_quant_Proteome_carbonSources_MAPPED_complexes_pathways_NORM2.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'rel. Intensity')
group1 = utilsFacade.filtering(quant_cols, 'RF')
group2 = utilsFacade.filtering(quant_cols, 'GAL')
groups = [group1, group2]
data_dict.setdefault('ENV_yeast3',[])
data_dict['ENV_yeast3'] = data, quant_cols, groups
###################################################################################
fname = 'yeast5_quant_Proteome_MAPPED_complexes_pathways_NORM2_IBAQ.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
#quant_cols = utilsFacade.filtering(list(data.columns), 'LFQ intensity')
quant_cols = utilsFacade.filtering(list(data.columns), 'iBAQ')[:-1]
group1 = utilsFacade.filtering(quant_cols, 'DBVPG')
group2 = utilsFacade.filtering(quant_cols, 'UWOPS')
group3 = utilsFacade.filtering(quant_cols, 'YJM')
group4 = utilsFacade.filtering(quant_cols, 'YPS')
group5 = utilsFacade.filtering(quant_cols, 'YS')
groups = [set(group1), set(group2), set(group3), set(group4), set(group5)]
group6 = list(set(quant_cols).difference(set.union(*groups)))
groups = [group1, group2, group3, group4, group5, group6]
data_dict.setdefault('GEN_yeast5',[])
data_dict['GEN_yeast5'] = data, quant_cols, groups
###################################################################################
fname = 'yeast11_quant_proteome_MAPPED_complexes_pathways_NORM2.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'quant')
group1 = utilsFacade.filtering(quant_cols, 'REF')
group2 = utilsFacade.filtering(quant_cols, 'EtOH')
group3 = utilsFacade.filtering(quant_cols, 'Osmo')
group4 = utilsFacade.filtering(quant_cols, 'Temp')
groups = [group1, group2, group3, group4]
data_dict.setdefault('ENV_yeast11',[])
data_dict['ENV_yeast11'] = data, quant_cols, groups
###################################################################################
fname = 'yeast14_quant_proteome_MAPPED_complexes_pathways_NORM2.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'quant_', condition = 'startswith')
group1a = utilsFacade.filtering(quant_cols, 'BY')
group1b = utilsFacade.filtering(quant_cols, 'RM')
group1 = group1a + group1b
group2 = list(set(quant_cols).difference(set(group1)))
groups = [group1, group2]
data_dict.setdefault('GEN_yeast14',[])
data_dict['GEN_yeast14'] = data, quant_cols, groups
###################################################################################
fname = 'yeast20_quant_proteome_MAPPED_complexes_pathways_NORM2.tsv.gz'
data = DataFrameAnalyzer.getFile(folder, fname)
quant_cols = utilsFacade.filtering(list(data.columns), 'quant_', condition = 'startswith')
group1 = utilsFacade.filtering(quant_cols, '6 h')
group2 = utilsFacade.filtering(quant_cols, '3 days')
group3 = utilsFacade.filtering(quant_cols, '8 days')
group4 = utilsFacade.filtering(quant_cols, '9 days')
groups = [group1, group2, group3, group4]
data_dict.setdefault('ENV_yeast20',[])
data_dict['ENV_yeast20'] = data, quant_cols, groups
return data_dict
@staticmethod
def load_stoichiometry_data(folder):
com_yeast_dict = DataFrameAnalyzer.read_pickle(folder + 'modulewise_norm_complex_stoichiometry.pkl')
pat_yeast_dict = DataFrameAnalyzer.read_pickle(folder + 'modulewise_norm_pathway_stoichiometry.pkl')
return com_yeast_dict, pat_yeast_dict
@staticmethod
def prepare_dataframes_complex_abundances(data_dict):
complex_data_dict = dict((e1,dict()) for e1 in list(data_dict.keys()))
for key in data_dict.keys():
print(key)
data, qcols, groups = data_dict[key]
letters = ['a','b','c','d','e','f','g']
complexes = list(set(data.complexId))
all_complexes = list()
for com in complexes:
if str(com)!='nan':
for c in com.split(';'):
all_complexes.append(c)
all_complexes = list(set(all_complexes))
complex_dict = dict()
for complexId in all_complexes:
sub = data[data.complexId.str.contains(complexId, na = False, regex = False)]
if len(sub)>=4:
qsub = sub[qcols].T
group_indices = list()
for item in list(qsub.index):
for g,group in enumerate(groups):
if item in group:
group_indices.append(letters[g])
qsub['group'] = pd.Series(group_indices, index = qsub.index)
complex_dict.setdefault(complexId, [])
complex_dict[complexId] = qsub
complex_data_dict[key] = complex_dict
return complex_data_dict
@staticmethod
def prepare_dataframes_pathway_abundances(data_dict):
pat_data_dict = dict((e1,dict()) for e1 in list(data_dict.keys()))
for key in data_dict.keys():
print(key)
data, qcols, groups = data_dict[key]
letters = ['a','b','c','d','e','f','g']
complexes = list(set(data.pathway))
all_complexes = list()
for com in complexes:
if str(com)!='nan':
for c in com.split(';'):
all_complexes.append(c)
all_complexes = list(set(all_complexes))
complex_dict = dict()
for complexId in all_complexes:
sub = data[data.pathway.str.contains(complexId, na = False, regex = False)]
if len(sub)>=4:
qsub = sub[qcols].T
group_indices = list()
for item in list(qsub.index):
for g,group in enumerate(groups):
if item in group:
group_indices.append(letters[g])
qsub['group'] = pd.Series(group_indices, index = qsub.index)
complex_dict.setdefault(complexId, [])
complex_dict[complexId] = qsub
pat_data_dict[key] = complex_dict
return pat_data_dict
@staticmethod
def prepare_dataframes_protein_abundances(data_dict):
protein_data_dict = dict((e1,dict()) for e1 in list(data_dict.keys()))
for key in data_dict.keys():
print(key)
data, qcols, groups = data_dict[key]
letters = ['a','b','c','d','e','f','g']
all_proteins = list(data.index)
protein_dict = dict()
for p,protein in enumerate(all_proteins):
sub = data.iloc[p]
qsub = sub[qcols].T
qsub = qsub.to_frame()
group_indices = list()
for item in list(qsub.index):
for g,group in enumerate(groups):
if item in group:
group_indices.append(letters[g])
qsub['group'] = pd.Series(group_indices, index = qsub.index)
protein_dict.setdefault(protein, [])
protein_dict[protein] = qsub
protein_data_dict[key] = protein_dict
return protein_data_dict
@staticmethod
def prepare_dataframes_complex_stoichiometries(data_dict, com_yeast_dict):
complex_data_dict = dict((e1,dict()) for e1 in list(data_dict.keys()))
for key in data_dict.keys():
print(key)
ds = key.split('_')[1]
stoch_data = com_yeast_dict[ds]
data, qcols, groups = data_dict[key]
letters = ['a','b','c','d','e','f','g']
all_complexes = list(set(stoch_data.complexId))
complex_dict = dict()
for complexId in all_complexes:
sub = stoch_data[stoch_data.complexId.str.contains(complexId, na = False, regex = False)]
if len(sub)>=4:
qsub = sub[qcols].T
group_indices = list()
for item in list(qsub.index):
for g,group in enumerate(groups):
if item in group:
group_indices.append(letters[g])
qsub['group'] = pd.Series(group_indices, index = qsub.index)
complex_dict.setdefault(complexId, [])
complex_dict[complexId] = qsub
complex_data_dict[key] = complex_dict
return complex_data_dict
@staticmethod
def prepare_dataframes_pathway_stoichiometries(data_dict, pat_yeast_dict):
pathway_data_dict = dict((e1,dict()) for e1 in list(data_dict.keys()))
for key in data_dict.keys():
print(key)
ds = key.split('_')[1]
stoch_data = pat_yeast_dict[ds]
data, qcols, groups = data_dict[key]
letters = ['a','b','c','d','e','f','g']
all_complexes = list(set(stoch_data.complexId))
complex_dict = dict()
for complexId in all_complexes:
sub = stoch_data[stoch_data.complexId.str.contains(complexId, na = False, regex = False)]
if len(sub)>=4:
qsub = sub[qcols].T
group_indices = list()
for item in list(qsub.index):
for g,group in enumerate(groups):
if item in group:
group_indices.append(letters[g])
qsub['group'] = pd.Series(group_indices, index = qsub.index)
complex_dict.setdefault(complexId, [])
complex_dict[complexId] = qsub
pathway_data_dict[key] = complex_dict
return pathway_data_dict
@staticmethod
def gather_explained_variance_values(all_dictionaries, folder):
data_dict, complex_data_dict, pat_data_dict, complexStoch_data_dict, pathwayStoch_data_dict, protein_data_dict = all_dictionaries
print('test_framework_abundance_complexes')
genetic_environmental_effect_prep.test_framework_abundance_complexes(folder, complex_data_dict)
print('test_framework_abundance_pathways')
genetic_environmental_effect_prep.test_framework_abundance_pathways(folder, pat_data_dict)
print('test_framework_stoichiometry_complexes')
genetic_environmental_effect_prep.test_framework_stoichiometry_complexes(folder, complexStoch_data_dict)
print('test_framework_stoichiometry_pathways')
genetic_environmental_effect_prep.test_framework_stoichiometry_pathways(folder, pathwayStoch_data_dict)
print('test_framework_all_proteins ')
genetic_environmental_effect_prep.test_framework_all_proteins(folder, protein_data_dict)
print('summarize_results_all_proteins')
genetic_environmental_effect_prep.summarize_results_all_proteins(folder)
print('summarize_results_complex_abundances')
genetic_environmental_effect_prep.summarize_results_complex_abundances(folder)
print('summarize_results_complex_stoichiometry')
genetic_environmental_effect_prep.summarize_results_complex_stoichiometry(folder)
print('summarize_results_pathway_abundances')
genetic_environmental_effect_prep.summarize_results_pathway_abundances(folder)
print('summarize_results_pathway_stoichiometry')
genetic_environmental_effect_prep.summarize_results_pathway_stoichiometry(folder)
print('reorganize_folders_all_proteins')
genetic_environmental_effect_prep.reorganize_folders_all_proteins(folder)
print('reorganize_folders_complex_abundances')
genetic_environmental_effect_prep.reorganize_folders_complex_abundances(folder)
print('reorganize_folders_complex_stoichiometry')
genetic_environmental_effect_prep.reorganize_folders_complex_stoichiometry(folder)
print('reorganize_folders_pathway_abundances')
genetic_environmental_effect_prep.reorganize_folders_pathway_abundances(folder)
print('reorganize_folders_pathway_stoichiometry')
genetic_environmental_effect_prep.reorganize_folders_pathway_stoichiometry(folder)
@staticmethod
def run_multivariate_analysis(m, covariate_list, complex_name):
subunit_names = utilsFacade.filtering(list(m.columns), 'group', condition = 'notfind')
covariate = covariate_list[0]
nkfold = len(m)
counter = 0
table = list()
y_ref = np.array(m[subunit_names])
X = np.array(m[covariate])
kf = KFold(n_splits = nkfold, shuffle = True, random_state=500)
ki = 0
for train, test in kf.split(X):
X_train, X_test = X[train], X[test] #sex should be the explanatory variable
y_ref_train, y_ref_test = y_ref[train], y_ref[test] #subunits of complex/module in their abundance
clf = Ridge(random_state=500)
model = clf.fit(X_train,y_ref_train)
model_all = clf.fit(X, y_ref)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
coefficients = model.coef_
mean_se = mean_squared_error(y_ref_test,y_pred_test) #Mean squared error
r2_all = r2_score(y_ref, model_all.predict(X))
r2_train = r2_score(y_ref_train, y_pred_train)
r2_test = r2_score(y_ref_test, y_pred_test) #Variance score
r2_list_train = list()
for x,y in zip(y_ref_train.T, y_pred_train.T):
r2_list_train.append(r2_score(x,y))
r2_list_test = list()
for x,y in zip(y_ref_test.T, y_pred_test.T):
r2_list_test.append(r2_score(x,y))
r2_list_all = list()
for x,y in zip(y_ref.T, model_all.predict(X).T):
r2_list_all.append(r2_score(x,y))
explained_score_train = explained_variance_score(y_ref_train, y_pred_train)
explained_score_test = explained_variance_score(y_ref_test, y_pred_test)
explained_score_all = explained_variance_score(y_ref, model_all.predict(X))
sse = np.sum((y_pred_test - y_ref_test) ** 2, axis= 0) / float(X_test.shape[0] - X_test.shape[1])
try:
se = np.array([np.sqrt(np.diagonal(sse[i] * np.linalg.inv(np.dot(X_test.T, X_test))))
for i in range(sse.shape[0])])
except:
se = np.array([np.sqrt(np.diagonal(sse[i] * np.linalg.inv(np.array([[True]]))))
for i in range(sse.shape[0])])
t = model.coef_ / se
p = 2 * (1 - scipy.stats.t.cdf(np.abs(t), y_pred_test.shape[0] - X_test.shape[1]))
pval_adj = list(RFacade.get_bh_pvalues(p))
t_stats_per_module = np.mean(model.coef_)/mean_se
p_per_module = 2 * (1 - scipy.stats.t.cdf(np.abs(t_stats_per_module), y_pred_test.shape[0] - X_test.shape[1]))
ki += 1
ccount = 0
for su,r2 in zip(subunit_names, r2_list_train):
temp = [ki,complex_name,su, y_ref.shape[1],'_'.join(covariate),
mean_se, r2_list_all[ccount], r2, r2_list_test[ccount],
r2_all, r2_train, r2_test, explained_score_all,
explained_score_train, explained_score_test, coefficients[ccount][0],
sse[ccount], se[ccount][0], t[ccount][0], p[ccount][0],
pval_adj[ccount], t_stats_per_module, p_per_module]
table.append(temp)
ccount +=1
res = pd.DataFrame(table, columns=['ki','complex.name','subunit','n.subunits',
'covariates','mean_se','r2.all.subunit',
'r2.train.subunit','r2.test.subunit',
'r2.all.module','r2.train.module', 'r2.test.module',
'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module',
'coefficient','sse','se','t','pvalue','pval.adj',
't_stats_per_module','pvalue_per_module'])
grouped_module = []
for idx, grp in res.groupby(['complex.name', 'covariates']):
grouped_module.append(list(idx) + [list(grp['n.subunits'])[0]] +
[np.median(grp['mean_se']), np.median(grp['r2.all.subunit']),
np.median(grp['r2.train.subunit']),
np.median(grp['r2.test.subunit']), np.median(grp['r2.all.module']),
np.median(grp['r2.train.module']), np.median(grp['r2.test.module']),
np.median(grp['explained_variance_score.all.module']),
np.median(grp['explained_variance_score.training.module']),
np.median(grp['explained_variance_score.testing.module']),
np.median(grp['coefficient']), np.median(grp['sse']),
np.median(grp['se']), np.median(grp['t']),
np.median(grp['pvalue']),np.median(grp['pval.adj']),
np.median(grp['t_stats_per_module']), np.median(grp['pvalue_per_module'])])
res_module = pd.DataFrame(grouped_module, columns=['complex.name', 'covariates','n.subunits',
'mean_se', 'r2.all.subunit', 'r2.train.subunit','r2.test.subunit', 'r2.all.module',
'r2.train.module','r2.test.module', 'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module','coefficient','sse','se','t',
'pvalue','pval.adj','t_stats_per_module','pvalue_per_module'])
#group per subunit in module
grouped_subunit = []
for idx, grp in res.groupby(['complex.name', 'covariates', 'subunit']):
grouped_subunit.append(list(idx) + [list(grp['n.subunits'])[0]] +
[np.median(grp['mean_se']), np.median(grp['r2.all.subunit']),
np.median(grp['r2.train.subunit']),
np.median(grp['r2.test.subunit']), np.median(grp['r2.all.module']),
np.median(grp['r2.train.module']), np.median(grp['r2.test.module']),
np.median(grp['explained_variance_score.all.module']),
np.median(grp['explained_variance_score.training.module']),
np.median(grp['explained_variance_score.testing.module']),
np.median(grp['coefficient']), np.median(grp['sse']),
np.median(grp['se']), np.median(grp['t']),
np.median(grp['pvalue']),np.median(grp['pval.adj']),
np.median(grp['t_stats_per_module']), np.median(grp['pvalue_per_module'])])
res_subunit = pd.DataFrame(grouped_subunit, columns=['complex.name', 'covariates', 'subunit',
'n.subunits','mean_se', 'r2.all.subunit', 'r2.train.subunit','r2.test.subunit', 'r2.all.module',
'r2.train.module','r2.test.module', 'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module','coefficient','sse','se','t',
'pvalue','pval.adj','t_stats_per_module','pvalue_per_module'])
return res, res_module, res_subunit
@staticmethod
def test_framework_abundance_complexes(folder, complex_data_dict):
keys = list(complex_data_dict.keys())
for key in keys:
res_concat_list = list()
res_module_concat_list = list()
res_subunit_concat_list = list()
complexes = list(complex_data_dict[key].keys())
for com,complexId in enumerate(complexes):
print(key, complexId)
df = complex_data_dict[key][complexId]
combined_groups = utilsFacade.getCombinations(list(set(df['group'])))
for cg in combined_groups:
altName = 'complex' + str(com)
group_label = '::'.join(cg)
group_label_for_file = group_label.replace('::','_')
existence1 = os.path.exists(folder + '_'.join(['RES_test_explainVar', altName, key, group_label_for_file]) + '.tsv')
existence2 = os.path.exists(folder + '_'.join(['RESMODULE_test_explainVar', altName, key, group_label_for_file]) + '.tsv')
existence3 = os.path.exists(folder + '_'.join(['RESSUBUNIT_test_explainVar', altName, key, group_label_for_file]) + '.tsv')
if (existence1 == False) or (existence2 == False) or (existence3 == False):
sub1 = df[df['group']==cg[0]]
sub2 = df[df['group']==cg[1]]
sub = pd.concat([sub1, sub2])
sub = sub.dropna(axis = 1)
sub['group'] = [item == cg[0] for item in sub['group']]
if len(sub.columns)>1:
covariate_list = [['group']]
sub = sub.convert_objects(convert_numeric=True)
print(sub.shape)
res, res_module, res_subunit = genetic_environmental_effect_prep.run_multivariate_analysis(sub,
covariate_list, complexId)
res['group_comparison'] = pd.Series([group_label]*len(res), index = res.index)
res_module['group_comparison'] = pd.Series([group_label]*len(res_module), index = res_module.index)
res_subunit['group_comparison'] = pd.Series([group_label]*len(res_subunit), index = res_subunit.index)
res_concat_list.append(res)
res_module_concat_list.append(res_module)
res_subunit_concat_list.append(res_subunit)
res.to_csv(folder + '_'.join(['RES_test_explainVar',altName, key, group_label_for_file]) + '.tsv',
sep = '\t')
res_module.to_csv(folder + '_'.join(['RESMODULE_test_explainVar', altName, key, group_label_for_file]) + '.tsv', sep = '\t')
res_subunit.to_csv(folder + '_'.join(['RESSUBUNIT_test_explainVar', altName, key, group_label_for_file]) + '.tsv', sep = '\t')
res = pd.concat(res_concat_list)
res_module = pd.concat(res_module_concat_list)
res_subunit = pd.concat(res_subunit_concat_list)
res.to_csv(folder + 'RES_test_explainVar_' + key + '.tsv', sep = '\t')
res_module.to_csv(folder + 'RESMODULE_test_explainVar_' + key + '.tsv', sep = '\t')
res_subunit.to_csv(folder + 'RESSUBUNIT_test_explainVar_' + key + '.tsv', sep = '\t')
return res, res_module, res_subunit
@staticmethod
def test_framework_abundance_pathways(folder, pat_data_dict):
keys = list(pat_data_dict.keys())
for key in keys:
res_concat_list = list()
res_module_concat_list = list()
res_subunit_concat_list = list()
complexes = list(pat_data_dict[key].keys())
for com,complexId in enumerate(complexes):
print(key, complexId)
df = pat_data_dict[key][complexId]
combined_groups = utilsFacade.getCombinations(list(set(df['group'])))
for cg in combined_groups:
altName = 'complex' + str(com)
group_label = '::'.join(cg)
group_label_for_file = group_label.replace('::','_')
existence1 = os.path.exists(folder + '_'.join(['RES_test_pathway_explainVar', altName, key, group_label_for_file]) + '.tsv')
existence2 = os.path.exists(folder + '_'.join(['RESMODULE_test_pathway_explainVar', altName, key, group_label_for_file]) + '.tsv')
existence3 = os.path.exists(folder + '_'.join(['RESSUBUNIT_test_pathway_explainVar', altName, key, group_label_for_file]) + '.tsv')
if (existence1 == False) or (existence2 == False) or (existence3 == False):
sub1 = df[df['group']==cg[0]]
sub2 = df[df['group']==cg[1]]
sub = pd.concat([sub1, sub2])
sub = sub.dropna(axis = 1)
sub['group'] = [item == cg[0] for item in sub['group']]
if len(sub.columns)>1:
covariate_list = [['group']]
sub = sub.convert_objects(convert_numeric=True)
print(sub.shape)
res, res_module, res_subunit = genetic_environmental_effect_prep.run_multivariate_analysis(sub,
covariate_list, complexId)
res['group_comparison'] = pd.Series([group_label]*len(res), index = res.index)
res_module['group_comparison'] = pd.Series([group_label]*len(res_module), index = res_module.index)
res_subunit['group_comparison'] = pd.Series([group_label]*len(res_subunit), index = res_subunit.index)
res_concat_list.append(res)
res_module_concat_list.append(res_module)
res_subunit_concat_list.append(res_subunit)
res.to_csv(folder + '_'.join(['RES_test_pathway_explainVar', altName, key, group_label_for_file]) + '.tsv',sep = '\t')
res_module.to_csv(folder + '_'.join(['RESMODULE_test_pathway_explainVar', altName, key,group_label_for_file]) + '.tsv',sep = '\t')
res_subunit.to_csv(folder + '_'.join(['RESSUBUNIT_test_pathway_explainVar', altName, key, group_label_for_file]) + '.tsv',sep = '\t')
res = pd.concat(res_concat_list)
res_module = pd.concat(res_module_concat_list)
res_subunit = pd.concat(res_subunit_concat_list)
res.to_csv(folder + 'RES_test_pathway_explainVar_' + key + '.tsv', sep = '\t')
res_module.to_csv(folder + 'RESMODULE_test_pathway_explainVar_' + key + '.tsv', sep = '\t')
res_subunit.to_csv(folder + 'RESSUBUNIT_test_pathway_explainVar_' + key + '.tsv', sep = '\t')
return res, res_module, res_subunit
@staticmethod
def test_framework_stoichiometry_complexes(folder, complexStoch_data_dict):
keys = list(complexStoch_data_dict.keys())
for key in keys:
res_concat_list = list()
res_module_concat_list = list()
res_subunit_concat_list = list()
complexes = list(complexStoch_data_dict[key].keys())
for com,complexId in enumerate(complexes):
print(key, complexId)
df = complexStoch_data_dict[key][complexId]
combined_groups = utilsFacade.getCombinations(list(set(df['group'])))
for cg in combined_groups:
altName = 'complex' + str(com)
group_label = '::'.join(cg)
group_label_for_file = group_label.replace('::','_')
existence1 = os.path.exists(folder + '_'.join(['RES_test_stoch_explainVar', altName, key, group_label_for_file]) + '.tsv')
existence2 = os.path.exists(folder + '_'.join(['RESMODULE_test_stoch_explainVar', altName, key, group_label_for_file]) + '.tsv')
existence3 = os.path.exists(folder + '_'.join(['RESSUBUNIT_test_stoch_explainVar', altName, key, group_label_for_file]) + '.tsv')
if (existence1 == False) or (existence2 == False) or (existence3 == False):
sub1 = df[df['group']==cg[0]]
sub2 = df[df['group']==cg[1]]
sub = pd.concat([sub1, sub2])
sub = sub.dropna(axis = 1)
sub['group'] = [item == cg[0] for item in sub['group']]
if len(sub.columns)>1:
covariate_list = [['group']]
sub = sub.convert_objects(convert_numeric=True)
print(sub.shape)
res, res_module, res_subunit = genetic_environmental_effect_prep.run_multivariate_analysis(sub,
covariate_list, complexId)
res['group_comparison'] = pd.Series([group_label]*len(res), index = res.index)
res_module['group_comparison'] = pd.Series([group_label]*len(res_module), index = res_module.index)
res_subunit['group_comparison'] = pd.Series([group_label]*len(res_subunit), index = res_subunit.index)
res_concat_list.append(res)
res_module_concat_list.append(res_module)
res_subunit_concat_list.append(res_subunit)
res.to_csv(folder + '_'.join(['RES_test_stoch_explainVar', altName, key, group_label_for_file]) + '.tsv', sep = '\t')
res_module.to_csv(folder + '_'.join(['RESMODULE_test_stoch_explainVar', altName, key, group_label_for_file]) + '.tsv',sep = '\t')
res_subunit.to_csv(folder + '_'.join(['RESSUBUNIT_test_stoch_explainVar', altName, key, group_label_for_file]) + '.tsv',sep = '\t')
res = pd.concat(res_concat_list)
res_module = pd.concat(res_module_concat_list)
res_subunit = pd.concat(res_subunit_concat_list)
res.to_csv(folder + 'RES_test_stoch_explainVar_' + key + '.tsv', sep = '\t')
res_module.to_csv(folder + 'RESMODULE_test_stoch_explainVar_' + key + '.tsv', sep = '\t')
res_subunit.to_csv(folder + 'RESSUBUNIT_test_stoch_explainVar_' + key + '.tsv', sep = '\t')
return res, res_module, res_subunit
@staticmethod
def test_framework_stoichiometry_pathways(folder, pathwayStoch_data_dict):
keys = list(pathwayStoch_data_dict.keys())
for key in keys:
res_concat_list = list()
res_module_concat_list = list()
res_subunit_concat_list = list()
complexes = list(pathwayStoch_data_dict[key].keys())
for com,complexId in enumerate(complexes):
print(key, complexId)
df = pathwayStoch_data_dict[key][complexId]
combined_groups = utilsFacade.getCombinations(list(set(df['group'])))
for cg in combined_groups:
altName = 'complex' + str(com)
group_label = '::'.join(cg)
group_label_for_file = group_label.replace('::','_')
existence1 = os.path.exists(folder + '_'.join(['RES_test_pathway_stoch_explainVar', altName, key, group_label_for_file]) + '.tsv')
existence2 = os.path.exists(folder + '_'.join(['RESMODULE_test_pathway_stoch_explainVar', altName, key, group_label_for_file]) + '.tsv')
existence3 = os.path.exists(folder + '_'.join(['RESSUBUNIT_test_pathway_stoch_explainVar', altName, key, group_label_for_file]) + '.tsv')
if (existence1 == False) or (existence2 == False) or (existence3 == False):
sub1 = df[df['group']==cg[0]]
sub2 = df[df['group']==cg[1]]
sub = pd.concat([sub1, sub2])
sub = sub.dropna(axis = 1)
sub['group'] = [item == cg[0] for item in sub['group']]
if len(sub.columns)>1:
covariate_list = [['group']]
sub = sub.convert_objects(convert_numeric=True)
print(sub.shape)
res, res_module, res_subunit = genetic_environmental_effects.run_multivariate_analysis(sub,
covariate_list, complexId)
res['group_comparison'] = pd.Series([group_label]*len(res), index = res.index)
res_module['group_comparison'] = pd.Series([group_label]*len(res_module), index = res_module.index)
res_subunit['group_comparison'] = pd.Series([group_label]*len(res_subunit), index = res_subunit.index)
res_concat_list.append(res)
res_module_concat_list.append(res_module)
res_subunit_concat_list.append(res_subunit)
res.to_csv(folder + '_'.join(['RES_test_pathway_stoch_explainVar', altName, key, group_label_for_file]) + '.tsv',sep = '\t')
res_module.to_csv(folder + '_'.join(['RESMODULE_test_pathway_stoch_explainVar', altName, key, group_label_for_file]) + '.tsv',sep = '\t')
res_subunit.to_csv(folder + '_'.join(['RESSUBUNIT_test_pathway_stoch_explainVar', altName, key, group_label_for_file]) + '.tsv',sep = '\t')
res = pd.concat(res_concat_list)
res_module = pd.concat(res_module_concat_list)
res_subunit = pd.concat(res_subunit_concat_list)
res.to_csv(folder + 'RES_test_pathway_stoch_explainVar_' + key + '.tsv', sep = '\t')
res_module.to_csv(folder + 'RESMODULE_test_pathway_stoch_explainVar_' + key + '.tsv', sep = '\t')
res_subunit.to_csv(folder + 'RESSUBUNIT_test_pathway_stoch_explainVar_' + key + '.tsv', sep = '\t')
return res, res_module, res_subunit
@staticmethod
def test_framework_all_proteins(folder, protein_data_dict):
keys = list(protein_data_dict.keys())
for key in keys:
res_concat_list = list()
res_module_concat_list = list()
res_subunit_concat_list = list()
proteins = list(protein_data_dict[key].keys())
for pro,protein in enumerate(proteins):
print(key, protein)
df = protein_data_dict[key][protein]
combined_groups = utilsFacade.getCombinations(list(set(df['group'])))
for cg in combined_groups:
altName = 'protein' + str(pro)
group_label = '::'.join(cg)
group_label_for_file = group_label.replace('::','_')
existence1 = os.path.exists(folder + 'RES_ALL_proteins_' + key + '/RES_test_explainVar_' + altName + '_' + key + '_' + group_label_for_file + '.tsv')
existence2 = os.path.exists(folder + 'RES_ALL_proteins_' + key + '/RESMODULE_test_explainVar_' + altName + '_' + key + '_' + group_label_for_file + '.tsv')
existence3 = os.path.exists(folder + 'RES_ALL_proteins_' + key + '/RESSUBUNIT_test_explainVar_' + altName + '_' + key + '_' + group_label_for_file + '.tsv')
if (existence1 == False) or (existence2 == False) or (existence3 == False):
sub1 = df[df['group']==cg[0]]
sub2 = df[df['group']==cg[1]]
sub = pd.concat([sub1, sub2])
sub = sub.dropna(axis = 1)
sub['group'] = [item == cg[0] for item in sub['group']]
if len(sub.columns)>1:
covariate_list = [['group']]
sub = sub.convert_objects(convert_numeric=True)
print(sub.shape)
res, res_module, res_subunit = genetic_environmental_effects.run_multivariate_analysis(sub,
covariate_list, protein)
res['group_comparison'] = pd.Series([group_label]*len(res), index = res.index)
res_module['group_comparison'] = pd.Series([group_label]*len(res_module), index = res_module.index)
res_subunit['group_comparison'] = pd.Series([group_label]*len(res_subunit), index = res_subunit.index)
res_concat_list.append(res)
res_module_concat_list.append(res_module)
res_subunit_concat_list.append(res_subunit)
res.to_csv(folder + 'RES_test_ALLprotein_explainVar_' + altName + '_' + key + '_' + group_label_for_file + '.tsv',sep = '\t')
res_module.to_csv(folder + 'RESMODULE_test_ALLprotein_explainVar_' + altName + '_' + key + '_' + group_label_for_file + '.tsv',sep = '\t')
res_subunit.to_csv(folder + 'RESSUBUNIT_test_ALLprotein_explainVar_' + altName + '_' + key + '_' + group_label_for_file + '.tsv',sep = '\t')
res = pd.concat(res_concat_list)
res_module = pd.concat(res_module_concat_list)
res_subunit = pd.concat(res_subunit_concat_list)
res.to_csv(folder + 'RES_test_ALLprotein_explainVar_' + key + '.tsv', sep = '\t')
res_module.to_csv(folder + 'RESMODULE_test_ALLprotein_explainVar_' + key + '.tsv', sep = '\t')
res_subunit.to_csv(folder + 'RESSUBUNIT_test_ALLprotein_explainVar_' + key + '.tsv', sep = '\t')
return res, res_module, res_subunit
@staticmethod
def summarize_results_all_proteins(folder):
keys = ['ENV_yeast11','ENV_yeast3','GEN_yeast14', 'ENV_yeast20', 'GEN_yeast5']
for key in keys:
print(key)
ffolder = folder + 'RES_ALL_proteins_' + key + '/'
file_list = os.listdir(ffolder)
res_files = utilsFacade.filtering(file_list, 'RES_', condition = 'startswith')
resmodule_files = utilsFacade.filtering(file_list, 'RESMODULE_', condition = 'startswith')
ressubunit_files = utilsFacade.filtering(file_list, 'RESSUBUNIT_', condition = 'startswith')
concat_list = list()
for fname in resmodule_files:
print(fname)
data = DataFrameAnalyzer.getFile(ffolder, fname)
concat_list.append(data)
data = pd.concat(concat_list)
data = data.sort_values('complex.name')
grouped_module = []
for idx, grp in data.groupby(['complex.name']):
grouped_module.append([idx] + [list(grp['n.subunits'])[0]] +
[np.median(grp['mean_se']), np.median(grp['r2.all.subunit']),
np.median(grp['r2.train.subunit']),
np.median(grp['r2.test.subunit']), np.median(grp['r2.all.module']),
np.median(grp['r2.train.module']), np.median(grp['r2.test.module']),
np.median(grp['explained_variance_score.all.module']),
np.median(grp['explained_variance_score.training.module']),
np.median(grp['explained_variance_score.testing.module']),
np.median(grp['coefficient']), np.median(grp['sse']),
np.median(grp['se']), np.median(grp['t']),
np.median(grp['pvalue']),np.median(grp['pval.adj']),
np.median(grp['t_stats_per_module']), np.median(grp['pvalue_per_module'])])
res_module = pd.DataFrame(grouped_module, columns=['complex.name','n.subunits',
'mean_se', 'r2.all.subunit', 'r2.train.subunit','r2.test.subunit', 'r2.all.module',
'r2.train.module','r2.test.module', 'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module','coefficient','sse','se','t',
'pvalue','pval.adj','t_stats_per_module','pvalue_per_module'])
res_module.to_csv(ffolder + 'RESMODULE_' + key + '_all_proteins.tsv', sep = '\t')
@staticmethod
def summarize_results_complex_abundances(folder):
keys = ['ENV_yeast11','ENV_yeast3','GEN_yeast14', 'ENV_yeast20', 'GEN_yeast5']
for key in keys:
print(key)
ffolder = folder + 'RES_complex_abundance_' + key + '/'
file_list = os.listdir(ffolder)
res_files = utilsFacade.filtering(file_list, 'RES_', condition = 'startswith')
resmodule_files = utilsFacade.filtering(file_list, 'RESMODULE_', condition = 'startswith')
ressubunit_files = utilsFacade.filtering(file_list, 'RESSUBUNIT_', condition = 'startswith')
concat_list = list()
for fname in resmodule_files:
print(fname)
data = DataFrameAnalyzer.getFile(ffolder, fname)
concat_list.append(data)
data = pd.concat(concat_list)
data = data.sort_values('complex.name')
grouped_module = []
for idx, grp in data.groupby(['complex.name']):
grouped_module.append([idx] + [list(grp['n.subunits'])[0]] +
[np.median(grp['mean_se']), np.median(grp['r2.all.subunit']),
np.median(grp['r2.train.subunit']),
np.median(grp['r2.test.subunit']), np.median(grp['r2.all.module']),
np.median(grp['r2.train.module']), np.median(grp['r2.test.module']),
np.median(grp['explained_variance_score.all.module']),
np.median(grp['explained_variance_score.training.module']),
np.median(grp['explained_variance_score.testing.module']),
np.median(grp['coefficient']), np.median(grp['sse']),
np.median(grp['se']), np.median(grp['t']),
np.median(grp['pvalue']),np.median(grp['pval.adj']),
np.median(grp['t_stats_per_module']), np.median(grp['pvalue_per_module'])])
res_module = pd.DataFrame(grouped_module, columns=['complex.name','n.subunits',
'mean_se', 'r2.all.subunit', 'r2.train.subunit','r2.test.subunit', 'r2.all.module',
'r2.train.module','r2.test.module', 'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module','coefficient','sse','se','t',
'pvalue','pval.adj','t_stats_per_module','pvalue_per_module'])
res_module.to_csv(ffolder + 'RESMODULE_' + key + '_complex_abundance.tsv', sep = '\t')
@staticmethod
def summarize_results_complex_stoichiometry(folder):
keys = ['ENV_yeast11','ENV_yeast3','GEN_yeast14', 'ENV_yeast20', 'GEN_yeast5']
for key in keys:
print(key)
ffolder = folder + 'RES_complex_stoichiometry_' + key + '/'
file_list = os.listdir(ffolder)
res_files = utilsFacade.filtering(file_list, 'RES_', condition = 'startswith')
resmodule_files = utilsFacade.filtering(file_list, 'RESMODULE_', condition = 'startswith')
ressubunit_files = utilsFacade.filtering(file_list, 'RESSUBUNIT_', condition = 'startswith')
concat_list = list()
for fname in resmodule_files:
print(fname)
data = DataFrameAnalyzer.getFile(ffolder, fname)
concat_list.append(data)
data = pd.concat(concat_list)
data = data.sort_values('complex.name')
grouped_module = []
for idx, grp in data.groupby(['complex.name']):
grouped_module.append([idx] + [list(grp['n.subunits'])[0]] +
[np.median(grp['mean_se']), np.median(grp['r2.all.subunit']),
np.median(grp['r2.train.subunit']),
np.median(grp['r2.test.subunit']), np.median(grp['r2.all.module']),
np.median(grp['r2.train.module']), np.median(grp['r2.test.module']),
np.median(grp['explained_variance_score.all.module']),
np.median(grp['explained_variance_score.training.module']),
np.median(grp['explained_variance_score.testing.module']),
np.median(grp['coefficient']), np.median(grp['sse']),
np.median(grp['se']), np.median(grp['t']),
np.median(grp['pvalue']),np.median(grp['pval.adj']),
np.median(grp['t_stats_per_module']), np.median(grp['pvalue_per_module'])])
res_module = pd.DataFrame(grouped_module, columns=['complex.name','n.subunits',
'mean_se', 'r2.all.subunit', 'r2.train.subunit','r2.test.subunit', 'r2.all.module',
'r2.train.module','r2.test.module', 'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module','coefficient','sse','se','t',
'pvalue','pval.adj','t_stats_per_module','pvalue_per_module'])
res_module.to_csv(ffolder + 'RESMODULE_' + key + '_complex_stoichiometry.tsv', sep = '\t')
@staticmethod
def summarize_results_pathway_abundances(folder):
keys = ['ENV_yeast11','ENV_yeast3','GEN_yeast14', 'ENV_yeast20', 'GEN_yeast5']
for key in keys:
print(key)
ffolder = folder + 'RES_pathway_abundance_' + key + '/'
file_list = os.listdir(ffolder)
res_files = utilsFacade.filtering(file_list, 'RES_', condition = 'startswith')
resmodule_files = utilsFacade.filtering(file_list, 'RESMODULE_', condition = 'startswith')
ressubunit_files = utilsFacade.filtering(file_list, 'RESSUBUNIT_', condition = 'startswith')
concat_list = list()
for fname in resmodule_files:
print(fname)
data = DataFrameAnalyzer.getFile(ffolder, fname)
concat_list.append(data)
data = pd.concat(concat_list)
data = data.sort_values('complex.name')
grouped_module = []
for idx, grp in data.groupby(['complex.name']):
grouped_module.append([idx] + [list(grp['n.subunits'])[0]] +
[np.median(grp['mean_se']), np.median(grp['r2.all.subunit']),
np.median(grp['r2.train.subunit']),
np.median(grp['r2.test.subunit']), np.median(grp['r2.all.module']),
np.median(grp['r2.train.module']), np.median(grp['r2.test.module']),
np.median(grp['explained_variance_score.all.module']),
np.median(grp['explained_variance_score.training.module']),
np.median(grp['explained_variance_score.testing.module']),
np.median(grp['coefficient']), np.median(grp['sse']),
np.median(grp['se']), np.median(grp['t']),
np.median(grp['pvalue']),np.median(grp['pval.adj']),
np.median(grp['t_stats_per_module']), np.median(grp['pvalue_per_module'])])
res_module = pd.DataFrame(grouped_module, columns=['complex.name','n.subunits',
'mean_se', 'r2.all.subunit', 'r2.train.subunit','r2.test.subunit', 'r2.all.module',
'r2.train.module','r2.test.module', 'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module','coefficient','sse','se','t',
'pvalue','pval.adj','t_stats_per_module','pvalue_per_module'])
res_module.to_csv(ffolder + 'RESMODULE_' + key + '_pathway_abundance.tsv', sep = '\t')
@staticmethod
def summarize_results_pathway_stoichiometry(folder):
keys = ['ENV_yeast11','ENV_yeast3','GEN_yeast14', 'ENV_yeast20', 'GEN_yeast5']
for key in keys:
print(key)
ffolder = folder + 'RES_pathway_stoichiometry_' + key + '/'
file_list = os.listdir(ffolder)
res_files = utilsFacade.filtering(file_list, 'RES_', condition = 'startswith')
resmodule_files = utilsFacade.filtering(file_list, 'RESMODULE_', condition = 'startswith')
ressubunit_files = utilsFacade.filtering(file_list, 'RESSUBUNIT_', condition = 'startswith')
concat_list = list()
for fname in resmodule_files:
print(fname)
data = DataFrameAnalyzer.getFile(ffolder, fname)
concat_list.append(data)
data = pd.concat(concat_list)
data = data.sort_values('complex.name')
grouped_module = []
for idx, grp in data.groupby(['complex.name']):
grouped_module.append([idx] + [list(grp['n.subunits'])[0]] +
[np.median(grp['mean_se']), np.median(grp['r2.all.subunit']),
np.median(grp['r2.train.subunit']),
np.median(grp['r2.test.subunit']), np.median(grp['r2.all.module']),
np.median(grp['r2.train.module']), np.median(grp['r2.test.module']),
np.median(grp['explained_variance_score.all.module']),
np.median(grp['explained_variance_score.training.module']),
np.median(grp['explained_variance_score.testing.module']),
np.median(grp['coefficient']), np.median(grp['sse']),
np.median(grp['se']), np.median(grp['t']),
np.median(grp['pvalue']),np.median(grp['pval.adj']),
np.median(grp['t_stats_per_module']), np.median(grp['pvalue_per_module'])])
res_module = pd.DataFrame(grouped_module, columns=['complex.name','n.subunits',
'mean_se', 'r2.all.subunit', 'r2.train.subunit','r2.test.subunit', 'r2.all.module',
'r2.train.module','r2.test.module', 'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module','coefficient','sse','se','t',
'pvalue','pval.adj','t_stats_per_module','pvalue_per_module'])
res_module.to_csv(ffolder + 'RESMODULE_' + key + '_pathway_stoichiometry.tsv', sep = '\t')
@staticmethod
def reorganize_folders_all_proteins(folder):
keys = ['ENV_yeast11','ENV_yeast3','GEN_yeast14', 'ENV_yeast20', 'GEN_yeast5']
for key in keys:
ffolder = folder + 'RES_ALL_proteins_' + key + '/'
existence1 = os.path.exists(ffolder + 'RES/')
existence2 = os.path.exists(ffolder + 'RESMODULE/')
existence3 = os.path.exists(ffolder + 'RESSUBUNIT/')
if existence1 == False:
systemAnalyzer.make_folder(ffolder + 'RES/')
if existence2 == False:
systemAnalyzer.make_folder(ffolder + 'RESMODULE/')
if existence3 == False:
systemAnalyzer.make_folder(ffolder + 'RESSUBUNIT/')
file_list = os.listdir(ffolder)
final_fname = 'RESMODULE_' + key + '_all_proteins.tsv'
res_files = utilsFacade.filtering(file_list, 'RES_', condition = 'startswith')
resmodule_files = utilsFacade.filtering(file_list, 'RESMODULE_', condition = 'startswith')
ressubunit_files = utilsFacade.filtering(file_list, 'RESSUBUNIT_', condition = 'startswith')
res_files = utilsFacade.filtering(res_files, key + '_')
resmodule_files = utilsFacade.filtering(resmodule_files, key + '_')
ressubunit_files = utilsFacade.filtering(ressubunit_files, key + '_')
for r in res_files:
print(r)
shutil.move(ffolder + r, ffolder + 'data/RES/' + r)
for r in resmodule_files:
print(r)
shutil.move(ffolder + r, ffolder + 'data/RESMODULE/' + r)
shutil.move(ffolder + 'RESMODULE/' + final_fname, ffolder + final_fname)
for r in ressubunit_files:
print(r)
shutil.move(ffolder + r, ffolder + 'RESSUBUNIT/' + r)
archive_name = ffolder + 'RES.tar.gz'
source_dir = ffolder + 'RES/'
print("Compressing files to %s..." % archive_name)
tar = tarfile.open(archive_name, "w:gz")
for file_name in glob.glob(os.path.join(source_dir, "*")):
print(" Adding %s..." % file_name)
tar.add(file_name, os.path.basename(file_name))
tar.close()
archive_name = ffolder + 'RESMODULE.tar.gz'
source_dir = ffolder + 'RESMODULE/'
print("Compressing files to %s..." % archive_name)
tar = tarfile.open(archive_name, "w:gz")
for file_name in glob.glob(os.path.join(source_dir, "*")):
print(" Adding %s..." % file_name)
tar.add(file_name, os.path.basename(file_name))
tar.close()
archive_name = ffolder + 'RESSUBUNIT.tar.gz'
source_dir = ffolder + 'RESSUBUNIT/'
print("Compressing files to %s..." % archive_name)
tar = tarfile.open(archive_name, "w:gz")
for file_name in glob.glob(os.path.join(source_dir, "*")):
print(" Adding %s..." % file_name)
tar.add(file_name, os.path.basename(file_name))
tar.close()
@staticmethod
def reorganize_folders_complex_abundances(folder):
keys = ['ENV_yeast11','ENV_yeast3','GEN_yeast14', 'ENV_yeast20', 'GEN_yeast5']
for key in keys:
ffolder = folder + 'RES_complex_abundance_' + key + '/'
existence1 = os.path.exists(ffolder + 'RES/')
existence2 = os.path.exists(ffolder + 'RESMODULE/')
existence3 = os.path.exists(ffolder + 'RESSUBUNIT/')
if existence1 == False:
systemAnalyzer.make_folder(ffolder + 'RES/')
if existence2 == False:
systemAnalyzer.make_folder(ffolder + 'RESMODULE/')
if existence3 == False:
systemAnalyzer.make_folder(ffolder + 'RESSUBUNIT/')
file_list = os.listdir(ffolder)
final_fname = 'RESMODULE_' + key + '_complex_abundance.tsv'
res_files = utilsFacade.filtering(file_list, 'RES_', condition = 'startswith')
resmodule_files = utilsFacade.filtering(file_list, 'RESMODULE_', condition = 'startswith')
ressubunit_files = utilsFacade.filtering(file_list, 'RESSUBUNIT_', condition = 'startswith')
res_files = utilsFacade.filtering(res_files, key + '_')
resmodule_files = utilsFacade.filtering(resmodule_files, key + '_')
ressubunit_files = utilsFacade.filtering(ressubunit_files, key + '_')
for r in res_files:
print(r)
shutil.move(ffolder + r, ffolder + 'RES/' + r)
for r in resmodule_files:
print(r)
shutil.move(ffolder + r, ffolder + 'RESMODULE/' + r)
shutil.move(ffolder + 'RESMODULE/' + final_fname, ffolder + final_fname)
for r in ressubunit_files:
print(r)
shutil.move(ffolder + r, ffolder + 'RESSUBUNIT/' + r)
archive_name = ffolder + 'RES.tar.gz'
source_dir = ffolder + 'RES/'
print("Compressing files to %s..." % archive_name)
tar = tarfile.open(archive_name, "w:gz")
for file_name in glob.glob(os.path.join(source_dir, "*")):
print(" Adding %s..." % file_name)
tar.add(file_name, os.path.basename(file_name))
tar.close()
archive_name = ffolder + 'RESMODULE.tar.gz'
source_dir = ffolder + 'RESMODULE/'
print("Compressing files to %s..." % archive_name)
tar = tarfile.open(archive_name, "w:gz")
for file_name in glob.glob(os.path.join(source_dir, "*")):
print(" Adding %s..." % file_name)
tar.add(file_name, os.path.basename(file_name))
tar.close()
archive_name = ffolder + 'RESSUBUNIT.tar.gz'
source_dir = ffolder + 'RESSUBUNIT/'
print("Compressing files to %s..." % archive_name)
tar = tarfile.open(archive_name, "w:gz")
for file_name in glob.glob(os.path.join(source_dir, "*")):
print(" Adding %s..." % file_name)
tar.add(file_name, os.path.basename(file_name))
tar.close()
@staticmethod
def reorganize_folders_complex_stoichiometry(folder):
keys = ['ENV_yeast11','ENV_yeast3','GEN_yeast14', 'ENV_yeast20', 'GEN_yeast5']
for key in keys:
ffolder = folder + 'RES_complex_stoichiometry_' + key + '/'
existence1 = os.path.exists(ffolder + 'RES/')
existence2 = os.path.exists(ffolder + 'RESMODULE/')
existence3 = os.path.exists(ffolder + 'RESSUBUNIT/')
if existence1 == False:
systemAnalyzer.make_folder(ffolder + 'RES/')
if existence2 == False:
systemAnalyzer.make_folder(ffolder + 'RESMODULE/')
if existence3 == False:
systemAnalyzer.make_folder(ffolder + 'RESSUBUNIT/')
file_list = os.listdir(ffolder)
final_fname = 'RESMODULE_' + key + '_complex_stoichiometry.tsv'
res_files = utilsFacade.filtering(file_list, 'RES_', condition = 'startswith')
resmodule_files = utilsFacade.filtering(file_list, 'RESMODULE_', condition = 'startswith')
ressubunit_files = utilsFacade.filtering(file_list, 'RESSUBUNIT_', condition = 'startswith')
res_files = utilsFacade.filtering(res_files, key + '_')
resmodule_files = utilsFacade.filtering(resmodule_files, key + '_')
ressubunit_files = utilsFacade.filtering(ressubunit_files, key + '_')
for r in res_files:
print(r)
shutil.move(ffolder + r, ffolder + 'RES/' + r)
for r in resmodule_files:
print(r)
shutil.move(ffolder + r, ffolder + 'RESMODULE/' + r)
shutil.move(ffolder + 'RESMODULE/' + final_fname, ffolder + final_fname)
for r in ressubunit_files:
print(r)
shutil.move(ffolder + r, ffolder + 'RESSUBUNIT/' + r)
archive_name = ffolder + 'RES.tar.gz'
source_dir = ffolder + 'RES/'
print("Compressing files to %s..." % archive_name)
tar = tarfile.open(archive_name, "w:gz")
for file_name in glob.glob(os.path.join(source_dir, "*")):
print(" Adding %s..." % file_name)
tar.add(file_name, os.path.basename(file_name))
tar.close()
archive_name = ffolder + 'RESMODULE.tar.gz'
source_dir = ffolder + 'RESMODULE/'
print("Compressing files to %s..." % archive_name)
tar = tarfile.open(archive_name, "w:gz")
for file_name in glob.glob(os.path.join(source_dir, "*")):
print(" Adding %s..." % file_name)
tar.add(file_name, os.path.basename(file_name))
tar.close()
archive_name = ffolder + 'RESSUBUNIT.tar.gz'
source_dir = ffolder + 'RESSUBUNIT/'
print("Compressing files to %s..." % archive_name)
tar = tarfile.open(archive_name, "w:gz")
for file_name in glob.glob(os.path.join(source_dir, "*")):
print(" Adding %s..." % file_name)
tar.add(file_name, os.path.basename(file_name))
tar.close()
@staticmethod
def reorganize_folders_pathway_abundances(folder):
keys = ['ENV_yeast11','ENV_yeast3','GEN_yeast14', 'ENV_yeast20', 'GEN_yeast5']
for key in keys:
ffolder = folder + 'RES_pathway_abundance_' + key + '/'
existence1 = os.path.exists(ffolder + 'RES/')
existence2 = os.path.exists(ffolder + 'RESMODULE/')
existence3 = os.path.exists(ffolder + 'RESSUBUNIT/')
if existence1 == False:
systemAnalyzer.make_folder(ffolder + 'RES/')
if existence2 == False:
systemAnalyzer.make_folder(ffolder + 'RESMODULE/')
if existence3 == False:
systemAnalyzer.make_folder(ffolder + 'RESSUBUNIT/')
file_list = os.listdir(ffolder)
final_fname = 'RESMODULE_' + key + '_pathway_abundance.tsv'
res_files = utilsFacade.filtering(file_list, 'RES_', condition = 'startswith')
resmodule_files = utilsFacade.filtering(file_list, 'RESMODULE_', condition = 'startswith')
ressubunit_files = utilsFacade.filtering(file_list, 'RESSUBUNIT_', condition = 'startswith')
res_files = utilsFacade.filtering(res_files, key + '_')
resmodule_files = utilsFacade.filtering(resmodule_files, key + '_')
ressubunit_files = utilsFacade.filtering(ressubunit_files, key + '_')
for r in res_files:
print(r)
shutil.move(ffolder + r, ffolder + 'RES/' + r)
for r in resmodule_files:
print(r)
shutil.move(ffolder + r, ffolder + 'RESMODULE/' + r)
shutil.move(ffolder + 'RESMODULE/' + final_fname, ffolder + final_fname)
for r in ressubunit_files:
print(r)
shutil.move(ffolder + r, ffolder + 'RESSUBUNIT/' + r)
archive_name = ffolder + 'RES.tar.gz'
source_dir = ffolder + 'RES/'
print("Compressing files to %s..." % archive_name)
tar = tarfile.open(archive_name, "w:gz")
for file_name in glob.glob(os.path.join(source_dir, "*")):
print(" Adding %s..." % file_name)
tar.add(file_name, os.path.basename(file_name))
tar.close()
archive_name = ffolder + 'RESMODULE.tar.gz'
source_dir = ffolder + 'RESMODULE/'
print("Compressing files to %s..." % archive_name)
tar = tarfile.open(archive_name, "w:gz")
for file_name in glob.glob(os.path.join(source_dir, "*")):
print(" Adding %s..." % file_name)
tar.add(file_name, os.path.basename(file_name))
tar.close()
archive_name = ffolder + 'RESSUBUNIT.tar.gz'
source_dir = ffolder + 'RESSUBUNIT/'
print("Compressing files to %s..." % archive_name)
tar = tarfile.open(archive_name, "w:gz")
for file_name in glob.glob(os.path.join(source_dir, "*")):
print(" Adding %s..." % file_name)
tar.add(file_name, os.path.basename(file_name))
tar.close()
@staticmethod
def reorganize_folders_pathway_stoichiometry(folder):
keys = ['ENV_yeast11','ENV_yeast3','GEN_yeast14', 'ENV_yeast20', 'GEN_yeast5']
for key in keys:
ffolder = folder + 'RES_pathway_stoichiometry_' + key + '/'
existence1 = os.path.exists(ffolder + 'RES/')
existence2 = os.path.exists(ffolder + 'RESMODULE/')
existence3 = os.path.exists(ffolder + 'RESSUBUNIT/')
if existence1 == False:
systemAnalyzer.make_folder(ffolder + 'RES/')
if existence2 == False:
systemAnalyzer.make_folder(ffolder + 'RESMODULE/')
if existence3 == False:
systemAnalyzer.make_folder(ffolder + 'RESSUBUNIT/')
file_list = os.listdir(ffolder)
final_fname = 'RESMODULE_' + key + '_pathway_stoichiometry.tsv'
res_files = utilsFacade.filtering(file_list, 'RES_', condition = 'startswith')
resmodule_files = utilsFacade.filtering(file_list, 'RESMODULE_', condition = 'startswith')
ressubunit_files = utilsFacade.filtering(file_list, 'RESSUBUNIT_', condition = 'startswith')
res_files = utilsFacade.filtering(res_files, key + '_')
resmodule_files = utilsFacade.filtering(resmodule_files, key + '_')
ressubunit_files = utilsFacade.filtering(ressubunit_files, key + '_')
for r in res_files:
print(r)
shutil.move(ffolder + r, ffolder + 'RES/' + r)
for r in resmodule_files:
print(r)
shutil.move(ffolde + r, ffolder + 'RESMODULE/' + r)
shutil.move(ffolder + 'RESMODULE/' + final_fname, ffolder + final_fname)
for r in ressubunit_files:
print(r)
shutil.move(ffolder + r, ffolder + 'RESSUBUNIT/' + r)
archive_name = ffolder + 'RES.tar.gz'
source_dir = ffolder + 'RES/'
print("Compressing files to %s..." % archive_name)
tar = tarfile.open(archive_name, "w:gz")
for file_name in glob.glob(os.path.join(source_dir, "*")):
print(" Adding %s..." % file_name)
tar.add(file_name, os.path.basename(file_name))
tar.close()
archive_name = ffolder + 'RESMODULE.tar.gz'
source_dir = ffolder + 'RESMODULE/'
print("Compressing files to %s..." % archive_name)
tar = tarfile.open(archive_name, "w:gz")
for file_name in glob.glob(os.path.join(source_dir, "*")):
print(" Adding %s..." % file_name)
tar.add(file_name, os.path.basename(file_name))
tar.close()
archive_name = ffolder + 'RESSUBUNIT.tar.gz'
source_dir = ffolder + 'RESSUBUNIT/'
print("Compressing files to %s..." % archive_name)
tar = tarfile.open(archive_name, "w:gz")
for file_name in glob.glob(os.path.join(source_dir, "*")):
print(" Adding %s..." % file_name)
tar.add(file_name, os.path.basename(file_name))
tar.close()
class genetic_environmental_effects:
@staticmethod
def execute(**kwargs):
folder = kwargs.get('folder', 'PATH')
print('load_data')
data_dict = genetic_environmental_effects.load_data(folder)
print('plot_boxplot_distributions')
genetic_environmental_effects.plot_boxplot_distributions(folder, data_dict)
@staticmethod
def load_data(folder):
categories = ['ALL_proteins', 'complex_abundance', 'complex_stoichiometry',
'pathway_abundance', 'pathway_stoichiometry']
keys = ['ENV_yeast11','ENV_yeast3','GEN_yeast14', 'ENV_yeast20', 'GEN_yeast5']
data_dict = dict((e1,dict()) for e1 in categories)
for cat in categories:
data_dict[cat] = dict((e1,list()) for e1 in keys)
for key in keys:
print(cat, key)
ffolder = folder + 'RES_' + cat + '_' + key + '/'
fname = 'RESMODULE_' + key + '_' + cat.lower() + '.tsv'
data = DataFrameAnalyzer.getFile(ffolder, fname)
r2_list = np.median(utilsFacade.finite(list(data['r2.all.module'])))
data_dict[cat][key] = r2_list
return data_dict
@staticmethod
def plot_boxplot_distributions(folder, data_dict):
categories = ['ALL_proteins', 'complex_abundance', 'complex_stoichiometry']
keys = ['ENV_yeast11','ENV_yeast3','ENV_yeast20', 'GEN_yeast5','GEN_yeast14']
data_list = list()
xlabel_list = list()
for key in keys:
for cat in categories:
dat = data_dict[cat][key]
data_list.append(dat)
xlabel_list.append(key + '::' + cat)
sns.set(context='notebook', style='white',
palette='deep', font='Liberation Sans', font_scale=1,
color_codes=False, rc=None)
plt.rcParams["axes.grid"] = True
plt.clf()
fig = plt.figure(figsize = (10,5))
ax = fig.add_subplot(111)
bp = ax.boxplot(data_list, notch=0, sym="", vert=1, patch_artist=True, widths=[0.5]*len(data_list))
plt.setp(bp['medians'], color="black")
plt.setp(bp['whiskers'], color="black",linestyle="-")
for i,patch in enumerate(bp['boxes']):
if i%3==0:
patch.set_facecolor('white')
patch.set_edgecolor('blue')
elif i%3==1:
patch.set_facecolor('lightblue')
patch.set_edgecolor('black')
else:
patch.set_facecolor('darkblue')
patch.set_edgecolor('black')
patch.set_alpha(0.8)
plt.xticks(list(xrange(len([''] + xlabel_list))))
ax.set_xticklabels([''] + xlabel_list, rotation = 90)
plt.savefig(folder + 'explained_variance_yeast_datasets.pdf', bbox_inches = 'tight', dpi = 300)
if __name__=="__main__":
gen_env_prep = genetic_environmental_effect_prep.execute(folder = sys.argv[1])
gen_env = genetic_environmental_effects.execute(folder = sys.argv[1])
All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.