STEP 12: Python Code
class step12_machine_learning_setup(object):
def __init__(self, jobid, dataset, module, datatype,**kwargs):
self.folder = kwargs.get('folder','PATH')
self.randomize = kwargs.get('randomize','False')
self.run_combined = kwargs.get('run_combined','False')
self.run_multivariate = kwargs.get('run_multivariate', 'True')
self.jobid = int(jobid)-1
self.module = module
self.dataset = dataset
self.datatype = datatype
print('load_dictionary')
self.load_dictionary()
print('load_data')
self.data, self.name, self.subunit_names, self.relevant_filename = self.load_data()
if self.run_multivariate=='True':
print('run_multivariate_analysis')
self.res, self.res_module, self.res_subunit = self.run_multivariate_analysis()
if self.run_combined == 'True':
print('run_combined_multivariate_analysis')
self.res = self.run_combined_multivariate_analysis()
def load_dictionary(self):
folder = self.folder
datatype = self.datatype
file_list = os.listdir(folder)
json_list = filter(lambda a:a.endswith('.json')==True, file_list)
json_list = filter(lambda a:a.find(datatype)!=-1, json_list)
print(json_list)
self.name_dict = json.load(open(folder + json_list[0], 'rb')) #53 is eif2b
def load_data(self):
jobid = self.jobid
folder = self.folder
module = self.module
dataset = self.dataset
datatype = self.datatype
folder = folder + 'split_complex_input_files/'
file_list = os.listdir(folder)
final_inputs = utilsFacade.filtering(final_inputs, module,condition = 'find')
final_inputs = utilsFacade.filtering(final_inputs, datatype,condition = 'find')
relevant_filename = utilsFacade.filtering(final_inputs, 'Num' + str(jobid)+'.')
if len(relevant_filename)==0:
return None
else:
print(relevant_filename[0])
m = DataFrameAnalyzer.getFile(folder, relevant_filename[0])
diet_cols = m.loc['diet'][:-1]
sex_cols = m.loc['sex'][:-1]
m = m[:-2].dropna(axis = 0, how='any')
m = m.T
m['sex'] = sex_cols
m['diet'] = diet_cols
m = m.T
name = self.name_dict[str(jobid)]
subunit_names = [c for c in m.index if not c in {'sex', 'diet'}]
return m, name, subunit_names, relevant_filename[0]
def randomize_dataset(self, m):
sex_list = list(m.sex)
diet_list = list(m.diet)
m1 = m.drop(['sex','diet'],axis=1)
table_list = list()
for i,row in m1.iterrows():
temp = list(row)
np.random.shuffle(temp)
table_list.append(temp)
m2 = pd.DataFrame(table_list)
m2.columns = m1.columns
m2.index = m1.index
m = m2
m['sex'] = sex_list
m['diet'] = diet_list
return m
def run_multivariate_analysis(self):
m = self.data
folder = self.folder
dataset = self.dataset
complex_name = self.name
randomize = self.randomize
subunit_names = self.subunit_names
relevant_filename = self.relevant_filename
columns = list(m.columns)
last_col = columns[-1]
m = m.drop([last_col], axis = 1)
m = m.T
idx_list = list(m.index)
passed_subunits = list()
idx_list = list()
for su in subunit_names:
c = passed_subunits.count(su)
idx_list.append(c)
passed_subunits.append(su)
columns = list()
for su,idx in zip(subunit_names, idx_list):
if idx != 0:
columns.append(su + '.' + str(idx))
else:
columns.append(su)
m.columns = columns + ['sex','diet']
m['sex'] = [sex == 'M' for sex in m['sex']]
m['diet'] = [diet == 'H' for diet in m['diet']]
covariate_list = [['sex'], ['diet']]
m = m.convert_objects(convert_numeric=True)
print(m.shape)
nkfold = 10
counter = 0
table = list()
if self.randomize == 'True':
m = randomize_dataset(m)
if m.shape[1] > 2:
for covariate in covariate_list:
y_ref = np.array(m[subunit_names])
X = np.array(m[covariate])
kf = KFold(n_splits = nkfold, shuffle = True, random_state=500)
ki = 0
for train, test in kf.split(X):
X_train, X_test = X[train], X[test]
y_ref_train, y_ref_test = y_ref[train], y_ref[test]
clf = Ridge(random_state=500)
model = clf.fit(X_train,y_ref_train)
model_all = clf.fit(X, y_ref)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
coefficients = model.coef_
mean_se = mean_squared_error(y_ref_test,y_pred_test) #Mean squared error
r2_all = r2_score(y_ref, model_all.predict(X))
r2_train = r2_score(y_ref_train, y_pred_train)
r2_test = r2_score(y_ref_test, y_pred_test) #Variance score
r2_list_train = list()
for x,y in zip(y_ref_train.T, y_pred_train.T):
r2_list_train.append(r2_score(x,y))
r2_list_test = list()
for x,y in zip(y_ref_test.T, y_pred_test.T):
r2_list_test.append(r2_score(x,y))
r2_list_all = list()
for x,y in zip(y_ref.T, model_all.predict(X).T):
r2_list_all.append(r2_score(x,y))
explained_score_train = explained_variance_score(y_ref_train, y_pred_train)
explained_score_test = explained_variance_score(y_ref_test, y_pred_test)
explained_score_all = explained_variance_score(y_ref, model_all.predict(X))
sse = np.sum((y_pred_test - y_ref_test) ** 2, axis= 0) / float(X_test.shape[0] - X_test.shape[1])
try:
se = np.array([np.sqrt(np.diagonal(sse[i] * np.linalg.inv(np.dot(X_test.T, X_test))))
for i in range(sse.shape[0])])
except:
se = np.array([np.sqrt(np.diagonal(sse[i] * np.linalg.inv(np.array([[True]]))))
for i in range(sse.shape[0])])
t = model.coef_ / se
p = 2 * (1 - scipy.stats.t.cdf(np.abs(t), y_pred_test.shape[0] - X_test.shape[1]))
pval_adj = list(RFacade.get_bh_pvalues(p))
t_stats_per_module = np.mean(model.coef_)/mean_se
p_per_module = 2 * (1 - scipy.stats.t.cdf(np.abs(t_stats_per_module), y_pred_test.shape[0] - X_test.shape[1]))
ki += 1
ccount = 0
for su,r2 in zip(subunit_names, r2_list_train):
temp = [ki,complex_name,su, y_ref.shape[1],'_'.join(covariate),
mean_se, r2_list_all[ccount], r2, r2_list_test[ccount],
r2_all, r2_train, r2_test, explained_score_all,
explained_score_train, explained_score_test, coefficients[ccount][0],
sse[ccount], se[ccount][0], t[ccount][0], p[ccount][0],
pval_adj[ccount], t_stats_per_module, p_per_module]
table.append(temp)
ccount +=1
res = pd.DataFrame(table, columns=['ki','complex.name','subunit','n.subunits',
'covariates','mean_se','r2.all.subunit',
'r2.train.subunit','r2.test.subunit',
'r2.all.module','r2.train.module', 'r2.test.module',
'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module',
'coefficient','sse','se','t','pvalue','pval.adj',
't_stats_per_module','pvalue_per_module'])
grouped_module = []
for idx, grp in res.groupby(['complex.name', 'covariates']):
grouped_module.append(list(idx) + [list(grp['n.subunits'])[0]] +
[np.median(grp['mean_se']), np.median(grp['r2.all.subunit']),
np.median(grp['r2.train.subunit']),
np.median(grp['r2.test.subunit']), np.median(grp['r2.all.module']),
np.median(grp['r2.train.module']), np.median(grp['r2.test.module']),
np.median(grp['explained_variance_score.all.module']),
np.median(grp['explained_variance_score.training.module']),
np.median(grp['explained_variance_score.testing.module']),
np.median(grp['coefficient']), np.median(grp['sse']),
np.median(grp['se']), np.median(grp['t']),
np.median(grp['pvalue']),np.median(grp['pval.adj']),
np.median(grp['t_stats_per_module']), np.median(grp['pvalue_per_module'])])
res_module = pd.DataFrame(grouped_module, columns=['complex.name', 'covariates','n.subunits',
'mean_se', 'r2.all.subunit', 'r2.train.subunit','r2.test.subunit', 'r2.all.module',
'r2.train.module','r2.test.module', 'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module','coefficient','sse','se','t',
'pvalue','pval.adj','t_stats_per_module','pvalue_per_module'])
#group per subunit in module
grouped_subunit = []
for idx, grp in res.groupby(['complex.name', 'covariates', 'subunit']):
grouped_subunit.append(list(idx) + [list(grp['n.subunits'])[0]] +
[np.median(grp['mean_se']), np.median(grp['r2.all.subunit']),
np.median(grp['r2.train.subunit']),
np.median(grp['r2.test.subunit']), np.median(grp['r2.all.module']),
np.median(grp['r2.train.module']), np.median(grp['r2.test.module']),
np.median(grp['explained_variance_score.all.module']),
np.median(grp['explained_variance_score.training.module']),
np.median(grp['explained_variance_score.testing.module']),
np.median(grp['coefficient']), np.median(grp['sse']),
np.median(grp['se']), np.median(grp['t']),
np.median(grp['pvalue']),np.median(grp['pval.adj']),
np.median(grp['t_stats_per_module']), np.median(grp['pvalue_per_module'])])
res_subunit = pd.DataFrame(grouped_subunit, columns=['complex.name', 'covariates', 'subunit',
'n.subunits','mean_se', 'r2.all.subunit', 'r2.train.subunit','r2.test.subunit', 'r2.all.module',
'r2.train.module','r2.test.module', 'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module','coefficient','sse','se','t',
'pvalue','pval.adj','t_stats_per_module','pvalue_per_module'])
print 'writing'
if self.randomize == 'True':
output_filename = dataset + '_' + relevant_filename.split('.')[0] + '_randomized_multi_covariate.tsv.gz'
else:
output_filename = dataset + '_' + relevant_filename.split('.')[0] + '_multi_covariate.tsv.gz'
if self.randomize == 'True':
if os.path.exists(folder + 'randomized_multivariate/')==False:
os.makedirs(folder + 'randomized_multivariate/')
output_folder = folder + 'randomized_multivariate/'
else:
if os.path.exists(folder + 'multivariate/')==False:
os.makedirs(folder + 'multivariate/')
output_folder = folder + 'multivariate/'
print(folder, output_folder)
res.to_csv(output_folder + output_filename,
compression = 'gzip', sep ='\t')
res_module.to_csv(output_folder + output_filename.split('.')[0] + '_moduleSpecific.tsv.gz',
compression = 'gzip', sep ='\t')
res_subunit.to_csv(output_folder + output_filename.split('.')[0] + '_subunitSpecific.tsv.gz',
compression = 'gzip', sep ='\t')
return res, res_module, res_subunit
else:
return 'failed','failed','failed'
def run_combined_multivariate_analysis(self):
m = self.data
folder = self.folder
dataset = self.dataset
complex_name = self.name
randomize = self.randomize
subunit_names = self.subunit_names
relevant_filename = self.relevant_filename
print('randomize:',randomize)
columns = list(m.columns)
last_col = columns[-1]
m = m.drop([last_col], axis = 1)
m = m.T
idx_list = list(m.index)
passed_subunits = list()
idx_list = list()
for su in subunit_names:
c = passed_subunits.count(su)
idx_list.append(c)
passed_subunits.append(su)
columns = list()
for su,idx in zip(subunit_names, idx_list):
if idx != 0:
columns.append(su + '.' + str(idx))
else:
columns.append(su)
m.columns = columns + ['sex','diet']
m['sex'] = [sex == 'M' for sex in m['sex']]
m['diet'] = [diet == 'H' for diet in m['diet']]
covariate_list = [['sex'], ['diet']]
m = m.convert_objects(convert_numeric=True)
print(m.shape)
if self.randomize == 'True':
m = randomize_dataset(m)
nkfold = 10
counter = 0
table = list()
if m.shape[1] > 2:
y_ref = np.array(m[subunit_names])
X = np.array(m[['sex','diet']])
kf = KFold(n_splits=nkfold, shuffle=True, random_state=500)
ki = 0
for train, test in kf.split(X):
X_train, X_test = X[train], X[test]
y_ref_train, y_ref_test = y_ref[train], y_ref[test]
clf = Ridge(random_state=500)
model = clf.fit(X_train,y_ref_train)
model_all = clf.fit(X, y_ref)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
coefficients = model.coef_
mean_se = mean_squared_error(y_ref_test,y_pred_test) #Mean squared error
r2_all = r2_score(y_ref, model_all.predict(X))
r2_train = r2_score(y_ref_train, y_pred_train)
r2_test = r2_score(y_ref_test, y_pred_test) #Variance score
r2_list_train = list()
for x,y in zip(y_ref_train.T, y_pred_train.T):
r2_list_train.append(r2_score(x,y))
r2_list_test = list()
for x,y in zip(y_ref_test.T, y_pred_test.T):
r2_list_test.append(r2_score(x,y))
r2_list_all = list()
for x,y in zip(y_ref.T, model_all.predict(X).T):
r2_list_all.append(r2_score(x,y))
explained_score_train = explained_variance_score(y_ref_train, y_pred_train)
explained_score_test = explained_variance_score(y_ref_test, y_pred_test)
explained_score_all = explained_variance_score(y_ref, model_all.predict(X))
sse = np.sum((y_pred_test - y_ref_test) ** 2, axis= 0) / float(X_test.shape[0] - X_test.shape[1])
try:
se = np.array([np.sqrt(np.diagonal(sse[i] * np.linalg.inv(np.dot(X_test.T, X_test))))
for i in range(sse.shape[0])])
except:
se = np.array([np.sqrt(np.diagonal(sse[i] * np.linalg.inv(np.array([[True]]))))
for i in range(sse.shape[0])])
t = model.coef_ / se
t_stats_per_module = np.mean(model.coef_)/mean_se
p_per_module = 2 * (1 - scipy.stats.t.cdf(np.abs(t_stats_per_module), y_pred_test.shape[0] - X_test.shape[1]))
ki += 1
ccount = 0
for su,r2 in zip(subunit_names, r2_list_train):
temp = [ki,complex_name,su, y_ref.shape[1],'_'.join(['sex','diet']),
mean_se, r2_list_all[ccount], r2, r2_list_test[ccount],
r2_all, r2_train, r2_test, explained_score_all,
explained_score_train, explained_score_test, coefficients[ccount][0],
sse[ccount], se[ccount][0], t[ccount][0], t_stats_per_module, p_per_module]
table.append(temp)
ccount +=1
res = pd.DataFrame(table, columns=['ki','complex.name','subunit','n.subunits',
'covariates','mean_se','r2.all.subunit',
'r2.train.subunit','r2.test.subunit',
'r2.all.module','r2.train.module', 'r2.test.module',
'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module',
'coefficient','sse','se','t',
't_stats_per_module','pvalue_per_module'])
grouped_module = []
for idx, grp in res.groupby(['complex.name', 'covariates']):
grouped_module.append(list(idx) + [list(grp['n.subunits'])[0]] +
[np.median(grp['mean_se']), np.median(grp['r2.all.subunit']),
np.median(grp['r2.train.subunit']),
np.median(grp['r2.test.subunit']), np.median(grp['r2.all.module']),
np.median(grp['r2.train.module']), np.median(grp['r2.test.module']),
np.median(grp['explained_variance_score.all.module']),
np.median(grp['explained_variance_score.training.module']),
np.median(grp['explained_variance_score.testing.module']),
np.median(grp['coefficient']), np.median(grp['sse']),
np.median(grp['se']), np.median(grp['t']),
np.median(grp['t_stats_per_module']), np.median(grp['pvalue_per_module'])])
res_module = pd.DataFrame(grouped_module, columns=['complex.name', 'covariates','n.subunits',
'mean_se', 'r2.all.subunit', 'r2.train.subunit','r2.test.subunit', 'r2.all.module',
'r2.train.module','r2.test.module', 'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module','coefficient','sse','se','t',
't_stats_per_module','pvalue_per_module'])
#group per subunit in module
grouped_subunit = []
for idx, grp in res.groupby(['complex.name', 'covariates', 'subunit']):
grouped_subunit.append(list(idx) + [list(grp['n.subunits'])[0]] +
[np.median(grp['mean_se']), np.median(grp['r2.all.subunit']),
np.median(grp['r2.train.subunit']),
np.median(grp['r2.test.subunit']), np.median(grp['r2.all.module']),
np.median(grp['r2.train.module']), np.median(grp['r2.test.module']),
np.median(grp['explained_variance_score.all.module']),
np.median(grp['explained_variance_score.training.module']),
np.median(grp['explained_variance_score.testing.module']),
np.median(grp['coefficient']), np.median(grp['sse']),
np.median(grp['se']), np.median(grp['t']),
np.median(grp['t_stats_per_module']), np.median(grp['pvalue_per_module'])])
res_subunit = pd.DataFrame(grouped_subunit, columns=['complex.name', 'covariates', 'subunit',
'n.subunits','mean_se', 'r2.all.subunit', 'r2.train.subunit','r2.test.subunit', 'r2.all.module',
'r2.train.module','r2.test.module', 'explained_variance_score.all.module',
'explained_variance_score.training.module',
'explained_variance_score.testing.module','coefficient','sse','se','t',
't_stats_per_module','pvalue_per_module'])
print 'writing'
if self.randomize == 'True':
output_filename = dataset + '_' + relevant_filename.split('.')[0] + '_randomized_multi_covariate.tsv.gz'
else:
output_filename = dataset + '_' + relevant_filename.split('.')[0] + '_multi_covariate.tsv.gz'
if self.randomize == 'True':
if os.path.exists(folder + 'combined_randomized_multivariate/')==False:
os.makedirs(folder + 'combined_randomized_multivariate/')
output_folder = folder + 'combined_randomized_multivariate/'
else:
if os.path.exists(folder + 'combined_multivariate/')==False:
os.makedirs(folder + 'combined_multivariate/')
output_folder = folder + 'combined_multivariate/'
print(folder, output_folder)
res.to_csv(output_folder + output_filename,
compression = 'gzip', sep ='\t')
res_module.to_csv(output_folder + output_filename.split('.')[0] + '_moduleSpecific.tsv.gz',
compression = 'gzip', sep ='\t')
res_subunit.to_csv(output_folder + output_filename.split('.')[0] + '_subunitSpecific.tsv.gz',
compression = 'gzip', sep ='\t')
return res, res_module, res_subunit
else:
return 'failed','failed','failed'
class step12_summarizing:
@staticmethod
def merge_multivariate_result_files(dataset, module_type, datatype, **kwargs):
folder = kwargs.get('folder','PATH')
randomize = kwargs.get('randomize', False)
folder = folder + 'multivariate/'
file_list = os.listdir(folder)
if randomize == True:
folder = folder + 'randomized_multivariate/'
file_list = os.listdir(folder)
module_input_list = utilsFacade.filtering(file_list, '_randomized_multi_covariate_moduleSpecific.tsv.gz')
module_input_list = utilsFacade.filtering(module_input_list, datatype)
module_input_list = utilsFacade.filtering(module_input_list, '_all_', condition = 'notfind')
subunit_input_list = utilsFacade.filtering(file_list, '_randomized_multi_covariate_subunitSpecific.tsv.gz')
subunit_input_list = utilsFacade.filtering(subunit_input_list, datatype)
subunit_input_list = utilsFacade.filtering(subunit_input_list, '_all_', condition = 'notfind')
print(len(module_input_list),len(subunit_input_list))
else:
module_input_list = utilsFacade.filtering(file_list, '_multi_covariate_moduleSpecific.tsv.gz')
module_input_list = utilsFacade.filtering(module_input_list, datatype)
module_input_list = utilsFacade.filtering(module_input_list, 'randomized', condition = 'notfind')
module_input_list = utilsFacade.filtering(module_input_list, '_all_', condition = 'notfind')
subunit_input_list = utilsFacade.filtering(file_list, '_multi_covariate_subunitSpecific.tsv.gz')
subunit_input_list = utilsFacade.filtering(subunit_input_list, datatype)
subunit_input_list = utilsFacade.filtering(subunit_input_list, 'randomized', condition = 'notfind')
subunit_input_list = utilsFacade.filtering(subunit_input_list, '_all_', condition = 'notfind')
print(module_type, len(module_input_list), len(subunit_input_list))
module_concat_list = list()
for mod in module_input_list:
print(mod)
dat = DataFrameAnalyzer.open_in_chunks(folder, mod)
module_concat_list.append(dat)
module_data = pd.concat(module_concat_list)
subunit_concat_list = list()
for mod in subunit_input_list:
print(mod)
dat = DataFrameAnalyzer.open_in_chunks(folder, mod)
subunit_concat_list.append(dat)
subunit_data = pd.concat(subunit_concat_list)
print('writing')
output_folder = folder
if randomize == True:
output_filename = '_'.join([dataset, module_type, datatype]) + '_data_all_randomized_multi_covariate_moduleSpecific.tsv.gz'
module_data.to_csv(output_folder + output_filename, sep = '\t', compression = 'gzip')
output_filename = '_'.join([dataset, module_type, datatype]) + '_data_all_randomized_multi_covariate_subunitSpecific.tsv.gz'
subunit_data.to_csv(output_folder + output_filename, sep = '\t', compression = 'gzip')
else:
output_filename = '_'.join([dataset, module_type, datatype]) + '_data_all_multi_covariate_moduleSpecific.tsv.gz'
module_data.to_csv(output_folder + output_filename, sep = '\t', compression = 'gzip')
output_filename = '_'.join([dataset, module_type, datatype]) + '_data_all_multi_covariate_subunitSpecific.tsv.gz'
subunit_data.to_csv(output_folder + output_filename, sep = '\t', compression = 'gzip')
@staticmethod
def merge_multivariate_result_files_combined(dataset, module_type, datatype, **kwargs):
folder = kwargs.get('folder', 'PATH')
randomize = kwargs.get('randomize', False)
folder = folder + 'combined_multivariate/'
file_list = os.listdir(folder)
if randomize == True:
folder = folder + 'combined_randomized_multivariate/'
file_list = os.listdir(folder)
module_input_list = utilsFacade.filtering(file_list, '_randomized_multi_covariate_moduleSpecific.tsv.gz')
module_input_list = utilsFacade.filtering(module_input_list, datatype)
module_input_list = utilsFacade.filtering(module_input_list, '_all_', condition = 'notfind')
subunit_input_list = utilsFacade.filtering(file_list, '_randomized_multi_covariate_subunitSpecific.tsv.gz')
subunit_input_list = utilsFacade.filtering(subunit_input_list, datatype)
subunit_input_list = utilsFacade.filtering(subunit_input_list, '_all_', condition = 'notfind')
else:
module_input_list = utilsFacade.filtering(file_list, '_multi_covariate_moduleSpecific.tsv.gz')
module_input_list = utilsFacade.filtering(module_input_list, datatype)
module_input_list = utilsFacade.filtering(module_input_list, 'randomized', condition = 'notfind')
module_input_list = utilsFacade.filtering(module_input_list, '_all_', condition = 'notfind')
subunit_input_list = utilsFacade.filtering(file_list, '_multi_covariate_subunitSpecific.tsv.gz')
subunit_input_list = utilsFacade.filtering(subunit_input_list, datatype)
subunit_input_list = utilsFacade.filtering(subunit_input_list, 'randomized', condition = 'notfind')
subunit_input_list = utilsFacade.filtering(subunit_input_list, '_all_', condition = 'notfind')
print(module_type, len(module_input_list), len(subunit_input_list))
module_concat_list = list()
for mod in module_input_list:
print(mod)
dat = DataFrameAnalyzer.open_in_chunks(folder, mod)
module_concat_list.append(dat)
module_data = pd.concat(module_concat_list)
subunit_concat_list = list()
for mod in subunit_input_list:
print(mod)
dat = DataFrameAnalyzer.open_in_chunks(folder, mod)
subunit_concat_list.append(dat)
subunit_data = pd.concat(subunit_concat_list)
print('writing')
output_folder = folder
if randomize == True:
output_filename = '_'.join([dataset, module_type, datatype]) + '_data_all_randomized_multi_covariate_moduleSpecific_COMBINEDEFFECT.tsv.gz'
module_data.to_csv(output_folder + output_filename, sep = '\t', compression = 'gzip')
output_filename = '_'.join([dataset, module_type, datatype]) + '_data_all_randomized_multi_covariate_subunitSpecific_COMBINEDEFFECT.tsv.gz'
subunit_data.to_csv(output_folder + output_filename, sep = '\t', compression = 'gzip')
else:
output_filename = '_'.join([dataset, module_type, datatype]) + '_data_all_multi_covariate_moduleSpecific_COMBINEDEFFECT.tsv.gz'
module_data.to_csv(output_folder + output_filename, sep = '\t', compression = 'gzip')
output_filename = '_'.join([dataset, module_type, datatype]) + '_data_all_multi_covariate_subunitSpecific_COMBINEDEFFECT.tsv.gz'
subunit_data.to_csv(output_folder + output_filename, sep = '\t', compression = 'gzip')
@staticmethod
def load_multivariate_module_specific_data(dataset, module_type, datatype, covariate, **kwargs):
n_subunit = kwargs.get('n_subunit',0)
folder = kwargs.get('folder','PATH')
joint_name = '_'.join([dataset, module_type, datatype,covariate])
filename = joint_name + '_data_all_multi_covariate_moduleSpecific_empiricalFDR.tsv.gz'
data = DataFrameAnalyzer.open_in_chunks(folder, filename)
data = data[data['n.subunits'] >= n_subunit]
return data
@staticmethod
def load_multivariate_subunit_specific_data(dataset, module_type, datatype, covariate, **kwargs):
folder = kwargs.get('folder','PATH')
filename = '_'.join([dataset, module_type, datatype, covariate]) + '_data_all_multi_covariate_subunitSpecific_empiricalFDR.tsv.gz'
data = DataFrameAnalyzer.open_in_chunks(folder, filename)
return data
@staticmethod
def add_empirical_fdr_for_moduleSpecific_data(dataset, module_type, datatype, covariate, **kwargs):
folder = kwargs.get('folder','PATH')
data = step12_summarizing.load_multivariate_module_specific_data(dataset, module_type,
datatype, covariate, folder = folder)
subunit_data = step12_summarizing.load_multivariate_subunit_specific_data(dataset,
module_type, datatype, covariate, folder = folder)
filename = dataset + '_randomized_sex_covariates.tsv.gz'
rand_data = DataFrameAnalyzer.open_in_chunks(folder, filename)
df = pd.DataFrame({'label':[1]*len(data) + [0]*len(rand_data),
'r2':list(data['r2.all.module'])+list(rand_data['r2.all'])})
df = df.sort_values('r2', ascending = False)
df = df.reset_index(drop=True)
fdr_by_score = {}
tp = 0
fp = 0
fdrs = []
for ri, r in df.iterrows():
label,pval = r.values
tp += 1 if int(label) == 1 else 0
fp += 1 if int(label) == 0 else 0
fdr = float(fp) / float(fp + tp)
fdrs.append(fdr)
df['FDR'] = fdrs
df.index = df.r2
print(df.sort_values('FDR').head())
fdr_dict = df['FDR'].to_dict()
empirical_fdr_values = list()
for r2 in list(data['r2.all.module']):
empirical_fdr_values.append(fdr_dict[r2])
data['empirical.FDR.module'] = pd.Series(empirical_fdr_values, index = data.index)
fname = '_'.join([dataset, module_type, datatype,covariate]) + '_data_all_multi_covariate_moduleSpecific_empiricalFDR.tsv.gz'
data.to_csv(folder + fname, sep = '\t', compression = 'gzip')
###########################################################################################
df = pd.DataFrame({'label':[1]*len(subunit_data) + [0]*len(rand_data),
'r2':list(subunit_data['r2.all.subunit'])+list(rand_data['r2.all'])})
df = df.sort_values('r2', ascending = False)
df = df.reset_index(drop=True)
fdr_by_score = {}
tp = 0
fp = 0
fdrs = []
for ri, r in df.iterrows():
label,pval = r.values
tp += 1 if int(label) == 1 else 0
fp += 1 if int(label) == 0 else 0
fdr = float(fp) / float(fp + tp)
fdrs.append(fdr)
df['FDR'] = fdrs
df.index = df.r2
print(df.sort_values('FDR').head())
fdr_dict = df['FDR'].to_dict()
empirical_fdr_values = list()
for r2 in list(subunit_data['r2.all.subunit']):
empirical_fdr_values.append(fdr_dict[r2])
subunit_data['empirical.FDR.module'] = pd.Series(empirical_fdr_values, index = subunit_data.index)
fname = '_'.join([dataset, module_type, datatype,covariate]) + '_data_all_multi_covariate_subunitSpecific_empiricalFDR.tsv.gz'
subunit_data.to_csv(folder + fname, sep = '\t', compression = 'gzip')
class step12:
@staticmethod
def execute(**kwargs):
folder = kwargs.get('folder','PATH')
randomize = kwargs.get('randomize','False')
run_combined = kwargs.get('run_combined','True')
run_multivariate = kwargs.get('run_multivariate','True')
for datatype in ['quant','stoichiometry']:
var = step12_machine_learning_setup(1, 'gygi3', 'complex',datatype,
randomize = randomize,
run_multivariate=run_multivariate,
run_combined=run_combined)
if run_multivariate == "True":
m1 = step12_summarizing.merge_multivariate_result_files('gygi3','complex', datatype,
folder = folder, randomize = randomize)
step12_summarizing.add_empirical_fdr_for_moduleSpecific_data('gygi3','complex', datatype, 'sex', folder = folder)
step12_summarizing.add_empirical_fdr_for_moduleSpecific_data('gygi3','complex', datatype, 'diet', folder = folder)
if run_combined == "True":
m2 = step12_summarizing.merge_multivariate_result_files_combined('gygi3','complex',datatype,
folder = folder, randomize = randomize)
if __name__ == "__main__":
## EXECUTE STEP12
step12.execute(folder = sys.argv[1], randomize = sys.argv[2], run_combined = sys.argv[3],
run_multivariate = sys.argv[4])
All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.