STEP 2: Python Code
class file_Loader(object):
def __init__(self, **kwargs):
self.folder = kwargs.get('folder','PATH')
print('load_standard_dataframe_into_memory')
self.load_standard_dataframe_into_memory()
@staticmethod
def load_file(folder, filename):
return DataFrameAnalyzer.open_in_chunks(folder, filename, compression = 'gzip')
def load_standard_dataframe_into_memory(self):
self.gygi3 = file_Loader.load_file(self.folder,"dataset_gygi3_remapped.tsv.gz")
self.gygi2 = file_Loader.load_file(self.folder,"dataset_gygi2_remapped.tsv.gz")
self.gygi1 = file_Loader.load_file(self.folder,"dataset_gygi1_remapped.tsv.gz")
self.battle_protein = file_Loader.load_file(self.folder,"dataset_battle_protein_remapped.tsv.gz")
self.battle_ribo = file_Loader.load_file(self.folder,"dataset_battle_ribo_remapped.tsv.gz")
self.battle_rna = file_Loader.load_file(self.folder,"dataset_battle_rna_remapped.tsv.gz")
self.mann_all_log2 = file_Loader.load_file(self.folder,"dataset_mann_all_log2_remapped.tsv.gz")
self.wu = file_Loader.load_file(self.folder,"dataset_wu_remapped.tsv.gz")
self.tiannan = file_Loader.load_file(self.folder,"dataset_tiannan_remapped.tsv.gz")
self.primatePRO = file_Loader.load_file(self.folder,"dataset_primatePRO_remapped.tsv.gz")
self.primateRNA = file_Loader.load_file(self.folder,"dataset_primateRNA_remapped.tsv.gz")
self.tcga_ovarian = file_Loader.load_file(self.folder,"dataset_tcga_ovarian_remapped.tsv.gz")
self.tcga_breast = file_Loader.load_file(self.folder,"dataset_tcga_breast_remapped.tsv.gz")
self.bxd_protein = file_Loader.load_file(self.folder, 'dataset_bxdMouse_remapped.tsv.gz')
self.colo_cancer = file_Loader.load_file(self.folder, 'dataset_coloCa_remapped.tsv.gz')
@staticmethod
def get_quantCols(fileData):
columnList = list(fileData.columns)
return utilsFacade.filtering(columnList, 'quant')
class step2_check_normalization:
@staticmethod
def execute(wp, **kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
datasets = [wp.gygi1, wp.gygi2, wp.gygi3, wp.battle_protein, wp.battle_ribo,
wp.battle_rna, wp.mann_all_log2, wp.wu, wp.tiannan, wp.primateRNA,
wp.primatePRO, wp.tcga_ovarian, wp.tcga_breast, wp.colo_cancer,
wp.bxd_protein]
names = ['gygi1','gygi2','gygi3','battle_protein','battle_ribo',
'battle_rna','mann_all_log2','wu','tiannan','primateRNA',
'primatePRO','tcga_ovarian','tcga_breast','colo_cancer',
'bxd_protein']
for dat,name in zip(datasets, names):
step2_check_normalization.check_normalization_boxplot(dat, name, folder)
@staticmethod
def check_normalization_boxplot(data, name, folder):
quant_cols = utilsFacade.filtering(list(data.columns),'quant_')
data = data[quant_cols]
sns.set(context='notebook', style='white',
palette='deep', font='Liberation Sans', font_scale=1,
color_codes=False, rc=None)
plt.rcParams["axes.grid"] = True
plt.clf()
fig = plt.figure(figsize = (10,5))
ax = fig.add_subplot(111)
plottingFacade.boxplot(ax, data)
plt.savefig(folder + 'normalization_check_' + name + '.pdf', bbox_inches = 'tight', dpi = 400)
#normal distribution of protein abundance within samples ensured
normal_check_list = list()
normal_pval_list = list()
for q in quant_cols:
w, p = scipy.stats.shapiro(utilsFacade.finite(list(data[q])))
normal_check_list.append(w)
normal_pval_list.append(p)
print(mean(normal_check_list))
print(mean(normal_pval_list))
class step2_complex_filtering(object):
def __init__(self, wp, **kwargs):
self.folder = kwargs.get('folder','PATH')
print("remapping_cycle")
self.remapping_cycle(wp, self.folder)
def remapping_cycle(self, wp, folder):
dataList = [wp.battle_protein, wp.battle_ribo, wp.battle_rna,
wp.mann_all_log2, wp.primatePRO, wp.primateRNA,
wp.tiannan, wp.wu, wp.tcga_ovarian, wp.tcga_breast,
wp.colo_cancer, wp.bxd_protein, wp.gygi1, wp.gygi2,
wp.gygi3]
nameList = ["battle_protein", "battle_ribo", "battle_rna",
"mann_all_log2", "primatePRO", "primateRNA",
"tiannan", "wu", 'tcga_ovarian', 'tcga_breast',
'tcga_colo','bxd_protein','gygi1','gygi2','gygi3']
for d,name in zip(dataList,nameList):
data = self.reduce_to_complex_proteins(d, name, folder)
def reduce_to_complex_proteins(self, data, name, folder):
complex_proteinData = data[data["complex_formal_ID"]>0]
complex_id_list = list(complex_proteinData["complex_formal_ID"])
complex_name_list = list(complex_proteinData["complexName"])
dfList = map(list,complex_proteinData.values)
idxList = list(complex_proteinData.index)
new_dfList = list()
count = 0
proteinList = list()
for com_id,com_name in zip(complex_id_list,complex_name_list):
df = dfList[count]
com_id = com_id.split(";;")
com_name = com_name.split(";;")
for i,n in zip(com_id,com_name):
tempList = df + [i,n]
new_dfList.append(tempList)
proteinList.append(idxList[count])
count += 1
new_df = pd.DataFrame(new_dfList)
new_df.columns = list(complex_proteinData.columns) + ['ComplexID','ComplexName']
new_df.index = proteinList
new_df["complex_search"] = new_df["ComplexID"] + "::" + new_df["ComplexName"]
complex_lst = list(new_df["complex_search"])
member_numList = list()
for complexID in complex_lst:
sub = new_df[new_df["complex_search"] == complexID]
member_numList.append(len(sub))
new_df["member_num"] = pd.Series(member_numList,index = new_df.index)
'''
new_df = new_df.rename(columns = {'complex_formal_ID':'complexID',
'complexName':'ComplexName'})
'''
new_df.to_csv(folder + 'complex_filtered_' + name + 'tsv.gz', sep = "\t", compression = 'gzip')
return new_df
class step2_stoichiometry_preparation(object):
def __init__(self,com_data,name,**kwargs):
self.name = name
self.com_data = com_data
self.folder = kwargs.get('folder','PATH')
self.log2_method = kwargs.get("log2_method","yes")
self.output_folder = kwargs.get('output_folder','PATH')
print("get_stoichiometry_data")
self.stoch_data = self.get_stoichiometry_data(com_data)
print("export_stoichiometry_data")
self.export_stoichiometry_data()
def export_stoichiometry_data(self):
stoch_data = self.stoch_data
stoch_data.to_csv(folder + self.name + "_remapped_complexStoichiometry.tsv.gz",
sep = "\t", compression = 'gzip')
def get_stoichiometry_data(self, com_data):
quant_cols = file_Loader.get_quantCols(com_data)
complex_lst = list(set(com_data["complex_search"]))
concat_list = list()
count = 0
for complex_id in complex_lst:
print(count,complex_id)
sub = com_data[com_data["complex_search"] == complex_id]
quant_sub = sub[quant_cols]
proteinList = list(quant_sub.index)
other_dict = dict()
for protein in proteinList:
other_complexes = self.check_protein_in_other_complexes(protein,com_data,complex_id)
if len(other_complexes) > 0:
other_dict.setdefault(protein,[]).append(other_complexes)
if len(other_dict) > 0:
mean_dict = self.get_average_value_from_other_complexes(other_dict,com_data,quant_cols)
else:
other_dict = dict()
mean_dict = dict()
quant_list = map(list,quant_sub.values)
trimmed_list = np.array(self.getTrimmedMeanList(quant_list))
stoch_list,relative_variance = self.calculate_relative_abundances(quant_list,proteinList,trimmed_list,mean_dict,other_dict)
stoch_data = pd.DataFrame(stoch_list)
stoch_data.columns = quant_sub.columns
stoch_data.index = quant_sub.index
stoch_data["relative_variance"] = pd.Series(relative_variance,index = stoch_data.index)
additional_columns = list(set(sub.columns).difference(set(stoch_data.columns)))
for col in additional_columns:
if col! = "SYMBOL.1.1":
stoch_data[col] = pd.Series(list(sub[col]),index = stoch_data.index)
concat_list.append(stoch_data)
count+ = 1
concat_stoch = pd.concat(concat_list)
return concat_stoch
def calculate_relative_abundances(self, quant_list, proteinList, trimmed_list, mean_dict, other_dict):
log2_method = self.log2_method
stoch_list = list()
relative_variance = list()
for q,prot in zip(quant_list,proteinList):
if len(other_dict) = 0:
if log2_method = "no":
tempList = np.array(q) - trimmed_list
else:
ratio_list = numpy.array(q)/numpy.array(trimmed_list)
ratio_list = np.array([x if x! = 0 else np.nan for x in list(ratio_list)])
tempList = numpy.log2(ratio_list)
else:
get_prot = mean_dict.get(prot,0)
if get_prot! = 0:
if log2_method = "no":
tempList = np.array(q) - np.array(mean_dict[prot][0])
else:
ratio_list = numpy.array(q)/numpy.array(mean_dict[prot][0])
ratio_list = np.array([x if x! = 0 else np.nan for x in list(ratio_list)])
tempList = numpy.log2(ratio_list)
else:
if log2_method = "no":
tempList = np.array(q) - trimmed_list
if log2_method = "yes":
ratio_list = numpy.array(q)/numpy.array(trimmed_list)
ratio_list = np.array([x if x! = 0 else np.nan for x in list(ratio_list)])
tempList = numpy.log2(ratio_list)
stoch_list.append(tempList)
relative_variance.append(np.var(utilsFacade.finite(tempList)))
return stoch_list,relative_variance
def get_average_value_from_other_complexes(self,other_dict,com_data,quant_cols):
mean_dict = dict()
for prot in other_dict:
trim_list = list()
for com_id in other_dict[prot][0]:
s = com_data[com_data["complex_search"] = = com_id]
quant_s = s[quant_cols]
quant = map(list,quant_s.values)
trimmed = self.getTrimmedMeanList(quant)
trim_list.append(trimmed)
mean_list = self.getMeanList(trim_list)
mean_dict.setdefault(prot,[]).append(mean_list)
return mean_dict
def getMeanList(self,lists):
log2_method = self.log2_method
"""
input: lists (nested lists)
This function calculates the medianProfile out of several lists.
"""
meanList = list()
for i in xrange(len(lists[0])):
tempList = list()
for l in lists:
if str(l[i])!= "nan":
tempList.append(l[i])
meanList.append(numpy.mean(filter(lambda a:str(a)!= "nan",tempList)))
if log2_method == "yes":
meanList = [x if x!= 0 else np.nan for x in meanList]
return meanList
def getTrimmedMeanList(self,lists):
log2_method = self.log2_method
"""
input: lists (nested lists)
This function calculates the medianProfile out of several lists.
"""
medianList = list()
for i in xrange(len(lists[0])):
tempList = list()
for l in lists:
if str(l[i])!= "nan":
tempList.append(l[i])
trimValue = utilsFacade.trimmean(filter(lambda a:str(a)!= "nan",tempList))
if trimValue = 0:
medianList.append(np.nan)
else:
medianList.append(trimValue)
if log2_method = "yes":
medianList = [x if x!=0 else np.nan for x in medianList]
return medianList
def check_protein_in_other_complexes(self,protein,com_data,complex_id):
temp = com_data.loc[protein]
if type(temp) = pd.DataFrame:
other_complexes = list(temp["complex_search"])
else:
other_complexes = list()
return other_complexes
class step2_variability_testing_subunits(object):
def __init__(self,data,name,**kwargs):
self.data = data
self.name = name
print("checking_variability")
self.concat_stoch = self.checking_variability()
print("export_concat_stoch")
self.export_concat_stoch()
def checking_variability(self):
stoch_data = self.data
stoch_data['complex_search'] = stoch_data['ComplexID'] + ':' + stoch_data['ComplexName']
concatList = list()
quant_cols = file_Loader.get_quantCols(stoch_data)
complex_list = list(set(stoch_data["complex_search"]))
outer_count = 0
for com in complex_list:
print(outer_count,com)
stoch = stoch_data[stoch_data["complex_search"] = = com]
sub = stoch[quant_cols].T
dataList = list()
proteins = list()
for p,protein in enumerate(list(sub.columns)):
d = utilsFacade.finite(list(sub.iloc[:,p]))
dataList.append(d)
proteins.append(protein)
levene_pval_list = list()
stateList = list()
coverageList = list()
count = 0
for p,d in zip(proteins,dataList):
others = utilsFacade.flatten(dataList[0:count] + dataList[count+1:])
try:
levene_pval_list.append(scipy.stats.levene(d,others)[1])
except:
levene_pval_list.append(np.nan)
coverageList.append((len(d)/len(quant_cols))*100)
if np.var(d) > np.var(others):
stateList.append("variable")
else:
stateList.append("stable")
count += 1
stoch["coverage"] = pd.Series(coverageList,index = stoch.index)
stoch["levene_pval"] = pd.Series(levene_pval_list,index = stoch.index)
stoch["state"] = pd.Series(stateList,index = stoch.index)
concatList.append(stoch)
outer_count += 1
concat_stoch = pd.concat(concatList)
levene = list(concat_stoch["levene_pval"])
lev = utilsFacade.finite(levene)
levene_corr = utilsFacade.correct_pvalues(lev)
adj_list = list()
for l in levene:
if str(l)! = "nan":
idx = lev.index(l)
adj_list.append(levene_corr[idx])
else:
adj_list.append(np.nan)
concat_stoch["levene_pval.adj"] = pd.Series(adj_list,index = concat_stoch.index)
return concat_stoch
def export_concat_stoch(self):
concat_stoch = self.concat_stoch
concat_stoch.to_csv(self.folder + self.name + "_remapped_complexStoichiometry_withVar.tsv.gz",
sep = "\t", compression = 'gzip')
class step2:
@staticmethod
def execute(wp, **kwargs):
folder = kwargs.get('folder','PATH')
print('run_norm_check')
step2.run_norm_check(wp, folder)
print('run_complex_filtering')
step2.run_complex_filtering(wp, folder)
print('run_complex_stoichiometry_preparation')
step2.run_complex_stoichiometry_preparation(wp, folder)
@staticmethod
def run_norm_check(wp, folder):
step2_check_normalization(wp, folder = folder)
@staticmethod
def run_complex_filtering(wp, folder):
step2_complex_filtering(wp, folder = folder)
@staticmethod
def run_complex_stoichiometry_preparation(wp, folder):
nameList = ["battle_protein", "battle_ribo", "battle_rna",
"mann_all_log2", "primatePRO", "primateRNA",
"tiannan", "wu", 'tcga_ovarian', 'tcga_breast',
'tcga_colo','bxd_protein','gygi1','gygi2','gygi3']
log_list = ['no','yes','yes','no','no','no','yes','no','yes',
'yes','yes','yes','yes','no','yes']
for name,log in zip(nameList, log_list):
d = DataFrameAnalyzer.getFile(folder + 'complex_filtered_' + name + '.tsv.gz')
stoch = step2_stoichiometry_preparation(d, name, log2_method = log)
stochvar = step2_variability_testing_subunits(stoch.stoch_data, name)
if __name__ == "__main__":
## EXECUTE STEP2
wp = file_Loader(folder = sys.argv[1])
step2.execute(wp, folder = sys.argv[1])
All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.