FIGURE 1: Python Code
class figure1:
@staticmethod
def execute(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
print('FIGURE1:main_figure1_elements')
figure1.main_figure1_elements(folder = folder, output_folder = output_folder)
print('FIGURE1:main_figure1_rocCurves')
figure1.main_figure1_rocCurves(folder = folder, output_folder = output_folder)
@staticmethod
def main_figure1_elements(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
print('get_data')
data = figure1.get_data(folder = folder)
print('sub_figure_abundanceProfiles')
figure1.sub_figure_abundanceProfiles(data, output_folder = output_folder)
print('sub_figure_coexpressionMatrix')
figure1.sub_figure_coexpressionMatrix(data, output_folder = output_folder)
@staticmethod
def main_figure1_rocCurves(**kwargs):
folder = kwargs.get('folder','PATH')
output_folder = kwargs.get('output_folder','PATH')
string_fileFolder = folder + 'string_roc_data/'
folders = [string_fileFolder, folder, output_folder]
print("return_areaDict")
areaDict1, xList1 = figure1.return_areaDict_track1(folders)
areaDict2, xList2 = figure1.return_areaDict_track2(string_fileFolder)
print("sub_figure_rocCurve")
figure1.sub_figure_rocCurve_track1(areaDict1, output_folder)
figure1.sub_figure_rocCurve_track2(areaDict2, output_folder)
@staticmethod
def get_data(**kwargs):
folder = kwargs.get('folder','PATH')
data = DataFrameAnalyzer.open_in_chunks(folder, 'complex_filtered_battle_protein.tsv.gz')
return data
@staticmethod
def sub_figure_coexpressionMatrix(data, output_folder):
quantCols = utilsFacade.get_quantCols(data)
sub = data[data.ComplexID.isin(["HC2308","Nucleopore Complex","HC2224","HC2191"])]
quantData = sub[quantCols][0:600].dropna()
sns.set(context='notebook', style='white',
palette='deep', font='Liberation Sans', font_scale=1,
color_codes=False, rc=None)
plt.rcParams["axes.grid"] = False
plt.clf()
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111)
altProteinList = list(quantData.index)
plottingFacade.func_plotCorrelationMatrix(ax, quantData, altProteinList)
plt.savefig(output_folder + "fig1a_coexpression_matrix.pdf",
bbox_inches="tight", dpi=400)
@staticmethod
def sub_figure_abundanceProfiles(data, output_folder):
quantCols = utilsFacade.get_quantCols(data)
quantData = data[quantCols]
quantData = quantData.dropna()
sns.set(context='notebook', style='white',
palette='deep', font='Liberation Sans', font_scale=1,
color_codes=False, rc=None)
plt.rcParams["axes.grid"] = False
plt.rcParams["axes.grid"] = True
plt.clf()
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(311)
ax.plot(xrange(len(quantData.iloc[0])),list(quantData.iloc[0]),color="orange",linewidth=2)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax = fig.add_subplot(312)
ax.plot(xrange(len(quantData.iloc[100])),list(quantData.iloc[100]),color="blue",linewidth=2)
ax.set_ylabel("expression/abundance",fontsize=20)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax = fig.add_subplot(313)
ax.plot(xrange(len(quantData.iloc[200])),list(quantData.iloc[200]),color="green",linewidth=2)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_xlabel("individuals",fontsize=20)
plt.savefig(output_folder + "fig1a_protein_profile_examples.pdf", bbox_inches="tight", dpi=400)
@staticmethod
def return_areaDict_track1(folders):
keyname = "battle_protein"
category = "combined_score"
string_fileFolder, folder, output_folder = folders
areaDict = dict((e1,dict()) for e1 in [keyname])
areaDict[keyname] = dict((e1,dict()) for e1 in ['STRING'])
fileName = '_'.join(['string_rocCurves_inputData', keyname, category, 'complexExcluded']) + '.txt.gz'
fileData = DataFrameAnalyzer.getFile(string_fileFolder, fileName)
thresholdList = map(float,list(fileData["threshold"].iloc[0].split(",")))
areaList = list()
for score in list(xrange(100,1000,100)):
area = list(fileData[fileData["stringScore"]==score]["area"])[0]
areaList.append(area)
tp = map(float,list(fileData[fileData["stringScore"]==score]["TP"])[0].split(","))
fp = map(float,list(fileData[fileData["stringScore"]==score]["FP"])[0].split(","))
areaDict[keyname]["STRING"].setdefault(score,[]).append((area,tp,fp))
areaDict = figure1.get_compartment(areaDict, folder)
areaDict = figure1.get_pathways(areaDict, folder)
areaDict = figure1.get_complexes(areaDict, folder)
return areaDict,thresholdList
@staticmethod
def return_areaDict_track2(string_fileFolder):
keyname = "battle_protein"
category = "combined_score"
keynames = ["battle_protein","battle_ribo","battle_rna","gygi1","gygi2"]
areaDict = dict((e1,dict()) for e1 in keynames)
for keyname in keynames:
print(keyname)
areaDict[keyname].setdefault("STRING",{})
fileName = '_'.join(['string_rocCurves_inputData', keyname, category, 'complexExcluded']) + '.txt.gz'
fileData = DataFrameAnalyzer.getFile(string_fileFolder, fileName, sep="\t")
areaList = list()
thresholdList = map(float,list(fileData["threshold"].iloc[0].split(",")))
for score in list(xrange(100,1000,100)):
area = list(fileData[fileData["stringScore"]==score]["area"])[0]
areaList.append(area)
tp = map(float,list(fileData[fileData["stringScore"]==score]["TP"])[0].split(","))
fp = map(float,list(fileData[fileData["stringScore"]==score]["FP"])[0].split(","))
areaDict[keyname]["STRING"].setdefault(score,[]).append((area,tp,fp))
return areaDict,thresholdList
@staticmethod
def get_pathways(areaDict, pat_fileFolder):
fileName = "pathway_rocCurves_inputData_battle_protein_diffMethod.txt.gz"
fileData = DataFrameAnalyzer.getFile(pat_fileFolder, fileName)
area = list(fileData["area"])[0]
tp = map(float,list(fileData["TP"])[0].split(","))
fp = map(float,list(fileData["FP"])[0].split(","))
areaDict[keyname].setdefault("pathway",[]).append((area,tp,fp))
return areaDict
@staticmethod
def get_complexes(areaDict, com_fileFolder):
fileName = "complex_rocCurves_inputData_battle_protein_3members.txt.gz"
fileData = DataFrameAnalyzer.getFile(com_fileFolder, fileName)
area = list(fileData["area"])[0]
tp = map(float,list(fileData["TP"])[0].split(","))
fp = map(float,list(fileData["FP"])[0].split(","))
areaDict[keyname].setdefault("complex_3members",[]).append((area,tp,fp))
fileName = "complex_rocCurves_inputData_battle_protein_5members.txt.gz"
fileData = DataFrameAnalyzer.getFile(com_fileFolder,fileName)
area = list(fileData["area"])[0]
tp = map(float,list(fileData["TP"])[0].split(","))
fp = map(float,list(fileData["FP"])[0].split(","))
areaDict[keyname].setdefault("complex_5members",[]).append((area,tp,fp))
return areaDict
@staticmethod
def get_compartment(areaDict, loc_fileFolder):
fileName = "compartment_rocCurves_inputData_battle_protein.txt"
fileData = DataFrameAnalyzer.getFile(loc_fileFolder, fileName)
data = fileData[fileData["dataset_name"]=="localization"]
area = list(data["area"])[0]
tp = map(float,list(fileData["TP"])[0].split(","))
fp = map(float,list(fileData["FP"])[0].split(","))
areaDict[keyname].setdefault("compartment",[]).append((area,tp,fp))
data = fileData[fileData["dataset_name"]=="Nucleus"]
area = list(data["FP"])[0]
tp = map(float,list(fileData["TP"])[0].split(","))
fp = map(float,list(fileData["FP"])[0].split(","))
areaDict[keyname].setdefault("nucleus",[]).append((area,tp,fp))
fileName = "mitocarta_rocCurves_inputData_battle_protein.txt"
fileData = DataFrameAnalyzer.getFile(loc_fileFolder, fileName)
data = fileData[fileData["category"]=="mitoDomain"]
area = list(data["area"])[0]
tp = map(float,list(fileData["TP"])[0].split(","))
fp = map(float,list(fileData["FP"])[0].split(","))
areaDict[keyname].setdefault("mitocarta_mitoDomain",[]).append((area,tp,fp))
data = fileData[fileData["category"]=="sharedDomain"]
area = list(data["area"])[0]
tp = map(float,list(fileData["TP"])[0].split(","))
fp = map(float,list(fileData["FP"])[0].split(","))
areaDict[keyname].setdefault("mitocarta_sharedDomain",[]).append((area,tp,fp))
return areaDict
@staticmethod
def sub_figure_rocCurve_track1(areaDict, output_folder):
area_complex,tp_complex,fp_complex = areaDict["battle_protein"]["complex_5members"][0]
area_pat,tp_pat,fp_pat = areaDict["battle_protein"]["pathway"][0]
area_compa,tp_compa,fp_compa = areaDict["battle_protein"]["compartment"][0]
area_mito,tp_mito,fp_mito = areaDict["battle_protein"]["mitocarta_mitoDomain"][0]
area_string500,tp_string500,fp_string500 = areaDict["battle_protein"]["STRING"][500][0]
area_string600,tp_string600,fp_string600 = areaDict["battle_protein"]["STRING"][600][0]
area_string700,tp_string700,fp_string700 = areaDict["battle_protein"]["STRING"][700][0]
area_string800,tp_string800,fp_string800 = areaDict["battle_protein"]["STRING"][800][0]
area_string900,tp_string900,fp_string900 = areaDict["battle_protein"]["STRING"][900][0]
cm = plt.get_cmap('ocean')
cNorm = mpl.colors.Normalize(vmin=0, vmax=4)
scalarMap = mpl.cm.ScalarMappable(norm=cNorm, cmap=cm)
scalarMap.set_array(list(xrange(9)))
colorList1 = scalarMap.to_rgba(list(xrange(9)))
colorList1 = ["red","green","blue","darkblue"]
sns.set(context='notebook', style='white',
palette='deep', font='Liberation Sans', font_scale=1,
color_codes=False, rc=None)
plt.rcParams["axes.grid"] = True
plt.clf()
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111)
ax.plot(list(utilsFacade.frange(0,1,0.1)),list(utilsFacade.frange(0,1,0.1)),
color="grey", alpha=0.85, linewidth=2, linestyle="--")
ax.plot(fp_complex,tp_complex, color=colorList1[0], alpha=0.85, linewidth=2)
ax.plot(fp_pat,tp_pat, color=colorList1[1], alpha=0.85, linewidth=2)
ax.plot(fp_compa,tp_compa, color=colorList1[2], alpha=0.85, linewidth=2)
ax.plot(fp_string700,tp_string700, color=colorList1[3], alpha=0.85, linewidth=2)
plottingFacade.make_full_legend(ax, ["complexes","pathways","compartments","STRING interactions"],
colorList1, fontsize=12)
ax.set_xlabel("False Positive Rate", fontsize=13)
ax.set_ylabel("True Positive Rate", fontsize=13)
plt.tick_params(axis="x",which="both",bottom="off",top="off",labelsize=12)
plt.tick_params(axis="y",which="both",left="off",right="off",labelsize=12)
ax.set_title("ROC curves for collecting AUC-values",fontsize=14)
plt.savefig(output_folder + "fig1a_roc_curve_examples_TRACK1.pdf", bbox_inches="tight", dpi=400)
@staticmethod
def sub_figure_rocCurve_track2(areaDict, output_folder):
bp_area_string700, bp_tp_string700, bp_fp_string700 = areaDict["battle_protein"]["STRING"][700][0]
bribo_area_string700, bribo_tp_string700, bribo_fp_string700 = areaDict["battle_ribo"]["STRING"][700][0]
brna_area_string700, brna_tp_string700, brna_fp_string700 = areaDict["battle_rna"]["STRING"][700][0]
gygi1_area_string700, gygi1_tp_string700, gygi1_fp_string700 = areaDict["gygi1"]["STRING"][700][0]
gygi2_area_string700, gygi2_tp_string700, gygi2_fp_string700 = areaDict["gygi2"]["STRING"][700][0]
cm = plt.get_cmap('ocean')
cNorm = mpl.colors.Normalize(vmin=0, vmax=4)
scalarMap = mpl.cm.ScalarMappable(norm=cNorm, cmap=cm)
scalarMap.set_array(list(xrange(9)))
colorList1 = scalarMap.to_rgba(list(xrange(9)))
colorList1 = ["red","green","blue","darkblue","darkgreen","darkred"]
sns.set(context='notebook', style='white',
palette='deep', font='Liberation Sans', font_scale=1,
color_codes=False, rc=None)
plt.rcParams["axes.grid"] = True
plt.clf()
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111)
ax.plot(list(utilsFacade.frange(0,1,0.1)),list(utilsFacade.frange(0,1,0.1)),
color="grey", alpha=0.85, linewidth=2, linestyle="--")
ax.plot(bp_fp_string700,bp_tp_string700, color=colorList1[0], alpha=0.85, linewidth=2)
ax.plot(bribo_fp_string700,bribo_tp_string700, color=colorList1[1], alpha=0.85, linewidth=2)
ax.plot(brna_fp_string700,brna_tp_string700, color=colorList1[2], alpha=0.85, linewidth=2)
ax.plot(gygi1_fp_string700,gygi1_tp_string700, color=colorList1[3], alpha=0.85, linewidth=2)
ax.plot(gygi2_fp_string700,gygi2_tp_string700, color=colorList1[4], alpha=0.85, linewidth=2)
plottingFacade.make_full_legend(ax, ["Dataset1","Dataset2","Dataset3","Dataset4","Dataset5"],
colorList1, fontsize=12)
ax.set_xlabel("False Positive Rate", fontsize=13)
ax.set_ylabel("True Positive Rate", fontsize=13)
plt.tick_params(axis="x",which="both",bottom="off",top="off",labelsize=12)
plt.tick_params(axis="y",which="both",left="off",right="off",labelsize=12)
ax.set_title("ROC curves for collecting AUC-values",fontsize=14)
plt.savefig(output_folder + "fig1a_roc_curve_examples_TRACK2.pdf", bbox_inches="tight", dpi=400)
if __name__ == "__main__":
## EXECUTE FIGURE1
figure1.execute(folder = sys.argv[1], input_folder = sys.argv[2])
All scripts were developed by Natalie Romanov (Bork group, EMBL). The source code used in the analysis of protein complex variability across individuals is released under the GNU General Public License v3.0. All scripts on this website/web resource is Copyright (C) 2019 Natalie Romanov, Michael Kuhn, Ruedi Aebersold, Alessandro Ori, Martin Beck, Peer Bork and EMBL.