Code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Loading data
protein = pd.read_csv("summarys/protein.tsv" , sep= " \t " , usecols= ["uniprot_acc" , "id_protein" ])
mutation_clinvar = pd.read_csv("summarys/mutation_clinvar.tsv" , sep= " \t " , usecols= ["id_protein" ])
mutation_humsavar = pd.read_csv("summarys/mutation_humsavar.tsv" , sep= " \t " , usecols= ["id_protein" ])
# Counting mutations
humsavar_counts = mutation_humsavar["id_protein" ].value_counts().reset_index()
humsavar_counts.columns = ["id_protein" , "count_humsavar" ]
clinvar_counts = mutation_clinvar["id_protein" ].value_counts().reset_index()
clinvar_counts.columns = ["id_protein" , "count_clinvar" ]
# Merging with mutations
protein = protein.merge(humsavar_counts, on= "id_protein" , how= "left" ).fillna(0 )
protein = protein.merge(clinvar_counts, on= "id_protein" , how= "left" ).fillna(0 )
# Correct types
protein["count_humsavar" ] = protein["count_humsavar" ].astype(int )
protein["count_clinvar" ] = protein["count_clinvar" ].astype(int )
# Calculate number of mutations
protein["total_mutations" ] = protein["count_humsavar" ] + protein["count_clinvar" ]
protein_has_mlo = pd.read_csv("summarys/protein_has_mlo.tsv" , sep= " \t " )
protein_has_mlo["id_mlo" ] = protein_has_mlo["id_mlo" ].astype(int )
protein_and_mlo = protein.merge(right= protein_has_mlo, on= "id_protein" )
rol = pd.read_csv("summarys/rol.tsv" , sep= " \t " )
mlo = pd.read_csv("summarys/mlo.tsv" , sep= " \t " )
mlo["id_mlo" ] = mlo["id_mlo" ].astype(int )
protein_and_mlo = protein_and_mlo.merge(rol, on= "id_rol" )
protein_and_mlo.drop(columns= "id_rol" , inplace= True )
protein_and_mlo = protein_and_mlo.merge(mlo, on= "id_mlo" )
protein_and_mlo.drop(columns= "id_mlo" , inplace= True )
top_5_mlo = protein_and_mlo["mlo" ].value_counts().nlargest(5 ).index # Obtener los 5 más frecuentes
df_filtered = protein_and_mlo[protein_and_mlo["mlo" ].isin(top_5_mlo)]
df_filtered
Code
plt.figure(figsize= (12 , 8 ))
sns.set (style= "whitegrid" , palette= "muted" )
sns.boxplot(data= df_filtered, y= "mlo" , x= "total_mutations" , hue= "rol" )
plt.xticks(rotation= 45 )
plt.xlabel("MLO" )
plt.ylabel("Mutaciones totales" )
plt.title("Distribución de Mutaciones Totales por MLO y Rol" , fontsize= 14 )
plt.legend(title= "Rol" , loc= "upper right" , bbox_to_anchor= (1.15 , 1 ))
plt.tight_layout()
plt.show()
Code
plt.figure(figsize= (12 , 8 ))
sns.boxplot(data= df_filtered, y= "rol" , x= "total_mutations" , hue= "mlo" )
plt.xticks(rotation= 45 )
plt.xlabel("MLO" )
plt.ylabel("Mutaciones totales" )
plt.title("Distribución de Mutaciones Totales por MLO y Rol" , fontsize= 14 )
plt.legend(title= "Rol" , loc= "upper right" , bbox_to_anchor= (1.15 , 1 ))
plt.tight_layout()
plt.show()
Code
import plotly.express as px
# Supongamos que ya tienes el DataFrame 'protein_and_mlo'
fig = px.box(df_filtered, x= "mlo" , y= "total_mutations" , title= "Distribución de Mutaciones por MLO" , hover_data= {"mlo" : True , "total_mutations" : True , "uniprot_acc" :True })
fig.show()
Code
plt.figure(figsize= (14 ,8 ))
sns.set (style= "whitegrid" , palette= "Set2" )
ax = sns.boxplot(data= df_filtered, y= "mlo" , x= "total_mutations" , hue= "rol" , showfliers= False , width= 0.6 )
sns.stripplot(data= df_filtered, y= "mlo" , x= "total_mutations" , hue= "rol" , dodge= True , size= 4 , alpha= 1 , jitter= True )
handles, labels = ax.get_legend_handles_labels()
n = len (set (df_filtered["rol" ]))
plt.legend(handles[:n], labels[:n], title= "Rol" , bbox_to_anchor= (1.05 , 1 ), loc= "upper left" )
plt.xlabel("Mutaciones totales" , fontsize= 14 )
plt.ylabel("MLO" , fontsize= 14 )
plt.title("Distribución de Mutaciones Totales por MLO y Rol" , fontsize= 16 )
plt.xticks(rotation= 45 )
plt.tight_layout()
plt.show()
Code
plt.figure(figsize= (14 ,8 ))
sns.set (style= "whitegrid" , palette= "Set2" )
# ax = sns.boxplot(data=df_filtered, y="mlo", x="total_mutations", hue="rol", showfliers=False, width=0.6)
sns.stripplot(data= df_filtered, y= "mlo" , x= "total_mutations" , hue= "rol" , dodge= True , size= 4 , alpha= 1 , jitter= True )
handles, labels = ax.get_legend_handles_labels()
n = len (set (df_filtered["rol" ]))
plt.legend(handles[:n], labels[:n], title= "Rol" , bbox_to_anchor= (1.05 , 1 ), loc= "upper left" )
plt.xlabel("Mutaciones totales" , fontsize= 14 )
plt.ylabel("MLO" , fontsize= 14 )
plt.title("Distribución de Mutaciones Totales por MLO y Rol" , fontsize= 16 )
plt.xticks(rotation= 45 )
plt.tight_layout()
plt.show()
Code
plt.figure(figsize= (10 ,8 ))
ax = sns.boxplot(data= df_filtered, y= "mlo" , x= "total_mutations" , hue= "rol" , showfliers= False , width= 0.6 )
sns.stripplot(data= df_filtered, y= "mlo" , x= "total_mutations" , hue= "rol" , dodge= True , size= 4 , alpha= 0.6 , jitter= True )
plt.xscale("log" )
plt.xlabel("Mutaciones totales (escala logarítmica)" , fontsize= 14 )
plt.ylabel("MLO" , fontsize= 14 )
plt.title("Distribución de Mutaciones Totales por MLO y Rol (escala logarítmica)" , fontsize= 16 )
# handles, labels = ax.get_legend_handles_labels()
n = len (set (df_filtered["rol" ]))
plt.legend(handles[:n], labels[:n], title= "Rol" , bbox_to_anchor= (1.05 , 1 ), loc= "upper left" )
# plt.legend(title="Rol", loc="upper left")
plt.xticks(rotation= 45 )
plt.tight_layout()
plt.show()
Code
plt.figure(figsize= (10 ,8 ))
# ax = sns.boxplot(data=df_filtered, y="mlo", x="total_mutations", hue="rol", showfliers=False, width=0.6)
sns.stripplot(data= df_filtered, y= "mlo" , x= "total_mutations" , hue= "rol" , dodge= True , size= 4 , alpha= 0.6 , jitter= True )
plt.xscale("log" )
plt.xlabel("Mutaciones totales (escala logarítmica)" , fontsize= 14 )
plt.ylabel("MLO" , fontsize= 14 )
plt.title("Distribución de Mutaciones Totales por MLO y Rol (escala logarítmica)" , fontsize= 16 )
# handles, labels = ax.get_legend_handles_labels()
n = len (set (df_filtered["rol" ]))
plt.legend(handles[:n], labels[:n], title= "Rol" , bbox_to_anchor= (1.05 , 1 ), loc= "upper left" )
# plt.legend(title="Rol", loc="upper left")
plt.xticks(rotation= 45 )
plt.tight_layout()
plt.show()
El outlier más marcado es Q8WZ42
Análisis de Benignas/Patogénicas/Incierto para Humsavar
Code
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import textwrap
from datetime import datetime
import os
# Cargar datasets
humsavar = pd.read_csv("data/humsavar.tsv" , sep= " \t " ) # Contiene patogenicidad
protein = pd.read_csv("summarys/protein.tsv" , sep= " \t " , usecols= ["uniprot_acc" ])
humsavar = humsavar[humsavar["disease_name" ].str .contains("MIM:" )]
mutation_disease = pd.read_csv("results/humsavar_diseases.tsv" , sep= " \t " ) # Asocia mutaciones a enfermedades
humsavar = humsavar.merge(protein, left_on= "uniprot" , right_on= "uniprot_acc" )
mutation_disease = mutation_disease.groupby("OMIM" ).agg({
"disease" : "last" ,
"ft_id" : lambda x: "," .join(str (id ) for id in x if pd.notna(id )) # Handle NaN values
}).reset_index()
humsavar.replace({"disease_name" : {"-" : pd.NA}}, inplace= True )
humsavar.dropna(subset= "disease_name" , inplace= True )
mutation_disease.drop(columns= ["OMIM" ], inplace= True )
mutation_disease["ft_id" ] = mutation_disease["ft_id" ].str .split("," )
mutation_disease = mutation_disease.explode("ft_id" )
mutation_disease["ft_id" ] = mutation_disease["ft_id" ].str .strip()
mutation_disease = mutation_disease.merge(humsavar["ft_id" ], on= "ft_id" )
mutation_disease["ft_id" ] = mutation_disease["ft_id" ].str .strip()
categorys = humsavar.set_index('ft_id' ).loc[mutation_disease['ft_id' ]]['category' ].values
mutation_disease = mutation_disease.merge(
humsavar[['ft_id' , 'category' ]],
on= 'ft_id' ,
how= 'left'
)
mutation_disease.drop_duplicates(inplace= True )
def wrap_text(text, max_width= 20 ):
return ' \n ' .join(textwrap.wrap(text, width= max_width))
mutation_disease['disease' ] = mutation_disease['disease' ].apply (wrap_text)
counts = mutation_disease.groupby(['disease' , 'category' ]).size().unstack(fill_value= 0 )
top_5 = counts.sum (axis= 1 ).nlargest(5 ).index
counts_top5 = counts.loc[top_5]
counts_top5 = counts_top5[['LP/P' , 'US' , 'LB/B' ]]
percentages = counts_top5.div(counts_top5.sum (axis= 1 ), axis= 0 ) * 100
counts_top5
Code
colores = ['#ff5656' , '#a2d41c' , '#5bbad3' ] # Rojo, Azul, Verde
# plt.figure(figsize=(10, 5))
ax = counts_top5.plot(kind= 'barh' , stacked= True ,
color= colores,
width= 0.7 ,
figsize= (10 , 5 ))
for i, (idx, row) in enumerate (counts_top5.iterrows()):
total = row.sum ()
accum = 0
for cat, value in row.items():
if value > 0 :
ax.text(accum + value/ 2 , i, f" { value} \n ( { value/ total* 100 :.1f} %)" ,
ha= 'center' , va= 'center' , color= 'white' , fontweight= 'bold' , fontsize= 7 )
accum += value
plt.title("Top 5 enfermedades con mutaciones en proteínas LLPS" , fontsize= 14 , pad= 20 )
plt.xlabel("Número de mutaciones" )
plt.ylabel("" )
plt.legend(['Patogénicas (LP/P)' , 'Significado incierto (US)' , 'Benignas (LB/B)' ],
frameon= False , bbox_to_anchor= (1 , 1 ))
sns.despine(left= True , bottom= True )
plt.tight_layout()
plt.show()
Code
counts_top5.plot(kind= 'bar' , stacked= False , color= colores, figsize= (12 ,6 ))
plt.title("Mutaciones por categoría (top 5 enfermedades)" , fontsize= 14 )
plt.ylabel("Número de mutaciones" )
plt.xticks(rotation= 45 , ha= 'right' )
plt.legend(title= "Categoría" , bbox_to_anchor= (1.05 , 1 ))
sns.despine()
plt.tight_layout()
plt.show()