Reporte de datos para SEH2 Bioinfo

Author

Garcia Justo

Published

March 26, 2024

Code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loading data
protein = pd.read_csv("summarys/protein.tsv", sep="\t", usecols=["uniprot_acc", "id_protein"])
mutation_clinvar = pd.read_csv("summarys/mutation_clinvar.tsv", sep="\t", usecols=["id_protein"])
mutation_humsavar = pd.read_csv("summarys/mutation_humsavar.tsv", sep="\t", usecols=["id_protein"])

# Counting mutations
humsavar_counts = mutation_humsavar["id_protein"].value_counts().reset_index()
humsavar_counts.columns = ["id_protein", "count_humsavar"]

clinvar_counts = mutation_clinvar["id_protein"].value_counts().reset_index()
clinvar_counts.columns = ["id_protein", "count_clinvar"]

# Merging with mutations
protein = protein.merge(humsavar_counts, on="id_protein", how="left").fillna(0)
protein = protein.merge(clinvar_counts, on="id_protein", how="left").fillna(0)

# Correct types
protein["count_humsavar"] = protein["count_humsavar"].astype(int)
protein["count_clinvar"] = protein["count_clinvar"].astype(int)

# Calculate number of mutations
protein["total_mutations"] = protein["count_humsavar"] + protein["count_clinvar"]

protein_has_mlo = pd.read_csv("summarys/protein_has_mlo.tsv", sep="\t")
protein_has_mlo["id_mlo"] = protein_has_mlo["id_mlo"].astype(int)

protein_and_mlo = protein.merge(right=protein_has_mlo, on="id_protein")

rol = pd.read_csv("summarys/rol.tsv", sep="\t")

mlo = pd.read_csv("summarys/mlo.tsv", sep="\t")
mlo["id_mlo"] = mlo["id_mlo"].astype(int)

protein_and_mlo = protein_and_mlo.merge(rol, on="id_rol")
protein_and_mlo.drop(columns="id_rol", inplace=True)

protein_and_mlo = protein_and_mlo.merge(mlo, on="id_mlo")
protein_and_mlo.drop(columns="id_mlo", inplace=True)

top_5_mlo = protein_and_mlo["mlo"].value_counts().nlargest(5).index  # Obtener los 5 más frecuentes

df_filtered = protein_and_mlo[protein_and_mlo["mlo"].isin(top_5_mlo)]
df_filtered
Table 1: Resumen de proteínas asociadas con los 5 MLOs más frecuentes
uniprot_acc id_protein count_humsavar count_clinvar total_mutations id_dataset id_proteinmlo rol mlo
2 O00571 11 20 566 586 2 28 driver nucleolus
3 O00571 11 20 566 586 2 29 driver p-body
4 O00571 11 20 566 586 2 30 driver stress_granule
5 O00571 11 20 566 586 2 31 driver postsynaptic_density
6 O00571 11 20 566 586 4 32 driver stress_granule
... ... ... ... ... ... ... ... ... ...
11364 Q9Y6U7 5739 1 33 34 5 11655 client nucleolus
11365 Q9Y6V0 2623 2 2772 2774 2 11656 client postsynaptic_density
11366 Q9Y6W6 5740 0 30 30 5 11648 client nucleolus
11370 V9GYY5 1751 0 2023 2023 2 11652 client nucleolus
11372 V9GYY5 1751 0 2023 2023 2 11654 client p-body

8560 rows × 9 columns

Code
plt.figure(figsize=(12, 8))
sns.set(style="whitegrid", palette="muted")

sns.boxplot(data=df_filtered, y="mlo", x="total_mutations", hue="rol")
plt.xticks(rotation=45)
plt.xlabel("MLO")
plt.ylabel("Mutaciones totales")
plt.title("Distribución de Mutaciones Totales por MLO y Rol", fontsize=14)

plt.legend(title="Rol", loc="upper right", bbox_to_anchor=(1.15, 1))
plt.tight_layout()

plt.show()
Figure 1: Boxplot de mutaciones por Mlo y Rol
Code
plt.figure(figsize=(12, 8))

sns.boxplot(data=df_filtered, y="rol", x="total_mutations", hue="mlo")
plt.xticks(rotation=45)
plt.xlabel("MLO")
plt.ylabel("Mutaciones totales")
plt.title("Distribución de Mutaciones Totales por MLO y Rol", fontsize=14)

plt.legend(title="Rol", loc="upper right", bbox_to_anchor=(1.15, 1))
plt.tight_layout()

plt.show()
Figure 2: Boxplot de mutaciones por Mlo y Rol, invertido como se diferencian
Code
import plotly.express as px

# Supongamos que ya tienes el DataFrame 'protein_and_mlo'
fig = px.box(df_filtered, x="mlo", y="total_mutations", title="Distribución de Mutaciones por MLO", hover_data={"mlo": True, "total_mutations": True, "uniprot_acc":True})

fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
Figure 3: Boxplot de mutaciones por Mlo y Rol con plotly
Code
plt.figure(figsize=(14,8))
sns.set(style="whitegrid", palette="Set2")

ax = sns.boxplot(data=df_filtered, y="mlo", x="total_mutations", hue="rol", showfliers=False, width=0.6)
sns.stripplot(data=df_filtered, y="mlo", x="total_mutations", hue="rol", dodge=True, size=4, alpha=1, jitter=True)

handles, labels = ax.get_legend_handles_labels()
n = len(set(df_filtered["rol"]))
plt.legend(handles[:n], labels[:n], title="Rol", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.xlabel("Mutaciones totales", fontsize=14)
plt.ylabel("MLO", fontsize=14)
plt.title("Distribución de Mutaciones Totales por MLO y Rol", fontsize=16)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Figure 4: Boxplot y stripplot de mutaciones por Mlo y Rol
Code
plt.figure(figsize=(14,8))
sns.set(style="whitegrid", palette="Set2")

# ax = sns.boxplot(data=df_filtered, y="mlo", x="total_mutations", hue="rol", showfliers=False, width=0.6)
sns.stripplot(data=df_filtered, y="mlo", x="total_mutations", hue="rol", dodge=True, size=4, alpha=1, jitter=True)

handles, labels = ax.get_legend_handles_labels()
n = len(set(df_filtered["rol"]))
plt.legend(handles[:n], labels[:n], title="Rol", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.xlabel("Mutaciones totales", fontsize=14)
plt.ylabel("MLO", fontsize=14)
plt.title("Distribución de Mutaciones Totales por MLO y Rol", fontsize=16)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Figure 5: Stripplot de mutaciones por Mlo y Rol
Code
plt.figure(figsize=(10,8))

ax = sns.boxplot(data=df_filtered, y="mlo", x="total_mutations", hue="rol", showfliers=False, width=0.6)
sns.stripplot(data=df_filtered, y="mlo", x="total_mutations", hue="rol", dodge=True, size=4, alpha=0.6, jitter=True)

plt.xscale("log")
plt.xlabel("Mutaciones totales (escala logarítmica)", fontsize=14)
plt.ylabel("MLO", fontsize=14)
plt.title("Distribución de Mutaciones Totales por MLO y Rol (escala logarítmica)", fontsize=16)

# handles, labels = ax.get_legend_handles_labels()
n = len(set(df_filtered["rol"]))
plt.legend(handles[:n], labels[:n], title="Rol", bbox_to_anchor=(1.05, 1), loc="upper left")

# plt.legend(title="Rol", loc="upper left")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Figure 6: Boxplot y stripplot de mutaciones por Mlo y Rol en escala logarítmica
Code
plt.figure(figsize=(10,8))

# ax = sns.boxplot(data=df_filtered, y="mlo", x="total_mutations", hue="rol", showfliers=False, width=0.6)
sns.stripplot(data=df_filtered, y="mlo", x="total_mutations", hue="rol", dodge=True, size=4, alpha=0.6, jitter=True)

plt.xscale("log")
plt.xlabel("Mutaciones totales (escala logarítmica)", fontsize=14)
plt.ylabel("MLO", fontsize=14)
plt.title("Distribución de Mutaciones Totales por MLO y Rol (escala logarítmica)", fontsize=16)

# handles, labels = ax.get_legend_handles_labels()
n = len(set(df_filtered["rol"]))
plt.legend(handles[:n], labels[:n], title="Rol", bbox_to_anchor=(1.05, 1), loc="upper left")

# plt.legend(title="Rol", loc="upper left")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Figure 7: Stripplot de mutaciones por Mlo y Rol en escala logarítmica

El outlier más marcado es Q8WZ42

Análisis de Benignas/Patogénicas/Incierto para Humsavar

Code
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import textwrap
from datetime import datetime
import os

# Cargar datasets
humsavar = pd.read_csv("data/humsavar.tsv", sep="\t")  # Contiene patogenicidad
protein = pd.read_csv("summarys/protein.tsv", sep="\t", usecols=["uniprot_acc"])
humsavar = humsavar[humsavar["disease_name"].str.contains("MIM:")]

mutation_disease = pd.read_csv("results/humsavar_diseases.tsv", sep="\t")  # Asocia mutaciones a enfermedades
humsavar = humsavar.merge(protein, left_on="uniprot", right_on="uniprot_acc")
mutation_disease = mutation_disease.groupby("OMIM").agg({
    "disease": "last",
    "ft_id": lambda x: ",".join(str(id) for id in x if pd.notna(id))  # Handle NaN values
}).reset_index()
humsavar.replace({"disease_name" : {"-" : pd.NA}}, inplace=True)

humsavar.dropna(subset="disease_name", inplace=True)

mutation_disease.drop(columns=["OMIM"], inplace=True)

mutation_disease["ft_id"] = mutation_disease["ft_id"].str.split(",")


mutation_disease = mutation_disease.explode("ft_id")

mutation_disease["ft_id"] = mutation_disease["ft_id"].str.strip()
mutation_disease = mutation_disease.merge(humsavar["ft_id"], on="ft_id")

mutation_disease["ft_id"] = mutation_disease["ft_id"].str.strip()

categorys = humsavar.set_index('ft_id').loc[mutation_disease['ft_id']]['category'].values

mutation_disease = mutation_disease.merge(
    humsavar[['ft_id', 'category']],
    on='ft_id',
    how='left'
)
mutation_disease.drop_duplicates(inplace=True)



def wrap_text(text, max_width=20):
    return '\n'.join(textwrap.wrap(text, width=max_width))

mutation_disease['disease'] = mutation_disease['disease'].apply(wrap_text)



counts = mutation_disease.groupby(['disease', 'category']).size().unstack(fill_value=0)

top_5 = counts.sum(axis=1).nlargest(5).index
counts_top5 = counts.loc[top_5]

counts_top5 = counts_top5[['LP/P', 'US', 'LB/B']]

percentages = counts_top5.div(counts_top5.sum(axis=1), axis=0) * 100

counts_top5
Table 2: Enfermedades y el recuento de mutaciones pertenecientes a cada categoria
category LP/P US LB/B
disease
Hemophilia A (HEMA) 466 3 1
Marfan syndrome\n(MFS) 318 2 1
Brugada syndrome 1\n(BRGDA1) 86 143 0
Breast cancer (BC) 85 71 46
Cardiomyopathy,\nfamilial\nhypertrophic, 1\n(CMH1) 170 2 0
Code
colores = ['#ff5656', '#a2d41c', '#5bbad3'] # Rojo, Azul, Verde

# plt.figure(figsize=(10, 5))
ax = counts_top5.plot(kind='barh', stacked=True,
                      color=colores,
                      width=0.7,
                      figsize=(10, 5))

for i, (idx, row) in enumerate(counts_top5.iterrows()):
    total = row.sum()
    accum = 0
    for cat, value in row.items():
        if value > 0:
            ax.text(accum + value/2, i, f"{value} \n ({value/total*100:.1f}%)", 
                    ha='center', va='center', color='white', fontweight='bold', fontsize=7)
            accum += value

plt.title("Top 5 enfermedades con mutaciones en proteínas LLPS", fontsize=14, pad=20)
plt.xlabel("Número de mutaciones")
plt.ylabel("")
plt.legend(['Patogénicas (LP/P)', 'Significado incierto (US)', 'Benignas (LB/B)'], 
           frameon=False, bbox_to_anchor=(1, 1))
sns.despine(left=True, bottom=True)
plt.tight_layout()

plt.show()
Figure 8: Barplot para las 5 enfermedades más frecuentes en Humsavar y su categoría de patogenicidad.
Code
counts_top5.plot(kind='bar', stacked=False, color=colores, figsize=(12,6))
plt.title("Mutaciones por categoría (top 5 enfermedades)", fontsize=14)
plt.ylabel("Número de mutaciones")
plt.xticks(rotation=45, ha='right')
plt.legend(title="Categoría", bbox_to_anchor=(1.05, 1))
sns.despine()
plt.tight_layout()

plt.show()
Figure 9: Barplot para las 5 enfermedades más frecuentes en Humsavar y su categoría de patogenicidad pero unstacked.