summary.py 5.77 KB
Newer Older
1 2 3 4 5
import pandas as pd
import time
import numpy as np
import re
from ast import literal_eval
6
from find_patterns import substitute_or_remove_prot_id
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
def readData(archivoEntrada, enfermedad,patrones_file,Sal):
    """
    Reads data from an Excel file, filters it based on the disease, and performs additional processing.

    Parameters:
    - archivo_entrada (str): Path to the Excel file.
    - enfermedad (str): Disease ID for filtering.
    - patrones_file (str): Path to the file containing patterns.
    - Sal: Output file extension
    Returns:
    - data (pd.DataFrame): Processed DataFrame based on the given parameters.
    """
    data = pd.read_excel(archivoEntrada)

    if enfermedad:
        data = data.loc[data["disease_id"] == enfermedad]

    dataB = pd.read_csv(patrones_file)

    print(len(data))
    filt_data = len(data)
    alz_filt_data = len(dataB)
    print("Proteins discarded after the main filter: " + str(filt_data - len(data)))
    print("Proteins discarded after the common Alzheimer's filter: " + str(alz_filt_data - len(dataB)))

    dataC = {}
    daa = dataB["Patron"].unique()
    das={}
    pos={}
    for u in daa:
        if len(u) > 3:
          kk=data.protein_sequence.str.contains(u)
          das[u] = data[kk]["protein_id"].to_list()
          pos[u]= data[kk]['protein_sequence'].str.find(u).to_list()
          print(len(pos[u]))
          print(len(das[u]))
          dataC[u]=[[das[u][ii],pos[u][ii]] for ii in range(0,len(das[u]))]
    dataG = pd.DataFrame(dataC.items(), columns=["pattern", "proteins"])
    dataG.to_excel("ProtByPattern"+Sal+".xlsx")

    sequences = data["protein_sequence"]
    return data
def add_protein_info_to_data(main_data_path, patterns_info_path, protein_names_path):
    """
    Add protein names and protein information from the original pattern file and the names Dataset to a DataFrame based on matching patterns.

    Parameters:
    - main_data_path (str): The path to the Excel file containing the main data.
    - patterns_info_path (str): The path to the CSV file containing patterns and protein information.
    - protein_names_path (str): The path to the CSV file containing protein names.

    Returns:
    None: The function updates the provided Excel file with additional protein information.

    Example:
    ```python
    add_protein_info_to_data("main_data.xlsx", "patterns_info.csv", "protein_names.csv")
    ```

    Note:
    - The function assumes that the provided Excel file ('main_data_path') contains a 'pattern' column.
    - The 'patterns_info_path' CSV file is expected to have columns 'Patron', 'Proteina', and 'Posiciones'.
    - The 'protein_names_path' CSV file is expected to have columns 'Entry' and 'Entry_Name'.
    """

    # Read data from files
    main_data = pd.read_excel(main_data_path)
    patterns_info = pd.read_csv(patterns_info_path)
    protein_names = pd.read_csv(protein_names_path)

    # Group patterns in 'patterns_info' DataFrame
    patterns_grouped = patterns_info.groupby("Patron")

    # Initialize columns in 'main_data' DataFrame
    main_data["protein_names"] = ""
    main_data["proteins_treat"] = ""

    # Iterate over patterns in 'patterns_info'
    for index, row in patterns_info.iterrows():
        pattern = row["Patron"]
        protein_id = row["Proteina"]
        positions = row["Posiciones"]

        # Find matching rows in 'main_data' DataFrame
        matching_rows = main_data[main_data["pattern"] == pattern]

        # Update 'proteins_treat' column
        main_data.loc[main_data["pattern"] == pattern, "proteins_treat"] = matching_rows["proteins"].apply(
            lambda x: literal_eval(x) + [[protein_id, positions]] if pd.notna(x) else [[protein_id, positions]]
        )

        # Update 'protein_names' column
        main_data.loc[main_data["pattern"] == pattern, "protein_names"] = matching_rows["proteins"].apply(
            lambda lst: [protein_names[protein_names["Entry"] == protein_id]["Entry_Name"].to_list() if protein_id else "N/A" for protein_id, _ in literal_eval(lst)]
        )

    # Save the updated data
    main_data_base_name = main_data_path.split(".")[0]
105
    main_data.to_excel(f"{main_data_base_name}_summary.xlsx", index=False) 
106
        
107
def add_entry_name(archivoEntrada,protein_name_file,archNom):
108 109 110 111 112 113
     """
     Adds entry names to the DataFrame based on an additional CSV file and performs additional processing.

     Parameters:
     - archivo_entrada (str): Path to the Excel file.
     - protein_name_file (str): Path to the protein name CSV file.
114 115
     - archNom (str): Path to the id sustitution file
     
116 117 118
     Returns:
     - None
     """
119
     data = pd.read_excel(archivoEntrada)
120
     dataB = pd.read_csv(protein_name_file, usecols=['Entry', "Entry_Name", "Protein_names", "Length"])
121 122
     dataB = substitute_or_remove_prot_id(dataB, archNom, "na")
     print("PASA")
123 124
     dataB = dataB.reindex(columns=['Entry', "Entry_Name", "Length", "Protein_names"])
     datas = dataB[dataB["Entry"].isin(data["protein_id"])]
125
     datas.to_csv(archivoEntrada + "_nombre.csv")
126 127 128 129 130 131
     doo = data[~(data["protein_id"].isin(dataB["Entry"]))]
     doo.to_csv("Proteinas_sin_nombre")
     #data.assign(lenght=datas["Length"].to_list())
     #data.assign(name=datas["Protein names"].to_list())
     #data.to_csv(archivoEntrada+"_nombre.csv")    
if __name__=="__main__":
132 133 134 135 136 137 138
       data=add_entry_name("Data/data_lung_cancer_desease.xlsx","Data/protein_name.csv","Data/nombres_sust.txt")
       data=pd.read_excel("Data/data_lung_cancer_desease.xlsx")
       dd=pd.read_excel("Data/data_lung_cancer_treatment.xlsx")
       dds=pd.concat([data,dd])
       dds.to_excel("Data/data_lung_cancer_desease_full.xlsx")
       #data=readData("Data/data_lung_cancer_desease.xlsx","","patronesIdenticos10Treat.csv","Lung01")
       #add_names_prot("ProtByPatternLung01.xlsx","patronesIdenticos10Treat.csv","protein_name.csv")
139