summary.py 7.48 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
import pandas as pd
import time
import numpy as np
import re
from ast import literal_eval
from find_patterns import substitute_or_remove_prot_id
def readData(archivoEntrada, enfermedad,patrones_file,Sal):
    """
    Reads data from an Excel file, filters it based on the disease, and performs additional processing.

    Parameters:
    - archivo_entrada (str): Path to the Excel file.
    - enfermedad (str): Disease ID for filtering.
    - patrones_file (str): Path to the file containing patterns.
    - Sal: Output file extension
    Returns:
    - data (pd.DataFrame): Processed DataFrame based on the given parameters.
    """
    data = pd.read_excel(archivoEntrada)

    if enfermedad:
        data = data.loc[data["disease_id"] == enfermedad]

    dataB = pd.read_csv(patrones_file)

    print(len(data))
    filt_data = len(data)
    alz_filt_data = len(dataB)
    print("Proteins discarded after the main filter: " + str(filt_data - len(data)))
    print("Proteins discarded after the common Alzheimer's filter: " + str(alz_filt_data - len(dataB)))

    dataC = {}
    dataz={}
    daa = dataB["Patron"].unique()
    das={}
    pos={}
    deas={}
    for u in daa:
        if len(u) > 3:
          kk=data.protein_sequence.str.contains(u)
          das[u] = data[kk]["protein_id"].to_list()
          pos[u]= data[kk]['protein_sequence'].str.find(u).to_list()
          deas[u]=data[kk]['disease_id'].to_list()
          print(len(pos[u]))
          print(len(das[u]))
          dataC[u]=[[[das[u][ii],pos[u][ii]],deas[u][ii]] for ii in range(0,len(das[u]))]
          res = []
          for row in dataC[u]:
              matching_sublist = next((sublist for sublist in res if sublist[0] == row[0]), None)

              if matching_sublist is not None:
            # If a matching sublist is found, append only non-matching elements to it
                  matching_sublist[1].append(row[1])
              else:
            # If no matching sublist is found, create a new sublist with only non-matching elements
                 res.append([row[0],row[1:]])
          dataC[u]=[sublist[0] for sublist in res]
          dataz[u]=[sublist[1] for sublist in res]
    dataG = pd.DataFrame({"pattern": dataC.keys(),"proteins":dataC.values(),"desease_id":dataz.values()})
    dataG.to_excel("ProtByPattern"+Sal+".xlsx")

    sequences = data["protein_sequence"]
    return data
def add_protein_info_to_data(main_data_path, patterns_info_path, protein_names_path):
    """
    Add protein names and protein information from the original pattern file and the names Dataset to a DataFrame based on matching patterns.

    Parameters:
    - main_data_path (str): The path to the Excel file containing the main data.
    - patterns_info_path (str): The path to the CSV file containing patterns and protein information.
    - protein_names_path (str): The path to the CSV file containing protein names.

    Returns:
    None: The function updates the provided Excel file with additional protein information.

    Example:
    ```python
    add_protein_info_to_data("main_data.xlsx", "patterns_info.csv", "protein_names.csv")
    ```

    Note:
    - The function assumes that the provided Excel file ('main_data_path') contains a 'pattern' column.
    - The 'patterns_info_path' CSV file is expected to have columns 'Patron', 'Proteina', and 'Posiciones'.
    - The 'protein_names_path' CSV file is expected to have columns 'Entry' and 'Entry_Name'.
    """

    # Read data from files
    main_data = pd.read_excel(main_data_path)
    patterns_info = pd.read_csv(patterns_info_path)
    protein_names = pd.read_csv(protein_names_path)

    # Group patterns in 'patterns_info' DataFrame
    patterns_grouped = patterns_info.groupby("Patron")

    # Initialize columns in 'main_data' DataFrame
    main_data["protein_names"] = ""
    main_data["proteins_treat"] = "{}"
    main_data["names_Treat"]=""
    for pattern, group_data in patterns_grouped:
    # Iterate over patterns in 'patterns_info'
       for index, row in group_data.iterrows():
         protein_id = row["Proteina"]
         positions = row["Posiciones"]

         # Find matching rows in 'main_data' DataFrame
         matching_rows = main_data[main_data["pattern"] == pattern]

         # Initialize or get the current 'proteins_treat' list
         current_proteins_treat = {}

         # Update 'proteins_treat' field for each matching row
         for matching_index, matching_row in matching_rows.iterrows():
            current_proteins_treat = literal_eval(matching_row["proteins_treat"]) if pd.notna(matching_row["proteins_treat"]) or matching_row["proteins_treat"] != "[]" else {}
            current_proteins_treat.update({protein_id: literal_eval(positions)})
            main_data.at[matching_index, "proteins_treat"] = str(current_proteins_treat)
            matching_rows.at[matching_index, "proteins_treat"] = str(current_proteins_treat)
       print(matching_rows["proteins_treat"].apply(
            lambda lst: [protein_idee for protein_idee, _ in literal_eval(lst).items()]))
       main_data.loc[main_data["pattern"] == pattern, "names_Treat"] = matching_rows["proteins_treat"].apply(
            lambda lst: [protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() if protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() != [] else ["N/A"] for protein_idee, _ in literal_eval(lst).items()]
        )
       main_data.loc[main_data["pattern"] == pattern, "protein_names"] = matching_rows["proteins"].apply(
            lambda lst: [protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() if protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() != [] else ["N/A"] for protein_idee, _ in literal_eval(lst)]
        )
    # Save the updated data
    main_data_base_name = main_data_path.split(".")[0]
    main_data.to_excel(f"{main_data_base_name}_summary.xlsx", index=False) 


        
def add_entry_name(archivoEntrada,protein_name_file,archNom):
     """
     Adds entry names to the DataFrame based on an additional CSV file and performs additional processing.

     Parameters:
     - archivo_entrada (str): Path to the Excel file.
     - protein_name_file (str): Path to the protein name CSV file.
     - archNom (str): Path to the id sustitution file
     
     Returns:
     - None
     """
     data = pd.read_excel(archivoEntrada)
     dataB = pd.read_csv(protein_name_file, usecols=['Entry', "Entry_Name", "Protein_names", "Length"])
     dataB = substitute_or_remove_prot_id(dataB, archNom, "na")
     print("PASA")
     dataB = dataB.reindex(columns=['Entry', "Entry_Name", "Length", "Protein_names"])
     datas = dataB[dataB["Entry"].isin(data["protein_id"])]
     datas.to_csv(archivoEntrada + "_nombre.csv")
     doo = data[~(data["protein_id"].isin(dataB["Entry"]))]
     doo.to_csv("Proteinas_sin_nombre")
     #data.assign(lenght=datas["Length"].to_list())
     #data.assign(name=datas["Protein names"].to_list())
     #data.to_csv(archivoEntrada+"_nombre.csv")    
if __name__=="__main__":
       #data=add_entry_name("Data/data_cancers_desease.xlsx","Data/protein_name.csv","Data/nombres_sust.txt")
       #data=pd.read_excel("Data/data_lung_cancer_desease.xlsx")
       #dd=pd.read_excel("Data/data_lung_cancer_treatment.xlsx")
       #dds=pd.concat([data,dd])
       #dds.to_excel("Data/data_lung_cancer_desease_full.xlsx")
       data=readData("Data/data_immune_desease.xlsx","","patronesIdenticos10Treat.csv","Immun01")
       add_protein_info_to_data("ProtByPatternImmun01.xlsx","patronesIdenticos10Treat.csv","Data/protein_name.csv")