summary.py 5.58 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
import pandas as pd
import time
import numpy as np
import re
from ast import literal_eval
def readData(archivoEntrada, enfermedad,patrones_file,Sal):
    """
    Reads data from an Excel file, filters it based on the disease, and performs additional processing.

    Parameters:
    - archivo_entrada (str): Path to the Excel file.
    - enfermedad (str): Disease ID for filtering.
    - patrones_file (str): Path to the file containing patterns.
    - Sal: Output file extension
    Returns:
    - data (pd.DataFrame): Processed DataFrame based on the given parameters.
    """
    data = pd.read_excel(archivoEntrada)

    if enfermedad:
        data = data.loc[data["disease_id"] == enfermedad]

    dataB = pd.read_csv(patrones_file)

    print(len(data))
    filt_data = len(data)
    alz_filt_data = len(dataB)
    print("Proteins discarded after the main filter: " + str(filt_data - len(data)))
    print("Proteins discarded after the common Alzheimer's filter: " + str(alz_filt_data - len(dataB)))

    dataC = {}
    daa = dataB["Patron"].unique()
    das={}
    pos={}
    for u in daa:
        if len(u) > 3:
          kk=data.protein_sequence.str.contains(u)
          das[u] = data[kk]["protein_id"].to_list()
          pos[u]= data[kk]['protein_sequence'].str.find(u).to_list()
          print(len(pos[u]))
          print(len(das[u]))
          dataC[u]=[[das[u][ii],pos[u][ii]] for ii in range(0,len(das[u]))]
    dataG = pd.DataFrame(dataC.items(), columns=["pattern", "proteins"])
    dataG.to_excel("ProtByPattern"+Sal+".xlsx")

    sequences = data["protein_sequence"]
    return data
def add_protein_info_to_data(main_data_path, patterns_info_path, protein_names_path):
    """
    Add protein names and protein information from the original pattern file and the names Dataset to a DataFrame based on matching patterns.

    Parameters:
    - main_data_path (str): The path to the Excel file containing the main data.
    - patterns_info_path (str): The path to the CSV file containing patterns and protein information.
    - protein_names_path (str): The path to the CSV file containing protein names.

    Returns:
    None: The function updates the provided Excel file with additional protein information.

    Example:
    ```python
    add_protein_info_to_data("main_data.xlsx", "patterns_info.csv", "protein_names.csv")
    ```

    Note:
    - The function assumes that the provided Excel file ('main_data_path') contains a 'pattern' column.
    - The 'patterns_info_path' CSV file is expected to have columns 'Patron', 'Proteina', and 'Posiciones'.
    - The 'protein_names_path' CSV file is expected to have columns 'Entry' and 'Entry_Name'.
    """

    # Read data from files
    main_data = pd.read_excel(main_data_path)
    patterns_info = pd.read_csv(patterns_info_path)
    protein_names = pd.read_csv(protein_names_path)

    # Group patterns in 'patterns_info' DataFrame
    patterns_grouped = patterns_info.groupby("Patron")

    # Initialize columns in 'main_data' DataFrame
    main_data["protein_names"] = ""
    main_data["proteins_treat"] = ""

    # Iterate over patterns in 'patterns_info'
    for index, row in patterns_info.iterrows():
        pattern = row["Patron"]
        protein_id = row["Proteina"]
        positions = row["Posiciones"]

        # Find matching rows in 'main_data' DataFrame
        matching_rows = main_data[main_data["pattern"] == pattern]

        # Update 'proteins_treat' column
        main_data.loc[main_data["pattern"] == pattern, "proteins_treat"] = matching_rows["proteins"].apply(
            lambda x: literal_eval(x) + [[protein_id, positions]] if pd.notna(x) else [[protein_id, positions]]
        )

        # Update 'protein_names' column
        main_data.loc[main_data["pattern"] == pattern, "protein_names"] = matching_rows["proteins"].apply(
            lambda lst: [protein_names[protein_names["Entry"] == protein_id]["Entry_Name"].to_list() if protein_id else "N/A" for protein_id, _ in literal_eval(lst)]
        )

    # Save the updated data
    main_data_base_name = main_data_path.split(".")[0]
    main_data.to_excel(f"{main_data_base_name}_aux.xlsx", index=False) 
        
def add_entry_name(archivoEntrada,protein_name_file):
     """
     Adds entry names to the DataFrame based on an additional CSV file and performs additional processing.

     Parameters:
     - archivo_entrada (str): Path to the Excel file.
     - protein_name_file (str): Path to the protein name CSV file.

     Returns:
     - None
     """
     data = pd.read_excel(archivo_entrada)
     dataB = pd.read_csv(protein_name_file, usecols=['Entry', "Entry_Name", "Protein_names", "Length"])
     dataB = substitute_or_remove_prot_id(dataB, "na")
     dataB = dataB.reindex(columns=['Entry', "Entry_Name", "Length", "Protein_names"])
     datas = dataB[dataB["Entry"].isin(data["protein_id"])]
     datas.to_csv(archivo_entrada + "_nombre.csv")
     doo = data[~(data["protein_id"].isin(dataB["Entry"]))]
     doo.to_csv("Proteinas_sin_nombre")
     #data.assign(lenght=datas["Length"].to_list())
     #data.assign(name=datas["Protein names"].to_list())
     #data.to_csv(archivoEntrada+"_nombre.csv")    
if __name__=="__main__":
       #data=add_entry_name("data_nervous_genes_xf.xlsx","protein_name.csv")
       #data=pd.read_excel("Data/data_nervous_genes_xf.xlsx")
       #dd=pd.read_excel("Data/data_nervous_genes_xfal.xlsx")
       #dds=pd.concat([data,dd])
       #dds.to_excel("Data/data_nervous_genes_xfull.xlsx")
       data=readData("Data/data_nervous_genes_xf.xlsx","","patronesIdenticos10Treat.csv","Lung01")
       add_names_prot("ProtByPatternLung01.xlsx","patronesIdenticos10Treat.csv","protein_name.csv")