Commit b5e1a75c authored by Belen Otero Carrasco's avatar Belen Otero Carrasco

Adding final results finding patterns

parents
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import pandas as pd
import time
import numpy as np
import re
from ast import literal_eval
from find_patterns import substitute_or_remove_prot_id
def readData(archivoEntrada, enfermedad,patrones_file,Sal):
"""
Reads data from an Excel file, filters it based on the disease, and performs additional processing.
Parameters:
- archivo_entrada (str): Path to the Excel file.
- enfermedad (str): Disease ID for filtering.
- patrones_file (str): Path to the file containing patterns.
- Sal: Output file extension
Returns:
- data (pd.DataFrame): Processed DataFrame based on the given parameters.
"""
data = pd.read_excel(archivoEntrada)
if enfermedad:
data = data.loc[data["disease_id"] == enfermedad]
dataB = pd.read_csv(patrones_file)
print(len(data))
filt_data = len(data)
alz_filt_data = len(dataB)
print("Proteins discarded after the main filter: " + str(filt_data - len(data)))
print("Proteins discarded after the common Alzheimer's filter: " + str(alz_filt_data - len(dataB)))
dataC = {}
dataz={}
daa = dataB["Patron"].unique()
das={}
pos={}
deas={}
for u in daa:
if len(u) > 3:
kk=data.protein_sequence.str.contains(u)
das[u] = data[kk]["protein_id"].to_list()
pos[u]= data[kk]['protein_sequence'].str.find(u).to_list()
deas[u]=data[kk]['disease_id'].to_list()
print(len(pos[u]))
print(len(das[u]))
dataC[u]=[[[das[u][ii],pos[u][ii]],deas[u][ii]] for ii in range(0,len(das[u]))]
res = []
for row in dataC[u]:
matching_sublist = next((sublist for sublist in res if sublist[0] == row[0]), None)
if matching_sublist is not None:
# If a matching sublist is found, append only non-matching elements to it
matching_sublist[1].append(row[1])
else:
# If no matching sublist is found, create a new sublist with only non-matching elements
res.append([row[0],row[1:]])
dataC[u]=[sublist[0] for sublist in res]
dataz[u]=[sublist[1] for sublist in res]
dataG = pd.DataFrame({"pattern": dataC.keys(),"proteins":dataC.values(),"desease_id":dataz.values()})
dataG.to_excel("ProtByPattern"+Sal+".xlsx")
sequences = data["protein_sequence"]
return data
def add_protein_info_to_data(main_data_path, patterns_info_path, protein_names_path):
"""
Add protein names and protein information from the original pattern file and the names Dataset to a DataFrame based on matching patterns.
Parameters:
- main_data_path (str): The path to the Excel file containing the main data.
- patterns_info_path (str): The path to the CSV file containing patterns and protein information.
- protein_names_path (str): The path to the CSV file containing protein names.
Returns:
None: The function updates the provided Excel file with additional protein information.
Example:
```python
add_protein_info_to_data("main_data.xlsx", "patterns_info.csv", "protein_names.csv")
```
Note:
- The function assumes that the provided Excel file ('main_data_path') contains a 'pattern' column.
- The 'patterns_info_path' CSV file is expected to have columns 'Patron', 'Proteina', and 'Posiciones'.
- The 'protein_names_path' CSV file is expected to have columns 'Entry' and 'Entry_Name'.
"""
# Read data from files
main_data = pd.read_excel(main_data_path)
patterns_info = pd.read_csv(patterns_info_path)
protein_names = pd.read_csv(protein_names_path)
# Group patterns in 'patterns_info' DataFrame
patterns_grouped = patterns_info.groupby("Patron")
# Initialize columns in 'main_data' DataFrame
main_data["protein_names"] = ""
main_data["proteins_treat"] = "{}"
main_data["names_Treat"]=""
for pattern, group_data in patterns_grouped:
# Iterate over patterns in 'patterns_info'
for index, row in group_data.iterrows():
protein_id = row["Proteina"]
positions = row["Posiciones"]
# Find matching rows in 'main_data' DataFrame
matching_rows = main_data[main_data["pattern"] == pattern]
# Initialize or get the current 'proteins_treat' list
current_proteins_treat = {}
# Update 'proteins_treat' field for each matching row
for matching_index, matching_row in matching_rows.iterrows():
current_proteins_treat = literal_eval(matching_row["proteins_treat"]) if pd.notna(matching_row["proteins_treat"]) or matching_row["proteins_treat"] != "[]" else {}
current_proteins_treat.update({protein_id: literal_eval(positions)})
main_data.at[matching_index, "proteins_treat"] = str(current_proteins_treat)
matching_rows.at[matching_index, "proteins_treat"] = str(current_proteins_treat)
print(matching_rows["proteins_treat"].apply(
lambda lst: [protein_idee for protein_idee, _ in literal_eval(lst).items()]))
main_data.loc[main_data["pattern"] == pattern, "names_Treat"] = matching_rows["proteins_treat"].apply(
lambda lst: [protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() if protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() != [] else ["N/A"] for protein_idee, _ in literal_eval(lst).items()]
)
main_data.loc[main_data["pattern"] == pattern, "protein_names"] = matching_rows["proteins"].apply(
lambda lst: [protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() if protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() != [] else ["N/A"] for protein_idee, _ in literal_eval(lst)]
)
# Save the updated data
main_data_base_name = main_data_path.split(".")[0]
main_data.to_excel(f"{main_data_base_name}_summary.xlsx", index=False)
def add_entry_name(archivoEntrada,protein_name_file,archNom):
"""
Adds entry names to the DataFrame based on an additional CSV file and performs additional processing.
Parameters:
- archivo_entrada (str): Path to the Excel file.
- protein_name_file (str): Path to the protein name CSV file.
- archNom (str): Path to the id sustitution file
Returns:
- None
"""
data = pd.read_excel(archivoEntrada)
dataB = pd.read_csv(protein_name_file, usecols=['Entry', "Entry_Name", "Protein_names", "Length"])
dataB = substitute_or_remove_prot_id(dataB, archNom, "na")
print("PASA")
dataB = dataB.reindex(columns=['Entry', "Entry_Name", "Length", "Protein_names"])
datas = dataB[dataB["Entry"].isin(data["protein_id"])]
datas.to_csv(archivoEntrada + "_nombre.csv")
doo = data[~(data["protein_id"].isin(dataB["Entry"]))]
doo.to_csv("Proteinas_sin_nombre")
#data.assign(lenght=datas["Length"].to_list())
#data.assign(name=datas["Protein names"].to_list())
#data.to_csv(archivoEntrada+"_nombre.csv")
if __name__=="__main__":
#data=add_entry_name("Data/data_cancers_desease.xlsx","Data/protein_name.csv","Data/nombres_sust.txt")
#data=pd.read_excel("Data/data_lung_cancer_desease.xlsx")
#dd=pd.read_excel("Data/data_lung_cancer_treatment.xlsx")
#dds=pd.concat([data,dd])
#dds.to_excel("Data/data_lung_cancer_desease_full.xlsx")
data=readData("Data/data_immune_desease.xlsx","","patronesIdenticos10Treat.csv","Immun01")
add_protein_info_to_data("ProtByPatternImmun01.xlsx","patronesIdenticos10Treat.csv","Data/protein_name.csv")
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment