Refactored and documented the code

b4953dc3 · Rafael Artinano · caafdbcc · b4953dc3 · b4953dc3 · b4953dc3
Commit b4953dc3 authored Jan 23, 2024 by Rafael Artinano
16 changed files
--- a/TFM-main/src/CMakeLists.txt
+++ b/TFM-main/src/CMakeLists.txt
+cmake_minimum_required(VERSION 3.12)
+project(nw_wrapper)
+
+set(CMAKE_CXX_STANDARD 14)
+
+find_package(pybind11 REQUIRED)
+
+add_library(nw_wrapper MODULE nw_wrapper.cpp)
+target_link_libraries(nw_wrapper PRIVATE pybind11::module)
+set_target_properties(nw_wrapper PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}" SUFFIX "${PYTHON_MODULE_EXTENSION}")
--- a/TFM-main/src/clustering.py
+++ b/TFM-main/src/clustering.py
--- a/TFM-main/src/compute_distance_mat.py
+++ b/TFM-main/src/compute_distance_mat.py
--- a/TFM-main/src/compute_for_clases.py
+++ b/TFM-main/src/compute_for_clases.py
--- a/TFM-main/src/find_patterns.py
+++ b/TFM-main/src/find_patterns.py
+
+import pandas as pd
+import time
+import numpy as np
+import re
+globi=0
+df_b=None
+def substitute_or_remove_prot_id(data,archSubs,sub_rem,archivSal=None):
+    """
+    Substitute or remove protein identifiers based on a substitution file.
+
+    Parameters:
+    - data: DataFrame containing protein data
+    - archSubs: Input file with the protein ids that must be substituted by its primary entry
+    - sub_rem: Operation type ('s' for substitution, 'p' for removal of protein id repeated once replaced and return of the removed ones, 'c' for class replacement,
+               'na' for protein_id replacement as Entry, anything else for protein_id  based on desease_id replacement and repeated removal)
+    - archSal: (Optional) Name of the output file has default value as None            
+    
+    Returns:
+    - Modified DataFrame after performing substitution or removal operation
+    """
+    print("inside the problem")
+    with open(archSubss) as prottosubs:
+          index=prottosubs.readline()
+          acept=index.split()
+          listtosubs={}
+          for i in range(0,len(acept)):
+            listtosubs[acept[i]]=[]
+          while line := prottosubs.readline():
+              newline=line.split()
+              #print(len(newline))
+              for i in range(0,len(newline)):
+                  
+                  listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip())  
+    resub=1
+    if re.search("Primary",list(listtosubs.keys())[0]):
+           resub=0
+    print((resub+1)%2)
+    #print(data)
+    #data2=data.copy()
+    global globi
+    if(sub_rem == "s"):
+        data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
+    #datacp=data.copy()
+    #print(pd.concat([data2,datacp]).drop_duplicates())
+    elif(sub_rem == "p"):
+        datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==False]
+        data= data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==True]
+        #print(data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==True])
+        #print(datas)
+        
+        #data.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=True)
+        data=data.drop_duplicates(keep="first", inplace=False)
+        did=data.copy()
+        data = data.drop_duplicates(subset=['disease_id', 'protein_sequence'], keep="first", inplace=False)
+        did=did[~did.isin(data).all(axis=1)]
+        did=did.drop_duplicates()
+        #print(pd.concat([did,did2]).drop_duplicates(keep=False))
+        print(did)
+        datas=pd.concat([datas, did], ignore_index=True)
+        if(archivSal != None):
+           data.to_excel(archivSal,index=False,columns=data.columns) 
+        datas.to_csv('resultados/proteinasDescartadassp_'+ str(globi) +'.csv', index=False) 
+    elif(sub_rem == "c"):
+        datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True]
+        data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
+        print("tamaño original: "+str(len(data)))
+        dats=data.drop_duplicates(subset=['protein_id','class_id'],keep='first',inplace=False)
+        print("Despues de tirar duplicados en id: "+str(len(dats)))
+        dats=dats.drop_duplicates(subset=['protein_sequence','class_id'],keep='first',inplace=False)
+        print("Despues de tirar duplicados en secuencia: "+str(len(dats)))
+        if(archivSal != None):
+         dats.to_excel(archivSal,index=False,columns=data.columns)  
+        datas.to_csv('resultados/clasesDescartadasc_'+ str(globi) +'.csv', index=False) 
+        #pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
+        #pd_diff.to_excel('data_not_valid.xlsx')
+        globi=globi+1 
+        data=dats
+    elif(sub_rem=="na"):
+        datas= data[data["Entry"].isin(list(listtosubs.values())[(resub+1)%2])==True]
+        data["Entry"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
+        print("tamaño original: "+str(len(data)))
+        dats=data.drop_duplicates(subset=['Entry'],keep='first',inplace=False)
+        print("Despues de tirar duplicados en id: "+str(len(dats)))
+        #dats=dats.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=False)
+        print("Despues de tirar duplicados en secuencia: "+str(len(dats)))
+        if(archivSal != None):
+           dats.to_excel(archivSal,index=False,columns=data.columns)  
+        datas.to_csv('resultados/proteinasDescartadasna_'+ str(globi) +'.csv', index=False) 
+        #pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
+        #pd_diff.to_excel('data_not_valid.xlsx')
+        globi=globi+1 
+        data=dats    
+    else: 
+        
+        datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True]
+        data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
+        print("tamaño original: "+str(len(data)))
+        dats=data.drop_duplicates(subset=['disease_id','protein_id'],keep='first',inplace=False)
+        print("Despues de tirar duplicados en id: "+str(len(dats)))
+        dats=dats.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=False)
+        print("Despues de tirar duplicados en secuencia: "+str(len(dats)))
+        if(archivSal != None):
+           dats.to_excel(archivSal,index=False,columns=data.columns)  
+        datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False) 
+        #pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
+        #pd_diff.to_excel('data_not_valid.xlsx')
+        globi=globi+1 
+        data=dats
+        #data.to_excel('data_nervous_genes_2.xlsx')
+    return data
+
+def readData(archivoEntrada,archivoEnt2 ,enfermedad,archivoSal):
+    """
+    Read data from an Excel file, find evey entry that matches the patterns of the CSV Input file, and save the result of the matches with the pattern that was searched to a new Excel file.
+
+    Parameters:
+    - archivoEntrada: Input Excel file path
+    - archivoEnt2: Input CSV file with the patterns to be searched in the Archivo Entrada input file
+    - enfermedad: Optional disease ID for filtering
+    - archivoSal: Output Excel file path
+    """
+    data = pd.read_excel(archivoEntrada)
+    
+    #data.to_excel('data_nervous_genes_2.xlsx')
+    
+    if (enfermedad != ''):
+        #datar=substitute_or_remove_prot_id(data,"r")
+        #sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
+        
+        
+        data = data.loc[data["disease_id"] == enfermedad]
+
+    dataB = pd.read_csv(archivoEnt2)
+                  
+        
+    print(len(data))
+        #dataB.to_excel("data_nervous_genes_xf2.xlsx")
+        #data.to_excel('data_nervous_genes_2.xlsx')
+    filt_data=len(data)
+    alz_filt_data=len(dataB)
+    print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data)))      
+    print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB)))
+        #print("tamaño del descarte: "+ str(data[data["protein_id"].isin(dataB["protein_id"])].shape[0]))
+    dataC={}
+    daa=dataB["Patron"].unique()
+        
+    for u in daa:
+      if(len(u)>3): 
+        dataC[u]=data[data.protein_sequence.str.contains(u)]["protein_id"].to_list()
+    dataG=pd.DataFrame(dataC.items(),columns=["pattern","proteins"])
+    dataG.to_excel(archivoSal)              
+    #data=substitute_or_remove_prot_id(data,"r")
+def add_name_patterns(archivoEntrada,archivNom,EqvData,OutName):
+      """
+      Add protein names to the DataFrame extracted from an excel based on a CSV file and save the result to a new CSV file.
+
+      Parameters:
+      - archivoEntrada: Input Excel file
+      -ArchivNom: Input csv with the names of equivalences of the values contained in the input excel
+      """
+     data=pd.read_excel(archivoEntrada)
+     dataB=pd.read_csv(archivNom,usecols=['Entry',"Entry_Name","Protein_names","Length"])
+     dataB=substitute_or_remove_prot_id(dataB,EqvData,"na",OutName)
+     dataB=dataB.reindex(columns=['Entry',"Entry_Name","Length","Protein_names"])
+     datas=dataB[dataB["Entry"].isin(data["protein_id"])]
+     datas.to_csv(archivoEntrada+"_nombre.csv")
+     doo=data[~(data["protein_id"].isin(dataB["Entry_name"]))]
+     doo.to_csv("Proteinas_sin_nombre.csv")
+     #data.assign(lenght=datas["Length"].to_list())
+     #data.assign(name=datas["Protein names"].to_list())
+     #data.to_csv(archivoEntrada+"_nombre.csv")
+     
+     
+if __name__=="__main__":
+    #readData("data_nervous_genes_xf.xlsx","resultados/patronesIdenticosTreat_005.csv","C0007131","ProtByPatternLung005.xlsx")
+     add_name_patterns("data_nervous_genes_xf.xlsx","protein_name.csv","nombres_sust.txt","protein_name_clean.csv")
--- a/TFM-main/src/generate_tha_excel.py
+++ b/TFM-main/src/generate_tha_excel.py
--- a/TFM-main/src/input/aminoacidos_mod.txt
+++ b/TFM-main/src/input/aminoacidos_mod.txt
+Hydrophobic	I	L	V	C	A	G	M	F	Y	W	H	K	T
+Small	V	C	A	G	D	N	S	T	P
+Aromatic	F	Y	W	H
+Positive	H	K	R
+Negative	E	D
--- a/TFM-main/src/input/aminoacidos_mod_2.txt
+++ b/TFM-main/src/input/aminoacidos_mod_2.txt
+Tiny	A	G	S
+Aliphatic	I	L	V
+Aromatic	F	Y	W	H
+Positive	H	K	R
+Negative	E	D
--- a/TFM-main/src/metricas.py
+++ b/TFM-main/src/metricas.py
@@ -53,11 +53,11 @@ def metrica_distanciaProteinas():
    df.to_csv('resultados/Metrica_distanciaProteinasMismoPatron.csv',
              index=False)

-def patronesComun(patronesComun):
+def patronesComun(patronesComun,archivoEntrada,ocurrencia,archivoClases):
    
    # Leer el archivo CSV y cargar los datos en una lista de diccionarios
    registros = []
-    cl=pd.read_excel("alzheimer_protein_class 2.xlsx")
+    cl=pd.read_excel(archivoClases)
    #cl=substitute_or_remove_prot_id(cl,"r")
    #data2=data.copy()
    cli=cl.groupby('protein_id')
@@ -69,7 +69,7 @@ def patronesComun(patronesComun):
      do[k]=di
      di=[]
    class_dict=do
-    with open("resultados/patronesIdenticos.csv", 'r') as file:
+    with open("resultados/patronesIdenticos"+str(int((float(ocurrencia)%1)*100))+".csv", 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            registros.append(row)
@@ -96,7 +96,7 @@ def patronesComun(patronesComun):
    # Diccionario para almacenar las proteinas que tienen en común cada par de proteinas
    proteinas_comunes = {}
    rr=[]
-    df_p = pd.read_excel("data_nervous_genes_xf.xlsx")
+    df_p = pd.read_excel(archivoEntrada)
    #df_p = pd.read_excel("proteinasClase_PC00060.xlsx")
    #df_p=substitute_or_remove_prot_id(df_p,"r")
    proteinas_dict2 = dict(df_p[['protein_id','protein_sequence']].values)
@@ -182,11 +182,11 @@ def patronesComun(patronesComun):
                df = pd.DataFrame(output, columns=['Patrones', 'Proteina1', 'Proteina2',"classesProt1","classesProt2"])
                output=[]
                if(first2):
-                     df.to_csv('resultados/Metrica_patronesComunes.csv',
+                     df.to_csv('resultados/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+'.csv',
              index=False)
                     first2=False
                else:
-                     df.to_csv('resultados/Metrica_patronesComunes.csv',index=False,header=False,mode='a')
+                     df.to_csv('resultados/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+'.csv',index=False,header=False,mode='a')
                   
            #else:
                #output.append([sorted_pattern_lengths, proteina1, proteina2,
@@ -199,10 +199,10 @@ def patronesComun(patronesComun):
                 df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia',"classesProt1","classesProt2"])
                 output2=[]
                 if(first):
-                     df2.to_csv('resultados/Metrica_Coincidencia.csv',index=False)
+                     df2.to_csv('resultados/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+'.csv',index=False)
                     first=False
                 else:
-                     df2.to_csv('resultados/Metrica_Coincidencia.csv',index=False,header=False,mode='a')
+                     df2.to_csv('resultados/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+'.csv',index=False,header=False,mode='a')
                 
                 
                                
@@ -213,11 +213,11 @@ def patronesComun(patronesComun):
    


-def patronesComunClas(patronesComun,name):
+def patronesComunClas(patronesComun,name,archivoEntrada,ocurrencia,archivoClases):
    
    # Leer el archivo CSV y cargar los datos en una lista de diccionarios
    registros = []
-    cl=pd.read_excel("alzheimer_protein_class 2.xlsx")
+    cl=pd.read_excel(archivoClases)
    #cl=substitute_or_remove_prot_id(cl,"r")
    #data2=data.copy()
    cli=cl.groupby('protein_id')
@@ -229,7 +229,7 @@ def patronesComunClas(patronesComun,name):
      do[k]=di
      di=[]
    class_dict=do
-    with open("clases/"+name+"/patronesIdenticos.csv", 'r') as file:
+    with open("clases/"+name+"/patronesIdenticos"+str(int((float(ocurrencia)%1)*100))+".csv", 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            registros.append(row)
@@ -256,7 +256,7 @@ def patronesComunClas(patronesComun,name):
    # Diccionario para almacenar las proteinas que tienen en común cada par de proteinas
    proteinas_comunes = {}
    rr=[]
-    df_p = pd.read_excel("data_nervous_genes_xf.xlsx")
+    df_p = pd.read_excel(archivoEntrada)
    #df_p = pd.read_excel("proteinasClase_PC00060.xlsx")
    #df_p=substitute_or_remove_prot_id(df_p,"r")
    proteinas_dict2 = dict(df_p[['protein_id','protein_sequence']].values)
@@ -342,11 +342,11 @@ def patronesComunClas(patronesComun,name):
                df = pd.DataFrame(output, columns=['Patrones', 'Proteina1', 'Proteina2',"classesProt1","classesProt2"])
                output=[]
                if(first2):
-                     df.to_csv('clases/'+name+'/Metrica_patronesComunes.csv',
+                     df.to_csv('clases/'+name+'/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+'.csv',
              index=False)
                     first2=False
                else:
-                     df.to_csv('clases/'+name+'/Metrica_patronesComunes.csv',index=False,header=False,mode='a')
+                     df.to_csv('clases/'+name+'/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+'.csv',index=False,header=False,mode='a')
                   
            #else:
                #output.append([sorted_pattern_lengths, proteina1, proteina2,
@@ -359,10 +359,10 @@ def patronesComunClas(patronesComun,name):
                 df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia',"classesProt1","classesProt2"])
                 output2=[]
                 if(first):
-                     df2.to_csv('clases/'+name+'/Metrica_Coincidencia.csv',index=False)
+                     df2.to_csv('clases/'+name+'/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+'.csv',index=False)
                     first=False
                 else:
-                     df2.to_csv('clases/'+name+'/Metrica_Coincidencia.csv',index=False,header=False,mode='a')
+                     df2.to_csv('clases/'+name+'/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+'.csv',index=False,header=False,mode='a')
                 
                 
                                
@@ -372,8 +372,8 @@ def patronesComunClas(patronesComun,name):
    #          index=False)


-def remplazar_sequence_for_ID(output):
-    df_b = pd.read_excel("data_nervous_genes_xf.xlsx")
+def remplazar_sequence_for_ID(output,archivoEntrada):
+    df_b = pd.read_excel(archivoEntrada)
    #df_b = pd.read_excel("proteinasClase_PC00060.xlsx")
    #df_b=substitute_or_remove_prot_id(df_b,"r")
    # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico

--- a/TFM-main/src/nw_wrapper.cpp
+++ b/TFM-main/src/nw_wrapper.cpp
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/numpy.h>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <map>
+#define alphabets 26
+#define SHIFTBITS 31
+using namespace std;
+#ifndef _LOGGER_HPP_
+#define _LOGGER_HPP_
+
+#include <iostream>
+#include <sstream>
+
+/* consider adding boost thread id since we'll want to know whose writting and
+ * won't want to repeat it for every single call */
+
+/* consider adding policy class to allow users to redirect logging to specific
+ * files via the command line
+ */
+
+enum loglevel_e
+    {logERROR, logWARNING, logINFO, logDEBUG, logDEBUG1, logDEBUG2, logDEBUG3, logDEBUG4};
+
+class logIt
+{
+public:
+    logIt(loglevel_e _loglevel = logERROR) {
+        _buffer << _loglevel << " :" 
+            << std::string(
+                _loglevel > logDEBUG 
+                ? (_loglevel - logDEBUG) * 4 
+                : 1
+                , ' ');
+    }
+
+    template <typename T>
+    logIt & operator<<(T const & value)
+    {
+        _buffer << value;
+        return *this;
+    }
+
+    ~logIt()
+    {
+        _buffer << std::endl;
+        // This is atomic according to the POSIX standard
+        // http://www.gnu.org/s/libc/manual/html_node/Streams-and-Threads.html
+        std::cerr << _buffer.str();
+    }
+
+private:
+    std::ostringstream _buffer;
+};
+
+extern loglevel_e loglevel;
+
+#define log(level) \
+if (level > loglevel) ; \
+else logIt(level)
+
+#endif
+class NW{
+private:
+//const size_t alphabets = 26;
+size_t score=0;
+int align(const std::string &aa, const std::string &bb, int alpha_gap,int match,int mismatch, 
+        int alpha[alphabets][alphabets], std::string &a_aligned, 
+        std::string &b_aligned)
+{   
+    size_t n = aa.size();
+    size_t mm = bb.size();
+    cout << n << "  tamaño de la prot1\n";
+    cout << mm << "tamaño de la prot2\n"; 
+    long int nRows = mm + 1;
+  long int nCols = n + 1;
+  int a, b, c, m; // temps for max calculation
+  
+  // populate first row, column
+  // s1 across top, s2 down side
+  vector<int> t;
+  t.reserve(nRows*nCols+2);
+  t[0] = 0;
+  
+
+  // populate first row and column
+  for (long int i = 1; i < nCols; ++i) { t[i] = t[i-1] + alpha_gap; }
+  for (long int i = 1; i < nRows; ++i) { t[nCols*i] = t[nCols*(i-1)] + alpha_gap; }
+  
+  cout << "tamaño de t  " << (nRows*nCols)+2 << "\n"; 
+  // populate remaining table, row-major order
+  for (long int i = 1; i < nRows; ++i) {
+    for (long int j = 1; j < nCols; ++j) {
+      m = -(aa[j-1] == bb[i-1]);
+      //a = t[((i-1) * nCols) + j-1] + ((m? match:alpha[aa[j-1]][bb[i-1]]));
+      a = t[((i-1) * nCols) + j-1] + ((alpha[aa[j-1]-'A'][bb[i-1]-'A']));
+      b = t[((i-1) * nCols) + j] + alpha_gap;
+      c = t[(i * nCols) + j-1] + alpha_gap;
+
+      // OPTIMIZATION: bitwise max 
+      a = a - (((a - b) >> SHIFTBITS) & (a - b));
+      a = a - (((a - c) >> SHIFTBITS) & (a - c));
+      t[(i * nCols) + j] = a;
+    }
+  }
+  cout << "aqui muere?" << "\n";
+  // printTable(t, nRows, nCols);
+    //for (int i = nRows*nCols+1; i >= 0; i--) 
+    //cout << t[(nRows)*(nCols-1)] << "\n";
+    //cout << t[(nRows*nCols)-1] << "\n";
+    return t[(nRows*nCols)-1];
+}
+void print2DVector(const vector<vector<int> > &A)
+{
+    for (auto& i : A)
+    {
+        for (auto j : i)
+            cout << j << "\n";
+        cout << endl;
+    }
+}
+/* 
+ * Returns the Needleman-Wunsch score for the best alignment of a and b
+ * and stores the aligned sequences in a_aligned and b_aligned
+ */
+  public:
+
+
+      NW(std::string a1, std::string b1,std::map<char,std::map<char,float>> c, int match, int mismatch, int gap)
+      {
+    // The input strings that need to be aligned
+      
+        
+
+    // Penalty for any alphabet letter matched with a gap
+       int gap_penalty = gap;
+
+    /* 
+     * alpha[i][j] = penalty for matching the ith alphabet with the
+     *               jth alphabet.
+     * Here: Penalty for matching an alphabet with anoter one is 1 
+     *       Penalty for matching an alphabet with itself is 0
+     */
+       //cout << match << " + "<< mismatch << "\n";
+       int alpha[alphabets][alphabets];
+       for (size_t i = 0; i < alphabets; ++i)
+       {
+         for (size_t j = 0; j < alphabets; ++j)
+         {
+            //is first set as mismatch and continues like that if a match is not found
+            alpha[i][j]=mismatch;
+            //if alphabet letter is equal the other alphabet letter match
+            if (i == j) alpha[i][j] = match;
+            //tries to set as match alphabet letters that belong to the same group
+            else{if(c.find((char)'A'+i) != c.end() || c.find((char)'A'+j) != c.end() ){
+                   float count=0;
+                   if(c.find((char)'A'+i) != c.end()){
+                     
+                     if(c[(char)'A'+i].find((char)'A'+j) != c[(char)'A'+i].end() )
+                      count= float(c[(char)'A'+i][(char)'A'+j]);
+                      //cout << float(c[(char)'A'+i][(char)'A'+j]);
+                   }
+                   
+                   else{
+                      if(c[(char)'A'+j].find((char)'A'+i) != c[(char)'A'+j].end())
+                         count= float(c[(char)'A'+j][(char)'A'+i]);}
+                   if(count !=0){
+                     alpha[i][j]= match;
+                     alpha[j][i]= match;} 
+            
+            }
+
+         }
+       }
+     }
+    // Aligned sequences
+       std::string a2, b2;
+       cout << "aqui se llega" << "\n";
+       int penalty = align(a1, b1, gap_penalty,match,mismatch, alpha, a2, b2);
+    
+    this->score = penalty;
+    //delete alpha;
+}
+
+int get_score(){
+    return this->score;
+}
+
+
+
+};
+namespace py = pybind11;
+PYBIND11_MODULE(nw_wrapper, m) {
+    py::class_<NW>(m, "NW")
+        .def(py::init<std::string, std::string, std::map<char, std::map<char, float>>, int, int, int>())
+        .def("get_score", &NW::get_score);
+}
--- a/TFM-main/src/param_file.conf
+++ b/TFM-main/src/param_file.conf
+{
+
+
+"NombreArchivoEntrada" : "Data/data_nervous_genes_xf.xlsx",
+"CodigoEnfermedad":"C0007131",
+"NombreArchivoTarget":"",
+"NombreArchivoClases":"lung_cancer_protein_class.xlsx",
+"NombreArchivoAA": "Data/aminoacidos_mod_2.txt",
+"Similitud":"0.00001",
+"OcurrenciaMin":"0.05",
+"Metrica":"0",
+"NWMatrix":[""],
+ "NWmatch":"3",
+ "NWmismatch":"0",
+ "NWgap":"-1"
+ 
+
+}
--- a/TFM-main/src/patrones_identicos.py
+++ b/TFM-main/src/patrones_identicos.py
--- a/TFM-main/src/patrones_similares.py
+++ b/TFM-main/src/patrones_similares.py
@@ -3,6 +3,15 @@ import Levenshtein
 import math

 def patrones_similares(pattern_freqMin):
+    """
+    Identifies similar patterns in a dictionary of patterns based on Levenshtein distance.
+
+    Parameters:
+    - pattern_freqMin: dict, dictionary of patterns and their positions.
+
+    Returns:
+    - pattern_freqMin: dict, updated dictionary of patterns with similar patterns merged.
+    """
    similar_patterns = {}  # Guarda los patrones similares relacionados con el patron similar del que parten
    num_op = 3
    similar_patterns = {}  # Guarda los patrones similares relacionados con el patron similar del que parten
@@ -78,4 +87,4 @@ if __name__ == "__main__":
    fin = time.time()

    tiempo_total = fin - inicio
-    print(tiempo_total, "segundos")
\ No newline at end of file
+    print(tiempo_total, "segundos")
--- a/TFM-main/src/patrones_similares_aa.py
+++ b/TFM-main/src/patrones_similares_aa.py
@@ -16,6 +16,15 @@ from collections import defaultdict
 classes={}
 min_ocurrence=0
 def swap_dict(d):
+    """
+    Swaps keys and values in a dictionary.
+
+    Parameters:
+    - d: dict, input dictionary.
+
+    Returns:
+    - new_dict: dict, dictionary with keys and values swapped.
+    """
    new_dict = {}
    for key, values in d.items():
        for value in values:
@@ -23,47 +32,23 @@ def swap_dict(d):
                new_dict[value] = []
            new_dict[value].append(key)
    return new_dict
-def readData(archivoEntrada, enfermedad, archivoTarget):
-    data = pd.read_excel(archivoEntrada)
-    dataC = pd.read_csv("resultados/proteinasDescartadas2.csv")
-    #data=substitute_or_remove_prot_id(data,"r")
-    #dataC=substitute_or_remove_prot_id(dataC,"r")
-    #Descarte de proteinas
-    data = data[~data['protein_id'].isin(dataC['ProteinasDescartadas'])]
-    print("Se ha realizado el descarte de proteínas")
-    cla={}
-    global classes
-    with open('aminoacidos.txt','r') as op:
-        line=op.readline()
-        print(line)
-        oo=line.split()
-        key=oo.pop(0)
-        cla[key]=oo
-    classes=swap_dict(cla)    
-    # "C0002395"
-    if(enfermedad != ''):
-        data = data.loc[data["disease_id"] == enfermedad]
-    #    dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
-    #    print("Se han seleccionado las proteínas de la enfermedad elegida")
-    #    dataB=substitute_or_remove_prot_id(dataB,"r")
-    #if(archivoTarget != ''):
-    #    dataB=substitute_or_remove_prot_id(dataB,"r")
-        #Eliminar las proteinas target
-    #    data = data[~((data["disease_id"] == enfermedad) &
-    #                  (data["protein_id"].isin(dataB["protein_id"])))]
-    #    print("Se han descartado las proteínas del archivo target")
-    
-    sequences = data["protein_sequence"]
-    print(sequences)
-    num_filas = sequences.shape[0]

-    return sequences, num_filas


+def read_aminoacidos(archivoAA):
+    """
+    Reads amino acid information from a file and returns a dictionary with
+    swapped keys and values, and the original dictionary.
+
+    Parameters:
+    - archivoAA: str, path to the amino acid information file.

-def read_aminoacidos():
+    Returns:
+    - classes: dict, dictionary with swapped keys and values.
+    - cla: dict, original dictionary with amino acid information.
+    """
    cla = {}
-    with open('aminoacidos.txt', 'r') as op:
+    with open(archivoAA, 'r') as op:
        lines = op.readlines()
        for line in lines:
            oo = line.replace('\n', '').split('\t')
@@ -71,10 +56,25 @@ def read_aminoacidos():
            cla[key] = oo
    return swap_dict(cla), cla

-def guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence):
+def guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence,archivoAA):
+    """
+    Processes protein sequences to find patterns of length 1 and their positions,
+    filters patterns based on minimum occurrence, and saves results to a CSV file.
+
+    Parameters:
+    - sequences: pandas Series, protein sequences.
+    - pattern_freqMin: dict, dictionary to store patterns and their occurrences.
+    - min_ocurrence: int, minimum occurrence threshold.
+    - archivoAA: str, path to the amino acid information file.
+
+    Returns:
+    - pattern_freqMin: dict, updated dictionary of patterns.
+    - posicion_patterns: dict, positions of each character in the sequences.
+    - longitud_max: int, maximum length of protein sequences.
+    """
    all_patterns = defaultdict(list)
    longitud_max = 0
-    classes, cla = read_aminoacidos()
+    classes, cla = read_aminoacidos(archivoAA)

    for protein in sequences:
        longitud = len(protein)
@@ -111,11 +111,24 @@ def guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence):
    df.to_csv('prueba2.csv', index=False)
    return pattern_freqMin, posicion_patterns, longitud_max

-def buscar_patrones_simAA(sequences, min_ocurr):
+def buscar_patrones_simAA(sequences, min_ocurr,archivoAA):
+    """
+    Searches for similar patterns in protein sequences based on amino acid information,
+    filters patterns based on minimum occurrence, and returns results.
+
+    Parameters:
+    - sequences: pandas Series, protein sequences.
+    - min_ocurr: int, minimum occurrence threshold.
+    - archivoAA: str, path to the amino acid information file.
+
+    Returns:
+    - pattern_freqMin: dict, dictionary of patterns and their positions.
+    - num_patrones: int, number of unique patterns found.
+    """
    min_ocurrence = min_ocurr
    pattern_freqMin = {}
-    pattern_freqMin, posicion_patterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence)
-    classes, cla = read_aminoacidos()
+    pattern_freqMin, posicion_patterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence,archivoAA)
+    classes, cla = read_aminoacidos(archivoAA)

    if not bool(pattern_freqMin):
        return pattern_freqMin, 0
@@ -212,13 +225,26 @@ def buscar_patrones_simAA(sequences, min_ocurr):
    num_patrones = df.shape[0]
    #pattern_freqMin = {k: v for k, v in pattern_freqMin.items() if len(k) >= 4}
    return pattern_freqMin, num_patrones
-def buscar_patrones_identicos(sequences,min_ocurr):
+def buscar_patrones_identicos(sequences,min_ocurr,archivoAA):
+    """
+    Searches for identical patterns of different lengths in protein sequences
+    based on amino acid information, and returns results.
+
+    Parameters:
+    - sequences: pandas Series, protein sequences.
+    - min_ocurr: int, minimum occurrence threshold.
+    - archivoAA: str, path to the amino acid information file.
+
+    Returns:
+    - pattern_freqMin: dict, dictionary of patterns and their positions.
+    - num_patrones: int, number of unique patterns found.
+    """
    pattern_freqMin = {}
    min_ocurrence=min_ocurr
-    pattern_freqMin, posicionPatterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin)
+    pattern_freqMin, posicionPatterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin,archivoAA)
    cla={}
    num_patrones=0
-    with open('aminoacidos.txt','r') as op:
+    with open(archivoAA,'r') as op:
           lines=op.readlines()
           print(lines)
           for line in lines:
@@ -325,13 +351,23 @@ def buscar_patrones_identicos(sequences,min_ocurr):
    
    return pattern_freqMin, num_patrones

-def remplazar_sequence_for_ID(pattern_freqMin):
-    df_b = pd.read_excel("data_nervous_genes_xf.xlsx")
+def remplazar_sequence_for_ID(pattern_freqMin,archivoEntrada,ArchivoAA,ocurrencia):
+    """
+    Replaces identified patterns in the original data with their corresponding IDs,
+    saves the results to a CSV file, and prints a success message.
+
+    Parameters:
+    - pattern_freqMin: dict, dictionary of patterns and their positions.
+    - archivoEntrada: str, path to the input Excel file.
+    - ArchivoAA: str, path to the amino acid information file.
+    - ocurrencia: float, occurrence parameter (not currently in use).
+    """
+    df_b = pd.read_excel(archivoEntrada)
    #df_b=substitute_or_remove_prot_id(df_b,'r')
    output = []
    global classes
    cla={}
-    with open('aminoacidos.txt','r') as op:
+    with open(ArchivoAA,'r') as op:
        lines=op.readlines()
        #print(lines)
        for line in lines:
@@ -378,5 +414,5 @@ def remplazar_sequence_for_ID(pattern_freqMin):
    df_a = pd.DataFrame(output_ordered, columns=['Patron', 'Proteina', 'Posiciones'])

    # Guardar el DataFrame actualizado en un archivo CSV
-    df_a.to_csv('resultados/patronesSimilaresAA.csv', index=False)
+    df_a.to_csv('resultados/patronesSimilaresAA'+str(int((float(ocurrencia)%1)*100))+'.csv', index=False)
    print("Se ha generado el .csv con los patrones idénticos encontrados")
--- a/TFM-main/src/similitudAllProteins.py
+++ b/TFM-main/src/similitudAllProteins.py
@@ -2,14 +2,57 @@ import pandas as pd
 import Levenshtein
 from minineedle import needle, smith, core
 from descarteProteinas import substitute_or_remove_prot_id 
+from ast import literal_eval
 def readData(archivoEntrada):
+    """
+    Read protein sequences from an Excel file.
+
+    Parameters:
+    - archivoEntrada: Input Excel file path
+    
+    Returns:
+    - List of protein sequences
+
+    This function reads protein sequences from an Excel file specified by 'archivoEntrada' and extracts the
+    'protein_sequence' column from the DataFrame. The sequences are returned as a list.
+    
+    Example:
+    >>> sequences = readData("protein_data.xlsx")
+    >>> print(sequences)
+    ['MTCG...', 'MCTA...', ...]
+    """
    data = pd.read_excel(archivoEntrada)
-    data=substitute_or_remove_prot_id(data,'r')
+    #data=substitute_or_remove_prot_id(data,'r')
    sequences = data["protein_sequence"]

    return sequences

 def similitudProteinas(sequences):
+    """
+    Calculate pairwise similarity scores between protein sequences using Levenshtein distance.
+
+    Parameters:
+    - sequences: List of protein sequences
+    
+    Returns:
+    - List of lists containing pairwise similarity information:
+        - [protein_sequence_1, protein_sequence_2, similarity_score]
+
+    This function takes a list of protein sequences and calculates pairwise similarity scores
+    between each pair of protein sequences using Levenshtein distance. The results are returned
+    in a list of lists.
+
+    Example:
+    >>> sequences = ["MACG", "MACC", "MGCA"]
+    >>> result = similitudProteinas(sequences)
+    >>> print(result)
+    [['MACG', 'MACC', 75.0],
+     ['MACG', 'MGCA', 50.0],
+     ['MACC', 'MACG', 75.0],
+     ['MACC', 'MGCA', 66.67],
+     ['MGCA', 'MACG', 50.0],
+     ['MGCA', 'MACC', 66.67]]
+    """
    output = []
    for row1 in sequences:
        for row2 in sequences:
@@ -20,15 +63,59 @@ def similitudProteinas(sequences):
                output.append([row1, row2, similarity*100])
    return output

-def remplazar_sequence_for_ID(output):
-    df_b = pd.read_excel("data_nervous_genes_1.xlsx")
-    df_b=substitute_or_remove_prot_id(df_b,"r")
-    # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
-    output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0]))
+def remplazar_sequence_for_ID(output,archivoEntrada,Sal,mode="default"):
+    """
+    Replace protein sequences with protein IDs using a pre-existing DataFrame.

-    proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
+    Parameters:
+    - output: List of lists containing similarity information
+    - mode: Replacement mode (default or drug)
+    - archivoEntrada: Path to protein information file
+    - Sal: Extension for output file
+
+    This function takes a list of lists containing pairwise similarity information, and replaces
+    protein sequences with their corresponding protein IDs. The replacement is based on the information
+    provided in a pre-existing DataFrame. The updated information is saved to a CSV file.

-    for item in output_ordered:
+    Example:
+    >>> data = [['MACG', 'MGCA', 75.0], ['MACC', 'MGCA', 66.67]]
+    >>> inputFile = "protein_data.xlsx"
+    >>> outputExt = "protein"
+    >>> remplazar_sequence_for_ID(data,inputFile,OutputExt, mode="default")
+    """
+    df_b = pd.read_excel(archivoEntrada)
+    #df_b=substitute_or_remove_prot_id(df_b,"r")
+    # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
+    #output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0]))
+    
+    proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
+    if(mode=="drug"):
+       drug_dict=dict(df_b[['protein_sequence','drug_id']].values)
+       for item in output:
+        protein_sequence1 = item[0]
+        protein_sequence2 = item[1]
+        res=[]
+        [res.append(x) for x in literal_eval(drug_dict[item[0]]) if x not in res and ( x != '[' or x != ']') ] 
+        if(len(res) == 1):
+          item.append(res[0])
+        elif(len(res)>1):
+          item.append(res)
+        else:
+          item.append("")    
+        res=[]
+        [res.append(x) for x in literal_eval(drug_dict[item[1]]) if x not in res and ( x != '[' or x != ']')] 
+        if(len(res) == 1):
+          item.append(res[0])
+        elif(len(res)>1):
+          item.append(res)
+        else:
+          item.append("")  
+        if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict:
+            item[0] = proteinas_dict[protein_sequence1]
+            item[1] = proteinas_dict[protein_sequence2]
+       df_a=pd.DataFrame(output, columns=['Proteina1', 'Proteina2', 'Similaridad','SimilaridadAA','similaridadAA_2','drug_id_p1','drug_id_p2'])    
+    else:
+       for item in output:
        protein_sequence1 = item[0]
        protein_sequence2 = item[1]
        if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict:
@@ -37,15 +124,66 @@ def remplazar_sequence_for_ID(output):



-    df_a = pd.DataFrame(output_ordered, columns=['Proteina1', 'Proteina2', 'Similaridad'])
+       df_a = pd.DataFrame(output, columns=['Proteina1', 'Proteina2', 'Similaridad','SimilaridadAA','similaridadAA_2'])

    # Guardar el DataFrame actualizado en un archivo CSV
-    df_a.to_csv('AllProteins_%Similitud.csv', index=False)
+    df_a.to_csv('AllProteins_%Similitud'+Sal+'.csv', index=False)
+def similitudMatProteinas(sequences, matrix,matrix2,matriz3):
+    """
+    Create percentages of pairwise similarity scores between protein sequences based on three similarity matrices.
+
+    Parameters:
+    - sequences: List of protein sequences
+    - matrix: First similarity matrix
+    - matrix2: Second similarity matrix
+    - matriz3: Third similarity matrix
+
+    Returns:
+    - List of lists containing pairwise similarity information:
+        - [protein_sequence_1, protein_sequence_2, similarity_score_matrix1, similarity_score_matrix2, similarity_score_matrix3]

+    This function takes a list of protein sequences and three similarity matrices and calculates pairwise similarity scores
+    between each pair of protein sequences. The similarity scores are computed using the provided matrices, and the results
+    are returned in a list of lists.
+
+    Note: The function assumes that the matrices are square matrices with dimensions matching the length of the 'sequences' list.
+
+    Example:
+    >>> sequences = ["MACG", "MACC", "MGCA"]
+    >>> matrix1 = [[1.0, 0.8, 0.6], [0.8, 1.0, 0.7], [0.6, 0.7, 1.0]]
+    >>> matrix2 = [[0.9, 0.7, 0.5], [0.7, 0.9, 0.6], [0.5, 0.6, 0.9]]
+    >>> matrix3 = [[0.8, 0.6, 0.4], [0.6, 0.8, 0.5], [0.4, 0.5, 0.8]]
+    >>> result = similitudMatProteinas(sequences, matrix1, matrix2, matrix3)
+    >>> print(result)
+    [['MACG', 'MACC', 80.0, 70.0, 60.0],
+     ['MACG', 'MGCA', 60.0, 50.0, 40.0],
+     ['MACC', 'MACG', 80.0, 70.0, 60.0],
+     ['MACC', 'MGCA', 70.0, 60.0, 50.0],
+     ['MGCA', 'MACG', 60.0, 50.0, 40.0],
+     ['MGCA', 'MACC', 70.0, 60.0, 50.0]]
+    """
+    output = []
+    for row1 in range(0,len(sequences)):
+        for row2 in range(0,len(sequences)):
+            if row1 != row2:
+                #similarity = abs(smith.SmithWaterman(row1, row2).get_score()-1) / max(len(row1), len(row2))
+                #similarity = abs(needle.NeedlemanWunsch(row1, row2).get_score()-1) / (2*max(len(row1), len(row2)))
+                output.append([sequences[row1], sequences[row2], matrix[row1][row2]*100,matrix2[row1][row2]*100,matriz3[row1][row2]*100])
+    return output
 if __name__ == "__main__":
-    archivoEntrada = "data_nervous_genes_1.xlsx"
+    archivoEntrada = "Data/data_nervous_genes_xf.xlsx"
    sequences = readData(archivoEntrada)
    
-   
-    output = similitudProteinas(sequences)
-    remplazar_sequence_for_ID(output)
+    matrix=pd.read_csv('matrizNWc.csv',header=None,index_col=False)*3+1
+    matrix.abs()
+    matrix=matrix/4
+    matrix2=pd.read_csv('matrizNWmod1.csv',header=None,index_col=False)*3+1.0
+    matrix2.abs()
+    matrix2=matrix2/4
+    matrix3=pd.read_csv('matrizNWmod2.csv',header=None,index_col=False)*3+1.0
+    matrix3.abs()
+    matrix3=matrix3/4
+    #output = similitudProteinas(sequences)
+    output=similitudMatProteinas(sequences, matrix,matrix2,matrix3)
+    print("Generada la tabla de con las matrices de similaridad especificadas") 
+    remplazar_sequence_for_ID(output,archivoEntrada,"Desease")
--- a/TFM-main/src/summary.py
+++ b/TFM-main/src/summary.py
+import pandas as pd
+import time
+import numpy as np
+import re
+from ast import literal_eval
+def readData(archivoEntrada, enfermedad,patrones_file,Sal):
+    """
+    Reads data from an Excel file, filters it based on the disease, and performs additional processing.
+
+    Parameters:
+    - archivo_entrada (str): Path to the Excel file.
+    - enfermedad (str): Disease ID for filtering.
+    - patrones_file (str): Path to the file containing patterns.
+    - Sal: Output file extension
+    Returns:
+    - data (pd.DataFrame): Processed DataFrame based on the given parameters.
+    """
+    data = pd.read_excel(archivoEntrada)
+
+    if enfermedad:
+        data = data.loc[data["disease_id"] == enfermedad]
+
+    dataB = pd.read_csv(patrones_file)
+
+    print(len(data))
+    filt_data = len(data)
+    alz_filt_data = len(dataB)
+    print("Proteins discarded after the main filter: " + str(filt_data - len(data)))
+    print("Proteins discarded after the common Alzheimer's filter: " + str(alz_filt_data - len(dataB)))
+
+    dataC = {}
+    daa = dataB["Patron"].unique()
+    das={}
+    pos={}
+    for u in daa:
+        if len(u) > 3:
+          kk=data.protein_sequence.str.contains(u)
+          das[u] = data[kk]["protein_id"].to_list()
+          pos[u]= data[kk]['protein_sequence'].str.find(u).to_list()
+          print(len(pos[u]))
+          print(len(das[u]))
+          dataC[u]=[[das[u][ii],pos[u][ii]] for ii in range(0,len(das[u]))]
+    dataG = pd.DataFrame(dataC.items(), columns=["pattern", "proteins"])
+    dataG.to_excel("ProtByPattern"+Sal+".xlsx")
+
+    sequences = data["protein_sequence"]
+    return data
+def add_protein_info_to_data(main_data_path, patterns_info_path, protein_names_path):
+    """
+    Add protein names and protein information from the original pattern file and the names Dataset to a DataFrame based on matching patterns.
+
+    Parameters:
+    - main_data_path (str): The path to the Excel file containing the main data.
+    - patterns_info_path (str): The path to the CSV file containing patterns and protein information.
+    - protein_names_path (str): The path to the CSV file containing protein names.
+
+    Returns:
+    None: The function updates the provided Excel file with additional protein information.
+
+    Example:
+    ```python
+    add_protein_info_to_data("main_data.xlsx", "patterns_info.csv", "protein_names.csv")
+    ```
+
+    Note:
+    - The function assumes that the provided Excel file ('main_data_path') contains a 'pattern' column.
+    - The 'patterns_info_path' CSV file is expected to have columns 'Patron', 'Proteina', and 'Posiciones'.
+    - The 'protein_names_path' CSV file is expected to have columns 'Entry' and 'Entry_Name'.
+    """
+
+    # Read data from files
+    main_data = pd.read_excel(main_data_path)
+    patterns_info = pd.read_csv(patterns_info_path)
+    protein_names = pd.read_csv(protein_names_path)
+
+    # Group patterns in 'patterns_info' DataFrame
+    patterns_grouped = patterns_info.groupby("Patron")
+
+    # Initialize columns in 'main_data' DataFrame
+    main_data["protein_names"] = ""
+    main_data["proteins_treat"] = ""
+
+    # Iterate over patterns in 'patterns_info'
+    for index, row in patterns_info.iterrows():
+        pattern = row["Patron"]
+        protein_id = row["Proteina"]
+        positions = row["Posiciones"]
+
+        # Find matching rows in 'main_data' DataFrame
+        matching_rows = main_data[main_data["pattern"] == pattern]
+
+        # Update 'proteins_treat' column
+        main_data.loc[main_data["pattern"] == pattern, "proteins_treat"] = matching_rows["proteins"].apply(
+            lambda x: literal_eval(x) + [[protein_id, positions]] if pd.notna(x) else [[protein_id, positions]]
+        )
+
+        # Update 'protein_names' column
+        main_data.loc[main_data["pattern"] == pattern, "protein_names"] = matching_rows["proteins"].apply(
+            lambda lst: [protein_names[protein_names["Entry"] == protein_id]["Entry_Name"].to_list() if protein_id else "N/A" for protein_id, _ in literal_eval(lst)]
+        )
+
+    # Save the updated data
+    main_data_base_name = main_data_path.split(".")[0]
+    main_data.to_excel(f"{main_data_base_name}_aux.xlsx", index=False) 
+        
+def add_entry_name(archivoEntrada,protein_name_file):
+     """
+     Adds entry names to the DataFrame based on an additional CSV file and performs additional processing.
+
+     Parameters:
+     - archivo_entrada (str): Path to the Excel file.
+     - protein_name_file (str): Path to the protein name CSV file.
+
+     Returns:
+     - None
+     """
+     data = pd.read_excel(archivo_entrada)
+     dataB = pd.read_csv(protein_name_file, usecols=['Entry', "Entry_Name", "Protein_names", "Length"])
+     dataB = substitute_or_remove_prot_id(dataB, "na")
+     dataB = dataB.reindex(columns=['Entry', "Entry_Name", "Length", "Protein_names"])
+     datas = dataB[dataB["Entry"].isin(data["protein_id"])]
+     datas.to_csv(archivo_entrada + "_nombre.csv")
+     doo = data[~(data["protein_id"].isin(dataB["Entry"]))]
+     doo.to_csv("Proteinas_sin_nombre")
+     #data.assign(lenght=datas["Length"].to_list())
+     #data.assign(name=datas["Protein names"].to_list())
+     #data.to_csv(archivoEntrada+"_nombre.csv")    
+if __name__=="__main__":
+       #data=add_entry_name("data_nervous_genes_xf.xlsx","protein_name.csv")
+       #data=pd.read_excel("Data/data_nervous_genes_xf.xlsx")
+       #dd=pd.read_excel("Data/data_nervous_genes_xfal.xlsx")
+       #dds=pd.concat([data,dd])
+       #dds.to_excel("Data/data_nervous_genes_xfull.xlsx")
+       data=readData("Data/data_nervous_genes_xf.xlsx","","patronesIdenticos10Treat.csv","Lung01")
+       add_names_prot("ProtByPatternLung01.xlsx","patronesIdenticos10Treat.csv","protein_name.csv")
+