find_patterns.py 8.61 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177

import pandas as pd
import time
import numpy as np
import re
globi=0
df_b=None
def substitute_or_remove_prot_id(data,archSubs,sub_rem,archivSal=None):
    """
    Substitute or remove protein identifiers based on a substitution file.

    Parameters:
    - data: DataFrame containing protein data
    - archSubs: Input file with the protein ids that must be substituted by its primary entry
    - sub_rem: Operation type ('s' for substitution, 'p' for removal of protein id repeated once replaced and return of the removed ones, 'c' for class replacement,
               'na' for protein_id replacement as Entry, anything else for protein_id  based on desease_id replacement and repeated removal)
    - archSal: (Optional) Name of the output file has default value as None            
    
    Returns:
    - Modified DataFrame after performing substitution or removal operation
    """
    print("inside the problem")
    with open(archSubss) as prottosubs:
          index=prottosubs.readline()
          acept=index.split()
          listtosubs={}
          for i in range(0,len(acept)):
            listtosubs[acept[i]]=[]
          while line := prottosubs.readline():
              newline=line.split()
              #print(len(newline))
              for i in range(0,len(newline)):
                  
                  listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip())  
    resub=1
    if re.search("Primary",list(listtosubs.keys())[0]):
           resub=0
    print((resub+1)%2)
    #print(data)
    #data2=data.copy()
    global globi
    if(sub_rem == "s"):
        data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
    #datacp=data.copy()
    #print(pd.concat([data2,datacp]).drop_duplicates())
    elif(sub_rem == "p"):
        datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==False]
        data= data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==True]
        #print(data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==True])
        #print(datas)
        
        #data.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=True)
        data=data.drop_duplicates(keep="first", inplace=False)
        did=data.copy()
        data = data.drop_duplicates(subset=['disease_id', 'protein_sequence'], keep="first", inplace=False)
        did=did[~did.isin(data).all(axis=1)]
        did=did.drop_duplicates()
        #print(pd.concat([did,did2]).drop_duplicates(keep=False))
        print(did)
        datas=pd.concat([datas, did], ignore_index=True)
        if(archivSal != None):
           data.to_excel(archivSal,index=False,columns=data.columns) 
        datas.to_csv('resultados/proteinasDescartadassp_'+ str(globi) +'.csv', index=False) 
    elif(sub_rem == "c"):
        datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True]
        data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
        print("tamaño original: "+str(len(data)))
        dats=data.drop_duplicates(subset=['protein_id','class_id'],keep='first',inplace=False)
        print("Despues de tirar duplicados en id: "+str(len(dats)))
        dats=dats.drop_duplicates(subset=['protein_sequence','class_id'],keep='first',inplace=False)
        print("Despues de tirar duplicados en secuencia: "+str(len(dats)))
        if(archivSal != None):
         dats.to_excel(archivSal,index=False,columns=data.columns)  
        datas.to_csv('resultados/clasesDescartadasc_'+ str(globi) +'.csv', index=False) 
        #pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
        #pd_diff.to_excel('data_not_valid.xlsx')
        globi=globi+1 
        data=dats
    elif(sub_rem=="na"):
        datas= data[data["Entry"].isin(list(listtosubs.values())[(resub+1)%2])==True]
        data["Entry"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
        print("tamaño original: "+str(len(data)))
        dats=data.drop_duplicates(subset=['Entry'],keep='first',inplace=False)
        print("Despues de tirar duplicados en id: "+str(len(dats)))
        #dats=dats.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=False)
        print("Despues de tirar duplicados en secuencia: "+str(len(dats)))
        if(archivSal != None):
           dats.to_excel(archivSal,index=False,columns=data.columns)  
        datas.to_csv('resultados/proteinasDescartadasna_'+ str(globi) +'.csv', index=False) 
        #pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
        #pd_diff.to_excel('data_not_valid.xlsx')
        globi=globi+1 
        data=dats    
    else: 
        
        datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True]
        data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
        print("tamaño original: "+str(len(data)))
        dats=data.drop_duplicates(subset=['disease_id','protein_id'],keep='first',inplace=False)
        print("Despues de tirar duplicados en id: "+str(len(dats)))
        dats=dats.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=False)
        print("Despues de tirar duplicados en secuencia: "+str(len(dats)))
        if(archivSal != None):
           dats.to_excel(archivSal,index=False,columns=data.columns)  
        datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False) 
        #pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
        #pd_diff.to_excel('data_not_valid.xlsx')
        globi=globi+1 
        data=dats
        #data.to_excel('data_nervous_genes_2.xlsx')
    return data

def readData(archivoEntrada,archivoEnt2 ,enfermedad,archivoSal):
    """
    Read data from an Excel file, find evey entry that matches the patterns of the CSV Input file, and save the result of the matches with the pattern that was searched to a new Excel file.

    Parameters:
    - archivoEntrada: Input Excel file path
    - archivoEnt2: Input CSV file with the patterns to be searched in the Archivo Entrada input file
    - enfermedad: Optional disease ID for filtering
    - archivoSal: Output Excel file path
    """
    data = pd.read_excel(archivoEntrada)
    
    #data.to_excel('data_nervous_genes_2.xlsx')
    
    if (enfermedad != ''):
        #datar=substitute_or_remove_prot_id(data,"r")
        #sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
        
        
        data = data.loc[data["disease_id"] == enfermedad]

    dataB = pd.read_csv(archivoEnt2)
                  
        
    print(len(data))
        #dataB.to_excel("data_nervous_genes_xf2.xlsx")
        #data.to_excel('data_nervous_genes_2.xlsx')
    filt_data=len(data)
    alz_filt_data=len(dataB)
    print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data)))      
    print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB)))
        #print("tamaño del descarte: "+ str(data[data["protein_id"].isin(dataB["protein_id"])].shape[0]))
    dataC={}
    daa=dataB["Patron"].unique()
        
    for u in daa:
      if(len(u)>3): 
        dataC[u]=data[data.protein_sequence.str.contains(u)]["protein_id"].to_list()
    dataG=pd.DataFrame(dataC.items(),columns=["pattern","proteins"])
    dataG.to_excel(archivoSal)              
    #data=substitute_or_remove_prot_id(data,"r")
def add_name_patterns(archivoEntrada,archivNom,EqvData,OutName):
      """
      Add protein names to the DataFrame extracted from an excel based on a CSV file and save the result to a new CSV file.

      Parameters:
      - archivoEntrada: Input Excel file
      -ArchivNom: Input csv with the names of equivalences of the values contained in the input excel
      """
     data=pd.read_excel(archivoEntrada)
     dataB=pd.read_csv(archivNom,usecols=['Entry',"Entry_Name","Protein_names","Length"])
     dataB=substitute_or_remove_prot_id(dataB,EqvData,"na",OutName)
     dataB=dataB.reindex(columns=['Entry',"Entry_Name","Length","Protein_names"])
     datas=dataB[dataB["Entry"].isin(data["protein_id"])]
     datas.to_csv(archivoEntrada+"_nombre.csv")
     doo=data[~(data["protein_id"].isin(dataB["Entry_name"]))]
     doo.to_csv("Proteinas_sin_nombre.csv")
     #data.assign(lenght=datas["Length"].to_list())
     #data.assign(name=datas["Protein names"].to_list())
     #data.to_csv(archivoEntrada+"_nombre.csv")
     
     
if __name__=="__main__":
    #readData("data_nervous_genes_xf.xlsx","resultados/patronesIdenticosTreat_005.csv","C0007131","ProtByPatternLung005.xlsx")
     add_name_patterns("data_nervous_genes_xf.xlsx","protein_name.csv","nombres_sust.txt","protein_name_clean.csv")