similitudAllProteins.py 1.93 KB
Newer Older
Rafael Artinano's avatar
Rafael Artinano committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
import pandas as pd
import Levenshtein
from minineedle import needle, smith, core
from descarteProteinas import substitute_or_remove_prot_id 
def readData(archivoEntrada):
    data = pd.read_excel(archivoEntrada)
    data=substitute_or_remove_prot_id(data,'r')
    sequences = data["protein_sequence"]

    return sequences

def similitudProteinas(sequences):
    output = []
    for row1 in sequences:
        for row2 in sequences:
            if row1 != row2:
                #similarity = abs(smith.SmithWaterman(row1, row2).get_score()-1) / max(len(row1), len(row2))
                #similarity = abs(needle.NeedlemanWunsch(row1, row2).get_score()-1) / (2*max(len(row1), len(row2)))
                similarity = abs(Levenshtein.distance(row1, row2)) / max(len(row1), len(row2))
                output.append([row1, row2, similarity*100])
    return output

def remplazar_sequence_for_ID(output):
    df_b = pd.read_excel("data_nervous_genes_1.xlsx")
    df_b=substitute_or_remove_prot_id(df_b,"r")
    # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
    output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0]))

    proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)

    for item in output_ordered:
        protein_sequence1 = item[0]
        protein_sequence2 = item[1]
        if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict:
            item[0] = proteinas_dict[protein_sequence1]
            item[1] = proteinas_dict[protein_sequence2]



    df_a = pd.DataFrame(output_ordered, columns=['Proteina1', 'Proteina2', 'Similaridad'])

    # Guardar el DataFrame actualizado en un archivo CSV
    df_a.to_csv('AllProteins_%Similitud.csv', index=False)

if __name__ == "__main__":
    archivoEntrada = "data_nervous_genes_1.xlsx"
    sequences = readData(archivoEntrada)
    
   
    output = similitudProteinas(sequences)
    remplazar_sequence_for_ID(output)