similitudAllProteins.py 7.79 KB
Newer Older
Rafael Artinano's avatar
Rafael Artinano committed
1 2 3 4
import pandas as pd
import Levenshtein
from minineedle import needle, smith, core
from descarteProteinas import substitute_or_remove_prot_id 
5
from ast import literal_eval
Rafael Artinano's avatar
Rafael Artinano committed
6
def readData(archivoEntrada):
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
    """
    Read protein sequences from an Excel file.

    Parameters:
    - archivoEntrada: Input Excel file path
    
    Returns:
    - List of protein sequences

    This function reads protein sequences from an Excel file specified by 'archivoEntrada' and extracts the
    'protein_sequence' column from the DataFrame. The sequences are returned as a list.
    
    Example:
    >>> sequences = readData("protein_data.xlsx")
    >>> print(sequences)
    ['MTCG...', 'MCTA...', ...]
    """
Rafael Artinano's avatar
Rafael Artinano committed
24
    data = pd.read_excel(archivoEntrada)
25
    #data=substitute_or_remove_prot_id(data,'r')
Rafael Artinano's avatar
Rafael Artinano committed
26 27 28 29 30
    sequences = data["protein_sequence"]

    return sequences

def similitudProteinas(sequences):
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
    """
    Calculate pairwise similarity scores between protein sequences using Levenshtein distance.

    Parameters:
    - sequences: List of protein sequences
    
    Returns:
    - List of lists containing pairwise similarity information:
        - [protein_sequence_1, protein_sequence_2, similarity_score]

    This function takes a list of protein sequences and calculates pairwise similarity scores
    between each pair of protein sequences using Levenshtein distance. The results are returned
    in a list of lists.

    Example:
    >>> sequences = ["MACG", "MACC", "MGCA"]
    >>> result = similitudProteinas(sequences)
    >>> print(result)
    [['MACG', 'MACC', 75.0],
     ['MACG', 'MGCA', 50.0],
     ['MACC', 'MACG', 75.0],
     ['MACC', 'MGCA', 66.67],
     ['MGCA', 'MACG', 50.0],
     ['MGCA', 'MACC', 66.67]]
    """
Rafael Artinano's avatar
Rafael Artinano committed
56 57 58 59 60 61 62 63 64 65
    output = []
    for row1 in sequences:
        for row2 in sequences:
            if row1 != row2:
                #similarity = abs(smith.SmithWaterman(row1, row2).get_score()-1) / max(len(row1), len(row2))
                #similarity = abs(needle.NeedlemanWunsch(row1, row2).get_score()-1) / (2*max(len(row1), len(row2)))
                similarity = abs(Levenshtein.distance(row1, row2)) / max(len(row1), len(row2))
                output.append([row1, row2, similarity*100])
    return output

66 67 68
def remplazar_sequence_for_ID(output,archivoEntrada,Sal,mode="default"):
    """
    Replace protein sequences with protein IDs using a pre-existing DataFrame.
Rafael Artinano's avatar
Rafael Artinano committed
69

70 71 72 73 74 75 76 77 78
    Parameters:
    - output: List of lists containing similarity information
    - mode: Replacement mode (default or drug)
    - archivoEntrada: Path to protein information file
    - Sal: Extension for output file

    This function takes a list of lists containing pairwise similarity information, and replaces
    protein sequences with their corresponding protein IDs. The replacement is based on the information
    provided in a pre-existing DataFrame. The updated information is saved to a CSV file.
Rafael Artinano's avatar
Rafael Artinano committed
79

80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
    Example:
    >>> data = [['MACG', 'MGCA', 75.0], ['MACC', 'MGCA', 66.67]]
    >>> inputFile = "protein_data.xlsx"
    >>> outputExt = "protein"
    >>> remplazar_sequence_for_ID(data,inputFile,OutputExt, mode="default")
    """
    df_b = pd.read_excel(archivoEntrada)
    #df_b=substitute_or_remove_prot_id(df_b,"r")
    # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
    #output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0]))
    
    proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
    if(mode=="drug"):
       drug_dict=dict(df_b[['protein_sequence','drug_id']].values)
       for item in output:
        protein_sequence1 = item[0]
        protein_sequence2 = item[1]
        res=[]
        [res.append(x) for x in literal_eval(drug_dict[item[0]]) if x not in res and ( x != '[' or x != ']') ] 
        if(len(res) == 1):
          item.append(res[0])
        elif(len(res)>1):
          item.append(res)
        else:
          item.append("")    
        res=[]
        [res.append(x) for x in literal_eval(drug_dict[item[1]]) if x not in res and ( x != '[' or x != ']')] 
        if(len(res) == 1):
          item.append(res[0])
        elif(len(res)>1):
          item.append(res)
        else:
          item.append("")  
        if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict:
            item[0] = proteinas_dict[protein_sequence1]
            item[1] = proteinas_dict[protein_sequence2]
       df_a=pd.DataFrame(output, columns=['Proteina1', 'Proteina2', 'Similaridad','SimilaridadAA','similaridadAA_2','drug_id_p1','drug_id_p2'])    
    else:
       for item in output:
Rafael Artinano's avatar
Rafael Artinano committed
119 120 121 122 123 124 125 126
        protein_sequence1 = item[0]
        protein_sequence2 = item[1]
        if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict:
            item[0] = proteinas_dict[protein_sequence1]
            item[1] = proteinas_dict[protein_sequence2]



127
       df_a = pd.DataFrame(output, columns=['Proteina1', 'Proteina2', 'Similaridad','SimilaridadAA','similaridadAA_2'])
Rafael Artinano's avatar
Rafael Artinano committed
128 129

    # Guardar el DataFrame actualizado en un archivo CSV
130 131 132 133 134 135 136 137 138 139 140 141 142 143
    df_a.to_csv('AllProteins_%Similitud'+Sal+'.csv', index=False)
def similitudMatProteinas(sequences, matrix,matrix2,matriz3):
    """
    Create percentages of pairwise similarity scores between protein sequences based on three similarity matrices.

    Parameters:
    - sequences: List of protein sequences
    - matrix: First similarity matrix
    - matrix2: Second similarity matrix
    - matriz3: Third similarity matrix

    Returns:
    - List of lists containing pairwise similarity information:
        - [protein_sequence_1, protein_sequence_2, similarity_score_matrix1, similarity_score_matrix2, similarity_score_matrix3]
Rafael Artinano's avatar
Rafael Artinano committed
144

145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
    This function takes a list of protein sequences and three similarity matrices and calculates pairwise similarity scores
    between each pair of protein sequences. The similarity scores are computed using the provided matrices, and the results
    are returned in a list of lists.

    Note: The function assumes that the matrices are square matrices with dimensions matching the length of the 'sequences' list.

    Example:
    >>> sequences = ["MACG", "MACC", "MGCA"]
    >>> matrix1 = [[1.0, 0.8, 0.6], [0.8, 1.0, 0.7], [0.6, 0.7, 1.0]]
    >>> matrix2 = [[0.9, 0.7, 0.5], [0.7, 0.9, 0.6], [0.5, 0.6, 0.9]]
    >>> matrix3 = [[0.8, 0.6, 0.4], [0.6, 0.8, 0.5], [0.4, 0.5, 0.8]]
    >>> result = similitudMatProteinas(sequences, matrix1, matrix2, matrix3)
    >>> print(result)
    [['MACG', 'MACC', 80.0, 70.0, 60.0],
     ['MACG', 'MGCA', 60.0, 50.0, 40.0],
     ['MACC', 'MACG', 80.0, 70.0, 60.0],
     ['MACC', 'MGCA', 70.0, 60.0, 50.0],
     ['MGCA', 'MACG', 60.0, 50.0, 40.0],
     ['MGCA', 'MACC', 70.0, 60.0, 50.0]]
    """
    output = []
    for row1 in range(0,len(sequences)):
        for row2 in range(0,len(sequences)):
            if row1 != row2:
                #similarity = abs(smith.SmithWaterman(row1, row2).get_score()-1) / max(len(row1), len(row2))
                #similarity = abs(needle.NeedlemanWunsch(row1, row2).get_score()-1) / (2*max(len(row1), len(row2)))
                output.append([sequences[row1], sequences[row2], matrix[row1][row2]*100,matrix2[row1][row2]*100,matriz3[row1][row2]*100])
    return output
Rafael Artinano's avatar
Rafael Artinano committed
173
if __name__ == "__main__":
174
    archivoEntrada = "Data/data_nervous_genes_xf.xlsx"
Rafael Artinano's avatar
Rafael Artinano committed
175 176
    sequences = readData(archivoEntrada)
    
177 178 179 180 181 182 183 184 185 186 187 188 189
    matrix=pd.read_csv('matrizNWc.csv',header=None,index_col=False)*3+1
    matrix.abs()
    matrix=matrix/4
    matrix2=pd.read_csv('matrizNWmod1.csv',header=None,index_col=False)*3+1.0
    matrix2.abs()
    matrix2=matrix2/4
    matrix3=pd.read_csv('matrizNWmod2.csv',header=None,index_col=False)*3+1.0
    matrix3.abs()
    matrix3=matrix3/4
    #output = similitudProteinas(sequences)
    output=similitudMatProteinas(sequences, matrix,matrix2,matrix3)
    print("Generada la tabla de con las matrices de similaridad especificadas") 
    remplazar_sequence_for_ID(output,archivoEntrada,"Desease")