similarityAllProteins.py 10.3 KB
Newer Older
1 2
import pandas as pd
import Levenshtein
Rafael Artinano's avatar
Rafael Artinano committed
3
from minineedle import needle, smith, core 
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
from ast import literal_eval
import blosum as bl
def readData(archivoEntrada):
    """
    Read protein sequences from an Excel file.

    Parameters:
    - archivoEntrada: Input Excel file path
    
    Returns:
    - List of protein sequences

    This function reads protein sequences from an Excel file specified by 'archivoEntrada' and extracts the
    'protein_sequence' column from the DataFrame. The sequences are returned as a list.
    
    Example:
    >>> sequences = readData("protein_data.xlsx")
    >>> print(sequences)
    ['MTCG...', 'MCTA...', ...]
    """
    data = pd.read_excel(archivoEntrada)
    #data=substitute_or_remove_prot_id(data,'r')
    sequences = data["protein_sequence"]

    return sequences

def similitudProteinas(sequences):
    """
    Calculate pairwise similarity scores between protein sequences using Levenshtein distance.

    Parameters:
    - sequences: List of protein sequences
    
    Returns:
    - List of lists containing pairwise similarity information:
        - [protein_sequence_1, protein_sequence_2, similarity_score]

    This function takes a list of protein sequences and calculates pairwise similarity scores
    between each pair of protein sequences using Levenshtein distance. The results are returned
    in a list of lists.

    Example:
    >>> sequences = ["MACG", "MACC", "MGCA"]
    >>> result = similitudProteinas(sequences)
    >>> print(result)
    [['MACG', 'MACC', 75.0],
     ['MACG', 'MGCA', 50.0],
     ['MACC', 'MACG', 75.0],
     ['MACC', 'MGCA', 66.67],
     ['MGCA', 'MACG', 50.0],
     ['MGCA', 'MACC', 66.67]]
    """
    output = []
    for row1 in sequences:
        for row2 in sequences:
            if row1 != row2:
                #similarity = abs(smith.SmithWaterman(row1, row2).get_score()-1) / max(len(row1), len(row2))
                #similarity = abs(needle.NeedlemanWunsch(row1, row2).get_score()-1) / (2*max(len(row1), len(row2)))
                similarity = abs(Levenshtein.distance(row1, row2)) / max(len(row1), len(row2))
                output.append([row1, row2, similarity*100])
    return output

def remplazar_sequence_for_ID(output,archivoEntrada,archivoEntrada2,Sal,mode="default"):
    """
    Replace protein sequences with protein IDs using a pre-existing DataFrame.

    Parameters:
    - output: List of lists containing similarity information
    - mode: Replacement mode (default or drug)
    - archivoEntrada: Path to protein information file
    - Sal: Extension for output file

    This function takes a list of lists containing pairwise similarity information, and replaces
    protein sequences with their corresponding protein IDs. The replacement is based on the information
    provided in a pre-existing DataFrame. The updated information is saved to a CSV file.

    Example:
    >>> data = [['MACG', 'MGCA', 75.0], ['MACC', 'MGCA', 66.67]]
    >>> inputFile = "protein_data.xlsx"
    >>> outputExt = "protein"
    >>> remplazar_sequence_for_ID(data,inputFile,OutputExt, mode="default")
    """
    df_b = pd.read_excel(archivoEntrada)
    df_c= pd.read_excel(archivoEntrada2)
    common_cols = list(set.intersection(*(set(df_b.columns),set(df_c.columns) )))
    df_b=pd.concat([df_b[common_cols],df_c[common_cols]], ignore_index=True)
    #df_b=substitute_or_remove_prot_id(df_b,"r")
    # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
    #output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0]))
    
    proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
    if(mode=="drug"):
       drug_dict=dict(df_b[['protein_sequence','drug_id']].values)
       for item in output:
        protein_sequence1 = item[0]
        protein_sequence2 = item[1]
        res=[]
        [res.append(x) for x in literal_eval(drug_dict[item[0]]) if x not in res and ( x != '[' or x != ']') ] 
        if(len(res) == 1):
          item.append(res[0])
        elif(len(res)>1):
          item.append(res)
        else:
          item.append("")    
        res=[]
        [res.append(x) for x in literal_eval(drug_dict[item[1]]) if x not in res and ( x != '[' or x != ']')] 
        if(len(res) == 1):
          item.append(res[0])
        elif(len(res)>1):
          item.append(res)
        else:
          item.append("")  
        if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict:
            item[0] = proteinas_dict[protein_sequence1]
            item[1] = proteinas_dict[protein_sequence2]
       df_a=pd.DataFrame(output, columns=['Proteina1', 'Proteina2', 'Similaridad','SimilaridadAA','similaridadAA_2','similaridadBlosum','drug_id_p1','drug_id_p2'])    
    else:
       for item in output:
        protein_sequence1 = item[0]
        protein_sequence2 = item[1]
        if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict:
            item[0] = proteinas_dict[protein_sequence1]
            item[1] = proteinas_dict[protein_sequence2]



       df_a = pd.DataFrame(output, columns=['Proteina1', 'Proteina2', 'Similaridad','SimilaridadAA','similaridadAA_2','similaridadBlosum'])

    # Guardar el DataFrame actualizado en un archivo CSV
    df_a.to_csv('AllProteins_%Similitud'+Sal+'.csv', index=False)
def similitudMatProteinas(sequences,sequences2, matrix,matrix2,matriz3,matriz4,equal=False):
    """
    Create percentages of pairwise similarity scores between protein sequences based on three similarity matrices.

    Parameters:
    - sequences: List of protein sequences
    - matrix: First similarity matrix
    - matrix2: Second similarity matrix
    - matriz3: Third similarity matrix

    Returns:
    - List of lists containing pairwise similarity information:
        - [protein_sequence_1, protein_sequence_2, similarity_score_matrix1, similarity_score_matrix2, similarity_score_matrix3]

    This function takes a list of protein sequences and three similarity matrices and calculates pairwise similarity scores
    between each pair of protein sequences. The similarity scores are computed using the provided matrices, and the results
    are returned in a list of lists.

    Note: The function assumes that the matrices are square matrices with dimensions matching the length of the 'sequences' list.

    Example:
    >>> sequences = ["MACG", "MACC", "MGCA"]
    >>> matrix1 = [[1.0, 0.8, 0.6], [0.8, 1.0, 0.7], [0.6, 0.7, 1.0]]
    >>> matrix2 = [[0.9, 0.7, 0.5], [0.7, 0.9, 0.6], [0.5, 0.6, 0.9]]
    >>> matrix3 = [[0.8, 0.6, 0.4], [0.6, 0.8, 0.5], [0.4, 0.5, 0.8]]
    >>> result = similitudMatProteinas(sequences, matrix1, matrix2, matrix3)
    >>> print(result)
    [['MACG', 'MACC', 80.0, 70.0, 60.0],
     ['MACG', 'MGCA', 60.0, 50.0, 40.0],
     ['MACC', 'MACG', 80.0, 70.0, 60.0],
     ['MACC', 'MGCA', 70.0, 60.0, 50.0],
     ['MGCA', 'MACG', 60.0, 50.0, 40.0],
     ['MGCA', 'MACC', 70.0, 60.0, 50.0]]
    """
    output = []
    for row1 in range(0,len(sequences2)):
        for row2 in range(0,len(sequences)):
           if equal:
             if row1 != row2:
                #similarity = abs(smith.SmithWaterman(row1, row2).get_score()-1) / max(len(row1), len(row2))
                #similarity = abs(needle.NeedlemanWunsch(row1, row2).get_score()-1) / (2*max(len(row1), len(row2)))
                output.append([sequences[row2], sequences2[row1], matrix[row1][row2]*100,matrix2[row1][row2]*100,matriz3[row1][row2]*100,matriz4[row1][row2]*100])
           else:
                output.append([sequences[row2], sequences2[row1], matrix[row1][row2]*100,matrix2[row1][row2]*100,matriz3[row1][row2]*100,matriz4[row1][row2]*100])     
    return output
if __name__ == "__main__":
    archivoEntrada = "Data/data_lung_cancer_treatment.xlsx"
    sequences1 = readData(archivoEntrada)
    archivoEntrada2 = "Data/data_autoimmume_desease.xlsx"
    sequences2 = readData(archivoEntrada2)
    matrix=pd.read_csv('matrizNWAutoimmuneDiseaseC.csv',header=None,index_col=False)*3+1.0
    matrix.abs()
    matrix/=4
    print(matrix.shape)
    matrix2=pd.read_csv('matrizNWAutoimmuneDiseaseMod1.csv',header=None,index_col=False)*3+1.0
    matrix2.abs()
    matrix2/=4
    print(matrix2.shape)
    matrix3=pd.read_csv('matrizNWAutoimmuneDiseaseMod2.csv',header=None,index_col=False)*3+1.0
    matrix3.abs()
    matrix3/=4
    print(matrix3.shape)
    matrix4=pd.read_csv('matrizNWAutoimmuneDiseaseBlosum62.csv',header=None,index_col=False)
    dic= bl.BLOSUM(62)
    print(dic)
    mismatch=0
    match=1
    minn=min(min(min(min(list(row.values())) for row in list(dic.values())),-4),mismatch)
    print(matrix4.shape)
    print(len(sequences1))
    for row1 in range(0,len(sequences2)):
        for row2 in range(0,len(sequences1)):
            len_sec1=0
            len_min_sec1=0
            dic_seq=set()
            minf_letters='a'
            for i in sequences1[row2]:
               dic_seq.add(dic[i][i])
               minf_letters= i if(dic[i][i] == float('-inf')) else minf_letters
               
               len_sec1+=dic[i][i] if(dic[i][i] != float('-inf')) else match
               len_min_sec1+=min(list(dic[i].values())) if(dic[i][i] != float('-inf')) else mismatch
            len_sec2=0
            len_min_sec2=0   
            for i in sequences2[row1]:
               dic_seq.add(dic[i][i])
               minf_letters= i if(dic[i][i] == float('-inf')) else minf_letters
               len_sec2+=dic[i][i] if(dic[i][i] != float('-inf')) else match
               len_min_sec2+=min(list(dic[i].values())) if(dic[i][i] != float('-inf')) else mismatch
            if(max(len_sec2,len_sec1) == float('-inf')):   
               print(max(len_sec2,len_sec1))
               print(dic_seq)
               print(minf_letters)   
            matrix4[row1][row2]*=max(len_sec2,len_sec1)
            matrix4[row1][row2]-=(minn*max(len(sequences1[row2]),len(sequences2[row1])))
            matrix4[row1][row2]/= (max(len_sec2,len_sec1)-minn*max(len(sequences1[row2]),len(sequences2[row1])))
    print(matrix[0][0])
    print(matrix2[0][0])
    print(matrix3[0][0])
    print(matrix4[0][0])          
    #output = similitudProteinas(sequences)
    output=similitudMatProteinas(sequences1,sequences2, matrix,matrix2,matrix3,matrix4,equal=False)
    print("Generada la tabla de con las matrices de similaridad especificadas")
     
    remplazar_sequence_for_ID(output,archivoEntrada,archivoEntrada2,"AutoimmuneDisease")