import pandas as pd import Levenshtein from minineedle import needle, smith, core from ast import literal_eval import blosum as bl def readData(archivoEntrada): """ Read protein sequences from an Excel file. Parameters: - archivoEntrada: Input Excel file path Returns: - List of protein sequences This function reads protein sequences from an Excel file specified by 'archivoEntrada' and extracts the 'protein_sequence' column from the DataFrame. The sequences are returned as a list. Example: >>> sequences = readData("protein_data.xlsx") >>> print(sequences) ['MTCG...', 'MCTA...', ...] """ data = pd.read_excel(archivoEntrada) #data=substitute_or_remove_prot_id(data,'r') sequences = data["protein_sequence"] return sequences def similitudProteinas(sequences): """ Calculate pairwise similarity scores between protein sequences using Levenshtein distance. Parameters: - sequences: List of protein sequences Returns: - List of lists containing pairwise similarity information: - [protein_sequence_1, protein_sequence_2, similarity_score] This function takes a list of protein sequences and calculates pairwise similarity scores between each pair of protein sequences using Levenshtein distance. The results are returned in a list of lists. Example: >>> sequences = ["MACG", "MACC", "MGCA"] >>> result = similitudProteinas(sequences) >>> print(result) [['MACG', 'MACC', 75.0], ['MACG', 'MGCA', 50.0], ['MACC', 'MACG', 75.0], ['MACC', 'MGCA', 66.67], ['MGCA', 'MACG', 50.0], ['MGCA', 'MACC', 66.67]] """ output = [] for row1 in sequences: for row2 in sequences: if row1 != row2: #similarity = abs(smith.SmithWaterman(row1, row2).get_score()-1) / max(len(row1), len(row2)) #similarity = abs(needle.NeedlemanWunsch(row1, row2).get_score()-1) / (2*max(len(row1), len(row2))) similarity = abs(Levenshtein.distance(row1, row2)) / max(len(row1), len(row2)) output.append([row1, row2, similarity*100]) return output def remplazar_sequence_for_ID(output,archivoEntrada,archivoEntrada2,Sal,mode="default"): """ Replace protein sequences with protein IDs using a pre-existing DataFrame. Parameters: - output: List of lists containing similarity information - mode: Replacement mode (default or drug) - archivoEntrada: Path to protein information file - Sal: Extension for output file This function takes a list of lists containing pairwise similarity information, and replaces protein sequences with their corresponding protein IDs. The replacement is based on the information provided in a pre-existing DataFrame. The updated information is saved to a CSV file. Example: >>> data = [['MACG', 'MGCA', 75.0], ['MACC', 'MGCA', 66.67]] >>> inputFile = "protein_data.xlsx" >>> outputExt = "protein" >>> remplazar_sequence_for_ID(data,inputFile,OutputExt, mode="default") """ df_b = pd.read_excel(archivoEntrada) df_c= pd.read_excel(archivoEntrada2) common_cols = list(set.intersection(*(set(df_b.columns),set(df_c.columns) ))) df_b=pd.concat([df_b[common_cols],df_c[common_cols]], ignore_index=True) #df_b=substitute_or_remove_prot_id(df_b,"r") # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico #output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0])) proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values) if(mode=="drug"): drug_dict=dict(df_b[['protein_sequence','drug_id']].values) for item in output: protein_sequence1 = item[0] protein_sequence2 = item[1] res=[] [res.append(x) for x in literal_eval(drug_dict[item[0]]) if x not in res and ( x != '[' or x != ']') ] if(len(res) == 1): item.append(res[0]) elif(len(res)>1): item.append(res) else: item.append("") res=[] [res.append(x) for x in literal_eval(drug_dict[item[1]]) if x not in res and ( x != '[' or x != ']')] if(len(res) == 1): item.append(res[0]) elif(len(res)>1): item.append(res) else: item.append("") if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict: item[0] = proteinas_dict[protein_sequence1] item[1] = proteinas_dict[protein_sequence2] df_a=pd.DataFrame(output, columns=['Proteina1', 'Proteina2', 'Similaridad','SimilaridadAA','similaridadAA_2','similaridadBlosum','drug_id_p1','drug_id_p2']) else: for item in output: protein_sequence1 = item[0] protein_sequence2 = item[1] if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict: item[0] = proteinas_dict[protein_sequence1] item[1] = proteinas_dict[protein_sequence2] df_a = pd.DataFrame(output, columns=['Proteina1', 'Proteina2', 'Similaridad','SimilaridadAA','similaridadAA_2','similaridadBlosum']) # Guardar el DataFrame actualizado en un archivo CSV df_a.to_csv('AllProteins_%Similitud'+Sal+'.csv', index=False) def similitudMatProteinas(sequences,sequences2, matrix,matrix2,matriz3,matriz4,equal=False): """ Create percentages of pairwise similarity scores between protein sequences based on three similarity matrices. Parameters: - sequences: List of protein sequences - matrix: First similarity matrix - matrix2: Second similarity matrix - matriz3: Third similarity matrix Returns: - List of lists containing pairwise similarity information: - [protein_sequence_1, protein_sequence_2, similarity_score_matrix1, similarity_score_matrix2, similarity_score_matrix3] This function takes a list of protein sequences and three similarity matrices and calculates pairwise similarity scores between each pair of protein sequences. The similarity scores are computed using the provided matrices, and the results are returned in a list of lists. Note: The function assumes that the matrices are square matrices with dimensions matching the length of the 'sequences' list. Example: >>> sequences = ["MACG", "MACC", "MGCA"] >>> matrix1 = [[1.0, 0.8, 0.6], [0.8, 1.0, 0.7], [0.6, 0.7, 1.0]] >>> matrix2 = [[0.9, 0.7, 0.5], [0.7, 0.9, 0.6], [0.5, 0.6, 0.9]] >>> matrix3 = [[0.8, 0.6, 0.4], [0.6, 0.8, 0.5], [0.4, 0.5, 0.8]] >>> result = similitudMatProteinas(sequences, matrix1, matrix2, matrix3) >>> print(result) [['MACG', 'MACC', 80.0, 70.0, 60.0], ['MACG', 'MGCA', 60.0, 50.0, 40.0], ['MACC', 'MACG', 80.0, 70.0, 60.0], ['MACC', 'MGCA', 70.0, 60.0, 50.0], ['MGCA', 'MACG', 60.0, 50.0, 40.0], ['MGCA', 'MACC', 70.0, 60.0, 50.0]] """ output = [] for row1 in range(0,len(sequences2)): for row2 in range(0,len(sequences)): if equal: if row1 != row2: #similarity = abs(smith.SmithWaterman(row1, row2).get_score()-1) / max(len(row1), len(row2)) #similarity = abs(needle.NeedlemanWunsch(row1, row2).get_score()-1) / (2*max(len(row1), len(row2))) output.append([sequences[row2], sequences2[row1], matrix[row1][row2]*100,matrix2[row1][row2]*100,matriz3[row1][row2]*100,matriz4[row1][row2]*100]) else: output.append([sequences[row2], sequences2[row1], matrix[row1][row2]*100,matrix2[row1][row2]*100,matriz3[row1][row2]*100,matriz4[row1][row2]*100]) return output if __name__ == "__main__": archivoEntrada = "Data/data_lung_cancer_treatment.xlsx" sequences1 = readData(archivoEntrada) archivoEntrada2 = "Data/data_autoimmume_desease.xlsx" sequences2 = readData(archivoEntrada2) matrix=pd.read_csv('matrizNWAutoimmuneDiseaseC.csv',header=None,index_col=False)*3+1.0 matrix.abs() matrix/=4 print(matrix.shape) matrix2=pd.read_csv('matrizNWAutoimmuneDiseaseMod1.csv',header=None,index_col=False)*3+1.0 matrix2.abs() matrix2/=4 print(matrix2.shape) matrix3=pd.read_csv('matrizNWAutoimmuneDiseaseMod2.csv',header=None,index_col=False)*3+1.0 matrix3.abs() matrix3/=4 print(matrix3.shape) matrix4=pd.read_csv('matrizNWAutoimmuneDiseaseBlosum62.csv',header=None,index_col=False) dic= bl.BLOSUM(62) print(dic) mismatch=0 match=1 minn=min(min(min(min(list(row.values())) for row in list(dic.values())),-4),mismatch) print(matrix4.shape) print(len(sequences1)) for row1 in range(0,len(sequences2)): for row2 in range(0,len(sequences1)): len_sec1=0 len_min_sec1=0 dic_seq=set() minf_letters='a' for i in sequences1[row2]: dic_seq.add(dic[i][i]) minf_letters= i if(dic[i][i] == float('-inf')) else minf_letters len_sec1+=dic[i][i] if(dic[i][i] != float('-inf')) else match len_min_sec1+=min(list(dic[i].values())) if(dic[i][i] != float('-inf')) else mismatch len_sec2=0 len_min_sec2=0 for i in sequences2[row1]: dic_seq.add(dic[i][i]) minf_letters= i if(dic[i][i] == float('-inf')) else minf_letters len_sec2+=dic[i][i] if(dic[i][i] != float('-inf')) else match len_min_sec2+=min(list(dic[i].values())) if(dic[i][i] != float('-inf')) else mismatch if(max(len_sec2,len_sec1) == float('-inf')): print(max(len_sec2,len_sec1)) print(dic_seq) print(minf_letters) matrix4[row1][row2]*=max(len_sec2,len_sec1) matrix4[row1][row2]-=(minn*max(len(sequences1[row2]),len(sequences2[row1]))) matrix4[row1][row2]/= (max(len_sec2,len_sec1)-minn*max(len(sequences1[row2]),len(sequences2[row1]))) print(matrix[0][0]) print(matrix2[0][0]) print(matrix3[0][0]) print(matrix4[0][0]) #output = similitudProteinas(sequences) output=similitudMatProteinas(sequences1,sequences2, matrix,matrix2,matrix3,matrix4,equal=False) print("Generada la tabla de con las matrices de similaridad especificadas") remplazar_sequence_for_ID(output,archivoEntrada,archivoEntrada2,"AutoimmuneDisease")