import pandas as pd import Levenshtein import time from sklearn.cluster import OPTICS,DBSCAN,AgglomerativeClustering,BisectingKMeans,SpectralClustering from sklearn.preprocessing import StandardScaler import numpy as np from scipy.spatial.distance import pdist, squareform from pyclustering.cluster.dbscan import dbscan from pyclustering.utils import timedcall from Levenshtein import distance import re from minineedle import needle, smith, core from Bio.Blast.Applications import NcbiblastpCommandline from io import StringIO from Bio.Blast import NCBIXML from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import swalign import multiprocessing as mp globi=0 def substitute_or_remove_prot_id(data,sub_rem): print("inside the problem") with open("nombres_sust.txt") as prottosubs: index=prottosubs.readline() acept=index.split() listtosubs={} for i in range(0,len(acept)): listtosubs[acept[i]]=[] while line := prottosubs.readline(): newline=line.split() #print(len(newline)) for i in range(0,len(newline)): listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip()) resub=1 if re.search("Primary",list(listtosubs.keys())[0]): resub=0 print((resub+1)%2) #print(data) #data2=data.copy() if(sub_rem == "s"): data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) #datacp=data.copy() #print(pd.concat([data2,datacp]).drop_duplicates()) else: global globi datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True] data = data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==False] datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False) globi=globi+1 return data def readData(archivoEntrada, enfermedad): data = pd.read_excel(archivoEntrada) if (enfermedad != ''): #datar=substitute_or_remove_prot_id(data,"r") #print("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar))) data = data.loc[data["disease_id"] == enfermedad] dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx") filt_data=len(data) alz_filt_data=len(dataB) #data=substitute_or_remove_prot_id(data,"r") #dataB=substitute_or_remove_prot_id(dataB,"r") print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data))) print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB))) data = data[~((data["disease_id"] == enfermedad) & (data["protein_id"].isin(dataB["protein_id"])) & (data["gene_id"].isin(dataB["gene_id"])))] sequences = data["protein_sequence"] return sequences def calculate_matriz_levens(data): num_points=len(data) similarity_matrix = [[0] * num_points for _ in range(num_points)] with mp.Pool(processes=20) as pool: sim_matrix=pool.starmap(levenshtein_similarity, [(data[i],data[j]) for i in range(num_points) for j in range(num_points)]) similarity = [] for idx in range(0, len(sim_matrix) // num_points): #Getting incremented chunks similarity.append([sim_matrix[idx*num_points: (idx + 1) * num_points]]) datf=pd.DataFrame(np.asmatrix(np.array(similarity))) datf.to_csv('resultados/matrizLevenshtein.csv', index=False,header=False) return def calculate_matrix_needle(data): num_points=len(data) similarity_matrix = [[0] * num_points for _ in range(num_points)] with mp.Pool(processes=20) as pool: sim_matrix=pool.starmap(needleman_wunsch_similarity, [(data[i],data[j]) for i in range(num_points) for j in range(num_points)]) similarity = [] for idx in range(0, len(sim_matrix) // num_points): #Getting incremented chunks similarity.append([sim_matrix[idx*num_points: (idx + 1) * num_points]]) datf=pd.DataFrame(np.asmatrix(np.array(similarity))) datf.to_csv('resultados/matrizNeedleWunch.csv', index=False,header=False) return def calculate_matrix_smith(data): num_points=len(data) similarity_matrix = [[0] * num_points for _ in range(num_points)] with mp.Pool(processes=20) as pool: sim_matrix=pool.starmap(smith_waterman_similarity, [(data[i],data[j]) for i in range(num_points) for j in range(num_points)]) similarity = [] for idx in range(0, len(sim_matrix) // num_points): # Getting incremented chunks similarity.append([sim_matrix[idx*num_points: (idx + 1) * num_points]]) datf=pd.DataFrame(np.asmatrix(np.array(similarity))) datf.to_csv('resultados/matrizSmithWater.csv', index=False,header=False) return def calculate_matrix_blasto(data): num_points=len(data) similarity_matrix = [[0] * num_points for _ in range(num_points)] with mp.Pool(processes=20) as pool: sim_matrix=pool.starmap(blast_similarity, [(data[i],data[j]) for i in range(num_points) for j in range(num_points)]) similarity = [] for idx in range(0, len(sim_matrix) // num_points): #Getting incremented chunks similarity.append([sim_matrix[idx*num_points: (idx + 1) * num_points]]) datf=pd.DataFrame(np.asmatrix(np.array(similarity))) datf.to_csv('resultados/matrizBlast.csv', index=False,header=False) def remplazar_sequence_for_ID(output): df_b = pd.read_excel("data_nervous_genes_2.xlsx") df_b= substitute_or_remove_prot_id(df_b,"s") proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values) for i in range(len(output)): protein_sequence = output[i] if protein_sequence in proteinas_dict: output[i] = proteinas_dict[protein_sequence] return output def smith_waterman_similarity(pattern1,pattern2): return smith.SmithWaterman(pattern1,pattern2).get_score()/max(len(pattern1), len(pattern2)) def levenshtein_similarity(pattern1, pattern2): return Levenshtein.distance(pattern1, pattern2) / max(len(pattern1), len(pattern2)) def needleman_wunsch_similarity(pattern1, pattern2): global dat #print(needle.NeedlemanWunsch(pattern1 , pattern2).get_score()/max(len(pattern1), len(pattern2))) return needle.NeedlemanWunsch(pattern1 , pattern2).get_score()/max(len(pattern1), len(pattern2)) def to_raw(string): return "{0}".format(string) def blast_similarity(pattern1,pattern2): seq1 = SeqRecord(Seq(pattern1), id="seq1") seq2 = SeqRecord(Seq(pattern2), id="seq2") assert pattern1 assert pattern2 SeqIO.write(seq1, "seq1.fasta", "fasta") SeqIO.write(seq2, "seq2.fasta", "fasta") SeqIO.write(seq1, "seqx.fasta", "fasta") SeqIO.write(seq1, "seqy.fasta", "fasta") output = NcbiblastpCommandline(query="seq1.fasta", subject="seq2.fasta", outfmt=5)()[0] #print(output) blast_result_record = NCBIXML.read(StringIO(output)) result=0 with open("seq1.fasta", 'w') as target: target.truncate() with open("seq2.fasta", 'w') as target: target.truncate() for alignment in blast_result_record.alignments: for hsp in alignment.hsps: result=result+hsp.score #print(blast_result_record) return int(result)/max(len(pattern1), len(pattern2)) def substitute_or_remove_prot_id(data,sub_rem): print("inside the problem") with open("nombres_sust.txt") as prottosubs: index=prottosubs.readline() acept=index.split() listtosubs={} for i in range(0,len(acept)): listtosubs[acept[i]]=[] while line := prottosubs.readline(): newline=line.split() #print(len(newline)) for i in range(0,len(newline)): listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip()) resub=1 if re.search("Primary",list(listtosubs.keys())[0]): resub=0 print((resub+1)%2) #print(data) #data2=data.copy() if(sub_rem == "s"): data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub]) #datacp=data.copy() #print(pd.concat([data2,datacp]).drop_duplicates()) else: global globi datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True] data = data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==False] datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False) globi=globi+1 return data def readData(archivoEntrada, enfermedad): data = pd.read_excel(archivoEntrada) #data = substitute_or_remove_prot_id(data,"r") if (enfermedad != ''): #datar=substitute_or_remove_prot_id(data,"r") #print("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar))) data = data.loc[data["disease_id"] == enfermedad] #dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx") #dataB = substitute_or_remove_prot_id(dataB,"r") filt_data=len(data) #alz_filt_data=len(dataB) #data=substitute_or_remove_prot_id(data,"r") #dataB=substitute_or_remove_prot_id(dataB,"r") #print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data))) #print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB))) #data = data[~((data["disease_id"] == enfermedad) & # (data["protein_id"].isin(dataB["protein_id"])) & # ())] sequences = data["protein_id"] return sequences if __name__=="__main__": data=readData("data_nervous_genes_x.xlsx","C0002395") calculate_matrix_needle(data) calculate_matrix_smith(data) output=data.to_list() #output=remplazar_sequence_for_ID(data) similarity_matrix=pd.read_csv('resultados/matrizLevenshtein.csv',header=None,index_col=False)-1 #similarity_matrix=similarity_matrix/2 similarity_matrix=similarity_matrix.abs() similarity_matrix.to_numpy() sim_mat_40=similarity_matrix.copy() sim_mat_20=similarity_matrix.copy() sim_mat_10=similarity_matrix.copy() data_40=pd.read_csv('resultados/Metrica_Coincidencia_40.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False) data_40=data_40.drop([0]) data_20=pd.read_csv('resultados/Metrica_Coincidencia_20.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False) data_20=data_20.drop([0]) data_10=pd.read_csv('resultados/Metrica_Coincidencia_10.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False) data_10=data_10.drop([0]) new_sim=np.copy(similarity_matrix) print(output) new_sim_mean=np.copy(similarity_matrix) for i,ks in data_40.iterrows(): sim_mat_40[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])*0.3 #for i,kks in data_20.iterrows(): # sim_mat_20[output.index(kks['proteina1'])][output.index(kks['proteina2'])]+=float(kks['%Coincidencia'])*0.3 #for i,ksk in data_10.iterrows(): # sim_mat_10[output.index(ksk['proteina1'])][output.index(ksk['proteina2'])]+=float(ksk['%Coincidencia'])*0.3 #dfx=pd.DataFrame(sim_mat_20) #dfx=df/1.3 #dfx=df-1 #dfx.abs() #dfx.to_csv("resultados/matrizLevenshteinFS_20.csv",header=False,index=False) dfx=pd.DataFrame(sim_mat_40) dfx=dfx/1.3 dfx=dfx-1 dfx.abs() dfx.to_csv("resultados/matrizLevenshteinFS_40.csv",header=False,index=False) """ dfx=pd.DataFrame(sim_mat_10) dfx=df/1.3 dfx=df-1 dfx.abs() dfx.to_csv("resultados/matrizLevenshteinFS_10.csv",header=False,index=False) s1 = pd.merge(data_40, data_20, how='inner', on=['proteina1','proteina2']) s2= pd.merge(s1,data_10, how='inner', on=['proteina1','proteina2']) ss=s1[(~(s1['proteina1'].isin(s2['proteina1']))& ~(s1['proteina2'].isin(s2['proteina2'])))] s3 = pd.merge(data_20, data_10, how='inner', on=['proteina1','proteina2']) print(s3['proteina2'].isin(s2['proteina2'])) s4=s3[~(s3['proteina1'].isin(s2['proteina1']))&~(s3['proteina2'].isin(s2['proteina2']))] s5 = pd.merge(data_40, data_10, how='inner', on=['proteina1','proteina2']) s6=s5.loc[~(s5['proteina1'].isin(s2['proteina1']))&(s5['proteina2'].isin(s2['proteina2']))] data_401=data_40[~(data_40['proteina1'].isin(data_20['proteina1']))& ~(data_40['proteina2'].isin(data_20['proteina2']))] data_402=data_40[~(data_40['proteina1'].isin(data_10['proteina1']))& ~(data_40['proteina2'].isin(data_10['proteina2']))] data_40X=data_402[~(data_402['proteina1'].isin(data_20['proteina1']))& ~(data_402['proteina2'].isin(data_20['proteina2']))] data_201=data_20[~(data_20['proteina1'].isin(data_40['proteina1']))&(data_20['proteina2'].isin(data_40['proteina2']))] data_202=data_20[~(data_20['proteina1'].isin(data_10['proteina1']))&(data_20['proteina2'].isin(data_10['proteina2']))] data_20X=data_202[~(data_202['proteina1'].isin(data_40['proteina1']))&(data_202['proteina2'].isin(data_40['proteina2']))] data_101=data_10[~(data_10['proteina1'].isin(data_40['proteina1']))&(data_10['proteina2'].isin(data_40['proteina2']))] data_102=data_10[~(data_10['proteina1'].isin(data_20['proteina1']))&(data_10['proteina2'].isin(data_20['proteina2']))] data_10X=data_102[~(data_102['proteina1'].isin(data_40['proteina1']))&(data_102['proteina2'].isin(data_40['proteina2']))] #print(s3) print(data_40X) print(data_20X) print(data_10X) #print(data_402) for i in range(0,similarity_matrix.shape[0]): for j in range(0,similarity_matrix.shape[1]): cross=0 cross_over=0 dd_10_check=False dd_20_check=False dd_40_check=False if ((data_40['proteina1']==output[i]) & (data_40['proteina2']==output[j])).any() or ((data_40['proteina1']==output[j]) & (data_40['proteina2']==output[i])).any(): dd_40_check=True if ((data_40['proteina1']==output[i]) & (data_40['proteina2']==output[j])).any(): dd_40=float(data_40[(data_40['proteina1']==output[i]) & (data_40['proteina2']==output[j])]['%Coincidencia'].to_list()[0])/100 else: dd_40=float(data_40[(data_40['proteina1']==output[j]) & (data_40['proteina2']==output[i])]['%Coincidencia'].to_list()[0])/100 if ((data_20['proteina1']==output[i]) & (data_20['proteina2']==output[j])).any() or ((data_20['proteina1']==output[j]) & (data_20['proteina2']==output[i])).any(): dd_20_check=True if ((data_20['proteina1']==output[i]) & (data_20['proteina2']==output[j])).any(): dd_20=float(data_20[(data_20['proteina1']==output[i]) & (data_20['proteina2']==output[j])]['%Coincidencia'].to_list()[0])/100 else: dd_20=float(data_20[(data_20['proteina1']==output[j]) & (data_20['proteina2']==output[i])]['%Coincidencia'].to_list()[0])/100 if ((data_10['proteina1']==output[i]) & (data_10['proteina2']==output[j])).any() or ((data_10['proteina1']==output[j]) & (data_10['proteina2']==output[i])).any(): dd_10_check=True if ((data_10['proteina1']==output[i]) & (data_10['proteina2']==output[j])).any(): dd_10=float(data_10[(data_10['proteina1']==output[i]) & (data_10['proteina2']==output[j])]['%Coincidencia'].to_list()[0])/100 else: dd_10=float(data_10[(data_10['proteina1']==output[j]) & (data_10['proteina2']==output[i])]['%Coincidencia'].to_list()[0])/100 if dd_10_check and dd_40_check and dd_20_check: #print(dd_40) #print(dd_20) #print(dd_10) cross=(dd_40-dd_20)-(dd_20-dd_10) cross_over=(dd_40+dd_20+dd_10)/len([dd_20,dd_10,dd_40]) elif dd_20_check and dd_40_check: cross=(dd_40-dd_20) cross_over=(dd_40+dd_20)/len([dd_20,dd_40]) elif dd_10_check and dd_20_check: cross=(dd_20-dd_10) cross_over=(dd_20+dd_10)/len([dd_20,dd_10]) elif dd_40_check and dd_10_check: cross=(dd_40-dd_10) cross_over=(dd_40+dd_10)/len([dd_10,dd_40]) elif dd_40_check: cross=-dd_40 cross_over=dd_40 elif dd_20_check: cross=-dd_20 cross_over=dd_20 elif dd_10_check: cross=-dd_10 cross_over=dd_10 if(cross!=0): print(cross) if(cross==0): cross=-1 new_sim[i][j]+=0.3*cross new_sim_mean[i][j]+=0.3*cross_over df=pd.DataFrame(new_sim) df=df-1 df.abs() df=df/1.3 df.to_csv("resultados/matrizLevenshteinF.csv",header=False,index=False) df2=pd.DataFrame(new_sim_mean) df2=df/1.3 df2=df-1 df2.abs() """ #df2.to_csv("resultados/matrizLevenshteinFMean.csv",header=False,index=False) #calculate_matrix_blasto(data) #calculate_matriz_levens(data)