Commit b5e1a75c authored by Belen Otero Carrasco's avatar Belen Otero Carrasco

Adding final results finding patterns

parents
import pandas as pd
import Levenshtein
import time
import numpy as np
from Levenshtein import distance
import re
from minineedle import needle, smith, core
from Bio.Blast.Applications import NcbiblastpCommandline
from io import StringIO
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import swalign
import multiprocessing as mp
globi=0
import nw_wrapper
import nw_wrapper_matrix
import math
import blosum as bl
def substitute_or_remove_prot_id(data,archSubs,sub_rem):
"""
Substitute or remove protein IDs based on a substitution file.
Parameters:
- data: DataFrame containing protein data
- sub_rem: 's' to substitute, 'r' to remove
- archSubs: Substituion file
Returns:
- Updated DataFrame after substitution or removal
"""
with open(archSubs) as prottosubs:
index=prottosubs.readline()
acept=index.split()
listtosubs={}
for i in range(0,len(acept)):
listtosubs[acept[i]]=[]
while line := prottosubs.readline():
newline=line.split()
#print(len(newline))
for i in range(0,len(newline)):
listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip())
resub=1
if re.search("Primary",list(listtosubs.keys())[0]):
resub=0
#print(data)
#data2=data.copy()
if(sub_rem == "s"):
data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
#datacp=data.copy()
#print(pd.concat([data2,datacp]).drop_duplicates())
else:
global globi
datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True]
data = data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==False]
datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False)
globi=globi+1
return data
def readData(archivoEntrada, enfermedad):
"""
Read protein data from an Excel file, filter based on disease, and return protein sequences.
Parameters:
- archivoEntrada: Excel file containing protein data
- enfermedad: Disease ID for filtering
Returns:
- Protein sequences DataFrame
"""
data = pd.read_excel(archivoEntrada)
sequences = data["protein_sequence"]
return sequences
def calculate_matrix_similarity(data, similarity_function, output_filename):
"""
Calculate similarity matrix pairwise between each data pair of sequences using multiprocessing .
Parameters:
- data: Protein sequences DataFrame
- similarity_function: Function to calculate similarity
- output_filename: Filename to save the similarity matrix
"""
num_points = len(data)
similarity_matrix = [[0] * num_points for _ in range(num_points)]
with mp.Pool(processes=20) as pool:
sim_matrix = pool.starmap(similarity_function, [(data[i], data[j]) for i in range(num_points) for j in range(num_points)])
similarity = []
for idx in range(0, len(sim_matrix) // num_points):
similarity.append([sim_matrix[idx * num_points: (idx + 1) * num_points]])
datf = pd.DataFrame(np.asmatrix(np.array(similarity)))
datf.to_csv(output_filename, index=False, header=False)
def remplazar_sequence_for_ID(output,archivoEntrada,archSubs):
"""
Changes the protein sequences in output to their corresponding id.
Parameters:
- ouput: Pandas Dataframe with the sequences to be changed
- archivoEntrada : Input file with the equivalences of sequence-id
- archSubs: Input file with each protein that has to be changed back to its primary entry
Returns:
- Returns dataframe passed as input with the sequences changed to their respective id
"""
df_b = pd.read_excel(archivoEntrada)
df_b= substitute_or_remove_prot_id(df_b,archSubs,"s")
proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
for i in range(len(output)):
protein_sequence = output[i]
if protein_sequence in proteinas_dict:
output[i] = proteinas_dict[protein_sequence]
return output
def smith_waterman_similarity(pattern1,pattern2):
"""
Wrapper for Smith-Waterman algorithm using default values.
Parameters:
- pattern1: Protein sequence 1
- pattern2: Protein sequence 2
Returns:
- Smith-Waterman alignment score divided by the maximum lenght between the two sequences, making the score be between the interval [0,1]
"""
return smith.SmithWaterman(pattern1,pattern2).get_score()/max(len(pattern1), len(pattern2))
def levenshtein_similarity(pattern1, pattern2):
"""
Calculate Levenshtein similarity between two sequences.
Parameters:
- pattern1: Protein sequence 1
- pattern2: Protein sequence 2
Returns:
- Levenshtein similarity score divided by the maximum lenght between the two sequences, making the score be between the interval [0,1]
"""
return Levenshtein.distance(pattern1, pattern2) / max(len(pattern1), len(pattern2))
def needleman_wunsch_similarity(pattern1, pattern2):
"""
Wrapper for Needleman-Wunsch algorithm using default values.
Parameters:
- pattern1: Protein sequence 1
- pattern2: Protein sequence 2
Returns:
- Needleman-Wunsch alignment score normalized between [-1,1]
"""
global dat
#print(needle.NeedlemanWunsch(pattern1 , pattern2).get_score()/max(len(pattern1), len(pattern2)))
return needle.NeedlemanWunsch(pattern1 , pattern2).get_score()/max(len(pattern1), len(pattern2))
def to_raw(string):
return "{0}".format(string)
def blast_similarity(pattern1,pattern2):
"""
Run BLAST to calculate similarity between two protein sequences.
Parameters:
- pattern1: Protein sequence 1
- pattern2: Protein sequence 2
Returns:
- BLAST alignment score
"""
seq1 = SeqRecord(Seq(pattern1),
id="seq1")
seq2 = SeqRecord(Seq(pattern2),
id="seq2")
assert pattern1
assert pattern2
SeqIO.write(seq1, "seq1.fasta", "fasta")
SeqIO.write(seq2, "seq2.fasta", "fasta")
SeqIO.write(seq1, "seqx.fasta", "fasta")
SeqIO.write(seq1, "seqy.fasta", "fasta")
output = NcbiblastpCommandline(query="seq1.fasta", subject="seq2.fasta", outfmt=5)()[0]
#print(output)
blast_result_record = NCBIXML.read(StringIO(output))
result=0
with open("seq1.fasta", 'w') as target:
target.truncate()
with open("seq2.fasta", 'w') as target:
target.truncate()
for alignment in blast_result_record.alignments:
for hsp in alignment.hsps:
result=result+hsp.score
#print(blast_result_record)
return float(result)/max(len(pattern1), len(pattern2))
def nwmodScore(sec1,sec2,dic,match,mismatch,gap):
"""
Wrapper for Needleman-Wunsch algorithm.
Parameters:
- sec1: Protein sequence 1
- sec2: Protein sequence 2
- dic: Substitution dictionary of the letters that belong to the same group
- match: match value
- mismatch: mismatch value
- gap: gap value
Returns:
- Needleman-Wunsch alignment score
"""
#print(sec2)
nw_instance=nw_wrapper.NW(sec1,sec2,dic,match,mismatch,gap)
#print(str(int(nw_instance.get_score())/max(len(sec1), len(sec2))))
return int(nw_instance.get_score())/(match*max(len(sec1), len(sec2)))
def nwmodScoreMt(sec1,sec2,dic,match,mismatch,gap):
"""
Wrapper for Needleman-Wunsch algorithm.
Parameters:
- sec1: Protein sequence 1
- sec2: Protein sequence 2
- dic: Substitution dictionary of the letters that belong to the same group
- match: match value
- mismatch: mismatch value
- gap: gap value
Returns:
- Needleman-Wunsch alignment score
"""
print(sec2)
nw_instance=nw_wrapper_matrix.NWM(sec1,sec2,dic,match,mismatch,gap)
print(str(int(nw_instance.get_score())/max(len(sec1), len(sec2))))
len_sec1=0
for i in sec1:
len_sec1+=dic[i][i]
len_sec2=0
for i in sec2:
len_sec2+=dic[i][i]
return int(nw_instance.get_score())/(max(len_sec1, len_sec2))
def generate_nwmod(data,dic,dd,sal):
"""
Matrix generator using Needleman-Wunsch algorithm to compute pairwise score between each pair of proteins with custom matches.
Parameters:
- data: Protein sequences
- dic: Substitution dictionary of the letters that belong to the same group
- dd: match, mismatch and gap values dictionary
- sal: Extension added to the output file
Returns:
- Needleman-Wunsch alignment score matrix normalized between [max(gap,mismatch)/match,1]
"""
num_points=len(data)
match=dd["match"]
mismatch=dd["mismatch"]
gap=dd["gap"]
sim_data = [[0 for _ in range(len(data))] for _ in range(len(data))]
similarity_matrix = [[0] * num_points for _ in range(num_points)]
first=True
for i in range(num_points):
sim_matrix=[nwmodScore(data[i],data[j],dic,dd["match"],dd["mismatch"],dd["gap"]) for j in range(num_points)]
similarity = []
#for i in range(0,num_points):
# for j in range(i,num_points):
# sim_data[i][j]=sim_matrix.pop(0)
# sim_data[j][i]=sim_data[i][j]
if(first):
datf=pd.DataFrame(np.asmatrix(np.array(sim_matrix)))
datf.to_csv('resultados/matrizNW'+sal+'.csv', index=False,header=False)
first=False
else:
datf=pd.DataFrame(np.asmatrix(np.array(sim_matrix)))
datf.to_csv('resultados/matrizNW'+sal+'.csv', index=False,header=False, mode="a")
return
def generate_nwmodpremade(data,dic,dd,sal):
"""
Matrix generator using Needleman-Wunsch algorithm to compute pairwise score between each pair of proteins with custom matches.
Parameters:
- data: Protein sequences
- dic: Substitution dictionary of the letters that belong to the same group
- dd: match, mismatch and gap values dictionary
- sal: Extension added to the output file
Returns:
- Needleman-Wunsch alignment score matrix normalized between [max(gap,mismatch)/match,1]
"""
num_points=len(data)
match=dd["match"]
mismatch=dd["mismatch"]
gap=dd["gap"]
sim_data = [[0 for _ in range(len(data))] for _ in range(len(data))]
similarity_matrix = [[0] * num_points for _ in range(num_points)]
first=True
for i in range(num_points):
sim_matrix=[nwmodScoreMt(data[i],data[j],dic,dd["match"],dd["mismatch"],dd["gap"]) for j in range(num_points)]
similarity = []
#for i in range(0,num_points):
# for j in range(i,num_points):
# sim_data[i][j]=sim_matrix.pop(0)
# sim_data[j][i]=sim_data[i][j]
if(first):
datf=pd.DataFrame(np.asmatrix(np.array(sim_matrix)))
datf.to_csv('resultados/matrizNW'+sal+'.csv', index=False,header=False)
first=False
else:
datf=pd.DataFrame(np.asmatrix(np.array(sim_matrix)))
datf.to_csv('resultados/matrizNW'+sal+'.csv', index=False,header=False, mode="a")
return
def swap_dict(d):
"""
Swap keys and values in a dictionary.
Parameters:
- d: Input dictionary
Returns:
- Dictionary with swapped keys and values
"""
new_dict = {}
for key, values in d.items():
for value in values:
if value not in new_dict:
if(len(value)== 1):
new_dict[value[0]] = []
else:
new_dict[value] = []
if(len(value)== 1):
new_dict[value[0]].append(key)
else:
new_dict[value].append(key)
return new_dict
def read_aminoacidos(afile):
"""
Read amino acid data from a file and create dictionaries.
Parameters:
- afile: Amino acid data file
Returns:
- Dictionaries with amino acid data
"""
cla = {}
with open(afile, 'r') as op:
lines = op.readlines()
for line in lines:
oo = line.replace('\n', '').split('\t')
key = oo.pop(0)
cla[key] = oo
return swap_dict(cla), cla
def get_matrix(data, similarity_function, output_filename):
"""
Creates a square matrix of with entry using the similarity function specified.
Parameters:
- data: Protein sequences data file
- similarity_function: similarity function to apply
- output_filename: Name of the file in which the output matriz will be written
"""
calculate_matrix_similarity(data, similarity_function, output_filename)
def get_clases(clas):
"""
Calculate the substitutability between aminoacids based on their common clases.
Parameters:
- clas: Dictionary containing amino acid clases and the aminoacid that belong to that clases
Returns:
Dictionary of dictionaries with the grade of match that have 2 aminoacids depending on their clases
"""
clases={}
for k,v in clas.items():
for k2,v2 in clas.items():
if(k not in clases.keys()):
clases[k]={}
if(k2 not in clases[k].keys()):
clases[k][k2]=float(len(set(v) & set(v2))/len(set(v) | set(v2)))
return clases
if __name__=="__main__":
inputFile="Data/data_lung_cancer_treatment.xlsx"
data=readData(inputFile,"")
similarity_functions = [levenshtein_similarity]
output_filenames = ["levenshtein_similarity.csv"]
for sim_func, output_filename in zip(similarity_functions, output_filenames):
get_matrix(data, sim_func, output_filename)
afile="Data/aminoacidos_mod.txt"
sal="mod1"
a = "MACWPQLRLLLWKNLTFRRRQTCQLLLEVAWPLFIFLILISVRLSYPPYEQHECHFPNKAMPSAGTLPWVQGIICNANNPCFRYPTPGEAPGVVGNFNKSIVARLFSDARRLLLYSQKDTSMKDMRKVLRTLQQIKKSSSNLKLQDFLVDNETFSGFLYHNLSLPKSTVDKMLRADVILHKVFLQGYQLHLTSLCNGSKSEEMIQLGDQEVSELCGLPREKLAAAERVLRSNMDILKPILRTLNSTSPFPSKELAEATKTLLHSLGTLAQELFSMRSWSDMRQEVMFLTNVNSSSSSTQIYQAVSRIVCGHPEGGGLKIKSLNWYEDNNYKALFGGNGTEEDAETFYDNSTTPYCNDLMKNLESSPLSRIIWKALKPLLVGKILYTPDTPATRQVMAEVNKTFQELAVFHDLEGMWEELSPKIWTFMENSQEMDLVRMLLDSRDNDHFWEQQLDGLDWTAQDIVAFLAKHPEDVQSSNGSVYTWREAFNETNQAIRTISRFMECVNLNKLEPIATEVWLINKSMELLDERKFWAGIVFTGITPGSIELPHHVKYKIRMDIDNVERTNKIKDGYWDPGPRADPFEDMRYVWGGFAYLQDVVEQAIIRVLTGTEKKTGVYMQQMPYPCYVDDIFLRVMSRSMPLFMTLAWIYSVAVIIKGIVYEKEARLKETMRIMGLDNSILWFSWFISSLIPLLVSAGLLVVILKLGNLLPYSDPSVVFVFLSVFAVVTILQCFLISTLFSRANLAAACGGIIYFTLYLPYVLCVAWQDYVGFTLKIFASLLSPVAFGFGCEYFALFEEQGIGVQWDNLFESPVEEDGFNLTTSVSMMLFDTFLYGVMTWYIEAVFPGQYGIPRPWYFPCTKSYWFGEESDEKSHPGSNQKRISEICMEEEPTHLKLGVSIQNLVKVYRDGMKVAVDGLALNFYEGQITSFLGHNGAGKTTTMSILTGLFPPTSGTAYILGKDIRSEMSTIRQNLGVCPQHNVLFDMLTVEEHIWFYARLKGLSEKHVKAEMEQMALDVGLPSSKLKSKTSQLSGGMQRKLSVALAFVGGSKVVILDEPTAGVDPYSRRGIWELLLKYRQGRTIILSTHHMDEADVLGDRIAIISHGKLCCVGSSLFLKNQLGTGYYLTLVKKDVESSLSSCRNSSSTVSYLKKEDSVSQSSSDAGLGSDHESDTLTIDVSAISNLIRKHVSEARLVEDIGHELTYVLPYEAAKEGAFVELFHEIDDRLSDLGISSYGISETTLEEIFLKVAEESGVDAETSDGTLPARRNRRAFGDKQSCLRPFTEDDAADPNDSDIDPESRETDLLSGMDGKGSYQVKGWKLTQQQFVALLWKRLLIARRSRKGFFAQIVLPAVFVCIALVFSLIVPPFGKYPSLELQPWMYNEQYTFVSNDAPEDTGTLELLNALTKDPGFGTRCMEGNPIPDTPCQAGEEEWTTAPVPQTIMDLFQNGNWTMQNPSPACQCSSDKIKKMLPVCPPGAGGLPPPQRKQNTADILQDLTGRNISDYLVKTYVQIIAKSLKNKIWVNEFRYGGFSLGVSNTQALPPSQEVNDAIKQMKKHLKLAKDSSADRFLNSLGRFMTGLDTKNNVKVWFNNKGWHAISSFLNVINNAILRANLQKGENPSHYGITAFNHPLNLTKQQLSEVALMTTSVDVLVSICVIFAMSFVPASFVVFLIQERVSKAKHLQFISGVKPVIYWLSNFVWDMCNYVVPATLVIIIFICFQQKSYVSSTNLPVLALLLLLYGWSITPLMYPASFVFKIPSTAYVVLTSVNLFIGINGSVATFVLELFTDNKLNNINDILKSVFLIFPHFCLGRGLIDMVKNQAMADALERFGENRFVSPLSWDLVGRNLFAMAVEGVVFFLITVLIQYRFFIRPRPVNAKLSPLNDEDEDVRRERQRILDGGGQNDILEIKELTKIYRRKRKPAVDRICVGIPPGECFGLLGVNGAGKSSTFKMLTGDTTVTRGDAFLNKNSILSNIHEVHQNMGYCPQFDAITELLTGREHVEFFALLRGVPEKEVGKVGEWAIRKLGLVKYGEKYAGNYSGGNKRKLSTAMALIGGPPVVFLDEPTTGMDPKARRFLWNCALSVVKEGRSVVLTSHSMEECEALCTRMAIMVNGRFRCLGSVQHLKNRFGDGYTIVVRIAGSNPDLKPVQDFFGLAFPGSVLKEKHRNMLQYQLPSSLSSLARIFSILSQSKKRLHIEDYSVSQTTLDQVFVNFAKDQSDDDHLKDLSLHKNQTVVDVAVLTSFLQDEKVKESYV"
b = "MACWPQLRLLLWKNLTFRRRQTCQLLLEVAWPLFIFLILISVRLSYPPYEQHECHFPNKAMPSAGTLPWVQGIICNANNPCFRYPTPGEAPGVVGNFNKSIVARLFSDARRLLLYSQKDTSMKDMRKVLRTLQQIKKSSSNLKLQDFLVDNETFSGFLYHNLSLPKSTVDKMLRADVILHKVFLQGYQLHLTSLCNGSKSEEMIQLGDQEVSELCGLPREKLAAAERVLRSNMDILKPILRTLNSTSPFPSKELAEATKTLLHSLGTLAQELFSMRSWSDMRQEVMFLTNVNSSSSSTQIYQAVSRIVCGHPEGGGLKIKSLNWYEDNNYKALFGGNGTEEDAETFYDNSTTPYCNDLMKNLESSPLSRIIWKALKPLLVGKILYTPDTPATRQVMAEVNKTFQELAVFHDLEGMWEELSPKIWTFMENSQEMDLVRMLLDSRDNDHFWEQQLDGLDWTAQDIVAFLAKHPEDVQSSNGSVYTWREAFNETNQAIRTISRFMECVNLNKLEPIATEVWLINKSMELLDERKFWAGIVFTGITPGSIELPHHVKYKIRMDIDNVERTNKIKDGYWDPGPRADPFEDMRYVWGGFAYLQDVVEQAIIRVLTGTEKKTGVYMQQMPYPCYVDDIFLRVMSRSMPLFMTLAWIYSVAVIIKGIVYEKEARLKETMRIMGLDNSILWFSWFISSLIPLLVSAGLLVVILKLGNLLPYSDPSVVFVFLSVFAVVTILQCFLISTLFSRANLAAACGGIIYFTLYLPYVLCVAWQDYVGFTLKIFASLLSPVAFGFGCEYFALFEEQGIGVQWDNLFESPVEEDGFNLTTSVSMMLFDTFLYGVMTWYIEAVFPGQYGIPRPWYFPCTKSYWFGEESDEKSHPGSNQKRISEICMEEEPTHLKLGVSIQNLVKVYRDGMKVAVDGLALNFYEGQITSFLGHNGAGKTTTMSILTGLFPPTSGTAYILGKDIRSEMSTIRQNLGVCPQHNVLFDMLTVEEHIWFYARLKGLSEKHVKAEMEQMALDVGLPSSKLKSKTSQLSGGMQRKLSVALAFVGGSKVVILDEPTAGVDPYSRRGIWELLLKYRQGRTIILSTHHMDEADVLGDRIAIISHGKLCCVGSSLFLKNQLGTGYYLTLVKKDVESSLSSCRNSSSTVSYLKKEDSVSQSSSDAGLGSDHESDTLTIDVSAISNLIRKHVSEARLVEDIGHELTYVLPYEAAKEGAFVELFHEIDDRLSDLGISSYGISETTLEEIFLKVAEESGVDAETSDGTLPARRNRRAFGDKQSCLRPFTEDDAADPNDSDIDPESRETDLLSGMDGKGSYQVKGWKLTQQQFVALLWKRLLIARRSRKGFFAQIVLPAVFVCIALVFSLIVPPFGKYPSLELQPWMYNEQYTFVSNDAPEDTGTLELLNALTKDPGFGTRCMEGNPIPDTPCQAGEEEWTTAPVPQTIMDLFQNGNWTMQNPSPACQCSSDKIKKMLPVCPPGAGGLPPPQRKQNTADILQDLTGRNISDYLVKTYVQIIAKSLKNKIWVNEFRYGGFSLGVSNTQALPPSQEVNDAIKQMKKHLKLAKDSSADRFLNSLGRFMTGLDTKNNVKVWFNNKGWHAISSFLNVINNAILRANLQKGENPSHYGITAFNHPLNLTKQQLSEVALMTTSVDVLVSICVIFAMSFVPASFVVFLIQERVSKAKHLQFISGVKPVIYWLSNFVWDMCNYVVPATLVIIIFICFQQKSYVSSTNLPVLALLLLLYGWSITPLMYPASFVFKIPSTAYVVLTSVNLFIGINGSVATFVLELFTDNKLNNINDILKSVFLIFPHFCLGRGLIDMVKNQAMADALERFGENRFVSPLSWDLVGRNLFAMAVEGVVFFLITVLIQYRFFIRPRPVNAKLSPLNDEDEDVRRERQRILDGGGQNDILEIKELTKIYRRKRKPAVDRICVGIPPGECFGLLGVNGAGKSSTFKMLTGDTTVTRGDAFLNKNSILSNIHEVHQNMGYCPQFDAITELLTGREHVEFFALLRGVPEKEVGKVGEWAIRKLGLVKYGEKYAGNYSGGNKRKLSTAMALIGGPPVVFLDEPTTGMDPKARRFLWNCALSVVKEGRSVVLTSHSMEECEALCTRMAIMVNGRFRCLGSVQHLKNRFGDGYTIVVRIAGSNPDLKPVQDFFGLAFPGSVLKEKHRNMLQYQLPSSLSSLARIFSILSQSKKRLHIEDYSVSQTTLDQVFVNFAKDQSDDDHLKDLSLHKNQTVVDVAVLTSFLQDEKVKESYV"
c = {'A': {'A': 1, 'C': 1}, 'C': {'A': 1, 'C': 1}}
dd={"match":3,"mismatch":0,"gap":-1}
nw_instance = nw_wrapper.NW(a, b, c, dd["match"], dd["mismatch"], dd["gap"])
score = nw_instance.get_score()
print(f"Alignment Score: {score/max(len(a),len(b))}")
di=[]
do={}
clas,_=read_aminoacidos(afile)
clases=get_clases(clas)
generate_nwmod(data,clases,dd,sal)
matrix = bl.BLOSUM(62)
dd={"match":1,"mismatch":-4,"gap":0}
sal="blosum62"
generate_nwmodpremade(data,matrix,dd,sal)
"""
output=data.to_list()
output=remplazar_sequence_for_ID(data,inputFile,"nombres_sust.txt")
similarity_matrix=pd.read_csv('resultados/matrizLevenshtein.csv',header=None,index_col=False)
#similarity_matrix=similarity_matrix/2
#similarity_matrix=similarity_matrix.abs()
#similarity_matrix.to_numpy()
sim_mat_40=similarity_matrix.copy()
sim_mat_20=similarity_matrix.copy()
#sim_mat_10=similarity_matrix.copy()
data_40=pd.read_csv('resultados/Metrica_Coincidencia.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#data_40=data_40.drop([0])
#data_20=pd.read_csv('resultados/Metrica_Coincidencia_20.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#data_20=data_20.drop([0])
#data_10=pd.read_csv('resultados/Metrica_Coincidencia_10.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#data_10=data_10.drop([0])
#new_sim=np.copy(similarity_matrix)
#print(output)
#new_sim_mean=np.copy(similarity_matrix)
for i,ks in data_40.iterrows():
sim_mat_40[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])*0.3/70
sim_mat_20[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])
#for i,kks in data_20.iterrows():
# sim_mat_20[output.index(kks['proteina1'])][output.index(kks['proteina2'])]+=float(kks['%Coincidencia'])*0.3
#for i,ksk in data_10.iterrows():
# sim_mat_10[output.index(ksk['proteina1'])][output.index(ksk['proteina2'])]+=float(ksk['%Coincidencia'])*0.3
dfx=pd.DataFrame(sim_mat_20)
dfx=df/2
dfx=df-1
dfx.abs()
dfx.to_csv("resultados/matrizLevenshteinFS_Mean.csv",header=False,index=False)
dfx=pd.DataFrame(sim_mat_40)
dfx=dfx*0.7
dfx=dfx-1
dfx.abs()
dfx.to_csv("resultados/matrizLevenshteinFS_70.csv",header=False,index=False)
similarity_matrix=pd.read_csv('resultados/matrizNeedleWunch.csv',header=None,index_col=False)-1
similarity_matrix=similarity_matrix/2
similarity_matrix=similarity_matrix.abs()
#similarity_matrix.to_numpy()
sim_mat_40=similarity_matrix.copy()
sim_mat_20=similarity_matrix.copy()
#sim_mat_10=similarity_matrix.copy()
data_40=pd.read_csv('resultados/Metrica_Coincidencia.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#data_40=data_40.drop([0])
#data_20=pd.read_csv('resultados/Metrica_Coincidencia_20.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#data_20=data_20.drop([0])
#data_10=pd.read_csv('resultados/Metrica_Coincidencia_10.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
#data_10=data_10.drop([0])
#new_sim=np.copy(similarity_matrix)
#print(output)
#new_sim_mean=np.copy(similarity_matrix)
indexes=[]
for i,ks in data_40.iterrows():
indexes.append(output.index(ks['proteina1']),output.index(ks['proteina2']))
sim_mat_40[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])*0.3/70
sim_mat_20[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])/100
print(indexes)
#for i,kks in data_20.iterrows():
# sim_mat_20[output.index(kks['proteina1'])][output.index(kks['proteina2'])]+=float(kks['%Coincidencia'])*0.3
#for i,ksk in data_10.iterrows():
# sim_mat_10[output.index(ksk['proteina1'])][output.index(ksk['proteina2'])]+=float(ksk['%Coincidencia'])*0.3
dfx=pd.DataFrame(sim_mat_20)
dfx=df/2
dfx=df-1
dfx.abs()
dfx.to_csv("resultados/matrizNeedleWunchFS_Mean.csv",header=False,index=False)
dfx=pd.DataFrame(sim_mat_40)
dfx=dfx*0.7
dfx=dfx-1
dfx.abs()
"""
# dfx.to_csv("resultados/mmatrizNeedleWunchFS_70.csv",header=False,index=False)
"""
dfx=pd.DataFrame(sim_mat_10)
dfx=df/1.3
dfx=df-1
dfx.abs()
dfx.to_csv("resultados/matrizLevenshteinFS_10.csv",header=False,index=False)
s1 = pd.merge(data_40, data_20, how='inner', on=['proteina1','proteina2'])
s2= pd.merge(s1,data_10, how='inner', on=['proteina1','proteina2'])
ss=s1[(~(s1['proteina1'].isin(s2['proteina1']))& ~(s1['proteina2'].isin(s2['proteina2'])))]
s3 = pd.merge(data_20, data_10, how='inner', on=['proteina1','proteina2'])
print(s3['proteina2'].isin(s2['proteina2']))
s4=s3[~(s3['proteina1'].isin(s2['proteina1']))&~(s3['proteina2'].isin(s2['proteina2']))]
s5 = pd.merge(data_40, data_10, how='inner', on=['proteina1','proteina2'])
s6=s5.loc[~(s5['proteina1'].isin(s2['proteina1']))&(s5['proteina2'].isin(s2['proteina2']))]
data_401=data_40[~(data_40['proteina1'].isin(data_20['proteina1']))& ~(data_40['proteina2'].isin(data_20['proteina2']))]
data_402=data_40[~(data_40['proteina1'].isin(data_10['proteina1']))& ~(data_40['proteina2'].isin(data_10['proteina2']))]
data_40X=data_402[~(data_402['proteina1'].isin(data_20['proteina1']))& ~(data_402['proteina2'].isin(data_20['proteina2']))]
data_201=data_20[~(data_20['proteina1'].isin(data_40['proteina1']))&(data_20['proteina2'].isin(data_40['proteina2']))]
data_202=data_20[~(data_20['proteina1'].isin(data_10['proteina1']))&(data_20['proteina2'].isin(data_10['proteina2']))]
data_20X=data_202[~(data_202['proteina1'].isin(data_40['proteina1']))&(data_202['proteina2'].isin(data_40['proteina2']))]
data_101=data_10[~(data_10['proteina1'].isin(data_40['proteina1']))&(data_10['proteina2'].isin(data_40['proteina2']))]
data_102=data_10[~(data_10['proteina1'].isin(data_20['proteina1']))&(data_10['proteina2'].isin(data_20['proteina2']))]
data_10X=data_102[~(data_102['proteina1'].isin(data_40['proteina1']))&(data_102['proteina2'].isin(data_40['proteina2']))]
#print(s3)
print(data_40X)
print(data_20X)
print(data_10X)
#print(data_402)
for i in range(0,similarity_matrix.shape[0]):
for j in range(0,similarity_matrix.shape[1]):
cross=0
cross_over=0
dd_10_check=False
dd_20_check=False
dd_40_check=False
if ((data_40['proteina1']==output[i]) & (data_40['proteina2']==output[j])).any() or ((data_40['proteina1']==output[j]) & (data_40['proteina2']==output[i])).any():
dd_40_check=True
if ((data_40['proteina1']==output[i]) & (data_40['proteina2']==output[j])).any():
dd_40=float(data_40[(data_40['proteina1']==output[i]) & (data_40['proteina2']==output[j])]['%Coincidencia'].to_list()[0])/100
else:
dd_40=float(data_40[(data_40['proteina1']==output[j]) & (data_40['proteina2']==output[i])]['%Coincidencia'].to_list()[0])/100
if ((data_20['proteina1']==output[i]) & (data_20['proteina2']==output[j])).any() or ((data_20['proteina1']==output[j]) & (data_20['proteina2']==output[i])).any():
dd_20_check=True
if ((data_20['proteina1']==output[i]) & (data_20['proteina2']==output[j])).any():
dd_20=float(data_20[(data_20['proteina1']==output[i]) & (data_20['proteina2']==output[j])]['%Coincidencia'].to_list()[0])/100
else:
dd_20=float(data_20[(data_20['proteina1']==output[j]) & (data_20['proteina2']==output[i])]['%Coincidencia'].to_list()[0])/100
if ((data_10['proteina1']==output[i]) & (data_10['proteina2']==output[j])).any() or ((data_10['proteina1']==output[j]) & (data_10['proteina2']==output[i])).any():
dd_10_check=True
if ((data_10['proteina1']==output[i]) & (data_10['proteina2']==output[j])).any():
dd_10=float(data_10[(data_10['proteina1']==output[i]) & (data_10['proteina2']==output[j])]['%Coincidencia'].to_list()[0])/100
else:
dd_10=float(data_10[(data_10['proteina1']==output[j]) & (data_10['proteina2']==output[i])]['%Coincidencia'].to_list()[0])/100
if dd_10_check and dd_40_check and dd_20_check:
#print(dd_40)
#print(dd_20)
#print(dd_10)
cross=(dd_40-dd_20)-(dd_20-dd_10)
cross_over=(dd_40+dd_20+dd_10)/len([dd_20,dd_10,dd_40])
elif dd_20_check and dd_40_check:
cross=(dd_40-dd_20)
cross_over=(dd_40+dd_20)/len([dd_20,dd_40])
elif dd_10_check and dd_20_check:
cross=(dd_20-dd_10)
cross_over=(dd_20+dd_10)/len([dd_20,dd_10])
elif dd_40_check and dd_10_check:
cross=(dd_40-dd_10)
cross_over=(dd_40+dd_10)/len([dd_10,dd_40])
elif dd_40_check:
cross=-dd_40
cross_over=dd_40
elif dd_20_check:
cross=-dd_20
cross_over=dd_20
elif dd_10_check:
cross=-dd_10
cross_over=dd_10
if(cross!=0):
print(cross)
if(cross==0):
cross=-1
new_sim[i][j]+=0.3*cross
new_sim_mean[i][j]+=0.3*cross_over
df=pd.DataFrame(new_sim)
df=df-1
df.abs()
df=df/1.3
df.to_csv("resultados/matrizLevenshteinF.csv",header=False,index=False)
df2=pd.DataFrame(new_sim_mean)
df2=df/1.3
df2=df-1
df2.abs()
"""
#df2.to_csv("resultados/matrizLevenshteinFMean.csv",header=False,index=False)
import time
import pandas as pd
import Levenshtein
import csv
import ast
import numpy as np
from descarteProteinas import remplazar_ID_for_sequence as rp
from descarteProteinas import substitute_or_remove_prot_id
from generate_the_excel import substitute_or_remove_prot_id
def similitudProteinas(sequences):
output = []
for row1 in sequences:
for row2 in sequences:
if row1 != row2:
#print(row1,row2)
#similarity = abs(smith.SmithWaterman(row1, row2).get_score()-1) / max(len(row1), len(row2))
similarity = abs(needle.NeedlemanWunsch(row1, row2).get_score()-1) / (2*max(len(row1), len(row2)))
#similarity = abs(Levenshtein.distance(row1, row2)) / max(len(row1), len(row2))
output.append([row1, row2, similarity*100])
return output
def metrica_distanciaProteinas():
# Leer los archivos CSV
data = pd.read_csv("resultados/patronesIdenticos.csv")
df_b = pd.read_csv("AllProteins_%Similitud.csv")
# Crear un diccionario de similaridades
proteinas_dict = dict(zip(zip(df_b['Proteina1'], df_b['Proteina2']), df_b['Similaridad']))
# Agrupar por el patrón de proteína
grupos = data.groupby('Patron')
# Crear una lista de tuplas con los índices únicos de las filas en cada grupo
indices = [grupo.index for patron, grupo in grupos]
# Generar todas las combinaciones únicas de índices
index_combinations = [(i, j) for grp in indices for i in grp for j in grp if i != j]
# Filtrar las combinaciones que no son duplicadas y tienen diferencias en las filas correspondientes
filtered_combinations = [comb for comb in index_combinations if not data.loc[comb[0]].equals(data.loc[comb[1]])]
# Filtrar las combinaciones que existen en el diccionario de similaridades
output = [[data.loc[comb[0], 'Patron'], data.loc[comb[0], 'Proteina'], data.loc[comb[1], 'Proteina'],
proteinas_dict.get((data.loc[comb[0], 'Proteina'], data.loc[comb[1], 'Proteina']), '')] for comb in
filtered_combinations]
# Crear un DataFrame a partir de la lista de resultados
df = pd.DataFrame(output, columns=['Patron', 'Proteina1', 'Proteina2', 'Similitud'])
# Guardar el DataFrame en un archivo CSV
df.to_csv('resultados/Metrica_distanciaProteinasMismoPatron.csv',
index=False)
def patronesComun(patronesComun,archivoEntrada,ocurrencia,sal,archivoClases):
# Leer el archivo CSV y cargar los datos en una lista de diccionarios
registros = []
cl=pd.read_excel(archivoClases)
#cl=substitute_or_remove_prot_id(cl,"r")
#data2=data.copy()
cli=cl.groupby('protein_id')
di=[]
do={}
for k,v in cli:
for index,row in v.iterrows():
di.append(row['class_name'])
do[k]=di
di=[]
class_dict=do
with open("resultados/patronesIdenticos"+str(int((float(ocurrencia)%1)*100))+sal+".csv", 'r') as file:
reader = csv.DictReader(file)
for row in reader:
registros.append(row)
# Diccionario para almacenar la cantidad de patrones únicos por proteína
patrones_por_proteina = {}
posiciones_patron={}
# Iterar sobre los registros y extraer los patrones únicos de cada proteína
for registro in registros:
proteina = registro['Proteina']
patron = registro['Patron']
posicion = registro['Posiciones']
if proteina not in patrones_por_proteina:
patrones_por_proteina[proteina] = set()
patrones_por_proteina[proteina].add(patron)
pp=[oo for oo in ast.literal_eval(posicion) if oo is not '[' and oo is not ']']
if proteina not in posiciones_patron:
posiciones_patron[proteina]={}
posiciones_patron[proteina][patron]=[]
for u in pp:
for kaa in range(0,len(patron)):
posiciones_patron[proteina][patron].append(kaa+int(u))
# Diccionario para almacenar las proteinas que tienen en común cada par de proteinas
proteinas_comunes = {}
rr=[]
df_p = pd.read_excel(archivoEntrada)
#df_p = pd.read_excel("proteinasClase_PC00060.xlsx")
#df_p=substitute_or_remove_prot_id(df_p,"r")
proteinas_dict2 = dict(df_p[['protein_id','protein_sequence']].values)
pares_proteinas_procesados = set()
# Filtrar las proteínas que tienen al menos 10 patrones únicos en común
for proteina1, patrones1 in patrones_por_proteina.items():
for proteina2, patrones2 in patrones_por_proteina.items():
if proteina1 != proteina2 and (proteina2, proteina1) not in pares_proteinas_procesados:
patrones_comunes = patrones1.intersection(patrones2)
if len(patrones_comunes) >= patronesComun:
par_proteinas = (proteina1, proteina2)
proteinas_comunes[par_proteinas] = patrones_comunes
pares_proteinas_procesados.add(par_proteinas)
output = []
df_b = pd.read_csv("AllProteins_%Similitud.csv")
output2=[]
proteinas_dict = df_b.set_index(['Proteina1', 'Proteina2'])['Similaridad'].to_dict()
outbreak=[]
first=True
first2=True
for par_proteinas, patrones_comunes in proteinas_comunes.items():
proteina1, proteina2 = par_proteinas
pattern_lengths = {}
pattern_l={}
Antecedentes={}
if(proteina1 == 'Q13753' and proteina2 == 'P07550'):
print(patrones_comunes)
for pattern in patrones_comunes:
length = len(pattern)
key = f'Longitud {length}'
if key in pattern_lengths:
pattern_lengths[key].append([pattern])
Add=posiciones_patron[proteina1][pattern]
if(proteina1 == 'Q13753' and proteina2 == 'P07550'):
print(Add)
if proteina1 not in Antecedentes:
Antecedentes[proteina1]=set()
lex=len(Antecedentes[proteina1] & set(Add))
Antecedentes[proteina1].update(Add)
pattern_l[key][0]+=len(Add)-lex
Add=posiciones_patron[proteina2][pattern]
if proteina2 not in Antecedentes:
Antecedentes[proteina2]=set()
lex=len(Antecedentes[proteina2] & set(Add))
Antecedentes[proteina2].update(Add)
pattern_l[key][1]+=len(Add)-lex
#sprint(length*len(Posic))
else:
pattern_lengths[key] = [[pattern]]
Add=posiciones_patron[proteina1][pattern]
if proteina1 not in Antecedentes:
Antecedentes[proteina1]=set()
lex=len(Antecedentes[proteina1] & set(Add))
#print(lex)
#print(Antecedentes)
Antecedentes[proteina1].update(Add)
Add2=posiciones_patron[proteina2][pattern]
if proteina2 not in Antecedentes:
Antecedentes[proteina2]=set()
lex2=len(Antecedentes[proteina2] & set(Add2))
Antecedentes[proteina2].update(Add2)
pattern_l[key]=[len(Add)-lex,len(Add2)-lex2]
sorted_pattern_lengths = dict(sorted(pattern_lengths.items(), key=lambda x: int(x[0][9:]), reverse=True))
if proteina1 != proteina2:
prot=[proteinas_dict2[proteina1],proteinas_dict2[proteina2]]
if Antecedentes != {} and(len(prot[0])>0 and len(prot[1])>0):
output.append([sorted_pattern_lengths, proteina1, proteina2,class_dict[proteina1] if proteina1 in class_dict else "N/A",class_dict[proteina2] if proteina2 in class_dict else "N/A"])
df = pd.DataFrame(output, columns=['Patrones', 'Proteina1', 'Proteina2',"classesProt1","classesProt2"])
output=[]
if(first2):
df.to_csv('resultados/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',
index=False)
first2=False
else:
df.to_csv('resultados/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',index=False,header=False,mode='a')
#else:
#output.append([sorted_pattern_lengths, proteina1, proteina2,
# 'N/A'])
#print("prot1 : "+proteina1 + " : "+str(len(Antecedentes[proteina1])))
#print("prot2 : "+proteina2 + " : " + str(len(Antecedentes[proteina2]) ))
if Antecedentes != {} and(len(prot[0])>0 and len(prot[1])>0):
output2.append([proteina1,proteina2, (np.mean([len(Antecedentes[proteina1])/len(prot[0]),len(Antecedentes[proteina2])/len(prot[1])])*100),class_dict[proteina1] if proteina1 in class_dict else "N/A",class_dict[proteina2] if proteina2 in class_dict else "N/A"])
df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia',"classesProt1","classesProt2"])
output2=[]
if(first):
df2.to_csv('resultados/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',index=False)
first=False
else:
df2.to_csv('resultados/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',index=False,header=False,mode='a')
#output2=sorted(output2, key = lambda x: int(x[2]))
#df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia'])
#df2.to_csv('resultados/Metrica_Coincidencia.csv',
# index=False)
def patronesComunClas(patronesComun,name,archivoEntrada,ocurrencia,sal,archivoClases):
# Leer el archivo CSV y cargar los datos en una lista de diccionarios
registros = []
cl=pd.read_excel(archivoClases)
#cl=substitute_or_remove_prot_id(cl,"r")
#data2=data.copy()
cli=cl.groupby('protein_id')
di=[]
do={}
for k,v in cli:
for index,row in v.iterrows():
di.append(row['class_name'])
do[k]=di
di=[]
class_dict=do
with open("clases/"+name+"/patronesIdenticos"+str(int((float(ocurrencia)%1)*100))+sal+".csv", 'r') as file:
reader = csv.DictReader(file)
for row in reader:
registros.append(row)
# Diccionario para almacenar la cantidad de patrones únicos por proteína
patrones_por_proteina = {}
posiciones_patron={}
# Iterar sobre los registros y extraer los patrones únicos de cada proteína
for registro in registros:
proteina = registro['Proteina']
patron = registro['Patron']
posicion = registro['Posiciones']
if proteina not in patrones_por_proteina:
patrones_por_proteina[proteina] = set()
patrones_por_proteina[proteina].add(patron)
pp=[oo for oo in ast.literal_eval(posicion) if oo is not '[' and oo is not ']']
if proteina not in posiciones_patron:
posiciones_patron[proteina]={}
posiciones_patron[proteina][patron]=[]
for u in pp:
for kaa in range(0,len(patron)):
posiciones_patron[proteina][patron].append(kaa+int(u))
# Diccionario para almacenar las proteinas que tienen en común cada par de proteinas
proteinas_comunes = {}
rr=[]
df_p = pd.read_excel(archivoEntrada)
#df_p = pd.read_excel("proteinasClase_PC00060.xlsx")
#df_p=substitute_or_remove_prot_id(df_p,"r")
proteinas_dict2 = dict(df_p[['protein_id','protein_sequence']].values)
pares_proteinas_procesados = set()
# Filtrar las proteínas que tienen al menos 10 patrones únicos en común
for proteina1, patrones1 in patrones_por_proteina.items():
for proteina2, patrones2 in patrones_por_proteina.items():
if proteina1 != proteina2 and (proteina2, proteina1) not in pares_proteinas_procesados:
patrones_comunes = patrones1.intersection(patrones2)
if len(patrones_comunes) >= patronesComun:
par_proteinas = (proteina1, proteina2)
proteinas_comunes[par_proteinas] = patrones_comunes
pares_proteinas_procesados.add(par_proteinas)
output = []
df_b = pd.read_csv("AllProteins_%Similitud.csv")
output2=[]
proteinas_dict = df_b.set_index(['Proteina1', 'Proteina2'])['Similaridad'].to_dict()
outbreak=[]
first=True
first2=True
for par_proteinas, patrones_comunes in proteinas_comunes.items():
proteina1, proteina2 = par_proteinas
pattern_lengths = {}
pattern_l={}
Antecedentes={}
if(proteina1 == 'Q13753' and proteina2 == 'P07550'):
print(patrones_comunes)
for pattern in patrones_comunes:
length = len(pattern)
key = f'Longitud {length}'
if key in pattern_lengths:
pattern_lengths[key].append([pattern])
Add=posiciones_patron[proteina1][pattern]
if(proteina1 == 'Q13753' and proteina2 == 'P07550'):
print(Add)
if proteina1 not in Antecedentes:
Antecedentes[proteina1]=set()
lex=len(Antecedentes[proteina1] & set(Add))
Antecedentes[proteina1].update(Add)
pattern_l[key][0]+=len(Add)-lex
Add=posiciones_patron[proteina2][pattern]
if proteina2 not in Antecedentes:
Antecedentes[proteina2]=set()
lex=len(Antecedentes[proteina2] & set(Add))
Antecedentes[proteina2].update(Add)
pattern_l[key][1]+=len(Add)-lex
#sprint(length*len(Posic))
else:
pattern_lengths[key] = [[pattern]]
Add=posiciones_patron[proteina1][pattern]
if proteina1 not in Antecedentes:
Antecedentes[proteina1]=set()
lex=len(Antecedentes[proteina1] & set(Add))
#print(lex)
#print(Antecedentes)
Antecedentes[proteina1].update(Add)
Add2=posiciones_patron[proteina2][pattern]
if proteina2 not in Antecedentes:
Antecedentes[proteina2]=set()
lex2=len(Antecedentes[proteina2] & set(Add2))
Antecedentes[proteina2].update(Add2)
pattern_l[key]=[len(Add)-lex,len(Add2)-lex2]
sorted_pattern_lengths = dict(sorted(pattern_lengths.items(), key=lambda x: int(x[0][9:]), reverse=True))
if proteina1 != proteina2:
prot=[proteinas_dict2[proteina1],proteinas_dict2[proteina2]]
if Antecedentes != {} and(len(prot[0])>0 and len(prot[1])>0):
output.append([sorted_pattern_lengths, proteina1, proteina2,class_dict[proteina1] if proteina1 in class_dict else "N/A",class_dict[proteina2] if proteina2 in class_dict else "N/A"])
df = pd.DataFrame(output, columns=['Patrones', 'Proteina1', 'Proteina2',"classesProt1","classesProt2"])
output=[]
if(first2):
df.to_csv('clases/'+name+'/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',
index=False)
first2=False
else:
df.to_csv('clases/'+name+'/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',index=False,header=False,mode='a')
#else:
#output.append([sorted_pattern_lengths, proteina1, proteina2,
# 'N/A'])
#print("prot1 : "+proteina1 + " : "+str(len(Antecedentes[proteina1])))
#print("prot2 : "+proteina2 + " : " + str(len(Antecedentes[proteina2]) ))
if Antecedentes != {} and(len(prot[0])>0 and len(prot[1])>0):
output2.append([proteina1,proteina2, (np.mean([len(Antecedentes[proteina1])/len(prot[0]),len(Antecedentes[proteina2])/len(prot[1])])*100),class_dict[proteina1] if proteina1 in class_dict else "N/A",class_dict[proteina2] if proteina2 in class_dict else "N/A"])
df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia',"classesProt1","classesProt2"])
output2=[]
if(first):
df2.to_csv('clases/'+name+'/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',index=False)
first=False
else:
df2.to_csv('clases/'+name+'/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',index=False,header=False,mode='a')
#output2=sorted(output2, key = lambda x: int(x[2]))
#df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia'])
#df2.to_csv('resultados/Metrica_Coincidencia.csv',
# index=False)
def remplazar_sequence_for_ID(output,archivoEntrada):
df_b = pd.read_excel(archivoEntrada)
#df_b = pd.read_excel("proteinasClase_PC00060.xlsx")
#df_b=substitute_or_remove_prot_id(df_b,"r")
# Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0]))
proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
for item in output_ordered:
protein_sequence1 = item[0]
protein_sequence2 = item[1]
if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict:
item[0] = proteinas_dict[protein_sequence1]
item[1] = proteinas_dict[protein_sequence2]
df_a = pd.DataFrame(output_ordered, columns=['Proteina1', 'Proteina2', 'Similaridad'])
# Guardar el DataFrame actualizado en un archivo CSV
df_a.to_csv('AllProteins_%Similitud.csv', index=False)
import pandas as pd
import time
import ast
import csv
import math
from interfazGrafica import interfaz
from descarteProteinas import ejecutar,remplazar_ID_for_sequence
from generate_the_excel import substitute_or_remove_prot_id
import metricas
from graficas import grafica
import os
import json
import ast
import re
from patrones_similares_aa import remplazar_sequence_for_ID as remplazar_s
from patrones_similares_aa import buscar_patrones_simAA
from collections import defaultdict
def readData(archivoEntrada, enfermedad, archivoTarget):
"""
Reads data from an Excel file, filters it based on disease (if specified),
and returns protein sequences along with the number of rows.
Parameters:
- archivoEntrada: str, path to the input Excel file.
- enfermedad: str, disease ID for filtering (empty string for no filtering).
- archivoTarget: str, path to the target Excel file (not currently in use).
Returns:
- sequences: pandas Series, protein sequences column.
- num_filas: int, number of rows in the filtered data.
"""
data = pd.read_excel(archivoEntrada)
#data=substitute_or_remove_prot_id(data,"r")
#dataC=substitute_or_remove_prot_id(dataC,"r")
#Descarte de proteinas
#print(data)
#data = data[~data['protein_id'].isin(dataC['ProteinasDescartadas'])]
print("Se ha realizado el descarte de proteínas")
# "C0002395"
if(enfermedad != ''):
data = data.loc[data["disease_id"] == enfermedad]
#dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
#print("Se han seleccionado las proteínas de la enfermedad elegida")
#dataB=substitute_or_remove_prot_id(dataB,"r")
#if(archivoTarget != ''):
# dataB=substitute_or_remove_prot_id(dataB,"r")
#Eliminar las proteinas target
# data = data[~((data["disease_id"] == enfermedad) &
# (data["protein_id"].isin(dataB["protein_id"])))]
# print("Se han descartado las proteínas del archivo target")
sequences = data["protein_sequence"]
print(sequences)
num_filas = sequences.shape[0]
return sequences, num_filas
def guardar_patrones_len1(sequences, pattern_freqMin):
"""
Processes protein sequences to find patterns of length 1 and their positions,
filters patterns based on minimum occurrence, and saves results to a CSV file.
Parameters:
- sequences: pandas Series, protein sequences.
- pattern_freqMin: dict, dictionary to store patterns and their occurrences.
Returns:
- pattern_freqMin: dict, updated dictionary of patterns.
- posicionPatterns: dict, positions of each character in the sequences.
- longitud_max: int, maximum length of protein sequences.
"""
all_patterns = dict()
longitud_max = 0
# Each pattern associated to the proteins the pattern is in
pattern_proteins = {}
for protein in sequences:
longitud = len(protein)
if longitud > longitud_max:
longitud_max = longitud
all_patterns[protein] = []
# En cada iteración guarda los patrones que aparecen en la secuencia con sus posiciones asociadas a la proteina
posicionPatterns = dict()
for index, letter in enumerate(protein):
posicionPatterns[letter] = posicionPatterns.get(letter, []) + [index]
all_patterns[protein] = posicionPatterns
for protein, patterns in all_patterns.items():
for pattern, positions in patterns.items():
if pattern not in pattern_proteins:
pattern_proteins[pattern] = {}
if protein not in pattern_proteins[pattern]:
pattern_proteins[pattern][protein] = []
pattern_proteins[pattern][protein].extend(positions)
for pattern, proteins in pattern_proteins.items():
if len(proteins) >= min_ocurrence:
pattern_freqMin[pattern] = proteins
df = pd.DataFrame(pattern_freqMin.items(), columns=['pattern', 'proteins'])
df.to_csv('prueba2.csv', index=False)
return pattern_freqMin, posicionPatterns, longitud_max
def buscar_patrones_identicos(sequences):
"""
Searches for identical patterns of different lengths in protein sequences
and stores them along with their positions in a dictionary.
Parameters:
- sequences: pandas Series, protein sequences.
Returns:
- pattern_freqMin: dict, dictionary of patterns and their positions.
- num_patrones: int, number of unique patterns found.
"""
pattern_freqMin = {}
pattern_freqMin, posicionPatterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin)
if bool(pattern_freqMin):
for pattern_length in range(2, longitud_max + 1):
# Si se intenta acceder a una clave que no existe se creara una lista vacia
auxPos = {}
sub_seqs = []
for pattern, proteins in pattern_freqMin.items():
if len(pattern) == pattern_length - 1:
for prot, positions in proteins.items():
protein_len = len(prot)
if protein_len < pattern_length - 1:
continue
for position in positions:
if (protein_len < position + pattern_length):
continue
sub_seq = prot[position:position + pattern_length]
if sub_seq in pattern_freqMin:
continue
# Si la ultima letra que es la nueva del patron ya esta min_freq, el patron es posible
# min freq tb
ultima_letra = sub_seq[-1]
pos_ultima_letra = position + pattern_length - 1
if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]:
if sub_seq not in auxPos:
auxPos[sub_seq] = {}
if prot not in auxPos[sub_seq]:
auxPos[sub_seq][prot] = []
auxPos[sub_seq][prot].append(position)
if sub_seq not in sub_seqs:
sub_seqs.append(sub_seq)
print(pattern_length)
sub_seqs_copy = sub_seqs.copy()
for p in sub_seqs_copy:
if len(auxPos[p]) < min_ocurrence:
del auxPos[p]
sub_seqs.remove(p)
# Si no se encuentra ningun patron de longitud pattern_length se sale del bucle. No hay mas patrones posible a encontrar
if not bool(auxPos):
break
for pattern, proteins in auxPos.items():
for prot, pos in proteins.items():
if pattern not in pattern_freqMin:
pattern_freqMin[pattern] = {}
if prot not in pattern_freqMin[pattern]:
pattern_freqMin[pattern][prot] = []
found=list(filter(lambda x: pos-len(pattern) <= x <= pos+len(pattern), pattern_freqMin[pattern][prot]))
print(found)
print(len(found))
if(len(found)<=0):
pattern_freqMin[pattern][prot].extend(pos)
if len(pattern) > 2:
if pattern[:-1] in pattern_freqMin:
del pattern_freqMin[pattern[:-1]]
if pattern[1:] in pattern_freqMin:
del pattern_freqMin[pattern[1:]]
# Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
dict_ordered_patterns = dict(sorted(pattern_freqMin.items(), key=lambda x: (-len(x[0]), x[0])))
#dict_ordered_patterns = {k: v for k, v in dict_ordered_patterns.items() if len(k) >= 4}
df = pd.DataFrame(dict_ordered_patterns.items(), columns=['pattern', 'proteins'])
num_patrones = df.shape[0]
#pattern_freqMin = {k: v for k, v in pattern_freqMin.items() if len(k) >= 4}
return pattern_freqMin, num_patrones
def remplazar_sequence_for_ID(pattern_freqMin,archivoEntrada,ocurrencia,Sal,archivoClases=None):
"""
Replaces identified patterns in the original data with their corresponding IDs,
saves the results to a CSV file, and prints a success message.
Parameters:
- pattern_freqMin: dict, dictionary of patterns and their positions.
- archivoEntrada: str, path to the input Excel file.
- ocurrencia: float, occurrence parameter.
- archivoClases (Optional): str, path to the classes Excel file.
"""
df_b = pd.read_excel(archivoEntrada)
#df_b=pd.read_excel("proteinasClase_PC00060.xlsx")
#df_b=substitute_or_remove_prot_id(df_b,'r')
if(archivoClases is not None):
cl=pd.read_excel(archivoClases)
#cl=substitute_or_remove_prot_id(cl,"r")
#data2=data.copy()
cli=cl.groupby('protein_id')
di=[]
do={}
for k,v in cli:
for index,row in v.iterrows():
di.append(row['class_name'])
do[k]=di
di=[]
class_dict=do
output = []
for key, value in pattern_freqMin.items():
for proteina, posiciones in value.items():
output.append([key, proteina, posiciones])
output = [sublista for sublista in output if len(sublista[0]) != 1]
# Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0]))
proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
for item in output_ordered:
protein_sequence = item[1]
if protein_sequence in proteinas_dict:
item[1] = proteinas_dict[protein_sequence]
item.append(class_dict[item[1]] if item[1] in class_dict else "N/A")
df_a = pd.DataFrame(output_ordered, columns=['Patron', 'Proteina', 'Posiciones','classesProt'])
# Guardar el DataFrame actualizado en un archivo CSV
df_a.to_csv('resultados/patronesIdenticos'+str(int((ocurrencia%1)*100))+Sal+'.csv', index=False)
print("Se ha generado el .csv con los patrones idénticos encontrados")
def calculate_sequence_length(sequences):
"""
Calculates the total length of protein sequences.
Parameters:
- sequences: pandas Series, protein sequences.
Returns:
- seq_len: int, total length of protein sequences.
"""
seq_len = 0
for i in sequences:
seq_len += len(i)
return seq_len
def group_classes_by_protein(cl):
"""
Groups classes by protein ID.
Parameters:
- cl: pandas DataFrame, DataFrame containing class information.
Returns:
- class_dict: dict, dictionary of protein IDs and associated classes.
"""
class_dict = {}
cli = cl.groupby('protein_id')
for k, v in cli:
class_names = [row['class_name'] for index, row in v.iterrows()]
class_dict[k] = class_names
return class_dict
def compute_pattern_ocurrence(df,sal):
"""
Computes the occurrence of patterns in the data and saves the results to a CSV file.
Parameters:
- df: pandas DataFrame, DataFrame containing pattern information.
Note: saves the patterns, the amount of times a pattern appears in proteins of the dataset and the number of proteins that have that pattern.
"""
df2=df.groupby('Patron')
compl=0
comp=0
first=True
res=set()
for k,v in df2:
res=set()
for index,row in v.iterrows():
Posic=[oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']']
rem=[]
if(len(Posic)>2):
u=0
while u+1<len(Posic):
if(Posic[u]+len(k)<=Posic[u+1]):
del Posic[u+1]
else:
u+=1
res|=set(Posic)
compl+=1
comp+=len(res)
for k,v in df2:
dicta={'Patron':[] ,'total_Patrones_por_prot':[],'numero_prot':[]}
dicta[k]=0
dox=0
dix=0
co=0
res=0
Posic=set()
for index,row in v.iterrows():
Posic|=set([oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']'])
Poss=[oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']']
co+=1
rem=[]
if(len(Poss)>2):
u=0
while u+1<len(Poss):
if(Poss[u]+len(k)<=Poss[u+1]):
del Poss[u+1]
else:
u+=1
res+=len(Poss)
dix+=len(Posic)
dox+=len(Posic)*len(str(k))
dox/=seq_len
#dicta['%Ocurrencia_caracter'].append(dox*100)
#dicta['longitud_Apariciones'].append(co)
#dicta['longitud_Apariciones_Proteina'].append(dix)
#dicta['%Patron'].append(co/compl*100)
#dicta['%Patron_proteina'].append(dix/comp*100)
dicta['Patron'].append(str(k))
#dicta['total_Patrones'].append(compl)
dicta['total_Patrones_por_prot'].append(res)
dicta['numero_prot'].append(co)
do=pd.DataFrame(dicta)
if not first:
do.to_csv('resultados/patronesOcurrencia'+str(int((float(datosInterfaz["OcurrenciaMin"])%1)*100))+sal+'.csv',index=False,header=False,mode='a' )
else:
do.to_csv('resultados/patronesOcurrencia'+str(int((float(datosInterfaz["OcurrenciaMin"])%1)*100))+sal+'.csv',index=False )
first=False
del df2
del do
def compute_proteinas_ocurrencia(df,sal):
"""
Computes the occurrence of proteins in the data and saves the results to a CSV file.
Parameters:
- df: pandas DataFrame, DataFrame containing protein information.
Note: Saves four values the protein id, the so called global ocurrence, the classes it has each protein. Global ocurrence is the percentage of Aminoacids in the sequence that belong to a pattern vs the total of aminoacids in the sequence of a specific protein.
"""
df3=df.groupby('Proteina')
first=True
df_b = pd.read_excel(archivoEntrada)
#df_b = pd.read_excel("proteinasClase_PC00060.xlsx")
#df_b=substitute_or_remove_prot_id(df_b,"r")
proteinas_dict = dict(df_b[['protein_id','protein_sequence']].values)
positions_visited=[]
for k,v in df3:
di={'proteinas':[],'global_ocurrence':[],"classesProt":[]}
seq=proteinas_dict[k]
#di['maximum_ocurrence'].append(len(seq))
di['proteinas'].append(k)
pato=[]
glob_ocurrence=0
Acum=[]
for index,row in v.iterrows():
print(row)
pat={}
pat['patron']=str(row['Patron'])
Posit=[oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']']
print(Posit)
Add=[]
for i in Posit:
for kaa in range(0,len(str(row['Patron']))):
print(i)
Add.append(int(i)+kaa)
lex=len(list(set(Acum) & set(Add)))
Posic=Posit
pat['loc_ocurren']=(len(Posic)*len(str(row['Patron'])))/len(seq)
glob_ocurrence+=len(Posic)*len(str(row['Patron']))-lex
pato.append(pat)
Acum=list(set(Acum) | set(Add))
#di['patrones'].append(pato)
di['global_ocurrence'].append(glob_ocurrence/len(seq))
di['classesProt'].append(class_dict[k] if k in class_dict else "N/A")
do=pd.DataFrame(di)
if not first:
do.to_csv('resultados/proteinasOcurrencia'+str(int((float(datosInterfaz["OcurrenciaMin"])%1)*100))+sal+'.csv',index=False,header=False,mode='a' )
else:
do.to_csv('resultados/proteinasOcurrencia'+str(int((float(datosInterfaz["OcurrenciaMin"])%1)*100))+sal+'.csv',index=False)
first=False
del do
if __name__ == "__main__":
if not os.path.exists("resultados"):
# Si no existe, crearla
os.makedirs("resultados")
print(f"La carpeta resultados se ha creado correctamente.")
else:
print(f"La carpeta resultados ya existe.")
inicio = time.time()
jsonfile=open("param_file.conf","r")
datosInterfaz=json.load(jsonfile)
#datosInterfaz = interfaz()
print(datosInterfaz)
archivoEntrada = datosInterfaz["NombreArchivoEntrada"]
enfermedad = datosInterfaz["CodigoEnfermedad"]
archivoTarget = datosInterfaz["NombreArchivoTarget"]
similitud = float(datosInterfaz["Similitud"])
archivoClases = datosInterfaz["NombreArchivoClases"]
archivoAA=datosInterfaz["NombreArchivoAA"]
sal=datosInterfaz["ExtensionSalida"]
cl=pd.read_excel(archivoClases)
#cl=substitute_or_remove_prot_id(cl,"r")
#data2=data.copy()
cli=cl.groupby('protein_id')
class_dict=group_classes_by_protein(cl)
#ejecutar(archivoEntrada, enfermedad, similitud)
pattern_freqMin = dict()
sequences, num_filas = readData(archivoEntrada, enfermedad, archivoTarget)
df_b = pd.read_excel(archivoEntrada)
#df_b=pd.read_excel("proteinasClase_PC00060.xlsx")
proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
ka=""
for item in sequences:
ka=proteinas_dict[item]
min_ocurrence = math.floor(num_filas * float(datosInterfaz["OcurrenciaMin"]))
seq_len=calculate_sequence_length(sequences)
print(min_ocurrence)
#pattern_freq, num_patrones = buscar_patrones_simAA(sequences,min_ocurrence,archivoAA)
#remplazar_s(pattern_freqMin,archivoEntrada,ArchivoAA,float(datosInterfaz["OcurrenciaMin"]),sal)
print(sequences)
#pattern_freqMin, num_patrones = buscar_patrones_identicos(sequences,archivoEntrada,archivoAA,float(datosInterfaz["OcurrenciaMin"]))
pattern_freqMin, num_patrones = buscar_patrones_identicos(sequences)
remplazar_sequence_for_ID(pattern_freqMin,archivoEntrada,float(datosInterfaz["OcurrenciaMin"]),sal,archivoClases)
df=pd.read_csv('resultados/patronesIdenticos'+str(int((float(datosInterfaz["OcurrenciaMin"])%1)*100))+sal+'.csv', usecols=['Patron', 'Proteina', 'Posiciones',"classesProt"],index_col=False)
df.to_csv('resultados/patronesIdenticos'+str(int((float(datosInterfaz["OcurrenciaMin"])%1)*100))+sal+'.csv', index=False)
#dfx=df.copy()
compute_pattern_ocurrence(df,sal)
compute_proteinas_ocurrencia(df,sal)
#metricas.metrica_distanciaProteinas()
#grafica(archivo, nombreOutput)
print("Se han obtenido los resultados de la métrica para la distancia entre dos proteínas que poseen el mismo patrón")
metrica = math.floor(num_patrones * float(datosInterfaz["Metrica"]))
metricas.patronesComun(metrica,archivoEntrada,float(datosInterfaz["OcurrenciaMin"]),sal,archivoClases)
#grafica(archivo, nombreOutput)
print("Se han obtenido los resultados de la métrica para la distancia entre dos proteínas que poseen mas de un patrón en común")
fin = time.time()
tiempo_total = fin - inicio
print(tiempo_total, "segundos")
import pandas as pd
import Levenshtein
from minineedle import needle, smith, core
from descarteProteinas import substitute_or_remove_prot_id
from ast import literal_eval
import blosum as bl
def readData(archivoEntrada):
"""
Read protein sequences from an Excel file.
Parameters:
- archivoEntrada: Input Excel file path
Returns:
- List of protein sequences
This function reads protein sequences from an Excel file specified by 'archivoEntrada' and extracts the
'protein_sequence' column from the DataFrame. The sequences are returned as a list.
Example:
>>> sequences = readData("protein_data.xlsx")
>>> print(sequences)
['MTCG...', 'MCTA...', ...]
"""
data = pd.read_excel(archivoEntrada)
#data=substitute_or_remove_prot_id(data,'r')
sequences = data["protein_sequence"]
return sequences
def similitudProteinas(sequences):
"""
Calculate pairwise similarity scores between protein sequences using Levenshtein distance.
Parameters:
- sequences: List of protein sequences
Returns:
- List of lists containing pairwise similarity information:
- [protein_sequence_1, protein_sequence_2, similarity_score]
This function takes a list of protein sequences and calculates pairwise similarity scores
between each pair of protein sequences using Levenshtein distance. The results are returned
in a list of lists.
Example:
>>> sequences = ["MACG", "MACC", "MGCA"]
>>> result = similitudProteinas(sequences)
>>> print(result)
[['MACG', 'MACC', 75.0],
['MACG', 'MGCA', 50.0],
['MACC', 'MACG', 75.0],
['MACC', 'MGCA', 66.67],
['MGCA', 'MACG', 50.0],
['MGCA', 'MACC', 66.67]]
"""
output = []
for row1 in sequences:
for row2 in sequences:
if row1 != row2:
#similarity = abs(smith.SmithWaterman(row1, row2).get_score()-1) / max(len(row1), len(row2))
#similarity = abs(needle.NeedlemanWunsch(row1, row2).get_score()-1) / (2*max(len(row1), len(row2)))
similarity = abs(Levenshtein.distance(row1, row2)) / max(len(row1), len(row2))
output.append([row1, row2, similarity*100])
return output
def remplazar_sequence_for_ID(output,archivoEntrada,archivoEntrada2,Sal,mode="default"):
"""
Replace protein sequences with protein IDs using a pre-existing DataFrame.
Parameters:
- output: List of lists containing similarity information
- mode: Replacement mode (default or drug)
- archivoEntrada: Path to protein information file
- Sal: Extension for output file
This function takes a list of lists containing pairwise similarity information, and replaces
protein sequences with their corresponding protein IDs. The replacement is based on the information
provided in a pre-existing DataFrame. The updated information is saved to a CSV file.
Example:
>>> data = [['MACG', 'MGCA', 75.0], ['MACC', 'MGCA', 66.67]]
>>> inputFile = "protein_data.xlsx"
>>> outputExt = "protein"
>>> remplazar_sequence_for_ID(data,inputFile,OutputExt, mode="default")
"""
df_b = pd.read_excel(archivoEntrada)
df_c= pd.read_excel(archivoEntrada2)
common_cols = list(set.intersection(*(set(df_b.columns),set(df_c.columns) )))
df_b=pd.concat([df_b[common_cols],df_c[common_cols]], ignore_index=True)
#df_b=substitute_or_remove_prot_id(df_b,"r")
# Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
#output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0]))
proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
if(mode=="drug"):
drug_dict=dict(df_b[['protein_sequence','drug_id']].values)
for item in output:
protein_sequence1 = item[0]
protein_sequence2 = item[1]
res=[]
[res.append(x) for x in literal_eval(drug_dict[item[0]]) if x not in res and ( x != '[' or x != ']') ]
if(len(res) == 1):
item.append(res[0])
elif(len(res)>1):
item.append(res)
else:
item.append("")
res=[]
[res.append(x) for x in literal_eval(drug_dict[item[1]]) if x not in res and ( x != '[' or x != ']')]
if(len(res) == 1):
item.append(res[0])
elif(len(res)>1):
item.append(res)
else:
item.append("")
if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict:
item[0] = proteinas_dict[protein_sequence1]
item[1] = proteinas_dict[protein_sequence2]
df_a=pd.DataFrame(output, columns=['Proteina1', 'Proteina2', 'Similaridad','SimilaridadAA','similaridadAA_2','similaridadBlosum','drug_id_p1','drug_id_p2'])
else:
for item in output:
protein_sequence1 = item[0]
protein_sequence2 = item[1]
if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict:
item[0] = proteinas_dict[protein_sequence1]
item[1] = proteinas_dict[protein_sequence2]
df_a = pd.DataFrame(output, columns=['Proteina1', 'Proteina2', 'Similaridad','SimilaridadAA','similaridadAA_2','similaridadBlosum'])
# Guardar el DataFrame actualizado en un archivo CSV
df_a.to_csv('AllProteins_%Similitud'+Sal+'.csv', index=False)
def similitudMatProteinas(sequences,sequences2, matrix,matrix2,matriz3,matriz4,equal=False):
"""
Create percentages of pairwise similarity scores between protein sequences based on three similarity matrices.
Parameters:
- sequences: List of protein sequences
- matrix: First similarity matrix
- matrix2: Second similarity matrix
- matriz3: Third similarity matrix
Returns:
- List of lists containing pairwise similarity information:
- [protein_sequence_1, protein_sequence_2, similarity_score_matrix1, similarity_score_matrix2, similarity_score_matrix3]
This function takes a list of protein sequences and three similarity matrices and calculates pairwise similarity scores
between each pair of protein sequences. The similarity scores are computed using the provided matrices, and the results
are returned in a list of lists.
Note: The function assumes that the matrices are square matrices with dimensions matching the length of the 'sequences' list.
Example:
>>> sequences = ["MACG", "MACC", "MGCA"]
>>> matrix1 = [[1.0, 0.8, 0.6], [0.8, 1.0, 0.7], [0.6, 0.7, 1.0]]
>>> matrix2 = [[0.9, 0.7, 0.5], [0.7, 0.9, 0.6], [0.5, 0.6, 0.9]]
>>> matrix3 = [[0.8, 0.6, 0.4], [0.6, 0.8, 0.5], [0.4, 0.5, 0.8]]
>>> result = similitudMatProteinas(sequences, matrix1, matrix2, matrix3)
>>> print(result)
[['MACG', 'MACC', 80.0, 70.0, 60.0],
['MACG', 'MGCA', 60.0, 50.0, 40.0],
['MACC', 'MACG', 80.0, 70.0, 60.0],
['MACC', 'MGCA', 70.0, 60.0, 50.0],
['MGCA', 'MACG', 60.0, 50.0, 40.0],
['MGCA', 'MACC', 70.0, 60.0, 50.0]]
"""
output = []
for row1 in range(0,len(sequences2)):
for row2 in range(0,len(sequences)):
if equal:
if row1 != row2:
#similarity = abs(smith.SmithWaterman(row1, row2).get_score()-1) / max(len(row1), len(row2))
#similarity = abs(needle.NeedlemanWunsch(row1, row2).get_score()-1) / (2*max(len(row1), len(row2)))
output.append([sequences[row2], sequences2[row1], matrix[row1][row2]*100,matrix2[row1][row2]*100,matriz3[row1][row2]*100,matriz4[row1][row2]*100])
else:
output.append([sequences[row2], sequences2[row1], matrix[row1][row2]*100,matrix2[row1][row2]*100,matriz3[row1][row2]*100,matriz4[row1][row2]*100])
return output
if __name__ == "__main__":
archivoEntrada = "Data/data_lung_cancer_treatment.xlsx"
sequences1 = readData(archivoEntrada)
archivoEntrada2 = "Data/data_autoimmume_desease.xlsx"
sequences2 = readData(archivoEntrada2)
matrix=pd.read_csv('matrizNWAutoimmuneDiseaseC.csv',header=None,index_col=False)*3+1.0
matrix.abs()
matrix/=4
print(matrix.shape)
matrix2=pd.read_csv('matrizNWAutoimmuneDiseaseMod1.csv',header=None,index_col=False)*3+1.0
matrix2.abs()
matrix2/=4
print(matrix2.shape)
matrix3=pd.read_csv('matrizNWAutoimmuneDiseaseMod2.csv',header=None,index_col=False)*3+1.0
matrix3.abs()
matrix3/=4
print(matrix3.shape)
matrix4=pd.read_csv('matrizNWAutoimmuneDiseaseBlosum62.csv',header=None,index_col=False)
dic= bl.BLOSUM(62)
print(dic)
mismatch=0
match=1
minn=min(min(min(min(list(row.values())) for row in list(dic.values())),-4),mismatch)
print(matrix4.shape)
print(len(sequences1))
for row1 in range(0,len(sequences2)):
for row2 in range(0,len(sequences1)):
len_sec1=0
len_min_sec1=0
dic_seq=set()
minf_letters='a'
for i in sequences1[row2]:
dic_seq.add(dic[i][i])
minf_letters= i if(dic[i][i] == float('-inf')) else minf_letters
len_sec1+=dic[i][i] if(dic[i][i] != float('-inf')) else match
len_min_sec1+=min(list(dic[i].values())) if(dic[i][i] != float('-inf')) else mismatch
len_sec2=0
len_min_sec2=0
for i in sequences2[row1]:
dic_seq.add(dic[i][i])
minf_letters= i if(dic[i][i] == float('-inf')) else minf_letters
len_sec2+=dic[i][i] if(dic[i][i] != float('-inf')) else match
len_min_sec2+=min(list(dic[i].values())) if(dic[i][i] != float('-inf')) else mismatch
if(max(len_sec2,len_sec1) == float('-inf')):
print(max(len_sec2,len_sec1))
print(dic_seq)
print(minf_letters)
matrix4[row1][row2]*=max(len_sec2,len_sec1)
matrix4[row1][row2]-=(minn*max(len(sequences1[row2]),len(sequences2[row1])))
matrix4[row1][row2]/= (max(len_sec2,len_sec1)-minn*max(len(sequences1[row2]),len(sequences2[row1])))
print(matrix[0][0])
print(matrix2[0][0])
print(matrix3[0][0])
print(matrix4[0][0])
#output = similitudProteinas(sequences)
output=similitudMatProteinas(sequences1,sequences2, matrix,matrix2,matrix3,matrix4,equal=False)
print("Generada la tabla de con las matrices de similaridad especificadas")
remplazar_sequence_for_ID(output,archivoEntrada,archivoEntrada2,"AutoimmuneDisease")
import pandas as pd
import time
import numpy as np
import re
from ast import literal_eval
from find_patterns import substitute_or_remove_prot_id
def readData(archivoEntrada, enfermedad,patrones_file,Sal):
"""
Reads data from an Excel file, filters it based on the disease, and performs additional processing.
Parameters:
- archivo_entrada (str): Path to the Excel file.
- enfermedad (str): Disease ID for filtering.
- patrones_file (str): Path to the file containing patterns.
- Sal: Output file extension
Returns:
- data (pd.DataFrame): Processed DataFrame based on the given parameters.
"""
data = pd.read_excel(archivoEntrada)
if enfermedad:
data = data.loc[data["disease_id"] == enfermedad]
dataB = pd.read_csv(patrones_file)
print(len(data))
filt_data = len(data)
alz_filt_data = len(dataB)
print("Proteins discarded after the main filter: " + str(filt_data - len(data)))
print("Proteins discarded after the common Alzheimer's filter: " + str(alz_filt_data - len(dataB)))
dataC = {}
dataz={}
daa = dataB["Patron"].unique()
das={}
pos={}
deas={}
for u in daa:
if len(u) > 3:
kk=data.protein_sequence.str.contains(u)
das[u] = data[kk]["protein_id"].to_list()
pos[u]= data[kk]['protein_sequence'].str.find(u).to_list()
deas[u]=data[kk]['disease_id'].to_list()
print(len(pos[u]))
print(len(das[u]))
dataC[u]=[[[das[u][ii],pos[u][ii]],deas[u][ii]] for ii in range(0,len(das[u]))]
res = []
for row in dataC[u]:
matching_sublist = next((sublist for sublist in res if sublist[0] == row[0]), None)
if matching_sublist is not None:
# If a matching sublist is found, append only non-matching elements to it
matching_sublist[1].append(row[1])
else:
# If no matching sublist is found, create a new sublist with only non-matching elements
res.append([row[0],row[1:]])
dataC[u]=[sublist[0] for sublist in res]
dataz[u]=[sublist[1] for sublist in res]
dataG = pd.DataFrame({"pattern": dataC.keys(),"proteins":dataC.values(),"desease_id":dataz.values()})
dataG.to_excel("ProtByPattern"+Sal+".xlsx")
sequences = data["protein_sequence"]
return data
def add_protein_info_to_data(main_data_path, patterns_info_path, protein_names_path):
"""
Add protein names and protein information from the original pattern file and the names Dataset to a DataFrame based on matching patterns.
Parameters:
- main_data_path (str): The path to the Excel file containing the main data.
- patterns_info_path (str): The path to the CSV file containing patterns and protein information.
- protein_names_path (str): The path to the CSV file containing protein names.
Returns:
None: The function updates the provided Excel file with additional protein information.
Example:
```python
add_protein_info_to_data("main_data.xlsx", "patterns_info.csv", "protein_names.csv")
```
Note:
- The function assumes that the provided Excel file ('main_data_path') contains a 'pattern' column.
- The 'patterns_info_path' CSV file is expected to have columns 'Patron', 'Proteina', and 'Posiciones'.
- The 'protein_names_path' CSV file is expected to have columns 'Entry' and 'Entry_Name'.
"""
# Read data from files
main_data = pd.read_excel(main_data_path)
patterns_info = pd.read_csv(patterns_info_path)
protein_names = pd.read_csv(protein_names_path)
# Group patterns in 'patterns_info' DataFrame
patterns_grouped = patterns_info.groupby("Patron")
# Initialize columns in 'main_data' DataFrame
main_data["protein_names"] = ""
main_data["proteins_treat"] = "{}"
main_data["names_Treat"]=""
for pattern, group_data in patterns_grouped:
# Iterate over patterns in 'patterns_info'
for index, row in group_data.iterrows():
protein_id = row["Proteina"]
positions = row["Posiciones"]
# Find matching rows in 'main_data' DataFrame
matching_rows = main_data[main_data["pattern"] == pattern]
# Initialize or get the current 'proteins_treat' list
current_proteins_treat = {}
# Update 'proteins_treat' field for each matching row
for matching_index, matching_row in matching_rows.iterrows():
current_proteins_treat = literal_eval(matching_row["proteins_treat"]) if pd.notna(matching_row["proteins_treat"]) or matching_row["proteins_treat"] != "[]" else {}
current_proteins_treat.update({protein_id: literal_eval(positions)})
main_data.at[matching_index, "proteins_treat"] = str(current_proteins_treat)
matching_rows.at[matching_index, "proteins_treat"] = str(current_proteins_treat)
print(matching_rows["proteins_treat"].apply(
lambda lst: [protein_idee for protein_idee, _ in literal_eval(lst).items()]))
main_data.loc[main_data["pattern"] == pattern, "names_Treat"] = matching_rows["proteins_treat"].apply(
lambda lst: [protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() if protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() != [] else ["N/A"] for protein_idee, _ in literal_eval(lst).items()]
)
main_data.loc[main_data["pattern"] == pattern, "protein_names"] = matching_rows["proteins"].apply(
lambda lst: [protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() if protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() != [] else ["N/A"] for protein_idee, _ in literal_eval(lst)]
)
# Save the updated data
main_data_base_name = main_data_path.split(".")[0]
main_data.to_excel(f"{main_data_base_name}_summary.xlsx", index=False)
def add_entry_name(archivoEntrada,protein_name_file,archNom):
"""
Adds entry names to the DataFrame based on an additional CSV file and performs additional processing.
Parameters:
- archivo_entrada (str): Path to the Excel file.
- protein_name_file (str): Path to the protein name CSV file.
- archNom (str): Path to the id sustitution file
Returns:
- None
"""
data = pd.read_excel(archivoEntrada)
dataB = pd.read_csv(protein_name_file, usecols=['Entry', "Entry_Name", "Protein_names", "Length"])
dataB = substitute_or_remove_prot_id(dataB, archNom, "na")
print("PASA")
dataB = dataB.reindex(columns=['Entry', "Entry_Name", "Length", "Protein_names"])
datas = dataB[dataB["Entry"].isin(data["protein_id"])]
datas.to_csv(archivoEntrada + "_nombre.csv")
doo = data[~(data["protein_id"].isin(dataB["Entry"]))]
doo.to_csv("Proteinas_sin_nombre")
#data.assign(lenght=datas["Length"].to_list())
#data.assign(name=datas["Protein names"].to_list())
#data.to_csv(archivoEntrada+"_nombre.csv")
if __name__=="__main__":
#data=add_entry_name("Data/data_cancers_desease.xlsx","Data/protein_name.csv","Data/nombres_sust.txt")
#data=pd.read_excel("Data/data_lung_cancer_desease.xlsx")
#dd=pd.read_excel("Data/data_lung_cancer_treatment.xlsx")
#dds=pd.concat([data,dd])
#dds.to_excel("Data/data_lung_cancer_desease_full.xlsx")
data=readData("Data/data_immune_desease.xlsx","","patronesIdenticos10Treat.csv","Immun01")
add_protein_info_to_data("ProtByPatternImmun01.xlsx","patronesIdenticos10Treat.csv","Data/protein_name.csv")
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment