Adding final results finding patterns

b5e1a75c · Belen Otero Carrasco · b5e1a75c · b5e1a75c · b5e1a75c · b5e1a75c
Commit b5e1a75c authored Aug 30, 2024 by Belen Otero Carrasco
16 changed files
--- a/Code Approach 1 and 2 (2.1 - 2.2)/compute_distance_mat.py
+++ b/Code Approach 1 and 2 (2.1 - 2.2)/compute_distance_mat.py
+import pandas as pd
+import Levenshtein
+import time
+import numpy as np
+from Levenshtein import distance
+import re
+from minineedle import needle, smith, core
+from Bio.Blast.Applications import NcbiblastpCommandline
+from io import StringIO
+from Bio.Blast import NCBIXML
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+import swalign
+import multiprocessing as mp
+globi=0
+import nw_wrapper
+import nw_wrapper_matrix
+import math
+
+import blosum as bl
+
+def substitute_or_remove_prot_id(data,archSubs,sub_rem):
+    """
+    Substitute or remove protein IDs based on a substitution file.
+
+    Parameters:
+    - data: DataFrame containing protein data
+    - sub_rem: 's' to substitute, 'r' to remove
+    - archSubs: Substituion file
+
+    Returns:
+    - Updated DataFrame after substitution or removal
+    """
+    with open(archSubs) as prottosubs:
+          index=prottosubs.readline()
+          acept=index.split()
+          listtosubs={}
+          for i in range(0,len(acept)):
+            listtosubs[acept[i]]=[]
+          while line := prottosubs.readline():
+              newline=line.split()
+              #print(len(newline))
+              for i in range(0,len(newline)):
+                  
+                  listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip())  
+    resub=1
+    if re.search("Primary",list(listtosubs.keys())[0]):
+           resub=0
+    #print(data)
+    #data2=data.copy()
+    if(sub_rem == "s"):
+       data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
+    #datacp=data.copy()
+    #print(pd.concat([data2,datacp]).drop_duplicates())
+    else: 
+        global globi
+        datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True]
+        data = data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==False]
+        
+        datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False) 
+
+        globi=globi+1 
+    return data 
+    
+def readData(archivoEntrada, enfermedad):
+    """
+    Read protein data from an Excel file, filter based on disease, and return protein sequences.
+
+    Parameters:
+    - archivoEntrada: Excel file containing protein data
+    - enfermedad: Disease ID for filtering
+
+    Returns:
+    - Protein sequences DataFrame
+    """
+    data = pd.read_excel(archivoEntrada)
+
+    sequences = data["protein_sequence"]
+
+    return sequences
+def calculate_matrix_similarity(data, similarity_function, output_filename):
+    """
+    Calculate similarity matrix pairwise between each data pair of sequences using multiprocessing .
+
+    Parameters:
+    - data: Protein sequences DataFrame
+    - similarity_function: Function to calculate similarity
+    - output_filename: Filename to save the similarity matrix
+    """
+    num_points = len(data)
+    similarity_matrix = [[0] * num_points for _ in range(num_points)]
+
+    with mp.Pool(processes=20) as pool:
+        sim_matrix = pool.starmap(similarity_function, [(data[i], data[j]) for i in range(num_points) for j in range(num_points)])
+
+    similarity = []
+
+    for idx in range(0, len(sim_matrix) // num_points):
+        similarity.append([sim_matrix[idx * num_points: (idx + 1) * num_points]])
+
+    datf = pd.DataFrame(np.asmatrix(np.array(similarity)))
+    datf.to_csv(output_filename, index=False, header=False)
+
+
+
+def remplazar_sequence_for_ID(output,archivoEntrada,archSubs):
+    """
+    Changes the protein sequences in output to their corresponding id.
+
+    Parameters:
+    - ouput: Pandas Dataframe with the sequences to be changed
+    - archivoEntrada : Input file with the equivalences of sequence-id
+    - archSubs: Input file with each protein that has to be changed back to its primary entry
+    
+    Returns:
+    - Returns dataframe passed as input with the sequences changed to their respective id
+    """
+    df_b = pd.read_excel(archivoEntrada)
+    df_b= substitute_or_remove_prot_id(df_b,archSubs,"s")
+    proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
+
+    for i in range(len(output)):
+        protein_sequence = output[i]
+        if protein_sequence in proteinas_dict:
+            output[i] = proteinas_dict[protein_sequence]
+
+    return output
+def smith_waterman_similarity(pattern1,pattern2):
+    """
+    Wrapper for Smith-Waterman algorithm using default values.
+
+    Parameters:
+    - pattern1: Protein sequence 1
+    - pattern2: Protein sequence 2
+
+    Returns:
+    - Smith-Waterman alignment score divided by the maximum lenght between the two sequences, making the score be between the interval [0,1]
+    """
+    return smith.SmithWaterman(pattern1,pattern2).get_score()/max(len(pattern1), len(pattern2))
+def levenshtein_similarity(pattern1, pattern2):
+    """
+    Calculate Levenshtein similarity between two sequences.
+
+    Parameters:
+    - pattern1: Protein sequence 1
+    - pattern2: Protein sequence 2
+
+    Returns:
+    - Levenshtein similarity score divided by the maximum lenght between the two sequences, making the score be between the interval [0,1]
+    """
+    return Levenshtein.distance(pattern1, pattern2) / max(len(pattern1), len(pattern2))
+def needleman_wunsch_similarity(pattern1, pattern2):
+    """
+    Wrapper for Needleman-Wunsch algorithm using default values.
+
+    Parameters:
+    - pattern1: Protein sequence 1
+    - pattern2: Protein sequence 2
+
+    Returns:
+    - Needleman-Wunsch alignment score normalized between [-1,1]
+    """
+    global dat
+    #print(needle.NeedlemanWunsch(pattern1 , pattern2).get_score()/max(len(pattern1), len(pattern2)))
+    return needle.NeedlemanWunsch(pattern1 , pattern2).get_score()/max(len(pattern1), len(pattern2))
+    
+def to_raw(string):
+    return "{0}".format(string)    
+def blast_similarity(pattern1,pattern2):
+    """
+    Run BLAST to calculate similarity between two protein sequences.
+
+    Parameters:
+    - pattern1: Protein sequence 1
+    - pattern2: Protein sequence 2
+
+    Returns:
+    - BLAST alignment score
+    """
+    seq1 = SeqRecord(Seq(pattern1),
+                   id="seq1")
+    seq2 = SeqRecord(Seq(pattern2),
+                   id="seq2")
+    assert pattern1
+    assert pattern2               
+    SeqIO.write(seq1, "seq1.fasta", "fasta")
+    SeqIO.write(seq2, "seq2.fasta", "fasta")
+    SeqIO.write(seq1, "seqx.fasta", "fasta")
+    SeqIO.write(seq1, "seqy.fasta", "fasta")
+    output = NcbiblastpCommandline(query="seq1.fasta", subject="seq2.fasta", outfmt=5)()[0]
+    #print(output)
+    blast_result_record = NCBIXML.read(StringIO(output))
+    result=0
+    with open("seq1.fasta", 'w') as target:
+          target.truncate()
+    with open("seq2.fasta", 'w') as target:
+          target.truncate()      
+    
+    for alignment in blast_result_record.alignments:
+      for hsp in alignment.hsps:
+        result=result+hsp.score
+    #print(blast_result_record)
+    return float(result)/max(len(pattern1), len(pattern2)) 
+                       
+def nwmodScore(sec1,sec2,dic,match,mismatch,gap):
+    """
+    Wrapper for Needleman-Wunsch algorithm.
+
+    Parameters:
+    - sec1: Protein sequence 1
+    - sec2: Protein sequence 2
+    - dic: Substitution dictionary of the letters that belong to the same group
+    - match: match value
+    - mismatch: mismatch value
+    - gap: gap value
+
+    Returns:
+    - Needleman-Wunsch alignment score
+    """
+    #print(sec2)
+    nw_instance=nw_wrapper.NW(sec1,sec2,dic,match,mismatch,gap)
+    #print(str(int(nw_instance.get_score())/max(len(sec1), len(sec2))))
+    return int(nw_instance.get_score())/(match*max(len(sec1), len(sec2)))
+def nwmodScoreMt(sec1,sec2,dic,match,mismatch,gap):
+    """
+    Wrapper for Needleman-Wunsch algorithm.
+
+    Parameters:
+    - sec1: Protein sequence 1
+    - sec2: Protein sequence 2
+    - dic: Substitution dictionary of the letters that belong to the same group
+    - match: match value
+    - mismatch: mismatch value
+    - gap: gap value
+
+    Returns:
+    - Needleman-Wunsch alignment score
+    """
+    print(sec2)
+    nw_instance=nw_wrapper_matrix.NWM(sec1,sec2,dic,match,mismatch,gap)
+    print(str(int(nw_instance.get_score())/max(len(sec1), len(sec2))))
+    len_sec1=0
+    for i in sec1:
+       len_sec1+=dic[i][i]
+    len_sec2=0   
+    for i in sec2:
+        len_sec2+=dic[i][i]   
+    return int(nw_instance.get_score())/(max(len_sec1, len_sec2))    
+def generate_nwmod(data,dic,dd,sal):
+    """
+    Matrix generator using Needleman-Wunsch algorithm to compute pairwise score between each pair of proteins with custom matches.
+
+    Parameters:
+    - data: Protein sequences
+    - dic: Substitution dictionary of the letters that belong to the same group
+    - dd: match, mismatch and gap values dictionary
+    - sal: Extension added to the output file
+
+    Returns:
+    - Needleman-Wunsch alignment score matrix normalized between [max(gap,mismatch)/match,1]
+    """
+    num_points=len(data)
+    match=dd["match"]
+    mismatch=dd["mismatch"]
+    gap=dd["gap"]
+    sim_data = [[0 for _ in range(len(data))] for _ in range(len(data))]
+    similarity_matrix = [[0] * num_points for _ in range(num_points)]
+    first=True
+    for i in range(num_points):
+       sim_matrix=[nwmodScore(data[i],data[j],dic,dd["match"],dd["mismatch"],dd["gap"]) for j in range(num_points)]   
+       similarity = []
+    #for i in range(0,num_points):
+    #      for j in range(i,num_points):
+    #         sim_data[i][j]=sim_matrix.pop(0)  
+    #         sim_data[j][i]=sim_data[i][j] 
+       if(first):         
+          datf=pd.DataFrame(np.asmatrix(np.array(sim_matrix)))
+          datf.to_csv('resultados/matrizNW'+sal+'.csv', index=False,header=False)
+          first=False
+       else:
+          datf=pd.DataFrame(np.asmatrix(np.array(sim_matrix)))
+          datf.to_csv('resultados/matrizNW'+sal+'.csv', index=False,header=False, mode="a") 
+    return
+def generate_nwmodpremade(data,dic,dd,sal):
+    """
+    Matrix generator using Needleman-Wunsch algorithm to compute pairwise score between each pair of proteins with custom matches.
+
+    Parameters:
+    - data: Protein sequences
+    - dic: Substitution dictionary of the letters that belong to the same group
+    - dd: match, mismatch and gap values dictionary
+    - sal: Extension added to the output file
+
+    Returns:
+    - Needleman-Wunsch alignment score matrix normalized between [max(gap,mismatch)/match,1]
+    """
+    num_points=len(data)
+    match=dd["match"]
+    mismatch=dd["mismatch"]
+    gap=dd["gap"]
+    sim_data = [[0 for _ in range(len(data))] for _ in range(len(data))]
+    similarity_matrix = [[0] * num_points for _ in range(num_points)]
+    first=True
+    for i in range(num_points):
+       sim_matrix=[nwmodScoreMt(data[i],data[j],dic,dd["match"],dd["mismatch"],dd["gap"]) for j in range(num_points)]   
+       similarity = []
+    #for i in range(0,num_points):
+    #      for j in range(i,num_points):
+    #         sim_data[i][j]=sim_matrix.pop(0)  
+    #         sim_data[j][i]=sim_data[i][j] 
+       if(first):         
+          datf=pd.DataFrame(np.asmatrix(np.array(sim_matrix)))
+          datf.to_csv('resultados/matrizNW'+sal+'.csv', index=False,header=False)
+          first=False
+       else:
+          datf=pd.DataFrame(np.asmatrix(np.array(sim_matrix)))
+          datf.to_csv('resultados/matrizNW'+sal+'.csv', index=False,header=False, mode="a") 
+    return
+def swap_dict(d):
+    """
+    Swap keys and values in a dictionary.
+
+    Parameters:
+    - d: Input dictionary
+
+    Returns:
+    - Dictionary with swapped keys and values
+    """ 
+    new_dict = {}
+    for key, values in d.items():
+        for value in values:
+            if value not in new_dict:
+                if(len(value)== 1):
+                  new_dict[value[0]] = []
+                else:
+                  new_dict[value] = [] 
+            if(len(value)== 1):
+               new_dict[value[0]].append(key)
+            else:
+               new_dict[value].append(key)   
+    return new_dict
+def read_aminoacidos(afile):
+    """
+    Read amino acid data from a file and create dictionaries.
+
+    Parameters:
+    - afile: Amino acid data file
+
+    Returns:
+    - Dictionaries with amino acid data
+    """
+    cla = {}
+    with open(afile, 'r') as op:
+        lines = op.readlines()
+        for line in lines:
+            oo = line.replace('\n', '').split('\t')
+            key = oo.pop(0)
+            cla[key] = oo
+    return swap_dict(cla), cla
+def get_matrix(data, similarity_function, output_filename):
+    """
+    Creates a square matrix of with entry using the similarity function specified.
+
+    Parameters:
+    - data: Protein sequences data file
+    - similarity_function: similarity function to apply
+    - output_filename: Name of the file in which the output matriz will be written  
+       
+    """
+    calculate_matrix_similarity(data, similarity_function, output_filename)
+def get_clases(clas):
+    """
+    Calculate the substitutability between aminoacids based on their common clases.
+
+    Parameters:
+    - clas: Dictionary containing amino acid clases and the aminoacid that belong to that clases
+    Returns:
+    Dictionary of dictionaries with the grade of match that have 2 aminoacids depending on their clases   
+    """
+    clases={}
+    for k,v in clas.items():
+      for k2,v2 in clas.items():
+         if(k not in clases.keys()):
+            clases[k]={}
+         if(k2 not in clases[k].keys()):
+            clases[k][k2]=float(len(set(v) & set(v2))/len(set(v) | set(v2)))
+    return clases        
+if __name__=="__main__":
+   inputFile="Data/data_lung_cancer_treatment.xlsx"
+   data=readData(inputFile,"")
+   similarity_functions = [levenshtein_similarity]
+   output_filenames = ["levenshtein_similarity.csv"]
+
+   for sim_func, output_filename in zip(similarity_functions, output_filenames):
+        get_matrix(data, sim_func, output_filename)
+   afile="Data/aminoacidos_mod.txt"
+   sal="mod1"
+   a = "MACWPQLRLLLWKNLTFRRRQTCQLLLEVAWPLFIFLILISVRLSYPPYEQHECHFPNKAMPSAGTLPWVQGIICNANNPCFRYPTPGEAPGVVGNFNKSIVARLFSDARRLLLYSQKDTSMKDMRKVLRTLQQIKKSSSNLKLQDFLVDNETFSGFLYHNLSLPKSTVDKMLRADVILHKVFLQGYQLHLTSLCNGSKSEEMIQLGDQEVSELCGLPREKLAAAERVLRSNMDILKPILRTLNSTSPFPSKELAEATKTLLHSLGTLAQELFSMRSWSDMRQEVMFLTNVNSSSSSTQIYQAVSRIVCGHPEGGGLKIKSLNWYEDNNYKALFGGNGTEEDAETFYDNSTTPYCNDLMKNLESSPLSRIIWKALKPLLVGKILYTPDTPATRQVMAEVNKTFQELAVFHDLEGMWEELSPKIWTFMENSQEMDLVRMLLDSRDNDHFWEQQLDGLDWTAQDIVAFLAKHPEDVQSSNGSVYTWREAFNETNQAIRTISRFMECVNLNKLEPIATEVWLINKSMELLDERKFWAGIVFTGITPGSIELPHHVKYKIRMDIDNVERTNKIKDGYWDPGPRADPFEDMRYVWGGFAYLQDVVEQAIIRVLTGTEKKTGVYMQQMPYPCYVDDIFLRVMSRSMPLFMTLAWIYSVAVIIKGIVYEKEARLKETMRIMGLDNSILWFSWFISSLIPLLVSAGLLVVILKLGNLLPYSDPSVVFVFLSVFAVVTILQCFLISTLFSRANLAAACGGIIYFTLYLPYVLCVAWQDYVGFTLKIFASLLSPVAFGFGCEYFALFEEQGIGVQWDNLFESPVEEDGFNLTTSVSMMLFDTFLYGVMTWYIEAVFPGQYGIPRPWYFPCTKSYWFGEESDEKSHPGSNQKRISEICMEEEPTHLKLGVSIQNLVKVYRDGMKVAVDGLALNFYEGQITSFLGHNGAGKTTTMSILTGLFPPTSGTAYILGKDIRSEMSTIRQNLGVCPQHNVLFDMLTVEEHIWFYARLKGLSEKHVKAEMEQMALDVGLPSSKLKSKTSQLSGGMQRKLSVALAFVGGSKVVILDEPTAGVDPYSRRGIWELLLKYRQGRTIILSTHHMDEADVLGDRIAIISHGKLCCVGSSLFLKNQLGTGYYLTLVKKDVESSLSSCRNSSSTVSYLKKEDSVSQSSSDAGLGSDHESDTLTIDVSAISNLIRKHVSEARLVEDIGHELTYVLPYEAAKEGAFVELFHEIDDRLSDLGISSYGISETTLEEIFLKVAEESGVDAETSDGTLPARRNRRAFGDKQSCLRPFTEDDAADPNDSDIDPESRETDLLSGMDGKGSYQVKGWKLTQQQFVALLWKRLLIARRSRKGFFAQIVLPAVFVCIALVFSLIVPPFGKYPSLELQPWMYNEQYTFVSNDAPEDTGTLELLNALTKDPGFGTRCMEGNPIPDTPCQAGEEEWTTAPVPQTIMDLFQNGNWTMQNPSPACQCSSDKIKKMLPVCPPGAGGLPPPQRKQNTADILQDLTGRNISDYLVKTYVQIIAKSLKNKIWVNEFRYGGFSLGVSNTQALPPSQEVNDAIKQMKKHLKLAKDSSADRFLNSLGRFMTGLDTKNNVKVWFNNKGWHAISSFLNVINNAILRANLQKGENPSHYGITAFNHPLNLTKQQLSEVALMTTSVDVLVSICVIFAMSFVPASFVVFLIQERVSKAKHLQFISGVKPVIYWLSNFVWDMCNYVVPATLVIIIFICFQQKSYVSSTNLPVLALLLLLYGWSITPLMYPASFVFKIPSTAYVVLTSVNLFIGINGSVATFVLELFTDNKLNNINDILKSVFLIFPHFCLGRGLIDMVKNQAMADALERFGENRFVSPLSWDLVGRNLFAMAVEGVVFFLITVLIQYRFFIRPRPVNAKLSPLNDEDEDVRRERQRILDGGGQNDILEIKELTKIYRRKRKPAVDRICVGIPPGECFGLLGVNGAGKSSTFKMLTGDTTVTRGDAFLNKNSILSNIHEVHQNMGYCPQFDAITELLTGREHVEFFALLRGVPEKEVGKVGEWAIRKLGLVKYGEKYAGNYSGGNKRKLSTAMALIGGPPVVFLDEPTTGMDPKARRFLWNCALSVVKEGRSVVLTSHSMEECEALCTRMAIMVNGRFRCLGSVQHLKNRFGDGYTIVVRIAGSNPDLKPVQDFFGLAFPGSVLKEKHRNMLQYQLPSSLSSLARIFSILSQSKKRLHIEDYSVSQTTLDQVFVNFAKDQSDDDHLKDLSLHKNQTVVDVAVLTSFLQDEKVKESYV"
+   b = "MACWPQLRLLLWKNLTFRRRQTCQLLLEVAWPLFIFLILISVRLSYPPYEQHECHFPNKAMPSAGTLPWVQGIICNANNPCFRYPTPGEAPGVVGNFNKSIVARLFSDARRLLLYSQKDTSMKDMRKVLRTLQQIKKSSSNLKLQDFLVDNETFSGFLYHNLSLPKSTVDKMLRADVILHKVFLQGYQLHLTSLCNGSKSEEMIQLGDQEVSELCGLPREKLAAAERVLRSNMDILKPILRTLNSTSPFPSKELAEATKTLLHSLGTLAQELFSMRSWSDMRQEVMFLTNVNSSSSSTQIYQAVSRIVCGHPEGGGLKIKSLNWYEDNNYKALFGGNGTEEDAETFYDNSTTPYCNDLMKNLESSPLSRIIWKALKPLLVGKILYTPDTPATRQVMAEVNKTFQELAVFHDLEGMWEELSPKIWTFMENSQEMDLVRMLLDSRDNDHFWEQQLDGLDWTAQDIVAFLAKHPEDVQSSNGSVYTWREAFNETNQAIRTISRFMECVNLNKLEPIATEVWLINKSMELLDERKFWAGIVFTGITPGSIELPHHVKYKIRMDIDNVERTNKIKDGYWDPGPRADPFEDMRYVWGGFAYLQDVVEQAIIRVLTGTEKKTGVYMQQMPYPCYVDDIFLRVMSRSMPLFMTLAWIYSVAVIIKGIVYEKEARLKETMRIMGLDNSILWFSWFISSLIPLLVSAGLLVVILKLGNLLPYSDPSVVFVFLSVFAVVTILQCFLISTLFSRANLAAACGGIIYFTLYLPYVLCVAWQDYVGFTLKIFASLLSPVAFGFGCEYFALFEEQGIGVQWDNLFESPVEEDGFNLTTSVSMMLFDTFLYGVMTWYIEAVFPGQYGIPRPWYFPCTKSYWFGEESDEKSHPGSNQKRISEICMEEEPTHLKLGVSIQNLVKVYRDGMKVAVDGLALNFYEGQITSFLGHNGAGKTTTMSILTGLFPPTSGTAYILGKDIRSEMSTIRQNLGVCPQHNVLFDMLTVEEHIWFYARLKGLSEKHVKAEMEQMALDVGLPSSKLKSKTSQLSGGMQRKLSVALAFVGGSKVVILDEPTAGVDPYSRRGIWELLLKYRQGRTIILSTHHMDEADVLGDRIAIISHGKLCCVGSSLFLKNQLGTGYYLTLVKKDVESSLSSCRNSSSTVSYLKKEDSVSQSSSDAGLGSDHESDTLTIDVSAISNLIRKHVSEARLVEDIGHELTYVLPYEAAKEGAFVELFHEIDDRLSDLGISSYGISETTLEEIFLKVAEESGVDAETSDGTLPARRNRRAFGDKQSCLRPFTEDDAADPNDSDIDPESRETDLLSGMDGKGSYQVKGWKLTQQQFVALLWKRLLIARRSRKGFFAQIVLPAVFVCIALVFSLIVPPFGKYPSLELQPWMYNEQYTFVSNDAPEDTGTLELLNALTKDPGFGTRCMEGNPIPDTPCQAGEEEWTTAPVPQTIMDLFQNGNWTMQNPSPACQCSSDKIKKMLPVCPPGAGGLPPPQRKQNTADILQDLTGRNISDYLVKTYVQIIAKSLKNKIWVNEFRYGGFSLGVSNTQALPPSQEVNDAIKQMKKHLKLAKDSSADRFLNSLGRFMTGLDTKNNVKVWFNNKGWHAISSFLNVINNAILRANLQKGENPSHYGITAFNHPLNLTKQQLSEVALMTTSVDVLVSICVIFAMSFVPASFVVFLIQERVSKAKHLQFISGVKPVIYWLSNFVWDMCNYVVPATLVIIIFICFQQKSYVSSTNLPVLALLLLLYGWSITPLMYPASFVFKIPSTAYVVLTSVNLFIGINGSVATFVLELFTDNKLNNINDILKSVFLIFPHFCLGRGLIDMVKNQAMADALERFGENRFVSPLSWDLVGRNLFAMAVEGVVFFLITVLIQYRFFIRPRPVNAKLSPLNDEDEDVRRERQRILDGGGQNDILEIKELTKIYRRKRKPAVDRICVGIPPGECFGLLGVNGAGKSSTFKMLTGDTTVTRGDAFLNKNSILSNIHEVHQNMGYCPQFDAITELLTGREHVEFFALLRGVPEKEVGKVGEWAIRKLGLVKYGEKYAGNYSGGNKRKLSTAMALIGGPPVVFLDEPTTGMDPKARRFLWNCALSVVKEGRSVVLTSHSMEECEALCTRMAIMVNGRFRCLGSVQHLKNRFGDGYTIVVRIAGSNPDLKPVQDFFGLAFPGSVLKEKHRNMLQYQLPSSLSSLARIFSILSQSKKRLHIEDYSVSQTTLDQVFVNFAKDQSDDDHLKDLSLHKNQTVVDVAVLTSFLQDEKVKESYV"
+   c = {'A': {'A': 1, 'C': 1}, 'C': {'A': 1, 'C': 1}}
+
+   dd={"match":3,"mismatch":0,"gap":-1}
+   nw_instance = nw_wrapper.NW(a, b, c, dd["match"], dd["mismatch"], dd["gap"])
+   score = nw_instance.get_score()
+   print(f"Alignment Score: {score/max(len(a),len(b))}")
+   di=[]
+   do={}
+   
+   clas,_=read_aminoacidos(afile)
+   clases=get_clases(clas)
+   generate_nwmod(data,clases,dd,sal)
+   matrix = bl.BLOSUM(62)
+   dd={"match":1,"mismatch":-4,"gap":0}
+   sal="blosum62"
+   generate_nwmodpremade(data,matrix,dd,sal)
+   
+   """
+   output=data.to_list()
+   output=remplazar_sequence_for_ID(data,inputFile,"nombres_sust.txt")
+   similarity_matrix=pd.read_csv('resultados/matrizLevenshtein.csv',header=None,index_col=False)
+   #similarity_matrix=similarity_matrix/2
+   #similarity_matrix=similarity_matrix.abs()
+   #similarity_matrix.to_numpy()
+   sim_mat_40=similarity_matrix.copy()
+   sim_mat_20=similarity_matrix.copy()
+   #sim_mat_10=similarity_matrix.copy()
+   data_40=pd.read_csv('resultados/Metrica_Coincidencia.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
+   #data_40=data_40.drop([0])
+   #data_20=pd.read_csv('resultados/Metrica_Coincidencia_20.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
+   #data_20=data_20.drop([0])
+   #data_10=pd.read_csv('resultados/Metrica_Coincidencia_10.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
+   #data_10=data_10.drop([0])
+   #new_sim=np.copy(similarity_matrix)
+   #print(output)
+   #new_sim_mean=np.copy(similarity_matrix)
+   for i,ks in data_40.iterrows():
+       sim_mat_40[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])*0.3/70
+       sim_mat_20[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])
+       
+   #for i,kks in data_20.iterrows():
+   #      sim_mat_20[output.index(kks['proteina1'])][output.index(kks['proteina2'])]+=float(kks['%Coincidencia'])*0.3
+   #for i,ksk in data_10.iterrows():
+   #       sim_mat_10[output.index(ksk['proteina1'])][output.index(ksk['proteina2'])]+=float(ksk['%Coincidencia'])*0.3 
+   dfx=pd.DataFrame(sim_mat_20)
+   dfx=df/2
+   dfx=df-1
+   dfx.abs()
+   
+   dfx.to_csv("resultados/matrizLevenshteinFS_Mean.csv",header=False,index=False)
+   dfx=pd.DataFrame(sim_mat_40)
+   dfx=dfx*0.7
+   dfx=dfx-1
+   dfx.abs()
+   
+   dfx.to_csv("resultados/matrizLevenshteinFS_70.csv",header=False,index=False)
+   similarity_matrix=pd.read_csv('resultados/matrizNeedleWunch.csv',header=None,index_col=False)-1
+   similarity_matrix=similarity_matrix/2
+   similarity_matrix=similarity_matrix.abs()
+   #similarity_matrix.to_numpy()
+   sim_mat_40=similarity_matrix.copy()
+   sim_mat_20=similarity_matrix.copy()
+   #sim_mat_10=similarity_matrix.copy()
+   data_40=pd.read_csv('resultados/Metrica_Coincidencia.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
+   #data_40=data_40.drop([0])
+   #data_20=pd.read_csv('resultados/Metrica_Coincidencia_20.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
+   #data_20=data_20.drop([0])
+   #data_10=pd.read_csv('resultados/Metrica_Coincidencia_10.csv',names=['proteina1','proteina2','%Coincidencia'],index_col=False)
+   #data_10=data_10.drop([0])
+   #new_sim=np.copy(similarity_matrix)
+   #print(output)
+   #new_sim_mean=np.copy(similarity_matrix)
+   indexes=[]
+   for i,ks in data_40.iterrows():
+       indexes.append(output.index(ks['proteina1']),output.index(ks['proteina2']))
+       sim_mat_40[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])*0.3/70
+       sim_mat_20[output.index(ks['proteina1'])][output.index(ks['proteina2'])]+=float(ks['%Coincidencia'])/100
+   print(indexes)    
+   #for i,kks in data_20.iterrows():
+   #      sim_mat_20[output.index(kks['proteina1'])][output.index(kks['proteina2'])]+=float(kks['%Coincidencia'])*0.3
+   #for i,ksk in data_10.iterrows():
+   #       sim_mat_10[output.index(ksk['proteina1'])][output.index(ksk['proteina2'])]+=float(ksk['%Coincidencia'])*0.3 
+   dfx=pd.DataFrame(sim_mat_20)
+   dfx=df/2
+   dfx=df-1
+   dfx.abs()
+   
+   dfx.to_csv("resultados/matrizNeedleWunchFS_Mean.csv",header=False,index=False)
+   dfx=pd.DataFrame(sim_mat_40)
+   dfx=dfx*0.7
+   dfx=dfx-1
+   dfx.abs()
+   """
+  # dfx.to_csv("resultados/mmatrizNeedleWunchFS_70.csv",header=False,index=False)
+   """
+   dfx=pd.DataFrame(sim_mat_10)
+   dfx=df/1.3
+   dfx=df-1
+   dfx.abs()
+   
+   dfx.to_csv("resultados/matrizLevenshteinFS_10.csv",header=False,index=False)              
+   s1 = pd.merge(data_40, data_20, how='inner', on=['proteina1','proteina2'])
+   s2= pd.merge(s1,data_10, how='inner', on=['proteina1','proteina2'])
+   ss=s1[(~(s1['proteina1'].isin(s2['proteina1']))& ~(s1['proteina2'].isin(s2['proteina2'])))]
+   s3 = pd.merge(data_20, data_10, how='inner', on=['proteina1','proteina2'])
+   print(s3['proteina2'].isin(s2['proteina2']))
+   s4=s3[~(s3['proteina1'].isin(s2['proteina1']))&~(s3['proteina2'].isin(s2['proteina2']))]
+   s5 = pd.merge(data_40, data_10, how='inner', on=['proteina1','proteina2'])
+   s6=s5.loc[~(s5['proteina1'].isin(s2['proteina1']))&(s5['proteina2'].isin(s2['proteina2']))]
+   data_401=data_40[~(data_40['proteina1'].isin(data_20['proteina1']))& ~(data_40['proteina2'].isin(data_20['proteina2']))]
+   data_402=data_40[~(data_40['proteina1'].isin(data_10['proteina1']))& ~(data_40['proteina2'].isin(data_10['proteina2']))]
+   data_40X=data_402[~(data_402['proteina1'].isin(data_20['proteina1']))& ~(data_402['proteina2'].isin(data_20['proteina2']))]
+   data_201=data_20[~(data_20['proteina1'].isin(data_40['proteina1']))&(data_20['proteina2'].isin(data_40['proteina2']))]
+   data_202=data_20[~(data_20['proteina1'].isin(data_10['proteina1']))&(data_20['proteina2'].isin(data_10['proteina2']))]
+   data_20X=data_202[~(data_202['proteina1'].isin(data_40['proteina1']))&(data_202['proteina2'].isin(data_40['proteina2']))]
+   data_101=data_10[~(data_10['proteina1'].isin(data_40['proteina1']))&(data_10['proteina2'].isin(data_40['proteina2']))]
+   data_102=data_10[~(data_10['proteina1'].isin(data_20['proteina1']))&(data_10['proteina2'].isin(data_20['proteina2']))]
+   data_10X=data_102[~(data_102['proteina1'].isin(data_40['proteina1']))&(data_102['proteina2'].isin(data_40['proteina2']))]
+   
+   #print(s3)
+   print(data_40X)
+   print(data_20X)
+   print(data_10X)
+   #print(data_402)    
+   for i in range(0,similarity_matrix.shape[0]):
+        for j in range(0,similarity_matrix.shape[1]):
+            cross=0
+            cross_over=0
+            dd_10_check=False
+            dd_20_check=False
+            dd_40_check=False
+            if ((data_40['proteina1']==output[i]) & (data_40['proteina2']==output[j])).any() or ((data_40['proteina1']==output[j]) & (data_40['proteina2']==output[i])).any():
+               dd_40_check=True
+               if ((data_40['proteina1']==output[i]) & (data_40['proteina2']==output[j])).any():
+                  dd_40=float(data_40[(data_40['proteina1']==output[i]) & (data_40['proteina2']==output[j])]['%Coincidencia'].to_list()[0])/100
+               else:  
+                  dd_40=float(data_40[(data_40['proteina1']==output[j]) & (data_40['proteina2']==output[i])]['%Coincidencia'].to_list()[0])/100
+            if ((data_20['proteina1']==output[i]) & (data_20['proteina2']==output[j])).any() or ((data_20['proteina1']==output[j]) & (data_20['proteina2']==output[i])).any():
+               dd_20_check=True
+               if ((data_20['proteina1']==output[i]) & (data_20['proteina2']==output[j])).any():
+                  dd_20=float(data_20[(data_20['proteina1']==output[i]) & (data_20['proteina2']==output[j])]['%Coincidencia'].to_list()[0])/100
+               else:  
+                  dd_20=float(data_20[(data_20['proteina1']==output[j]) & (data_20['proteina2']==output[i])]['%Coincidencia'].to_list()[0])/100
+            if ((data_10['proteina1']==output[i]) & (data_10['proteina2']==output[j])).any() or ((data_10['proteina1']==output[j]) & (data_10['proteina2']==output[i])).any():
+               dd_10_check=True
+               if ((data_10['proteina1']==output[i]) & (data_10['proteina2']==output[j])).any():
+                  dd_10=float(data_10[(data_10['proteina1']==output[i]) & (data_10['proteina2']==output[j])]['%Coincidencia'].to_list()[0])/100
+               else:  
+                  dd_10=float(data_10[(data_10['proteina1']==output[j]) & (data_10['proteina2']==output[i])]['%Coincidencia'].to_list()[0])/100
+            
+            if dd_10_check and dd_40_check and dd_20_check:
+                 #print(dd_40)
+                 #print(dd_20)
+                 #print(dd_10)
+                 cross=(dd_40-dd_20)-(dd_20-dd_10)
+                 cross_over=(dd_40+dd_20+dd_10)/len([dd_20,dd_10,dd_40])
+            elif dd_20_check and dd_40_check:
+                 cross=(dd_40-dd_20)
+                 cross_over=(dd_40+dd_20)/len([dd_20,dd_40])
+            elif  dd_10_check and dd_20_check:
+                 cross=(dd_20-dd_10)
+                 cross_over=(dd_20+dd_10)/len([dd_20,dd_10])
+            elif dd_40_check and dd_10_check:
+                 cross=(dd_40-dd_10)
+                 cross_over=(dd_40+dd_10)/len([dd_10,dd_40])
+            elif dd_40_check:
+                 cross=-dd_40
+                 cross_over=dd_40
+            elif dd_20_check:
+                 cross=-dd_20
+                 cross_over=dd_20
+            elif dd_10_check:
+                 cross=-dd_10
+                 cross_over=dd_10
+            if(cross!=0):     
+               print(cross)
+            if(cross==0):
+               cross=-1   
+            new_sim[i][j]+=0.3*cross
+            new_sim_mean[i][j]+=0.3*cross_over
+   
+
+   df=pd.DataFrame(new_sim)
+   df=df-1
+   df.abs()
+   df=df/1.3
+   df.to_csv("resultados/matrizLevenshteinF.csv",header=False,index=False)
+   df2=pd.DataFrame(new_sim_mean)
+   df2=df/1.3
+   df2=df-1
+   df2.abs()
+   """
+   
+   #df2.to_csv("resultados/matrizLevenshteinFMean.csv",header=False,index=False)                                                     
+   
--- a/Code Approach 1 and 2 (2.1 - 2.2)/metrics.py
+++ b/Code Approach 1 and 2 (2.1 - 2.2)/metrics.py
+import time
+import pandas as pd
+import Levenshtein
+import csv
+import ast
+import numpy as np
+from descarteProteinas import remplazar_ID_for_sequence as rp
+from descarteProteinas import substitute_or_remove_prot_id
+from generate_the_excel import substitute_or_remove_prot_id
+    
+def similitudProteinas(sequences):
+    output = []
+
+    for row1 in sequences:
+        for row2 in sequences:
+            if row1 != row2:
+                #print(row1,row2)
+                #similarity = abs(smith.SmithWaterman(row1, row2).get_score()-1) / max(len(row1), len(row2))
+                similarity = abs(needle.NeedlemanWunsch(row1, row2).get_score()-1) / (2*max(len(row1), len(row2)))
+                #similarity = abs(Levenshtein.distance(row1, row2)) / max(len(row1), len(row2))
+                
+                output.append([row1, row2, similarity*100])
+    return output
+
+def metrica_distanciaProteinas():
+    # Leer los archivos CSV
+    data = pd.read_csv("resultados/patronesIdenticos.csv")
+    df_b = pd.read_csv("AllProteins_%Similitud.csv")
+
+    # Crear un diccionario de similaridades
+    proteinas_dict = dict(zip(zip(df_b['Proteina1'], df_b['Proteina2']), df_b['Similaridad']))
+
+    # Agrupar por el patrón de proteína
+    grupos = data.groupby('Patron')
+
+    # Crear una lista de tuplas con los índices únicos de las filas en cada grupo
+    indices = [grupo.index for patron, grupo in grupos]
+
+    # Generar todas las combinaciones únicas de índices
+    index_combinations = [(i, j) for grp in indices for i in grp for j in grp if i != j]
+
+    # Filtrar las combinaciones que no son duplicadas y tienen diferencias en las filas correspondientes
+    filtered_combinations = [comb for comb in index_combinations if not data.loc[comb[0]].equals(data.loc[comb[1]])]
+
+    # Filtrar las combinaciones que existen en el diccionario de similaridades
+    output = [[data.loc[comb[0], 'Patron'], data.loc[comb[0], 'Proteina'], data.loc[comb[1], 'Proteina'],
+               proteinas_dict.get((data.loc[comb[0], 'Proteina'], data.loc[comb[1], 'Proteina']), '')] for comb in
+              filtered_combinations]
+
+    # Crear un DataFrame a partir de la lista de resultados
+    df = pd.DataFrame(output, columns=['Patron', 'Proteina1', 'Proteina2', 'Similitud'])
+
+    # Guardar el DataFrame en un archivo CSV
+    df.to_csv('resultados/Metrica_distanciaProteinasMismoPatron.csv',
+              index=False)
+
+def patronesComun(patronesComun,archivoEntrada,ocurrencia,sal,archivoClases):
+    
+    # Leer el archivo CSV y cargar los datos en una lista de diccionarios
+    registros = []
+    cl=pd.read_excel(archivoClases)
+    #cl=substitute_or_remove_prot_id(cl,"r")
+    #data2=data.copy()
+    cli=cl.groupby('protein_id')
+    di=[]
+    do={}
+    for k,v in cli:
+      for index,row in v.iterrows():
+         di.append(row['class_name'])
+      do[k]=di
+      di=[]
+    class_dict=do
+    with open("resultados/patronesIdenticos"+str(int((float(ocurrencia)%1)*100))+sal+".csv", 'r') as file:
+        reader = csv.DictReader(file)
+        for row in reader:
+            registros.append(row)
+    
+    # Diccionario para almacenar la cantidad de patrones únicos por proteína
+    patrones_por_proteina = {}
+    posiciones_patron={}
+    # Iterar sobre los registros y extraer los patrones únicos de cada proteína
+    for registro in registros:
+        proteina = registro['Proteina']
+        patron = registro['Patron']
+        posicion = registro['Posiciones']
+        if proteina not in patrones_por_proteina:
+            patrones_por_proteina[proteina] = set()
+        patrones_por_proteina[proteina].add(patron)
+        pp=[oo for oo in ast.literal_eval(posicion) if oo is not '[' and oo is not ']']
+        if proteina not in posiciones_patron:
+            posiciones_patron[proteina]={}
+        posiciones_patron[proteina][patron]=[]    
+        for u in pp:
+           for kaa in range(0,len(patron)):
+                 posiciones_patron[proteina][patron].append(kaa+int(u))
+        
+    # Diccionario para almacenar las proteinas que tienen en común cada par de proteinas
+    proteinas_comunes = {}
+    rr=[]
+    df_p = pd.read_excel(archivoEntrada)
+    #df_p = pd.read_excel("proteinasClase_PC00060.xlsx")
+    #df_p=substitute_or_remove_prot_id(df_p,"r")
+    proteinas_dict2 = dict(df_p[['protein_id','protein_sequence']].values)
+    pares_proteinas_procesados = set()
+    # Filtrar las proteínas que tienen al menos 10 patrones únicos en común
+        
+    for proteina1, patrones1 in patrones_por_proteina.items():
+        for proteina2, patrones2 in patrones_por_proteina.items():
+            if proteina1 != proteina2 and (proteina2, proteina1) not in pares_proteinas_procesados:
+                patrones_comunes = patrones1.intersection(patrones2)
+                if len(patrones_comunes) >= patronesComun:
+                    par_proteinas = (proteina1, proteina2)
+                       
+                       
+                       
+                    proteinas_comunes[par_proteinas] = patrones_comunes
+                    pares_proteinas_procesados.add(par_proteinas)
+            
+    output = []
+    df_b = pd.read_csv("AllProteins_%Similitud.csv")
+    output2=[]
+    proteinas_dict = df_b.set_index(['Proteina1', 'Proteina2'])['Similaridad'].to_dict()
+    outbreak=[]
+    first=True
+    first2=True
+    for par_proteinas, patrones_comunes in proteinas_comunes.items():
+    
+        proteina1, proteina2 = par_proteinas
+        pattern_lengths = {}
+        pattern_l={}
+        Antecedentes={}
+        
+        if(proteina1 == 'Q13753' and proteina2 == 'P07550'):
+            print(patrones_comunes)
+        for pattern in patrones_comunes:
+            length = len(pattern)
+            key = f'Longitud {length}'
+            if key in pattern_lengths:
+                pattern_lengths[key].append([pattern])
+                Add=posiciones_patron[proteina1][pattern]
+                if(proteina1 == 'Q13753' and proteina2 == 'P07550'):
+                    print(Add)
+                if proteina1 not in Antecedentes:
+                    Antecedentes[proteina1]=set()
+                lex=len(Antecedentes[proteina1] & set(Add))
+                Antecedentes[proteina1].update(Add)   
+                pattern_l[key][0]+=len(Add)-lex
+                Add=posiciones_patron[proteina2][pattern]
+
+                if proteina2 not in Antecedentes:
+                    Antecedentes[proteina2]=set()
+                lex=len(Antecedentes[proteina2] & set(Add))
+                Antecedentes[proteina2].update(Add)
+                pattern_l[key][1]+=len(Add)-lex
+                #sprint(length*len(Posic))
+            else:
+                pattern_lengths[key] = [[pattern]]
+                Add=posiciones_patron[proteina1][pattern]
+                
+                if proteina1 not in Antecedentes:
+                    Antecedentes[proteina1]=set()   
+                lex=len(Antecedentes[proteina1] & set(Add))
+                #print(lex)
+                #print(Antecedentes)
+                Antecedentes[proteina1].update(Add)
+                Add2=posiciones_patron[proteina2][pattern]
+                
+                
+                if proteina2 not in Antecedentes:
+                    Antecedentes[proteina2]=set()
+                lex2=len(Antecedentes[proteina2] & set(Add2))
+                Antecedentes[proteina2].update(Add2)
+                
+                pattern_l[key]=[len(Add)-lex,len(Add2)-lex2]
+                
+        sorted_pattern_lengths = dict(sorted(pattern_lengths.items(), key=lambda x: int(x[0][9:]), reverse=True))
+        
+        if proteina1 != proteina2:
+            prot=[proteinas_dict2[proteina1],proteinas_dict2[proteina2]]
+            if Antecedentes != {} and(len(prot[0])>0 and len(prot[1])>0):
+                output.append([sorted_pattern_lengths, proteina1, proteina2,class_dict[proteina1] if proteina1 in class_dict else "N/A",class_dict[proteina2] if proteina2 in class_dict else "N/A"])
+                
+                df = pd.DataFrame(output, columns=['Patrones', 'Proteina1', 'Proteina2',"classesProt1","classesProt2"])
+                output=[]
+                if(first2):
+                     df.to_csv('resultados/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',
+              index=False)
+                     first2=False
+                else:
+                     df.to_csv('resultados/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',index=False,header=False,mode='a')
+                   
+            #else:
+                #output.append([sorted_pattern_lengths, proteina1, proteina2,
+                #               'N/A'])                               
+            
+                #print("prot1 : "+proteina1 + " : "+str(len(Antecedentes[proteina1])))
+                #print("prot2 : "+proteina2 + " : " + str(len(Antecedentes[proteina2]) ))
+            if Antecedentes != {} and(len(prot[0])>0 and len(prot[1])>0):                              
+                 output2.append([proteina1,proteina2, (np.mean([len(Antecedentes[proteina1])/len(prot[0]),len(Antecedentes[proteina2])/len(prot[1])])*100),class_dict[proteina1] if proteina1 in class_dict else "N/A",class_dict[proteina2] if proteina2 in class_dict else "N/A"])
+                 df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia',"classesProt1","classesProt2"])
+                 output2=[]
+                 if(first):
+                     df2.to_csv('resultados/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',index=False)
+                     first=False
+                 else:
+                     df2.to_csv('resultados/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',index=False,header=False,mode='a')
+                 
+                 
+                                
+    #output2=sorted(output2, key = lambda x: int(x[2]))
+    #df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia'])
+    #df2.to_csv('resultados/Metrica_Coincidencia.csv',
+    #          index=False)
+    
+
+
+def patronesComunClas(patronesComun,name,archivoEntrada,ocurrencia,sal,archivoClases):
+    
+    # Leer el archivo CSV y cargar los datos en una lista de diccionarios
+    registros = []
+    cl=pd.read_excel(archivoClases)
+    #cl=substitute_or_remove_prot_id(cl,"r")
+    #data2=data.copy()
+    cli=cl.groupby('protein_id')
+    di=[]
+    do={}
+    for k,v in cli:
+      for index,row in v.iterrows():
+         di.append(row['class_name'])
+      do[k]=di
+      di=[]
+    class_dict=do
+    with open("clases/"+name+"/patronesIdenticos"+str(int((float(ocurrencia)%1)*100))+sal+".csv", 'r') as file:
+        reader = csv.DictReader(file)
+        for row in reader:
+            registros.append(row)
+    
+    # Diccionario para almacenar la cantidad de patrones únicos por proteína
+    patrones_por_proteina = {}
+    posiciones_patron={}
+    # Iterar sobre los registros y extraer los patrones únicos de cada proteína
+    for registro in registros:
+        proteina = registro['Proteina']
+        patron = registro['Patron']
+        posicion = registro['Posiciones']
+        if proteina not in patrones_por_proteina:
+            patrones_por_proteina[proteina] = set()
+        patrones_por_proteina[proteina].add(patron)
+        pp=[oo for oo in ast.literal_eval(posicion) if oo is not '[' and oo is not ']']
+        if proteina not in posiciones_patron:
+            posiciones_patron[proteina]={}
+        posiciones_patron[proteina][patron]=[]    
+        for u in pp:
+           for kaa in range(0,len(patron)):
+                 posiciones_patron[proteina][patron].append(kaa+int(u))
+        
+    # Diccionario para almacenar las proteinas que tienen en común cada par de proteinas
+    proteinas_comunes = {}
+    rr=[]
+    df_p = pd.read_excel(archivoEntrada)
+    #df_p = pd.read_excel("proteinasClase_PC00060.xlsx")
+    #df_p=substitute_or_remove_prot_id(df_p,"r")
+    proteinas_dict2 = dict(df_p[['protein_id','protein_sequence']].values)
+    pares_proteinas_procesados = set()
+    # Filtrar las proteínas que tienen al menos 10 patrones únicos en común
+        
+    for proteina1, patrones1 in patrones_por_proteina.items():
+        for proteina2, patrones2 in patrones_por_proteina.items():
+            if proteina1 != proteina2 and (proteina2, proteina1) not in pares_proteinas_procesados:
+                patrones_comunes = patrones1.intersection(patrones2)
+                if len(patrones_comunes) >= patronesComun:
+                    par_proteinas = (proteina1, proteina2)
+                       
+                       
+                       
+                    proteinas_comunes[par_proteinas] = patrones_comunes
+                    pares_proteinas_procesados.add(par_proteinas)
+            
+    output = []
+    df_b = pd.read_csv("AllProteins_%Similitud.csv")
+    output2=[]
+    proteinas_dict = df_b.set_index(['Proteina1', 'Proteina2'])['Similaridad'].to_dict()
+    outbreak=[]
+    first=True
+    first2=True
+    for par_proteinas, patrones_comunes in proteinas_comunes.items():
+    
+        proteina1, proteina2 = par_proteinas
+        pattern_lengths = {}
+        pattern_l={}
+        Antecedentes={}
+        
+        if(proteina1 == 'Q13753' and proteina2 == 'P07550'):
+            print(patrones_comunes)
+        for pattern in patrones_comunes:
+            length = len(pattern)
+            key = f'Longitud {length}'
+            if key in pattern_lengths:
+                pattern_lengths[key].append([pattern])
+                Add=posiciones_patron[proteina1][pattern]
+                if(proteina1 == 'Q13753' and proteina2 == 'P07550'):
+                    print(Add)
+                if proteina1 not in Antecedentes:
+                    Antecedentes[proteina1]=set()
+                lex=len(Antecedentes[proteina1] & set(Add))
+                Antecedentes[proteina1].update(Add)   
+                pattern_l[key][0]+=len(Add)-lex
+                Add=posiciones_patron[proteina2][pattern]
+
+                if proteina2 not in Antecedentes:
+                    Antecedentes[proteina2]=set()
+                lex=len(Antecedentes[proteina2] & set(Add))
+                Antecedentes[proteina2].update(Add)
+                pattern_l[key][1]+=len(Add)-lex
+                #sprint(length*len(Posic))
+            else:
+                pattern_lengths[key] = [[pattern]]
+                Add=posiciones_patron[proteina1][pattern]
+                
+                if proteina1 not in Antecedentes:
+                    Antecedentes[proteina1]=set()   
+                lex=len(Antecedentes[proteina1] & set(Add))
+                #print(lex)
+                #print(Antecedentes)
+                Antecedentes[proteina1].update(Add)
+                Add2=posiciones_patron[proteina2][pattern]
+                
+                
+                if proteina2 not in Antecedentes:
+                    Antecedentes[proteina2]=set()
+                lex2=len(Antecedentes[proteina2] & set(Add2))
+                Antecedentes[proteina2].update(Add2)
+                
+                pattern_l[key]=[len(Add)-lex,len(Add2)-lex2]
+                
+        sorted_pattern_lengths = dict(sorted(pattern_lengths.items(), key=lambda x: int(x[0][9:]), reverse=True))
+        
+        if proteina1 != proteina2:
+            prot=[proteinas_dict2[proteina1],proteinas_dict2[proteina2]]
+            if Antecedentes != {} and(len(prot[0])>0 and len(prot[1])>0):
+                output.append([sorted_pattern_lengths, proteina1, proteina2,class_dict[proteina1] if proteina1 in class_dict else "N/A",class_dict[proteina2] if proteina2 in class_dict else "N/A"])
+                
+                df = pd.DataFrame(output, columns=['Patrones', 'Proteina1', 'Proteina2',"classesProt1","classesProt2"])
+                output=[]
+                if(first2):
+                     df.to_csv('clases/'+name+'/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',
+              index=False)
+                     first2=False
+                else:
+                     df.to_csv('clases/'+name+'/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',index=False,header=False,mode='a')
+                   
+            #else:
+                #output.append([sorted_pattern_lengths, proteina1, proteina2,
+                #               'N/A'])                               
+            
+                #print("prot1 : "+proteina1 + " : "+str(len(Antecedentes[proteina1])))
+                #print("prot2 : "+proteina2 + " : " + str(len(Antecedentes[proteina2]) ))
+            if Antecedentes != {} and(len(prot[0])>0 and len(prot[1])>0):                              
+                 output2.append([proteina1,proteina2, (np.mean([len(Antecedentes[proteina1])/len(prot[0]),len(Antecedentes[proteina2])/len(prot[1])])*100),class_dict[proteina1] if proteina1 in class_dict else "N/A",class_dict[proteina2] if proteina2 in class_dict else "N/A"])
+                 df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia',"classesProt1","classesProt2"])
+                 output2=[]
+                 if(first):
+                     df2.to_csv('clases/'+name+'/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',index=False)
+                     first=False
+                 else:
+                     df2.to_csv('clases/'+name+'/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+sal+'.csv',index=False,header=False,mode='a')
+                 
+                 
+                                
+    #output2=sorted(output2, key = lambda x: int(x[2]))
+    #df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia'])
+    #df2.to_csv('resultados/Metrica_Coincidencia.csv',
+    #          index=False)
+
+
+def remplazar_sequence_for_ID(output,archivoEntrada):
+    df_b = pd.read_excel(archivoEntrada)
+    #df_b = pd.read_excel("proteinasClase_PC00060.xlsx")
+    #df_b=substitute_or_remove_prot_id(df_b,"r")
+    # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
+    output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0]))
+
+    proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
+
+    for item in output_ordered:
+        protein_sequence1 = item[0]
+        protein_sequence2 = item[1]
+        if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict:
+            item[0] = proteinas_dict[protein_sequence1]
+            item[1] = proteinas_dict[protein_sequence2]
+
+
+
+    df_a = pd.DataFrame(output_ordered, columns=['Proteina1', 'Proteina2', 'Similaridad'])
+
+    # Guardar el DataFrame actualizado en un archivo CSV
+    df_a.to_csv('AllProteins_%Similitud.csv', index=False)
+
--- a/Code Approach 1 and 2 (2.1 - 2.2)/patterns.py
+++ b/Code Approach 1 and 2 (2.1 - 2.2)/patterns.py
+import pandas as pd
+import time
+import ast
+import csv
+import math
+from interfazGrafica import interfaz
+from descarteProteinas import ejecutar,remplazar_ID_for_sequence
+from generate_the_excel import substitute_or_remove_prot_id
+import metricas
+from graficas import grafica
+import os
+import json
+import ast
+import re
+from patrones_similares_aa import remplazar_sequence_for_ID as remplazar_s
+from patrones_similares_aa import buscar_patrones_simAA
+from collections import defaultdict
+
+
+
+def readData(archivoEntrada, enfermedad, archivoTarget):
+    """
+    Reads data from an Excel file, filters it based on disease (if specified),
+    and returns protein sequences along with the number of rows.
+
+    Parameters:
+    - archivoEntrada: str, path to the input Excel file.
+    - enfermedad: str, disease ID for filtering (empty string for no filtering).
+    - archivoTarget: str, path to the target Excel file (not currently in use).
+
+    Returns:
+    - sequences: pandas Series, protein sequences column.
+    - num_filas: int, number of rows in the filtered data.
+    """
+    data = pd.read_excel(archivoEntrada)
+    #data=substitute_or_remove_prot_id(data,"r")
+    #dataC=substitute_or_remove_prot_id(dataC,"r")
+    #Descarte de proteinas
+    #print(data)
+    #data = data[~data['protein_id'].isin(dataC['ProteinasDescartadas'])]
+    print("Se ha realizado el descarte de proteínas")
+
+    # "C0002395"
+    if(enfermedad != ''):
+        data = data.loc[data["disease_id"] == enfermedad]
+        #dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
+        #print("Se han seleccionado las proteínas de la enfermedad elegida")
+        #dataB=substitute_or_remove_prot_id(dataB,"r")
+    #if(archivoTarget != ''):
+    #    dataB=substitute_or_remove_prot_id(dataB,"r")
+        #Eliminar las proteinas target
+    #    data = data[~((data["disease_id"] == enfermedad) &
+    #                  (data["protein_id"].isin(dataB["protein_id"])))]
+    #    print("Se han descartado las proteínas del archivo target")
+
+    sequences = data["protein_sequence"]
+    print(sequences)
+    num_filas = sequences.shape[0]
+
+    return sequences, num_filas
+
+def guardar_patrones_len1(sequences, pattern_freqMin):
+    """
+    Processes protein sequences to find patterns of length 1 and their positions,
+    filters patterns based on minimum occurrence, and saves results to a CSV file.
+
+    Parameters:
+    - sequences: pandas Series, protein sequences.
+    - pattern_freqMin: dict, dictionary to store patterns and their occurrences.
+
+    Returns:
+    - pattern_freqMin: dict, updated dictionary of patterns.
+    - posicionPatterns: dict, positions of each character in the sequences.
+    - longitud_max: int, maximum length of protein sequences.
+    """
+    all_patterns = dict()
+    longitud_max = 0
+    # Each pattern associated to the proteins the pattern is in
+    pattern_proteins = {}
+    for protein in sequences:
+        longitud = len(protein)
+        if longitud > longitud_max:
+            longitud_max = longitud
+
+        all_patterns[protein] = []
+        # En cada iteración guarda los patrones que aparecen en la secuencia con sus posiciones asociadas a la proteina
+        posicionPatterns = dict()
+        for index, letter in enumerate(protein):
+            posicionPatterns[letter] = posicionPatterns.get(letter, []) + [index]
+
+        all_patterns[protein] = posicionPatterns
+
+
+    for protein, patterns in all_patterns.items():
+        for pattern, positions in patterns.items():
+            if pattern not in pattern_proteins:
+                pattern_proteins[pattern] = {}
+            if protein not in pattern_proteins[pattern]:
+                pattern_proteins[pattern][protein] = []
+            pattern_proteins[pattern][protein].extend(positions)
+
+
+    for pattern, proteins in pattern_proteins.items():
+        if len(proteins) >= min_ocurrence:
+            pattern_freqMin[pattern] = proteins
+
+    df = pd.DataFrame(pattern_freqMin.items(), columns=['pattern', 'proteins'])
+    df.to_csv('prueba2.csv', index=False)
+    return pattern_freqMin, posicionPatterns, longitud_max
+
+def buscar_patrones_identicos(sequences):
+    """
+    Searches for identical patterns of different lengths in protein sequences
+    and stores them along with their positions in a dictionary.
+
+    Parameters:
+    - sequences: pandas Series, protein sequences.
+
+    Returns:
+    - pattern_freqMin: dict, dictionary of patterns and their positions.
+    - num_patrones: int, number of unique patterns found.
+    """
+    pattern_freqMin = {}
+    pattern_freqMin, posicionPatterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin)
+
+    if bool(pattern_freqMin):
+        for pattern_length in range(2, longitud_max + 1):
+            # Si se intenta acceder a una clave que no existe se creara una lista vacia
+            auxPos = {}
+            sub_seqs = []
+            for pattern, proteins in pattern_freqMin.items():
+                if len(pattern) == pattern_length - 1:
+                    for prot, positions in proteins.items():
+                        protein_len = len(prot)
+                        if protein_len < pattern_length - 1:
+                            continue
+                        for position in positions:
+                            if (protein_len < position + pattern_length):
+                                continue
+                            sub_seq = prot[position:position + pattern_length]
+                            if sub_seq in pattern_freqMin:
+                                continue
+                            # Si la ultima letra que es la nueva del patron ya esta min_freq, el patron es posible
+                            # min freq tb
+                            ultima_letra = sub_seq[-1]
+                            pos_ultima_letra = position + pattern_length - 1
+                            if ultima_letra in pattern_freqMin and pos_ultima_letra in pattern_freqMin[ultima_letra][prot]:
+                                if sub_seq not in auxPos:
+                                    auxPos[sub_seq] = {}
+                                if prot not in auxPos[sub_seq]:
+                                    auxPos[sub_seq][prot] = []
+                                auxPos[sub_seq][prot].append(position)
+                                if sub_seq not in sub_seqs:
+                                    sub_seqs.append(sub_seq)
+                print(pattern_length)
+                sub_seqs_copy = sub_seqs.copy()
+                for p in sub_seqs_copy:
+                      if len(auxPos[p]) < min_ocurrence:
+                          del auxPos[p]
+                          sub_seqs.remove(p)
+
+            # Si no se encuentra ningun patron de longitud pattern_length se sale del bucle. No hay mas patrones posible a encontrar
+            if not bool(auxPos):
+                break
+
+            for pattern, proteins in auxPos.items():
+                for prot, pos in proteins.items():
+                    if pattern not in pattern_freqMin:
+                        pattern_freqMin[pattern] = {}
+                    if prot not in pattern_freqMin[pattern]:
+                        pattern_freqMin[pattern][prot] = []
+                    found=list(filter(lambda x: pos-len(pattern) <= x <= pos+len(pattern), pattern_freqMin[pattern][prot]))
+                    print(found)
+                    print(len(found))
+                    if(len(found)<=0):        
+                       pattern_freqMin[pattern][prot].extend(pos)
+                       if len(pattern) > 2:
+                         if pattern[:-1] in pattern_freqMin:
+                            del pattern_freqMin[pattern[:-1]]
+                         if pattern[1:] in pattern_freqMin:
+                            del pattern_freqMin[pattern[1:]]
+
+
+
+        # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
+        
+        dict_ordered_patterns = dict(sorted(pattern_freqMin.items(), key=lambda x: (-len(x[0]), x[0])))
+        #dict_ordered_patterns = {k: v for k, v in dict_ordered_patterns.items() if len(k) >= 4}
+        df = pd.DataFrame(dict_ordered_patterns.items(), columns=['pattern', 'proteins'])
+        num_patrones = df.shape[0]
+    #pattern_freqMin = {k: v for k, v in pattern_freqMin.items() if len(k) >= 4}
+    return pattern_freqMin, num_patrones
+
+def remplazar_sequence_for_ID(pattern_freqMin,archivoEntrada,ocurrencia,Sal,archivoClases=None):
+    """
+    Replaces identified patterns in the original data with their corresponding IDs,
+    saves the results to a CSV file, and prints a success message.
+
+    Parameters:
+    - pattern_freqMin: dict, dictionary of patterns and their positions.
+    - archivoEntrada: str, path to the input Excel file.
+    - ocurrencia: float, occurrence parameter.
+    - archivoClases (Optional): str, path to the classes Excel file.
+    """
+    df_b = pd.read_excel(archivoEntrada)
+    #df_b=pd.read_excel("proteinasClase_PC00060.xlsx")
+    #df_b=substitute_or_remove_prot_id(df_b,'r')
+    if(archivoClases is not None):
+      cl=pd.read_excel(archivoClases)
+    #cl=substitute_or_remove_prot_id(cl,"r")
+    #data2=data.copy()
+      cli=cl.groupby('protein_id')
+      di=[]
+      do={}
+      for k,v in cli:
+        for index,row in v.iterrows():
+         di.append(row['class_name'])
+        do[k]=di
+        di=[]
+      class_dict=do
+      output = []
+    
+    for key, value in pattern_freqMin.items():
+        for proteina, posiciones in value.items():
+            output.append([key, proteina, posiciones])
+
+    output = [sublista for sublista in output if len(sublista[0]) != 1]
+
+    # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
+    output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0]))
+
+
+    proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
+    for item in output_ordered:
+        protein_sequence = item[1]
+        if protein_sequence in proteinas_dict:
+            item[1] = proteinas_dict[protein_sequence]
+        item.append(class_dict[item[1]] if item[1] in class_dict else "N/A")
+
+    df_a = pd.DataFrame(output_ordered, columns=['Patron', 'Proteina', 'Posiciones','classesProt'])
+
+    # Guardar el DataFrame actualizado en un archivo CSV
+    df_a.to_csv('resultados/patronesIdenticos'+str(int((ocurrencia%1)*100))+Sal+'.csv', index=False)
+    print("Se ha generado el .csv con los patrones idénticos encontrados")
+def calculate_sequence_length(sequences):
+    """
+    Calculates the total length of protein sequences.
+
+    Parameters:
+    - sequences: pandas Series, protein sequences.
+
+    Returns:
+    - seq_len: int, total length of protein sequences.
+    """
+    seq_len = 0
+    for i in sequences:
+        seq_len += len(i)
+    return seq_len
+def group_classes_by_protein(cl):
+    """
+    Groups classes by protein ID.
+
+    Parameters:
+    - cl: pandas DataFrame, DataFrame containing class information.
+
+    Returns:
+    - class_dict: dict, dictionary of protein IDs and associated classes.
+    """
+    class_dict = {}
+    cli = cl.groupby('protein_id')
+    for k, v in cli:
+        class_names = [row['class_name'] for index, row in v.iterrows()]
+        class_dict[k] = class_names
+    return class_dict
+def compute_pattern_ocurrence(df,sal):
+    """
+    Computes the occurrence of patterns in the data and saves the results to a CSV file.
+
+    Parameters:
+    - df: pandas DataFrame, DataFrame containing pattern information.
+    
+    Note: saves the patterns, the amount of times a pattern appears in proteins of the dataset and the number of proteins that have that pattern.
+    """
+    df2=df.groupby('Patron')
+    compl=0
+    comp=0
+    first=True
+    res=set()
+    for k,v in df2:
+         res=set()  
+         for index,row in v.iterrows():
+             Posic=[oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']']
+             rem=[]
+             if(len(Posic)>2):
+              u=0
+              while u+1<len(Posic):
+                 if(Posic[u]+len(k)<=Posic[u+1]):
+                    del Posic[u+1]
+                 else:
+                   u+=1
+             res|=set(Posic)      
+             compl+=1
+         comp+=len(res)
+         
+    for k,v in df2:
+         dicta={'Patron':[] ,'total_Patrones_por_prot':[],'numero_prot':[]}
+         dicta[k]=0
+         dox=0
+         dix=0
+         co=0
+         res=0
+         Posic=set()
+         for index,row in v.iterrows():
+             
+             Posic|=set([oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']'])
+             Poss=[oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']']
+             co+=1
+             rem=[]
+             if(len(Poss)>2):
+              u=0
+              while u+1<len(Poss):
+                 if(Poss[u]+len(k)<=Poss[u+1]):
+                    del Poss[u+1]
+                 else:
+                   u+=1
+             res+=len(Poss)   
+         dix+=len(Posic)   
+         dox+=len(Posic)*len(str(k))
+         dox/=seq_len
+         #dicta['%Ocurrencia_caracter'].append(dox*100)
+         #dicta['longitud_Apariciones'].append(co)
+         #dicta['longitud_Apariciones_Proteina'].append(dix)
+         #dicta['%Patron'].append(co/compl*100)
+         #dicta['%Patron_proteina'].append(dix/comp*100)
+         dicta['Patron'].append(str(k))
+         #dicta['total_Patrones'].append(compl)
+         dicta['total_Patrones_por_prot'].append(res)
+         dicta['numero_prot'].append(co)
+         do=pd.DataFrame(dicta)
+         if not first:
+            do.to_csv('resultados/patronesOcurrencia'+str(int((float(datosInterfaz["OcurrenciaMin"])%1)*100))+sal+'.csv',index=False,header=False,mode='a' )
+         else:
+            do.to_csv('resultados/patronesOcurrencia'+str(int((float(datosInterfaz["OcurrenciaMin"])%1)*100))+sal+'.csv',index=False )
+            first=False
+    del df2                 
+    
+    del do
+def compute_proteinas_ocurrencia(df,sal):
+    """
+    Computes the occurrence of proteins in the data and saves the results to a CSV file.
+
+    Parameters:
+    - df: pandas DataFrame, DataFrame containing protein information.
+    
+    Note: Saves four values the protein id, the so called global ocurrence, the classes it has each protein. Global ocurrence is the percentage of Aminoacids in the sequence that belong to a pattern vs the total of aminoacids in the sequence of a specific protein.
+    """
+    df3=df.groupby('Proteina')
+    first=True
+    df_b = pd.read_excel(archivoEntrada)
+    #df_b = pd.read_excel("proteinasClase_PC00060.xlsx")
+    #df_b=substitute_or_remove_prot_id(df_b,"r")
+    proteinas_dict = dict(df_b[['protein_id','protein_sequence']].values)
+    positions_visited=[]
+    for k,v in df3:
+        di={'proteinas':[],'global_ocurrence':[],"classesProt":[]}
+        seq=proteinas_dict[k]
+        #di['maximum_ocurrence'].append(len(seq))
+        di['proteinas'].append(k)
+        pato=[]
+        glob_ocurrence=0
+        Acum=[]
+        
+        
+        
+        for index,row in v.iterrows():
+            print(row)
+            pat={}
+            pat['patron']=str(row['Patron'])
+            Posit=[oo for oo in ast.literal_eval(row['Posiciones']) if oo is not '[' and oo is not ']']
+            print(Posit)
+            Add=[]
+            for i in Posit:
+               for kaa in range(0,len(str(row['Patron']))):
+                   print(i)
+                   Add.append(int(i)+kaa)
+            lex=len(list(set(Acum) & set(Add)))       
+                    
+            Posic=Posit
+            pat['loc_ocurren']=(len(Posic)*len(str(row['Patron'])))/len(seq)
+            glob_ocurrence+=len(Posic)*len(str(row['Patron']))-lex
+            pato.append(pat)
+            Acum=list(set(Acum) | set(Add))
+        #di['patrones'].append(pato)    
+        di['global_ocurrence'].append(glob_ocurrence/len(seq))
+        di['classesProt'].append(class_dict[k] if k in class_dict else "N/A")
+        do=pd.DataFrame(di)
+        if not first:
+           do.to_csv('resultados/proteinasOcurrencia'+str(int((float(datosInterfaz["OcurrenciaMin"])%1)*100))+sal+'.csv',index=False,header=False,mode='a' )
+        else: 
+           do.to_csv('resultados/proteinasOcurrencia'+str(int((float(datosInterfaz["OcurrenciaMin"])%1)*100))+sal+'.csv',index=False)   
+           first=False 
+    del do        
+if __name__ == "__main__":
+    if not os.path.exists("resultados"):
+        # Si no existe, crearla
+        os.makedirs("resultados")
+        print(f"La carpeta resultados se ha creado correctamente.")
+    else:
+        print(f"La carpeta resultados ya existe.")
+
+
+    inicio = time.time()
+    jsonfile=open("param_file.conf","r")
+    datosInterfaz=json.load(jsonfile)
+    #datosInterfaz = interfaz()
+    print(datosInterfaz)
+
+    archivoEntrada = datosInterfaz["NombreArchivoEntrada"]
+    enfermedad = datosInterfaz["CodigoEnfermedad"]
+    archivoTarget = datosInterfaz["NombreArchivoTarget"]
+    similitud = float(datosInterfaz["Similitud"])
+    archivoClases = datosInterfaz["NombreArchivoClases"]
+    archivoAA=datosInterfaz["NombreArchivoAA"]
+    sal=datosInterfaz["ExtensionSalida"]
+    cl=pd.read_excel(archivoClases)
+    #cl=substitute_or_remove_prot_id(cl,"r")
+    #data2=data.copy()
+    cli=cl.groupby('protein_id')
+    class_dict=group_classes_by_protein(cl)
+    #ejecutar(archivoEntrada, enfermedad, similitud)
+    pattern_freqMin = dict()
+    sequences, num_filas = readData(archivoEntrada, enfermedad, archivoTarget)
+    df_b = pd.read_excel(archivoEntrada)
+    #df_b=pd.read_excel("proteinasClase_PC00060.xlsx")
+    proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
+    ka=""
+    for item in sequences:
+            ka=proteinas_dict[item]        
+    min_ocurrence = math.floor(num_filas * float(datosInterfaz["OcurrenciaMin"]))
+    seq_len=calculate_sequence_length(sequences)  
+    print(min_ocurrence)    
+    #pattern_freq, num_patrones = buscar_patrones_simAA(sequences,min_ocurrence,archivoAA)
+    #remplazar_s(pattern_freqMin,archivoEntrada,ArchivoAA,float(datosInterfaz["OcurrenciaMin"]),sal)  
+    print(sequences)
+    #pattern_freqMin, num_patrones = buscar_patrones_identicos(sequences,archivoEntrada,archivoAA,float(datosInterfaz["OcurrenciaMin"]))
+    pattern_freqMin, num_patrones = buscar_patrones_identicos(sequences)
+    remplazar_sequence_for_ID(pattern_freqMin,archivoEntrada,float(datosInterfaz["OcurrenciaMin"]),sal,archivoClases)
+   
+    df=pd.read_csv('resultados/patronesIdenticos'+str(int((float(datosInterfaz["OcurrenciaMin"])%1)*100))+sal+'.csv', usecols=['Patron', 'Proteina', 'Posiciones',"classesProt"],index_col=False)
+    
+    df.to_csv('resultados/patronesIdenticos'+str(int((float(datosInterfaz["OcurrenciaMin"])%1)*100))+sal+'.csv', index=False)
+    
+    #dfx=df.copy()
+    compute_pattern_ocurrence(df,sal)
+    compute_proteinas_ocurrencia(df,sal)
+    #metricas.metrica_distanciaProteinas()
+    #grafica(archivo, nombreOutput)
+    
+    print("Se han obtenido los resultados de la métrica para la distancia entre dos proteínas que poseen el mismo patrón")
+
+    metrica = math.floor(num_patrones * float(datosInterfaz["Metrica"]))
+
+    metricas.patronesComun(metrica,archivoEntrada,float(datosInterfaz["OcurrenciaMin"]),sal,archivoClases)
+    
+    
+    #grafica(archivo, nombreOutput)
+    print("Se han obtenido los resultados de la métrica para la distancia entre dos proteínas que poseen mas de un patrón en común")
+    
+    fin = time.time()
+    
+    tiempo_total = fin - inicio
+    print(tiempo_total, "segundos")
--- a/Code Approach 1 and 2 (2.1 - 2.2)/similarityAllProteins.py
+++ b/Code Approach 1 and 2 (2.1 - 2.2)/similarityAllProteins.py
+import pandas as pd
+import Levenshtein
+from minineedle import needle, smith, core
+from descarteProteinas import substitute_or_remove_prot_id 
+from ast import literal_eval
+import blosum as bl
+def readData(archivoEntrada):
+    """
+    Read protein sequences from an Excel file.
+
+    Parameters:
+    - archivoEntrada: Input Excel file path
+    
+    Returns:
+    - List of protein sequences
+
+    This function reads protein sequences from an Excel file specified by 'archivoEntrada' and extracts the
+    'protein_sequence' column from the DataFrame. The sequences are returned as a list.
+    
+    Example:
+    >>> sequences = readData("protein_data.xlsx")
+    >>> print(sequences)
+    ['MTCG...', 'MCTA...', ...]
+    """
+    data = pd.read_excel(archivoEntrada)
+    #data=substitute_or_remove_prot_id(data,'r')
+    sequences = data["protein_sequence"]
+
+    return sequences
+
+def similitudProteinas(sequences):
+    """
+    Calculate pairwise similarity scores between protein sequences using Levenshtein distance.
+
+    Parameters:
+    - sequences: List of protein sequences
+    
+    Returns:
+    - List of lists containing pairwise similarity information:
+        - [protein_sequence_1, protein_sequence_2, similarity_score]
+
+    This function takes a list of protein sequences and calculates pairwise similarity scores
+    between each pair of protein sequences using Levenshtein distance. The results are returned
+    in a list of lists.
+
+    Example:
+    >>> sequences = ["MACG", "MACC", "MGCA"]
+    >>> result = similitudProteinas(sequences)
+    >>> print(result)
+    [['MACG', 'MACC', 75.0],
+     ['MACG', 'MGCA', 50.0],
+     ['MACC', 'MACG', 75.0],
+     ['MACC', 'MGCA', 66.67],
+     ['MGCA', 'MACG', 50.0],
+     ['MGCA', 'MACC', 66.67]]
+    """
+    output = []
+    for row1 in sequences:
+        for row2 in sequences:
+            if row1 != row2:
+                #similarity = abs(smith.SmithWaterman(row1, row2).get_score()-1) / max(len(row1), len(row2))
+                #similarity = abs(needle.NeedlemanWunsch(row1, row2).get_score()-1) / (2*max(len(row1), len(row2)))
+                similarity = abs(Levenshtein.distance(row1, row2)) / max(len(row1), len(row2))
+                output.append([row1, row2, similarity*100])
+    return output
+
+def remplazar_sequence_for_ID(output,archivoEntrada,archivoEntrada2,Sal,mode="default"):
+    """
+    Replace protein sequences with protein IDs using a pre-existing DataFrame.
+
+    Parameters:
+    - output: List of lists containing similarity information
+    - mode: Replacement mode (default or drug)
+    - archivoEntrada: Path to protein information file
+    - Sal: Extension for output file
+
+    This function takes a list of lists containing pairwise similarity information, and replaces
+    protein sequences with their corresponding protein IDs. The replacement is based on the information
+    provided in a pre-existing DataFrame. The updated information is saved to a CSV file.
+
+    Example:
+    >>> data = [['MACG', 'MGCA', 75.0], ['MACC', 'MGCA', 66.67]]
+    >>> inputFile = "protein_data.xlsx"
+    >>> outputExt = "protein"
+    >>> remplazar_sequence_for_ID(data,inputFile,OutputExt, mode="default")
+    """
+    df_b = pd.read_excel(archivoEntrada)
+    df_c= pd.read_excel(archivoEntrada2)
+    common_cols = list(set.intersection(*(set(df_b.columns),set(df_c.columns) )))
+    df_b=pd.concat([df_b[common_cols],df_c[common_cols]], ignore_index=True)
+    #df_b=substitute_or_remove_prot_id(df_b,"r")
+    # Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
+    #output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0]))
+    
+    proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
+    if(mode=="drug"):
+       drug_dict=dict(df_b[['protein_sequence','drug_id']].values)
+       for item in output:
+        protein_sequence1 = item[0]
+        protein_sequence2 = item[1]
+        res=[]
+        [res.append(x) for x in literal_eval(drug_dict[item[0]]) if x not in res and ( x != '[' or x != ']') ] 
+        if(len(res) == 1):
+          item.append(res[0])
+        elif(len(res)>1):
+          item.append(res)
+        else:
+          item.append("")    
+        res=[]
+        [res.append(x) for x in literal_eval(drug_dict[item[1]]) if x not in res and ( x != '[' or x != ']')] 
+        if(len(res) == 1):
+          item.append(res[0])
+        elif(len(res)>1):
+          item.append(res)
+        else:
+          item.append("")  
+        if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict:
+            item[0] = proteinas_dict[protein_sequence1]
+            item[1] = proteinas_dict[protein_sequence2]
+       df_a=pd.DataFrame(output, columns=['Proteina1', 'Proteina2', 'Similaridad','SimilaridadAA','similaridadAA_2','similaridadBlosum','drug_id_p1','drug_id_p2'])    
+    else:
+       for item in output:
+        protein_sequence1 = item[0]
+        protein_sequence2 = item[1]
+        if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict:
+            item[0] = proteinas_dict[protein_sequence1]
+            item[1] = proteinas_dict[protein_sequence2]
+
+
+
+       df_a = pd.DataFrame(output, columns=['Proteina1', 'Proteina2', 'Similaridad','SimilaridadAA','similaridadAA_2','similaridadBlosum'])
+
+    # Guardar el DataFrame actualizado en un archivo CSV
+    df_a.to_csv('AllProteins_%Similitud'+Sal+'.csv', index=False)
+def similitudMatProteinas(sequences,sequences2, matrix,matrix2,matriz3,matriz4,equal=False):
+    """
+    Create percentages of pairwise similarity scores between protein sequences based on three similarity matrices.
+
+    Parameters:
+    - sequences: List of protein sequences
+    - matrix: First similarity matrix
+    - matrix2: Second similarity matrix
+    - matriz3: Third similarity matrix
+
+    Returns:
+    - List of lists containing pairwise similarity information:
+        - [protein_sequence_1, protein_sequence_2, similarity_score_matrix1, similarity_score_matrix2, similarity_score_matrix3]
+
+    This function takes a list of protein sequences and three similarity matrices and calculates pairwise similarity scores
+    between each pair of protein sequences. The similarity scores are computed using the provided matrices, and the results
+    are returned in a list of lists.
+
+    Note: The function assumes that the matrices are square matrices with dimensions matching the length of the 'sequences' list.
+
+    Example:
+    >>> sequences = ["MACG", "MACC", "MGCA"]
+    >>> matrix1 = [[1.0, 0.8, 0.6], [0.8, 1.0, 0.7], [0.6, 0.7, 1.0]]
+    >>> matrix2 = [[0.9, 0.7, 0.5], [0.7, 0.9, 0.6], [0.5, 0.6, 0.9]]
+    >>> matrix3 = [[0.8, 0.6, 0.4], [0.6, 0.8, 0.5], [0.4, 0.5, 0.8]]
+    >>> result = similitudMatProteinas(sequences, matrix1, matrix2, matrix3)
+    >>> print(result)
+    [['MACG', 'MACC', 80.0, 70.0, 60.0],
+     ['MACG', 'MGCA', 60.0, 50.0, 40.0],
+     ['MACC', 'MACG', 80.0, 70.0, 60.0],
+     ['MACC', 'MGCA', 70.0, 60.0, 50.0],
+     ['MGCA', 'MACG', 60.0, 50.0, 40.0],
+     ['MGCA', 'MACC', 70.0, 60.0, 50.0]]
+    """
+    output = []
+    for row1 in range(0,len(sequences2)):
+        for row2 in range(0,len(sequences)):
+           if equal:
+             if row1 != row2:
+                #similarity = abs(smith.SmithWaterman(row1, row2).get_score()-1) / max(len(row1), len(row2))
+                #similarity = abs(needle.NeedlemanWunsch(row1, row2).get_score()-1) / (2*max(len(row1), len(row2)))
+                output.append([sequences[row2], sequences2[row1], matrix[row1][row2]*100,matrix2[row1][row2]*100,matriz3[row1][row2]*100,matriz4[row1][row2]*100])
+           else:
+                output.append([sequences[row2], sequences2[row1], matrix[row1][row2]*100,matrix2[row1][row2]*100,matriz3[row1][row2]*100,matriz4[row1][row2]*100])     
+    return output
+if __name__ == "__main__":
+    archivoEntrada = "Data/data_lung_cancer_treatment.xlsx"
+    sequences1 = readData(archivoEntrada)
+    archivoEntrada2 = "Data/data_autoimmume_desease.xlsx"
+    sequences2 = readData(archivoEntrada2)
+    matrix=pd.read_csv('matrizNWAutoimmuneDiseaseC.csv',header=None,index_col=False)*3+1.0
+    matrix.abs()
+    matrix/=4
+    print(matrix.shape)
+    matrix2=pd.read_csv('matrizNWAutoimmuneDiseaseMod1.csv',header=None,index_col=False)*3+1.0
+    matrix2.abs()
+    matrix2/=4
+    print(matrix2.shape)
+    matrix3=pd.read_csv('matrizNWAutoimmuneDiseaseMod2.csv',header=None,index_col=False)*3+1.0
+    matrix3.abs()
+    matrix3/=4
+    print(matrix3.shape)
+    matrix4=pd.read_csv('matrizNWAutoimmuneDiseaseBlosum62.csv',header=None,index_col=False)
+    dic= bl.BLOSUM(62)
+    print(dic)
+    mismatch=0
+    match=1
+    minn=min(min(min(min(list(row.values())) for row in list(dic.values())),-4),mismatch)
+    print(matrix4.shape)
+    print(len(sequences1))
+    for row1 in range(0,len(sequences2)):
+        for row2 in range(0,len(sequences1)):
+            len_sec1=0
+            len_min_sec1=0
+            dic_seq=set()
+            minf_letters='a'
+            for i in sequences1[row2]:
+               dic_seq.add(dic[i][i])
+               minf_letters= i if(dic[i][i] == float('-inf')) else minf_letters
+               
+               len_sec1+=dic[i][i] if(dic[i][i] != float('-inf')) else match
+               len_min_sec1+=min(list(dic[i].values())) if(dic[i][i] != float('-inf')) else mismatch
+            len_sec2=0
+            len_min_sec2=0   
+            for i in sequences2[row1]:
+               dic_seq.add(dic[i][i])
+               minf_letters= i if(dic[i][i] == float('-inf')) else minf_letters
+               len_sec2+=dic[i][i] if(dic[i][i] != float('-inf')) else match
+               len_min_sec2+=min(list(dic[i].values())) if(dic[i][i] != float('-inf')) else mismatch
+            if(max(len_sec2,len_sec1) == float('-inf')):   
+               print(max(len_sec2,len_sec1))
+               print(dic_seq)
+               print(minf_letters)   
+            matrix4[row1][row2]*=max(len_sec2,len_sec1)
+            matrix4[row1][row2]-=(minn*max(len(sequences1[row2]),len(sequences2[row1])))
+            matrix4[row1][row2]/= (max(len_sec2,len_sec1)-minn*max(len(sequences1[row2]),len(sequences2[row1])))
+    print(matrix[0][0])
+    print(matrix2[0][0])
+    print(matrix3[0][0])
+    print(matrix4[0][0])          
+    #output = similitudProteinas(sequences)
+    output=similitudMatProteinas(sequences1,sequences2, matrix,matrix2,matrix3,matrix4,equal=False)
+    print("Generada la tabla de con las matrices de similaridad especificadas")
+     
+    remplazar_sequence_for_ID(output,archivoEntrada,archivoEntrada2,"AutoimmuneDisease")
--- a/Code Approach 1 and 2 (2.1 - 2.2)/summary.py
+++ b/Code Approach 1 and 2 (2.1 - 2.2)/summary.py
+import pandas as pd
+import time
+import numpy as np
+import re
+from ast import literal_eval
+from find_patterns import substitute_or_remove_prot_id
+def readData(archivoEntrada, enfermedad,patrones_file,Sal):
+    """
+    Reads data from an Excel file, filters it based on the disease, and performs additional processing.
+
+    Parameters:
+    - archivo_entrada (str): Path to the Excel file.
+    - enfermedad (str): Disease ID for filtering.
+    - patrones_file (str): Path to the file containing patterns.
+    - Sal: Output file extension
+    Returns:
+    - data (pd.DataFrame): Processed DataFrame based on the given parameters.
+    """
+    data = pd.read_excel(archivoEntrada)
+
+    if enfermedad:
+        data = data.loc[data["disease_id"] == enfermedad]
+
+    dataB = pd.read_csv(patrones_file)
+
+    print(len(data))
+    filt_data = len(data)
+    alz_filt_data = len(dataB)
+    print("Proteins discarded after the main filter: " + str(filt_data - len(data)))
+    print("Proteins discarded after the common Alzheimer's filter: " + str(alz_filt_data - len(dataB)))
+
+    dataC = {}
+    dataz={}
+    daa = dataB["Patron"].unique()
+    das={}
+    pos={}
+    deas={}
+    for u in daa:
+        if len(u) > 3:
+          kk=data.protein_sequence.str.contains(u)
+          das[u] = data[kk]["protein_id"].to_list()
+          pos[u]= data[kk]['protein_sequence'].str.find(u).to_list()
+          deas[u]=data[kk]['disease_id'].to_list()
+          print(len(pos[u]))
+          print(len(das[u]))
+          dataC[u]=[[[das[u][ii],pos[u][ii]],deas[u][ii]] for ii in range(0,len(das[u]))]
+          res = []
+          for row in dataC[u]:
+              matching_sublist = next((sublist for sublist in res if sublist[0] == row[0]), None)
+
+              if matching_sublist is not None:
+            # If a matching sublist is found, append only non-matching elements to it
+                  matching_sublist[1].append(row[1])
+              else:
+            # If no matching sublist is found, create a new sublist with only non-matching elements
+                 res.append([row[0],row[1:]])
+          dataC[u]=[sublist[0] for sublist in res]
+          dataz[u]=[sublist[1] for sublist in res]
+    dataG = pd.DataFrame({"pattern": dataC.keys(),"proteins":dataC.values(),"desease_id":dataz.values()})
+    dataG.to_excel("ProtByPattern"+Sal+".xlsx")
+
+    sequences = data["protein_sequence"]
+    return data
+def add_protein_info_to_data(main_data_path, patterns_info_path, protein_names_path):
+    """
+    Add protein names and protein information from the original pattern file and the names Dataset to a DataFrame based on matching patterns.
+
+    Parameters:
+    - main_data_path (str): The path to the Excel file containing the main data.
+    - patterns_info_path (str): The path to the CSV file containing patterns and protein information.
+    - protein_names_path (str): The path to the CSV file containing protein names.
+
+    Returns:
+    None: The function updates the provided Excel file with additional protein information.
+
+    Example:
+    ```python
+    add_protein_info_to_data("main_data.xlsx", "patterns_info.csv", "protein_names.csv")
+    ```
+
+    Note:
+    - The function assumes that the provided Excel file ('main_data_path') contains a 'pattern' column.
+    - The 'patterns_info_path' CSV file is expected to have columns 'Patron', 'Proteina', and 'Posiciones'.
+    - The 'protein_names_path' CSV file is expected to have columns 'Entry' and 'Entry_Name'.
+    """
+
+    # Read data from files
+    main_data = pd.read_excel(main_data_path)
+    patterns_info = pd.read_csv(patterns_info_path)
+    protein_names = pd.read_csv(protein_names_path)
+
+    # Group patterns in 'patterns_info' DataFrame
+    patterns_grouped = patterns_info.groupby("Patron")
+
+    # Initialize columns in 'main_data' DataFrame
+    main_data["protein_names"] = ""
+    main_data["proteins_treat"] = "{}"
+    main_data["names_Treat"]=""
+    for pattern, group_data in patterns_grouped:
+    # Iterate over patterns in 'patterns_info'
+       for index, row in group_data.iterrows():
+         protein_id = row["Proteina"]
+         positions = row["Posiciones"]
+
+         # Find matching rows in 'main_data' DataFrame
+         matching_rows = main_data[main_data["pattern"] == pattern]
+
+         # Initialize or get the current 'proteins_treat' list
+         current_proteins_treat = {}
+
+         # Update 'proteins_treat' field for each matching row
+         for matching_index, matching_row in matching_rows.iterrows():
+            current_proteins_treat = literal_eval(matching_row["proteins_treat"]) if pd.notna(matching_row["proteins_treat"]) or matching_row["proteins_treat"] != "[]" else {}
+            current_proteins_treat.update({protein_id: literal_eval(positions)})
+            main_data.at[matching_index, "proteins_treat"] = str(current_proteins_treat)
+            matching_rows.at[matching_index, "proteins_treat"] = str(current_proteins_treat)
+       print(matching_rows["proteins_treat"].apply(
+            lambda lst: [protein_idee for protein_idee, _ in literal_eval(lst).items()]))
+       main_data.loc[main_data["pattern"] == pattern, "names_Treat"] = matching_rows["proteins_treat"].apply(
+            lambda lst: [protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() if protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() != [] else ["N/A"] for protein_idee, _ in literal_eval(lst).items()]
+        )
+       main_data.loc[main_data["pattern"] == pattern, "protein_names"] = matching_rows["proteins"].apply(
+            lambda lst: [protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() if protein_names[protein_names["Entry"] == protein_idee]["Entry_Name"].to_list() != [] else ["N/A"] for protein_idee, _ in literal_eval(lst)]
+        )
+    # Save the updated data
+    main_data_base_name = main_data_path.split(".")[0]
+    main_data.to_excel(f"{main_data_base_name}_summary.xlsx", index=False) 
+
+
+        
+def add_entry_name(archivoEntrada,protein_name_file,archNom):
+     """
+     Adds entry names to the DataFrame based on an additional CSV file and performs additional processing.
+
+     Parameters:
+     - archivo_entrada (str): Path to the Excel file.
+     - protein_name_file (str): Path to the protein name CSV file.
+     - archNom (str): Path to the id sustitution file
+     
+     Returns:
+     - None
+     """
+     data = pd.read_excel(archivoEntrada)
+     dataB = pd.read_csv(protein_name_file, usecols=['Entry', "Entry_Name", "Protein_names", "Length"])
+     dataB = substitute_or_remove_prot_id(dataB, archNom, "na")
+     print("PASA")
+     dataB = dataB.reindex(columns=['Entry', "Entry_Name", "Length", "Protein_names"])
+     datas = dataB[dataB["Entry"].isin(data["protein_id"])]
+     datas.to_csv(archivoEntrada + "_nombre.csv")
+     doo = data[~(data["protein_id"].isin(dataB["Entry"]))]
+     doo.to_csv("Proteinas_sin_nombre")
+     #data.assign(lenght=datas["Length"].to_list())
+     #data.assign(name=datas["Protein names"].to_list())
+     #data.to_csv(archivoEntrada+"_nombre.csv")    
+if __name__=="__main__":
+       #data=add_entry_name("Data/data_cancers_desease.xlsx","Data/protein_name.csv","Data/nombres_sust.txt")
+       #data=pd.read_excel("Data/data_lung_cancer_desease.xlsx")
+       #dd=pd.read_excel("Data/data_lung_cancer_treatment.xlsx")
+       #dds=pd.concat([data,dd])
+       #dds.to_excel("Data/data_lung_cancer_desease_full.xlsx")
+       data=readData("Data/data_immune_desease.xlsx","","patronesIdenticos10Treat.csv","Immun01")
+       add_protein_info_to_data("ProtByPatternImmun01.xlsx","patronesIdenticos10Treat.csv","Data/protein_name.csv")
+            
--- a/Code statistical methods/Analysis of similarities - patterns significance - Simi_AA.ipynb
+++ b/Code statistical methods/Analysis of similarities - patterns significance - Simi_AA.ipynb
--- a/Code statistical methods/Analysis of similarities - patterns significance - Simi_AA2.ipynb
+++ b/Code statistical methods/Analysis of similarities - patterns significance - Simi_AA2.ipynb
--- a/Code statistical methods/Analysis of similarities - patterns significance - Simi_BLOSUM.ipynb
+++ b/Code statistical methods/Analysis of similarities - patterns significance - Simi_BLOSUM.ipynb
--- a/Code statistical methods/Analysis of similarities - patterns significance _ DR .ipynb
+++ b/Code statistical methods/Analysis of similarities - patterns significance _ DR .ipynb
--- a/Code statistical methods/Pattern found - Sankey plots.ipynb
+++ b/Code statistical methods/Pattern found - Sankey plots.ipynb
--- a/Input/autoimmune_protein_ids.xlsx
+++ b/Input/autoimmune_protein_ids.xlsx
--- a/Input/cancers_proteins_ids.xlsx
+++ b/Input/cancers_proteins_ids.xlsx
--- a/Input/data_cancers_disease.xlsx
+++ b/Input/data_cancers_disease.xlsx
--- a/Input/data_rare_disease.xlsx
+++ b/Input/data_rare_disease.xlsx
--- a/Input/protein_lung_cancer_C0007131.csv
+++ b/Input/protein_lung_cancer_C0007131.csv
--- a/Input/treatment_lung_cancer.xlsx
+++ b/Input/treatment_lung_cancer.xlsx