patrones_similares.py 3.78 KB
Newer Older
Rafael Artinano's avatar
Rafael Artinano committed
1 2 3 4 5
import time
import Levenshtein
import math

def patrones_similares(pattern_freqMin):
6 7 8 9 10 11 12 13 14
    """
    Identifies similar patterns in a dictionary of patterns based on Levenshtein distance.

    Parameters:
    - pattern_freqMin: dict, dictionary of patterns and their positions.

    Returns:
    - pattern_freqMin: dict, updated dictionary of patterns with similar patterns merged.
    """
Rafael Artinano's avatar
Rafael Artinano committed
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
    similar_patterns = {}  # Guarda los patrones similares relacionados con el patron similar del que parten
    num_op = 3
    similar_patterns = {}  # Guarda los patrones similares relacionados con el patron similar del que parten

    patterns = list(pattern_freqMin.keys())
    num_patterns = len(patterns)

    for i in range(num_patterns):
        pattern1 = patterns[i]
        proteins1 = pattern_freqMin[pattern1]
        len_pattern1 = len(pattern1)

        for j in range(i + 1, num_patterns):
            pattern2 = patterns[j]
            proteins2 = pattern_freqMin[pattern2]
            len_pattern2 = len(pattern2)

            # Calcular distancia de Levenshtein entre patrones
            similarity = Levenshtein.distance(pattern1, pattern2) / max(len(pattern1), len(pattern2))
            # Para admitir una inserción, una delección o una sustitución el valor debe ser 1, y dividimos para normalizar y
            # adaptarlo a las distintas longitudes
            max_length = max(len_pattern1, len_pattern2)
            operaciones_max = math.ceil(0.1 * max_length)
            umbral = operaciones_max / max_length
            # print("Patron 1: ", pattern1, " Patron 2: ", pattern2, " Similariad: ", similarity)
            # print(umbral)

            if similarity <= umbral:
                if pattern1 not in similar_patterns:
                    similar_patterns[pattern1] = set()
                if pattern2 not in similar_patterns:
                    similar_patterns[pattern2] = set()

                if pattern1 not in pattern_freqMin:
                    pattern_freqMin[pattern2] = {}

                for proteina, posiciones in proteins1.items():
                    if proteina not in pattern_freqMin[pattern2]:
                        pattern_freqMin[pattern2][proteina] = []
                        if posiciones:
                            pattern_freqMin[pattern2][proteina].extend(posiciones)
                    else:
                        for posicion in posiciones:
                            if posicion not in pattern_freqMin[pattern2][proteina]:
                                pattern_freqMin[pattern2][proteina].append(posicion)
                            pattern_freqMin[pattern2][proteina].sort()

                if pattern2 not in pattern_freqMin:
                    pattern_freqMin[pattern1] = {}

                for proteina, posiciones in proteins2.items():
                    if proteina not in pattern_freqMin[pattern1]:
                        pattern_freqMin[pattern1][proteina] = []
                        if posiciones:
                            pattern_freqMin[pattern1][proteina].extend(posiciones)
                    else:
                        for posicion in posiciones:
                            if posicion not in pattern_freqMin[pattern1][proteina]:
                                pattern_freqMin[pattern1][proteina].append(posicion)
                            pattern_freqMin[pattern1][proteina].sort()

                similar_patterns[pattern1].add(pattern2)
                similar_patterns[pattern2].add(pattern1)
    print(pattern_freqMin)
    return pattern_freqMin


if __name__ == "__main__":
    inicio = time.time()

    lista_similares = {'Al': {"ALFAS": [0], 'AFNERALDAL': [5, 7]}, 'AF': {"JDLKAJSDL": [8, 9, 10], "JLADA": [3, 7], "ALFAS": [6]}}
    patrones_similares(lista_similares)
    fin = time.time()

    tiempo_total = fin - inicio
90
    print(tiempo_total, "segundos")