added comments to metric.py

60a7d13a · Rafael Artinano · da384ed5 · 60a7d13a
Commit 60a7d13a authored Jan 23, 2024 by Rafael Artinano
Hide whitespace changes
Inline Side-by-side

Showing with 49 additions and 2 deletions

TFM-main/src/metricas.py TFM-main/src/metricas.py +49 -2

No files found.
--- a/TFM-main/src/metricas.py
+++ b/TFM-main/src/metricas.py
@@ -8,6 +8,15 @@ from descarteProteinas import substitute_or_remove_prot_id
 from generate_the_excel import substitute_or_remove_prot_id
    
 def similitudProteinas(sequences):
+    """
+    Calculates the similarity between protein sequences using the Needleman-Wunsch algorithm.
+
+    Parameters:
+    - sequences (list): List of protein sequences.
+
+    Returns:
+    - list: List of lists containing pairs of protein sequences and their calculated similarity.
+    """
    output = []

    for row1 in sequences:
@@ -22,6 +31,11 @@ def similitudProteinas(sequences):
    return output

 def metrica_distanciaProteinas():
+    """
+    Calculates a metric for the distance between proteins based on preprocessed data.
+
+    Reads CSV files, creates a dictionary of similarities, and performs calculations to generate a new CSV file.
+    """
    # Leer los archivos CSV
    data = pd.read_csv("resultados/patronesIdenticos.csv")
    df_b = pd.read_csv("AllProteins_%Similitud.csv")
@@ -54,7 +68,18 @@ def metrica_distanciaProteinas():
              index=False)

 def patronesComun(patronesComun,archivoEntrada,ocurrencia,sal,archivoClases):
-    
+    """
+    Computes common patterns between proteins and generates metrics based on pattern occurrences.
+
+    Parameters:
+    - patronesComun (int): Minimum number of common patterns required.
+    - archivoEntrada (str): Path to the input CSV file.
+    - ocurrencia (float): Occurrence value.
+    - sal (str): Output file suffix.
+    - archivoClases (str): Path to the input Excel file containing protein classes.
+
+    Reads data, extracts unique patterns, finds common patterns, and generates metrics and CSV files.
+    """
    # Leer el archivo CSV y cargar los datos en una lista de diccionarios
    registros = []
    cl=pd.read_excel(archivoClases)
@@ -214,7 +239,19 @@ def patronesComun(patronesComun,archivoEntrada,ocurrencia,sal,archivoClases):


 def patronesComunClas(patronesComun,name,archivoEntrada,ocurrencia,sal,archivoClases):
-    
+    """
+    Computes common patterns between proteins within a class and generates class-specific metrics.
+
+    Parameters:
+    - patronesComun (int): Minimum number of common patterns required.
+    - name (str): Name of the class for output file organization.
+    - archivoEntrada (str): Path to the input CSV file.
+    - ocurrencia (float): Occurrence value.
+    - sal (str): Output file suffix.
+    - archivoClases (str): Path to the input Excel file containing protein classes.
+
+    Reads data, extracts unique patterns, finds common patterns within a class, and generates metrics of amount of patterns and if match the whole sequence and saves it CSV files.
+    """
    # Leer el archivo CSV y cargar los datos en una lista de diccionarios
    registros = []
    cl=pd.read_excel(archivoClases)
@@ -373,6 +410,16 @@ def patronesComunClas(patronesComun,name,archivoEntrada,ocurrencia,sal,archivoCl


 def remplazar_sequence_for_ID(output,archivoEntrada):
+    """
+    Replaces protein sequences with corresponding protein IDs in the output based on a reference CSV file.
+
+    Parameters:
+    - output (list): List of lists containing protein pairs and their similarity.
+    - archivoEntrada (str): Path to the input CSV file containing protein information.
+
+    Reads the reference CSV file, sorts the output, and replaces sequences with protein IDs.
+    Updates the output CSV file.
+    """
    df_b = pd.read_excel(archivoEntrada)
    #df_b = pd.read_excel("proteinasClase_PC00060.xlsx")
    #df_b=substitute_or_remove_prot_id(df_b,"r")