# Setting up LLM model

In [None]:
from openai import OpenAI
import pandas as pd
import json
import os
import re
import tiktoken

# Establishing connection to OpenAI
openAI_client = OpenAI() # defaults to os.environ.get("OPENAI_API_KEY")
model_name = "gpt-4o-mini" # CHANGE THIS TO USE A DIFFERENT MODEL, for example "gpt-4o"

print(f"Using {model_name} model")

def get_response_LLM(prompt):
    """
    Get response from LLM model
    """
    completion = openAI_client.chat.completions.create(
     model = model_name,
     response_format={ "type": "json_object" },
     messages = prompt,
     temperature = 0,
     )
    return completion.choices[0].message.content

# Paths to the files
path_to_texts_old = r"[TFM - MUCD] Textos de las enfermedades\100 Texts LLM TFM" # Old path to the texts without cleaning
path_to_texts = r"[TFM - MUCD] Textos de las enfermedades\100 Clean Disease Texts" # Path to the texts after cleaning, these are actually used
failed_texts_path_to_store = r"Resultados de Prompts\Failed Texts from each Prompt"
path_to_general_results = r"Resultados de Prompts"

# Use a simplified entity dictionary starting from Prompt #2
useSimplifiedEntityDescription = False

# Auxiliary function to transform ChatGPT output into JSON using the first key
def transform_GPT_output(answer_ChatGPT):
    
    answer_ChatGPT_json = json.loads(answer_ChatGPT)
    first_key = list(answer_ChatGPT_json.keys())[0]
    # if the the values of the first key are empty, then return a blank DataFrame
    if not answer_ChatGPT_json[first_key]:
        return pd.DataFrame()
    filtered_json = answer_ChatGPT_json[first_key]
    return pd.DataFrame(filtered_json)

# Auxiliary function to rename files in a folder that have ÔÇô for a –
def fix_file_list(path: str):
    """
    This function receives a path to a folder and changes the name of the files in the folder that have ÔÇô for a –.
    It was only used once to fix the names of the files in the folder with the texts of the diseases.
    """
    for filename in os.listdir(path):
        if "ÔÇô" in filename:
            new_filename = filename.replace("ÔÇô", "–")
            os.rename(os.path.join(path, filename), os.path.join(path, new_filename))

def save_prompt(prompt_name, prompt_message, prompt_description):
    """
    This function saves a prompt message to a file as txt file and updates the Prompt Summary Excel file with the new prompt
    """
    path_to_save_prompt = r"Prompt Engineering\List of Prompts"
    with open(os.path.join(path_to_save_prompt, prompt_name + ".txt"), "w") as file:
        file.write(prompt_message)
    # Transform the prompt message and name into a DataFrame
    prompt_df = pd.DataFrame({"Prompt": [prompt_name], "Prompt Message": [prompt_message], "Prompt Description": [prompt_description]})
    prompt_summary_path = r"Prompt Engineering\List of Prompts\Prompt_Summary.xlsx"
    if os.path.exists(prompt_summary_path):
        prompt_summary_df = pd.read_excel(prompt_summary_path)
    else:
        prompt_summary_df = pd.DataFrame(columns=["Prompt", "Prompt Message", "Prompt Description"])
    # Add the new prompt to the DataFrame
    prompt_summary_df = pd.concat([prompt_summary_df, prompt_df], ignore_index=True)
    # Remove the duplicates as per column "Prompt"
    prompt_summary_df = prompt_summary_df.drop_duplicates(subset=["Prompt"], keep="last")
    # Save the DataFrame to the Excel file
    prompt_summary_df.to_excel(prompt_summary_path, index=False)
    create_full_prompt_record_production()

def filter_prompt6_removeOther(path_to_prompt_results):
    """
    This function filters the results of Prompt 6 removing the entities classified as "Other" in the TUI Code.
    They were classified as "Other" because they were not in the list of TUI Codes used in the prompt.
    The new files are saved in a new folder with the suffix "_OtherRemoved"
    """
    # Get the folder where the prompt results are stored
    abbreviation = "_OtherRemoved"
    folder_name = os.path.basename(path_to_prompt_results)
    new_folder_name = folder_name + abbreviation
    # Create the new folder if it does not exist
    new_path_to_prompt_results = os.path.join(os.path.dirname(path_to_prompt_results), new_folder_name)
    if not os.path.exists(new_path_to_prompt_results):
        os.makedirs(new_path_to_prompt_results)
    # Loop through the files in the folder
    for file in os.listdir(path_to_prompt_results):
        if file.endswith(".xlsx"):
            file_name = file.split(".")[0]
            new_file_name = file_name + abbreviation + ".xlsx"
            df_PartB = pd.read_excel(os.path.join(path_to_prompt_results, file), sheet_name="PartB_Classification")
            df_PartA = pd.read_excel(os.path.join(path_to_prompt_results, file), sheet_name="PartA_Extraction")
            df_PartB = df_PartB[df_PartB["TUI_Code"] != "Other"]
            # Save the DataFrames of part B into an Excel file in the new folder
            df_PartB.to_excel(os.path.join(new_path_to_prompt_results, new_file_name), sheet_name="PartB_Classification" ,index=False)
            # Save the df_PartA in the same excel file but in a different sheet
            with pd.ExcelWriter(os.path.join(new_path_to_prompt_results, new_file_name), engine='openpyxl', mode='a') as writer:
                df_PartA.to_excel(writer, sheet_name="PartA_Extraction", index=False)

def count_tokens_in_text(text: str, model = 'gpt-3.5-turbo'):
    """
    This function counts the number of tokens in a text for a given model.
    The default model is GPT-3.5-turbo. Use 'gpt-4o' for Chat GPT-4o or any other model.
    The function uses the tiktoken library to tokenize the text.
    """
    # Initialize the tokenizer for the model
    encoding = tiktoken.encoding_for_model(model)
    # Tokenize the text
    tokens = encoding.encode(text)
    # Count the number of tokens and return it
    return len(tokens)

def transform_df_into_json(df):
    """
    This function transforms a DataFrame into a JSON object
    """
    return df.to_json(orient='records')

def generate_summary_failed_texts(path_to_model_results: str):
    df_failed_texts_summary = pd.DataFrame()
    for file in os.listdir(path_to_model_results):
        # if file contains "Failed texts" in the name
        if "Failed texts" in file:
            # Open the file
            df = pd.read_excel(os.path.join(path_to_model_results, file), sheet_name=0)
            prompt_name = re.search(r'Prompt .*?(?=\.\w+$)', file)
            if prompt_name:
                prompt_name = prompt_name.group()
            else:
                prompt_name = "Error in parsing prompt_name"
            df["Prompt"] = prompt_name
            # Put the prompt column in the first position
            columns = df.columns.tolist()
            columns = columns[-1:] + columns[:-1]
            df = df[columns]
            df_failed_texts_summary = pd.concat([df_failed_texts_summary, df])
    # Save the summary of the failed texts
    df_failed_texts_summary.to_excel(os.path.join(path_to_model_results, "Failed Texts Summary.xlsx"), index=False)
    return df_failed_texts_summary

def create_full_prompt_record_production():
    path_detailed_prompt_production_sink = r"Prompt Engineering\Detailed Prompt Production"
    path_to_prompts = r"Prompt Engineering\List of Prompts"

    for prompt in os.listdir(path_to_prompts):
        if prompt.endswith(".txt"):
            with open(os.path.join(path_to_prompts, prompt), 'r', encoding='utf-8') as f:
                prompt_text = f.read()
            prompt_name = prompt.split(".")[0]
        # Create the folder with the prompt name in the production sink path if it does not exist
            path_to_production_prompt_sink = os.path.join(path_detailed_prompt_production_sink, prompt_name)
            if not os.path.exists(path_to_production_prompt_sink):
                os.makedirs(path_to_production_prompt_sink)

            for condition in os.listdir(path_to_texts):
                condition_name = condition.split(".")[0]
                with open(os.path.join(path_to_texts, condition), 'r', encoding='utf-8') as f:
                    text = f.read()
                    full_prompt = prompt_text + "\n\n" + "text to analyze: \n\n" + text
                # Save the full_prompt in the production sink folder
                    with open(os.path.join(path_to_production_prompt_sink, condition_name + "_" + prompt_name + ".txt"), 'w', encoding='utf-8') as f:
                        f.write(full_prompt)

Using gpt-4o-mini model


# Processing Texts, cleaning them (only do once)

In [90]:
# def clean_text(text):
#     # Remove [number] or [number, number] citations
#     text = re.sub(r'\[\d+(,\s*\d+)*\]', '', text)
    
#     # Replace all occurrences of '&/or' with 'and/or'
#     text = re.sub(r'&/or', 'and/or', text)
    
#     # Remove incomplete bracketed numbers like [11 or similar cases
#     text = re.sub(r'\[\d+', '', text)
    
#     # Remove non-numeric bracketed terms like [citation needed], [edit], ?, !, etc.
#     text = re.sub(r'\[[a-zA-Z\s?.,!]+\]', '', text)
    
#     # Add spaces around the & symbol if missing
#     text = re.sub(r'\s*&\s*', ' & ', text)
    
#     # Remove any leftover unclosed brackets (e.g., [ or ] without a pair)
#     text = text.replace('[', '').replace(']', '')
    
#     # Add a period after each paragraph if missing
#     text = re.sub(r'([^\.\n])(\n)', r'\1.\2', text)

#     # Ensure consistent spacing after periods and commas
#     text = re.sub(r'([.,;:!?])([^\s])', r'\1 \2', text)

#     # Replace multiple spaces with a single space within paragraphs
#     text = re.sub(r'([^\n\S]+)', ' ', text)

#     # Correct capitalization after periods
#     text = re.sub(r'(\. )([a-z])', lambda m: m.group(1) + m.group(2).upper(), text)

#     # Handle percentage values (e.g., "70 of cases" -> "70% of cases")
#     text = re.sub(r'(\d+)\s*of cases', r'\1% of cases', text)
    
#     # Ensure correct abbreviation punctuation (e.g., "e.g" -> "e.g.")
#     text = re.sub(r'\b(e\.g|i\.e|etc)\b', r'\1.', text)
    
#     # Add degree symbol for temperatures (e.g., "25 C" -> "25°C")
#     text = re.sub(r'(\d+)\s*C', r'\1°C', text)
    
#     # Ensure proper formatting for numeric ranges (e.g., "69-89" -> "69 – 89")
#     text = re.sub(r'(\d+)-(\d+)', r'\1 – \2', text)
    
#     # Remove extra punctuation like double commas or periods
#     text = re.sub(r'\s*[.,!?;:]{2,}\s*', '. ', text)

#     # Remove spaces between numbers and commas (e.g., "25, 000" -> "25,000")
#     text = re.sub(r'(\d+),\s+(\d+)', r'\1,\2', text)

#     # Remove extra spaces before periods, commas, semicolons, etc.
#     text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    
#     # Replace double periods with a single period
#     text = re.sub(r'\.\.', '.', text)
    
#     # Remove unnecessary spaces inside parentheses (e.g., "i.e. ," -> "i.e.,")
#     text = re.sub(r'\(\s*', '(', text)
#     text = re.sub(r'\s*\)', ')', text)
    
#     # Remove spaces around equals signs and hyphens (e.g., "-itis" = inflammation -> "-itis = inflammation")
#     text = re.sub(r'\s*=\s*', ' = ', text)
#     text = re.sub(r'\s*-\s*', '-', text)
    
#     # Correct "i. E." to "i.e."
#     text = re.sub(r'\bi\.\s*E\.', 'i.e.', text, flags=re.IGNORECASE)

#     # Correct "i. e. " to "i.e."
#     text = re.sub(r'\bi\.\s*e\.\s*', 'i.e. ', text)

#     # Remove space between '=' and '>'
#     text = re.sub(r'\=\s+\>', '=>', text)

#     # Remove the period after a colon at the end of a paragraph
#     text = re.sub(r'(:)\.\s*$', r'\1', text, flags=re.MULTILINE)

#     # Remove space after a period when surrounded by numbers (e.g., "99. 0" -> "99.0")
#     text = re.sub(r'(\d+)\.\s+(\d+)', r'\1.\2', text)
    
#     # Add a bullet point before the first occurrence after a period or a newline, followed by an '&'
#     text = re.sub(r'(?<=[\.\n])\s*(&)\s*', r'\n- ', text)
    
#     # Replace all remaining '&' symbols with bullet points
#     text = re.sub(r'\s*&\s*', r'\n- ', text)

#     # Find a sentence preceded by a line break and followed by a line break with a bullet point
#     text = re.sub(r'(\n)([^\n]+)(\n-)', r'\1- \2\3', text)

#     # Replace any duplicate bullet points like '- -' with a single bullet point '-'
#     text = re.sub(r'-\s*-', '-', text)

#     return text

# # Apply the cleaning to each text
# df_texts = pd.DataFrame()
# path_to_save_clean_texts = r"[TFM - MUCD] Textos de las enfermedades\100 Clean Disease Texts"
# for index, disease_text in enumerate(os.listdir(path_to_texts_old)):
#     with open(os.path.join(path_to_texts_old, disease_text), 'r', encoding='utf-8') as f:
#         disease_title = disease_text.split(".")[0]
#         text = f.read()
#     cleanText = clean_text(text)
#     # Save the cleanText to a text file
#     with open(os.path.join(path_to_save_clean_texts, disease_text), 'w', encoding='utf-8') as file:
#         file.write(cleanText)
#     df_texts = pd.concat([df_texts, pd.DataFrame({"Original Text": [text], "Clean Text": [cleanText]})], ignore_index=True)

# # Save the cleaned texts to a new Excel file
# df_texts.to_excel(r"[TFM - MUCD] Textos de las enfermedades\Textos Limpios.xlsx", index=False)


# Counting tokens of text and results - Only do once if needed

In [91]:
# # Function to count the number of tokens in the results of the prompts

# path_to_results = r"Resultados de Prompts\GPT35turbo"
# token_accumulator = 0
# for folder in os.listdir(path_to_results):
#     path_to_folder = os.path.join(path_to_results, folder)
#     # if folder is a folder and doesn't contain "DON'T USE" in the name
#     if os.path.isdir(path_to_folder) and "DON'T USE" not in folder:
#         for file in os.listdir(path_to_folder):
#             if file.endswith(".xlsx"):
#                 path_to_file = os.path.join(path_to_folder, file)
#                 df = pd.read_excel(path_to_file)
#                 df_json = transform_df_into_json(df)
#                 token_prompt = count_tokens_in_text(df_json)
#                 token_accumulator += token_prompt
#         print(token_accumulator)
#     token_accumulator = 0
# df_store_size_of_texts = pd.DataFrame(columns=["Disease", "Number of Words", "Number of Tokens"])

# # Function to count the number of tokens in the texts, it's auxiliary

# for file in os.listdir(path_to_texts):
#     if file.endswith(".txt"):
#         # Read the text file
#         with open(os.path.join(path_to_texts, file), "r", encoding="utf-8") as f:
#             text = f.read()
#         # Get the file name without the extension
#         file_name = file.split(".")[0]
#         # Count the number of words in the text
#         num_words = len(text.split())
#         # Count the number of tokens in the text
#         num_tokens = count_tokens_in_text(text)
#         # Append the results to the DataFrame
#         df_store_size_of_texts = pd.concat([df_store_size_of_texts, pd.DataFrame({"Disease": [file_name], "Number of Words": [num_words], "Number of Tokens": [num_tokens]})], ignore_index=True)
# # Save the results to an Excel file
# path_to_save = r"[TFM - MUCD] Textos de las enfermedades"
# df_store_size_of_texts.to_excel(os.path.join(path_to_save, "Texts Length and Number of Tokens.xlsx"), index=False)

# Prompt 1: Zero-shot Learning

In [92]:
prompt_name = "Prompt 1"
abbreviation = "_P1"

# Create the path for the results of the prompt
result_path_primer_prompt = os.path.join(path_to_general_results, model_name, prompt_name)
print(result_path_primer_prompt)

# Create the folder if it does not exist
if not os.path.exists(result_path_primer_prompt):
    os.makedirs(result_path_primer_prompt)

# Primer prompt text
primer_prompt_text = "Extract and list all the phenotypic manifestations of the condition found \
in the “text to analyze”. Classify each extracted term according to its corresponding TUI code and semantic type."

prompt_description = "Basic Prompt"

# Open the JSON format otuput
with open(r"Prompt Engineering\JSON format output.txt", 'r', encoding='utf-8') as f:
    JSON_format_output = f.read()

primer_prompt_text = primer_prompt_text + "\n" + JSON_format_output
save_prompt(prompt_name, primer_prompt_text, prompt_description=prompt_description)

failed_texts_name = []
failed_texts = []
failed_texts_error_message = []

# Create a list of files for which the text has already been analyzed
analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_primer_prompt) if file.endswith(".xlsx")]

for index, disease_text in enumerate(os.listdir(path_to_texts)):
    # Skip the files that have already been analyzed
    if disease_text.split(".")[0] in analyzed_files:
        continue
    with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:
        text = f.read()
        condition_name = disease_text.split(".")[0]
        message_text = [{"role":"system","content":primer_prompt_text},
                        {"role":"user", "content":"Text to analyze: \n\n" + text}]
        print(f"{index+1} - Analyzing disease: ", condition_name)
        # break
        try:
            GPT_answer = get_response_LLM(prompt=message_text)
            df = transform_GPT_output(GPT_answer)
            df.to_excel(os.path.join(result_path_primer_prompt, condition_name + abbreviation + ".xlsx"), sheet_name=model_name ,index=False)
            print("Disease analyzed and saved: ", condition_name)
            # break
        except Exception as e:
            failed_texts_name.append(condition_name)
            failed_texts.append(GPT_answer)
            # Save the error message from the exception
            failed_texts_error_message.append(str(e))
            print("Failed to analyze disease: ", condition_name)
            continue

# Saving failed texts summary
failed_df = pd.DataFrame({"Disease name failed": failed_texts_name, "GPT Output": failed_texts, "Error message": failed_texts_error_message})
failed_path = os.path.join(path_to_general_results, model_name)
failed_df.to_excel(os.path.join(failed_path, "Failed texts " + prompt_name + ".xlsx"), sheet_name=model_name ,index=False)
generate_summary_failed_texts(failed_path)

Resultados de Prompts\gpt-4o-mini\Prompt 1


Unnamed: 0,Prompt,Disease name failed,GPT Output,Error message,GPT Output Part A,GPT Output Part B


# Prompt 1: Zero-shot Learning with guidance

In [93]:
prompt_name = "Prompt 1_guided"
abbreviation = "_P1_guided"

# Create the path for the results of the prompt
result_path_seven_prompt = os.path.join(path_to_general_results, model_name, prompt_name)

# Create the folder if it does not exist
if not os.path.exists(result_path_seven_prompt):
    os.makedirs(result_path_seven_prompt)

# Open the JSON format otuput
with open(r"Prompt Engineering\JSON format output.txt", 'r', encoding='utf-8') as f:
    JSON_format_output = f.read()

# Creating and saving the prompt
primer_prompt_text = """Extract and list the terms found in the “text to analyze” that are phenotypic manifestations of {condition_name}.
Classify each extracted term according to its corresponding TUI code and Semantic type."""
primer_prompt_text = primer_prompt_text + "\n" + JSON_format_output
prompt_description = "Basic Prompt but saying in advance the condition to analyze in the text."
save_prompt(prompt_name, primer_prompt_text, prompt_description=prompt_description)

failed_texts_name = []
failed_texts = []
failed_texts_error_message = []

# Create a list of files for which the text has already been analyzed
analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_seven_prompt) if file.endswith(".xlsx")]

for index, disease_text in enumerate(os.listdir(path_to_texts)):
    # Skip the files that have already been analyzed
    if disease_text.split(".")[0] in analyzed_files:
        continue
    with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:
        text = f.read()
        condition_name = disease_text.split(".")[0]
        # Primer prompt text
        primer_prompt_text = f"""Extract and list the terms found in the “text to analyze” that are phenotypic manifestations of {condition_name}.
        Classify each extracted term according to its corresponding TUI code and semantic type."""
        primer_prompt_text = primer_prompt_text + "\n" + JSON_format_output

        message_text = [{"role":"system","content":primer_prompt_text},
                        {"role":"user", "content":"Text to analyze: \n\n" + text}]
        print(f"{index+1} - Analyzing disease: ", condition_name)
        # break
        try:
            GPT_answer = get_response_LLM(prompt=message_text)
            df = transform_GPT_output(GPT_answer)
            df.to_excel(os.path.join(result_path_seven_prompt, condition_name + abbreviation + ".xlsx"), sheet_name=model_name ,index=False)
            print("Disease analyzed and saved: ", condition_name)
        except Exception as e:
            failed_texts_name.append(condition_name)
            failed_texts.append(GPT_answer)
            # Save the error message from the exception
            failed_texts_error_message.append(str(e))
            print("Failed to analyze disease: ", condition_name)
            continue

# Saving failed texts summary
failed_df = pd.DataFrame({"Disease name failed": failed_texts_name, "GPT Output": failed_texts, "Error message": failed_texts_error_message})
failed_path = os.path.join(path_to_general_results, model_name)
failed_df.to_excel(os.path.join(failed_path, "Failed texts " + prompt_name + ".xlsx"), sheet_name=model_name ,index=False)
generate_summary_failed_texts(failed_path)

Unnamed: 0,Prompt,Disease name failed,GPT Output,Error message,GPT Output Part A,GPT Output Part B


# Prompt 2: Zero-Shot Learning + Entity Dictionary

In [94]:
if useSimplifiedEntityDescription:
    prompt_name = "Prompt 2_simplified"
    abbreviation = "_P2_simplified"
    # Open Prompt Engineering\TUI_description_text.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text_simplified.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()
    prompt_description = "Basic Prompt but extracting according to semantic type categories without extensive description, it's simplified."
else:
    prompt_name = "Prompt 2"
    abbreviation = "_P2"
    # Open Prompt Engineering\TUI_description_text.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()
    prompt_description = "Basic Prompt but extracting according to semantic type categories."

# Create the path for the results of the prompt
result_path_seventh_prompt = os.path.join(path_to_general_results, model_name, prompt_name)

# Create the folder if it does not exist
if not os.path.exists(result_path_seventh_prompt):
    os.makedirs(result_path_seventh_prompt)

second_prompt_text = "Extract and list all the terms that can be classified into any of the categories in the \
“TUI code and Semantic type description text” and are related to the condition found in the “text to analyze”. \
Classify each extracted term into one of the categories in the “TUI code and Semantic type description text."

# Open the JSON format otuput
with open(r"Prompt Engineering\JSON format output.txt", 'r', encoding='utf-8') as f:
    JSON_format_output = f.read()

# Creating and saving the prompt
second_prompt_text = second_prompt_text + "\n" + JSON_format_output
second_prompt_text = second_prompt_text + "\n\n" + "TUI code and Semantic type description text:\n\n" + TUI_description_text
save_prompt(prompt_name, second_prompt_text, prompt_description=prompt_description)

failed_texts_name = []
failed_texts = []
failed_texts_error_message = []

# Create a list of files for which the text has already been analyzed
analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_seventh_prompt) if file.endswith(".xlsx")]

for index, disease_text in enumerate(os.listdir(path_to_texts)):
    # Skip the files that have already been analyzed
    if disease_text.split(".")[0] in analyzed_files:
        continue
    with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:
        text = f.read() # Disease text
        condition_name = disease_text.split(".")[0]
        message_text = [{"role":"system","content":second_prompt_text},
                        {"role":"user", "content":"Text to analyze: \n\n" + text}]
        print(f"{index+1} - Analyzing disease: ", condition_name)
        try:
            GPT_answer = get_response_LLM(prompt=message_text)
            df = transform_GPT_output(GPT_answer)
            df.to_excel(os.path.join(result_path_seventh_prompt, condition_name + abbreviation + ".xlsx"), sheet_name=model_name ,index=False)
            print("Disease analyzed and saved: ", condition_name)
        except Exception as e:
            failed_texts_name.append(condition_name)
            failed_texts.append(GPT_answer)
            failed_texts_error_message.append(str(e))
            print("Failed to analyze disease: ", condition_name)
            continue

# Save failed texts
failed_df = pd.DataFrame({"Disease name failed": failed_texts_name, "GPT Output": failed_texts, "Error message": failed_texts_error_message})
failed_path = os.path.join(path_to_general_results, model_name)
failed_df.to_excel(os.path.join(failed_path, "Failed texts " + prompt_name + ".xlsx"), sheet_name=model_name ,index=False)
generate_summary_failed_texts(failed_path)

1 - Analyzing disease:  Acute decompensated heart failure
Disease analyzed and saved:  Acute decompensated heart failure
2 - Analyzing disease:  Acute intermittent porphyria
Disease analyzed and saved:  Acute intermittent porphyria
3 - Analyzing disease:  Anthrax
Disease analyzed and saved:  Anthrax
4 - Analyzing disease:  Arterial embolism
Disease analyzed and saved:  Arterial embolism
5 - Analyzing disease:  Arteriovenous malformation
Disease analyzed and saved:  Arteriovenous malformation
6 - Analyzing disease:  Ascites
Disease analyzed and saved:  Ascites
7 - Analyzing disease:  Autonomic dysreflexia
Disease analyzed and saved:  Autonomic dysreflexia
8 - Analyzing disease:  Benzodiazepine withdrawal syndrome
Disease analyzed and saved:  Benzodiazepine withdrawal syndrome
9 - Analyzing disease:  Blastomycosis
Disease analyzed and saved:  Blastomycosis
10 - Analyzing disease:  Breast cancer
Disease analyzed and saved:  Breast cancer
11 - Analyzing disease:  Campylobacteriosis
Disease

Unnamed: 0,Prompt,Disease name failed,GPT Output,Error message,GPT Output Part A,GPT Output Part B


# Prompt 2: Zero-Shot Learning + Entity Dictionary with guidance

In [104]:
if useSimplifiedEntityDescription:
    prompt_name = "Prompt 2_simplified_guided"
    abbreviation = "_P2_simplified_guided"
    # Open Prompt Engineering\TUI_description_text.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text_simplified.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()
else:
    prompt_name = "Prompt 2_guided"
    abbreviation = "_P2_guided"
    # Open Prompt Engineering\TUI_description_text.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()

# Create the path for the results of the prompt
result_path_second_prompt = os.path.join(path_to_general_results, model_name, prompt_name)

# Create the folder if it does not exist
if not os.path.exists(result_path_second_prompt):
    os.makedirs(result_path_second_prompt)

# Open the JSON format otuput
with open(r"Prompt Engineering\JSON format output.txt", 'r', encoding='utf-8') as f:
    JSON_format_output = f.read()

# Creating and saving the prompt
second_prompt_text = """Extract and list all the terms from the “text to analyze” that can be classified into any of the categories in the
“TUI code and Semantic type description text” and are related to {condition_name}. Classify each extracted 
term into one of the categories in the “TUI code and Semantic type description text."""
second_prompt_text = second_prompt_text + "\n" + JSON_format_output
second_prompt_text = second_prompt_text + "\n\n" + "TUI code and Semantic type description text:\n\n" + TUI_description_text
prompt_description = "Basic Prompt but saying in advance the condition to analyze in the text and extracting the terms that can be classified into one of the categories."
save_prompt(prompt_name, second_prompt_text, prompt_description=prompt_description) 
    
failed_texts_name = []
failed_texts = []
failed_texts_error_message = []

# Create a list of files for which the text has already been analyzed
analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_second_prompt) if file.endswith(".xlsx")]

for index, disease_text in enumerate(os.listdir(path_to_texts)):
    # Skip the files that have already been analyzed
    if disease_text.split(".")[0] in analyzed_files:
        continue
    with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:
        text = f.read() # Disease text
        condition_name = disease_text.split(".")[0]
        second_prompt_text = f"""Extract and list all the terms from the “text to analyze” that can be classified into any of the categories in the
        “TUI code and Semantic type description text” and are related to {condition_name}”. Classify each extracted 
        term into one of the categories in the “TUI code and Semantic type description text."""
        second_prompt_text = second_prompt_text + "\n" + JSON_format_output
        message_text = [{"role":"system","content":second_prompt_text + "\n\n" + "TUI code and Semantic type description text:\n\n" + TUI_description_text},
                        {"role":"user", "content":"Text to analyze: \n\n" + text}]
        print(f"{index+1} - Analyzing disease: ", condition_name)
        try:
            GPT_answer = get_response_LLM(prompt=message_text)
            df = transform_GPT_output(GPT_answer)
            df.to_excel(os.path.join(result_path_second_prompt, condition_name + abbreviation + ".xlsx"), sheet_name=model_name ,index=False)
            print("Disease analyzed and saved: ", condition_name)
        except Exception as e:
            failed_texts_name.append(condition_name)
            failed_texts.append(GPT_answer)
            failed_texts_error_message.append(str(e))
            print("Failed to analyze disease: ", condition_name)
            continue

# Save failed texts
failed_df = pd.DataFrame({"Disease name failed": failed_texts_name, "GPT Output": failed_texts, "Error message": failed_texts_error_message})
failed_path = os.path.join(path_to_general_results, model_name)
failed_df.to_excel(os.path.join(failed_path, "Failed texts " + prompt_name + ".xlsx"), sheet_name=model_name ,index=False)
generate_summary_failed_texts(failed_path)

10 - Analyzing disease:  Breast cancer
Disease analyzed and saved:  Breast cancer


Unnamed: 0,Prompt,Disease name failed,GPT Output,Error message,GPT Output Part A,GPT Output Part B


# Prompt 2: Zero-Shot Learning + Entity Dictionary plus manifestations

In [96]:
if useSimplifiedEntityDescription:
    prompt_name = "Prompt 2_simplified_plusManifestations"
    abbreviation = "_P2_simplified_plusManifestations"
    # Open Prompt Engineering\TUI_description_text.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text_simplified.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()
    prompt_description = "Basic Prompt but extracting according to semantic type categories without extensive description, it's simplified."
else:
    prompt_name = "Prompt 2_plusManifestations"
    abbreviation = "_P2_plusManifestations"
    # Open Prompt Engineering\TUI_description_text.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()
    prompt_description = "Basic Prompt but extracting according to semantic type categories."

# Create the path for the results of the prompt
result_path_seventh_prompt = os.path.join(path_to_general_results, model_name, prompt_name)

# Create the folder if it does not exist
if not os.path.exists(result_path_seventh_prompt):
    os.makedirs(result_path_seventh_prompt)

second_prompt_text = "Extract and list all the terms and manifestations that can be classified into any of the categories in the \
“TUI code and Semantic type description text” and are related to the condition found in the “text to analyze”. \
Classify each extracted term into one of the categories in the “TUI code and Semantic type description text."

# Open the JSON format otuput
with open(r"Prompt Engineering\JSON format output.txt", 'r', encoding='utf-8') as f:
    JSON_format_output = f.read()

# Creating and saving the prompt
second_prompt_text = second_prompt_text + "\n" + JSON_format_output
second_prompt_text = second_prompt_text + "\n\n" + "TUI code and Semantic type description text:\n\n" + TUI_description_text
save_prompt(prompt_name, second_prompt_text, prompt_description=prompt_description)

failed_texts_name = []
failed_texts = []
failed_texts_error_message = []

# Create a list of files for which the text has already been analyzed
analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_seventh_prompt) if file.endswith(".xlsx")]

for index, disease_text in enumerate(os.listdir(path_to_texts)):
    # Skip the files that have already been analyzed
    if disease_text.split(".")[0] in analyzed_files:
        continue
    with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:
        text = f.read() # Disease text
        condition_name = disease_text.split(".")[0]
        message_text = [{"role":"system","content":second_prompt_text},
                        {"role":"user", "content":"Text to analyze: \n\n" + text}]
        print(f"{index+1} - Analyzing disease: ", condition_name)
        try:
            GPT_answer = get_response_LLM(prompt=message_text)
            df = transform_GPT_output(GPT_answer)
            df.to_excel(os.path.join(result_path_seventh_prompt, condition_name + abbreviation + ".xlsx"), sheet_name=model_name ,index=False)
            print("Disease analyzed and saved: ", condition_name)
        except Exception as e:
            failed_texts_name.append(condition_name)
            failed_texts.append(GPT_answer)
            failed_texts_error_message.append(str(e))
            print("Failed to analyze disease: ", condition_name)
            continue

# Save failed texts
failed_df = pd.DataFrame({"Disease name failed": failed_texts_name, "GPT Output": failed_texts, "Error message": failed_texts_error_message})
failed_path = os.path.join(path_to_general_results, model_name)
failed_df.to_excel(os.path.join(failed_path, "Failed texts " + prompt_name + ".xlsx"), sheet_name=model_name ,index=False)
generate_summary_failed_texts(failed_path)

1 - Analyzing disease:  Acute decompensated heart failure
Disease analyzed and saved:  Acute decompensated heart failure
2 - Analyzing disease:  Acute intermittent porphyria
Disease analyzed and saved:  Acute intermittent porphyria
3 - Analyzing disease:  Anthrax
Disease analyzed and saved:  Anthrax
4 - Analyzing disease:  Arterial embolism
Disease analyzed and saved:  Arterial embolism
5 - Analyzing disease:  Arteriovenous malformation
Disease analyzed and saved:  Arteriovenous malformation
6 - Analyzing disease:  Ascites
Disease analyzed and saved:  Ascites
7 - Analyzing disease:  Autonomic dysreflexia
Disease analyzed and saved:  Autonomic dysreflexia
8 - Analyzing disease:  Benzodiazepine withdrawal syndrome
Disease analyzed and saved:  Benzodiazepine withdrawal syndrome
9 - Analyzing disease:  Blastomycosis
Disease analyzed and saved:  Blastomycosis
10 - Analyzing disease:  Breast cancer
Disease analyzed and saved:  Breast cancer
11 - Analyzing disease:  Campylobacteriosis
Disease

Unnamed: 0,Prompt,Disease name failed,GPT Output,Error message,GPT Output Part A,GPT Output Part B


# Prompt 2: Zero-Shot Learning + Entity Dictionary optimized by ChatGPT4o

In [97]:
prompt_name = "Prompt 2_optimizedByChatGPT"
abbreviation = "_P2_optimizedByChatGPT"
prompt_description = "Prompt 2 optimized by ChatGPT. Contains detailed definitions of categories, examples, steps of analysis, and the JSON format output."

# Create the path for the results of the prompt
result_path_seventh_prompt = os.path.join(path_to_general_results, model_name, prompt_name)

# Create the folder if it does not exist
if not os.path.exists(result_path_seventh_prompt):
    os.makedirs(result_path_seventh_prompt)

# Load the optimized prompt by ChatGPT
path_to_optimized_prompt = r"Prompt Engineering\Prompt 2 Optimized by ChatGPT.txt"
with open(path_to_optimized_prompt, 'r', encoding='utf-8') as f:
    second_prompt_text = f.read()
save_prompt(prompt_name, second_prompt_text, prompt_description=prompt_description)

failed_texts_name = []
failed_texts = []
failed_texts_error_message = []

# Create a list of files for which the text has already been analyzed
analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_seventh_prompt) if file.endswith(".xlsx")]

for index, disease_text in enumerate(os.listdir(path_to_texts)):
    # Skip the files that have already been analyzed
    if disease_text.split(".")[0] in analyzed_files:
        continue
    with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:
        text = f.read() # Disease text
        condition_name = disease_text.split(".")[0]
        message_text = [{"role":"user", "content":second_prompt_text + text}]
        print(f"{index+1} - Analyzing disease: ", condition_name)
        try:
            GPT_answer = get_response_LLM(prompt=message_text)
            df = transform_GPT_output(GPT_answer)
            df.to_excel(os.path.join(result_path_seventh_prompt, condition_name + abbreviation + ".xlsx"), sheet_name=model_name ,index=False)
            print("Disease analyzed and saved: ", condition_name)
        except Exception as e:
            failed_texts_name.append(condition_name)
            failed_texts.append(GPT_answer)
            failed_texts_error_message.append(str(e))
            print("Failed to analyze disease: ", condition_name)
            continue

# Save failed texts
failed_df = pd.DataFrame({"Disease name failed": failed_texts_name, "GPT Output": failed_texts, "Error message": failed_texts_error_message})
failed_path = os.path.join(path_to_general_results, model_name)
failed_df.to_excel(os.path.join(failed_path, "Failed texts " + prompt_name + ".xlsx"), sheet_name=model_name ,index=False)
generate_summary_failed_texts(failed_path)

Unnamed: 0,Prompt,Disease name failed,GPT Output,Error message,GPT Output Part A,GPT Output Part B


# Prompt 3: One-shot Learning + Entity Dictionary

In [98]:
if useSimplifiedEntityDescription:
    prompt_name = "Prompt 3_simplified"
    abbreviation = "_P3_simplified"
    # Open Prompt Engineering\TUI_description_text.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text_simplified.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()
    prompt_description = "Basic Prompt but extracting according to simplified semantic type categories text with one-shot learning."
else:
    prompt_name = "Prompt 3"
    abbreviation = "_P3"
    # Open Prompt Engineering\TUI_description_text.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()
    prompt_description = "Basic Prompt but extracting according to semantic type categories with one-shot learning."

# Create the path for the results of the prompt
result_path_third_prompt = os.path.join(path_to_general_results, model_name, prompt_name)

# Create the folder if it does not exist
if not os.path.exists(result_path_third_prompt):
    os.makedirs(result_path_third_prompt)

third_prompt_text = "Extract and list all the terms that can be classified into any of the categories in the \
“TUI code and Semantic type description text” and are related to the condition found in the “text to analyze”. \
Classify each extracted term into one of the categories in the “TUI code and Semantic type description text"


# Open the JSON format otuput
with open(r"Prompt Engineering\JSON format output.txt", 'r', encoding='utf-8') as f:
    JSON_format_output = f.read()

third_prompt_text = third_prompt_text + "\n" + JSON_format_output
third_prompt_text = third_prompt_text + "\n\nTUI code and Semantic type description text: \n\n" + TUI_description_text
save_prompt(prompt_name, third_prompt_text, prompt_description)

# Open Texto Ejemplo de Anemia
with open(r"Prompt Engineering\Texto Ejemplo - Anemia.txt", 'r', encoding='utf-8') as f:
    example_text = f.read()
    
# Open Texto Ejemplo sin CoT
with open(r"Prompt Engineering\Texto Ejemplo Respuesta sin CoT - Anemia.txt", 'r', encoding='utf-8') as f:
    example_text_answer_sin_CoT = f.read()

failed_texts_name = []
failed_texts = []
failed_texts_error_message = []

# Create a list of files for which the text has already been analyzed
analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_third_prompt) if file.endswith(".xlsx")]

for index, disease_text in enumerate(os.listdir(path_to_texts)):
    # Skip the files that have already been analyzed
    if disease_text.split(".")[0] in analyzed_files:
        continue
    with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:
        text = f.read() # Disease text
        condition_name = disease_text.split(".")[0]
        message_text = [{"role":"system","content":third_prompt_text},
                        {"role":"user", "content":"Text to analyze: \n\n" + example_text},
                        {"role":"assistant","content":example_text_answer_sin_CoT},
                        {"role":"user","content":"Text to analyze: \n\n" + text}]
        print(f"{index+1} - Analyzing disease: ", condition_name)
        try:
            GPT_answer = get_response_LLM(prompt=message_text)
            df = transform_GPT_output(GPT_answer)
            df.to_excel(os.path.join(result_path_third_prompt, condition_name + abbreviation + ".xlsx"), sheet_name=model_name ,index=False)
            print("Successfully analyzed: ", condition_name)
        except Exception as e:
            failed_texts_name.append(condition_name)
            failed_texts.append(GPT_answer)
            # Save the error message from the exception
            failed_texts_error_message.append(str(e))
            print("Failed to analyze: ", condition_name)
            continue

# Save failed texts
failed_df = pd.DataFrame({"Disease name failed": failed_texts_name, "GPT Output": failed_texts, "Error message": failed_texts_error_message})
failed_path = os.path.join(path_to_general_results, model_name)
failed_df.to_excel(os.path.join(failed_path, "Failed texts " + prompt_name + ".xlsx"), sheet_name=model_name ,index=False)
generate_summary_failed_texts(failed_path)

1 - Analyzing disease:  Acute decompensated heart failure
Successfully analyzed:  Acute decompensated heart failure
2 - Analyzing disease:  Acute intermittent porphyria
Successfully analyzed:  Acute intermittent porphyria
3 - Analyzing disease:  Anthrax
Successfully analyzed:  Anthrax
4 - Analyzing disease:  Arterial embolism
Successfully analyzed:  Arterial embolism
5 - Analyzing disease:  Arteriovenous malformation
Successfully analyzed:  Arteriovenous malformation
6 - Analyzing disease:  Ascites
Successfully analyzed:  Ascites
7 - Analyzing disease:  Autonomic dysreflexia
Successfully analyzed:  Autonomic dysreflexia
8 - Analyzing disease:  Benzodiazepine withdrawal syndrome
Successfully analyzed:  Benzodiazepine withdrawal syndrome
9 - Analyzing disease:  Blastomycosis
Successfully analyzed:  Blastomycosis
10 - Analyzing disease:  Breast cancer
Successfully analyzed:  Breast cancer
11 - Analyzing disease:  Campylobacteriosis
Successfully analyzed:  Campylobacteriosis
12 - Analyzing

Unnamed: 0,Prompt,Disease name failed,GPT Output,Error message,GPT Output Part A,GPT Output Part B


# Prompt 4: Zero-Shot Learning + Entity Dictionary + Self-Reflection (or called CoT, provide explanation)

In [99]:
if useSimplifiedEntityDescription:
    prompt_name = "Prompt 4_simplified"
    abbreviation = "_P4_simplified"
    # Open Prompt Engineering\TUI_description_text.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text_simplified.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()
    prompt_description = "Basic Prompt but extracting according to simplified semantic type categories text with explanation (CoT)."
else:
    prompt_name = "Prompt 4"
    abbreviation = "_P4"
    # Open Prompt Engineering\TUI_description_text.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()
    prompt_description = "Basic Prompt but extracting according to semantic type categories with explanation (CoT)."

# Create the path for the results of the prompt
result_path_fourth_prompt = os.path.join(path_to_general_results, model_name, prompt_name)

# Create the folder if it does not exist
if not os.path.exists(result_path_fourth_prompt):
    os.makedirs(result_path_fourth_prompt)

fourth_prompt_text = "Extract and list all the terms that can be classified into any of the categories in the \
“TUI code and Semantic type description text” and are related to the condition found in the “text to analyze”. \
Classify each extracted term into one of the categories in the “TUI code and Semantic type description text”. \
Explain why the entity was extracted and why it was classified as such."


# Open the JSON format otuput
with open(r"Prompt Engineering\JSON format output with Explanation.txt", 'r', encoding='utf-8') as f:
    JSON_format_output_with_explanation = f.read()

# Creating and saving the prompt
fourth_prompt_text = fourth_prompt_text + "\n" + JSON_format_output_with_explanation
fourth_prompt_text = fourth_prompt_text + "\n\n" + "TUI code and Semantic type description text: \n\n" + TUI_description_text
save_prompt(prompt_name, fourth_prompt_text, prompt_description)

failed_texts_name = []
failed_texts = []
failed_texts_error_message = []

# Create a list of files for which the text has already been analyzed
analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_fourth_prompt) if file.endswith(".xlsx")]

for index, disease_text in enumerate(os.listdir(path_to_texts)):
    # Skip the files that have already been analyzed
    if disease_text.split(".")[0] in analyzed_files:
        continue
    with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:
        text = f.read() # Disease text
        condition_name = disease_text.split(".")[0]
        message_text = [{"role":"system","content":fourth_prompt_text},
                        {"role":"user", "content":"Text to analyze: \n\n" + text}]
        print(f"{index + 1} - Analyzing:", condition_name)
        try:
            GPT_answer = get_response_LLM(prompt=message_text)
            df = transform_GPT_output(GPT_answer)
            df.to_excel(os.path.join(result_path_fourth_prompt, condition_name + abbreviation + ".xlsx"), sheet_name=model_name ,index=False)
            print("Successfully analyzed: ", condition_name)
        except Exception as e:
            failed_texts_name.append(condition_name)
            failed_texts.append(GPT_answer)
            # Save the error message from the exception
            failed_texts_error_message.append(str(e))
            print("Failed to analyze: ", condition_name)

# Save failed texts after looping through all the text diseases
failed_df = pd.DataFrame({"Disease name failed": failed_texts_name, "GPT Output": failed_texts, "Error message": failed_texts_error_message})
failed_path = os.path.join(path_to_general_results, model_name)
failed_df.to_excel(os.path.join(failed_path, "Failed texts " + prompt_name + ".xlsx"), sheet_name=model_name ,index=False)
generate_summary_failed_texts(failed_path)

1 - Analyzing: Acute decompensated heart failure
Successfully analyzed:  Acute decompensated heart failure
2 - Analyzing: Acute intermittent porphyria
Successfully analyzed:  Acute intermittent porphyria
3 - Analyzing: Anthrax
Successfully analyzed:  Anthrax
4 - Analyzing: Arterial embolism
Successfully analyzed:  Arterial embolism
5 - Analyzing: Arteriovenous malformation
Successfully analyzed:  Arteriovenous malformation
6 - Analyzing: Ascites
Successfully analyzed:  Ascites
7 - Analyzing: Autonomic dysreflexia
Successfully analyzed:  Autonomic dysreflexia
8 - Analyzing: Benzodiazepine withdrawal syndrome
Successfully analyzed:  Benzodiazepine withdrawal syndrome
9 - Analyzing: Blastomycosis
Successfully analyzed:  Blastomycosis
10 - Analyzing: Breast cancer
Successfully analyzed:  Breast cancer
11 - Analyzing: Campylobacteriosis
Successfully analyzed:  Campylobacteriosis
12 - Analyzing: Carciac myxoma
Successfully analyzed:  Carciac myxoma
13 - Analyzing: Carrion's disease
Successfu

Unnamed: 0,Prompt,Disease name failed,GPT Output,Error message,GPT Output Part A,GPT Output Part B


# Prompt 5: One-shot learning + Entity Description + Self-Reflection (or called CoT, provide explanation)

In [100]:
if useSimplifiedEntityDescription:
    prompt_name = "Prompt 5_simplified"
    abbreviation = "_P5_simplified"
    # Open Prompt Engineering\TUI_description_text_simplified.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text_simplified.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()
    prompt_description = "Basic Prompt but extracting according to simplified semantic type categories text with explanation (CoT) and one-shot learning."
else:
    prompt_name = "Prompt 5"
    abbreviation = "_P5"
    # Open Prompt Engineering\TUI_description_text.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()
    prompt_description = "Basic Prompt but extracting according to semantic type categories with explanation (CoT) and one-shot learning."

# Create the path for the results of the prompt
result_path_fifth_prompt = os.path.join(path_to_general_results, model_name, prompt_name)

# Create the folder if it does not exist
if not os.path.exists(result_path_fifth_prompt):
    os.makedirs(result_path_fifth_prompt)

fifth_prompt_text = "Extract and list all the terms that can be classified into any of the categories in the \
“TUI code and Semantic type description text” and are related to the condition found in the “text to analyze”. \
Classify each extracted term into one of the categories in the “TUI code and Semantic type description text”. \
Explain why the entity was extracted and why it was classified as such."


# Open the JSON format otuput
with open(r"Prompt Engineering\JSON format output with Explanation.txt", 'r', encoding='utf-8') as f:
    JSON_format_output_with_explanation = f.read()

# Creating and saving the prompt
fifth_prompt_text = fifth_prompt_text + "\n" + JSON_format_output_with_explanation
fifth_prompt_text = fifth_prompt_text + "\n\n" + "TUI code and Semantic type description text: \n\n" + TUI_description_text
save_prompt(prompt_name, fifth_prompt_text, prompt_description=prompt_description)

# Open Texto Ejemplo de Anemia
with open(r"Prompt Engineering\Texto Ejemplo - Anemia.txt", 'r', encoding='utf-8') as f:
    example_text = f.read()
# Open Texto Ejemplo sin CoT
with open(r"Prompt Engineering\Texto Ejemplo Respuesta con CoT - Anemia.txt", 'r', encoding='utf-8') as f:
    example_text_answer_con_CoT = f.read()

failed_texts_name = []
failed_texts = []
failed_texts_error_message = []

# Create a list of files for which the text has already been analyzed
analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_fifth_prompt) if file.endswith(".xlsx")]

for index, disease_text in enumerate(os.listdir(path_to_texts)):
    if disease_text.split(".")[0] in analyzed_files:
        continue
    with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:
        text = f.read() # Disease text
        condition_name = disease_text.split(".")[0]
        message_text = [{"role":"system","content":fifth_prompt_text},
                        {"role":"user", "content":"Text to analyze: \n\n" + example_text},
                        {"role":"assistant","content":example_text_answer_con_CoT},
                        {"role":"user","content":"Text to analyze: \n\n" + text}]
        print(f"{index+1} - Analyzing: ", condition_name)
        try:
            GPT_answer = get_response_LLM(prompt=message_text)
            df = transform_GPT_output(GPT_answer)
            df.to_excel(os.path.join(result_path_fifth_prompt, condition_name + abbreviation + ".xlsx"), sheet_name=model_name ,index=False)
            print("Successfully analyzed: ", condition_name)
        except Exception as e:
            failed_texts_name.append(condition_name)
            failed_texts.append(GPT_answer)
            # Save the error message from the exception
            failed_texts_error_message.append(str(e))
            print("Failed to analyze: ", condition_name)

# Save failed texts
failed_df = pd.DataFrame({"Disease name failed": failed_texts_name, "GPT Output": failed_texts, "Error message": failed_texts_error_message})
failed_path = os.path.join(path_to_general_results, model_name)
failed_df.to_excel(os.path.join(failed_path, "Failed texts " + prompt_name + ".xlsx"), sheet_name=model_name ,index=False)
generate_summary_failed_texts(failed_path)

1 - Analyzing:  Acute decompensated heart failure
Successfully analyzed:  Acute decompensated heart failure
2 - Analyzing:  Acute intermittent porphyria
Successfully analyzed:  Acute intermittent porphyria
3 - Analyzing:  Anthrax
Successfully analyzed:  Anthrax
4 - Analyzing:  Arterial embolism
Successfully analyzed:  Arterial embolism
5 - Analyzing:  Arteriovenous malformation
Successfully analyzed:  Arteriovenous malformation
6 - Analyzing:  Ascites
Successfully analyzed:  Ascites
7 - Analyzing:  Autonomic dysreflexia
Successfully analyzed:  Autonomic dysreflexia
8 - Analyzing:  Benzodiazepine withdrawal syndrome
Successfully analyzed:  Benzodiazepine withdrawal syndrome
9 - Analyzing:  Blastomycosis
Successfully analyzed:  Blastomycosis
10 - Analyzing:  Breast cancer
Successfully analyzed:  Breast cancer
11 - Analyzing:  Campylobacteriosis
Successfully analyzed:  Campylobacteriosis
12 - Analyzing:  Carciac myxoma
Successfully analyzed:  Carciac myxoma
13 - Analyzing:  Carrion's dise

Unnamed: 0,Prompt,Disease name failed,GPT Output,Error message,GPT Output Part A,GPT Output Part B


# Prompt 6: Prompt Chaining
- 1. First extract all the possible terms related to the condition mentioned in the text
- 2. Classify each term according to the TUI Code and Semantic Categories and discard those that do not fit a category

In [101]:
if useSimplifiedEntityDescription:
    prompt_name = "Prompt 6_simplified"
    abbreviation = "_P6_simplified"
    # Open Prompt Engineering\TUI_description_text.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text_simplified.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()
    prompt_description_PartB = "Basic Prompt but classifying each term and manifestation into one of the simplified categories of TUI code and Semantic type description text."

else:
    prompt_name = "Prompt 6"
    abbreviation = "_P6"
    # Open Prompt Engineering\TUI_description_text.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()
    prompt_description_PartB = "Basic Prompt but classifying each term into one of the categories in the TUI code and Semantic type description text."

# Create the path for the results of the prompt
result_path_sixth_prompt = os.path.join(path_to_general_results, model_name, prompt_name)

# Create the folder if it does not exist
if not os.path.exists(result_path_sixth_prompt):
    os.makedirs(result_path_sixth_prompt)

# Part A - Extraction prompt
sixth_prompt_text_PartA = "Extract and list all the terms that are related to the condition found in the “text to analyze”."
prompt_description_PartA = "Basic Prompt but extracting all terms related to the condition."

# Open the JSON format otuput
with open(r"Prompt Engineering\JSON format output with only Entity.txt", 'r', encoding='utf-8') as f:
    JSON_format_output_only_entity = f.read()

# Creating and saving the prompt A
sixth_prompt_text_PartA = sixth_prompt_text_PartA + "\n" + JSON_format_output_only_entity
save_prompt(prompt_name + "_PartA", sixth_prompt_text_PartA, prompt_description=prompt_description_PartA)

# Part B - Classification prompt
sixth_prompt_text_PartB = "Classify each term of a list according to one of the categories in the “TUI code and Semantic type description text”. \
If a term cannot be classified into any of the categories in the “TUI code and Semantic type description text”, classify its TUI code and Semantic type as “Other”."

# Open the JSON format otuput
with open(r"Prompt Engineering\JSON format output.txt", 'r', encoding='utf-8') as f:
    JSON_format_output = f.read()

# Creating and saving the prompt B
sixth_prompt_text_PartB = sixth_prompt_text_PartB + "\n" + JSON_format_output
sixth_prompt_text_PartB = sixth_prompt_text_PartB + "\n\n" + "TUI code and Semantic type description text: \n\n" + TUI_description_text
save_prompt(prompt_name + "_PartB", sixth_prompt_text_PartB, prompt_description=prompt_description_PartB)

# Creating lists to store the failed texts
failed_texts_name = []
failed_texts_PartA = []
failed_texts_PartB = []
failed_texts_error_message = []

# Create a list of files for which the text has already been analyzed
analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_sixth_prompt) if file.endswith(".xlsx")]

for i, disease_text in enumerate(os.listdir(path_to_texts)):
    # Skip the files that have already been analyzed
    if disease_text.split(".")[0] in analyzed_files:
        continue
    with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:
        text = f.read() # Disease text
        condition_name = disease_text.split(".")[0]
        # Restarting the GPT_answer for each part as a blank string
        GPT_answer_PartA = ""
        GPT_answer_PartB = "Not yet generated"

        # Part A - Extracting the biomedical entities from the text
        message_text_PartA = [{"role":"system","content":sixth_prompt_text_PartA},
                            {"role":"user","content":"Text to analyze: \n\n" + text}]
        print(f"{i + 1} - Analyzing: ",condition_name)

        try:
            GPT_answer_PartA = get_response_LLM(prompt=message_text_PartA)
            # Transform the output from Part A into a DataFrame
            df_PartA = transform_GPT_output(GPT_answer_PartA)
            print("\tPart A, entity extraction, success")
            
            # Part B - Classifying the biomedical entities according to the TUI code and Semantic type description text
            message_text_PartB = [{"role":"system","content":sixth_prompt_text_PartB},
                                {"role":"user", "content":GPT_answer_PartA}]
            GPT_answer_PartB = get_response_LLM(prompt=message_text_PartB)

            # Transform the output from Part B into a DataFrame
            df_PartB = transform_GPT_output(GPT_answer_PartB)
            print("\tPart B, entity classification, success")

            # Save the DataFrames in an Excel file
            df_PartB.to_excel(os.path.join(result_path_sixth_prompt, condition_name + abbreviation + ".xlsx"), sheet_name="PartB_Classification" ,index=False)
            
            # Save the df_PartA in the same excel file but in a different sheet
            with pd.ExcelWriter(os.path.join(result_path_sixth_prompt, condition_name + abbreviation + ".xlsx"), engine='openpyxl', mode='a') as writer:
                df_PartA.to_excel(writer, sheet_name="PartA_Extraction", index=False)

            # Count the number of rows in the DataFrames
            count_PartA = df_PartA.shape[0]
            count_PartB = df_PartB.shape[0]

            if count_PartA != count_PartB:
                print("\tThe number of rows in the DataFrames is different")
                raise Exception("The number of rows in the DataFrames is different")
            else:
                print("\tSuccessfully analyzed: ", condition_name)
        except Exception as e:
            failed_texts_name.append(condition_name)
            failed_texts_PartA.append(GPT_answer_PartA)
            failed_texts_PartB.append(GPT_answer_PartB)
            failed_texts_error_message.append(str(e))
            print("Failed to analyze: ", condition_name)

# Save failed texts
failed_df = pd.DataFrame({"Disease name failed": failed_texts_name, "GPT Output Part A": failed_texts_PartA, "GPT Output Part B": failed_texts_PartB, "Error message": failed_texts_error_message})
failed_path = os.path.join(path_to_general_results, model_name)
failed_df.to_excel(os.path.join(failed_path, "Failed texts " + prompt_name + ".xlsx"), sheet_name=model_name ,index=False)
generate_summary_failed_texts(failed_path)
# Filter the results of Prompt 6 removing the entities classified as "Other" in the TUI Code ans save the new files in a new folder
filter_prompt6_removeOther(result_path_sixth_prompt)

1 - Analyzing:  Acute decompensated heart failure
	Part A, entity extraction, success
	Part B, entity classification, success
	Successfully analyzed:  Acute decompensated heart failure
2 - Analyzing:  Acute intermittent porphyria
	Part A, entity extraction, success
	Part B, entity classification, success
	Successfully analyzed:  Acute intermittent porphyria
3 - Analyzing:  Anthrax
	Part A, entity extraction, success
	Part B, entity classification, success
	Successfully analyzed:  Anthrax
4 - Analyzing:  Arterial embolism
	Part A, entity extraction, success
	Part B, entity classification, success
	Successfully analyzed:  Arterial embolism
5 - Analyzing:  Arteriovenous malformation
	Part A, entity extraction, success
	Part B, entity classification, success
	Successfully analyzed:  Arteriovenous malformation
6 - Analyzing:  Ascites
	Part A, entity extraction, success
	Part B, entity classification, success
	Successfully analyzed:  Ascites
7 - Analyzing:  Autonomic dysreflexia
	Part A, ent

# Prompt 6: Prompt Chaining with manifestations
- 1. First extract all the possible terms and manifestations related to the condition mentioned in the text
- 2. Classify each term and manifestation according to the TUI Code and Semantic Categories and discard those that do not fit a category

In [102]:
if useSimplifiedEntityDescription:
    prompt_name = "Prompt 6_simplified_plusManifestations"
    abbreviation = "_P6_simplified_plusManifestations"
    # Open Prompt Engineering\TUI_description_text.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text_simplified.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()
    prompt_description_PartB = "Basic Prompt but classifying each term and manifestation into one of the simplified categories of TUI code and Semantic type description text."

else:
    prompt_name = "Prompt 6_plusManifestations"
    abbreviation = "_P6_plusManifestations"
    # Open Prompt Engineering\TUI_description_text.txt to get the TUI code and Semantic type description text
    with open(r"Prompt Engineering\TUI_description_text.txt", 'r', encoding='utf-8') as f:
        TUI_description_text = f.read()
    prompt_description_PartB = "Basic Prompt but classifying each term into one of the categories in the TUI code and Semantic type description text."

# Create the path for the results of the prompt
result_path_sixth_prompt = os.path.join(path_to_general_results, model_name, prompt_name)

# Create the folder if it does not exist
if not os.path.exists(result_path_sixth_prompt):
    os.makedirs(result_path_sixth_prompt)

# Part A - Extraction prompt
sixth_prompt_text_PartA = "Extract and list all the terms and manifestations that are related to the condition found in the “text to analyze”."
prompt_description_PartA = "Basic Prompt but extracting all terms related to the condition."

# Open the JSON format otuput
with open(r"Prompt Engineering\JSON format output with only Entity.txt", 'r', encoding='utf-8') as f:
    JSON_format_output_only_entity = f.read()

# Creating and saving the prompt A
sixth_prompt_text_PartA = sixth_prompt_text_PartA + "\n" + JSON_format_output_only_entity
save_prompt(prompt_name + "_PartA", sixth_prompt_text_PartA, prompt_description=prompt_description_PartA)

# Part B - Classification prompt
sixth_prompt_text_PartB = "Classify each term and manifestation of a list according to one of the categories in the “TUI code and Semantic type description text”. \
If a term cannot be classified into any of the categories in the “TUI code and Semantic type description text”, classify its TUI code and Semantic type as “Other”."

# Open the JSON format otuput
with open(r"Prompt Engineering\JSON format output.txt", 'r', encoding='utf-8') as f:
    JSON_format_output = f.read()

# Creating and saving the prompt B
sixth_prompt_text_PartB = sixth_prompt_text_PartB + "\n" + JSON_format_output
sixth_prompt_text_PartB = sixth_prompt_text_PartB + "\n\n" + "TUI code and Semantic type description text: \n\n" + TUI_description_text
save_prompt(prompt_name + "_PartB", sixth_prompt_text_PartB, prompt_description=prompt_description_PartB)

# Creating lists to store the failed texts
failed_texts_name = []
failed_texts_PartA = []
failed_texts_PartB = []
failed_texts_error_message = []

# Create a list of files for which the text has already been analyzed
analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_sixth_prompt) if file.endswith(".xlsx")]

for i, disease_text in enumerate(os.listdir(path_to_texts)):
    # Skip the files that have already been analyzed
    if disease_text.split(".")[0] in analyzed_files:
        continue
    with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:
        text = f.read() # Disease text
        condition_name = disease_text.split(".")[0]
        # Restarting the GPT_answer for each part as a blank string
        GPT_answer_PartA = ""
        GPT_answer_PartB = "Not yet generated"

        # Part A - Extracting the biomedical entities from the text
        message_text_PartA = [{"role":"system","content":sixth_prompt_text_PartA},
                            {"role":"user","content":"Text to analyze: \n\n" + text}]
        print(f"{i + 1} - Analyzing: ",condition_name)

        try:
            GPT_answer_PartA = get_response_LLM(prompt=message_text_PartA)
            # Transform the output from Part A into a DataFrame
            df_PartA = transform_GPT_output(GPT_answer_PartA)
            print("\tPart A, entity extraction, success")
            
            # Part B - Classifying the biomedical entities according to the TUI code and Semantic type description text
            message_text_PartB = [{"role":"system","content":sixth_prompt_text_PartB},
                                {"role":"user", "content":GPT_answer_PartA}]
            GPT_answer_PartB = get_response_LLM(prompt=message_text_PartB)

            # Transform the output from Part B into a DataFrame
            df_PartB = transform_GPT_output(GPT_answer_PartB)
            print("\tPart B, entity classification, success")

            # Save the DataFrames in an Excel file
            df_PartB.to_excel(os.path.join(result_path_sixth_prompt, condition_name + abbreviation + ".xlsx"), sheet_name="PartB_Classification" ,index=False)
            
            # Save the df_PartA in the same excel file but in a different sheet
            with pd.ExcelWriter(os.path.join(result_path_sixth_prompt, condition_name + abbreviation + ".xlsx"), engine='openpyxl', mode='a') as writer:
                df_PartA.to_excel(writer, sheet_name="PartA_Extraction", index=False)

            # Count the number of rows in the DataFrames
            count_PartA = df_PartA.shape[0]
            count_PartB = df_PartB.shape[0]

            if count_PartA != count_PartB:
                print("\tThe number of rows in the DataFrames is different")
                raise Exception("The number of rows in the DataFrames is different")
            else:
                print("\tSuccessfully analyzed: ", condition_name)
        except Exception as e:
            failed_texts_name.append(condition_name)
            failed_texts_PartA.append(GPT_answer_PartA)
            failed_texts_PartB.append(GPT_answer_PartB)
            failed_texts_error_message.append(str(e))
            print("Failed to analyze: ", condition_name)

# Save failed texts
failed_df = pd.DataFrame({"Disease name failed": failed_texts_name, "GPT Output Part A": failed_texts_PartA, "GPT Output Part B": failed_texts_PartB, "Error message": failed_texts_error_message})
failed_path = os.path.join(path_to_general_results, model_name)
failed_df.to_excel(os.path.join(failed_path, "Failed texts " + prompt_name + ".xlsx"), sheet_name=model_name ,index=False)
generate_summary_failed_texts(failed_path)

# Filter the results of Prompt 6 removing the entities classified as "Other" in the TUI Code ans save the new files in a new folder
filter_prompt6_removeOther(result_path_sixth_prompt)

1 - Analyzing:  Acute decompensated heart failure
	Part A, entity extraction, success
	Part B, entity classification, success
	Successfully analyzed:  Acute decompensated heart failure
2 - Analyzing:  Acute intermittent porphyria
	Part A, entity extraction, success
	Part B, entity classification, success
	Successfully analyzed:  Acute intermittent porphyria
3 - Analyzing:  Anthrax
	Part A, entity extraction, success
	Part B, entity classification, success
	Successfully analyzed:  Anthrax
4 - Analyzing:  Arterial embolism
	Part A, entity extraction, success
	Part B, entity classification, success
	Successfully analyzed:  Arterial embolism
5 - Analyzing:  Arteriovenous malformation
	Part A, entity extraction, success
	Part B, entity classification, success
	Successfully analyzed:  Arteriovenous malformation
6 - Analyzing:  Ascites
	Part A, entity extraction, success
	Part B, entity classification, success
	Successfully analyzed:  Ascites
7 - Analyzing:  Autonomic dysreflexia
	Part A, ent

# Prompt 7 - Entity One by One, doing batch processing

## Creating batch request

In [2]:
prompt_name = "Prompt 7_OneByOne"
abbreviation = "_P7_OneByOne"
prompt_description = "Prompt 7 extracts each type of identity one by one for each text."

# Create the path for the results of the prompt
result_path_seventh_prompt = os.path.join(path_to_general_results, model_name, prompt_name)

# Create the folder if it does not exist
if not os.path.exists(result_path_seventh_prompt):
    os.makedirs(result_path_seventh_prompt)

# Open the JSON format output
with open(r"Prompt Engineering\JSON format output.txt", 'r', encoding='utf-8') as f:
    JSON_format_output = f.read()

# Load the TUI code and Semantic type categories
path_to_TUI_seman_type_description = r"Prompt Engineering\TUI_Codes_Semantic_Types_to_look_for.xlsx"
TUI_semantic_type_description_df = pd.read_excel(path_to_TUI_seman_type_description)

seventh_prompt_text = r"""Extract and list all the {insert semantic type} that are related to the condition found in the "text to analyze". \
      Classify each extracted term into TUI code {insert TUI code} and Semantic type {insert semantic type}."""
seventh_prompt_text = seventh_prompt_text + "\n" + JSON_format_output

save_prompt(prompt_name, seventh_prompt_text, prompt_description=prompt_description)

failed_texts_name = []
failed_texts = []
failed_texts_error_message = []

# Create a list of files for which the text has already been analyzed
analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_seventh_prompt) if file.endswith(".xlsx")]

batch_request_1 = []
batch_request_2 = []
token_counter = 0
def create_prompt_request(message_text, condition_name, TUI_Code, Semantic_Type):
    request_ID = f"{prompt_name}_{condition_name}_{Semantic_Type}_{TUI_Code}"
    return {"custom_id": request_ID, "method": "POST", "url": "/v1/chat/completions", "body": {"model": model_name, "temperature": 0, "response_format":{ "type": "json_object" }, "messages": message_text}}

for index, disease_text in enumerate(os.listdir(path_to_texts)):
    # Skip the files that have already been analyzed
    if disease_text.split(".")[0] in analyzed_files:
        continue
    with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:
        text = f.read() # Disease text
        condition_name = disease_text.split(".")[0] # Name of the disease
    for TUI_Code, Semantic_Type in zip(TUI_semantic_type_description_df["TUI Code"], TUI_semantic_type_description_df["Semantic Type"]):
        seventh_prompt_text = (
            f'Extract and list all the "{Semantic_Type}" that are related to the condition found in the "text to analyze". '
            f'Classify each extracted term as TUI code "{TUI_Code}" and Semantic type "{Semantic_Type}".\n'
            'Format the output as JSON in the following structure:\n\n'
            '[\n'
            '    {\n'
            '        "Entities": [\n'
            '            {\n'
            f'                "Entity": "Entity Name 1",\n'
            f'                "TUI_Code": "{TUI_Code}",\n'
            f'                "Semantic_Type": "{Semantic_Type}"\n'
            '            },\n'
            '            {\n'
            f'                "Entity": "Entity Name 2",\n'
            f'                "TUI_Code": "{TUI_Code}",\n'
            f'                "Semantic_Type": "{Semantic_Type}"\n'
            '            }\n'
            '        ]\n'
            '    }\n'
            ']\n\n'
            'Ensure each object in the "Entities" array contains the keys: "Entity", "TUI_Code", and "Semantic_Type".'
        )
        message_text = [{"role":"system","content":seventh_prompt_text},
                        {"role":"user", "content":"Text to analyze: \n\n" + text}]
        prompt_request = create_prompt_request(message_text, condition_name, TUI_Code, Semantic_Type)
        token_counter += count_tokens_in_text(prompt_request.__str__())
        # The token limit is 2000000 so we split the requests in two batches
        if token_counter < 1100000:
            batch_request_1.append(prompt_request)
        else:
            batch_request_2.append(prompt_request)

# Save the batch request in a JSONL file
file_name_1 = f"batch_request_{prompt_name}_part1.jsonl"
file_name_2 = f"batch_request_{prompt_name}_part2.jsonl"
path_to_batch_request_part1 = os.path.join(result_path_seventh_prompt, file_name_1)
path_to_batch_request_part2 = os.path.join(result_path_seventh_prompt, file_name_2)
with open(path_to_batch_request_part1, 'w') as jsonl_file:
    for record in batch_request_1:
        jsonl_file.write(json.dumps(record) + '\n')  # Convert each dict to a JSON string and add a newline
with open(path_to_batch_request_part2, 'w') as jsonl_file:
    for record in batch_request_2:
        jsonl_file.write(json.dumps(record) + '\n')  # Convert each dict to a JSON string and add a newline
print("Batch 1 request saved in: ", path_to_batch_request_part1)
print("Batch 2 request saved in: ", path_to_batch_request_part2)
print("Total tokens in the batch request: ", token_counter)

Batch 1 request saved in:  Resultados de Prompts\gpt-4o-mini\Prompt 7_OneByOne\batch_request_Prompt 7_OneByOne_part1.jsonl
Batch 2 request saved in:  Resultados de Prompts\gpt-4o-mini\Prompt 7_OneByOne\batch_request_Prompt 7_OneByOne_part2.jsonl
Total tokens in the batch request:  2465676


## Sending Batch Request

In [16]:
### 1st batch request

# batch_input_file_part1 = openAI_client.files.create(
#   file=open(path_to_batch_request_part1, "rb"),
#   purpose="batch"
# )

# batch_input_file_id_part1 = batch_input_file_part1.id

# batch_info_part1 = openAI_client.batches.create(
#     input_file_id=batch_input_file_id_part1,
#     endpoint="/v1/chat/completions",
#     completion_window="24h",
#     metadata={
#       "description": "Batch request for Prompt 7_OneByOne - Part 1"
#     }
# )

### 2nd batch request

# batch_input_file_part2 = openAI_client.files.create(
#   file=open(path_to_batch_request_part2, "rb"),
#   purpose="batch"
# )

# batch_input_file_id_part2 = batch_input_file_part2.id

# batch_info_part2 = openAI_client.batches.create(
#     input_file_id=batch_input_file_id_part2,
#     endpoint="/v1/chat/completions",
#     completion_window="24h",
#     metadata={
#       "description": "Batch request for Prompt 7_OneByOne - Part 2 (take 2)"
#     }
# )

In [49]:
# Checking the status of the batch request

status_of_all_batches = openAI_client.batches.list(limit=2)

# Extract batch_id and status from the response
batch_info = [(batch.id, batch.metadata, batch.created_at, batch.errors, batch.status, batch.output_file_id) for batch in status_of_all_batches.data]
batch_info
# Print the extracted info from the batch line by line with titles
for batch in batch_info:
    print("Batch ID: ", batch[0])
    print("Metadata: ", batch[1])
    print("Created at: ", batch[2])
    print("Errors: ", batch[3])
    print("Status: ", batch[4])
    print("Output file ID: ", batch[5])
    print("\n")

Batch ID:  batch_66fee64f6808819096eefff5f6fae825
Metadata:  {'description': 'Batch request for Prompt 7_OneByOne - Part 2 (take 2)'}
Created at:  1727981135
Errors:  None
Status:  completed
Output file ID:  file-ar8HdU9P1bb1OT44oAihKFSA


Batch ID:  batch_66fece55199c8190ad396df6b2406744
Metadata:  {'description': 'Batch request for Prompt 7_OneByOne - Part 1'}
Created at:  1727974997
Errors:  None
Status:  completed
Output file ID:  file-KQJIDFd3znNkWthXD3GEdTJH




In [50]:
# Retrieving the files and saving them
file_response_part1 = openAI_client.files.content("file-KQJIDFd3znNkWthXD3GEdTJH")
file_response_part2 = openAI_client.files.content("file-ar8HdU9P1bb1OT44oAihKFSA")

# Save the response in a JSONL file
path_to_response_part1 = os.path.join(result_path_seventh_prompt, "response_part1.jsonl")
path_to_response_part2 = os.path.join(result_path_seventh_prompt, "response_part2.jsonl")

with open(path_to_response_part1, 'wb') as f:
    f.write(file_response_part1.read())
with open(path_to_response_part2, 'wb') as f:
    f.write(file_response_part2.read())

In [52]:
# Parsing each jsonl file and saving the results in an Excel file
def process_batch_response(path_to_batch_file: str):
    dfs = []
    with (open(path_to_batch_file, 'r')) as f:
        for line in f:
            response = json.loads(line)
            condition_name = response["custom_id"].split("_")[2]
            GPT_answer = response['response']['body']['choices'][0]['message']['content']
            df = transform_GPT_output(GPT_answer)
        # Add the condition name as a column in the DataFrame
            df["Condition"] = condition_name
            dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

accum_df1 = process_batch_response(path_to_response_part1)
accum_df2 = process_batch_response(path_to_response_part2)

# Concatenate the two DataFrames
accum_df = pd.concat([accum_df1, accum_df2], ignore_index=True)
accum_duplicates = 0
for condition in accum_df["Condition"].unique():
    # Filter the DataFrame by condition and drop the "Condition" column
    condition_df = accum_df[accum_df["Condition"] == condition].drop(columns=["Condition"])
    # Remove duplicates and count how many were removed
    accum_duplicates += condition_df.duplicated().sum()
    condition_df = condition_df.drop_duplicates()
    condition_df.to_excel(os.path.join(result_path_seventh_prompt, condition + abbreviation + ".xlsx"), sheet_name=model_name ,index=False)
print("Total duplicates removed: ", accum_duplicates)

Total duplicates removed:  3


# Function to generate a folder with the complete prompt plus text

In [48]:
create_full_prompt_record_production()