{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setting up LLM model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using gpt-4o-mini model\n"
]
}
],
"source": [
"from openai import OpenAI\n",
"import pandas as pd\n",
"import json\n",
"import os\n",
"import re\n",
"import tiktoken\n",
"\n",
"# Establishing connection to OpenAI\n",
"openAI_client = OpenAI() # defaults to os.environ.get(\"OPENAI_API_KEY\")\n",
"model_name = \"gpt-4o-mini\" # CHANGE THIS TO USE A DIFFERENT MODEL, for example \"gpt-4o\"\n",
"\n",
"print(f\"Using {model_name} model\")\n",
"\n",
"def get_response_LLM(prompt):\n",
" \"\"\"\n",
" Get response from LLM model\n",
" \"\"\"\n",
" completion = openAI_client.chat.completions.create(\n",
" model = model_name,\n",
" response_format={ \"type\": \"json_object\" },\n",
" messages = prompt,\n",
" temperature = 0,\n",
" )\n",
" return completion.choices[0].message.content\n",
"\n",
"# Paths to the files\n",
"path_to_texts_old = r\"[TFM - MUCD] Textos de las enfermedades\\100 Texts LLM TFM\" # Old path to the texts without cleaning\n",
"path_to_texts = r\"[TFM - MUCD] Textos de las enfermedades\\100 Clean Disease Texts\" # Path to the texts after cleaning, these are actually used\n",
"failed_texts_path_to_store = r\"Resultados de Prompts\\Failed Texts from each Prompt\"\n",
"path_to_general_results = r\"Resultados de Prompts\"\n",
"\n",
"# Use a simplified entity dictionary starting from Prompt #2\n",
"useSimplifiedEntityDescription = False\n",
"\n",
"# Auxiliary function to transform ChatGPT output into JSON using the first key\n",
"def transform_GPT_output(answer_ChatGPT):\n",
" \n",
" answer_ChatGPT_json = json.loads(answer_ChatGPT)\n",
" first_key = list(answer_ChatGPT_json.keys())[0]\n",
" # if the the values of the first key are empty, then return a blank DataFrame\n",
" if not answer_ChatGPT_json[first_key]:\n",
" return pd.DataFrame()\n",
" filtered_json = answer_ChatGPT_json[first_key]\n",
" return pd.DataFrame(filtered_json)\n",
"\n",
"# Auxiliary function to rename files in a folder that have ÔÇô for a –\n",
"def fix_file_list(path: str):\n",
" \"\"\"\n",
" This function receives a path to a folder and changes the name of the files in the folder that have ÔÇô for a –.\n",
" It was only used once to fix the names of the files in the folder with the texts of the diseases.\n",
" \"\"\"\n",
" for filename in os.listdir(path):\n",
" if \"ÔÇô\" in filename:\n",
" new_filename = filename.replace(\"ÔÇô\", \"–\")\n",
" os.rename(os.path.join(path, filename), os.path.join(path, new_filename))\n",
"\n",
"def save_prompt(prompt_name, prompt_message, prompt_description):\n",
" \"\"\"\n",
" This function saves a prompt message to a file as txt file and updates the Prompt Summary Excel file with the new prompt\n",
" \"\"\"\n",
" path_to_save_prompt = r\"Prompt Engineering\\List of Prompts\"\n",
" with open(os.path.join(path_to_save_prompt, prompt_name + \".txt\"), \"w\") as file:\n",
" file.write(prompt_message)\n",
" # Transform the prompt message and name into a DataFrame\n",
" prompt_df = pd.DataFrame({\"Prompt\": [prompt_name], \"Prompt Message\": [prompt_message], \"Prompt Description\": [prompt_description]})\n",
" prompt_summary_path = r\"Prompt Engineering\\List of Prompts\\Prompt_Summary.xlsx\"\n",
" if os.path.exists(prompt_summary_path):\n",
" prompt_summary_df = pd.read_excel(prompt_summary_path)\n",
" else:\n",
" prompt_summary_df = pd.DataFrame(columns=[\"Prompt\", \"Prompt Message\", \"Prompt Description\"])\n",
" # Add the new prompt to the DataFrame\n",
" prompt_summary_df = pd.concat([prompt_summary_df, prompt_df], ignore_index=True)\n",
" # Remove the duplicates as per column \"Prompt\"\n",
" prompt_summary_df = prompt_summary_df.drop_duplicates(subset=[\"Prompt\"], keep=\"last\")\n",
" # Save the DataFrame to the Excel file\n",
" prompt_summary_df.to_excel(prompt_summary_path, index=False)\n",
" create_full_prompt_record_production()\n",
"\n",
"def filter_prompt6_removeOther(path_to_prompt_results):\n",
" \"\"\"\n",
" This function filters the results of Prompt 6 removing the entities classified as \"Other\" in the TUI Code.\n",
" They were classified as \"Other\" because they were not in the list of TUI Codes used in the prompt.\n",
" The new files are saved in a new folder with the suffix \"_OtherRemoved\"\n",
" \"\"\"\n",
" # Get the folder where the prompt results are stored\n",
" abbreviation = \"_OtherRemoved\"\n",
" folder_name = os.path.basename(path_to_prompt_results)\n",
" new_folder_name = folder_name + abbreviation\n",
" # Create the new folder if it does not exist\n",
" new_path_to_prompt_results = os.path.join(os.path.dirname(path_to_prompt_results), new_folder_name)\n",
" if not os.path.exists(new_path_to_prompt_results):\n",
" os.makedirs(new_path_to_prompt_results)\n",
" # Loop through the files in the folder\n",
" for file in os.listdir(path_to_prompt_results):\n",
" if file.endswith(\".xlsx\"):\n",
" file_name = file.split(\".\")[0]\n",
" new_file_name = file_name + abbreviation + \".xlsx\"\n",
" df_PartB = pd.read_excel(os.path.join(path_to_prompt_results, file), sheet_name=\"PartB_Classification\")\n",
" df_PartA = pd.read_excel(os.path.join(path_to_prompt_results, file), sheet_name=\"PartA_Extraction\")\n",
" df_PartB = df_PartB[df_PartB[\"TUI_Code\"] != \"Other\"]\n",
" # Save the DataFrames of part B into an Excel file in the new folder\n",
" df_PartB.to_excel(os.path.join(new_path_to_prompt_results, new_file_name), sheet_name=\"PartB_Classification\" ,index=False)\n",
" # Save the df_PartA in the same excel file but in a different sheet\n",
" with pd.ExcelWriter(os.path.join(new_path_to_prompt_results, new_file_name), engine='openpyxl', mode='a') as writer:\n",
" df_PartA.to_excel(writer, sheet_name=\"PartA_Extraction\", index=False)\n",
"\n",
"def count_tokens_in_text(text: str, model = 'gpt-3.5-turbo'):\n",
" \"\"\"\n",
" This function counts the number of tokens in a text for a given model.\n",
" The default model is GPT-3.5-turbo. Use 'gpt-4o' for Chat GPT-4o or any other model.\n",
" The function uses the tiktoken library to tokenize the text.\n",
" \"\"\"\n",
" # Initialize the tokenizer for the model\n",
" encoding = tiktoken.encoding_for_model(model)\n",
" # Tokenize the text\n",
" tokens = encoding.encode(text)\n",
" # Count the number of tokens and return it\n",
" return len(tokens)\n",
"\n",
"def transform_df_into_json(df):\n",
" \"\"\"\n",
" This function transforms a DataFrame into a JSON object\n",
" \"\"\"\n",
" return df.to_json(orient='records')\n",
"\n",
"def generate_summary_failed_texts(path_to_model_results: str):\n",
" df_failed_texts_summary = pd.DataFrame()\n",
" for file in os.listdir(path_to_model_results):\n",
" # if file contains \"Failed texts\" in the name\n",
" if \"Failed texts\" in file:\n",
" # Open the file\n",
" df = pd.read_excel(os.path.join(path_to_model_results, file), sheet_name=0)\n",
" prompt_name = re.search(r'Prompt .*?(?=\\.\\w+$)', file)\n",
" if prompt_name:\n",
" prompt_name = prompt_name.group()\n",
" else:\n",
" prompt_name = \"Error in parsing prompt_name\"\n",
" df[\"Prompt\"] = prompt_name\n",
" # Put the prompt column in the first position\n",
" columns = df.columns.tolist()\n",
" columns = columns[-1:] + columns[:-1]\n",
" df = df[columns]\n",
" df_failed_texts_summary = pd.concat([df_failed_texts_summary, df])\n",
" # Save the summary of the failed texts\n",
" df_failed_texts_summary.to_excel(os.path.join(path_to_model_results, \"Failed Texts Summary.xlsx\"), index=False)\n",
" return df_failed_texts_summary\n",
"\n",
"def create_full_prompt_record_production():\n",
" path_detailed_prompt_production_sink = r\"Prompt Engineering\\Detailed Prompt Production\"\n",
" path_to_prompts = r\"Prompt Engineering\\List of Prompts\"\n",
"\n",
" for prompt in os.listdir(path_to_prompts):\n",
" if prompt.endswith(\".txt\"):\n",
" with open(os.path.join(path_to_prompts, prompt), 'r', encoding='utf-8') as f:\n",
" prompt_text = f.read()\n",
" prompt_name = prompt.split(\".\")[0]\n",
" # Create the folder with the prompt name in the production sink path if it does not exist\n",
" path_to_production_prompt_sink = os.path.join(path_detailed_prompt_production_sink, prompt_name)\n",
" if not os.path.exists(path_to_production_prompt_sink):\n",
" os.makedirs(path_to_production_prompt_sink)\n",
"\n",
" for condition in os.listdir(path_to_texts):\n",
" condition_name = condition.split(\".\")[0]\n",
" with open(os.path.join(path_to_texts, condition), 'r', encoding='utf-8') as f:\n",
" text = f.read()\n",
" full_prompt = prompt_text + \"\\n\\n\" + \"text to analyze: \\n\\n\" + text\n",
" # Save the full_prompt in the production sink folder\n",
" with open(os.path.join(path_to_production_prompt_sink, condition_name + \"_\" + prompt_name + \".txt\"), 'w', encoding='utf-8') as f:\n",
" f.write(full_prompt)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Processing Texts, cleaning them (only do once)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"# def clean_text(text):\n",
"# # Remove [number] or [number, number] citations\n",
"# text = re.sub(r'\\[\\d+(,\\s*\\d+)*\\]', '', text)\n",
" \n",
"# # Replace all occurrences of '&/or' with 'and/or'\n",
"# text = re.sub(r'&/or', 'and/or', text)\n",
" \n",
"# # Remove incomplete bracketed numbers like [11 or similar cases\n",
"# text = re.sub(r'\\[\\d+', '', text)\n",
" \n",
"# # Remove non-numeric bracketed terms like [citation needed], [edit], ?, !, etc.\n",
"# text = re.sub(r'\\[[a-zA-Z\\s?.,!]+\\]', '', text)\n",
" \n",
"# # Add spaces around the & symbol if missing\n",
"# text = re.sub(r'\\s*&\\s*', ' & ', text)\n",
" \n",
"# # Remove any leftover unclosed brackets (e.g., [ or ] without a pair)\n",
"# text = text.replace('[', '').replace(']', '')\n",
" \n",
"# # Add a period after each paragraph if missing\n",
"# text = re.sub(r'([^\\.\\n])(\\n)', r'\\1.\\2', text)\n",
"\n",
"# # Ensure consistent spacing after periods and commas\n",
"# text = re.sub(r'([.,;:!?])([^\\s])', r'\\1 \\2', text)\n",
"\n",
"# # Replace multiple spaces with a single space within paragraphs\n",
"# text = re.sub(r'([^\\n\\S]+)', ' ', text)\n",
"\n",
"# # Correct capitalization after periods\n",
"# text = re.sub(r'(\\. )([a-z])', lambda m: m.group(1) + m.group(2).upper(), text)\n",
"\n",
"# # Handle percentage values (e.g., \"70 of cases\" -> \"70% of cases\")\n",
"# text = re.sub(r'(\\d+)\\s*of cases', r'\\1% of cases', text)\n",
" \n",
"# # Ensure correct abbreviation punctuation (e.g., \"e.g\" -> \"e.g.\")\n",
"# text = re.sub(r'\\b(e\\.g|i\\.e|etc)\\b', r'\\1.', text)\n",
" \n",
"# # Add degree symbol for temperatures (e.g., \"25 C\" -> \"25°C\")\n",
"# text = re.sub(r'(\\d+)\\s*C', r'\\1°C', text)\n",
" \n",
"# # Ensure proper formatting for numeric ranges (e.g., \"69-89\" -> \"69 – 89\")\n",
"# text = re.sub(r'(\\d+)-(\\d+)', r'\\1 – \\2', text)\n",
" \n",
"# # Remove extra punctuation like double commas or periods\n",
"# text = re.sub(r'\\s*[.,!?;:]{2,}\\s*', '. ', text)\n",
"\n",
"# # Remove spaces between numbers and commas (e.g., \"25, 000\" -> \"25,000\")\n",
"# text = re.sub(r'(\\d+),\\s+(\\d+)', r'\\1,\\2', text)\n",
"\n",
"# # Remove extra spaces before periods, commas, semicolons, etc.\n",
"# text = re.sub(r'\\s+([.,!?;:])', r'\\1', text)\n",
" \n",
"# # Replace double periods with a single period\n",
"# text = re.sub(r'\\.\\.', '.', text)\n",
" \n",
"# # Remove unnecessary spaces inside parentheses (e.g., \"i.e. ,\" -> \"i.e.,\")\n",
"# text = re.sub(r'\\(\\s*', '(', text)\n",
"# text = re.sub(r'\\s*\\)', ')', text)\n",
" \n",
"# # Remove spaces around equals signs and hyphens (e.g., \"-itis\" = inflammation -> \"-itis = inflammation\")\n",
"# text = re.sub(r'\\s*=\\s*', ' = ', text)\n",
"# text = re.sub(r'\\s*-\\s*', '-', text)\n",
" \n",
"# # Correct \"i. E.\" to \"i.e.\"\n",
"# text = re.sub(r'\\bi\\.\\s*E\\.', 'i.e.', text, flags=re.IGNORECASE)\n",
"\n",
"# # Correct \"i. e. \" to \"i.e.\"\n",
"# text = re.sub(r'\\bi\\.\\s*e\\.\\s*', 'i.e. ', text)\n",
"\n",
"# # Remove space between '=' and '>'\n",
"# text = re.sub(r'\\=\\s+\\>', '=>', text)\n",
"\n",
"# # Remove the period after a colon at the end of a paragraph\n",
"# text = re.sub(r'(:)\\.\\s*$', r'\\1', text, flags=re.MULTILINE)\n",
"\n",
"# # Remove space after a period when surrounded by numbers (e.g., \"99. 0\" -> \"99.0\")\n",
"# text = re.sub(r'(\\d+)\\.\\s+(\\d+)', r'\\1.\\2', text)\n",
" \n",
"# # Add a bullet point before the first occurrence after a period or a newline, followed by an '&'\n",
"# text = re.sub(r'(?<=[\\.\\n])\\s*(&)\\s*', r'\\n- ', text)\n",
" \n",
"# # Replace all remaining '&' symbols with bullet points\n",
"# text = re.sub(r'\\s*&\\s*', r'\\n- ', text)\n",
"\n",
"# # Find a sentence preceded by a line break and followed by a line break with a bullet point\n",
"# text = re.sub(r'(\\n)([^\\n]+)(\\n-)', r'\\1- \\2\\3', text)\n",
"\n",
"# # Replace any duplicate bullet points like '- -' with a single bullet point '-'\n",
"# text = re.sub(r'-\\s*-', '-', text)\n",
"\n",
"# return text\n",
"\n",
"# # Apply the cleaning to each text\n",
"# df_texts = pd.DataFrame()\n",
"# path_to_save_clean_texts = r\"[TFM - MUCD] Textos de las enfermedades\\100 Clean Disease Texts\"\n",
"# for index, disease_text in enumerate(os.listdir(path_to_texts_old)):\n",
"# with open(os.path.join(path_to_texts_old, disease_text), 'r', encoding='utf-8') as f:\n",
"# disease_title = disease_text.split(\".\")[0]\n",
"# text = f.read()\n",
"# cleanText = clean_text(text)\n",
"# # Save the cleanText to a text file\n",
"# with open(os.path.join(path_to_save_clean_texts, disease_text), 'w', encoding='utf-8') as file:\n",
"# file.write(cleanText)\n",
"# df_texts = pd.concat([df_texts, pd.DataFrame({\"Original Text\": [text], \"Clean Text\": [cleanText]})], ignore_index=True)\n",
"\n",
"# # Save the cleaned texts to a new Excel file\n",
"# df_texts.to_excel(r\"[TFM - MUCD] Textos de las enfermedades\\Textos Limpios.xlsx\", index=False)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Counting tokens of text and results - Only do once if needed"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
"# # Function to count the number of tokens in the results of the prompts\n",
"\n",
"# path_to_results = r\"Resultados de Prompts\\GPT35turbo\"\n",
"# token_accumulator = 0\n",
"# for folder in os.listdir(path_to_results):\n",
"# path_to_folder = os.path.join(path_to_results, folder)\n",
"# # if folder is a folder and doesn't contain \"DON'T USE\" in the name\n",
"# if os.path.isdir(path_to_folder) and \"DON'T USE\" not in folder:\n",
"# for file in os.listdir(path_to_folder):\n",
"# if file.endswith(\".xlsx\"):\n",
"# path_to_file = os.path.join(path_to_folder, file)\n",
"# df = pd.read_excel(path_to_file)\n",
"# df_json = transform_df_into_json(df)\n",
"# token_prompt = count_tokens_in_text(df_json)\n",
"# token_accumulator += token_prompt\n",
"# print(token_accumulator)\n",
"# token_accumulator = 0\n",
"# df_store_size_of_texts = pd.DataFrame(columns=[\"Disease\", \"Number of Words\", \"Number of Tokens\"])\n",
"\n",
"# # Function to count the number of tokens in the texts, it's auxiliary\n",
"\n",
"# for file in os.listdir(path_to_texts):\n",
"# if file.endswith(\".txt\"):\n",
"# # Read the text file\n",
"# with open(os.path.join(path_to_texts, file), \"r\", encoding=\"utf-8\") as f:\n",
"# text = f.read()\n",
"# # Get the file name without the extension\n",
"# file_name = file.split(\".\")[0]\n",
"# # Count the number of words in the text\n",
"# num_words = len(text.split())\n",
"# # Count the number of tokens in the text\n",
"# num_tokens = count_tokens_in_text(text)\n",
"# # Append the results to the DataFrame\n",
"# df_store_size_of_texts = pd.concat([df_store_size_of_texts, pd.DataFrame({\"Disease\": [file_name], \"Number of Words\": [num_words], \"Number of Tokens\": [num_tokens]})], ignore_index=True)\n",
"# # Save the results to an Excel file\n",
"# path_to_save = r\"[TFM - MUCD] Textos de las enfermedades\"\n",
"# df_store_size_of_texts.to_excel(os.path.join(path_to_save, \"Texts Length and Number of Tokens.xlsx\"), index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prompt 1: Zero-shot Learning"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Resultados de Prompts\\gpt-4o-mini\\Prompt 1\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Prompt | \n",
" Disease name failed | \n",
" GPT Output | \n",
" Error message | \n",
" GPT Output Part A | \n",
" GPT Output Part B | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Prompt, Disease name failed, GPT Output, Error message, GPT Output Part A, GPT Output Part B]\n",
"Index: []"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prompt_name = \"Prompt 1\"\n",
"abbreviation = \"_P1\"\n",
"\n",
"# Create the path for the results of the prompt\n",
"result_path_primer_prompt = os.path.join(path_to_general_results, model_name, prompt_name)\n",
"print(result_path_primer_prompt)\n",
"\n",
"# Create the folder if it does not exist\n",
"if not os.path.exists(result_path_primer_prompt):\n",
" os.makedirs(result_path_primer_prompt)\n",
"\n",
"# Primer prompt text\n",
"primer_prompt_text = \"Extract and list all the phenotypic manifestations of the condition found \\\n",
"in the “text to analyze”. Classify each extracted term according to its corresponding TUI code and semantic type.\"\n",
"\n",
"prompt_description = \"Basic Prompt\"\n",
"\n",
"# Open the JSON format otuput\n",
"with open(r\"Prompt Engineering\\JSON format output.txt\", 'r', encoding='utf-8') as f:\n",
" JSON_format_output = f.read()\n",
"\n",
"primer_prompt_text = primer_prompt_text + \"\\n\" + JSON_format_output\n",
"save_prompt(prompt_name, primer_prompt_text, prompt_description=prompt_description)\n",
"\n",
"failed_texts_name = []\n",
"failed_texts = []\n",
"failed_texts_error_message = []\n",
"\n",
"# Create a list of files for which the text has already been analyzed\n",
"analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_primer_prompt) if file.endswith(\".xlsx\")]\n",
"\n",
"for index, disease_text in enumerate(os.listdir(path_to_texts)):\n",
" # Skip the files that have already been analyzed\n",
" if disease_text.split(\".\")[0] in analyzed_files:\n",
" continue\n",
" with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:\n",
" text = f.read()\n",
" condition_name = disease_text.split(\".\")[0]\n",
" message_text = [{\"role\":\"system\",\"content\":primer_prompt_text},\n",
" {\"role\":\"user\", \"content\":\"Text to analyze: \\n\\n\" + text}]\n",
" print(f\"{index+1} - Analyzing disease: \", condition_name)\n",
" # break\n",
" try:\n",
" GPT_answer = get_response_LLM(prompt=message_text)\n",
" df = transform_GPT_output(GPT_answer)\n",
" df.to_excel(os.path.join(result_path_primer_prompt, condition_name + abbreviation + \".xlsx\"), sheet_name=model_name ,index=False)\n",
" print(\"Disease analyzed and saved: \", condition_name)\n",
" # break\n",
" except Exception as e:\n",
" failed_texts_name.append(condition_name)\n",
" failed_texts.append(GPT_answer)\n",
" # Save the error message from the exception\n",
" failed_texts_error_message.append(str(e))\n",
" print(\"Failed to analyze disease: \", condition_name)\n",
" continue\n",
"\n",
"# Saving failed texts summary\n",
"failed_df = pd.DataFrame({\"Disease name failed\": failed_texts_name, \"GPT Output\": failed_texts, \"Error message\": failed_texts_error_message})\n",
"failed_path = os.path.join(path_to_general_results, model_name)\n",
"failed_df.to_excel(os.path.join(failed_path, \"Failed texts \" + prompt_name + \".xlsx\"), sheet_name=model_name ,index=False)\n",
"generate_summary_failed_texts(failed_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prompt 1: Zero-shot Learning with guidance"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Prompt | \n",
" Disease name failed | \n",
" GPT Output | \n",
" Error message | \n",
" GPT Output Part A | \n",
" GPT Output Part B | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Prompt, Disease name failed, GPT Output, Error message, GPT Output Part A, GPT Output Part B]\n",
"Index: []"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prompt_name = \"Prompt 1_guided\"\n",
"abbreviation = \"_P1_guided\"\n",
"\n",
"# Create the path for the results of the prompt\n",
"result_path_seven_prompt = os.path.join(path_to_general_results, model_name, prompt_name)\n",
"\n",
"# Create the folder if it does not exist\n",
"if not os.path.exists(result_path_seven_prompt):\n",
" os.makedirs(result_path_seven_prompt)\n",
"\n",
"# Open the JSON format otuput\n",
"with open(r\"Prompt Engineering\\JSON format output.txt\", 'r', encoding='utf-8') as f:\n",
" JSON_format_output = f.read()\n",
"\n",
"# Creating and saving the prompt\n",
"primer_prompt_text = \"\"\"Extract and list the terms found in the “text to analyze” that are phenotypic manifestations of {condition_name}.\n",
"Classify each extracted term according to its corresponding TUI code and Semantic type.\"\"\"\n",
"primer_prompt_text = primer_prompt_text + \"\\n\" + JSON_format_output\n",
"prompt_description = \"Basic Prompt but saying in advance the condition to analyze in the text.\"\n",
"save_prompt(prompt_name, primer_prompt_text, prompt_description=prompt_description)\n",
"\n",
"failed_texts_name = []\n",
"failed_texts = []\n",
"failed_texts_error_message = []\n",
"\n",
"# Create a list of files for which the text has already been analyzed\n",
"analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_seven_prompt) if file.endswith(\".xlsx\")]\n",
"\n",
"for index, disease_text in enumerate(os.listdir(path_to_texts)):\n",
" # Skip the files that have already been analyzed\n",
" if disease_text.split(\".\")[0] in analyzed_files:\n",
" continue\n",
" with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:\n",
" text = f.read()\n",
" condition_name = disease_text.split(\".\")[0]\n",
" # Primer prompt text\n",
" primer_prompt_text = f\"\"\"Extract and list the terms found in the “text to analyze” that are phenotypic manifestations of {condition_name}.\n",
" Classify each extracted term according to its corresponding TUI code and semantic type.\"\"\"\n",
" primer_prompt_text = primer_prompt_text + \"\\n\" + JSON_format_output\n",
"\n",
" message_text = [{\"role\":\"system\",\"content\":primer_prompt_text},\n",
" {\"role\":\"user\", \"content\":\"Text to analyze: \\n\\n\" + text}]\n",
" print(f\"{index+1} - Analyzing disease: \", condition_name)\n",
" # break\n",
" try:\n",
" GPT_answer = get_response_LLM(prompt=message_text)\n",
" df = transform_GPT_output(GPT_answer)\n",
" df.to_excel(os.path.join(result_path_seven_prompt, condition_name + abbreviation + \".xlsx\"), sheet_name=model_name ,index=False)\n",
" print(\"Disease analyzed and saved: \", condition_name)\n",
" except Exception as e:\n",
" failed_texts_name.append(condition_name)\n",
" failed_texts.append(GPT_answer)\n",
" # Save the error message from the exception\n",
" failed_texts_error_message.append(str(e))\n",
" print(\"Failed to analyze disease: \", condition_name)\n",
" continue\n",
"\n",
"# Saving failed texts summary\n",
"failed_df = pd.DataFrame({\"Disease name failed\": failed_texts_name, \"GPT Output\": failed_texts, \"Error message\": failed_texts_error_message})\n",
"failed_path = os.path.join(path_to_general_results, model_name)\n",
"failed_df.to_excel(os.path.join(failed_path, \"Failed texts \" + prompt_name + \".xlsx\"), sheet_name=model_name ,index=False)\n",
"generate_summary_failed_texts(failed_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prompt 2: Zero-Shot Learning + Entity Dictionary"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 - Analyzing disease: Acute decompensated heart failure\n",
"Disease analyzed and saved: Acute decompensated heart failure\n",
"2 - Analyzing disease: Acute intermittent porphyria\n",
"Disease analyzed and saved: Acute intermittent porphyria\n",
"3 - Analyzing disease: Anthrax\n",
"Disease analyzed and saved: Anthrax\n",
"4 - Analyzing disease: Arterial embolism\n",
"Disease analyzed and saved: Arterial embolism\n",
"5 - Analyzing disease: Arteriovenous malformation\n",
"Disease analyzed and saved: Arteriovenous malformation\n",
"6 - Analyzing disease: Ascites\n",
"Disease analyzed and saved: Ascites\n",
"7 - Analyzing disease: Autonomic dysreflexia\n",
"Disease analyzed and saved: Autonomic dysreflexia\n",
"8 - Analyzing disease: Benzodiazepine withdrawal syndrome\n",
"Disease analyzed and saved: Benzodiazepine withdrawal syndrome\n",
"9 - Analyzing disease: Blastomycosis\n",
"Disease analyzed and saved: Blastomycosis\n",
"10 - Analyzing disease: Breast cancer\n",
"Disease analyzed and saved: Breast cancer\n",
"11 - Analyzing disease: Campylobacteriosis\n",
"Disease analyzed and saved: Campylobacteriosis\n",
"12 - Analyzing disease: Carciac myxoma\n",
"Disease analyzed and saved: Carciac myxoma\n",
"13 - Analyzing disease: Carrion's disease\n",
"Disease analyzed and saved: Carrion's disease\n",
"14 - Analyzing disease: Cerebral salt-wasting syndrome\n",
"Disease analyzed and saved: Cerebral salt-wasting syndrome\n",
"15 - Analyzing disease: Cerebrovascular disease\n",
"Disease analyzed and saved: Cerebrovascular disease\n",
"16 - Analyzing disease: Chlamydia infection\n",
"Disease analyzed and saved: Chlamydia infection\n",
"17 - Analyzing disease: Cholesterol embolism\n",
"Disease analyzed and saved: Cholesterol embolism\n",
"18 - Analyzing disease: Coccidioidomycosis\n",
"Disease analyzed and saved: Coccidioidomycosis\n",
"19 - Analyzing disease: Conversion disorder\n",
"Disease analyzed and saved: Conversion disorder\n",
"20 - Analyzing disease: Cryoglobulinemia\n",
"Disease analyzed and saved: Cryoglobulinemia\n",
"21 - Analyzing disease: Diphtheria\n",
"Disease analyzed and saved: Diphtheria\n",
"22 - Analyzing disease: Erysipelas\n",
"Disease analyzed and saved: Erysipelas\n",
"23 - Analyzing disease: Erythema nodosum\n",
"Disease analyzed and saved: Erythema nodosum\n",
"24 - Analyzing disease: Ethylene glycol poisoning\n",
"Disease analyzed and saved: Ethylene glycol poisoning\n",
"25 - Analyzing disease: Felty's syndrome\n",
"Disease analyzed and saved: Felty's syndrome\n",
"26 - Analyzing disease: Food intolerance\n",
"Disease analyzed and saved: Food intolerance\n",
"27 - Analyzing disease: Gastroparesis\n",
"Disease analyzed and saved: Gastroparesis\n",
"28 - Analyzing disease: Generalized anxiety disorder\n",
"Disease analyzed and saved: Generalized anxiety disorder\n",
"29 - Analyzing disease: GM1 gangliosidoses\n",
"Disease analyzed and saved: GM1 gangliosidoses\n",
"30 - Analyzing disease: Helicobacter pylori\n",
"Disease analyzed and saved: Helicobacter pylori\n",
"31 - Analyzing disease: Hemolytic-uremic syndrome\n",
"Disease analyzed and saved: Hemolytic-uremic syndrome\n",
"32 - Analyzing disease: Herpes labialis\n",
"Disease analyzed and saved: Herpes labialis\n",
"33 - Analyzing disease: Hypercalcaemia\n",
"Disease analyzed and saved: Hypercalcaemia\n",
"34 - Analyzing disease: Hyperosmolar hyperglycemic state\n",
"Disease analyzed and saved: Hyperosmolar hyperglycemic state\n",
"35 - Analyzing disease: Hypervitaminosis A\n",
"Disease analyzed and saved: Hypervitaminosis A\n",
"36 - Analyzing disease: Hypocalcaemia\n",
"Disease analyzed and saved: Hypocalcaemia\n",
"37 - Analyzing disease: Hypomagnesemia\n",
"Disease analyzed and saved: Hypomagnesemia\n",
"38 - Analyzing disease: Hypovolemia\n",
"Disease analyzed and saved: Hypovolemia\n",
"39 - Analyzing disease: Inborn error of metabolism\n",
"Disease analyzed and saved: Inborn error of metabolism\n",
"40 - Analyzing disease: Influenza\n",
"Disease analyzed and saved: Influenza\n",
"41 - Analyzing disease: Intention tremor\n",
"Disease analyzed and saved: Intention tremor\n",
"42 - Analyzing disease: Intraparenchymal hemorrhage\n",
"Disease analyzed and saved: Intraparenchymal hemorrhage\n",
"43 - Analyzing disease: Itch\n",
"Disease analyzed and saved: Itch\n",
"44 - Analyzing disease: Juvenile dermatomyositis\n",
"Disease analyzed and saved: Juvenile dermatomyositis\n",
"45 - Analyzing disease: Kaposi's sarcoma\n",
"Disease analyzed and saved: Kaposi's sarcoma\n",
"46 - Analyzing disease: Lambert–Eaton myasthenic syndrome\n",
"Disease analyzed and saved: Lambert–Eaton myasthenic syndrome\n",
"47 - Analyzing disease: Laryngitis\n",
"Disease analyzed and saved: Laryngitis\n",
"48 - Analyzing disease: Lateral medullary syndrome\n",
"Disease analyzed and saved: Lateral medullary syndrome\n",
"49 - Analyzing disease: MERRF syndrome\n",
"Disease analyzed and saved: MERRF syndrome\n",
"50 - Analyzing disease: Metal fume fever\n",
"Disease analyzed and saved: Metal fume fever\n",
"51 - Analyzing disease: Morvan's syndrome\n",
"Disease analyzed and saved: Morvan's syndrome\n",
"52 - Analyzing disease: Myocarditis\n",
"Disease analyzed and saved: Myocarditis\n",
"53 - Analyzing disease: Necatoriasis\n",
"Disease analyzed and saved: Necatoriasis\n",
"54 - Analyzing disease: Nicotine poisoning\n",
"Disease analyzed and saved: Nicotine poisoning\n",
"55 - Analyzing disease: Non-alcoholic fatty liver disease\n",
"Disease analyzed and saved: Non-alcoholic fatty liver disease\n",
"56 - Analyzing disease: Non-small-cell lung carcinoma\n",
"Disease analyzed and saved: Non-small-cell lung carcinoma\n",
"57 - Analyzing disease: Normal pressure hydrocephalus\n",
"Disease analyzed and saved: Normal pressure hydrocephalus\n",
"58 - Analyzing disease: Obesity hypoventilation syndrome\n",
"Disease analyzed and saved: Obesity hypoventilation syndrome\n",
"59 - Analyzing disease: Opioid use disorder\n",
"Disease analyzed and saved: Opioid use disorder\n",
"60 - Analyzing disease: Optic neuritis\n",
"Disease analyzed and saved: Optic neuritis\n",
"61 - Analyzing disease: Orofacial granulomatosis\n",
"Disease analyzed and saved: Orofacial granulomatosis\n",
"62 - Analyzing disease: Orthostatic hypotension\n",
"Disease analyzed and saved: Orthostatic hypotension\n",
"63 - Analyzing disease: Pancreatic cancer\n",
"Disease analyzed and saved: Pancreatic cancer\n",
"64 - Analyzing disease: Panic attack\n",
"Disease analyzed and saved: Panic attack\n",
"65 - Analyzing disease: Paratyphoid fever\n",
"Disease analyzed and saved: Paratyphoid fever\n",
"66 - Analyzing disease: Parry–Romberg syndrome\n",
"Disease analyzed and saved: Parry–Romberg syndrome\n",
"67 - Analyzing disease: Pituitary apoplexy\n",
"Disease analyzed and saved: Pituitary apoplexy\n",
"68 - Analyzing disease: Polyarteritis nodosa\n",
"Disease analyzed and saved: Polyarteritis nodosa\n",
"69 - Analyzing disease: Porencephaly\n",
"Disease analyzed and saved: Porencephaly\n",
"70 - Analyzing disease: Prediabetes\n",
"Disease analyzed and saved: Prediabetes\n",
"71 - Analyzing disease: Pregnancy\n",
"Disease analyzed and saved: Pregnancy\n",
"72 - Analyzing disease: Premenstrual syndrome\n",
"Disease analyzed and saved: Premenstrual syndrome\n",
"73 - Analyzing disease: Primary hyperparathyroidism\n",
"Disease analyzed and saved: Primary hyperparathyroidism\n",
"74 - Analyzing disease: Primary sclerosing cholangitis\n",
"Disease analyzed and saved: Primary sclerosing cholangitis\n",
"75 - Analyzing disease: Reactive hypoglycemia\n",
"Disease analyzed and saved: Reactive hypoglycemia\n",
"76 - Analyzing disease: Reflex syncope\n",
"Disease analyzed and saved: Reflex syncope\n",
"77 - Analyzing disease: Ross River fever\n",
"Disease analyzed and saved: Ross River fever\n",
"78 - Analyzing disease: Rubella\n",
"Disease analyzed and saved: Rubella\n",
"79 - Analyzing disease: Scarlet fever\n",
"Disease analyzed and saved: Scarlet fever\n",
"80 - Analyzing disease: Scleroderma\n",
"Disease analyzed and saved: Scleroderma\n",
"81 - Analyzing disease: Snakebite\n",
"Disease analyzed and saved: Snakebite\n",
"82 - Analyzing disease: Soy allergy\n",
"Disease analyzed and saved: Soy allergy\n",
"83 - Analyzing disease: Streptococcal pharyngitis\n",
"Disease analyzed and saved: Streptococcal pharyngitis\n",
"84 - Analyzing disease: Subdural hematoma\n",
"Disease analyzed and saved: Subdural hematoma\n",
"85 - Analyzing disease: Superior mesenteric artery syndrome\n",
"Disease analyzed and saved: Superior mesenteric artery syndrome\n",
"86 - Analyzing disease: Taeniasis\n",
"Disease analyzed and saved: Taeniasis\n",
"87 - Analyzing disease: Tetanus\n",
"Disease analyzed and saved: Tetanus\n",
"88 - Analyzing disease: Tethered spinal cord syndrome\n",
"Disease analyzed and saved: Tethered spinal cord syndrome\n",
"89 - Analyzing disease: Thyroid storm\n",
"Disease analyzed and saved: Thyroid storm\n",
"90 - Analyzing disease: Thyrotoxic periodic paralysis\n",
"Disease analyzed and saved: Thyrotoxic periodic paralysis\n",
"91 - Analyzing disease: Tonsillitis\n",
"Disease analyzed and saved: Tonsillitis\n",
"92 - Analyzing disease: Trichinosis\n",
"Disease analyzed and saved: Trichinosis\n",
"93 - Analyzing disease: Tropical sprue\n",
"Disease analyzed and saved: Tropical sprue\n",
"94 - Analyzing disease: Typhoid fever\n",
"Disease analyzed and saved: Typhoid fever\n",
"95 - Analyzing disease: Upper respiratory tract infection\n",
"Disease analyzed and saved: Upper respiratory tract infection\n",
"96 - Analyzing disease: Vascular dementia\n",
"Disease analyzed and saved: Vascular dementia\n",
"97 - Analyzing disease: Waterhouse–Friderichsen syndrome\n",
"Disease analyzed and saved: Waterhouse–Friderichsen syndrome\n",
"98 - Analyzing disease: Weight loss\n",
"Disease analyzed and saved: Weight loss\n",
"99 - Analyzing disease: Whipple's disease\n",
"Disease analyzed and saved: Whipple's disease\n",
"100 - Analyzing disease: Xerostomia\n",
"Disease analyzed and saved: Xerostomia\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Prompt | \n",
" Disease name failed | \n",
" GPT Output | \n",
" Error message | \n",
" GPT Output Part A | \n",
" GPT Output Part B | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Prompt, Disease name failed, GPT Output, Error message, GPT Output Part A, GPT Output Part B]\n",
"Index: []"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"if useSimplifiedEntityDescription:\n",
" prompt_name = \"Prompt 2_simplified\"\n",
" abbreviation = \"_P2_simplified\"\n",
" # Open Prompt Engineering\\TUI_description_text.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text_simplified.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
" prompt_description = \"Basic Prompt but extracting according to semantic type categories without extensive description, it's simplified.\"\n",
"else:\n",
" prompt_name = \"Prompt 2\"\n",
" abbreviation = \"_P2\"\n",
" # Open Prompt Engineering\\TUI_description_text.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
" prompt_description = \"Basic Prompt but extracting according to semantic type categories.\"\n",
"\n",
"# Create the path for the results of the prompt\n",
"result_path_seventh_prompt = os.path.join(path_to_general_results, model_name, prompt_name)\n",
"\n",
"# Create the folder if it does not exist\n",
"if not os.path.exists(result_path_seventh_prompt):\n",
" os.makedirs(result_path_seventh_prompt)\n",
"\n",
"second_prompt_text = \"Extract and list all the terms that can be classified into any of the categories in the \\\n",
"“TUI code and Semantic type description text” and are related to the condition found in the “text to analyze”. \\\n",
"Classify each extracted term into one of the categories in the “TUI code and Semantic type description text.\"\n",
"\n",
"# Open the JSON format otuput\n",
"with open(r\"Prompt Engineering\\JSON format output.txt\", 'r', encoding='utf-8') as f:\n",
" JSON_format_output = f.read()\n",
"\n",
"# Creating and saving the prompt\n",
"second_prompt_text = second_prompt_text + \"\\n\" + JSON_format_output\n",
"second_prompt_text = second_prompt_text + \"\\n\\n\" + \"TUI code and Semantic type description text:\\n\\n\" + TUI_description_text\n",
"save_prompt(prompt_name, second_prompt_text, prompt_description=prompt_description)\n",
"\n",
"failed_texts_name = []\n",
"failed_texts = []\n",
"failed_texts_error_message = []\n",
"\n",
"# Create a list of files for which the text has already been analyzed\n",
"analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_seventh_prompt) if file.endswith(\".xlsx\")]\n",
"\n",
"for index, disease_text in enumerate(os.listdir(path_to_texts)):\n",
" # Skip the files that have already been analyzed\n",
" if disease_text.split(\".\")[0] in analyzed_files:\n",
" continue\n",
" with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:\n",
" text = f.read() # Disease text\n",
" condition_name = disease_text.split(\".\")[0]\n",
" message_text = [{\"role\":\"system\",\"content\":second_prompt_text},\n",
" {\"role\":\"user\", \"content\":\"Text to analyze: \\n\\n\" + text}]\n",
" print(f\"{index+1} - Analyzing disease: \", condition_name)\n",
" try:\n",
" GPT_answer = get_response_LLM(prompt=message_text)\n",
" df = transform_GPT_output(GPT_answer)\n",
" df.to_excel(os.path.join(result_path_seventh_prompt, condition_name + abbreviation + \".xlsx\"), sheet_name=model_name ,index=False)\n",
" print(\"Disease analyzed and saved: \", condition_name)\n",
" except Exception as e:\n",
" failed_texts_name.append(condition_name)\n",
" failed_texts.append(GPT_answer)\n",
" failed_texts_error_message.append(str(e))\n",
" print(\"Failed to analyze disease: \", condition_name)\n",
" continue\n",
"\n",
"# Save failed texts\n",
"failed_df = pd.DataFrame({\"Disease name failed\": failed_texts_name, \"GPT Output\": failed_texts, \"Error message\": failed_texts_error_message})\n",
"failed_path = os.path.join(path_to_general_results, model_name)\n",
"failed_df.to_excel(os.path.join(failed_path, \"Failed texts \" + prompt_name + \".xlsx\"), sheet_name=model_name ,index=False)\n",
"generate_summary_failed_texts(failed_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prompt 2: Zero-Shot Learning + Entity Dictionary with guidance"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 - Analyzing disease: Breast cancer\n",
"Disease analyzed and saved: Breast cancer\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Prompt | \n",
" Disease name failed | \n",
" GPT Output | \n",
" Error message | \n",
" GPT Output Part A | \n",
" GPT Output Part B | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Prompt, Disease name failed, GPT Output, Error message, GPT Output Part A, GPT Output Part B]\n",
"Index: []"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"if useSimplifiedEntityDescription:\n",
" prompt_name = \"Prompt 2_simplified_guided\"\n",
" abbreviation = \"_P2_simplified_guided\"\n",
" # Open Prompt Engineering\\TUI_description_text.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text_simplified.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
"else:\n",
" prompt_name = \"Prompt 2_guided\"\n",
" abbreviation = \"_P2_guided\"\n",
" # Open Prompt Engineering\\TUI_description_text.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
"\n",
"# Create the path for the results of the prompt\n",
"result_path_second_prompt = os.path.join(path_to_general_results, model_name, prompt_name)\n",
"\n",
"# Create the folder if it does not exist\n",
"if not os.path.exists(result_path_second_prompt):\n",
" os.makedirs(result_path_second_prompt)\n",
"\n",
"# Open the JSON format otuput\n",
"with open(r\"Prompt Engineering\\JSON format output.txt\", 'r', encoding='utf-8') as f:\n",
" JSON_format_output = f.read()\n",
"\n",
"# Creating and saving the prompt\n",
"second_prompt_text = \"\"\"Extract and list all the terms from the “text to analyze” that can be classified into any of the categories in the\n",
"“TUI code and Semantic type description text” and are related to {condition_name}. Classify each extracted \n",
"term into one of the categories in the “TUI code and Semantic type description text.\"\"\"\n",
"second_prompt_text = second_prompt_text + \"\\n\" + JSON_format_output\n",
"second_prompt_text = second_prompt_text + \"\\n\\n\" + \"TUI code and Semantic type description text:\\n\\n\" + TUI_description_text\n",
"prompt_description = \"Basic Prompt but saying in advance the condition to analyze in the text and extracting the terms that can be classified into one of the categories.\"\n",
"save_prompt(prompt_name, second_prompt_text, prompt_description=prompt_description) \n",
" \n",
"failed_texts_name = []\n",
"failed_texts = []\n",
"failed_texts_error_message = []\n",
"\n",
"# Create a list of files for which the text has already been analyzed\n",
"analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_second_prompt) if file.endswith(\".xlsx\")]\n",
"\n",
"for index, disease_text in enumerate(os.listdir(path_to_texts)):\n",
" # Skip the files that have already been analyzed\n",
" if disease_text.split(\".\")[0] in analyzed_files:\n",
" continue\n",
" with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:\n",
" text = f.read() # Disease text\n",
" condition_name = disease_text.split(\".\")[0]\n",
" second_prompt_text = f\"\"\"Extract and list all the terms from the “text to analyze” that can be classified into any of the categories in the\n",
" “TUI code and Semantic type description text” and are related to {condition_name}”. Classify each extracted \n",
" term into one of the categories in the “TUI code and Semantic type description text.\"\"\"\n",
" second_prompt_text = second_prompt_text + \"\\n\" + JSON_format_output\n",
" message_text = [{\"role\":\"system\",\"content\":second_prompt_text + \"\\n\\n\" + \"TUI code and Semantic type description text:\\n\\n\" + TUI_description_text},\n",
" {\"role\":\"user\", \"content\":\"Text to analyze: \\n\\n\" + text}]\n",
" print(f\"{index+1} - Analyzing disease: \", condition_name)\n",
" try:\n",
" GPT_answer = get_response_LLM(prompt=message_text)\n",
" df = transform_GPT_output(GPT_answer)\n",
" df.to_excel(os.path.join(result_path_second_prompt, condition_name + abbreviation + \".xlsx\"), sheet_name=model_name ,index=False)\n",
" print(\"Disease analyzed and saved: \", condition_name)\n",
" except Exception as e:\n",
" failed_texts_name.append(condition_name)\n",
" failed_texts.append(GPT_answer)\n",
" failed_texts_error_message.append(str(e))\n",
" print(\"Failed to analyze disease: \", condition_name)\n",
" continue\n",
"\n",
"# Save failed texts\n",
"failed_df = pd.DataFrame({\"Disease name failed\": failed_texts_name, \"GPT Output\": failed_texts, \"Error message\": failed_texts_error_message})\n",
"failed_path = os.path.join(path_to_general_results, model_name)\n",
"failed_df.to_excel(os.path.join(failed_path, \"Failed texts \" + prompt_name + \".xlsx\"), sheet_name=model_name ,index=False)\n",
"generate_summary_failed_texts(failed_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prompt 2: Zero-Shot Learning + Entity Dictionary plus manifestations"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 - Analyzing disease: Acute decompensated heart failure\n",
"Disease analyzed and saved: Acute decompensated heart failure\n",
"2 - Analyzing disease: Acute intermittent porphyria\n",
"Disease analyzed and saved: Acute intermittent porphyria\n",
"3 - Analyzing disease: Anthrax\n",
"Disease analyzed and saved: Anthrax\n",
"4 - Analyzing disease: Arterial embolism\n",
"Disease analyzed and saved: Arterial embolism\n",
"5 - Analyzing disease: Arteriovenous malformation\n",
"Disease analyzed and saved: Arteriovenous malformation\n",
"6 - Analyzing disease: Ascites\n",
"Disease analyzed and saved: Ascites\n",
"7 - Analyzing disease: Autonomic dysreflexia\n",
"Disease analyzed and saved: Autonomic dysreflexia\n",
"8 - Analyzing disease: Benzodiazepine withdrawal syndrome\n",
"Disease analyzed and saved: Benzodiazepine withdrawal syndrome\n",
"9 - Analyzing disease: Blastomycosis\n",
"Disease analyzed and saved: Blastomycosis\n",
"10 - Analyzing disease: Breast cancer\n",
"Disease analyzed and saved: Breast cancer\n",
"11 - Analyzing disease: Campylobacteriosis\n",
"Disease analyzed and saved: Campylobacteriosis\n",
"12 - Analyzing disease: Carciac myxoma\n",
"Disease analyzed and saved: Carciac myxoma\n",
"13 - Analyzing disease: Carrion's disease\n",
"Disease analyzed and saved: Carrion's disease\n",
"14 - Analyzing disease: Cerebral salt-wasting syndrome\n",
"Disease analyzed and saved: Cerebral salt-wasting syndrome\n",
"15 - Analyzing disease: Cerebrovascular disease\n",
"Disease analyzed and saved: Cerebrovascular disease\n",
"16 - Analyzing disease: Chlamydia infection\n",
"Disease analyzed and saved: Chlamydia infection\n",
"17 - Analyzing disease: Cholesterol embolism\n",
"Disease analyzed and saved: Cholesterol embolism\n",
"18 - Analyzing disease: Coccidioidomycosis\n",
"Disease analyzed and saved: Coccidioidomycosis\n",
"19 - Analyzing disease: Conversion disorder\n",
"Disease analyzed and saved: Conversion disorder\n",
"20 - Analyzing disease: Cryoglobulinemia\n",
"Disease analyzed and saved: Cryoglobulinemia\n",
"21 - Analyzing disease: Diphtheria\n",
"Disease analyzed and saved: Diphtheria\n",
"22 - Analyzing disease: Erysipelas\n",
"Disease analyzed and saved: Erysipelas\n",
"23 - Analyzing disease: Erythema nodosum\n",
"Disease analyzed and saved: Erythema nodosum\n",
"24 - Analyzing disease: Ethylene glycol poisoning\n",
"Disease analyzed and saved: Ethylene glycol poisoning\n",
"25 - Analyzing disease: Felty's syndrome\n",
"Disease analyzed and saved: Felty's syndrome\n",
"26 - Analyzing disease: Food intolerance\n",
"Disease analyzed and saved: Food intolerance\n",
"27 - Analyzing disease: Gastroparesis\n",
"Disease analyzed and saved: Gastroparesis\n",
"28 - Analyzing disease: Generalized anxiety disorder\n",
"Disease analyzed and saved: Generalized anxiety disorder\n",
"29 - Analyzing disease: GM1 gangliosidoses\n",
"Disease analyzed and saved: GM1 gangliosidoses\n",
"30 - Analyzing disease: Helicobacter pylori\n",
"Disease analyzed and saved: Helicobacter pylori\n",
"31 - Analyzing disease: Hemolytic-uremic syndrome\n",
"Disease analyzed and saved: Hemolytic-uremic syndrome\n",
"32 - Analyzing disease: Herpes labialis\n",
"Disease analyzed and saved: Herpes labialis\n",
"33 - Analyzing disease: Hypercalcaemia\n",
"Disease analyzed and saved: Hypercalcaemia\n",
"34 - Analyzing disease: Hyperosmolar hyperglycemic state\n",
"Disease analyzed and saved: Hyperosmolar hyperglycemic state\n",
"35 - Analyzing disease: Hypervitaminosis A\n",
"Disease analyzed and saved: Hypervitaminosis A\n",
"36 - Analyzing disease: Hypocalcaemia\n",
"Disease analyzed and saved: Hypocalcaemia\n",
"37 - Analyzing disease: Hypomagnesemia\n",
"Disease analyzed and saved: Hypomagnesemia\n",
"38 - Analyzing disease: Hypovolemia\n",
"Disease analyzed and saved: Hypovolemia\n",
"39 - Analyzing disease: Inborn error of metabolism\n",
"Disease analyzed and saved: Inborn error of metabolism\n",
"40 - Analyzing disease: Influenza\n",
"Disease analyzed and saved: Influenza\n",
"41 - Analyzing disease: Intention tremor\n",
"Disease analyzed and saved: Intention tremor\n",
"42 - Analyzing disease: Intraparenchymal hemorrhage\n",
"Disease analyzed and saved: Intraparenchymal hemorrhage\n",
"43 - Analyzing disease: Itch\n",
"Disease analyzed and saved: Itch\n",
"44 - Analyzing disease: Juvenile dermatomyositis\n",
"Disease analyzed and saved: Juvenile dermatomyositis\n",
"45 - Analyzing disease: Kaposi's sarcoma\n",
"Disease analyzed and saved: Kaposi's sarcoma\n",
"46 - Analyzing disease: Lambert–Eaton myasthenic syndrome\n",
"Disease analyzed and saved: Lambert–Eaton myasthenic syndrome\n",
"47 - Analyzing disease: Laryngitis\n",
"Disease analyzed and saved: Laryngitis\n",
"48 - Analyzing disease: Lateral medullary syndrome\n",
"Disease analyzed and saved: Lateral medullary syndrome\n",
"49 - Analyzing disease: MERRF syndrome\n",
"Disease analyzed and saved: MERRF syndrome\n",
"50 - Analyzing disease: Metal fume fever\n",
"Disease analyzed and saved: Metal fume fever\n",
"51 - Analyzing disease: Morvan's syndrome\n",
"Disease analyzed and saved: Morvan's syndrome\n",
"52 - Analyzing disease: Myocarditis\n",
"Disease analyzed and saved: Myocarditis\n",
"53 - Analyzing disease: Necatoriasis\n",
"Disease analyzed and saved: Necatoriasis\n",
"54 - Analyzing disease: Nicotine poisoning\n",
"Disease analyzed and saved: Nicotine poisoning\n",
"55 - Analyzing disease: Non-alcoholic fatty liver disease\n",
"Disease analyzed and saved: Non-alcoholic fatty liver disease\n",
"56 - Analyzing disease: Non-small-cell lung carcinoma\n",
"Disease analyzed and saved: Non-small-cell lung carcinoma\n",
"57 - Analyzing disease: Normal pressure hydrocephalus\n",
"Disease analyzed and saved: Normal pressure hydrocephalus\n",
"58 - Analyzing disease: Obesity hypoventilation syndrome\n",
"Disease analyzed and saved: Obesity hypoventilation syndrome\n",
"59 - Analyzing disease: Opioid use disorder\n",
"Disease analyzed and saved: Opioid use disorder\n",
"60 - Analyzing disease: Optic neuritis\n",
"Disease analyzed and saved: Optic neuritis\n",
"61 - Analyzing disease: Orofacial granulomatosis\n",
"Disease analyzed and saved: Orofacial granulomatosis\n",
"62 - Analyzing disease: Orthostatic hypotension\n",
"Disease analyzed and saved: Orthostatic hypotension\n",
"63 - Analyzing disease: Pancreatic cancer\n",
"Disease analyzed and saved: Pancreatic cancer\n",
"64 - Analyzing disease: Panic attack\n",
"Disease analyzed and saved: Panic attack\n",
"65 - Analyzing disease: Paratyphoid fever\n",
"Disease analyzed and saved: Paratyphoid fever\n",
"66 - Analyzing disease: Parry–Romberg syndrome\n",
"Disease analyzed and saved: Parry–Romberg syndrome\n",
"67 - Analyzing disease: Pituitary apoplexy\n",
"Disease analyzed and saved: Pituitary apoplexy\n",
"68 - Analyzing disease: Polyarteritis nodosa\n",
"Disease analyzed and saved: Polyarteritis nodosa\n",
"69 - Analyzing disease: Porencephaly\n",
"Disease analyzed and saved: Porencephaly\n",
"70 - Analyzing disease: Prediabetes\n",
"Disease analyzed and saved: Prediabetes\n",
"71 - Analyzing disease: Pregnancy\n",
"Disease analyzed and saved: Pregnancy\n",
"72 - Analyzing disease: Premenstrual syndrome\n",
"Disease analyzed and saved: Premenstrual syndrome\n",
"73 - Analyzing disease: Primary hyperparathyroidism\n",
"Disease analyzed and saved: Primary hyperparathyroidism\n",
"74 - Analyzing disease: Primary sclerosing cholangitis\n",
"Disease analyzed and saved: Primary sclerosing cholangitis\n",
"75 - Analyzing disease: Reactive hypoglycemia\n",
"Disease analyzed and saved: Reactive hypoglycemia\n",
"76 - Analyzing disease: Reflex syncope\n",
"Disease analyzed and saved: Reflex syncope\n",
"77 - Analyzing disease: Ross River fever\n",
"Disease analyzed and saved: Ross River fever\n",
"78 - Analyzing disease: Rubella\n",
"Disease analyzed and saved: Rubella\n",
"79 - Analyzing disease: Scarlet fever\n",
"Disease analyzed and saved: Scarlet fever\n",
"80 - Analyzing disease: Scleroderma\n",
"Disease analyzed and saved: Scleroderma\n",
"81 - Analyzing disease: Snakebite\n",
"Disease analyzed and saved: Snakebite\n",
"82 - Analyzing disease: Soy allergy\n",
"Disease analyzed and saved: Soy allergy\n",
"83 - Analyzing disease: Streptococcal pharyngitis\n",
"Disease analyzed and saved: Streptococcal pharyngitis\n",
"84 - Analyzing disease: Subdural hematoma\n",
"Disease analyzed and saved: Subdural hematoma\n",
"85 - Analyzing disease: Superior mesenteric artery syndrome\n",
"Disease analyzed and saved: Superior mesenteric artery syndrome\n",
"86 - Analyzing disease: Taeniasis\n",
"Disease analyzed and saved: Taeniasis\n",
"87 - Analyzing disease: Tetanus\n",
"Disease analyzed and saved: Tetanus\n",
"88 - Analyzing disease: Tethered spinal cord syndrome\n",
"Disease analyzed and saved: Tethered spinal cord syndrome\n",
"89 - Analyzing disease: Thyroid storm\n",
"Disease analyzed and saved: Thyroid storm\n",
"90 - Analyzing disease: Thyrotoxic periodic paralysis\n",
"Disease analyzed and saved: Thyrotoxic periodic paralysis\n",
"91 - Analyzing disease: Tonsillitis\n",
"Disease analyzed and saved: Tonsillitis\n",
"92 - Analyzing disease: Trichinosis\n",
"Disease analyzed and saved: Trichinosis\n",
"93 - Analyzing disease: Tropical sprue\n",
"Disease analyzed and saved: Tropical sprue\n",
"94 - Analyzing disease: Typhoid fever\n",
"Disease analyzed and saved: Typhoid fever\n",
"95 - Analyzing disease: Upper respiratory tract infection\n",
"Disease analyzed and saved: Upper respiratory tract infection\n",
"96 - Analyzing disease: Vascular dementia\n",
"Disease analyzed and saved: Vascular dementia\n",
"97 - Analyzing disease: Waterhouse–Friderichsen syndrome\n",
"Disease analyzed and saved: Waterhouse–Friderichsen syndrome\n",
"98 - Analyzing disease: Weight loss\n",
"Disease analyzed and saved: Weight loss\n",
"99 - Analyzing disease: Whipple's disease\n",
"Disease analyzed and saved: Whipple's disease\n",
"100 - Analyzing disease: Xerostomia\n",
"Disease analyzed and saved: Xerostomia\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Prompt | \n",
" Disease name failed | \n",
" GPT Output | \n",
" Error message | \n",
" GPT Output Part A | \n",
" GPT Output Part B | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Prompt, Disease name failed, GPT Output, Error message, GPT Output Part A, GPT Output Part B]\n",
"Index: []"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"if useSimplifiedEntityDescription:\n",
" prompt_name = \"Prompt 2_simplified_plusManifestations\"\n",
" abbreviation = \"_P2_simplified_plusManifestations\"\n",
" # Open Prompt Engineering\\TUI_description_text.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text_simplified.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
" prompt_description = \"Basic Prompt but extracting according to semantic type categories without extensive description, it's simplified.\"\n",
"else:\n",
" prompt_name = \"Prompt 2_plusManifestations\"\n",
" abbreviation = \"_P2_plusManifestations\"\n",
" # Open Prompt Engineering\\TUI_description_text.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
" prompt_description = \"Basic Prompt but extracting according to semantic type categories.\"\n",
"\n",
"# Create the path for the results of the prompt\n",
"result_path_seventh_prompt = os.path.join(path_to_general_results, model_name, prompt_name)\n",
"\n",
"# Create the folder if it does not exist\n",
"if not os.path.exists(result_path_seventh_prompt):\n",
" os.makedirs(result_path_seventh_prompt)\n",
"\n",
"second_prompt_text = \"Extract and list all the terms and manifestations that can be classified into any of the categories in the \\\n",
"“TUI code and Semantic type description text” and are related to the condition found in the “text to analyze”. \\\n",
"Classify each extracted term into one of the categories in the “TUI code and Semantic type description text.\"\n",
"\n",
"# Open the JSON format otuput\n",
"with open(r\"Prompt Engineering\\JSON format output.txt\", 'r', encoding='utf-8') as f:\n",
" JSON_format_output = f.read()\n",
"\n",
"# Creating and saving the prompt\n",
"second_prompt_text = second_prompt_text + \"\\n\" + JSON_format_output\n",
"second_prompt_text = second_prompt_text + \"\\n\\n\" + \"TUI code and Semantic type description text:\\n\\n\" + TUI_description_text\n",
"save_prompt(prompt_name, second_prompt_text, prompt_description=prompt_description)\n",
"\n",
"failed_texts_name = []\n",
"failed_texts = []\n",
"failed_texts_error_message = []\n",
"\n",
"# Create a list of files for which the text has already been analyzed\n",
"analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_seventh_prompt) if file.endswith(\".xlsx\")]\n",
"\n",
"for index, disease_text in enumerate(os.listdir(path_to_texts)):\n",
" # Skip the files that have already been analyzed\n",
" if disease_text.split(\".\")[0] in analyzed_files:\n",
" continue\n",
" with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:\n",
" text = f.read() # Disease text\n",
" condition_name = disease_text.split(\".\")[0]\n",
" message_text = [{\"role\":\"system\",\"content\":second_prompt_text},\n",
" {\"role\":\"user\", \"content\":\"Text to analyze: \\n\\n\" + text}]\n",
" print(f\"{index+1} - Analyzing disease: \", condition_name)\n",
" try:\n",
" GPT_answer = get_response_LLM(prompt=message_text)\n",
" df = transform_GPT_output(GPT_answer)\n",
" df.to_excel(os.path.join(result_path_seventh_prompt, condition_name + abbreviation + \".xlsx\"), sheet_name=model_name ,index=False)\n",
" print(\"Disease analyzed and saved: \", condition_name)\n",
" except Exception as e:\n",
" failed_texts_name.append(condition_name)\n",
" failed_texts.append(GPT_answer)\n",
" failed_texts_error_message.append(str(e))\n",
" print(\"Failed to analyze disease: \", condition_name)\n",
" continue\n",
"\n",
"# Save failed texts\n",
"failed_df = pd.DataFrame({\"Disease name failed\": failed_texts_name, \"GPT Output\": failed_texts, \"Error message\": failed_texts_error_message})\n",
"failed_path = os.path.join(path_to_general_results, model_name)\n",
"failed_df.to_excel(os.path.join(failed_path, \"Failed texts \" + prompt_name + \".xlsx\"), sheet_name=model_name ,index=False)\n",
"generate_summary_failed_texts(failed_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prompt 2: Zero-Shot Learning + Entity Dictionary optimized by ChatGPT4o"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Prompt | \n",
" Disease name failed | \n",
" GPT Output | \n",
" Error message | \n",
" GPT Output Part A | \n",
" GPT Output Part B | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Prompt, Disease name failed, GPT Output, Error message, GPT Output Part A, GPT Output Part B]\n",
"Index: []"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prompt_name = \"Prompt 2_optimizedByChatGPT\"\n",
"abbreviation = \"_P2_optimizedByChatGPT\"\n",
"prompt_description = \"Prompt 2 optimized by ChatGPT. Contains detailed definitions of categories, examples, steps of analysis, and the JSON format output.\"\n",
"\n",
"# Create the path for the results of the prompt\n",
"result_path_seventh_prompt = os.path.join(path_to_general_results, model_name, prompt_name)\n",
"\n",
"# Create the folder if it does not exist\n",
"if not os.path.exists(result_path_seventh_prompt):\n",
" os.makedirs(result_path_seventh_prompt)\n",
"\n",
"# Load the optimized prompt by ChatGPT\n",
"path_to_optimized_prompt = r\"Prompt Engineering\\Prompt 2 Optimized by ChatGPT.txt\"\n",
"with open(path_to_optimized_prompt, 'r', encoding='utf-8') as f:\n",
" second_prompt_text = f.read()\n",
"save_prompt(prompt_name, second_prompt_text, prompt_description=prompt_description)\n",
"\n",
"failed_texts_name = []\n",
"failed_texts = []\n",
"failed_texts_error_message = []\n",
"\n",
"# Create a list of files for which the text has already been analyzed\n",
"analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_seventh_prompt) if file.endswith(\".xlsx\")]\n",
"\n",
"for index, disease_text in enumerate(os.listdir(path_to_texts)):\n",
" # Skip the files that have already been analyzed\n",
" if disease_text.split(\".\")[0] in analyzed_files:\n",
" continue\n",
" with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:\n",
" text = f.read() # Disease text\n",
" condition_name = disease_text.split(\".\")[0]\n",
" message_text = [{\"role\":\"user\", \"content\":second_prompt_text + text}]\n",
" print(f\"{index+1} - Analyzing disease: \", condition_name)\n",
" try:\n",
" GPT_answer = get_response_LLM(prompt=message_text)\n",
" df = transform_GPT_output(GPT_answer)\n",
" df.to_excel(os.path.join(result_path_seventh_prompt, condition_name + abbreviation + \".xlsx\"), sheet_name=model_name ,index=False)\n",
" print(\"Disease analyzed and saved: \", condition_name)\n",
" except Exception as e:\n",
" failed_texts_name.append(condition_name)\n",
" failed_texts.append(GPT_answer)\n",
" failed_texts_error_message.append(str(e))\n",
" print(\"Failed to analyze disease: \", condition_name)\n",
" continue\n",
"\n",
"# Save failed texts\n",
"failed_df = pd.DataFrame({\"Disease name failed\": failed_texts_name, \"GPT Output\": failed_texts, \"Error message\": failed_texts_error_message})\n",
"failed_path = os.path.join(path_to_general_results, model_name)\n",
"failed_df.to_excel(os.path.join(failed_path, \"Failed texts \" + prompt_name + \".xlsx\"), sheet_name=model_name ,index=False)\n",
"generate_summary_failed_texts(failed_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prompt 3: One-shot Learning + Entity Dictionary"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 - Analyzing disease: Acute decompensated heart failure\n",
"Successfully analyzed: Acute decompensated heart failure\n",
"2 - Analyzing disease: Acute intermittent porphyria\n",
"Successfully analyzed: Acute intermittent porphyria\n",
"3 - Analyzing disease: Anthrax\n",
"Successfully analyzed: Anthrax\n",
"4 - Analyzing disease: Arterial embolism\n",
"Successfully analyzed: Arterial embolism\n",
"5 - Analyzing disease: Arteriovenous malformation\n",
"Successfully analyzed: Arteriovenous malformation\n",
"6 - Analyzing disease: Ascites\n",
"Successfully analyzed: Ascites\n",
"7 - Analyzing disease: Autonomic dysreflexia\n",
"Successfully analyzed: Autonomic dysreflexia\n",
"8 - Analyzing disease: Benzodiazepine withdrawal syndrome\n",
"Successfully analyzed: Benzodiazepine withdrawal syndrome\n",
"9 - Analyzing disease: Blastomycosis\n",
"Successfully analyzed: Blastomycosis\n",
"10 - Analyzing disease: Breast cancer\n",
"Successfully analyzed: Breast cancer\n",
"11 - Analyzing disease: Campylobacteriosis\n",
"Successfully analyzed: Campylobacteriosis\n",
"12 - Analyzing disease: Carciac myxoma\n",
"Successfully analyzed: Carciac myxoma\n",
"13 - Analyzing disease: Carrion's disease\n",
"Successfully analyzed: Carrion's disease\n",
"14 - Analyzing disease: Cerebral salt-wasting syndrome\n",
"Successfully analyzed: Cerebral salt-wasting syndrome\n",
"15 - Analyzing disease: Cerebrovascular disease\n",
"Successfully analyzed: Cerebrovascular disease\n",
"16 - Analyzing disease: Chlamydia infection\n",
"Successfully analyzed: Chlamydia infection\n",
"17 - Analyzing disease: Cholesterol embolism\n",
"Successfully analyzed: Cholesterol embolism\n",
"18 - Analyzing disease: Coccidioidomycosis\n",
"Successfully analyzed: Coccidioidomycosis\n",
"19 - Analyzing disease: Conversion disorder\n",
"Successfully analyzed: Conversion disorder\n",
"20 - Analyzing disease: Cryoglobulinemia\n",
"Successfully analyzed: Cryoglobulinemia\n",
"21 - Analyzing disease: Diphtheria\n",
"Successfully analyzed: Diphtheria\n",
"22 - Analyzing disease: Erysipelas\n",
"Successfully analyzed: Erysipelas\n",
"23 - Analyzing disease: Erythema nodosum\n",
"Successfully analyzed: Erythema nodosum\n",
"24 - Analyzing disease: Ethylene glycol poisoning\n",
"Successfully analyzed: Ethylene glycol poisoning\n",
"25 - Analyzing disease: Felty's syndrome\n",
"Successfully analyzed: Felty's syndrome\n",
"26 - Analyzing disease: Food intolerance\n",
"Successfully analyzed: Food intolerance\n",
"27 - Analyzing disease: Gastroparesis\n",
"Successfully analyzed: Gastroparesis\n",
"28 - Analyzing disease: Generalized anxiety disorder\n",
"Successfully analyzed: Generalized anxiety disorder\n",
"29 - Analyzing disease: GM1 gangliosidoses\n",
"Successfully analyzed: GM1 gangliosidoses\n",
"30 - Analyzing disease: Helicobacter pylori\n",
"Successfully analyzed: Helicobacter pylori\n",
"31 - Analyzing disease: Hemolytic-uremic syndrome\n",
"Successfully analyzed: Hemolytic-uremic syndrome\n",
"32 - Analyzing disease: Herpes labialis\n",
"Successfully analyzed: Herpes labialis\n",
"33 - Analyzing disease: Hypercalcaemia\n",
"Successfully analyzed: Hypercalcaemia\n",
"34 - Analyzing disease: Hyperosmolar hyperglycemic state\n",
"Successfully analyzed: Hyperosmolar hyperglycemic state\n",
"35 - Analyzing disease: Hypervitaminosis A\n",
"Successfully analyzed: Hypervitaminosis A\n",
"36 - Analyzing disease: Hypocalcaemia\n",
"Successfully analyzed: Hypocalcaemia\n",
"37 - Analyzing disease: Hypomagnesemia\n",
"Successfully analyzed: Hypomagnesemia\n",
"38 - Analyzing disease: Hypovolemia\n",
"Successfully analyzed: Hypovolemia\n",
"39 - Analyzing disease: Inborn error of metabolism\n",
"Successfully analyzed: Inborn error of metabolism\n",
"40 - Analyzing disease: Influenza\n",
"Successfully analyzed: Influenza\n",
"41 - Analyzing disease: Intention tremor\n",
"Successfully analyzed: Intention tremor\n",
"42 - Analyzing disease: Intraparenchymal hemorrhage\n",
"Successfully analyzed: Intraparenchymal hemorrhage\n",
"43 - Analyzing disease: Itch\n",
"Successfully analyzed: Itch\n",
"44 - Analyzing disease: Juvenile dermatomyositis\n",
"Successfully analyzed: Juvenile dermatomyositis\n",
"45 - Analyzing disease: Kaposi's sarcoma\n",
"Successfully analyzed: Kaposi's sarcoma\n",
"46 - Analyzing disease: Lambert–Eaton myasthenic syndrome\n",
"Successfully analyzed: Lambert–Eaton myasthenic syndrome\n",
"47 - Analyzing disease: Laryngitis\n",
"Successfully analyzed: Laryngitis\n",
"48 - Analyzing disease: Lateral medullary syndrome\n",
"Successfully analyzed: Lateral medullary syndrome\n",
"49 - Analyzing disease: MERRF syndrome\n",
"Successfully analyzed: MERRF syndrome\n",
"50 - Analyzing disease: Metal fume fever\n",
"Successfully analyzed: Metal fume fever\n",
"51 - Analyzing disease: Morvan's syndrome\n",
"Successfully analyzed: Morvan's syndrome\n",
"52 - Analyzing disease: Myocarditis\n",
"Successfully analyzed: Myocarditis\n",
"53 - Analyzing disease: Necatoriasis\n",
"Successfully analyzed: Necatoriasis\n",
"54 - Analyzing disease: Nicotine poisoning\n",
"Successfully analyzed: Nicotine poisoning\n",
"55 - Analyzing disease: Non-alcoholic fatty liver disease\n",
"Successfully analyzed: Non-alcoholic fatty liver disease\n",
"56 - Analyzing disease: Non-small-cell lung carcinoma\n",
"Successfully analyzed: Non-small-cell lung carcinoma\n",
"57 - Analyzing disease: Normal pressure hydrocephalus\n",
"Successfully analyzed: Normal pressure hydrocephalus\n",
"58 - Analyzing disease: Obesity hypoventilation syndrome\n",
"Successfully analyzed: Obesity hypoventilation syndrome\n",
"59 - Analyzing disease: Opioid use disorder\n",
"Successfully analyzed: Opioid use disorder\n",
"60 - Analyzing disease: Optic neuritis\n",
"Successfully analyzed: Optic neuritis\n",
"61 - Analyzing disease: Orofacial granulomatosis\n",
"Successfully analyzed: Orofacial granulomatosis\n",
"62 - Analyzing disease: Orthostatic hypotension\n",
"Successfully analyzed: Orthostatic hypotension\n",
"63 - Analyzing disease: Pancreatic cancer\n",
"Successfully analyzed: Pancreatic cancer\n",
"64 - Analyzing disease: Panic attack\n",
"Successfully analyzed: Panic attack\n",
"65 - Analyzing disease: Paratyphoid fever\n",
"Successfully analyzed: Paratyphoid fever\n",
"66 - Analyzing disease: Parry–Romberg syndrome\n",
"Successfully analyzed: Parry–Romberg syndrome\n",
"67 - Analyzing disease: Pituitary apoplexy\n",
"Successfully analyzed: Pituitary apoplexy\n",
"68 - Analyzing disease: Polyarteritis nodosa\n",
"Successfully analyzed: Polyarteritis nodosa\n",
"69 - Analyzing disease: Porencephaly\n",
"Successfully analyzed: Porencephaly\n",
"70 - Analyzing disease: Prediabetes\n",
"Successfully analyzed: Prediabetes\n",
"71 - Analyzing disease: Pregnancy\n",
"Successfully analyzed: Pregnancy\n",
"72 - Analyzing disease: Premenstrual syndrome\n",
"Successfully analyzed: Premenstrual syndrome\n",
"73 - Analyzing disease: Primary hyperparathyroidism\n",
"Successfully analyzed: Primary hyperparathyroidism\n",
"74 - Analyzing disease: Primary sclerosing cholangitis\n",
"Successfully analyzed: Primary sclerosing cholangitis\n",
"75 - Analyzing disease: Reactive hypoglycemia\n",
"Successfully analyzed: Reactive hypoglycemia\n",
"76 - Analyzing disease: Reflex syncope\n",
"Successfully analyzed: Reflex syncope\n",
"77 - Analyzing disease: Ross River fever\n",
"Successfully analyzed: Ross River fever\n",
"78 - Analyzing disease: Rubella\n",
"Successfully analyzed: Rubella\n",
"79 - Analyzing disease: Scarlet fever\n",
"Successfully analyzed: Scarlet fever\n",
"80 - Analyzing disease: Scleroderma\n",
"Successfully analyzed: Scleroderma\n",
"81 - Analyzing disease: Snakebite\n",
"Successfully analyzed: Snakebite\n",
"82 - Analyzing disease: Soy allergy\n",
"Successfully analyzed: Soy allergy\n",
"83 - Analyzing disease: Streptococcal pharyngitis\n",
"Successfully analyzed: Streptococcal pharyngitis\n",
"84 - Analyzing disease: Subdural hematoma\n",
"Successfully analyzed: Subdural hematoma\n",
"85 - Analyzing disease: Superior mesenteric artery syndrome\n",
"Successfully analyzed: Superior mesenteric artery syndrome\n",
"86 - Analyzing disease: Taeniasis\n",
"Successfully analyzed: Taeniasis\n",
"87 - Analyzing disease: Tetanus\n",
"Successfully analyzed: Tetanus\n",
"88 - Analyzing disease: Tethered spinal cord syndrome\n",
"Successfully analyzed: Tethered spinal cord syndrome\n",
"89 - Analyzing disease: Thyroid storm\n",
"Successfully analyzed: Thyroid storm\n",
"90 - Analyzing disease: Thyrotoxic periodic paralysis\n",
"Successfully analyzed: Thyrotoxic periodic paralysis\n",
"91 - Analyzing disease: Tonsillitis\n",
"Successfully analyzed: Tonsillitis\n",
"92 - Analyzing disease: Trichinosis\n",
"Successfully analyzed: Trichinosis\n",
"93 - Analyzing disease: Tropical sprue\n",
"Successfully analyzed: Tropical sprue\n",
"94 - Analyzing disease: Typhoid fever\n",
"Successfully analyzed: Typhoid fever\n",
"95 - Analyzing disease: Upper respiratory tract infection\n",
"Successfully analyzed: Upper respiratory tract infection\n",
"96 - Analyzing disease: Vascular dementia\n",
"Successfully analyzed: Vascular dementia\n",
"97 - Analyzing disease: Waterhouse–Friderichsen syndrome\n",
"Successfully analyzed: Waterhouse–Friderichsen syndrome\n",
"98 - Analyzing disease: Weight loss\n",
"Successfully analyzed: Weight loss\n",
"99 - Analyzing disease: Whipple's disease\n",
"Successfully analyzed: Whipple's disease\n",
"100 - Analyzing disease: Xerostomia\n",
"Successfully analyzed: Xerostomia\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Prompt | \n",
" Disease name failed | \n",
" GPT Output | \n",
" Error message | \n",
" GPT Output Part A | \n",
" GPT Output Part B | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Prompt, Disease name failed, GPT Output, Error message, GPT Output Part A, GPT Output Part B]\n",
"Index: []"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"if useSimplifiedEntityDescription:\n",
" prompt_name = \"Prompt 3_simplified\"\n",
" abbreviation = \"_P3_simplified\"\n",
" # Open Prompt Engineering\\TUI_description_text.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text_simplified.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
" prompt_description = \"Basic Prompt but extracting according to simplified semantic type categories text with one-shot learning.\"\n",
"else:\n",
" prompt_name = \"Prompt 3\"\n",
" abbreviation = \"_P3\"\n",
" # Open Prompt Engineering\\TUI_description_text.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
" prompt_description = \"Basic Prompt but extracting according to semantic type categories with one-shot learning.\"\n",
"\n",
"# Create the path for the results of the prompt\n",
"result_path_third_prompt = os.path.join(path_to_general_results, model_name, prompt_name)\n",
"\n",
"# Create the folder if it does not exist\n",
"if not os.path.exists(result_path_third_prompt):\n",
" os.makedirs(result_path_third_prompt)\n",
"\n",
"third_prompt_text = \"Extract and list all the terms that can be classified into any of the categories in the \\\n",
"“TUI code and Semantic type description text” and are related to the condition found in the “text to analyze”. \\\n",
"Classify each extracted term into one of the categories in the “TUI code and Semantic type description text\"\n",
"\n",
"\n",
"# Open the JSON format otuput\n",
"with open(r\"Prompt Engineering\\JSON format output.txt\", 'r', encoding='utf-8') as f:\n",
" JSON_format_output = f.read()\n",
"\n",
"third_prompt_text = third_prompt_text + \"\\n\" + JSON_format_output\n",
"third_prompt_text = third_prompt_text + \"\\n\\nTUI code and Semantic type description text: \\n\\n\" + TUI_description_text\n",
"save_prompt(prompt_name, third_prompt_text, prompt_description)\n",
"\n",
"# Open Texto Ejemplo de Anemia\n",
"with open(r\"Prompt Engineering\\Texto Ejemplo - Anemia.txt\", 'r', encoding='utf-8') as f:\n",
" example_text = f.read()\n",
" \n",
"# Open Texto Ejemplo sin CoT\n",
"with open(r\"Prompt Engineering\\Texto Ejemplo Respuesta sin CoT - Anemia.txt\", 'r', encoding='utf-8') as f:\n",
" example_text_answer_sin_CoT = f.read()\n",
"\n",
"failed_texts_name = []\n",
"failed_texts = []\n",
"failed_texts_error_message = []\n",
"\n",
"# Create a list of files for which the text has already been analyzed\n",
"analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_third_prompt) if file.endswith(\".xlsx\")]\n",
"\n",
"for index, disease_text in enumerate(os.listdir(path_to_texts)):\n",
" # Skip the files that have already been analyzed\n",
" if disease_text.split(\".\")[0] in analyzed_files:\n",
" continue\n",
" with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:\n",
" text = f.read() # Disease text\n",
" condition_name = disease_text.split(\".\")[0]\n",
" message_text = [{\"role\":\"system\",\"content\":third_prompt_text},\n",
" {\"role\":\"user\", \"content\":\"Text to analyze: \\n\\n\" + example_text},\n",
" {\"role\":\"assistant\",\"content\":example_text_answer_sin_CoT},\n",
" {\"role\":\"user\",\"content\":\"Text to analyze: \\n\\n\" + text}]\n",
" print(f\"{index+1} - Analyzing disease: \", condition_name)\n",
" try:\n",
" GPT_answer = get_response_LLM(prompt=message_text)\n",
" df = transform_GPT_output(GPT_answer)\n",
" df.to_excel(os.path.join(result_path_third_prompt, condition_name + abbreviation + \".xlsx\"), sheet_name=model_name ,index=False)\n",
" print(\"Successfully analyzed: \", condition_name)\n",
" except Exception as e:\n",
" failed_texts_name.append(condition_name)\n",
" failed_texts.append(GPT_answer)\n",
" # Save the error message from the exception\n",
" failed_texts_error_message.append(str(e))\n",
" print(\"Failed to analyze: \", condition_name)\n",
" continue\n",
"\n",
"# Save failed texts\n",
"failed_df = pd.DataFrame({\"Disease name failed\": failed_texts_name, \"GPT Output\": failed_texts, \"Error message\": failed_texts_error_message})\n",
"failed_path = os.path.join(path_to_general_results, model_name)\n",
"failed_df.to_excel(os.path.join(failed_path, \"Failed texts \" + prompt_name + \".xlsx\"), sheet_name=model_name ,index=False)\n",
"generate_summary_failed_texts(failed_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prompt 4: Zero-Shot Learning + Entity Dictionary + Self-Reflection (or called CoT, provide explanation)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 - Analyzing: Acute decompensated heart failure\n",
"Successfully analyzed: Acute decompensated heart failure\n",
"2 - Analyzing: Acute intermittent porphyria\n",
"Successfully analyzed: Acute intermittent porphyria\n",
"3 - Analyzing: Anthrax\n",
"Successfully analyzed: Anthrax\n",
"4 - Analyzing: Arterial embolism\n",
"Successfully analyzed: Arterial embolism\n",
"5 - Analyzing: Arteriovenous malformation\n",
"Successfully analyzed: Arteriovenous malformation\n",
"6 - Analyzing: Ascites\n",
"Successfully analyzed: Ascites\n",
"7 - Analyzing: Autonomic dysreflexia\n",
"Successfully analyzed: Autonomic dysreflexia\n",
"8 - Analyzing: Benzodiazepine withdrawal syndrome\n",
"Successfully analyzed: Benzodiazepine withdrawal syndrome\n",
"9 - Analyzing: Blastomycosis\n",
"Successfully analyzed: Blastomycosis\n",
"10 - Analyzing: Breast cancer\n",
"Successfully analyzed: Breast cancer\n",
"11 - Analyzing: Campylobacteriosis\n",
"Successfully analyzed: Campylobacteriosis\n",
"12 - Analyzing: Carciac myxoma\n",
"Successfully analyzed: Carciac myxoma\n",
"13 - Analyzing: Carrion's disease\n",
"Successfully analyzed: Carrion's disease\n",
"14 - Analyzing: Cerebral salt-wasting syndrome\n",
"Successfully analyzed: Cerebral salt-wasting syndrome\n",
"15 - Analyzing: Cerebrovascular disease\n",
"Successfully analyzed: Cerebrovascular disease\n",
"16 - Analyzing: Chlamydia infection\n",
"Successfully analyzed: Chlamydia infection\n",
"17 - Analyzing: Cholesterol embolism\n",
"Successfully analyzed: Cholesterol embolism\n",
"18 - Analyzing: Coccidioidomycosis\n",
"Successfully analyzed: Coccidioidomycosis\n",
"19 - Analyzing: Conversion disorder\n",
"Successfully analyzed: Conversion disorder\n",
"20 - Analyzing: Cryoglobulinemia\n",
"Successfully analyzed: Cryoglobulinemia\n",
"21 - Analyzing: Diphtheria\n",
"Successfully analyzed: Diphtheria\n",
"22 - Analyzing: Erysipelas\n",
"Successfully analyzed: Erysipelas\n",
"23 - Analyzing: Erythema nodosum\n",
"Successfully analyzed: Erythema nodosum\n",
"24 - Analyzing: Ethylene glycol poisoning\n",
"Successfully analyzed: Ethylene glycol poisoning\n",
"25 - Analyzing: Felty's syndrome\n",
"Successfully analyzed: Felty's syndrome\n",
"26 - Analyzing: Food intolerance\n",
"Successfully analyzed: Food intolerance\n",
"27 - Analyzing: Gastroparesis\n",
"Successfully analyzed: Gastroparesis\n",
"28 - Analyzing: Generalized anxiety disorder\n",
"Successfully analyzed: Generalized anxiety disorder\n",
"29 - Analyzing: GM1 gangliosidoses\n",
"Successfully analyzed: GM1 gangliosidoses\n",
"30 - Analyzing: Helicobacter pylori\n",
"Successfully analyzed: Helicobacter pylori\n",
"31 - Analyzing: Hemolytic-uremic syndrome\n",
"Successfully analyzed: Hemolytic-uremic syndrome\n",
"32 - Analyzing: Herpes labialis\n",
"Successfully analyzed: Herpes labialis\n",
"33 - Analyzing: Hypercalcaemia\n",
"Successfully analyzed: Hypercalcaemia\n",
"34 - Analyzing: Hyperosmolar hyperglycemic state\n",
"Successfully analyzed: Hyperosmolar hyperglycemic state\n",
"35 - Analyzing: Hypervitaminosis A\n",
"Successfully analyzed: Hypervitaminosis A\n",
"36 - Analyzing: Hypocalcaemia\n",
"Successfully analyzed: Hypocalcaemia\n",
"37 - Analyzing: Hypomagnesemia\n",
"Successfully analyzed: Hypomagnesemia\n",
"38 - Analyzing: Hypovolemia\n",
"Successfully analyzed: Hypovolemia\n",
"39 - Analyzing: Inborn error of metabolism\n",
"Successfully analyzed: Inborn error of metabolism\n",
"40 - Analyzing: Influenza\n",
"Successfully analyzed: Influenza\n",
"41 - Analyzing: Intention tremor\n",
"Successfully analyzed: Intention tremor\n",
"42 - Analyzing: Intraparenchymal hemorrhage\n",
"Successfully analyzed: Intraparenchymal hemorrhage\n",
"43 - Analyzing: Itch\n",
"Successfully analyzed: Itch\n",
"44 - Analyzing: Juvenile dermatomyositis\n",
"Successfully analyzed: Juvenile dermatomyositis\n",
"45 - Analyzing: Kaposi's sarcoma\n",
"Successfully analyzed: Kaposi's sarcoma\n",
"46 - Analyzing: Lambert–Eaton myasthenic syndrome\n",
"Successfully analyzed: Lambert–Eaton myasthenic syndrome\n",
"47 - Analyzing: Laryngitis\n",
"Successfully analyzed: Laryngitis\n",
"48 - Analyzing: Lateral medullary syndrome\n",
"Successfully analyzed: Lateral medullary syndrome\n",
"49 - Analyzing: MERRF syndrome\n",
"Successfully analyzed: MERRF syndrome\n",
"50 - Analyzing: Metal fume fever\n",
"Successfully analyzed: Metal fume fever\n",
"51 - Analyzing: Morvan's syndrome\n",
"Successfully analyzed: Morvan's syndrome\n",
"52 - Analyzing: Myocarditis\n",
"Successfully analyzed: Myocarditis\n",
"53 - Analyzing: Necatoriasis\n",
"Successfully analyzed: Necatoriasis\n",
"54 - Analyzing: Nicotine poisoning\n",
"Successfully analyzed: Nicotine poisoning\n",
"55 - Analyzing: Non-alcoholic fatty liver disease\n",
"Successfully analyzed: Non-alcoholic fatty liver disease\n",
"56 - Analyzing: Non-small-cell lung carcinoma\n",
"Successfully analyzed: Non-small-cell lung carcinoma\n",
"57 - Analyzing: Normal pressure hydrocephalus\n",
"Successfully analyzed: Normal pressure hydrocephalus\n",
"58 - Analyzing: Obesity hypoventilation syndrome\n",
"Successfully analyzed: Obesity hypoventilation syndrome\n",
"59 - Analyzing: Opioid use disorder\n",
"Successfully analyzed: Opioid use disorder\n",
"60 - Analyzing: Optic neuritis\n",
"Successfully analyzed: Optic neuritis\n",
"61 - Analyzing: Orofacial granulomatosis\n",
"Successfully analyzed: Orofacial granulomatosis\n",
"62 - Analyzing: Orthostatic hypotension\n",
"Successfully analyzed: Orthostatic hypotension\n",
"63 - Analyzing: Pancreatic cancer\n",
"Successfully analyzed: Pancreatic cancer\n",
"64 - Analyzing: Panic attack\n",
"Successfully analyzed: Panic attack\n",
"65 - Analyzing: Paratyphoid fever\n",
"Successfully analyzed: Paratyphoid fever\n",
"66 - Analyzing: Parry–Romberg syndrome\n",
"Successfully analyzed: Parry–Romberg syndrome\n",
"67 - Analyzing: Pituitary apoplexy\n",
"Successfully analyzed: Pituitary apoplexy\n",
"68 - Analyzing: Polyarteritis nodosa\n",
"Successfully analyzed: Polyarteritis nodosa\n",
"69 - Analyzing: Porencephaly\n",
"Successfully analyzed: Porencephaly\n",
"70 - Analyzing: Prediabetes\n",
"Successfully analyzed: Prediabetes\n",
"71 - Analyzing: Pregnancy\n",
"Successfully analyzed: Pregnancy\n",
"72 - Analyzing: Premenstrual syndrome\n",
"Successfully analyzed: Premenstrual syndrome\n",
"73 - Analyzing: Primary hyperparathyroidism\n",
"Successfully analyzed: Primary hyperparathyroidism\n",
"74 - Analyzing: Primary sclerosing cholangitis\n",
"Successfully analyzed: Primary sclerosing cholangitis\n",
"75 - Analyzing: Reactive hypoglycemia\n",
"Successfully analyzed: Reactive hypoglycemia\n",
"76 - Analyzing: Reflex syncope\n",
"Successfully analyzed: Reflex syncope\n",
"77 - Analyzing: Ross River fever\n",
"Successfully analyzed: Ross River fever\n",
"78 - Analyzing: Rubella\n",
"Successfully analyzed: Rubella\n",
"79 - Analyzing: Scarlet fever\n",
"Successfully analyzed: Scarlet fever\n",
"80 - Analyzing: Scleroderma\n",
"Successfully analyzed: Scleroderma\n",
"81 - Analyzing: Snakebite\n",
"Successfully analyzed: Snakebite\n",
"82 - Analyzing: Soy allergy\n",
"Successfully analyzed: Soy allergy\n",
"83 - Analyzing: Streptococcal pharyngitis\n",
"Successfully analyzed: Streptococcal pharyngitis\n",
"84 - Analyzing: Subdural hematoma\n",
"Successfully analyzed: Subdural hematoma\n",
"85 - Analyzing: Superior mesenteric artery syndrome\n",
"Successfully analyzed: Superior mesenteric artery syndrome\n",
"86 - Analyzing: Taeniasis\n",
"Successfully analyzed: Taeniasis\n",
"87 - Analyzing: Tetanus\n",
"Successfully analyzed: Tetanus\n",
"88 - Analyzing: Tethered spinal cord syndrome\n",
"Successfully analyzed: Tethered spinal cord syndrome\n",
"89 - Analyzing: Thyroid storm\n",
"Successfully analyzed: Thyroid storm\n",
"90 - Analyzing: Thyrotoxic periodic paralysis\n",
"Successfully analyzed: Thyrotoxic periodic paralysis\n",
"91 - Analyzing: Tonsillitis\n",
"Successfully analyzed: Tonsillitis\n",
"92 - Analyzing: Trichinosis\n",
"Successfully analyzed: Trichinosis\n",
"93 - Analyzing: Tropical sprue\n",
"Successfully analyzed: Tropical sprue\n",
"94 - Analyzing: Typhoid fever\n",
"Successfully analyzed: Typhoid fever\n",
"95 - Analyzing: Upper respiratory tract infection\n",
"Successfully analyzed: Upper respiratory tract infection\n",
"96 - Analyzing: Vascular dementia\n",
"Successfully analyzed: Vascular dementia\n",
"97 - Analyzing: Waterhouse–Friderichsen syndrome\n",
"Successfully analyzed: Waterhouse–Friderichsen syndrome\n",
"98 - Analyzing: Weight loss\n",
"Successfully analyzed: Weight loss\n",
"99 - Analyzing: Whipple's disease\n",
"Successfully analyzed: Whipple's disease\n",
"100 - Analyzing: Xerostomia\n",
"Successfully analyzed: Xerostomia\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Prompt | \n",
" Disease name failed | \n",
" GPT Output | \n",
" Error message | \n",
" GPT Output Part A | \n",
" GPT Output Part B | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Prompt, Disease name failed, GPT Output, Error message, GPT Output Part A, GPT Output Part B]\n",
"Index: []"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"if useSimplifiedEntityDescription:\n",
" prompt_name = \"Prompt 4_simplified\"\n",
" abbreviation = \"_P4_simplified\"\n",
" # Open Prompt Engineering\\TUI_description_text.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text_simplified.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
" prompt_description = \"Basic Prompt but extracting according to simplified semantic type categories text with explanation (CoT).\"\n",
"else:\n",
" prompt_name = \"Prompt 4\"\n",
" abbreviation = \"_P4\"\n",
" # Open Prompt Engineering\\TUI_description_text.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
" prompt_description = \"Basic Prompt but extracting according to semantic type categories with explanation (CoT).\"\n",
"\n",
"# Create the path for the results of the prompt\n",
"result_path_fourth_prompt = os.path.join(path_to_general_results, model_name, prompt_name)\n",
"\n",
"# Create the folder if it does not exist\n",
"if not os.path.exists(result_path_fourth_prompt):\n",
" os.makedirs(result_path_fourth_prompt)\n",
"\n",
"fourth_prompt_text = \"Extract and list all the terms that can be classified into any of the categories in the \\\n",
"“TUI code and Semantic type description text” and are related to the condition found in the “text to analyze”. \\\n",
"Classify each extracted term into one of the categories in the “TUI code and Semantic type description text”. \\\n",
"Explain why the entity was extracted and why it was classified as such.\"\n",
"\n",
"\n",
"# Open the JSON format otuput\n",
"with open(r\"Prompt Engineering\\JSON format output with Explanation.txt\", 'r', encoding='utf-8') as f:\n",
" JSON_format_output_with_explanation = f.read()\n",
"\n",
"# Creating and saving the prompt\n",
"fourth_prompt_text = fourth_prompt_text + \"\\n\" + JSON_format_output_with_explanation\n",
"fourth_prompt_text = fourth_prompt_text + \"\\n\\n\" + \"TUI code and Semantic type description text: \\n\\n\" + TUI_description_text\n",
"save_prompt(prompt_name, fourth_prompt_text, prompt_description)\n",
"\n",
"failed_texts_name = []\n",
"failed_texts = []\n",
"failed_texts_error_message = []\n",
"\n",
"# Create a list of files for which the text has already been analyzed\n",
"analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_fourth_prompt) if file.endswith(\".xlsx\")]\n",
"\n",
"for index, disease_text in enumerate(os.listdir(path_to_texts)):\n",
" # Skip the files that have already been analyzed\n",
" if disease_text.split(\".\")[0] in analyzed_files:\n",
" continue\n",
" with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:\n",
" text = f.read() # Disease text\n",
" condition_name = disease_text.split(\".\")[0]\n",
" message_text = [{\"role\":\"system\",\"content\":fourth_prompt_text},\n",
" {\"role\":\"user\", \"content\":\"Text to analyze: \\n\\n\" + text}]\n",
" print(f\"{index + 1} - Analyzing:\", condition_name)\n",
" try:\n",
" GPT_answer = get_response_LLM(prompt=message_text)\n",
" df = transform_GPT_output(GPT_answer)\n",
" df.to_excel(os.path.join(result_path_fourth_prompt, condition_name + abbreviation + \".xlsx\"), sheet_name=model_name ,index=False)\n",
" print(\"Successfully analyzed: \", condition_name)\n",
" except Exception as e:\n",
" failed_texts_name.append(condition_name)\n",
" failed_texts.append(GPT_answer)\n",
" # Save the error message from the exception\n",
" failed_texts_error_message.append(str(e))\n",
" print(\"Failed to analyze: \", condition_name)\n",
"\n",
"# Save failed texts after looping through all the text diseases\n",
"failed_df = pd.DataFrame({\"Disease name failed\": failed_texts_name, \"GPT Output\": failed_texts, \"Error message\": failed_texts_error_message})\n",
"failed_path = os.path.join(path_to_general_results, model_name)\n",
"failed_df.to_excel(os.path.join(failed_path, \"Failed texts \" + prompt_name + \".xlsx\"), sheet_name=model_name ,index=False)\n",
"generate_summary_failed_texts(failed_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prompt 5: One-shot learning + Entity Description + Self-Reflection (or called CoT, provide explanation)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 - Analyzing: Acute decompensated heart failure\n",
"Successfully analyzed: Acute decompensated heart failure\n",
"2 - Analyzing: Acute intermittent porphyria\n",
"Successfully analyzed: Acute intermittent porphyria\n",
"3 - Analyzing: Anthrax\n",
"Successfully analyzed: Anthrax\n",
"4 - Analyzing: Arterial embolism\n",
"Successfully analyzed: Arterial embolism\n",
"5 - Analyzing: Arteriovenous malformation\n",
"Successfully analyzed: Arteriovenous malformation\n",
"6 - Analyzing: Ascites\n",
"Successfully analyzed: Ascites\n",
"7 - Analyzing: Autonomic dysreflexia\n",
"Successfully analyzed: Autonomic dysreflexia\n",
"8 - Analyzing: Benzodiazepine withdrawal syndrome\n",
"Successfully analyzed: Benzodiazepine withdrawal syndrome\n",
"9 - Analyzing: Blastomycosis\n",
"Successfully analyzed: Blastomycosis\n",
"10 - Analyzing: Breast cancer\n",
"Successfully analyzed: Breast cancer\n",
"11 - Analyzing: Campylobacteriosis\n",
"Successfully analyzed: Campylobacteriosis\n",
"12 - Analyzing: Carciac myxoma\n",
"Successfully analyzed: Carciac myxoma\n",
"13 - Analyzing: Carrion's disease\n",
"Successfully analyzed: Carrion's disease\n",
"14 - Analyzing: Cerebral salt-wasting syndrome\n",
"Successfully analyzed: Cerebral salt-wasting syndrome\n",
"15 - Analyzing: Cerebrovascular disease\n",
"Successfully analyzed: Cerebrovascular disease\n",
"16 - Analyzing: Chlamydia infection\n",
"Successfully analyzed: Chlamydia infection\n",
"17 - Analyzing: Cholesterol embolism\n",
"Successfully analyzed: Cholesterol embolism\n",
"18 - Analyzing: Coccidioidomycosis\n",
"Successfully analyzed: Coccidioidomycosis\n",
"19 - Analyzing: Conversion disorder\n",
"Successfully analyzed: Conversion disorder\n",
"20 - Analyzing: Cryoglobulinemia\n",
"Successfully analyzed: Cryoglobulinemia\n",
"21 - Analyzing: Diphtheria\n",
"Successfully analyzed: Diphtheria\n",
"22 - Analyzing: Erysipelas\n",
"Successfully analyzed: Erysipelas\n",
"23 - Analyzing: Erythema nodosum\n",
"Successfully analyzed: Erythema nodosum\n",
"24 - Analyzing: Ethylene glycol poisoning\n",
"Successfully analyzed: Ethylene glycol poisoning\n",
"25 - Analyzing: Felty's syndrome\n",
"Successfully analyzed: Felty's syndrome\n",
"26 - Analyzing: Food intolerance\n",
"Successfully analyzed: Food intolerance\n",
"27 - Analyzing: Gastroparesis\n",
"Successfully analyzed: Gastroparesis\n",
"28 - Analyzing: Generalized anxiety disorder\n",
"Successfully analyzed: Generalized anxiety disorder\n",
"29 - Analyzing: GM1 gangliosidoses\n",
"Successfully analyzed: GM1 gangliosidoses\n",
"30 - Analyzing: Helicobacter pylori\n",
"Successfully analyzed: Helicobacter pylori\n",
"31 - Analyzing: Hemolytic-uremic syndrome\n",
"Successfully analyzed: Hemolytic-uremic syndrome\n",
"32 - Analyzing: Herpes labialis\n",
"Successfully analyzed: Herpes labialis\n",
"33 - Analyzing: Hypercalcaemia\n",
"Successfully analyzed: Hypercalcaemia\n",
"34 - Analyzing: Hyperosmolar hyperglycemic state\n",
"Successfully analyzed: Hyperosmolar hyperglycemic state\n",
"35 - Analyzing: Hypervitaminosis A\n",
"Successfully analyzed: Hypervitaminosis A\n",
"36 - Analyzing: Hypocalcaemia\n",
"Successfully analyzed: Hypocalcaemia\n",
"37 - Analyzing: Hypomagnesemia\n",
"Successfully analyzed: Hypomagnesemia\n",
"38 - Analyzing: Hypovolemia\n",
"Successfully analyzed: Hypovolemia\n",
"39 - Analyzing: Inborn error of metabolism\n",
"Successfully analyzed: Inborn error of metabolism\n",
"40 - Analyzing: Influenza\n",
"Successfully analyzed: Influenza\n",
"41 - Analyzing: Intention tremor\n",
"Successfully analyzed: Intention tremor\n",
"42 - Analyzing: Intraparenchymal hemorrhage\n",
"Successfully analyzed: Intraparenchymal hemorrhage\n",
"43 - Analyzing: Itch\n",
"Successfully analyzed: Itch\n",
"44 - Analyzing: Juvenile dermatomyositis\n",
"Successfully analyzed: Juvenile dermatomyositis\n",
"45 - Analyzing: Kaposi's sarcoma\n",
"Successfully analyzed: Kaposi's sarcoma\n",
"46 - Analyzing: Lambert–Eaton myasthenic syndrome\n",
"Successfully analyzed: Lambert–Eaton myasthenic syndrome\n",
"47 - Analyzing: Laryngitis\n",
"Successfully analyzed: Laryngitis\n",
"48 - Analyzing: Lateral medullary syndrome\n",
"Successfully analyzed: Lateral medullary syndrome\n",
"49 - Analyzing: MERRF syndrome\n",
"Successfully analyzed: MERRF syndrome\n",
"50 - Analyzing: Metal fume fever\n",
"Successfully analyzed: Metal fume fever\n",
"51 - Analyzing: Morvan's syndrome\n",
"Successfully analyzed: Morvan's syndrome\n",
"52 - Analyzing: Myocarditis\n",
"Successfully analyzed: Myocarditis\n",
"53 - Analyzing: Necatoriasis\n",
"Successfully analyzed: Necatoriasis\n",
"54 - Analyzing: Nicotine poisoning\n",
"Successfully analyzed: Nicotine poisoning\n",
"55 - Analyzing: Non-alcoholic fatty liver disease\n",
"Successfully analyzed: Non-alcoholic fatty liver disease\n",
"56 - Analyzing: Non-small-cell lung carcinoma\n",
"Successfully analyzed: Non-small-cell lung carcinoma\n",
"57 - Analyzing: Normal pressure hydrocephalus\n",
"Successfully analyzed: Normal pressure hydrocephalus\n",
"58 - Analyzing: Obesity hypoventilation syndrome\n",
"Successfully analyzed: Obesity hypoventilation syndrome\n",
"59 - Analyzing: Opioid use disorder\n",
"Successfully analyzed: Opioid use disorder\n",
"60 - Analyzing: Optic neuritis\n",
"Successfully analyzed: Optic neuritis\n",
"61 - Analyzing: Orofacial granulomatosis\n",
"Successfully analyzed: Orofacial granulomatosis\n",
"62 - Analyzing: Orthostatic hypotension\n",
"Successfully analyzed: Orthostatic hypotension\n",
"63 - Analyzing: Pancreatic cancer\n",
"Successfully analyzed: Pancreatic cancer\n",
"64 - Analyzing: Panic attack\n",
"Successfully analyzed: Panic attack\n",
"65 - Analyzing: Paratyphoid fever\n",
"Successfully analyzed: Paratyphoid fever\n",
"66 - Analyzing: Parry–Romberg syndrome\n",
"Successfully analyzed: Parry–Romberg syndrome\n",
"67 - Analyzing: Pituitary apoplexy\n",
"Successfully analyzed: Pituitary apoplexy\n",
"68 - Analyzing: Polyarteritis nodosa\n",
"Successfully analyzed: Polyarteritis nodosa\n",
"69 - Analyzing: Porencephaly\n",
"Successfully analyzed: Porencephaly\n",
"70 - Analyzing: Prediabetes\n",
"Successfully analyzed: Prediabetes\n",
"71 - Analyzing: Pregnancy\n",
"Successfully analyzed: Pregnancy\n",
"72 - Analyzing: Premenstrual syndrome\n",
"Successfully analyzed: Premenstrual syndrome\n",
"73 - Analyzing: Primary hyperparathyroidism\n",
"Successfully analyzed: Primary hyperparathyroidism\n",
"74 - Analyzing: Primary sclerosing cholangitis\n",
"Successfully analyzed: Primary sclerosing cholangitis\n",
"75 - Analyzing: Reactive hypoglycemia\n",
"Successfully analyzed: Reactive hypoglycemia\n",
"76 - Analyzing: Reflex syncope\n",
"Successfully analyzed: Reflex syncope\n",
"77 - Analyzing: Ross River fever\n",
"Successfully analyzed: Ross River fever\n",
"78 - Analyzing: Rubella\n",
"Successfully analyzed: Rubella\n",
"79 - Analyzing: Scarlet fever\n",
"Successfully analyzed: Scarlet fever\n",
"80 - Analyzing: Scleroderma\n",
"Successfully analyzed: Scleroderma\n",
"81 - Analyzing: Snakebite\n",
"Successfully analyzed: Snakebite\n",
"82 - Analyzing: Soy allergy\n",
"Successfully analyzed: Soy allergy\n",
"83 - Analyzing: Streptococcal pharyngitis\n",
"Successfully analyzed: Streptococcal pharyngitis\n",
"84 - Analyzing: Subdural hematoma\n",
"Successfully analyzed: Subdural hematoma\n",
"85 - Analyzing: Superior mesenteric artery syndrome\n",
"Successfully analyzed: Superior mesenteric artery syndrome\n",
"86 - Analyzing: Taeniasis\n",
"Successfully analyzed: Taeniasis\n",
"87 - Analyzing: Tetanus\n",
"Successfully analyzed: Tetanus\n",
"88 - Analyzing: Tethered spinal cord syndrome\n",
"Successfully analyzed: Tethered spinal cord syndrome\n",
"89 - Analyzing: Thyroid storm\n",
"Successfully analyzed: Thyroid storm\n",
"90 - Analyzing: Thyrotoxic periodic paralysis\n",
"Successfully analyzed: Thyrotoxic periodic paralysis\n",
"91 - Analyzing: Tonsillitis\n",
"Successfully analyzed: Tonsillitis\n",
"92 - Analyzing: Trichinosis\n",
"Successfully analyzed: Trichinosis\n",
"93 - Analyzing: Tropical sprue\n",
"Successfully analyzed: Tropical sprue\n",
"94 - Analyzing: Typhoid fever\n",
"Successfully analyzed: Typhoid fever\n",
"95 - Analyzing: Upper respiratory tract infection\n",
"Successfully analyzed: Upper respiratory tract infection\n",
"96 - Analyzing: Vascular dementia\n",
"Successfully analyzed: Vascular dementia\n",
"97 - Analyzing: Waterhouse–Friderichsen syndrome\n",
"Successfully analyzed: Waterhouse–Friderichsen syndrome\n",
"98 - Analyzing: Weight loss\n",
"Successfully analyzed: Weight loss\n",
"99 - Analyzing: Whipple's disease\n",
"Successfully analyzed: Whipple's disease\n",
"100 - Analyzing: Xerostomia\n",
"Successfully analyzed: Xerostomia\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Prompt | \n",
" Disease name failed | \n",
" GPT Output | \n",
" Error message | \n",
" GPT Output Part A | \n",
" GPT Output Part B | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Prompt, Disease name failed, GPT Output, Error message, GPT Output Part A, GPT Output Part B]\n",
"Index: []"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"if useSimplifiedEntityDescription:\n",
" prompt_name = \"Prompt 5_simplified\"\n",
" abbreviation = \"_P5_simplified\"\n",
" # Open Prompt Engineering\\TUI_description_text_simplified.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text_simplified.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
" prompt_description = \"Basic Prompt but extracting according to simplified semantic type categories text with explanation (CoT) and one-shot learning.\"\n",
"else:\n",
" prompt_name = \"Prompt 5\"\n",
" abbreviation = \"_P5\"\n",
" # Open Prompt Engineering\\TUI_description_text.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
" prompt_description = \"Basic Prompt but extracting according to semantic type categories with explanation (CoT) and one-shot learning.\"\n",
"\n",
"# Create the path for the results of the prompt\n",
"result_path_fifth_prompt = os.path.join(path_to_general_results, model_name, prompt_name)\n",
"\n",
"# Create the folder if it does not exist\n",
"if not os.path.exists(result_path_fifth_prompt):\n",
" os.makedirs(result_path_fifth_prompt)\n",
"\n",
"fifth_prompt_text = \"Extract and list all the terms that can be classified into any of the categories in the \\\n",
"“TUI code and Semantic type description text” and are related to the condition found in the “text to analyze”. \\\n",
"Classify each extracted term into one of the categories in the “TUI code and Semantic type description text”. \\\n",
"Explain why the entity was extracted and why it was classified as such.\"\n",
"\n",
"\n",
"# Open the JSON format otuput\n",
"with open(r\"Prompt Engineering\\JSON format output with Explanation.txt\", 'r', encoding='utf-8') as f:\n",
" JSON_format_output_with_explanation = f.read()\n",
"\n",
"# Creating and saving the prompt\n",
"fifth_prompt_text = fifth_prompt_text + \"\\n\" + JSON_format_output_with_explanation\n",
"fifth_prompt_text = fifth_prompt_text + \"\\n\\n\" + \"TUI code and Semantic type description text: \\n\\n\" + TUI_description_text\n",
"save_prompt(prompt_name, fifth_prompt_text, prompt_description=prompt_description)\n",
"\n",
"# Open Texto Ejemplo de Anemia\n",
"with open(r\"Prompt Engineering\\Texto Ejemplo - Anemia.txt\", 'r', encoding='utf-8') as f:\n",
" example_text = f.read()\n",
"# Open Texto Ejemplo sin CoT\n",
"with open(r\"Prompt Engineering\\Texto Ejemplo Respuesta con CoT - Anemia.txt\", 'r', encoding='utf-8') as f:\n",
" example_text_answer_con_CoT = f.read()\n",
"\n",
"failed_texts_name = []\n",
"failed_texts = []\n",
"failed_texts_error_message = []\n",
"\n",
"# Create a list of files for which the text has already been analyzed\n",
"analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_fifth_prompt) if file.endswith(\".xlsx\")]\n",
"\n",
"for index, disease_text in enumerate(os.listdir(path_to_texts)):\n",
" if disease_text.split(\".\")[0] in analyzed_files:\n",
" continue\n",
" with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:\n",
" text = f.read() # Disease text\n",
" condition_name = disease_text.split(\".\")[0]\n",
" message_text = [{\"role\":\"system\",\"content\":fifth_prompt_text},\n",
" {\"role\":\"user\", \"content\":\"Text to analyze: \\n\\n\" + example_text},\n",
" {\"role\":\"assistant\",\"content\":example_text_answer_con_CoT},\n",
" {\"role\":\"user\",\"content\":\"Text to analyze: \\n\\n\" + text}]\n",
" print(f\"{index+1} - Analyzing: \", condition_name)\n",
" try:\n",
" GPT_answer = get_response_LLM(prompt=message_text)\n",
" df = transform_GPT_output(GPT_answer)\n",
" df.to_excel(os.path.join(result_path_fifth_prompt, condition_name + abbreviation + \".xlsx\"), sheet_name=model_name ,index=False)\n",
" print(\"Successfully analyzed: \", condition_name)\n",
" except Exception as e:\n",
" failed_texts_name.append(condition_name)\n",
" failed_texts.append(GPT_answer)\n",
" # Save the error message from the exception\n",
" failed_texts_error_message.append(str(e))\n",
" print(\"Failed to analyze: \", condition_name)\n",
"\n",
"# Save failed texts\n",
"failed_df = pd.DataFrame({\"Disease name failed\": failed_texts_name, \"GPT Output\": failed_texts, \"Error message\": failed_texts_error_message})\n",
"failed_path = os.path.join(path_to_general_results, model_name)\n",
"failed_df.to_excel(os.path.join(failed_path, \"Failed texts \" + prompt_name + \".xlsx\"), sheet_name=model_name ,index=False)\n",
"generate_summary_failed_texts(failed_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prompt 6: Prompt Chaining\n",
"- 1. First extract all the possible terms related to the condition mentioned in the text\n",
"- 2. Classify each term according to the TUI Code and Semantic Categories and discard those that do not fit a category"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 - Analyzing: Acute decompensated heart failure\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Acute decompensated heart failure\n",
"2 - Analyzing: Acute intermittent porphyria\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Acute intermittent porphyria\n",
"3 - Analyzing: Anthrax\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Anthrax\n",
"4 - Analyzing: Arterial embolism\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Arterial embolism\n",
"5 - Analyzing: Arteriovenous malformation\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Arteriovenous malformation\n",
"6 - Analyzing: Ascites\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Ascites\n",
"7 - Analyzing: Autonomic dysreflexia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Autonomic dysreflexia\n",
"8 - Analyzing: Benzodiazepine withdrawal syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Benzodiazepine withdrawal syndrome\n",
"9 - Analyzing: Blastomycosis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Blastomycosis\n",
"10 - Analyzing: Breast cancer\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Breast cancer\n",
"11 - Analyzing: Campylobacteriosis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Campylobacteriosis\n",
"12 - Analyzing: Carciac myxoma\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Carciac myxoma\n",
"13 - Analyzing: Carrion's disease\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Carrion's disease\n",
"14 - Analyzing: Cerebral salt-wasting syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Cerebral salt-wasting syndrome\n",
"15 - Analyzing: Cerebrovascular disease\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Cerebrovascular disease\n",
"16 - Analyzing: Chlamydia infection\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Chlamydia infection\n",
"17 - Analyzing: Cholesterol embolism\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Cholesterol embolism\n",
"18 - Analyzing: Coccidioidomycosis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Coccidioidomycosis\n",
"19 - Analyzing: Conversion disorder\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Conversion disorder\n",
"20 - Analyzing: Cryoglobulinemia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Cryoglobulinemia\n",
"21 - Analyzing: Diphtheria\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Diphtheria\n",
"22 - Analyzing: Erysipelas\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Erysipelas\n",
"23 - Analyzing: Erythema nodosum\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Erythema nodosum\n",
"24 - Analyzing: Ethylene glycol poisoning\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Ethylene glycol poisoning\n",
"25 - Analyzing: Felty's syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Felty's syndrome\n",
"26 - Analyzing: Food intolerance\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Food intolerance\n",
"27 - Analyzing: Gastroparesis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Gastroparesis\n",
"28 - Analyzing: Generalized anxiety disorder\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Generalized anxiety disorder\n",
"29 - Analyzing: GM1 gangliosidoses\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: GM1 gangliosidoses\n",
"30 - Analyzing: Helicobacter pylori\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Helicobacter pylori\n",
"31 - Analyzing: Hemolytic-uremic syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Hemolytic-uremic syndrome\n",
"32 - Analyzing: Herpes labialis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Herpes labialis\n",
"33 - Analyzing: Hypercalcaemia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Hypercalcaemia\n",
"34 - Analyzing: Hyperosmolar hyperglycemic state\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Hyperosmolar hyperglycemic state\n",
"35 - Analyzing: Hypervitaminosis A\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Hypervitaminosis A\n",
"36 - Analyzing: Hypocalcaemia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Hypocalcaemia\n",
"37 - Analyzing: Hypomagnesemia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Hypomagnesemia\n",
"38 - Analyzing: Hypovolemia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Hypovolemia\n",
"39 - Analyzing: Inborn error of metabolism\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Inborn error of metabolism\n",
"40 - Analyzing: Influenza\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Influenza\n",
"41 - Analyzing: Intention tremor\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Intention tremor\n",
"42 - Analyzing: Intraparenchymal hemorrhage\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Intraparenchymal hemorrhage\n",
"43 - Analyzing: Itch\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Itch\n",
"44 - Analyzing: Juvenile dermatomyositis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Juvenile dermatomyositis\n",
"45 - Analyzing: Kaposi's sarcoma\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Kaposi's sarcoma\n",
"46 - Analyzing: Lambert–Eaton myasthenic syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Lambert–Eaton myasthenic syndrome\n",
"47 - Analyzing: Laryngitis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Laryngitis\n",
"48 - Analyzing: Lateral medullary syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Lateral medullary syndrome\n",
"49 - Analyzing: MERRF syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: MERRF syndrome\n",
"50 - Analyzing: Metal fume fever\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Metal fume fever\n",
"51 - Analyzing: Morvan's syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Morvan's syndrome\n",
"52 - Analyzing: Myocarditis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Myocarditis\n",
"53 - Analyzing: Necatoriasis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Necatoriasis\n",
"54 - Analyzing: Nicotine poisoning\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Nicotine poisoning\n",
"55 - Analyzing: Non-alcoholic fatty liver disease\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Non-alcoholic fatty liver disease\n",
"56 - Analyzing: Non-small-cell lung carcinoma\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Non-small-cell lung carcinoma\n",
"57 - Analyzing: Normal pressure hydrocephalus\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Normal pressure hydrocephalus\n",
"58 - Analyzing: Obesity hypoventilation syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Obesity hypoventilation syndrome\n",
"59 - Analyzing: Opioid use disorder\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Opioid use disorder\n",
"60 - Analyzing: Optic neuritis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Optic neuritis\n",
"61 - Analyzing: Orofacial granulomatosis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Orofacial granulomatosis\n",
"62 - Analyzing: Orthostatic hypotension\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Orthostatic hypotension\n",
"63 - Analyzing: Pancreatic cancer\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Pancreatic cancer\n",
"64 - Analyzing: Panic attack\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Panic attack\n",
"65 - Analyzing: Paratyphoid fever\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Paratyphoid fever\n",
"66 - Analyzing: Parry–Romberg syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Parry–Romberg syndrome\n",
"67 - Analyzing: Pituitary apoplexy\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Pituitary apoplexy\n",
"68 - Analyzing: Polyarteritis nodosa\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Polyarteritis nodosa\n",
"69 - Analyzing: Porencephaly\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Porencephaly\n",
"70 - Analyzing: Prediabetes\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Prediabetes\n",
"71 - Analyzing: Pregnancy\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Pregnancy\n",
"72 - Analyzing: Premenstrual syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Premenstrual syndrome\n",
"73 - Analyzing: Primary hyperparathyroidism\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Primary hyperparathyroidism\n",
"74 - Analyzing: Primary sclerosing cholangitis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Primary sclerosing cholangitis\n",
"75 - Analyzing: Reactive hypoglycemia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Reactive hypoglycemia\n",
"76 - Analyzing: Reflex syncope\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Reflex syncope\n",
"77 - Analyzing: Ross River fever\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Ross River fever\n",
"78 - Analyzing: Rubella\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Rubella\n",
"79 - Analyzing: Scarlet fever\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Scarlet fever\n",
"80 - Analyzing: Scleroderma\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Scleroderma\n",
"81 - Analyzing: Snakebite\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Snakebite\n",
"82 - Analyzing: Soy allergy\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Soy allergy\n",
"83 - Analyzing: Streptococcal pharyngitis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Streptococcal pharyngitis\n",
"84 - Analyzing: Subdural hematoma\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Subdural hematoma\n",
"85 - Analyzing: Superior mesenteric artery syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Superior mesenteric artery syndrome\n",
"86 - Analyzing: Taeniasis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Taeniasis\n",
"87 - Analyzing: Tetanus\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Tetanus\n",
"88 - Analyzing: Tethered spinal cord syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Tethered spinal cord syndrome\n",
"89 - Analyzing: Thyroid storm\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Thyroid storm\n",
"90 - Analyzing: Thyrotoxic periodic paralysis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Thyrotoxic periodic paralysis\n",
"91 - Analyzing: Tonsillitis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Tonsillitis\n",
"92 - Analyzing: Trichinosis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Trichinosis\n",
"93 - Analyzing: Tropical sprue\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Tropical sprue\n",
"94 - Analyzing: Typhoid fever\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Typhoid fever\n",
"95 - Analyzing: Upper respiratory tract infection\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Upper respiratory tract infection\n",
"96 - Analyzing: Vascular dementia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Vascular dementia\n",
"97 - Analyzing: Waterhouse–Friderichsen syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Waterhouse–Friderichsen syndrome\n",
"98 - Analyzing: Weight loss\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Weight loss\n",
"99 - Analyzing: Whipple's disease\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Whipple's disease\n",
"100 - Analyzing: Xerostomia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Xerostomia\n"
]
}
],
"source": [
"if useSimplifiedEntityDescription:\n",
" prompt_name = \"Prompt 6_simplified\"\n",
" abbreviation = \"_P6_simplified\"\n",
" # Open Prompt Engineering\\TUI_description_text.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text_simplified.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
" prompt_description_PartB = \"Basic Prompt but classifying each term and manifestation into one of the simplified categories of TUI code and Semantic type description text.\"\n",
"\n",
"else:\n",
" prompt_name = \"Prompt 6\"\n",
" abbreviation = \"_P6\"\n",
" # Open Prompt Engineering\\TUI_description_text.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
" prompt_description_PartB = \"Basic Prompt but classifying each term into one of the categories in the TUI code and Semantic type description text.\"\n",
"\n",
"# Create the path for the results of the prompt\n",
"result_path_sixth_prompt = os.path.join(path_to_general_results, model_name, prompt_name)\n",
"\n",
"# Create the folder if it does not exist\n",
"if not os.path.exists(result_path_sixth_prompt):\n",
" os.makedirs(result_path_sixth_prompt)\n",
"\n",
"# Part A - Extraction prompt\n",
"sixth_prompt_text_PartA = \"Extract and list all the terms that are related to the condition found in the “text to analyze”.\"\n",
"prompt_description_PartA = \"Basic Prompt but extracting all terms related to the condition.\"\n",
"\n",
"# Open the JSON format otuput\n",
"with open(r\"Prompt Engineering\\JSON format output with only Entity.txt\", 'r', encoding='utf-8') as f:\n",
" JSON_format_output_only_entity = f.read()\n",
"\n",
"# Creating and saving the prompt A\n",
"sixth_prompt_text_PartA = sixth_prompt_text_PartA + \"\\n\" + JSON_format_output_only_entity\n",
"save_prompt(prompt_name + \"_PartA\", sixth_prompt_text_PartA, prompt_description=prompt_description_PartA)\n",
"\n",
"# Part B - Classification prompt\n",
"sixth_prompt_text_PartB = \"Classify each term of a list according to one of the categories in the “TUI code and Semantic type description text”. \\\n",
"If a term cannot be classified into any of the categories in the “TUI code and Semantic type description text”, classify its TUI code and Semantic type as “Other”.\"\n",
"\n",
"# Open the JSON format otuput\n",
"with open(r\"Prompt Engineering\\JSON format output.txt\", 'r', encoding='utf-8') as f:\n",
" JSON_format_output = f.read()\n",
"\n",
"# Creating and saving the prompt B\n",
"sixth_prompt_text_PartB = sixth_prompt_text_PartB + \"\\n\" + JSON_format_output\n",
"sixth_prompt_text_PartB = sixth_prompt_text_PartB + \"\\n\\n\" + \"TUI code and Semantic type description text: \\n\\n\" + TUI_description_text\n",
"save_prompt(prompt_name + \"_PartB\", sixth_prompt_text_PartB, prompt_description=prompt_description_PartB)\n",
"\n",
"# Creating lists to store the failed texts\n",
"failed_texts_name = []\n",
"failed_texts_PartA = []\n",
"failed_texts_PartB = []\n",
"failed_texts_error_message = []\n",
"\n",
"# Create a list of files for which the text has already been analyzed\n",
"analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_sixth_prompt) if file.endswith(\".xlsx\")]\n",
"\n",
"for i, disease_text in enumerate(os.listdir(path_to_texts)):\n",
" # Skip the files that have already been analyzed\n",
" if disease_text.split(\".\")[0] in analyzed_files:\n",
" continue\n",
" with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:\n",
" text = f.read() # Disease text\n",
" condition_name = disease_text.split(\".\")[0]\n",
" # Restarting the GPT_answer for each part as a blank string\n",
" GPT_answer_PartA = \"\"\n",
" GPT_answer_PartB = \"Not yet generated\"\n",
"\n",
" # Part A - Extracting the biomedical entities from the text\n",
" message_text_PartA = [{\"role\":\"system\",\"content\":sixth_prompt_text_PartA},\n",
" {\"role\":\"user\",\"content\":\"Text to analyze: \\n\\n\" + text}]\n",
" print(f\"{i + 1} - Analyzing: \",condition_name)\n",
"\n",
" try:\n",
" GPT_answer_PartA = get_response_LLM(prompt=message_text_PartA)\n",
" # Transform the output from Part A into a DataFrame\n",
" df_PartA = transform_GPT_output(GPT_answer_PartA)\n",
" print(\"\\tPart A, entity extraction, success\")\n",
" \n",
" # Part B - Classifying the biomedical entities according to the TUI code and Semantic type description text\n",
" message_text_PartB = [{\"role\":\"system\",\"content\":sixth_prompt_text_PartB},\n",
" {\"role\":\"user\", \"content\":GPT_answer_PartA}]\n",
" GPT_answer_PartB = get_response_LLM(prompt=message_text_PartB)\n",
"\n",
" # Transform the output from Part B into a DataFrame\n",
" df_PartB = transform_GPT_output(GPT_answer_PartB)\n",
" print(\"\\tPart B, entity classification, success\")\n",
"\n",
" # Save the DataFrames in an Excel file\n",
" df_PartB.to_excel(os.path.join(result_path_sixth_prompt, condition_name + abbreviation + \".xlsx\"), sheet_name=\"PartB_Classification\" ,index=False)\n",
" \n",
" # Save the df_PartA in the same excel file but in a different sheet\n",
" with pd.ExcelWriter(os.path.join(result_path_sixth_prompt, condition_name + abbreviation + \".xlsx\"), engine='openpyxl', mode='a') as writer:\n",
" df_PartA.to_excel(writer, sheet_name=\"PartA_Extraction\", index=False)\n",
"\n",
" # Count the number of rows in the DataFrames\n",
" count_PartA = df_PartA.shape[0]\n",
" count_PartB = df_PartB.shape[0]\n",
"\n",
" if count_PartA != count_PartB:\n",
" print(\"\\tThe number of rows in the DataFrames is different\")\n",
" raise Exception(\"The number of rows in the DataFrames is different\")\n",
" else:\n",
" print(\"\\tSuccessfully analyzed: \", condition_name)\n",
" except Exception as e:\n",
" failed_texts_name.append(condition_name)\n",
" failed_texts_PartA.append(GPT_answer_PartA)\n",
" failed_texts_PartB.append(GPT_answer_PartB)\n",
" failed_texts_error_message.append(str(e))\n",
" print(\"Failed to analyze: \", condition_name)\n",
"\n",
"# Save failed texts\n",
"failed_df = pd.DataFrame({\"Disease name failed\": failed_texts_name, \"GPT Output Part A\": failed_texts_PartA, \"GPT Output Part B\": failed_texts_PartB, \"Error message\": failed_texts_error_message})\n",
"failed_path = os.path.join(path_to_general_results, model_name)\n",
"failed_df.to_excel(os.path.join(failed_path, \"Failed texts \" + prompt_name + \".xlsx\"), sheet_name=model_name ,index=False)\n",
"generate_summary_failed_texts(failed_path)\n",
"# Filter the results of Prompt 6 removing the entities classified as \"Other\" in the TUI Code ans save the new files in a new folder\n",
"filter_prompt6_removeOther(result_path_sixth_prompt)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prompt 6: Prompt Chaining with manifestations\n",
"- 1. First extract all the possible terms and manifestations related to the condition mentioned in the text\n",
"- 2. Classify each term and manifestation according to the TUI Code and Semantic Categories and discard those that do not fit a category"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 - Analyzing: Acute decompensated heart failure\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Acute decompensated heart failure\n",
"2 - Analyzing: Acute intermittent porphyria\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Acute intermittent porphyria\n",
"3 - Analyzing: Anthrax\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Anthrax\n",
"4 - Analyzing: Arterial embolism\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Arterial embolism\n",
"5 - Analyzing: Arteriovenous malformation\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Arteriovenous malformation\n",
"6 - Analyzing: Ascites\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Ascites\n",
"7 - Analyzing: Autonomic dysreflexia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Autonomic dysreflexia\n",
"8 - Analyzing: Benzodiazepine withdrawal syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Benzodiazepine withdrawal syndrome\n",
"9 - Analyzing: Blastomycosis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Blastomycosis\n",
"10 - Analyzing: Breast cancer\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Breast cancer\n",
"11 - Analyzing: Campylobacteriosis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Campylobacteriosis\n",
"12 - Analyzing: Carciac myxoma\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Carciac myxoma\n",
"13 - Analyzing: Carrion's disease\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Carrion's disease\n",
"14 - Analyzing: Cerebral salt-wasting syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Cerebral salt-wasting syndrome\n",
"15 - Analyzing: Cerebrovascular disease\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Cerebrovascular disease\n",
"16 - Analyzing: Chlamydia infection\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Chlamydia infection\n",
"17 - Analyzing: Cholesterol embolism\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Cholesterol embolism\n",
"18 - Analyzing: Coccidioidomycosis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Coccidioidomycosis\n",
"19 - Analyzing: Conversion disorder\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Conversion disorder\n",
"20 - Analyzing: Cryoglobulinemia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Cryoglobulinemia\n",
"21 - Analyzing: Diphtheria\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Diphtheria\n",
"22 - Analyzing: Erysipelas\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Erysipelas\n",
"23 - Analyzing: Erythema nodosum\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Erythema nodosum\n",
"24 - Analyzing: Ethylene glycol poisoning\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Ethylene glycol poisoning\n",
"25 - Analyzing: Felty's syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Felty's syndrome\n",
"26 - Analyzing: Food intolerance\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Food intolerance\n",
"27 - Analyzing: Gastroparesis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Gastroparesis\n",
"28 - Analyzing: Generalized anxiety disorder\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Generalized anxiety disorder\n",
"29 - Analyzing: GM1 gangliosidoses\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: GM1 gangliosidoses\n",
"30 - Analyzing: Helicobacter pylori\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Helicobacter pylori\n",
"31 - Analyzing: Hemolytic-uremic syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Hemolytic-uremic syndrome\n",
"32 - Analyzing: Herpes labialis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Herpes labialis\n",
"33 - Analyzing: Hypercalcaemia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Hypercalcaemia\n",
"34 - Analyzing: Hyperosmolar hyperglycemic state\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Hyperosmolar hyperglycemic state\n",
"35 - Analyzing: Hypervitaminosis A\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Hypervitaminosis A\n",
"36 - Analyzing: Hypocalcaemia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Hypocalcaemia\n",
"37 - Analyzing: Hypomagnesemia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Hypomagnesemia\n",
"38 - Analyzing: Hypovolemia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Hypovolemia\n",
"39 - Analyzing: Inborn error of metabolism\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Inborn error of metabolism\n",
"40 - Analyzing: Influenza\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Influenza\n",
"41 - Analyzing: Intention tremor\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Intention tremor\n",
"42 - Analyzing: Intraparenchymal hemorrhage\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Intraparenchymal hemorrhage\n",
"43 - Analyzing: Itch\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Itch\n",
"44 - Analyzing: Juvenile dermatomyositis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Juvenile dermatomyositis\n",
"45 - Analyzing: Kaposi's sarcoma\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Kaposi's sarcoma\n",
"46 - Analyzing: Lambert–Eaton myasthenic syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Lambert–Eaton myasthenic syndrome\n",
"47 - Analyzing: Laryngitis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Laryngitis\n",
"48 - Analyzing: Lateral medullary syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Lateral medullary syndrome\n",
"49 - Analyzing: MERRF syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: MERRF syndrome\n",
"50 - Analyzing: Metal fume fever\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Metal fume fever\n",
"51 - Analyzing: Morvan's syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Morvan's syndrome\n",
"52 - Analyzing: Myocarditis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Myocarditis\n",
"53 - Analyzing: Necatoriasis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Necatoriasis\n",
"54 - Analyzing: Nicotine poisoning\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Nicotine poisoning\n",
"55 - Analyzing: Non-alcoholic fatty liver disease\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Non-alcoholic fatty liver disease\n",
"56 - Analyzing: Non-small-cell lung carcinoma\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Non-small-cell lung carcinoma\n",
"57 - Analyzing: Normal pressure hydrocephalus\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Normal pressure hydrocephalus\n",
"58 - Analyzing: Obesity hypoventilation syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Obesity hypoventilation syndrome\n",
"59 - Analyzing: Opioid use disorder\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Opioid use disorder\n",
"60 - Analyzing: Optic neuritis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Optic neuritis\n",
"61 - Analyzing: Orofacial granulomatosis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Orofacial granulomatosis\n",
"62 - Analyzing: Orthostatic hypotension\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Orthostatic hypotension\n",
"63 - Analyzing: Pancreatic cancer\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Pancreatic cancer\n",
"64 - Analyzing: Panic attack\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Panic attack\n",
"65 - Analyzing: Paratyphoid fever\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Paratyphoid fever\n",
"66 - Analyzing: Parry–Romberg syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Parry–Romberg syndrome\n",
"67 - Analyzing: Pituitary apoplexy\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Pituitary apoplexy\n",
"68 - Analyzing: Polyarteritis nodosa\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Polyarteritis nodosa\n",
"69 - Analyzing: Porencephaly\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Porencephaly\n",
"70 - Analyzing: Prediabetes\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Prediabetes\n",
"71 - Analyzing: Pregnancy\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Pregnancy\n",
"72 - Analyzing: Premenstrual syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Premenstrual syndrome\n",
"73 - Analyzing: Primary hyperparathyroidism\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Primary hyperparathyroidism\n",
"74 - Analyzing: Primary sclerosing cholangitis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Primary sclerosing cholangitis\n",
"75 - Analyzing: Reactive hypoglycemia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Reactive hypoglycemia\n",
"76 - Analyzing: Reflex syncope\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Reflex syncope\n",
"77 - Analyzing: Ross River fever\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Ross River fever\n",
"78 - Analyzing: Rubella\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Rubella\n",
"79 - Analyzing: Scarlet fever\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Scarlet fever\n",
"80 - Analyzing: Scleroderma\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Scleroderma\n",
"81 - Analyzing: Snakebite\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Snakebite\n",
"82 - Analyzing: Soy allergy\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Soy allergy\n",
"83 - Analyzing: Streptococcal pharyngitis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Streptococcal pharyngitis\n",
"84 - Analyzing: Subdural hematoma\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Subdural hematoma\n",
"85 - Analyzing: Superior mesenteric artery syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Superior mesenteric artery syndrome\n",
"86 - Analyzing: Taeniasis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Taeniasis\n",
"87 - Analyzing: Tetanus\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Tetanus\n",
"88 - Analyzing: Tethered spinal cord syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Tethered spinal cord syndrome\n",
"89 - Analyzing: Thyroid storm\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Thyroid storm\n",
"90 - Analyzing: Thyrotoxic periodic paralysis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Thyrotoxic periodic paralysis\n",
"91 - Analyzing: Tonsillitis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Tonsillitis\n",
"92 - Analyzing: Trichinosis\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Trichinosis\n",
"93 - Analyzing: Tropical sprue\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Tropical sprue\n",
"94 - Analyzing: Typhoid fever\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Typhoid fever\n",
"95 - Analyzing: Upper respiratory tract infection\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Upper respiratory tract infection\n",
"96 - Analyzing: Vascular dementia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Vascular dementia\n",
"97 - Analyzing: Waterhouse–Friderichsen syndrome\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Waterhouse–Friderichsen syndrome\n",
"98 - Analyzing: Weight loss\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Weight loss\n",
"99 - Analyzing: Whipple's disease\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Whipple's disease\n",
"100 - Analyzing: Xerostomia\n",
"\tPart A, entity extraction, success\n",
"\tPart B, entity classification, success\n",
"\tSuccessfully analyzed: Xerostomia\n"
]
}
],
"source": [
"if useSimplifiedEntityDescription:\n",
" prompt_name = \"Prompt 6_simplified_plusManifestations\"\n",
" abbreviation = \"_P6_simplified_plusManifestations\"\n",
" # Open Prompt Engineering\\TUI_description_text.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text_simplified.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
" prompt_description_PartB = \"Basic Prompt but classifying each term and manifestation into one of the simplified categories of TUI code and Semantic type description text.\"\n",
"\n",
"else:\n",
" prompt_name = \"Prompt 6_plusManifestations\"\n",
" abbreviation = \"_P6_plusManifestations\"\n",
" # Open Prompt Engineering\\TUI_description_text.txt to get the TUI code and Semantic type description text\n",
" with open(r\"Prompt Engineering\\TUI_description_text.txt\", 'r', encoding='utf-8') as f:\n",
" TUI_description_text = f.read()\n",
" prompt_description_PartB = \"Basic Prompt but classifying each term into one of the categories in the TUI code and Semantic type description text.\"\n",
"\n",
"# Create the path for the results of the prompt\n",
"result_path_sixth_prompt = os.path.join(path_to_general_results, model_name, prompt_name)\n",
"\n",
"# Create the folder if it does not exist\n",
"if not os.path.exists(result_path_sixth_prompt):\n",
" os.makedirs(result_path_sixth_prompt)\n",
"\n",
"# Part A - Extraction prompt\n",
"sixth_prompt_text_PartA = \"Extract and list all the terms and manifestations that are related to the condition found in the “text to analyze”.\"\n",
"prompt_description_PartA = \"Basic Prompt but extracting all terms related to the condition.\"\n",
"\n",
"# Open the JSON format otuput\n",
"with open(r\"Prompt Engineering\\JSON format output with only Entity.txt\", 'r', encoding='utf-8') as f:\n",
" JSON_format_output_only_entity = f.read()\n",
"\n",
"# Creating and saving the prompt A\n",
"sixth_prompt_text_PartA = sixth_prompt_text_PartA + \"\\n\" + JSON_format_output_only_entity\n",
"save_prompt(prompt_name + \"_PartA\", sixth_prompt_text_PartA, prompt_description=prompt_description_PartA)\n",
"\n",
"# Part B - Classification prompt\n",
"sixth_prompt_text_PartB = \"Classify each term and manifestation of a list according to one of the categories in the “TUI code and Semantic type description text”. \\\n",
"If a term cannot be classified into any of the categories in the “TUI code and Semantic type description text”, classify its TUI code and Semantic type as “Other”.\"\n",
"\n",
"# Open the JSON format otuput\n",
"with open(r\"Prompt Engineering\\JSON format output.txt\", 'r', encoding='utf-8') as f:\n",
" JSON_format_output = f.read()\n",
"\n",
"# Creating and saving the prompt B\n",
"sixth_prompt_text_PartB = sixth_prompt_text_PartB + \"\\n\" + JSON_format_output\n",
"sixth_prompt_text_PartB = sixth_prompt_text_PartB + \"\\n\\n\" + \"TUI code and Semantic type description text: \\n\\n\" + TUI_description_text\n",
"save_prompt(prompt_name + \"_PartB\", sixth_prompt_text_PartB, prompt_description=prompt_description_PartB)\n",
"\n",
"# Creating lists to store the failed texts\n",
"failed_texts_name = []\n",
"failed_texts_PartA = []\n",
"failed_texts_PartB = []\n",
"failed_texts_error_message = []\n",
"\n",
"# Create a list of files for which the text has already been analyzed\n",
"analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_sixth_prompt) if file.endswith(\".xlsx\")]\n",
"\n",
"for i, disease_text in enumerate(os.listdir(path_to_texts)):\n",
" # Skip the files that have already been analyzed\n",
" if disease_text.split(\".\")[0] in analyzed_files:\n",
" continue\n",
" with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:\n",
" text = f.read() # Disease text\n",
" condition_name = disease_text.split(\".\")[0]\n",
" # Restarting the GPT_answer for each part as a blank string\n",
" GPT_answer_PartA = \"\"\n",
" GPT_answer_PartB = \"Not yet generated\"\n",
"\n",
" # Part A - Extracting the biomedical entities from the text\n",
" message_text_PartA = [{\"role\":\"system\",\"content\":sixth_prompt_text_PartA},\n",
" {\"role\":\"user\",\"content\":\"Text to analyze: \\n\\n\" + text}]\n",
" print(f\"{i + 1} - Analyzing: \",condition_name)\n",
"\n",
" try:\n",
" GPT_answer_PartA = get_response_LLM(prompt=message_text_PartA)\n",
" # Transform the output from Part A into a DataFrame\n",
" df_PartA = transform_GPT_output(GPT_answer_PartA)\n",
" print(\"\\tPart A, entity extraction, success\")\n",
" \n",
" # Part B - Classifying the biomedical entities according to the TUI code and Semantic type description text\n",
" message_text_PartB = [{\"role\":\"system\",\"content\":sixth_prompt_text_PartB},\n",
" {\"role\":\"user\", \"content\":GPT_answer_PartA}]\n",
" GPT_answer_PartB = get_response_LLM(prompt=message_text_PartB)\n",
"\n",
" # Transform the output from Part B into a DataFrame\n",
" df_PartB = transform_GPT_output(GPT_answer_PartB)\n",
" print(\"\\tPart B, entity classification, success\")\n",
"\n",
" # Save the DataFrames in an Excel file\n",
" df_PartB.to_excel(os.path.join(result_path_sixth_prompt, condition_name + abbreviation + \".xlsx\"), sheet_name=\"PartB_Classification\" ,index=False)\n",
" \n",
" # Save the df_PartA in the same excel file but in a different sheet\n",
" with pd.ExcelWriter(os.path.join(result_path_sixth_prompt, condition_name + abbreviation + \".xlsx\"), engine='openpyxl', mode='a') as writer:\n",
" df_PartA.to_excel(writer, sheet_name=\"PartA_Extraction\", index=False)\n",
"\n",
" # Count the number of rows in the DataFrames\n",
" count_PartA = df_PartA.shape[0]\n",
" count_PartB = df_PartB.shape[0]\n",
"\n",
" if count_PartA != count_PartB:\n",
" print(\"\\tThe number of rows in the DataFrames is different\")\n",
" raise Exception(\"The number of rows in the DataFrames is different\")\n",
" else:\n",
" print(\"\\tSuccessfully analyzed: \", condition_name)\n",
" except Exception as e:\n",
" failed_texts_name.append(condition_name)\n",
" failed_texts_PartA.append(GPT_answer_PartA)\n",
" failed_texts_PartB.append(GPT_answer_PartB)\n",
" failed_texts_error_message.append(str(e))\n",
" print(\"Failed to analyze: \", condition_name)\n",
"\n",
"# Save failed texts\n",
"failed_df = pd.DataFrame({\"Disease name failed\": failed_texts_name, \"GPT Output Part A\": failed_texts_PartA, \"GPT Output Part B\": failed_texts_PartB, \"Error message\": failed_texts_error_message})\n",
"failed_path = os.path.join(path_to_general_results, model_name)\n",
"failed_df.to_excel(os.path.join(failed_path, \"Failed texts \" + prompt_name + \".xlsx\"), sheet_name=model_name ,index=False)\n",
"generate_summary_failed_texts(failed_path)\n",
"\n",
"# Filter the results of Prompt 6 removing the entities classified as \"Other\" in the TUI Code ans save the new files in a new folder\n",
"filter_prompt6_removeOther(result_path_sixth_prompt)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prompt 7 - Entity One by One, doing batch processing"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Creating batch request"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Batch 1 request saved in: Resultados de Prompts\\gpt-4o-mini\\Prompt 7_OneByOne\\batch_request_Prompt 7_OneByOne_part1.jsonl\n",
"Batch 2 request saved in: Resultados de Prompts\\gpt-4o-mini\\Prompt 7_OneByOne\\batch_request_Prompt 7_OneByOne_part2.jsonl\n",
"Total tokens in the batch request: 2465676\n"
]
}
],
"source": [
"prompt_name = \"Prompt 7_OneByOne\"\n",
"abbreviation = \"_P7_OneByOne\"\n",
"prompt_description = \"Prompt 7 extracts each type of identity one by one for each text.\"\n",
"\n",
"# Create the path for the results of the prompt\n",
"result_path_seventh_prompt = os.path.join(path_to_general_results, model_name, prompt_name)\n",
"\n",
"# Create the folder if it does not exist\n",
"if not os.path.exists(result_path_seventh_prompt):\n",
" os.makedirs(result_path_seventh_prompt)\n",
"\n",
"# Open the JSON format output\n",
"with open(r\"Prompt Engineering\\JSON format output.txt\", 'r', encoding='utf-8') as f:\n",
" JSON_format_output = f.read()\n",
"\n",
"# Load the TUI code and Semantic type categories\n",
"path_to_TUI_seman_type_description = r\"Prompt Engineering\\TUI_Codes_Semantic_Types_to_look_for.xlsx\"\n",
"TUI_semantic_type_description_df = pd.read_excel(path_to_TUI_seman_type_description)\n",
"\n",
"seventh_prompt_text = r\"\"\"Extract and list all the {insert semantic type} that are related to the condition found in the \"text to analyze\". \\\n",
" Classify each extracted term into TUI code {insert TUI code} and Semantic type {insert semantic type}.\"\"\"\n",
"seventh_prompt_text = seventh_prompt_text + \"\\n\" + JSON_format_output\n",
"\n",
"save_prompt(prompt_name, seventh_prompt_text, prompt_description=prompt_description)\n",
"\n",
"failed_texts_name = []\n",
"failed_texts = []\n",
"failed_texts_error_message = []\n",
"\n",
"# Create a list of files for which the text has already been analyzed\n",
"analyzed_files = [file.split(abbreviation)[0] for file in os.listdir(result_path_seventh_prompt) if file.endswith(\".xlsx\")]\n",
"\n",
"batch_request_1 = []\n",
"batch_request_2 = []\n",
"token_counter = 0\n",
"def create_prompt_request(message_text, condition_name, TUI_Code, Semantic_Type):\n",
" request_ID = f\"{prompt_name}_{condition_name}_{Semantic_Type}_{TUI_Code}\"\n",
" return {\"custom_id\": request_ID, \"method\": \"POST\", \"url\": \"/v1/chat/completions\", \"body\": {\"model\": model_name, \"temperature\": 0, \"response_format\":{ \"type\": \"json_object\" }, \"messages\": message_text}}\n",
"\n",
"for index, disease_text in enumerate(os.listdir(path_to_texts)):\n",
" # Skip the files that have already been analyzed\n",
" if disease_text.split(\".\")[0] in analyzed_files:\n",
" continue\n",
" with open(os.path.join(path_to_texts, disease_text), 'r', encoding='utf-8') as f:\n",
" text = f.read() # Disease text\n",
" condition_name = disease_text.split(\".\")[0] # Name of the disease\n",
" for TUI_Code, Semantic_Type in zip(TUI_semantic_type_description_df[\"TUI Code\"], TUI_semantic_type_description_df[\"Semantic Type\"]):\n",
" seventh_prompt_text = (\n",
" f'Extract and list all the \"{Semantic_Type}\" that are related to the condition found in the \"text to analyze\". '\n",
" f'Classify each extracted term as TUI code \"{TUI_Code}\" and Semantic type \"{Semantic_Type}\".\\n'\n",
" 'Format the output as JSON in the following structure:\\n\\n'\n",
" '[\\n'\n",
" ' {\\n'\n",
" ' \"Entities\": [\\n'\n",
" ' {\\n'\n",
" f' \"Entity\": \"Entity Name 1\",\\n'\n",
" f' \"TUI_Code\": \"{TUI_Code}\",\\n'\n",
" f' \"Semantic_Type\": \"{Semantic_Type}\"\\n'\n",
" ' },\\n'\n",
" ' {\\n'\n",
" f' \"Entity\": \"Entity Name 2\",\\n'\n",
" f' \"TUI_Code\": \"{TUI_Code}\",\\n'\n",
" f' \"Semantic_Type\": \"{Semantic_Type}\"\\n'\n",
" ' }\\n'\n",
" ' ]\\n'\n",
" ' }\\n'\n",
" ']\\n\\n'\n",
" 'Ensure each object in the \"Entities\" array contains the keys: \"Entity\", \"TUI_Code\", and \"Semantic_Type\".'\n",
" )\n",
" message_text = [{\"role\":\"system\",\"content\":seventh_prompt_text},\n",
" {\"role\":\"user\", \"content\":\"Text to analyze: \\n\\n\" + text}]\n",
" prompt_request = create_prompt_request(message_text, condition_name, TUI_Code, Semantic_Type)\n",
" token_counter += count_tokens_in_text(prompt_request.__str__())\n",
" # The token limit is 2000000 so we split the requests in two batches\n",
" if token_counter < 1100000:\n",
" batch_request_1.append(prompt_request)\n",
" else:\n",
" batch_request_2.append(prompt_request)\n",
"\n",
"# Save the batch request in a JSONL file\n",
"file_name_1 = f\"batch_request_{prompt_name}_part1.jsonl\"\n",
"file_name_2 = f\"batch_request_{prompt_name}_part2.jsonl\"\n",
"path_to_batch_request_part1 = os.path.join(result_path_seventh_prompt, file_name_1)\n",
"path_to_batch_request_part2 = os.path.join(result_path_seventh_prompt, file_name_2)\n",
"with open(path_to_batch_request_part1, 'w') as jsonl_file:\n",
" for record in batch_request_1:\n",
" jsonl_file.write(json.dumps(record) + '\\n') # Convert each dict to a JSON string and add a newline\n",
"with open(path_to_batch_request_part2, 'w') as jsonl_file:\n",
" for record in batch_request_2:\n",
" jsonl_file.write(json.dumps(record) + '\\n') # Convert each dict to a JSON string and add a newline\n",
"print(\"Batch 1 request saved in: \", path_to_batch_request_part1)\n",
"print(\"Batch 2 request saved in: \", path_to_batch_request_part2)\n",
"print(\"Total tokens in the batch request: \", token_counter)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sending Batch Request"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"### 1st batch request\n",
"\n",
"# batch_input_file_part1 = openAI_client.files.create(\n",
"# file=open(path_to_batch_request_part1, \"rb\"),\n",
"# purpose=\"batch\"\n",
"# )\n",
"\n",
"# batch_input_file_id_part1 = batch_input_file_part1.id\n",
"\n",
"# batch_info_part1 = openAI_client.batches.create(\n",
"# input_file_id=batch_input_file_id_part1,\n",
"# endpoint=\"/v1/chat/completions\",\n",
"# completion_window=\"24h\",\n",
"# metadata={\n",
"# \"description\": \"Batch request for Prompt 7_OneByOne - Part 1\"\n",
"# }\n",
"# )\n",
"\n",
"### 2nd batch request\n",
"\n",
"# batch_input_file_part2 = openAI_client.files.create(\n",
"# file=open(path_to_batch_request_part2, \"rb\"),\n",
"# purpose=\"batch\"\n",
"# )\n",
"\n",
"# batch_input_file_id_part2 = batch_input_file_part2.id\n",
"\n",
"# batch_info_part2 = openAI_client.batches.create(\n",
"# input_file_id=batch_input_file_id_part2,\n",
"# endpoint=\"/v1/chat/completions\",\n",
"# completion_window=\"24h\",\n",
"# metadata={\n",
"# \"description\": \"Batch request for Prompt 7_OneByOne - Part 2 (take 2)\"\n",
"# }\n",
"# )"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Batch ID: batch_66fee64f6808819096eefff5f6fae825\n",
"Metadata: {'description': 'Batch request for Prompt 7_OneByOne - Part 2 (take 2)'}\n",
"Created at: 1727981135\n",
"Errors: None\n",
"Status: completed\n",
"Output file ID: file-ar8HdU9P1bb1OT44oAihKFSA\n",
"\n",
"\n",
"Batch ID: batch_66fece55199c8190ad396df6b2406744\n",
"Metadata: {'description': 'Batch request for Prompt 7_OneByOne - Part 1'}\n",
"Created at: 1727974997\n",
"Errors: None\n",
"Status: completed\n",
"Output file ID: file-KQJIDFd3znNkWthXD3GEdTJH\n",
"\n",
"\n"
]
}
],
"source": [
"# Checking the status of the batch request\n",
"\n",
"status_of_all_batches = openAI_client.batches.list(limit=2)\n",
"\n",
"# Extract batch_id and status from the response\n",
"batch_info = [(batch.id, batch.metadata, batch.created_at, batch.errors, batch.status, batch.output_file_id) for batch in status_of_all_batches.data]\n",
"batch_info\n",
"# Print the extracted info from the batch line by line with titles\n",
"for batch in batch_info:\n",
" print(\"Batch ID: \", batch[0])\n",
" print(\"Metadata: \", batch[1])\n",
" print(\"Created at: \", batch[2])\n",
" print(\"Errors: \", batch[3])\n",
" print(\"Status: \", batch[4])\n",
" print(\"Output file ID: \", batch[5])\n",
" print(\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"# Retrieving the files and saving them\n",
"file_response_part1 = openAI_client.files.content(\"file-KQJIDFd3znNkWthXD3GEdTJH\")\n",
"file_response_part2 = openAI_client.files.content(\"file-ar8HdU9P1bb1OT44oAihKFSA\")\n",
"\n",
"# Save the response in a JSONL file\n",
"path_to_response_part1 = os.path.join(result_path_seventh_prompt, \"response_part1.jsonl\")\n",
"path_to_response_part2 = os.path.join(result_path_seventh_prompt, \"response_part2.jsonl\")\n",
"\n",
"with open(path_to_response_part1, 'wb') as f:\n",
" f.write(file_response_part1.read())\n",
"with open(path_to_response_part2, 'wb') as f:\n",
" f.write(file_response_part2.read())"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total duplicates removed: 3\n"
]
}
],
"source": [
"# Parsing each jsonl file and saving the results in an Excel file\n",
"def process_batch_response(path_to_batch_file: str):\n",
" dfs = []\n",
" with (open(path_to_batch_file, 'r')) as f:\n",
" for line in f:\n",
" response = json.loads(line)\n",
" condition_name = response[\"custom_id\"].split(\"_\")[2]\n",
" GPT_answer = response['response']['body']['choices'][0]['message']['content']\n",
" df = transform_GPT_output(GPT_answer)\n",
" # Add the condition name as a column in the DataFrame\n",
" df[\"Condition\"] = condition_name\n",
" dfs.append(df)\n",
" return pd.concat(dfs, ignore_index=True)\n",
"\n",
"accum_df1 = process_batch_response(path_to_response_part1)\n",
"accum_df2 = process_batch_response(path_to_response_part2)\n",
"\n",
"# Concatenate the two DataFrames\n",
"accum_df = pd.concat([accum_df1, accum_df2], ignore_index=True)\n",
"accum_duplicates = 0\n",
"for condition in accum_df[\"Condition\"].unique():\n",
" # Filter the DataFrame by condition and drop the \"Condition\" column\n",
" condition_df = accum_df[accum_df[\"Condition\"] == condition].drop(columns=[\"Condition\"])\n",
" # Remove duplicates and count how many were removed\n",
" accum_duplicates += condition_df.duplicated().sum()\n",
" condition_df = condition_df.drop_duplicates()\n",
" condition_df.to_excel(os.path.join(result_path_seventh_prompt, condition + abbreviation + \".xlsx\"), sheet_name=model_name ,index=False)\n",
"print(\"Total duplicates removed: \", accum_duplicates)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Function to generate a folder with the complete prompt plus text"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"create_full_prompt_record_production()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}