import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def filter_cd_zscore(pnd_df):
    """
    This function filters the rows of pnd_df based on the values of 'closest distance'
    that fall below Q3 from the known treatments.
    
    Input Parameters:
    - pnd_df: DataFrame containing the data with 'closest distance' column.
    
    Returns:
    - filtered_df: DataFrame containing only the rows where 'closest distance' is between Q1 and Q3.
    """
    # Calculate Q3 of the Treatment
    filtered_yes = pnd_df[pnd_df['Treatment'] == 'yes']
    q3 = filtered_yes['Closest distance'].quantile(0.75)
    filtered_df =pnd_df[(pnd_df['Dc_zscore'] < -0.15) & (pnd_df['Closest distance'] <= q3) & (pnd_df['Treatment'] == 'unknown')]
    
    return filtered_df


def rep_pnd(pnd_df,filtered_pnd):
    """
    Generates side-by-side boxplots to compare 'Closest distance' and 'Personalized Network Distance (PND)' metrics
    across three different treatment status groups.

    Input Parameters:
    - pnd_df (pd.DataFrame): DataFrame containing drug data with columns: 
      - 'Treatment': The treatment status of the drugs ('yes' or 'unknown').
      - 'Closest distance': A numeric measure of the closest distance metric for the drugs.
      - 'PND': A numeric measure of Personalized Network Distance for the drugs.
    - filtered_pnd (pd.DataFrame): A subset of pnd_df filtered for a specific condition, with the same columns as pnd_df.

    Returns:
    - None: The function saves a boxplot comparison as a PNG file and displays the plot.
    """
    drugs_with_disease = pnd_df[(pnd_df['Treatment'] == 'yes')]
    drugs_without_disease = pnd_df[(pnd_df['Treatment'] == 'unknown')]
    combined_data = pd.concat([drugs_with_disease.assign(Treatment='Treatment'), drugs_without_disease.assign(Treatment='All unknown'),filtered_pnd.assign(Treatment='Filtered unknown for DR')])
 
    # Combine the two datasets into a single subplot
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))  # Create a subplot with 1 row and 2 columns
 
    sns.boxplot(x='Treatment', y='Closest distance', data=combined_data, hue='Treatment', ax=axes[0], palette={'Treatment': '#FF7A7A', 'All unknown': '#79C4FF', 'Filtered unknown for DR': '#CAF0F8'}, dodge=False, medianprops=dict(linewidth=2))
    axes[0].set_ylabel('Closest distance', fontsize=12)
    axes[0].set_xlabel('')
    for label in axes[0].get_xticklabels():
        label.set_fontsize(12)
    axes[0].legend([], [], frameon=False)
    
    # Plot the boxplot with 'PND' on the right
    sns.boxplot(x='Treatment', y='PND', data=combined_data, hue='Treatment', ax=axes[1], palette={'Treatment': '#FF7A7A', 'All unknown': '#79C4FF', 'Filtered unknown for DR': '#CAF0F8'}, dodge=False, medianprops=dict(linewidth=2))
    axes[1].set_ylabel('Personalized Network Distance ($\mathregular{PND}$)', fontsize=12)
    axes[1].set_xlabel('')
    for label in axes[1].get_xticklabels():
        label.set_fontsize(12)
    axes[1].legend([], [], frameon=False)
    plt.tight_layout()  # Adjust the layout of the subplot to avoid overlap
 
    plt.savefig('../results/pnd_closest_distance_boxplot_filtered.png', dpi=300)
    plt.show()


def get_gsm_ids_for_lowest_pnd_drugs(pnd_df, drugs_list, num_gsm=3):
    """
    This function extracts the gsm_id for the given list of drugs with the lowest PND values.
    
    Input Parameters:
    - pnd_df (pd.DataFrame): DataFrame containing drug data with columns: 
      - 'Treatment': The treatment status of the drugs ('yes' or 'unknown').
      - 'Closest distance': A numeric measure of the closest distance metric for the drugs.
      - 'PND': A numeric measure of Personalized Network Distance for the drugs.
    - drugs_list: List of drugs with the lowest PND values.
    - num_gsm: Number of gsm_id to extract for each drug.
    
    Returns:
    - gsm_df: DataFrame containing rows with the specified gsm_id for the given drugs.
    """
    filtered_rows = []

    for drug in drugs_list:
        # Filter the DataFrame for the current drug
        drug_df = pnd_df[pnd_df['Drugs'] == drug]
        
        # Sort the DataFrame by PND in ascending order and select the top num_gsm rows
        top_gsm_df = drug_df.sort_values(by='PND').head(num_gsm)
        
        # Append the selected rows to the list
        filtered_rows.append(top_gsm_df)

    # Concatenate the selected rows into a single DataFrame
    gsm_df = pd.concat(filtered_rows)
    
    return gsm_df