#!/usr/bin/env python3
"""
BOB is Bayesian Optimization for Black-box optimization.
License: MIT
Author: Ivan Stepanov <ivanstepanovftw@gmail.com>
"""
from typing import Optional, Dict, List, Tuple, Any, Union
import hashlib
import numpy as np
import os
import argparse
import re
import subprocess
import random
import time
from skopt import gp_minimize
from skopt.space import Integer, Real, Categorical, Dimension
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.style as mplstyle
import ast
import json
import bert_score
import language_tool_python
from collections import Counter
import glob
from skopt.benchmarks import branin as branin
from skopt.benchmarks import hart6 as hart6_
from matplotlib.colors import LogNorm
from functools import partial
from skopt.plots import plot_convergence, plot_gaussian_process, plot_regret, plot_objective, plot_evaluations, plot_objective_2D, plot_histogram
from skopt import gp_minimize, forest_minimize, dummy_minimize, expected_minimum, expected_minimum_random_sampling

from transformers import logging
logging.set_verbosity_error()

tool = language_tool_python.LanguageTool('ru-RU')

mpl.use('Qt5Agg')
# Or export MPLBACKEND=Qt5Agg

random.seed(1)
threads = 8

ENT_PATH = '/p/o/ent_random_sequence_tester/src/ent'
SKIP_SUBJECTIVE = True
AUTO_SKIP_VALID_SUBJECTIVE = True
BERTSCORE_MODEL = 'microsoft/deberta-v3-base'

# Each model will be optimized independently
models_filepath = [
    "models/7B/ggml-model-q4_0.bin",
    # "models/13B/ggml-model-q4_0.bin",
    # "models/7B/ggml-model-q4_1.bin", # slower than 13B-q4_0
    # "models/13B/ggml-model-q4_1.bin", # too much major faults
    # "models/gpt4all-7B/ggml-model-uncensured-q4_0.bin",
]

# Default parameters if not provided in inputs
probe_default_args = {
    "top_k": 40,
    "top_p": 0.9,
    "repeat_last_n": 64,
    "repeat_penalty": 1.1,
    "temp": 0.8,
    
    "batch_size": 8,
    "n_predict": 128,
    "ctx_size": 512,
    "keep": 0,
    "ignore_eos": False,
    "seed": [1], # Seeds to try, before getting objective score. Mean score is used.
}

# Search space to optimize, default will be overridden by these values
search_space = {
    "top_k": Integer(1, 32000),
    "top_p": Real(0, 1),
    "repeat_last_n": Integer(0, 1024),
    "repeat_penalty": Real(1, 1.5),
    "temp": Real(0, 2),
}

# Benchmarks to try, before getting objective score. Mean score is used.
benchmarks = [
    # The purpose of this benchmark is to optimize a model's ability to understand the essence of a
    # dialogue and to logically conclude it. The performance of the model will be measured by
    # BERTScore Recall.
    # You can add more benchmarks, but keep in mind that you probably need to change objective metric calculation.
    {
        "prompt": open("prompts/chernobaev.ru.txt", 'r', encoding='utf-8', errors='ignore').read(),
        "gta": open("prompts/gta/chernobaev.ru.txt", 'r', encoding='utf-8', errors='ignore').read(),
        "ctx_size": 1024,
        "n_predict": 512,
        "ignore_eos": True,
        "seed": [1, 2, 3, 4, 5],
    },
    # {
    #     "prompt": "Википедия:\nГенетический алгоритм - это эвристический",
    #     "gta": "алгоритм поиска, используемый для решения задач оптимизации и моделирования путём случайного подбора, комбинирования и вариации искомых параметров с использованием механизмов, аналогичных естественному отбору в природе.",
    #     "ctx_size": 128,
    #     "n_predict": 32,
    #     "seed": [1, 2],
    # }
]


def plot_iterations(result):
    search_space = result.space
    x_iters = result.x_iters
    func_vals = result.func_vals
    search_space_names = [dim.name for dim in search_space]
    opts = search_space_names + ["objective_r"]

    num_params = len(opts) + 1
    fig, axs = plt.subplots(num_params, figsize=(8, num_params * 8), sharex=True)
    iterations = list(range(1, len(x_iters) + 1))

    for i, param in enumerate(opts):
        if param == "objective_r":
            param_values = func_vals
        else:
            param_index = search_space_names.index(param)
            param_values = [x[param_index] for x in x_iters]

        axs[i].scatter(iterations, param_values)
        axs[i].set_xlabel("Iteration")
        axs[i].set_ylabel(param)
        # axs[i].set_title(f"Iteration vs {param}")

    plot_convergence(result, true_minimum=0, ax=axs[-1])

    return axs


def get_metrics(metrics_filepath, prompt, gta, prediction) -> Dict[str, Any]:
    """
    Gets metrics for a given prompt and output.
    Caches results to a file.

    @param metrics_filepath: Path to a file to cache results to.
    @param prompt: Prompt.
    @param gta: Ground truth answer.
    @param prediction: Model output.

    @return: Dict of metrics.
    """
    print(f"Calculating metrics: {metrics_filepath}")

    metrics = {
        'bertscore_model': None,
        'entropy': None,
        'chi_squared': None,
        'mean': None,
        'subjective': None,
        'bertscore_P': None,
        'bertscore_R': None,
        'bertscore_F1': None,
        'p_bertscore_P': None,
        'p_bertscore_R': None,
        'p_bertscore_F1': None,
        'p_grammar': None,
        'p_repeats': None,
    }

    # Load cached scores
    try:
        with open(metrics_filepath, 'r', encoding='utf-8') as f:
            metrics.update(json.load(f))
    except FileNotFoundError:
        pass

    output = prompt + prediction

    # Score: Entropy
    if not metrics['entropy'] or not metrics['chi_squared'] or not metrics['mean']:
        try:
            ent_output = subprocess.check_output([ENT_PATH], input=output.encode('utf-8'), stderr=subprocess.STDOUT).decode('utf-8')
            metrics['entropy'] = float(ent_output.split('Entropy = ')[1].split(' bits per byte.')[0])
            metrics['chi_squared'] = float(ent_output.split('Chi square distribution for ')[1].split(' samples is ')[1].split(', and randomly')[0])
            metrics['mean'] = float(ent_output.split('Arithmetic mean value of data bytes is ')[1].split(' ')[0])
        except:
            pass

    # # Score: BERTScore
    # if not metrics["bertscore_P"] or not metrics["bertscore_R"] or not metrics["bertscore_F1"] or metrics["bertscore_model"] != BERTSCORE_MODEL:
    #     score = bert_score.score([output], [prompt + gta], model_type=bertscore_model)
    #     metrics["bertscore_P"], metrics["bertscore_R"], metrics["bertscore_F1"] = score[0].item(), score[1].item(), score[2].item()

    # Score: BERTScore of prediction
    if not metrics["p_bertscore_P"] or not metrics["p_bertscore_R"] or not metrics["p_bertscore_F1"] or metrics["bertscore_model"] != BERTSCORE_MODEL:
        metrics["bertscore_model"] = BERTSCORE_MODEL
        score = bert_score.score([prediction], [gta], model_type=BERTSCORE_MODEL)
        metrics["p_bertscore_P"], metrics["p_bertscore_R"], metrics["p_bertscore_F1"] = score[0].item(), score[1].item(), score[2].item()

    # TODO: Should be measured as sequence repeats.
    # # Score: Repeats (count repeated words)
    # prediction_word_map = {}
    # for word in prediction.split():
    #     if word not in prediction_word_map:
    #         prediction_word_map[word] = 0
    #     prediction_word_map[word] += 1
    # metrics["p_repeats"] = sum([
    #     len(word) * (count)
    #     for word, count in prediction_word_map.items() if count > 1
    # ]) / len(prediction)

    # Score: Grammar
    if not metrics["p_grammar"]:
        matches = tool.check(prediction)
        metrics["p_grammar"] = len(matches) / len(prediction)

    # Prompt the user for a subjective rating
    if not SKIP_SUBJECTIVE:
        if not AUTO_SKIP_VALID_SUBJECTIVE or metrics["subjective"] is None or not (0 <= metrics["subjective"] <= 10):
            print()
            print(f'Prompt: {prompt}')
            print(f'GTA: {gta}')
            print(f'Prediction: {prediction}')
            while True:
                try:
                    metrics["subjective"] = int(input(f'Subjective score (0-10, default {metrics["subjective"]}): '))
                    if metrics["subjective"] is not None and 0 <= metrics["subjective"] <= 10:
                        break
                except ValueError:
                    pass

    # Score: Objective score to minimize.
    # Less is better.
    metrics["objective_r"] = -metrics["p_bertscore_R"]

    # Write the score data to a file
    metrics_str = json.dumps(metrics, indent=2, ensure_ascii=False).encode('utf-8')
    with open(metrics_filepath, 'wb') as f:
        f.write(metrics_str)

    return metrics


def probe_model(
    output_path,
    cold: bool = False,
    **probe_args,
) -> float:
    """
    Evaluates the model with the given parameters and returns the objective score.
    @param output_path: The path to the output directory, where cache is stored.
    @return: The objective score.
    """
    probe_raw = json.dumps(probe_args, indent=2, ensure_ascii=False, sort_keys=True).encode('utf-8')

    # Create output filepaths
    output_basename = hashlib.md5(probe_raw).hexdigest()
    probes_path = f"{output_path}/probes/{output_basename}"
    probe_filepath = f"{probes_path}/probe.json"
    out_filepath = f"{probes_path}/out.txt"
    err_filepath = f"{probes_path}/err.txt"
    metrics_filepath = f"{probes_path}/metrics.json"

    # Try to load cached metrics
    is_cached = False
    try:
        output = open(out_filepath, 'r', encoding='utf-8', errors='ignore').read()
        try:
            prediction = output.split(probe_args["prompt"])[1]
        except:
            raise Exception(f'Could not find prompt in output: {output}')

        metrics = get_metrics(metrics_filepath, probe_args["prompt"], probe_args["gta"], output)
        if metrics is not None and metrics["objective_r"] is not None:
            is_cached = True
    except FileNotFoundError:
        pass

    # Human readable symlinks
    os.makedirs(f"{output_path}/human", exist_ok=True)
    human_probe_path = f"{output_path}/human/{probe_args['model_filepath'].split('/')[-2]}-" + \
        f"{hashlib.md5(probe_args['prompt'].encode('utf-8')).hexdigest()[:6]}-" + \
        "-".join([f"{k}={probe_args[k]}" for k in probe_args.keys() if k not in ['model_filepath', 'prompt', 'gta']])

    if is_cached or not cold:
        try:
            os.symlink(f"../probes/{output_basename}", human_probe_path, target_is_directory=True)
        except FileExistsError:
            pass

    if is_cached:
        return metrics["objective_r"]
    if cold:
        return None

    os.makedirs(probes_path, exist_ok=True)
    
    with open(probe_filepath, 'wb') as f:
        f.write(probe_raw)

    print("Running probe:", probe_filepath, human_probe_path)
    arguments = [
            "./main",
            "--seed", str(probe_args['seed']),
            "--threads", str(threads),
            "--prompt", str(probe_args['prompt']),
            "--n_predict", str(probe_args['n_predict']),
            "--top_k", str(probe_args['top_k']),
            "--top_p", str(probe_args['top_p']),
            "--repeat_last_n", str(probe_args['repeat_last_n']),
            "--repeat_penalty", str(probe_args['repeat_penalty']),
            "--ctx_size", str(probe_args['ctx_size']),
            "--batch_size", str(probe_args['batch_size']),
            "--temp", str(probe_args['temp']),
            "--keep", str(probe_args['keep']),
            "--model", probe_args['model_filepath'],
            *["--ignore-eos" if probe_args['ignore_eos'] else []],
        ]
    process = subprocess.run(arguments,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )
    with open(err_filepath, "wb") as f:
        f.write(process.stderr)
    if process.returncode == 0:
        with open(out_filepath, "wb") as f:
            f.write(process.stdout)
    else:
        raise Exception(f"Error running '{arguments}':\n{process.stderr}")
    
    output = process.stdout.decode('utf-8', errors='ignore')

    try:
        prediction = output.split(probe_args["prompt"])[1]
    except:
        raise Exception(f'Could not find prompt in output: {output}')

    metrics = get_metrics(metrics_filepath, probe_args["prompt"], probe_args["gta"], prediction)
    if metrics is not None and metrics["objective_r"] is not None:
        return metrics["objective_r"]
    
    raise Exception(f"Cound not get metrics for file {out_filepath}")


def evaluate_model(search_space_points: list, cold=False):
    probe_args = probe_default_args.copy()
    
    # Merge search_space_points
    for k, v in zip(search_space.keys(), search_space_points):
        # skopt provides a list of numpy values, we need to convert them to the correct type
        probe_args[k] = ast.literal_eval(str(v))

    scores = []
    for case in benchmarks:
        probe_args_for_case = probe_args.copy()

        # Merge case
        for key, value in case.items():
            probe_args_for_case[key] = value
        
        probe_args_for_case["model_filepath"] = model_filepath

        seeds = probe_args_for_case["seed"]

        for seed in seeds:
            probe_args_for_case["seed"] = seed
            score = probe_model(args.output_path, cold, **probe_args_for_case)
            if cold and not score:
                return None
            scores.append(score)

    objective = np.mean(scores)

    return objective


def model_optimize(model_filepath, args):
    # Load cached
    DEBUG = True
    x0 = []
    probe_files = glob.glob(f"{args.output_path}/probes/**/probe.json")
    probe_files.sort(key=os.path.getmtime)
    for probe_filepath in probe_files:
        probe_args = json.loads(open(probe_filepath, 'r', encoding='utf-8', errors="ignore").read())

        if probe_args['model_filepath'] != model_filepath:
            DEBUG and print("~ Model mismatch")
            continue

        if not all([probe_args[key] >= search_space[key].low and probe_args[key] <= search_space[key].high for key in search_space.keys()]):
            DEBUG and print("~ Not in search space")
            continue
        
        search_space_points = [probe_args[key] for key in search_space.keys()]
        if search_space_points not in x0:
            x0 += [search_space_points]

    # Remove points that are not cached by the model. Skips points, that are started, but not finished.
    x0 = [x for x in x0 if evaluate_model(x, cold=True) is not None]
    print(f"Found {len(x0)} cached points.")

    if len(x0) == 0:
        x0 = None

    # Run the optimization
    result = gp_minimize(
        func=evaluate_model,
        dimensions=search_space.values(),
        n_calls=args.n_calls + len(x0) if x0 else args.n_calls,
        n_initial_points=args.n_initial_points,
        # initial_point_generator="hammersly",
        initial_point_generator="lhs",
        x0=x0,
        random_state=0,
        n_jobs=-1,
        verbose=True,
    )
    
    # print("-" * 80)
    # optimal_params = dict(zip(search_space.keys(), result.x))
    # for key in params:
    #     if not isinstance(params[key], Dimension):
    #         optimal_params[key] = params[key]
    # print(f"Optimal parameters (score: {result.fun}): {optimal_params}")

    print("-" * 80)
    print("Optimization finished.")
    print(f"Best evaluation: {(dict(zip(search_space.keys(), result.x)), result.fun)}")
    min_x, min_fun = expected_minimum(result)
    print(f"Expected minimum: {(dict(zip(search_space.keys(), min_x)), min_fun)}")
    print("-" * 80)

    if args.plot_iterations:
        plot_iterations(result)

    if args.plot_objective:
        plot_objective(result)

    if args.plot_evaluations:
        plot_evaluations(result)

    # if args.plot_gaussian_process:
    #     plot_gaussian_process(result, objective=evaluate_model)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='BOB is Bayesian Optimization for Black-box optimization.')

    parser.add_argument('output_path', type=str, help='Path to the directory where output and scores are stored or will be stored.')

    parser.add_argument('-n', '--n_calls', type=int, metavar='N', default=100, help='Number of calls to the black-box function. Set to 0 for a cold run.')
    parser.add_argument('-i', '--n_initial_points', type=int, metavar='N', default=10, help='Number of initial calls to the black-box function minus any already existing points in scores_path.')

    parser.add_argument('-1', '--plot_iterations', action='store_true', help='Display the iteration versus parameters plot, covergence and regret.')
    parser.add_argument('-2', '--plot_objective', action='store_true', help='Display the objective plot. n_samples = 40.')
    parser.add_argument('-3', '--plot_evaluations', action='store_true', help='Display a histogram of the samples at each other, more precise than plot_objective.')
    parser.add_argument('-4', '--plot_gaussian_process', action='store_true', help='Display the Gaussian process plots.')

    args = parser.parse_args()

    # Assign name to each dimension in search space
    for key, value in search_space.items():
        if isinstance(value, Dimension):
            value.name = key

    # Optimize each model
    for model_filepath in models_filepath:
        model_optimize(model_filepath, args)
    plt.show()