#!/usr/bin/env python3 """ BOB is Bayesian Optimization for Black-box optimization. License: MIT Author: Ivan Stepanov """ from typing import Optional, Dict, List, Tuple, Any, Union import hashlib import numpy as np import os import argparse import re import subprocess import random import time from skopt import gp_minimize from skopt.space import Integer, Real, Categorical, Dimension import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.style as mplstyle import ast import json import bert_score import language_tool_python from collections import Counter import glob from skopt.benchmarks import branin as branin from skopt.benchmarks import hart6 as hart6_ from matplotlib.colors import LogNorm from functools import partial from skopt.plots import plot_convergence, plot_gaussian_process, plot_regret, plot_objective, plot_evaluations, plot_objective_2D, plot_histogram from skopt import gp_minimize, forest_minimize, dummy_minimize, expected_minimum, expected_minimum_random_sampling from transformers import logging logging.set_verbosity_error() tool = language_tool_python.LanguageTool('ru-RU') mpl.use('Qt5Agg') # Or export MPLBACKEND=Qt5Agg random.seed(1) threads = 8 ENT_PATH = '/p/o/ent_random_sequence_tester/src/ent' SKIP_SUBJECTIVE = True AUTO_SKIP_VALID_SUBJECTIVE = True BERTSCORE_MODEL = 'microsoft/deberta-v3-base' # Each model will be optimized independently models_filepath = [ "models/7B/ggml-model-q4_0.bin", # "models/13B/ggml-model-q4_0.bin", # "models/7B/ggml-model-q4_1.bin", # slower than 13B-q4_0 # "models/13B/ggml-model-q4_1.bin", # too much major faults # "models/gpt4all-7B/ggml-model-uncensured-q4_0.bin", ] # Default parameters if not provided in inputs probe_default_args = { "top_k": 40, "top_p": 0.9, "repeat_last_n": 64, "repeat_penalty": 1.1, "temp": 0.8, "batch_size": 8, "n_predict": 128, "ctx_size": 512, "keep": 0, "ignore_eos": False, "seed": [1], # Seeds to try, before getting objective score. Mean score is used. } # Search space to optimize, default will be overridden by these values search_space = { "top_k": Integer(1, 32000), "top_p": Real(0, 1), "repeat_last_n": Integer(0, 1024), "repeat_penalty": Real(1, 1.5), "temp": Real(0, 2), } # Benchmarks to try, before getting objective score. Mean score is used. benchmarks = [ # The purpose of this benchmark is to optimize a model's ability to understand the essence of a # dialogue and to logically conclude it. The performance of the model will be measured by # BERTScore Recall. # You can add more benchmarks, but keep in mind that you probably need to change objective metric calculation. { "prompt": open("prompts/chernobaev.ru.txt", 'r', encoding='utf-8', errors='ignore').read(), "gta": open("prompts/gta/chernobaev.ru.txt", 'r', encoding='utf-8', errors='ignore').read(), "ctx_size": 1024, "n_predict": 512, "ignore_eos": True, "seed": [1, 2, 3, 4, 5], }, # { # "prompt": "Википедия:\nГенетический алгоритм - это эвристический", # "gta": "алгоритм поиска, используемый для решения задач оптимизации и моделирования путём случайного подбора, комбинирования и вариации искомых параметров с использованием механизмов, аналогичных естественному отбору в природе.", # "ctx_size": 128, # "n_predict": 32, # "seed": [1, 2], # } ] def plot_iterations(result): search_space = result.space x_iters = result.x_iters func_vals = result.func_vals search_space_names = [dim.name for dim in search_space] opts = search_space_names + ["objective_r"] num_params = len(opts) + 1 fig, axs = plt.subplots(num_params, figsize=(8, num_params * 8), sharex=True) iterations = list(range(1, len(x_iters) + 1)) for i, param in enumerate(opts): if param == "objective_r": param_values = func_vals else: param_index = search_space_names.index(param) param_values = [x[param_index] for x in x_iters] axs[i].scatter(iterations, param_values) axs[i].set_xlabel("Iteration") axs[i].set_ylabel(param) # axs[i].set_title(f"Iteration vs {param}") plot_convergence(result, true_minimum=0, ax=axs[-1]) return axs def get_metrics(metrics_filepath, prompt, gta, prediction) -> Dict[str, Any]: """ Gets metrics for a given prompt and output. Caches results to a file. @param metrics_filepath: Path to a file to cache results to. @param prompt: Prompt. @param gta: Ground truth answer. @param prediction: Model output. @return: Dict of metrics. """ print(f"Calculating metrics: {metrics_filepath}") metrics = { 'bertscore_model': None, 'entropy': None, 'chi_squared': None, 'mean': None, 'subjective': None, 'bertscore_P': None, 'bertscore_R': None, 'bertscore_F1': None, 'p_bertscore_P': None, 'p_bertscore_R': None, 'p_bertscore_F1': None, 'p_grammar': None, 'p_repeats': None, } # Load cached scores try: with open(metrics_filepath, 'r', encoding='utf-8') as f: metrics.update(json.load(f)) except FileNotFoundError: pass output = prompt + prediction # Score: Entropy if not metrics['entropy'] or not metrics['chi_squared'] or not metrics['mean']: try: ent_output = subprocess.check_output([ENT_PATH], input=output.encode('utf-8'), stderr=subprocess.STDOUT).decode('utf-8') metrics['entropy'] = float(ent_output.split('Entropy = ')[1].split(' bits per byte.')[0]) metrics['chi_squared'] = float(ent_output.split('Chi square distribution for ')[1].split(' samples is ')[1].split(', and randomly')[0]) metrics['mean'] = float(ent_output.split('Arithmetic mean value of data bytes is ')[1].split(' ')[0]) except: pass # # Score: BERTScore # if not metrics["bertscore_P"] or not metrics["bertscore_R"] or not metrics["bertscore_F1"] or metrics["bertscore_model"] != BERTSCORE_MODEL: # score = bert_score.score([output], [prompt + gta], model_type=bertscore_model) # metrics["bertscore_P"], metrics["bertscore_R"], metrics["bertscore_F1"] = score[0].item(), score[1].item(), score[2].item() # Score: BERTScore of prediction if not metrics["p_bertscore_P"] or not metrics["p_bertscore_R"] or not metrics["p_bertscore_F1"] or metrics["bertscore_model"] != BERTSCORE_MODEL: metrics["bertscore_model"] = BERTSCORE_MODEL score = bert_score.score([prediction], [gta], model_type=BERTSCORE_MODEL) metrics["p_bertscore_P"], metrics["p_bertscore_R"], metrics["p_bertscore_F1"] = score[0].item(), score[1].item(), score[2].item() # TODO: Should be measured as sequence repeats. # # Score: Repeats (count repeated words) # prediction_word_map = {} # for word in prediction.split(): # if word not in prediction_word_map: # prediction_word_map[word] = 0 # prediction_word_map[word] += 1 # metrics["p_repeats"] = sum([ # len(word) * (count) # for word, count in prediction_word_map.items() if count > 1 # ]) / len(prediction) # Score: Grammar if not metrics["p_grammar"]: matches = tool.check(prediction) metrics["p_grammar"] = len(matches) / len(prediction) # Prompt the user for a subjective rating if not SKIP_SUBJECTIVE: if not AUTO_SKIP_VALID_SUBJECTIVE or metrics["subjective"] is None or not (0 <= metrics["subjective"] <= 10): print() print(f'Prompt: {prompt}') print(f'GTA: {gta}') print(f'Prediction: {prediction}') while True: try: metrics["subjective"] = int(input(f'Subjective score (0-10, default {metrics["subjective"]}): ')) if metrics["subjective"] is not None and 0 <= metrics["subjective"] <= 10: break except ValueError: pass # Score: Objective score to minimize. # Less is better. metrics["objective_r"] = -metrics["p_bertscore_R"] # Write the score data to a file metrics_str = json.dumps(metrics, indent=2, ensure_ascii=False).encode('utf-8') with open(metrics_filepath, 'wb') as f: f.write(metrics_str) return metrics def probe_model( output_path, cold: bool = False, **probe_args, ) -> float: """ Evaluates the model with the given parameters and returns the objective score. @param output_path: The path to the output directory, where cache is stored. @return: The objective score. """ probe_raw = json.dumps(probe_args, indent=2, ensure_ascii=False, sort_keys=True).encode('utf-8') # Create output filepaths output_basename = hashlib.md5(probe_raw).hexdigest() probes_path = f"{output_path}/probes/{output_basename}" probe_filepath = f"{probes_path}/probe.json" out_filepath = f"{probes_path}/out.txt" err_filepath = f"{probes_path}/err.txt" metrics_filepath = f"{probes_path}/metrics.json" # Try to load cached metrics is_cached = False try: output = open(out_filepath, 'r', encoding='utf-8', errors='ignore').read() try: prediction = output.split(probe_args["prompt"])[1] except: raise Exception(f'Could not find prompt in output: {output}') metrics = get_metrics(metrics_filepath, probe_args["prompt"], probe_args["gta"], output) if metrics is not None and metrics["objective_r"] is not None: is_cached = True except FileNotFoundError: pass # Human readable symlinks os.makedirs(f"{output_path}/human", exist_ok=True) human_probe_path = f"{output_path}/human/{probe_args['model_filepath'].split('/')[-2]}-" + \ f"{hashlib.md5(probe_args['prompt'].encode('utf-8')).hexdigest()[:6]}-" + \ "-".join([f"{k}={probe_args[k]}" for k in probe_args.keys() if k not in ['model_filepath', 'prompt', 'gta']]) if is_cached or not cold: try: os.symlink(f"../probes/{output_basename}", human_probe_path, target_is_directory=True) except FileExistsError: pass if is_cached: return metrics["objective_r"] if cold: return None os.makedirs(probes_path, exist_ok=True) with open(probe_filepath, 'wb') as f: f.write(probe_raw) print("Running probe:", probe_filepath, human_probe_path) arguments = [ "./main", "--seed", str(probe_args['seed']), "--threads", str(threads), "--prompt", str(probe_args['prompt']), "--n_predict", str(probe_args['n_predict']), "--top_k", str(probe_args['top_k']), "--top_p", str(probe_args['top_p']), "--repeat_last_n", str(probe_args['repeat_last_n']), "--repeat_penalty", str(probe_args['repeat_penalty']), "--ctx_size", str(probe_args['ctx_size']), "--batch_size", str(probe_args['batch_size']), "--temp", str(probe_args['temp']), "--keep", str(probe_args['keep']), "--model", probe_args['model_filepath'], *["--ignore-eos" if probe_args['ignore_eos'] else []], ] process = subprocess.run(arguments, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) with open(err_filepath, "wb") as f: f.write(process.stderr) if process.returncode == 0: with open(out_filepath, "wb") as f: f.write(process.stdout) else: raise Exception(f"Error running '{arguments}':\n{process.stderr}") output = process.stdout.decode('utf-8', errors='ignore') try: prediction = output.split(probe_args["prompt"])[1] except: raise Exception(f'Could not find prompt in output: {output}') metrics = get_metrics(metrics_filepath, probe_args["prompt"], probe_args["gta"], prediction) if metrics is not None and metrics["objective_r"] is not None: return metrics["objective_r"] raise Exception(f"Cound not get metrics for file {out_filepath}") def evaluate_model(search_space_points: list, cold=False): probe_args = probe_default_args.copy() # Merge search_space_points for k, v in zip(search_space.keys(), search_space_points): # skopt provides a list of numpy values, we need to convert them to the correct type probe_args[k] = ast.literal_eval(str(v)) scores = [] for case in benchmarks: probe_args_for_case = probe_args.copy() # Merge case for key, value in case.items(): probe_args_for_case[key] = value probe_args_for_case["model_filepath"] = model_filepath seeds = probe_args_for_case["seed"] for seed in seeds: probe_args_for_case["seed"] = seed score = probe_model(args.output_path, cold, **probe_args_for_case) if cold and not score: return None scores.append(score) objective = np.mean(scores) return objective def model_optimize(model_filepath, args): # Load cached DEBUG = True x0 = [] probe_files = glob.glob(f"{args.output_path}/probes/**/probe.json") probe_files.sort(key=os.path.getmtime) for probe_filepath in probe_files: probe_args = json.loads(open(probe_filepath, 'r', encoding='utf-8', errors="ignore").read()) if probe_args['model_filepath'] != model_filepath: DEBUG and print("~ Model mismatch") continue if not all([probe_args[key] >= search_space[key].low and probe_args[key] <= search_space[key].high for key in search_space.keys()]): DEBUG and print("~ Not in search space") continue search_space_points = [probe_args[key] for key in search_space.keys()] if search_space_points not in x0: x0 += [search_space_points] # Remove points that are not cached by the model. Skips points, that are started, but not finished. x0 = [x for x in x0 if evaluate_model(x, cold=True) is not None] print(f"Found {len(x0)} cached points.") if len(x0) == 0: x0 = None # Run the optimization result = gp_minimize( func=evaluate_model, dimensions=search_space.values(), n_calls=args.n_calls + len(x0) if x0 else args.n_calls, n_initial_points=args.n_initial_points, # initial_point_generator="hammersly", initial_point_generator="lhs", x0=x0, random_state=0, n_jobs=-1, verbose=True, ) # print("-" * 80) # optimal_params = dict(zip(search_space.keys(), result.x)) # for key in params: # if not isinstance(params[key], Dimension): # optimal_params[key] = params[key] # print(f"Optimal parameters (score: {result.fun}): {optimal_params}") print("-" * 80) print("Optimization finished.") print(f"Best evaluation: {(dict(zip(search_space.keys(), result.x)), result.fun)}") min_x, min_fun = expected_minimum(result) print(f"Expected minimum: {(dict(zip(search_space.keys(), min_x)), min_fun)}") print("-" * 80) if args.plot_iterations: plot_iterations(result) if args.plot_objective: plot_objective(result) if args.plot_evaluations: plot_evaluations(result) # if args.plot_gaussian_process: # plot_gaussian_process(result, objective=evaluate_model) if __name__ == '__main__': parser = argparse.ArgumentParser(description='BOB is Bayesian Optimization for Black-box optimization.') parser.add_argument('output_path', type=str, help='Path to the directory where output and scores are stored or will be stored.') parser.add_argument('-n', '--n_calls', type=int, metavar='N', default=100, help='Number of calls to the black-box function. Set to 0 for a cold run.') parser.add_argument('-i', '--n_initial_points', type=int, metavar='N', default=10, help='Number of initial calls to the black-box function minus any already existing points in scores_path.') parser.add_argument('-1', '--plot_iterations', action='store_true', help='Display the iteration versus parameters plot, covergence and regret.') parser.add_argument('-2', '--plot_objective', action='store_true', help='Display the objective plot. n_samples = 40.') parser.add_argument('-3', '--plot_evaluations', action='store_true', help='Display a histogram of the samples at each other, more precise than plot_objective.') parser.add_argument('-4', '--plot_gaussian_process', action='store_true', help='Display the Gaussian process plots.') args = parser.parse_args() # Assign name to each dimension in search space for key, value in search_space.items(): if isinstance(value, Dimension): value.name = key # Optimize each model for model_filepath in models_filepath: model_optimize(model_filepath, args) plt.show()