diff --git a/tests/helpers.py b/tests/helpers.py index 37471f9..747e7c2 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -16,7 +16,7 @@ ERROR_CIRCUIT_RELAXED = 0.1 -def data_to_file(data_path: Path, data: list[torch.Tensor]) -> dict[str, list]: +def data_to_json_file(data_path: Path, data: list[torch.Tensor]) -> dict[str, list]: column_names = [f"columns_{i}" for i in range(len(data))] column_to_data = { column: d.tolist() @@ -45,7 +45,7 @@ def compute( data_path = basepath / "data.json" data_commitment_path = basepath / "commitments.json" - column_to_data = data_to_file(data_path, data) + column_to_data = data_to_json_file(data_path, data) # If selected_columns_params is None, select all columns if selected_columns_params is None: selected_columns = list(column_to_data.keys()) diff --git a/tests/test_core.py b/tests/test_core.py index e130deb..95e3c7d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -2,10 +2,10 @@ import torch -from zkstats.core import generate_data_commitment +from zkstats.core import generate_data_commitment, prover_gen_settings, _preprocess_data_file_to_json, verifier_define_calculation from zkstats.computation import computation_to_model -from .helpers import data_to_file, compute +from .helpers import data_to_json_file, compute def test_get_data_commitment_maps(tmp_path, column_0, column_1, scales): @@ -16,7 +16,7 @@ def test_get_data_commitment_maps(tmp_path, column_0, column_1, scales): # "columns_0": [1, 2, 3, 4, 5], # "columns_1": [6, 7, 8, 9, 10], # } - data_json = data_to_file(data_path, [column_0, column_1]) + data_json = data_to_json_file(data_path, [column_0, column_1]) # data_commitment is a mapping[scale -> mapping[column_name, commitment_hex]] # { # scale_0: { @@ -51,7 +51,7 @@ def test_get_data_commitment_maps_hardcoded(tmp_path): data_commitment_path = tmp_path / "commitments.json" column_0 = torch.tensor([3.0, 4.5, 1.0, 2.0, 7.5, 6.4, 5.5]) column_1 = torch.tensor([2.7, 3.3, 1.1, 2.2, 3.8, 8.2, 4.4]) - data_to_file(data_path, [column_0, column_1]) + data_to_json_file(data_path, [column_0, column_1]) scales = [2, 3] generate_data_commitment(data_path, scales, data_commitment_path) with open(data_commitment_path, "r") as f: @@ -63,7 +63,7 @@ def test_get_data_commitment_maps_hardcoded(tmp_path): def test_integration_select_partial_columns(tmp_path, column_0, column_1, error, scales): data_path = tmp_path / "data.json" - data_json = data_to_file(data_path, [column_0, column_1]) + data_json = data_to_json_file(data_path, [column_0, column_1]) columns = list(data_json.keys()) assert len(columns) == 2 # Select only the first column from two columns @@ -75,3 +75,83 @@ def simple_computation(state, x): _, model = computation_to_model(simple_computation, error) # gen settings, setup, prove, verify compute(tmp_path, [column_0, column_1], model, scales, selected_columns) + + +def test_csv_data(tmp_path, column_0, column_1, error, scales): + data_json_path = tmp_path / "data.csv" + data_csv_path = tmp_path / "data.csv" + data_json = data_to_json_file(data_json_path, [column_0, column_1]) + json_file_to_csv(data_json_path, data_csv_path) + + selected_columns = list(data_json.keys()) + + def simple_computation(state, x): + return state.mean(x[0]) + + sel_data_path = tmp_path / "comb_data.json" + model_path = tmp_path / "model.onnx" + settings_path = tmp_path / "settings.json" + data_commitment_path = tmp_path / "commitments.json" + + # Test: `generate_data_commitment` works with csv + generate_data_commitment(data_csv_path, scales, data_commitment_path) + + # Test: `prover_gen_settings` works with csv + _, model_for_proving = computation_to_model(simple_computation, error) + prover_gen_settings( + data_path=data_csv_path, + selected_columns=selected_columns, + sel_data_path=str(sel_data_path), + prover_model=model_for_proving, + prover_model_path=str(model_path), + scale=scales, + mode="resources", + settings_path=str(settings_path), + ) + + # Test: `prover_gen_settings` works with csv + # Instantiate the model for verification since the state of `model_for_proving` is changed after `prover_gen_settings` + _, model_for_verification = computation_to_model(simple_computation, error) + verifier_define_calculation(data_csv_path, selected_columns, str(sel_data_path), model_for_verification, str(model_path)) + + +def json_file_to_csv(data_json_path, data_csv_path): + with open(data_json_path, "r") as f: + data_from_json = json.load(f) + # Generate csv file from json + column_names = list(data_from_json.keys()) + len_columns = len(data_from_json[column_names[0]]) + for column in column_names: + assert len(data_from_json[column]) == len_columns, "All columns should have the same length" + rows = [ + [str(data_from_json[column][i]) for column in column_names] + for i in range(len_columns) + ] + with open(data_csv_path, "w") as f: + f.write(",".join(column_names) + "\n") + for row in rows: + f.write(",".join(row) + "\n") + + +def test__preprocess_data_file_to_json(tmp_path, column_0, column_1): + data_json_path = tmp_path / "data.json" + data_from_json = data_to_json_file(data_json_path, [column_0, column_1]) + + # Test: csv can be converted to json + # 1. Generate a csv file from json + data_csv_path = tmp_path / "data.csv" + json_file_to_csv(data_json_path, data_csv_path) + # 2. Convert csv to json + data_from_csv_json_path = tmp_path / "data_from_csv.json" + _preprocess_data_file_to_json(data_csv_path, data_from_csv_json_path) + with open(data_from_csv_json_path, "r") as f: + data_from_csv = json.load(f) + # 3. Compare the two json files + assert data_from_csv == data_from_json + + # Test: this function can also handle json format by just copying the file + new_data_json_path = tmp_path / "new_data.json" + _preprocess_data_file_to_json(data_json_path, new_data_json_path) + with open(new_data_json_path, "r") as f: + new_data_from_json = json.load(f) + assert new_data_from_json == data_from_json diff --git a/zkstats/core.py b/zkstats/core.py index 4b1a04e..edab408 100644 --- a/zkstats/core.py +++ b/zkstats/core.py @@ -1,11 +1,15 @@ -from typing import Type, Sequence, Mapping, Union, Literal -import torch -import ezkl +import csv +from pathlib import Path +from typing import Type, Sequence, Mapping, Union, Literal, Callable +from enum import Enum import os import numpy as np import json import time +import torch +import ezkl + from zkstats.computation import IModel @@ -40,7 +44,11 @@ def create_dummy(data_path: str, dummy_data_path: str) -> None: """ Create a dummy data file with randomized data based on the shape of the original data. """ - data = json.loads(open(data_path, "r").read()) + # Convert data file to json under the same directory but with suffix .json + data_path: Path = Path(data_path) + data_json_path = Path(data_path).with_suffix(DataExtension.JSON.value) + + data = json.loads(open(data_json_path, "r").read()) # assume all columns have same number of rows dummy_data ={} for col in data: @@ -270,16 +278,17 @@ def generate_data_commitment(data_path: str, scales: Sequence[int], data_commitm Generate and store data commitment maps for different scales so that verifiers can verify proofs with different scales. - :param data_path: path to the data file. The data file should be a JSON file with the following format: - { - "column_0": [number_0, number_1, ...], - "column_1": [number_0, number_1, ...], - } + :param data_path: data file path. The format must be anything defined in `DataExtension` :param scales: a list of scales to use for the commitments :param data_commitment_path: path to store the generated data commitment maps """ - with open(data_path) as f: + # Convert `data_path` to json file `data_json_path` + data_path: Path = Path(data_path) + data_json_path = Path(data_path).with_suffix(DataExtension.JSON.value) + _preprocess_data_file_to_json(data_path, data_json_path) + + with open(data_json_path) as f: data_json = json.load(f) data_commitments = { str(scale): { @@ -367,14 +376,62 @@ def _gen_settings( print("setting: ", f_setting.read()) +def _csv_file_to_json(old_file_path: Union[Path, str], out_data_json_path: Union[Path, str], *, delimiter: str = ",") -> None: + data_csv_path = Path(old_file_path) + with open(data_csv_path, 'r') as f_csv: + reader = csv.reader(f_csv, delimiter=delimiter, strict=True) + # Read all data from the reader to `rows` + rows_with_column_name = tuple(reader) + if len(rows_with_column_name) < 1: + raise ValueError("No column names in the CSV file") + if len(rows_with_column_name) < 2: + raise ValueError("No data in the CSV file") + column_names = rows_with_column_name[0] + rows = rows_with_column_name[1:] + + columns = [ + [ + float(rows[j][i]) + for j in range(len(rows)) + ] + for i in range(len(rows[0])) + ] + data = { + column_name: column_data + for column_name, column_data in zip(column_names, columns) + } + with open(out_data_json_path, "w") as f_json: + json.dump(data, f_json) + + +class DataExtension(Enum): + CSV = ".csv" + JSON = ".json" + + +DATA_FORMAT_PREPROCESSING_FUNCTION: dict[DataExtension, Callable[[Union[Path, str], Path], None]] = { + DataExtension.CSV: _csv_file_to_json, + DataExtension.JSON: lambda old_file_path, out_data_json_path: Path(out_data_json_path).write_text(Path(old_file_path).read_text()) +} + +def _preprocess_data_file_to_json(data_path: Union[Path, str], out_data_json_path: Path): + data_file_extension = DataExtension(data_path.suffix) + preprocess_function = DATA_FORMAT_PREPROCESSING_FUNCTION[data_file_extension] + preprocess_function(data_path, out_data_json_path) + + def _process_data( - data_path: str, + data_path: Union[str | Path], col_array: list[str], sel_data_path: list[str], ) -> list[torch.Tensor]: data_tensor_array=[] sel_data = [] - data_onefile = json.loads(open(data_path, "r").read()) + data_path: Path = Path(data_path) + # Convert data file to json under the same directory but with suffix .json + data_json_path = Path(data_path).with_suffix(DataExtension.JSON.value) + _preprocess_data_file_to_json(data_path, data_json_path) + data_onefile = json.loads(open(data_json_path, "r").read()) for col in col_array: data = data_onefile[col] @@ -394,4 +451,4 @@ def _get_commitment_for_column(column: list[float], scale: int) -> str: res_poseidon_hash = ezkl.poseidon_hash(serialized_data)[0] # res_hex = ezkl.vecu64_to_felt(res_poseidon_hash[0]) - return res_poseidon_hash \ No newline at end of file + return res_poseidon_hash