Skip to content

Commit

Permalink
Merge pull request #31 from ZKStats/feat/csv-inputs
Browse files Browse the repository at this point in the history
csv as input data
  • Loading branch information
mhchia authored May 7, 2024
2 parents a35c0af + ae6277a commit 09db78e
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 20 deletions.
4 changes: 2 additions & 2 deletions tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
ERROR_CIRCUIT_RELAXED = 0.1


def data_to_file(data_path: Path, data: list[torch.Tensor]) -> dict[str, list]:
def data_to_json_file(data_path: Path, data: list[torch.Tensor]) -> dict[str, list]:
column_names = [f"columns_{i}" for i in range(len(data))]
column_to_data = {
column: d.tolist()
Expand Down Expand Up @@ -45,7 +45,7 @@ def compute(
data_path = basepath / "data.json"
data_commitment_path = basepath / "commitments.json"

column_to_data = data_to_file(data_path, data)
column_to_data = data_to_json_file(data_path, data)
# If selected_columns_params is None, select all columns
if selected_columns_params is None:
selected_columns = list(column_to_data.keys())
Expand Down
90 changes: 85 additions & 5 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

import torch

from zkstats.core import generate_data_commitment
from zkstats.core import generate_data_commitment, prover_gen_settings, _preprocess_data_file_to_json, verifier_define_calculation
from zkstats.computation import computation_to_model

from .helpers import data_to_file, compute
from .helpers import data_to_json_file, compute


def test_get_data_commitment_maps(tmp_path, column_0, column_1, scales):
Expand All @@ -16,7 +16,7 @@ def test_get_data_commitment_maps(tmp_path, column_0, column_1, scales):
# "columns_0": [1, 2, 3, 4, 5],
# "columns_1": [6, 7, 8, 9, 10],
# }
data_json = data_to_file(data_path, [column_0, column_1])
data_json = data_to_json_file(data_path, [column_0, column_1])
# data_commitment is a mapping[scale -> mapping[column_name, commitment_hex]]
# {
# scale_0: {
Expand Down Expand Up @@ -51,7 +51,7 @@ def test_get_data_commitment_maps_hardcoded(tmp_path):
data_commitment_path = tmp_path / "commitments.json"
column_0 = torch.tensor([3.0, 4.5, 1.0, 2.0, 7.5, 6.4, 5.5])
column_1 = torch.tensor([2.7, 3.3, 1.1, 2.2, 3.8, 8.2, 4.4])
data_to_file(data_path, [column_0, column_1])
data_to_json_file(data_path, [column_0, column_1])
scales = [2, 3]
generate_data_commitment(data_path, scales, data_commitment_path)
with open(data_commitment_path, "r") as f:
Expand All @@ -63,7 +63,7 @@ def test_get_data_commitment_maps_hardcoded(tmp_path):

def test_integration_select_partial_columns(tmp_path, column_0, column_1, error, scales):
data_path = tmp_path / "data.json"
data_json = data_to_file(data_path, [column_0, column_1])
data_json = data_to_json_file(data_path, [column_0, column_1])
columns = list(data_json.keys())
assert len(columns) == 2
# Select only the first column from two columns
Expand All @@ -75,3 +75,83 @@ def simple_computation(state, x):
_, model = computation_to_model(simple_computation, error)
# gen settings, setup, prove, verify
compute(tmp_path, [column_0, column_1], model, scales, selected_columns)


def test_csv_data(tmp_path, column_0, column_1, error, scales):
data_json_path = tmp_path / "data.csv"
data_csv_path = tmp_path / "data.csv"
data_json = data_to_json_file(data_json_path, [column_0, column_1])
json_file_to_csv(data_json_path, data_csv_path)

selected_columns = list(data_json.keys())

def simple_computation(state, x):
return state.mean(x[0])

sel_data_path = tmp_path / "comb_data.json"
model_path = tmp_path / "model.onnx"
settings_path = tmp_path / "settings.json"
data_commitment_path = tmp_path / "commitments.json"

# Test: `generate_data_commitment` works with csv
generate_data_commitment(data_csv_path, scales, data_commitment_path)

# Test: `prover_gen_settings` works with csv
_, model_for_proving = computation_to_model(simple_computation, error)
prover_gen_settings(
data_path=data_csv_path,
selected_columns=selected_columns,
sel_data_path=str(sel_data_path),
prover_model=model_for_proving,
prover_model_path=str(model_path),
scale=scales,
mode="resources",
settings_path=str(settings_path),
)

# Test: `prover_gen_settings` works with csv
# Instantiate the model for verification since the state of `model_for_proving` is changed after `prover_gen_settings`
_, model_for_verification = computation_to_model(simple_computation, error)
verifier_define_calculation(data_csv_path, selected_columns, str(sel_data_path), model_for_verification, str(model_path))


def json_file_to_csv(data_json_path, data_csv_path):
with open(data_json_path, "r") as f:
data_from_json = json.load(f)
# Generate csv file from json
column_names = list(data_from_json.keys())
len_columns = len(data_from_json[column_names[0]])
for column in column_names:
assert len(data_from_json[column]) == len_columns, "All columns should have the same length"
rows = [
[str(data_from_json[column][i]) for column in column_names]
for i in range(len_columns)
]
with open(data_csv_path, "w") as f:
f.write(",".join(column_names) + "\n")
for row in rows:
f.write(",".join(row) + "\n")


def test__preprocess_data_file_to_json(tmp_path, column_0, column_1):
data_json_path = tmp_path / "data.json"
data_from_json = data_to_json_file(data_json_path, [column_0, column_1])

# Test: csv can be converted to json
# 1. Generate a csv file from json
data_csv_path = tmp_path / "data.csv"
json_file_to_csv(data_json_path, data_csv_path)
# 2. Convert csv to json
data_from_csv_json_path = tmp_path / "data_from_csv.json"
_preprocess_data_file_to_json(data_csv_path, data_from_csv_json_path)
with open(data_from_csv_json_path, "r") as f:
data_from_csv = json.load(f)
# 3. Compare the two json files
assert data_from_csv == data_from_json

# Test: this function can also handle json format by just copying the file
new_data_json_path = tmp_path / "new_data.json"
_preprocess_data_file_to_json(data_json_path, new_data_json_path)
with open(new_data_json_path, "r") as f:
new_data_from_json = json.load(f)
assert new_data_from_json == data_from_json
83 changes: 70 additions & 13 deletions zkstats/core.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from typing import Type, Sequence, Mapping, Union, Literal
import torch
import ezkl
import csv
from pathlib import Path
from typing import Type, Sequence, Mapping, Union, Literal, Callable
from enum import Enum
import os
import numpy as np
import json
import time

import torch
import ezkl

from zkstats.computation import IModel


Expand Down Expand Up @@ -40,7 +44,11 @@ def create_dummy(data_path: str, dummy_data_path: str) -> None:
"""
Create a dummy data file with randomized data based on the shape of the original data.
"""
data = json.loads(open(data_path, "r").read())
# Convert data file to json under the same directory but with suffix .json
data_path: Path = Path(data_path)
data_json_path = Path(data_path).with_suffix(DataExtension.JSON.value)

data = json.loads(open(data_json_path, "r").read())
# assume all columns have same number of rows
dummy_data ={}
for col in data:
Expand Down Expand Up @@ -270,16 +278,17 @@ def generate_data_commitment(data_path: str, scales: Sequence[int], data_commitm
Generate and store data commitment maps for different scales so that verifiers can verify
proofs with different scales.
:param data_path: path to the data file. The data file should be a JSON file with the following format:
{
"column_0": [number_0, number_1, ...],
"column_1": [number_0, number_1, ...],
}
:param data_path: data file path. The format must be anything defined in `DataExtension`
:param scales: a list of scales to use for the commitments
:param data_commitment_path: path to store the generated data commitment maps
"""

with open(data_path) as f:
# Convert `data_path` to json file `data_json_path`
data_path: Path = Path(data_path)
data_json_path = Path(data_path).with_suffix(DataExtension.JSON.value)
_preprocess_data_file_to_json(data_path, data_json_path)

with open(data_json_path) as f:
data_json = json.load(f)
data_commitments = {
str(scale): {
Expand Down Expand Up @@ -367,14 +376,62 @@ def _gen_settings(
print("setting: ", f_setting.read())


def _csv_file_to_json(old_file_path: Union[Path, str], out_data_json_path: Union[Path, str], *, delimiter: str = ",") -> None:
data_csv_path = Path(old_file_path)
with open(data_csv_path, 'r') as f_csv:
reader = csv.reader(f_csv, delimiter=delimiter, strict=True)
# Read all data from the reader to `rows`
rows_with_column_name = tuple(reader)
if len(rows_with_column_name) < 1:
raise ValueError("No column names in the CSV file")
if len(rows_with_column_name) < 2:
raise ValueError("No data in the CSV file")
column_names = rows_with_column_name[0]
rows = rows_with_column_name[1:]

columns = [
[
float(rows[j][i])
for j in range(len(rows))
]
for i in range(len(rows[0]))
]
data = {
column_name: column_data
for column_name, column_data in zip(column_names, columns)
}
with open(out_data_json_path, "w") as f_json:
json.dump(data, f_json)


class DataExtension(Enum):
CSV = ".csv"
JSON = ".json"


DATA_FORMAT_PREPROCESSING_FUNCTION: dict[DataExtension, Callable[[Union[Path, str], Path], None]] = {
DataExtension.CSV: _csv_file_to_json,
DataExtension.JSON: lambda old_file_path, out_data_json_path: Path(out_data_json_path).write_text(Path(old_file_path).read_text())
}

def _preprocess_data_file_to_json(data_path: Union[Path, str], out_data_json_path: Path):
data_file_extension = DataExtension(data_path.suffix)
preprocess_function = DATA_FORMAT_PREPROCESSING_FUNCTION[data_file_extension]
preprocess_function(data_path, out_data_json_path)


def _process_data(
data_path: str,
data_path: Union[str | Path],
col_array: list[str],
sel_data_path: list[str],
) -> list[torch.Tensor]:
data_tensor_array=[]
sel_data = []
data_onefile = json.loads(open(data_path, "r").read())
data_path: Path = Path(data_path)
# Convert data file to json under the same directory but with suffix .json
data_json_path = Path(data_path).with_suffix(DataExtension.JSON.value)
_preprocess_data_file_to_json(data_path, data_json_path)
data_onefile = json.loads(open(data_json_path, "r").read())

for col in col_array:
data = data_onefile[col]
Expand All @@ -394,4 +451,4 @@ def _get_commitment_for_column(column: list[float], scale: int) -> str:
res_poseidon_hash = ezkl.poseidon_hash(serialized_data)[0]
# res_hex = ezkl.vecu64_to_felt(res_poseidon_hash[0])

return res_poseidon_hash
return res_poseidon_hash

0 comments on commit 09db78e

Please sign in to comment.