Merge pull request karpathy#28 from klei22/csv_preparation

Add csv preparation scripts
gkielian · Nov 7, 2023 · f08d2e0 · f08d2e0
2 parents f5ead2e + e58ac5f
commit f08d2e0
Show file tree

Hide file tree

Showing 8 changed files with 379 additions and 0 deletions.
diff --git a/data/csv_data/.gitignore b/data/csv_data/.gitignore
@@ -0,0 +1,3 @@
+*.csv
+*.bin
+*.pkl
diff --git a/data/csv_data/README.md b/data/csv_data/README.md
@@ -0,0 +1,79 @@
+# Data-Shuffler for ML Permutation Invariance
+
+These Python scripts process time-series data from a CSV file to add permutation
+invariance to the csv's data fields.
+
+Each row in the CSV becomes a single line in the text file, with each cell
+represented by a unique lowercase letter (starting from 'a') followed by the
+value from the cell.
+
+One has the option to shuffle the letter-value pairs in each line, using a
+command-line flag.
+
+Training on this data with the shuffle option, will create a form of
+in-frame-permutation invariance.
+
+This will give -- during inference -- data the freedom to move around and unlock
+special capabilities otherwise not available to fixed-frame trained networks.
+
+For example, one can utilize a beam search for each of the labels to determine
+which of the letter value pairs gives the strongest certainty of data points in
+this frame, and build the next frame up incrementally using this technique.
+
+## Getting Started
+
+### Prerequisites
+
+- Python (3.6 or above)
+
+## Usage
+
+1. Separate your file into one with timestamp columns and one with data columns.
+
+2. Navigate to the directory where the script is located and run process_csv on
+   one's data-column file:
+
+```sh
+python3 process_csv.py <data_column_file> <processed_data_file> --shuffle --exclude e
+```
+
+3. Recombine the output file from process_csv.py with the time column data.
+
+
+```sh
+python3 combine_csvs.py <time_column_file> <processed_data_file> <processed_csv>
+```
+
+4. Prepare the processed_data_file for training
+
+```sh
+python3 prepare.py -i <processed_data_file>
+```
+
+5. `cd` to the `explorations` folder, and utilize the script to run training:
+
+
+```sh
+cd ../../explorations
+bash run_csv_data_training.sh
+```
+
+6. [Optional] Create an exploration script to test training and inference with
+   and without with and without shuffling.
+
+### Arguments
+
+- `input_file`: The path to the input CSV file containing time-series data.
+- `output_file`: The path to the output text file.
+- `--shuffle`: Optional flag to shuffle the order of letter-value pairs in each line.
+- `--exclude`: Optional flag to remove any letters used by the dataset (e.g. `e`
+    for scientific notation)
+
+### Example
+
+For a full example see the `main.sh` script on generated sine + noise data.
+
+## License
+
+This project is licensed under the MIT License
+
diff --git a/data/csv_data/combine_csvs.py b/data/csv_data/combine_csvs.py
@@ -0,0 +1,30 @@
+import csv
+import argparse
+
+def combine_csv_columns(file_path1, file_path2, output_file_path):
+    with open(file_path1, 'r') as file1, open(file_path2, 'r') as file2, open(output_file_path, 'w', newline='') as outfile:
+        reader1 = csv.reader(file1)
+        reader2 = csv.reader(file2)
+
+        for row1, row2 in zip(reader1, reader2):
+            # Combine the rows from both CSVs
+            combined_row = row1 + row2
+
+            # Join the row with no delimiter and write it to the file
+            outfile.write(''.join(combined_row))
+
+            # Write a newline character after each row
+            outfile.write('\n')
+
+def main(args):
+    combine_csv_columns(args.file1, args.file2, args.output)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Combine columns of two CSV files.')
+    parser.add_argument('file1', type=str, help='Path to the first input CSV file.')
+    parser.add_argument('file2', type=str, help='Path to the second input CSV file.')
+    parser.add_argument('output', type=str, help='Path to the output CSV file.')
+
+    args = parser.parse_args()
+    main(args)
+
diff --git a/data/csv_data/main.sh b/data/csv_data/main.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# This is an example using generated data to demonstrate:
+# 1. labelling of different time series data
+# 2. shuffling of different fields with their label
+
+set -x
+
+# CREATE SYNTHETIC DATA -- Skip if using your own data
+# This is a generator create two csvs (if `--split` is specified):
+# 1. time data called:    `time_filename.csv`
+# 2. signal data called:  `data_filename.csv`
+python3 sine_noise_generator.py --noise_level 0.3 --filename sine_data.csv --scientific --precision 2 --modulo 1000 --points 1000000 --split
+
+set +x
+echo -e "\nPreview: Generated Times"
+head time_sine_data.csv
+
+echo -e "\n\nPreview: Generated Data"
+head data_sine_data.csv
+echo -e "\n\n"
+set -x
+
+# Utilize a dataset only csv (no timestamps) in this case `data_sine_data.csv`
+# This script does two things:
+# 1. _prepend_ labels to the data
+# 2. (optionally) shuffle the data
+# Also 'e' is used for scientific notation, skip this letter when doing labelling
+python3 process_csv.py data_sine_data.csv sine_noise_sn_shuffled.csv --shuffle --exclude e
+
+# preview the result
+set +x
+echo -e "\nPreview: Shuffled Data"
+head sine_noise_sn_shuffled.csv
+echo -e "\n\n"
+set -x
+
+# recombine
+python3 combine_csvs.py time_sine_data.csv sine_noise_sn_shuffled.csv processed_sine_data.csv
+
+set +x
+echo -e "\nPreview: Timestamps with Shuffled Data"
+head processed_sine_data.csv
diff --git a/data/csv_data/prepare.py b/data/csv_data/prepare.py
@@ -0,0 +1,69 @@
+import os
+import pickle
+import numpy as np
+import argparse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input_file", required=True)
+    parser.add_argument("-t", "--token_file", default=None)
+    return parser.parse_args()
+
+
+def process_data(input_file, token_file):
+    with open(input_file, "r") as f:
+        data = f.read()
+
+    if token_file != None:
+        with open(token_file, "r") as f:
+            token_data = f.read()
+    else:
+        token_data = data
+
+    print(f"Length of dataset: {len(data):,}")
+
+    chars = sorted(list(set(token_data)))
+    vocab_size = len(chars)
+
+    print(f"Unique chars: {''.join(chars)}")
+    print(f"Vocab size: {vocab_size:,}")
+
+    stoi = {ch: i for i, ch in enumerate(chars)}
+    itos = {i: ch for i, ch in enumerate(chars)}
+
+    def encode(s):
+        return [stoi[c] for c in s]
+
+    def decode(l):
+        return "".join([itos[i] for i in l])
+
+    n = len(data)
+    train_data = data[: int(n * 0.9)]
+    val_data = data[int(n * 0.9) :]
+
+    train_ids = encode(train_data)
+    val_ids = encode(val_data)
+
+    print(f"Train tokens: {len(train_ids):,}")
+    print(f"Val tokens: {len(val_ids):,}")
+
+    return train_ids, val_ids, stoi, itos
+
+
+def save_data(train_ids, val_ids, stoi, itos, output_dir):
+    train_ids = np.array(train_ids, dtype=np.uint16)
+    val_ids = np.array(val_ids, dtype=np.uint16)
+
+    train_ids.tofile(os.path.join(output_dir, "train.bin"))
+    val_ids.tofile(os.path.join(output_dir, "val.bin"))
+
+    meta = {"vocab_size": len(stoi), "itos": itos, "stoi": stoi}
+    with open(os.path.join(output_dir, "meta.pkl"), "wb") as f:
+        pickle.dump(meta, f)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    train_ids, val_ids, stoi, itos = process_data(args.input_file, args.token_file)
+    save_data(train_ids, val_ids, stoi, itos, ".")
diff --git a/data/csv_data/process_csv.py b/data/csv_data/process_csv.py
@@ -0,0 +1,49 @@
+import csv
+import argparse
+import random
+
+def create_letter_mapping(exclude: list) -> dict:
+    # Create a mapping of indices to letters, skipping excluded letters.
+    allowed_letters = [chr(i) for i in range(ord('a'), ord('z') + 1) if chr(i) not in exclude]
+    return {i: letter for i, letter in enumerate(allowed_letters)}
+
+def process_csv(input_file: str, output_file: str, shuffle: bool, exclude: list) -> None:
+    # Create the letter mapping
+    letter_mapping = create_letter_mapping(exclude)
+
+    with open(input_file, mode="r") as csv_file, open(output_file, mode="w") as txt_file:
+        csv_reader = csv.reader(csv_file)
+
+        for row in csv_reader:
+            # Use the letter mapping to assign letters to values
+            letter_value_pairs = [
+                f"{letter_mapping[i]}{val}" for i, val in enumerate(row) if i in letter_mapping
+            ]
+
+            if shuffle:
+                random.shuffle(letter_value_pairs)
+
+            # Join the letter-value pairs with no spaces and write to the output file.
+            txt_file.write("".join(letter_value_pairs) + "\n")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Process a time-series CSV and convert it to a custom text format while excluding certain letters."
+    )
+    parser.add_argument("input_file", type=str, help="Path to the input CSV file.")
+    parser.add_argument("output_file", type=str, help="Path to the output text file.")
+    parser.add_argument(
+        "--shuffle",
+        action="store_true",
+        help="Whether to shuffle the order of letter and value pairs.",
+    )
+    parser.add_argument(
+        "--exclude",
+        nargs="*",
+        default=[],
+        help="A list of letters to exclude from the letter labeling.",
+    )
+
+    args = parser.parse_args()
+    process_csv(args.input_file, args.output_file, args.shuffle, args.exclude)
+
diff --git a/data/csv_data/sine_noise_generator.py b/data/csv_data/sine_noise_generator.py
@@ -0,0 +1,98 @@
+import argparse
+import numpy as np
+import pandas as pd
+
+# Function to generate sine wave with noise
+def generate_sine_wave_with_noise(freq, sample_rate, num_points, noise_level):
+    t = np.arange(num_points)  # Time axis with integer values starting from 0
+    sine_wave = np.sin(2 * np.pi * freq * t / sample_rate)
+    noise = noise_level * np.random.normal(size=num_points)
+    combined_signal = sine_wave + noise
+    return t, sine_wave, noise, combined_signal
+
+# Function to format data as scientific notation if required
+def format_data_as_scientific(df, precision):
+    return df.applymap(lambda x: f"{x:.{precision}e}")
+
+# Function to save the data to CSV
+def save_to_csv(time, signal, noise, combined_signal, filename, scientific, precision, split_files, modulo):
+    # Apply modulo if necessary
+    if modulo is not None:
+        time = time % modulo
+
+    # Create the DataFrame
+    data = {
+        'signal': signal,
+        'noise': noise,
+        'signal_plus_noise': combined_signal
+    }
+
+    if split_files:
+        # Save time data to a separate CSV
+        time_df = pd.DataFrame({'seconds_from_start': time})
+        if scientific:
+            time_df = format_data_as_scientific(time_df, precision)
+        time_df.to_csv(f"time_{filename}", header=False, index=False)
+
+        # Save data to a separate CSV
+        data_df = pd.DataFrame(data)
+        if scientific:
+            data_df = format_data_as_scientific(data_df, precision)
+        data_df.to_csv(f"data_{filename}", header=False, index=False)
+    else:
+        # Combine time and data for a single CSV
+        df = pd.DataFrame({'seconds_from_start': time, **data})
+        if scientific:
+            df = format_data_as_scientific(df, precision)
+        df.to_csv(filename, header=False, index=False)
+
+# Parse command-line arguments
+def parse_arguments():
+    parser = argparse.ArgumentParser(description='Generate a sine wave with noise and export to CSV.')
+    parser.add_argument('-n', '--noise_level', type=float, default=0.5, required=True,
+                        help='Level of noise relative to the sine wave (0-1).')
+    parser.add_argument('-f', '--filename', type=str, default='sine_wave.csv',
+                        help='Name of the output CSV file.')
+    parser.add_argument('--scientific', action='store_true',
+                        help='Output numbers in scientific notation.')
+    parser.add_argument('--precision', type=int, default=2,
+                        help='Number of digits past the decimal point in scientific notation.')
+    parser.add_argument('--points', type=int, default=5000,
+                        help='Total number of data points to be created.')
+    parser.add_argument('--split', action='store_true',
+                        help='Save time data and signal data in separate CSV files.')
+    parser.add_argument('--modulo', type=int,
+                        help='Modulo value to apply to the time data.')
+    args = parser.parse_args()
+    if not (0 <= args.noise_level <= 1):
+        raise ValueError('Noise level must be between 0 and 1.')
+    if args.precision < 0:
+        raise ValueError('Precision must be a non-negative integer.')
+    if args.points <= 0:
+        raise ValueError('Number of data points must be a positive integer.')
+    return args
+
+def main():
+    args = parse_arguments()
+
+    # Parameters for sine wave generation
+    frequency = 5  # Frequency in Hz
+    sample_rate = 500  # Sample rate in Hz
+    num_points = args.points  # Total number of data points
+
+    # Generate the sine wave with noise
+    time, sine_wave, noise, combined_signal = generate_sine_wave_with_noise(
+        frequency, sample_rate, num_points, args.noise_level
+    )
+
+    # Save to CSV file(s)
+    save_to_csv(time, sine_wave, noise, combined_signal, args.filename, args.scientific, args.precision, args.split, args.modulo)
+    if args.split:
+        print(f"Time data saved to time_{args.filename}")
+        print(f"Signal data saved to data_{args.filename}")
+    else:
+        print(f"Sine wave data with noise saved to {args.filename}")
+
+if __name__ == '__main__':
+    main()
+
diff --git a/explorations/run_csv_data_training.sh b/explorations/run_csv_data_training.sh
@@ -0,0 +1,9 @@
+#/bin/bash
+
+cd ../
+python3 train.py \
+  --max_iters 3000 \
+  --dataset csv_data \
+  --tensorboard_project csv_data \
+  --tensorboard_run_name csv_data
+
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    *.csv
+    *.bin
+    *.pkl