forked from Harry24k/adversarial-attacks-pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_dataset.py
82 lines (72 loc) · 3.3 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
import shutil
import random
import re
from PIL import Image
from tqdm import tqdm
# Paths to the original datasets
clean_path = '/home/elios/lazzaroni/adv/datasets/clean'
pixle_path = '/home/elios/lazzaroni/adv/datasets/pixle'
poltergeist_path = '/home/elios/lazzaroni/adv/datasets/poltergeist'
# Path to the new dataset
new_dataset_path = '/home/elios/lazzaroni/adv/datasets/FC_5K_3K_3K'
# Number of images for each split
num_images = {
'train': 5000,
'val': 3000,
'test': 3000
}
# Splits and classes
splits = ['train', 'val', 'test']
classes = ['clean', 'pixle', 'poltergeist']
# Create new dataset directory structure
def create_directory_structure(base_path, splits, classes):
for split in splits:
for cls in classes:
dir_path = os.path.join(base_path, split, cls)
os.makedirs(dir_path, exist_ok=True)
create_directory_structure(new_dataset_path, splits, classes)
# Function to get base filenames from clean dataset for a split
def get_base_filenames(path, split, subfolder='images'):
split_path = os.path.join(path, split)
if os.path.exists(os.path.join(split_path, subfolder)):
split_path = os.path.join(split_path, subfolder)
base_filenames = set()
for fname in os.listdir(split_path):
if fname.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
base_name = os.path.splitext(fname)[0]
base_filenames.add(base_name)
return base_filenames
# Function to process and copy images
def process_and_copy_images(split, basenames, source_paths, dest_path):
for base_name in tqdm(basenames, desc=f"Processing {base_name}"):
for cls, source_path in source_paths.items():
split_source_path = os.path.join(source_path, split)
if cls == 'clean' and os.path.exists(os.path.join(split_source_path, 'images')):
split_source_path = os.path.join(split_source_path, 'images')
if not os.path.exists(split_source_path):
continue
pattern = re.compile('^' + re.escape(base_name))
matched_files = [f for f in os.listdir(split_source_path)
if pattern.match(os.path.splitext(f)[0]) and f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]
random.shuffle(matched_files)
for fname in tqdm(matched_files, desc=f"Copying files for {base_name} - {cls}", leave=False):
src_file = os.path.join(split_source_path, fname)
dest_dir = os.path.join(dest_path, split, cls)
dest_file = os.path.join(dest_dir, os.path.splitext(fname)[0] + '.png')
try:
with Image.open(src_file) as img:
# img = img.convert('RGB')
# img = img.resize((640, 480))
img.save(dest_file, 'PNG')
except Exception as e:
print(f"Error processing file {src_file}: {e}")
source_paths = {
'clean': clean_path,
'pixle': pixle_path,
'poltergeist': poltergeist_path
}
for split in splits:
clean_basenames = get_base_filenames(clean_path, split)
selected_basenames = random.sample(list(clean_basenames), min(num_images[split], len(clean_basenames)))
process_and_copy_images(split, selected_basenames, source_paths, new_dataset_path)