-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcaptions_copy_from_reference_path.py
79 lines (64 loc) · 3.02 KB
/
captions_copy_from_reference_path.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import shutil
from collections import defaultdict
# Function to scan a directory for specific file types
def scan_directory(path, extensions):
files = []
for root, _, filenames in os.walk(path):
for filename in filenames:
if filename.lower().endswith(extensions):
files.append(os.path.join(root, filename))
return files
# Function to extract base name without extension and optional " (copy X)" part
def clean_basename(file_name):
base_name = os.path.splitext(file_name)[0]
if " (copy" in base_name:
base_name = base_name[:base_name.rfind(" (copy")]
return base_name
# Main function to process images and match TXT files
def process_files(work_path, reference_path):
# Supported image extensions
image_extensions = (".jpg", ".jpeg", ".png", ".webp")
text_extension = ".txt"
# Scan directories
image_files = scan_directory(work_path, image_extensions)
text_files = scan_directory(reference_path, (text_extension,))
# Organize TXT files by basename and folder
text_files_dict = defaultdict(list)
for text_file in text_files:
folder_name = os.path.basename(os.path.dirname(text_file))
base_name = os.path.splitext(os.path.basename(text_file))[0]
text_files_dict[base_name].append((text_file, folder_name))
matched_count = 0
missing_count = 0
for image_file in image_files:
image_folder = os.path.basename(os.path.dirname(image_file))
image_name = os.path.basename(image_file)
clean_name = clean_basename(image_name)
# Try matching TXT file with exact and cleaned names
potential_matches = text_files_dict.get(clean_name, [])
if not potential_matches:
potential_matches = text_files_dict.get(image_name, [])
if potential_matches:
# Prefer matches in the same folder as the image
preferred_matches = [t for t in potential_matches if t[1] == image_folder]
selected_match = preferred_matches[0] if preferred_matches else potential_matches[0]
# Copy and rename the matched TXT file
source_txt_path = selected_match[0]
dest_txt_path = os.path.join(os.path.dirname(image_file), f"{os.path.splitext(image_name)[0]}{text_extension}")
shutil.copy(source_txt_path, dest_txt_path)
matched_count += 1
else:
print(f"No matching TXT file found for image: {image_file}")
missing_count += 1
# Print final report
print(f"Matched TXT files: {matched_count}")
print(f"Missing TXT files: {missing_count}")
# Example usage
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Match images with TXT files.")
parser.add_argument("--work", required=True, help="Path to the work directory containing images.")
parser.add_argument("--references", required=True, help="Path to the references directory containing TXT files.")
args = parser.parse_args()
process_files(args.work, args.references)