forked from marian42/butterflies
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_metadata.py
82 lines (69 loc) · 2.13 KB
/
create_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from tqdm import tqdm
import csv
from config import *
class DataProperty():
def __init__(self, column, name, type=str):
self.column = column
self.name = name
self.values = []
self.type = type
columns = [
DataProperty(17, 'Family'),
DataProperty(20, 'Genus'),
DataProperty(62, 'Species'),
DataProperty(31, 'Subspecies'),
DataProperty(25, 'Higher Classification'),
DataProperty(61, 'Sex'),
DataProperty(10, 'Latitude', float),
DataProperty(11, 'Longitude', float),
DataProperty(8, 'Country'),
DataProperty(59, 'Name'),
DataProperty(60, 'Name Author'),
DataProperty(9, 'Day'),
DataProperty(50, 'Month'),
DataProperty(70, 'Year'),
DataProperty(0, 'id', int),
DataProperty(51, 'Occurence ID'),
]
file = open('data/occurrence.csv', 'r')
reader = csv.reader(file)
reader_iterator = iter(reader)
column_names = next(reader_iterator)
row_by_id = {}
row_index = 0
progress = tqdm(total=1039840, desc='Reading occurence.csv')
for row in reader_iterator:
id = int(row[0])
progress.update()
if 'lepidoptera' not in row[25].lower():
continue
for data_property in columns:
data_property.values.append(row[data_property.column].strip())
row_by_id[id] = row_index
row_index += 1
strings = []
name_ids = {}
file = open('data/multimedia.csv', 'r')
reader = csv.reader(file)
reader_iterator = iter(reader)
column_names = next(reader_iterator)
progress = tqdm(total=2126980, desc='Reading multimedia.csv')
image_ids = []
for row in reader_iterator:
progress.update()
id = int(row[0])
image = row[5].split('/')[-3]
title = row[2]
if '_label_' in title:
continue
if row[3] != 'image/jpeg':
continue
if id not in row_by_id:
continue
image_ids.append((image, id))
with open(METADATA_FILE_NAME, 'w') as file:
csv_writer = csv.writer(file, delimiter=',')
csv_writer.writerow([c.name for c in columns] + ['image'])
for image, id in tqdm(image_ids, desc='Writing metadata.csv'):
row_index = row_by_id[id]
csv_writer.writerow([c.values[row_index] for c in columns] + [image])