src_model/MapYourCity_ExampleDataLoader.py

#!/usr/bin/env python
# coding: utf-8

# Example DataLoader for the MapYourCity dataset      

"""
ABOUT SCRIPT: 
This file creates an example DataLoader for the training, validation, and test sets using PyTorch 
This code is generated by Nikolaos Dionelis @ESA 
LAST EDITED: 05/02/2024 
"""

# Python library imports      
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd
import rasterio 
import os  
import cv2
# We use PyTorch 
import torch

class Dataset(torch.utils.data.Dataset):    
    """
    This class defines the data with all the 3 modalities   
    """
    def __init__(self, list_IDs):
        """
        This function initializes the data class - constructor function   
        :param list_IDs: the PID numbers - (i.e. the pid) 
        """
        self.list_IDs = list_IDs 

    def __len__(self):
        return len(self.list_IDs) 

    def __getitem__(self, index): 
        ID = self.list_IDs[index] 
        X = cv2.imread(train_path + ID + '/street.jpg')
        X = cv2.resize(X, (256, 256)) 
        X2 = cv2.imread(train_path + ID + '/orthophoto.tif') 
        X2 = cv2.resize(X2, (256, 256)) 
        X3 = rasterio.open(train_path + ID + '/s2_l2a.tif').read() 
        X3 = np.transpose(X3, [1, 2, 0]) 
        y = int(open(train_path + ID + '/label.txt', "r").read())
        return X, X2, X3, y 

# Define the batch size    
#BATCH_SIZE = 256 
BATCH_SIZE = 32 

# Define the paths to the data   
# input_path = "directory with MapYourCity image files" 
input_path = "/Data/ndionelis/building-age-dataset/" # This line has to be modified/ changed  
train_path = input_path + "train/data/"
test_path = input_path + "test/data/"

# Load the csv files
test_df = pd.read_csv(input_path + "test/test-set.csv")
train_df = pd.read_csv(input_path + "train/train-set.csv")
train_df.head()
test_df.head() 

# For the datasets
names_data = os.listdir(train_path) # to not load all data in a single tensor, load only the names                     
length_names = len(names_data) 
perm = torch.randperm(length_names)
#idx = perm[:round(0.8*length_names)] # draw round(0.8*length_names) samples      
#torch.save(idx, 'indexForTrainVal.pt')       
idx = torch.load('indexForTrainVal.pt')  

# For the training data
names_data = np.array(names_data) 
idx = idx.numpy() 
training_data = names_data[idx]

# For the test data
#test_data = names_data[~idx]         
mask = np.ones(names_data.size, dtype=bool)  
mask[idx] = False
test_data = names_data[mask]

# For the training set 
train_set = Dataset(training_data.tolist()) 
train_loader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)  
#train_loader_iter = iter(train_loader)
#train_loader_iter_next = next(train_loader_iter) 

# Example for the test set
test_set = Dataset(test_data.tolist())  
test_loader = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE)   
#test_loader_iter = iter(test_loader) 
#test_loader_iter_next = next(test_loader_iter)   

# For the DataLoaders
# We use train_loader and test_loader    
train_dataloader = train_loader          
valid_dataloader = test_loader 

# The sizes depend on the BATCH_SIZE    
# We use PyTorch .shape
print(next(iter(train_dataloader))[0].shape) 
print(next(iter(train_dataloader))[1].shape)
print(next(iter(train_dataloader))[2].shape)
print(next(iter(train_dataloader))[3].shape)

# To run this script: python MapYourCity_ExampleDataLoader.py   
# Also: The main function below 

# if __name__ == '__main__':
#     BATCH_SIZE = 32 
#     input_path = "/Data/ndionelis/building-age-dataset/" # This line has to be modified/ changed  
#     train_path = input_path + "train/data/"
#     train_df = pd.read_csv(input_path + "train/train-set.csv")
#     train_df.head()
#     names_data = os.listdir(train_path) # to not load all data in a single tensor, load only the names                      
#     train_set = Dataset(training_data) 
#     train_dataloader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)  
#     print(next(iter(train_dataloader))[0].shape) 
#     print(next(iter(train_dataloader))[1].shape)
#     print(next(iter(train_dataloader))[2].shape)
#     print(next(iter(train_dataloader))[3].shape)