linear_regression.Rmd

---
title: "Machine Learning I: Group Assignment 2: Supervised"
author: "GMBD: 2020 Intake: Group E"
date: "11/JUL/2020"
output:
  html_document:
    code_folding: hide
    toc: true
    toc_float:
      collapsed: true
---

```{css, echo=FALSE}
.html-widget {
    margin: auto;
}
body .main-container {
  max-width: 1280px !important;
  width: 1280px !important;
}
body {
  max-width: 1280px !important;
}
toc-content {
    max-width: 1280px !important;
}
```

```{r setup, include=FALSE, eval=TRUE}
knitr::opts_chunk$set(message = FALSE, echo=TRUE, warning=FALSE)
```

```{r, echo=FALSE}
library(tidyverse)
library(readxl)
library(skimr)
library(MASS)
library(leaflet)
library(PerformanceAnalytics)
library(gmodels)
library(htmltools)
library(flipPlots)
library(viridisLite)
library(ggridges)
library(scales)
library(stringr)
library(fitdistrplus)
```

# Reading data

The source data is read from the original *Excel* file.

```{r}
data <- read_excel(path = file.path('data', 'Houses for rent in madrid_assignment 2020.xlsx'), sheet = 'Houses_for_rent_madrid_assignme')
# skim(data)
```

# Data preparation

Some data preparation was done to the RAW data.\
Including:\
* Normalization of the address\
* Extraction of property type from the description\
* Completion of data\
  * For the **Chalet**: Variables Floor, Outer and Elevator were revised\
  * For **Estudio**: Variable Bedrooms was revised\
  * For **Chalet pareado**: Variables Floor and Outer were revised\
  * For **Chalet adosado**: Variables Floor and Outer were revised\
\
A summary of the data is presented.

```{r}
data1 <- data %>% 
  mutate(
    Type = factor(ifelse(grepl('^Piso\\sen', Address), 'Piso',
                  ifelse(grepl('^Ático\\sen', Address), 'Ático',
                  ifelse(grepl('^Dúplex\\sen', Address), 'Dúplex',
                  ifelse(grepl('^Estudio\\sen', Address), 'Estudio',
                  ifelse(grepl('^Chalet\\spareado\\sen', Address), 'Chalet pareado',
                  ifelse(grepl('^Chalet\\sadosado\\sen', Address), 'Chalet adosado',
                  ifelse(grepl('^Caserón\\sen', Address), 'Chalet',
                  ifelse(grepl('^Casa\\so\\schalet\\sindependiente\\sen', Address), 'Chalet',
                  ifelse(grepl('^Chalet\\sen', Address), 'Chalet', 'Other')))))))))),
    # Fix Area
    Area = gsub('\\s-\\s', '-', Area),
    Area = gsub('^en\\s', '', Area),
    Area = gsub('chalet independiente en Nueva España', 'Nueva España', Area),
    # Chalet
    Floor = ifelse(is.na(Floor) & Type == 'Chalet', 0, Floor),
    Outer = ifelse(is.na(Outer) & Type == 'Chalet', 0, Outer),
    Elevator = ifelse(is.na(Elevator) & Type == 'Chalet', 0, Elevator),
    # Estudio
    Bedrooms = ifelse(is.na(Bedrooms) & Type == 'Estudio', 0, Bedrooms),
    # Chalet pareado
    Floor = ifelse(is.na(Floor) & Type == 'Chalet pareado', 0, Floor),
    Outer = ifelse(is.na(Outer) & Type == 'Chalet pareado', 0, Outer),
    # Chalet adosado
    Floor = ifelse(Type == 'Chalet adosado', 0, Floor), # Data error
    Outer = ifelse(is.na(Outer) & Type == 'Chalet adosado', 0, Outer),
    # Factoring
    Outer = factor(Outer, levels = c(0, 1), labels = c('No', 'Yes')),
    Elevator = factor(Elevator, levels = c(0, 1), labels = c('No', 'Yes')),
    # Type, but will be removed
    Penthouse = factor(Penthouse, levels = c(0, 1), labels = c('No', 'Yes')), # Ático
    Cottage = factor(Cottage, levels = c(0, 1), labels = c('No', 'Yes')), # Casa o chalet independiente, Caserón, Chalet, Chalet adosado, Chalet pareado
    Duplex = factor(Duplex, levels = c(0, 1), labels = c('No', 'Yes')), # Dúplex
    Semidetached = factor(Semidetached, levels = c(0, 1), labels = c('No', 'Yes')), #Chalet, Chalet adosado
    # Names
    AddressComplete = gsub(".*\\sen\\s","", Address),
    AddressComplete = paste(AddressComplete, ifelse(is.na(Number), 1, Number), Area, 'Madrid', 'Spain', sep = ', '),
    AreaComplete = paste(Area, 'Madrid', 'Spain', sep = ', '),
    AddressComplete = str_to_title(AddressComplete),
    AreaComplete =str_to_title(AreaComplete)
  )

skim(data1)
```

# Feature engineering
## Adding geolocalization

Using the project http://nominatim.openstreetmap.org, we converted the properties addresses to latitude and longitude.\
This allows a better representation of the properties in a map, and also improves the **regression model**.\


```{r}
# https://datascienceplus.com/osm-nominatim-with-r-getting-locations-geo-coordinates-by-its-address/
## geocoding function using OSM Nominatim API
## details: http://wiki.openstreetmap.org/wiki/Nominatim
## made by: D.Kisler 

# Function to geolocate
nominatim_osm <- function(address = NULL)
{
  if(suppressWarnings(is.null(address)))
    return(data.frame())
  tryCatch(
    d <- jsonlite::fromJSON( 
      gsub('\\@addr\\@', gsub('\\s+', '\\%20', address), 
           'http://nominatim.openstreetmap.org/search/@addr@?format=json&addressdetails=0&limit=1')
    ), error = function(c) return(data.frame())
  )
  if(length(d) == 0) return(data.frame())
  return(data.frame(address = address, lon = as.numeric(d$lon), lat = as.numeric(d$lat)))
}

# Group full address
data_3_temp <- data1 %>% 
  group_by(AddressComplete) %>% 
  summarize(n=n(), .groups = 'drop')

# Look for the geolocalization
# system.time({
#   data_geopos_address <- lapply(data_3_temp$AddressComplete, nominatim_osm) %>%
#     bind_rows()
# })
# saveRDS(object = data_geopos_address, file = file.path('storage', 'data_geopos_address.RData'))
data_geopos_address <- readRDS(file = file.path('storage', 'data_geopos_address.RData'))

# Group area
data_3_temp <- data1 %>% 
  filter(!AddressComplete %in% data_geopos_address$address) %>% 
  group_by(AreaComplete) %>% 
  summarize(n=n(), .groups = 'drop')

# Look for the geolocalization
# system.time({
#   data_geopos_area <- lapply(data_3_temp$AreaComplete, nominatim_osm) %>%
#     bind_rows()
# })
# saveRDS(object = data_geopos_area, file = file.path('storage', 'data_geopos_area.RData'))
data_geopos_area <- readRDS(file = file.path('storage', 'data_geopos_area.RData'))

data_3_temp <- data1 %>% 
  left_join(data_geopos_address, by = c("AddressComplete" = "address")) %>% 
  left_join(data_geopos_area, by = c("AreaComplete" = "address")) %>% 
  mutate(Longitude = ifelse(is.na(lon.x), lon.y, lon.x),
         Latitude = ifelse(is.na(lat.x), lat.y, lat.x)) %>% 
  dplyr::select(-lon.x, -lat.x, -lon.y, -lat.y)

```
Only `r data_3_temp %>% filter_at(vars(Outer, Elevator, Bedrooms, Floor), all_vars(!is.na(.))) %>% summarise(sum(is.na(Latitude))) %>% as.numeric` addresses were not possible to map.\

## Data selection

Because of the new latitude and longitude data, we are able to remove **Addrees**, **Number**, **Area** and **District** from the input dataset.\
\
The following summary is a representation of the **data input** for the **regression model**.\
\
We als removed `r data_3_temp %>% filter_at(vars(Outer, Elevator, Bedrooms, Floor), any_vars(is.na(.))) %>% nrow()` properties because no values at any of the relevant variables, i.e., **Outer**, **Elevator**, **Bedrooms**, **Floor**) were available.

```{r}
data4 <- data_3_temp %>% 
  dplyr::select(-Penthouse, -Cottage, -Duplex, -Semidetached) %>% # 'Type' is more comprehensive
  filter_at(vars(Outer, Elevator, Bedrooms, Floor, Latitude, Longitude), all_vars(!is.na(.))) # Remove nulls

data5 <- data4 %>% 
  dplyr::select(-Address, -Number, -Area, -District) %>% # Replaced by lat and lon.
  dplyr::select(-Id, -AddressComplete, -AreaComplete)

skim(data5)
```

The variables **Outer**, **Elevator** and **Type** will be used as binary variables.\
The other variables (**Bedroom**, **Sq.Mt**, **Floor**, **Longitude** and **Latitude**) will be used as continuous variables.\
In particular **Bedroom** and **Floor** are considered as continuous variables. To calculate a single coefficient from the regression on how "a unit of those variables affect the **Rent**".

# Frequency tables

The *frequency tables* represent the number of elements per combination of -for example-, having or not **Elevator** and type of property like **Ático**.\

```{r, eval=FALSE}
# https://dabblingwithdata.wordpress.com/2017/12/20/my-favourite-r-package-for-frequency-tables/
# These outpus is not being shown, for format and display problems in the R Markdown compilation.

ct_elevator <- CrossTable(data4$Elevator, data4$Type, expected = FALSE, prop.t = FALSE, prop.chisq = FALSE)
ct_outer <- CrossTable(data4$Outer, data4$Type, expected = FALSE, prop.t = FALSE, prop.chisq = FALSE)
ct_bedrooms <- CrossTable(data4$Bedrooms, data4$Type, expected = FALSE, prop.t = FALSE, prop.chisq = FALSE)
ct_floor <- CrossTable(data4$Floor, data4$Type, expected = FALSE, prop.t = FALSE, prop.chisq = FALSE)
```

Frequency table between property **Type** and **Elevator**. Measured in number of properties.

```{r}
table(data4$Elevator, data4$Type) %>% 
  knitr::kable()
```

Frequency table between property **Type** and **Outer**. Measured in number of properties.

```{r}
table(data4$Elevator, data4$Type) %>% 
  knitr::kable()
```

# Sankey

A visual way to represent the distribution of elements, is using the *sankey* plot.\
In particular, we are presenting 2 *interactive* representations.

```{r}
# https://www.displayr.com/how-to-create-sankey-diagrams-from-tables-using-r/
# install.packages("devtools")
# library(devtools)
# install_github("Displayr/flipPlots")
data1 %>% 
  dplyr::select(Type, Outer, Elevator, Bedrooms, Floor) %>% 
  SankeyDiagram(link.color = "Source") 

data1 %>% 
  dplyr::select(District, Type) %>% 
  SankeyDiagram(link.color = "Target", max.categories = 300) 
```

The first plot, shows that "Piso" is the predominan property **Type**.\
The second plot shows no predominant distribution of **Type** between the **Districts**.

# Densities

A density plot shows the distribution of a numeric variable.

```{r}
# data4 %>% 
#   ggplot(aes(x = Sq.Mt, color = Type)) +
#   # geom_histogram(aes(y=..density..)) +
#   geom_density(alpha=.2) +
#   labs(x = 'Log10(Sq.Mt)', y = 'Density')
# 
# data4 %>%
#   ggplot(aes(x = Bedrooms, color = Type)) +
#   # geom_histogram(aes(y=..density..)) +
#   geom_density(alpha=.2) +
#   labs(x = 'Bedrooms', y = 'Density')

# http://www.sthda.com/english/articles/32-r-graphics-essentials/133-plot-one-variable-frequency-graph-density-distribution-and-more/
theme_set(theme_ridges())

data4 %>% 
  ggplot(aes(x = Sq.Mt, y = Type)) +
  # geom_histogram(aes(y=..density..)) +
  geom_density_ridges(aes(fill = Type), alpha=.2) +
  labs(x = 'Sq.Mt', y = 'Density', title = 'Density distribution of the Sq.Mt per Type') +
  theme(legend.position = "none")

data4 %>%
  ggplot(aes(x = Bedrooms, y = Type)) +
  # geom_histogram(aes(y=..density..)) +
  geom_density_ridges(aes(fill = Type), alpha=.2) +
  labs(x = 'Bedrooms', y = 'Density', title = 'Density distribution of the number of Bedrooms per Type') +
  theme(legend.position = "none")

data4 %>%
  ggplot(aes(x = Floor, y = Type)) +
  # geom_histogram(aes(y=..density..)) +
  geom_density_ridges(aes(fill = Type), alpha=.2) +
  labs(x = 'Floor', y = 'Density', title = 'Density distribution of the number of Floors per Type') +
  theme(legend.position = "none")
```

As per definition, the **Estudios** have only one room, usually including both a sleeping are and kitchen.\
The distribution of number of **Floors** is very sparse between the **Type** of properties.\

# Rent summary per factor

The following analysis presents the **Rent** distribution per different factors.

```{r}
summary_factor <- function(factor_){
  data4 %>% 
    group_by_at(all_of(factor_)) %>% 
    summarise(Mean=mean(Rent), SD=sd(Rent), N=n(),
              p0.25 = quantile(Rent, probs = 0.25), 
              p0.50 = quantile(Rent, probs = 0.50), 
              p0.75 = quantile(Rent, probs = 0.75),
              .groups = 'drop')
}

summary_outer <- summary_factor('Outer')
summary_elevator <- summary_factor('Elevator')
summary_type <- summary_factor('Type')
summary_bedrooms <- summary_factor('Bedrooms')
summary_floor <- summary_factor('Floor')

summary_sqmt <- data4 %>% 
  mutate(Sq.MtRanges = cut(data4$Sq.Mt, breaks = 6)) %>% 
  group_by(Sq.MtRanges) %>% 
  summarise(Mean=mean(Rent), SD=sd(Rent), N=n(), 
            p0.25 = quantile(Rent, probs = 0.25), 
            p0.50 = quantile(Rent, probs = 0.50), 
            p0.75 = quantile(Rent, probs = 0.75),
            .groups = 'drop')

knitr::kable(summary_outer)
knitr::kable(summary_elevator)
knitr::kable(summary_type)
knitr::kable(summary_bedrooms)
knitr::kable(summary_sqmt)
```

A property that has **Outer** access/view, has a mean price decrease of `r scales::percent(filter(summary_outer, Outer == 'Yes')$Mean/filter(summary_outer, Outer == 'No')$Mean-1)`.\
There aren't any significant changes in the mean price by having **Elevator**.\
\
The **Floor** number of the property changes the price.\
For example, the floor 0 has a price of `r scales::percent(filter(summary_floor, Floor == 0)$Mean)`, in comparison with the floor 1, where the price increases `r scales::percent(filter(summary_floor, Floor == 1)$Mean/filter(summary_floor, Floor == 0)$Mean-1)`. This could be because of the presence of a patio.\
\
For simplicity, we also created some density plots to confirm the previous observations.\
We added a plot of the **Rent/Sq.Mt**, considering those two have the higher linear correlation.


```{r}
data4 %>% 
  mutate(Sq.MtRanges = cut(data4$Sq.Mt, breaks = 6)) %>% 
  ggplot(aes(x = Rent/Sq.Mt, y = Sq.MtRanges)) +
  # geom_histogram(aes(y=..density..)) +
  geom_density_ridges(aes(fill = Sq.MtRanges), alpha=.2) +
  labs(x = 'Rent/Sq.Mt', y = 'Sq.Mt range', title = 'Density distribution of Rent per Rent/Sq.Mt') +
  theme(legend.position = "none")

data4 %>% 
  mutate(Sq.MtRanges = cut(data4$Sq.Mt, breaks = 6)) %>% 
  ggplot(aes(x = Rent, y = Sq.MtRanges)) +
  # geom_histogram(aes(y=..density..)) +
  geom_density_ridges(aes(fill = Sq.MtRanges), alpha=.2) +
  labs(x = 'Rent', y = 'Sq.Mt range', title = 'Density distribution of Rent') +
  theme(legend.position = "none")
```

From the plot, it can be observed the bigger the area of the property, the lower the price of **Rent/Sq.Mt**.

# Regression

## Regression model

A linear model using *ordinary least squares* methods was used.\
To *fit* the model, we used `r dim(data5)[1]` rows and `r dim(data5)[2]-1` features. Three of those where categorical, so *R* converts them into factors and represents the encoded version.

```{r}
# Fit full model
# model_full <- glm(Rent ~ ., data = data1)
# https://stats.stackexchange.com/questions/181113/is-there-any-difference-between-lm-and-glm-for-the-gaussian-family-of-glm
# full.model1 <- lm(Rent ~ ., data = data1)
model_full <- glm(Rent ~ ., data = data5, family = gaussian(link = "identity"))
summary(model_full)
```
The significance of all the variables is high, except for **Bedrooms** and **Outer**.\
\
We also added a *Stepwise Algorithm* to reduce the number of variables in the model, optimizing the *Akaike information criterion (AIC)*.\
The following is the summary of the final model where the Stepwise Algorith has removed the variable *Bedrooms*.\


```{r}
# Step model
model_step <- model_full %>% 
  stepAIC(trace = FALSE)
summary(model_step)
```

There is small change in the model after the optimization.\
The features **Bedrooms** and **OuterYes** were removed from the final model. Let's not forget there is a high linear correlation between the **Sq.Mt** and the **Bedrooms**.\
\
The *adjusted R squared* is `r round(with(summary(model_step), 1 - deviance/null.deviance), 3)`. That adjusted $R^{2}$ value may be interpreted as the percentage of the variability / variation of Y (rent) explained by our regression.

### Checking main statistical assumptions

The basic linear regression model is built under the following statistical assumptions.

#### Homoscedasticity in residuals

The following plot shows there is no apparent *Heteroscedasticity*. Only a few datapoints are outside the range.

```{r}
plot(model_step, which=1)
```

#### Correlation

There is a high linear correlation between the **Rent** price and **Sq.Mt** (flat dimension of the property in square meters).\
Also between the **Rent** and the number of **Bedrooms**.\
The number of **Floors** of the property and the **Rent** has a small correlation, and also has less significance.\
\
There is only a visible correlation between the *Explanatory/Exogenous* variables **Bedrooms** and **Sq.Mt** ($R^{2} = 0.73$, with a high significance). As mentioned before, the *Stepwise Algorithm* has removed **Bedrooms**.

```{r}
data5 %>% 
  # dplyr::select(Bedrooms, Sq.Mt, Floor, Longitude, Latitude) %>% 
  dplyr::select_if(is.numeric) %>% 
  chart.Correlation(histogram=TRUE)
```

#### Correlation between residual observations

Lack of correlation between residual observations.\
The values are *scaled* and *centered*.


```{r}
temp_lag <- tibble(t0 = scale(predict(model_step, data4) - data4$Rent),
                   `t-1` = lag(t0, 1))
plot(temp_lag, main = 'Correlation between log residual observations', ylim=c(-2,2), xlim=c(-2,2))
```

#### Normality in residuals

We cannot confirm the normality in the residuals.\
The reason could be the ommission of relevant variables.

```{r}
hist(predict(model_step, data4) - data4$Rent, main = 'Histogram of the residuals')

temp <- descdist(predict(model_step, data4) - data4$Rent)

shapiro.test(predict(model_step, data4) - data4$Rent)
```

From the output of the **Shapiro-Wilk normality test**, the *p-value < 0.05* implying that the distribution of the data is significantly different from *normal distribution*. In other words, we cannot assume the normality.\
More variables should be included in the model to improve the capabilities.

## Impact of removing one feature at a time

The following is an excerise of removing one feature at a time and compare the *linear regression* coeficients.

```{r}
bind_cols(
  FeatureRemoved = c('None', colnames(data5)[2:length(colnames(data5))]),
bind_rows(
  coef(glm(Rent ~ ., data = data5, family = gaussian(link = "identity"))),
  coef(glm(Rent ~ . - Bedrooms, data = data5, family = gaussian(link = "identity"))),
  coef(glm(Rent ~ . - Sq.Mt, data = data5, family = gaussian(link = "identity"))),
  coef(glm(Rent ~ . - Floor, data = data5, family = gaussian(link = "identity"))),
  coef(glm(Rent ~ . - Outer, data = data5, family = gaussian(link = "identity"))),
  coef(glm(Rent ~ . - Elevator, data = data5, family = gaussian(link = "identity"))),
  coef(glm(Rent ~ . - Type, data = data5, family = gaussian(link = "identity"))),
  coef(glm(Rent ~ . - Longitude, data = data5, family = gaussian(link = "identity"))),
  coef(glm(Rent ~ . - Latitude, data = data5, family = gaussian(link = "identity"))))
) %>% 
  knitr::kable()
```

The removal of **Sq.Mt** makes a big change in the coefficients.\
Also the coeficient of `r coef(glm(Rent ~ ., data = data5, family = gaussian(link = "identity")))[['Sq.Mt']]` stays almost inmutable by removing the other features.
This number represents a change of 1 unit of **Rent** by changing 1 unit of **Sq.Mt** as long as the other variables remain constant    . \
That means, 1 extra square meter of property will increase the rent price by `r coef(glm(Rent ~ ., data = data5, family = gaussian(link = "identity")))[['Sq.Mt']]` EUR as long the other variables remains constant.

## Model summary table: Scaled

Scaling the variables, allows the reader to compare the different variables in the model.

```{r}
# Scaled
data_7_temp <- data5 %>% 
  mutate_if(is.numeric, scale)
model_step_scaled <- glm(formula(model_step), data = data_7_temp, family = gaussian(link = "identity"))
summary(model_step_scaled)
```

As a summary, the coefficients of the model are the following.\
This analysis, compares the measured and unitless (*scaled*) variables.

```{r}
# First version of the summary
summary_unit <- summary(model_step)$coefficients %>% 
  data.frame() %>% 
  rownames_to_column()
summary_scaled <- summary(model_step_scaled)$coefficients %>% 
  data.frame() %>% 
  rownames_to_column() %>% 
  rename_all(function(x) paste(x, 'scaled', sep = '_'))
bind_cols(summary_unit,
          dplyr::select(summary_scaled, Estimate_scaled, Std..Error_scaled, t.value_scaled)) %>%
  magrittr::set_colnames(c('Feature', 'Estimate', 'Std. Error', 't value', 'Pr(>|t|)', 'Estimate scaled', 'Std. Error scaled', 't value scaled')) %>% 
  knitr::kable()

# Second version of the summary
regression_table <- function(x, x_scaled, level = 0.95) {
  table <- cbind(
    summary(x)$coefficients,
    summary(x_scaled)$coefficients[, 'Estimate'],
    confint.default(x, level = level))
  colnames(table)[5] <- "Estimate Std."
  return(table)
}

#regression_table(model_step, model_step_scaled) %>% 
#  knitr::kable()
```

The *scaling*, allows to check the importance of a variable, for example the **TypeChalet adosado** is the one that produces the highest variation of the **Rent**, and also has a *negative* relation. See column *Estimate scaled* (as long the other variables remain constant).\
The second one, is the **Sq.Mt**, affecting rent in a positive way (an increase in m^2 leads to an increase in rent).

```{r}
data_8_temp <- summary(model_step_scaled)$coefficients[, 'Estimate']
data_8_temp <- data.frame(Variable = names(data_8_temp), StandardizedCoefficient = as.numeric(data_8_temp))
data_8_temp <- arrange(data_8_temp, abs(StandardizedCoefficient))
data_8_temp$Variable <- factor(data_8_temp$Variable, levels = data_8_temp$Variable)

ggplot(data=data_8_temp, aes(x=Variable, y=StandardizedCoefficient)) +
  geom_bar(stat="identity") +
  coord_flip()
```
The previous plot shows the importance per variable in the same unit. Where **Type Chalet adosado** has the biggest impact on **Rent**, as long the other variables remain constant.\
The sign tells if the variable affects the **Rent** in a positive or negative way. That is, one unit of the variable increase or decrease the **Rent**.

# Good opportunities

Following, there is a list of the top 15 properties with *Real* price below their theoretical *Predicted* price.\
The **Ratio** captures how many times better the *Real* price is compared to the *Predicted*.\
The **Rent** is in *EUR/month*.

```{r}
# Find good opportunities in the market looking for flats that may be under their theoretical estimated price

data_6_temp <- data4
data_6_temp$Prediction <- predict(model_step, data_6_temp)
data_6_temp$Ratio <- data_6_temp$Prediction/data_6_temp$Rent-1

data_6_temp %>% 
  filter(Ratio>0) %>% 
  top_n(Ratio, n = 15) %>% 
  dplyr::select(Id, Type, AddressComplete, Rent, Ratio) %>% 
  mutate(Ratio = round(Ratio, 2)) %>% 
  arrange(desc(Ratio)) %>%
  knitr::kable()
```

## Map

Following the most relevant renting opportunities per **Type** are presented.

```{r}
# Labels for the map ----
data_6_temp$Label <- 
  paste('<strong>', data_6_temp$Type, '</strong>', '<br/>', 
        'Location:', data_6_temp$AddressComplete, '<br/>',
        'Rent price:', round(data_6_temp$Rent, 1), 'EUR/month', '<br/>',
        'Theoretical rent price:', round(data_6_temp$Prediction, 1), 'EUR/month', '<br/>',
        'Id:', data_6_temp$Id
        ) %>% 
  lapply(HTML)
```

### Piso
Interesting opportunities for **Piso** or *Flat*.\
Only showing the ones we were able to geolocate (assign latitude and longitude), and also where the ratio between estimated rent price and real price is greater than 1x.

```{r}
data_7_temp <- data_6_temp %>% 
  filter(Ratio>1, Type == 'Piso')

# make palette
# https://cran.r-project.org/web/packages/viridis/vignettes/intro-to-viridis.html
pallete_ <- colorNumeric(palette = magma(20), domain = range(data_7_temp$Ratio))

data_7_temp %>% 
  leaflet() %>%
    addProviderTiles(providers$CartoDB.Positron) %>% 
    addCircleMarkers(~Longitude, ~Latitude, 
                     popup = ~Label, 
                     label = ~Type,
                     color = ~pallete_(Ratio))
```

### Ático
Interesting opportunities for **Ático**.\
Additionally, we are adding *Penthouse* opportunities.

```{r}
data_7_temp <- data_6_temp %>% 
  filter(Ratio>1, Type == 'Ático')

# make palette
# https://cran.r-project.org/web/packages/viridis/vignettes/intro-to-viridis.html
pallete_ <- colorNumeric(palette = magma(20), domain = range(data_7_temp$Ratio))

data_7_temp %>% 
  leaflet() %>%
    addProviderTiles(providers$CartoDB.Positron) %>% 
    addCircleMarkers(~Longitude, ~Latitude, 
                     popup = ~Label, 
                     label = ~Type,
                     color = ~pallete_(Ratio))
```

### Estudio
Interesting opportunities for **Estudio**.\
These properties are single-storey, meaning the room is on one floor only.

```{r}
data_7_temp <- data_6_temp %>% 
  filter(Ratio>1, Type == 'Estudio')

# make palette
# https://cran.r-project.org/web/packages/viridis/vignettes/intro-to-viridis.html
pallete_ <- colorNumeric(palette = magma(20), domain = range(data_7_temp$Ratio))

data_7_temp %>% 
  leaflet() %>%
    addProviderTiles(providers$CartoDB.Positron) %>% 
    addCircleMarkers(~Longitude, ~Latitude, 
                     popup = ~Label, 
                     label = ~Type,
                     color = ~pallete_(Ratio))
```