-
Notifications
You must be signed in to change notification settings - Fork 0
/
historial.Rhistory
107 lines (107 loc) · 5.33 KB
/
historial.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
airbnb<-read.csv('airbnb-listings.csv',sep = ';')
options(repr.plot.height=4,repr.plot.width=6,repr.plot.res = 300)
library(dplyr)
head(airbnb)
df_madrid <- airbnb[airbnb$Room.Type == "Entire home/apt" & airbnb$Neighbourhood != '' & airbnb$City == "Madrid",
c("Neighbourhood", "Accommodates", "Bathrooms", "Bedrooms", "Beds", "Price", "Square.Feet", "Guests.Included", "Extra.People", "Review.Scores.Rating", "Latitude", "Longitude")]
head(df_madrid)
df_madrid$Square.Meters <- df_madrid$Square.Feet * 0.092903
df_madrid[, c("Square.Feet", "Square.Meters")]
head(df_madrid)
entire_home_NA <- sum(is.na(df_madrid$Square.Meters))
porcent <- (entire_home_NA / nrow(df_madrid)) * 100
cat(porcent)
entire_home_real <-entire_home_NA
home_cero <-sum(df_madrid$Square.Meters ==0 & !is.na(df_madrid$Square.Meters))
porcent_real <- (home_cero / entire_home_real) *100
cat(porcent_real)
df_madrid$Square.Meters[df_madrid$Square.Meters == '' | df_madrid$Square.Meters == 0] <- NA
cat(df_madrid$Square.Meters)
library(ggplot2)
ggplot(df_madrid, aes(x = Square.Meters)) +
geom_histogram(binwidth = 10, fill = 'red', color = "white") +
labs(title = "Histograma de Metros Cuadrados", x = "Metros Cuadrados", y = "")
df_madrid$Square.Meters[df_madrid$Square.Meters < 20] <- NA
cat(df_madrid$Square.Meters)
library(dplyr)
df_madrid_Neighbourhood<- unique(df_madrid$Neighbourhood)
Neighbourhood_0 <- character(0)
for (Neighbourhood in df_madrid_Neighbourhood) {
df_madrid_barrio <- df_madrid[df_madrid$Neighbourhood == Neighbourhood, ]
df_na <- all(is.na(df_madrid_barrio$Square.Meters))
if (df_na) {
Neighbourhood_0 <- c(Neighbourhood_0, Neighbourhood)
}
}
df_madrid <- df_madrid[!df_madrid$Neighbourhood %in% Neighbourhood_0, ]
summary(df_madrid)
tky<-TukeyHSD(aov( formula=Square.Meters~Neighbourhood, data=df_madrid ))
tky.result<-data.frame(tky$Neighbourhood)
cn <-sort(unique(df_madrid$Neighbourhood))
resm <- matrix(NA, length(cn),length(cn))
rownames(resm) <- cn
colnames(resm) <- cn
resm[lower.tri(resm) ] <- round(tky.result$p.adj,4)
resm[upper.tri(resm) ] <- t(resm)[upper.tri(resm)]
diag(resm) <- 1
library(ggplot2)
library(reshape2)
dfResm <- melt(resm)
ggplot(dfResm, aes(x=Var1, y=Var2, fill=value))+
geom_tile(colour = "black")+
scale_fill_gradient(low = "white",high = "steelblue")+
ylab("Neighbourhood")+xlab("Neighbourhood")+theme_bw()+
theme(axis.text.x = element_text(angle = 90, hjust = 1),legend.position="none")
library(factoextra)
distancia <- as.dist(1 - resm)
clust_madrid <- hclust(distancia, method = "complete")
dendrogram <- as.dendrogram(clust_madrid)
fviz_dend(clust_madrid, cex = 0.8, lwd = 0.8, k = 38,
rect = TRUE,
k_colors = "jco",
rect_border = "jco",
rect_fill = TRUE,
ggtheme = theme_gray())
cluster <- cutree(clust_madrid, k = 38)
df_madrid_cluster <- data.frame(Neighbourhood = unique(df_madrid$Neighbourhood), neighb_id = cluster)
df_madrid <- merge(df_madrid, df_madrid_cluster , by = "Neighbourhood", all.x = TRUE)
head(df_madrid)
set.seed(42)
idx<-sample(1:nrow(df_madrid),nrow(df_madrid)*0.7)
train.df<-df_madrid[idx,]
test.df<-df_madrid[-idx,]
train.df <- train.df |> select(!c(Neighbourhood,Square.Feet))
test.df <- test.df |> select(!c(Neighbourhood,Square.Feet))
cat("Número de valores:", nrow(train.df), "\n")
cat("Número de valores de datos de prueba:", nrow(test.df), "\n")
summary(train.df)
cor(train.df[,c("Accommodates","Square.Meters","Bathrooms","Bedrooms","Beds","Price","Guests.Included","Extra.People")], use ="pairwise.complete.obs")
modelo1 <- lm(Square.Meters ~ Accommodates + Bathrooms + Bedrooms + Beds + Price + Guests.Included + Extra.People + Review.Scores.Rating + Latitude + Longitude , data = df_train)
summary(modelo1)
modelo2 <- lm(Square.Meters ~ Accommodates + Bedrooms + Price, data = df_train)
summary(modelo2)
residuo1 <- residuals(modelo1, newdata = test.df)
residuo2 <- residuals(modelo2, newdata = test.df)
library(ggplot2)
ggplot(data.frame(residuo = residuo1), aes(x = residuo)) +
geom_histogram(binwidth = 1, fill = 'blue', color = "black", alpha = 0.5) +
labs(title = "Histograma de residuos de modelo1", x = "Residuos", y = "")
ggplot(data.frame(residuo = residuo2), aes(x = residuo)) +
geom_histogram(binwidth = 1, fill = 'darkgreen', color = "black", alpha = 0.5) +
labs(title = "Histograma de residuos de modelo2", x = "Residuos", y = "")
df_anuncio <- data.frame(
Accommodates=6, Bathrooms=1, Bedrooms=3, Beds= 3,
Price=80, Guests.Included=3, Review.Scores.Rating = 80,
Extra.People=1, Latitude=mean(df_madrid$Latitude,na.rm = TRUE), Longitude=-mean(df_madrid$Longitude,na.rm = TRUE), neighb_id = "1")
metros_estimados_mod1 <- predict(modelo1, df_anuncio)
paste("Metros cuadrados apartamento con modelo1:", metros_estimados)
metros_habitacion_adicional <- (modelo1$coefficients["Bedrooms"])
paste("Metros por habitación adicionalcon modelo1:",metros_habitacion_adicional)
metros_estimados_mod2 <- predict(modelo2, df_anuncio)
paste("Metros cuadrados apartamento con modelo2:", metros_estimados)
metros_habitacion_adicional <- (modelo2$coefficients["Bedrooms"])
paste("Metros por habitación adicionalcon modelo2:",metros_habitacion_adicional)
df_madrid_nuevo <- df_madrid
df_madrid_nuevo[is.na(df_madrid_nuevo$Square.Meters),"Square.Meters"] <- predict(modelo1, (df_madrid[is.na(df_madrid$Square.Meters),]))
head(df_madrid_nuevo)
savehistory("~/Documents/Documentos - MacBook Pro de Isabel (2)/Keepcoding/Cursos/0. Curso Big Data/12. data-mining/practica/Practica.Rhistory")