2020-08-29 16:24:51 +02:00

932 lines
16 KiB
Plaintext

---
title: "Statistical Learning Project"
output:
html_document:
df_print: paged
toc: true
toc_float:
collapsed: false
smooth_scroll: false
pdf_document:
keep_tex: yes
latex_engine: xelatex
toc: true
---
\ \
<strong > Author: Andrea Ierardi </strong >
\ \
<strong > Repository </strong > : Code [link](https://github.com/Andreaierardi/Statistical-Learning-Project)
\ \
### Assignment
The exam consists in two assignments, one on the first part(regression, tree, neural nets) and the second part (unsupervised learning). For both you must prepare a writing report using one or more techniques and comparing their performance on one or more data set chosen by the student.
\ \
# Libraries
```{r}
library(knitr)
library(ggplot2)
library(plotly)
library(tidyr)
library(dplyr)
library(png)
library(ggpubr)
library(tidyverse)
library(caTools)
library(caret)
library(tree)
library(MASS)
library(randomForest)
library(ranger)
library(tuneRanger)
library(keras)
library(kableExtra)
library(clustMixType)
library(cluster)
library(dendextend)
library(readr)
library(factoextra)
library(FactoMineR)
library(PCAmixdata)
```
\ \
# Dataset
Link of the [dataset](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data)
```{r}
ds = read.csv("AB_NYC_2019.csv")
```
\ \
# Data Inspection
```{r}
head(ds)
summary(ds)
```
```{r}
cat("Shape of the dataset:" ,dim(ds))
```
```{r}
img <- readPNG("map.png")
map_ds = ggplot() + background_image(img)+ geom_point(data = ds, aes(y=latitude,x = longitude, color = price))
map_ds
```
\ \
# Data cleaning
\ \
## Check for NA and NULL values
```{r}
apply(ds,2,function(x) sum(is.na(x)))
```
\ \
## Variable selection
```{r}
dataset = ds %>% dplyr::select(neighbourhood_group,latitude, longitude, room_type,price)
head(dataset)
```
\ \
## Variable scaling
```{r}
scale_data = function(df)
{
df = df %>%filter( price >= 15 & price <= 500)
numerical = c("price")
numerical2 = c("latitude","longitude")
categorical = c("room_type", "neighbourhood_group")
for( cat in categorical )
{
df[cat] = factor(df[[cat]],
level = unique(df[[cat]]),
labels = c(1:length(unique(df[[cat]])) ))
}
df[numerical] = as.numeric(scale(df[numerical]))
df2 = df
df2[numerical2] = as.numeric(scale(df2[numerical2]))
df3 = list()
df3$df2 = df2
df3$df = df
return(df3)
}
dataframe = scale_data(dataset)
dataset = dataframe$df
data = dataframe$df2
head(dataset)
summary(dataset)
```
\ \
## Data visualisation after the scaling
```{r}
mappa = ggplot() + background_image(img)+ geom_point(data = dataset, aes(y=latitude,x = longitude, color = price))
mappa
```
\ \
# Data split
## Split data in subsets for each neighbourhood_group and room_type
```{r}
neighbourhoods = unique(dataset$neighbourhood_group)
rooms = unique(dataset$room_type)
clust_data = vector("list")
lis_n = vector("list")
for (n in neighbourhoods)
{
tmp = dataset %>%filter( neighbourhood_group == n)
lis_n[[n]] = tmp[-1]
tmp2 = data %>%filter( neighbourhood_group == n)
clust_data[[n]] = tmp2[-1]
clust_data[[n]]$room_type = factor(clust_data[[n]]$room_type , level = unique(clust_data[[n]]$room_type) , labels= unique(ds$room_type))
}
lis_r_n= vector("list")
for (n in neighbourhoods)
{
for(r in rooms)
{
tmp = dataset %>%
filter( room_type == r & neighbourhood_group == n)
lis_r_n[[paste0("n",n,"-","r",r)]]= tmp[-1][-3]
tmp2 = data %>%
filter( room_type == r & neighbourhood_group == n)
clust_data[[paste0("n",n,"-","r",r)]]= tmp2[-1][-3]
}
}
data$neighbourhood_group = factor(data$neighbourhood_group , level = unique(data$neighbourhood_group ) , labels= unique(ds$neighbourhood_group))
data$room_type = factor(data$room_type , level = unique(data$room_type ) , labels= unique(ds$room_type))
clust_data[["all"]] = data
```
\ \
## Split in train and test for each subset
```{r}
trains = vector("list")
tests = vector("list")
datas = vector("list")
for (i in names(lis_n))
{
sample = sample.split(lis_n[[i]], SplitRatio = .75)
train = subset(lis_n[[i]], sample == TRUE)
test = subset(lis_n[[i]], sample == FALSE)
trains[[i]]= train
tests[[i]] = test
datas[[i]] = lis_n[[i]]
}
for (i in names(lis_r_n))
{
sample = sample.split(lis_r_n[[i]], SplitRatio = .75)
train = subset(lis_r_n[[i]], sample == TRUE)
test = subset(lis_r_n[[i]], sample == FALSE)
trains[[i]]= train
tests[[i]] = test
datas[[i]] = lis_r_n[[i]]
}
sample = sample.split(dataset, SplitRatio = .75)
train = subset(dataset, sample == TRUE)
test = subset(dataset, sample == FALSE)
trains[["all"]] = train
tests[["all"]] = test
datas[["all"]] = dataset
```
\ \
# MODELS TRAIN
\ \
```{r}
model_lis = vector("list")
```
\ \
## LINEAR REGRESSION
```{r}
lin_reg = vector("list")
for (sub in names(trains))
{
lin_reg[[sub]]$fit =lm.fit = lm(price~., data = trains[[sub]])
lin_reg[[sub]]$summary = summary(lm.fit)
lin_reg[[sub]]$pred = pr.lm = predict(lm.fit,tests[[sub]])
lin_reg[[sub]]$MSE = sum((pr.lm - tests[[sub]]$price)^2)/nrow(tests[[sub]])
print(paste0("========== ",sub, " =========="))
print(summary(lm.fit))
cat("\n\n")
}
lin_reg$name = "Linear Regression"
model_lis$linear_regression= lin_reg
```
\ \
## DECISION TREE
```{r}
dec_tree = vector("list")
for (sub in names(trains))
{
dec_tree[[sub]]$fit = tree_res=tree(price~., data = trains[[sub]])
dec_tree[[sub]]$summary = sum = summary(tree_res)
print(paste0("========== ",sub, " =========="))
print(sum)
if(sum$size > 1 )
{
plot(tree_res)
text(tree_res,pretty=0)
title(paste0("Tree of: ",sub))
}
else
{
cat("Not possible to plot tree: ", sub)
}
dec_tree[[sub]]$pred = pred = predict(tree_res,tests[[sub]])
dec_tree[[sub]]$MSE = mse = sum((pred - tests[[sub]]$price)^2)/nrow(tests[[sub]])
print(mse)
cat("\n\n")
}
dec_tree$name = "Decision Tree"
model_lis$decision_tree = dec_tree
```
\ \
## RANDOM FOREST
```{r}
rf = vector("list")
for (sub in names(trains))
{
rf[[sub]]$fit = res = randomForest( price ~ . , data=trains[[sub]])
rf[[sub]]$pred = predt = predict(res,tests[[sub]])
print(paste0("========== ",sub, " =========="))
print(res)
rf[[sub]]$MSE = mse = sum((predt - tests[[sub]]$price)^2)/nrow(tests[[sub]])
print(paste0("MSE: ",mse))
cat("\n\n")
}
rf$name = "Random Forest"
model_lis$random_forest = rf
```
\ \
## RANGER RANDOM FOREST
```{r}
ranger_rf = vector("list")
for (sub in names(trains))
{
print(paste0("========== ",sub, " =========="))
ranger_rf[[sub]]$fit = res = ranger( price~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
ranger_rf[[sub]]$pred = predt = predict(res,tests[[sub]])
ranger_rf[[sub]]$MSE = mse = sum((predt$predictions - tests[[sub]]$price)^2)/nrow(tests[[sub]])
print(res)
print(paste0("MSE: ",mse))
cat("\n\n")
}
ranger_rf$name = "Ranger Random Forest"
model_lis$ranger = ranger_rf
```
\ \
## NEURAL NETWORKS
```{r}
build_model <- function(dimension) {
model <- keras::keras_model_sequential() %>%
layer_dense(units = 32, activation = "relu",
input_shape = dimension) %>%
layer_dense(units = 16, activation = "relu") %>%
layer_dense(units = 1,activation="linear")
model %>% compile(
loss = "mse",
optimizer = optimizer_rmsprop(),
metrics = list("mean_absolute_error")
)
return(model)
}
print_dot_callback <- callback_lambda(
on_epoch_end = function(epoch, logs) {
if (epoch %% 1 == 0)
cat(".")
}
)
nn = vector("list")
for (sub in names(trains))
{
d = trains[[sub]]
d2 = tests[[sub]]
len = length(d)
len2 = length(d2)
if(!is.null(d$room_type))
{
d$room_type = keras::to_categorical(d$room_type)
d2$room_type = keras::to_categorical(d2$room_type)
}
if(!is.null(d$neighbourhood_group))
{
d$neighbourhood_group = keras::to_categorical(d$neighbourhood_group)
d2$neighbourhood_group = keras::to_categorical(d2$neighbourhood_group)
}
target = as.vector(d$price)
features = as.matrix(as_tibble(d[-len]))
target_test = as.vector(d2$price)
features_test = as.matrix(as_tibble(d2[-len]))
nn[[sub]]$epochs = epochs = 30
nn[[sub]]$model = model = build_model(dim(features)[2])
nn[[sub]]$summary = model %>% summary()
nn[[sub]]$history = hist = model %>% fit(
x = features,
y = target,
epochs = epochs,
validation_split = 0.2,
verbose = 0,
callbacks = list(print_dot_callback)
)
eva = model %>% evaluate(features_test,target_test, verbose = 0)
nn[[sub]]$mae = eva[1]
nn[[sub]]$loss = eva[2]
nn[[sub]]$pred = pred = model %>% predict(features_test)
nn[[sub]]$MSE = sum((pred - target_test)^2)/length(target_test)
}
nn$name = "Neural Networks"
model_lis$neural_networks = nn
```
\ \
## NN plots
```{r}
for (sub in names(nn))
{
if(sub != "name")
{
m = nn[[sub]]
hist = m$history
print(paste0("========== ",sub, " =========="))
str = paste0("MAE: ",round(m$mae,3)," --- Loss: ",round(m$loss,3))
p = plot(hist, y ~ x) + theme_bw(base_size = 12) +ggtitle(paste0("NN of: ",sub))+ labs(caption= str)
print(p)
plot(p)
cat("MAE: ",m$mae)
cat("\nLoss: ",m$loss,"\n\n")
}
}
```
\ \
# Comparison between the models
\ \
```{r}
conc = c()
for (n in unique(ds$neighbourhood_group))
{
conc = c(conc,n)
}
for (n in unique(ds$neighbourhood_group))
{
for(r in unique(ds$room_type))
{
conc = c(conc, paste(n,"-",r))
}
}
conc = c(conc,"All")
n = names(nn)
cols = vector("list")
cols[["Subset"]] = conc
for( m in model_lis)
{
col = c()
for( nam in n)
{
if( nam != "name")
{
col = c(col,m[[nam]]$MSE)
}
}
cols[[m$name]] = col
}
mse_df = as.data.frame(cols)
best = c()
for (i in 1:length(cols$Subset))
{
m = min(mse_df[i,2:6])
col = which(mse_df[i,1:6 ] == m)
nam = names(mse_df)[col]
best = c(best, nam)
}
mse_df$Best = best
#kableExtra::kable(mse_df)%>%
# kable_styling(bootstrap_options = "striped", full_width = F,font_size = 20) %>%
## row_spec(0, bold = T, color = "white", background = "#D7261E")%>%
## column_spec(1, bold = T, border_right = T,color = "white", background = "#191970")%>%
# column_spec(2:6,extra_css="text-align:Center")
mse_df
```
\ \
\ \
# Clustering and Groups
\ \
## Clustering for Mixed type of data
```{r}
clust_num = 5
get_clusters = function(dts, num, dim_plot,name)
{
l = list()
if(is.null(dts$room_type))
{
l$cl = kmeans(dts,num)
}
else
{
l$cl = kproto(dts,num)
}
clust = list()
for (i in 1:num)
{
indexes = l$cl$cluster == i
clust[[i]] = dts[indexes,]
}
min_lat = dim_plot[1]
max_lat = dim_plot[2]
min_long = dim_plot[3]
max_long= dim_plot[4]
myplot= ggplot() + background_image(img)+ xlab('Longitude') + ylab('Latitude')+ theme(plot.margin = unit(c(1,1,1,1),"cm"),legend.title = element_text(colour="blue", size=10, face="bold")) + xlim(min_long, max_long) + ylim(min_lat,max_lat) +ggtitle(paste0("Clusters of ", name ))
count = 1
l$clust_plot = vector("list")
for(el in clust)
{
myplot = myplot+ geom_point(data = el, aes(y = latitude, x = longitude),color= count)
p =ggplot()+ background_image(img)+geom_point(data = el, aes(y = latitude, x = longitude),color= count) + xlim(min_long, max_long) + ylim(min_lat,max_lat) + ggtitle(paste0("Cluster: ", count," of ", name ))
l$clust_plot[[as.character(count)]] = p
l$summary[[as.character(count)]] = summary(el)
count= count+1
}
l$myplot = myplot
return(l)
}
get_all_cluster = function(clust_data, clust_num, dim)
{
lis = vector("list")
plots = vector("list")
cnt= 1
for(sub in names(clust_data))
{
lis[[sub]] = get_clusters(clust_data[[sub]], clust_num, dim, sub)
plots[[cnt]] = lis[[sub]]$myplot
cnt = cnt+1
}
lis[["all_plots"]]= plots
return(lis)
}
borders = c(min(clust_data$all$latitude) ,max(clust_data$all$latitude), min(clust_data$all$longitude), max(clust_data$all$longitude))
all_cluster = get_all_cluster(clust_data,5, borders)
```
```{r}
multiplots <- function(plotlist, file=NULL,cols = 2, layout = NULL) {
require(grid)
plots <- c(plotlist)
numPlots = length(plots)
if (is.null(layout)) {
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols),byrow =T)
}
if (numPlots == 1) {
print(plots[[1]])
} else {
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
for (i in 1:numPlots) {
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
all_cluster$all_plots[[21]]
all_cluster$all$summary
```
```{r}
multiplots(all_cluster$all_plot[1:5], cols=3)
```
```{r}
multiplots(all_cluster$all_plot[6:11], cols=3)
```
```{r}
multiplots(all_cluster$all_plot[12:17], cols=3)
```
```{r}
multiplots(all_cluster$all_plot[c(18:20,22,23)], cols=3)
```
\ \
## Hierarchical Cluster Analysis
```{r}
agg = aggregate(price ~neighbourhood_group+room_type, clust_data$all , mean)
name_hc = c()
for (n1 in substr(unique(agg$neighbourhood_group),1,5))
{
for(n2 in substr(unique(agg$room_type),1,3))
{
name_hc = c(name_hc, paste0(n1,"/",n2))
}
}
rownames(agg) = name_hc
agg
gower <- daisy(agg, metric = "gower")
hc1 <- hclust(gower, method = "complete" )
plot(hc1, cex = 0.6, hang = -1)
```
```{r}
avg_dend_obj <- as.dendrogram(hc1)
avg_col_dend <- color_branches(avg_dend_obj, h = 0.6)
plot(avg_col_dend)
```
```{r}
agg = aggregate(price ~neighbourhood_group, clust_data$all , mean)
agg
rownames(agg) = c("Brooklyn","Manhattan",
"Queens","Staten Island", "Bronx")
agg$neighbourhood_group = NULL
gower <- daisy(agg, metric = "gower")
hc1 <- hclust(gower, method = "complete" )
plot(hc1, cex = 0.6, hang = -1)
```
\ \
# Principal Component Analysis
\ \
## PCAmixdata
```{r}
## Split mixed dataset into quantitative and qualitative variables
#split <- splitmix(dataset[1:5])
split = splitmix(clust_data$all)
## PCA
res.pcamix <- PCAmix(X.quanti=split$X.quanti,
X.quali=split$X.quali,
rename.level=TRUE)
res.pcamix
```
```{r}
## Inspect principal components
res.pcamix$eig
```
```{r}
res.pcamix$quanti.cor
res.pcamix$quali.eta2
```
\ \
## Factor Analysis of Mixed Data (FAMD)
FAMD (base, ncp = 5, sup.var = NULL, ind.sup = NULL, graph = TRUE)
- base : a data frame with n rows (individuals) and p columns (variables).
- ncp: the number of dimensions kept in the results (by default 5)
- sup.var: a vector indicating the indexes of the supplementary variables.
- ind.sup: a vector indicating the indexes of the supplementary individuals.
- graph : a logical value. If TRUE a graph is displayed.
```{r}
res.famd <- FAMD(clust_data$all, graph = F, ncp = 5)
print(res.famd)
```
```{r}
eig.val <- get_eigenvalue(res.famd)
head(eig.val)
fviz_screeplot(res.famd)
```
```{r}
quanti.var <- get_famd_var(res.famd, "quanti.var")
quanti.var
fviz_famd_var(res.famd, "quanti.var", col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
```
```{r}
quali.var <- get_famd_var(res.famd, "quali.var")
quali.var
fviz_famd_var(res.famd, "quali.var", col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)
```
```{r}
var <- get_famd_var(res.famd)
var
```
```{r}
# Coordinates of variables
head(var$coord)
# Cos2: quality of representation on the factore map
head(var$cos2)
# Contributions to the dimensions
head(var$contrib)
```
```{r}
# Plot of variables
fviz_famd_var(res.famd, repel = TRUE)
# Contribution to the first dimension
fviz_contrib(res.famd, "var", axes = 1)
# Contribution to the second dimension
fviz_contrib(res.famd, "var", axes = 2)
# Contribution to the third dimension
fviz_contrib(res.famd, "var", axes = 3)
# Contribution to the forth dimension
fviz_contrib(res.famd, "var", axes = 4)
# Contribution to the fifth dimension
fviz_contrib(res.famd, "var", axes = 5)
```