mirror of
https://github.com/Andreaierardi/Master-DataScience-Notes.git
synced 2025-01-23 17:57:44 +01:00
upload SL project
This commit is contained in:
parent
b84558bea3
commit
fd4b696502
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,20 @@
|
||||
# Statistical-Learning-Project
|
||||
|
||||
# Exam
|
||||
The exam consists in two assignments, one on the first part(regression, tree, neural nets) and the second part (unsupervised learning). For both you must prepare a writing report using one or more techniques and comparing their performance on one or more data set chosen by the student. A brief oral presentation of the reports will be asked.
|
||||
|
||||
Each report must contain:
|
||||
|
||||
- short **abstract**: what are your going to present in the report
|
||||
- **statement** of the problem/**goal** of the analysis and description of the **data set**(s)
|
||||
- list of three to five **findings/keypoints**
|
||||
- the analysis with **wise** commentary
|
||||
- (optional) theoretical background of the used methods
|
||||
- **conclusions**(should include the findings/keypoints)
|
||||
- the **Appendix**, containing all the R code
|
||||
|
||||
Notice:
|
||||
- The **paper length** is irrelevant provided that the content is correct.
|
||||
- **No R code in the main text**. The R code must be confined to the appendix
|
||||
- The report should be prepared in **PDF** only
|
||||
|
@ -0,0 +1,932 @@
|
||||
---
|
||||
title: "Statistical Learning Project"
|
||||
output:
|
||||
html_document:
|
||||
df_print: paged
|
||||
toc: true
|
||||
toc_float:
|
||||
collapsed: false
|
||||
smooth_scroll: false
|
||||
pdf_document:
|
||||
keep_tex: yes
|
||||
latex_engine: xelatex
|
||||
toc: true
|
||||
---
|
||||
\ \
|
||||
|
||||
<strong > Author: Andrea Ierardi </strong >
|
||||
|
||||
\ \
|
||||
|
||||
<strong > Repository </strong > : Code [link](https://github.com/Andreaierardi/Statistical-Learning-Project)
|
||||
|
||||
\ \
|
||||
|
||||
### Assignment
|
||||
|
||||
The exam consists in two assignments, one on the first part(regression, tree, neural nets) and the second part (unsupervised learning). For both you must prepare a writing report using one or more techniques and comparing their performance on one or more data set chosen by the student.
|
||||
|
||||
\ \
|
||||
|
||||
# Libraries
|
||||
```{r}
|
||||
|
||||
library(knitr)
|
||||
library(ggplot2)
|
||||
library(plotly)
|
||||
|
||||
library(tidyr)
|
||||
library(dplyr)
|
||||
library(png)
|
||||
library(ggpubr)
|
||||
library(tidyverse)
|
||||
library(caTools)
|
||||
library(caret)
|
||||
library(tree)
|
||||
library(MASS)
|
||||
|
||||
library(randomForest)
|
||||
library(ranger)
|
||||
library(tuneRanger)
|
||||
|
||||
|
||||
library(keras)
|
||||
|
||||
library(kableExtra)
|
||||
|
||||
library(clustMixType)
|
||||
library(cluster)
|
||||
library(dendextend)
|
||||
library(readr)
|
||||
|
||||
library(factoextra)
|
||||
library(FactoMineR)
|
||||
|
||||
|
||||
library(PCAmixdata)
|
||||
|
||||
|
||||
```
|
||||
\ \
|
||||
|
||||
# Dataset
|
||||
Link of the [dataset](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data)
|
||||
|
||||
|
||||
```{r}
|
||||
ds = read.csv("AB_NYC_2019.csv")
|
||||
```
|
||||
\ \
|
||||
|
||||
# Data Inspection
|
||||
|
||||
```{r}
|
||||
head(ds)
|
||||
summary(ds)
|
||||
|
||||
```
|
||||
```{r}
|
||||
cat("Shape of the dataset:" ,dim(ds))
|
||||
|
||||
```
|
||||
```{r}
|
||||
img <- readPNG("map.png")
|
||||
|
||||
|
||||
map_ds = ggplot() + background_image(img)+ geom_point(data = ds, aes(y=latitude,x = longitude, color = price))
|
||||
map_ds
|
||||
|
||||
```
|
||||
|
||||
\ \
|
||||
|
||||
# Data cleaning
|
||||
|
||||
\ \
|
||||
|
||||
## Check for NA and NULL values
|
||||
```{r}
|
||||
|
||||
apply(ds,2,function(x) sum(is.na(x)))
|
||||
|
||||
```
|
||||
\ \
|
||||
|
||||
## Variable selection
|
||||
|
||||
```{r}
|
||||
dataset = ds %>% dplyr::select(neighbourhood_group,latitude, longitude, room_type,price)
|
||||
|
||||
head(dataset)
|
||||
```
|
||||
|
||||
\ \
|
||||
|
||||
## Variable scaling
|
||||
```{r}
|
||||
|
||||
scale_data = function(df)
|
||||
{
|
||||
df = df %>%filter( price >= 15 & price <= 500)
|
||||
|
||||
numerical = c("price")
|
||||
numerical2 = c("latitude","longitude")
|
||||
categorical = c("room_type", "neighbourhood_group")
|
||||
|
||||
for( cat in categorical )
|
||||
{
|
||||
df[cat] = factor(df[[cat]],
|
||||
level = unique(df[[cat]]),
|
||||
labels = c(1:length(unique(df[[cat]])) ))
|
||||
}
|
||||
|
||||
df[numerical] = as.numeric(scale(df[numerical]))
|
||||
|
||||
|
||||
df2 = df
|
||||
df2[numerical2] = as.numeric(scale(df2[numerical2]))
|
||||
df3 = list()
|
||||
df3$df2 = df2
|
||||
df3$df = df
|
||||
|
||||
|
||||
return(df3)
|
||||
|
||||
|
||||
}
|
||||
dataframe = scale_data(dataset)
|
||||
|
||||
dataset = dataframe$df
|
||||
|
||||
data = dataframe$df2
|
||||
|
||||
head(dataset)
|
||||
summary(dataset)
|
||||
```
|
||||
\ \
|
||||
|
||||
## Data visualisation after the scaling
|
||||
```{r}
|
||||
|
||||
mappa = ggplot() + background_image(img)+ geom_point(data = dataset, aes(y=latitude,x = longitude, color = price))
|
||||
mappa
|
||||
```
|
||||
\ \
|
||||
|
||||
# Data split
|
||||
|
||||
## Split data in subsets for each neighbourhood_group and room_type
|
||||
```{r}
|
||||
neighbourhoods = unique(dataset$neighbourhood_group)
|
||||
|
||||
rooms = unique(dataset$room_type)
|
||||
|
||||
clust_data = vector("list")
|
||||
|
||||
lis_n = vector("list")
|
||||
|
||||
for (n in neighbourhoods)
|
||||
{
|
||||
tmp = dataset %>%filter( neighbourhood_group == n)
|
||||
lis_n[[n]] = tmp[-1]
|
||||
|
||||
tmp2 = data %>%filter( neighbourhood_group == n)
|
||||
clust_data[[n]] = tmp2[-1]
|
||||
clust_data[[n]]$room_type = factor(clust_data[[n]]$room_type , level = unique(clust_data[[n]]$room_type) , labels= unique(ds$room_type))
|
||||
}
|
||||
|
||||
|
||||
|
||||
lis_r_n= vector("list")
|
||||
for (n in neighbourhoods)
|
||||
{
|
||||
for(r in rooms)
|
||||
{
|
||||
tmp = dataset %>%
|
||||
filter( room_type == r & neighbourhood_group == n)
|
||||
lis_r_n[[paste0("n",n,"-","r",r)]]= tmp[-1][-3]
|
||||
|
||||
|
||||
tmp2 = data %>%
|
||||
filter( room_type == r & neighbourhood_group == n)
|
||||
clust_data[[paste0("n",n,"-","r",r)]]= tmp2[-1][-3]
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
data$neighbourhood_group = factor(data$neighbourhood_group , level = unique(data$neighbourhood_group ) , labels= unique(ds$neighbourhood_group))
|
||||
data$room_type = factor(data$room_type , level = unique(data$room_type ) , labels= unique(ds$room_type))
|
||||
|
||||
clust_data[["all"]] = data
|
||||
|
||||
```
|
||||
|
||||
\ \
|
||||
|
||||
## Split in train and test for each subset
|
||||
```{r}
|
||||
|
||||
trains = vector("list")
|
||||
tests = vector("list")
|
||||
datas = vector("list")
|
||||
|
||||
for (i in names(lis_n))
|
||||
{
|
||||
sample = sample.split(lis_n[[i]], SplitRatio = .75)
|
||||
train = subset(lis_n[[i]], sample == TRUE)
|
||||
test = subset(lis_n[[i]], sample == FALSE)
|
||||
trains[[i]]= train
|
||||
tests[[i]] = test
|
||||
datas[[i]] = lis_n[[i]]
|
||||
}
|
||||
|
||||
for (i in names(lis_r_n))
|
||||
{
|
||||
sample = sample.split(lis_r_n[[i]], SplitRatio = .75)
|
||||
train = subset(lis_r_n[[i]], sample == TRUE)
|
||||
test = subset(lis_r_n[[i]], sample == FALSE)
|
||||
trains[[i]]= train
|
||||
tests[[i]] = test
|
||||
datas[[i]] = lis_r_n[[i]]
|
||||
|
||||
}
|
||||
|
||||
sample = sample.split(dataset, SplitRatio = .75)
|
||||
train = subset(dataset, sample == TRUE)
|
||||
test = subset(dataset, sample == FALSE)
|
||||
|
||||
trains[["all"]] = train
|
||||
tests[["all"]] = test
|
||||
datas[["all"]] = dataset
|
||||
|
||||
```
|
||||
|
||||
|
||||
\ \
|
||||
|
||||
# MODELS TRAIN
|
||||
|
||||
\ \
|
||||
```{r}
|
||||
model_lis = vector("list")
|
||||
```
|
||||
|
||||
\ \
|
||||
|
||||
## LINEAR REGRESSION
|
||||
|
||||
```{r}
|
||||
|
||||
lin_reg = vector("list")
|
||||
|
||||
for (sub in names(trains))
|
||||
{
|
||||
lin_reg[[sub]]$fit =lm.fit = lm(price~., data = trains[[sub]])
|
||||
lin_reg[[sub]]$summary = summary(lm.fit)
|
||||
|
||||
lin_reg[[sub]]$pred = pr.lm = predict(lm.fit,tests[[sub]])
|
||||
|
||||
lin_reg[[sub]]$MSE = sum((pr.lm - tests[[sub]]$price)^2)/nrow(tests[[sub]])
|
||||
print(paste0("========== ",sub, " =========="))
|
||||
print(summary(lm.fit))
|
||||
cat("\n\n")
|
||||
}
|
||||
|
||||
lin_reg$name = "Linear Regression"
|
||||
model_lis$linear_regression= lin_reg
|
||||
```
|
||||
\ \
|
||||
|
||||
## DECISION TREE
|
||||
|
||||
```{r}
|
||||
|
||||
dec_tree = vector("list")
|
||||
|
||||
for (sub in names(trains))
|
||||
{
|
||||
dec_tree[[sub]]$fit = tree_res=tree(price~., data = trains[[sub]])
|
||||
dec_tree[[sub]]$summary = sum = summary(tree_res)
|
||||
|
||||
print(paste0("========== ",sub, " =========="))
|
||||
|
||||
print(sum)
|
||||
|
||||
if(sum$size > 1 )
|
||||
{
|
||||
plot(tree_res)
|
||||
text(tree_res,pretty=0)
|
||||
title(paste0("Tree of: ",sub))
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
cat("Not possible to plot tree: ", sub)
|
||||
}
|
||||
|
||||
dec_tree[[sub]]$pred = pred = predict(tree_res,tests[[sub]])
|
||||
|
||||
dec_tree[[sub]]$MSE = mse = sum((pred - tests[[sub]]$price)^2)/nrow(tests[[sub]])
|
||||
print(mse)
|
||||
cat("\n\n")
|
||||
}
|
||||
|
||||
dec_tree$name = "Decision Tree"
|
||||
model_lis$decision_tree = dec_tree
|
||||
```
|
||||
|
||||
|
||||
\ \
|
||||
|
||||
## RANDOM FOREST
|
||||
|
||||
```{r}
|
||||
|
||||
rf = vector("list")
|
||||
|
||||
for (sub in names(trains))
|
||||
{
|
||||
rf[[sub]]$fit = res = randomForest( price ~ . , data=trains[[sub]])
|
||||
|
||||
rf[[sub]]$pred = predt = predict(res,tests[[sub]])
|
||||
|
||||
print(paste0("========== ",sub, " =========="))
|
||||
|
||||
print(res)
|
||||
|
||||
rf[[sub]]$MSE = mse = sum((predt - tests[[sub]]$price)^2)/nrow(tests[[sub]])
|
||||
print(paste0("MSE: ",mse))
|
||||
cat("\n\n")
|
||||
}
|
||||
|
||||
rf$name = "Random Forest"
|
||||
model_lis$random_forest = rf
|
||||
|
||||
```
|
||||
|
||||
\ \
|
||||
|
||||
## RANGER RANDOM FOREST
|
||||
|
||||
```{r}
|
||||
|
||||
ranger_rf = vector("list")
|
||||
|
||||
for (sub in names(trains))
|
||||
{
|
||||
|
||||
print(paste0("========== ",sub, " =========="))
|
||||
|
||||
ranger_rf[[sub]]$fit = res = ranger( price~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
|
||||
|
||||
ranger_rf[[sub]]$pred = predt = predict(res,tests[[sub]])
|
||||
ranger_rf[[sub]]$MSE = mse = sum((predt$predictions - tests[[sub]]$price)^2)/nrow(tests[[sub]])
|
||||
print(res)
|
||||
print(paste0("MSE: ",mse))
|
||||
cat("\n\n")
|
||||
}
|
||||
|
||||
ranger_rf$name = "Ranger Random Forest"
|
||||
model_lis$ranger = ranger_rf
|
||||
|
||||
|
||||
```
|
||||
|
||||
\ \
|
||||
|
||||
## NEURAL NETWORKS
|
||||
|
||||
```{r}
|
||||
build_model <- function(dimension) {
|
||||
|
||||
model <- keras::keras_model_sequential() %>%
|
||||
layer_dense(units = 32, activation = "relu",
|
||||
input_shape = dimension) %>%
|
||||
layer_dense(units = 16, activation = "relu") %>%
|
||||
layer_dense(units = 1,activation="linear")
|
||||
|
||||
model %>% compile(
|
||||
loss = "mse",
|
||||
optimizer = optimizer_rmsprop(),
|
||||
metrics = list("mean_absolute_error")
|
||||
)
|
||||
|
||||
return(model)
|
||||
}
|
||||
|
||||
|
||||
print_dot_callback <- callback_lambda(
|
||||
on_epoch_end = function(epoch, logs) {
|
||||
if (epoch %% 1 == 0)
|
||||
cat(".")
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
nn = vector("list")
|
||||
|
||||
for (sub in names(trains))
|
||||
{
|
||||
d = trains[[sub]]
|
||||
d2 = tests[[sub]]
|
||||
len = length(d)
|
||||
len2 = length(d2)
|
||||
|
||||
if(!is.null(d$room_type))
|
||||
{
|
||||
d$room_type = keras::to_categorical(d$room_type)
|
||||
d2$room_type = keras::to_categorical(d2$room_type)
|
||||
|
||||
}
|
||||
|
||||
if(!is.null(d$neighbourhood_group))
|
||||
{
|
||||
d$neighbourhood_group = keras::to_categorical(d$neighbourhood_group)
|
||||
d2$neighbourhood_group = keras::to_categorical(d2$neighbourhood_group)
|
||||
}
|
||||
|
||||
|
||||
target = as.vector(d$price)
|
||||
features = as.matrix(as_tibble(d[-len]))
|
||||
|
||||
target_test = as.vector(d2$price)
|
||||
features_test = as.matrix(as_tibble(d2[-len]))
|
||||
|
||||
|
||||
nn[[sub]]$epochs = epochs = 30
|
||||
|
||||
nn[[sub]]$model = model = build_model(dim(features)[2])
|
||||
|
||||
nn[[sub]]$summary = model %>% summary()
|
||||
nn[[sub]]$history = hist = model %>% fit(
|
||||
x = features,
|
||||
y = target,
|
||||
epochs = epochs,
|
||||
validation_split = 0.2,
|
||||
verbose = 0,
|
||||
callbacks = list(print_dot_callback)
|
||||
)
|
||||
eva = model %>% evaluate(features_test,target_test, verbose = 0)
|
||||
|
||||
|
||||
nn[[sub]]$mae = eva[1]
|
||||
nn[[sub]]$loss = eva[2]
|
||||
|
||||
nn[[sub]]$pred = pred = model %>% predict(features_test)
|
||||
nn[[sub]]$MSE = sum((pred - target_test)^2)/length(target_test)
|
||||
|
||||
}
|
||||
|
||||
nn$name = "Neural Networks"
|
||||
model_lis$neural_networks = nn
|
||||
|
||||
|
||||
```
|
||||
|
||||
\ \
|
||||
|
||||
## NN plots
|
||||
```{r}
|
||||
|
||||
for (sub in names(nn))
|
||||
{
|
||||
if(sub != "name")
|
||||
{
|
||||
m = nn[[sub]]
|
||||
hist = m$history
|
||||
print(paste0("========== ",sub, " =========="))
|
||||
str = paste0("MAE: ",round(m$mae,3)," --- Loss: ",round(m$loss,3))
|
||||
p = plot(hist, y ~ x) + theme_bw(base_size = 12) +ggtitle(paste0("NN of: ",sub))+ labs(caption= str)
|
||||
|
||||
print(p)
|
||||
plot(p)
|
||||
cat("MAE: ",m$mae)
|
||||
cat("\nLoss: ",m$loss,"\n\n")
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
\ \
|
||||
|
||||
# Comparison between the models
|
||||
|
||||
\ \
|
||||
```{r}
|
||||
conc = c()
|
||||
|
||||
for (n in unique(ds$neighbourhood_group))
|
||||
{
|
||||
conc = c(conc,n)
|
||||
}
|
||||
for (n in unique(ds$neighbourhood_group))
|
||||
{
|
||||
for(r in unique(ds$room_type))
|
||||
{
|
||||
conc = c(conc, paste(n,"-",r))
|
||||
|
||||
}
|
||||
}
|
||||
conc = c(conc,"All")
|
||||
|
||||
n = names(nn)
|
||||
|
||||
|
||||
|
||||
cols = vector("list")
|
||||
cols[["Subset"]] = conc
|
||||
for( m in model_lis)
|
||||
{
|
||||
col = c()
|
||||
for( nam in n)
|
||||
{
|
||||
|
||||
if( nam != "name")
|
||||
{
|
||||
col = c(col,m[[nam]]$MSE)
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
cols[[m$name]] = col
|
||||
|
||||
}
|
||||
|
||||
|
||||
mse_df = as.data.frame(cols)
|
||||
best = c()
|
||||
for (i in 1:length(cols$Subset))
|
||||
{
|
||||
m = min(mse_df[i,2:6])
|
||||
col = which(mse_df[i,1:6 ] == m)
|
||||
nam = names(mse_df)[col]
|
||||
best = c(best, nam)
|
||||
}
|
||||
mse_df$Best = best
|
||||
#kableExtra::kable(mse_df)%>%
|
||||
# kable_styling(bootstrap_options = "striped", full_width = F,font_size = 20) %>%
|
||||
## row_spec(0, bold = T, color = "white", background = "#D7261E")%>%
|
||||
## column_spec(1, bold = T, border_right = T,color = "white", background = "#191970")%>%
|
||||
# column_spec(2:6,extra_css="text-align:Center")
|
||||
|
||||
mse_df
|
||||
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
\ \
|
||||
|
||||
|
||||
\ \
|
||||
|
||||
# Clustering and Groups
|
||||
|
||||
\ \
|
||||
|
||||
## Clustering for Mixed type of data
|
||||
|
||||
|
||||
|
||||
```{r}
|
||||
|
||||
|
||||
clust_num = 5
|
||||
|
||||
get_clusters = function(dts, num, dim_plot,name)
|
||||
{
|
||||
l = list()
|
||||
if(is.null(dts$room_type))
|
||||
{
|
||||
l$cl = kmeans(dts,num)
|
||||
}
|
||||
else
|
||||
{
|
||||
l$cl = kproto(dts,num)
|
||||
|
||||
}
|
||||
|
||||
clust = list()
|
||||
|
||||
for (i in 1:num)
|
||||
{
|
||||
indexes = l$cl$cluster == i
|
||||
clust[[i]] = dts[indexes,]
|
||||
}
|
||||
|
||||
min_lat = dim_plot[1]
|
||||
max_lat = dim_plot[2]
|
||||
min_long = dim_plot[3]
|
||||
max_long= dim_plot[4]
|
||||
|
||||
|
||||
myplot= ggplot() + background_image(img)+ xlab('Longitude') + ylab('Latitude')+ theme(plot.margin = unit(c(1,1,1,1),"cm"),legend.title = element_text(colour="blue", size=10, face="bold")) + xlim(min_long, max_long) + ylim(min_lat,max_lat) +ggtitle(paste0("Clusters of ", name ))
|
||||
|
||||
|
||||
count = 1
|
||||
l$clust_plot = vector("list")
|
||||
for(el in clust)
|
||||
{
|
||||
myplot = myplot+ geom_point(data = el, aes(y = latitude, x = longitude),color= count)
|
||||
|
||||
p =ggplot()+ background_image(img)+geom_point(data = el, aes(y = latitude, x = longitude),color= count) + xlim(min_long, max_long) + ylim(min_lat,max_lat) + ggtitle(paste0("Cluster: ", count," of ", name ))
|
||||
|
||||
l$clust_plot[[as.character(count)]] = p
|
||||
|
||||
l$summary[[as.character(count)]] = summary(el)
|
||||
|
||||
count= count+1
|
||||
}
|
||||
l$myplot = myplot
|
||||
return(l)
|
||||
}
|
||||
|
||||
get_all_cluster = function(clust_data, clust_num, dim)
|
||||
{
|
||||
lis = vector("list")
|
||||
plots = vector("list")
|
||||
|
||||
cnt= 1
|
||||
for(sub in names(clust_data))
|
||||
{
|
||||
|
||||
lis[[sub]] = get_clusters(clust_data[[sub]], clust_num, dim, sub)
|
||||
plots[[cnt]] = lis[[sub]]$myplot
|
||||
cnt = cnt+1
|
||||
|
||||
}
|
||||
lis[["all_plots"]]= plots
|
||||
return(lis)
|
||||
}
|
||||
|
||||
borders = c(min(clust_data$all$latitude) ,max(clust_data$all$latitude), min(clust_data$all$longitude), max(clust_data$all$longitude))
|
||||
all_cluster = get_all_cluster(clust_data,5, borders)
|
||||
|
||||
|
||||
|
||||
```
|
||||
|
||||
```{r}
|
||||
|
||||
|
||||
multiplots <- function(plotlist, file=NULL,cols = 2, layout = NULL) {
|
||||
require(grid)
|
||||
|
||||
plots <- c(plotlist)
|
||||
|
||||
numPlots = length(plots)
|
||||
|
||||
if (is.null(layout)) {
|
||||
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
|
||||
ncol = cols, nrow = ceiling(numPlots/cols),byrow =T)
|
||||
}
|
||||
|
||||
if (numPlots == 1) {
|
||||
print(plots[[1]])
|
||||
|
||||
} else {
|
||||
grid.newpage()
|
||||
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
|
||||
|
||||
for (i in 1:numPlots) {
|
||||
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
|
||||
|
||||
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
|
||||
layout.pos.col = matchidx$col))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
all_cluster$all_plots[[21]]
|
||||
|
||||
all_cluster$all$summary
|
||||
```
|
||||
|
||||
```{r}
|
||||
multiplots(all_cluster$all_plot[1:5], cols=3)
|
||||
```
|
||||
|
||||
```{r}
|
||||
|
||||
multiplots(all_cluster$all_plot[6:11], cols=3)
|
||||
|
||||
|
||||
```
|
||||
|
||||
```{r}
|
||||
|
||||
multiplots(all_cluster$all_plot[12:17], cols=3)
|
||||
|
||||
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
|
||||
multiplots(all_cluster$all_plot[c(18:20,22,23)], cols=3)
|
||||
|
||||
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
\ \
|
||||
|
||||
## Hierarchical Cluster Analysis
|
||||
|
||||
```{r}
|
||||
|
||||
agg = aggregate(price ~neighbourhood_group+room_type, clust_data$all , mean)
|
||||
|
||||
|
||||
name_hc = c()
|
||||
for (n1 in substr(unique(agg$neighbourhood_group),1,5))
|
||||
{
|
||||
for(n2 in substr(unique(agg$room_type),1,3))
|
||||
{
|
||||
name_hc = c(name_hc, paste0(n1,"/",n2))
|
||||
}
|
||||
}
|
||||
rownames(agg) = name_hc
|
||||
|
||||
agg
|
||||
gower <- daisy(agg, metric = "gower")
|
||||
hc1 <- hclust(gower, method = "complete" )
|
||||
|
||||
plot(hc1, cex = 0.6, hang = -1)
|
||||
|
||||
|
||||
```
|
||||
|
||||
```{r}
|
||||
avg_dend_obj <- as.dendrogram(hc1)
|
||||
avg_col_dend <- color_branches(avg_dend_obj, h = 0.6)
|
||||
plot(avg_col_dend)
|
||||
|
||||
|
||||
|
||||
```
|
||||
|
||||
|
||||
|
||||
```{r}
|
||||
agg = aggregate(price ~neighbourhood_group, clust_data$all , mean)
|
||||
agg
|
||||
|
||||
rownames(agg) = c("Brooklyn","Manhattan",
|
||||
"Queens","Staten Island", "Bronx")
|
||||
agg$neighbourhood_group = NULL
|
||||
gower <- daisy(agg, metric = "gower")
|
||||
hc1 <- hclust(gower, method = "complete" )
|
||||
|
||||
plot(hc1, cex = 0.6, hang = -1)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
```
|
||||
|
||||
|
||||
\ \
|
||||
|
||||
# Principal Component Analysis
|
||||
|
||||
|
||||
\ \
|
||||
|
||||
## PCAmixdata
|
||||
|
||||
|
||||
```{r}
|
||||
|
||||
## Split mixed dataset into quantitative and qualitative variables
|
||||
#split <- splitmix(dataset[1:5])
|
||||
|
||||
split = splitmix(clust_data$all)
|
||||
## PCA
|
||||
res.pcamix <- PCAmix(X.quanti=split$X.quanti,
|
||||
X.quali=split$X.quali,
|
||||
rename.level=TRUE)
|
||||
|
||||
res.pcamix
|
||||
|
||||
|
||||
```
|
||||
|
||||
```{r}
|
||||
|
||||
## Inspect principal components
|
||||
res.pcamix$eig
|
||||
|
||||
|
||||
```
|
||||
|
||||
```{r}
|
||||
|
||||
res.pcamix$quanti.cor
|
||||
res.pcamix$quali.eta2
|
||||
|
||||
```
|
||||
|
||||
|
||||
\ \
|
||||
|
||||
|
||||
## Factor Analysis of Mixed Data (FAMD)
|
||||
|
||||
|
||||
FAMD (base, ncp = 5, sup.var = NULL, ind.sup = NULL, graph = TRUE)
|
||||
- base : a data frame with n rows (individuals) and p columns (variables).
|
||||
- ncp: the number of dimensions kept in the results (by default 5)
|
||||
- sup.var: a vector indicating the indexes of the supplementary variables.
|
||||
- ind.sup: a vector indicating the indexes of the supplementary individuals.
|
||||
- graph : a logical value. If TRUE a graph is displayed.
|
||||
|
||||
```{r}
|
||||
|
||||
res.famd <- FAMD(clust_data$all, graph = F, ncp = 5)
|
||||
print(res.famd)
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
eig.val <- get_eigenvalue(res.famd)
|
||||
head(eig.val)
|
||||
|
||||
fviz_screeplot(res.famd)
|
||||
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
quanti.var <- get_famd_var(res.famd, "quanti.var")
|
||||
quanti.var
|
||||
fviz_famd_var(res.famd, "quanti.var", col.var = "contrib",
|
||||
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
|
||||
repel = TRUE)
|
||||
```
|
||||
```{r}
|
||||
quali.var <- get_famd_var(res.famd, "quali.var")
|
||||
quali.var
|
||||
fviz_famd_var(res.famd, "quali.var", col.var = "contrib",
|
||||
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
|
||||
```{r}
|
||||
var <- get_famd_var(res.famd)
|
||||
var
|
||||
```
|
||||
|
||||
```{r}
|
||||
# Coordinates of variables
|
||||
head(var$coord)
|
||||
# Cos2: quality of representation on the factore map
|
||||
head(var$cos2)
|
||||
# Contributions to the dimensions
|
||||
head(var$contrib)
|
||||
```
|
||||
|
||||
```{r}
|
||||
# Plot of variables
|
||||
|
||||
fviz_famd_var(res.famd, repel = TRUE)
|
||||
# Contribution to the first dimension
|
||||
fviz_contrib(res.famd, "var", axes = 1)
|
||||
# Contribution to the second dimension
|
||||
fviz_contrib(res.famd, "var", axes = 2)
|
||||
# Contribution to the third dimension
|
||||
fviz_contrib(res.famd, "var", axes = 3)
|
||||
# Contribution to the forth dimension
|
||||
fviz_contrib(res.famd, "var", axes = 4)
|
||||
# Contribution to the fifth dimension
|
||||
fviz_contrib(res.famd, "var", axes = 5)
|
||||
```
|
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user