--- title: "Statistical Learning Project" output: html_document: df_print: paged toc: true toc_float: collapsed: false smooth_scroll: false pdf_document: keep_tex: yes latex_engine: xelatex toc: true --- \ \ Author: Andrea Ierardi \ \ Repository : Code [link](https://github.com/Andreaierardi/Statistical-Learning-Project) \ \ ### Assignment The exam consists in two assignments, one on the first part(regression, tree, neural nets) and the second part (unsupervised learning). For both you must prepare a writing report using one or more techniques and comparing their performance on one or more data set chosen by the student. \ \ # Libraries ```{r} library(knitr) library(ggplot2) library(plotly) library(tidyr) library(dplyr) library(png) library(ggpubr) library(tidyverse) library(caTools) library(caret) library(tree) library(MASS) library(randomForest) library(ranger) library(tuneRanger) library(keras) library(kableExtra) library(clustMixType) library(cluster) library(dendextend) library(readr) library(factoextra) library(FactoMineR) library(PCAmixdata) ``` \ \ # Dataset Link of the [dataset](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data) ```{r} ds = read.csv("AB_NYC_2019.csv") ``` \ \ # Data Inspection ```{r} head(ds) summary(ds) ``` ```{r} cat("Shape of the dataset:" ,dim(ds)) ``` ```{r} img <- readPNG("map.png") map_ds = ggplot() + background_image(img)+ geom_point(data = ds, aes(y=latitude,x = longitude, color = price)) map_ds ``` \ \ # Data cleaning \ \ ## Check for NA and NULL values ```{r} apply(ds,2,function(x) sum(is.na(x))) ``` \ \ ## Variable selection ```{r} dataset = ds %>% dplyr::select(neighbourhood_group,latitude, longitude, room_type,price) head(dataset) ``` \ \ ## Variable scaling ```{r} scale_data = function(df) { df = df %>%filter( price >= 15 & price <= 500) numerical = c("price") numerical2 = c("latitude","longitude") categorical = c("room_type", "neighbourhood_group") for( cat in categorical ) { df[cat] = factor(df[[cat]], level = unique(df[[cat]]), labels = c(1:length(unique(df[[cat]])) )) } df[numerical] = as.numeric(scale(df[numerical])) df2 = df df2[numerical2] = as.numeric(scale(df2[numerical2])) df3 = list() df3$df2 = df2 df3$df = df return(df3) } dataframe = scale_data(dataset) dataset = dataframe$df data = dataframe$df2 head(dataset) summary(dataset) ``` \ \ ## Data visualisation after the scaling ```{r} mappa = ggplot() + background_image(img)+ geom_point(data = dataset, aes(y=latitude,x = longitude, color = price)) mappa ``` \ \ # Data split ## Split data in subsets for each neighbourhood_group and room_type ```{r} neighbourhoods = unique(dataset$neighbourhood_group) rooms = unique(dataset$room_type) clust_data = vector("list") lis_n = vector("list") for (n in neighbourhoods) { tmp = dataset %>%filter( neighbourhood_group == n) lis_n[[n]] = tmp[-1] tmp2 = data %>%filter( neighbourhood_group == n) clust_data[[n]] = tmp2[-1] clust_data[[n]]$room_type = factor(clust_data[[n]]$room_type , level = unique(clust_data[[n]]$room_type) , labels= unique(ds$room_type)) } lis_r_n= vector("list") for (n in neighbourhoods) { for(r in rooms) { tmp = dataset %>% filter( room_type == r & neighbourhood_group == n) lis_r_n[[paste0("n",n,"-","r",r)]]= tmp[-1][-3] tmp2 = data %>% filter( room_type == r & neighbourhood_group == n) clust_data[[paste0("n",n,"-","r",r)]]= tmp2[-1][-3] } } data$neighbourhood_group = factor(data$neighbourhood_group , level = unique(data$neighbourhood_group ) , labels= unique(ds$neighbourhood_group)) data$room_type = factor(data$room_type , level = unique(data$room_type ) , labels= unique(ds$room_type)) clust_data[["all"]] = data ``` \ \ ## Split in train and test for each subset ```{r} trains = vector("list") tests = vector("list") datas = vector("list") for (i in names(lis_n)) { sample = sample.split(lis_n[[i]], SplitRatio = .75) train = subset(lis_n[[i]], sample == TRUE) test = subset(lis_n[[i]], sample == FALSE) trains[[i]]= train tests[[i]] = test datas[[i]] = lis_n[[i]] } for (i in names(lis_r_n)) { sample = sample.split(lis_r_n[[i]], SplitRatio = .75) train = subset(lis_r_n[[i]], sample == TRUE) test = subset(lis_r_n[[i]], sample == FALSE) trains[[i]]= train tests[[i]] = test datas[[i]] = lis_r_n[[i]] } sample = sample.split(dataset, SplitRatio = .75) train = subset(dataset, sample == TRUE) test = subset(dataset, sample == FALSE) trains[["all"]] = train tests[["all"]] = test datas[["all"]] = dataset ``` \ \ # MODELS TRAIN \ \ ```{r} model_lis = vector("list") ``` \ \ ## LINEAR REGRESSION ```{r} lin_reg = vector("list") for (sub in names(trains)) { lin_reg[[sub]]$fit =lm.fit = lm(price~., data = trains[[sub]]) lin_reg[[sub]]$summary = summary(lm.fit) lin_reg[[sub]]$pred = pr.lm = predict(lm.fit,tests[[sub]]) lin_reg[[sub]]$MSE = sum((pr.lm - tests[[sub]]$price)^2)/nrow(tests[[sub]]) print(paste0("========== ",sub, " ==========")) print(summary(lm.fit)) cat("\n\n") } lin_reg$name = "Linear Regression" model_lis$linear_regression= lin_reg ``` \ \ ## DECISION TREE ```{r} dec_tree = vector("list") for (sub in names(trains)) { dec_tree[[sub]]$fit = tree_res=tree(price~., data = trains[[sub]]) dec_tree[[sub]]$summary = sum = summary(tree_res) print(paste0("========== ",sub, " ==========")) print(sum) if(sum$size > 1 ) { plot(tree_res) text(tree_res,pretty=0) title(paste0("Tree of: ",sub)) } else { cat("Not possible to plot tree: ", sub) } dec_tree[[sub]]$pred = pred = predict(tree_res,tests[[sub]]) dec_tree[[sub]]$MSE = mse = sum((pred - tests[[sub]]$price)^2)/nrow(tests[[sub]]) print(mse) cat("\n\n") } dec_tree$name = "Decision Tree" model_lis$decision_tree = dec_tree ``` \ \ ## RANDOM FOREST ```{r} rf = vector("list") for (sub in names(trains)) { rf[[sub]]$fit = res = randomForest( price ~ . , data=trains[[sub]]) rf[[sub]]$pred = predt = predict(res,tests[[sub]]) print(paste0("========== ",sub, " ==========")) print(res) rf[[sub]]$MSE = mse = sum((predt - tests[[sub]]$price)^2)/nrow(tests[[sub]]) print(paste0("MSE: ",mse)) cat("\n\n") } rf$name = "Random Forest" model_lis$random_forest = rf ``` \ \ ## RANGER RANDOM FOREST ```{r} ranger_rf = vector("list") for (sub in names(trains)) { print(paste0("========== ",sub, " ==========")) ranger_rf[[sub]]$fit = res = ranger( price~ ., data = trains[[sub]], write.forest = TRUE, classification = F) ranger_rf[[sub]]$pred = predt = predict(res,tests[[sub]]) ranger_rf[[sub]]$MSE = mse = sum((predt$predictions - tests[[sub]]$price)^2)/nrow(tests[[sub]]) print(res) print(paste0("MSE: ",mse)) cat("\n\n") } ranger_rf$name = "Ranger Random Forest" model_lis$ranger = ranger_rf ``` \ \ ## NEURAL NETWORKS ```{r} build_model <- function(dimension) { model <- keras::keras_model_sequential() %>% layer_dense(units = 32, activation = "relu", input_shape = dimension) %>% layer_dense(units = 16, activation = "relu") %>% layer_dense(units = 1,activation="linear") model %>% compile( loss = "mse", optimizer = optimizer_rmsprop(), metrics = list("mean_absolute_error") ) return(model) } print_dot_callback <- callback_lambda( on_epoch_end = function(epoch, logs) { if (epoch %% 1 == 0) cat(".") } ) nn = vector("list") for (sub in names(trains)) { d = trains[[sub]] d2 = tests[[sub]] len = length(d) len2 = length(d2) if(!is.null(d$room_type)) { d$room_type = keras::to_categorical(d$room_type) d2$room_type = keras::to_categorical(d2$room_type) } if(!is.null(d$neighbourhood_group)) { d$neighbourhood_group = keras::to_categorical(d$neighbourhood_group) d2$neighbourhood_group = keras::to_categorical(d2$neighbourhood_group) } target = as.vector(d$price) features = as.matrix(as_tibble(d[-len])) target_test = as.vector(d2$price) features_test = as.matrix(as_tibble(d2[-len])) nn[[sub]]$epochs = epochs = 30 nn[[sub]]$model = model = build_model(dim(features)[2]) nn[[sub]]$summary = model %>% summary() nn[[sub]]$history = hist = model %>% fit( x = features, y = target, epochs = epochs, validation_split = 0.2, verbose = 0, callbacks = list(print_dot_callback) ) eva = model %>% evaluate(features_test,target_test, verbose = 0) nn[[sub]]$mae = eva[1] nn[[sub]]$loss = eva[2] nn[[sub]]$pred = pred = model %>% predict(features_test) nn[[sub]]$MSE = sum((pred - target_test)^2)/length(target_test) } nn$name = "Neural Networks" model_lis$neural_networks = nn ``` \ \ ## NN plots ```{r} for (sub in names(nn)) { if(sub != "name") { m = nn[[sub]] hist = m$history print(paste0("========== ",sub, " ==========")) str = paste0("MAE: ",round(m$mae,3)," --- Loss: ",round(m$loss,3)) p = plot(hist, y ~ x) + theme_bw(base_size = 12) +ggtitle(paste0("NN of: ",sub))+ labs(caption= str) print(p) plot(p) cat("MAE: ",m$mae) cat("\nLoss: ",m$loss,"\n\n") } } ``` \ \ # Comparison between the models \ \ ```{r} conc = c() for (n in unique(ds$neighbourhood_group)) { conc = c(conc,n) } for (n in unique(ds$neighbourhood_group)) { for(r in unique(ds$room_type)) { conc = c(conc, paste(n,"-",r)) } } conc = c(conc,"All") n = names(nn) cols = vector("list") cols[["Subset"]] = conc for( m in model_lis) { col = c() for( nam in n) { if( nam != "name") { col = c(col,m[[nam]]$MSE) } } cols[[m$name]] = col } mse_df = as.data.frame(cols) best = c() for (i in 1:length(cols$Subset)) { m = min(mse_df[i,2:6]) col = which(mse_df[i,1:6 ] == m) nam = names(mse_df)[col] best = c(best, nam) } mse_df$Best = best #kableExtra::kable(mse_df)%>% # kable_styling(bootstrap_options = "striped", full_width = F,font_size = 20) %>% ## row_spec(0, bold = T, color = "white", background = "#D7261E")%>% ## column_spec(1, bold = T, border_right = T,color = "white", background = "#191970")%>% # column_spec(2:6,extra_css="text-align:Center") mse_df ``` \ \ \ \ # Clustering and Groups \ \ ## Clustering for Mixed type of data ```{r} clust_num = 5 get_clusters = function(dts, num, dim_plot,name) { l = list() if(is.null(dts$room_type)) { l$cl = kmeans(dts,num) } else { l$cl = kproto(dts,num) } clust = list() for (i in 1:num) { indexes = l$cl$cluster == i clust[[i]] = dts[indexes,] } min_lat = dim_plot[1] max_lat = dim_plot[2] min_long = dim_plot[3] max_long= dim_plot[4] myplot= ggplot() + background_image(img)+ xlab('Longitude') + ylab('Latitude')+ theme(plot.margin = unit(c(1,1,1,1),"cm"),legend.title = element_text(colour="blue", size=10, face="bold")) + xlim(min_long, max_long) + ylim(min_lat,max_lat) +ggtitle(paste0("Clusters of ", name )) count = 1 l$clust_plot = vector("list") for(el in clust) { myplot = myplot+ geom_point(data = el, aes(y = latitude, x = longitude),color= count) p =ggplot()+ background_image(img)+geom_point(data = el, aes(y = latitude, x = longitude),color= count) + xlim(min_long, max_long) + ylim(min_lat,max_lat) + ggtitle(paste0("Cluster: ", count," of ", name )) l$clust_plot[[as.character(count)]] = p l$summary[[as.character(count)]] = summary(el) count= count+1 } l$myplot = myplot return(l) } get_all_cluster = function(clust_data, clust_num, dim) { lis = vector("list") plots = vector("list") cnt= 1 for(sub in names(clust_data)) { lis[[sub]] = get_clusters(clust_data[[sub]], clust_num, dim, sub) plots[[cnt]] = lis[[sub]]$myplot cnt = cnt+1 } lis[["all_plots"]]= plots return(lis) } borders = c(min(clust_data$all$latitude) ,max(clust_data$all$latitude), min(clust_data$all$longitude), max(clust_data$all$longitude)) all_cluster = get_all_cluster(clust_data,5, borders) ``` ```{r} multiplots <- function(plotlist, file=NULL,cols = 2, layout = NULL) { require(grid) plots <- c(plotlist) numPlots = length(plots) if (is.null(layout)) { layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), ncol = cols, nrow = ceiling(numPlots/cols),byrow =T) } if (numPlots == 1) { print(plots[[1]]) } else { grid.newpage() pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) for (i in 1:numPlots) { matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, layout.pos.col = matchidx$col)) } } } all_cluster$all_plots[[21]] all_cluster$all$summary ``` ```{r} multiplots(all_cluster$all_plot[1:5], cols=3) ``` ```{r} multiplots(all_cluster$all_plot[6:11], cols=3) ``` ```{r} multiplots(all_cluster$all_plot[12:17], cols=3) ``` ```{r} multiplots(all_cluster$all_plot[c(18:20,22,23)], cols=3) ``` \ \ ## Hierarchical Cluster Analysis ```{r} agg = aggregate(price ~neighbourhood_group+room_type, clust_data$all , mean) name_hc = c() for (n1 in substr(unique(agg$neighbourhood_group),1,5)) { for(n2 in substr(unique(agg$room_type),1,3)) { name_hc = c(name_hc, paste0(n1,"/",n2)) } } rownames(agg) = name_hc agg gower <- daisy(agg, metric = "gower") hc1 <- hclust(gower, method = "complete" ) plot(hc1, cex = 0.6, hang = -1) ``` ```{r} avg_dend_obj <- as.dendrogram(hc1) avg_col_dend <- color_branches(avg_dend_obj, h = 0.6) plot(avg_col_dend) ``` ```{r} agg = aggregate(price ~neighbourhood_group, clust_data$all , mean) agg rownames(agg) = c("Brooklyn","Manhattan", "Queens","Staten Island", "Bronx") agg$neighbourhood_group = NULL gower <- daisy(agg, metric = "gower") hc1 <- hclust(gower, method = "complete" ) plot(hc1, cex = 0.6, hang = -1) ``` \ \ # Principal Component Analysis \ \ ## PCAmixdata ```{r} ## Split mixed dataset into quantitative and qualitative variables #split <- splitmix(dataset[1:5]) split = splitmix(clust_data$all) ## PCA res.pcamix <- PCAmix(X.quanti=split$X.quanti, X.quali=split$X.quali, rename.level=TRUE) res.pcamix ``` ```{r} ## Inspect principal components res.pcamix$eig ``` ```{r} res.pcamix$quanti.cor res.pcamix$quali.eta2 ``` \ \ ## Factor Analysis of Mixed Data (FAMD) FAMD (base, ncp = 5, sup.var = NULL, ind.sup = NULL, graph = TRUE) - base : a data frame with n rows (individuals) and p columns (variables). - ncp: the number of dimensions kept in the results (by default 5) - sup.var: a vector indicating the indexes of the supplementary variables. - ind.sup: a vector indicating the indexes of the supplementary individuals. - graph : a logical value. If TRUE a graph is displayed. ```{r} res.famd <- FAMD(clust_data$all, graph = F, ncp = 5) print(res.famd) ``` ```{r} eig.val <- get_eigenvalue(res.famd) head(eig.val) fviz_screeplot(res.famd) ``` ```{r} quanti.var <- get_famd_var(res.famd, "quanti.var") quanti.var fviz_famd_var(res.famd, "quanti.var", col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE) ``` ```{r} quali.var <- get_famd_var(res.famd, "quali.var") quali.var fviz_famd_var(res.famd, "quali.var", col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07") ) ``` ```{r} var <- get_famd_var(res.famd) var ``` ```{r} # Coordinates of variables head(var$coord) # Cos2: quality of representation on the factore map head(var$cos2) # Contributions to the dimensions head(var$contrib) ``` ```{r} # Plot of variables fviz_famd_var(res.famd, repel = TRUE) # Contribution to the first dimension fviz_contrib(res.famd, "var", axes = 1) # Contribution to the second dimension fviz_contrib(res.famd, "var", axes = 2) # Contribution to the third dimension fviz_contrib(res.famd, "var", axes = 3) # Contribution to the forth dimension fviz_contrib(res.famd, "var", axes = 4) # Contribution to the fifth dimension fviz_contrib(res.famd, "var", axes = 5) ```