upload SL project

2025-02-18 22:16:48 +01:00 · 2020-08-29 16:24:51 +02:00 · 2020-08-29 16:24:51 +02:00 · fd4b696502
commit fd4b696502
parent b84558bea3
5 changed files with 50033 additions and 0 deletions
--- a/Intelligence/Project/AB_NYC_2019.csv
+++ b/Intelligence/Project/AB_NYC_2019.csv
--- a/Intelligence/Project/README.md
+++ b/Intelligence/Project/README.md
@ -0,0 +1,20 @@
 # Statistical-Learning-Project
 # Exam 
 The exam consists in two assignments, one on the first part(regression, tree, neural nets) and the second part (unsupervised learning). For both you must prepare a writing report using one or more techniques and comparing their performance on one or more data set chosen by the student. A brief oral presentation of the reports will be asked. 
 Each report must contain:
 - short **abstract**: what are your going to present in the report
 - **statement** of the problem/**goal** of the analysis and description of the **data set**(s)
 - list of three to five **findings/keypoints**
 - the analysis with **wise** commentary
 - (optional) theoretical background of the used methods
 - **conclusions**(should include the findings/keypoints)
 - the **Appendix**, containing all the R code
 Notice:
 - The **paper length** is irrelevant provided that the content is correct.
 - **No R code in the main text**. The R code must be confined to the appendix
 - The report should be prepared in **PDF** only
--- a/Intelligence/Project/project-code.Rmd
+++ b/Intelligence/Project/project-code.Rmd
@ -0,0 +1,932 @@
 ---
 title: "Statistical Learning Project"
 output:
  html_document:
    df_print: paged
    toc: true
    toc_float: 
      collapsed: false
      smooth_scroll: false
  pdf_document:
    keep_tex: yes
    latex_engine: xelatex
    toc: true
 ---
 \ \
 <strong > Author: Andrea Ierardi </strong >
 \ \
 <strong > Repository </strong > : Code [link](https://github.com/Andreaierardi/Statistical-Learning-Project)
 \ \
 ### Assignment
 The exam consists in two assignments, one on the first part(regression, tree, neural nets) and the second part (unsupervised learning). For both you must prepare a writing report using one or more techniques and comparing their performance on one or more data set chosen by the student. 
 \ \
 # Libraries
 ```{r}
 library(knitr)
 library(ggplot2)
 library(plotly)
 library(tidyr)
 library(dplyr)
 library(png)
 library(ggpubr)
 library(tidyverse)
 library(caTools)
 library(caret)
 library(tree)
 library(MASS)
 library(randomForest)
 library(ranger)
 library(tuneRanger)
 library(keras)
 library(kableExtra)
 library(clustMixType)
 library(cluster)
 library(dendextend) 
 library(readr)
 library(factoextra)
 library(FactoMineR)
 library(PCAmixdata)
 ```
 \ \
 # Dataset 
 Link of the [dataset](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data)
 ```{r}
 ds = read.csv("AB_NYC_2019.csv")
 ```
 \ \
 # Data Inspection
 ```{r}
 head(ds)
 summary(ds)
 ```
 ```{r}
 cat("Shape of the dataset:" ,dim(ds))
 ```
 ```{r}
 img <- readPNG("map.png")
 map_ds = ggplot() + background_image(img)+ geom_point(data = ds,  aes(y=latitude,x = longitude, color = price)) 
 map_ds
 ```
 \ \ 
 # Data cleaning 
 \ \
 ## Check for NA and NULL values
 ```{r}
 apply(ds,2,function(x) sum(is.na(x)))
 ```
 \ \
 ## Variable selection
 ```{r}
 dataset = ds  %>% dplyr::select(neighbourhood_group,latitude, longitude, room_type,price)
 head(dataset)
 ```
 \ \
 ## Variable scaling
 ```{r}
 scale_data = function(df)
 {
  df = df %>%filter( price >= 15  & price <= 500)
  numerical = c("price")
  numerical2 = c("latitude","longitude")
  categorical = c("room_type", "neighbourhood_group")
  for( cat in categorical )
  {
    df[cat] = factor(df[[cat]], 
                     level = unique(df[[cat]]), 
                     labels = c(1:length(unique(df[[cat]])) ))
  }
  df[numerical] = as.numeric(scale(df[numerical]))
  df2 = df
  df2[numerical2] = as.numeric(scale(df2[numerical2]))
  df3 = list()
  df3$df2 = df2
  df3$df = df
  return(df3)
 }
 dataframe = scale_data(dataset)
 dataset = dataframe$df
 data = dataframe$df2
 head(dataset)
 summary(dataset)
 ```
 \ \
 ## Data visualisation after the scaling
 ```{r}
 mappa = ggplot() + background_image(img)+ geom_point(data = dataset,  aes(y=latitude,x = longitude, color = price)) 
 mappa
 ```
 \ \
 # Data split 
 ## Split data in subsets for each neighbourhood_group and room_type
 ```{r}
 neighbourhoods = unique(dataset$neighbourhood_group)
 rooms = unique(dataset$room_type)
 clust_data = vector("list")
 lis_n = vector("list")
 for (n in neighbourhoods)
 {
  tmp = dataset %>%filter( neighbourhood_group == n) 
  lis_n[[n]] = tmp[-1]
  tmp2 = data %>%filter( neighbourhood_group == n) 
  clust_data[[n]] = tmp2[-1]
  clust_data[[n]]$room_type = factor(clust_data[[n]]$room_type , level = unique(clust_data[[n]]$room_type) , labels= unique(ds$room_type))
 }
 lis_r_n= vector("list")
 for (n in neighbourhoods)
 {
  for(r in rooms)
  {
    tmp = dataset %>%
      filter( room_type == r  & neighbourhood_group == n) 
    lis_r_n[[paste0("n",n,"-","r",r)]]= tmp[-1][-3]
    tmp2 = data %>%
      filter( room_type == r  & neighbourhood_group == n) 
    clust_data[[paste0("n",n,"-","r",r)]]= tmp2[-1][-3]
  }
 }
 data$neighbourhood_group = factor(data$neighbourhood_group  , level = unique(data$neighbourhood_group ) , labels= unique(ds$neighbourhood_group)) 
 data$room_type = factor(data$room_type  , level = unique(data$room_type ) , labels= unique(ds$room_type)) 
 clust_data[["all"]] = data
 ```
 \ \
 ## Split in train and test for each subset
 ```{r}
 trains = vector("list")
 tests = vector("list")
 datas = vector("list")
 for (i in names(lis_n))
 {
  sample = sample.split(lis_n[[i]], SplitRatio = .75)
  train = subset(lis_n[[i]], sample == TRUE)
  test  = subset(lis_n[[i]], sample == FALSE)
  trains[[i]]=  train
  tests[[i]] = test
  datas[[i]] = lis_n[[i]]
 }
 for (i in names(lis_r_n))
 {
  sample = sample.split(lis_r_n[[i]], SplitRatio = .75)
  train = subset(lis_r_n[[i]], sample == TRUE)
  test  = subset(lis_r_n[[i]], sample == FALSE)
  trains[[i]]=  train
  tests[[i]] = test
  datas[[i]] = lis_r_n[[i]]
 }
 sample = sample.split(dataset, SplitRatio = .75)
 train = subset(dataset, sample == TRUE)
 test  = subset(dataset, sample == FALSE)
 trains[["all"]] = train
 tests[["all"]] = test
 datas[["all"]] = dataset
 ```
 \  \
 # MODELS TRAIN
 \ \
 ```{r}
 model_lis = vector("list")
 ```
 \ \
 ## LINEAR REGRESSION
 ```{r}
 lin_reg = vector("list")
 for (sub in names(trains))
 {
  lin_reg[[sub]]$fit =lm.fit = lm(price~., data = trains[[sub]])
  lin_reg[[sub]]$summary = summary(lm.fit)
  lin_reg[[sub]]$pred  = pr.lm = predict(lm.fit,tests[[sub]])
  lin_reg[[sub]]$MSE = sum((pr.lm - tests[[sub]]$price)^2)/nrow(tests[[sub]])
  print(paste0("========== ",sub, " =========="))
  print(summary(lm.fit))
  cat("\n\n")
 }
 lin_reg$name = "Linear Regression"
 model_lis$linear_regression=  lin_reg
 ```
 \ \
 ## DECISION  TREE
 ```{r}
 dec_tree = vector("list")
 for (sub in names(trains))
 {
  dec_tree[[sub]]$fit = tree_res=tree(price~., data = trains[[sub]])
  dec_tree[[sub]]$summary = sum = summary(tree_res)
  print(paste0("========== ",sub, " =========="))
  print(sum)
  if(sum$size > 1 )
  {
    plot(tree_res)
    text(tree_res,pretty=0)
    title(paste0("Tree of: ",sub))
  }
  else 
  {
    cat("Not possible to plot tree: ", sub)
  }
  dec_tree[[sub]]$pred  = pred = predict(tree_res,tests[[sub]])
  dec_tree[[sub]]$MSE = mse =  sum((pred - tests[[sub]]$price)^2)/nrow(tests[[sub]])
  print(mse)
  cat("\n\n")
 }
 dec_tree$name = "Decision Tree"
 model_lis$decision_tree = dec_tree
 ```
 \ \
 ## RANDOM FOREST
 ```{r}
 rf = vector("list")
 for (sub in names(trains))
 {
  rf[[sub]]$fit = res =  randomForest(  price ~ . , data=trains[[sub]])
  rf[[sub]]$pred = predt = predict(res,tests[[sub]])
  print(paste0("========== ",sub, " =========="))
  print(res)
  rf[[sub]]$MSE = mse = sum((predt - tests[[sub]]$price)^2)/nrow(tests[[sub]])
  print(paste0("MSE: ",mse))
  cat("\n\n")
 }
 rf$name = "Random Forest"
 model_lis$random_forest = rf
 ```
 \ \
 ## RANGER RANDOM FOREST
 ```{r}
 ranger_rf = vector("list")
 for (sub in names(trains))
 {
   print(paste0("========== ",sub, " =========="))
  ranger_rf[[sub]]$fit = res = ranger( price~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
  ranger_rf[[sub]]$pred = predt = predict(res,tests[[sub]])
  ranger_rf[[sub]]$MSE = mse =  sum((predt$predictions - tests[[sub]]$price)^2)/nrow(tests[[sub]])
  print(res)
  print(paste0("MSE: ",mse))
  cat("\n\n")
 }
 ranger_rf$name = "Ranger Random Forest"
 model_lis$ranger = ranger_rf
 ```
 \ \
 ## NEURAL NETWORKS
 ```{r}
 build_model <- function(dimension) {
  model <- keras::keras_model_sequential() %>%
    layer_dense(units = 32, activation = "relu",
                input_shape = dimension) %>%
    layer_dense(units = 16, activation = "relu") %>%
    layer_dense(units = 1,activation="linear")
  model %>% compile(
    loss = "mse",
    optimizer = optimizer_rmsprop(),
    metrics = list("mean_absolute_error")
  )
  return(model)
 }
 print_dot_callback <- callback_lambda(
  on_epoch_end = function(epoch, logs) {
    if (epoch %% 1 == 0) 
      cat(".")
  }
 )    
 nn = vector("list")
 for (sub in names(trains))
 {
  d = trains[[sub]]
  d2 = tests[[sub]]
  len = length(d)
  len2 = length(d2)
  if(!is.null(d$room_type))
  {
    d$room_type = keras::to_categorical(d$room_type)
    d2$room_type = keras::to_categorical(d2$room_type)
  }
  if(!is.null(d$neighbourhood_group)) 
  {
    d$neighbourhood_group = keras::to_categorical(d$neighbourhood_group)
    d2$neighbourhood_group = keras::to_categorical(d2$neighbourhood_group)
  }
  target = as.vector(d$price)
  features = as.matrix(as_tibble(d[-len]))
  target_test = as.vector(d2$price)
  features_test = as.matrix(as_tibble(d2[-len]))
  nn[[sub]]$epochs = epochs =  30
  nn[[sub]]$model = model =  build_model(dim(features)[2])
  nn[[sub]]$summary = model %>% summary()
  nn[[sub]]$history = hist =  model %>% fit(
    x = features,
    y = target,
    epochs = epochs,
    validation_split = 0.2,
    verbose = 0,
    callbacks = list(print_dot_callback)
  )
  eva = model %>% evaluate(features_test,target_test, verbose = 0)
  nn[[sub]]$mae = eva[1]
  nn[[sub]]$loss = eva[2]
  nn[[sub]]$pred = pred = model %>% predict(features_test)
  nn[[sub]]$MSE = sum((pred - target_test)^2)/length(target_test)
 }
 nn$name = "Neural Networks"
 model_lis$neural_networks = nn
 ```
 \ \
 ## NN plots
 ```{r}
 for (sub in names(nn))
 {
  if(sub != "name")
  {
    m = nn[[sub]]
    hist = m$history
    print(paste0("========== ",sub, " =========="))
    str = paste0("MAE: ",round(m$mae,3)," --- Loss: ",round(m$loss,3))
    p = plot(hist, y ~ x) +  theme_bw(base_size = 12)  +ggtitle(paste0("NN of: ",sub))+ labs(caption= str)
    print(p)
    plot(p)
    cat("MAE: ",m$mae)
    cat("\nLoss: ",m$loss,"\n\n")
  }
 }
 ```
 \  \
 # Comparison between the models
 \ \
 ```{r}
 conc = c()
 for (n in unique(ds$neighbourhood_group))
 {
  conc = c(conc,n) 
 }
 for (n in unique(ds$neighbourhood_group))
 {
  for(r in unique(ds$room_type))
  {
    conc = c(conc, paste(n,"-",r))
  }
 }
 conc = c(conc,"All")
 n = names(nn)
 cols = vector("list")
 cols[["Subset"]] = conc
 for( m in model_lis)
 {
  col = c()
  for( nam in n)
  {
    if( nam != "name")
    { 
      col = c(col,m[[nam]]$MSE)  
    }
  }
  cols[[m$name]] = col
 }
 mse_df = as.data.frame(cols)
 best = c()
 for (i in 1:length(cols$Subset))
 {
  m = min(mse_df[i,2:6])
  col = which(mse_df[i,1:6 ] == m)
  nam = names(mse_df)[col]
  best = c(best, nam)
 }
 mse_df$Best = best
 #kableExtra::kable(mse_df)%>%
 #  kable_styling(bootstrap_options = "striped", full_width = F,font_size = 20) %>%
 ##  row_spec(0, bold = T, color = "white", background = "#D7261E")%>%
 ##  column_spec(1, bold = T, border_right = T,color = "white", background = "#191970")%>%
 #  column_spec(2:6,extra_css="text-align:Center")
 mse_df
 ```
 \ \ 
 \ \
 # Clustering and Groups
 \ \
 ## Clustering for Mixed type of data
 ```{r}
 clust_num = 5
 get_clusters = function(dts, num, dim_plot,name)
 {
  l = list()
  if(is.null(dts$room_type))
  {
    l$cl = kmeans(dts,num)
  }
  else
  {
    l$cl = kproto(dts,num)
  }
  clust = list()
  for (i in 1:num)
  {
    indexes = l$cl$cluster == i
    clust[[i]] = dts[indexes,]
  }
  min_lat =  dim_plot[1]
  max_lat = dim_plot[2]
  min_long = dim_plot[3]
  max_long= dim_plot[4]
  myplot= ggplot() +  background_image(img)+  xlab('Longitude') +  ylab('Latitude')+ theme(plot.margin = unit(c(1,1,1,1),"cm"),legend.title = element_text(colour="blue", size=10, face="bold")) + xlim(min_long, max_long) + ylim(min_lat,max_lat) +ggtitle(paste0("Clusters of ", name  ))
  count = 1
  l$clust_plot = vector("list")
  for(el in clust)
  {
    myplot = myplot+ geom_point(data = el, aes(y = latitude, x = longitude),color= count)
    p =ggplot()+ background_image(img)+geom_point(data = el, aes(y = latitude, x = longitude),color= count) + xlim(min_long, max_long) + ylim(min_lat,max_lat) + ggtitle(paste0("Cluster: ", count," of ", name  ))
    l$clust_plot[[as.character(count)]] = p
    l$summary[[as.character(count)]] = summary(el)
    count= count+1
  }
  l$myplot = myplot
  return(l)
 }
 get_all_cluster = function(clust_data, clust_num, dim)
 {
  lis = vector("list")
  plots = vector("list")
  cnt= 1
  for(sub in names(clust_data))
  {
    lis[[sub]] = get_clusters(clust_data[[sub]], clust_num, dim, sub)
    plots[[cnt]] = lis[[sub]]$myplot
    cnt = cnt+1
  }
  lis[["all_plots"]]= plots
  return(lis)  
 }
 borders = c(min(clust_data$all$latitude) ,max(clust_data$all$latitude), min(clust_data$all$longitude), max(clust_data$all$longitude))
 all_cluster = get_all_cluster(clust_data,5, borders)
 ```
 ```{r}
 multiplots <- function(plotlist, file=NULL,cols = 2, layout = NULL) {
  require(grid)
  plots <- c(plotlist)
  numPlots = length(plots)
  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols),byrow =T)
  }
  if (numPlots == 1) {
    print(plots[[1]])
  } else {
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    for (i in 1:numPlots) {
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
 }
 all_cluster$all_plots[[21]]
 all_cluster$all$summary
 ```
 ```{r}
 multiplots(all_cluster$all_plot[1:5], cols=3)
 ```
 ```{r}
 multiplots(all_cluster$all_plot[6:11], cols=3)
 ```
 ```{r}
 multiplots(all_cluster$all_plot[12:17], cols=3)
 ```
 ```{r}
 multiplots(all_cluster$all_plot[c(18:20,22,23)], cols=3)
 ```
 \ \
 ## Hierarchical Cluster Analysis
 ```{r}
 agg = aggregate(price ~neighbourhood_group+room_type, clust_data$all , mean)
 name_hc = c()
 for (n1 in substr(unique(agg$neighbourhood_group),1,5))
 {
  for(n2 in substr(unique(agg$room_type),1,3))
  {
    name_hc = c(name_hc, paste0(n1,"/",n2))
  }
 }
 rownames(agg) = name_hc
 agg
 gower <- daisy(agg, metric = "gower")
 hc1 <- hclust(gower, method = "complete" )
 plot(hc1, cex = 0.6, hang = -1)
 ```
 ```{r}
 avg_dend_obj <- as.dendrogram(hc1)
 avg_col_dend <- color_branches(avg_dend_obj, h = 0.6)
 plot(avg_col_dend)
 ```
 ```{r}
 agg = aggregate(price ~neighbourhood_group, clust_data$all , mean)
 agg
 rownames(agg) = c("Brooklyn","Manhattan",
                  "Queens","Staten Island", "Bronx")
 agg$neighbourhood_group = NULL
 gower <- daisy(agg, metric = "gower")
 hc1 <- hclust(gower, method = "complete" )
 plot(hc1, cex = 0.6, hang = -1)
 ```
 \    \
 # Principal Component Analysis
 \ \
 ## PCAmixdata
 ```{r}
 ## Split mixed dataset into quantitative and qualitative variables
 #split <- splitmix(dataset[1:5])
 split = splitmix(clust_data$all)
 ## PCA
 res.pcamix <- PCAmix(X.quanti=split$X.quanti,  
                     X.quali=split$X.quali, 
                     rename.level=TRUE)
 res.pcamix
 ```
 ```{r}
 ## Inspect principal components
 res.pcamix$eig
 ```
 ```{r}
 res.pcamix$quanti.cor
 res.pcamix$quali.eta2
 ```
 \ \
 ##  Factor Analysis of Mixed Data (FAMD)
 FAMD (base, ncp = 5, sup.var = NULL, ind.sup = NULL, graph = TRUE)
 - base : a data frame with n rows (individuals) and p columns (variables).
 - ncp: the number of dimensions kept in the results (by default 5)
 - sup.var: a vector indicating the indexes of the supplementary variables.
 - ind.sup: a vector indicating the indexes of the supplementary individuals.
 - graph : a logical value. If TRUE a graph is displayed.
 ```{r}
 res.famd <- FAMD(clust_data$all, graph = F, ncp = 5)
 print(res.famd)
 ```
 ```{r}
 eig.val <- get_eigenvalue(res.famd)
 head(eig.val)
 fviz_screeplot(res.famd)
 ```
 ```{r}
 quanti.var <- get_famd_var(res.famd, "quanti.var")
 quanti.var
 fviz_famd_var(res.famd, "quanti.var", col.var = "contrib", 
              gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
              repel = TRUE)
 ```
 ```{r}
 quali.var <- get_famd_var(res.famd, "quali.var")
 quali.var 
 fviz_famd_var(res.famd, "quali.var", col.var = "contrib", 
              gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
 )
 ```
 ```{r}
 var <- get_famd_var(res.famd)
 var
 ```
 ```{r}
 # Coordinates of variables
 head(var$coord)
 # Cos2: quality of representation on the factore map
 head(var$cos2)
 # Contributions to the  dimensions
 head(var$contrib)
 ```
 ```{r}
 # Plot of variables
 fviz_famd_var(res.famd, repel = TRUE)
 # Contribution to the first dimension
 fviz_contrib(res.famd, "var", axes = 1)
 # Contribution to the second dimension
 fviz_contrib(res.famd, "var", axes = 2)
 # Contribution to the third dimension
 fviz_contrib(res.famd, "var", axes = 3)
 # Contribution to the forth dimension
 fviz_contrib(res.famd, "var", axes = 4)
 # Contribution to the fifth dimension
 fviz_contrib(res.famd, "var", axes = 5)
 ```
--- a/Intelligence/Project/report_andreaierardi.pdf
+++ b/Intelligence/Project/report_andreaierardi.pdf
--- a/Intelligence/Project/repository_link
+++ b/Intelligence/Project/repository_link