source("https://raw.githubusercontent.com/MatteoFasulo/Rare-Earth/main/R/util/coreFunctions.R")
loadPackages(c('sn','tidyverse','psych','RColorBrewer','stargazer','mclust','ContaminatedMixt',
'plotly','ggplot2','ggdendro','teigen','tclust','HDMD','caTools','clustvarsel',
'vscc','sparcl','pgmm','caret','glmnet','MLmetrics'))
load("Z:\\DesktopC\\LUMSA\\2\\Data Mining\\Finite Mixture\\FiniteMixtureL31.RData")
#load("H:\\smoxy\\Downloads\\FiniteMixtureL31.RData")
rm(CO2data)
rm(NOdata)
rm(tonedata)
<- wine$Type
type rm(wine)
Dati su 27 caratteristiche chimico/fisiche di tre diversi tipi di vino (Barolo, Grignolino, Barbera) dal Piemonte. Un set di dati con 178 osservazioni e 28 variabili (di cui la prima relativa alla tipologia di vino). Nell’ordine:
data(wines)
wines
E’ stato possibile attraverso la ricerca originaria risalire all’anno di osservazione di ciascun vino. Di seguito vengono riportate le osservazioni dei tre diversi tipi di vino durante gli anni:
<- as.numeric(substr(rownames(wines), 6, 7))
year table(wines$wine, year)
year
70 71 72 73 74 75 76 78 79
Barolo 0 19 0 20 20 0 0 0 0
Grignolino 9 9 7 9 16 9 12 0 0
Barbera 0 0 0 0 9 0 5 29 5
#wines[,'wine'] <- type
Notiamo subito che il Barbera è distribuito principalmente negli ultimi anni (76,78,79) mentre il Barolo nel 71, 73 e 74. Per quanto riguarda la percentuale delle singole classi:
%>%
wines count(wine = factor(wine)) %>%
mutate(pct = prop.table(n)) %>%
ggplot(aes(x = wine, y = pct, fill = wine, label = scales::percent(pct))) +
geom_col(position = 'dodge') +
geom_text(position = position_dodge(width = .9),
vjust = -0.5,
size = 3) +
scale_y_continuous(name = "Percentage")+
scale_x_discrete(name = "Wine Name")+
scale_fill_hue(l=40, c=35)+
theme(legend.position = "none")
E’ chiaro che il Grignolino sia il più numeroso (39.9%) seguito dal Barolo (33.1%) e dal Barbera (27.0%).
Per rappresentare la dispersione dei dati abbiamo usato uno scatterplot leggermente differente dal solito. Sulla diagonale superiore si vede la distribuzione dei dati mentre sulla diagonale inferiore vi è la correlazione di Pearson tra le variabili.
<- c("#00AFBB", "#E7B800", "#FC4E07")
my_cols <- function(x, y){
panel.cor <- par("usr"); on.exit(par(usr))
usr par(usr = c(0, 1, 0, 1))
<- round(cor(x, y), digits=2)
r <- paste0("R = ", r)
txt text(0.5, 0.5, txt, cex = 1)
}<-function(x, y){
upper.panelpoints(x,y, pch = 19, col = my_cols[wines$wine],cex=.5)
}pairs(wines[,2:8],
lower.panel = panel.cor,
upper.panel = upper.panel)
Dalle prime 7 variabili, è possibile notare una correlazione di 0.69 tra acidity e malic e ovviamente una relazione inversamente proporzionale tra pH e acidity.
Per visualizzare le statistiche descrittive (media e deviazione standard) ci è sembrato opportuno dividerle in base alla classe di appartenenza:
<- function(variables,groupvariable)
printMeanAndSdByGroup
{<- c(names(groupvariable),names(as.data.frame(variables)))
variablenames <- groupvariable[,1]
groupvariable <- aggregate(as.matrix(variables) ~ groupvariable, FUN = mean)
means names(means) <- variablenames
print(paste("Means:"))
print(means)
<- aggregate(as.matrix(variables) ~ groupvariable, FUN = sd)
sds names(sds) <- variablenames
print(paste("Standard deviations:"))
print(sds)
}printMeanAndSdByGroup(wines[2:28],wines[1])
[1] "Means:"
[1] "Standard deviations:"
Alcune considerazioni:
Abbiamo calcolato la varianza within tra una feature e i tipi di vino:
<- function(variable,groupvariable)
calcWithinGroupsVariance
{<- as.factor(groupvariable[[1]])
groupvariable2 <- levels(groupvariable2)
levels <- length(levels)
numlevels <- 0
numtotal <- 0
denomtotal for (i in 1:numlevels)
{<- levels[i]
leveli <- variable[groupvariable==leveli,]
levelidata <- length(levelidata)
levelilength <- sd(levelidata)
sdi <- (levelilength - 1)*(sdi * sdi)
numi <- levelilength
denomi <- numtotal + numi
numtotal <- denomtotal + denomi
denomtotal
}<- numtotal / (denomtotal - numlevels)
Vw return(Vw)
}calcWithinGroupsVariance(wines["flavanoids"],wines[1])
[1] 0.2747075
Stesso discorso per la varianza between tra una feature e i vini:
<- function(variable,groupvariable)
calcBetweenGroupsVariance
{<- as.factor(groupvariable[[1]])
groupvariable2 <- levels(groupvariable2)
levels <- length(levels)
numlevels <- sapply(variable,mean)
grandmean <- 0
numtotal <- 0
denomtotal for (i in 1:numlevels)
{<- levels[i]
leveli <- variable[groupvariable==leveli,]
levelidata <- length(levelidata)
levelilength <- mean(levelidata)
meani <- sd(levelidata)
sdi <- levelilength * ((meani - grandmean)^2)
numi <- levelilength
denomi <- numtotal + numi
numtotal <- denomtotal + denomi
denomtotal
}<- numtotal / (numlevels - 1)
Vb <- Vb[[1]]
Vb return(Vb)
}calcBetweenGroupsVariance(wines["flavanoids"],wines[1])
[1] 64.2612
Per vedere quali variabili hanno la maggiore separazione, (rapporto tra Varianza Between e Varianza Within) abbiamo scritto una funzione apposita per calcolarne il valore per ogni feature.
<- function(variables,groupvariable)
calcSeparations
{# find out how many variables we have
<- as.data.frame(variables)
variables <- length(variables)
numvariables # find the variable names
<- colnames(variables)
variablenames # calculate the separation for each variable
<- NULL
Vw <- NULL
Vb <- NULL
sep for (i in 1:numvariables)
{<- variables[i]
variablei <- variablenames[i]
variablename <- calcWithinGroupsVariance(variablei, groupvariable)
Vw[i] <- calcBetweenGroupsVariance(variablei, groupvariable)
Vb[i] <- Vb[i]/Vw[i]
sep[i]
}<- data.frame('Within'=Vw,'Between'=Vb,'Sep'=sep, row.names = as.vector(colnames(wines[,-1])), stringsAsFactors = F)
result order(result$Sep,decreasing = T),]
result[
}calcSeparations(wines[2:28],wines[1])
Per alcune analisi successive, abbiamo provato a cambiare il task della ricerca tentando di utilizzare un modello come classificatore e di misurarne le prestazioni di classificazione. A tale scopo, abbiamo suddiviso il dataset originario in due sottogruppi:
require(caTools)
= sample.split(wines[,1], SplitRatio = .50)
sample
= subset(wines, sample == TRUE)
train <- train$wine
trainTestNames print(paste("Train Obs:",nrow(train)))
[1] "Train Obs: 90"
#train$wine <- as.numeric(train$wine)
= subset(wines, sample == FALSE)
test <- test$wine
wineTestNames print(paste("Test Obs:",nrow(test)))
[1] "Test Obs: 88"
#test$wine <- as.numeric(test$wine)
Per il Clustering abbiamo deciso di applicare:
<- pairwise.mahalanobis(wines[,-1],
dissMatrix grouping = c(1:nrow(wines)),
cov = cov(wines[,-1]))$distance
<- sqrt(dissMatrix)
dissMatrix <- as.dist(dissMatrix)
dissMatrix <- function(distance, methods, df, dt, dissMatrix) {
combDist <- 0
c <- list()
results for (i in 1:length(distance)){
ifelse(distance[i] == "minkowski",
<- dist(df, method = distance[i], p = 4),
dist ifelse(distance[i] == "mahalanobis",
<- dissMatrix,
dist <- dist(df, method = distance[i])))
dist for (j in 1:length(methods)){
= hclust(dist, method = methods[j])
dendo = ggdendrogram(dendo, rotate = F, size = 2, leaf_labels = T, labels = F) +
dendo.x ggtitle(paste(distance[i],' ',methods[j],sep=''))
for(elem in 2:4){
= cutree(dendo, k=elem)
cluster <- c + 1
c <- list(distance = distance[i],
results[[c]] method = methods[j],
groups = elem,
table = table(dt,cluster),
dendo = dendo.x,
AdjustedRandIndex = adjustedRandIndex(dt,cluster),
cluster = cluster)
}
}
}return(results)
}<- combDist(c("euclidean", "manhattan", "minkowski","mahalanobis"),
results c("single", "complete", "average", "ward.D"), scale(wines[,-1]), wines[,1], dissMatrix)
<- function(results){
optimal = 0
best_randIndex.eu = 0
best_randIndex.ma = 0
best_randIndex.mi = 0
best_randIndex.maha = integer()
best_model.eu = integer()
best_model.ma = integer()
best_model.mi = integer()
best_model.maha for (i in 1:length(results)){
= results[[i]]$AdjustedRandIndex
current_randIndex if (results[[i]]$distance == "euclidean"){
if (current_randIndex > best_randIndex.eu) {
= current_randIndex
best_randIndex.eu = i
best_model.eu
}
}else if (results[[i]]$distance == "manhattan"){
if (current_randIndex > best_randIndex.ma) {
= current_randIndex
best_randIndex.ma = i
best_model.ma
}
}else if (results[[i]]$distance == "minkowski"){
if (current_randIndex > best_randIndex.mi) {
= current_randIndex
best_randIndex.mi = i
best_model.mi
}
}else if (results[[i]]$distance == "mahalanobis"){
if (current_randIndex > best_randIndex.maha) {
= current_randIndex
best_randIndex.maha = i
best_model.maha
}
}
}#print(list(euclidean = list(model.number = best_model.eu,
# cluster = results[[best_model.eu]]$groups,
# AdjustedRandIndex = best_randIndex.eu),
# manhattan = list(model.number = best_model.ma,
# cluster = results[[best_model.ma]]$groups,
# AdjustedRandIndex = best_randIndex.ma),
# minkowski = list(model.number = best_model.mi,
# cluster = results[[best_model.mi]]$groups,
# AdjustedRandIndex = best_randIndex.mi),
# mahalanobis=list(model.number = best_model.maha,
# cluster = results[[best_model.maha]]$groups,
# AdjustedRandIndex = best_randIndex.maha))
# )
return(list(euclidean = results[[best_model.eu]],
manhattan = results[[best_model.ma]],
minkowski = results[[best_model.mi]],
mahalanobis=results[[best_model.maha]]))
}= optimal(results) best_dist_model
ggplotly(best_dist_model$euclidean$dendo)
print(best_dist_model$euclidean$table)
cluster
dt 1 2 3
Barolo 55 4 0
Grignolino 6 61 4
Barbera 0 0 48
print(paste("AdjustedRandIndex:",round(best_dist_model$euclidean$AdjustedRandIndex,3)))
[1] "AdjustedRandIndex: 0.769"
ggplotly(best_dist_model$manhattan$dendo)
print(best_dist_model$manhattan$table)
cluster
dt 1 2 3
Barolo 59 0 0
Grignolino 3 68 0
Barbera 0 0 48
print(paste("AdjustedRandIndex:",round(best_dist_model$manhattan$AdjustedRandIndex,3)))
[1] "AdjustedRandIndex: 0.946"
ggplotly(best_dist_model$minkowski$dendo)
print(best_dist_model$minkowski$table)
cluster
dt 1 2 3 4
Barolo 55 2 2 0
Grignolino 8 1 59 3
Barbera 2 0 0 46
print(paste("AdjustedRandIndex:",round(best_dist_model$minkowski$AdjustedRandIndex,3)))
[1] "AdjustedRandIndex: 0.73"
ggplotly(best_dist_model$mahalanobis$dendo)
print(best_dist_model$mahalanobis$table)
cluster
dt 1 2
Barolo 52 7
Grignolino 57 14
Barbera 5 43
print(paste("AdjustedRandIndex:",round(best_dist_model$mahalanobis$AdjustedRandIndex,3)))
[1] "AdjustedRandIndex: 0.27"
require(cluster)
.3 <- kmeans(scale(wines[,-1]),centers=3,nstart = 50, iter.max = 100)
k.meanstable(wines[,1], k.means.3$cluster)
1 2 3
Barolo 58 1 0
Grignolino 2 68 1
Barbera 0 0 48
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(k.means.3$cluster, wines[,1]),3)))
[1] "AdjustedRandIndex: 0.93"
require(cluster)
.4 <- kmeans(scale(wines[,-1]),centers=4,nstart = 50, iter.max = 100)
k.meanstable(wines[,1], k.means.4$cluster)
1 2 3 4
Barolo 0 0 59 0
Grignolino 0 38 5 28
Barbera 47 0 0 1
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(k.means.4$cluster, wines[,1]),3)))
[1] "AdjustedRandIndex: 0.736"
require(cluster)
.3 <- pam(wines[,-1], k=3,
PAMmetric = "euclidean",
nstart = 50,
stand = TRUE)
table(wines[,1], PAM.3$clustering)
1 2 3
Barolo 50 9 0
Grignolino 3 67 1
Barbera 1 1 46
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(PAM.3$clustering, wines[,1]),3)))
[1] "AdjustedRandIndex: 0.754"
require(cluster)
.4 <- pam(wines[,-1], k=4,
PAMmetric = "euclidean",
nstart = 50,
stand = TRUE)
table(wines[,1], PAM.4$clustering)
1 2 3 4
Barolo 28 27 4 0
Grignolino 3 0 67 1
Barbera 1 0 1 46
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(PAM.4$clustering, wines[,1]),3)))
[1] "AdjustedRandIndex: 0.728"
Novacula Occami: frustra fit per plura quod potest fieri per pauciora (Il rasoio di Occam: è futile fare con più mezzi ciò che si può fare con meno). Tale principio metodologico è ritenuto alla base del pensiero scientifico moderno.
1 2 3 4
Barolo 47 10 2 0
Grignolino 6 3 61 1
Barbera 0 0 1 47
[1] "AdjustedRandIndex: 0.734"
1 2 3 4
Barolo 47 10 2 0
Grignolino 6 3 61 1
Barbera 0 0 1 47
[1] "AdjustedRandIndex: 0.734"
table(wines[,1], vscc.mclust$initialrun$classification) #Clustering results on full data set
1 2 3
Barolo 58 1 0
Grignolino 1 70 0
Barbera 0 2 46
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(vscc.mclust$initialrun$classification, wines[,1]),3)))
[1] "AdjustedRandIndex: 0.931"
table(wines[,1], vscc.mclust$bestmodel$classification) #Clustering results on reduced data set
1 2 3
Barolo 59 0 0
Grignolino 3 66 2
Barbera 0 0 48
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(vscc.mclust$bestmodel$classification, wines[,1]),3)))
[1] "AdjustedRandIndex: 0.913"
1 2 3
Barolo 46 13 0
Grignolino 1 20 50
Barbera 0 29 19
[1] "AdjustedRandIndex: 0.371"
1 2 3 4
Barolo 30 23 6 0
Grignolino 4 0 22 45
Barbera 5 0 31 12
[1] "AdjustedRandIndex: 0.303"
Le variabili selezionate dalla funzione clustervarsel sono:
Le variabili selezionate dalla funzione vscc sono:
table(wines[,1],mixt.selected.wines$classification)
1 2 3 4
Barolo 59 0 0 0
Grignolino 8 50 11 2
Barbera 0 0 0 48
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(wines[,1],mixt.selected.wines$classification),3)))
[1] "AdjustedRandIndex: 0.745"
table(wines[,1],getBestModel(cn.wines.mixt, criterion = 'ICL')$models[[1]]$group)
1 2 3
Barolo 0 0 59
Grignolino 3 63 5
Barbera 48 0 0
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(wines[,1],getBestModel(cn.wines.mixt, criterion = 'ICL')$models[[1]]$group),3)))
[1] "AdjustedRandIndex: 0.864"
table(wines[,1],getBestModel(cn.wines.kmeans, criterion = 'ICL')$models[[1]]$group)
1 2 3
Barolo 0 0 59
Grignolino 63 3 5
Barbera 0 48 0
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(wines[,1],getBestModel(cn.wines.kmeans, criterion = 'ICL')$models[[1]]$group),3)))
[1] "AdjustedRandIndex: 0.864"
table(wines[,1],getBestModel(cn.wines.rpost, criterion = 'ICL')$models[[1]]$group)
1 2 3
Barolo 59 0 0
Grignolino 5 3 63
Barbera 0 48 0
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(wines[,1],getBestModel(cn.wines.rpost, criterion = 'ICL')$models[[1]]$group),3)))
[1] "AdjustedRandIndex: 0.864"
table(wines[,1],getBestModel(cn.wines.rclass, criterion = 'ICL')$models[[1]]$group)
1 2 3
Barolo 0 0 59
Grignolino 62 3 6
Barbera 0 48 0
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(wines[,1],getBestModel(cn.wines.rclass, criterion = 'ICL')$models[[1]]$group),3)))
[1] "AdjustedRandIndex: 0.847"
table(wines[,1],teigen.kmeans$classification)
1 2 3 4
Barolo 58 0 1 0
Grignolino 1 1 69 0
Barbera 0 25 0 23
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(wines[,1],teigen.kmeans$classification),3)))
[1] "AdjustedRandIndex: 0.865"
table(wines[,1],teigen.hard$classification)
1 2 3 4
Barolo 1 0 0 58
Grignolino 69 1 0 1
Barbera 0 25 23 0
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(wines[,1],teigen.hard$classification),3)))
[1] "AdjustedRandIndex: 0.865"
table(wines[,1],teigen.soft$classification)
1 2 3 4
Barolo 58 0 1 0
Grignolino 1 1 69 0
Barbera 0 25 0 23
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(wines[,1],teigen.soft$classification),3)))
[1] "AdjustedRandIndex: 0.865"
table(test[,1],teigen.pre.kmeans$classification)
1 2 3
Barolo 28 1 0
Grignolino 3 31 1
Barbera 0 0 24
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(test[,1],teigen.pre.kmeans$classification),3)))
[1] "AdjustedRandIndex: 0.827"
<- teigen(train[,-1], Gs=3:4, init = 'uniform', scale = T, known = train[,1], verbose = F)
teigen.classifier.uniform = predict(teigen.classifier.uniform,test[,-1])
teigen.pre.uniform
table(test[,1],teigen.pre.uniform$classification)
1 2 3
Barolo 28 1 0
Grignolino 3 31 1
Barbera 0 0 24
print(paste("AdjustedRandIndex:",round(adjustedRandIndex(test[,1],teigen.pre.uniform$classification),3)))
[1] "AdjustedRandIndex: 0.827"
Per la nostra analisi abbiamo voluto verificare l’efficienza di tre noti modelli di regolarizzazione attraverso il pacchetto caret:
<- 10^seq(0, -2, length = 250) lambda
Il modello Ridge riduce i coefficienti, in modo che le variabili, con un contributo minore al risultato, abbiano i loro coefficienti vicini allo zero. Invece di forzarli a essere esattamente zero (come nel Lasso), li penalizziamo con un termine chiamato norma L2 costringendoli così a essere piccoli. In questo modo diminuiamo la complessità del modello senza eliminare nessuna variabile attraverso una costante chiamata lambda (\(\lambda\)) di penalizzazione: \[ L_{ridge}(\hat{\beta}) = \sum_{i = 1}^{n}{(y_i - x_i\hat{\beta})^2} + \lambda\sum_{k = 1}^{K}{\hat{\beta}_k^2} \]
# Build the model
set.seed(123)
<- caret::train(
ridge x = train[,-1],
y = factor(train[,1]),
method = "glmnet",
trControl = trainControl("cv", number = 10, classProbs = TRUE, summaryFunction = multiClassSummary),
tuneGrid = expand.grid(alpha = 0, lambda=lambda),
metric="Accuracy")
# Model coefficients
coef(ridge$finalModel, ridge$bestTune$lambda)
$Barolo
28 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) -5.773974e+00
alcohol 1.601221e-01
sugar 2.847696e-02
acidity -2.776960e-03
tartaric -1.009434e-01
malic -1.734279e-02
uronic -1.523725e-01
pH 6.403133e-02
ash 1.990303e-01
alcal_ash -3.323135e-02
potassium 2.699287e-04
calcium -1.600088e-03
magnesium 5.805483e-03
phosphate 9.690038e-04
chloride 1.129685e-04
phenols 1.263709e-01
flavanoids 1.177827e-01
nonflavanoids -5.550151e-01
proanthocyanins 1.462032e-01
colour 1.013349e-02
hue 2.626132e-01
OD_dw 1.388700e-01
OD_fl 7.422181e-02
glycerol 5.042171e-02
butanediol -2.132007e-05
nitrogen 8.420899e-04
proline 4.925977e-04
methanol 4.285249e-04
$Grignolino
28 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 6.787280e+00
alcohol -1.890888e-01
sugar -3.224553e-02
acidity -1.772993e-03
tartaric -5.621554e-02
malic -7.762842e-02
uronic -2.237885e-01
pH 1.351516e-01
ash -3.611891e-01
alcal_ash 8.905963e-03
potassium -4.815379e-04
calcium 3.291712e-03
magnesium -7.495180e-03
phosphate -8.798840e-04
chloride 4.059303e-04
phenols 3.362762e-03
flavanoids 1.053689e-03
nonflavanoids 6.151024e-02
proanthocyanins -4.543428e-02
colour -5.651463e-02
hue 2.889793e-01
OD_dw 2.124811e-02
OD_fl 5.477169e-02
glycerol -5.802764e-02
butanediol -3.782010e-04
nitrogen 9.400765e-07
proline -4.114483e-04
methanol -1.608830e-03
$Barbera
28 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) -1.013307e+00
alcohol 2.896670e-02
sugar 3.768566e-03
acidity 4.549953e-03
tartaric 1.571589e-01
malic 9.497122e-02
uronic 3.761610e-01
pH -1.991830e-01
ash 1.621588e-01
alcal_ash 2.432538e-02
potassium 2.116092e-04
calcium -1.691623e-03
magnesium 1.689697e-03
phosphate -8.911989e-05
chloride -5.188988e-04
phenols -1.297336e-01
flavanoids -1.188364e-01
nonflavanoids 4.935049e-01
proanthocyanins -1.007690e-01
colour 4.638115e-02
hue -5.515925e-01
OD_dw -1.601181e-01
OD_fl -1.289935e-01
glycerol 7.605930e-03
butanediol 3.995211e-04
nitrogen -8.430300e-04
proline -8.114934e-05
methanol 1.180305e-03
# Make predictions
<- ridge %>% predict(test)
predictions.ridge # Model prediction performance
tibble(
trueValue = wineTestNames,
predictedValue = predictions.ridge)
Il ridge è composto dalla somma dei residui quadrati più una penalità , definita dalla lettera Lambda, che è moltiplicata per la somma dei coefficienti quadrati \(\beta\). Quando \(\lambda = 0\), il termine di penalità non ha alcun effetto e il ridge produrrà i coefficienti minimi quadrati classici. Tuttavia, quando \(\lambda\) aumenta all’infinito, l’impatto della penalità aumenta e i coefficienti si avvicinano allo zero. Il ridge è particolarmente indicato quando si hanno molti dati multivariati con numero di feature maggiore del numero di osservazioni. Lo svantaggio, però, è che includerà tutti le feature nel modello finale, a differenza dei metodi di feature selection, che generalmente selezioneranno un insieme ridotto di variabili tra quelle disponibili.
::confusionMatrix(predictions.ridge, test$wine) caret
Confusion Matrix and Statistics
Reference
Prediction Barolo Grignolino Barbera
Barolo 29 2 0
Grignolino 0 33 2
Barbera 0 0 22
Overall Statistics
Accuracy : 0.9545
95% CI : (0.8877, 0.9875)
No Information Rate : 0.3977
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.9309
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Barolo Class: Grignolino Class: Barbera
Sensitivity 1.0000 0.9429 0.9167
Specificity 0.9661 0.9623 1.0000
Pos Pred Value 0.9355 0.9429 1.0000
Neg Pred Value 1.0000 0.9623 0.9697
Prevalence 0.3295 0.3977 0.2727
Detection Rate 0.3295 0.3750 0.2500
Detection Prevalence 0.3523 0.3977 0.2500
Balanced Accuracy 0.9831 0.9526 0.9583
Il Least Absolute Shrinkage and Selection Operator (LASSO) riduce i coefficienti verso lo zero penalizzando il modello con un termine di penalità chiamato norma L1, che è la somma dei coefficienti in valore assoluto: \[ L_{lasso}(\hat{\beta}) = \sum_{i = 1}^{n}{(y_i - x_i\hat{\beta})^2} + \lambda\sum_{k = 1}^{K}{|\hat{\beta}_k|} \]
# Build the model
set.seed(123)
<- caret::train(
lasso x = train[,-1],
y = factor(train[,1]),
method = "glmnet",
trControl = trainControl("cv", number = 10, classProbs = TRUE, summaryFunction = multiClassSummary),
tuneGrid = expand.grid(alpha = 1, lambda=lambda),
metric="Accuracy")
# Model coefficients
coef(lasso$finalModel, lasso$bestTune$lambda)
$Barolo
28 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) -11.849841488
alcohol .
sugar .
acidity .
tartaric .
malic .
uronic .
pH .
ash .
alcal_ash .
potassium .
calcium .
magnesium .
phosphate .
chloride .
phenols .
flavanoids 0.174755104
nonflavanoids .
proanthocyanins .
colour .
hue .
OD_dw 0.729463141
OD_fl .
glycerol .
butanediol .
nitrogen .
proline 0.004795062
methanol .
$Grignolino
28 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 1.461681e+01
alcohol -9.204137e-01
sugar .
acidity .
tartaric .
malic -6.597026e-02
uronic .
pH .
ash -1.578711e+00
alcal_ash .
potassium -1.607104e-06
calcium .
magnesium .
phosphate .
chloride .
phenols .
flavanoids .
nonflavanoids .
proanthocyanins .
colour -4.840949e-01
hue .
OD_dw .
OD_fl .
glycerol -2.330578e-01
butanediol .
nitrogen .
proline .
methanol .
$Barbera
28 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) -2.76696471
alcohol .
sugar .
acidity .
tartaric .
malic 0.19468148
uronic 0.97246611
pH .
ash .
alcal_ash .
potassium .
calcium .
magnesium .
phosphate .
chloride .
phenols .
flavanoids -0.80529767
nonflavanoids .
proanthocyanins .
colour 0.02919194
hue -3.08136966
OD_dw .
OD_fl -0.25692597
glycerol .
butanediol .
nitrogen .
proline .
methanol .
# Make predictions
<- lasso %>% predict(test)
predictions.lasso # Model prediction performance
tibble(
trueValue = wineTestNames,
predictedValue = predictions.lasso)
In questo caso la penalità ha l’effetto di forzare alcune delle stime dei coefficienti, con un contributo minore al modello, a essere esattamente uguale a zero. Il lasso, quindi, può anche essere visto come un’alternativa ai metodi di feature selection per eseguire la selezione delle variabili al fine di ridurre la complessità del modello.
Come nel ridge, è fondamentale selezionare un buon valore di \(\lambda\).
Quando lambda è piccolo, il risultato è molto vicino alla stima dei minimi quadrati. All’aumentare di lambda, si verifica una contrazione in modo da poter eliminare le variabili che sono a zero.
::confusionMatrix(predictions.lasso, test$wine) caret
Confusion Matrix and Statistics
Reference
Prediction Barolo Grignolino Barbera
Barolo 28 1 0
Grignolino 1 33 1
Barbera 0 1 23
Overall Statistics
Accuracy : 0.9545
95% CI : (0.8877, 0.9875)
No Information Rate : 0.3977
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.931
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Barolo Class: Grignolino Class: Barbera
Sensitivity 0.9655 0.9429 0.9583
Specificity 0.9831 0.9623 0.9844
Pos Pred Value 0.9655 0.9429 0.9583
Neg Pred Value 0.9831 0.9623 0.9844
Prevalence 0.3295 0.3977 0.2727
Detection Rate 0.3182 0.3750 0.2614
Detection Prevalence 0.3295 0.3977 0.2727
Balanced Accuracy 0.9743 0.9526 0.9714
Elastic Net combina le proprietà di Ridge e Lasso penalizzando il modello usando sia la norma L2 che la norma L1: \[ L_{ElasticNet}(\hat{\beta}) = \frac{\sum_{i = 1}^{n}{(y_i - x_i\hat{\beta})^2}}{2n} + \lambda(\frac{1-\alpha}{2}\sum_{k = 1}^{K}{\hat{\beta}_k^2} + \alpha\sum_{k = 1}^{K}{|\hat{\beta}_k}|) \]
set.seed(123)
<- caret::train(
elastic x = train[,-1],
y = factor(train[,1]),
method = "glmnet",
trControl = trainControl("cv", number = 10, classProbs = TRUE, summaryFunction = multiClassSummary),
metric="Accuracy")
# Model coefficients
coef(elastic$finalModel, elastic$bestTune$lambda)
$Barolo
28 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) -9.730569698
alcohol 0.274590456
sugar .
acidity .
tartaric .
malic .
uronic .
pH .
ash .
alcal_ash -0.039923198
potassium .
calcium .
magnesium .
phosphate 0.001283703
chloride .
phenols .
flavanoids 0.299534487
nonflavanoids .
proanthocyanins .
colour .
hue .
OD_dw 0.303221659
OD_fl .
glycerol 0.025831137
butanediol .
nitrogen .
proline 0.002009706
methanol .
$Grignolino
28 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 11.2568155702
alcohol -0.5859864115
sugar .
acidity .
tartaric .
malic -0.1291909425
uronic .
pH .
ash -1.0082315724
alcal_ash .
potassium -0.0003227710
calcium .
magnesium -0.0062108242
phosphate .
chloride .
phenols .
flavanoids .
nonflavanoids .
proanthocyanins .
colour -0.1938906837
hue .
OD_dw .
OD_fl .
glycerol -0.1620487622
butanediol .
nitrogen .
proline -0.0009097107
methanol .
$Barbera
28 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) -1.52624587
alcohol .
sugar .
acidity .
tartaric 0.21065074
malic 0.20195569
uronic 0.75211696
pH .
ash .
alcal_ash .
potassium .
calcium .
magnesium .
phosphate .
chloride .
phenols .
flavanoids -0.39354493
nonflavanoids .
proanthocyanins .
colour 0.08931841
hue -1.73977955
OD_dw -0.36134581
OD_fl -0.28785551
glycerol .
butanediol .
nitrogen .
proline .
methanol .
# Make predictions
<- elastic %>% predict(test)
predictions.enet # Model prediction performance
tibble(
trueValue = wineTestNames,
predictedValue = predictions.enet)
Oltre a impostare e scegliere un valore lambda, l’elastic net ci consente anche di ottimizzare il parametro alfa dove \(\alpha = 0\) corrisponde a ridge e \(\alpha = 1\) al lasso.
Pertanto possiamo scegliere un valore \(\alpha\) compreso tra 0 e 1 per ottimizzare l’elastic net. Se tale valore è incluso in questo intervallo, si avrà una riduzione con alcuni portati a \(0\).
::confusionMatrix(predictions.enet, test$wine) caret
Confusion Matrix and Statistics
Reference
Prediction Barolo Grignolino Barbera
Barolo 29 1 0
Grignolino 0 33 1
Barbera 0 1 23
Overall Statistics
Accuracy : 0.9659
95% CI : (0.9036, 0.9929)
No Information Rate : 0.3977
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.9483
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Barolo Class: Grignolino Class: Barbera
Sensitivity 1.0000 0.9429 0.9583
Specificity 0.9831 0.9811 0.9844
Pos Pred Value 0.9667 0.9706 0.9583
Neg Pred Value 1.0000 0.9630 0.9844
Prevalence 0.3295 0.3977 0.2727
Detection Rate 0.3295 0.3750 0.2614
Detection Prevalence 0.3409 0.3864 0.2727
Balanced Accuracy 0.9915 0.9620 0.9714
<- list(ridge = caret::confusionMatrix(predictions.ridge, test$wine)$overall[1],
models lasso = caret::confusionMatrix(predictions.lasso,test$wine)$overall[1],
elastic = caret::confusionMatrix(predictions.enet,test$wine)$overall[1])
for(j in 1:length(models)){
print(paste("Accuracy of",names(models[j]),"=",round(models[[j]],3)))
}
[1] "Accuracy of ridge = 0.955"
[1] "Accuracy of lasso = 0.955"
[1] "Accuracy of elastic = 0.966"
<- dist(wines[,-1], method = "manhattan")
dist <- hclust(dist, method = "ward.D")
dendo <- list()
custom for (i in 3:4){
= cutree(dendo, k=i)
custom[[i]]
}
<- c("UCU", "UCC", "CUU", "CUC", "CCU", "CCC")
models_to_run
# Il rilassamento dato da (p-q)^2 > p+q è verificato per 1 <= q <= 20
<- pgmmEM(scale(wines[,-1]), rG=3:4, rq=1:7, icl=T, zstart=2,seed=1234, modelSubset=models_to_run) pg.kmeans
Based on k-means starting values, the best model (ICL) for the range of factors and components used is a CUU model with q = 6 and G = 3.
The ICL for this model is -11480.51.
table(wines[,1],pg.kmeans$map)
1 2 3
Barolo 0 59 0
Grignolino 67 3 1
Barbera 0 0 48
adjustedRandIndex(wines[,1],pg.kmeans$map)
[1] 0.9294895
<- pgmmEM(scale(wines[,-1]), rG=3:4, rq=1:7, icl=T, zstart = 3, seed = 1234, zlist = custom, modelSubset=models_to_run) pg.manhattan
Based on custom starting values, the best model (ICL) for the range of factors and components used is a CUU model with q = 6 and G = 3.
The ICL for this model is -11480.51.
table(wines[,1],pg.manhattan$map)
1 2 3
Barolo 59 0 0
Grignolino 3 1 67
Barbera 0 48 0
adjustedRandIndex(wines[,1],pg.manhattan$map)
[1] 0.9294895
<- function(df, models){
summaryOfModels <- length(models)
nModel <- NULL
ari <- NULL
bic <- NULL
icl <- NULL
G for(i in 1:nModel){
if (class(models[[i]])=='kmeans'){
<- adjustedRandIndex(df,models[[i]]$cluster)
ari[i] <- NA
bic[i] <- NA
icl[i] <- length(unique(models[[i]]$cluster))
G[i] else if (class(models[[i]])=='pam') {
} <- adjustedRandIndex(df,models[[i]]$clustering)
ari[i] <- NA
bic[i] <- NA
icl[i] <- length(models[[i]]$id.med)
G[i] else if (class(models[[i]])=='KMeansSparseCluster'){
} <- models[[i]]
modelName <- adjustedRandIndex(df,modelName[[1]]$Cs)
ari[i] <- NA
bic[i] <- NA
icl[i] <- modelName[[1]]$wbound
G[i] else if (class(models[[i]])=='Mclust'){
} <- adjustedRandIndex(df,models[[i]]$classification)
ari[i] <- models[[i]]$bic
bic[i] <- models[[i]]$icl
icl[i] <- models[[i]]$G
G[i] else if (class(models[[i]])=='ContaminatedMixt'){
} <- getBestModel(models[[i]], criterion = 'ICL')$models[[1]]
bestModel <- adjustedRandIndex(df,bestModel$group)
ari[i] <- bestModel$IC$BIC
bic[i] <- bestModel$IC$ICL
icl[i] <- bestModel$G
G[i] else if (class(models[[i]])=='teigen'){
} <- adjustedRandIndex(df,models[[i]]$iclresults$classification)
ari[i] <- models[[i]]$bic
bic[i] <- models[[i]]$iclresults$icl
icl[i] <- models[[i]]$G
G[i] else if (class(models[[i]])=='pgmm'){
} <- adjustedRandIndex(df,models[[i]]$map)
ari[i] <- models[[i]]$g
G[i] ifelse(is.null(models[[i]]$bic[1])==TRUE,
<- NA,
bic[i] <- as.double(models[[i]]$bic[1]))
bic[i] ifelse(is.null(models[[i]]$icl[1])==TRUE,
<- NA,
icl[i] <- as.double(models[[i]]$icl[1]))
icl[i] else if (class(models[[i]])=='tkmeans'){
} <- adjustedRandIndex(df,models[[i]]$cluster)
ari[i] <- models[[i]]$k
G[i] <- NA
bic[i] <- NA
icl[i]
}
}<- data.frame('AdjustedRandIndex' = ari,
outputDF 'BIC' = bic,
'ICL' = icl,
'G' = as.integer(G),
row.names = c('KMeans3',
'KMeans4',
'PAM3',
'PAM4',
'VS-Greedy',
'VS-Headlong',
'VSCC',
'KMeansSparse3',
'KMeansSparse4',
'VS-MVN',
'VS-Contaminated Mixt',
'VS-VS-Contaminated KMeans',
'VS-Contaminated RPost',
'VS-Contaminated RClass',
'VS-TEigen KMeans',
'VS-TEigen Hard',
'VS-TEigen Soft'),
stringsAsFactors = F)
return(outputDF)
}<- summaryOfModels(wines[,1],list(
test .3,
k.means.4,
k.means.3,
PAM.4,
PAM$model,
subset.greedy$model,
subset.headlong$bestmodel,
vscc.mclust.3,
km.sparse.4,
km.sparse
mixt.selected.wines,
cn.wines.mixt,
cn.wines.kmeans,
cn.wines.rpost,
cn.wines.rclass,
teigen.kmeans,
teigen.hard,
teigen.soft))print(test, n=20)
# best 3 for separation
plot3d(wines$flavanoids, wines$OD_dw, wines$proline, type='s', size=2, col=as.numeric(wines$wine))
# worst 3 for separation
plot3d(wines$methanol, wines$potassium, wines$pH, type='s', size=2, col=as.numeric(wines$wine))