1 Introduction

1.1 Overview

In this project, We will use a public dataset from UCI in order to explore the benefits of an unsupervised machine learning technique.

1.2 Purpose

The main purpose of this project is to provide a better understanding to the businesses about the use of the clustering technique to gain insights than can improve customer loyalty, sales and profits.

2 Data Understanding

2.1 Setting libraries

Loading libraries for data manipulation & visualization…

library(factoextra)
library(NbClust)
library(cluster)
library(factoextra)
library(ggplot2)
library(corrplot)
library(animation)
library(readxl)
library(factoextra)
library(NbClust)
library(cluster)
library(factoextra)
library(ggplot2)
library(animation)
library(scales)
library(ExPanDaR)
library(knitr)
library(tidyverse)
library(lubridate)
library(lattice)
library(e1071)
library(scales)
library(caret)
library(rattle)
library(qwraps2)
library(ROCR)
library(countrycode)
library(tidyr)
library(DataExplorer)
library(corrplot)
library(corrr)
library(imputeTS)
library(fpp)
library(mice)
library(ggplot2)
library(highcharter)
library(gapminder)
library(magrittr)    
library(viridisLite)
library(countrycode)
library(DT)
library(cluster)
library(shiny)
library(car)
library(rgl)
library(gridExtra)
library(grid)

2.2 Data Loading

Loading UCI dataset…

# 1.1 Setting my working directory in R
setwd("C:/Saul/Portfolio/K-Means/KMeans") 
# 1.2 Reading the dataset
data.trx <-  data.frame(read_excel("Retail Transactions.xlsx", sheet="Transactions"))

2.3 Data Exploring

# 2.1 Data structure review
#str(data.trx)
# 2.2 Data preview

Checking a sample data…

datatable(data.trx[(1:50),], filter = 'top', options = list(
  pageLength = 25, scrollX = TRUE, scrollY = "300px", autoWidth = TRUE))

Checking missing values…

prepare_missing_values_graph(data.trx, ts_id = "Country")

plot_missing(data.trx)

Checking data through a world map

countries <- data.trx %>% 
filter(!(Country %in% c("EIRE", "Unspecified","Channel Islands","European Community","RSA")))  %>% 
group_by(Country) %>% 
dplyr::summarise(total = n())
  

names(countries) <- c("country", "total")
countries$iso3 <-  countrycode(countries$country, origin = "country.name", destination = "iso3c")



data(worldgeojson, package = "highcharter")
dshmstops <- data.frame(q = c(0, exp(1:5)/exp(5)),
                        c = substring(viridis(5 + 1, option = "D"), 0, 7)) %>%  list_parse2()

highchart() %>% 
  hc_add_series_map(worldgeojson, countries, value = "total", joinBy = "iso3") %>% 
  hc_legend(enabled = TRUE) %>% 
  hc_add_theme(hc_theme_google()) %>% 
  hc_mapNavigation(enabled = TRUE) %>%
  hc_title(text = "Transactions per Country")  %>%
  hc_colorAxis(minColor = "#bed2e7", maxColor = "#003366") %>%
hc_tooltip(useHTML = TRUE, headerFormat = "", pointFormat = "{point.country}: {point.total} transactions")

Let’s take a quick review using Tableau

3 Data Preparation

3.1 Scope

  • Include only UK
  • Remove guest customers
  • Remove cancelled invoices
  • Standarize data
  • Use the RFM Model

3.2 Processes

Performing cleansing, formatting, normalization,…

3.3 Output

3.3.1 Money Spent by Customers

df <- data.frame(customer.data)
ggplot(df, aes(x = money.spent.z)) +
  geom_histogram(bins = 20, fill="#08519C", alpha=1) +
  geom_vline(aes(xintercept=mean(money.spent.z)), ## straight line for the mean
             colour = "#ADFF2F", size=1.5, alpha=0.5) + 
  geom_vline(aes(xintercept=median(money.spent.z)), ## dashed line for the median
             colour = "#ADFF2F", linetype="dashed", size=1.5, alpha=0.5)

3.3.2 Days Since Last Purchase

df <- data.frame(customer.data)
ggplot(df, aes(x = days.sl.pur.z)) +
  geom_histogram(bins = 20, fill="#08519C", alpha=1) +
  geom_vline(aes(xintercept=mean(days.sl.pur.z)), ## straight line for the mean
             colour = "#ADFF2F", size=1.5, alpha=0.5) + 
  geom_vline(aes(xintercept=median(days.sl.pur.z)), ## dashed line for the median
             colour = "#ADFF2F", linetype="dashed", size=1.5, alpha=0.5)

3.3.3 Number of Purchases

df <- data.frame(customer.data)
ggplot(df, aes(x = number.pur.z)) +
  geom_histogram(bins = 20, fill="#08519C", alpha=1) +
  geom_vline(aes(xintercept=mean(number.pur.z)), ## straight line for the mean
             colour = "#ADFF2F", size=1.5, alpha=0.5) + 
  geom_vline(aes(xintercept=median(number.pur.z)), ## dashed line for the median
             colour = "#ADFF2F", linetype="dashed", size=1.5, alpha=0.5)

4 Data Modeling

4.1 K-Means

Basically, it works like this…

4.2 Cluster Definition

Let’s use some of the different techniques to find a optimal number of clusters.

# 4.1 Elbow method

set.seed(123)
# Compute and plot wss for k = 2 to k = 15
k.max <- 15 # Maximal number of clusters
data <- customer.data[,2:4]
wss <- sapply(1:k.max,
              function(k){kmeans(data, k, nstart=10 )$tot.withinss})
plot(1:k.max, wss,
     type="b", pch = 19, frame = FALSE,
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")
abline(v = 4, lty =2,col="royalblue")

# 4.2 Average Silhouette method

library(cluster)
k.max <- 10
data <- customer.data[,2:4]
sil <- rep(0, k.max)
# Compute the average silhouette width for
# k = 2 to k = 15
for(i in 2:k.max){
  km.res <- kmeans(data, centers = i, nstart = 25)
  ss <- silhouette(km.res$cluster, dist(data))
  sil[i] <- mean(ss[, 3])
}

# Plot the  average silhouette width
plot(1:k.max, sil, type = "b", pch = 19,
     frame = FALSE, xlab = "Number of clusters k")
abline(v = which.max(sil), lty = 2)

# 
# # 4.3 Gap Statistic method
# set.seed(123)
# data <- customer.data[,2:4]
# gap_stat <- clusGap(data, FUN = kmeans, nstart = 25,K.max = 10, B = 50)
# fviz_gap_stat(gap_stat)
# 
# 
# # 4.4 Using 30 different indexes
# set.seed(123)
# data <- customer.data[,2:4]
# res <- NbClust(data, diss=NULL, distance = "euclidean", min.nc=2, max.nc=10, method = "kmeans" , index = "all")
# fviz_nbclust(res) + theme_minimal()

4.3 Processing

There is no a consensus regarding the number of clusters. Let’s consider 5 clusters and process our Kmeans model.

# 
# # 5. K-MEAN CLUSTERING
# 
# 5.1 fit the model and get cluster means
set.seed(123)
data <- customer.data[,6:8]
fit <- kmeans(data, 5, nstart = 20) 
aggregate(customer.data[,2:4],by=list(fit$cluster),FUN=mean) 
##   Group.1 days.sl.pur number.pur money.spent
## 1       1    13.52546  16.289352   9342.4472
## 2       2    91.87245   3.599143   1532.8870
## 3       3    13.55618   4.556180   1429.7539
## 4       4   214.45803   1.214508    283.6120
## 5       5    38.07020   1.376847    334.9072
# 5.2 Display cluster centers and give a count of data points in each cluster
#fit$centers 
table(fit$cluster) 
## 
##   1   2   3   4   5 
## 432 933 712 965 812

5 Results

5.1 2D plot

Let’s visualize the results when using 5 clusters

# 5.3 Plot the model
#clusplot(data, fit$cluster, color=TRUE, shade=TRUE, Expllabels=2, lines=0)

fviz_cluster(list(data = data, cluster = fit$cluster),
             frame.type = "norm", geom = "point", stand = FALSE)

fviz_cluster(fit, data = data)

# Add cluster membership to customers dataset
customer.data$cluster <- fit$cluster
customer.data$cluster  <- factor(customer.data$cluster, levels = c(1:5))





##2D plot
# 
# #######################
# # Plot clusters in 3D #
# #######################
# 
# colors <- c('red','orange','green3','deepskyblue','blue','darkorchid4','violet','pink1','tan3','black')
# scatter3d(x = customer.data$number.pur.z, 
#           y = customer.data$money.spent.z,
#           z = customer.data$days.sl.pur.z, 
#           groups = customer.data$cluster,
#           xlab = "Frequency (z)", 
#           ylab = "Monetary Value (z)",
#           zlab = "Recency (z)",
#           surface.col = colors,
#           box=FALSE,angle =80,
#           axis.scales = FALSE,
#           surface = TRUE, # produces the horizonal planes through the graph at each level of monetary value
#           fit = "smooth",
#           #     ellipsoid = TRUE, # to graph ellipses uses this command and set "surface = " to FALSE
#           grid = TRUE,
#           axis.col = c("black", "black", "black"))
# 
# 
# 
# 
# 
# scatter3d(x = customer.data$money.spent.z, 
#           y = customer.data$number.pur.z,
#           z = customer.data$days.sl.pur.z,
#           groups = customer.data$cluster,
#           xlab = "Monetary Value (z)", 
#           ylab = "Frequency (z)",
#           zlab = "Recency (z)",
#           surface.col = colors,
#           axis.scales = FALSE,
#           surface = TRUE, # produces the horizonal planes through the graph at each level of monetary value
#           fit = "smooth",
#           #     ellipsoid = TRUE, # to graph ellipses uses this command and set "surface = " to FALSE
#           grid = TRUE,
#           axis.col = c("black", "black", "black"))
# 
# <center><img src="3d1.png" width="70%" and height="70%"  align="middle"></center>

5.2 Labeling

We will add the labels to the customers.

count_cluster <- as.data.frame(table(fit$cluster))
colnames (count_cluster ) <- c("Group.1","Customers")
df <- aggregate(customer.data[,2:4],by=list(fit$cluster),FUN=mean) 
df <- merge (df, count_cluster, by = "Group.1")


colnames (df) <- c("Cluster","Days since last purchase","Number of purchases","Money spent","Customers")
df$`Days since last purchase` <- round(df$`Days since last purchase`)
df$`Number of purchases` <- round(df$`Number of purchases` )
df$`Money spent` <- round(df$`Money spent`)

d <- head(df)

tt3 <- ttheme_minimal(
  core=list(bg_params = list(fill = blues9[1:4], col=NA),
            fg_params=list(fontface=3)),
  colhead=list(fg_params=list(col="navyblue", fontface=4L)),
  rowhead=list(fg_params=list(col="orange", fontface=3L)))


grid.arrange(tableGrob(d, theme=tt3), nrow=1)

5.3 Findings

Based on the results, below strategies should be applied to leverage the recent clustering process.

# test

6 What is Next

  • Explore more variables: first day of purchase, spent per category…
  • Inventory segmentation
  • Qualify the Suppliers
  • Employee profiles

7 References