In this project, We will use a public dataset from UCI in order to explore the benefits of an unsupervised machine learning technique.
The main purpose of this project is to provide a better understanding to the businesses about the use of the clustering technique to gain insights than can improve customer loyalty, sales and profits.
Loading libraries for data manipulation & visualization…
library(factoextra)
library(NbClust)
library(cluster)
library(factoextra)
library(ggplot2)
library(corrplot)
library(animation)
library(readxl)
library(factoextra)
library(NbClust)
library(cluster)
library(factoextra)
library(ggplot2)
library(animation)
library(scales)
library(ExPanDaR)
library(knitr)
library(tidyverse)
library(lubridate)
library(lattice)
library(e1071)
library(scales)
library(caret)
library(rattle)
library(qwraps2)
library(ROCR)
library(countrycode)
library(tidyr)
library(DataExplorer)
library(corrplot)
library(corrr)
library(imputeTS)
library(fpp)
library(mice)
library(ggplot2)
library(highcharter)
library(gapminder)
library(magrittr)
library(viridisLite)
library(countrycode)
library(DT)
library(cluster)
library(shiny)
library(car)
library(rgl)
library(gridExtra)
library(grid)
Loading UCI dataset…
# 1.1 Setting my working directory in R
setwd("C:/Saul/Portfolio/K-Means/KMeans")
# 1.2 Reading the dataset
data.trx <- data.frame(read_excel("Retail Transactions.xlsx", sheet="Transactions"))
# 2.1 Data structure review
#str(data.trx)
# 2.2 Data preview
Checking a sample data…
datatable(data.trx[(1:50),], filter = 'top', options = list(
pageLength = 25, scrollX = TRUE, scrollY = "300px", autoWidth = TRUE))
prepare_missing_values_graph(data.trx, ts_id = "Country")
plot_missing(data.trx)
Checking data through a world map
countries <- data.trx %>%
filter(!(Country %in% c("EIRE", "Unspecified","Channel Islands","European Community","RSA"))) %>%
group_by(Country) %>%
dplyr::summarise(total = n())
names(countries) <- c("country", "total")
countries$iso3 <- countrycode(countries$country, origin = "country.name", destination = "iso3c")
data(worldgeojson, package = "highcharter")
dshmstops <- data.frame(q = c(0, exp(1:5)/exp(5)),
c = substring(viridis(5 + 1, option = "D"), 0, 7)) %>% list_parse2()
highchart() %>%
hc_add_series_map(worldgeojson, countries, value = "total", joinBy = "iso3") %>%
hc_legend(enabled = TRUE) %>%
hc_add_theme(hc_theme_google()) %>%
hc_mapNavigation(enabled = TRUE) %>%
hc_title(text = "Transactions per Country") %>%
hc_colorAxis(minColor = "#bed2e7", maxColor = "#003366") %>%
hc_tooltip(useHTML = TRUE, headerFormat = "", pointFormat = "{point.country}: {point.total} transactions")
Let’s take a quick review using Tableau
Performing cleansing, formatting, normalization,…
df <- data.frame(customer.data)
ggplot(df, aes(x = money.spent.z)) +
geom_histogram(bins = 20, fill="#08519C", alpha=1) +
geom_vline(aes(xintercept=mean(money.spent.z)), ## straight line for the mean
colour = "#ADFF2F", size=1.5, alpha=0.5) +
geom_vline(aes(xintercept=median(money.spent.z)), ## dashed line for the median
colour = "#ADFF2F", linetype="dashed", size=1.5, alpha=0.5)
df <- data.frame(customer.data)
ggplot(df, aes(x = days.sl.pur.z)) +
geom_histogram(bins = 20, fill="#08519C", alpha=1) +
geom_vline(aes(xintercept=mean(days.sl.pur.z)), ## straight line for the mean
colour = "#ADFF2F", size=1.5, alpha=0.5) +
geom_vline(aes(xintercept=median(days.sl.pur.z)), ## dashed line for the median
colour = "#ADFF2F", linetype="dashed", size=1.5, alpha=0.5)
df <- data.frame(customer.data)
ggplot(df, aes(x = number.pur.z)) +
geom_histogram(bins = 20, fill="#08519C", alpha=1) +
geom_vline(aes(xintercept=mean(number.pur.z)), ## straight line for the mean
colour = "#ADFF2F", size=1.5, alpha=0.5) +
geom_vline(aes(xintercept=median(number.pur.z)), ## dashed line for the median
colour = "#ADFF2F", linetype="dashed", size=1.5, alpha=0.5)