library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(plyr)
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:Hmisc':
##
## is.discrete, summarize
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::arrange() masks plyr::arrange()
## x purrr::compact() masks plyr::compact()
## x dplyr::count() masks plyr::count()
## x dplyr::failwith() masks plyr::failwith()
## x dplyr::filter() masks stats::filter()
## x dplyr::id() masks plyr::id()
## x dplyr::lag() masks stats::lag()
## x dplyr::mutate() masks plyr::mutate()
## x dplyr::rename() masks plyr::rename()
## x dplyr::src() masks Hmisc::src()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize(), Hmisc::summarize()
library(stringr)
library(readxl)
library(writexl)
library(ggplot2)
library(dplyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
sales_data = read_xlsx("D:\\Drive E\\Software\\R\\UCI Online Retail\\sales_data.xlsx")
date = read_xlsx("D:\\Drive E\\Software\\R\\UCI Online Retail\\date.xlsx")
date=date%>%distinct()
write.csv(date,file="Date_distinct")
head(date)
## # A tibble: 6 x 6
## timestamp date day_name day_of_month month_of_year
## <dttm> <dttm> <chr> <chr> <chr>
## 1 2010-12-01 08:26:00 2010-12-01 00:00:00 Wednesd~ 01 December
## 2 2010-12-01 08:28:00 2010-12-01 00:00:00 Wednesd~ 01 December
## 3 2010-12-01 08:34:00 2010-12-01 00:00:00 Wednesd~ 01 December
## 4 2010-12-01 08:35:00 2010-12-01 00:00:00 Wednesd~ 01 December
## 5 2010-12-01 08:45:00 2010-12-01 00:00:00 Wednesd~ 01 December
## 6 2010-12-01 09:00:00 2010-12-01 00:00:00 Wednesd~ 01 December
## # ... with 1 more variable: `time_of_day(hh:mm:ss)` <chr>
Sales_Data_Date=sales_data%>%left_join(date,by=c("transaction timestamp"="timestamp")) #left join sales data with date
write_xlsx(Sales_Data_Date,"D:\\Drive E\\Software\\R\\Sales_Data_Date.xlsx")
write_xlsx(date,"D:\\Drive E\\Software\\R\\Date_Distinct.xlsx")
Dup=Sales_Data_Date[duplicated(Sales_Data_Date),]
#duplicated(Sales_Data_Date)
#head(Dup)
SalesDD_Dist= Sales_Data_Date%>%distinct()
SalesDD_Dist%>%count(distinct(SalesDD_Dist,)) #Create Distinct Sales_Data_Date Dataset
## # A tibble: 536,641 x 14
## `transaction id` `product id` `product descri~ `quantity sold`
## <chr> <chr> <chr> <dbl>
## 1 536365 21730 GLASS STAR FROS~ 6
## 2 536365 22752 SET 7 BABUSHKA ~ 2
## 3 536365 71053 WHITE METAL LAN~ 6
## 4 536365 84029E RED WOOLLY HOTT~ 6
## 5 536365 84029G KNITTED UNION F~ 6
## 6 536365 84406B CREAM CUPID HEA~ 8
## 7 536365 85123A WHITE HANGING H~ 6
## 8 536366 22632 HAND WARMER RED~ 6
## 9 536366 22633 HAND WARMER UNI~ 6
## 10 536367 21754 HOME BUILDING B~ 3
## # ... with 536,631 more rows, and 10 more variables: `transaction
## # timestamp` <dttm>, `unit price` <dbl>, `customer id` <dbl>, `transaction
## # country` <chr>, date <dttm>, day_name <chr>, day_of_month <chr>,
## # month_of_year <chr>, `time_of_day(hh:mm:ss)` <chr>, n <int>
summary(SalesDD_Dist)
## transaction id product id product description quantity sold
## Length:536641 Length:536641 Length:536641 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 1.00
## Mode :character Mode :character Mode :character Median : 3.00
## Mean : 9.62
## 3rd Qu.: 10.00
## Max. : 80995.00
##
## transaction timestamp unit price customer id
## Min. :2010-12-01 08:26:00 Min. :-11062.06 Min. :12346
## 1st Qu.:2011-03-28 10:52:00 1st Qu.: 1.25 1st Qu.:13939
## Median :2011-07-19 14:04:00 Median : 2.08 Median :15145
## Mean :2011-07-04 08:57:06 Mean : 4.63 Mean :15281
## 3rd Qu.:2011-10-18 17:05:00 3rd Qu.: 4.13 3rd Qu.:16784
## Max. :2011-12-09 12:50:00 Max. : 38970.00 Max. :18287
## NA's :135037
## transaction country date day_name
## Length:536641 Min. :2010-12-01 00:00:00 Length:536641
## Class :character 1st Qu.:2011-03-28 00:00:00 Class :character
## Mode :character Median :2011-07-19 00:00:00 Mode :character
## Mean :2011-07-03 19:22:27
## 3rd Qu.:2011-10-18 00:00:00
## Max. :2011-12-09 00:00:00
##
## day_of_month month_of_year time_of_day(hh:mm:ss)
## Length:536641 Length:536641 Length:536641
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
describe(SalesDD_Dist) #To get description of data stored for each column
## SalesDD_Dist
##
## 13 Variables 536641 Observations
## --------------------------------------------------------------------------------
## transaction id
## n missing distinct
## 536641 0 25900
##
## lowest : 536365 536366 536367 536368 536369
## highest: C581484 C581490 C581499 C581568 C581569
## --------------------------------------------------------------------------------
## product id
## n missing distinct
## 536641 0 4070
##
## lowest : 10002 10080 10120 10123C 10123G, highest: m M PADS POST S
## --------------------------------------------------------------------------------
## product description
## n missing distinct
## 535187 1454 4211
##
## lowest : *Boombox Ipod Classic *USB Office Mirror Ball ? ? sold as sets? ??
## highest: ZINC T-LIGHT HOLDER STARS SMALL ZINC TOP 2 DOOR WOODEN SHELF ZINC WILLIE WINKIE CANDLE STICK ZINC WIRE KITCHEN ORGANISER ZINC WIRE SWEETHEART LETTER TRAY
## --------------------------------------------------------------------------------
## quantity sold
## n missing distinct Info Mean Gmd .05 .10
## 536641 0 722 0.974 9.62 16.67 1 1
## .25 .50 .75 .90 .95
## 1 3 10 24 30
##
## lowest : -80995 -74215 -9600 -9360 -9058, highest: 4800 5568 12540 74215 80995
##
## Value -80000 -74000 -10000 -6000 -4000 -2000 0 2000 4000
## Frequency 1 1 4 1 5 46 536455 116 8
## Proportion 0 0 0 0 0 0 1 0 0
##
## Value 6000 12000 74000 80000
## Frequency 1 1 1 1
## Proportion 0 0 0 0
##
## For the frequency table, variable is rounded to the nearest 2000
## --------------------------------------------------------------------------------
## transaction timestamp
## n missing distinct Info
## 536641 0 23260 1
## Mean Gmd .05 .10
## 2011-07-04 08:57:06 11438790 2010-12-13 09:35:00 2011-01-12 11:21:00
## .25 .50 .75 .90
## 2011-03-28 10:52:00 2011-07-19 14:04:00 2011-10-18 17:05:00 2011-11-21 15:18:00
## .95
## 2011-11-30 14:40:00
##
## lowest : 2010-12-01 08:26:00 2010-12-01 08:28:00 2010-12-01 08:34:00 2010-12-01 08:35:00 2010-12-01 08:45:00
## highest: 2011-12-09 12:23:00 2011-12-09 12:25:00 2011-12-09 12:31:00 2011-12-09 12:49:00 2011-12-09 12:50:00
## --------------------------------------------------------------------------------
## unit price
## n missing distinct Info Mean Gmd .05 .10
## 536641 0 1630 0.998 4.633 6.044 0.42 0.64
## .25 .50 .75 .90 .95
## 1.25 2.08 4.13 7.95 9.95
##
## lowest : -11062.060 0.000 0.001 0.010 0.030
## highest: 13541.330 16453.710 16888.020 17836.460 38970.000
## --------------------------------------------------------------------------------
## customer id
## n missing distinct Info Mean Gmd .05 .10
## 401604 135037 4372 1 15281 1978 12621 12867
## .25 .50 .75 .90 .95
## 13939 15145 16784 17716 17898
##
## lowest : 12346 12347 12348 12349 12350, highest: 18280 18281 18282 18283 18287
## --------------------------------------------------------------------------------
## transaction country
## n missing distinct
## 536641 0 38
##
## lowest : Australia Austria Bahrain Belgium Brazil
## highest: Switzerland United Arab Emirates United Kingdom Unspecified USA
## --------------------------------------------------------------------------------
## date
## n missing distinct Info Mean Gmd .05
## 536641 0 305 1 2011-07-04 11438597 2010-12-13
## .10 .25 .50 .75 .90 .95
## 2011-01-12 2011-03-28 2011-07-19 2011-10-18 2011-11-21 2011-11-30
##
## lowest : 2010-12-01 2010-12-02 2010-12-03 2010-12-05 2010-12-06
## highest: 2011-12-05 2011-12-06 2011-12-07 2011-12-08 2011-12-09
## --------------------------------------------------------------------------------
## day_name
## n missing distinct
## 536641 0 6
##
## lowest : Friday Monday Sunday Thursday Tuesday
## highest: Monday Sunday Thursday Tuesday Wednesday
##
## Value Friday Monday Sunday Thursday Tuesday Wednesday
## Frequency 81565 94435 62806 103056 101064 93715
## Proportion 0.152 0.176 0.117 0.192 0.188 0.175
## --------------------------------------------------------------------------------
## day_of_month
## n missing distinct
## 536641 0 31
##
## lowest : 01 02 03 04 05, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## month_of_year
## n missing distinct
## 536641 0 12
##
## lowest : April August December February January
## highest: March May November October September
##
## Value April August December February January July
## Frequency 29701 35064 67227 27479 34900 39267
## Proportion 0.055 0.065 0.125 0.051 0.065 0.073
##
## Value June March May November October September
## Frequency 36609 36439 36782 83343 59969 49861
## Proportion 0.068 0.068 0.069 0.155 0.112 0.093
## --------------------------------------------------------------------------------
## time_of_day(hh:mm:ss)
## n missing distinct
## 536641 0 774
##
## lowest : 06:10:00 06:12:00 06:13:00 06:14:00 06:15:00
## highest: 20:28:00 20:32:00 20:33:00 20:36:00 20:38:00
## --------------------------------------------------------------------------------
str(SalesDD_Dist)#get structure
## tibble [536,641 x 13] (S3: tbl_df/tbl/data.frame)
## $ transaction id : chr [1:536641] "536365" "536365" "536365" "536365" ...
## $ product id : chr [1:536641] "85123A" "71053" "84406B" "84029G" ...
## $ product description : chr [1:536641] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ quantity sold : num [1:536641] 6 6 8 6 6 2 6 6 6 32 ...
## $ transaction timestamp: POSIXct[1:536641], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
## $ unit price : num [1:536641] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ customer id : num [1:536641] 17850 17850 17850 17850 17850 ...
## $ transaction country : chr [1:536641] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## $ date : POSIXct[1:536641], format: "2010-12-01" "2010-12-01" ...
## $ day_name : chr [1:536641] "Wednesday" "Wednesday" "Wednesday" "Wednesday" ...
## $ day_of_month : chr [1:536641] "01" "01" "01" "01" ...
## $ month_of_year : chr [1:536641] "December" "December" "December" "December" ...
## $ time_of_day(hh:mm:ss): chr [1:536641] "08:26:00" "08:26:00" "08:26:00" "08:26:00" ...
str(SalesDD_Dist)#get structure
## tibble [536,641 x 13] (S3: tbl_df/tbl/data.frame)
## $ transaction id : chr [1:536641] "536365" "536365" "536365" "536365" ...
## $ product id : chr [1:536641] "85123A" "71053" "84406B" "84029G" ...
## $ product description : chr [1:536641] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ quantity sold : num [1:536641] 6 6 8 6 6 2 6 6 6 32 ...
## $ transaction timestamp: POSIXct[1:536641], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
## $ unit price : num [1:536641] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ customer id : num [1:536641] 17850 17850 17850 17850 17850 ...
## $ transaction country : chr [1:536641] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## $ date : POSIXct[1:536641], format: "2010-12-01" "2010-12-01" ...
## $ day_name : chr [1:536641] "Wednesday" "Wednesday" "Wednesday" "Wednesday" ...
## $ day_of_month : chr [1:536641] "01" "01" "01" "01" ...
## $ month_of_year : chr [1:536641] "December" "December" "December" "December" ...
## $ time_of_day(hh:mm:ss): chr [1:536641] "08:26:00" "08:26:00" "08:26:00" "08:26:00" ...
#create dataframe to modify
SalesDD_Mod = SalesDD_Dist
SalesDD_Mod$`transaction id`=as.factor(SalesDD_Dist$`transaction id`)
str(SalesDD_Mod)
## tibble [536,641 x 13] (S3: tbl_df/tbl/data.frame)
## $ transaction id : Factor w/ 25900 levels "536365","536366",..: 1 1 1 1 1 1 1 2 2 3 ...
## $ product id : chr [1:536641] "85123A" "71053" "84406B" "84029G" ...
## $ product description : chr [1:536641] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ quantity sold : num [1:536641] 6 6 8 6 6 2 6 6 6 32 ...
## $ transaction timestamp: POSIXct[1:536641], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
## $ unit price : num [1:536641] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ customer id : num [1:536641] 17850 17850 17850 17850 17850 ...
## $ transaction country : chr [1:536641] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## $ date : POSIXct[1:536641], format: "2010-12-01" "2010-12-01" ...
## $ day_name : chr [1:536641] "Wednesday" "Wednesday" "Wednesday" "Wednesday" ...
## $ day_of_month : chr [1:536641] "01" "01" "01" "01" ...
## $ month_of_year : chr [1:536641] "December" "December" "December" "December" ...
## $ time_of_day(hh:mm:ss): chr [1:536641] "08:26:00" "08:26:00" "08:26:00" "08:26:00" ...
Cancelled = SalesDD_Mod%>%filter(str_detect(`transaction id`,"^C"))
#Imputation : Remove 0 unit price
zero_price=SalesDD_Mod%>%filter(`unit price`==0)
SalesDD_Mod=SalesDD_Mod%>%filter(`unit price`!=0)
#Order with ProductIDs having no integer but only letters are additional fees not actual products. E.g. Post, AmazonFee, Manual etc.
#Removing product IDs with only character names i.e filtering for Product IDs having digits
#Check the data with product id not having any digits
#13 Prod.IDs are invalid except Prod. IDs with DCGSSBOY & DCGSSGIRL are valid product types
#ProductID of data still has contents with ID starting with "gift_" and no customer ID, which indicates gift vouchers are distributed
#Removing invalid Product IDs:
sales_clean=SalesDD_Mod %>% filter(str_detect(`product id`,"\\d")|str_detect(`product id`,"^DCGSS"))%>%filter(!str_detect(`product id`,"^gift"))
describe(sales_clean)
## sales_clean
##
## 13 Variables 531348 Observations
## --------------------------------------------------------------------------------
## transaction id
## n missing distinct
## 531348 0 23198
##
## lowest : 536365 536366 536367 536368 536369
## highest: C581470 C581484 C581490 C581568 C581569
## --------------------------------------------------------------------------------
## product id
## n missing distinct
## 531348 0 3922
##
## lowest : 10002 10080 10120 10123C 10124A
## highest: DCGS0069 DCGS0070 DCGS0076 DCGSSBOY DCGSSGIRL
## --------------------------------------------------------------------------------
## product description
## n missing distinct
## 531348 0 4016
##
## lowest : *Boombox Ipod Classic *USB Office Mirror Ball 10 COLOUR SPACEBOY PEN 12 COLOURED PARTY BALLOONS 12 DAISY PEGS IN WOOD BOX
## highest: ZINC T-LIGHT HOLDER STARS SMALL ZINC TOP 2 DOOR WOODEN SHELF ZINC WILLIE WINKIE CANDLE STICK ZINC WIRE KITCHEN ORGANISER ZINC WIRE SWEETHEART LETTER TRAY
## --------------------------------------------------------------------------------
## quantity sold
## n missing distinct Info Mean Gmd .05 .10
## 531348 0 507 0.974 9.959 15.72 1 1
## .25 .50 .75 .90 .95
## 1 3 10 24 30
##
## lowest : -80995 -74215 -9360 -3114 -2000, highest: 3906 4300 4800 74215 80995
##
## Value -80000 -74000 -10000 -4000 -2000 0 2000 4000 74000
## Frequency 1 1 1 1 6 531224 106 6 1
## Proportion 0 0 0 0 0 1 0 0 0
##
## Value 80000
## Frequency 1
## Proportion 0
##
## For the frequency table, variable is rounded to the nearest 2000
## --------------------------------------------------------------------------------
## transaction timestamp
## n missing distinct Info
## 531348 0 21311 1
## Mean Gmd .05 .10
## 2011-07-04 13:31:05 11440399 2010-12-13 09:35:00 2011-01-12 11:52:00
## .25 .50 .75 .90
## 2011-03-28 11:51:00 2011-07-20 09:11:00 2011-10-19 10:04:00 2011-11-21 15:57:00
## .95
## 2011-11-30 14:46:00
##
## lowest : 2010-12-01 08:26:00 2010-12-01 08:28:00 2010-12-01 08:34:00 2010-12-01 08:35:00 2010-12-01 08:45:00
## highest: 2011-12-09 12:23:00 2011-12-09 12:25:00 2011-12-09 12:31:00 2011-12-09 12:49:00 2011-12-09 12:50:00
## --------------------------------------------------------------------------------
## unit price
## n missing distinct Info Mean Gmd .05 .10
## 531348 0 517 0.998 3.305 3.22 0.42 0.65
## .25 .50 .75 .90 .95
## 1.25 2.08 4.13 7.90 9.95
##
## lowest : 0.03 0.04 0.06 0.07 0.08, highest: 175.00 195.00 265.50 295.00 649.50
## --------------------------------------------------------------------------------
## customer id
## n missing distinct Info Mean Gmd .05 .10
## 399790 131558 4362 1 15289 1974 12627 12901
## .25 .50 .75 .90 .95
## 13959 15152 16791 17719 17900
##
## lowest : 12346 12347 12348 12349 12350, highest: 18280 18281 18282 18283 18287
## --------------------------------------------------------------------------------
## transaction country
## n missing distinct
## 531348 0 38
##
## lowest : Australia Austria Bahrain Belgium Brazil
## highest: Switzerland United Arab Emirates United Kingdom Unspecified USA
## --------------------------------------------------------------------------------
## date
## n missing distinct Info Mean Gmd .05
## 531348 0 305 1 2011-07-04 11440201 2010-12-13
## .10 .25 .50 .75 .90 .95
## 2011-01-12 2011-03-28 2011-07-20 2011-10-19 2011-11-21 2011-11-30
##
## lowest : 2010-12-01 2010-12-02 2010-12-03 2010-12-05 2010-12-06
## highest: 2011-12-05 2011-12-06 2011-12-07 2011-12-08 2011-12-09
## --------------------------------------------------------------------------------
## day_name
## n missing distinct
## 531348 0 6
##
## lowest : Friday Monday Sunday Thursday Tuesday
## highest: Monday Sunday Thursday Tuesday Wednesday
##
## Value Friday Monday Sunday Thursday Tuesday Wednesday
## Frequency 80527 93484 62667 102039 99892 92739
## Proportion 0.152 0.176 0.118 0.192 0.188 0.175
## --------------------------------------------------------------------------------
## day_of_month
## n missing distinct
## 531348 0 31
##
## lowest : 01 02 03 04 05, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## month_of_year
## n missing distinct
## 531348 0 12
##
## lowest : April August December February January
## highest: March May November October September
##
## Value April August December February January July
## Frequency 29280 34731 66629 27160 34585 38845
## Proportion 0.055 0.065 0.125 0.051 0.065 0.073
##
## Value June March May November October September
## Frequency 36192 35955 36299 82763 59440 49469
## Proportion 0.068 0.068 0.068 0.156 0.112 0.093
## --------------------------------------------------------------------------------
## time_of_day(hh:mm:ss)
## n missing distinct
## 531348 0 770
##
## lowest : 06:10:00 06:14:00 06:15:00 06:19:00 06:20:00
## highest: 20:18:00 20:28:00 20:33:00 20:36:00 20:38:00
## --------------------------------------------------------------------------------
#Prod ID with C2 are carriage thus needs to be removed
sales_clean=sales_clean%>% filter(!str_detect(`product id`,"C2"))
#Cancelled Order Check
cancelled = sales_clean%>%filter(str_detect(`transaction id`,"^C"))
describe(cancelled)
## cancelled
##
## 13 Variables 8668 Observations
## --------------------------------------------------------------------------------
## transaction id
## n missing distinct
## 8668 0 3422
##
## lowest : C536383 C536391 C536506 C536543 C536548
## highest: C581470 C581484 C581490 C581568 C581569
## --------------------------------------------------------------------------------
## product id
## n missing distinct
## 8668 0 1938
##
## lowest : 10133 10135 11001 15034 15036 , highest: 90204 90208 90209A 90209C 90211B
## --------------------------------------------------------------------------------
## product description
## n missing distinct
## 8668 0 1962
##
## lowest : 10 COLOUR SPACEBOY PEN 12 COLOURED PARTY BALLOONS 12 EGG HOUSE PAINTED WOOD 12 IVORY ROSE PEG PLACE SETTINGS 12 MESSAGE CARDS WITH ENVELOPES
## highest: ZINC SWEETHEART WIRE LETTER RACK ZINC T-LIGHT HOLDER STAR LARGE ZINC T-LIGHT HOLDER STARS SMALL ZINC TOP 2 DOOR WOODEN SHELF ZINC WILLIE WINKIE CANDLE STICK
## --------------------------------------------------------------------------------
## quantity sold
## n missing distinct Info Mean Gmd .05 .10
## 8668 0 133 0.926 -31.15 57.48 -36 -20
## .25 .50 .75 .90 .95
## -6 -2 -1 -1 -1
##
## lowest : -80995 -74215 -9360 -3114 -2000, highest: -5 -4 -3 -2 -1
##
## Value -81000 -74000 -9000 -3000 -2000 -1000 0
## Frequency 1 1 1 1 3 18 8643
## Proportion 0.000 0.000 0.000 0.000 0.000 0.002 0.997
##
## For the frequency table, variable is rounded to the nearest 1000
## --------------------------------------------------------------------------------
## transaction timestamp
## n missing distinct Info
## 8668 0 3363 1
## Mean Gmd .05 .10
## 2011-06-26 09:26:07 11273694 2010-12-14 11:12:00 2011-01-09 14:17:00
## .25 .50 .75 .90
## 2011-03-21 16:15:00 2011-07-08 13:39:00 2011-10-06 20:36:00 2011-11-14 15:10:00
## .95
## 2011-11-29 11:15:09
##
## lowest : 2010-12-01 09:49:00 2010-12-01 10:24:00 2010-12-01 12:38:00 2010-12-01 14:30:00 2010-12-01 14:33:00
## highest: 2011-12-08 19:28:00 2011-12-09 09:27:00 2011-12-09 09:57:00 2011-12-09 11:57:00 2011-12-09 11:58:00
## --------------------------------------------------------------------------------
## unit price
## n missing distinct Info Mean Gmd .05 .10
## 8668 0 157 0.998 4.416 4.618 0.55 0.85
## .25 .50 .75 .90 .95
## 1.45 2.55 4.95 9.95 12.75
##
## lowest : 0.03 0.06 0.07 0.08 0.11, highest: 165.00 175.00 195.00 265.50 295.00
##
## Value 0 5 10 15 20 25 30 35 40 50 60
## Frequency 4040 3132 1027 381 21 11 5 6 13 6 4
## Proportion 0.466 0.361 0.118 0.044 0.002 0.001 0.001 0.001 0.001 0.001 0.000
##
## Value 65 85 125 165 175 195 265 295
## Frequency 2 2 10 2 1 2 1 2
## Proportion 0.000 0.000 0.001 0.000 0.000 0.000 0.000 0.000
##
## For the frequency table, variable is rounded to the nearest 5
## --------------------------------------------------------------------------------
## customer id
## n missing distinct Info Mean Gmd .05 .10
## 8506 162 1534 1 14995 1961 12505 12709
## .25 .50 .75 .90 .95
## 13523 14901 16393 17548 17841
##
## lowest : 12346 12352 12359 12362 12375, highest: 18272 18274 18276 18277 18282
## --------------------------------------------------------------------------------
## transaction country
## n missing distinct
## 8668 0 28
##
## lowest : Australia Austria Bahrain Belgium Channel Islands
## highest: Spain Sweden Switzerland United Kingdom USA
## --------------------------------------------------------------------------------
## date
## n missing distinct Info Mean Gmd .05
## 8668 0 302 1 2011-06-26 11273971 2010-12-14
## .10 .25 .50 .75 .90 .95
## 2011-01-09 2011-03-21 2011-07-08 2011-10-06 2011-11-14 2011-11-29
##
## lowest : 2010-12-01 2010-12-02 2010-12-03 2010-12-05 2010-12-06
## highest: 2011-12-05 2011-12-06 2011-12-07 2011-12-08 2011-12-09
## --------------------------------------------------------------------------------
## day_name
## n missing distinct
## 8668 0 6
##
## lowest : Friday Monday Sunday Thursday Tuesday
## highest: Monday Sunday Thursday Tuesday Wednesday
##
## Value Friday Monday Sunday Thursday Tuesday Wednesday
## Frequency 1276 1429 449 2241 1625 1648
## Proportion 0.147 0.165 0.052 0.259 0.187 0.190
## --------------------------------------------------------------------------------
## day_of_month
## n missing distinct
## 8668 0 31
##
## lowest : 01 02 03 04 05, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## month_of_year
## n missing distinct
## 8668 0 12
##
## lowest : April August December February January
## highest: March May November October September
##
## Value April August December February January July
## Frequency 530 616 1036 414 678 641
## Proportion 0.061 0.071 0.120 0.048 0.078 0.074
##
## Value June March May November October September
## Frequency 667 639 558 1013 1120 756
## Proportion 0.077 0.074 0.064 0.117 0.129 0.087
## --------------------------------------------------------------------------------
## time_of_day(hh:mm:ss)
## n missing distinct
## 8668 0 686
##
## lowest : 06:10:00 06:14:00 06:15:00 06:19:00 06:21:00
## highest: 20:18:00 20:28:00 20:33:00 20:36:00 20:38:00
## --------------------------------------------------------------------------------
##SALES PERFORMANCE
#create required columns
sales=sales_clean%>%mutate(sale_value=`quantity sold`*`unit price`)
sales=sales %>% mutate(year=format(`transaction timestamp`,"%Y"))
sales=sales %>% mutate(month_number=format(`transaction timestamp`,"%m"))
#plot sales by month and year
df <- sales %>% filter(date<"2011-12-01")%>%
mutate(month_year=(paste(year,month_number,sep="-")))%>%
group_by(month_year)%>%
summarise(Total_sales=sum(sale_value))%>%
arrange(match(month_year, month.name))
## `summarise()` ungrouping output (override with `.groups` argument)
#sales %>%
#mutate(month_year=(paste(year,month_number,sep="-")))%>%
#group_by(month_year)%>%
#summarise(Total_sales=sum(sale_value))%>%
#arrange(match(month_year, month.name))
#Line plot
ggplot(df,aes(x=ordered(month_year), y=Total_sales, group =1))+ggtitle("Monthly Sales of the Baseline Year")+
geom_line(colour = "red", size = 1.2)+
geom_point(colour="darkblue")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
ylab("Total Sales") + xlab("Year-Month")
#LinearRegression for daily sales
daily_df=sales%>%group_by(date)%>%summarise(daily_sales=sum(sale_value))%>%arrange(date)
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(daily_df,aes(x=date,y=daily_sales))+geom_point()+geom_smooth(method="lm")+
ggtitle("Linear Model")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
ylab("Sales") + xlab("Date")
## `geom_smooth()` using formula 'y ~ x'
lm_sales=lm(daily_sales ~ date, data = daily_df)
summary(lm_sales)
##
## Call:
## lm(formula = daily_sales ~ date, data = daily_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30574 -10092 -2444 7574 69768
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.362e+05 1.292e+05 -7.246 3.57e-12 ***
## date 7.404e-04 9.879e-05 7.495 7.37e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15970 on 303 degrees of freedom
## Multiple R-squared: 0.1564, Adjusted R-squared: 0.1536
## F-statistic: 56.17 on 1 and 303 DF, p-value: 7.369e-13
##SALES INSIGHTS
#Sales by Country
country_df=sales%>%group_by(`transaction country`)%>%
summarise(country_sales=sum(sale_value),transaction_count=n())%>%
mutate(avg_product_cost=country_sales/transaction_count)%>%
arrange(desc(country_sales))%>%head(10)
## `summarise()` ungrouping output (override with `.groups` argument)
country_df
## # A tibble: 10 x 4
## `transaction country` country_sales transaction_count avg_product_cost
## <chr> <dbl> <int> <dbl>
## 1 United Kingdom 8281390. 486200 17.0
## 2 Netherlands 283480. 2326 122.
## 3 EIRE 259380. 8059 32.2
## 4 Germany 200620. 9080 22.1
## 5 France 182077. 8218 22.2
## 6 Australia 136922. 1253 109.
## 7 Switzerland 52483. 1960 26.8
## 8 Spain 51747. 2462 21.0
## 9 Belgium 36663. 1971 18.6
## 10 Japan 35420. 355 99.8
#plot of country sales
top_n(country_df,10,country_sales)%>%
ggplot(aes(x=reorder(`transaction country`,-country_sales),y=country_sales))+
ggtitle("Top 10 Countries by Sales")+ labs(color = "Average Cost of Product")+
geom_point(stat='identity',aes(size=avg_product_cost),color='Blue')+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
ylab("Sales") + xlab("Country")
#Sales by Day
sales%>%group_by(day_name)%>%summarise(day_sales=sum(sale_value))%>%arrange(day_sales)%>%
ggplot(aes(x=reorder(day_name,-day_sales),y=day_sales))+
geom_col(fill='forest green')+
ggtitle(" Sales by Weekday")+
geom_text(aes(label=day_sales), position=position_dodge(width=0.9), vjust=-0.25)+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
ylab("Sales") + xlab("Weekday")
## `summarise()` ungrouping output (override with `.groups` argument)
#Sales by time
sales$hour=format(sales$`transaction timestamp`,"%H")
sales%>%group_by(hour)%>%summarise(hourly_sales=sum(sale_value),total_quantity=sum(`quantity sold`))%>%arrange(hour)%>%
ggplot(aes(x=hour,y=hourly_sales))+
geom_line(aes(group=1),color="pink",size=1.5)+geom_point(colour='maroon',size=1.5)+
ggtitle(" Sales by Hours")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
ylab("Sales") + xlab("Hours of the Day")
## `summarise()` ungrouping output (override with `.groups` argument)
#Sales by product
product_df=sales%>%group_by(`product description`)%>%summarise(prod_sales=sum(sale_value),total_quantity=sum(`quantity sold`))%>%arrange(prod_sales)
## `summarise()` ungrouping output (override with `.groups` argument)
top_n(product_df,10,prod_sales)%>%
ggplot(aes(x=reorder(`product description`,-prod_sales),y=prod_sales))+
geom_col(aes(fill=total_quantity))+
scale_fill_gradient(low="yellow", high="red")+
ggtitle(" Top 10 Sales by Product")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
ylab("Sales") +
xlab("Product")
##CUSTOMER ANALYSIS: CUSTOMER ACQUISITION AND LOYALTY
###Customer Behaviour
#Remove customer IDs with NA
sales_customer=sales%>%filter(!is.na(`customer id`))
describe(sales_customer)
## sales_customer
##
## 17 Variables 399656 Observations
## --------------------------------------------------------------------------------
## transaction id
## n missing distinct
## 399656 0 21785
##
## lowest : 536365 536366 536367 536368 536369
## highest: C581470 C581484 C581490 C581568 C581569
## --------------------------------------------------------------------------------
## product id
## n missing distinct
## 399656 0 3676
##
## lowest : 10002 10080 10120 10123C 10124A, highest: 90214U 90214V 90214W 90214Y 90214Z
## --------------------------------------------------------------------------------
## product description
## n missing distinct
## 399656 0 3877
##
## lowest : 10 COLOUR SPACEBOY PEN 12 COLOURED PARTY BALLOONS 12 DAISY PEGS IN WOOD BOX 12 EGG HOUSE PAINTED WOOD 12 HANGING EGGS HAND PAINTED
## highest: ZINC T-LIGHT HOLDER STARS SMALL ZINC TOP 2 DOOR WOODEN SHELF ZINC WILLIE WINKIE CANDLE STICK ZINC WIRE KITCHEN ORGANISER ZINC WIRE SWEETHEART LETTER TRAY
## --------------------------------------------------------------------------------
## quantity sold
## n missing distinct Info Mean Gmd .05 .10
## 399656 0 431 0.987 12.2 18.88 1 1
## .25 .50 .75 .90 .95
## 2 5 12 24 36
##
## lowest : -80995 -74215 -9360 -3114 -2000, highest: 3906 4300 4800 74215 80995
##
## Value -80000 -74000 -10000 -4000 -2000 0 2000 4000 74000
## Frequency 1 1 1 1 6 399534 104 6 1
## Proportion 0 0 0 0 0 1 0 0 0
##
## Value 80000
## Frequency 1
## Proportion 0
##
## For the frequency table, variable is rounded to the nearest 2000
## --------------------------------------------------------------------------------
## transaction timestamp
## n missing distinct Info
## 399656 0 20132 1
## Mean Gmd .05 .10
## 2011-07-10 12:35:18 11094688 2010-12-14 15:02:00 2011-01-21 13:47:00
## .25 .50 .75 .90
## 2011-04-06 15:07:00 2011-07-29 15:51:00 2011-10-20 12:03:00 2011-11-20 15:00:00
## .95
## 2011-11-29 15:23:00
##
## lowest : 2010-12-01 08:26:00 2010-12-01 08:28:00 2010-12-01 08:34:00 2010-12-01 08:35:00 2010-12-01 08:45:00
## highest: 2011-12-09 12:23:00 2011-12-09 12:25:00 2011-12-09 12:31:00 2011-12-09 12:49:00 2011-12-09 12:50:00
## --------------------------------------------------------------------------------
## unit price
## n missing distinct Info Mean Gmd .05 .10
## 399656 0 372 0.996 2.908 2.811 0.42 0.55
## .25 .50 .75 .90 .95
## 1.25 1.95 3.75 6.25 8.50
##
## lowest : 0.03 0.04 0.06 0.07 0.08, highest: 175.00 195.00 265.50 295.00 649.50
## --------------------------------------------------------------------------------
## customer id
## n missing distinct Info Mean Gmd .05 .10
## 399656 0 4362 1 15289 1974 12627 12901
## .25 .50 .75 .90 .95
## 13959 15152 16791 17719 17901
##
## lowest : 12346 12347 12348 12349 12350, highest: 18280 18281 18282 18283 18287
## --------------------------------------------------------------------------------
## transaction country
## n missing distinct
## 399656 0 37
##
## lowest : Australia Austria Bahrain Belgium Brazil
## highest: Switzerland United Arab Emirates United Kingdom Unspecified USA
## --------------------------------------------------------------------------------
## date
## n missing distinct Info Mean Gmd .05
## 399656 0 305 1 2011-07-10 11094244 2010-12-14
## .10 .25 .50 .75 .90 .95
## 2011-01-21 2011-04-06 2011-07-29 2011-10-20 2011-11-20 2011-11-29
##
## lowest : 2010-12-01 2010-12-02 2010-12-03 2010-12-05 2010-12-06
## highest: 2011-12-05 2011-12-06 2011-12-07 2011-12-08 2011-12-09
## --------------------------------------------------------------------------------
## day_name
## n missing distinct
## 399656 0 6
##
## lowest : Friday Monday Sunday Thursday Tuesday
## highest: Monday Sunday Thursday Tuesday Wednesday
##
## Value Friday Monday Sunday Thursday Tuesday Wednesday
## Frequency 55163 65364 61530 81134 67052 69413
## Proportion 0.138 0.164 0.154 0.203 0.168 0.174
## --------------------------------------------------------------------------------
## day_of_month
## n missing distinct
## 399656 0 31
##
## lowest : 01 02 03 04 05, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## month_of_year
## n missing distinct
## 399656 0 12
##
## lowest : April August December February January
## highest: March May November October September
##
## Value April August December February January July
## Frequency 22874 27296 43545 20040 21562 27111
## Proportion 0.057 0.068 0.109 0.050 0.054 0.068
##
## Value June March May November October September
## Frequency 27448 27358 28512 63938 49708 40264
## Proportion 0.069 0.068 0.071 0.160 0.124 0.101
## --------------------------------------------------------------------------------
## time_of_day(hh:mm:ss)
## n missing distinct
## 399656 0 770
##
## lowest : 06:10:00 06:14:00 06:15:00 06:19:00 06:20:00
## highest: 20:18:00 20:28:00 20:33:00 20:36:00 20:38:00
## --------------------------------------------------------------------------------
## sale_value
## n missing distinct Info Mean Gmd .05 .10
## 399656 0 3755 1 20.68 30.35 0.85 1.65
## .25 .50 .75 .90 .95
## 4.25 11.58 19.50 35.40 66.40
##
## lowest : -168469.60 -77183.60 -6539.40 -4921.50 -4522.50
## highest: 6539.40 7144.72 38970.00 77183.60 168469.60
##
## Value -170000 -75000 -5000 0 5000 40000 75000 170000
## Frequency 1 1 6 399591 54 1 1 1
## Proportion 0 0 0 1 0 0 0 0
##
## For the frequency table, variable is rounded to the nearest 5000
## --------------------------------------------------------------------------------
## year
## n missing distinct
## 399656 0 2
##
## Value 2010 2011
## Frequency 26235 373421
## Proportion 0.066 0.934
## --------------------------------------------------------------------------------
## month_number
## n missing distinct
## 399656 0 12
##
## lowest : 01 02 03 04 05, highest: 08 09 10 11 12
##
## Value 1 2 3 4 5 6 7 8 9 10 11
## Frequency 21562 20040 27358 22874 28512 27448 27111 27296 40264 49708 63938
## Proportion 0.054 0.050 0.068 0.057 0.071 0.069 0.068 0.068 0.101 0.124 0.160
##
## Value 12
## Frequency 43545
## Proportion 0.109
## --------------------------------------------------------------------------------
## hour
## n missing distinct
## 399656 0 15
##
## lowest : 06 07 08 09 10, highest: 16 17 18 19 20
##
## Value 6 7 8 9 10 11 12 13 14 15 16
## Frequency 39 379 8722 22286 38468 49304 71907 63794 53945 45438 24497
## Proportion 0.000 0.001 0.022 0.056 0.096 0.123 0.180 0.160 0.135 0.114 0.061
##
## Value 17 18 19 20
## Frequency 13531 3088 3413 845
## Proportion 0.034 0.008 0.009 0.002
## --------------------------------------------------------------------------------
#Update the product quantity by removing the effect of cancelled transaction
#Cancelled Order Check
cancelled = sales_customer%>%filter(str_detect(`transaction id`,"^C"))
describe(cancelled)
## cancelled
##
## 17 Variables 8506 Observations
## --------------------------------------------------------------------------------
## transaction id
## n missing distinct
## 8506 0 3383
##
## lowest : C536383 C536391 C536506 C536543 C536548
## highest: C581470 C581484 C581490 C581568 C581569
## --------------------------------------------------------------------------------
## product id
## n missing distinct
## 8506 0 1915
##
## lowest : 10133 10135 11001 15034 15036 , highest: 90204 90208 90209A 90209C 90211B
## --------------------------------------------------------------------------------
## product description
## n missing distinct
## 8506 0 1939
##
## lowest : 10 COLOUR SPACEBOY PEN 12 COLOURED PARTY BALLOONS 12 EGG HOUSE PAINTED WOOD 12 IVORY ROSE PEG PLACE SETTINGS 12 PENCIL SMALL TUBE WOODLAND
## highest: ZINC SWEETHEART WIRE LETTER RACK ZINC T-LIGHT HOLDER STAR LARGE ZINC T-LIGHT HOLDER STARS SMALL ZINC TOP 2 DOOR WOODEN SHELF ZINC WILLIE WINKIE CANDLE STICK
## --------------------------------------------------------------------------------
## quantity sold
## n missing distinct Info Mean Gmd .05 .10
## 8506 0 132 0.924 -31.47 58.2 -36 -20
## .25 .50 .75 .90 .95
## -6 -2 -1 -1 -1
##
## lowest : -80995 -74215 -9360 -3114 -2000, highest: -5 -4 -3 -2 -1
##
## Value -81000 -74000 -9000 -3000 -2000 -1000 0
## Frequency 1 1 1 1 3 18 8481
## Proportion 0.000 0.000 0.000 0.000 0.000 0.002 0.997
##
## For the frequency table, variable is rounded to the nearest 1000
## --------------------------------------------------------------------------------
## transaction timestamp
## n missing distinct Info
## 8506 0 3324 1
## Mean Gmd .05 .10
## 2011-06-27 00:56:51 11263978 2010-12-14 11:42:00 2011-01-10 10:59:00
## .25 .50 .75 .90
## 2011-03-21 16:27:30 2011-07-11 13:07:00 2011-10-06 20:38:00 2011-11-14 15:26:30
## .95
## 2011-11-29 13:21:45
##
## lowest : 2010-12-01 09:49:00 2010-12-01 10:24:00 2010-12-01 12:38:00 2010-12-01 14:30:00 2010-12-01 14:33:00
## highest: 2011-12-08 19:28:00 2011-12-09 09:27:00 2011-12-09 09:57:00 2011-12-09 11:57:00 2011-12-09 11:58:00
## --------------------------------------------------------------------------------
## unit price
## n missing distinct Info Mean Gmd .05 .10
## 8506 0 152 0.998 4.44 4.641 0.55 0.85
## .25 .50 .75 .90 .95
## 1.45 2.55 4.95 9.95 12.75
##
## lowest : 0.03 0.06 0.07 0.08 0.11, highest: 165.00 175.00 195.00 265.50 295.00
##
## Value 0 5 10 15 20 25 30 35 40 50 60
## Frequency 3944 3080 1020 376 21 10 5 5 13 6 4
## Proportion 0.464 0.362 0.120 0.044 0.002 0.001 0.001 0.001 0.002 0.001 0.000
##
## Value 65 85 125 165 175 195 265 295
## Frequency 2 2 10 2 1 2 1 2
## Proportion 0.000 0.000 0.001 0.000 0.000 0.000 0.000 0.000
##
## For the frequency table, variable is rounded to the nearest 5
## --------------------------------------------------------------------------------
## customer id
## n missing distinct Info Mean Gmd .05 .10
## 8506 0 1534 1 14995 1961 12505 12709
## .25 .50 .75 .90 .95
## 13523 14901 16393 17548 17841
##
## lowest : 12346 12352 12359 12362 12375, highest: 18272 18274 18276 18277 18282
## --------------------------------------------------------------------------------
## transaction country
## n missing distinct
## 8506 0 26
##
## lowest : Australia Austria Belgium Channel Islands Cyprus
## highest: Spain Sweden Switzerland United Kingdom USA
## --------------------------------------------------------------------------------
## date
## n missing distinct Info Mean Gmd .05
## 8506 0 302 1 2011-06-26 11264336 2010-12-14
## .10 .25 .50 .75 .90 .95
## 2011-01-10 2011-03-21 2011-07-11 2011-10-06 2011-11-14 2011-11-29
##
## lowest : 2010-12-01 2010-12-02 2010-12-03 2010-12-05 2010-12-06
## highest: 2011-12-05 2011-12-06 2011-12-07 2011-12-08 2011-12-09
## --------------------------------------------------------------------------------
## day_name
## n missing distinct
## 8506 0 6
##
## lowest : Friday Monday Sunday Thursday Tuesday
## highest: Monday Sunday Thursday Tuesday Wednesday
##
## Value Friday Monday Sunday Thursday Tuesday Wednesday
## Frequency 1224 1401 446 2230 1561 1644
## Proportion 0.144 0.165 0.052 0.262 0.184 0.193
## --------------------------------------------------------------------------------
## day_of_month
## n missing distinct
## 8506 0 31
##
## lowest : 01 02 03 04 05, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## month_of_year
## n missing distinct
## 8506 0 12
##
## lowest : April August December February January
## highest: March May November October September
##
## Value April August December February January July
## Frequency 529 613 1012 410 666 637
## Proportion 0.062 0.072 0.119 0.048 0.078 0.075
##
## Value June March May November October September
## Frequency 628 608 552 1000 1097 754
## Proportion 0.074 0.071 0.065 0.118 0.129 0.089
## --------------------------------------------------------------------------------
## time_of_day(hh:mm:ss)
## n missing distinct
## 8506 0 685
##
## lowest : 06:10:00 06:14:00 06:15:00 06:19:00 06:21:00
## highest: 20:18:00 20:28:00 20:33:00 20:36:00 20:38:00
## --------------------------------------------------------------------------------
## sale_value
## n missing distinct Info Mean Gmd .05 .10
## 8506 0 900 1 -55.46 97.92 -81.12 -37.50
## .25 .50 .75 .90 .95
## -16.60 -8.30 -3.30 -1.65 -1.25
##
## lowest : -168469.60 -77183.60 -6539.40 -4921.50 -4522.50
## highest: -0.39 -0.29 -0.21 -0.19 -0.12
##
## Value -168000 -78000 -6000 -4000 -2000 0
## Frequency 1 1 1 5 15 8483
## Proportion 0.000 0.000 0.000 0.001 0.002 0.997
##
## For the frequency table, variable is rounded to the nearest 2000
## --------------------------------------------------------------------------------
## year
## n missing distinct
## 8506 0 2
##
## Value 2010 2011
## Frequency 666 7840
## Proportion 0.078 0.922
## --------------------------------------------------------------------------------
## month_number
## n missing distinct
## 8506 0 12
##
## lowest : 01 02 03 04 05, highest: 08 09 10 11 12
##
## Value 1 2 3 4 5 6 7 8 9 10 11
## Frequency 666 410 608 529 552 628 637 613 754 1097 1000
## Proportion 0.078 0.048 0.071 0.062 0.065 0.074 0.075 0.072 0.089 0.129 0.118
##
## Value 12
## Frequency 1012
## Proportion 0.119
## --------------------------------------------------------------------------------
## hour
## n missing distinct
## 8506 0 15
##
## lowest : 06 07 08 09 10, highest: 16 17 18 19 20
##
## Value 6 7 8 9 10 11 12 13 14 15 16
## Frequency 38 4 98 499 910 1095 1191 994 893 823 871
## Proportion 0.004 0.000 0.012 0.059 0.107 0.129 0.140 0.117 0.105 0.097 0.102
##
## Value 17 18 19 20
## Frequency 634 204 184 68
## Proportion 0.075 0.024 0.022 0.008
## --------------------------------------------------------------------------------
#Create a list of cancelled transactions to join with the products of similar value
cancelled_join=cancelled%>%
mutate(`quantity sold`=abs(`quantity sold`),sale_value=abs(sale_value))%>%
select(`transaction id`,`product id`,`product description`,`quantity sold`,`unit price`,sale_value,`customer id`,`transaction timestamp`)
#Join with sales data to identify the combination of product id,customer, sale_value to remove
cancelled_id_df=sales_customer%>%left_join(cancelled_join,by=c("customer id","sale_value","product id","product description","quantity sold","unit price"))%>%
filter(`transaction timestamp.x`<`transaction timestamp.y`)%>%
arrange(`customer id`,desc(`transaction timestamp.x`))%>%
group_by(`product id`,`customer id`,sale_value,`transaction id.y`)%>%slice(1)%>%ungroup()%>%
mutate(`transaction timestamp`=`transaction timestamp.x`,`transaction id`=`transaction id.x`)%>%
select("transaction id","customer id","sale_value","product id","product description","quantity sold","unit price","transaction timestamp","transaction id.y")
sales_customer=sales_customer%>%
left_join(cancelled_id_df, by=c("transaction id","customer id","sale_value","product id","product description","quantity sold","unit price","transaction timestamp"))%>%
filter(is.na(`transaction id.y.y`))%>%
filter((!str_detect(`transaction id`,"^C")))
#Prepare final sales_customer data frame
sales_customer=sales_customer%>%select(-c(`transaction id.y.y`))
#Customer Behaviour: types of customer that buy
#Create customer level df
cust_df=sales_customer%>%group_by(`customer id`,`transaction country`)%>%
summarise(customer_sales=sum(sale_value),total_quantity=sum(`quantity sold`),total_transaction=length(unique(as.character(`transaction id`))))
## `summarise()` regrouping output by 'customer id' (override with `.groups` argument)
#Top5 Customers
cust_df%>%arrange(desc(customer_sales))%>%head(5)
## # A tibble: 5 x 5
## # Groups: customer id [5]
## `customer id` `transaction cou~ customer_sales total_quantity total_transacti~
## <dbl> <chr> <dbl> <dbl> <int>
## 1 14646 Netherlands 278778. 196556 72
## 2 18102 United Kingdom 259657. 64124 60
## 3 17450 United Kingdom 189608. 69031 45
## 4 14911 EIRE 129691. 77154 195
## 5 12415 Australia 123807. 77056 19
#Customers by country
cust_df%>%group_by(`transaction country`)%>%
summarise(count_cust=length(`customer id`))%>%
arrange(desc(count_cust))%>%head(5)%>%
ggplot(aes(x=reorder(`transaction country`,-count_cust),y=count_cust))+
geom_col(fill="Light Blue")+ geom_text(aes(label=count_cust), position=position_dodge(width=0.9), vjust=-0.25)+
ggtitle(" Top 5 Countries by Customers")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
ylab("Number of Customers") +
xlab("Country")
## `summarise()` ungrouping output (override with `.groups` argument)
###Customer Purchase Behaviour
#Average transaction, median transaction
summary(cust_df$total_transaction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 4.222 5.000 203.000
#average quantity
sum(cust_df$total_quantity)/sum(cust_df$total_transaction)
## [1] 268.9532
#median
summary(cust_df$total_quantity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 158.0 373.5 1135.4 982.2 196556.0
#average purchase amount
sum(cust_df$customer_sales)/sum(cust_df$total_transaction)
## [1] 456.6943
#median
summary(cust_df$customer_sales)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.9 301.2 655.1 1928.0 1613.1 278778.0
#Most bought products
sales_customer%>%group_by(`product description`)%>%summarise(count=n())%>%ungroup()%>%arrange(desc(count))%>%head(10)%>%
ggplot(aes(x=count,y=reorder(`product description`,count)))+
geom_col(fill="green")+ geom_text(aes(label=count), position=position_dodge(width=0), hjust=1.1)+
ggtitle(" Most Bought Products")+
theme(axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
ylab("Product Description") +
xlab("Count")
## `summarise()` ungrouping output (override with `.groups` argument)
###Customer Loyalty and Retention
#Customers on monthly basis
sales_customer%>%filter(date<'2011-12-01')%>%
mutate(year_month=format(date,"%Y-%m"))%>%
group_by(year_month)%>%
summarise(trans_count=length(unique(`transaction id`)),distinct_cust=length(unique(`customer id`)))%>%
arrange(year_month)%>%
ggplot(aes(x=ordered(year_month),group=1))+
geom_line(aes(y=distinct_cust,colour="Customer Count"),size=1)+ geom_point(aes(y=distinct_cust),colour="RED",size=2)+
geom_line(aes(y=trans_count,colour="Transaction Count"),size=1)+ geom_point(aes(y=trans_count),colour="blue",size=2)+
scale_colour_manual("",values=c("red","blue"))+
ggtitle("Monthly Customers and Transactions")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
ylab("Count") +
xlab("Year-Month")
## `summarise()` ungrouping output (override with `.groups` argument)
#Customer Retention
sales_customer%>%select(`transaction id`,`customer id`,date)%>%
filter(date<'2011-12-01')%>%
mutate(year_month=format(date,"%Y-%m"))%>%
group_by(year_month)%>%
summarise(trans_count=length(unique(`transaction id`)),distinct_cust=length(unique(`customer id`)))%>%
arrange(year_month)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 13 x 3
## year_month trans_count distinct_cust
## <chr> <int> <int>
## 1 2010-12 1387 883
## 2 2011-01 975 737
## 3 2011-02 989 756
## 4 2011-03 1304 970
## 5 2011-04 1130 849
## 6 2011-05 1528 1051
## 7 2011-06 1377 986
## 8 2011-07 1314 945
## 9 2011-08 1264 933
## 10 2011-09 1729 1254
## 11 2011-10 1887 1356
## 12 2011-11 2631 1657
## 13 2011-12 118 111
#Cohort Analysis
cohort=sales_customer%>%select(`transaction id`,`customer id`,date)%>%
filter(date<'2011-12-01')%>%
mutate(year_month=format(date,"%Y-%m"))%>%
group_by(year_month)%>%select(`customer id`,year_month)%>%distinct(`customer id`,year_month)
cohort= cohort %>% # store in cohort table, get from dbdata
group_by(`customer id`) %>% # group all users together
mutate(first = min(year_month)) %>% # for every user, find the first period
group_by(first, year_month) %>% # group by this first period + the other periods
summarise(users = n()) %>% # for each combination, count the number of users
spread(year_month, users) # and make columns with period names
## `summarise()` regrouping output by 'first' (override with `.groups` argument)
shiftrow <- function(v) {
# put a vector in, strip off leading NA values, and place that amount at the end
first_na_index <- min( which(!is.na(v)) )
# return that bit to the end, and pad with NAs.
c(v[first_na_index:length(v)], rep(NA, first_na_index-1))
}
# create a new dataframe, with shifted rows (and keep the first one)
shifted <- data.frame(
cohort = cohort$first,
t(apply( select(as.data.frame(cohort), 2:ncol(cohort)), # 2nd column to the end
1, shiftrow ))) # for every row
# and make column names readable
# first should be "cohort" and the rest week.<number>, (padded)
colnames(shifted) <- c("cohort", sub("","Month.", str_pad(1:(ncol(shifted)-1),2,pad = "0")))
# percentages
shifted_pct <- data.frame(
cohort = shifted$cohort, # first column
shifted[,1:nrow(shifted)+1] / shifted[["Month.01"]]) # rest: divide by month.01
shifted_pct
## cohort Month.01 Month.02 Month.03 Month.04 Month.05 Month.06
## 1 2010-12 1 0.36579841 0.32389581 0.38165345 0.35900340 0.398640997
## 2 2011-01 1 0.21497585 0.26811594 0.22946860 0.31884058 0.287439614
## 3 2011-02 1 0.18635171 0.18635171 0.28346457 0.27034121 0.244094488
## 4 2011-03 1 0.14634146 0.24833703 0.19512195 0.22394678 0.168514412
## 5 2011-04 1 0.21333333 0.20333333 0.21000000 0.19666667 0.230000000
## 6 2011-05 1 0.19434629 0.17314488 0.17314488 0.20848057 0.229681979
## 7 2011-06 1 0.17355372 0.15702479 0.26446281 0.23140496 0.330578512
## 8 2011-07 1 0.17647059 0.20320856 0.22459893 0.27272727 0.005347594
## 9 2011-08 1 0.20118343 0.23668639 0.24260355 0.01183432 NA
## 10 2011-09 1 0.23648649 0.30405405 0.01351351 NA NA
## 11 2011-10 1 0.23595506 0.02247191 NA NA NA
## 12 2011-11 1 0.02173913 NA NA NA NA
## 13 2011-12 1 NA NA NA NA NA
## Month.07 Month.08 Month.09 Month.10 Month.11 Month.12 Month.13
## 1 0.36013590 0.34767837 0.35334088 0.39297848 0.37372593 0.50169875 0.05662514
## 2 0.24879227 0.24154589 0.29710145 0.32850242 0.36714976 0.02415459 NA
## 3 0.25459318 0.27296588 0.24409449 0.30183727 0.01312336 NA NA
## 4 0.26607539 0.23059867 0.27937916 0.01330377 NA NA NA
## 5 0.21333333 0.26000000 0.01666667 NA NA NA NA
## 6 0.26501767 0.02120141 NA NA NA NA NA
## 7 0.01652893 NA NA NA NA NA NA
## 8 NA NA NA NA NA NA NA
## 9 NA NA NA NA NA NA NA
## 10 NA NA NA NA NA NA NA
## 11 NA NA NA NA NA NA NA
## 12 NA NA NA NA NA NA NA
## 13 NA NA NA NA NA NA NA
##MARKET BASKET ANALYSIS
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Loading required package: grid
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
#Prepare df in basket format
basket_df=sales_customer[order(sales_customer$`transaction id`),]
basket_df$`transaction id`=as.numeric(as.character(basket_df$`transaction id`))
basket_df= ddply(basket_df,c("`transaction id`","date"),function(df1)paste(df1$`product description`,collapse = ","))
basket_df$`transaction id`=NULL
basket_df$date=NULL
colnames(basket_df)=c("items")
txn = read.transactions("marketbasket.csv", format="basket",sep=",")
#Apply association rules
asso_rules=apriori(txn,parameter = list(supp=0.001,conf=0.8,maxlen=4))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 4 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 18
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[7708 item(s), 18289 transaction(s)] done [0.14s].
## sorting and recoding items ... [2431 item(s)] done [0.01s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4
## Warning in apriori(txn, parameter = list(supp = 0.001, conf = 0.8, maxlen = 4)):
## Mining stopped (maxlen reached). Only patterns up to a length of 4 returned!
## done [0.24s].
## writing ... [14060 rule(s)] done [0.02s].
## creating S4 object ... done [0.02s].
summary(is.redundant(asso_rules))
## Mode FALSE TRUE
## logical 12168 1892
asso_rules=asso_rules[!is.redundant(asso_rules)]
asso_rules=sort(asso_rules,by="support", decreasing = T)
summary(asso_rules)
## set of 12168 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4
## 113 2813 9242
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 4.00 4.00 3.75 4.00 4.00
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.001039 Min. :0.8000 Min. :0.001039 Min. : 8.361
## 1st Qu.:0.001148 1st Qu.:0.8246 1st Qu.:0.001367 1st Qu.: 20.761
## Median :0.001367 Median :0.8571 Median :0.001640 Median : 32.819
## Mean :0.001707 Mean :0.8668 Mean :0.001973 Mean : 63.321
## 3rd Qu.:0.001859 3rd Qu.:0.9048 3rd Qu.:0.002132 3rd Qu.: 78.193
## Max. :0.022363 Max. :1.0000 Max. :0.027394 Max. :589.968
## count
## Min. : 19.00
## 1st Qu.: 21.00
## Median : 25.00
## Mean : 31.22
## 3rd Qu.: 34.00
## Max. :409.00
##
## mining info:
## data ntransactions support confidence
## txn 18289 0.001 0.8
inspect(asso_rules[1:20])
## lhs rhs support confidence coverage lift count
## [1] {PINK REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.022363169 0.8163673 0.027393515 24.08152 409
## [2] {GREEN REGENCY TEACUP AND SAUCER,
## PINK REGENCY TEACUP AND SAUCER} => {ROSES REGENCY TEACUP AND SAUCER} 0.018645087 0.8337408 0.022363169 21.81443 341
## [3] {PINK REGENCY TEACUP AND SAUCER,
## ROSES REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.018645087 0.8903394 0.020941550 26.26358 341
## [4] {GREEN REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER} => {ROSES REGENCY TEACUP AND SAUCER} 0.014708295 0.8176292 0.017988955 21.39288 269
## [5] {PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER} => {GREEN REGENCY TEACUP AND SAUCER} 0.012849254 0.8703704 0.014762972 25.67452 235
## [6] {PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER} => {ROSES REGENCY TEACUP AND SAUCER} 0.012411832 0.8407407 0.014762972 21.99758 227
## [7] {SET 3 RETROSPOT TEA} => {SUGAR} 0.011919733 1.0000000 0.011919733 83.89450 218
## [8] {SUGAR} => {SET 3 RETROSPOT TEA} 0.011919733 1.0000000 0.011919733 83.89450 218
## [9] {SET 3 RETROSPOT TEA} => {COFFEE} 0.011919733 1.0000000 0.011919733 63.50347 218
## [10] {SUGAR} => {COFFEE} 0.011919733 1.0000000 0.011919733 63.50347 218
## [11] {SET/6 RED SPOTTY PAPER CUPS} => {SET/6 RED SPOTTY PAPER PLATES} 0.011701022 0.8230769 0.014216196 50.85559 214
## [12] {SHED} => {KEY FOB} 0.011372956 1.0000000 0.011372956 60.16118 208
## [13] {REGENCY TEA PLATE GREEN} => {REGENCY TEA PLATE ROSES} 0.011099568 0.8388430 0.013231997 52.72027 203
## [14] {GREEN REGENCY TEACUP AND SAUCER,
## PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER} => {ROSES REGENCY TEACUP AND SAUCER} 0.011099568 0.8638298 0.012849254 22.60169 203
## [15] {PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER,
## ROSES REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.011099568 0.8942731 0.012411832 26.37961 203
## [16] {BACK DOOR} => {KEY FOB} 0.010552791 1.0000000 0.010552791 60.16118 193
## [17] {WOODEN TREE CHRISTMAS SCANDINAVIAN} => {WOODEN STAR CHRISTMAS SCANDINAVIAN} 0.009896659 0.8153153 0.012138444 43.34681 181
## [18] {REGENCY TEA PLATE PINK} => {REGENCY TEA PLATE GREEN} 0.009787304 0.8905473 0.010990213 67.30256 179
## [19] {REGENCY TEA PLATE PINK} => {REGENCY TEA PLATE ROSES} 0.009568593 0.8706468 0.010990213 54.71910 175
## [20] {REGENCY TEA PLATE GREEN,
## REGENCY TEA PLATE PINK} => {REGENCY TEA PLATE ROSES} 0.008912461 0.9106145 0.009787304 57.23103 163
#To find rule for specific product
specific_asso_rules=apriori(txn,parameter = list(supp=0.001,conf=0.8),appearance = list(default="lhs",rhs="SUGAR"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 18
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[7708 item(s), 18289 transaction(s)] done [0.14s].
## sorting and recoding items ... [2431 item(s)] done [0.01s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(txn, parameter = list(supp = 0.001, conf = 0.8), appearance
## = list(default = "lhs", : Mining stopped (maxlen reached). Only patterns up to a
## length of 10 returned!
## done [0.36s].
## writing ... [116 rule(s)] done [0.04s].
## creating S4 object ... done [0.01s].
inspect(specific_asso_rules[1:10])
## lhs rhs support confidence coverage lift count
## [1] {SET 3 RETROSPOT TEA} => {SUGAR} 0.011919733 1.0000000 0.011919733 83.89450 218
## [2] {RED RETROSPOT MUG,
## SET 3 RETROSPOT TEA} => {SUGAR} 0.001093554 1.0000000 0.001093554 83.89450 20
## [3] {COFFEE,
## RED RETROSPOT MUG} => {SUGAR} 0.001093554 0.9523810 0.001148231 79.89952 20
## [4] {RED RETROSPOT SUGAR JAM BOWL,
## SET 3 RETROSPOT TEA} => {SUGAR} 0.001804363 1.0000000 0.001804363 83.89450 33
## [5] {COFFEE,
## RED RETROSPOT SUGAR JAM BOWL} => {SUGAR} 0.001804363 0.9705882 0.001859041 81.42701 33
## [6] {RED RETROSPOT BUTTER DISH,
## SET 3 RETROSPOT TEA} => {SUGAR} 0.002023074 1.0000000 0.002023074 83.89450 37
## [7] {COFFEE,
## RED RETROSPOT BUTTER DISH} => {SUGAR} 0.002023074 0.9250000 0.002187107 77.60241 37
## [8] {BREAD BIN DINER STYLE RED,
## SET 3 RETROSPOT TEA} => {SUGAR} 0.001038876 1.0000000 0.001038876 83.89450 19
## [9] {BREAD BIN DINER STYLE RED,
## COFFEE} => {SUGAR} 0.001038876 0.9047619 0.001148231 75.90454 19
## [10] {RED SPOTTY BISCUIT TIN,
## SET 3 RETROSPOT TEA} => {SUGAR} 0.003499371 1.0000000 0.003499371 83.89450 64
summary(specific_asso_rules)
## set of 116 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5
## 1 71 42 2
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 3.000 3.000 3.388 4.000 5.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.001039 Min. :0.8000 Min. :0.001039 Min. :67.12
## 1st Qu.:0.001094 1st Qu.:0.9931 1st Qu.:0.001203 1st Qu.:83.31
## Median :0.001422 Median :1.0000 Median :0.001504 Median :83.89
## Mean :0.001733 Mean :0.9773 Mean :0.001769 Mean :81.99
## 3rd Qu.:0.001763 3rd Qu.:1.0000 3rd Qu.:0.001804 3rd Qu.:83.89
## Max. :0.011920 Max. :1.0000 Max. :0.011920 Max. :83.89
## count
## Min. : 19.00
## 1st Qu.: 20.00
## Median : 26.00
## Mean : 31.69
## 3rd Qu.: 32.25
## Max. :218.00
##
## mining info:
## data ntransactions support confidence
## txn 18289 0.001 0.8
#Filter rules
filtered_rules <- subset(asso_rules, subset = lift > 20 & support>0.005)
inspect(filtered_rules[1:10])
## lhs rhs support confidence coverage lift count
## [1] {PINK REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.02236317 0.8163673 0.02739352 24.08152 409
## [2] {GREEN REGENCY TEACUP AND SAUCER,
## PINK REGENCY TEACUP AND SAUCER} => {ROSES REGENCY TEACUP AND SAUCER} 0.01864509 0.8337408 0.02236317 21.81443 341
## [3] {PINK REGENCY TEACUP AND SAUCER,
## ROSES REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.01864509 0.8903394 0.02094155 26.26358 341
## [4] {GREEN REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER} => {ROSES REGENCY TEACUP AND SAUCER} 0.01470829 0.8176292 0.01798896 21.39288 269
## [5] {PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER} => {GREEN REGENCY TEACUP AND SAUCER} 0.01284925 0.8703704 0.01476297 25.67452 235
## [6] {PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER} => {ROSES REGENCY TEACUP AND SAUCER} 0.01241183 0.8407407 0.01476297 21.99758 227
## [7] {SET 3 RETROSPOT TEA} => {SUGAR} 0.01191973 1.0000000 0.01191973 83.89450 218
## [8] {SUGAR} => {SET 3 RETROSPOT TEA} 0.01191973 1.0000000 0.01191973 83.89450 218
## [9] {SET 3 RETROSPOT TEA} => {COFFEE} 0.01191973 1.0000000 0.01191973 63.50347 218
## [10] {SUGAR} => {COFFEE} 0.01191973 1.0000000 0.01191973 63.50347 218
#Visualize market basket
plot(asso_rules)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(filtered_rules,method = "graph",engine = "htmlwidget")
## Warning: Too many rules supplied. Only plotting the best 100 rules using lift
## (change control parameter max if needed)
plot(filtered_rules)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(filtered_rules,method = "graph")
## Warning: plot: Too many rules supplied. Only plotting the best 100 rules using
## 'support' (change control parameter max if needed)
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
kmean_df=sales_customer%>%
group_by(`customer id`)%>%
summarise(unique_items=n_distinct(`product id`),transaction_count=n_distinct(`transaction id`),quantity_total=sum(`quantity sold`),total_sales=sum(sale_value))
## `summarise()` ungrouping output (override with `.groups` argument)
kmean_df$`customer id`=NULL
#Function to normalize the data
normalize=function(x){(x-min(x))/(max(x)-min(x))}
kmean_df=as.data.frame(lapply(kmean_df, normalize))
summary(kmean_df)
## unique_items transaction_count quantity_total total_sales
## Min. :0.000000 Min. :0.000000 Min. :0.0000000 Min. :0.000000
## 1st Qu.:0.008547 1st Qu.:0.000000 1st Qu.:0.0007988 1st Qu.:0.001070
## Median :0.019373 Median :0.004951 Median :0.0019002 Median :0.002344
## Mean :0.034330 Mean :0.015987 Mean :0.0057822 Mean :0.006918
## 3rd Qu.:0.043305 3rd Qu.:0.019802 3rd Qu.:0.0049961 3rd Qu.:0.005786
## Max. :1.000000 Max. :1.000000 Max. :1.0000000 Max. :1.000000
#Dimesionality Check (Multicollinearity Check)
cor(kmean_df)
## unique_items transaction_count quantity_total total_sales
## unique_items 1.0000000 0.6908534 0.4367447 0.4021930
## transaction_count 0.6908534 1.0000000 0.5856440 0.5739230
## quantity_total 0.4367447 0.5856440 1.0000000 0.9194646
## total_sales 0.4021930 0.5739230 0.9194646 1.0000000
#Quantity total and total sales have high collinearity, hence dropping one
kmean_df$total_sales=NULL
set.seed(123)
# function to compute total within-cluster sum of square
wss=function(k){
kmeans(kmean_df,k,nstart=25)$tot.withinss
}
# Compute and plot wss for k = 1 to k = 15
k.values=1:10
# extract wss for 2-15 clusters
wss_values=sapply(k.values,wss)
warnings()
#Plot to visualize and select k value based on elbow method
plot(k.values,wss_values, xlab="Number of clusters k", ylab="Total within-clusters sum of squares")
#K=5 seems optimum from above graph
#Alternate way to visualize the optimum k value
set.seed(123)
fviz_nbclust(kmean_df, kmeans, method = "wss")
#Generating K Means with K=5
set.seed(123)
kmeans_final=kmeans(kmean_df,5,nstart = 25)
#Plots
fviz_cluster(kmeans_final, data = kmean_df)