library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(plyr)
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:Hmisc':
## 
##     is.discrete, summarize
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::arrange()   masks plyr::arrange()
## x purrr::compact()   masks plyr::compact()
## x dplyr::count()     masks plyr::count()
## x dplyr::failwith()  masks plyr::failwith()
## x dplyr::filter()    masks stats::filter()
## x dplyr::id()        masks plyr::id()
## x dplyr::lag()       masks stats::lag()
## x dplyr::mutate()    masks plyr::mutate()
## x dplyr::rename()    masks plyr::rename()
## x dplyr::src()       masks Hmisc::src()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize(), Hmisc::summarize()
library(stringr)
library(readxl)
library(writexl)
library(ggplot2)
library(dplyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
sales_data = read_xlsx("D:\\Drive E\\Software\\R\\UCI Online Retail\\sales_data.xlsx")
date = read_xlsx("D:\\Drive E\\Software\\R\\UCI Online Retail\\date.xlsx")

date=date%>%distinct()
write.csv(date,file="Date_distinct")
head(date)
## # A tibble: 6 x 6
##   timestamp           date                day_name day_of_month month_of_year
##   <dttm>              <dttm>              <chr>    <chr>        <chr>        
## 1 2010-12-01 08:26:00 2010-12-01 00:00:00 Wednesd~ 01           December     
## 2 2010-12-01 08:28:00 2010-12-01 00:00:00 Wednesd~ 01           December     
## 3 2010-12-01 08:34:00 2010-12-01 00:00:00 Wednesd~ 01           December     
## 4 2010-12-01 08:35:00 2010-12-01 00:00:00 Wednesd~ 01           December     
## 5 2010-12-01 08:45:00 2010-12-01 00:00:00 Wednesd~ 01           December     
## 6 2010-12-01 09:00:00 2010-12-01 00:00:00 Wednesd~ 01           December     
## # ... with 1 more variable: `time_of_day(hh:mm:ss)` <chr>
Sales_Data_Date=sales_data%>%left_join(date,by=c("transaction timestamp"="timestamp")) #left join sales data with date


write_xlsx(Sales_Data_Date,"D:\\Drive E\\Software\\R\\Sales_Data_Date.xlsx")
write_xlsx(date,"D:\\Drive E\\Software\\R\\Date_Distinct.xlsx")
Dup=Sales_Data_Date[duplicated(Sales_Data_Date),]
#duplicated(Sales_Data_Date)
#head(Dup)
SalesDD_Dist= Sales_Data_Date%>%distinct()
SalesDD_Dist%>%count(distinct(SalesDD_Dist,)) #Create Distinct Sales_Data_Date Dataset
## # A tibble: 536,641 x 14
##    `transaction id` `product id` `product descri~ `quantity sold`
##    <chr>            <chr>        <chr>                      <dbl>
##  1 536365           21730        GLASS STAR FROS~               6
##  2 536365           22752        SET 7 BABUSHKA ~               2
##  3 536365           71053        WHITE METAL LAN~               6
##  4 536365           84029E       RED WOOLLY HOTT~               6
##  5 536365           84029G       KNITTED UNION F~               6
##  6 536365           84406B       CREAM CUPID HEA~               8
##  7 536365           85123A       WHITE HANGING H~               6
##  8 536366           22632        HAND WARMER RED~               6
##  9 536366           22633        HAND WARMER UNI~               6
## 10 536367           21754        HOME BUILDING B~               3
## # ... with 536,631 more rows, and 10 more variables: `transaction
## #   timestamp` <dttm>, `unit price` <dbl>, `customer id` <dbl>, `transaction
## #   country` <chr>, date <dttm>, day_name <chr>, day_of_month <chr>,
## #   month_of_year <chr>, `time_of_day(hh:mm:ss)` <chr>, n <int>
summary(SalesDD_Dist)
##  transaction id      product id        product description quantity sold      
##  Length:536641      Length:536641      Length:536641       Min.   :-80995.00  
##  Class :character   Class :character   Class :character    1st Qu.:     1.00  
##  Mode  :character   Mode  :character   Mode  :character    Median :     3.00  
##                                                            Mean   :     9.62  
##                                                            3rd Qu.:    10.00  
##                                                            Max.   : 80995.00  
##                                                                               
##  transaction timestamp           unit price         customer id    
##  Min.   :2010-12-01 08:26:00   Min.   :-11062.06   Min.   :12346   
##  1st Qu.:2011-03-28 10:52:00   1st Qu.:     1.25   1st Qu.:13939   
##  Median :2011-07-19 14:04:00   Median :     2.08   Median :15145   
##  Mean   :2011-07-04 08:57:06   Mean   :     4.63   Mean   :15281   
##  3rd Qu.:2011-10-18 17:05:00   3rd Qu.:     4.13   3rd Qu.:16784   
##  Max.   :2011-12-09 12:50:00   Max.   : 38970.00   Max.   :18287   
##                                                    NA's   :135037  
##  transaction country      date                       day_name        
##  Length:536641       Min.   :2010-12-01 00:00:00   Length:536641     
##  Class :character    1st Qu.:2011-03-28 00:00:00   Class :character  
##  Mode  :character    Median :2011-07-19 00:00:00   Mode  :character  
##                      Mean   :2011-07-03 19:22:27                     
##                      3rd Qu.:2011-10-18 00:00:00                     
##                      Max.   :2011-12-09 00:00:00                     
##                                                                      
##  day_of_month       month_of_year      time_of_day(hh:mm:ss)
##  Length:536641      Length:536641      Length:536641        
##  Class :character   Class :character   Class :character     
##  Mode  :character   Mode  :character   Mode  :character     
##                                                             
##                                                             
##                                                             
## 
describe(SalesDD_Dist) #To get description of data stored for each column
## SalesDD_Dist 
## 
##  13  Variables      536641  Observations
## --------------------------------------------------------------------------------
## transaction id 
##        n  missing distinct 
##   536641        0    25900 
## 
## lowest : 536365  536366  536367  536368  536369 
## highest: C581484 C581490 C581499 C581568 C581569
## --------------------------------------------------------------------------------
## product id 
##        n  missing distinct 
##   536641        0     4070 
## 
## lowest : 10002  10080  10120  10123C 10123G, highest: m      M      PADS   POST   S     
## --------------------------------------------------------------------------------
## product description 
##        n  missing distinct 
##   535187     1454     4211 
## 
## lowest : *Boombox Ipod Classic            *USB Office Mirror Ball          ?                                ? sold as sets?                  ??                              
## highest: ZINC T-LIGHT HOLDER STARS SMALL  ZINC TOP  2 DOOR WOODEN SHELF    ZINC WILLIE WINKIE  CANDLE STICK ZINC WIRE KITCHEN ORGANISER      ZINC WIRE SWEETHEART LETTER TRAY
## --------------------------------------------------------------------------------
## quantity sold 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   536641        0      722    0.974     9.62    16.67        1        1 
##      .25      .50      .75      .90      .95 
##        1        3       10       24       30 
## 
## lowest : -80995 -74215  -9600  -9360  -9058, highest:   4800   5568  12540  74215  80995
##                                                                          
## Value      -80000 -74000 -10000  -6000  -4000  -2000      0   2000   4000
## Frequency       1      1      4      1      5     46 536455    116      8
## Proportion      0      0      0      0      0      0      1      0      0
##                                       
## Value        6000  12000  74000  80000
## Frequency       1      1      1      1
## Proportion      0      0      0      0
## 
## For the frequency table, variable is rounded to the nearest 2000
## --------------------------------------------------------------------------------
## transaction timestamp 
##                   n             missing            distinct                Info 
##              536641                   0               23260                   1 
##                Mean                 Gmd                 .05                 .10 
## 2011-07-04 08:57:06            11438790 2010-12-13 09:35:00 2011-01-12 11:21:00 
##                 .25                 .50                 .75                 .90 
## 2011-03-28 10:52:00 2011-07-19 14:04:00 2011-10-18 17:05:00 2011-11-21 15:18:00 
##                 .95 
## 2011-11-30 14:40:00 
## 
## lowest : 2010-12-01 08:26:00 2010-12-01 08:28:00 2010-12-01 08:34:00 2010-12-01 08:35:00 2010-12-01 08:45:00
## highest: 2011-12-09 12:23:00 2011-12-09 12:25:00 2011-12-09 12:31:00 2011-12-09 12:49:00 2011-12-09 12:50:00
## --------------------------------------------------------------------------------
## unit price 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   536641        0     1630    0.998    4.633    6.044     0.42     0.64 
##      .25      .50      .75      .90      .95 
##     1.25     2.08     4.13     7.95     9.95 
## 
## lowest : -11062.060      0.000      0.001      0.010      0.030
## highest:  13541.330  16453.710  16888.020  17836.460  38970.000
## --------------------------------------------------------------------------------
## customer id 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   401604   135037     4372        1    15281     1978    12621    12867 
##      .25      .50      .75      .90      .95 
##    13939    15145    16784    17716    17898 
## 
## lowest : 12346 12347 12348 12349 12350, highest: 18280 18281 18282 18283 18287
## --------------------------------------------------------------------------------
## transaction country 
##        n  missing distinct 
##   536641        0       38 
## 
## lowest : Australia            Austria              Bahrain              Belgium              Brazil              
## highest: Switzerland          United Arab Emirates United Kingdom       Unspecified          USA                 
## --------------------------------------------------------------------------------
## date 
##          n    missing   distinct       Info       Mean        Gmd        .05 
##     536641          0        305          1 2011-07-04   11438597 2010-12-13 
##        .10        .25        .50        .75        .90        .95 
## 2011-01-12 2011-03-28 2011-07-19 2011-10-18 2011-11-21 2011-11-30 
## 
## lowest : 2010-12-01 2010-12-02 2010-12-03 2010-12-05 2010-12-06
## highest: 2011-12-05 2011-12-06 2011-12-07 2011-12-08 2011-12-09
## --------------------------------------------------------------------------------
## day_name 
##        n  missing distinct 
##   536641        0        6 
## 
## lowest : Friday    Monday    Sunday    Thursday  Tuesday  
## highest: Monday    Sunday    Thursday  Tuesday   Wednesday
##                                                                       
## Value         Friday    Monday    Sunday  Thursday   Tuesday Wednesday
## Frequency      81565     94435     62806    103056    101064     93715
## Proportion     0.152     0.176     0.117     0.192     0.188     0.175
## --------------------------------------------------------------------------------
## day_of_month 
##        n  missing distinct 
##   536641        0       31 
## 
## lowest : 01 02 03 04 05, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## month_of_year 
##        n  missing distinct 
##   536641        0       12 
## 
## lowest : April     August    December  February  January  
## highest: March     May       November  October   September
##                                                                       
## Value          April    August  December  February   January      July
## Frequency      29701     35064     67227     27479     34900     39267
## Proportion     0.055     0.065     0.125     0.051     0.065     0.073
##                                                                       
## Value           June     March       May  November   October September
## Frequency      36609     36439     36782     83343     59969     49861
## Proportion     0.068     0.068     0.069     0.155     0.112     0.093
## --------------------------------------------------------------------------------
## time_of_day(hh:mm:ss) 
##        n  missing distinct 
##   536641        0      774 
## 
## lowest : 06:10:00 06:12:00 06:13:00 06:14:00 06:15:00
## highest: 20:28:00 20:32:00 20:33:00 20:36:00 20:38:00
## --------------------------------------------------------------------------------
str(SalesDD_Dist)#get structure
## tibble [536,641 x 13] (S3: tbl_df/tbl/data.frame)
##  $ transaction id       : chr [1:536641] "536365" "536365" "536365" "536365" ...
##  $ product id           : chr [1:536641] "85123A" "71053" "84406B" "84029G" ...
##  $ product description  : chr [1:536641] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ quantity sold        : num [1:536641] 6 6 8 6 6 2 6 6 6 32 ...
##  $ transaction timestamp: POSIXct[1:536641], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
##  $ unit price           : num [1:536641] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ customer id          : num [1:536641] 17850 17850 17850 17850 17850 ...
##  $ transaction country  : chr [1:536641] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
##  $ date                 : POSIXct[1:536641], format: "2010-12-01" "2010-12-01" ...
##  $ day_name             : chr [1:536641] "Wednesday" "Wednesday" "Wednesday" "Wednesday" ...
##  $ day_of_month         : chr [1:536641] "01" "01" "01" "01" ...
##  $ month_of_year        : chr [1:536641] "December" "December" "December" "December" ...
##  $ time_of_day(hh:mm:ss): chr [1:536641] "08:26:00" "08:26:00" "08:26:00" "08:26:00" ...
str(SalesDD_Dist)#get structure
## tibble [536,641 x 13] (S3: tbl_df/tbl/data.frame)
##  $ transaction id       : chr [1:536641] "536365" "536365" "536365" "536365" ...
##  $ product id           : chr [1:536641] "85123A" "71053" "84406B" "84029G" ...
##  $ product description  : chr [1:536641] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ quantity sold        : num [1:536641] 6 6 8 6 6 2 6 6 6 32 ...
##  $ transaction timestamp: POSIXct[1:536641], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
##  $ unit price           : num [1:536641] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ customer id          : num [1:536641] 17850 17850 17850 17850 17850 ...
##  $ transaction country  : chr [1:536641] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
##  $ date                 : POSIXct[1:536641], format: "2010-12-01" "2010-12-01" ...
##  $ day_name             : chr [1:536641] "Wednesday" "Wednesday" "Wednesday" "Wednesday" ...
##  $ day_of_month         : chr [1:536641] "01" "01" "01" "01" ...
##  $ month_of_year        : chr [1:536641] "December" "December" "December" "December" ...
##  $ time_of_day(hh:mm:ss): chr [1:536641] "08:26:00" "08:26:00" "08:26:00" "08:26:00" ...
#create dataframe to modify
SalesDD_Mod = SalesDD_Dist 
SalesDD_Mod$`transaction id`=as.factor(SalesDD_Dist$`transaction id`)
str(SalesDD_Mod)
## tibble [536,641 x 13] (S3: tbl_df/tbl/data.frame)
##  $ transaction id       : Factor w/ 25900 levels "536365","536366",..: 1 1 1 1 1 1 1 2 2 3 ...
##  $ product id           : chr [1:536641] "85123A" "71053" "84406B" "84029G" ...
##  $ product description  : chr [1:536641] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ quantity sold        : num [1:536641] 6 6 8 6 6 2 6 6 6 32 ...
##  $ transaction timestamp: POSIXct[1:536641], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
##  $ unit price           : num [1:536641] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ customer id          : num [1:536641] 17850 17850 17850 17850 17850 ...
##  $ transaction country  : chr [1:536641] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
##  $ date                 : POSIXct[1:536641], format: "2010-12-01" "2010-12-01" ...
##  $ day_name             : chr [1:536641] "Wednesday" "Wednesday" "Wednesday" "Wednesday" ...
##  $ day_of_month         : chr [1:536641] "01" "01" "01" "01" ...
##  $ month_of_year        : chr [1:536641] "December" "December" "December" "December" ...
##  $ time_of_day(hh:mm:ss): chr [1:536641] "08:26:00" "08:26:00" "08:26:00" "08:26:00" ...
Cancelled = SalesDD_Mod%>%filter(str_detect(`transaction id`,"^C"))

#Imputation : Remove 0 unit price
zero_price=SalesDD_Mod%>%filter(`unit price`==0)
SalesDD_Mod=SalesDD_Mod%>%filter(`unit price`!=0)
#Order with ProductIDs having no integer but only letters are additional fees not actual products. E.g. Post, AmazonFee, Manual etc.
#Removing product IDs with only character names i.e filtering for Product IDs having digits


#Check the data with product id not having any digits
#13 Prod.IDs are invalid except Prod. IDs with DCGSSBOY & DCGSSGIRL are valid product types
#ProductID of data still has contents with ID starting with "gift_" and no customer ID, which indicates gift vouchers are distributed 
#Removing invalid Product IDs:

sales_clean=SalesDD_Mod %>% filter(str_detect(`product id`,"\\d")|str_detect(`product id`,"^DCGSS"))%>%filter(!str_detect(`product id`,"^gift"))
describe(sales_clean)
## sales_clean 
## 
##  13  Variables      531348  Observations
## --------------------------------------------------------------------------------
## transaction id 
##        n  missing distinct 
##   531348        0    23198 
## 
## lowest : 536365  536366  536367  536368  536369 
## highest: C581470 C581484 C581490 C581568 C581569
## --------------------------------------------------------------------------------
## product id 
##        n  missing distinct 
##   531348        0     3922 
## 
## lowest : 10002     10080     10120     10123C    10124A   
## highest: DCGS0069  DCGS0070  DCGS0076  DCGSSBOY  DCGSSGIRL
## --------------------------------------------------------------------------------
## product description 
##        n  missing distinct 
##   531348        0     4016 
## 
## lowest : *Boombox Ipod Classic            *USB Office Mirror Ball          10 COLOUR SPACEBOY PEN           12 COLOURED PARTY BALLOONS       12 DAISY PEGS IN WOOD BOX       
## highest: ZINC T-LIGHT HOLDER STARS SMALL  ZINC TOP  2 DOOR WOODEN SHELF    ZINC WILLIE WINKIE  CANDLE STICK ZINC WIRE KITCHEN ORGANISER      ZINC WIRE SWEETHEART LETTER TRAY
## --------------------------------------------------------------------------------
## quantity sold 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   531348        0      507    0.974    9.959    15.72        1        1 
##      .25      .50      .75      .90      .95 
##        1        3       10       24       30 
## 
## lowest : -80995 -74215  -9360  -3114  -2000, highest:   3906   4300   4800  74215  80995
##                                                                          
## Value      -80000 -74000 -10000  -4000  -2000      0   2000   4000  74000
## Frequency       1      1      1      1      6 531224    106      6      1
## Proportion      0      0      0      0      0      1      0      0      0
##                  
## Value       80000
## Frequency       1
## Proportion      0
## 
## For the frequency table, variable is rounded to the nearest 2000
## --------------------------------------------------------------------------------
## transaction timestamp 
##                   n             missing            distinct                Info 
##              531348                   0               21311                   1 
##                Mean                 Gmd                 .05                 .10 
## 2011-07-04 13:31:05            11440399 2010-12-13 09:35:00 2011-01-12 11:52:00 
##                 .25                 .50                 .75                 .90 
## 2011-03-28 11:51:00 2011-07-20 09:11:00 2011-10-19 10:04:00 2011-11-21 15:57:00 
##                 .95 
## 2011-11-30 14:46:00 
## 
## lowest : 2010-12-01 08:26:00 2010-12-01 08:28:00 2010-12-01 08:34:00 2010-12-01 08:35:00 2010-12-01 08:45:00
## highest: 2011-12-09 12:23:00 2011-12-09 12:25:00 2011-12-09 12:31:00 2011-12-09 12:49:00 2011-12-09 12:50:00
## --------------------------------------------------------------------------------
## unit price 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   531348        0      517    0.998    3.305     3.22     0.42     0.65 
##      .25      .50      .75      .90      .95 
##     1.25     2.08     4.13     7.90     9.95 
## 
## lowest :   0.03   0.04   0.06   0.07   0.08, highest: 175.00 195.00 265.50 295.00 649.50
## --------------------------------------------------------------------------------
## customer id 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   399790   131558     4362        1    15289     1974    12627    12901 
##      .25      .50      .75      .90      .95 
##    13959    15152    16791    17719    17900 
## 
## lowest : 12346 12347 12348 12349 12350, highest: 18280 18281 18282 18283 18287
## --------------------------------------------------------------------------------
## transaction country 
##        n  missing distinct 
##   531348        0       38 
## 
## lowest : Australia            Austria              Bahrain              Belgium              Brazil              
## highest: Switzerland          United Arab Emirates United Kingdom       Unspecified          USA                 
## --------------------------------------------------------------------------------
## date 
##          n    missing   distinct       Info       Mean        Gmd        .05 
##     531348          0        305          1 2011-07-04   11440201 2010-12-13 
##        .10        .25        .50        .75        .90        .95 
## 2011-01-12 2011-03-28 2011-07-20 2011-10-19 2011-11-21 2011-11-30 
## 
## lowest : 2010-12-01 2010-12-02 2010-12-03 2010-12-05 2010-12-06
## highest: 2011-12-05 2011-12-06 2011-12-07 2011-12-08 2011-12-09
## --------------------------------------------------------------------------------
## day_name 
##        n  missing distinct 
##   531348        0        6 
## 
## lowest : Friday    Monday    Sunday    Thursday  Tuesday  
## highest: Monday    Sunday    Thursday  Tuesday   Wednesday
##                                                                       
## Value         Friday    Monday    Sunday  Thursday   Tuesday Wednesday
## Frequency      80527     93484     62667    102039     99892     92739
## Proportion     0.152     0.176     0.118     0.192     0.188     0.175
## --------------------------------------------------------------------------------
## day_of_month 
##        n  missing distinct 
##   531348        0       31 
## 
## lowest : 01 02 03 04 05, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## month_of_year 
##        n  missing distinct 
##   531348        0       12 
## 
## lowest : April     August    December  February  January  
## highest: March     May       November  October   September
##                                                                       
## Value          April    August  December  February   January      July
## Frequency      29280     34731     66629     27160     34585     38845
## Proportion     0.055     0.065     0.125     0.051     0.065     0.073
##                                                                       
## Value           June     March       May  November   October September
## Frequency      36192     35955     36299     82763     59440     49469
## Proportion     0.068     0.068     0.068     0.156     0.112     0.093
## --------------------------------------------------------------------------------
## time_of_day(hh:mm:ss) 
##        n  missing distinct 
##   531348        0      770 
## 
## lowest : 06:10:00 06:14:00 06:15:00 06:19:00 06:20:00
## highest: 20:18:00 20:28:00 20:33:00 20:36:00 20:38:00
## --------------------------------------------------------------------------------
#Prod ID with C2 are carriage thus needs to be removed
sales_clean=sales_clean%>% filter(!str_detect(`product id`,"C2"))

#Cancelled Order Check
cancelled = sales_clean%>%filter(str_detect(`transaction id`,"^C"))
describe(cancelled)
## cancelled 
## 
##  13  Variables      8668  Observations
## --------------------------------------------------------------------------------
## transaction id 
##        n  missing distinct 
##     8668        0     3422 
## 
## lowest : C536383 C536391 C536506 C536543 C536548
## highest: C581470 C581484 C581490 C581568 C581569
## --------------------------------------------------------------------------------
## product id 
##        n  missing distinct 
##     8668        0     1938 
## 
## lowest : 10133  10135  11001  15034  15036 , highest: 90204  90208  90209A 90209C 90211B
## --------------------------------------------------------------------------------
## product description 
##        n  missing distinct 
##     8668        0     1962 
## 
## lowest : 10 COLOUR SPACEBOY PEN           12 COLOURED PARTY BALLOONS       12 EGG HOUSE PAINTED WOOD        12 IVORY ROSE PEG PLACE SETTINGS 12 MESSAGE CARDS WITH ENVELOPES 
## highest: ZINC SWEETHEART WIRE LETTER RACK ZINC T-LIGHT HOLDER STAR LARGE   ZINC T-LIGHT HOLDER STARS SMALL  ZINC TOP  2 DOOR WOODEN SHELF    ZINC WILLIE WINKIE  CANDLE STICK
## --------------------------------------------------------------------------------
## quantity sold 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8668        0      133    0.926   -31.15    57.48      -36      -20 
##      .25      .50      .75      .90      .95 
##       -6       -2       -1       -1       -1 
## 
## lowest : -80995 -74215  -9360  -3114  -2000, highest:     -5     -4     -3     -2     -1
##                                                            
## Value      -81000 -74000  -9000  -3000  -2000  -1000      0
## Frequency       1      1      1      1      3     18   8643
## Proportion  0.000  0.000  0.000  0.000  0.000  0.002  0.997
## 
## For the frequency table, variable is rounded to the nearest 1000
## --------------------------------------------------------------------------------
## transaction timestamp 
##                   n             missing            distinct                Info 
##                8668                   0                3363                   1 
##                Mean                 Gmd                 .05                 .10 
## 2011-06-26 09:26:07            11273694 2010-12-14 11:12:00 2011-01-09 14:17:00 
##                 .25                 .50                 .75                 .90 
## 2011-03-21 16:15:00 2011-07-08 13:39:00 2011-10-06 20:36:00 2011-11-14 15:10:00 
##                 .95 
## 2011-11-29 11:15:09 
## 
## lowest : 2010-12-01 09:49:00 2010-12-01 10:24:00 2010-12-01 12:38:00 2010-12-01 14:30:00 2010-12-01 14:33:00
## highest: 2011-12-08 19:28:00 2011-12-09 09:27:00 2011-12-09 09:57:00 2011-12-09 11:57:00 2011-12-09 11:58:00
## --------------------------------------------------------------------------------
## unit price 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8668        0      157    0.998    4.416    4.618     0.55     0.85 
##      .25      .50      .75      .90      .95 
##     1.45     2.55     4.95     9.95    12.75 
## 
## lowest :   0.03   0.06   0.07   0.08   0.11, highest: 165.00 175.00 195.00 265.50 295.00
##                                                                             
## Value          0     5    10    15    20    25    30    35    40    50    60
## Frequency   4040  3132  1027   381    21    11     5     6    13     6     4
## Proportion 0.466 0.361 0.118 0.044 0.002 0.001 0.001 0.001 0.001 0.001 0.000
##                                                           
## Value         65    85   125   165   175   195   265   295
## Frequency      2     2    10     2     1     2     1     2
## Proportion 0.000 0.000 0.001 0.000 0.000 0.000 0.000 0.000
## 
## For the frequency table, variable is rounded to the nearest 5
## --------------------------------------------------------------------------------
## customer id 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8506      162     1534        1    14995     1961    12505    12709 
##      .25      .50      .75      .90      .95 
##    13523    14901    16393    17548    17841 
## 
## lowest : 12346 12352 12359 12362 12375, highest: 18272 18274 18276 18277 18282
## --------------------------------------------------------------------------------
## transaction country 
##        n  missing distinct 
##     8668        0       28 
## 
## lowest : Australia       Austria         Bahrain         Belgium         Channel Islands
## highest: Spain           Sweden          Switzerland     United Kingdom  USA            
## --------------------------------------------------------------------------------
## date 
##          n    missing   distinct       Info       Mean        Gmd        .05 
##       8668          0        302          1 2011-06-26   11273971 2010-12-14 
##        .10        .25        .50        .75        .90        .95 
## 2011-01-09 2011-03-21 2011-07-08 2011-10-06 2011-11-14 2011-11-29 
## 
## lowest : 2010-12-01 2010-12-02 2010-12-03 2010-12-05 2010-12-06
## highest: 2011-12-05 2011-12-06 2011-12-07 2011-12-08 2011-12-09
## --------------------------------------------------------------------------------
## day_name 
##        n  missing distinct 
##     8668        0        6 
## 
## lowest : Friday    Monday    Sunday    Thursday  Tuesday  
## highest: Monday    Sunday    Thursday  Tuesday   Wednesday
##                                                                       
## Value         Friday    Monday    Sunday  Thursday   Tuesday Wednesday
## Frequency       1276      1429       449      2241      1625      1648
## Proportion     0.147     0.165     0.052     0.259     0.187     0.190
## --------------------------------------------------------------------------------
## day_of_month 
##        n  missing distinct 
##     8668        0       31 
## 
## lowest : 01 02 03 04 05, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## month_of_year 
##        n  missing distinct 
##     8668        0       12 
## 
## lowest : April     August    December  February  January  
## highest: March     May       November  October   September
##                                                                       
## Value          April    August  December  February   January      July
## Frequency        530       616      1036       414       678       641
## Proportion     0.061     0.071     0.120     0.048     0.078     0.074
##                                                                       
## Value           June     March       May  November   October September
## Frequency        667       639       558      1013      1120       756
## Proportion     0.077     0.074     0.064     0.117     0.129     0.087
## --------------------------------------------------------------------------------
## time_of_day(hh:mm:ss) 
##        n  missing distinct 
##     8668        0      686 
## 
## lowest : 06:10:00 06:14:00 06:15:00 06:19:00 06:21:00
## highest: 20:18:00 20:28:00 20:33:00 20:36:00 20:38:00
## --------------------------------------------------------------------------------

##SALES PERFORMANCE

#create required columns
sales=sales_clean%>%mutate(sale_value=`quantity sold`*`unit price`)
sales=sales %>% mutate(year=format(`transaction timestamp`,"%Y"))
sales=sales %>% mutate(month_number=format(`transaction timestamp`,"%m"))
#plot sales by month and year

df <- sales %>% filter(date<"2011-12-01")%>%
  mutate(month_year=(paste(year,month_number,sep="-")))%>%
  group_by(month_year)%>%
  summarise(Total_sales=sum(sale_value))%>%
  arrange(match(month_year, month.name))
## `summarise()` ungrouping output (override with `.groups` argument)
#sales %>% 
  #mutate(month_year=(paste(year,month_number,sep="-")))%>%
  #group_by(month_year)%>%
  #summarise(Total_sales=sum(sale_value))%>%
  #arrange(match(month_year, month.name))
#Line plot
ggplot(df,aes(x=ordered(month_year), y=Total_sales, group =1))+ggtitle("Monthly Sales of the Baseline Year")+
  geom_line(colour = "red", size = 1.2)+
  geom_point(colour="darkblue")+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
        panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
        panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
  ylab("Total Sales") + xlab("Year-Month")

#LinearRegression for daily sales

daily_df=sales%>%group_by(date)%>%summarise(daily_sales=sum(sale_value))%>%arrange(date)
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(daily_df,aes(x=date,y=daily_sales))+geom_point()+geom_smooth(method="lm")+
  ggtitle("Linear Model")+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
        panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
        panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
  ylab("Sales") + xlab("Date")
## `geom_smooth()` using formula 'y ~ x'

lm_sales=lm(daily_sales ~ date, data = daily_df)
summary(lm_sales)
## 
## Call:
## lm(formula = daily_sales ~ date, data = daily_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -30574 -10092  -2444   7574  69768 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -9.362e+05  1.292e+05  -7.246 3.57e-12 ***
## date         7.404e-04  9.879e-05   7.495 7.37e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15970 on 303 degrees of freedom
## Multiple R-squared:  0.1564, Adjusted R-squared:  0.1536 
## F-statistic: 56.17 on 1 and 303 DF,  p-value: 7.369e-13

##SALES INSIGHTS

#Sales by Country
country_df=sales%>%group_by(`transaction country`)%>%
  summarise(country_sales=sum(sale_value),transaction_count=n())%>%
  mutate(avg_product_cost=country_sales/transaction_count)%>%
  arrange(desc(country_sales))%>%head(10)
## `summarise()` ungrouping output (override with `.groups` argument)
country_df
## # A tibble: 10 x 4
##    `transaction country` country_sales transaction_count avg_product_cost
##    <chr>                         <dbl>             <int>            <dbl>
##  1 United Kingdom             8281390.            486200             17.0
##  2 Netherlands                 283480.              2326            122. 
##  3 EIRE                        259380.              8059             32.2
##  4 Germany                     200620.              9080             22.1
##  5 France                      182077.              8218             22.2
##  6 Australia                   136922.              1253            109. 
##  7 Switzerland                  52483.              1960             26.8
##  8 Spain                        51747.              2462             21.0
##  9 Belgium                      36663.              1971             18.6
## 10 Japan                        35420.               355             99.8
#plot of country sales
top_n(country_df,10,country_sales)%>%
  ggplot(aes(x=reorder(`transaction country`,-country_sales),y=country_sales))+
  ggtitle("Top 10 Countries by Sales")+ labs(color = "Average Cost of Product")+
  geom_point(stat='identity',aes(size=avg_product_cost),color='Blue')+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
        panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
        panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
  ylab("Sales") + xlab("Country")

#Sales by Day
sales%>%group_by(day_name)%>%summarise(day_sales=sum(sale_value))%>%arrange(day_sales)%>%
  ggplot(aes(x=reorder(day_name,-day_sales),y=day_sales))+
  geom_col(fill='forest green')+
  ggtitle(" Sales by Weekday")+
  geom_text(aes(label=day_sales), position=position_dodge(width=0.9), vjust=-0.25)+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
        panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
        panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
  ylab("Sales") + xlab("Weekday")
## `summarise()` ungrouping output (override with `.groups` argument)

#Sales by time
sales$hour=format(sales$`transaction timestamp`,"%H")

sales%>%group_by(hour)%>%summarise(hourly_sales=sum(sale_value),total_quantity=sum(`quantity sold`))%>%arrange(hour)%>%
  ggplot(aes(x=hour,y=hourly_sales))+
  geom_line(aes(group=1),color="pink",size=1.5)+geom_point(colour='maroon',size=1.5)+
  ggtitle(" Sales by Hours")+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
        panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
        panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
  ylab("Sales") + xlab("Hours of the Day")
## `summarise()` ungrouping output (override with `.groups` argument)

#Sales by product

product_df=sales%>%group_by(`product description`)%>%summarise(prod_sales=sum(sale_value),total_quantity=sum(`quantity sold`))%>%arrange(prod_sales)
## `summarise()` ungrouping output (override with `.groups` argument)
top_n(product_df,10,prod_sales)%>%
ggplot(aes(x=reorder(`product description`,-prod_sales),y=prod_sales))+
  geom_col(aes(fill=total_quantity))+ 
  scale_fill_gradient(low="yellow", high="red")+
  ggtitle(" Top 10 Sales by Product")+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
        panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
        panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
  ylab("Sales") + 
  xlab("Product")

##CUSTOMER ANALYSIS: CUSTOMER ACQUISITION AND LOYALTY

###Customer Behaviour

#Remove customer IDs with NA
sales_customer=sales%>%filter(!is.na(`customer id`))
describe(sales_customer)
## sales_customer 
## 
##  17  Variables      399656  Observations
## --------------------------------------------------------------------------------
## transaction id 
##        n  missing distinct 
##   399656        0    21785 
## 
## lowest : 536365  536366  536367  536368  536369 
## highest: C581470 C581484 C581490 C581568 C581569
## --------------------------------------------------------------------------------
## product id 
##        n  missing distinct 
##   399656        0     3676 
## 
## lowest : 10002  10080  10120  10123C 10124A, highest: 90214U 90214V 90214W 90214Y 90214Z
## --------------------------------------------------------------------------------
## product description 
##        n  missing distinct 
##   399656        0     3877 
## 
## lowest : 10 COLOUR SPACEBOY PEN           12 COLOURED PARTY BALLOONS       12 DAISY PEGS IN WOOD BOX        12 EGG HOUSE PAINTED WOOD        12 HANGING EGGS HAND PAINTED    
## highest: ZINC T-LIGHT HOLDER STARS SMALL  ZINC TOP  2 DOOR WOODEN SHELF    ZINC WILLIE WINKIE  CANDLE STICK ZINC WIRE KITCHEN ORGANISER      ZINC WIRE SWEETHEART LETTER TRAY
## --------------------------------------------------------------------------------
## quantity sold 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   399656        0      431    0.987     12.2    18.88        1        1 
##      .25      .50      .75      .90      .95 
##        2        5       12       24       36 
## 
## lowest : -80995 -74215  -9360  -3114  -2000, highest:   3906   4300   4800  74215  80995
##                                                                          
## Value      -80000 -74000 -10000  -4000  -2000      0   2000   4000  74000
## Frequency       1      1      1      1      6 399534    104      6      1
## Proportion      0      0      0      0      0      1      0      0      0
##                  
## Value       80000
## Frequency       1
## Proportion      0
## 
## For the frequency table, variable is rounded to the nearest 2000
## --------------------------------------------------------------------------------
## transaction timestamp 
##                   n             missing            distinct                Info 
##              399656                   0               20132                   1 
##                Mean                 Gmd                 .05                 .10 
## 2011-07-10 12:35:18            11094688 2010-12-14 15:02:00 2011-01-21 13:47:00 
##                 .25                 .50                 .75                 .90 
## 2011-04-06 15:07:00 2011-07-29 15:51:00 2011-10-20 12:03:00 2011-11-20 15:00:00 
##                 .95 
## 2011-11-29 15:23:00 
## 
## lowest : 2010-12-01 08:26:00 2010-12-01 08:28:00 2010-12-01 08:34:00 2010-12-01 08:35:00 2010-12-01 08:45:00
## highest: 2011-12-09 12:23:00 2011-12-09 12:25:00 2011-12-09 12:31:00 2011-12-09 12:49:00 2011-12-09 12:50:00
## --------------------------------------------------------------------------------
## unit price 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   399656        0      372    0.996    2.908    2.811     0.42     0.55 
##      .25      .50      .75      .90      .95 
##     1.25     1.95     3.75     6.25     8.50 
## 
## lowest :   0.03   0.04   0.06   0.07   0.08, highest: 175.00 195.00 265.50 295.00 649.50
## --------------------------------------------------------------------------------
## customer id 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   399656        0     4362        1    15289     1974    12627    12901 
##      .25      .50      .75      .90      .95 
##    13959    15152    16791    17719    17901 
## 
## lowest : 12346 12347 12348 12349 12350, highest: 18280 18281 18282 18283 18287
## --------------------------------------------------------------------------------
## transaction country 
##        n  missing distinct 
##   399656        0       37 
## 
## lowest : Australia            Austria              Bahrain              Belgium              Brazil              
## highest: Switzerland          United Arab Emirates United Kingdom       Unspecified          USA                 
## --------------------------------------------------------------------------------
## date 
##          n    missing   distinct       Info       Mean        Gmd        .05 
##     399656          0        305          1 2011-07-10   11094244 2010-12-14 
##        .10        .25        .50        .75        .90        .95 
## 2011-01-21 2011-04-06 2011-07-29 2011-10-20 2011-11-20 2011-11-29 
## 
## lowest : 2010-12-01 2010-12-02 2010-12-03 2010-12-05 2010-12-06
## highest: 2011-12-05 2011-12-06 2011-12-07 2011-12-08 2011-12-09
## --------------------------------------------------------------------------------
## day_name 
##        n  missing distinct 
##   399656        0        6 
## 
## lowest : Friday    Monday    Sunday    Thursday  Tuesday  
## highest: Monday    Sunday    Thursday  Tuesday   Wednesday
##                                                                       
## Value         Friday    Monday    Sunday  Thursday   Tuesday Wednesday
## Frequency      55163     65364     61530     81134     67052     69413
## Proportion     0.138     0.164     0.154     0.203     0.168     0.174
## --------------------------------------------------------------------------------
## day_of_month 
##        n  missing distinct 
##   399656        0       31 
## 
## lowest : 01 02 03 04 05, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## month_of_year 
##        n  missing distinct 
##   399656        0       12 
## 
## lowest : April     August    December  February  January  
## highest: March     May       November  October   September
##                                                                       
## Value          April    August  December  February   January      July
## Frequency      22874     27296     43545     20040     21562     27111
## Proportion     0.057     0.068     0.109     0.050     0.054     0.068
##                                                                       
## Value           June     March       May  November   October September
## Frequency      27448     27358     28512     63938     49708     40264
## Proportion     0.069     0.068     0.071     0.160     0.124     0.101
## --------------------------------------------------------------------------------
## time_of_day(hh:mm:ss) 
##        n  missing distinct 
##   399656        0      770 
## 
## lowest : 06:10:00 06:14:00 06:15:00 06:19:00 06:20:00
## highest: 20:18:00 20:28:00 20:33:00 20:36:00 20:38:00
## --------------------------------------------------------------------------------
## sale_value 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   399656        0     3755        1    20.68    30.35     0.85     1.65 
##      .25      .50      .75      .90      .95 
##     4.25    11.58    19.50    35.40    66.40 
## 
## lowest : -168469.60  -77183.60   -6539.40   -4921.50   -4522.50
## highest:    6539.40    7144.72   38970.00   77183.60  168469.60
##                                                                           
## Value      -170000  -75000   -5000       0    5000   40000   75000  170000
## Frequency        1       1       6  399591      54       1       1       1
## Proportion       0       0       0       1       0       0       0       0
## 
## For the frequency table, variable is rounded to the nearest 5000
## --------------------------------------------------------------------------------
## year 
##        n  missing distinct 
##   399656        0        2 
##                         
## Value        2010   2011
## Frequency   26235 373421
## Proportion  0.066  0.934
## --------------------------------------------------------------------------------
## month_number 
##        n  missing distinct 
##   399656        0       12 
## 
## lowest : 01 02 03 04 05, highest: 08 09 10 11 12
##                                                                             
## Value          1     2     3     4     5     6     7     8     9    10    11
## Frequency  21562 20040 27358 22874 28512 27448 27111 27296 40264 49708 63938
## Proportion 0.054 0.050 0.068 0.057 0.071 0.069 0.068 0.068 0.101 0.124 0.160
##                 
## Value         12
## Frequency  43545
## Proportion 0.109
## --------------------------------------------------------------------------------
## hour 
##        n  missing distinct 
##   399656        0       15 
## 
## lowest : 06 07 08 09 10, highest: 16 17 18 19 20
##                                                                             
## Value          6     7     8     9    10    11    12    13    14    15    16
## Frequency     39   379  8722 22286 38468 49304 71907 63794 53945 45438 24497
## Proportion 0.000 0.001 0.022 0.056 0.096 0.123 0.180 0.160 0.135 0.114 0.061
##                                   
## Value         17    18    19    20
## Frequency  13531  3088  3413   845
## Proportion 0.034 0.008 0.009 0.002
## --------------------------------------------------------------------------------
#Update the product quantity by removing the effect of cancelled transaction

#Cancelled Order Check
cancelled = sales_customer%>%filter(str_detect(`transaction id`,"^C"))
describe(cancelled)
## cancelled 
## 
##  17  Variables      8506  Observations
## --------------------------------------------------------------------------------
## transaction id 
##        n  missing distinct 
##     8506        0     3383 
## 
## lowest : C536383 C536391 C536506 C536543 C536548
## highest: C581470 C581484 C581490 C581568 C581569
## --------------------------------------------------------------------------------
## product id 
##        n  missing distinct 
##     8506        0     1915 
## 
## lowest : 10133  10135  11001  15034  15036 , highest: 90204  90208  90209A 90209C 90211B
## --------------------------------------------------------------------------------
## product description 
##        n  missing distinct 
##     8506        0     1939 
## 
## lowest : 10 COLOUR SPACEBOY PEN           12 COLOURED PARTY BALLOONS       12 EGG HOUSE PAINTED WOOD        12 IVORY ROSE PEG PLACE SETTINGS 12 PENCIL SMALL TUBE WOODLAND   
## highest: ZINC SWEETHEART WIRE LETTER RACK ZINC T-LIGHT HOLDER STAR LARGE   ZINC T-LIGHT HOLDER STARS SMALL  ZINC TOP  2 DOOR WOODEN SHELF    ZINC WILLIE WINKIE  CANDLE STICK
## --------------------------------------------------------------------------------
## quantity sold 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8506        0      132    0.924   -31.47     58.2      -36      -20 
##      .25      .50      .75      .90      .95 
##       -6       -2       -1       -1       -1 
## 
## lowest : -80995 -74215  -9360  -3114  -2000, highest:     -5     -4     -3     -2     -1
##                                                            
## Value      -81000 -74000  -9000  -3000  -2000  -1000      0
## Frequency       1      1      1      1      3     18   8481
## Proportion  0.000  0.000  0.000  0.000  0.000  0.002  0.997
## 
## For the frequency table, variable is rounded to the nearest 1000
## --------------------------------------------------------------------------------
## transaction timestamp 
##                   n             missing            distinct                Info 
##                8506                   0                3324                   1 
##                Mean                 Gmd                 .05                 .10 
## 2011-06-27 00:56:51            11263978 2010-12-14 11:42:00 2011-01-10 10:59:00 
##                 .25                 .50                 .75                 .90 
## 2011-03-21 16:27:30 2011-07-11 13:07:00 2011-10-06 20:38:00 2011-11-14 15:26:30 
##                 .95 
## 2011-11-29 13:21:45 
## 
## lowest : 2010-12-01 09:49:00 2010-12-01 10:24:00 2010-12-01 12:38:00 2010-12-01 14:30:00 2010-12-01 14:33:00
## highest: 2011-12-08 19:28:00 2011-12-09 09:27:00 2011-12-09 09:57:00 2011-12-09 11:57:00 2011-12-09 11:58:00
## --------------------------------------------------------------------------------
## unit price 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8506        0      152    0.998     4.44    4.641     0.55     0.85 
##      .25      .50      .75      .90      .95 
##     1.45     2.55     4.95     9.95    12.75 
## 
## lowest :   0.03   0.06   0.07   0.08   0.11, highest: 165.00 175.00 195.00 265.50 295.00
##                                                                             
## Value          0     5    10    15    20    25    30    35    40    50    60
## Frequency   3944  3080  1020   376    21    10     5     5    13     6     4
## Proportion 0.464 0.362 0.120 0.044 0.002 0.001 0.001 0.001 0.002 0.001 0.000
##                                                           
## Value         65    85   125   165   175   195   265   295
## Frequency      2     2    10     2     1     2     1     2
## Proportion 0.000 0.000 0.001 0.000 0.000 0.000 0.000 0.000
## 
## For the frequency table, variable is rounded to the nearest 5
## --------------------------------------------------------------------------------
## customer id 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8506        0     1534        1    14995     1961    12505    12709 
##      .25      .50      .75      .90      .95 
##    13523    14901    16393    17548    17841 
## 
## lowest : 12346 12352 12359 12362 12375, highest: 18272 18274 18276 18277 18282
## --------------------------------------------------------------------------------
## transaction country 
##        n  missing distinct 
##     8506        0       26 
## 
## lowest : Australia       Austria         Belgium         Channel Islands Cyprus         
## highest: Spain           Sweden          Switzerland     United Kingdom  USA            
## --------------------------------------------------------------------------------
## date 
##          n    missing   distinct       Info       Mean        Gmd        .05 
##       8506          0        302          1 2011-06-26   11264336 2010-12-14 
##        .10        .25        .50        .75        .90        .95 
## 2011-01-10 2011-03-21 2011-07-11 2011-10-06 2011-11-14 2011-11-29 
## 
## lowest : 2010-12-01 2010-12-02 2010-12-03 2010-12-05 2010-12-06
## highest: 2011-12-05 2011-12-06 2011-12-07 2011-12-08 2011-12-09
## --------------------------------------------------------------------------------
## day_name 
##        n  missing distinct 
##     8506        0        6 
## 
## lowest : Friday    Monday    Sunday    Thursday  Tuesday  
## highest: Monday    Sunday    Thursday  Tuesday   Wednesday
##                                                                       
## Value         Friday    Monday    Sunday  Thursday   Tuesday Wednesday
## Frequency       1224      1401       446      2230      1561      1644
## Proportion     0.144     0.165     0.052     0.262     0.184     0.193
## --------------------------------------------------------------------------------
## day_of_month 
##        n  missing distinct 
##     8506        0       31 
## 
## lowest : 01 02 03 04 05, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## month_of_year 
##        n  missing distinct 
##     8506        0       12 
## 
## lowest : April     August    December  February  January  
## highest: March     May       November  October   September
##                                                                       
## Value          April    August  December  February   January      July
## Frequency        529       613      1012       410       666       637
## Proportion     0.062     0.072     0.119     0.048     0.078     0.075
##                                                                       
## Value           June     March       May  November   October September
## Frequency        628       608       552      1000      1097       754
## Proportion     0.074     0.071     0.065     0.118     0.129     0.089
## --------------------------------------------------------------------------------
## time_of_day(hh:mm:ss) 
##        n  missing distinct 
##     8506        0      685 
## 
## lowest : 06:10:00 06:14:00 06:15:00 06:19:00 06:21:00
## highest: 20:18:00 20:28:00 20:33:00 20:36:00 20:38:00
## --------------------------------------------------------------------------------
## sale_value 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8506        0      900        1   -55.46    97.92   -81.12   -37.50 
##      .25      .50      .75      .90      .95 
##   -16.60    -8.30    -3.30    -1.65    -1.25 
## 
## lowest : -168469.60  -77183.60   -6539.40   -4921.50   -4522.50
## highest:      -0.39      -0.29      -0.21      -0.19      -0.12
##                                                           
## Value      -168000  -78000   -6000   -4000   -2000       0
## Frequency        1       1       1       5      15    8483
## Proportion   0.000   0.000   0.000   0.001   0.002   0.997
## 
## For the frequency table, variable is rounded to the nearest 2000
## --------------------------------------------------------------------------------
## year 
##        n  missing distinct 
##     8506        0        2 
##                       
## Value       2010  2011
## Frequency    666  7840
## Proportion 0.078 0.922
## --------------------------------------------------------------------------------
## month_number 
##        n  missing distinct 
##     8506        0       12 
## 
## lowest : 01 02 03 04 05, highest: 08 09 10 11 12
##                                                                             
## Value          1     2     3     4     5     6     7     8     9    10    11
## Frequency    666   410   608   529   552   628   637   613   754  1097  1000
## Proportion 0.078 0.048 0.071 0.062 0.065 0.074 0.075 0.072 0.089 0.129 0.118
##                 
## Value         12
## Frequency   1012
## Proportion 0.119
## --------------------------------------------------------------------------------
## hour 
##        n  missing distinct 
##     8506        0       15 
## 
## lowest : 06 07 08 09 10, highest: 16 17 18 19 20
##                                                                             
## Value          6     7     8     9    10    11    12    13    14    15    16
## Frequency     38     4    98   499   910  1095  1191   994   893   823   871
## Proportion 0.004 0.000 0.012 0.059 0.107 0.129 0.140 0.117 0.105 0.097 0.102
##                                   
## Value         17    18    19    20
## Frequency    634   204   184    68
## Proportion 0.075 0.024 0.022 0.008
## --------------------------------------------------------------------------------
#Create a list of cancelled transactions to join with the products of similar value
cancelled_join=cancelled%>%
  mutate(`quantity sold`=abs(`quantity sold`),sale_value=abs(sale_value))%>%
  select(`transaction id`,`product id`,`product description`,`quantity sold`,`unit price`,sale_value,`customer id`,`transaction timestamp`)

#Join with sales data to identify the combination of product id,customer, sale_value to remove
cancelled_id_df=sales_customer%>%left_join(cancelled_join,by=c("customer id","sale_value","product id","product description","quantity sold","unit price"))%>%
  filter(`transaction timestamp.x`<`transaction timestamp.y`)%>%
  arrange(`customer id`,desc(`transaction timestamp.x`))%>%
  group_by(`product id`,`customer id`,sale_value,`transaction id.y`)%>%slice(1)%>%ungroup()%>%
  mutate(`transaction timestamp`=`transaction timestamp.x`,`transaction id`=`transaction id.x`)%>%
  select("transaction id","customer id","sale_value","product id","product description","quantity sold","unit price","transaction timestamp","transaction id.y")

sales_customer=sales_customer%>%
  left_join(cancelled_id_df, by=c("transaction id","customer id","sale_value","product id","product description","quantity sold","unit price","transaction timestamp"))%>%
  filter(is.na(`transaction id.y.y`))%>%
  filter((!str_detect(`transaction id`,"^C")))  

#Prepare final sales_customer data frame
sales_customer=sales_customer%>%select(-c(`transaction id.y.y`))
#Customer Behaviour: types of customer that buy

#Create customer level df
cust_df=sales_customer%>%group_by(`customer id`,`transaction country`)%>%
  summarise(customer_sales=sum(sale_value),total_quantity=sum(`quantity sold`),total_transaction=length(unique(as.character(`transaction id`))))
## `summarise()` regrouping output by 'customer id' (override with `.groups` argument)
#Top5 Customers
cust_df%>%arrange(desc(customer_sales))%>%head(5)
## # A tibble: 5 x 5
## # Groups:   customer id [5]
##   `customer id` `transaction cou~ customer_sales total_quantity total_transacti~
##           <dbl> <chr>                      <dbl>          <dbl>            <int>
## 1         14646 Netherlands              278778.         196556               72
## 2         18102 United Kingdom           259657.          64124               60
## 3         17450 United Kingdom           189608.          69031               45
## 4         14911 EIRE                     129691.          77154              195
## 5         12415 Australia                123807.          77056               19
#Customers by country
cust_df%>%group_by(`transaction country`)%>%
  summarise(count_cust=length(`customer id`))%>%
  arrange(desc(count_cust))%>%head(5)%>%
  ggplot(aes(x=reorder(`transaction country`,-count_cust),y=count_cust))+
  geom_col(fill="Light Blue")+ geom_text(aes(label=count_cust), position=position_dodge(width=0.9), vjust=-0.25)+
  ggtitle(" Top 5 Countries by Customers")+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
        panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
        panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
  ylab("Number of Customers") + 
  xlab("Country")
## `summarise()` ungrouping output (override with `.groups` argument)

###Customer Purchase Behaviour

#Average transaction, median transaction
summary(cust_df$total_transaction)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   4.222   5.000 203.000
#average quantity
sum(cust_df$total_quantity)/sum(cust_df$total_transaction)
## [1] 268.9532
#median
summary(cust_df$total_quantity)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      1.0    158.0    373.5   1135.4    982.2 196556.0
#average purchase amount
sum(cust_df$customer_sales)/sum(cust_df$total_transaction)
## [1] 456.6943
#median
summary(cust_df$customer_sales)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      2.9    301.2    655.1   1928.0   1613.1 278778.0
#Most bought products
sales_customer%>%group_by(`product description`)%>%summarise(count=n())%>%ungroup()%>%arrange(desc(count))%>%head(10)%>%
  ggplot(aes(x=count,y=reorder(`product description`,count)))+
  geom_col(fill="green")+ geom_text(aes(label=count), position=position_dodge(width=0), hjust=1.1)+
  ggtitle(" Most Bought Products")+
  theme(axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
        panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
        panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
  ylab("Product Description") + 
  xlab("Count")
## `summarise()` ungrouping output (override with `.groups` argument)

###Customer Loyalty and Retention

#Customers on monthly basis
sales_customer%>%filter(date<'2011-12-01')%>%
  mutate(year_month=format(date,"%Y-%m"))%>%
  group_by(year_month)%>%
  summarise(trans_count=length(unique(`transaction id`)),distinct_cust=length(unique(`customer id`)))%>%
  arrange(year_month)%>%
  ggplot(aes(x=ordered(year_month),group=1))+
  geom_line(aes(y=distinct_cust,colour="Customer Count"),size=1)+ geom_point(aes(y=distinct_cust),colour="RED",size=2)+
  geom_line(aes(y=trans_count,colour="Transaction Count"),size=1)+ geom_point(aes(y=trans_count),colour="blue",size=2)+
  scale_colour_manual("",values=c("red","blue"))+
  ggtitle("Monthly Customers and Transactions")+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),plot.title = element_text(hjust=0.5, lineheight = 0.8, face = "bold"),
        panel.background = element_rect(fill = "White", colour = "Black", size = 1, linetype = "solid"),
        panel.grid.major = element_line(size = 0.5, linetype = 'solid',colour = "light grey"))+
  ylab("Count") + 
  xlab("Year-Month")
## `summarise()` ungrouping output (override with `.groups` argument)

#Customer Retention
sales_customer%>%select(`transaction id`,`customer id`,date)%>%
  filter(date<'2011-12-01')%>%
  mutate(year_month=format(date,"%Y-%m"))%>%
  group_by(year_month)%>%
  summarise(trans_count=length(unique(`transaction id`)),distinct_cust=length(unique(`customer id`)))%>%
  arrange(year_month)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 13 x 3
##    year_month trans_count distinct_cust
##    <chr>            <int>         <int>
##  1 2010-12           1387           883
##  2 2011-01            975           737
##  3 2011-02            989           756
##  4 2011-03           1304           970
##  5 2011-04           1130           849
##  6 2011-05           1528          1051
##  7 2011-06           1377           986
##  8 2011-07           1314           945
##  9 2011-08           1264           933
## 10 2011-09           1729          1254
## 11 2011-10           1887          1356
## 12 2011-11           2631          1657
## 13 2011-12            118           111
#Cohort Analysis
cohort=sales_customer%>%select(`transaction id`,`customer id`,date)%>%
  filter(date<'2011-12-01')%>%
  mutate(year_month=format(date,"%Y-%m"))%>%
  group_by(year_month)%>%select(`customer id`,year_month)%>%distinct(`customer id`,year_month)

cohort= cohort %>%           # store in cohort table, get from dbdata
  group_by(`customer id`) %>%        # group all users together
  mutate(first = min(year_month)) %>%  # for every user, find the first period
  group_by(first, year_month) %>%      # group by this first period + the other periods
  summarise(users = n()) %>%   # for each combination, count the number of users
  spread(year_month, users)    # and make columns with period names
## `summarise()` regrouping output by 'first' (override with `.groups` argument)
shiftrow <- function(v) {
  # put a vector in, strip off leading NA values, and place that amount at the end
  first_na_index <- min( which(!is.na(v)) )
  
  # return that bit to the end,  and pad with NAs.
  c(v[first_na_index:length(v)], rep(NA, first_na_index-1))
}

# create a new dataframe, with shifted rows (and keep the first one)
shifted <- data.frame(
  cohort = cohort$first,
  t(apply( select(as.data.frame(cohort), 2:ncol(cohort)), # 2nd column to the end
           1, shiftrow ))) # for every row

# and make column names readable
# first should be "cohort" and the rest week.<number>, (padded)
colnames(shifted) <- c("cohort", sub("","Month.", str_pad(1:(ncol(shifted)-1),2,pad = "0")))

# percentages
shifted_pct <- data.frame(
  cohort = shifted$cohort, # first column
  shifted[,1:nrow(shifted)+1] / shifted[["Month.01"]]) # rest: divide by month.01

shifted_pct
##     cohort Month.01   Month.02   Month.03   Month.04   Month.05    Month.06
## 1  2010-12        1 0.36579841 0.32389581 0.38165345 0.35900340 0.398640997
## 2  2011-01        1 0.21497585 0.26811594 0.22946860 0.31884058 0.287439614
## 3  2011-02        1 0.18635171 0.18635171 0.28346457 0.27034121 0.244094488
## 4  2011-03        1 0.14634146 0.24833703 0.19512195 0.22394678 0.168514412
## 5  2011-04        1 0.21333333 0.20333333 0.21000000 0.19666667 0.230000000
## 6  2011-05        1 0.19434629 0.17314488 0.17314488 0.20848057 0.229681979
## 7  2011-06        1 0.17355372 0.15702479 0.26446281 0.23140496 0.330578512
## 8  2011-07        1 0.17647059 0.20320856 0.22459893 0.27272727 0.005347594
## 9  2011-08        1 0.20118343 0.23668639 0.24260355 0.01183432          NA
## 10 2011-09        1 0.23648649 0.30405405 0.01351351         NA          NA
## 11 2011-10        1 0.23595506 0.02247191         NA         NA          NA
## 12 2011-11        1 0.02173913         NA         NA         NA          NA
## 13 2011-12        1         NA         NA         NA         NA          NA
##      Month.07   Month.08   Month.09   Month.10   Month.11   Month.12   Month.13
## 1  0.36013590 0.34767837 0.35334088 0.39297848 0.37372593 0.50169875 0.05662514
## 2  0.24879227 0.24154589 0.29710145 0.32850242 0.36714976 0.02415459         NA
## 3  0.25459318 0.27296588 0.24409449 0.30183727 0.01312336         NA         NA
## 4  0.26607539 0.23059867 0.27937916 0.01330377         NA         NA         NA
## 5  0.21333333 0.26000000 0.01666667         NA         NA         NA         NA
## 6  0.26501767 0.02120141         NA         NA         NA         NA         NA
## 7  0.01652893         NA         NA         NA         NA         NA         NA
## 8          NA         NA         NA         NA         NA         NA         NA
## 9          NA         NA         NA         NA         NA         NA         NA
## 10         NA         NA         NA         NA         NA         NA         NA
## 11         NA         NA         NA         NA         NA         NA         NA
## 12         NA         NA         NA         NA         NA         NA         NA
## 13         NA         NA         NA         NA         NA         NA         NA

##MARKET BASKET ANALYSIS

library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)
## Loading required package: grid
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
#Prepare df in basket format
basket_df=sales_customer[order(sales_customer$`transaction id`),]
basket_df$`transaction id`=as.numeric(as.character(basket_df$`transaction id`))
basket_df= ddply(basket_df,c("`transaction id`","date"),function(df1)paste(df1$`product description`,collapse = ","))

basket_df$`transaction id`=NULL
basket_df$date=NULL
colnames(basket_df)=c("items")


txn = read.transactions("marketbasket.csv", format="basket",sep=",")
#Apply association rules
asso_rules=apriori(txn,parameter = list(supp=0.001,conf=0.8,maxlen=4))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##       4  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 18 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[7708 item(s), 18289 transaction(s)] done [0.14s].
## sorting and recoding items ... [2431 item(s)] done [0.01s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4
## Warning in apriori(txn, parameter = list(supp = 0.001, conf = 0.8, maxlen = 4)):
## Mining stopped (maxlen reached). Only patterns up to a length of 4 returned!
##  done [0.24s].
## writing ... [14060 rule(s)] done [0.02s].
## creating S4 object  ... done [0.02s].
summary(is.redundant(asso_rules))
##    Mode   FALSE    TRUE 
## logical   12168    1892
asso_rules=asso_rules[!is.redundant(asso_rules)]
asso_rules=sort(asso_rules,by="support", decreasing = T)
summary(asso_rules)
## set of 12168 rules
## 
## rule length distribution (lhs + rhs):sizes
##    2    3    4 
##  113 2813 9242 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    4.00    4.00    3.75    4.00    4.00 
## 
## summary of quality measures:
##     support           confidence        coverage             lift        
##  Min.   :0.001039   Min.   :0.8000   Min.   :0.001039   Min.   :  8.361  
##  1st Qu.:0.001148   1st Qu.:0.8246   1st Qu.:0.001367   1st Qu.: 20.761  
##  Median :0.001367   Median :0.8571   Median :0.001640   Median : 32.819  
##  Mean   :0.001707   Mean   :0.8668   Mean   :0.001973   Mean   : 63.321  
##  3rd Qu.:0.001859   3rd Qu.:0.9048   3rd Qu.:0.002132   3rd Qu.: 78.193  
##  Max.   :0.022363   Max.   :1.0000   Max.   :0.027394   Max.   :589.968  
##      count       
##  Min.   : 19.00  
##  1st Qu.: 21.00  
##  Median : 25.00  
##  Mean   : 31.22  
##  3rd Qu.: 34.00  
##  Max.   :409.00  
## 
## mining info:
##  data ntransactions support confidence
##   txn         18289   0.001        0.8
inspect(asso_rules[1:20])
##      lhs                                     rhs                                      support confidence    coverage     lift count
## [1]  {PINK REGENCY TEACUP AND SAUCER}     => {GREEN REGENCY TEACUP AND SAUCER}    0.022363169  0.8163673 0.027393515 24.08152   409
## [2]  {GREEN REGENCY TEACUP AND SAUCER,                                                                                             
##       PINK REGENCY TEACUP AND SAUCER}     => {ROSES REGENCY TEACUP AND SAUCER}    0.018645087  0.8337408 0.022363169 21.81443   341
## [3]  {PINK REGENCY TEACUP AND SAUCER,                                                                                              
##       ROSES REGENCY TEACUP AND SAUCER}    => {GREEN REGENCY TEACUP AND SAUCER}    0.018645087  0.8903394 0.020941550 26.26358   341
## [4]  {GREEN REGENCY TEACUP AND SAUCER,                                                                                             
##       REGENCY CAKESTAND 3 TIER}           => {ROSES REGENCY TEACUP AND SAUCER}    0.014708295  0.8176292 0.017988955 21.39288   269
## [5]  {PINK REGENCY TEACUP AND SAUCER,                                                                                              
##       REGENCY CAKESTAND 3 TIER}           => {GREEN REGENCY TEACUP AND SAUCER}    0.012849254  0.8703704 0.014762972 25.67452   235
## [6]  {PINK REGENCY TEACUP AND SAUCER,                                                                                              
##       REGENCY CAKESTAND 3 TIER}           => {ROSES REGENCY TEACUP AND SAUCER}    0.012411832  0.8407407 0.014762972 21.99758   227
## [7]  {SET 3 RETROSPOT TEA}                => {SUGAR}                              0.011919733  1.0000000 0.011919733 83.89450   218
## [8]  {SUGAR}                              => {SET 3 RETROSPOT TEA}                0.011919733  1.0000000 0.011919733 83.89450   218
## [9]  {SET 3 RETROSPOT TEA}                => {COFFEE}                             0.011919733  1.0000000 0.011919733 63.50347   218
## [10] {SUGAR}                              => {COFFEE}                             0.011919733  1.0000000 0.011919733 63.50347   218
## [11] {SET/6 RED SPOTTY PAPER CUPS}        => {SET/6 RED SPOTTY PAPER PLATES}      0.011701022  0.8230769 0.014216196 50.85559   214
## [12] {SHED}                               => {KEY FOB}                            0.011372956  1.0000000 0.011372956 60.16118   208
## [13] {REGENCY TEA PLATE GREEN}            => {REGENCY TEA PLATE ROSES}            0.011099568  0.8388430 0.013231997 52.72027   203
## [14] {GREEN REGENCY TEACUP AND SAUCER,                                                                                             
##       PINK REGENCY TEACUP AND SAUCER,                                                                                              
##       REGENCY CAKESTAND 3 TIER}           => {ROSES REGENCY TEACUP AND SAUCER}    0.011099568  0.8638298 0.012849254 22.60169   203
## [15] {PINK REGENCY TEACUP AND SAUCER,                                                                                              
##       REGENCY CAKESTAND 3 TIER,                                                                                                    
##       ROSES REGENCY TEACUP AND SAUCER}    => {GREEN REGENCY TEACUP AND SAUCER}    0.011099568  0.8942731 0.012411832 26.37961   203
## [16] {BACK DOOR}                          => {KEY FOB}                            0.010552791  1.0000000 0.010552791 60.16118   193
## [17] {WOODEN TREE CHRISTMAS SCANDINAVIAN} => {WOODEN STAR CHRISTMAS SCANDINAVIAN} 0.009896659  0.8153153 0.012138444 43.34681   181
## [18] {REGENCY TEA PLATE PINK}             => {REGENCY TEA PLATE GREEN}            0.009787304  0.8905473 0.010990213 67.30256   179
## [19] {REGENCY TEA PLATE PINK}             => {REGENCY TEA PLATE ROSES}            0.009568593  0.8706468 0.010990213 54.71910   175
## [20] {REGENCY TEA PLATE GREEN,                                                                                                     
##       REGENCY TEA PLATE PINK}             => {REGENCY TEA PLATE ROSES}            0.008912461  0.9106145 0.009787304 57.23103   163
#To find rule for specific product
specific_asso_rules=apriori(txn,parameter = list(supp=0.001,conf=0.8),appearance = list(default="lhs",rhs="SUGAR"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 18 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[7708 item(s), 18289 transaction(s)] done [0.14s].
## sorting and recoding items ... [2431 item(s)] done [0.01s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(txn, parameter = list(supp = 0.001, conf = 0.8), appearance
## = list(default = "lhs", : Mining stopped (maxlen reached). Only patterns up to a
## length of 10 returned!
##  done [0.36s].
## writing ... [116 rule(s)] done [0.04s].
## creating S4 object  ... done [0.01s].
inspect(specific_asso_rules[1:10])
##      lhs                               rhs         support confidence    coverage     lift count
## [1]  {SET 3 RETROSPOT TEA}          => {SUGAR} 0.011919733  1.0000000 0.011919733 83.89450   218
## [2]  {RED RETROSPOT MUG,                                                                        
##       SET 3 RETROSPOT TEA}          => {SUGAR} 0.001093554  1.0000000 0.001093554 83.89450    20
## [3]  {COFFEE,                                                                                   
##       RED RETROSPOT MUG}            => {SUGAR} 0.001093554  0.9523810 0.001148231 79.89952    20
## [4]  {RED RETROSPOT SUGAR JAM BOWL,                                                             
##       SET 3 RETROSPOT TEA}          => {SUGAR} 0.001804363  1.0000000 0.001804363 83.89450    33
## [5]  {COFFEE,                                                                                   
##       RED RETROSPOT SUGAR JAM BOWL} => {SUGAR} 0.001804363  0.9705882 0.001859041 81.42701    33
## [6]  {RED RETROSPOT BUTTER DISH,                                                                
##       SET 3 RETROSPOT TEA}          => {SUGAR} 0.002023074  1.0000000 0.002023074 83.89450    37
## [7]  {COFFEE,                                                                                   
##       RED RETROSPOT BUTTER DISH}    => {SUGAR} 0.002023074  0.9250000 0.002187107 77.60241    37
## [8]  {BREAD BIN DINER STYLE RED,                                                                
##       SET 3 RETROSPOT TEA}          => {SUGAR} 0.001038876  1.0000000 0.001038876 83.89450    19
## [9]  {BREAD BIN DINER STYLE RED,                                                                
##       COFFEE}                       => {SUGAR} 0.001038876  0.9047619 0.001148231 75.90454    19
## [10] {RED SPOTTY BISCUIT TIN,                                                                   
##       SET 3 RETROSPOT TEA}          => {SUGAR} 0.003499371  1.0000000 0.003499371 83.89450    64
summary(specific_asso_rules)
## set of 116 rules
## 
## rule length distribution (lhs + rhs):sizes
##  2  3  4  5 
##  1 71 42  2 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   3.000   3.000   3.388   4.000   5.000 
## 
## summary of quality measures:
##     support           confidence        coverage             lift      
##  Min.   :0.001039   Min.   :0.8000   Min.   :0.001039   Min.   :67.12  
##  1st Qu.:0.001094   1st Qu.:0.9931   1st Qu.:0.001203   1st Qu.:83.31  
##  Median :0.001422   Median :1.0000   Median :0.001504   Median :83.89  
##  Mean   :0.001733   Mean   :0.9773   Mean   :0.001769   Mean   :81.99  
##  3rd Qu.:0.001763   3rd Qu.:1.0000   3rd Qu.:0.001804   3rd Qu.:83.89  
##  Max.   :0.011920   Max.   :1.0000   Max.   :0.011920   Max.   :83.89  
##      count       
##  Min.   : 19.00  
##  1st Qu.: 20.00  
##  Median : 26.00  
##  Mean   : 31.69  
##  3rd Qu.: 32.25  
##  Max.   :218.00  
## 
## mining info:
##  data ntransactions support confidence
##   txn         18289   0.001        0.8
#Filter rules
filtered_rules <- subset(asso_rules, subset = lift > 20 & support>0.005)
inspect(filtered_rules[1:10])
##      lhs                                  rhs                                  support confidence   coverage     lift count
## [1]  {PINK REGENCY TEACUP AND SAUCER}  => {GREEN REGENCY TEACUP AND SAUCER} 0.02236317  0.8163673 0.02739352 24.08152   409
## [2]  {GREEN REGENCY TEACUP AND SAUCER,                                                                                     
##       PINK REGENCY TEACUP AND SAUCER}  => {ROSES REGENCY TEACUP AND SAUCER} 0.01864509  0.8337408 0.02236317 21.81443   341
## [3]  {PINK REGENCY TEACUP AND SAUCER,                                                                                      
##       ROSES REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.01864509  0.8903394 0.02094155 26.26358   341
## [4]  {GREEN REGENCY TEACUP AND SAUCER,                                                                                     
##       REGENCY CAKESTAND 3 TIER}        => {ROSES REGENCY TEACUP AND SAUCER} 0.01470829  0.8176292 0.01798896 21.39288   269
## [5]  {PINK REGENCY TEACUP AND SAUCER,                                                                                      
##       REGENCY CAKESTAND 3 TIER}        => {GREEN REGENCY TEACUP AND SAUCER} 0.01284925  0.8703704 0.01476297 25.67452   235
## [6]  {PINK REGENCY TEACUP AND SAUCER,                                                                                      
##       REGENCY CAKESTAND 3 TIER}        => {ROSES REGENCY TEACUP AND SAUCER} 0.01241183  0.8407407 0.01476297 21.99758   227
## [7]  {SET 3 RETROSPOT TEA}             => {SUGAR}                           0.01191973  1.0000000 0.01191973 83.89450   218
## [8]  {SUGAR}                           => {SET 3 RETROSPOT TEA}             0.01191973  1.0000000 0.01191973 83.89450   218
## [9]  {SET 3 RETROSPOT TEA}             => {COFFEE}                          0.01191973  1.0000000 0.01191973 63.50347   218
## [10] {SUGAR}                           => {COFFEE}                          0.01191973  1.0000000 0.01191973 63.50347   218
#Visualize market basket
plot(asso_rules)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

plot(filtered_rules,method = "graph",engine = "htmlwidget")
## Warning: Too many rules supplied. Only plotting the best 100 rules using lift
## (change control parameter max if needed)

plot(filtered_rules)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

plot(filtered_rules,method = "graph")
## Warning: plot: Too many rules supplied. Only plotting the best 100 rules using
## 'support' (change control parameter max if needed)

K-MEANS

library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
kmean_df=sales_customer%>%
  group_by(`customer id`)%>%
  summarise(unique_items=n_distinct(`product id`),transaction_count=n_distinct(`transaction id`),quantity_total=sum(`quantity sold`),total_sales=sum(sale_value))
## `summarise()` ungrouping output (override with `.groups` argument)
kmean_df$`customer id`=NULL

#Function to normalize the data
normalize=function(x){(x-min(x))/(max(x)-min(x))}

kmean_df=as.data.frame(lapply(kmean_df, normalize))
summary(kmean_df)
##   unique_items      transaction_count  quantity_total       total_sales      
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000  
##  1st Qu.:0.008547   1st Qu.:0.000000   1st Qu.:0.0007988   1st Qu.:0.001070  
##  Median :0.019373   Median :0.004951   Median :0.0019002   Median :0.002344  
##  Mean   :0.034330   Mean   :0.015987   Mean   :0.0057822   Mean   :0.006918  
##  3rd Qu.:0.043305   3rd Qu.:0.019802   3rd Qu.:0.0049961   3rd Qu.:0.005786  
##  Max.   :1.000000   Max.   :1.000000   Max.   :1.0000000   Max.   :1.000000
#Dimesionality Check (Multicollinearity Check)
cor(kmean_df)
##                   unique_items transaction_count quantity_total total_sales
## unique_items         1.0000000         0.6908534      0.4367447   0.4021930
## transaction_count    0.6908534         1.0000000      0.5856440   0.5739230
## quantity_total       0.4367447         0.5856440      1.0000000   0.9194646
## total_sales          0.4021930         0.5739230      0.9194646   1.0000000
#Quantity total and total sales have high collinearity, hence dropping one
kmean_df$total_sales=NULL

set.seed(123)

# function to compute total within-cluster sum of square 
wss=function(k){
  kmeans(kmean_df,k,nstart=25)$tot.withinss
  }

# Compute and plot wss for k = 1 to k = 15
k.values=1:10

# extract wss for 2-15 clusters
wss_values=sapply(k.values,wss)
warnings()

#Plot to visualize and select k value based on elbow method
plot(k.values,wss_values, xlab="Number of clusters k", ylab="Total within-clusters sum of squares")

#K=5 seems optimum from above graph

#Alternate way to visualize the optimum k value
set.seed(123)

fviz_nbclust(kmean_df, kmeans, method = "wss")

#Generating K Means with K=5
set.seed(123)
kmeans_final=kmeans(kmean_df,5,nstart = 25)

#Plots
fviz_cluster(kmeans_final, data = kmean_df)