library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
df  =read.csv("D:\\Drive E\\MSBA UCin\\Course\\Spring Sem\\BANA 7031 - Probability\\Project\\cookie_cats.csv")
head(df)
##   userid version sum_gamerounds retention_1 retention_7
## 1    116 gate_30              3       FALSE       FALSE
## 2    337 gate_30             38        TRUE       FALSE
## 3    377 gate_40            165        TRUE       FALSE
## 4    483 gate_40              1       FALSE       FALSE
## 5    488 gate_40            179        TRUE        TRUE
## 6    540 gate_40            187        TRUE        TRUE
summary(df)
##      userid          version          sum_gamerounds     retention_1    
##  Min.   :    116   Length:90189       Min.   :    0.00   Mode :logical  
##  1st Qu.:2512230   Class :character   1st Qu.:    5.00   FALSE:50036    
##  Median :4995815   Mode  :character   Median :   16.00   TRUE :40153    
##  Mean   :4998412                      Mean   :   51.87                  
##  3rd Qu.:7496452                      3rd Qu.:   51.00                  
##  Max.   :9999861                      Max.   :49854.00                  
##  retention_7    
##  Mode :logical  
##  FALSE:73408    
##  TRUE :16781    
##                 
##                 
## 
hist(df$sum_gamerounds[df$sum_gamerounds<3000&df$sum_gamerounds>0])

test= df%>%mutate(bin_grp = ifelse(sum_gamerounds <= 10,'0-10', paste0(floor(sum_gamerounds/10-0.1)*10+1,'-',ceiling(sum_gamerounds/10)*10)))%>%group_by(bin_grp)%>%summarise(n=n())%>%arrange(desc(n))%>%head(10)

ggplot(test,aes(bin_grp,n))+geom_bar(stat = 'identity',fill = 'sky blue')+theme_light()+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+geom_text(aes(label = n),vjust=-0.2)+xlab("Level Range")+ylab("Count")

df%>%filter(version=='gate_30')%>%mutate(bin_grp = ifelse(sum_gamerounds <= 10,'0-10', paste0(floor(sum_gamerounds/10-0.1)*10+1,'-',ceiling(sum_gamerounds/10)*10)))%>%group_by(bin_grp)%>%summarise(n=n())%>%arrange(desc(n))%>%head(10)%>%ggplot(aes(bin_grp,n))+geom_bar(stat = 'identity',fill = 'dark green')+theme_light()+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+geom_text(aes(label = n),vjust=-0.2)+xlab("Level Range")+ylab("Count of Gate 30 Version Users")

df%>%filter(version=='gate_40')%>%mutate(bin_grp = ifelse(sum_gamerounds <= 10,'0-10', paste0(floor(sum_gamerounds/10-0.1)*10+1,'-',ceiling(sum_gamerounds/10)*10)))%>%group_by(bin_grp)%>%summarise(n=n())%>%arrange(desc(n))%>%head(10)%>%ggplot(aes(bin_grp,n))+geom_bar(stat = 'identity',fill = 'dark red')+theme_light()+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+geom_text(aes(label = n),vjust=-0.2)+xlab("Level Range")+ylab("Count of Gate 40 Version Users")

One Day Retention

(one_day = sum(df$retention_1)/nrow(df))
## [1] 0.4452095
df%>%group_by(version)%>%summarise(one_day_ratio = sum(retention_1)/(n=n()),one_day_count = sum(retention_1),Total = n())
## # A tibble: 2 x 4
##   version one_day_ratio one_day_count Total
##   <chr>           <dbl>         <int> <int>
## 1 gate_30         0.448         20034 44700
## 2 gate_40         0.442         20119 45489

Seven Day Retention

(seven_day = sum(df$retention_7)/nrow(df))
## [1] 0.1860648
df%>%group_by(version)%>%summarise(seven_day_ratio = sum(retention_7)/(n=n()),seven_day_count = sum(retention_7),count = n())
## # A tibble: 2 x 4
##   version seven_day_ratio seven_day_count count
##   <chr>             <dbl>           <int> <int>
## 1 gate_30           0.190            8502 44700
## 2 gate_40           0.182            8279 45489
summary(df$version)
##    Length     Class      Mode 
##     90189 character character
mu.hat.set.30.1=NULL
mu.hat.set.40.1=NULL
for(k in 1:2000){
  bootstrap.30 = sample(df$retention_1[df$version == 'gate_30'],size = length(df$retention_1[df$version == 'gate_30']),replace = TRUE)
  bootstrap.40 = sample(df$retention_1[df$version == 'gate_40'],size = length(df$retention_1[df$version == 'gate_40']),replace = TRUE)
  mu.hat.30 = mean(bootstrap.30)
  mu.hat.40 = mean(bootstrap.40)
  mu.hat.set.30.1[k]=mu.hat.30
  mu.hat.set.40.1[k]=mu.hat.40
}

sd(mu.hat.set.30.1)
## [1] 0.002304545
sd(mu.hat.set.40.1)
## [1] 0.002282777
quantile(mu.hat.set.30.1,probs = c(0.025,0.975))
##      2.5%     97.5% 
## 0.4435112 0.4526622
quantile(mu.hat.set.40.1,probs = c(0.025,0.975))
##      2.5%     97.5% 
## 0.4377536 0.4469218
ggplot()+geom_density(aes(x=mu.hat.set.30.1),color='red',size=1.5)+geom_density(aes(x=mu.hat.set.40.1),color='blue',size=1.5)+xlab("Mean value distribution")+theme_light()

#Lift
lift.1d = (mean(df$retention_1[df$version == 'gate_30'])-mean(df$retention_1[df$version == 'gate_40']))*100/mean(df$retention_1[df$version == 'gate_30'])
#ggplot()+geom_density(aes(x=lift.1d))+xlab("Mean value distribution")
print(lift.1d)
## [1] 1.317566
prop.test(xtabs(~version+retention_1,data=df)[,2:1])
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  xtabs(~version + retention_1, data = df)[, 2:1]
## X-squared = 3.1591, df = 1, p-value = 0.0755
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.0006042772  0.0124146168
## sample estimates:
##    prop 1    prop 2 
## 0.4481879 0.4422827
mu.hat.set.30.7=NULL
mu.hat.set.40.7=NULL
set.seed(123)
for(k in 1:2000){
  bootstrap.30 = sample(df$retention_7[df$version == 'gate_30'],size = length(df$retention_7[df$version == 'gate_30']),replace = TRUE)
  bootstrap.40 = sample(df$retention_7[df$version == 'gate_40'],size = length(df$retention_7[df$version == 'gate_40']),replace = TRUE)
  mu.hat.30 = mean(bootstrap.30)
  mu.hat.40 = mean(bootstrap.40)
  mu.hat.set.30.7[k]=mu.hat.30
  mu.hat.set.40.7[k]=mu.hat.40
}

sd(mu.hat.set.30.7)
## [1] 0.001878546
sd(mu.hat.set.40.7)
## [1] 0.001818584
quantile(mu.hat.set.30.7,probs = c(0.025,0.975))
##      2.5%     97.5% 
## 0.1864871 0.1939161
quantile(mu.hat.set.40.7,probs = c(0.025,0.975))
##      2.5%     97.5% 
## 0.1785706 0.1857812
ggplot()+geom_density(aes(x=mu.hat.set.30.7),color='red',size = 1.5)+geom_density(aes(x=mu.hat.set.40.7),color='blue',size=1.5)+
  xlab("Mean value distribution")+theme_light()

#Lift
lift.7d = (mean(mu.hat.set.30.7)-mean(mu.hat.set.40.7))*100/mean(mu.hat.set.40.7)
#ggplot()+geom_density(aes(x=lift.7d))+xlab("Mean value distribution")
print(lift.7d)
## [1] 4.496012
prop.test(xtabs(~version+retention_7,data=df)[,2:1])
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  xtabs(~version + retention_7, data = df)[, 2:1]
## X-squared = 9.9591, df = 1, p-value = 0.001601
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.003098867 0.013303730
## sample estimates:
##    prop 1    prop 2 
## 0.1902013 0.1820000