library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
df =read.csv("D:\\Drive E\\MSBA UCin\\Course\\Spring Sem\\BANA 7031 - Probability\\Project\\cookie_cats.csv")
head(df)
## userid version sum_gamerounds retention_1 retention_7
## 1 116 gate_30 3 FALSE FALSE
## 2 337 gate_30 38 TRUE FALSE
## 3 377 gate_40 165 TRUE FALSE
## 4 483 gate_40 1 FALSE FALSE
## 5 488 gate_40 179 TRUE TRUE
## 6 540 gate_40 187 TRUE TRUE
summary(df)
## userid version sum_gamerounds retention_1
## Min. : 116 Length:90189 Min. : 0.00 Mode :logical
## 1st Qu.:2512230 Class :character 1st Qu.: 5.00 FALSE:50036
## Median :4995815 Mode :character Median : 16.00 TRUE :40153
## Mean :4998412 Mean : 51.87
## 3rd Qu.:7496452 3rd Qu.: 51.00
## Max. :9999861 Max. :49854.00
## retention_7
## Mode :logical
## FALSE:73408
## TRUE :16781
##
##
##
hist(df$sum_gamerounds[df$sum_gamerounds<3000&df$sum_gamerounds>0])
test= df%>%mutate(bin_grp = ifelse(sum_gamerounds <= 10,'0-10', paste0(floor(sum_gamerounds/10-0.1)*10+1,'-',ceiling(sum_gamerounds/10)*10)))%>%group_by(bin_grp)%>%summarise(n=n())%>%arrange(desc(n))%>%head(10)
ggplot(test,aes(bin_grp,n))+geom_bar(stat = 'identity',fill = 'sky blue')+theme_light()+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+geom_text(aes(label = n),vjust=-0.2)+xlab("Level Range")+ylab("Count")
df%>%filter(version=='gate_30')%>%mutate(bin_grp = ifelse(sum_gamerounds <= 10,'0-10', paste0(floor(sum_gamerounds/10-0.1)*10+1,'-',ceiling(sum_gamerounds/10)*10)))%>%group_by(bin_grp)%>%summarise(n=n())%>%arrange(desc(n))%>%head(10)%>%ggplot(aes(bin_grp,n))+geom_bar(stat = 'identity',fill = 'dark green')+theme_light()+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+geom_text(aes(label = n),vjust=-0.2)+xlab("Level Range")+ylab("Count of Gate 30 Version Users")
df%>%filter(version=='gate_40')%>%mutate(bin_grp = ifelse(sum_gamerounds <= 10,'0-10', paste0(floor(sum_gamerounds/10-0.1)*10+1,'-',ceiling(sum_gamerounds/10)*10)))%>%group_by(bin_grp)%>%summarise(n=n())%>%arrange(desc(n))%>%head(10)%>%ggplot(aes(bin_grp,n))+geom_bar(stat = 'identity',fill = 'dark red')+theme_light()+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+geom_text(aes(label = n),vjust=-0.2)+xlab("Level Range")+ylab("Count of Gate 40 Version Users")
One Day Retention
(one_day = sum(df$retention_1)/nrow(df))
## [1] 0.4452095
df%>%group_by(version)%>%summarise(one_day_ratio = sum(retention_1)/(n=n()),one_day_count = sum(retention_1),Total = n())
## # A tibble: 2 x 4
## version one_day_ratio one_day_count Total
## <chr> <dbl> <int> <int>
## 1 gate_30 0.448 20034 44700
## 2 gate_40 0.442 20119 45489
Seven Day Retention
(seven_day = sum(df$retention_7)/nrow(df))
## [1] 0.1860648
df%>%group_by(version)%>%summarise(seven_day_ratio = sum(retention_7)/(n=n()),seven_day_count = sum(retention_7),count = n())
## # A tibble: 2 x 4
## version seven_day_ratio seven_day_count count
## <chr> <dbl> <int> <int>
## 1 gate_30 0.190 8502 44700
## 2 gate_40 0.182 8279 45489
summary(df$version)
## Length Class Mode
## 90189 character character
mu.hat.set.30.1=NULL
mu.hat.set.40.1=NULL
for(k in 1:2000){
bootstrap.30 = sample(df$retention_1[df$version == 'gate_30'],size = length(df$retention_1[df$version == 'gate_30']),replace = TRUE)
bootstrap.40 = sample(df$retention_1[df$version == 'gate_40'],size = length(df$retention_1[df$version == 'gate_40']),replace = TRUE)
mu.hat.30 = mean(bootstrap.30)
mu.hat.40 = mean(bootstrap.40)
mu.hat.set.30.1[k]=mu.hat.30
mu.hat.set.40.1[k]=mu.hat.40
}
sd(mu.hat.set.30.1)
## [1] 0.002304545
sd(mu.hat.set.40.1)
## [1] 0.002282777
quantile(mu.hat.set.30.1,probs = c(0.025,0.975))
## 2.5% 97.5%
## 0.4435112 0.4526622
quantile(mu.hat.set.40.1,probs = c(0.025,0.975))
## 2.5% 97.5%
## 0.4377536 0.4469218
ggplot()+geom_density(aes(x=mu.hat.set.30.1),color='red',size=1.5)+geom_density(aes(x=mu.hat.set.40.1),color='blue',size=1.5)+xlab("Mean value distribution")+theme_light()
#Lift
lift.1d = (mean(df$retention_1[df$version == 'gate_30'])-mean(df$retention_1[df$version == 'gate_40']))*100/mean(df$retention_1[df$version == 'gate_30'])
#ggplot()+geom_density(aes(x=lift.1d))+xlab("Mean value distribution")
print(lift.1d)
## [1] 1.317566
prop.test(xtabs(~version+retention_1,data=df)[,2:1])
##
## 2-sample test for equality of proportions with continuity correction
##
## data: xtabs(~version + retention_1, data = df)[, 2:1]
## X-squared = 3.1591, df = 1, p-value = 0.0755
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.0006042772 0.0124146168
## sample estimates:
## prop 1 prop 2
## 0.4481879 0.4422827
mu.hat.set.30.7=NULL
mu.hat.set.40.7=NULL
set.seed(123)
for(k in 1:2000){
bootstrap.30 = sample(df$retention_7[df$version == 'gate_30'],size = length(df$retention_7[df$version == 'gate_30']),replace = TRUE)
bootstrap.40 = sample(df$retention_7[df$version == 'gate_40'],size = length(df$retention_7[df$version == 'gate_40']),replace = TRUE)
mu.hat.30 = mean(bootstrap.30)
mu.hat.40 = mean(bootstrap.40)
mu.hat.set.30.7[k]=mu.hat.30
mu.hat.set.40.7[k]=mu.hat.40
}
sd(mu.hat.set.30.7)
## [1] 0.001878546
sd(mu.hat.set.40.7)
## [1] 0.001818584
quantile(mu.hat.set.30.7,probs = c(0.025,0.975))
## 2.5% 97.5%
## 0.1864871 0.1939161
quantile(mu.hat.set.40.7,probs = c(0.025,0.975))
## 2.5% 97.5%
## 0.1785706 0.1857812
ggplot()+geom_density(aes(x=mu.hat.set.30.7),color='red',size = 1.5)+geom_density(aes(x=mu.hat.set.40.7),color='blue',size=1.5)+
xlab("Mean value distribution")+theme_light()
#Lift
lift.7d = (mean(mu.hat.set.30.7)-mean(mu.hat.set.40.7))*100/mean(mu.hat.set.40.7)
#ggplot()+geom_density(aes(x=lift.7d))+xlab("Mean value distribution")
print(lift.7d)
## [1] 4.496012
prop.test(xtabs(~version+retention_7,data=df)[,2:1])
##
## 2-sample test for equality of proportions with continuity correction
##
## data: xtabs(~version + retention_7, data = df)[, 2:1]
## X-squared = 9.9591, df = 1, p-value = 0.001601
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.003098867 0.013303730
## sample estimates:
## prop 1 prop 2
## 0.1902013 0.1820000