For access to more scripts, see the repo here: https://github.com/blakeshurtz/multilevel_marketing
## Loading required package: StanHeaders
## Loading required package: ggplot2
## rstan (Version 2.19.2, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
## For improved execution time, we recommend calling
## Sys.setenv(LOCAL_CPPFLAGS = '-march=native')
## although this causes Stan to throw an error on a few processors.
## Loading required package: Rcpp
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## rstanarm (Version 2.19.2, packaged: 2019-10-01 20:20:33 UTC)
## - Do not expect the default priors to remain the same in future rstanarm versions.
## Thus, R scripts should specify priors explicitly, even if they are just the defaults.
## - For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores())
## - bayesplot theme set to bayesplot::theme_default()
## * Does _not_ affect other ggplot2 plots
## * See ?bayesplot_theme_set for details on theme setting
##
## Attaching package: 'rstanarm'
## The following object is masked from 'package:rstan':
##
## loo
## This is loo version 2.2.0
## - Online documentation and vignettes at mc-stan.org/loo
## - As of v2.0.0 loo defaults to 1 core but we recommend using as many as possible. Use the 'cores' argument or set options(mc.cores = NUM_CORES) for an entire session.
## - Windows 10 users: loo may be very slow if 'mc.cores' is set in your .Rprofile file (see https://github.com/stan-dev/loo/issues/94).
##
## Attaching package: 'loo'
## The following object is masked from 'package:rstan':
##
## loo
## -- Attaching packages --- tidyverse 1.3.0 --
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## v purrr 0.3.3
## -- Conflicts ------ tidyverse_conflicts() --
## x tidyr::extract() masks rstan::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## This is bayesplot version 1.7.1
## - Online documentation and vignettes at mc-stan.org/bayesplot
## - bayesplot theme set to bayesplot::theme_default()
## * Does _not_ affect other ggplot2 plots
## * See ?bayesplot_theme_set for details on theme setting
## Loading required package: shiny
##
## This is shinystan version 2.5.0
library(tictoc)
library(readr)
rstan_options(auto_write = TRUE) #cache model
options(mc.cores = parallel::detectCores()) #multiple cores for multiple chains
###simulating data
set.seed(849107) #for reproducability
###define multi-level vars
ng_marketing <- 5 #marketing sources
nj_marketing <- 300 #marketing cluster size (make unbalanced)
n = ng_marketing * nj_marketing
ng_year <- 5 #years of data
nj_year <- 300 #cluster size (make unbalanced)
n = ng_year * nj_year
ng_city <- 5 #citys of data
nj_city <- 300 #cluster size (make unbalanced)
n = ng_city * nj_city
ng_salesperson <- 5 #citys of data
nj_salesperson <- 300 #cluster size (make unbalanced)
n = ng_salesperson * nj_salesperson
###creating cluster variables, sample to randomise order
#marketing
lead_source <- gl(ng_marketing, k = nj_marketing,
labels = c("radio", "tv", "newspaper",
"internet", "vehicle"
)
)
lead_source <- sample(lead_source, n, replace = FALSE)
#year
year <- gl(ng_year, k = nj_year,
labels = c("2015", "2016", "2017",
"2018", "2019"
)
)
year <- sample(year, n, replace = FALSE)
#city
city <- gl(ng_city, k = nj_city,
labels = c("davis", "vacaville", "fairfield",
"woodland", "suisun"
)
)
city <- sample(city, n, replace = FALSE)
#sales_professional
sales_professional <- gl(ng_salesperson, k = nj_salesperson,
labels = c("John", "Susan", "Peter",
"Steven", "Mary"
)
)
sales_professional <- sample(sales_professional, n, replace = FALSE)
###standard (single level) variables
#product
product <- factor(c(rep("HVAC", 750),
rep("DHW", 500),
rep("SOLAR", 250))
)
product <- sample(product, n, replace = FALSE)
#n_service
n_service <- scale(rpois(1500, 2))
#prior_revenue
prior_revenue <- c(rep(0, 1000), rep(1, 500))
prior_revenue <- sample(prior_revenue, n, replace = FALSE)
#call_to_estimate
call_to_estimate <- c(scale(rpois(1500, 2)))
#creating table showing factor label and numeric value
marketing_table = unique(data.frame(bind_cols(name = lead_source,
value= as.numeric(lead_source)-3
)
)
)
year_table = unique(data.frame(bind_cols(name = year,
value= as.numeric(year)-3
)
)
)
city_table = unique(data.frame(bind_cols(name = city,
value= as.numeric(city)-3
)
)
)
salesperson_table = unique(data.frame(bind_cols(name = sales_professional,
value= as.numeric(sales_professional)-3
)
)
)
product_table = unique(data.frame(bind_cols(name = product,
value= as.numeric(product)-2
)
)
)
#sd- in estimating coefficients, sigma has inverse relationship with n
sigma = .01
#simulate regression
close <- as.numeric(lead_source) - 3 +
as.numeric(year) - 3 +
as.numeric(city) - 3 +
as.numeric(sales_professional) - 3 +
as.numeric(product) - 2 +
n_service +
prior_revenue +
call_to_estimate +
rnorm(n, 0, sigma)
###convert to logistic
pr = 1/(1+exp(-close)) # pass through an inv-logit function
range(pr)
## [1] 2.843567e-05 9.999784e-01
## close
## 0 1
## 724 776
###mlm
mlm_sim <- stan_glmer(formula = close ~ (1|lead_source) +
(1|year) +
(1|city) +
(1|sales_professional) +
product +
n_service +
prior_revenue +
call_to_estimate - 1,
prior = normal(location = 0, scale = 1, autoscale = FALSE),
prior_intercept = cauchy(location = 0, scale = 2.5, autoscale = FALSE),
prior_aux = cauchy(location = 0, scale = 25, autoscale = FALSE),
data = dat,
family = binomial,
cores = 3,
chains = 4,
iter = 1000,
warmup = 150,
seed = 1,
refresh = 50)
glm_sim <- stan_glm(formula = close ~ lead_source +
year +
city +
sales_professional +
product +
n_service +
prior_revenue +
call_to_estimate - 1,
prior = normal(location = 0, scale = 1, autoscale = FALSE),
prior_intercept = cauchy(location = 0, scale = 2.5, autoscale = FALSE),
prior_aux = cauchy(location = 0, scale = 25, autoscale = FALSE),
data = dat,
family = binomial,
cores = 3,
chains = 4,
iter = 1000,
warmup = 150,
seed = 9,
refresh = 50)
###waic
mlm_waic <- waic(mlm_sim, cores = 4)
glm_waic <- waic(glm_sim, cores = 4)
###WAIC comparison
loo_compare(mlm_waic, glm_waic)
## elpd_diff se_diff
## mlm_sim 0.0 0.0
## glm_sim -37.3 7.4
## 2656.27 sec elapsed
## 2773.01 sec elapsed
## elpd_diff se_diff
## mlm_sim 0.0 0.0
## glm_sim -37.3 7.4
## Priors for model 'mlm_sim'
## ------
##
## Coefficients
## ~ normal(location = [0,0,0,...], scale = [1,1,1,...])
##
## Covariance
## ~ decov(reg. = 1, conc. = 1, shape = 1, scale = 1)
## ------
## See help('prior_summary.stanreg') for more details
#construct generative model using priors from mlm_sim
mlm_sim_prior <- stan_glmer(formula = close ~ (1|lead_source) +
(1|year) +
(1|city) +
(1|sales_professional) +
product +
n_service +
prior_revenue +
call_to_estimate - 1,
prior = normal(location = 0, scale = 1, autoscale = FALSE),
prior_intercept = cauchy(location = 0, scale = 2.5, autoscale = FALSE),
prior_aux = cauchy(location = 0, scale = 25, autoscale = FALSE),
prior_PD = TRUE,
data = dat,
family = binomial,
cores = 3,
chains = 4,
iter = 1000,
warmup = 150,
seed = 1,
refresh = 50)
#sample the response from model priors
#uses function posterior_predict, but model had prior_PD = TRUE, so data is not incorporated
y_rep <- posterior_predict(mlm_sim_prior)
#rows are iteractions, columns are samples. Why 1500?
plot(density(apply(y_rep, 2, mean)))
l_post <- log_posterior(mlm_sim)
nuts_p <- nuts_params(mlm_sim)
pars = names(mlm_sim$coefficients)
color_scheme_set("darkgray")
mcmc_parcoord(mlm_sim, np = nuts_p, pars = pars) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank()) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###MCMC diagnostics
###R-hat
m_summary <- summary(mlm_sim) %>%
as.data.frame() %>%
mutate(variable = rownames(.)) %>%
select(variable, everything()) %>%
as_tibble()
m_summary %>%
ggplot(aes(Rhat)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
color_scheme_set("brightblue") # see help("color_scheme_set")
rhats <- rhat(mlm_sim, pars = pars) #works as a CDF too
mcmc_rhat(rhats) + yaxis_text(hjust = 1)
###Effective Sample Sie
ratios_cp <- neff_ratio(mlm_sim, pars = pars)
#print(ratios_cp)
mcmc_neff(ratios_cp, size = 2)
iterations = mlm_sim[["stanfit"]]@sim[["iter"]]
m_summary %>%
ggplot(aes(n_eff/iterations)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###autocorrelation
pars = names(mlm_sim$coefficients[1:3])
mcmc_acf(mlm_sim, pars = pars, lags = 10)
## $lead_source
## (Intercept)
## radio -1.89454310
## tv -1.09452622
## newspaper 0.05502839
## internet 0.80787828
## vehicle 1.96022543
##
## $year
## (Intercept)
## 2015 -1.9540767
## 2016 -1.0766000
## 2017 -0.2532088
## 2018 1.1979384
## 2019 1.8545443
##
## $city
## (Intercept)
## davis -1.8081953
## vacaville -1.2131092
## fairfield -0.3171668
## woodland 1.0816224
## suisun 2.1825517
##
## $sales_professional
## (Intercept)
## John -2.32908962
## Susan -0.85040569
## Peter -0.02412893
## Steven 0.98825503
## Mary 2.04118571
##
## with conditional variances for "lead_source" "year" "city" "sales_professional"
###plotting draws
posterior <- as.matrix(mlm_sim)
colnames(posterior)[7:11] <- c("Internet", "Newspaper", "Radio", "TV", "Vehicle")
mcmc_intervals(
exp(posterior)/(1 + exp(posterior)),
pars = colnames(posterior)[7:11],
prob = 0.8, # 80% intervals
point_est = "mean"
) +
labs( title = "Vehicle & Internet marketing campaigns most successful",
subtitle = "95% and 80% probability intervals, with mean point",
caption = "x-axis transformed from logit to probability scale",
x = "Value-Add (added probability of successful sale)",
y = "") +
theme(text = element_text(size=16))