library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
I recently came across an interesting probability puzzle that has it’s origins in WW2. The allied forces were trying to figure out how many tanks the German’s had produced based the analyzing the sequentially numbered serial numbers of the tanks that they could find.
The problem can be generalized as trying to guess the maximum number in a given sequence given random numbers from that sequence.
When I first heard about this. I wanted to see if I could come up with my own solution before looking anything up. With some thought, I came up with a simple formula; take the maximum of mean doubled and the maximum number from the sample. Formalized here:
\[ N \approx \max(2\mu, m) \\ N = actual ~ maximum \\ m = maximum ~ from ~ sample \\ \mu = average ~ value \] I set out to test it with a simulation.
set.seed(345)
real <- c()
my_estimate <- c()
max_pop_size <- 3001
for(i in 1:1000) {
random <- runif(1, 1, max_pop_size) %>% floor()
vector <- 1:random
sample <- sample(vector, runif(1, 1, min(150, random)) %>% floor(), replace = F)
my_est <- max(max(sample), mean(sample)*2) %>% round()
real <- c(real, random)
my_estimate <- c(my_estimate, my_est)
}
results <- tibble(
real = real,
my_estimate = my_estimate,
off_by = abs(real - my_estimate),
off_perc = (off_by / real) * 100
)
results %>% head(10)
## # A tibble: 10 × 4
## real my_estimate off_by off_perc
## <dbl> <dbl> <dbl> <dbl>
## 1 649 764 115 17.7
## 2 2605 2597 8 0.307
## 3 1037 1121 84 8.10
## 4 2734 3495 761 27.8
## 5 1637 1634 3 0.183
## 6 481 479 2 0.416
## 7 2446 2435 11 0.450
## 8 2875 2851 24 0.835
## 9 1948 1941 7 0.359
## 10 317 317 0 0
results$off_perc %>% mean()
## [1] 5.05941
Looking at this sample of the data frame, we see that the estimate is occasionally very far off, but mostly very accurate. With, what I think is a pretty good average score.
It’s at this point that I decided to look up other potential solutions. I decided to test 3 of them. One Frequentist and two Bayesian.
\[ Frequentist~minimum-variance~unbiased~estimator \\ N \approx m + \frac{m}{k}-1 \\ \\ Bayesian~median \\ N \approx m + \frac{m\ln(2)}{k-1} \\ \\ Bayesian~mean \\ N \approx (m-1)\frac{k-1}{k-2} \\ \\ where: \\ N = actual ~ max \\ m = sample ~ maximum \\ k = sample ~ size \]
Here I simulate each method and measure its effectiveness against the other for each sample size. (note: here the sample size being set is the “minimum” sample size because we don’t know the actual size of the population.)
set.seed(345)
my_error_rate <- c()
fr_error_rate <- c()
bmd_error_rate <- c()
bmn_error_rate <- c()
min_sample <- c()
max_pop_size <- 3001
#iterating through maximum sample taken
for(j in 3:1000) {
real <- c()
my_estimate <- c()
fr_estimate <- c()
bmd_estimate <- c()
bmn_estimate <- c()
###
for(i in 1:1000) {
random <- runif(1, min = 3, max = max_pop_size) %>% floor()
vector <- 1:random
sample <- sample(vector, runif(1, min = 3, max = min(j, random)) %>% floor(), replace = F)
max <- max(sample)
sample_size <- length(sample)
my_est <- max(max, mean(sample)*2) %>% round()
frequentist_method <- max + (max/sample_size) - 1
bayesian_median_method <- max + ((max * log(2)) / (sample_size - 1))
bayesian_mean_method <- (max - 1) * ((sample_size - 1) / (sample_size - 2))
#debugging code
if(bayesian_median_method == Inf) {
print(sample)
print(max)
print(sample_size)
}
real <- c(real, random)
my_estimate <- c(my_estimate, my_est)
fr_estimate <- c(fr_estimate, frequentist_method)
bmd_estimate <- c(bmd_estimate, bayesian_median_method)
bmn_estimate <- c(bmn_estimate, bayesian_mean_method)
}
results <- tibble(
real = real,
my_estimate = my_estimate,
fr_estimate = fr_estimate,
bmd_estimate = bmd_estimate,
bmn_estimate = bmn_estimate,
my_off_by = abs(real - my_estimate),
fr_off_by = abs(real - fr_estimate),
bmd_off_by = abs(real - bmd_estimate),
bmn_off_by = abs(real - bmn_estimate),
my_off_perc = (my_off_by / real) * 100,
fr_off_perc = (fr_off_by / real) * 100,
bmd_off_perc = (bmd_off_by / real) * 100,
bmn_off_perc = (bmn_off_by / real) * 100,
)
###
min_sample <- c(min_sample, j)
my_error_rate <- c(my_error_rate, results$my_off_perc %>% mean())
fr_error_rate <- c(fr_error_rate, results$fr_off_perc %>% mean())
bmd_error_rate <- c(bmd_error_rate, results$bmd_off_perc %>% mean())
bmn_error_rate <- c(bmn_error_rate, results$bmn_off_perc %>% mean())
}
error_set <- tibble(
min_sample = min_sample,
my_error_rate = my_error_rate,
fr_error_rate = fr_error_rate,
bmd_error_rate = bmd_error_rate,
bmn_error_rate = bmn_error_rate
)
error_set[47:60,]
## # A tibble: 14 × 5
## min_sample my_error_rate fr_error_rate bmd_error_rate bmn_error_rate
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 49 8.38 4.35 4.17 5.62
## 2 50 8.75 4.72 4.65 5.92
## 3 51 8.46 4.76 4.61 6.57
## 4 52 8.17 4.58 4.36 6.48
## 5 53 7.81 4.22 4.04 5.59
## 6 54 8.08 4.48 4.39 5.83
## 7 55 7.55 3.84 3.68 4.99
## 8 56 7.84 4.24 4.05 5.60
## 9 57 7.70 4.24 4.12 5.60
## 10 58 7.26 3.97 3.76 5.44
## 11 59 7.89 4.13 3.88 5.30
## 12 60 7.64 3.76 3.62 5.40
## 13 61 7.18 3.74 3.59 5.20
## 14 62 7.21 3.68 3.55 5.03
Here’s a middle segment of the results. Showing the whole data frame might result in an unenviable level of scrolling. And it might be better visualized with a graph.
ggplot(error_set, aes(x = min_sample)) +
geom_line(aes(y = my_error_rate), color = "purple", linewidth = 0.25) +
geom_line(aes(y = fr_error_rate), color = "blue", linewidth = 0.25) +
geom_line(aes(y = bmd_error_rate), color = "orange", linewidth = 0.25) +
geom_line(aes(y = bmn_error_rate), color = "red", linewidth = 0.25) +
geom_smooth(aes(y = my_error_rate), color = "purple", linewidth = 0.5) +
geom_smooth(aes(y = fr_error_rate), color = "blue", linewidth = 0.5) +
geom_smooth(aes(y = bmd_error_rate), color = "orange", linewidth = 0.5) +
geom_smooth(aes(y = bmn_error_rate), color = "red", linewidth = 0.5) +
scale_y_log10() +
labs(x = "Minimum Sample",
y = "Error Rate %") +
theme_minimal()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Purple = My Error Rate, Blue = Frequentist minimum variance unbiased
estimator error rate, Orange = Bayes median error rate, Blue = Bayes
mean error rate,
It seems like the Frequentist approach is the most effective. With Baye’s Median being a very close second. My method being a distant last.
This didn’t make complete sense to me at first, but on reflection it kind of does. The likelihood of one of your hits being the actual maximum is very unlikely, and becomes less likely the higher your sample size is. So therefore it probably makes sense to just assume the maximum is higher, and estimate by how much. Having said that it’s not as if my method is “bad.” It’s still able to come pretty close. Which makes me wonder if we could further optimize by taking combinations of values, testing for bias, shifting between them, etc., but that seems like a project for another day.