# Create example spam detection data
set.seed(123)
n_emails <- 1000
spam_data <- data.frame(
exclamation_marks = c(rpois(100, 5), rpois(900, 0.5)), # Spam has more !
contains_free = c(rbinom(100, 1, 0.8), rbinom(900, 1, 0.1)), # Spam mentions "free"
length = c(rnorm(100, 200, 50), rnorm(900, 500, 100)), # Spam is shorter
is_spam = c(rep(1, 100), rep(0, 900))
)
# Look at the data
head(spam_data) exclamation_marks contains_free length is_spam
1 4 1 150.21006 1
2 7 1 148.00225 1
3 4 1 199.10099 1
4 8 0 193.39124 1
5 9 0 72.53286 1
6 2 1 252.02867 1





