Machine Learning algorithm with low recall
Machine Learning algorithm with low recall
I'm trying to create a model based on the likelihood of an event, based on a behaviour of 7 variables.
The problem is that happening of the event - a user buying something - is approximately 1 % of all contacts. So even if the models have a massive accuracy of 99%, they are pretty bad to detect the 1% of that accuracy.
In the best of words, would be generating a likelihood of withrev column/variable being one from 0 to 1.
My code is the following, I added 110 rows.
library(caret)
library(pROC)
library(RANN)
train = structure(list(id = c(380858, 369968, 280867, 22590, 427261,
35899, 416468, 382882, 452351, 242913, 117705, 198652, 69613,
128245, 413836, 428429, 369245, 452288, 365766, 366455, 62413,
247529, 238390, 423846, 423924, 413305, 24969, 397607, 390359,
124830, 22340, 31907, 237708, 400852, 433123, 23434, 332540,
466145, 315170, 271449, 343925, 317769, 16094, 436306, 133180,
335606, 353099, 337339, 298883, 200758, 14239, 263128, 3304,
439342, 328788, 375375, 204907, 285234, 277300, 283870, 397150,
328893, 302016, 216045, 424227, 396024, 341372, 323088, 189579,
141107, 328222, 329196, 3012, 264832, 14965, 341169, 102881,
352503, 115213, 455984, 97339, 92839, 267388, 378203, 199180,
344888, 94432, 399611, 371614, 165821, 127668, 466233, 192372,
114379, 17741, 291622, 202945, 229930, 41095, 152952, 309194,
8866, 169539, 97375, 434700, 52413, 6882, 18985, 1102, 206164
), sends = c(1L, 1L, 3L, 22L, 1L, 20L, 1L, 1L, 1L, 5L, 10L, 7L,
13L, 9L, 1L, 1L, 1L, 1L, 1L, 1L, 14L, 5L, 6L, 1L, 1L, 1L, 22L,
1L, 1L, 10L, 22L, 21L, 6L, 1L, 1L, 22L, 2L, 1L, 2L, 3L, 2L, 2L,
22L, 1L, 9L, 2L, 2L, 2L, 3L, 7L, 23L, 4L, 33L, 1L, 2L, 1L, 7L,
3L, 3L, 3L, 1L, 2L, 3L, 6L, 1L, 1L, 2L, 2L, 7L, 9L, 2L, 2L, 33L,
4L, 23L, 2L, 11L, 2L, 10L, 1L, 11L, 11L, 4L, 1L, 7L, 2L, 11L,
1L, 1L, 8L, 9L, 1L, 7L, 10L, 22L, 3L, 7L, 6L, 17L, 9L, 2L, 26L,
8L, 11L, 1L, 15L, 29L, 22L, 37L, 7L), unique_opens = c(0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 8L, 7L, 4L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
4L, 3L, 25L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 20L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 15L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L),
unique_clicks = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 14L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L), avg_duration_time = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0.22076, 0, 0, 0, 0, 0, 0, 0, 0, 0.19864, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.17882, 0,
0.40527, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0.30109, 0, 0, 0, 0, 0.27801, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0.1908, 0, 0, 0, 0.19413, 0), jl_ord = c(0L, 0L, 6L, 4L,
6L, 5L, 2L, 5L, 1L, 4L, 4L, 6L, 6L, 4L, 6L, 2L, 6L, 5L, 2L,
5L, 4L, 6L, 5L, 0L, 1L, 6L, 0L, 2L, 6L, 5L, 0L, 0L, 1L, 2L,
6L, 0L, 0L, 1L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 0L, 6L, 6L, 6L,
6L, 5L, 5L, 1L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 1L, 6L, 6L, 6L,
0L, 0L, 4L, 0L, 5L, 6L, 0L, 6L, 2L, 1L, 2L, 1L, 6L, 6L, 0L,
2L, 5L, 5L, 0L, 6L, 3L, 6L, 0L, 2L, 6L, 0L, 6L, 6L, 0L, 5L,
0L, 1L, 5L, 1L, 3L, 5L, 6L, 5L, 6L, 4L, 3L, 6L, 6L, 0L, 5L,
6L), completeness = c(0.333, 0.333, 0.333, 0.667, 0.333,
0.5, 0.667, 0.5, 0.5, 0.833, 0.667, 0.333, 0.333, 0.667,
0.333, 0.5, 0.333, 0.5, 0.5, 0.5, 0.833, 0.667, 0.5, 0.167,
0.5, 0.667, 0.167, 0.5, 0.667, 0.5, 0.167, 0.333, 0.667,
0.5, 0.667, 0.167, 0.333, 0.5, 0.5, 0.5, 0.333, 0.5, 0.5,
0.5, 0.667, 0.333, 0.5, 0.333, 0.5, 0.5, 0.5, 0.667, 0.833,
0.333, 0.5, 0.667, 0.667, 0.333, 0.333, 0.333, 0.5, 0.5,
0.5, 0.5, 0.167, 0.167, 0.5, 0.333, 0.5, 0.667, 0.333, 0.5,
0.667, 0.5, 0.5, 0.5, 0.833, 0.667, 0.333, 0.5, 0.667, 0.667,
0.5, 0.667, 0.5, 0.5, 0.333, 0.5, 0.333, 0.333, 0.667, 0.667,
0.333, 0.5, 0.333, 0.5, 0.5, 0.5, 0.667, 0.667, 0.5, 0.5,
0.667, 0.667, 0.5, 0.667, 0.667, 0.333, 0.667, 0.5), withrev = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(380858L,
369968L, 280867L, 22590L, 427261L, 35899L, 416468L, 382882L,
452351L, 242913L, 117705L, 198652L, 69613L, 128245L, 413836L,
428429L, 369245L, 452288L, 365766L, 366455L, 62413L, 247529L,
238390L, 423846L, 423924L, 413305L, 24969L, 397607L, 390359L,
124830L, 22340L, 31907L, 237708L, 400852L, 433123L, 23434L, 332540L,
466145L, 315170L, 271449L, 343925L, 317769L, 16094L, 436306L,
133180L, 335606L, 353099L, 337339L, 298883L, 200758L, 14239L,
263128L, 3304L, 439342L, 328788L, 375375L, 204907L, 285234L,
277300L, 283870L, 397150L, 328893L, 302016L, 216045L, 424227L,
396024L, 341372L, 323088L, 189579L, 141107L, 328222L, 329196L,
3012L, 264832L, 14965L, 341169L, 102881L, 352503L, 115213L, 455984L,
97339L, 92839L, 267388L, 378203L, 199180L, 344888L, 94432L, 399611L,
371614L, 165821L, 127668L, 466233L, 192372L, 114379L, 17741L,
291622L, 202945L, 229930L, 41095L, 152952L, 309194L, 8866L, 169539L,
97375L, 434700L, 52413L, 6882L, 18985L, 1102L, 206164L))
train$withrev = factor(train$withrev)
id<-train$id
train$id = NULL
train_transformed =train
train_transformed$withrev<-as.factor(train_transformed$withrev)
index <- createDataPartition(train_transformed$withrev, p=0.5, list=FALSE)
trainSet <- train_transformed[ index,]
testSet <- train_transformed[-index,]
control <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
repeats = 3,
verbose = T)
outcomeName<-'withrev'
predictors<-names(trainSet)[!names(trainSet) %in% outcomeName ]
Loan_Pred_Profile <- rfe(trainSet[,predictors], trainSet[,outcomeName],
rfeControl = control)
fitControl <- trainControl(
method = "repeatedcv",
number = 5,
repeats = 5)
model_gbm<-train(trainSet[,predictors],trainSet[,outcomeName],method='gbm',trControl=fitControl,tuneLength=5)
predictions = predict(model_gbm,testSet)
mperf = confusionMatrix(predictions,testSet$withrev)
auc <- roc(as.numeric(testSet[,outcomeName]), as.numeric(predictions))
print(auc$auc)
The AUC would be around 0.5, in the example is more than that because the data is very limited.
It is possible that the model is not feasible, due to the non-existent or very low correlation between variables.
Regards,
Google "class imbalance" - a huge sub-topic with its own dedicated techniques & methodologies...
– desertnaut
Aug 20 at 18:40
By clicking "Post Your Answer", you acknowledge that you have read our updated terms of service, privacy policy and cookie policy, and that your continued use of the website is subject to these policies.
This would be a much better fit at datascience.stackexchange or stats.stackexchange. It's not really a programming question. You might want to do some searching/reading about "imbalanced data" - there are probably many pertinent questions and answers on both those sites already, for example this one and this one
– Gregor
Aug 20 at 17:54