[R] Improve training of predictive neural networks

Wed Jan 13 23:55:57 CET 2016

Dear all,
I am trying to train a predictive neural network to guess the
positivity/negativity of some tests I have done. I am using the
library pnn and I have empirically identified the best sigma to use in
the algorithm but the results I got are affected by false positive
results.
The actual dataset I am using is formed by 14,000 values, which I have
subdivided in a training and query subsets. Since the numbers are high
I can't really use them for the present example, just suffice to say
that the values less than y = 0.035 are negative in the training
subset. However in the query subset many values with y < 0.035 and x
between about 10 and 30 are given as positive.
Since the only parameter that I can change in pnn is essentially sigma
and the training subset is properly dividing the positive and negative
populations, how can I further train the system to reduce the false
positive rate?
In the example I am using here, which is just an approximation of the
real thing, red and black circles indicates the positive and negative
values of the training subset, respectively. Squares comes from the
query subset, red/black = positive/negative by pnn. The line indicates
the cut-off that I was expecting the model to guess.
Many thanks
Luigi

>>>
x <- c(3.15,    2.97,    3.21,    45,    2,    2.47,    2.97,    2.6,
  7.35,    4.11,    37.12,    2.73,    36.36,    2.4,    2.74,    45,
  2.47,    37.4,    45,    2.97,    2,    2,    2.55,    2.51,
2.68,    2.31,    2.6,    2,    2.57,    37.05,    13.84,    19.18,
21.94,    28.61,    38.01,    38.24,    38.33,    29.01,    24.64,
10.03,    10.12,    10.29,    10.32,    10.39,    10.41,    10.44,
10.51,    10.64,    10.65,    10.67,    10.83,    10.85,    10.97,
11.24,    11.43,    11.85,    11.87,    12.02,    12.03,    12.05,
12.12,    12.22,    12.29,    12.3,    12.33,    12.62,    12.62,
12.64,    12.69)
y <- c(0.014,    0.008,    0.008,    0.001,    0.002,    0,    0.013,
  0.008,    0.001,    0.011,    0.076,    0.005,    0.045,    0.002,
 0.016,    0.001,    0.002,    0.086,    0.002,    0.019,    0,
0.002,    0.024,    0.015,    0.009,    0.013,    0.017,    0.009,
0.012,    0.088,    0.129,    0.097,    0.085,    0.096,    0.087,
0.103,    0.066,    0.11,    0.11, 0.001,    0.002,    0.002,
0.104,    0,    0.003,    0.116,    0.001,    0.002,    -0.001,
0.116,    0.124,    0.004,    0.116,    0.124,    0.119,    0.003,
0.112,    0.003,    0.002,    0.092,    0.118,    0.108,    0.104,
0,    0.112,    0.131,    0.001,    0.125,    0.005)
z <- c(0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    1,
  0,    1,    0,    0,    0,    0,    1,    0,    0,    0,    0,    0,
   0,    0,    0,    0,    0,    0,    1,    1,    1,    1,    1,
1,    1,    1,    1,    1, 0,    0,    0,    1,    0,    0,    1,
0,    0,    0,    1,    1,    0,    1,    1,    1,    0,    1,    0,
 0,    1,    1,    1,    1,    0,    1,    1,    0,    1,    0)
t.df <- data.frame(z, x, y)

t.pos <- subset(t.df, z == 1)
t.neg <- subset(t.df, z == 0)
plot(t.pos$y ~ t.pos$x,
     col = "red", xlab="x", ylab="y",
     xlim = c(min(t.df$x), max(t.df$x)),
     ylim = c(min(t.df$y), max(t.df$y))
)
points(t.neg$x, t.neg$y)

x <- c(38.01,    2.7,    4.89,    2.76,    2.96,    2.91,    38.61,
2.89,    2.07,    2.72,    2.77,    4.49,    3.06,    3.1,    2,
2.95,    3.37,    4.7,    2.98,    2.89,    44.6,    3.09,    28.05,
 2.8,    4.76,    4.91,    3.04,    2.79,    3.1,    37.62,    5.49,
 3.17,    4.53,    2.77,    2.87,    4.91,    3.08,    3.04,    3.03,
  3.09,    4.74,    2.74,    4.25,    3.31,    28.22,    3.05,
4.68,    4.8,    3.12,    2.65,    2.62,    2.91,    38.32,    2.86,
 2.96,    2.95,    16.24,    3.01,    3.25,    2.93,    2.92,    2.93,
   2.99,    4.79,    3.13,    3.01,    3.29,    2.76,    3.44,
4.91,    3.14,    2.9,    3.03,    2.51,    2.91,    2.52,    45,
3.15,    44.48,    3.03,    2.76,    4.81,    14.97,    2.8, 12, 20,
25, 30)
y <- c(0.082,    0.007,    0.034,    0.007,    0.027,    0.009,
0.057,    0.028,    0.007,    0.02,    0.022,    0.021,    0.012,
0.018,    0,    0.021,    0.041,    0.021,    0.021,    0.011,
0.025,    0.011,    0.102,    0.016,    0.035,    0.015,    0.008,
0.017,    0.028,    0.084,    0.013,    0.032,    0.004,    0.006,
0.025,    0.019,    0.006,    0.018,    0.019,    0.02,    0.021,
0.009,    0.015,    0.023,    0.089,    0.023,    0.025,    0.034,
0.035,    0.009,    0.006,    0.007,    0.056,    0.025,    0.016,
0.012,    0.101,    0.019,    0.017,    0.031,    0.019,    0.014,
0.044,    0.02,    0.018,    0.017,    0.018,    0.008,    0.02,
0.017,    0.016,    0.021,    0.02,    0.009,    0.019,    0.006,
0.002,    0.012,    0.016,    0.013,    0.016,    0.013,    0.101,
0.027, 0.01, 0.02, 0.03, 0.025)
q.df <- data.frame(x, y)
points(q.df$x, q.df$y, col="green")

library(pnn)
train <- learn(t.df[,1:3])
fit <- smooth(train)
# guess
n <- nrow(q.df)
Q <- rep(-1, n)
for (i in 1:n) {
    print(i)
    Q[i] <- guess(fit, as.matrix(q.df[i,1:2]))$category
}
R <- data.frame(q.df, Q, stringsAsFactors = FALSE)

P <- subset(R, Q == 1)
N <- subset(R, Q == 0)
points (P$x, P$y, pch=5, col = "red")
points (N$x, N$y, pch=5, col = "black")
abline(h=0.035)