4.2.5 Building an Expectation Maximization Model

Beginning in Oracle Database 12c, Release 2 (12.2), the ore.odmEM function creates a model that uses the Oracle Data Mining Expectation Maximization algorithm.

Expectation Maximization (EM) is a density estimation algorithm that performs probabilistic clustering. In density estimation, the goal is to construct a density function that captures how a given population is distributed. The density estimate is based on observed data that represents a sample of the population.

For information on the ore.odmEM function arguments, invoke help(ore.odmEM).

Example 4-12 Using the ore.odmEM Function

## Synthetic 2-dimensional data set
set.seed(7654)

x <- rbind(matrix(rnorm(100, mean = 4, sd = 0.3), ncol = 2),
           matrix(rnorm(100, mean = 2, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")

X <- ore.push (data.frame(ID=1:100,x))
rownames(X) <- X$ID

em.mod <- NULL
em.mod <- ore.odmEM(~., X, num.centers = 2L)

summary(em.mod)
rules(em.mod)
clusterhists(em.mod)
histogram(em.mod)

em.res <- predict(em.mod, X, type="class", supplemental.cols=c("x", "y"))
head(em.res)
em.res.local <- ore.pull(em.res)
plot(data.frame(x=em.res.local$x, y=em.res.local$y), col=em.res.local$CLUSTER_ID)
points(em.mod$centers2, col = rownames(em.mod$centers2), pch=8, cex=2)

head(predict(em.mod,X))
head(predict(em.mod,X,type=c("class","raw")))
head(predict(em.mod,X,type=c("class","raw"),supplemental.cols=c("x","y")))
head(predict(em.mod,X,type="raw",supplemental.cols=c("x","y")))

Listing for This Example

R> ## Synthetic 2-dimensional data set
R> 
R> set.seed(7654)
R>
R> x <- rbind(matrix(rnorm(100, mean = 4, sd = 0.3), ncol = 2),
+             matrix(rnorm(100, mean = 2, sd = 0.3), ncol = 2))
R> colnames(x) <- c("x", "y")
R>
R> X <- ore.push (data.frame(ID=1:100,x))
R> rownames(X) <- X$ID
R> 
R> em.mod <- NULL
R> em.mod <- ore.odmEM(~., X, num.centers = 2L)
R> 
R> summary(em.mod)

Call:
ore.odmEM(formula = ~., data = X, num.centers = 2L)

Settings: 
                                               value
clus.num.clusters                                  2
cluster.components               cluster.comp.enable
cluster.statistics                 clus.stats.enable
cluster.thresh                                     2
linkage.function                      linkage.single
loglike.improvement                             .001
max.num.attr.2d                                   50
min.pct.attr.support                              .1
model.search                    model.search.disable
num.components                                    20
num.distribution                    num.distr.system
num.equiwidth.bins                                11
num.iterations                                   100
num.projections                                   50
random.seed                                        0
remove.components                remove.comps.enable
odms.missing.value.treatment odms.missing.value.auto
odms.sampling                  odms.sampling.disable
prep.auto                                         ON

Centers: 
  MEAN.ID MEAN.x MEAN.y
2    25.5   4.03   3.96
3    75.5   1.93   1.99

R> rules(em.mod)
   cluster.id rhs.support rhs.conf lhr.support lhs.conf lhs.var lhs.var.support lhs.var.conf   predicate
1           1         100      1.0         100     1.00      ID             100       0.0000   ID <= 100
2           1         100      1.0         100     1.00      ID             100       0.0000     ID >= 1
3           1         100      1.0         100     1.00       x             100       0.2500 x <= 4.6298
4           1         100      1.0         100     1.00       x             100       0.2500 x >= 1.3987
5           1         100      1.0         100     1.00       y             100       0.3000 y <= 4.5846
6           1         100      1.0         100     1.00       y             100       0.3000 y >= 1.3546
7           2          50      0.5          50     1.00      ID              50       0.0937  ID <= 50.5
8           2          50      0.5          50     1.00      ID              50       0.0937     ID >= 1
9           2          50      0.5          50     1.00       x              50       0.0937 x <= 4.6298
10          2          50      0.5          50     1.00       x              50       0.0937  x > 3.3374
11          2          50      0.5          50     1.00       y              50       0.0937 y <= 4.5846
12          2          50      0.5          50     1.00       y              50       0.0937  y > 2.9696
13          3          50      0.5          50     0.98      ID              49       0.0937   ID <= 100
14          3          50      0.5          50     0.98      ID              49       0.0937   ID > 50.5
15          3          50      0.5          49     0.98       x              49       0.0937  x <= 2.368
16          3          50      0.5          49     0.98       x              49       0.0937 x >= 1.3987
17          3          50      0.5          49     0.98       y              49       0.0937 y <= 2.6466
18          3          50      0.5          49     0.98       y              49       0.0937 y >= 1.3546
R> clusterhists(em.mod)
   cluster.id variable bin.id lower.bound upper.bound       label count
1           1       ID      1        1.00       10.90      1:10.9    10
2           1       ID      2       10.90       20.80   10.9:20.8    10
3           1       ID      3       20.80       30.70   20.8:30.7    10
4           1       ID      4       30.70       40.60   30.7:40.6    10
5           1       ID      5       40.60       50.50   40.6:50.5    10
6           1       ID      6       50.50       60.40   50.5:60.4    10
7           1       ID      7       60.40       70.30   60.4:70.3    10
8           1       ID      8       70.30       80.20   70.3:80.2    10
9           1       ID      9       80.20       90.10   80.2:90.1    10
10          1       ID     10       90.10      100.00    90.1:100    10
11          1       ID     11          NA          NA           :     0
12          1        x      1        1.40        1.72 1.399:1.722    11
13          1        x      2        1.72        2.04 1.722:2.045    22
14          1        x      3        2.04        2.37 2.045:2.368    16
15          1        x      4        2.37        2.69 2.368:2.691     1
16          1        x      5        2.69        3.01 2.691:3.014     0
17          1        x      6        3.01        3.34 3.014:3.337     0
18          1        x      7        3.34        3.66  3.337:3.66     4
19          1        x      8        3.66        3.98  3.66:3.984    18
20          1        x      9        3.98        4.31 3.984:4.307    22
21          1        x     10        4.31        4.63  4.307:4.63     6
22          1        x     11          NA          NA           :     0
23          1        y      1        1.35        1.68 1.355:1.678     7
24          1        y      2        1.68        2.00 1.678:2.001    18
25          1        y      3        2.00        2.32 2.001:2.324    18
26          1        y      4        2.32        2.65 2.324:2.647     6
27          1        y      5        2.65        2.97  2.647:2.97     1
28          1        y      6        2.97        3.29  2.97:3.293     4
29          1        y      7        3.29        3.62 3.293:3.616     3
30          1        y      8        3.62        3.94 3.616:3.939    16
31          1        y      9        3.94        4.26 3.939:4.262    16
32          1        y     10        4.26        4.58 4.262:4.585    11
33          1        y     11          NA          NA           :     0
34          2       ID      1        1.00       10.90      1:10.9    10
35          2       ID      2       10.90       20.80   10.9:20.8    10
36          2       ID      3       20.80       30.70   20.8:30.7    10
37          2       ID      4       30.70       40.60   30.7:40.6    10
38          2       ID      5       40.60       50.50   40.6:50.5    10
39          2       ID      6       50.50       60.40   50.5:60.4     0
40          2       ID      7       60.40       70.30   60.4:70.3     0
41          2       ID      8       70.30       80.20   70.3:80.2     0
42          2       ID      9       80.20       90.10   80.2:90.1     0
43          2       ID     10       90.10      100.00    90.1:100     0
44          2       ID     11          NA          NA           :     0
45          2        x      1        1.40        1.72 1.399:1.722     0
46          2        x      2        1.72        2.04 1.722:2.045     0
47          2        x      3        2.04        2.37 2.045:2.368     0
48          2        x      4        2.37        2.69 2.368:2.691     0
49          2        x      5        2.69        3.01 2.691:3.014     0
50          2        x      6        3.01        3.34 3.014:3.337     0
51          2        x      7        3.34        3.66  3.337:3.66     4
52          2        x      8        3.66        3.98  3.66:3.984    18
53          2        x      9        3.98        4.31 3.984:4.307    22
54          2        x     10        4.31        4.63  4.307:4.63     6
55          2        x     11          NA          NA           :     0
56          2        y      1        1.35        1.68 1.355:1.678     0
57          2        y      2        1.68        2.00 1.678:2.001     0
58          2        y      3        2.00        2.32 2.001:2.324     0
59          2        y      4        2.32        2.65 2.324:2.647     0
60          2        y      5        2.65        2.97  2.647:2.97     0
61          2        y      6        2.97        3.29  2.97:3.293     4
62          2        y      7        3.29        3.62 3.293:3.616     3
63          2        y      8        3.62        3.94 3.616:3.939    16
64          2        y      9        3.94        4.26 3.939:4.262    16
65          2        y     10        4.26        4.58 4.262:4.585    11
66          2        y     11          NA          NA           :     0
67          3       ID      1        1.00       10.90      1:10.9     0
68          3       ID      2       10.90       20.80   10.9:20.8     0
69          3       ID      3       20.80       30.70   20.8:30.7     0
70          3       ID      4       30.70       40.60   30.7:40.6     0
71          3       ID      5       40.60       50.50   40.6:50.5     0
72          3       ID      6       50.50       60.40   50.5:60.4    10
73          3       ID      7       60.40       70.30   60.4:70.3    10
74          3       ID      8       70.30       80.20   70.3:80.2    10
75          3       ID      9       80.20       90.10   80.2:90.1    10
76          3       ID     10       90.10      100.00    90.1:100    10
77          3       ID     11          NA          NA           :     0
78          3        x      1        1.40        1.72 1.399:1.722    11
79          3        x      2        1.72        2.04 1.722:2.045    22
80          3        x      3        2.04        2.37 2.045:2.368    16
81          3        x      4        2.37        2.69 2.368:2.691     1
82          3        x      5        2.69        3.01 2.691:3.014     0
83          3        x      6        3.01        3.34 3.014:3.337     0
84          3        x      7        3.34        3.66  3.337:3.66     0
85          3        x      8        3.66        3.98  3.66:3.984     0
86          3        x      9        3.98        4.31 3.984:4.307     0
87          3        x     10        4.31        4.63  4.307:4.63     0
88          3        x     11          NA          NA           :     0
89          3        y      1        1.35        1.68 1.355:1.678     7
90          3        y      2        1.68        2.00 1.678:2.001    18
91          3        y      3        2.00        2.32 2.001:2.324    18
92          3        y      4        2.32        2.65 2.324:2.647     6
93          3        y      5        2.65        2.97  2.647:2.97     1
94          3        y      6        2.97        3.29  2.97:3.293     0
95          3        y      7        3.29        3.62 3.293:3.616     0
96          3        y      8        3.62        3.94 3.616:3.939     0
97          3        y      9        3.94        4.26 3.939:4.262     0
98          3        y     10        4.26        4.58 4.262:4.585     0
99          3        y     11          NA          NA           :     0
R> histogram(em.mod)
R>
R> em.res <- predict(em.mod, X, type="class", supplemental.cols=c("x", "y"))
R> head(em.res)
     x    y CLUSTER_ID
1 4.15 3.63          2
2 3.88 4.13          2
3 3.72 4.10          2
4 3.78 4.14          2
5 4.22 4.35          2
6 4.07 3.62          2
R> em.res.local <- ore.pull(em.res)
R> plot(data.frame(x=em.res.local$x, y=em.res.local$y), col=em.res.local$CLUSTER_ID)
R> points(em.mod$centers2, col = rownames(em.mod$centers2), pch=8, cex=2)
R>
R> head(predict(em.mod,X))
  '2'      '3' CLUSTER_ID
1   1 1.14e-54          2
2   1 1.63e-55          2
3   1 1.10e-51          2
4   1 1.53e-52          2
5   1 9.02e-62          2
6   1 3.20e-49          2
R> head(predict(em.mod,X,type=c("class","raw")))
  '2'      '3' CLUSTER_ID
1   1 1.14e-54          2
2   1 1.63e-55          2
3   1 1.10e-51          2
4   1 1.53e-52          2
5   1 9.02e-62          2
6   1 3.20e-49          2
R> head(predict(em.mod,X,type=c("class","raw"),supplemental.cols=c("x","y")))
  '2'      '3'    x    y CLUSTER_ID
1   1 1.14e-54 4.15 3.63          2
2   1 1.63e-55 3.88 4.13          2
3   1 1.10e-51 3.72 4.10          2
4   1 1.53e-52 3.78 4.14          2
5   1 9.02e-62 4.22 4.35          2
6   1 3.20e-49 4.07 3.62          2
R> head(predict(em.mod,X,type="raw",supplemental.cols=c("x","y")))
     x    y '2'      '3'
1 4.15 3.63   1 1.14e-54
2 3.88 4.13   1 1.63e-55
3 3.72 4.10   1 1.10e-51
4 3.78 4.14   1 1.53e-52
5 4.22 4.35   1 9.02e-62
6 4.07 3.62   1 3.20e-49