6.2.5.2 Partitioning on Multiple Columns

The ore.groupApply function takes a single column or multiple columns as the INDEX argument.

Example 6-14 uses data from the CHURN_TRAIN data set to build an rpart model that produces rules on the partitions of data specified, which are the voice_mail_plan and international_plan columns. The example uses the R table function to show the number of rows to expect in each partition.

The example next invokes the ore.scriptDrop function to ensure that no script by the specified name exists in the Oracle Database R script repository. It then uses the ore.scriptCreate function to define a script named my_rpartFunction and to store it in the repository. The stored script defines a function that takes a data source and a prefix to use for naming Oracle R Enterprise datastore objects. Each invocation of the function my_rpartFunction receives data from one of the partitions identified by the values in the voice_mail_plan and international_plan columns. Because the source partition columns are constants, the function sets them to NULL. It converts the character vectors to factors, builds a model to predict churn, and saves it in an appropriately named datastore. The function creates a list to return the specific partition column values, the distribution of churn values, and the model itself.

The example then loads the rpart library, sets the datastore prefix, and invokes ore.groupApply using the values from the voice_mail_plan and international_plan columns as the INDEX argument and my_rpartFunction as the value of the FUN.NAME argument to invoke the user-defined function stored in the R script repository. The ore.groupApply function uses an optional argument to pass the datastorePrefix variable to the user-defined function. It uses the optional argument ore.connect to connect to the database when executing the user-defined function. The ore.groupApply function returns an ore.list object as the variable res.

The example displays the first entry in the list returned. It then invokes the ore.load function to load the model for the case where the customer has both the voice mail plan and the international plan.

Example 6-14 Using ore.groupApply for Partitioning Data on Multiple Columns

library(C50)
data(churn)
ore.drop("CHURN_TRAIN") 
ore.create(churnTrain, "CHURN_TRAIN")
 
table(CHURN_TRAIN$international_plan, CHURN_TRAIN$voice_mail_plan)

options(width = 80)
head(CHURN_TRAIN, 3)

ore.scriptDrop("my_rpartFunction")
ore.scriptCreate("my_rpartFunction",
  function(dat, datastorePrefix) {
    library(rpart)
    vmp <- dat[1, "voice_mail_plan"]
    ip <- dat[1, "international_plan"]
    datastoreName <- paste(datastorePrefix, vmp, ip, sep = "_")
    dat$voice_mail_plan <- NULL
    dat$international_plan <- NULL
    dat$state <- as.factor(dat$state)
    dat$churn <- as.factor(dat$churn)
    dat$area_code <- as.factor(dat$area_code)
    mod <- rpart(churn ~ ., data = dat)
    ore.save(mod, name = datastoreName, overwrite = TRUE)
    list(voice_mail_plan = vmp,
        international_plan = ip,
        churn.table = table(dat$churn),
        rpart.model = mod)
  })

library(rpart)
datastorePrefix = "my.rpartModel"
 
res <- ore.groupApply(CHURN_TRAIN,
      INDEX = CHURN_TRAIN[, c("voice_mail_plan", "international_plan")],
      FUN.NAME = "my_rpartFunction",
      datastorePrefix = datastorePrefix,
      ore.connect = TRUE)
res[[1]]
ore.load(name=paste(datastorePrefix, "yes", "yes", sep = "_"))
mod

Listing for Example 6-14

R> library(C50)
R> data(churn)
R> ore.drop("CHURN_TRAIN") 
R> ore.create(churnTrain, "CHURN_TRAIN")
R>  
R> table(CHURN_TRAIN$international_plan, CHURN_TRAIN$voice_mail_plan)
     
        no  yes
  no  2180  830
  yes  231   92
R>
R> options(width = 80)
R> head(CHURN_TRAIN, 3)
  state account_length     area_code international_plan voice_mail_plan
1    KS            128 area_code_415                 no             yes
2    OH            107 area_code_415                 no             yes
3    NJ            137 area_code_415                 no              no
  number_vmail_messages total_day_minutes total_day_calls total_day_charge
1                    25             265.1             110            45.07
2                    26             161.6             123            27.47
3                     0             243.4             114            41.38
  total_eve_minutes total_eve_calls total_eve_charge total_night_minutes
1             197.4              99            16.78               244.7
2             195.5             103            16.62               254.4
3             121.2             110            10.30               162.6
  total_night_calls total_night_charge total_intl_minutes total_intl_calls
1                91              11.01               10.0                3
2               103              11.45               13.7                3
3               104               7.32               12.2                5
  total_intl_charge number_customer_service_calls churn
1              2.70                             1    no
2              3.70                             1    no
3              3.29                             0    no
Warning messages:
1: ORE object has no unique key - using random order
2: ORE object has no unique key - using random order
R> 
R> ore.scriptDrop("my_rpartFunction")
R> ore.scriptCreate("my_rpartFunction",
+   function(dat, datastorePrefix) {
+     library(rpart)
+     vmp <- dat[1, "voice_mail_plan"]
+     ip <- dat[1, "international_plan"]
+     datastoreName <- paste(datastorePrefix, vmp, ip, sep = "_")
+     dat$voice_mail_plan <- NULL
+     dat$international_plan <- NULL
+     dat$state <- as.factor(dat$state)
+     dat$churn <- as.factor(dat$churn)
+     dat$area_code <- as.factor(dat$area_code)
+     mod <- rpart(churn ~ ., data = dat)
+     ore.save(mod, name = datastoreName, overwrite = TRUE)
+     list(voice_mail_plan = vmp,
+         international_plan = ip,
+         churn.table = table(dat$churn),
+         rpart.model = mod)
+   })
R> 
R> library(rpart)
R> datastorePrefix = "my.rpartModel"
R> 
R> res <- ore.groupApply(CHURN_TRAIN,
+       INDEX = CHURN_TRAIN[, c("voice_mail_plan", "international_plan")],
+       FUN.NAME = "my_rpartFunction",
+       datastorePrefix = datastorePrefix,
+       ore.connect = TRUE)
R> res[[1]]
$voice_mail_plan
[1] "no"
 
$international_plan
[1] "no"
 
$churn.table
 
  no  yes 
1878  302 
 
$rpart.model
n= 2180 
 
node), split, n, loss, yval, (yprob)
      * denotes terminal node
 
 1) root 2180 302 no (0.86146789 0.13853211)  
   2) total_day_minutes< 263.55 2040 192 no (0.90588235 0.09411765)  
     4) number_customer_service_calls< 3.5 1876 108 no (0.94243070 0.05756930)  
       8) total_day_minutes< 223.25 1599  44 no (0.97248280 0.02751720) *
       9) total_day_minutes>=223.25 277  64 no (0.76895307 0.23104693)  
        18) total_eve_minutes< 242.35 210  18 no (0.91428571 0.08571429) *
        19) total_eve_minutes>=242.35 67  21 yes (0.31343284 0.68656716)  
          38) total_night_minutes< 174.2 17   4 no (0.76470588 0.23529412) *
          39) total_night_minutes>=174.2 50   8 yes (0.16000000 0.84000000) *
     5) number_customer_service_calls>=3.5 164  80 yes (0.48780488 0.51219512)  
      10) total_day_minutes>=160.2 95  22 no (0.76842105 0.23157895)  
        20) state=AL,AZ,CA,CO,DC,DE,FL,HI,KS,KY,MA,MD,ME,MI,NC,ND,NE,NH,NM,OK,OR,SC,TN,VA,VT,WY 56   2 no (0.96428571 0.03571429) *
        21) state=AK,AR,CT,GA,IA,ID,MN,MO,NJ,NV,NY,OH,RI,TX,UT,WA,WV 39  19 yes (0.48717949 0.51282051)  
          42) total_day_minutes>=182.3 21   5 no (0.76190476 0.23809524) *
          43) total_day_minutes< 182.3 18   3 yes (0.16666667 0.83333333) *
      11) total_day_minutes< 160.2 69   7 yes (0.10144928 0.89855072) *
   3) total_day_minutes>=263.55 140  30 yes (0.21428571 0.78571429)  
     6) total_eve_minutes< 167.3 29   7 no (0.75862069 0.24137931)  
      12) state=AK,AR,AZ,CO,CT,FL,HI,IN,KS,LA,MD,ND,NM,NY,OH,UT,WA,WV 21   0 no (1.00000000 0.00000000) *
      13) state=IA,MA,MN,PA,SD,TX,WI 8   1 yes (0.12500000 0.87500000) *
     7) total_eve_minutes>=167.3 111   8 yes (0.07207207 0.92792793) *
 
R> ore.load(name = paste(datastorePrefix, "yes", "yes", sep = "_"))
[1] "mod"
R> mod
n= 92 
 
node), split, n, loss, yval, (yprob)
      * denotes terminal node
 
1) root 92 36 no (0.60869565 0.39130435)  
  2) total_intl_minutes< 13.1 71 15 no (0.78873239 0.21126761)  
    4) total_intl_calls>=2.5 60  4 no (0.93333333 0.06666667)  
      8) state=AK,AR,AZ,CO,CT,DC,DE,FL,GA,HI,ID,IL,IN,KS,MD,MI,MO,MS,MT,NC,ND,NE,NH,NJ,OH,SC,SD,UT,VA,WA,WV,WY 53  0 no (1.00000000 0.00000000) *
      9) state=ME,NM,VT,WI 7  3 yes (0.42857143 0.57142857) *
    5) total_intl_calls< 2.5 11  0 yes (0.00000000 1.00000000) *
  3) total_intl_minutes>=13.1 21  0 yes (0.00000000 1.00000000) *