Beginning in Oracle Database 12c, Release 2 (12.2), the ore.odmESA function creates a model that uses the Oracle Data Mining Explicit Semantic Analysis (ESA) algorithm.
ESA is an unsupervised algorithm used by Oracle Data Mining for feature extraction. ESA does not discover latent features but instead uses explicit features based on an existing knowledge base.
Explicit knowledge often exists in text form. Multiple knowledge bases are available as collections of text documents. These knowledge bases can be generic, for example, Wikipedia, or domain-specific. Data preparation transforms the text into vectors that capture attribute-concept associations.
For information on the ore.odmESA function arguments, invoke help(ore.odmESA).
Example 4-13 Using the ore.odmESA Function
title <- c('Aids in Africa: Planning for a long war',
'Mars rover maneuvers for rim shot',
'Mars express confirms presence of water at Mars south pole',
'NASA announces major Mars rover finding',
'Drug access, Asia threat in focus at AIDS summit',
'NASA Mars Odyssey THEMIS image: typical crater',
'Road blocks for Aids')
# TEXT contents in character column
df <- data.frame(CUST_ID = seq(length(title)), TITLE = title)
ESA_TEXT <- ore.push(df)
# TEXT contents in clob column
attr(df$TITLE, "ora.type") <- "clob"
ESA_TEXT_CLOB <- ore.push(df)
# Create text policy (CTXSYS.CTX_DDL privilege is required)
ore.exec("Begin ctx_ddl.create_policy('ESA_TXTPOL'); End;")
# Specify TEXT POLICY_NAME, MIN_DOCUMENTS, MAX_FEATURES and
# ESA algorithm settings in odm.settings
esa.mod <- ore.odmESA(~ TITLE, data = ESA_TEXT_CLOB,
odm.settings = list(case_id_column_name = "CUST_ID",
ODMS_TEXT_POLICY_NAME = "ESA_TXTPOL",
ODMS_TEXT_MIN_DOCUMENTS = 1,
ODMS_TEXT_MAX_FEATURES = 3,
ESAS_MIN_ITEMS = 1,
ESAS_VALUE_THRESHOLD = 0.0001,
ESAS_TOPN_FEATURES = 3))
class(esa.mod)
summary(esa.mod)
settings(esa.mod)
features(esa.mod)
predict(esa.mod, ESA_TEXT, type = "class", supplemental.cols = "TITLE")
# Use ctx.settings to specify a character column as TEXT and
# the same settings as above as well as TOKEN_TYPE
esa.mod2 <- ore.odmESA(~ TITLE, data = ESA_TEXT,
odm.settings = list(case_id_column_name = "CUST_ID", ESAS_MIN_ITEMS = 1),
ctx.settings = list(TITLE =
"TEXT(POLICY_NAME:ESA_TXTPOL)(TOKEN_TYPE:STEM)(MIN_DOCUMENTS:1)(MAX_FEATURES:3)"))
summary(esa.mod2)
settings(esa.mod2)
features(esa.mod2)
predict(esa.mod2, ESA_TEXT_CLOB, type = "class", supplemental.cols = "TITLE")
ore.exec("Begin ctx_ddl.drop_policy('ESA_TXTPOL'); End;")
Listing for This Example
R> title <- c('Aids in Africa: Planning for a long war',
+ 'Mars rover maneuvers for rim shot',
+ 'Mars express confirms presence of water at Mars south pole',
+ 'NASA announces major Mars rover finding',
+ 'Drug access, Asia threat in focus at AIDS summit',
+ 'NASA Mars Odyssey THEMIS image: typical crater',
+ 'Road blocks for Aids')
R>
R> # TEXT contents in character column
R> df <- data.frame(CUST_ID = seq(length(title)), TITLE = title)
R> ESA_TEXT <- ore.push(df)
R>
R> # TEXT contents in clob column
R> attr(df$TITLE, "ora.type") <- "clob"
R> ESA_TEXT_CLOB <- ore.push(df)
R>
R> # Create a text policy (CTXSYS.CTX_DDL privilege is required)
R> ore.exec("Begin ctx_ddl.create_policy('ESA_TXTPOL'); End;")
R>
R> # Specify TEXT POLICY_NAME, MIN_DOCUMENTS, MAX_FEATURES and
R> # ESA algorithm settings in odm.settings
R> esa.mod <- ore.odmESA(~ TITLE, data = ESA_TEXT_CLOB,
+ odm.settings = list(case_id_column_name = "CUST_ID",
+ ODMS_TEXT_POLICY_NAME = "ESA_TXTPOL",
+ ODMS_TEXT_MIN_DOCUMENTS = 1,
+ ODMS_TEXT_MAX_FEATURES = 3,
+ ESAS_MIN_ITEMS = 1,
+ ESAS_VALUE_THRESHOLD = 0.0001,
+ ESAS_TOPN_FEATURES = 3))
R> class(esa.mod)
[1] "ore.odmESA" "ore.model"
R> summary(esa.mod)
Call:
ore.odmESA(formula = ~TITLE, data = ESA_TEXT_CLOB, odm.settings = list(case_id_column_name = "CUST_ID",
ODMS_TEXT_POLICY_NAME = "ESA_TXTPOL", ODMS_TEXT_MIN_DOCUMENTS = 1,
ODMS_TEXT_MAX_FEATURES = 3, ESAS_MIN_ITEMS = 1, ESAS_VALUE_THRESHOLD = 1e-04,
ESAS_TOPN_FEATURES = 3))
Settings:
value
min.items 1
topn.features 3
value.threshold 1e-04
odms.missing.value.treatment odms.missing.value.auto
odms.sampling odms.sampling.disable
odms.text.max.features 3
odms.text.min.documents 1
odms.text.policy.name ESA_TXTPOL
prep.auto ON
Features:
FEATURE_ID ATTRIBUTE_NAME ATTRIBUTE_VALUE COEFFICIENT
1 1 TITLE.AIDS <NA> 1.0000000
2 2 TITLE.MARS <NA> 0.4078615
3 2 TITLE.ROVER <NA> 0.9130438
4 3 TITLE.MARS <NA> 1.0000000
5 4 TITLE.NASA <NA> 0.6742695
6 4 TITLE.ROVER <NA> 0.6742695
7 5 TITLE.AIDS <NA> 1.0000000
8 6 TITLE.MARS <NA> 0.4078615
9 6 TITLE.NASA <NA> 0.9130438
10 7 TITLE.AIDS <NA> 1.0000000
R> settings(esa.mod)
SETTING_NAME SETTING_VALUE SETTING_TYPE
1 ALGO_NAME ALGO_EXPLICIT_SEMANTIC_ANALYS INPUT
2 ESAS_MIN_ITEMS 1 INPUT
3 ESAS_TOPN_FEATURES 3 INPUT
4 ESAS_VALUE_THRESHOLD 1e-04 INPUT
5 ODMS_MISSING_VALUE_TREATMENT ODMS_MISSING_VALUE_AUTO DEFAULT
6 ODMS_SAMPLING ODMS_SAMPLING_DISABLE DEFAULT
7 ODMS_TEXT_MAX_FEATURES 3 INPUT
8 ODMS_TEXT_MIN_DOCUMENTS 1 INPUT
9 ODMS_TEXT_POLICY_NAME ESA_TXTPOL INPUT
10 PREP_AUTO ON INPUT
R> features(esa.mod)
FEATURE_ID ATTRIBUTE_NAME ATTRIBUTE_VALUE COEFFICIENT
1 1 TITLE.AIDS <NA> 1.0000000
2 2 TITLE.MARS <NA> 0.4078615
3 2 TITLE.ROVER <NA> 0.9130438
4 3 TITLE.MARS <NA> 1.0000000
5 4 TITLE.NASA <NA> 0.6742695
6 4 TITLE.ROVER <NA> 0.6742695
7 5 TITLE.AIDS <NA> 1.0000000
8 6 TITLE.MARS <NA> 0.4078615
9 6 TITLE.NASA <NA> 0.9130438
10 7 TITLE.AIDS <NA> 1.0000000
R> predict(esa.mod, ESA_TEXT, type = "class", supplemental.cols = "TITLE")
TITLE FEATURE_ID
1 Aids in Africa: Planning for a long war 1
2 Mars rover maneuvers for rim shot 2
3 Mars express confirms presence of water at Mars south pole 3
4 NASA announces major Mars rover finding 4
5 Drug access, Asia threat in focus at AIDS summit 1
6 NASA Mars Odyssey THEMIS image: typical crater 6
7 Road blocks for Aids 1
R>
R> # Use ctx.settings to specify a character column as TEXT and
R> # the same settings as above as well as TOKEN_TYPE
R> esa.mod2 <- ore.odmESA(~ TITLE, data = ESA_TEXT,
+ odm.settings = list(case_id_column_name = "CUST_ID", ESAS_MIN_ITEMS = 1),
+ ctx.settings = list(TITLE =
+ "TEXT(POLICY_NAME:ESA_TXTPOL)(TOKEN_TYPE:STEM)(MIN_DOCUMENTS:1)(MAX_FEATURES:3)"))
R> summary(esa.mod2)
Call:
ore.odmESA(formula = ~TITLE, data = ESA_TEXT, odm.settings = list(case_id_column_name = "CUST_ID",
ESAS_MIN_ITEMS = 1), ctx.settings = list(TITLE = "TEXT(POLICY_NAME:ESA_TXTPOL)(TOKEN_TYPE:STEM)(MIN_DOCUMENTS:1)(MAX_FEATURES:3)"))
Settings:
value
min.items 1
topn.features 1000
value.threshold .00000001
odms.missing.value.treatment odms.missing.value.auto
odms.sampling odms.sampling.disable
odms.text.max.features 300000
odms.text.min.documents 3
prep.auto ON
Features:
FEATURE_ID ATTRIBUTE_NAME ATTRIBUTE_VALUE COEFFICIENT
1 1 TITLE.AIDS <NA> 1.0000000
2 2 TITLE.MARS <NA> 0.4078615
3 2 TITLE.ROVER <NA> 0.9130438
4 3 TITLE.MARS <NA> 1.0000000
5 4 TITLE.MARS <NA> 0.3011997
6 4 TITLE.NASA <NA> 0.6742695
7 4 TITLE.ROVER <NA> 0.6742695
8 5 TITLE.AIDS <NA> 1.0000000
9 6 TITLE.MARS <NA> 0.4078615
10 6 TITLE.NASA <NA> 0.9130438
11 7 TITLE.AIDS <NA> 1.0000000
R> settings(esa.mod2)
SETTING_NAME SETTING_VALUE SETTING_TYPE
1 ALGO_NAME ALGO_EXPLICIT_SEMANTIC_ANALYS INPUT
2 ESAS_MIN_ITEMS 1 INPUT
3 ESAS_TOPN_FEATURES 1000 DEFAULT
4 ESAS_VALUE_THRESHOLD .00000001 DEFAULT
5 ODMS_MISSING_VALUE_TREATMENT ODMS_MISSING_VALUE_AUTO DEFAULT
6 ODMS_SAMPLING ODMS_SAMPLING_DISABLE DEFAULT
7 ODMS_TEXT_MAX_FEATURES 300000 DEFAULT
8 ODMS_TEXT_MIN_DOCUMENTS 3 DEFAULT
9 PREP_AUTO ON INPUT
R> features(esa.mod2)
FEATURE_ID ATTRIBUTE_NAME ATTRIBUTE_VALUE COEFFICIENT
1 1 TITLE.AIDS <NA> 1.0000000
2 2 TITLE.MARS <NA> 0.4078615
3 2 TITLE.ROVER <NA> 0.9130438
4 3 TITLE.MARS <NA> 1.0000000
5 4 TITLE.MARS <NA> 0.3011997
6 4 TITLE.NASA <NA> 0.6742695
7 4 TITLE.ROVER <NA> 0.6742695
8 5 TITLE.AIDS <NA> 1.0000000
9 6 TITLE.MARS <NA> 0.4078615
10 6 TITLE.NASA <NA> 0.9130438
11 7 TITLE.AIDS <NA> 1.0000000
R> predict(esa.mod2, ESA_TEXT_CLOB, type = "class", supplemental.cols = "TITLE")
TITLE FEATURE_ID
1 Aids in Africa: Planning for a long war 1
2 Mars rover maneuvers for rim shot 2
3 Mars express confirms presence of water at Mars south pole 3
4 NASA announces major Mars rover finding 4
5 Drug access, Asia threat in focus at AIDS summit 1
6 NASA Mars Odyssey THEMIS image: typical crater 6
7 Road blocks for Aids 1
R>
R> ore.exec("Begin ctx_ddl.drop_policy('ESA_TXTPOL'); End;")