Oracle Database 12cリリース2(12.2)以降、ore.odmESA
関数は、Oracle Data Mining明示的セマンティック分析(ESA)アルゴリズムを使用するモデルを作成します。
ESAは、特徴抽出用にOracle Data Miningで使用される監視なしアルゴリズムです。ESAは、潜在的な特徴は検出しませんが、既存のナレッジ・ベースに基づく明示的特徴を使用します。
明示的ナレッジは、多くの場合テキスト・フォームで存在します。複数のナレッジ・ベースを、テキスト・ドキュメントのコレクションとして使用できます。これらのナレッジ・ベースは、Wikipediaなど汎用のものやドメイン固有のものの場合があります。データ準備は、テキストを、属性と概念の関連性を取得するベクターに変換します。
ore.odmESA
関数の引数の詳細は、help(ore.odmESA)
を呼び出してください。
例4-13 ore.odmESA関数の使用方法
title <- c('Aids in Africa: Planning for a long war', 'Mars rover maneuvers for rim shot', 'Mars express confirms presence of water at Mars south pole', 'NASA announces major Mars rover finding', 'Drug access, Asia threat in focus at AIDS summit', 'NASA Mars Odyssey THEMIS image: typical crater', 'Road blocks for Aids') # TEXT contents in character column df <- data.frame(CUST_ID = seq(length(title)), TITLE = title) ESA_TEXT <- ore.push(df) # TEXT contents in clob column attr(df$TITLE, "ora.type") <- "clob" ESA_TEXT_CLOB <- ore.push(df) # Create text policy (CTXSYS.CTX_DDL privilege is required) ore.exec("Begin ctx_ddl.create_policy('ESA_TXTPOL'); End;") # Specify TEXT POLICY_NAME, MIN_DOCUMENTS, MAX_FEATURES and # ESA algorithm settings in odm.settings esa.mod <- ore.odmESA(~ TITLE, data = ESA_TEXT_CLOB, odm.settings = list(case_id_column_name = "CUST_ID", ODMS_TEXT_POLICY_NAME = "ESA_TXTPOL", ODMS_TEXT_MIN_DOCUMENTS = 1, ODMS_TEXT_MAX_FEATURES = 3, ESAS_MIN_ITEMS = 1, ESAS_VALUE_THRESHOLD = 0.0001, ESAS_TOPN_FEATURES = 3)) class(esa.mod) summary(esa.mod) settings(esa.mod) features(esa.mod) predict(esa.mod, ESA_TEXT, type = "class", supplemental.cols = "TITLE") # Use ctx.settings to specify a character column as TEXT and # the same settings as above as well as TOKEN_TYPE esa.mod2 <- ore.odmESA(~ TITLE, data = ESA_TEXT, odm.settings = list(case_id_column_name = "CUST_ID", ESAS_MIN_ITEMS = 1), ctx.settings = list(TITLE = "TEXT(POLICY_NAME:ESA_TXTPOL)(TOKEN_TYPE:STEM)(MIN_DOCUMENTS:1)(MAX_FEATURES:3)")) summary(esa.mod2) settings(esa.mod2) features(esa.mod2) predict(esa.mod2, ESA_TEXT_CLOB, type = "class", supplemental.cols = "TITLE") ore.exec("Begin ctx_ddl.drop_policy('ESA_TXTPOL'); End;")
この例のリスト
R> title <- c('Aids in Africa: Planning for a long war', + 'Mars rover maneuvers for rim shot', + 'Mars express confirms presence of water at Mars south pole', + 'NASA announces major Mars rover finding', + 'Drug access, Asia threat in focus at AIDS summit', + 'NASA Mars Odyssey THEMIS image: typical crater', + 'Road blocks for Aids') R> R> # TEXT contents in character column R> df <- data.frame(CUST_ID = seq(length(title)), TITLE = title) R> ESA_TEXT <- ore.push(df) R> R> # TEXT contents in clob column R> attr(df$TITLE, "ora.type") <- "clob" R> ESA_TEXT_CLOB <- ore.push(df) R> R> # Create a text policy (CTXSYS.CTX_DDL privilege is required) R> ore.exec("Begin ctx_ddl.create_policy('ESA_TXTPOL'); End;") R> R> # Specify TEXT POLICY_NAME, MIN_DOCUMENTS, MAX_FEATURES and R> # ESA algorithm settings in odm.settings R> esa.mod <- ore.odmESA(~ TITLE, data = ESA_TEXT_CLOB, + odm.settings = list(case_id_column_name = "CUST_ID", + ODMS_TEXT_POLICY_NAME = "ESA_TXTPOL", + ODMS_TEXT_MIN_DOCUMENTS = 1, + ODMS_TEXT_MAX_FEATURES = 3, + ESAS_MIN_ITEMS = 1, + ESAS_VALUE_THRESHOLD = 0.0001, + ESAS_TOPN_FEATURES = 3)) R> class(esa.mod) [1] "ore.odmESA" "ore.model" R> summary(esa.mod) Call: ore.odmESA(formula = ~TITLE, data = ESA_TEXT_CLOB, odm.settings = list(case_id_column_name = "CUST_ID", ODMS_TEXT_POLICY_NAME = "ESA_TXTPOL", ODMS_TEXT_MIN_DOCUMENTS = 1, ODMS_TEXT_MAX_FEATURES = 3, ESAS_MIN_ITEMS = 1, ESAS_VALUE_THRESHOLD = 1e-04, ESAS_TOPN_FEATURES = 3)) Settings: value min.items 1 topn.features 3 value.threshold 1e-04 odms.missing.value.treatment odms.missing.value.auto odms.sampling odms.sampling.disable odms.text.max.features 3 odms.text.min.documents 1 odms.text.policy.name ESA_TXTPOL prep.auto ON Features: FEATURE_ID ATTRIBUTE_NAME ATTRIBUTE_VALUE COEFFICIENT 1 1 TITLE.AIDS <NA> 1.0000000 2 2 TITLE.MARS <NA> 0.4078615 3 2 TITLE.ROVER <NA> 0.9130438 4 3 TITLE.MARS <NA> 1.0000000 5 4 TITLE.NASA <NA> 0.6742695 6 4 TITLE.ROVER <NA> 0.6742695 7 5 TITLE.AIDS <NA> 1.0000000 8 6 TITLE.MARS <NA> 0.4078615 9 6 TITLE.NASA <NA> 0.9130438 10 7 TITLE.AIDS <NA> 1.0000000 R> settings(esa.mod) SETTING_NAME SETTING_VALUE SETTING_TYPE 1 ALGO_NAME ALGO_EXPLICIT_SEMANTIC_ANALYS INPUT 2 ESAS_MIN_ITEMS 1 INPUT 3 ESAS_TOPN_FEATURES 3 INPUT 4 ESAS_VALUE_THRESHOLD 1e-04 INPUT 5 ODMS_MISSING_VALUE_TREATMENT ODMS_MISSING_VALUE_AUTO DEFAULT 6 ODMS_SAMPLING ODMS_SAMPLING_DISABLE DEFAULT 7 ODMS_TEXT_MAX_FEATURES 3 INPUT 8 ODMS_TEXT_MIN_DOCUMENTS 1 INPUT 9 ODMS_TEXT_POLICY_NAME ESA_TXTPOL INPUT 10 PREP_AUTO ON INPUT R> features(esa.mod) FEATURE_ID ATTRIBUTE_NAME ATTRIBUTE_VALUE COEFFICIENT 1 1 TITLE.AIDS <NA> 1.0000000 2 2 TITLE.MARS <NA> 0.4078615 3 2 TITLE.ROVER <NA> 0.9130438 4 3 TITLE.MARS <NA> 1.0000000 5 4 TITLE.NASA <NA> 0.6742695 6 4 TITLE.ROVER <NA> 0.6742695 7 5 TITLE.AIDS <NA> 1.0000000 8 6 TITLE.MARS <NA> 0.4078615 9 6 TITLE.NASA <NA> 0.9130438 10 7 TITLE.AIDS <NA> 1.0000000 R> predict(esa.mod, ESA_TEXT, type = "class", supplemental.cols = "TITLE") TITLE FEATURE_ID 1 Aids in Africa: Planning for a long war 1 2 Mars rover maneuvers for rim shot 2 3 Mars express confirms presence of water at Mars south pole 3 4 NASA announces major Mars rover finding 4 5 Drug access, Asia threat in focus at AIDS summit 1 6 NASA Mars Odyssey THEMIS image: typical crater 6 7 Road blocks for Aids 1 R> R> # Use ctx.settings to specify a character column as TEXT and R> # the same settings as above as well as TOKEN_TYPE R> esa.mod2 <- ore.odmESA(~ TITLE, data = ESA_TEXT, + odm.settings = list(case_id_column_name = "CUST_ID", ESAS_MIN_ITEMS = 1), + ctx.settings = list(TITLE = + "TEXT(POLICY_NAME:ESA_TXTPOL)(TOKEN_TYPE:STEM)(MIN_DOCUMENTS:1)(MAX_FEATURES:3)")) R> summary(esa.mod2) Call: ore.odmESA(formula = ~TITLE, data = ESA_TEXT, odm.settings = list(case_id_column_name = "CUST_ID", ESAS_MIN_ITEMS = 1), ctx.settings = list(TITLE = "TEXT(POLICY_NAME:ESA_TXTPOL)(TOKEN_TYPE:STEM)(MIN_DOCUMENTS:1)(MAX_FEATURES:3)")) Settings: value min.items 1 topn.features 1000 value.threshold .00000001 odms.missing.value.treatment odms.missing.value.auto odms.sampling odms.sampling.disable odms.text.max.features 300000 odms.text.min.documents 3 prep.auto ON Features: FEATURE_ID ATTRIBUTE_NAME ATTRIBUTE_VALUE COEFFICIENT 1 1 TITLE.AIDS <NA> 1.0000000 2 2 TITLE.MARS <NA> 0.4078615 3 2 TITLE.ROVER <NA> 0.9130438 4 3 TITLE.MARS <NA> 1.0000000 5 4 TITLE.MARS <NA> 0.3011997 6 4 TITLE.NASA <NA> 0.6742695 7 4 TITLE.ROVER <NA> 0.6742695 8 5 TITLE.AIDS <NA> 1.0000000 9 6 TITLE.MARS <NA> 0.4078615 10 6 TITLE.NASA <NA> 0.9130438 11 7 TITLE.AIDS <NA> 1.0000000 R> settings(esa.mod2) SETTING_NAME SETTING_VALUE SETTING_TYPE 1 ALGO_NAME ALGO_EXPLICIT_SEMANTIC_ANALYS INPUT 2 ESAS_MIN_ITEMS 1 INPUT 3 ESAS_TOPN_FEATURES 1000 DEFAULT 4 ESAS_VALUE_THRESHOLD .00000001 DEFAULT 5 ODMS_MISSING_VALUE_TREATMENT ODMS_MISSING_VALUE_AUTO DEFAULT 6 ODMS_SAMPLING ODMS_SAMPLING_DISABLE DEFAULT 7 ODMS_TEXT_MAX_FEATURES 300000 DEFAULT 8 ODMS_TEXT_MIN_DOCUMENTS 3 DEFAULT 9 PREP_AUTO ON INPUT R> features(esa.mod2) FEATURE_ID ATTRIBUTE_NAME ATTRIBUTE_VALUE COEFFICIENT 1 1 TITLE.AIDS <NA> 1.0000000 2 2 TITLE.MARS <NA> 0.4078615 3 2 TITLE.ROVER <NA> 0.9130438 4 3 TITLE.MARS <NA> 1.0000000 5 4 TITLE.MARS <NA> 0.3011997 6 4 TITLE.NASA <NA> 0.6742695 7 4 TITLE.ROVER <NA> 0.6742695 8 5 TITLE.AIDS <NA> 1.0000000 9 6 TITLE.MARS <NA> 0.4078615 10 6 TITLE.NASA <NA> 0.9130438 11 7 TITLE.AIDS <NA> 1.0000000 R> predict(esa.mod2, ESA_TEXT_CLOB, type = "class", supplemental.cols = "TITLE") TITLE FEATURE_ID 1 Aids in Africa: Planning for a long war 1 2 Mars rover maneuvers for rim shot 2 3 Mars express confirms presence of water at Mars south pole 3 4 NASA announces major Mars rover finding 4 5 Drug access, Asia threat in focus at AIDS summit 1 6 NASA Mars Odyssey THEMIS image: typical crater 6 7 Road blocks for Aids 1 R> R> ore.exec("Begin ctx_ddl.drop_policy('ESA_TXTPOL'); End;")