This topic provides an example CAS crawl script with a crawl that
is configured to write to record file output. To create a similar CAS crawl
script in your application, add code to
DataIngest.xml
that specifies the CAS crawl to run
locks the crawl (to wait for any running crawls to complete), runs the crawl,
and releases the lock. Depending on your environment, you may need a script
that runs a full CAS crawl and a script that runs an incremental CAS crawl.
This example
DataIngest.xml
code runs a
full crawl that writes to record file output:
<!-- ######################################################################## # full crawl script # --> <script id="MyCrawl_fullCrawl"> <log-dir>./logs/provisioned_scripts</log-dir> <provisioned-script-command>./control/runcommand.bat MyCrawl_fullCrawl run</provisioned-script-command> <bean-shell-script> <![CDATA[ crawlName = "MyCrawl"; log.info("Starting full CAS crawl '" + crawlName + "'."); // obtain lock if (LockManager.acquireLock("crawl_lock_" + crawlName)) { if (!CAS.isCrawlFileOutput(crawlName)) { throw new UnsupportedOperationException("The crawl " + crawlName + " does not have a File System output type. The only supported output type for this script is File System."); } log.info("Starting full CAS crawl '" + crawlName + "'."); // Remove all files from the crawl's output directory CAS.cleanOutputDir(crawlName); CAS.runBaselineCasCrawl(crawlName); // Rename the output to files to include the crawl name // so they do not collide with the output from other crawls CAS.renameBaselineCrawlOutput(crawlName); destDir = PathUtils.getAbsolutePath(CAS.getWorkingDir(), CAS.getCasCrawlFullOutputDestDir()); // create the target dir, if it doesn't already exist mkDirUtil = new CreateDirUtility(CAS.getAppName(), CAS.getEacHost(), CAS.getEacPort(), CAS.isSslEnabled()); mkDirUtil.init(CAS.getCasCrawlOutputDestHost(), destDir, CAS.getWorkingDir()); mkDirUtil.run(); // clear the destination dir of full crawl from previous crawls CAS.clearFullCrawlOutputFromDestinationDir(crawlName); // remove previously collected incremental crawl files, // which are expected to be incorporated in this full crawl CAS.clearIncrementalCrawlOutputFromDestinationDir(crawlName); // copy the full crawl output to destination directory CAS.copyBaselineCrawlOutputToDestinationDir(crawlName); LockManager.releaseLock("crawl_lock_" + crawlName); } else { log.warning("Failed to obtain lock."); } log.info("Finished full CAS crawl '" + crawlName + "'."); ]]> </bean-shell-script> </script>
This example
DataIngest.xml
code runs an
incremental crawl that writes to record file output:
<!-- ######################################################################## # incremental crawl script # --> <script id="MyCrawl_IncrementalCrawl"> <log-dir>./logs/provisioned_scripts</log-dir> <provisioned-script-command>./control/runcommand.bat MyCrawl_IncrementalCrawl run</provisioned-script-command> <bean-shell-script> <![CDATA[ crawlName = "MyCrawl"; log.info("Starting incremental CAS crawl '" + crawlName + "'."); // obtain lock if (LockManager.acquireLock("crawl_lock_" + crawlName)) { if (!CAS.isCrawlFileOutput(crawlName)) { throw new UnsupportedOperationException("The crawl " + crawlName + " does not have a File System output type. The only supported output type for this script is File System."); } log.info("Starting incremental CAS crawl '" + crawlName + "'."); // Remove all files from the crawl's output directory CAS.cleanOutputDir(crawlName); CAS.runIncrementalCasCrawl(crawlName); // Timestamp and rename the output to files to include the // crawl name so they do not collide with the output from // previous incremental output from this crawl or incremental // output from other crawls CAS.renameIncrementalCrawlOutput(crawlName); destDir = PathUtils.getAbsolutePath(CAS.getWorkingDir(), CAS.getCasCrawlIncrementalOutputDestDir()); // create the target dir, if it doesn't already exist mkDirUtil = new CreateDirUtility(CAS.getAppName(), CAS.getEacHost(), CAS.getEacPort(), CAS.isSslEnabled()); mkDirUtil.init(CAS.getCasCrawlOutputDestHost(), destDir, CAS.getWorkingDir()); mkDirUtil.run(); // copy crawl output to destination directory // Note: We assume a downstream process removes incremental crawl output // from this directory that has already been processed. CAS.copyIncrementalCrawlOutputToDestinationDir(crawlName); LockManager.releaseLock("crawl_lock_" + crawlName); } else { log.warning("Failed to obtain lock."); } log.info("Finished incremental CAS crawl '" + crawlName + "'."); ]]> </bean-shell-script> </script>