Oracle Commerce Guided Search - CAS crawl scripts for record file output

CAS crawl scripts for record file output

This topic provides an example CAS crawl script with a crawl that is configured to write to record file output. To create a similar CAS crawl script in your application, add code to DataIngest.xml that specifies the CAS crawl to run locks the crawl (to wait for any running crawls to complete), runs the crawl, and releases the lock. Depending on your environment, you may need a script that runs a full CAS crawl and a script that runs an incremental CAS crawl.

This example DataIngest.xml code runs a full crawl that writes to record file output:

<!--
    ########################################################################
    # full crawl script
    #
 -->

  <script id="MyCrawl_fullCrawl">
    <log-dir>./logs/provisioned_scripts</log-dir>
    <provisioned-script-command>./control/runcommand.bat MyCrawl_fullCrawl run</provisioned-script-command>
    <bean-shell-script>
      <![CDATA[ 
    crawlName = "MyCrawl";
         
    log.info("Starting full CAS crawl '" + crawlName + "'.");
    
    // obtain lock
    if (LockManager.acquireLock("crawl_lock_" + crawlName)) {

        if (!CAS.isCrawlFileOutput(crawlName)) {
           throw new UnsupportedOperationException("The crawl " + crawlName + 
              " does not have a File System output type. The only supported output type for this script is File System.");	
        }
     
        log.info("Starting full CAS crawl '" + crawlName + "'.");
        // Remove all files from the crawl's output directory
        CAS.cleanOutputDir(crawlName);
        CAS.runBaselineCasCrawl(crawlName);
        // Rename the output to files to include the crawl name
        // so they do not collide with the output from other crawls
        CAS.renameBaselineCrawlOutput(crawlName);

        destDir = PathUtils.getAbsolutePath(CAS.getWorkingDir(),
           CAS.getCasCrawlFullOutputDestDir());
          
        // create the target dir, if it doesn't already exist
        mkDirUtil = new CreateDirUtility(CAS.getAppName(), 
           CAS.getEacHost(), CAS.getEacPort(), CAS.isSslEnabled());
        mkDirUtil.init(CAS.getCasCrawlOutputDestHost(), destDir, CAS.getWorkingDir());
        mkDirUtil.run();
            
        // clear the destination dir of full crawl from previous crawls
        CAS.clearFullCrawlOutputFromDestinationDir(crawlName);

        // remove previously collected incremental crawl files,
        // which are expected to be incorporated in this full crawl
        CAS.clearIncrementalCrawlOutputFromDestinationDir(crawlName);
      
        // copy the full crawl output to destination directory
        CAS.copyBaselineCrawlOutputToDestinationDir(crawlName);
        LockManager.releaseLock("crawl_lock_" + crawlName);
     }
    
    else {
      log.warning("Failed to obtain lock.");
    }
    
    log.info("Finished full CAS crawl '" + crawlName + "'.");
      ]]>
    </bean-shell-script>
  </script>

This example DataIngest.xml code runs an incremental crawl that writes to record file output:

   <!--
    ########################################################################
    # incremental crawl script
    #
  -->
  <script id="MyCrawl_IncrementalCrawl">
    <log-dir>./logs/provisioned_scripts</log-dir>
    <provisioned-script-command>./control/runcommand.bat MyCrawl_IncrementalCrawl run</provisioned-script-command>
    <bean-shell-script>
      <![CDATA[ 
    crawlName = "MyCrawl";
    	
   	log.info("Starting incremental CAS crawl '" + crawlName + "'.");
   
    // obtain lock
    if (LockManager.acquireLock("crawl_lock_" + crawlName)) {

       if (!CAS.isCrawlFileOutput(crawlName)) {
          throw new UnsupportedOperationException("The crawl " + crawlName + 
            " does not have a File System output type. The only supported output type for this script is File System.");	
       }
     
       log.info("Starting incremental CAS crawl '" + crawlName + "'.");
       // Remove all files from the crawl's output directory
       CAS.cleanOutputDir(crawlName);
       CAS.runIncrementalCasCrawl(crawlName);
       // Timestamp and rename the output to files to include the 
       // crawl name so they do not collide with the output from 
       // previous incremental output from this crawl or incremental
       // output from other crawls
       CAS.renameIncrementalCrawlOutput(crawlName);

       destDir = PathUtils.getAbsolutePath(CAS.getWorkingDir(),
          CAS.getCasCrawlIncrementalOutputDestDir());
          
       // create the target dir, if it doesn't already exist
       mkDirUtil = new CreateDirUtility(CAS.getAppName(), 
          CAS.getEacHost(), CAS.getEacPort(), CAS.isSslEnabled());
       mkDirUtil.init(CAS.getCasCrawlOutputDestHost(), destDir, CAS.getWorkingDir());
       mkDirUtil.run();
      
       // copy crawl output to destination directory
       // Note: We assume a downstream process removes incremental crawl output
       // from this directory that has already been processed.
       CAS.copyIncrementalCrawlOutputToDestinationDir(crawlName);
    
       LockManager.releaseLock("crawl_lock_" + crawlName);
     }
     
     else {
        log.warning("Failed to obtain lock.");
     }
    
    log.info("Finished incremental CAS crawl '" + crawlName + "'.");
      ]]>
    </bean-shell-script>
  </script>

CAS crawl scripts for record file output

Guided Search Administrator's Guide