A.26 Enable Additional Spark or PySpark interpreter

An additional Spark or PySpark interpreter is required to connect to two different external clusters at the same time.

To set up an additional Spark or PySpark interpreter, follow these steps:
  1. Create a start-script for the second Spark interpreter.

    Note:

    This is an optional step.
    1. Navigate to the <COMPLIANCE_STUDIO_INSTALLATION_PATH>/deployed/interpreters/ bin directory and create a new start-script called start-spark2-interpreter.sh using the following command:
      cp start-spark-interpreter.sh start-spark2-interpreter.sh
    2. Edit the start-spark2-interpreter.sh file in the <COMPLIANCE_STUDIO_INSTALLATION_- PATH>/deployed/interpreters/bin/ directory to update:
      1. Port number to a new port number that is not in use (for example, 7030)
      2. Rename the log file, search for the text, .log and give a new name to the log (for example, from spark.log to spark2.log).
    3. Edit the start-all-interpreters.sh file in the <COMPLIANCE_STUDIO_INSTALLATION_ PATH>/interpreters/bin/ directory as follows:
      1. Search for the text sh "$DEPLOY_APP_HOME"/interpreters/bin/startspark- interpreter.sh &
      2. Add an additional entry with sh "$DEPLOY_APP_HOME"/interpreters/bin/ start-spark2-interpreter.sh &

      Note:

      For the 2nd Spark interpreter variant, use start-spark2- interpreter.sh, when configuring for a 3rd variant, use as startspark3- interpreter.sh etc.
  2. Create the interpreter JSON for the additional Spark interpreter.
    1. Navigate to the <COMPLIANCE_STUDIO_INSTALLATION_PATH>/deployed/interpreters/ conf directory and create the new interpreter JSON called spark2.json using the following command:
      cp spark.json spark2.json
    2. Edit the spark2.json file in the <COMPLIANCE_STUDIO_INSTALLATION_PATH>/ deployed/interpreters/conf/ directory as follows:
      1. Update the following parameter values:
        group: <new-spark-interpreter-name>,
        name: <new-spark-interpreter-name>,
        groupSettings.initialCodeCapability: <new-spark-interpreter-name>,
        port: 7030 (the port chosen in the step 1),
        capabilities.name: <new-spark-interpreter-name>,
        capabilities.button.label: <new-spark-interpreter-name>,
  3. After the update, the file will look like the following:
    [
    {
    "group": "spark",
    "name": "spark",
    "className": "org.apache.zeppelin.spark.SparkInterpreter",
    "groupSettings": {
    "initialCode": "1+1",
    "initialCodeCapability": "spark"
    },
    "host": "localhost",
    "port": 7017,
    "capabilities": [
    {
    "name": "spark",
    "highlightLanguage": "scala",
    "formEscapeCharacter": "@",
    "button": {
    "defaultCode": "println(\"Hello, world\")",
    "icon": "fa fa-fw fa-building-o",
    "label": "Spark"
    }
    }
    ],
    "defaultInterpreter": true,
    "properties": {
    "spark.executor.memory": {
    "envName": null,
    "propertyName": "spark.executor.memory",
    "defaultValue": "",
    "description": "Executor memory per worker instance. ex) 512m,
    32g",
    "type": "string"
    },
    "args": {
    "envName": null,
    "propertyName": null,
    "defaultValue": "",
    "description": "spark commandline args",
    "type": "textarea"
    },
    "zeppelin.spark.useHiveContext": {
    "envName": "ZEPPELIN_SPARK_USEHIVECONTEXT",
    "propertyName": "zeppelin.spark.useHiveContext",
    "defaultValue": true,
    "description": "Use HiveContext instead of SQLContext if it is
    true.",
    "type": "checkbox"
    },
    "spark.app.name": {
    "envName": "SPARK_APP_NAME",
    "propertyName": "spark.app.name",
    "defaultValue": "Zeppelin",
    "description": "The name of spark application.",
    "type": "string"
    },
    "spark.pyspark.python": {
    "envName": null,
    "propertyName": "spark.pyspark.python",
    "defaultValue": "python3",
    "description": "Python command to run pyspark workers with",
    "type": "string"
    },
    "zeppelin.spark.printREPLOutput": {
    "envName": null,
    "propertyName": "zeppelin.spark.printREPLOutput",
    "defaultValue": true,
    "description": "Print REPL output",
    "type": "checkbox"
    },
    "spark.cores.max": {
    "envName": null,
    "propertyName": "spark.cores.max",
    "defaultValue": "",
    "description": "Total number of cores to use. Empty value uses
    all available core.",
    "type": "number"
    },
    "zeppelin.spark.maxResult": {
    "envName": "ZEPPELIN_SPARK_MAXRESULT",
    "propertyName": "zeppelin.spark.maxResult",
    "defaultValue": "1000",
    "description": "Max number of Spark SQL result to display.",
    "type": "number"
    },
    "spark.master": {
    "envName": "MASTER",
    "propertyName": "spark.master",
    "defaultValue": "yarn",
    "description": "Spark master uri. ex) spark://masterhost:7077",
    "type": "string"
    },
    "spark.yarn.archive": {
    "envName": null,
    "propertyName": "spark.yarn.archive",
    "defaultValue": "",
    "description": "An archive containing needed Spark jars for
    distribution to the YARN cache",
    "type": "string"
    },
    "spark.driver.bindAddress": {
    "envName": "DRIVER_BIND_ADDRESS",
    "propertyName": "spark.driver.bindAddress",
    "defaultValue": "0.0.0.0",
    "description": "Hostname or IP address where to bind listening
    sockets.",
    "type": "string"
    },
    "zeppelin.spark.enableSupportedVersionCheck": {
    "envName": null,
    "propertyName": "zeppelin.spark.enableSupportedVersionCheck",
    "defaultValue": true,
    "description": "Do not change - developer only setting, not for
    production use",
    "type": "checkbox"
    },
    "zeppelin.spark.uiWebUrl": {
    "envName": null,
    "propertyName": "zeppelin.spark.uiWebUrl",
    "defaultValue": "",
    "description": "Override Spark UI default URL",
    "type": "string"
    },
    "zeppelin.spark.useNew": {
    "envName": null,
    "propertyName": "zeppelin.spark.useNew",
    "defaultValue": true,
    "description": "Whether use new spark interpreter
    implementation",
    "type": "checkbox"
    },
    "zeppelin.spark.ui.hidden": {
    "envName": null,
    "propertyName": "zeppelin.spark.ui.hidden",
    "defaultValue": false,
    "description": "Whether to hide spark ui in zeppelin ui",
    "type": "checkbox"
    },
    "zeppelin.interpreter.output.limit": {
    "envName": null,
    "propertyName": "zeppelin.interpreter.output.limit",
    "defaultValue": "102400",
    "description": "Output message from interpreter exceeding the
    limit will be truncated",
    "type": "number"
    }
    },
    "initialCode": [],
    "editor": {
    "language": "scala",
    "editOnDblClick": false
    }
    }
    ]
  4. Create the interpreter JSON for the second PySpark interpreter.
    1. Navigate to the <COMPLIANCE_STUDIO_INSTALLATION_PATH>/deployed/interpreters/ conf directory and create the new interpreter JSON called pyspark2.json using the following command:
      cp pyspark.json pyspark2.json
    2. Edit the pyspark2.json file in the <COMPLIANCE_STUDIO_INSTALLATION_PATH>/ deployed/interpreters/conf/ directory as follows:
      1. Update the following parameter values:
        group: <new-spark-interpreter-name>,
        name: <new-spark-interpreter-name>,
        groupSettings.initialCodeCapability: <new-spark-interpreter-name>,
        port: 7030 (the port chosen in the step 1),
        capabilities.name: <new-spark-interpreter-name>,
        capabilities.button.label: <new-spark-interpreter-name>,
  5. After the update, the file will look like the following:
    [
    {
    "group": "spark",
    "name": "pyspark",
    "className": "org.apache.zeppelin.spark.PySparkInterpreter",
    "host": "localhost",
    "port": 7017,
    "capabilities": [
    {
    "name": "pyspark",
    "highlightLanguage": "python",
    "button": {
    "defaultCode": "print('Hello World')",
    "icon": "icon-python",
    "label": "PySpark"
    },
    "formEscapeCharacter": "$"
    }
    ],
    "properties": {
    "zeppelin.pyspark.python": {
    "envName": "PYSPARK_PYTHON",
    "propertyName": null,
    "defaultValue": "python3",
    "description": "Python executable to run pyspark with",
    "type": "string"
    },
    "zeppelin.pyspark.useIPython": {
    "envName": null,
    "propertyName": "zeppelin.pyspark.useIPython",
    "defaultValue": false,
    "description": "whether use IPython when it is available",
    "type": "checkbox"
    },
    "zeppelin.interpreter.output.limit": {
    "envName": null,
    "propertyName": "zeppelin.interpreter.output.limit",
    "defaultValue": "102400",
    "description": "Output message from interpreter exceeding the
    limit will be truncated",
    "type": "number"
    }
    },
    "initialCode": []
    }
    ]

    Note:

    If you try to connect two interpreters to different external clusters when setting the environment variables, SPARK_HOME and HADOOP_CONF_DIR, as part of providing custom Spark libraries in Yarn Mode, ensure that you append the environment variables to the respective Spark interpreter start-scripts.
  6. Restart Compliance Studio. To do this, navigate to the <COMPLIANCE_STUDIO_INSTALLATION_PATH>/bin/ directory and run the ./compliancestudio. sh –restart or ./compliance-studio.sh –r script