Environment Configuration

%python

import sys, os

jdk_path = "/usr/java/jdk1.8.0_401" # Hadoop supports Java 1.8 and Java 11. Refer matrix for actual versions. Both cluster and client must be in same version

spark_path = "<Path of spark installed location in MMG Server>" # This is spark3.5.5 + hadoop3.

# Ensure that spark version from the above matches with the pyspark installed in this python runtime.

# Also ensure that hadoop version from the above matches with the hadoop version of cluster

hadoop_conf_dir = "<Path to Hadoop xml directory which is copied local to MMG Server>" # Local Path containing hadoop XMLs. Ensure that yarn-site has the correct addresses for resource manager.

krb5_conf = "<Path_To_krb5.conf>krb5.conf" # Conf file for hive connectivity

default_service_principal = "<service>/<fully-qualified-hostname>@<KERBEROS_REALM>" # Service Principal, not the user. Needed while doing impersonation.

default_keytab_file = "<Path_To_Keytab file copied local to the MMG server>" # Keytab file for initializing connection

default_user_principal = "<service>/<fully-qualified-hostname>@<KERBEROS_REALM>"

"""Setup environment variables for Spark and Hadoop."""

os.environ["JAVA_HOME"] = jdk_path

os.environ["SPARK_HOME"] = spark_path

os.environ["PYSPARK_PYTHON"] = sys.executable

os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

os.environ["KRB5_CONFIG"] = krb5_conf

os.environ["HADOOP_CONF_DIR"] = hadoop_conf_dir

os.environ["YARN_CONF_DIR"] = hadoop_conf_dir

os.environ["PATH"] = f"{jdk_path}/bin:{spark_path}/bin:" + os.environ["PATH"]

os.environ["SPARK_SUBMIT_OPTS"] = f"-Djava.security.krb5.conf={krb5_conf}" # If we still get "KrbException: Cannot locate default realm", then this property needs to be added in .profile

print("Environment Configured")