Environment Configuration
%python
import sys, os
jdk_path = "/usr/java/jdk1.8.0_401" # Hadoop supports Java 1.8 and Java 11. Refer
matrix for actual versions. Both cluster and client must be in same
version
spark_path = "<Path of spark installed location in MMG Server>" # This is
spark3.5.5 + hadoop3.
# Ensure that spark version from the above matches with the pyspark installed in
this python runtime.
# Also ensure that hadoop version from the above matches with the hadoop version
of cluster
hadoop_conf_dir = "<Path to Hadoop xml directory which is copied local to MMG
Server>" # Local Path containing hadoop XMLs. Ensure that yarn-site has the correct
addresses for resource manager.
krb5_conf = "<Path_To_krb5.conf>krb5.conf" # Conf file for hive
connectivity
default_service_principal =
"<service>/<fully-qualified-hostname>@<KERBEROS_REALM>" # Service
Principal, not the user. Needed while doing impersonation.
default_keytab_file = "<Path_To_Keytab file copied local to the MMG server>" #
Keytab file for initializing connection
default_user_principal =
"<service>/<fully-qualified-hostname>@<KERBEROS_REALM>"
"""Setup environment variables for Spark and Hadoop."""
os.environ["JAVA_HOME"] = jdk_path
os.environ["SPARK_HOME"] = spark_path
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
os.environ["KRB5_CONFIG"] = krb5_conf
os.environ["HADOOP_CONF_DIR"] = hadoop_conf_dir
os.environ["YARN_CONF_DIR"] = hadoop_conf_dir
os.environ["PATH"] = f"{jdk_path}/bin:{spark_path}/bin:" +
os.environ["PATH"]
os.environ["SPARK_SUBMIT_OPTS"] = f"-Djava.security.krb5.conf={krb5_conf}" # If
we still get "KrbException: Cannot locate default realm", then this property needs
to be added in .profile
print("Environment Configured")