model_repo_path: "/model-store/"
use_ensemble: false
model_type: "LLAMA"
backend: "trt_llm"
base_model_id: "ensemble"
prompt_timer: 60
gateway_ip: "gateway-api"
server_port_internal: 9009
customization_cache_capacity: 10000
logging_level: "INFO"
enable_chat: true
pipeline:
  model_name: "ensemble"
  num_instances: 1
trt_llm:
  use: true
  ckpt_type: "hf"
  model_name: "trt_llm"
  backend: "python"
  num_gpus: 1
  model_path: /engine_dir
  max_queue_delay_microseconds: 10000
  model_type: "llama"
  max_batch_size: 1
  max_input_len: 256
  max_output_len: 256
  max_beam_width: 1
  tensor_para_size: 1
  pipeline_para_size: 1
  data_type: "float16"
  int8_mode: 0
  enable_custom_all_reduce: 0
  per_column_scaling: false