#!/usr/bin/env bash

################################################################################
# Check if hwloc commands exist
################################################################################
declare -i HPCBIND_HAS_HWLOC=1
type hwloc-bind >/dev/null 2>&1
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))

type hwloc-distrib >/dev/null 2>&1
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))

type hwloc-ls >/dev/null 2>&1
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))

type hwloc-calc >/dev/null 2>&1
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))

type hwloc-ps >/dev/null 2>&1
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))

if [[ ${HPCBIND_HAS_HWLOC} -eq 0 ]]; then
  echo "hwloc not found, no process binding will occur"
fi

# Get parent cpuset
HPCBIND_HWLOC_PARENT_CPUSET=""
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
  HPCBIND_HWLOC_VERSION="$(hwloc-ls --version | cut -d ' ' -f 2)"
  MY_PID="$BASHPID"
  HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)"
fi

################################################################################
# Check if nvidia-smi exist
################################################################################
declare -i HPCBIND_HAS_NVIDIA=0
type nvidia-smi >/dev/null 2>&1
HPCBIND_HAS_NVIDIA=$((!$?))


################################################################################
# Get visible gpu
################################################################################
declare -i NUM_GPUS=0
HPCBIND_VISIBLE_GPUS=""
if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
  NUM_GPUS=$(nvidia-smi -L | wc -l);
  HPCBIND_HAS_NVIDIA=$((!$?))
  if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
    GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )"
    HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}}
  fi
fi

declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))


################################################################################
# Get queue id
# supports sbatch, bsub, aprun
################################################################################
HPCBIND_QUEUE_NAME=""
declare -i HPCBIND_QUEUE_RANK=0
declare -i HPCBIND_QUEUE_SIZE=0
declare -i HPCBIND_QUEUE_MAPPING=0

if [[ ! -z "${PMI_RANK}" ]]; then
  HPCBIND_QUEUE_MAPPING=1
  HPCBIND_QUEUE_NAME="mpich"
  HPCBIND_QUEUE_RANK=${PMI_RANK}
  HPCBIND_QUEUE_SIZE=${PMI_SIZE}
elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then
  HPCBIND_QUEUE_MAPPING=1
  HPCBIND_QUEUE_NAME="openmpi"
  HPCBIND_QUEUE_RANK=${OMPI_COMM_WORLD_RANK}
  HPCBIND_QUEUE_SIZE=${OMPI_COMM_WORLD_SIZE}
elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then
  HPCBIND_QUEUE_MAPPING=1
  HPCBIND_QUEUE_NAME="mvapich2"
  HPCBIND_QUEUE_RANK=${MV2_COMM_WORLD_RANK}
  HPCBIND_QUEUE_SIZE=${MV2_COMM_WORLD_SIZE}
elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
  HPCBIND_QUEUE_MAPPING=1
  HPCBIND_QUEUE_NAME="slurm"
  HPCBIND_QUEUE_RANK=${SLURM_PROCID}
  HPCBIND_QUEUE_SIZE=${SLURM_NPROCS}
elif [[ ! -z "${ALPS_APP_PE}" ]]; then
  HPCBIND_QUEUE_MAPPING=1
  HPCBIND_QUEUE_NAME="aprun"
  HPCBIND_QUEUE_RANK=${ALPS_APP_PE}
elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
  HPCBIND_QUEUE_MAPPING=1
  HPCBIND_QUEUE_NAME="bsub"
  HPCBIND_QUEUE_RANK=${LBS_JOBINDEX}
fi

################################################################################
# Show help
################################################################################
function show_help {
  local cmd=$(basename "$0")
  echo "Usage: ${cmd} <options> -- command ..."
  echo "  Set the process mask, OMP environment variables and CUDA environment"
  echo "  variables to sane values if possible. Uses hwloc and nvidia-smi if"
  echo "  available.  Will preserve the current process binding, so it is safe"
  echo "  to use with a queuing system or mpiexec."
  echo ""
  echo "Options:"
  echo "  --no-hwloc-bind       Disable binding"
  echo "  --proc-bind=<LOC>     Set the initial process mask for the script"
  echo "                        LOC can be any valid location argument for"
  echo "                        hwloc-calc  Default: all"
  echo "  --whole-system        ${cmd} will ignore the its parent process binding"
  echo "  --distribute=N        Distribute the current cpuset into N partitions"
  echo "  --distribute-partition=I"
  echo "                        Use the i'th partition (zero based)"
  echo "  --visible-gpus=<L>    Comma separated list of gpu ids"
  echo "                        Default: CUDA_VISIBLE_DEVICES or all gpus in"
  echo "                        sequential order"
  echo "  --ignore-queue        Ignore queue job id when choosing visible GPU and partition"
  echo "  --no-gpu-mapping      Do not set CUDA_VISIBLE_DEVICES"
  echo "  --openmp=M.m          Set env variables for the given OpenMP version"
  echo "                        Default: 4.0"
  echo "  --openmp-ratio=N/D    Ratio of the cpuset to use for OpenMP"
  echo "                        Default: 1"
  echo "  --openmp-places=<Op>  Op=threads|cores|sockets. Default: threads"
  echo "  --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
  echo "  --force-openmp-num-threads=N"
  echo "                        Override logic for selecting OMP_NUM_THREADS"
  echo "  --force-openmp-proc-bind=<OP>"
  echo "                        Override logic for selecting OMP_PROC_BIND"
  echo "  --no-openmp-nested    Set OMP_NESTED to false"
  echo "  --output-prefix=<P>   Save the output to files of the form"
  echo "                        P.hpcbind.N, P.stdout.N and P.stderr.N where P is "
  echo "                        the prefix and N is the rank (no spaces)"
  echo "  --output-mode=<Op>    How console output should be handled."
  echo "                        Options are all, rank0, and none.  Default: rank0" 
  echo "  --lstopo              Show bindings in lstopo"
  echo "  -v|--verbose          Print bindings and relevant environment variables"
  echo "  -h|--help             Show this message"
  echo ""
  echo "Sample Usage:"
  echo ""
  echo "  Split the current process cpuset into 4 and use the 3rd partition"
  echo "    ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
  echo ""
  echo "  Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus"
  echo "  and save the output to rank specific files"
  echo "    mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\"
  echo "      --distribute=4 -v --output-prefix=output  -- command ..."
  echo ""
  echo "  Bind the process to all even cores"
  echo "    ${cmd} --proc-bind=core:even -v -- command ..."
  echo ""
  echo "  Bind the the even cores of socket 0 and the odd cores of socket 1"
  echo "    ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..."
  echo ""
  echo "  Skip GPU 0 when mapping visible devices"
  echo "    ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
  echo ""
  echo "  Display the current bindings"
  echo "    ${cmd} --proc-bind=numa:0 -- command"
  echo ""
  echo "  Display the current bindings using lstopo"
  echo "    ${cmd} --proc-bind=numa:0.core:odd --lstopo"
  echo ""
}


################################################################################
# Parse command line arguments
################################################################################
# Show help if no command line arguments given
if [[ "$#" -eq 0 ]]; then
  show_help
  exit 0
fi

declare -a UNKNOWN_ARGS=()
declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
declare -i HPCBIND_DISTRIBUTE=1
declare -i HPCBIND_PARTITION=-1
HPCBIND_PROC_BIND="all"
HPCBIND_OPENMP_VERSION=4.0
declare -i HPCBIND_OPENMP_RATIO_NUMERATOR=1
declare -i HPCBIND_OPENMP_RATIO_DENOMINATOR=1
HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads}
declare -i HPCBIND_OPENMP_PROC_BIND=1
HPCBIND_OPENMP_FORCE_NUM_THREADS=""
HPCBIND_OPENMP_FORCE_PROC_BIND=""
declare -i HPCBIND_OPENMP_NESTED=1
declare -i HPCBIND_VERBOSE=0

declare -i HPCBIND_LSTOPO=0

HPCBIND_OUTPUT_PREFIX=""
HPCBIND_OUTPUT_MODE="rank0"

declare -i HPCBIND_HAS_COMMAND=0

for i in "$@"; do
  case "$i" in
    # number of partitions to create
    --no-hwloc-bind)
      HPCBIND_ENABLE_HWLOC_BIND=0
      shift
      ;;
    --proc-bind=*)
      HPCBIND_PROC_BIND="${i#*=}"
      shift
      ;;
    --whole-system)
      HPCBIND_HWLOC_PARENT_CPUSET=""
      shift
      ;;
    --distribute=*)
      HPCBIND_DISTRIBUTE="${i#*=}"
      if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
        HPCBIND_DISTRIBUTE=1
      fi
      shift
      ;;
    # which partition to use
    --distribute-partition=*)
      HPCBIND_PARTITION="${i#*=}"
      shift
      ;;
    --visible-gpus=*)
      HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
      shift
      ;;
    --ignore-queue)
      HPCBIND_QUEUE_MAPPING=0
      shift
      ;;
    --no-gpu-mapping)
      HPCBIND_ENABLE_GPU_MAPPING=0
      shift
      ;;
    --openmp=*)
      HPCBIND_OPENMP_VERSION="${i#*=}"
      shift
      ;;
    --openmp-ratio=*)
      IFS=/ read HPCBIND_OPENMP_RATIO_NUMERATOR HPCBIND_OPENMP_RATIO_DENOMINATOR <<< "${i#*=}"
      if [[ ${HPCBIND_OPENMP_RATIO_NUMERATOR} -le 0 ]]; then
        HPCBIND_OPENMP_RATIO_NUMERATOR=1
      fi
      if [[ ${HPCBIND_OPENMP_RATIO_DENOMINATOR} -le 0 ]]; then
        HPCBIND_OPENMP_RATIO_DENOMINATOR=1
      fi
      if [[ ${HPCBIND_OPENMP_RATIO_NUMERATOR} -gt ${HPCBIND_OPENMP_RATIO_DENOMINATOR} ]]; then
        HPCBIND_OPENMP_RATIO_NUMERATOR=1
        HPCBIND_OPENMP_RATIO_DENOMINATOR=1
      fi
      shift
      ;;
    --openmp-places=*)
      HPCBIND_OPENMP_PLACES="${i#*=}"
      shift
      ;;
    --no-openmp-proc-bind)
      HPCBIND_OPENMP_PROC_BIND=0
      shift
      ;;
    --force-openmp-proc-bind=*)
      HPCBIND_OPENMP_FORCE_PROC_BIND="${i#*=}"
      shift
      ;;
    --force-openmp-num-threads=*)
      HPCBIND_OPENMP_FORCE_NUM_THREADS="${i#*=}"
      shift
      ;;
    --no-openmp-nested)
      HPCBIND_OPENMP_NESTED=0
      shift
      ;;
    --output-prefix=*)
      HPCBIND_OUTPUT_PREFIX="${i#*=}"
      shift
      ;;
    --output-mode=*)
      HPCBIND_OUTPUT_MODE="${i#*=}"
      #convert to lower case
      HPCBIND_OUTPUT_MODE="${HPCBIND_OUTPUT_MODE,,}"
      shift
      ;;
    --lstopo)
      HPCBIND_VERBOSE=1
      HPCBIND_LSTOPO=1
      shift
      ;;
    -v|--verbose)
      HPCBIND_VERBOSE=1
      shift
      ;;
    -h|--help)
      show_help
      exit 0
      ;;
    # ignore remaining arguments
    --)
      HPCBIND_HAS_COMMAND=1
      shift
      break
      ;;
    # unknown option
    *)
      UNKNOWN_ARGS+=("$i")
      shift
      ;;
  esac
done

################################################################################
# Check output mode
################################################################################
declare -i HPCBIND_TEE=0

if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then
  HPCBIND_TEE=0
elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then
  HPCBIND_TEE=1
elif [[ ${HPCBIND_QUEUE_RANK} -eq 0 ]]; then
  #default to rank0 printing to screen
  HPCBIND_TEE=1
fi


if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then
  HPCBIND_LOG=/dev/null
  HPCBIND_ERR=/dev/null
  HPCBIND_OUT=/dev/null
else
  if [[ ${HPCBIND_QUEUE_SIZE} -gt 0 ]]; then
    HPCBIND_STR_QUEUE_SIZE="${HPCBIND_QUEUE_SIZE}"
    HPCBIND_STR_QUEUE_RANK=$(printf %0*d ${#HPCBIND_STR_QUEUE_SIZE} ${HPCBIND_QUEUE_RANK})

    HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_STR_QUEUE_RANK}"
    HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_STR_QUEUE_RANK}"
    HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_STR_QUEUE_RANK}"
  else
    HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_QUEUE_RANK}"
    HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_QUEUE_RANK}"
    HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_QUEUE_RANK}"
  fi
  > ${HPCBIND_LOG}
fi


################################################################################
# Check unknown arguments
################################################################################
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
  echo "HPCBIND Uknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG})
  exit 1
fi

################################################################################
# Check that visible gpus are valid
################################################################################
HPCBIND_VISIBLE_GPUS=(${HPCBIND_VISIBLE_GPUS})
if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
  for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
    if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
      ${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
      echo "HPCBIND Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]} (setting to 0)" > >(tee -a ${HPCBIND_LOG})
      HPCBIND_VISIBLE_GPUS[$i]=0;
    fi
  done
  NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
fi


################################################################################
#choose the correct partition
################################################################################
if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then
  HPCBIND_PARTITION=${HPCBIND_QUEUE_RANK}
elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then
  HPCBIND_PARTITION=0
fi

if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
  HPCBIND_PARTITION=$((HPCBIND_PARTITION % HPCBIND_DISTRIBUTE))
fi

################################################################################
# Find cpuset and num threads
################################################################################
HPCBIND_HWLOC_CPUSET=""
declare -i HPCBIND_NUM_PUS=0

if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
  if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
    BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND[*]})
  else
    BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND[*]})
  fi

  if [[ ${HPCBIND_DISTRIBUTE} -gt 1 ]]; then
    CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
    HPCBIND_HWLOC_CPUSET="${CPUSETS[${HPCBIND_PARTITION}]}"
  else
    HPCBIND_HWLOC_CPUSET="${BINDING}"
  fi
  HPCBIND_NUM_PUS=$(hwloc-calc -q -N pu ${HPCBIND_HWLOC_CPUSET} )
  if [ $? -ne 0 ]; then
    HPCBIND_NUM_PUS=1
  fi
  HPCBIND_NUM_CORES=$(hwloc-calc -q -N core ${HPCBIND_HWLOC_CPUSET} )
  if [ $? -ne 0 ]; then
    HPCBIND_NUM_CORES=1
  fi
  HPCBIND_NUM_NUMAS=$(hwloc-calc -q -N numa ${HPCBIND_HWLOC_CPUSET} )
  if [ $? -ne 0 ]; then
    HPCBIND_NUM_NUMAS=1
  fi
  HPCBIND_NUM_SOCKETS=$(hwloc-calc -q -N socket ${HPCBIND_HWLOC_CPUSET} )
  if [ $? -ne 0 ]; then
    HPCBIND_NUM_SOCKETS=1
  fi
else
  HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
  HPCBIND_NUM_CORES=${HPCBIND_NUM_PUS}
  HPCBIND_NUM_NUMAS=1
  HPCBIND_NUM_SOCKETS=1
fi


if [[ ${HPCBIND_OPENMP_FORCE_NUM_THREADS} != "" ]]; then
  HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_OPENMP_FORCE_NUM_THREADS}
else
  declare -i HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_NUM_PUS * HPCBIND_OPENMP_RATIO_NUMERATOR / HPCBIND_OPENMP_RATIO_DENOMINATOR))

  if [[ ${HPCBIND_OPENMP_NUM_THREADS} -lt 1 ]]; then
    HPCBIND_OPENMP_NUM_THREADS=1
  elif [[ ${HPCBIND_OPENMP_NUM_THREADS} -gt ${HPCBIND_NUM_PUS} ]]; then
    HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_NUM_PUS}
  fi
fi

################################################################################
# Set OpenMP environment variables
################################################################################

# set OMP_NUM_THREADS
if [[ ${HPCBIND_OPENMP_NESTED} -eq 1 ]]; then
  export OMP_NUM_THREADS="${HPCBIND_OPENMP_NUM_THREADS},1"
else
  export OMP_NUM_THREADS=${HPCBIND_OPENMP_NUM_THREADS}
fi

# set OMP_PROC_BIND and OMP_PLACES
if [[ ${HPCBIND_OPENMP_PROC_BIND} -eq 1 ]]; then
  if [[ "${HPCBIND_OPENMP_FORCE_PROC_BIND}" == "" ]]; then
    #default proc bind logic
    if [[ "${HPCBIND_OPENMP_VERSION}" == "4.0" || "${HPCBIND_OPENMP_VERSION}" > "4.0" ]]; then
      export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
      if [[ ${HPCBIND_OPENMP_NESTED} -eq 1 ]]; then
        export OMP_PROC_BIND="spread,spread"
      else
        export OMP_PROC_BIND="spread"
      fi
    else
      export OMP_PROC_BIND="true"
      unset OMP_PLACES
    fi
  else
    #force proc bind
    export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
    export OMP_PROC_BIND="${HPCBIND_OPENMP_FORCE_PROC_BIND}"
  fi
else
  # no openmp proc bind
  unset OMP_PLACES
  unset OMP_PROC_BIND
fi

# set up hot teams (intel specific)
if [[ ${HPCBIND_OPENMP_NESTED} -eq 1 ]]; then
  export OMP_NESTED="true"
  export OMP_MAX_ACTIVE_LEVELS=2
  export KMP_HOT_TEAMS=1
  export KMP_HOT_TEAMS_MAX_LEVEL=2
else
  export OMP_NESTED="false"
fi

# set OMP_NESTED

################################################################################
# Set CUDA environment variables
################################################################################

if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
  if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
    declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
    export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
  else
    declare -i MY_TASK_ID=$((HPCBIND_QUEUE_RANK * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
    declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
    export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
  fi
fi

################################################################################
# Set hpcbind environment variables
################################################################################
export HPCBIND_HWLOC_VERSION=${HPCBIND_HWLOC_VERSION}
export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
export HPCBIND_NUM_CORES=${HPCBIND_NUM_CORES}
export HPCBIND_NUM_NUMAS=${HPCBIND_NUM_NUMAS}
export HPCBIND_NUM_SOCKETS=${HPCBIND_NUM_SOCKETS}
export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}"
export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
export HPCBIND_OPENMP_RATIO="${HPCBIND_OPENMP_RATIO_NUMERATOR}/${HPCBIND_OPENMP_RATIO_DENOMINATOR}"
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
  export HPCBIND_HWLOC_PARENT_CPUSET="all"
else
  export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}"
fi
export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}"
export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}"
if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
  export HPCBIND_QUEUE_RANK=${HPCBIND_QUEUE_RANK}
  export HPCBIND_QUEUE_SIZE=${HPCBIND_QUEUE_SIZE}
  export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}"
  export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING}
fi


################################################################################
# Print verbose
################################################################################

TMP_ENV=$(env | sort)
if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then
  echo "[HOST]" >> ${HPCBIND_LOG}
  hostname -s >> ${HPCBIND_LOG}
  echo "[HPCBIND]" >> ${HPCBIND_LOG}
  echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG}
  echo "[CUDA]" >> ${HPCBIND_LOG}
  echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG}
  echo "[OPENMP]" >> ${HPCBIND_LOG}
  echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG}
  echo "[GOMP] (gcc, g++, and gfortran)" >> ${HPCBIND_LOG}
  echo "${TMP_ENV}" | grep -E "^GOMP_" >> ${HPCBIND_LOG}
  echo "[KMP] (icc, icpc, and ifort)" >> ${HPCBIND_LOG}
  echo "${TMP_ENV}" | grep -E "^KMP_" >> ${HPCBIND_LOG}
  echo "[XLSMPOPTS] (xlc, xlc++, and xlf)" >> ${HPCBIND_LOG}
  echo "${TMP_ENV}" | grep -E "^XLSMPOPTS" >> ${HPCBIND_LOG}

  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
    echo "[BINDINGS]" >> ${HPCBIND_LOG}
    hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" >> ${HPCBIND_LOG}
  else
    echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG}
  fi
else
  echo "[HOST]" > >(tee -a ${HPCBIND_LOG})
  hostname -s > >(tee -a ${HPCBIND_LOG})
  echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG})
  echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG})
  echo "[CUDA]" > >(tee -a ${HPCBIND_LOG})
  echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG})
  echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG})
  echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG})
  echo "[GOMP] (gcc, g++, and gfortran)" > >(tee -a ${HPCBIND_LOG})
  echo "${TMP_ENV}" | grep -E "^GOMP_" > >(tee -a ${HPCBIND_LOG})
  echo "[KMP] (icc, icpc, and ifort)" > >(tee -a ${HPCBIND_LOG})
  echo "${TMP_ENV}" | grep -E "^KMP_" > >(tee -a ${HPCBIND_LOG})
  echo "[XLSMPOPTS] (xlc, xlc++, and xlf)" > >(tee -a ${HPCBIND_LOG})
  echo "${TMP_ENV}" | grep -E "^XLSMPOPTS" > >(tee -a ${HPCBIND_LOG})

  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
    echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG})
    hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --no-io --no-bridges > >(tee -a ${HPCBIND_LOG})
  else
    echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG})
  fi
fi

################################################################################
# Run command
################################################################################

# must be the last executed command so that the return value is correct
if [[ ${HPCBIND_LSTOPO} -eq 1 && ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
  hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- lstopo --pid 0
elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then
  # clear output files
  > ${HPCBIND_ERR}
  > ${HPCBIND_OUT}
  if [[ ${HPCBIND_TEE} -eq 0 ]]; then
    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
    else
      eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
    fi
  else
    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
    else
      eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
    fi
  fi
fi
