File: //proc/self/root/usr/local/ucloud/set_irq_affinity.sh
#!/bin/bash
#########################################################################
# version 0.28
# walk around irq vector exhausted on mlx.
# support more than 32 cpus on mlx.
# version 0.27
# update iommu state judgement in different kernel version.
# cnat2 support tcp/udp traffic through reserved queue and core.
# version 0.26:
# fix bug: prevent return an error in vm with multi queues.
# version 0.25:
# walk aroud mlx driver (4.4-2.0.7) failed to rss gre traffic.
# version 0.24:
# ukernel-4.14 support mlx and irqbalance disable
# version 0.23:
# add igw and natgw sepcial version, support i40e and mlx driver
# version 0.22:
# fix bug: set_rps_rfs use wrong virtual_machines_flag value
# version 0.21:
# fix some bugs: 1. killall is not exist always 2. there isn't rps_gre_enable (kernel >= 4.1.0)
# 3. kernel version < 2.6.32-279.19.17 no rps_gre_enable,no set rps
# version 0.20:
# remove reliance on ethx nic name
# version 0.19:
# open Current hardware settings Combined to support CentOS 7.x and Ubuntu 14.04
# version 0.18:
# set rps in virtual machine when cpu_numbers is more than queues
# version 0.17:
# setting up irq affinity according to /proc/irq/*/${nic_name}*
# version 0.16:
# the number of virtual machine CPU is more than 16,binding 8 CPU.
# version 0.15:
# if /proc/net/rps_gre_enable exists,set rps_gre_enable to 1
# version 0.14:
# if the virtual machine CPU number is 1,dont't set rps/rfs
# vsersion 0.13:
# setting up irq affinity according to /proc/irq/*/virtio* in VM
# version 0.12:
# support network device named nicx
# version 0.11:
# remove bc from script
# version 0.10:
# Added kernel version and root user judgment
# version 0.9:
# Added rps_gre_enable support
# version 0.8:
# if virtual_machines determine if or not mutiqueue with /sys/class/net/${nic_name}0/queues
# version 0.7:
# mod MAX_CPU == 1,MAX_CPU is negative number,processor name change
# version 0.6:
# if MAX_CPU < 8 Soft interrupt average assigned to each CPU
# version 0.5:
# Added if mutiqueue disable rps/rfs
# version 0.4:
# Added rps/rfs support
# version 0.3:
# Don't kill irqbalance if there is no mutiqueue netcard
#
# version 0.2:
# Added chelsio netcard support
#
# version 0.1:
# Initial version, a best effort to set cpu affinity for both intel and bnx2 drivers
#
###########################################################################
version=0.28
pethDEV=0
###################################################
# set igw and natgw cpu affinity and net2 rps
###################################################
mlx_set_irq() {
cd /proc/irq
pci_id=$(ethtool -i $ifname | grep bus-info | cut -d " " -f 2)
cores=$(($core_num - $reserved_cores))
last_irq="0"
ls -d */*$pci_id* 2> /dev/null | grep mlx5_comp | sed s'/mlx5_comp//g' | sed s'/@.*//g' | sed 's/[^0-9][^0-9]*/ /g' > /tmp/irqaffinity
while read irq queue
do
if [ -n "${queue}" ]; then
if [ "$last_queue_id" -lt "${queue}" ]; then
last_queue_id=${queue}
last_irq=$irq
fi
CPUID=$(($[queue%${cores}]))
IRQ=$irq
DEV="${ifname}"
set_affinity_list
fi
done < /tmp/irqaffinity
# bind last nic queue to reserved core on cnat2
if [ "$platform" = "cnat2" ]; then
CPUID=$(($core_num-1))
IRQ=$last_irq
DEV="${ifname}"
set_affinity_list
fi
}
intel_set_irq() {
cd /proc/irq
last_irq="0"
ls -d */*${ifname}* 2> /dev/null| sed s"/\/.*${ifname}/-${ifname}/g" | sed 's/[^0-9][^0-9]*/ /g' > /tmp/irqaffinity
while read irq eth queue
do
if [ -n "${queue}" ]; then
if [ "$last_queue_id" -lt "${queue}" ]; then
last_queue_id=${queue}
last_irq=$irq
fi
queue=$(($[queue%${core_num}]))
MASK=$((1<<$((queue))))
IRQ=$irq
DEV="${ifname}"
set_affinity
fi
done < /tmp/irqaffinity
# bind last nic queue to reserved core on cnat2
if [ "$platform" = "cnat2" ]; then
MASK=$((1<<$(($core_num-1))))
IRQ=$last_irq
DEV="${ifname}"
set_affinity
fi
}
gw_disable_rps() {
for fileRps in $(ls /sys/class/net/${ifname}/queues/rx-*/rps_cpus)
do
printf "echo 0 > $fileRps \n"
echo 0 > $fileRps
done
}
gw_enable_rps() {
rpsnof=$(printf "%x" $rpsno | tr '[A-Z]' '[a-z]')
for fileRps in $(ls /sys/class/net/${ifname}/queues/rx-*/rps_cpus)
do
printf "echo $rpsnof > $fileRps \n"
echo $rpsnof > $fileRps
done
}
update_i40e_settings() {
intel_set_irq
gw_disable_rps
}
update_ixgbe_settings() {
intel_set_irq
rpsno=$(echo $((2**($core_num - $reserved_cores) - 1)))
gw_enable_rps
}
update_ixgbevf_settings() {
intel_set_irq
rpsno=$(echo $((2**($core_num - $reserved_cores) - 1 - 3)))
gw_enable_rps
}
update_mlx_settings() {
mlx_set_irq
nic_drv_ver=`ethtool -i $ifname | grep version | head -1 | cut -d " " -f 2`
if [ $nic_drv_ver = "4.4-2.0.7" ]; then
rpsno=$(echo $((2**($core_num - $reserved_cores) - 1)))
gw_enable_rps
else
gw_disable_rps
fi
}
gw_set_ixgbevf() {
update_ixgbevf_settings
}
gw_set_ixgbe() {
if [ "$platform" = "cnat2" ]; then
queue=$core_num
else
queue=$(echo $(($core_num - $reserved_cores)))
fi
ethtool -L $ifname combined $queue
update_ixgbe_settings
if [ "$platform" = "cnat2" ]; then
echo 0 > /sys/class/net/${ifname}/queues/rx-$last_queue_id/rps_cpus
fi
}
gw_set_i40e() {
if [ "$platform" = "cnat2" ]; then
queue=$core_num
else
queue=$(echo $(($core_num - $reserved_cores)))
fi
ethtool -L $ifname combined $queue
update_i40e_settings
}
gw_set_mlx() {
update_mlx_settings
}
# make last nic queue to transfer tcp and udp traffic to local host
nic_set_tcp_udp() {
if [ "$last_queue_id" -le "0" ]; then
echo "Error: please check nic queue, last_queue_id:$last_queue_id."
else
# delete old rules if exist
ruleID=$(ethtool -n ${ifname} |grep Filter |cut -d ' ' -f2)
for i in $ruleID; do
ethtool -N ${ifname} delete $i
done
hostIP=$(ip a show ${ifname} |grep inet |awk '{print $2}' |cut -d/ -f1)
if [ "$hostIP"x == ""x ]; then
echo "Error: no IP configured on $ifname, FDIR disabled."
exit 1
fi
# enable flow director
ethtool -K ${ifname} ntuple on
ethtool -N ${ifname} flow-type tcp4 dst-ip ${hostIP} action $last_queue_id
ethtool -N ${ifname} flow-type udp4 dst-ip ${hostIP} action $last_queue_id
# set RSS queues except the last one
ethtool -X ${ifname} equal $last_queue_id
fi
}
is_iommu_off() {
is_intel_iommuoff=$(cat /proc/cmdline | grep 'intel_iommu=off')
is_intel_iommuon=$(cat /proc/cmdline | grep 'intel_iommu=on')
if [[ $major_version -eq 4 && $minor_version -eq 1 ]]; then
if [ $release_version -ge 18 ]; then
if [ "$is_intel_iommuon"x == ""x ];then
return 1
fi
else
if [ "$is_intel_iommuoff"x != ""x ];then
return 1
fi
fi
elif [[ $major_version -eq 4 && $minor_version -eq 14 ]]; then
if [ "$is_intel_iommuon"x == ""x ];then
return 1
fi
else
if [ "$is_intel_iommuoff"x != ""x ];then
return 1
fi
fi
return 0
}
# spread interrupts of other phy nics to each cpu, and avoid setting
# irq balance of net2 failed on some cpu, due to vector exhausted.
walk_around_irq_vector_exhausted() {
cd /sys/class/net
for nic in $(ls -d net*)
do
ifname=$nic
if [ "$ifname" == "net2" ]; then
continue
fi
ip l set $ifname up
last_queue_id="0"
nic_type=`ethtool -i $ifname | head -1 | cut -d " " -f 2`
case $nic_type in
"ixgbe")
intel_set_irq;;
"igb")
intel_set_irq;;
"i40e")
intel_set_irq;;
"mlx5_core")
mlx_set_irq;;
*)
;;
esac
done
}
set_gateway_affinity() {
is_iommu_off
if [ $? == 0 ];then
echo "WARN: kernel parameter intel_iommuoff must be off! "
exit 1
fi
core_num=`cat /proc/cpuinfo | grep 'processor' | wc -l`
echo "CPU Cores on this machine: $core_num"
if [ "$core_num" -gt "32" ];then
echo "Not support cpu $core_num machine, must be less than 32!!"
exit 1
fi
if [ "$platform" = "cnat2" ]; then
reserved_cores=1
else
reserved_cores=2
fi
walk_around_irq_vector_exhausted
last_queue_id="0"
ifname="net2"
if [[ -d /sys/class/net/$ifname ]];then
echo "ifname is $ifname, ok continue "
else
echo "Error: please check iface name, ifname:$ifname not exist!!"
exit 1
fi
nic_type=`ethtool -i $ifname | head -1 | cut -d " " -f 2`
case $nic_type in
"ixgbe") gw_set_ixgbe
;;
"ixgbevf") gw_set_ixgbevf
;;
"i40e") gw_set_i40e
;;
"mlx5_core") gw_set_mlx
;;
*) echo "nic_type $nic_type not support!"
exit 1
;;
esac
if [ "$platform" = "cnat2" ]; then
nic_set_tcp_udp
fi
echo "Success!"
}
if [ `ls -1 /sys/class/net/ | grep ^net | wc -l` -ne 0 ]; then
nic_name="net"
elif [ `ls -1 /sys/class/net/ | grep ^eth | wc -l` -ne 0 ]; then
nic_name="eth"
else
nic_name=`ls -1 /sys/class/net/ | grep -v lo | grep "^...[0-9]$"| sed 's/[0-9]//g' | uniq`
fi
rps_gre_enable_file="/proc/net/rps_gre_enable"
disable_rps_rfs()
{
local rps_rfs_zero=0
echo $rps_rfs_zero > /proc/sys/net/core/rps_sock_flow_entries
for fileRps in $(ls /sys/class/net/${nic_name}*/queues/rx-*/rps_cpus)
do
echo $rps_rfs_zero > $fileRps
printf "echo $rps_rfs_zero > $fileRps \n"
done
for fileRfc in $(ls /sys/class/net/${nic_name}*/queues/rx-*/rps_flow_cnt)
do
echo $rps_rfs_zero > $fileRfc
done
}
set_rps_rfs()
{
local need_disable_rps_rfs=0
cd /sys/class/net/
for nic in $(ls -d ${nic_name}*)
do
nic_type=`ethtool -i $nic | head -1 | cut -d " " -f 2`
if [ $nic_type = "i40e" ]; then
need_disable_rps_rfs=1
elif [ $nic_type = "mlx5_core" ]; then
nic_drv_ver=`ethtool -i $nic | grep version | head -1 | cut -d " " -f 2`
if [ $nic_drv_ver = "4.4-2.0.7" ]; then
need_disable_rps_rfs=0
break
else
need_disable_rps_rfs=1
fi
fi
done
if [ $need_disable_rps_rfs == 1 ]; then
disable_rps_rfs;
return
fi
local MAX_CPU=$(( $(grep "^processor" /proc/cpuinfo | wc -l) ))
if [ "$virtual_machines_flag" == "0" ];then
if [[ $MAX_CPU -gt 10 ]];then
let "MAX_CPU = ${MAX_CPU}/2-2"
else
let "MAX_CPU = ${MAX_CPU}"
fi
else
if [[ $MAX_CPU -eq 1 ]];then
return
else
if [ "$mutiqueue" -gt "1" ]; then
if [ "$MAX_CPU" -ge "$mutiqueue" ];then
if [[ $MAX_CPU -gt 8 ]];then
let "MAX_CPU = ${MAX_CPU}"
fi
fi
else
if [[ $MAX_CPU -gt 8 ]];then
let "MAX_CPU = ${MAX_CPU}-2"
else
let "MAX_CPU = ${MAX_CPU}"
fi
fi
fi
fi
if [ $MAX_CPU -ge 32 ];then
MASK_FILL=""
MASK_LOW_32_BIT="ffffffff"
let "IDX = $MAX_CPU / 32"
for ((i=1; i<=$IDX;i++))
do
MASK_FILL="${MASK_FILL},${MASK_LOW_32_BIT}"
done
let "cpuno -= 32 * $IDX"
rpsno=$(echo $((2**$MAX_CPU - 1)))
rpsnof=$(printf "%x" $rpsno | tr '[A-Z]' '[a-z]')
rpsnof=`printf "%X%s" $rpsnof $MASK_FILL`
else
rpsno=$(echo $((2**$MAX_CPU - 1)))
rpsnof=$(printf "%x" $rpsno | tr '[A-Z]' '[a-z]')
fi
rfc=0
rsfe=0
if [ "$virtual_machines_flag" == 0 ]; then
echo $rsfe > /proc/sys/net/core/rps_sock_flow_entries
for fileRfc in $(ls /sys/class/net/${nic_name}*/queues/rx-*/rps_flow_cnt)
do
echo $rfc > $fileRfc
done
for fileRps in $(ls /sys/class/net/${nic_name}*/queues/rx-*/rps_cpus)
do
echo $rpsnof > $fileRps
printf "echo $rpsnof > $fileRps \n"
done
else
if [ "$mutiqueue" -gt "1" ]; then
if [ "$MAX_CPU" -ge "$mutiqueue" ];then
#sysctl -w net.core.rps_sock_flow_entries=$rsfe
for fileRps in $(ls /sys/class/net/${nic_name}*/queues/rx-*/rps_cpus)
do
echo $rpsnof > $fileRps
printf "echo $rpsnof > $fileRps \n"
done
return
fi
else
#sysctl -w net.core.rps_sock_flow_entries=$rsfe
for fileRps in $(ls /sys/class/net/${nic_name}*/queues/rx-*/rps_cpus)
do
echo $rpsnof > $fileRps
printf "echo $rpsnof > $fileRps \n"
done
fi
fi
}
#MAX_CPU the number of physical CPU
set_virtual_machines_affinity()
{
local MAX_CPU=$(( $(grep "^processor" /proc/cpuinfo | wc -l) ))
cd /proc/irq
ls -d */*virtio*put* | sed 's/[^0-9][^0-9]*/ /g' > /tmp/irqaffinity
while read irq virtio queue
do
if [ -n "${queue}" ]; then
queue=$(($[queue%${MAX_CPU}]))
if [[ $MAX_CPU -ge 16 ]]; then
MASK=$((1<<$((queue))))
MASK_8=$((1<<($((queue)) + 4)))
let "MASK=$MASK|$MASK_8"
else
MASK=$((1<<$((queue))))
fi
IRQ=$irq
DEV="virtio"$virtio
set_affinity
fi
done < /tmp/irqaffinity
}
set_affinity()
{
printf "%s mask=%X for /proc/irq/%d/smp_affinity\n" $DEV $MASK $IRQ
printf "%X" $MASK > /proc/irq/$IRQ/smp_affinity
}
set_affinity_list()
{
printf "%s cpuid=%d for /proc/irq/%d/smp_affinity_list\n" $DEV $CPUID $IRQ
printf "%d" $CPUID > /proc/irq/$IRQ/smp_affinity_list
}
set_mlx() {
cd /proc/irq
pci_id=$(ethtool -i $ifname | grep bus-info | cut -d " " -f 2)
ls -d */*$pci_id* 2> /dev/null | grep mlx5_comp | sed s'/mlx5_comp//g' | sed s'/@.*//g' | sed 's/[^0-9][^0-9]*/ /g' > /tmp/irqaffinity
while read irq queue
do
if [ -n "${queue}" ]; then
if [ $core_num -lt 32 ]; then
queue=$(($[queue%${core_num}]))
MASK=$((1<<$((queue))))
IRQ=$irq
DEV="${ifname}"
set_affinity
else
# kuaijie bind queue 0-59 to cpu 11-70
queue=$(($((queue)) + 11))
CPUID=$(($[queue%${defaule_cpu}]))
IRQ=$irq
DEV="${ifname}"
set_affinity_list
fi
fi
done < /tmp/irqaffinity
}
set_intel() {
cd /proc/irq
ls -d */*${ifname}* 2> /dev/null| sed s"/\/.*${ifname}/-${ifname}/g" | sed 's/[^0-9][^0-9]*/ /g' > /tmp/irqaffinity
while read irq eth queue
do
if [ -n "${queue}" ]; then
queue=$(($[queue%${core_num}]))
MASK=$((1<<$((queue))))
IRQ=$irq
DEV="${ifname}"
set_affinity
fi
done < /tmp/irqaffinity
}
set_intel_broadcom()
{
#######################Intel############################
# Assuming a device with two RX and TX queues.
# This script will assign:
#
# eth0-rx-0 CPU0
# eth0-rx-1 CPU1
# eth0-tx-0 CPU0
# eth0-tx-1 CPU1
#######################Intel############################
######################broadcom##########################
# bnx2:
# eth0-0 eth0-1 eth0-2 eth0-3 eth0-4
# eth0-0 eth0-1 eth0-2 eth0-3
#
# bnx2x:
# eth0-rx-0 eth0-rx-1 ...
# eth0-tx-0 eth0-tx-1 ...
######################broadcom##########################
local MAX_CPU=$(( $(grep "^processor" /proc/cpuinfo | wc -l) ))
defaule_cpu=0
let "defaule_cpu = $MAX_CPU"
let "MAX_CPU = ${MAX_CPU}/2-2"
if [[ $MAX_CPU -gt 0 ]];then
if [[ $MAX_CPU -lt 3 ]];then
let "MAX_CPU = $defaule_cpu"
else
let "MAX_CPU = ${defaule_cpu}/2-2"
fi
else
let "MAX_CPU = $defaule_cpu"
fi
core_num=$MAX_CPU
cd /sys/class/net/
for nic in $(ls -d ${nic_name}*)
do
ifname=$nic
nic_type=`ethtool -i $ifname | head -1 | cut -d " " -f 2`
case $nic_type in
"ixgbe") set_intel
;;
"igb") set_intel
;;
"i40e") set_intel
;;
"mlx5_core") set_mlx
;;
*) echo "nic_type $nic_type not support!"
;;
esac
done
}
set_chelsio()
{
local MAX_CPU=$(( $(grep "^processor" /proc/cpuinfo | wc -l)-1 ))
cpu=$((1))
for DEV in $(ifconfig -a | grep 00:07:43 | awk '{print $1}'); do
##### compatible with xen0 #######
if [ $pethDEV -eq 1 -a "${DEV/peth/}" == "$DEV" ]; then
continue
fi
for IRQ in $(egrep "rdma|ofld|${DEV}" /proc/interrupts | awk '{printf "%d\n",$1}'); do
MASK=${cpu}
set_affinity
cpu=$((${cpu}<<1))
if [[ ${cpu} -gt $((1<<${MAX_CPU})) ]]; then
cpu=$((1))
fi
done
ethtool -K $DEV gro off
done
}
is_root()
{
if [ `id -u` -ne 0 ]; then
echo "Non root user. Please run as root."
exit 0
fi
}
kernel_version=`uname -r`
major_version=`uname -r | awk -F. '{print $1}'`
minor_version=`uname -r | awk -F. '{print $2}'`
release_version=`uname -r | awk -F[.-]+ '{print $3}'`
check_kernel_version()
{
if [[ $major_version -eq 2 && $minor_version -eq 6 && $release_version -lt 32 ]]; then
echo "kernel version is too old to support rps"
exit 0
fi
if [[ $kernel_version = "2.6.32-5-amd64" || $kernel_version = "2.6.32-38-server" ]]; then
echo "kernel version is too old to support rps"
exit 0
fi
}
ethtool_nic0()
{
local ethtool=`which ethtool`
pre_set=`$ethtool -l ${nic_name}0 | grep "Combined" | head -n1 | awk '{print $2}'`
cur_set=`$ethtool -l ${nic_name}0 | grep "Combined" | tail -n1 | awk '{print $2}'`
if [[ $pre_set -ne $cur_set && $cur_set -ne 4 ]]; then
$ethtool -L ${nic_name}0 combined 4
fi
}
version_gt() { test "$(echo "$@" | tr " " "\n" | sort -V | head -n 1)" != "$1"; }
version_le() { test "$(echo "$@" | tr " " "\n" | sort -V | head -n 1)" == "$1"; }
version_lt() { test "$(echo "$@" | tr " " "\n" | sort -rV | head -n 1)" != "$1"; }
version_ge() { test "$(echo "$@" | tr " " "\n" | sort -rV | head -n 1)" == "$1"; }
is_root && check_kernel_version
if [ "$1" = "--version" -o "$1" = "-V" ]; then
echo "version: $version"
exit 0
elif [ "$1" = "gw" -o "$1" = "cnat2" ]; then
platform="$1"
set_gateway_affinity
exit 0
elif [ -n "$1" ]; then
echo "Description:"
echo " This script attempts to bind each queue of a multi-queue NIC"
echo " to the same numbered core, ie tx0|rx0 --> cpu0, tx1|rx1 --> cpu1"
echo "usage:"
echo " $0 "
exit 0
fi
ifconfig peth1 > /dev/null 2>&1
if [ "$?" == "0" ]; then
pethDEV=1
echo "Set IRQ affinity for xen0:"
fi
########### Set up the desired devices. ##################################
virtual_machines_flag=`egrep -i "virtio" /proc/interrupts | wc -l`
if [ "$virtual_machines_flag" == "0" ];then
mutiqueue=`ls -d /sys/class/net/${nic_name}*/queues/rx-* | grep ${nic_name}.*-.*1`
if [ -n "${mutiqueue}" ]; then
# check for irqbalance running
res=`which systemctl`
if [ $? == 0 ]
then
systemctl stop irqbalance
systemctl disable irqbalance
fi
IRQBALANCE_ON=`ps ax | grep -v grep | grep -q irqbalance; echo $?`
if [ "$IRQBALANCE_ON" == "0" ] ; then
echo " WARNING: irqbalance is running and will"
echo " likely override this script's affinitization."
echo " So I stopped the irqbalance service by"
echo " 'kill -9 irqbalance'"
ps aux | grep irqbalance | grep -v grep | awk '{print $2}' | xargs kill -9
fi
result=$(ifconfig -a | grep "00:07:43")
if [ -n "${result}" ]; then
set_chelsio
else
if [ -f "$rps_gre_enable_file" ]; then
rps_gre_enable_value=`cat $rps_gre_enable_file`
if [ "$rps_gre_enable_value" == "1" ]; then
set_rps_rfs
set_intel_broadcom
else
echo 1 > $rps_gre_enable_file
set_rps_rfs
set_intel_broadcom
fi
else
cur_kernel_version=`uname -r | awk -F\- '{print $1}'`
if version_gt $cur_kernel_version '2.6.32'; then
set_rps_rfs
set_intel_broadcom
else
disable_rps_rfs
set_intel_broadcom
fi
fi
fi
else
set_rps_rfs
fi
else
ethtool_nic0 > /dev/null 2>&1
mutiqueue=`ls -1 /sys/class/net/${nic_name}*/queues/ | grep -c tx-*`
if [ "$mutiqueue" -gt "1" ]; then
# check for irqbalance running
res=`which systemctl`
if [ $? == 0 ]
then
systemctl stop irqbalance
systemctl disable irqbalance
fi
IRQBALANCE_ON=`ps ax | grep -v grep | grep -q irqbalance; echo $?`
if [ "$IRQBALANCE_ON" == "0" ] ; then
echo " WARNING: irqbalance is running and will"
echo " likely override this script's affinitization."
echo " So I stopped the irqbalance service by"
echo " 'kill -9 irqbalance'"
ps aux | grep irqbalance | grep -v grep | awk '{print $2}' | xargs kill -9
fi
disable_rps_rfs
set_rps_rfs
set_virtual_machines_affinity
else
set_rps_rfs
fi
fi
exit 0