Kubernetes nfs 排错

1
(combined from similar events): MountVolume.SetUp failed for volume "kubeflow-mnist" : mount failed: exit status 32 Mounting command: systemd-run Mounting arguments: --description=Kubernetes transient mount for /var/lib/kubelet/pods/7b62b7c1-8745-4b53-9895-77dbab9d1715/volumes/kubernetes.io~nfs/kubeflow-mnist --scope -- mount -t nfs xxx.xxx.xx.218:/data/nfs/mnist /var/lib/kubelet/pods/7b62b7c1-8745-4b53-9895-77dbab9d1715/volumes/kubernetes.io~nfs/kubeflow-mnist Output: Running scope as unit run-8571.scope. mount.nfs: mounting xxx.xxx.xx.218:/data/nfs/mnist failed, reason given by server: No such file or directory

image-20220122122936900

排错步骤

  • 确认路径是存在的,pv,pvc没有错误()

  • 确认pod所在的node ip

    1
     kubectl get pod <podname> -o wide -n namespace
    
  • 确认可以连到nfs server

    1
    telnet <nfs server> port
    
  • run dmesg to see mounting related errors

  • 尝试运行报错的命令mount -t nfs xxx.xxx.xx.218:/data/nfs/mnist /var/lib/kubelet/pods/7b62b7c1-8745-4b53-9895-77dbab9d1715/volumes/kubernetes.io~nfs/kubeflow-mnist

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
apiVersion: v1
kind: PersistentVolume
metadata:
  name: pvc-9ff7e03f-0ae9-4ede-aa06-2a12b53ba652
spec:
  accessModes:
  - ReadWriteOnce
  capacity:
    storage: 2Gi
  claimRef:
    apiVersion: v1
    kind: PersistentVolumeClaim
    name: mnist-pvc
    namespace: kserve-test
  nfs:
    path: /home/share/kserve-test-mnist-pvc-pvc-9ff7e03f-0ae9-4ede-aa06-2a12b53ba652
    server: xxx.xxx.xx.215
  persistentVolumeReclaimPolicy: Delete
  storageClassName: nfs-client
  volumeMode: Filesystem
---

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: mnist-pvc
  namespace: kserve-test
spec:
  accessModes:
  - ReadWriteOnce
  resources:
    requests:
      storage: 2Gi
  storageClassName: nfs-client
  volumeMode: Filesystem
  volumeName: pvc-9ff7e03f-0ae9-4ede-aa06-2a12b53ba652
status:
  accessModes:
  - ReadWriteOnce
  capacity:
    storage: 2Gi
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
set -e
CWD=$(dirname "$(readlink -f "$0")")
echo $CWD
WORKER_DIR=$CWD/nfs
HOSTS=("10.81.29.85" "10.81.29.92" "10.81.28.136" "10.81.29.172" "10.81.28.201")


echo $nfs_str

function pre_rpm() {
    RPM_DIR=$WORKER_DIR/nfs-PRM
    cd $RPM_DIR
    createrepo ./
    echo -e "[nfs]
name=nfs
baseurl=file://$RPM_DIR
enabled=1
gpgcheck=0" > /etc/yum.repos.d/nfs.repo
    yum -y --disablerepo="*" --enablerepo="nfs" update
}


function install_nfs_server() {
  pre_rpm
  client_ip=$1
  systemctl stop firewalld
  systemctl disable firewalld
  mkdir -p /data/nfs
  chmod -R 755 /data/nfs
  chown -R nfsnobody:nfsnobody /data/nfs
  yum -y install nfs-utils rpcbind --disablerepo="*" --enablerepo=nfs
  nfs_str="/data/nfs"
  for host in ${HOSTS[@]}
  do
    s="$host(rw,no_root_squash,no_all_squash,sync)"
    nfs_str="$nfs_str $s"
  done
  echo $nfs_str > /etc/exports
  systemctl enable rpcbind && systemctl restart rpcbind
  systemctl enable nfs && systemctl restart nfs

}
function install_nfs_client() {
  pre_rpm
  systemctl stop firewalld
  systemctl disable firewalld
  mkdir -p /data/nfsdata
  yum -y install nfs-utils rpcbind --disablerepo="*" --enablerepo=nfs
  systemctl enable rpcbind && systemctl restart rpcbind
  systemctl enable nfs && systemctl restart nfs

}


while getopts ":sch" optname
do
    case "$optname" in
      "s")
        install_nfs_server
        ;;
      "c")
        install_nfs_client
        ;;
      "h")
        echo "get option -h,eg:./install_nfs -s nfs_client_ip [-c]"
        ;;
      ":")
        echo "No argument value for option $OPTARG"
        ;;
      "?")
        echo "Unknown option $OPTARG"
        ;;
      *)
        echo "Unknown error while processing options"
        ;;
    esac
    #echo "option index is $OPTIND"
done

选项 内容
rw, ro 读写的方式共享,只读的方式共享
root_squash 客户端使用的是root用户时,则映射到NFS服务器的用户为NFS的匿名用户(nfsnobody)
no_root_squash 客户端使用的是root用户时,则映射到FNS服务器的用户依然为root用户
no_all_squash NFS客户端连接服务端时使用什么用户,对服务端分享的目录来说都是拥有root权限
all_squash 不论NFS客户端连接服务端时使用什么用户,对服务端分享的目录来说都是拥有匿名用户nobody权限
sync/async 默认选项,保持数据同步,数据同步写入到内存和硬盘/文件暂存内存,不立刻写入磁盘

配置

1
2
3
4
5
echo '/data/nfs 10.xx.xx.92(rw,sync,all_squash) xxx.xxx.xx.93(rw,sync,all_squash)' >> /etc/exports

# 重新挂载/etc/exports里面的设定 修改了/etc/exports,并不需要重启NFS服务
exportfs -r
-v 参数表示可视化显示

验证

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# client 运行
mount -t nfs 10.81.28.201:/data/nfs /data/nfsdata
# 检查
df -Th 
# 出现类似10.81.28.201:/data/nfs      nfs4       29G  7.6G   22G  27% /data/nfsdata 即为成功

# client 服务端可以看到
cd /data/nfsdata
touch aa


# 去除挂载 不要在挂载目录里面 不然会报错device is busy
umount /data/nfsdata