使用者命名空間：如何僅為給定程序安裝文件夾

May 20, 2022

我想在沒有 root 訪問權限的非 FHS 系統 (NixOs) 上偽造 FHS 系統。為此，我需要使用使用者名空間（我沒有看到任何其他解決方案）在根目錄下掛載一些文件夾（比如掛載/tmp/mylib到/lib）。

不幸的是，我找不到如何使它工作：我嘗試按照本教程進行操作，但是當我複制程式碼時它失敗了（我什至無法啟動 bash）：

$ gcc userns_child_exec.c -lcap -o userns_child_exec
$ id
uid=1000(myname) gid=100(users) groups=100(users),1(wheel),17(audio),20(lp),57(networkmanager),59(scanner),131(docker),998(vboxusers),999(adbusers)

$ ./userns_child_exec -U -M '0 1000 1' -G '0 100 1' bash
write /proc/535313/gid_map: Operation not permitted
bash: initialize_job_control: no job control in background: Bad file descriptor

[nix-shell:~/Documents/Logiciels/Nix_bidouille/2022_04_26_-_nix_fake_FHS_user_namespace/demo]$ 
[root@bestos:~/Documents/Logiciels/Nix_bidouille/2022_04_26_-_nix_fake_FHS_user_namespace/demo]# 
exit

（注意會顯示 bash 的提示，但是後來我什麼也輸入不了，直接退出了）

知道如何使它工作嗎？

程式碼：

/* userns_child_exec.c

  Copyright 2013, Michael Kerrisk
  Licensed under GNU General Public License v2 or later

  Create a child process that executes a shell command in new
  namespace(s); allow UID and GID mappings to be specified when
  creating a user namespace.
*/
#define _GNU_SOURCE
#include &lt;sched.h&gt;
#include &lt;unistd.h&gt;
#include &lt;stdlib.h&gt;
#include &lt;sys/wait.h&gt;
#include &lt;signal.h&gt;
#include &lt;fcntl.h&gt;
#include &lt;stdio.h&gt;
#include &lt;string.h&gt;
#include &lt;limits.h&gt;
#include &lt;errno.h&gt;

/* A simple error-handling function: print an error message based
  on the value in 'errno' and terminate the calling process */

#define errExit(msg)    do { perror(msg); exit(EXIT_FAILURE); \
                       } while (0)

struct child_args {
   char **argv;        /* Command to be executed by child, with arguments */
   int    pipe_fd[2];  /* Pipe used to synchronize parent and child */
};

static int verbose;

static void
usage(char *pname)
{
   fprintf(stderr, "Usage: %s [options] cmd [arg...]\n\n", pname);
   fprintf(stderr, "Create a child process that executes a shell command "
           "in a new user namespace,\n"
           "and possibly also other new namespace(s).\n\n");
   fprintf(stderr, "Options can be:\n\n");
#define fpe(str) fprintf(stderr, "    %s", str);
   fpe("-i          New IPC namespace\n");
   fpe("-m          New mount namespace\n");
   fpe("-n          New network namespace\n");
   fpe("-p          New PID namespace\n");
   fpe("-u          New UTS namespace\n");
   fpe("-U          New user namespace\n");
   fpe("-M uid_map  Specify UID map for user namespace\n");
   fpe("-G gid_map  Specify GID map for user namespace\n");
   fpe("            If -M or -G is specified, -U is required\n");
   fpe("-v          Display verbose messages\n");
   fpe("\n");
   fpe("Map strings for -M and -G consist of records of the form:\n");
   fpe("\n");
   fpe("    ID-inside-ns   ID-outside-ns   len\n");
   fpe("\n");
   fpe("A map string can contain multiple records, separated by commas;\n");
   fpe("the commas are replaced by newlines before writing to map files.\n");

   exit(EXIT_FAILURE);
}

/* Update the mapping file 'map_file', with the value provided in
  'mapping', a string that defines a UID or GID mapping. A UID or
  GID mapping consists of one or more newline-delimited records
  of the form:

      ID_inside-ns    ID-outside-ns   length

  Requiring the user to supply a string that contains newlines is
  of course inconvenient for command-line use. Thus, we permit the
  use of commas to delimit records in this string, and replace them
  with newlines before writing the string to the file. */

static void
update_map(char *mapping, char *map_file)
{
   int fd, j;
   size_t map_len;     /* Length of 'mapping' */

   /* Replace commas in mapping string with newlines */

   map_len = strlen(mapping);
   for (j = 0; j &lt; map_len; j++)
       if (mapping[j] == ',')
           mapping[j] = '\n';

   fd = open(map_file, O_RDWR);
   if (fd == -1) {
       fprintf(stderr, "open %s: %s\n", map_file, strerror(errno));
       exit(EXIT_FAILURE);
   }

   if (write(fd, mapping, map_len) != map_len) {
       fprintf(stderr, "write %s: %s\n", map_file, strerror(errno));
       exit(EXIT_FAILURE);
   }

   close(fd);
}

static int              /* Start function for cloned child */
childFunc(void *arg)
{
   struct child_args *args = (struct child_args *) arg;
   char ch;

   /* Wait until the parent has updated the UID and GID mappings. See
      the comment in main(). We wait for end of file on a pipe that will
      be closed by the parent process once it has updated the mappings. */

   close(args-&gt;pipe_fd[1]);    /* Close our descriptor for the write end
                                  of the pipe so that we see EOF when
                                  parent closes its descriptor */
   if (read(args-&gt;pipe_fd[0], &ch, 1) != 0) {
       fprintf(stderr, "Failure in child: read from pipe returned != 0\n");
       exit(EXIT_FAILURE);
   }

   /* Execute a shell command */

   execvp(args-&gt;argv[0], args-&gt;argv);
   errExit("execvp");
}

#define STACK_SIZE (1024 * 1024)

static char child_stack[STACK_SIZE];    /* Space for child's stack */

int
main(int argc, char *argv[])
{
   int flags, opt;
   pid_t child_pid;
   struct child_args args;
   char *uid_map, *gid_map;
   char map_path[PATH_MAX];

   /* Parse command-line options. The initial '+' character in
      the final getopt() argument prevents GNU-style permutation
      of command-line options. That's useful, since sometimes
      the 'command' to be executed by this program itself
      has command-line options. We don't want getopt() to treat
      those as options to this program. */

   flags = 0;
   verbose = 0;
   gid_map = NULL;
   uid_map = NULL;
   while ((opt = getopt(argc, argv, "+imnpuUM:G:v")) != -1) {
       switch (opt) {
       case 'i': flags |= CLONE_NEWIPC;        break;
       case 'm': flags |= CLONE_NEWNS;         break;
       case 'n': flags |= CLONE_NEWNET;        break;
       case 'p': flags |= CLONE_NEWPID;        break;
       case 'u': flags |= CLONE_NEWUTS;        break;
       case 'v': verbose = 1;                  break;
       case 'M': uid_map = optarg;             break;
       case 'G': gid_map = optarg;             break;
       case 'U': flags |= CLONE_NEWUSER;       break;
       default:  usage(argv[0]);
       }
   }

   /* -M or -G without -U is nonsensical */

   if ((uid_map != NULL || gid_map != NULL) &&
           !(flags & CLONE_NEWUSER))
       usage(argv[0]);

   args.argv = &argv[optind];

   /* We use a pipe to synchronize the parent and child, in order to
      ensure that the parent sets the UID and GID maps before the child
      calls execve(). This ensures that the child maintains its
      capabilities during the execve() in the common case where we
      want to map the child's effective user ID to 0 in the new user
      namespace. Without this synchronization, the child would lose
      its capabilities if it performed an execve() with nonzero
      user IDs (see the capabilities(7) man page for details of the
      transformation of a process's capabilities during execve()). */

   if (pipe(args.pipe_fd) == -1)
       errExit("pipe");

   /* Create the child in new namespace(s) */

   child_pid = clone(childFunc, child_stack + STACK_SIZE,
                     flags | SIGCHLD, &args);
   if (child_pid == -1)
       errExit("clone");

   /* Parent falls through to here */

   if (verbose)
       printf("%s: PID of child created by clone() is %ld\n",
               argv[0], (long) child_pid);

   /* Update the UID and GID maps in the child */

   if (uid_map != NULL) {
       snprintf(map_path, PATH_MAX, "/proc/%ld/uid_map",
               (long) child_pid);
       update_map(uid_map, map_path);
   }
   if (gid_map != NULL) {
       snprintf(map_path, PATH_MAX, "/proc/%ld/gid_map",
               (long) child_pid);
       update_map(gid_map, map_path);
   }

   /* Close the write end of the pipe, to signal to the child that we
      have updated the UID and GID maps */

   close(args.pipe_fd[1]);

   if (waitpid(child_pid, NULL, 0) == -1)      /* Wait for child */
       errExit("waitpid");

   if (verbose)
       printf("%s: terminating\n", argv[0]);

   exit(EXIT_SUCCESS);
}

編輯

實際上，這很奇怪：編寫組時出現錯誤，但它確實適用於 uid：

[leo@bestos:~]$ cat /proc/582197/gid_map 

[leo@bestos:~]$ cat /proc/582197/uid_map 
        0       1000          1

[leo@bestos:~]$ ll /proc/582197/gid_map 
-rw-r--r-- 1 leo users 0 mai   18 09:09 /proc/582197/gid_map

[leo@bestos:~]$ ll /proc/582197/uid_map 
-rw-r--r-- 1 leo users 0 mai   18 09:09 /proc/582197/uid_map

您正在閱讀的教程創建於 2013 年，之後在 2015 年為處理核心 3.19 中的GID映射添加了一個重要的附加限制。根據man user_namespaces：
將“拒絕”寫入/ proc/$$ pid $$寫入 /proc/之前的/setgroups文件*$$ pid $$/gid_map* 將永久禁用使用者命名空間中的 setgroups(2) 並允許寫入 /proc/$$ pid $$/gid_map 在父使用者命名空間中沒有 CAP_SETGID 功能。
/ proc/$$ pid $$/setgroups文件是在 Linux 3.19 中添加的，但由於它解決了安全問題，因此被向後移植到許多早期的穩定核心系列。該問題涉及具有“rwx—rwx”等權限的文件。此類文件授予“組”的權限少於授予“其他”的權限。這意味著使用 setgroups(2) 刪除組可能會允許它以前沒有的程序文件訪問權限。在使用者命名空間存在之前，這不是問題
$$ … $$然後，這允許以前沒有特權的使用者刪除組，從而獲得他們以前沒有的文件訪問權限。$$ … $$
因此，您必須添加程式碼以將單詞寫入deny名稱snprintf(map_path, PATH_MAX, "/proc/%ld/setgroups", (long) child_pid);正確的文件，然後再寫入gid_map.
整個程式碼可以用這個無處不在的命令替換：
unshare --user --map-root-user --mount -- bash
（其中有一個隱含的--setgroups=deny）
同樣沒有權限只能映射一個 uid/gid。因此，一旦安裝完成，唯一可能的選擇是模擬原始使用者，儘管不完全是映射回原始使用者，這可以通過使用unshare剛剛未共享的第二個級聯使用者命名空間的 too 的最新版本來完成：
# unshare --user --map-user=1000 --map-group=100 -- bash
那麼這個命名空間中就會有一個 uid。甚至 root 不再存在（並且被視為映射nobody為任何其他未映射的 uid）。
筆記
與其他命名空間和功能還有其他互動，這是一個範例：
CAP_SYS_ADMIN在擁有程序的 PID 命名空間的使用者命名空間中保持允許（從 Linux 3.8 開始）該程序掛載*/proc* 文件系統。
因此，添加--pid --fork以遵守上述限制允許在/proc以後需要時安裝現有的，但通常只有在首先使用時才需要--pid（這也可以通過添加方便地完成--mount-proc）。
同樣--net需要掛載/sys，因為它與網路命名空間的互動。
將所有這些放在一起以替換為OP 範例/lib的內容：/tmp/o
unshare --user --map-root-user --mount -- \
   sh -c 'mount --bind /tmp/o /lib; exec unshare --user --map-user=1000 --map-group=100 -- bash'
注意：一旦完成第一次映射，就不可能再正確使用大多數特權命令：使用者命名空間中存在可用的單個 UID 0，或者下一個（嵌套）使用者命名空間中可用的單個 UID 1000。由於特權命令處理兩個 UID（其中一個通常是 root）和一個不可用之間的轉換，它通常會在某些帶有 EINVAL 的系統呼叫中失敗。
為了做得更好，首先需要特權命令的幫助，以及 root 訪問權限來配置額外的權限。例如 setuid root 命令newuidmap，newgidmap通常需要這些命令從沒有特權的使用者那裡引導一個完整的容器。

引用自：https://unix.stackexchange.com/questions/702980

使用者命名空間：如何僅為給定程序安裝文件夾

相關問答

為什麼我不能在使用者命名空間中綁定掛載“/”？

性能測試出錯，dd 命令在 /dev/mapper/device 上創建了 13TB 的數據。為什麼系統沒有崩潰？硬碟-250GB

tmpfs 上的 tmp：fstab 與 tmp.mount 與 systemd

根本無法更改已安裝硬碟的權限

掛載 SMB 共享而不在遠端路徑中指定共享目錄

/etc/mtab 的歷史是什麼？它更新了什麼？