Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the error of runc doesn't work with go1.22 #4193

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-20.04, ubuntu-22.04, actuated-arm64-6cpu-8gb]
go-version: [1.20.x, 1.21.x]
go-version: [1.20.x, 1.21.x, 1.22.x]
rootless: ["rootless", ""]
race: ["-race", ""]
criu: ["", "criu-dev"]
Expand Down Expand Up @@ -219,7 +219,7 @@ jobs:
- name: install go
uses: actions/setup-go@v5
with:
go-version: 1.21.x # TODO: switch to 1.x (latest stable) once Go 1.22 vs glibc issue is fixed.
go-version: 1.x # Latest stable

- name: unit test
env:
Expand Down
2 changes: 0 additions & 2 deletions contrib/completions/bash/runc
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,6 @@ _runc_run() {
--help
--detatch
-d
--no-subreaper
--no-pivot
--no-new-keyring
"
Expand Down Expand Up @@ -623,7 +622,6 @@ _runc_restore() {
--file-locks
--detach
-d
--no-subreaper
--no-pivot
--auto-dedup
--lazy-pages
Expand Down
21 changes: 10 additions & 11 deletions exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,17 +181,16 @@ func execProcess(context *cli.Context) (int, error) {
}

r := &runner{
enableSubreaper: false,
shouldDestroy: false,
container: container,
consoleSocket: context.String("console-socket"),
pidfdSocket: context.String("pidfd-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
action: CT_ACT_RUN,
init: false,
preserveFDs: context.Int("preserve-fds"),
subCgroupPaths: cgPaths,
shouldDestroy: false,
container: container,
consoleSocket: context.String("console-socket"),
pidfdSocket: context.String("pidfd-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
action: CT_ACT_RUN,
init: false,
preserveFDs: context.Int("preserve-fds"),
subCgroupPaths: cgPaths,
}
return r.run(p)
}
Expand Down
6 changes: 6 additions & 0 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,12 @@ func (c *Container) start(process *Process) (retErr error) {
if err := utils.CloseExecFrom(3); err != nil {
return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
}

// Set us as the subreaper to let the grandchild process reparent to us.
if err := system.SetSubreaper(1); err != nil {
return fmt.Errorf("unable to set subreaper: %w", err)
}

if err := parent.start(); err != nil {
return fmt.Errorf("unable to start container process: %w", err)
}
Expand Down
15 changes: 0 additions & 15 deletions libcontainer/nsenter/nsenter_go122.go

This file was deleted.

66 changes: 14 additions & 52 deletions libcontainer/nsenter/nsexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,20 +52,6 @@ enum sync_t {
/* Stores the current stage of nsexec. */
int current_stage = STAGE_SETUP;

/* Assume the stack grows down, so arguments should be above it. */
struct clone_t {
/*
* Reserve some space for clone() to locate arguments
* and retcode in this place
*/
char stack[4096] __attribute__((aligned(16)));
char stack_ptr[0];

/* There's two children. This is used to execute the different code. */
jmp_buf *env;
int jmpval;
};

struct nlconfig_t {
char *data;

Expand Down Expand Up @@ -303,23 +289,15 @@ static void update_oom_score_adj(char *data, size_t len)
bail("failed to update /proc/self/oom_score_adj");
}

/* A dummy function that just jumps to the given jumpval. */
static int child_func(void *arg) __attribute__((noinline));
static int child_func(void *arg)
static int fork_and_run(jmp_buf *env, int jmpval)
{
struct clone_t *ca = (struct clone_t *)arg;
longjmp(*ca->env, ca->jmpval);
}

static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline));
static int clone_parent(jmp_buf *env, int jmpval)
{
struct clone_t ca = {
.env = env,
.jmpval = jmpval,
};

return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
pid_t pid = fork();
if (pid < 0)
bail("failed to fork");
if (pid > 0)
return pid;
/* Jump back to the big switch in nsexec... */
longjmp(*env, jmpval);
}

/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
Expand Down Expand Up @@ -644,7 +622,7 @@ void nsexec(void)
*
* Unfortunately, it's not as simple as that. We have to fork to enter the
* PID namespace (the PID namespace only applies to children). Since we'll
* have to double-fork, this clone_parent() call won't be able to get the
* have to double-fork, this fork() call won't be able to get the
* PID of the _actual_ init process (without doing more synchronisation than
* I can deal with at the moment). So we'll just get the parent to send it
* for us, the only job of this process is to update
Expand All @@ -655,15 +633,6 @@ void nsexec(void)
* will be in that namespace (and it will not be able to give us a PID value
* that makes sense without resorting to sending things with cmsg).
*
* This also deals with an older issue caused by dumping cloneflags into
* clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
* we have to unshare(2) before clone(2) in order to do this. This was fixed
* in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
* introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
* aware, the last mainline kernel which had this bug was Linux 3.12.
* However, we cannot comment on which kernels the broken patch was
* backported to.
*
* -- Aleksa "what has my life come to?" Sarai
*/

Expand All @@ -687,7 +656,7 @@ void nsexec(void)

/* Start the process of getting a container. */
write_log(DEBUG, "spawn stage-1");
stage1_pid = clone_parent(&env, STAGE_CHILD);
stage1_pid = fork_and_run(&env, STAGE_CHILD);
if (stage1_pid < 0)
bail("unable to spawn stage-1");

Expand Down Expand Up @@ -755,9 +724,7 @@ void nsexec(void)
/*
* Send both the stage-1 and stage-2 pids back to runc.
* runc needs the stage-2 to continue process management,
* but because stage-1 was spawned with CLONE_PARENT we
* cannot reap it within stage-0 and thus we need to ask
* runc to reap the zombie for us.
* and ask runc to reap the zombie for us.
*/
write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
stage1_pid, stage2_pid);
Expand Down Expand Up @@ -892,9 +859,8 @@ void nsexec(void)
}

/*
* We don't have the privileges to do any mapping here (see the
* clone_parent rant). So signal stage-0 to do the mapping for
* us.
* We don't have the privileges to do any mapping here.
* So signal stage-0 to do the mapping for us.
*/
write_log(DEBUG, "request stage-0 to map user namespace");
s = SYNC_USERMAP_PLS;
Expand Down Expand Up @@ -925,10 +891,6 @@ void nsexec(void)
* ordering might break in the future (especially with rootless
* containers). But for now, it's not possible to split this into
* CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
*
* Note that we don't merge this with clone() because there were
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
* was broken, so we'll just do it the long way anyway.
*/
try_unshare(config.cloneflags, "remaining namespaces");

Expand All @@ -955,7 +917,7 @@ void nsexec(void)
* to actually enter the new PID namespace.
*/
write_log(DEBUG, "spawn stage-2");
stage2_pid = clone_parent(&env, STAGE_INIT);
stage2_pid = fork_and_run(&env, STAGE_INIT);
if (stage2_pid < 0)
bail("unable to spawn stage-2");

Expand Down
3 changes: 0 additions & 3 deletions man/runc-restore.8.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,6 @@ image files directory.
**--pid-file** _path_
: Specify the file to write the initial container process' PID to.

**--no-subreaper**
: Disable the use of the subreaper used to reap reparented processes.

**--no-pivot**
: Do not use pivot root to jail process inside rootfs. This should not be used
except in exceptional circumstances, and may be unsafe from the security
Expand Down
3 changes: 0 additions & 3 deletions man/runc-run.8.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@ referencing the master end of the console's pseudoterminal. See
**--pid-file** _path_
: Specify the file to write the initial container process' PID to.

**--no-subreaper**
: Disable the use of the subreaper used to reap reparented processes.

**--no-pivot**
: Do not use pivot root to jail process inside rootfs. This should not be used
except in exceptional circumstances, and may be unsafe from the security
Expand Down
2 changes: 1 addition & 1 deletion restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ using the runc checkpoint command.`,
},
cli.BoolFlag{
Name: "no-subreaper",
Usage: "disable the use of the subreaper used to reap reparented processes",
Usage: "(ignored) disable the use of the subreaper used to reap reparented processes. This flag has been ignored by runc, and will be removed in 1.3.0",
},
cli.BoolFlag{
Name: "no-pivot",
Expand Down
2 changes: 1 addition & 1 deletion run.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ command(s) that get executed on start, edit the args parameter of the spec. See
},
cli.BoolFlag{
Name: "no-subreaper",
Usage: "disable the use of the subreaper used to reap reparented processes",
Usage: "(ignored) disable the use of the subreaper used to reap reparented processes. This flag has been ignored by runc, and will be removed in 1.3.0",
},
cli.BoolFlag{
Name: "no-pivot",
Expand Down
9 changes: 1 addition & 8 deletions signals.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"os/signal"

"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"

"github.com/sirupsen/logrus"
Expand All @@ -18,13 +17,7 @@ const signalBufferSize = 2048
// while still forwarding all other signals to the process.
// If notifySocket is present, use it to read systemd notifications from the container and
// forward them to notifySocketHost.
func newSignalHandler(enableSubreaper bool, notifySocket *notifySocket) *signalHandler {
if enableSubreaper {
// set us as the subreaper before registering the signal handler for the container
if err := system.SetSubreaper(1); err != nil {
logrus.Warn(err)
}
}
func newSignalHandler(notifySocket *notifySocket) *signalHandler {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As @kolyshkin pointed out in #4278 (comment)
This PR has done such things. But there is still one thing I can't determine whether we should do or not:

If we use PR_SET_CHILD_SUBREAPER and fork(2) to replace clone(CLONE_PARENT), I think we should move this signal handler to libcontainer. Or else someone use libcontainer directly in the code will have to write a signal handler by themselves.

// ensure that we have a large buffer size so that we do not miss any signals
// in case we are not processing them fast enough.
s := make(chan os.Signal, signalBufferSize)
Expand Down
55 changes: 27 additions & 28 deletions utils_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,20 +190,19 @@ func createContainer(context *cli.Context, id string, spec *specs.Spec) (*libcon
}

type runner struct {
init bool
enableSubreaper bool
shouldDestroy bool
detach bool
listenFDs []*os.File
preserveFDs int
pidFile string
consoleSocket string
pidfdSocket string
container *libcontainer.Container
action CtAct
notifySocket *notifySocket
criuOpts *libcontainer.CriuOpts
subCgroupPaths map[string]string
init bool
shouldDestroy bool
detach bool
listenFDs []*os.File
preserveFDs int
pidFile string
consoleSocket string
pidfdSocket string
container *libcontainer.Container
action CtAct
notifySocket *notifySocket
criuOpts *libcontainer.CriuOpts
subCgroupPaths map[string]string
}

func (r *runner) run(config *specs.Process) (int, error) {
Expand Down Expand Up @@ -247,10 +246,11 @@ func (r *runner) run(config *specs.Process) (int, error) {
return -1, err
}
detach := r.detach || (r.action == CT_ACT_CREATE)

// Setting up IO is a two stage process. We need to modify process to deal
// with detaching containers, and then we get a tty after the container has
// started.
handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
handler := newSignalHandler(r.notifySocket)
tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
if err != nil {
return -1, err
Expand Down Expand Up @@ -396,19 +396,18 @@ func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.C
}

r := &runner{
enableSubreaper: !context.Bool("no-subreaper"),
shouldDestroy: !context.Bool("keep"),
container: container,
listenFDs: listenFDs,
notifySocket: notifySocket,
consoleSocket: context.String("console-socket"),
pidfdSocket: context.String("pidfd-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
preserveFDs: context.Int("preserve-fds"),
action: action,
criuOpts: criuOpts,
init: true,
shouldDestroy: !context.Bool("keep"),
container: container,
listenFDs: listenFDs,
notifySocket: notifySocket,
consoleSocket: context.String("console-socket"),
pidfdSocket: context.String("pidfd-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
preserveFDs: context.Int("preserve-fds"),
action: action,
criuOpts: criuOpts,
init: true,
}
return r.run(spec.Process)
}
Expand Down