/*
 * newns -- A tool to create and fiddle with namesapces
 * Copyright (C) 2012 Andrew Lutomirski
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
 * USA
 */

#define _GNU_SOURCE

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <signal.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/prctl.h>
#include <sys/mman.h>
#include <sched.h>

#define PR_SET_NO_NEW_PRIVS 36
#define PR_GET_NO_NEW_PRIVS 37

static const char *argv0;

static void usage()
{
	fprintf(stderr, "Usage: %s [OPTS...] PROGRAM [ARGS]...\n"
		"Run PROGRAM in a new namespace.\n"
		"\n"
		"  --nnp           Set no_new_privs (may be required unless privileged)\n"
		"  --ipc           Create a new IPC namespace\n"
		"  --uts           Create a new UTS namespace\n"
		"  --mount         Create a new mount namespace\n"
		"  --pid           Create a new pid namespace\n"
		"  --init          Create a new pid namespace and act as init\n"
		"  --net           Create a new network namespace\n"
		"  --kill          Try to kill child when killed\n"
		"  --copy-ret      Copy child return code (otherwise return 0 if exec succeeds)\n",
		argv0);
	exit(1);
}

static int clone_flags = 0;
static bool act_as_init = false;
static bool nnp = false;
static bool set_pdeathsig = false;
static bool copy_ret = false;

static int argsleft;
static char **nextarg;

/*
 * Subtlety here: we need this even in the act_as_init case -- this is
 * how we exit with code 1 when init fails to exec the command.
 */
static struct exec_result {
	bool exec_failed;
	int exec_errno;
} *exec_result;

int clone_fn(void *arg)
{
	if (set_pdeathsig) {
		if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0) != 0) {
			perror("PR_SET_PDEATHSIG");
			exec_result->exec_failed = 1;
			_exit(1);
		}
	}

	if (act_as_init) {
		if (fork() == 0)
			goto do_exec;

		prctl(PR_SET_NAME, "[newns init]", 0, 0, 0);

		/* TODO: Detect and handle exec failure. */

		while (wait(0) != -1 || errno == EINTR)
			;

		if (errno == ECHILD) {
			_exit(0);  // All children are gone.
		} else {
			perror("[namespace init] wait");
			exec_result->exec_failed = 1;
			_exit(1);  // Well, crud.
		}
	}

do_exec:
	execvp(nextarg[0], nextarg);

	perror(nextarg[0]);
	exec_result->exec_failed = 1;
	exec_result->exec_errno = errno;
	_exit(1);
}

int main(int argc, char **argv)
{
	argv0 = argv[0];

	if (argc < 2)
		usage();

	/* Parse options */
	argsleft = argc - 1;
	nextarg = argv + 1;
	while (argsleft && !strncmp(*nextarg, "-", 1)) {
		if (!strcmp(*nextarg, "--nnp")) {
			nnp = true;
		} else if (!strcmp(*nextarg, "--ipc")) {
			clone_flags |= CLONE_NEWIPC;
		} else if (!strcmp(*nextarg, "--uts")) {
			clone_flags |= CLONE_NEWUTS;
		} else if (!strcmp(*nextarg, "--mount")) {
			clone_flags |= CLONE_NEWNS;
		} else if (!strcmp(*nextarg, "--pid")) {
			clone_flags |= CLONE_NEWPID;
		} else if (!strcmp(*nextarg, "--init")) {
			clone_flags |= CLONE_NEWPID;
			act_as_init = true;
		} else if (!strcmp(*nextarg, "--net")) {
			clone_flags |= CLONE_NEWNET;
		} else if (!strcmp(*nextarg, "--kill")) {
			set_pdeathsig = true;
		} else if (!strcmp(*nextarg, "--copy-ret")) {
			copy_ret = true;
		} else if (!strcmp(*nextarg, "--")) {
			break;  /* end of options */
		} else {
			fprintf(stderr, "Unknown option %s\n", nextarg[0]);
			usage();
		}

		--argsleft;
		++nextarg;
	}

	if (argsleft == 0) {
		fprintf(stderr, "No program specified.\n");
		usage();
	}

	if (copy_ret && act_as_init) {
		fprintf(stderr, "--init --copy-ret is not supported\n");
		usage();
	}

	if (nnp && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0) {
		perror("PR_SET_NO_NEW_PRIVS");
		return 1;
	}

	exec_result = mmap(0, sizeof(struct exec_result),
			   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
			   -1, 0);
	if (exec_result == MAP_FAILED) {
		perror("mmap");
		return 1;
	}

	struct sigaction sa_chld;
	memset(&sa_chld, 0, sizeof(sa_chld));
	sa_chld.sa_handler = SIG_DFL;
	sigaction(SIGCHLD, &sa_chld, 0);

	/* TODO: If kill9, set up as many signal handlers as possible
	   to try to kill child in case child unsets pdeathsig */

	if (!act_as_init)
		clone_flags |= CLONE_VFORK;

	// Child stack can overlap parent stack, since CLONE_VM is not set.
	char child_stack[1];
	int child_pid = clone(clone_fn, child_stack,
			      clone_flags | SIGCHLD,
			      0);
	if (child_pid == -1) {
		perror("clone");
		return 1;
	}

	if (!act_as_init) {
		/*
		 * At this point, the child has either failed or completed
		 * its execve.
		 */
		if (exec_result->exec_failed)
			return 1;

		/* Might as well free the extra page. */
		munmap(exec_result, sizeof(struct exec_result));
	}

	while (true) {
		int status;
		int reaped_pid = wait(&status);
		if (reaped_pid == -1) {
			if (errno != EINTR) {
				perror("wait");
				return 2;
			}
		} else if (reaped_pid == child_pid) {
			if (act_as_init && exec_result->exec_failed)
				return 1;
			else
				return copy_ret ? WEXITSTATUS(status) : 0;
		}
	}
}
