diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..c8a8d32e --- /dev/null +++ b/Makefile @@ -0,0 +1,51 @@ +SRC_DIR := $(shell pwd) + +IMAGE_NAME := phos-base-113 +DOCKERFILE := $(SRC_DIR)/dockerfiles/build_113.Dockerfile + +BUILD_ARGS ?= -i -3 -u -p=false +CLIENT_RUN_CMD ?= "python" + +.PHONY: build clean exec + +build-image: + docker build \ + --build-arg proxy=http://ipads:ipads123@127.0.0.1:11235 \ + --progress=plain -f $(DOCKERFILE) -t $(IMAGE_NAME) . + +build: + docker run --rm --gpus all \ + -v $(SRC_DIR):/root \ + --privileged --network=host --ipc=host \ + $(IMAGE_NAME) \ + bash -c "cd /root/scripts/build_scripts/ && bash build.sh $(BUILD_ARGS)" + +server-run: + docker run --rm --gpus all -it \ + -v $(SRC_DIR):/root \ + --privileged --network=host --ipc=host \ + $(IMAGE_NAME) \ + bash -c "CUDA_VISIBLE_DEVICES=2 pos_cli --start --target daemon" + +client-run: + docker run --rm --gpus all \ + -v $(SRC_DIR):/root \ + --privileged --network=host --ipc=host \ + $(IMAGE_NAME) \ + bash -c "cd /root && export LD_LIBRARY_PATH=/root/lib:$LD_LIBRARY_PATH && export LIBRARY_PATH=/root/lib:$LIBRARY_PATH && LD_PRELOAD=/root/lib/libxpuclient.so RUST_LOG=error $(CLIENT_RUN_CMD)" + +clean: + docker run --rm --gpus all \ + -v $(SRC_DIR):/root \ + --privileged --network=host --ipc=host \ + $(IMAGE_NAME) \ + bash -c "cd /root/scripts/build_scripts/ && bash build.sh -c -3" + +exec: + docker run --rm --gpus all -it \ + -v $(SRC_DIR):/root \ + --privileged --network=host --ipc=host \ + $(IMAGE_NAME) \ + bash + + diff --git a/README.md b/README.md index 6c07c581..4cc434ee 100755 --- a/README.md +++ b/README.md @@ -63,121 +63,10 @@
-## I. Build and Install PhOS +## I. Quick build -### 💡 Option 1: Build and Install From Source - -1. **[Clone Repository]** - First of all, clone this repository **recursively**: - - ```bash - git clone --recursive https://github.com/SJTU-IPADS/PhoenixOS.git - ``` - -2. **[Start Container]** - PhOS can be built and installed on official vendor image. - - > NOTE: PhOS require libc6 >= 2.29 for compiling CRIU from source. - - For example, for running PhOS for CUDA 11.3, - one can build on official CUDA images - (e.g., [`nvidia/cuda:11.3.1-cudnn8-devel-ubuntu20.04`](https://hub.docker.com/layers/nvidia/cuda/11.3.1-cudnn8-devel-ubuntu20.04/images/sha256-459c130c94363099b02706b9b25d9fe5822ea233203ce9fbf8dfd276a55e7e95)): - - - ```bash - # enter repository - cd PhoenixOS/scripts/docker - - # start and enter container with id 1 - bash run_torch_cu113.sh -s 1 - - # enter / close container (no need to execute here, just listed) - bash run_torch_cu113.sh -e 1 # enter container - bash run_torch_cu113.sh -c 1 # close container - ``` - - Note that it's important to execute docker container with root privilege, as CRIU needs the permission to C/R kernel-space memory pages. - -3. **[Downloading Necesssary Assets]** - PhOS relies on some assets to build and test, - please download these assets by simply running following commands: - - ```bash - # inside container - - # download assets - cd /root/scripts/build_scripts - bash download_assets.sh - ``` - -4. **[Build]** - Building PhOS is simple! - - PhOS provides a convinient build system, which covers compiling, linking and installing all PhOS components: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ComponentDescription
phos-autogenAutogen Engine for generating most of Parser and Worker code for specific hardware platform, based on lightwight notation.
phosdPhOS Daemon, which continuously run at the background, taking over the control of all GPU devices on the node.
libphos.soPhOS Hijacker, which hijacks all GPU API calls on the client-side and forward to PhOS Daemon.
libpccl.soPhOS Checkpoint Communication Library (PCCL), which provide highly-optimized device-to-device state migration. Note that this library is not included in current release.
unit-testingUnit Tests for PhOS, which is based on GoogleTest.
phos-cliCommand Line Interface (CLI) for interacting with PhOS.
phos-remotingRemoting Framework, which provide highly-optimized GPU API remoting performance. See more details at SJTU-IPADS/PhoenixOS-Remoting.
- - To build and install all above components and other dependencies, simply run the build script in the container would works: - - ```bash - # inside container - cd /root/scripts/build_scripts - - # clear old build cache - # -c: clear previous build - # -3: the clean process involves all third-parties - bash build.sh -c -3 - - # start building - # -3: the build process involves all third-parties - # -i: install after successful building - # -u: build PhOS with unit test enable - bash build.sh -i -3 -u - ``` - - For customizing build options, please refers to and modify avaiable options under `scripts/build_scripts/build_config.yaml`. - - If you encounter any build issues, you're able to see building logs under `build_log`. Please open a new issue if things are stuck :-| - -### 💡 Option 2: Install From Pre-built Binaries - - Will soon be updated, stay tuned :) - - -
+Currently, we don't have pre-built binaries. +Please check [build from Source](docs/docs/getting_started/build_from_source.md) for how to build and run from source! ## II. Usage @@ -194,9 +83,17 @@ Once successfully installed PhOS, you can now try run your program with PhOS sup 1. Start the PhOS daemon (`phosd`), which takes over all GPU reousces on the node: ```bash + ## If built in an interactive container (or host) pos_cli --start --target daemon ``` + or + + ```bash + ## If built with our container + make server-run + ``` + 2. To run your program with PhOS support, one need to put a `yaml` configure file under the directory which your program would regard as `$PWD`. This file contains all necessary informations for PhOS to hijack your program. An example file looks like: diff --git a/dockerfiles/build_113.Dockerfile b/dockerfiles/build_113.Dockerfile new file mode 100644 index 00000000..03c3e4ba --- /dev/null +++ b/dockerfiles/build_113.Dockerfile @@ -0,0 +1,53 @@ +FROM phoenixos/pytorch:11.3-ubuntu20.04 as base + +ARG DEBIAN_FRONTEND=noninteractive +ARG proxy + +RUN apt update +RUN apt-get install -y libibverbs-dev libboost-all-dev net-tools \ + git-lfs pkg-config python3-pip libelf-dev libssl-dev libgl1-mesa-dev \ + libvdpau-dev iputils-ping wget gdb vim nsight-compute-2023.1.1 curl + +RUN apt-get update && \ + apt-get install -y software-properties-common && \ + add-apt-repository -y ppa:ubuntu-toolchain-r/test && \ + apt-get update + +RUN apt-get install -y g++-9 +RUN apt-get install -y g++-13 + +RUN pip3 install meson -i https://mirrors.aliyun.com/pypi/simple/ + +RUN ln -s /opt/nvidia/nsight-compute/2023.1.1/target/linux-desktop-glibc_2_11_3-x64/ncu /usr/local/bin/ncu + +RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/ + +# Copy build scripts from the project root +COPY scripts/ /scripts +COPY third_party/go1.23.2.linux-amd64.tar.gz /third_party/go1.23.2.linux-amd64.tar.gz + + +ENV RUSTUP_UPDATE_ROOT=https://mirrors.tuna.tsinghua.edu.cn/rustup/rustup +ENV RUSTUP_DIST_SERVER=https://mirrors.tuna.tsinghua.edu.cn/rustup +RUN mkdir -p /opt/rust + +ENV CARGO_HOME=/opt/rust/.cargo +ENV RUSTUP_HOME=/opt/rust/.rustup + +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path +ENV PATH="/opt/rust/.cargo/bin:${PATH}" +RUN . /opt/rust/.cargo/env + +RUN rustup install nightly +RUN rustup default nightly + + +# Make scripts executable and run download_assets.sh +RUN chmod +x /scripts/build_scripts/*.sh +RUN cd /scripts/build_scripts && bash build.sh -p -b=false -3=true + +ENV PATH="/root/bin:${PATH}" +ENV LD_LIBRARY_PATH="/root/lib:${LD_LIBRARY_PATH}" + +WORKDIR /root + diff --git a/docs/docs/getting_started/build_from_source.md b/docs/docs/getting_started/build_from_source.md new file mode 100644 index 00000000..c614b382 --- /dev/null +++ b/docs/docs/getting_started/build_from_source.md @@ -0,0 +1,129 @@ +# Quick start + +This guide will help you build and run PhOS from source. +PhOS provides two options, and you can choose **either one** to build PhOS. + +## Overview of the build + +PhOS provides a convenient build system, which covers compiling, linking and installing all PhOS components: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ComponentDescription
phos-autogenAutogen Engine for generating most of Parser and Worker code for specific hardware platform, based on lightwight notation.
phosdPhOS Daemon, which continuously run at the background, taking over the control of all GPU devices on the node.
libphos.soPhOS Hijacker, which hijacks all GPU API calls on the client-side and forward to PhOS Daemon.
libpccl.soPhOS Checkpoint Communication Library (PCCL), which provide highly-optimized device-to-device state migration. Note that this library is not included in current release.
unit-testingUnit Tests for PhOS, which is based on GoogleTest.
phos-cliCommand Line Interface (CLI) for interacting with PhOS.
phos-remotingRemoting Framework, which provide highly-optimized GPU API remoting performance. See more details at SJTU-IPADS/PhoenixOS-Remoting.
+ + +1. **[Clone Repository]** + First of all, clone this repository **recursively**: + + ```bash + git clone --recursive https://github.com/SJTU-IPADS/PhoenixOS.git + ``` + +2. **[Downloading Necessary (third-party) Assets]** + PhOS relies on some assets to build and test, + please download these assets by simply running following commands: + + ```bash + # download assets + cd path/to/phos/scripts/build_scripts + bash download_assets.sh + ``` + +3. **(Optional#1) [Build with our image]** + First, build our pre-released image (if not found phos-base-113 on the hub): + (This option only works for cuda 11.3 for now) + + ```bash + make build-image + ``` + + Second, use the image to build PhOS all the time: + + ```bash + make build BUILD_ARGS="-i -3 -p=false" + ``` + + Use the following to check possible built options: + + ```bash + make build BUILD_ARGS="-help" + ``` + +3. **(Optional#2) [Start an interactive container]** + PhOS can be built and installed on official vendor image (or host) + if you don't want to use our pre-built image. + + > NOTE: PhOS has some minimal requirements, e.g., it requires libc6 >= 2.29 for compiling CRIU from source. Thus, we strongly recommend you to use our base image as an interactive building environment. + + For example, for running PhOS for CUDA 11.3, + one can build on official CUDA images + (e.g., [`nvidia/cuda:11.3.1-cudnn8-devel-ubuntu20.04`](https://hub.docker.com/layers/nvidia/cuda/11.3.1-cudnn8-devel-ubuntu20.04/images/sha256-459c130c94363099b02706b9b25d9fe5822ea233203ce9fbf8dfd276a55e7e95)): + + + ```bash + # enter repository + cd PhoenixOS/scripts/docker + + # start and enter container with id 1 + bash run_torch_cu113.sh -s 1 + + # enter / close container (no need to execute here, just listed) + bash run_torch_cu113.sh -e 1 # enter container + bash run_torch_cu113.sh -c 1 # close container + ``` + + > Note that it's important to execute docker container with root privilege, as CRIU needs the permission to C/R kernel-space memory pages. + + To build and install all above components and other dependencies, simply run the build script in the container would works: + + ```bash + # inside container + cd /root/scripts/build_scripts + + # clear old build cache + # -c: clear previous build + # -3: the clean process involves all third-parties + bash build.sh -c -3 + + # start building + # -3: the build process involves all third-parties + # -i: install after successful building + # -u: build PhOS with unit test enable + bash build.sh -i -3 -u + ``` + +4. **Build configuration and trouble shooting** + For customizing build options, please refers to and modify avaiable options under `scripts/build_scripts/build_config.yaml`. + + If you encounter any build issues, you're able to see building logs under `build_log`. Please open a new issue if things are stuck :-| The logs typically are quite self-explained. \ No newline at end of file diff --git a/scripts/build_scripts/build.sh b/scripts/build_scripts/build.sh index d661ed10..4750d805 100755 --- a/scripts/build_scripts/build.sh +++ b/scripts/build_scripts/build.sh @@ -23,15 +23,19 @@ source $script_dir/../common.sh # ================== program starts here ================== + check_and_install_go if [ $? -ne 0 ]; then error "failed to install golang" fi cd $script_dir + +go env -w GOPROXY=https://goproxy.cn,direct go build -o pos_build + if [ $? -ne 0 ]; then - error "faile to build PhOS's build system" + error "failed to build PhOS's build system" fi if [ ! -e $script_dir/pos_build ]; then error "no building binary was built" diff --git a/scripts/build_scripts/common.go b/scripts/build_scripts/common.go index e3b3c91a..332d74d2 100755 --- a/scripts/build_scripts/common.go +++ b/scripts/build_scripts/common.go @@ -63,6 +63,7 @@ type CmdOptions struct { WithUnitTest bool Target string PrintHelp bool + DoPackage bool DoBuild bool DoInstall bool DoClean bool @@ -75,6 +76,7 @@ func (cmdOpt *CmdOptions) print(logger *log.Logger) { - WithThirdParty: %v - Target: %v - PrintHelp: %v + - DoPackage: %v - DoClean: %v - DoInstall: %v - WithUnitTest: %v @@ -83,6 +85,7 @@ func (cmdOpt *CmdOptions) print(logger *log.Logger) { cmdOpt.WithThirdParty, cmdOpt.Target, cmdOpt.PrintHelp, + cmdOpt.DoPackage, cmdOpt.DoClean, cmdOpt.DoInstall, cmdOpt.WithUnitTest, diff --git a/scripts/build_scripts/cuda.go b/scripts/build_scripts/cuda.go index 8d354768..f058c0a3 100755 --- a/scripts/build_scripts/cuda.go +++ b/scripts/build_scripts/cuda.go @@ -130,7 +130,7 @@ func CRIB_PhOS_CUDA_KernelPatcher(cmdOpt CmdOptions, buildConf BuildConfigs, log } func CRIB_PhOS_CUDA(cmdOpt CmdOptions, buildConf BuildConfigs, logger *log.Logger) { - if cmdOpt.DoBuild { + if cmdOpt.DoPackage { // ==================== Prepare ==================== logger.Infof("pre-build check...") utils.CheckAndInstallPackage("git", "git", nil, nil, logger) @@ -194,9 +194,23 @@ func CRIB_PhOS_CUDA(cmdOpt CmdOptions, buildConf BuildConfigs, logger *log.Logge ) os.Exit(0) return nil + } + utils.CheckAndInstallPackage("cargo", "", install_cargo, post_install_cargo, logger) + + // XD: fixme: currently only tested on A800 machines with cuda 11.3 + install_nccl := func() error { + _, err := utils.BashScriptGetOutput(` + #!/bin/bash + set -e + apt-get install -y libnccl2 libnccl-dev --allow-change-held-packages + `, + false, logger, + ) + return err } - utils.CheckAndInstallPackage("cargo", "", install_cargo, post_install_cargo, logger) - } + utils.CheckAndInstallPackage("nccl", "", install_nccl, nil, logger) + } + // ==================== CRIB Dependencies ==================== if cmdOpt.WithThirdParty { diff --git a/scripts/build_scripts/main.go b/scripts/build_scripts/main.go index 8dca6784..a9d12036 100755 --- a/scripts/build_scripts/main.go +++ b/scripts/build_scripts/main.go @@ -46,6 +46,8 @@ func main() { var __PrintHelp *bool = flag.Bool("h", false, "Print help message") var __WithThirdParty *bool = flag.Bool("3", false, "Build/clean with 3rd parties") + var __DoPackage *bool = flag.Bool("p", true, "Build/clean with pre-built packages") + var __DoBuild *bool = flag.Bool("b", true, "Build/clean PhOS.") var __DoInstall *bool = flag.Bool("i", false, "Do installation") var __DoClean *bool = flag.Bool("c", false, "Do cleanning") var __WithUnitTest *bool = flag.Bool("u", false, "Do unit-testing after build") @@ -58,6 +60,8 @@ func main() { cmdOpt := CmdOptions{ PrintHelp: *__PrintHelp, WithThirdParty: *__WithThirdParty, + DoBuild: *__DoBuild, + DoPackage: *__DoPackage, DoInstall: *__DoInstall, DoClean: *__DoClean, WithUnitTest: *__WithUnitTest, @@ -91,10 +95,11 @@ func main() { // make sure we won't build/install when clean if cmdOpt.DoClean { + cmdOpt.DoPackage = false cmdOpt.DoBuild = false cmdOpt.DoInstall = false } else { - cmdOpt.DoBuild = true + // cmdOpt.DoBuild = true } CRIB_PhOS(cmdOpt, buildConf, logger) diff --git a/scripts/build_scripts/pos.go b/scripts/build_scripts/pos.go index a79895b8..638576b6 100755 --- a/scripts/build_scripts/pos.go +++ b/scripts/build_scripts/pos.go @@ -29,9 +29,14 @@ const ( func CRIB_PhOS_Remoting(cmdOpt CmdOptions, buildConf BuildConfigs, logger *log.Logger) { if cmdOpt.DoBuild { // TODO(zhuobin): we need to install NCCL version according to CUDA version - utils.CheckAndInstallMultiPackagesViaOsPkgManager([]string{ - "libnccl2=2.26.5-1+cuda12.9", "libnccl-dev=2.26.5-1+cuda12.9", - }, logger) + //utils.CheckAndInstallMultiPackagesViaOsPkgManager([]string{ + // "libnccl2=2.26.5-1+cuda12.9", "libnccl-dev=2.26.5-1+cuda12.9", + //}, logger) + //if !utils.CheckPackageViaOsPkgManager("libnccl-dev", logger) { + // utils.UnInstallPackageViaOsPkgManager("libnccl2", logger) + //} + //utils.CheckAndInstallPackageViaOsPkgManager("libnccl2=2.26.5-1+cuda12.9", logger) + //utils.CheckAndInstallPackageViaOsPkgManager("libnccl-dev=2.26.5-1+cuda12.9", logger) utils.CheckAndInstallPackageViaOsPkgManager("clang", logger) utils.CheckAndInstallPackageViaOsPkgManager("cmake", logger) } diff --git a/scripts/utils/dependencies.sh b/scripts/utils/dependencies.sh index 7d072230..18829100 100755 --- a/scripts/utils/dependencies.sh +++ b/scripts/utils/dependencies.sh @@ -59,7 +59,6 @@ util_install_common () { fi } - check_and_install_go() { if [[ ! -x "$(command -v go)" ]]; then warn "no go installed, installing from assets..." diff --git a/scripts/utils/get_root_dir.sh b/scripts/utils/get_root_dir.sh index e01c7ca6..5d8bf6c7 100755 --- a/scripts/utils/get_root_dir.sh +++ b/scripts/utils/get_root_dir.sh @@ -14,4 +14,9 @@ #!/bin/bash -git rev-parse --show-toplevel +if git rev-parse --is-inside-work-tree > /dev/null 2>&1; then + repo_root=$(git rev-parse --show-toplevel) + echo "$repo_root" +else + echo "Not in a git repo." +fi \ No newline at end of file