diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..c8a8d32e
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,51 @@
+SRC_DIR := $(shell pwd)
+
+IMAGE_NAME := phos-base-113
+DOCKERFILE := $(SRC_DIR)/dockerfiles/build_113.Dockerfile
+
+BUILD_ARGS ?= -i -3 -u -p=false
+CLIENT_RUN_CMD ?= "python"
+
+.PHONY: build clean exec
+
+build-image:
+ docker build \
+ --build-arg proxy=http://ipads:ipads123@127.0.0.1:11235 \
+ --progress=plain -f $(DOCKERFILE) -t $(IMAGE_NAME) .
+
+build:
+ docker run --rm --gpus all \
+ -v $(SRC_DIR):/root \
+ --privileged --network=host --ipc=host \
+ $(IMAGE_NAME) \
+ bash -c "cd /root/scripts/build_scripts/ && bash build.sh $(BUILD_ARGS)"
+
+server-run:
+ docker run --rm --gpus all -it \
+ -v $(SRC_DIR):/root \
+ --privileged --network=host --ipc=host \
+ $(IMAGE_NAME) \
+ bash -c "CUDA_VISIBLE_DEVICES=2 pos_cli --start --target daemon"
+
+client-run:
+ docker run --rm --gpus all \
+ -v $(SRC_DIR):/root \
+ --privileged --network=host --ipc=host \
+ $(IMAGE_NAME) \
+ bash -c "cd /root && export LD_LIBRARY_PATH=/root/lib:$LD_LIBRARY_PATH && export LIBRARY_PATH=/root/lib:$LIBRARY_PATH && LD_PRELOAD=/root/lib/libxpuclient.so RUST_LOG=error $(CLIENT_RUN_CMD)"
+
+clean:
+ docker run --rm --gpus all \
+ -v $(SRC_DIR):/root \
+ --privileged --network=host --ipc=host \
+ $(IMAGE_NAME) \
+ bash -c "cd /root/scripts/build_scripts/ && bash build.sh -c -3"
+
+exec:
+ docker run --rm --gpus all -it \
+ -v $(SRC_DIR):/root \
+ --privileged --network=host --ipc=host \
+ $(IMAGE_NAME) \
+ bash
+
+
diff --git a/README.md b/README.md
index 6c07c581..4cc434ee 100755
--- a/README.md
+++ b/README.md
@@ -63,121 +63,10 @@
-## I. Build and Install PhOS
+## I. Quick build
-### 💡 Option 1: Build and Install From Source
-
-1. **[Clone Repository]**
- First of all, clone this repository **recursively**:
-
- ```bash
- git clone --recursive https://github.com/SJTU-IPADS/PhoenixOS.git
- ```
-
-2. **[Start Container]**
- PhOS can be built and installed on official vendor image.
-
- > NOTE: PhOS require libc6 >= 2.29 for compiling CRIU from source.
-
- For example, for running PhOS for CUDA 11.3,
- one can build on official CUDA images
- (e.g., [`nvidia/cuda:11.3.1-cudnn8-devel-ubuntu20.04`](https://hub.docker.com/layers/nvidia/cuda/11.3.1-cudnn8-devel-ubuntu20.04/images/sha256-459c130c94363099b02706b9b25d9fe5822ea233203ce9fbf8dfd276a55e7e95)):
-
-
- ```bash
- # enter repository
- cd PhoenixOS/scripts/docker
-
- # start and enter container with id 1
- bash run_torch_cu113.sh -s 1
-
- # enter / close container (no need to execute here, just listed)
- bash run_torch_cu113.sh -e 1 # enter container
- bash run_torch_cu113.sh -c 1 # close container
- ```
-
- Note that it's important to execute docker container with root privilege, as CRIU needs the permission to C/R kernel-space memory pages.
-
-3. **[Downloading Necesssary Assets]**
- PhOS relies on some assets to build and test,
- please download these assets by simply running following commands:
-
- ```bash
- # inside container
-
- # download assets
- cd /root/scripts/build_scripts
- bash download_assets.sh
- ```
-
-4. **[Build]**
- Building PhOS is simple!
-
- PhOS provides a convinient build system, which covers compiling, linking and installing all PhOS components:
-
-
-
- | Component |
- Description |
-
-
- phos-autogen |
- Autogen Engine for generating most of Parser and Worker code for specific hardware platform, based on lightwight notation. |
-
-
- phosd |
- PhOS Daemon, which continuously run at the background, taking over the control of all GPU devices on the node. |
-
-
- libphos.so |
- PhOS Hijacker, which hijacks all GPU API calls on the client-side and forward to PhOS Daemon. |
-
-
- libpccl.so |
- PhOS Checkpoint Communication Library (PCCL), which provide highly-optimized device-to-device state migration. Note that this library is not included in current release. |
-
-
- unit-testing |
- Unit Tests for PhOS, which is based on GoogleTest. |
-
-
- phos-cli |
- Command Line Interface (CLI) for interacting with PhOS. |
-
-
- phos-remoting |
- Remoting Framework, which provide highly-optimized GPU API remoting performance. See more details at SJTU-IPADS/PhoenixOS-Remoting. |
-
-
-
- To build and install all above components and other dependencies, simply run the build script in the container would works:
-
- ```bash
- # inside container
- cd /root/scripts/build_scripts
-
- # clear old build cache
- # -c: clear previous build
- # -3: the clean process involves all third-parties
- bash build.sh -c -3
-
- # start building
- # -3: the build process involves all third-parties
- # -i: install after successful building
- # -u: build PhOS with unit test enable
- bash build.sh -i -3 -u
- ```
-
- For customizing build options, please refers to and modify avaiable options under `scripts/build_scripts/build_config.yaml`.
-
- If you encounter any build issues, you're able to see building logs under `build_log`. Please open a new issue if things are stuck :-|
-
-### 💡 Option 2: Install From Pre-built Binaries
-
- Will soon be updated, stay tuned :)
-
-
-
+Currently, we don't have pre-built binaries.
+Please check [build from Source](docs/docs/getting_started/build_from_source.md) for how to build and run from source!
## II. Usage
@@ -194,9 +83,17 @@ Once successfully installed PhOS, you can now try run your program with PhOS sup
1. Start the PhOS daemon (`phosd`), which takes over all GPU reousces on the node:
```bash
+ ## If built in an interactive container (or host)
pos_cli --start --target daemon
```
+ or
+
+ ```bash
+ ## If built with our container
+ make server-run
+ ```
+
2. To run your program with PhOS support, one need to put a `yaml` configure file under the directory which your program would regard as `$PWD`.
This file contains all necessary informations for PhOS to hijack your program. An example file looks like:
diff --git a/dockerfiles/build_113.Dockerfile b/dockerfiles/build_113.Dockerfile
new file mode 100644
index 00000000..03c3e4ba
--- /dev/null
+++ b/dockerfiles/build_113.Dockerfile
@@ -0,0 +1,53 @@
+FROM phoenixos/pytorch:11.3-ubuntu20.04 as base
+
+ARG DEBIAN_FRONTEND=noninteractive
+ARG proxy
+
+RUN apt update
+RUN apt-get install -y libibverbs-dev libboost-all-dev net-tools \
+ git-lfs pkg-config python3-pip libelf-dev libssl-dev libgl1-mesa-dev \
+ libvdpau-dev iputils-ping wget gdb vim nsight-compute-2023.1.1 curl
+
+RUN apt-get update && \
+ apt-get install -y software-properties-common && \
+ add-apt-repository -y ppa:ubuntu-toolchain-r/test && \
+ apt-get update
+
+RUN apt-get install -y g++-9
+RUN apt-get install -y g++-13
+
+RUN pip3 install meson -i https://mirrors.aliyun.com/pypi/simple/
+
+RUN ln -s /opt/nvidia/nsight-compute/2023.1.1/target/linux-desktop-glibc_2_11_3-x64/ncu /usr/local/bin/ncu
+
+RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
+
+# Copy build scripts from the project root
+COPY scripts/ /scripts
+COPY third_party/go1.23.2.linux-amd64.tar.gz /third_party/go1.23.2.linux-amd64.tar.gz
+
+
+ENV RUSTUP_UPDATE_ROOT=https://mirrors.tuna.tsinghua.edu.cn/rustup/rustup
+ENV RUSTUP_DIST_SERVER=https://mirrors.tuna.tsinghua.edu.cn/rustup
+RUN mkdir -p /opt/rust
+
+ENV CARGO_HOME=/opt/rust/.cargo
+ENV RUSTUP_HOME=/opt/rust/.rustup
+
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path
+ENV PATH="/opt/rust/.cargo/bin:${PATH}"
+RUN . /opt/rust/.cargo/env
+
+RUN rustup install nightly
+RUN rustup default nightly
+
+
+# Make scripts executable and run download_assets.sh
+RUN chmod +x /scripts/build_scripts/*.sh
+RUN cd /scripts/build_scripts && bash build.sh -p -b=false -3=true
+
+ENV PATH="/root/bin:${PATH}"
+ENV LD_LIBRARY_PATH="/root/lib:${LD_LIBRARY_PATH}"
+
+WORKDIR /root
+
diff --git a/docs/docs/getting_started/build_from_source.md b/docs/docs/getting_started/build_from_source.md
new file mode 100644
index 00000000..c614b382
--- /dev/null
+++ b/docs/docs/getting_started/build_from_source.md
@@ -0,0 +1,129 @@
+# Quick start
+
+This guide will help you build and run PhOS from source.
+PhOS provides two options, and you can choose **either one** to build PhOS.
+
+## Overview of the build
+
+PhOS provides a convenient build system, which covers compiling, linking and installing all PhOS components:
+
+
+
+ | Component |
+ Description |
+
+
+ phos-autogen |
+ Autogen Engine for generating most of Parser and Worker code for specific hardware platform, based on lightwight notation. |
+
+
+ phosd |
+ PhOS Daemon, which continuously run at the background, taking over the control of all GPU devices on the node. |
+
+
+ libphos.so |
+ PhOS Hijacker, which hijacks all GPU API calls on the client-side and forward to PhOS Daemon. |
+
+
+ libpccl.so |
+ PhOS Checkpoint Communication Library (PCCL), which provide highly-optimized device-to-device state migration. Note that this library is not included in current release. |
+
+
+ unit-testing |
+ Unit Tests for PhOS, which is based on GoogleTest. |
+
+
+ phos-cli |
+ Command Line Interface (CLI) for interacting with PhOS. |
+
+
+ phos-remoting |
+ Remoting Framework, which provide highly-optimized GPU API remoting performance. See more details at SJTU-IPADS/PhoenixOS-Remoting. |
+
+
+
+
+1. **[Clone Repository]**
+ First of all, clone this repository **recursively**:
+
+ ```bash
+ git clone --recursive https://github.com/SJTU-IPADS/PhoenixOS.git
+ ```
+
+2. **[Downloading Necessary (third-party) Assets]**
+ PhOS relies on some assets to build and test,
+ please download these assets by simply running following commands:
+
+ ```bash
+ # download assets
+ cd path/to/phos/scripts/build_scripts
+ bash download_assets.sh
+ ```
+
+3. **(Optional#1) [Build with our image]**
+ First, build our pre-released image (if not found phos-base-113 on the hub):
+ (This option only works for cuda 11.3 for now)
+
+ ```bash
+ make build-image
+ ```
+
+ Second, use the image to build PhOS all the time:
+
+ ```bash
+ make build BUILD_ARGS="-i -3 -p=false"
+ ```
+
+ Use the following to check possible built options:
+
+ ```bash
+ make build BUILD_ARGS="-help"
+ ```
+
+3. **(Optional#2) [Start an interactive container]**
+ PhOS can be built and installed on official vendor image (or host)
+ if you don't want to use our pre-built image.
+
+ > NOTE: PhOS has some minimal requirements, e.g., it requires libc6 >= 2.29 for compiling CRIU from source. Thus, we strongly recommend you to use our base image as an interactive building environment.
+
+ For example, for running PhOS for CUDA 11.3,
+ one can build on official CUDA images
+ (e.g., [`nvidia/cuda:11.3.1-cudnn8-devel-ubuntu20.04`](https://hub.docker.com/layers/nvidia/cuda/11.3.1-cudnn8-devel-ubuntu20.04/images/sha256-459c130c94363099b02706b9b25d9fe5822ea233203ce9fbf8dfd276a55e7e95)):
+
+
+ ```bash
+ # enter repository
+ cd PhoenixOS/scripts/docker
+
+ # start and enter container with id 1
+ bash run_torch_cu113.sh -s 1
+
+ # enter / close container (no need to execute here, just listed)
+ bash run_torch_cu113.sh -e 1 # enter container
+ bash run_torch_cu113.sh -c 1 # close container
+ ```
+
+ > Note that it's important to execute docker container with root privilege, as CRIU needs the permission to C/R kernel-space memory pages.
+
+ To build and install all above components and other dependencies, simply run the build script in the container would works:
+
+ ```bash
+ # inside container
+ cd /root/scripts/build_scripts
+
+ # clear old build cache
+ # -c: clear previous build
+ # -3: the clean process involves all third-parties
+ bash build.sh -c -3
+
+ # start building
+ # -3: the build process involves all third-parties
+ # -i: install after successful building
+ # -u: build PhOS with unit test enable
+ bash build.sh -i -3 -u
+ ```
+
+4. **Build configuration and trouble shooting**
+ For customizing build options, please refers to and modify avaiable options under `scripts/build_scripts/build_config.yaml`.
+
+ If you encounter any build issues, you're able to see building logs under `build_log`. Please open a new issue if things are stuck :-| The logs typically are quite self-explained.
\ No newline at end of file
diff --git a/scripts/build_scripts/build.sh b/scripts/build_scripts/build.sh
index d661ed10..4750d805 100755
--- a/scripts/build_scripts/build.sh
+++ b/scripts/build_scripts/build.sh
@@ -23,15 +23,19 @@ source $script_dir/../common.sh
# ================== program starts here ==================
+
check_and_install_go
if [ $? -ne 0 ]; then
error "failed to install golang"
fi
cd $script_dir
+
+go env -w GOPROXY=https://goproxy.cn,direct
go build -o pos_build
+
if [ $? -ne 0 ]; then
- error "faile to build PhOS's build system"
+ error "failed to build PhOS's build system"
fi
if [ ! -e $script_dir/pos_build ]; then
error "no building binary was built"
diff --git a/scripts/build_scripts/common.go b/scripts/build_scripts/common.go
index e3b3c91a..332d74d2 100755
--- a/scripts/build_scripts/common.go
+++ b/scripts/build_scripts/common.go
@@ -63,6 +63,7 @@ type CmdOptions struct {
WithUnitTest bool
Target string
PrintHelp bool
+ DoPackage bool
DoBuild bool
DoInstall bool
DoClean bool
@@ -75,6 +76,7 @@ func (cmdOpt *CmdOptions) print(logger *log.Logger) {
- WithThirdParty: %v
- Target: %v
- PrintHelp: %v
+ - DoPackage: %v
- DoClean: %v
- DoInstall: %v
- WithUnitTest: %v
@@ -83,6 +85,7 @@ func (cmdOpt *CmdOptions) print(logger *log.Logger) {
cmdOpt.WithThirdParty,
cmdOpt.Target,
cmdOpt.PrintHelp,
+ cmdOpt.DoPackage,
cmdOpt.DoClean,
cmdOpt.DoInstall,
cmdOpt.WithUnitTest,
diff --git a/scripts/build_scripts/cuda.go b/scripts/build_scripts/cuda.go
index 8d354768..f058c0a3 100755
--- a/scripts/build_scripts/cuda.go
+++ b/scripts/build_scripts/cuda.go
@@ -130,7 +130,7 @@ func CRIB_PhOS_CUDA_KernelPatcher(cmdOpt CmdOptions, buildConf BuildConfigs, log
}
func CRIB_PhOS_CUDA(cmdOpt CmdOptions, buildConf BuildConfigs, logger *log.Logger) {
- if cmdOpt.DoBuild {
+ if cmdOpt.DoPackage {
// ==================== Prepare ====================
logger.Infof("pre-build check...")
utils.CheckAndInstallPackage("git", "git", nil, nil, logger)
@@ -194,9 +194,23 @@ func CRIB_PhOS_CUDA(cmdOpt CmdOptions, buildConf BuildConfigs, logger *log.Logge
)
os.Exit(0)
return nil
+ }
+ utils.CheckAndInstallPackage("cargo", "", install_cargo, post_install_cargo, logger)
+
+ // XD: fixme: currently only tested on A800 machines with cuda 11.3
+ install_nccl := func() error {
+ _, err := utils.BashScriptGetOutput(`
+ #!/bin/bash
+ set -e
+ apt-get install -y libnccl2 libnccl-dev --allow-change-held-packages
+ `,
+ false, logger,
+ )
+ return err
}
- utils.CheckAndInstallPackage("cargo", "", install_cargo, post_install_cargo, logger)
- }
+ utils.CheckAndInstallPackage("nccl", "", install_nccl, nil, logger)
+ }
+
// ==================== CRIB Dependencies ====================
if cmdOpt.WithThirdParty {
diff --git a/scripts/build_scripts/main.go b/scripts/build_scripts/main.go
index 8dca6784..a9d12036 100755
--- a/scripts/build_scripts/main.go
+++ b/scripts/build_scripts/main.go
@@ -46,6 +46,8 @@ func main() {
var __PrintHelp *bool = flag.Bool("h", false, "Print help message")
var __WithThirdParty *bool = flag.Bool("3", false, "Build/clean with 3rd parties")
+ var __DoPackage *bool = flag.Bool("p", true, "Build/clean with pre-built packages")
+ var __DoBuild *bool = flag.Bool("b", true, "Build/clean PhOS.")
var __DoInstall *bool = flag.Bool("i", false, "Do installation")
var __DoClean *bool = flag.Bool("c", false, "Do cleanning")
var __WithUnitTest *bool = flag.Bool("u", false, "Do unit-testing after build")
@@ -58,6 +60,8 @@ func main() {
cmdOpt := CmdOptions{
PrintHelp: *__PrintHelp,
WithThirdParty: *__WithThirdParty,
+ DoBuild: *__DoBuild,
+ DoPackage: *__DoPackage,
DoInstall: *__DoInstall,
DoClean: *__DoClean,
WithUnitTest: *__WithUnitTest,
@@ -91,10 +95,11 @@ func main() {
// make sure we won't build/install when clean
if cmdOpt.DoClean {
+ cmdOpt.DoPackage = false
cmdOpt.DoBuild = false
cmdOpt.DoInstall = false
} else {
- cmdOpt.DoBuild = true
+ // cmdOpt.DoBuild = true
}
CRIB_PhOS(cmdOpt, buildConf, logger)
diff --git a/scripts/build_scripts/pos.go b/scripts/build_scripts/pos.go
index a79895b8..638576b6 100755
--- a/scripts/build_scripts/pos.go
+++ b/scripts/build_scripts/pos.go
@@ -29,9 +29,14 @@ const (
func CRIB_PhOS_Remoting(cmdOpt CmdOptions, buildConf BuildConfigs, logger *log.Logger) {
if cmdOpt.DoBuild {
// TODO(zhuobin): we need to install NCCL version according to CUDA version
- utils.CheckAndInstallMultiPackagesViaOsPkgManager([]string{
- "libnccl2=2.26.5-1+cuda12.9", "libnccl-dev=2.26.5-1+cuda12.9",
- }, logger)
+ //utils.CheckAndInstallMultiPackagesViaOsPkgManager([]string{
+ // "libnccl2=2.26.5-1+cuda12.9", "libnccl-dev=2.26.5-1+cuda12.9",
+ //}, logger)
+ //if !utils.CheckPackageViaOsPkgManager("libnccl-dev", logger) {
+ // utils.UnInstallPackageViaOsPkgManager("libnccl2", logger)
+ //}
+ //utils.CheckAndInstallPackageViaOsPkgManager("libnccl2=2.26.5-1+cuda12.9", logger)
+ //utils.CheckAndInstallPackageViaOsPkgManager("libnccl-dev=2.26.5-1+cuda12.9", logger)
utils.CheckAndInstallPackageViaOsPkgManager("clang", logger)
utils.CheckAndInstallPackageViaOsPkgManager("cmake", logger)
}
diff --git a/scripts/utils/dependencies.sh b/scripts/utils/dependencies.sh
index 7d072230..18829100 100755
--- a/scripts/utils/dependencies.sh
+++ b/scripts/utils/dependencies.sh
@@ -59,7 +59,6 @@ util_install_common () {
fi
}
-
check_and_install_go() {
if [[ ! -x "$(command -v go)" ]]; then
warn "no go installed, installing from assets..."
diff --git a/scripts/utils/get_root_dir.sh b/scripts/utils/get_root_dir.sh
index e01c7ca6..5d8bf6c7 100755
--- a/scripts/utils/get_root_dir.sh
+++ b/scripts/utils/get_root_dir.sh
@@ -14,4 +14,9 @@
#!/bin/bash
-git rev-parse --show-toplevel
+if git rev-parse --is-inside-work-tree > /dev/null 2>&1; then
+ repo_root=$(git rev-parse --show-toplevel)
+ echo "$repo_root"
+else
+ echo "Not in a git repo."
+fi
\ No newline at end of file