Skip to content

Feat: Add metadata validation workflow and scripts #5

Feat: Add metadata validation workflow and scripts

Feat: Add metadata validation workflow and scripts #5

name: validate-metadata-schema
env:
PYTHON_VERSION: "3.11"
TARGET_DIR: "component/pipelines"
COMPONENT_TYPES: "data_processing,deployment,evaluation,training"
on:
pull_request:
paths:
- 'component/pipelines/**'
- 'scripts/**'
jobs:
validate-component-metadata-schema:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
with:
fetch-depth: 0
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: 3.11
- name: Install Test dependencies
run: |
pip install -r scripts/validate_metadata/requirements.txt
- name: Retrieve new components
id: get-new-components
# if diff: use components in diff range, otherwise list all components in components dir.
run: |
BASE_COMMIT="${{ github.event.pull_request.base.sha }}"
HEAD_COMMIT="${{ github.event.pull_request.head.sha }}"
# --- Variables ---
COMMIT_RANGE="HEAD~1..HEAD"
SCRIPT_DIR="scripts/validate_metadata/"
echo "Using BASE_COMMIT: $BASE_COMMIT"
echo "Using HEAD_COMMIT: $HEAD_COMMIT"
echo "Run COMMIT_RANGE=$COMMIT_RANGE"
# If changes are detected in scripts/validate_metadata, validate all components.
SCRIPT_DIFF=$(git diff --name-only --diff-filter=A $COMMIT_RANGE -- $SCRIPT_DIR)
if [[ "$SCRIPT_DIFF" == "" ]]; then
echo "Changes detected in critical script directory: $SCRIPT_DIR"
ALL_COMPONENT_FILES=$(find components -mindepth 2 -maxdepth 2 -type d | \
sed 's/^components\///' | \
sort -u | \
tr '\n' ',' | \
sed 's/,$//')
echo "Changes detected in scripts/validate_metadata. All existing components will be validated: $ALL_COMPONENT_FILES"
echo "new_components_list=$ALL_COMPONENT_FILES" >> "$GITHUB_OUTPUT"
else
# 1. Find all newly added files/directories under the components directory.
ALL_NEW_PATHS=$(git diff --name-only --diff-filter=A $COMMIT_RANGE -- "components")
# 2. Process the list to extract new components.
for type in "$COMPONENT_TYPES"; do
NEW_COMPONENTS+=$(echo "$ALL_NEW_PATHS" | \
# Filter for files that are directly in components/
grep -E "components/$type.+/" | \
sed 's/^components\///' | \
sort -u | \
tr '\n' ',' | \
sed 's/,$//')
# --- Output ---
echo "The following new components were found and will be validated: $NEW_COMPONENTS"
echo "new_components_list=$NEW_COMPONENTS" >> "$GITHUB_OUTPUT"
fi
- name: Validate new components
if: ${{ steps.get-new-components.outputs.new_components_list != '' }}
run: |
SCRIPT_PATH="$GITHUB_WORKSPACE/scripts/validate_metadata/validate_metadata.py"
NEW_COMPONENTS_ARRAY="${{ steps.get-new-components.outputs.new_components_list }}"
# 2. Set IFS to a comma, so that the shell will split the string by commas.
IFS=','
for component in $NEW_COMPONENTS_ARRAY; do
COMPONENT_PATH="$GITHUB_WORKSPACE/components/$component"
echo "Processing component: $component"
python $SCRIPT_PATH --component $COMPONENT_PATH
done