Files
2026-03-22 23:21:49 +02:00

364 lines
8.8 KiB
Bash
Executable File

#!/bin/bash
#
# grade-output.sh - Interactive grading checklist for skill outputs
#
# Usage: ./grade-output.sh <path/to/skill-directory>
#
# This script provides a structured checklist for evaluating skill test outputs.
#
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# Parse arguments
SKILL_DIR=""
usage() {
echo "Usage: $0 <path/to/skill-directory>"
echo ""
echo "Examples:"
echo " $0 ~/.config/opencode/skills/my-skill"
echo " $0 ./my-skill"
exit 1
}
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help)
usage
;;
-*)
echo -e "${RED}Error: Unknown option $1${NC}"
usage
;;
*)
if [[ -z "$SKILL_DIR" ]]; then
SKILL_DIR="$1"
else
echo -e "${RED}Error: Multiple skill directories provided${NC}"
usage
fi
shift
;;
esac
done
# Validate skill directory
if [[ -z "$SKILL_DIR" ]]; then
echo -e "${RED}Error: Skill directory is required${NC}"
usage
fi
# Resolve path
SKILL_DIR="$(cd "$SKILL_DIR" 2>/dev/null && pwd)" || {
echo -e "${RED}Error: Cannot access directory: $SKILL_DIR${NC}"
exit 1
}
# Get skill name from directory
SKILL_NAME="$(basename "$SKILL_DIR")"
echo "========================================"
echo "Grading Output for: $SKILL_NAME"
echo "========================================"
echo ""
# Check for test results
if [[ -f "$SKILL_DIR/evals/test-results.json" ]]; then
echo -e "${BLUE}Found previous test results${NC}"
echo ""
fi
# Initialize grading data
declare -a CRITERIA=(
"Correctness: Output matches expected result"
"Correctness: No factual errors"
"Correctness: Logic is sound"
"Correctness: Edge cases handled appropriately"
"Completeness: All requested tasks completed"
"Completeness: No steps skipped"
"Completeness: Appropriate level of detail"
"Completeness: Relevant context included"
"Format: Output follows specified format"
"Format: Consistent with examples in skill"
"Format: Easy to read and understand"
"Triggering: Skill activated when appropriate"
"Triggering: Did not activate when inappropriate"
"Efficiency: No unnecessary steps"
"Efficiency: Reasonable response length"
"Efficiency: Not overly verbose"
)
GRADES=()
ISSUES=()
# Function to ask yes/no question
ask_yes_no() {
local prompt="$1"
while true; do
read -p "$prompt (y/n): " -n 1 -r
echo
case $REPLY in
[Yy])
return 0
;;
[Nn])
return 1
;;
*)
echo "Please enter y or n"
;;
esac
done
}
echo "This checklist will help you systematically evaluate the skill output."
echo "Answer each question based on the test results you observed."
echo ""
read -p "Press Enter to begin grading..."
echo ""
# Grade each criterion
echo "========================================"
echo -e "${CYAN}Grading Criteria${NC}"
echo "========================================"
echo ""
for criterion in "${CRITERIA[@]}"; do
category="${criterion%%:*}"
description="${criterion#*: }"
echo -e "${BLUE}[$category]${NC} $description"
if ask_yes_no " Does it meet this criterion"; then
GRADES+=("$criterion: PASS")
echo -e " ${GREEN}✓ Pass${NC}"
else
GRADES+=("$criterion: FAIL")
echo -e " ${RED}✗ Fail${NC}"
# Ask for issue description
echo " Briefly describe the issue:"
read -r issue
if [[ -n "$issue" ]]; then
ISSUES+=("[$category] $description: $issue")
fi
fi
echo ""
done
# Overall assessment
echo "========================================"
echo -e "${CYAN}Overall Assessment${NC}"
echo "========================================"
echo ""
echo "Overall Result:"
echo " [p] Pass - All or most criteria met"
echo " [f] Fail - Significant issues found"
echo " [i] Incomplete - Needs more testing"
echo ""
while true; do
read -p "Overall result (p/f/i): " -n 1 -r
echo
case $REPLY in
[Pp])
OVERALL_RESULT="pass"
break
;;
[Ff])
OVERALL_RESULT="fail"
break
;;
[Ii])
OVERALL_RESULT="incomplete"
break
;;
*)
echo "Please enter p, f, or i"
;;
esac
done
echo ""
# Priority assessment
echo "Priority of fixes needed:"
echo " [h] High - Critical issues, skill not usable"
echo " [m] Medium - Important issues, skill partially works"
echo " [l] Low - Minor issues, skill mostly works"
echo ""
while true; do
read -p "Priority (h/m/l): " -n 1 -r
echo
case $REPLY in
[Hh])
PRIORITY="high"
break
;;
[Mm])
PRIORITY="medium"
break
;;
[Ll])
PRIORITY="low"
break
;;
*)
echo "Please enter h, m, or l"
;;
esac
done
echo ""
# Suggested fixes
echo -e "${BLUE}Suggested Improvements (optional):${NC}"
echo "Describe what changes would address the issues:"
read -r SUGGESTED_FIXES
echo ""
# Pattern analysis
echo "========================================"
echo -e "${CYAN}Pattern Analysis${NC}"
echo "========================================"
echo ""
if ask_yes_no "Did the same issue appear in multiple test cases"; then
echo "This suggests a systemic problem. Consider:"
echo " - Fixing the root cause rather than symptoms"
echo " - Adding a helper script for repeated tasks"
echo " - Clarifying instructions in SKILL.md"
PATTERN="systemic"
else
echo "Issues appear to be isolated to specific cases."
PATTERN="isolated"
fi
echo ""
# Extract to script recommendation
if ask_yes_no "Should any repeated work be extracted to a script"; then
echo "Consider creating a script in scripts/ directory."
EXTRACT_SCRIPT="true"
else
EXTRACT_SCRIPT="false"
fi
echo ""
# Generate grading report
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
# Build issues array
ISSUES_JSON="["
for i in "${!ISSUES[@]}"; do
if [[ $i -gt 0 ]]; then
ISSUES_JSON+=","
fi
ISSUES_JSON+="\"${ISSUES[$i]}\""
done
ISSUES_JSON+="]"
# Build grades array
GRADES_JSON="["
for i in "${!GRADES[@]}"; do
if [[ $i -gt 0 ]]; then
GRADES_JSON+=","
fi
GRADES_JSON+="\"${GRADES[$i]}\""
done
GRADES_JSON+="]"
# Calculate pass rate
TOTAL_CRITERIA=${#CRITERIA[@]}
PASSED_COUNT=0
for grade in "${GRADES[@]}"; do
if [[ "$grade" == *"PASS" ]]; then
((PASSED_COUNT++))
fi
done
PASS_RATE=$((PASSED_COUNT * 100 / TOTAL_CRITERIA))
cat > "$SKILL_DIR/evals/grading-report.json" << EOF
{
"skill_name": "$SKILL_NAME",
"timestamp": "$TIMESTAMP",
"overall_result": "$OVERALL_RESULT",
"priority": "$PRIORITY",
"pass_rate": $PASS_RATE,
"criteria_passed": $PASSED_COUNT,
"criteria_total": $TOTAL_CRITERIA,
"pattern_analysis": "$PATTERN",
"extract_script_recommended": $EXTRACT_SCRIPT,
"detailed_grades": $GRADES_JSON,
"issues": $ISSUES_JSON,
"suggested_fixes": "$SUGGESTED_FIXES"
}
EOF
echo "========================================"
echo -e "${GREEN}Grading Report Generated${NC}"
echo "========================================"
echo ""
echo "Saved to: evals/grading-report.json"
echo ""
echo "Summary:"
echo " Overall: $OVERALL_RESULT"
echo " Priority: $PRIORITY"
echo " Pass Rate: $PASS_RATE% ($PASSED_COUNT/$TOTAL_CRITERIA criteria)"
echo " Pattern: $PATTERN issues"
echo ""
if [[ ${#ISSUES[@]} -gt 0 ]]; then
echo -e "${YELLOW}Issues Found:${NC}"
for issue in "${ISSUES[@]}"; do
echo " - $issue"
done
echo ""
fi
# Next steps
echo "========================================"
echo "Next Steps"
echo "========================================"
echo ""
if [[ "$OVERALL_RESULT" == "pass" ]]; then
echo -e "${GREEN}✓ Skill is working well!${NC}"
echo ""
echo "Consider:"
echo " - Adding more edge case tests"
echo " - Optimizing the description"
echo " - Documenting the skill"
else
echo "To improve the skill:"
echo ""
echo "1. Review grading-report.json for details"
echo "2. Update SKILL.md based on the issues found"
echo ""
if [[ "$EXTRACT_SCRIPT" == "true" ]]; then
echo "3. Create helper scripts for repeated tasks"
echo " - Place scripts in scripts/ directory"
echo " - Update SKILL.md to reference them"
echo ""
fi
echo "4. Re-run tests to verify improvements:"
echo " ~/.config/opencode/skills/skill-builder/scripts/run-tests.sh $SKILL_DIR"
fi
echo ""
echo -e "${GREEN}Done!${NC}"