364 lines
8.8 KiB
Bash
Executable File
364 lines
8.8 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
# grade-output.sh - Interactive grading checklist for skill outputs
|
|
#
|
|
# Usage: ./grade-output.sh <path/to/skill-directory>
|
|
#
|
|
# This script provides a structured checklist for evaluating skill test outputs.
|
|
#
|
|
|
|
set -euo pipefail
|
|
|
|
# Colors for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
CYAN='\033[0;36m'
|
|
NC='\033[0m' # No Color
|
|
|
|
# Parse arguments
|
|
SKILL_DIR=""
|
|
|
|
usage() {
|
|
echo "Usage: $0 <path/to/skill-directory>"
|
|
echo ""
|
|
echo "Examples:"
|
|
echo " $0 ~/.config/opencode/skills/my-skill"
|
|
echo " $0 ./my-skill"
|
|
exit 1
|
|
}
|
|
|
|
# Parse command line arguments
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
-h|--help)
|
|
usage
|
|
;;
|
|
-*)
|
|
echo -e "${RED}Error: Unknown option $1${NC}"
|
|
usage
|
|
;;
|
|
*)
|
|
if [[ -z "$SKILL_DIR" ]]; then
|
|
SKILL_DIR="$1"
|
|
else
|
|
echo -e "${RED}Error: Multiple skill directories provided${NC}"
|
|
usage
|
|
fi
|
|
shift
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Validate skill directory
|
|
if [[ -z "$SKILL_DIR" ]]; then
|
|
echo -e "${RED}Error: Skill directory is required${NC}"
|
|
usage
|
|
fi
|
|
|
|
# Resolve path
|
|
SKILL_DIR="$(cd "$SKILL_DIR" 2>/dev/null && pwd)" || {
|
|
echo -e "${RED}Error: Cannot access directory: $SKILL_DIR${NC}"
|
|
exit 1
|
|
}
|
|
|
|
# Get skill name from directory
|
|
SKILL_NAME="$(basename "$SKILL_DIR")"
|
|
|
|
echo "========================================"
|
|
echo "Grading Output for: $SKILL_NAME"
|
|
echo "========================================"
|
|
echo ""
|
|
|
|
# Check for test results
|
|
if [[ -f "$SKILL_DIR/evals/test-results.json" ]]; then
|
|
echo -e "${BLUE}Found previous test results${NC}"
|
|
echo ""
|
|
fi
|
|
|
|
# Initialize grading data
|
|
declare -a CRITERIA=(
|
|
"Correctness: Output matches expected result"
|
|
"Correctness: No factual errors"
|
|
"Correctness: Logic is sound"
|
|
"Correctness: Edge cases handled appropriately"
|
|
"Completeness: All requested tasks completed"
|
|
"Completeness: No steps skipped"
|
|
"Completeness: Appropriate level of detail"
|
|
"Completeness: Relevant context included"
|
|
"Format: Output follows specified format"
|
|
"Format: Consistent with examples in skill"
|
|
"Format: Easy to read and understand"
|
|
"Triggering: Skill activated when appropriate"
|
|
"Triggering: Did not activate when inappropriate"
|
|
"Efficiency: No unnecessary steps"
|
|
"Efficiency: Reasonable response length"
|
|
"Efficiency: Not overly verbose"
|
|
)
|
|
|
|
GRADES=()
|
|
ISSUES=()
|
|
|
|
# Function to ask yes/no question
|
|
ask_yes_no() {
|
|
local prompt="$1"
|
|
while true; do
|
|
read -p "$prompt (y/n): " -n 1 -r
|
|
echo
|
|
case $REPLY in
|
|
[Yy])
|
|
return 0
|
|
;;
|
|
[Nn])
|
|
return 1
|
|
;;
|
|
*)
|
|
echo "Please enter y or n"
|
|
;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
echo "This checklist will help you systematically evaluate the skill output."
|
|
echo "Answer each question based on the test results you observed."
|
|
echo ""
|
|
read -p "Press Enter to begin grading..."
|
|
echo ""
|
|
|
|
# Grade each criterion
|
|
echo "========================================"
|
|
echo -e "${CYAN}Grading Criteria${NC}"
|
|
echo "========================================"
|
|
echo ""
|
|
|
|
for criterion in "${CRITERIA[@]}"; do
|
|
category="${criterion%%:*}"
|
|
description="${criterion#*: }"
|
|
|
|
echo -e "${BLUE}[$category]${NC} $description"
|
|
|
|
if ask_yes_no " Does it meet this criterion"; then
|
|
GRADES+=("$criterion: PASS")
|
|
echo -e " ${GREEN}✓ Pass${NC}"
|
|
else
|
|
GRADES+=("$criterion: FAIL")
|
|
echo -e " ${RED}✗ Fail${NC}"
|
|
|
|
# Ask for issue description
|
|
echo " Briefly describe the issue:"
|
|
read -r issue
|
|
if [[ -n "$issue" ]]; then
|
|
ISSUES+=("[$category] $description: $issue")
|
|
fi
|
|
fi
|
|
echo ""
|
|
done
|
|
|
|
# Overall assessment
|
|
echo "========================================"
|
|
echo -e "${CYAN}Overall Assessment${NC}"
|
|
echo "========================================"
|
|
echo ""
|
|
|
|
echo "Overall Result:"
|
|
echo " [p] Pass - All or most criteria met"
|
|
echo " [f] Fail - Significant issues found"
|
|
echo " [i] Incomplete - Needs more testing"
|
|
echo ""
|
|
|
|
while true; do
|
|
read -p "Overall result (p/f/i): " -n 1 -r
|
|
echo
|
|
case $REPLY in
|
|
[Pp])
|
|
OVERALL_RESULT="pass"
|
|
break
|
|
;;
|
|
[Ff])
|
|
OVERALL_RESULT="fail"
|
|
break
|
|
;;
|
|
[Ii])
|
|
OVERALL_RESULT="incomplete"
|
|
break
|
|
;;
|
|
*)
|
|
echo "Please enter p, f, or i"
|
|
;;
|
|
esac
|
|
done
|
|
|
|
echo ""
|
|
|
|
# Priority assessment
|
|
echo "Priority of fixes needed:"
|
|
echo " [h] High - Critical issues, skill not usable"
|
|
echo " [m] Medium - Important issues, skill partially works"
|
|
echo " [l] Low - Minor issues, skill mostly works"
|
|
echo ""
|
|
|
|
while true; do
|
|
read -p "Priority (h/m/l): " -n 1 -r
|
|
echo
|
|
case $REPLY in
|
|
[Hh])
|
|
PRIORITY="high"
|
|
break
|
|
;;
|
|
[Mm])
|
|
PRIORITY="medium"
|
|
break
|
|
;;
|
|
[Ll])
|
|
PRIORITY="low"
|
|
break
|
|
;;
|
|
*)
|
|
echo "Please enter h, m, or l"
|
|
;;
|
|
esac
|
|
done
|
|
|
|
echo ""
|
|
|
|
# Suggested fixes
|
|
echo -e "${BLUE}Suggested Improvements (optional):${NC}"
|
|
echo "Describe what changes would address the issues:"
|
|
read -r SUGGESTED_FIXES
|
|
|
|
echo ""
|
|
|
|
# Pattern analysis
|
|
echo "========================================"
|
|
echo -e "${CYAN}Pattern Analysis${NC}"
|
|
echo "========================================"
|
|
echo ""
|
|
|
|
if ask_yes_no "Did the same issue appear in multiple test cases"; then
|
|
echo "This suggests a systemic problem. Consider:"
|
|
echo " - Fixing the root cause rather than symptoms"
|
|
echo " - Adding a helper script for repeated tasks"
|
|
echo " - Clarifying instructions in SKILL.md"
|
|
PATTERN="systemic"
|
|
else
|
|
echo "Issues appear to be isolated to specific cases."
|
|
PATTERN="isolated"
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# Extract to script recommendation
|
|
if ask_yes_no "Should any repeated work be extracted to a script"; then
|
|
echo "Consider creating a script in scripts/ directory."
|
|
EXTRACT_SCRIPT="true"
|
|
else
|
|
EXTRACT_SCRIPT="false"
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# Generate grading report
|
|
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
# Build issues array
|
|
ISSUES_JSON="["
|
|
for i in "${!ISSUES[@]}"; do
|
|
if [[ $i -gt 0 ]]; then
|
|
ISSUES_JSON+=","
|
|
fi
|
|
ISSUES_JSON+="\"${ISSUES[$i]}\""
|
|
done
|
|
ISSUES_JSON+="]"
|
|
|
|
# Build grades array
|
|
GRADES_JSON="["
|
|
for i in "${!GRADES[@]}"; do
|
|
if [[ $i -gt 0 ]]; then
|
|
GRADES_JSON+=","
|
|
fi
|
|
GRADES_JSON+="\"${GRADES[$i]}\""
|
|
done
|
|
GRADES_JSON+="]"
|
|
|
|
# Calculate pass rate
|
|
TOTAL_CRITERIA=${#CRITERIA[@]}
|
|
PASSED_COUNT=0
|
|
for grade in "${GRADES[@]}"; do
|
|
if [[ "$grade" == *"PASS" ]]; then
|
|
((PASSED_COUNT++))
|
|
fi
|
|
done
|
|
|
|
PASS_RATE=$((PASSED_COUNT * 100 / TOTAL_CRITERIA))
|
|
|
|
cat > "$SKILL_DIR/evals/grading-report.json" << EOF
|
|
{
|
|
"skill_name": "$SKILL_NAME",
|
|
"timestamp": "$TIMESTAMP",
|
|
"overall_result": "$OVERALL_RESULT",
|
|
"priority": "$PRIORITY",
|
|
"pass_rate": $PASS_RATE,
|
|
"criteria_passed": $PASSED_COUNT,
|
|
"criteria_total": $TOTAL_CRITERIA,
|
|
"pattern_analysis": "$PATTERN",
|
|
"extract_script_recommended": $EXTRACT_SCRIPT,
|
|
"detailed_grades": $GRADES_JSON,
|
|
"issues": $ISSUES_JSON,
|
|
"suggested_fixes": "$SUGGESTED_FIXES"
|
|
}
|
|
EOF
|
|
|
|
echo "========================================"
|
|
echo -e "${GREEN}Grading Report Generated${NC}"
|
|
echo "========================================"
|
|
echo ""
|
|
echo "Saved to: evals/grading-report.json"
|
|
echo ""
|
|
echo "Summary:"
|
|
echo " Overall: $OVERALL_RESULT"
|
|
echo " Priority: $PRIORITY"
|
|
echo " Pass Rate: $PASS_RATE% ($PASSED_COUNT/$TOTAL_CRITERIA criteria)"
|
|
echo " Pattern: $PATTERN issues"
|
|
echo ""
|
|
|
|
if [[ ${#ISSUES[@]} -gt 0 ]]; then
|
|
echo -e "${YELLOW}Issues Found:${NC}"
|
|
for issue in "${ISSUES[@]}"; do
|
|
echo " - $issue"
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# Next steps
|
|
echo "========================================"
|
|
echo "Next Steps"
|
|
echo "========================================"
|
|
echo ""
|
|
|
|
if [[ "$OVERALL_RESULT" == "pass" ]]; then
|
|
echo -e "${GREEN}✓ Skill is working well!${NC}"
|
|
echo ""
|
|
echo "Consider:"
|
|
echo " - Adding more edge case tests"
|
|
echo " - Optimizing the description"
|
|
echo " - Documenting the skill"
|
|
else
|
|
echo "To improve the skill:"
|
|
echo ""
|
|
echo "1. Review grading-report.json for details"
|
|
echo "2. Update SKILL.md based on the issues found"
|
|
echo ""
|
|
if [[ "$EXTRACT_SCRIPT" == "true" ]]; then
|
|
echo "3. Create helper scripts for repeated tasks"
|
|
echo " - Place scripts in scripts/ directory"
|
|
echo " - Update SKILL.md to reference them"
|
|
echo ""
|
|
fi
|
|
echo "4. Re-run tests to verify improvements:"
|
|
echo " ~/.config/opencode/skills/skill-builder/scripts/run-tests.sh $SKILL_DIR"
|
|
fi
|
|
|
|
echo ""
|
|
echo -e "${GREEN}Done!${NC}"
|