#!/bin/bash # # grade-output.sh - Interactive grading checklist for skill outputs # # Usage: ./grade-output.sh # # This script provides a structured checklist for evaluating skill test outputs. # set -euo pipefail # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' NC='\033[0m' # No Color # Parse arguments SKILL_DIR="" usage() { echo "Usage: $0 " echo "" echo "Examples:" echo " $0 ~/.config/opencode/skills/my-skill" echo " $0 ./my-skill" exit 1 } # Parse command line arguments while [[ $# -gt 0 ]]; do case $1 in -h|--help) usage ;; -*) echo -e "${RED}Error: Unknown option $1${NC}" usage ;; *) if [[ -z "$SKILL_DIR" ]]; then SKILL_DIR="$1" else echo -e "${RED}Error: Multiple skill directories provided${NC}" usage fi shift ;; esac done # Validate skill directory if [[ -z "$SKILL_DIR" ]]; then echo -e "${RED}Error: Skill directory is required${NC}" usage fi # Resolve path SKILL_DIR="$(cd "$SKILL_DIR" 2>/dev/null && pwd)" || { echo -e "${RED}Error: Cannot access directory: $SKILL_DIR${NC}" exit 1 } # Get skill name from directory SKILL_NAME="$(basename "$SKILL_DIR")" echo "========================================" echo "Grading Output for: $SKILL_NAME" echo "========================================" echo "" # Check for test results if [[ -f "$SKILL_DIR/evals/test-results.json" ]]; then echo -e "${BLUE}Found previous test results${NC}" echo "" fi # Initialize grading data declare -a CRITERIA=( "Correctness: Output matches expected result" "Correctness: No factual errors" "Correctness: Logic is sound" "Correctness: Edge cases handled appropriately" "Completeness: All requested tasks completed" "Completeness: No steps skipped" "Completeness: Appropriate level of detail" "Completeness: Relevant context included" "Format: Output follows specified format" "Format: Consistent with examples in skill" "Format: Easy to read and understand" "Triggering: Skill activated when appropriate" "Triggering: Did not activate when inappropriate" "Efficiency: No unnecessary steps" "Efficiency: Reasonable response length" "Efficiency: Not overly verbose" ) GRADES=() ISSUES=() # Function to ask yes/no question ask_yes_no() { local prompt="$1" while true; do read -p "$prompt (y/n): " -n 1 -r echo case $REPLY in [Yy]) return 0 ;; [Nn]) return 1 ;; *) echo "Please enter y or n" ;; esac done } echo "This checklist will help you systematically evaluate the skill output." echo "Answer each question based on the test results you observed." echo "" read -p "Press Enter to begin grading..." echo "" # Grade each criterion echo "========================================" echo -e "${CYAN}Grading Criteria${NC}" echo "========================================" echo "" for criterion in "${CRITERIA[@]}"; do category="${criterion%%:*}" description="${criterion#*: }" echo -e "${BLUE}[$category]${NC} $description" if ask_yes_no " Does it meet this criterion"; then GRADES+=("$criterion: PASS") echo -e " ${GREEN}✓ Pass${NC}" else GRADES+=("$criterion: FAIL") echo -e " ${RED}✗ Fail${NC}" # Ask for issue description echo " Briefly describe the issue:" read -r issue if [[ -n "$issue" ]]; then ISSUES+=("[$category] $description: $issue") fi fi echo "" done # Overall assessment echo "========================================" echo -e "${CYAN}Overall Assessment${NC}" echo "========================================" echo "" echo "Overall Result:" echo " [p] Pass - All or most criteria met" echo " [f] Fail - Significant issues found" echo " [i] Incomplete - Needs more testing" echo "" while true; do read -p "Overall result (p/f/i): " -n 1 -r echo case $REPLY in [Pp]) OVERALL_RESULT="pass" break ;; [Ff]) OVERALL_RESULT="fail" break ;; [Ii]) OVERALL_RESULT="incomplete" break ;; *) echo "Please enter p, f, or i" ;; esac done echo "" # Priority assessment echo "Priority of fixes needed:" echo " [h] High - Critical issues, skill not usable" echo " [m] Medium - Important issues, skill partially works" echo " [l] Low - Minor issues, skill mostly works" echo "" while true; do read -p "Priority (h/m/l): " -n 1 -r echo case $REPLY in [Hh]) PRIORITY="high" break ;; [Mm]) PRIORITY="medium" break ;; [Ll]) PRIORITY="low" break ;; *) echo "Please enter h, m, or l" ;; esac done echo "" # Suggested fixes echo -e "${BLUE}Suggested Improvements (optional):${NC}" echo "Describe what changes would address the issues:" read -r SUGGESTED_FIXES echo "" # Pattern analysis echo "========================================" echo -e "${CYAN}Pattern Analysis${NC}" echo "========================================" echo "" if ask_yes_no "Did the same issue appear in multiple test cases"; then echo "This suggests a systemic problem. Consider:" echo " - Fixing the root cause rather than symptoms" echo " - Adding a helper script for repeated tasks" echo " - Clarifying instructions in SKILL.md" PATTERN="systemic" else echo "Issues appear to be isolated to specific cases." PATTERN="isolated" fi echo "" # Extract to script recommendation if ask_yes_no "Should any repeated work be extracted to a script"; then echo "Consider creating a script in scripts/ directory." EXTRACT_SCRIPT="true" else EXTRACT_SCRIPT="false" fi echo "" # Generate grading report TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ") # Build issues array ISSUES_JSON="[" for i in "${!ISSUES[@]}"; do if [[ $i -gt 0 ]]; then ISSUES_JSON+="," fi ISSUES_JSON+="\"${ISSUES[$i]}\"" done ISSUES_JSON+="]" # Build grades array GRADES_JSON="[" for i in "${!GRADES[@]}"; do if [[ $i -gt 0 ]]; then GRADES_JSON+="," fi GRADES_JSON+="\"${GRADES[$i]}\"" done GRADES_JSON+="]" # Calculate pass rate TOTAL_CRITERIA=${#CRITERIA[@]} PASSED_COUNT=0 for grade in "${GRADES[@]}"; do if [[ "$grade" == *"PASS" ]]; then ((PASSED_COUNT++)) fi done PASS_RATE=$((PASSED_COUNT * 100 / TOTAL_CRITERIA)) cat > "$SKILL_DIR/evals/grading-report.json" << EOF { "skill_name": "$SKILL_NAME", "timestamp": "$TIMESTAMP", "overall_result": "$OVERALL_RESULT", "priority": "$PRIORITY", "pass_rate": $PASS_RATE, "criteria_passed": $PASSED_COUNT, "criteria_total": $TOTAL_CRITERIA, "pattern_analysis": "$PATTERN", "extract_script_recommended": $EXTRACT_SCRIPT, "detailed_grades": $GRADES_JSON, "issues": $ISSUES_JSON, "suggested_fixes": "$SUGGESTED_FIXES" } EOF echo "========================================" echo -e "${GREEN}Grading Report Generated${NC}" echo "========================================" echo "" echo "Saved to: evals/grading-report.json" echo "" echo "Summary:" echo " Overall: $OVERALL_RESULT" echo " Priority: $PRIORITY" echo " Pass Rate: $PASS_RATE% ($PASSED_COUNT/$TOTAL_CRITERIA criteria)" echo " Pattern: $PATTERN issues" echo "" if [[ ${#ISSUES[@]} -gt 0 ]]; then echo -e "${YELLOW}Issues Found:${NC}" for issue in "${ISSUES[@]}"; do echo " - $issue" done echo "" fi # Next steps echo "========================================" echo "Next Steps" echo "========================================" echo "" if [[ "$OVERALL_RESULT" == "pass" ]]; then echo -e "${GREEN}✓ Skill is working well!${NC}" echo "" echo "Consider:" echo " - Adding more edge case tests" echo " - Optimizing the description" echo " - Documenting the skill" else echo "To improve the skill:" echo "" echo "1. Review grading-report.json for details" echo "2. Update SKILL.md based on the issues found" echo "" if [[ "$EXTRACT_SCRIPT" == "true" ]]; then echo "3. Create helper scripts for repeated tasks" echo " - Place scripts in scripts/ directory" echo " - Update SKILL.md to reference them" echo "" fi echo "4. Re-run tests to verify improvements:" echo " ~/.config/opencode/skills/skill-builder/scripts/run-tests.sh $SKILL_DIR" fi echo "" echo -e "${GREEN}Done!${NC}"