Spaces:

burtenshaw
/

code_quiz

Running

File size: 9,641 Bytes

6f46aeb

import os
from datetime import datetime
import random

import gradio as gr
from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import whoami, InferenceClient

# Initialize the inference client
client = InferenceClient(
    api_key=os.getenv("HF_API_KEY"),  # Make sure to set this environment variable
)

# Load questions from Hugging Face dataset
EXAM_MAX_QUESTIONS = os.getenv("EXAM_MAX_QUESTIONS") or 5  # We have 5 questions total
EXAM_PASSING_SCORE = os.getenv("EXAM_PASSING_SCORE") or 0.0
EXAM_DATASET_ID = "agents-course/dummy-code-quiz"

# prep the dataset for the quiz
ds = load_dataset(EXAM_DATASET_ID, split="train")
quiz_data = ds.to_list()
random.shuffle(quiz_data)

def check_code(user_code, solution, challenge):
    """
    Use LLM to evaluate if the user's code solution is correct.
    Returns True if the solution is correct, False otherwise.
    """
    prompt = f"""You are an expert Python programming instructor evaluating a student's code solution.

    Challenge:
    {challenge}

    Reference Solution:
    {solution}
    
    Student's Solution:
    {user_code}
    
    Evaluate if the student's solution is functionally equivalent to the reference solution.
    Consider:
    1. Does it solve the problem correctly?
    2. Does it handle edge cases appropriately?
    3. Does it follow the requirements of the challenge?

    Respond with ONLY "CORRECT" or "INCORRECT" followed by a brief explanation.
    """

    messages = [{"role": "user", "content": prompt}]

    try:
        completion = client.chat.completions.create(
            model="Qwen/Qwen2.5-Coder-32B-Instruct",
            messages=messages,
            max_tokens=500,
        )

        response = completion.choices[0].message.content.strip()

        # Extract the verdict from the response
        is_correct = response.upper().startswith("CORRECT")

        # Add the explanation to the status text
        explanation = response.split("\n", 1)[1] if "\n" in response else ""
        gr.Info(explanation)

        return is_correct

    except Exception as e:
        gr.Warning(f"Error checking code: {str(e)}")
        # Fall back to simple string comparison if LLM fails
        return user_code.strip() == solution.strip()


def on_user_logged_in(token: gr.OAuthToken | None):
    """Handle user login state"""
    if token is not None:
        return gr.update(visible=False), gr.update(visible=True)
    else:
        return gr.update(visible=True), gr.update(visible=False)


def push_results_to_hub(
    user_answers: list, token: gr.OAuthToken | None, signed_in_message: str
):
    """Push results to Hugging Face Hub."""

    print(f"signed_in_message: {signed_in_message}")

    if not user_answers:  # Check if there are any answers to submit
        gr.Warning("No answers to submit!")
        return "No answers to submit!"

    if token is None:
        gr.Warning("Please log in to Hugging Face before pushing!")
        return "Please log in to Hugging Face before pushing!"

    # Calculate grade
    correct_count = sum(1 for answer in user_answers if answer["is_correct"])
    total_questions = len(user_answers)
    grade = correct_count / total_questions if total_questions > 0 else 0

    if grade < float(EXAM_PASSING_SCORE):
        gr.Warning(
            f"Score {grade:.1%} below passing threshold of {float(EXAM_PASSING_SCORE):.1%}"
        )
        return f"You scored {grade:.1%}. Please try again to achieve at least {float(EXAM_PASSING_SCORE):.1%}"

    gr.Info("Submitting answers to the Hub. Please wait...", duration=2)

    user_info = whoami(token=token.token)
    username = user_info["name"]
    repo_id = f"{EXAM_DATASET_ID}_responses"
    submission_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Create a dataset with the user's answers and metadata
    submission_data = [
        {
            "username": username,
            "datetime": submission_time,
            "grade": grade,
            **answer,  # Include all answer data
        }
        for answer in user_answers
    ]

    try:
        # Try to load existing dataset
        existing_ds = load_dataset(repo_id)
        # Convert to DatasetDict if it isn't already
        if not isinstance(existing_ds, dict):
            existing_ds = DatasetDict({"default": existing_ds})
    except Exception:
        # If dataset doesn't exist, create empty DatasetDict
        existing_ds = DatasetDict()

    # Create new dataset from submission
    new_ds = Dataset.from_list(submission_data)

    # Add or update the split for this user
    existing_ds[username] = new_ds

    # Push the updated dataset to the Hub
    existing_ds.push_to_hub(
        repo_id,
        private=True,  # Make it private by default since it contains student submissions
    )

    return f"Your responses have been submitted to the Hub! Final grade: {grade:.1%}"


def handle_quiz(question_idx, user_answers, submitted_code, is_start):
    """Handle quiz state and progression"""
    # Hide the start button once the first question is shown
    start_btn_update = gr.update(visible=False) if is_start else None

    # If this is the first time (start=True), begin at question_idx=0
    if is_start:
        question_idx = 0
    else:
        # If not the first question and there's a submission, store the user's last submission
        if (
            question_idx < len(quiz_data) and submitted_code.strip()
        ):  # Only check if there's code
            current_q = quiz_data[question_idx]
            is_correct = check_code(
                submitted_code, current_q["solution"], current_q["challenge"]
            )
            user_answers.append(
                {
                    "challenge": current_q["challenge"],
                    "submitted_code": submitted_code,
                    "correct_solution": current_q["solution"],
                    "is_correct": is_correct,
                }
            )
        question_idx += 1

    # If we've reached the end, show final results
    if question_idx >= len(quiz_data):
        correct_count = sum(1 for answer in user_answers if answer["is_correct"])
        grade = correct_count / len(user_answers)
        results_text = (
            f"**Quiz Complete!**\n\n"
            f"Your score: {grade:.1%}\n"
            f"Passing score: {float(EXAM_PASSING_SCORE):.1%}\n\n"
            f"Your answers:\n\n"
        )
        for idx, answer in enumerate(user_answers):
            results_text += (
                f"Question {idx + 1}: {'✅' if answer['is_correct'] else '❌'}\n"
            )
            results_text += (
                f"Your code:\n```python\n{answer['submitted_code']}\n```\n\n"
            )

        return (
            "",  # question_text becomes blank
            gr.update(value="", visible=False),  # clear and hide code input
            f"{'✅ Passed!' if grade >= float(EXAM_PASSING_SCORE) else '❌ Did not pass'}",
            question_idx,
            user_answers,
            start_btn_update,
            gr.update(value=results_text, visible=True),  # show final_markdown
        )
    else:
        # Show the next question
        q = quiz_data[question_idx]
        challenge_text = f"## Question {question_idx + 1} \n### {q['challenge']}"
        return (
            challenge_text,
            gr.update(value=q["placeholder"], visible=True),
            "Submit your code solution and click 'Next' to continue.",
            question_idx,
            user_answers,
            start_btn_update,
            gr.update(visible=False),  # Hide final_markdown
        )


with gr.Blocks() as demo:
    demo.title = f"Coding Quiz: {EXAM_DATASET_ID}"
    # State variables
    question_idx = gr.State(value=0)
    user_answers = gr.State(value=[])

    with gr.Row(variant="compact"):
        gr.Markdown(f"## Welcome to the {EXAM_DATASET_ID} Quiz")
    with gr.Row(variant="compact"):
        gr.Markdown(
            "Log in first, then click 'Start' to begin. Complete each coding challenge, click 'Next', "
            "and finally click 'Submit' to publish your results to the Hugging Face Hub."
        )

    with gr.Row(variant="panel"):
        question_text = gr.Markdown("")
        code_input = gr.Code(language="python", label="Your Solution", visible=False)

    with gr.Row(variant="compact"):
        status_text = gr.Markdown("")

    with gr.Row(variant="compact"):
        final_markdown = gr.Markdown("", visible=False)

        next_btn = gr.Button("Next ⏭️")
        submit_btn = gr.Button("Submit ✅")

    with gr.Row(variant="compact"):
        login_btn = gr.LoginButton()
        start_btn = gr.Button("Start", visible=False)

    login_btn.click(fn=on_user_logged_in, inputs=None, outputs=[login_btn, start_btn])

    start_btn.click(
        fn=handle_quiz,
        inputs=[question_idx, user_answers, code_input, gr.State(True)],
        outputs=[
            question_text,
            code_input,
            status_text,
            question_idx,
            user_answers,
            start_btn,
            final_markdown,
        ],
    )

    next_btn.click(
        fn=handle_quiz,
        inputs=[question_idx, user_answers, code_input, gr.State(False)],
        outputs=[
            question_text,
            code_input,
            status_text,
            question_idx,
            user_answers,
            start_btn,
            final_markdown,
        ],
    )

    submit_btn.click(
        fn=push_results_to_hub,
        inputs=[user_answers, login_btn],
        outputs=status_text,
    )


if __name__ == "__main__":
    demo.launch()