code_quiz / app.py
burtenshaw
add question level feedback
95114d7
raw
history blame
9.89 kB
import os
from datetime import datetime
import random
import gradio as gr
from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import whoami, InferenceClient
# Initialize the inference client
client = InferenceClient(
api_key=os.getenv("HF_API_KEY"), # Make sure to set this environment variable
)
# Load questions from Hugging Face dataset
EXAM_MAX_QUESTIONS = os.getenv("EXAM_MAX_QUESTIONS") or 5 # We have 5 questions total
EXAM_PASSING_SCORE = os.getenv("EXAM_PASSING_SCORE") or 0.7
EXAM_DATASET_ID = "burtenshaw/dummy-code-quiz"
# prep the dataset for the quiz
ds = load_dataset(EXAM_DATASET_ID, split="train")
quiz_data = ds.to_list()
random.shuffle(quiz_data)
def check_code(user_code, solution, challenge):
"""
Use LLM to evaluate if the user's code solution is correct.
Returns True if the solution is correct, False otherwise.
"""
prompt = f"""You are an expert Python programming instructor evaluating a student's code solution.
Challenge:
{challenge}
Reference Solution:
{solution}
Student's Solution:
{user_code}
Evaluate if the student's solution is functionally equivalent to the reference solution.
Consider:
1. Does it solve the problem correctly?
2. Does it handle edge cases appropriately?
3. Does it follow the requirements of the challenge?
Respond with ONLY "CORRECT" or "INCORRECT" followed by a brief explanation.
"""
messages = [{"role": "user", "content": prompt}]
try:
completion = client.chat.completions.create(
model="Qwen/Qwen2.5-Coder-32B-Instruct",
messages=messages,
max_tokens=500,
)
response = completion.choices[0].message.content.strip()
# Extract the verdict from the response
is_correct = response.upper().startswith("CORRECT")
# Add the explanation to the status text with emoji
explanation = response.split("\n", 1)[1] if "\n" in response else ""
status = "βœ… Correct!" if is_correct else "❌ Incorrect!"
gr.Info(f"{status}\n\n{explanation}")
return is_correct
except Exception as e:
gr.Warning(f"Error checking code: {str(e)}")
# Fall back to simple string comparison if LLM fails
is_correct = user_code.strip() == solution.strip()
status = "βœ… Correct!" if is_correct else "❌ Incorrect!"
gr.Info(f"{status} (Fallback comparison)")
return is_correct
def on_user_logged_in(token: gr.OAuthToken | None):
"""Handle user login state"""
if token is not None:
return gr.update(visible=False), gr.update(visible=True)
else:
return gr.update(visible=True), gr.update(visible=False)
def push_results_to_hub(
user_answers: list, token: gr.OAuthToken | None, signed_in_message: str
):
"""Push results to Hugging Face Hub."""
print(f"signed_in_message: {signed_in_message}")
if not user_answers: # Check if there are any answers to submit
gr.Warning("No answers to submit!")
return "No answers to submit!"
if token is None:
gr.Warning("Please log in to Hugging Face before pushing!")
return "Please log in to Hugging Face before pushing!"
# Calculate grade
correct_count = sum(1 for answer in user_answers if answer["is_correct"])
total_questions = len(user_answers)
grade = correct_count / total_questions if total_questions > 0 else 0
if grade < float(EXAM_PASSING_SCORE):
gr.Warning(
f"Score {grade:.1%} below passing threshold of {float(EXAM_PASSING_SCORE):.1%}"
)
return f"You scored {grade:.1%}. Please try again to achieve at least {float(EXAM_PASSING_SCORE):.1%}"
gr.Info("Submitting answers to the Hub. Please wait...", duration=2)
user_info = whoami(token=token.token)
username = user_info["name"]
repo_id = f"{EXAM_DATASET_ID}_responses"
submission_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Create a dataset with the user's answers and metadata
submission_data = [
{
"username": username,
"datetime": submission_time,
"grade": grade,
**answer, # Include all answer data
}
for answer in user_answers
]
try:
# Try to load existing dataset
existing_ds = load_dataset(repo_id)
# Convert to DatasetDict if it isn't already
if not isinstance(existing_ds, dict):
existing_ds = DatasetDict({"default": existing_ds})
except Exception:
# If dataset doesn't exist, create empty DatasetDict
existing_ds = DatasetDict()
# Create new dataset from submission
new_ds = Dataset.from_list(submission_data)
# Add or update the split for this user
existing_ds[username] = new_ds
# Push the updated dataset to the Hub
existing_ds.push_to_hub(
repo_id,
private=True, # Make it private by default since it contains student submissions
)
return f"Your responses have been submitted to the Hub! Final grade: {grade:.1%}"
def handle_quiz(question_idx, user_answers, submitted_code, is_start):
"""Handle quiz state and progression"""
# Hide the start button once the first question is shown
start_btn_update = gr.update(visible=False) if is_start else None
# If this is the first time (start=True), begin at question_idx=0
if is_start:
question_idx = 0
else:
# If not the first question and there's a submission, store the user's last submission
if (
question_idx < len(quiz_data) and submitted_code.strip()
): # Only check if there's code
current_q = quiz_data[question_idx]
is_correct = check_code(
submitted_code, current_q["solution"], current_q["challenge"]
)
user_answers.append(
{
"challenge": current_q["challenge"],
"submitted_code": submitted_code,
"correct_solution": current_q["solution"],
"is_correct": is_correct,
}
)
question_idx += 1
# If we've reached the end, show final results
if question_idx >= len(quiz_data):
correct_count = sum(1 for answer in user_answers if answer["is_correct"])
grade = correct_count / len(user_answers)
results_text = (
f"**Quiz Complete!**\n\n"
f"Your score: {grade:.1%}\n"
f"Passing score: {float(EXAM_PASSING_SCORE):.1%}\n\n"
f"Your answers:\n\n"
)
for idx, answer in enumerate(user_answers):
results_text += (
f"Question {idx + 1}: {'βœ…' if answer['is_correct'] else '❌'}\n"
)
results_text += (
f"Your code:\n```python\n{answer['submitted_code']}\n```\n\n"
)
return (
"", # question_text becomes blank
gr.update(value="", visible=False), # clear and hide code input
f"{'βœ… Passed!' if grade >= float(EXAM_PASSING_SCORE) else '❌ Did not pass'}",
question_idx,
user_answers,
start_btn_update,
gr.update(value=results_text, visible=True), # show final_markdown
)
else:
# Show the next question
q = quiz_data[question_idx]
challenge_text = f"## Question {question_idx + 1} \n### {q['challenge']}"
return (
challenge_text,
gr.update(value=q["placeholder"], visible=True),
"Submit your code solution and click 'Next' to continue.",
question_idx,
user_answers,
start_btn_update,
gr.update(visible=False), # Hide final_markdown
)
with gr.Blocks() as demo:
demo.title = f"Coding Quiz: {EXAM_DATASET_ID}"
# State variables
question_idx = gr.State(value=0)
user_answers = gr.State(value=[])
with gr.Row(variant="compact"):
gr.Markdown(f"## Welcome to the {EXAM_DATASET_ID} Quiz")
with gr.Row(variant="compact"):
gr.Markdown(
"Log in first, then click 'Start' to begin. Complete each coding challenge, click 'Next', "
"and finally click 'Submit' to publish your results to the Hugging Face Hub."
)
with gr.Row(variant="panel"):
question_text = gr.Markdown("")
code_input = gr.Code(language="python", label="Your Solution", visible=False)
with gr.Row(variant="compact"):
status_text = gr.Markdown("")
with gr.Row(variant="compact"):
final_markdown = gr.Markdown("", visible=False)
next_btn = gr.Button("Next ⏭️")
submit_btn = gr.Button("Submit βœ…")
with gr.Row(variant="compact"):
login_btn = gr.LoginButton()
start_btn = gr.Button("Start", visible=False)
login_btn.click(fn=on_user_logged_in, inputs=None, outputs=[login_btn, start_btn])
start_btn.click(
fn=handle_quiz,
inputs=[question_idx, user_answers, code_input, gr.State(True)],
outputs=[
question_text,
code_input,
status_text,
question_idx,
user_answers,
start_btn,
final_markdown,
],
)
next_btn.click(
fn=handle_quiz,
inputs=[question_idx, user_answers, code_input, gr.State(False)],
outputs=[
question_text,
code_input,
status_text,
question_idx,
user_answers,
start_btn,
final_markdown,
],
)
submit_btn.click(
fn=push_results_to_hub,
inputs=[user_answers, login_btn],
outputs=status_text,
)
if __name__ == "__main__":
demo.launch()