Spaces:

multimodalart
/

LLaDA

Running on Zero

App Files Files Community

multimodalart HF staff commited on 6 days ago

Commit

ef6b1de

verified ·

1 Parent(s): 691f73d

Update app.py

Browse files

Files changed (1) hide show

app.py +214 -111

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import torch
 import numpy as np
 import gradio as gr
 import spaces
 from transformers import AutoTokenizer, AutoModel
 import time
 import re
@@ -57,13 +58,56 @@ def format_chat_history(history):
     return messages
 @spaces.GPU
-def generate_response_with_visualization(model, tokenizer, device, messages, gen_length=64, steps=32, constraints=None):
     """
-    Generate text with LLaDA model with visualization of the denoising process
     Args:
         messages: List of message dictionaries with 'role' and 'content'
     Returns:
         List of visualization states showing the progression and final text
@@ -92,10 +136,10 @@ def generate_response_with_visualization(model, tokenizer, device, messages, gen
     x = torch.full((1, prompt_length + gen_length), MASK_ID, dtype=torch.long).to(device)
     x[:, :prompt_length] = input_ids.clone()
-    # Initialize visualization states for just the response part
     visualization_states = []
-    # Add initial state (all masked) - only for the response part
     initial_state = [(MASK_TOKEN, "#444444") for _ in range(gen_length)]
     visualization_states.append(initial_state)
@@ -105,114 +149,144 @@ def generate_response_with_visualization(model, tokenizer, device, messages, gen
         if absolute_pos < x.shape[1]:
             x[:, absolute_pos] = token_id
-    # Calculate timesteps
-    timesteps = torch.linspace(1.0, 0.0, steps + 1)[:-1]
-    # Keep track of already revealed tokens
-    revealed_tokens = torch.zeros(1, gen_length, dtype=torch.bool).to(device)
-    for step, t in enumerate(timesteps):
-        # Current t to next t
-        s = t - 1.0 / steps if step < steps - 1 else 0
-        # Get all mask positions in the current sequence
-        mask_indices = (x == MASK_ID)
-        # Skip if no masks
-        if not mask_indices.any():
-            break
-        # Get logits from the model
-        logits = model(x).logits
-        # Get the top predictions
-        x0 = torch.argmax(logits, dim=-1)
-        # Get probabilities for visualization
-        probs = torch.softmax(logits, dim=-1)
-        top_probs = torch.max(probs, dim=-1)[0]
-        # Apply the predictions where we have masks
-        x_old = x.clone()
-        x = torch.where(mask_indices, x0, x)
-        # Calculate how many tokens should remain masked at next step
-        total_len = gen_length
-        current_t_value = float(t)
-        next_t_value = float(s)
-        # Linear schedule: t=1 → all masked, t=0 → none masked
-        current_masks_expected = int(current_t_value * total_len)
-        next_masks_expected = int(next_t_value * total_len)
-        # How many to unmask in this step
-        tokens_to_unmask = current_masks_expected - next_masks_expected
-        if tokens_to_unmask > 0 and mask_indices.any():
-            # Get confidence scores for currently masked tokens
-            confidence_scores = top_probs[mask_indices]
-            # Sort confidence scores
-            sorted_indices = torch.argsort(confidence_scores, descending=True)
-            # Select which tokens to keep masked (the lowest confidence ones)
-            indices_to_remask = sorted_indices[tokens_to_unmask:]
-            # Get the actual indices in the sequence
-            mask_positions = torch.where(mask_indices)[1]
-            positions_to_remask = mask_positions[indices_to_remask]
-            # Remask these positions
-            x[:, positions_to_remask] = MASK_ID
-        # Ensure constraints are maintained
-        for pos, token_id in processed_constraints.items():
-            absolute_pos = prompt_length + pos
-            if absolute_pos < x.shape[1]:
-                x[:, absolute_pos] = token_id
-        # Create visualization state ONLY for the response part
-        current_state = []
-        # Update which tokens are newly revealed in this step
-        for i in range(gen_length):
-            pos = prompt_length + i  # Absolute position in the sequence
-            if x[0, pos] == MASK_ID:
-                # Still masked
-                current_state.append((MASK_TOKEN, "#444444"))  # Dark gray for masks
-            elif x_old[0, pos] == MASK_ID:
-                # Newly revealed in this step
-                token = tokenizer.decode([x[0, pos].item()], skip_special_tokens=True)
-                confidence = float(top_probs[0, pos].cpu())
-                # Color based on confidence: red (low) to green (high)
-                if confidence < 0.3:
-                    color = "#FF6666"  # Light red
-                elif confidence < 0.7:
-                    color = "#FFAA33"  # Orange
-                else:
-                    color = "#66CC66"  # Light green
-                current_state.append((token, color))
-                revealed_tokens[0, i] = True
-            else:
-                # Previously revealed
-                token = tokenizer.decode([x[0, pos].item()], skip_special_tokens=True)
-                current_state.append((token, "#6699CC"))  # Light blue
-        visualization_states.append(current_state)
     # Extract final text (just the assistant's response)
     response_tokens = x[0, prompt_length:]
-    response_text = tokenizer.decode(response_tokens, skip_special_tokens=True)
-    # Clean the response text
     final_text = tokenizer.decode(response_tokens,
-                                 skip_special_tokens=True,
-                                 clean_up_tokenization_spaces=True)
     return visualization_states, final_text
@@ -222,15 +296,13 @@ button{height: 60px}
 '''
 def create_chatbot_demo():
     with gr.Blocks(css=css) as demo:
-        gr.Markdown("# LLaDA - Large Language Diffusion Model demo")
         gr.Markdown("[model](https://huggingface.co/GSAI-ML/LLaDA-8B-Instruct), [project page](https://ml-gsai.github.io/LLaDA-demo/)")
-        # STATE MANAGEMENT - IMPORTANT
-        # We use a dedicated state to track the full conversation history
         chat_history = gr.State([])
         # UI COMPONENTS
-        # Chatbot for displaying messages
         with gr.Row():
             with gr.Column(scale=3):
                 chatbot_ui = gr.Chatbot(label="Conversation", height=500)
@@ -257,7 +329,8 @@ def create_chatbot_demo():
                     combine_adjacent=False,
                     show_legend=True,
                 )
-        # Visualization and response components
         with gr.Accordion("Generation Settings", open=False):
             with gr.Row():
                 gen_length = gr.Slider(
@@ -268,14 +341,32 @@ def create_chatbot_demo():
                     minimum=8, maximum=64, value=32, step=4,
                     label="Denoising Steps"
                 )
-            visualization_delay = gr.Slider(
-                minimum=0.0, maximum=1.0, value=0.1, step=0.1, visible=False,
-                label="Visualization Delay (seconds)"
-            )
-        # Current response text box
         current_response = gr.Textbox(
             label="Current Response",
             placeholder="The assistant's response will appear here...",
@@ -313,7 +404,7 @@ def create_chatbot_demo():
             # Return immediately to update UI with user message
             return history, history_for_display, message_out, [], ""
-        def bot_response(history, gen_length, steps, constraints, delay):
             """Generate bot response for the latest message"""
             if not history:
                 return history, [], ""
@@ -337,7 +428,11 @@ def create_chatbot_demo():
                     messages,
                     gen_length=gen_length,
                     steps=steps,
-                    constraints=parsed_constraints
                 )
                 # Update history with the assistant's response
@@ -393,13 +488,21 @@ def create_chatbot_demo():
         # This happens after the user message is displayed
         msg_submit.then(
             fn=bot_response,
-            inputs=[chat_history, gen_length, steps, constraints_input, visualization_delay],
             outputs=[chatbot_ui, output_vis, current_response]
         )
         send_click.then(
             fn=bot_response,
-            inputs=[chat_history, gen_length, steps, constraints_input, visualization_delay],
             outputs=[chatbot_ui, output_vis, current_response]
         )

 import numpy as np
 import gradio as gr
 import spaces
+import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModel
 import time
 import re
     return messages
+def add_gumbel_noise(logits, temperature):
+    '''
+    The Gumbel max is a method for sampling categorical distributions.
+    According to arXiv:2409.02908, for MDM, low-precision Gumbel Max improves perplexity score but reduces generation quality.
+    Thus, we use float64.
+    '''
+    if temperature <= 0:
+        return logits
+    logits = logits.to(torch.float64)
+    noise = torch.rand_like(logits, dtype=torch.float64)
+    gumbel_noise = (- torch.log(noise)) ** temperature
+    return logits.exp() / gumbel_noise
+def get_num_transfer_tokens(mask_index, steps):
+    '''
+    In the reverse process, the interval [0, 1] is uniformly discretized into steps intervals.
+    Furthermore, because LLaDA employs a linear noise schedule (as defined in Eq. (8)),
+    the expected number of tokens transitioned at each step should be consistent.
+    This function is designed to precompute the number of tokens that need to be transitioned at each step.
+    '''
+    mask_num = mask_index.sum(dim=1, keepdim=True)
+    base = mask_num // steps
+    remainder = mask_num % steps
+    num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64) + base
+    for i in range(mask_num.size(0)):
+        num_transfer_tokens[i, :remainder[i]] += 1
+    return num_transfer_tokens
 @spaces.GPU
+def generate_response_with_visualization(model, tokenizer, device, messages, gen_length=64, steps=32,
+                                         constraints=None, temperature=0.0, cfg_scale=0.0, block_length=32,
+                                         remasking='low_confidence'):
     """
+    Generate text with LLaDA model with visualization using the same sampling as in generate.py
     Args:
         messages: List of message dictionaries with 'role' and 'content'
+        gen_length: Length of text to generate
+        steps: Number of denoising steps
+        constraints: Dictionary mapping positions to words
+        temperature: Sampling temperature
+        cfg_scale: Classifier-free guidance scale
+        block_length: Block length for semi-autoregressive generation
+        remasking: Remasking strategy ('low_confidence' or 'random')
     Returns:
         List of visualization states showing the progression and final text
     x = torch.full((1, prompt_length + gen_length), MASK_ID, dtype=torch.long).to(device)
     x[:, :prompt_length] = input_ids.clone()
+    # Initialize visualization states for the response part
     visualization_states = []
+    # Add initial state (all masked)
     initial_state = [(MASK_TOKEN, "#444444") for _ in range(gen_length)]
     visualization_states.append(initial_state)
         if absolute_pos < x.shape[1]:
             x[:, absolute_pos] = token_id
+    # Mark prompt positions to exclude them from masking during classifier-free guidance
+    prompt_index = (x != MASK_ID)
+    # Ensure block_length is valid
+    if block_length > gen_length:
+        block_length = gen_length
+    # Calculate number of blocks
+    num_blocks = gen_length // block_length
+    if gen_length % block_length != 0:
+        num_blocks += 1
+    # Adjust steps per block
+    steps_per_block = steps // num_blocks
+    if steps_per_block < 1:
+        steps_per_block = 1
+    # Track the current state of x for visualization
+    current_x = x.clone()
+    # Process each block
+    for num_block in range(num_blocks):
+        # Calculate the start and end indices for the current block
+        block_start = prompt_length + num_block * block_length
+        block_end = min(prompt_length + (num_block + 1) * block_length, x.shape[1])
+        # Get mask indices for the current block
+        block_mask_index = (x[:, block_start:block_end] == MASK_ID)
+        # Skip if no masks in this block
+        if not block_mask_index.any():
+            continue
+        # Calculate number of tokens to unmask at each step
+        num_transfer_tokens = get_num_transfer_tokens(block_mask_index, steps_per_block)
+        # Process each step
+        for i in range(steps_per_block):
+            # Get all mask positions in the current sequence
+            mask_index = (x == MASK_ID)
+            # Skip if no masks
+            if not mask_index.any():
+                break
+            # Apply classifier-free guidance if enabled
+            if cfg_scale > 0.0:
+                un_x = x.clone()
+                un_x[prompt_index] = MASK_ID
+                x_ = torch.cat([x, un_x], dim=0)
+                logits = model(x_).logits
+                logits, un_logits = torch.chunk(logits, 2, dim=0)
+                logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
+            else:
+                logits = model(x).logits
+            # Apply Gumbel noise for sampling
+            logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
+            x0 = torch.argmax(logits_with_noise, dim=-1)
+            # Calculate confidence scores for remasking
+            if remasking == 'low_confidence':
+                p = F.softmax(logits.to(torch.float64), dim=-1)
+                x0_p = torch.squeeze(
+                    torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1)  # b, l
+            elif remasking == 'random':
+                x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
+            else:
+                raise NotImplementedError(f"Remasking strategy '{remasking}' not implemented")
+            # Don't consider positions beyond the current block
+            x0_p[:, block_end:] = -float('inf')
+            # Apply predictions where we have masks
+            old_x = x.clone()
+            x0 = torch.where(mask_index, x0, x)
+            confidence = torch.where(mask_index, x0_p, -float('inf'))
+            # Select tokens to unmask based on confidence
+            transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
+            for j in range(confidence.shape[0]):
+                # Only consider positions within the current block for unmasking
+                block_confidence = confidence[j, block_start:block_end]
+                if i < steps_per_block - 1:  # Not the last step
+                    # Take top-k confidences
+                    _, select_indices = torch.topk(block_confidence,
+                                                  k=min(num_transfer_tokens[j, i].item(),
+                                                       block_confidence.numel()))
+                    # Adjust indices to global positions
+                    select_indices = select_indices + block_start
+                    transfer_index[j, select_indices] = True
+                else:  # Last step - unmask everything remaining
+                    transfer_index[j, block_start:block_end] = mask_index[j, block_start:block_end]
+            # Apply the selected tokens
+            x = torch.where(transfer_index, x0, x)
+            # Ensure constraints are maintained
+            for pos, token_id in processed_constraints.items():
+                absolute_pos = prompt_length + pos
+                if absolute_pos < x.shape[1]:
+                    x[:, absolute_pos] = token_id
+            # Create visualization state only for the response part
+            current_state = []
+            for i in range(gen_length):
+                pos = prompt_length + i  # Absolute position in the sequence
+                if x[0, pos] == MASK_ID:
+                    # Still masked
+                    current_state.append((MASK_TOKEN, "#444444"))  # Dark gray for masks
+                elif old_x[0, pos] == MASK_ID:
+                    # Newly revealed in this step
+                    token = tokenizer.decode([x[0, pos].item()], skip_special_tokens=True)
+                    # Color based on confidence
+                    confidence = float(x0_p[0, pos].cpu())
+                    if confidence < 0.3:
+                        color = "#FF6666"  # Light red
+                    elif confidence < 0.7:
+                        color = "#FFAA33"  # Orange
+                    else:
+                        color = "#66CC66"  # Light green
+                    current_state.append((token, color))
+                else:
+                    # Previously revealed
+                    token = tokenizer.decode([x[0, pos].item()], skip_special_tokens=True)
+                    current_state.append((token, "#6699CC"))  # Light blue
+            visualization_states.append(current_state)
     # Extract final text (just the assistant's response)
     response_tokens = x[0, prompt_length:]
     final_text = tokenizer.decode(response_tokens,
+                               skip_special_tokens=True,
+                               clean_up_tokenization_spaces=True)
     return visualization_states, final_text
 '''
 def create_chatbot_demo():
     with gr.Blocks(css=css) as demo:
+        gr.Markdown("# LLaDA - Large Language Diffusion Model Demo")
         gr.Markdown("[model](https://huggingface.co/GSAI-ML/LLaDA-8B-Instruct), [project page](https://ml-gsai.github.io/LLaDA-demo/)")
+        # STATE MANAGEMENT
         chat_history = gr.State([])
         # UI COMPONENTS
         with gr.Row():
             with gr.Column(scale=3):
                 chatbot_ui = gr.Chatbot(label="Conversation", height=500)
                     combine_adjacent=False,
                     show_legend=True,
                 )
+        # Advanced generation settings
         with gr.Accordion("Generation Settings", open=False):
             with gr.Row():
                 gen_length = gr.Slider(
                     minimum=8, maximum=64, value=32, step=4,
                     label="Denoising Steps"
                 )
+            with gr.Row():
+                temperature = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=0.0, step=0.1,
+                    label="Temperature"
+                )
+                cfg_scale = gr.Slider(
+                    minimum=0.0, maximum=2.0, value=0.0, step=0.1,
+                    label="CFG Scale"
+                )
+            with gr.Row():
+                block_length = gr.Slider(
+                    minimum=8, maximum=128, value=32, step=8,
+                    label="Block Length"
+                )
+                remasking_strategy = gr.Radio(
+                    choices=["low_confidence", "random"],
+                    value="low_confidence",
+                    label="Remasking Strategy"
+                )
+            with gr.Row():
+                visualization_delay = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=0.1, step=0.1,
+                    label="Visualization Delay (seconds)"
+                )
+        # Current response text box (hidden)
         current_response = gr.Textbox(
             label="Current Response",
             placeholder="The assistant's response will appear here...",
             # Return immediately to update UI with user message
             return history, history_for_display, message_out, [], ""
+        def bot_response(history, gen_length, steps, constraints, delay, temperature, cfg_scale, block_length, remasking):
             """Generate bot response for the latest message"""
             if not history:
                 return history, [], ""
                     messages,
                     gen_length=gen_length,
                     steps=steps,
+                    constraints=parsed_constraints,
+                    temperature=temperature,
+                    cfg_scale=cfg_scale,
+                    block_length=block_length,
+                    remasking=remasking
                 )
                 # Update history with the assistant's response
         # This happens after the user message is displayed
         msg_submit.then(
             fn=bot_response,
+            inputs=[
+                chat_history, gen_length, steps, constraints_input,
+                visualization_delay, temperature, cfg_scale, block_length,
+                remasking_strategy
+            ],
             outputs=[chatbot_ui, output_vis, current_response]
         )
         send_click.then(
             fn=bot_response,
+            inputs=[
+                chat_history, gen_length, steps, constraints_input,
+                visualization_delay, temperature, cfg_scale, block_length,
+                remasking_strategy
+            ],
             outputs=[chatbot_ui, output_vis, current_response]
         )