File size: 3,298 Bytes
1e868bb
c7c29fb
 
1e868bb
 
 
 
 
 
c7c29fb
 
1e868bb
0553ee9
c7c29fb
 
 
1e868bb
 
c7c29fb
1e868bb
c7c29fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e868bb
 
c7c29fb
 
1e868bb
a852975
1e868bb
 
 
c7c29fb
1e868bb
 
 
 
 
 
 
 
 
0553ee9
a7ee8af
 
 
c7c29fb
1e868bb
 
 
 
 
 
c7c29fb
1e868bb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""This space is taken and modified from https://huggingface.co/spaces/merve/compare_clip_siglip"""
import torch
from transformers import AutoModel, AutoProcessor
import gradio as gr

################################################################################
# Load the models
################################################################################
sg1_ckpt = "google/siglip-so400m-patch14-384"
siglip1_model = AutoModel.from_pretrained(sg1_ckpt, device_map="cpu").eval()
siglip1_processor = AutoProcessor.from_pretrained(sg1_ckpt)

sg2_ckpt = "google/siglip2-so400m-patch14-384"
siglip2_model = AutoModel.from_pretrained(sg2_ckpt, device_map="cpu").eval()
siglip2_processor = AutoProcessor.from_pretrained(sg2_ckpt)


################################################################################
# Utilities
################################################################################
def postprocess_siglip(sg1_probs, sg2_probs, labels):
    sg1_output = {labels[i]: sg1_probs[0][i] for i in range(len(labels))}
    sg2_output = {labels[i]: sg2_probs[0][i] for i in range(len(labels))}
    return sg1_output, sg2_output


def siglip_detector(image, texts):
    sg1_inputs = siglip1_processor(
        text=texts, images=image, return_tensors="pt", padding="max_length", max_length=64
    ).to("cpu")
    sg2_inputs = siglip2_processor(
        text=texts, images=image, return_tensors="pt", padding="max_length", max_length=64
    ).to("cpu")
    with torch.no_grad():
        sg1_outputs = siglip1_model(**sg1_inputs)
        sg2_outputs = siglip2_model(**sg2_inputs)
        sg1_logits_per_image = sg1_outputs.logits_per_image
        sg2_logits_per_image = sg2_outputs.logits_per_image
        sg1_probs = torch.sigmoid(sg1_logits_per_image)
        sg2_probs = torch.sigmoid(sg2_logits_per_image)
    return sg1_probs, sg2_probs


def infer(image, candidate_labels):
    candidate_labels = [label.lstrip(" ") for label in candidate_labels.split(",")]
    sg1_probs, sg2_probs = siglip_detector(image, candidate_labels)
    return postprocess_siglip(sg1_probs, sg2_probs, labels=candidate_labels)


with gr.Blocks() as demo:
    gr.Markdown("# Compare SigLIP 1 and SigLIP 2")
    gr.Markdown(
        "Compare the performance of SigLIP 1 and SigLIP 2 on zero-shot classification in this Space :point_down:"
    )
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil")
            text_input = gr.Textbox(label="Input a list of labels (comma seperated)")
            run_button = gr.Button("Run", visible=True)
        with gr.Column():
            siglip1_output = gr.Label(label="SigLIP 1 Output", num_top_classes=3)
            siglip2_output = gr.Label(label="SigLIP 2 Output", num_top_classes=3)
    examples = [
        ["./baklava.jpg", "dessert on a plate, a serving of baklava, a plate and spoon"],
        ["./cat.jpg", "a cat, two cats, three cats"],
        ["./cat.jpg", "two sleeping cats, two cats playing, three cats laying down"],
    ]
    gr.Examples(
        examples=examples,
        inputs=[image_input, text_input],
        outputs=[siglip1_output, siglip2_output],
        fn=infer,
    )
    run_button.click(fn=infer, inputs=[image_input, text_input], outputs=[siglip1_output, siglip2_output])
demo.launch()