Upload model
Browse files- README.md +26 -60
- config.json +7 -3
- tf_model.h5 +3 -0
README.md
CHANGED
@@ -1,81 +1,47 @@
|
|
1 |
---
|
2 |
-
widget:
|
3 |
-
- src: http://images.cocodataset.org/val2017/000000039769.jpg
|
4 |
-
candidate_labels: 고양이, 강아지, 토끼
|
5 |
-
example_title: cat and remote
|
6 |
-
language: ko
|
7 |
license: mit
|
|
|
|
|
|
|
|
|
|
|
8 |
---
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
Korean CLIP model trained by [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813)
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
훈련 코드: <https://github.com/Bing-su/KoCLIP_training_code>
|
17 |
|
18 |
-
|
|
|
19 |
|
20 |
-
## How to Use
|
21 |
|
22 |
-
|
23 |
|
24 |
-
|
25 |
-
import requests
|
26 |
-
import torch
|
27 |
-
from PIL import Image
|
28 |
-
from transformers import AutoModel, AutoProcessor
|
29 |
|
30 |
-
|
31 |
-
model = AutoModel.from_pretrained(repo)
|
32 |
-
processor = AutoProcessor.from_pretrained(repo)
|
33 |
|
34 |
-
|
35 |
-
image = Image.open(requests.get(url, stream=True).raw)
|
36 |
-
inputs = processor(text=["고양이 두 마리", "개 두 마리"], images=image, return_tensors="pt", padding=True)
|
37 |
-
with torch.inference_mode():
|
38 |
-
outputs = model(**inputs)
|
39 |
-
logits_per_image = outputs.logits_per_image
|
40 |
-
probs = logits_per_image.softmax(dim=1)
|
41 |
-
```
|
42 |
|
43 |
-
|
44 |
-
>>> probs
|
45 |
-
tensor([[0.9926, 0.0074]])
|
46 |
-
```
|
47 |
|
48 |
-
|
49 |
|
50 |
-
|
51 |
-
from transformers import pipeline
|
52 |
|
53 |
-
|
54 |
-
pipe = pipeline("zero-shot-image-classification", model=repo)
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
|
60 |
-
|
61 |
-
>>> result
|
62 |
-
[{'score': 0.9456236958503723, 'label': '분홍색 소파에 드러누운 고양이 친구들'},
|
63 |
-
{'score': 0.05315302312374115, 'label': '고양이 두 마리'},
|
64 |
-
{'score': 0.0012233294546604156, 'label': '고양이 한 마리'}]
|
65 |
-
```
|
66 |
|
67 |
-
## Tokenizer
|
68 |
|
69 |
-
토크나이저는 한국어 데이터와 영어 데이터를 7:3 비율로 섞어, 원본 CLIP 토크나이저에서 `.train_new_from_iterator`를 통해 학습되었습니다.
|
70 |
|
71 |
-
|
72 |
-
```python
|
73 |
-
# text_embeds.shape = [batch_size, sequence_length, transformer.width]
|
74 |
-
# take features from the eot embedding (eot_token is the highest number in each sequence)
|
75 |
-
# casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
|
76 |
-
pooled_output = last_hidden_state[
|
77 |
-
torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1)
|
78 |
-
]
|
79 |
-
```
|
80 |
|
81 |
-
|
|
|
|
|
|
1 |
---
|
|
|
|
|
|
|
|
|
|
|
2 |
license: mit
|
3 |
+
tags:
|
4 |
+
- generated_from_keras_callback
|
5 |
+
model-index:
|
6 |
+
- name: clip-vit-base-patch32-ko
|
7 |
+
results: []
|
8 |
---
|
9 |
|
10 |
+
<!-- This model card has been generated automatically according to the information Keras had access to. You should
|
11 |
+
probably proofread and complete it, then remove this comment. -->
|
|
|
12 |
|
13 |
+
# clip-vit-base-patch32-ko
|
|
|
|
|
14 |
|
15 |
+
This model is a fine-tuned version of [Bingsu/clip-vit-base-patch32-ko](https://huggingface.co/Bingsu/clip-vit-base-patch32-ko) on an unknown dataset.
|
16 |
+
It achieves the following results on the evaluation set:
|
17 |
|
|
|
18 |
|
19 |
+
## Model description
|
20 |
|
21 |
+
More information needed
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
## Intended uses & limitations
|
|
|
|
|
24 |
|
25 |
+
More information needed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
## Training and evaluation data
|
|
|
|
|
|
|
28 |
|
29 |
+
More information needed
|
30 |
|
31 |
+
## Training procedure
|
|
|
32 |
|
33 |
+
### Training hyperparameters
|
|
|
34 |
|
35 |
+
The following hyperparameters were used during training:
|
36 |
+
- optimizer: None
|
37 |
+
- training_precision: float32
|
38 |
|
39 |
+
### Training results
|
|
|
|
|
|
|
|
|
|
|
40 |
|
|
|
41 |
|
|
|
42 |
|
43 |
+
### Framework versions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
+
- Transformers 4.23.1
|
46 |
+
- TensorFlow 2.9.2
|
47 |
+
- Tokenizers 0.13.1
|
config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_commit_hash":
|
3 |
"_name_or_path": "Bingsu/clip-vit-base-patch32-ko",
|
4 |
"architectures": [
|
5 |
"CLIPModel"
|
@@ -14,6 +14,7 @@
|
|
14 |
"architectures": null,
|
15 |
"attention_dropout": 0.0,
|
16 |
"bad_words_ids": null,
|
|
|
17 |
"bos_token_id": 0,
|
18 |
"chunk_size_feed_forward": 0,
|
19 |
"cross_attention_hidden_size": null,
|
@@ -67,6 +68,7 @@
|
|
67 |
"return_dict": true,
|
68 |
"return_dict_in_generate": false,
|
69 |
"sep_token_id": null,
|
|
|
70 |
"task_specific_params": null,
|
71 |
"temperature": 1.0,
|
72 |
"tf_legacy_loss": false,
|
@@ -77,7 +79,7 @@
|
|
77 |
"top_p": 1.0,
|
78 |
"torch_dtype": null,
|
79 |
"torchscript": false,
|
80 |
-
"transformers_version": "4.
|
81 |
"typical_p": 1.0,
|
82 |
"use_bfloat16": false,
|
83 |
"vocab_size": 49408
|
@@ -91,6 +93,7 @@
|
|
91 |
"architectures": null,
|
92 |
"attention_dropout": 0.0,
|
93 |
"bad_words_ids": null,
|
|
|
94 |
"bos_token_id": null,
|
95 |
"chunk_size_feed_forward": 0,
|
96 |
"cross_attention_hidden_size": null,
|
@@ -146,6 +149,7 @@
|
|
146 |
"return_dict": true,
|
147 |
"return_dict_in_generate": false,
|
148 |
"sep_token_id": null,
|
|
|
149 |
"task_specific_params": null,
|
150 |
"temperature": 1.0,
|
151 |
"tf_legacy_loss": false,
|
@@ -156,7 +160,7 @@
|
|
156 |
"top_p": 1.0,
|
157 |
"torch_dtype": null,
|
158 |
"torchscript": false,
|
159 |
-
"transformers_version": "4.
|
160 |
"typical_p": 1.0,
|
161 |
"use_bfloat16": false
|
162 |
},
|
|
|
1 |
{
|
2 |
+
"_commit_hash": "6f381bab5397bf31910ecd753491b53c84383811",
|
3 |
"_name_or_path": "Bingsu/clip-vit-base-patch32-ko",
|
4 |
"architectures": [
|
5 |
"CLIPModel"
|
|
|
14 |
"architectures": null,
|
15 |
"attention_dropout": 0.0,
|
16 |
"bad_words_ids": null,
|
17 |
+
"begin_suppress_tokens": null,
|
18 |
"bos_token_id": 0,
|
19 |
"chunk_size_feed_forward": 0,
|
20 |
"cross_attention_hidden_size": null,
|
|
|
68 |
"return_dict": true,
|
69 |
"return_dict_in_generate": false,
|
70 |
"sep_token_id": null,
|
71 |
+
"suppress_tokens": null,
|
72 |
"task_specific_params": null,
|
73 |
"temperature": 1.0,
|
74 |
"tf_legacy_loss": false,
|
|
|
79 |
"top_p": 1.0,
|
80 |
"torch_dtype": null,
|
81 |
"torchscript": false,
|
82 |
+
"transformers_version": "4.23.1",
|
83 |
"typical_p": 1.0,
|
84 |
"use_bfloat16": false,
|
85 |
"vocab_size": 49408
|
|
|
93 |
"architectures": null,
|
94 |
"attention_dropout": 0.0,
|
95 |
"bad_words_ids": null,
|
96 |
+
"begin_suppress_tokens": null,
|
97 |
"bos_token_id": null,
|
98 |
"chunk_size_feed_forward": 0,
|
99 |
"cross_attention_hidden_size": null,
|
|
|
149 |
"return_dict": true,
|
150 |
"return_dict_in_generate": false,
|
151 |
"sep_token_id": null,
|
152 |
+
"suppress_tokens": null,
|
153 |
"task_specific_params": null,
|
154 |
"temperature": 1.0,
|
155 |
"tf_legacy_loss": false,
|
|
|
160 |
"top_p": 1.0,
|
161 |
"torch_dtype": null,
|
162 |
"torchscript": false,
|
163 |
+
"transformers_version": "4.23.1",
|
164 |
"typical_p": 1.0,
|
165 |
"use_bfloat16": false
|
166 |
},
|
tf_model.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ea376ac0b923856e999412382f09b8aab4401a99d6ceabd2cba7ac2d1b75ddd1
|
3 |
+
size 605559544
|