Heron
Collection
Heron: Japanese Vision Language Models
•
11 items
•
Updated
Heron GIT Japanese StableLM Base 7B is a vision-language model that can converse about input images.
This model was trained using the heron library. Please refer to the code for details.
Follow the installation guide.
import torch
from heron.models.git_llm.git_japanese_stablelm_alpha import GitJapaneseStableLMAlphaForCausalLM
from transformers import AutoProcessor, LlamaTokenizer
device_id = 0
device = f"cuda:{device_id}"
MODEL_NAME = "turing-motors/heron-chat-git-ja-stablelm-base-7b-v1"
model = GitJapaneseStableLMAlphaForCausalLM.from_pretrained(
MODEL_NAME, torch_dtype=torch.float16, ignore_mismatched_sizes=True
)
model.eval()
model.to(device)
# prepare a processor
processor = AutoProcessor.from_pretrained(MODEL_NAME)
tokenizer = LlamaTokenizer.from_pretrained(
"novelai/nerdstash-tokenizer-v1",
padding_side="right",
additional_special_tokens=["▁▁"],
)
processor.tokenizer = tokenizer
import requests
from PIL import Image
# prepare inputs
url = "https://www.barnorama.com/wp-content/uploads/2016/12/03-Confusing-Pictures.jpg"
image = Image.open(requests.get(url, stream=True).raw)
text = f"##human: この画像の面白い点は何ですか?\n##gpt: "
# do preprocessing
inputs = processor(
text=text,
images=image,
return_tensors="pt",
truncation=True,
)
inputs = {k: v.to(device) for k, v in inputs.items()}
# do inference
with torch.no_grad():
out = model.generate(**inputs, max_length=256, do_sample=False, temperature=0., no_repeat_ngram_size=2)
# print result
print(processor.tokenizer.batch_decode(out))
This model is intended for use in chat-like applications and for research purposes.
The model may produce inaccurate or false information, and its accuracy is not guaranteed. It is still in the research and development stage.
@misc{inoue2024heronbench,
title={Heron-Bench: A Benchmark for Evaluating Vision Language Models in Japanese},
author={Yuichi Inoue and Kento Sasaki and Yuma Ochi and Kazuki Fujii and Kotaro Tanahashi and Yu Yamaguchi},
year={2024},
eprint={2404.07824},
archivePrefix={arXiv},
primaryClass={cs.CV}
}