paligemma_2 / app.py
tjw's picture
.
36654ae
import os
os.system('pip install ./transformers-4.47.0.dev0-py3-none-any.whl')
import gradio as gr
import PIL.Image
import transformers
from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
import torch
import string
import functools
import re
import numpy as np
import spaces
#adapter_id = "merve/paligemma2-3b-vqav2"
#adapter_id = "google/paligemma2-10b-pt-448"
#model_id = "google/paligemma2-10b-pt-448"
adapter_id = "google/paligemma2-3b-ft-docci-448"
model_id = "google/paligemma2-3b-ft-docci-448"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16
model = PaliGemmaForConditionalGeneration.from_pretrained(adapter_id, device_map='cuda', torch_dtype=dtype).eval()
processor = PaliGemmaProcessor.from_pretrained(model_id)
###### Transformers Inference
@spaces.GPU
def infer(
text,
image: PIL.Image.Image,
max_new_tokens: int
) -> str:
text = "answer en " + text
inputs = processor(text=text, images=image, return_tensors="pt").to(device=device, dtype=dtype)
with torch.inference_mode():
generated_ids = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False
)
result = processor.batch_decode(generated_ids, skip_special_tokens=True)
return result[0][len(text):].lstrip("\n")
######## Demo
INTRO_TEXT = """## PaliGemma 2 demo\n\n
| [Github](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md)
| [Blogpost](https://huggingface.co/blog/paligemma)
| [Fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb)
|\n\n
PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile
model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question
answering, text reading, object detection and object segmentation.
\n\n
This space includes a model LoRA fine-tuned by the team at Hugging Face on VQAv2, inferred using transformers.
See the [Blogpost](https://huggingface.co/blog/paligemma2), the project
[README](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md) and the
[fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb)
for detailed information about how to use and fine-tune PaliGemma and PaliGemma 2 models.
\n\n
**This is an experimental research model.** Make sure to add appropriate guardrails when using the model for applications.
"""
with gr.Blocks(css="style.css") as demo:
gr.Markdown(INTRO_TEXT)
with gr.Column():
image = gr.Image(label="Input Image", type="pil", height=400)
question = gr.Text(label="Question")
tokens = gr.Slider(
label="Max New Tokens",
info="Set to larger for longer generation.",
minimum=20,
maximum=1600,
value=256,
step=10,
)
caption_btn = gr.Button(value="Submit")
text_output = gr.Text(label="Text Output")
caption_inputs = [
question,
image,
tokens
]
caption_outputs = [
text_output
]
caption_btn.click(
fn=infer,
inputs=caption_inputs,
outputs=caption_outputs,
)
examples = [
["What is the graphic about?", "./howto.jpg", 60],
["What is the password", "./password.jpg", 20],
["Who is in this image?", "./examples_bowie.jpg", 80],
]
gr.Markdown("Example images are licensed CC0 by [akolesnikoff@](https://github.com/akolesnikoff), [mbosnjak@](https://github.com/mbosnjak), [maximneumann@](https://github.com/maximneumann) and [merve](https://huggingface.co/merve).")
gr.Examples(
examples=examples,
inputs=caption_inputs,
)
#########
if __name__ == "__main__":
demo.queue(max_size=10).launch(debug=True)