Overview
Excels at understanding relationships between text and images. Its training set is the English subset of Laion-5B (Laion2B-en). It’s particularly well-suited for tasks like:
- Zero-Shot Image Classification: Classify images based on text descriptions without further training.
- Image-Text Retrieval: Search for similar images or text descriptions within a dataset.
- Image Segmentation: Identify and segment different objects within an image based on their semantic meaning.
Key benefits for developers:
-
Open-source and accessible: Train and fine-tune the model for your specific needs.
-
Great performance: Can achieve high accuracy on various text-image tasks.
-
Versatile: Applicable to diverse applications, from image search to image generation.
Using the model
For best results, use a Jupyter Notebook to interact with this dataset.
Installation:
!pip install pinecone datasets transformers
Create Index
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key="API_KEY")
index_name = "clip-vit-b-32-laion2b-s34b-b79k"
if not pc.has_index(index_name):
pc.create_index(
name=index_name,
dimension=768,
metric="cosine",
spec=ServerlessSpec(
cloud='aws',
region='us-east-1'
)
)
index = pc.Index(index_name)
Embed & Upsert
from datasets import load_dataset
data = load_dataset(
"jamescalam/image-text-demo",
split="train"
)
from transformers import CLIPProcessor, CLIPVisionModel
import torch
model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPVisionModel.from_pretrained(model_id)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
import torch
def create_image_embeddings(image):
with torch.no_grad():
vals = processor(
text=None,
images=image,
return_tensors='pt').to(device)
image_embedding = model(**vals).pooler_output
return image_embedding[0]
from IPython.display import Image
def apply_vectorization(data):
data["image_embeddings"] = create_image_embeddings(data["image"])
return data
data = data.map(apply_vectorization)
ids = [str(i) for i in range(0, data.num_rows)]
data = data.add_column("id", ids)
vectors = []
for i in range(0, data.num_rows):
d = data[i]
vectors.append({
"id": d["id"],
"values": d["image_embeddings"],
"metadata": {"caption": d["text"]}
})
index.upsert(
vectors=vectors,
namespace="ns1"
)
Query
def id_to_image_helper(id, data):
image = data[int(id)]
print(image["text"])
return image["image"].resize((500, 500))
query = data[int(1)]["image"]
x = create_image_embeddings(query).tolist()
results = index.query(
namespace="ns1",
vector=x,
top_k=3,
include_values=False,
include_metadata=True
)
print(results)
id_to_image_helper(results["matches"][0]["id"], data)