# Embed data
# We'll use an example dataset of images of animals and cities:
from datasets import load_dataset
data = load_dataset(
"jamescalam/image-text-demo",
split="train"
)
from transformers import CLIPProcessor, CLIPModel
import torch
model_id = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id)
# move model to device if possible
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
# ClIP allows for both text and image embeddings
def create_text_embeddings(text):
text_embedding = processor(text=text,
padding=True,
images=None,
return_tensors='pt').to(device)
text_emb = model.get_text_features(**text_embedding)
return text_emb[0]
def create_image_embeddings(image):
vals = processor(
text=None,
images=image,
return_tensors='pt')['pixel_values'].to(device)
image_embedding = model.get_image_features(vals)
return image_embedding[0]
# We will embed the images and search with text
from IPython.display import Image
def apply_vectorization(data):
data["image_embeddings"] = create_image_embeddings(data["image"])
return data
data = data.map(apply_vectorization)
# add an id column for easy indexing later
ids = [str(i) for i in range(0, data.num_rows)]
data = data.add_column("id", ids)
vectors = []
for i in range(0, data.num_rows):
d = data[i]
vectors.append({
"id": d["id"],
"values": d["image_embeddings"],
"metadata": {"caption": d["text"]}
})
index.upsert(
vectors=vectors,
namespace="ns1"
)