Overview
CLIP (Contrastive LanguageāImage Pre-training) builds on a large body of work on zero-shot transfer, natural language supervision, and multimodal learning. It learns from unfiltered, highly varied, and highly noisy data, and is intended to be used in a zero-shot manner. CLIP struggles on more abstract or systematic tasks such as counting the number of objects in an image and on more complex tasks such as predicting how close the nearest car is in a photo.
The model allows people to design their own classifiers and removes the need for task-specific training data.
Using the model
For best results, use a Jupyter Notebook to interact with this dataset.
Installation:
!pip install pinecone datasets transformers
Create Index
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key="API_KEY")
index_name = "clip-vit-base-patch32"
if not pc.has_index(index_name):
pc.create_index(
name=index_name,
dimension=512,
metric="cosine",
spec=ServerlessSpec(
cloud='aws',
region='us-east-1'
)
)
index = pc.Index(index_name)
Embed & Upsert
from datasets import load_dataset
data = load_dataset(
"jamescalam/image-text-demo",
split="train"
)
from transformers import CLIPProcessor, CLIPModel
import torch
model_id = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
def create_text_embeddings(text):
text_embedding = processor(text=text,
padding=True,
images=None,
return_tensors='pt').to(device)
text_emb = model.get_text_features(**text_embedding)
return text_emb[0]
def create_image_embeddings(image):
vals = processor(
text=None,
images=image,
return_tensors='pt')['pixel_values'].to(device)
image_embedding = model.get_image_features(vals)
return image_embedding[0]
from IPython.display import Image
def apply_vectorization(data):
data["image_embeddings"] = create_image_embeddings(data["image"])
return data
data = data.map(apply_vectorization)
ids = [str(i) for i in range(0, data.num_rows)]
data = data.add_column("id", ids)
vectors = []
for i in range(0, data.num_rows):
d = data[i]
vectors.append({
"id": d["id"],
"values": d["image_embeddings"],
"metadata": {"caption": d["text"]}
})
index.upsert(
vectors=vectors,
namespace="ns1"
)
Query
query = "Show me a photo of a city"
x = create_text_embeddings(query).tolist()
results = index.query(
namespace="ns1",
vector=x,
top_k=3,
include_values=False,
include_metadata=True
)
print(results)
def id_to_image_helper(id, data):
image = data[int(id)]
print(image["text"])
return image["image"].resize((500, 500))
id_to_image_helper(results["matches"][0]["id"], data)