Lorem Ipsum
CLIP
CLIP (Contrastive Language–Image Pre-training) builds on a large body of work on zero-shot transfer, natural language supervision, and multimodal learning.
Was this page helpful?
⌘I
Documentation Index
Fetch the complete documentation index at: /llms.txt
Use this file to discover all available pages before exploring further.
🎉 New: Standard and Enterprise orgs get a one-time $250 bulk import credit (1 TB), through July 31, 2026. See details
CLIP (Contrastive Language–Image Pre-training) builds on a large body of work on zero-shot transfer, natural language supervision, and multimodal learning.
!pip install pinecone datasets transformers
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key="API_KEY")
# Create Index
index_name = "clip-vit-base-patch32"
if not pc.has_index(index_name):
pc.create_index(
name=index_name,
dimension=512,
metric="cosine",
spec=ServerlessSpec(
cloud='aws',
region='us-east-1'
)
)
index = pc.Index(index_name)
# Embed data
# We'll use an example dataset of images of animals and cities:
from datasets import load_dataset
data = load_dataset(
"jamescalam/image-text-demo",
split="train"
)
from transformers import CLIPProcessor, CLIPModel
import torch
model_id = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id)
# move model to device if possible
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
# ClIP allows for both text and image embeddings
def create_text_embeddings(text):
text_embedding = processor(text=text,
padding=True,
images=None,
return_tensors='pt').to(device)
text_emb = model.get_text_features(**text_embedding)
return text_emb[0]
def create_image_embeddings(image):
vals = processor(
text=None,
images=image,
return_tensors='pt')['pixel_values'].to(device)
image_embedding = model.get_image_features(vals)
return image_embedding[0]
# We will embed the images and search with text
from IPython.display import Image
def apply_vectorization(data):
data["image_embeddings"] = create_image_embeddings(data["image"])
return data
data = data.map(apply_vectorization)
# add an id column for easy indexing later
ids = [str(i) for i in range(0, data.num_rows)]
data = data.add_column("id", ids)
vectors = []
for i in range(0, data.num_rows):
d = data[i]
vectors.append({
"id": d["id"],
"values": d["image_embeddings"],
"metadata": {"caption": d["text"]}
})
index.upsert(
vectors=vectors,
namespace="ns1"
)
query = "Show me a photo of a city"
x = create_text_embeddings(query).tolist()
results = index.query(
namespace="ns1",
vector=x,
top_k=3,
include_values=False,
include_metadata=True
)
print(results)
def id_to_image_helper(id, data):
# given id, renders the images and captions
# resizes in order to speed up showing the image
image = data[int(id)]
print(image["text"])
return image["image"].resize((500, 500))
# view a specific result using the helper
id_to_image_helper(results["matches"][0]["id"], data)
Was this page helpful?