CLIP | OpenAI

METRIC

cosine, dot product

DIMENSION

512, 768, 2048

MAX INPUT TOKENS

2048

TASK

embedding

Overview

CLIP (Contrastive Languageā€“Image Pre-training) builds on a large body of work on zero-shot transfer, natural language supervision, and multimodal learning. It learns from unfiltered, highly varied, and highly noisy data, and is intended to be used in a zero-shot manner. CLIP struggles on more abstract or systematic tasks such as counting the number of objects in an image and on more complex tasks such as predicting how close the nearest car is in a photo.

The model allows people to design their own classifiers and removes the need for task-specific training data.

Using the model

For best results, use a Jupyter Notebook to interact with this dataset.

Installation:

!pip install pinecone datasets transformers

Create Index

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="API_KEY")

# Create Index
index_name = "clip-vit-base-patch32"

if not pc.has_index(index_name):
  pc.create_index(
      name=index_name,
      dimension=512,
      metric="cosine",
      spec=ServerlessSpec(
          cloud='aws',
          region='us-east-1'
      )
  )

index = pc.Index(index_name)

Embed & Upsert


# Embed data
# We'll use an example dataset of images of animals and cities:

from datasets import load_dataset


data = load_dataset(
    "jamescalam/image-text-demo",
    split="train"
)

from transformers import CLIPProcessor, CLIPModel
import torch

model_id = "openai/clip-vit-base-patch32"

processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id)

# move model to device if possible
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)


# ClIP allows for both text and image embeddings

def create_text_embeddings(text):
  text_embedding = processor(text=text,
      padding=True,
      images=None,
      return_tensors='pt').to(device)
    
  text_emb = model.get_text_features(**text_embedding)
  return text_emb[0]

def create_image_embeddings(image):
  vals = processor(
      text=None,
      images=image,
      return_tensors='pt')['pixel_values'].to(device)
  image_embedding = model.get_image_features(vals)
  return image_embedding[0]


# We will embed the images and search with text

from IPython.display import Image 

def apply_vectorization(data):

  data["image_embeddings"] = create_image_embeddings(data["image"])
  return data



data = data.map(apply_vectorization)
# add an id column for easy indexing later
ids = [str(i) for i in range(0, data.num_rows)]
data = data.add_column("id", ids)


vectors = []
for i in range(0, data.num_rows):
  d = data[i]
  vectors.append({
      "id": d["id"],
      "values": d["image_embeddings"],
      "metadata": {"caption": d["text"]}
  })

index.upsert(
    vectors=vectors,
    namespace="ns1"
)


Query

query = "Show me a photo of a city"

x = create_text_embeddings(query).tolist()

results = index.query(
    namespace="ns1",
    vector=x,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)


def id_to_image_helper(id, data):
  # given id, renders the images and captions
  # resizes in order to speed up showing the image

  image = data[int(id)]
  print(image["text"])
  return image["image"].resize((500, 500))


# view a specific result using the helper
  id_to_image_helper(results["matches"][0]["id"], data)




Lorem Ipsum

Was this page helpful?