jina-clip-v2 | Jina AI

METRIC

cosine

DIMENSION

1024

TASK

embedding

Jina CLIP v2 is a state-of-the-art multilingual and multimodal (text-image) embedding model. It excels in both cross-modal (text-to-image, image-to-text) and unimodal (text-to-text) retrieval tasks within a single vector space. It supports 100 languages with a focus on 30 (including English, Spanish, Chinese, Arabic, and more) and flexible embedding generation through Matryoshka Representation Learning (MRL), and it allows for shortened vector lengths via the dimensions parameter.

Installation

pip install pinecone requests

Create Index

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="API_KEY")

JINA_API_KEY = ""  # Replace with your Jina API key
dimension = 1024  # Specify the desired embedding dimension

index_name = "jina-clip-v2"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'  # Replace with your preferred region
        )
    )

index = pc.Index(index_name)

Embed & Upsert

from typing import List
import requests

def get_embeddings(
    inputs: List[str],  # List of text or image URLs
    dimensions: 1024,
    task: str = None  # Set to 'retrieval.query' for text retrieval
):
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {JINA_API_KEY}'
    }
    data = {
        'input': inputs,
        'model': 'jina-clip-v2',
        'dimensions': dimensions,
    }

    response = requests.post('https://api.jina.ai/v1/embeddings', headers=headers, json=data)
    return response.json()

# Example data with image and text
data = [
    {"id": "img1", "modality": "image", "content": "<https://example.com/image1.jpg>"},
    {"id": "txt1", "modality": "text", "content": "A red apple on a table."},
    {"id": "img2", "modality": "image", "content": "<https://example.com/image2.png>"},
    {"id": "txt2", "modality": "text", "content": "A basket of green apples."},
]

vectors = []
for item in data:
    embeddings = get_embeddings([item["content"]], dimensions=dimension)
    embedding = embeddings["data"][0]["embedding"]
    vectors.append({
        "id": item['id'],
        "values": embedding,
        "metadata": {'content': item['content'], 'modality': item['modality']}
    })

index.upsert(
    vectors=vectors,
    namespace="ns1"  # optionally specify a namespace
)

Query

query = "Santa With Glasses"  # Text query

embeddings = get_embeddings([query], dimensions=dimension, task='retrieval.query')
query_embedding = embeddings["data"][0]["embedding"]

results = index.query(
    namespace="ns1",
    vector=query_embedding,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)

Lorem Ipsum

Was this page helpful?