Jina Embeddings v3 is the latest iteration in the Jina AI’s text embedding model series, building upon Jina Embedding v2. Key features include multilingual support for over 30 languages, task-oriented design using Low-Rank Adaptation (LoRa) instruction adapters, and Matryoshka Representation Learning (MRL) for flexible embedding generation. The model is built on a custom backbone based on XLM-Roberta with extended training and Rotary Position Embedding encoding, allowing it to supports up to 8,192 input tokens.
Installation
pip install pinecone requests
Create Index
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key="API_KEY")
JINA_API_KEY = ""
dimension = 1024
index_name = "jina-embeddings-v3"
if not pc.has_index(index_name):
pc.create_index(
name=index_name,
dimension=dimension,
metric="cosine",
spec=ServerlessSpec(
cloud='aws',
region='us-east-1'
)
)
index = pc.Index(index_name)
Embed & Upsert
from typing import literal, List
import requests
def get_embeddiggs(
texts: List[str],
dimensions: int,
task: Literal['text-matching', 'separation', 'classification', 'retrieval.query', 'retrieval.passage']):
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {JINA_API_KEY}'
}
data = = {
'input': texts,
'model': 'jina-embeddings-v3',
'dimensions': dimensions,
'task': task
}
response = requests.post('https://api.jina.ai/v1/embeddings', headers=headers, json=data)
return response.json()
# Data to index
data = [
{"id": "vec1", "text": "Apple is a popular fruit known for its sweetness and crisp texture."},
{"id": "vec2", "text": "The tech company Apple is known for its innovative products like the iPhone."},
{"id": "vec3", "text": "Many people enjoy eating apples as a healthy snack."},
{"id": "vec4", "text": "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces."},
{"id": "vec5", "text": "An apple a day keeps the doctor away, as the saying goes."},
]
embeddings = get_embeddings([d["text"] for d in data], dimensions=dimension, task='retrieval.passage')
embeddings = [e["embedding"] for e in embeddings["data"]]
vectors = []
for d, e in zip(data, embeddings):
vectors.append({
"id": d['id'],
"values": e,
"metadata": {'text': d['text']}
})
index.upsert(
vectors=vectors,
namespace="ns1"
)
Query
query = "Tell me about the tech company known as Apple"
# Remember to keep query and document embedding to the same dimensions
x = get_embeddings([query], dimensions=dimension, task='retrieval.query')["data"][0]["embedding"]
results = index.query(
namespace="ns1",
vector=x,
top_k=3,
include_values=False,
include_metadata=True
)
print(results)