# Embed data
data = [
{"id": "vec1", "text": "Apple is a popular fruit known for its sweetness and crisp texture."},
{"id": "vec2", "text": "The tech company Apple is known for its innovative products like the iPhone."},
{"id": "vec3", "text": "Many people enjoy eating apples as a healthy snack."},
{"id": "vec4", "text": "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces."},
{"id": "vec5", "text": "An apple a day keeps the doctor away, as the saying goes."},
]
import torch
from torch.nn.functional import normalize
from transformers import AutoModel, AutoTokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "intfloat/e5-large-v2"
# initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()
def embed(docs: list[str]) -> list[list[float]]:
# tokenize
tokens = tokenizer(
docs, padding=True, max_length=512, truncation=True, return_tensors="pt"
).to(device)
with torch.no_grad():
# process with model for token-level embeddings
out = model(**tokens)
# mask padding tokens
last_hidden = out.last_hidden_state.masked_fill(
~tokens["attention_mask"][..., None].bool(), 0.0
)
# create mean pooled embeddings
doc_embeds = last_hidden.sum(dim=1) / \
tokens["attention_mask"].sum(dim=1)[..., None]
return doc_embeds.cpu().numpy().tolist()
# when encoding documents / passages
# Add "passage" to the documents being embedded for upserted documents
embeddings = embed([f"passage: {d}" for d in data])
vectors = []
for d, e in zip(data, embeddings):
vectors.append({
"id": d['id'],
"values": e['values'],
"metadata": {'text': d['text']}
})
index.upsert(
vectors=vectors,
namespace="ns1"
)