Skip to main content

Overview

Ideal model for high performance while keeping with open source. Works well on messy data. Good for short queries expected to return medium-length passages of text (1-2 paragraphs).Must prefix passages/documents with “passage: ” and queries with “query: ”. See here for an example.

Using the model

Installation:

pip install -qU transformers==4.35.0 pinecone

Create Index

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="API_KEY")

# Create Index
index_name = "e5-large-v2"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

index = pc.Index(index_name)

Embed & Upsert

# Embed data
data = [
    {"id": "vec1", "text": "Apple is a popular fruit known for its sweetness and crisp texture."},
    {"id": "vec2", "text": "The tech company Apple is known for its innovative products like the iPhone."},
    {"id": "vec3", "text": "Many people enjoy eating apples as a healthy snack."},
    {"id": "vec4", "text": "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces."},
    {"id": "vec5", "text": "An apple a day keeps the doctor away, as the saying goes."},
]

import torch
from torch.nn.functional import normalize
from transformers import AutoModel, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "intfloat/e5-large-v2"

# initialize tokenizer and model

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()

def embed(docs: list[str]) -> list[list[float]]:
    # tokenize
    tokens = tokenizer(
            docs, padding=True, max_length=512, truncation=True, return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        # process with model for token-level embeddings
        out = model(**tokens)
        # mask padding tokens
        last_hidden = out.last_hidden_state.masked_fill(
                ~tokens["attention_mask"][..., None].bool(), 0.0
        )
        # create mean pooled embeddings
        doc_embeds = last_hidden.sum(dim=1) / \
            tokens["attention_mask"].sum(dim=1)[..., None]
    return doc_embeds.cpu().numpy().tolist()

# when encoding documents / passages


# Add "passage" to the documents being embedded for upserted documents
embeddings = embed([f"passage: {d}" for d in data])

vectors = []
for d, e in zip(data, embeddings):
    vectors.append({
        "id": d['id'],
        "values": e['values'],
        "metadata": {'text': d['text']}
    })

index.upsert(
    vectors=vectors,
    namespace="ns1"
)

Query

query = "Tell me about the tech company known as Apple"

# add "query" to queries being encoded
x = embed([f"query: {query_str}"])

results = index.query(
    namespace="ns1",
    vector=x[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)
Lorem Ipsum