> ## Documentation Index
> Fetch the complete documentation index at: https://docs.pinecone.io/llms.txt
> Use this file to discover all available pages before exploring further.

# Semantic search

> Sample Next.js app that performs semantic search over PDF documents using Pinecone and Voyage AI's voyage-law-2 embeddings.

export const ArrowNE = () => {
  return <svg xmlns="http://www.w3.org/2000/svg" width="21" height="21" viewBox="0 0 21 21" fill="none">
            <path d="M4.92299 17.1668L3.8335 16.0773L14.5209 5.38992H4.71547V3.8335H17.1668V16.2849H15.6104V6.47941L4.92299 17.1668Z" fill="var(--text-primary)" />
        </svg>;
};

export const GithubIcon = () => {
  return <svg xmlns="http://www.w3.org/2000/svg" width="21" height="21" viewBox="0 0 21 21" fill="none">
            <path fill-rule="evenodd" clip-rule="evenodd" d="M9.70928 4.05007C8.31793 4.21017 6.96422 4.84637 5.94277 5.82016C4.24747 7.43636 3.52414 9.77037 4.00655 12.0678C4.46605 14.2561 6.02227 16.0844 8.11272 16.892C8.44405 17.0199 8.60214 17.0277 8.72515 16.9219L8.81336 16.8461L8.82292 16.1332L8.83247 15.4202L8.73955 15.4406C8.50614 15.4919 8.10282 15.5127 7.86284 15.4858C7.40903 15.435 7.01153 15.2388 6.79946 14.9611C6.73983 14.883 6.61481 14.6617 6.52166 14.4695C6.33116 14.0762 6.15107 13.8536 5.86758 13.6611C5.5353 13.4355 5.54995 13.2709 5.90191 13.2757C6.03736 13.2776 6.13771 13.3031 6.28618 13.3737C6.55388 13.5008 6.72474 13.6554 6.91734 13.9445C7.19338 14.3588 7.39605 14.5261 7.74282 14.6258C7.98116 14.6943 8.38716 14.6732 8.66027 14.5782L8.85297 14.5111L8.88852 14.332C8.93357 14.1048 9.04413 13.8664 9.17182 13.721C9.2859 13.591 9.28559 13.5908 8.97604 13.5556C8.70198 13.5243 8.21426 13.399 7.95469 13.2931C7.5852 13.1423 7.38567 13.008 7.08345 12.7068C6.57847 12.2034 6.3554 11.644 6.28281 10.6989C6.22063 9.88945 6.37347 9.29167 6.78619 8.73013L6.93636 8.5258L6.87447 8.29003C6.79346 7.98149 6.80996 7.41637 6.91027 7.06404C6.99339 6.77209 7.03036 6.74053 7.2572 6.76816C7.62556 6.81299 8.05296 6.98479 8.54616 7.28629L8.81336 7.44963L9.14344 7.38346C9.73562 7.26472 10.0045 7.24354 10.6681 7.26331C11.2515 7.28069 11.4102 7.29927 11.9297 7.4109L12.1225 7.45233L12.3541 7.30892C12.8263 7.01647 13.2498 6.83572 13.5901 6.78137C13.9054 6.73098 13.9339 6.74657 14.0112 7.01148C14.1466 7.47534 14.1575 8.05323 14.0375 8.40003C13.9956 8.52102 13.9958 8.52159 14.1525 8.72787C14.3401 8.97477 14.5238 9.35728 14.5959 9.65114C14.7536 10.2933 14.6496 11.354 14.3681 11.9748C14.0175 12.7481 13.4125 13.2118 12.4285 13.4617C12.2729 13.5012 12.0607 13.5432 11.9569 13.555C11.6405 13.5909 11.6372 13.5933 11.7462 13.708C11.848 13.8152 11.973 14.0466 12.0373 14.2471C12.0657 14.336 12.0829 14.7354 12.0959 15.6095L12.1141 16.8461L12.2023 16.9219C12.3253 17.0277 12.4834 17.0199 12.8148 16.892C15.1496 15.99 16.7734 13.8552 17.0364 11.3417C17.0826 10.9001 17.0576 9.99571 16.9878 9.5789C16.6755 7.71598 15.6173 6.10421 14.016 5.05281C13.2185 4.52921 12.1702 4.15856 11.171 4.0469C10.8383 4.00971 10.0449 4.01144 9.70928 4.05007Z" fill="var(--text-primary)" />
        </svg>;
};

export const InlineCode = ({copyCode, displayCode, copy}) => {
  const copyToClipboard = async e => {
    await navigator.clipboard.writeText(copyCode);
    const button = e.target.closest("button");
    button.innerHTML = `<svg  width="16" height="11" viewBox="0 0 16 11" fill="none" xmlns="http://www.w3.org/2000/svg" style="transform: translateY(-3px)"><path d="M14.7813 1.21873C15.0751 1.51248 15.0751 1.98748 14.7813 2.2781L6.53135 10.5312C6.2376 10.825 5.7626 10.825 5.47197 10.5312L1.21885 6.28123C0.925098 5.98748 0.925098 5.51248 1.21885 5.22185C1.5126 4.93123 1.9876 4.9281 2.27822 5.22185L5.99697 8.9406L13.7188 1.21873C14.0126 0.924976 14.4876 0.924976 14.7782 1.21873H14.7813Z" fill="var(--brand-blue)"></path></svg>`;
    setTimeout(() => {
      button.innerHTML = `<svg xmlns="http://www.w3.org/2000/svg" width="21" height="20" viewBox="0 0 21 20" fill="none">
      <path d="M14.2502 0.833496H4.25016C3.3335 0.833496 2.5835 1.5835 2.5835 2.50016V14.1668H4.25016V2.50016H14.2502V0.833496ZM16.7502 4.16683H7.5835C6.66683 4.16683 5.91683 4.91683 5.91683 5.8335V17.5002C5.91683 18.4168 6.66683 19.1668 7.5835 19.1668H16.7502C17.6668 19.1668 18.4168 18.4168 18.4168 17.5002V5.8335C18.4168 4.91683 17.6668 4.16683 16.7502 4.16683ZM16.7502 17.5002H7.5835V5.8335H16.7502V17.5002Z" fill="var(--brand-blue)" fill-opacity="0.38"/>
      </svg>`;
    }, 2000);
  };
  return <div className="relative">
      <code className="inline-flex gap-2 items-center py-2 pl-3 pr-10 custom-code">
        {displayCode}
      </code>

      {copy && <div className="absolute group right-3 top-1/2 -translate-y-1/2" style={{
    width: "1.375rem",
    height: "1.375rem"
  }}>
          <button onClick={e => copyToClipboard(e)}>
            <svg xmlns="http://www.w3.org/2000/svg" width="21" height="20" viewBox="0 0 21 20" fill="none">
              <path d="M14.2502 0.833496H4.25016C3.3335 0.833496 2.5835 1.5835 2.5835 2.50016V14.1668H4.25016V2.50016H14.2502V0.833496ZM16.7502 4.16683H7.5835C6.66683 4.16683 5.91683 4.91683 5.91683 5.8335V17.5002C5.91683 18.4168 6.66683 19.1668 7.5835 19.1668H16.7502C17.6668 19.1668 18.4168 18.4168 18.4168 17.5002V5.8335C18.4168 4.91683 17.6668 4.16683 16.7502 4.16683ZM16.7502 17.5002H7.5835V5.8335H16.7502V17.5002Z" fill="var(--brand-blue)" className="opacity-40 group-hover:opacity-100 transition-opacity" />
            </svg>
          </button>
        </div>}
    </div>;
};

<div className="sample-app">
  <div className="sample-app-heading">
    <span className="eyebrow">SAMPLE APP</span>

    # Semantic search

    A semantic search app to perform semantic search over PDF documents

    <InlineCode copyCode="npx create-pinecone-app@latest --template legal-semantic-search" displayCode={<span><span style={{color: "#215CCE"}}>$</span> npx create-pinecone-app@latest --template legal-semantic-search</span>} copy />
  </div>

  <div className="w-full h-fit rounded-lg " style={{background: "#121142", margin: "4rem 0 3rem"}}>
    <div className="container py-50">
      <iframe id="sample-app-iframe" className=" border-[0.5px] border-gray-300 rounded-lg shadow-xl" src="https://legal-semantic-search.vercel.app/" width="100%" height="700px" allow="clipboard-write" allowTransparency="true" />
    </div>
  </div>

  <div className="sample-app-split">
    <div className="content no-margin">
      The Legal Semantic Search app demonstrates how to programmatically bootstrap a custom knowledge base based on a Pinecone vector database with arbitrary PDF files included in the codebase.
      This app is focused on semantic search over legal documents, but this exact same technique and code can be applied to any content stored locally.
    </div>

    <div className="sidebar">
      <InlineCode copyCode="npx create-pinecone-app --template legal-semantic-search" displayCode={<span><span style={{color: "#215CCE"}}>$</span> npx create-pinecone-app@latest --template legal-semantic-search</span>} copy />

      <a href="https://github.com/pinecone-io/sample-apps/tree/main/legal-semantic-search" target="_blank" className="flex items-center gap-2 no-underline mt-6"><GithubIcon /> Github</a>
      <a href="https://legal-semantic-search.vercel.app/" target="_blank" className="flex items-center gap-2 no-underline mt-4"><ArrowNE /> Open in a new window</a>
    </div>
  </div>

  ***

  <div className="sample-app-split">
    <div className="content relative mt-8 prose prose-gray dark:prose-invert">
      ## Built with

      * Pinecone Serverless
      * Voyage Embeddings
      * Langchain
      * Next.js + tailwind
      * Node version 20 or higher

      ***

      ## Run the sample app

      The fastest way to get started is to use the `create-pinecone-app` CLI tool to get up and running:

      ```bash theme={null}
      npx -y create-pinecone-app@latest --template legal-semantic-search 
      ```

      ### Get your API key

      You need an API key to make API calls to your Pinecone project:

      <div style={{minWidth: '450px', minHeight:'152px'}}>
        <div id="pinecone-connect-widget">
          <div class="connect-widget-skeleton">
            <div class="skeleton-content" />
          </div>
        </div>
      </div>

      Then copy your generated key:

      ```
      PINECONE_API_KEY="{{YOUR_API_KEY}}"

      # This API key has ReadWrite access to all indexes in your project.
      ```

      Alternatively, follow [these steps](/guides/projects/manage-api-keys#create-an-api-key):

      1. Open the [Pinecone console](https://app.pinecone.io/).
      2. Select your project.
      3. Go to **API Keys**.
      4. Create an API key.
      5. Copy your API key.

      ### Get your Voyage AI API key

      1. Create a new [Voyage AI](https://www.voyageai.com/) account [here](https://dash.voyageai.com/).
      2. Create a [new API key](https://dashboard.voyageai.com/organization/api-keys).
      3. Add your billing information to your Voyage AI account [here](https://dashboard.voyageai.com/organization/billing). This is required even to use the free tier.
      4. Copy your API key.

      ### Create a Pinecone serverless index

      Create a Pinecone index for this project.
      The index should have the following properties:

      * **dimension**: `1024`
        The Voyage `voyage-law-2` embeddings model has 1024 dimensions.
      * **metric**: `cosine`
      * **region**: `us-east-1`

      You can create the index [in the console](https://app.pinecone.io/organizations/-/projects/-/create-index/serverless),
      or by following the instructions [here](https://docs.pinecone.io/guides/get-started/quickstart#4-create-a-serverless-index).

      ### Start the project

      **Requires Node version 20+**

      #### Dependency installation

      From the project root directory, run the following command.

      ```bash theme={null}
      cd legal-semantic-search && npm install 
      ```

      Make sure you have populated the client `.env` with relevant keys.

      ```bash theme={null}
      # You must first activate a Billing Account here: https://www.voyageai.com/ 
      # Then get your Voyage API Key here: https://dash.voyageai.com/
      VOYAGE_API_KEY="your-api-key-here"

      # Get your Pinecone API key here: https://app.pinecone.io/
      PINECONE_API_KEY="your-api-key-here"
      PINECONE_INDEX="legal-semantic-search"
      ```

      Start the app.

      ```bash theme={null}
      npm run dev
      ```

      ## Project structure

      In this example we opted to use a standard Next.js application structure.

      **Frontend Client**

      The frontend uses Next.js, tailwind and custom React components to power the search experience. It also leverages API routes to make calls to the server to initiate bootstrapping of the Pinecone vector database as a knowledge store, and to fetch relevant document chunks for the UI.

      **Backend Server**

      This project uses Next.js API routes to handle file chunking, upsertion, and context provision etc. Learn more about the implementation details below.

      ***

      ### Simple semantic search

      This project uses a basic semantic search architecture that achieves low latency natural language search across all embedded documents. When the app is loaded, it performs background checks to determine if the Pinecone vector database needs to be created and populated.

      **Componentized suggested search interface**

      To make it easier for you to clone this app as a starting point and quickly adopt it to your own purposes, we've
      built the search interface as a component that accepts a list of suggested searches and renders them as a dropdown, helping the
      user find things:

      You can define your suggested searches in your parent component:

      ```typescript theme={null}
      // For the purposes of our legal semantic search example, we pre-define some queries 
      // that we know will pull back interesting results for the user
      const suggestedSearches = [
        'Cases about personal freedoms being violated',
        'Cases involving a US President',
        'Cases involving guns',
        'Cases where Nixon was the defendant',
        'How much power does the commerce clause give Congress?',
        'Cases about personal rights or congressional overreach?',
        'Cases involving the ability to pay for an attorney',
        ...
      ];

      // Then, we pass them into our SearchForm component: 
      <SearchForm
        suggestedSearches={suggestedSearches}
        onSearch={(query: string) => {
          handleSearch(query, setResults, setIsSearching);
          setQuery(query);
        }}
      />

      ```

      This means you can pass in any suggested searches you wish given your specific use case.

      The SearchForm component is exported from `src/components/SearchForm.tsx`. It handles:

      * Displaying suggested searches
      * Allowing the user to search, or clear the input
      * Providing visual feedback to the user that the search is in progress

      **Local document processing via a bootstrapping service**

      We store several landmark legal cases as PDFs in the codebase, so that developers cloning and running the app locally can immediately build off the same experience being demonstrated by the legal semantic search app running on our Docs site.

      We use Langchain to parse the PDFs, convert them into chunks, and embed them. We store the resulting vectors in the Pinecone vector database.

      **Knowledge base bootstrapping**

      This project demonstrates how to programmatically bootstrap a knowledge base backed by a Pinecone vector database using arbitrary PDF files
      that are included in the codebase.

      The sample app use case is focused on semantic search over legal documents, but this exact same technique and code can be applied to any content stored locally.

      ```typescript theme={null}
      export const handleBootstrapping = async (targetIndex: string) => {

        try {
          console.log(`Running bootstrapping procedure against Pinecone index: ${targetIndex}`);

          // If a Pinecone index with the target name doesn't exist, create it
          // If it does exist, return while suppressing conflict errors
          await createIndexIfNecessary(targetIndex);

          // Short-circuit early if the index already exists and has vectors in it 
          const hasVectors = await pineconeIndexHasVectors(targetIndex);
          if (hasVectors) {
            console.log('Pinecone index already exists and has vectors in it - returning early without bootstrapping');
            return NextResponse.json({ sucess: true }, { status: 200 });
          }

          if (!hasVectors) {
            console.log('Pinecone index does not exist or has no vectors in it - bootstrapping');
          }

          // Load metadata from db.json
          const metadata = await readMetadata();

          // Form the local path to the PDFs documents
          const docsPath = path.resolve(process.cwd(), 'docs/')

          const loader = new DirectoryLoader(docsPath, {
            '.pdf': (filePath: string) => new PDFLoader(filePath),
          });

          // Load all PDFs within the specified directory
          const documents = await loader.load();

          // Merge extracted metadata with documents based on filename
          documents.forEach((doc, index) => {
            const fileMetadata = metadata.find(meta => meta.filename === path.basename(doc.metadata.source));
            if (fileMetadata) {
              doc.metadata = { ...doc.metadata, ...fileMetadata, pageContent: doc.pageContent };
            } else {
              console.warn(`No metadata found for ${doc.metadata.source}`);
            }
          });

          // Split text into chunks
          const splitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000 });
          const splits = await splitter.splitDocuments(documents);

          // Assign unique IDs to each split and flatten metadata
          const castedSplits: Document[] = splits.map(split => ({
            pageContent: split.pageContent,
            metadata: {
              ...flattenMetadata(split.metadata as Document['metadata']),
              id: uuidv4(),
              pageContent: split.pageContent, // Ensure pageContent is included in metadata
            },
          }));

          // Extract page contents
          const pageContents = castedSplits.map(split => split.pageContent);

          // Generate embeddings for each chunk
          const voyageEmbeddings = new VoyageEmbeddings({
            apiKey: process.env.VOYAGE_API_KEY,
            inputType: "document",
            modelName: "voyage-law-2",
          });

          const embeddings = await voyageEmbeddings.embedDocuments(pageContents);

          // Combine embeddings with metadata and ensure IDs are defined
          const vectors = castedSplits.map((split, index) => {
            if (!split.metadata.id) {
              throw new Error('Document chunk is missing an ID');
            }
            return {
              id: split.metadata.id!,
              values: embeddings[index],
              metadata: split.metadata,
            };
          });

          const pc = new Pinecone();
          const index = pc.Index(process.env.PINECONE_INDEX as string);

          // Batch upserts to stay within the 2MB request size limit
          await batchUpserts(index, vectors);

          console.log('Bootstrap procedure completed.');
          return NextResponse.json({ success: true }, { status: 200 });

        } catch (error) {
          console.error('Error during bootstrap procedure:', error);
        }

        return true;
      }
      ```

      When a user access the app, it runs a check to determine if the bootsrapping procedure needs to be run.

      If the Pinecone index does not already exist, or if it exists but does not yet contain vectors, the bootstrapping procedure is run.

      The bootsrapping procedure:

      * Creates the Pinecone index specified by the `PINECONE_INDEX` environment variable
      * Loads metadata from the `docs/db.json` file
      * Loads all PDFs in the `docs` directory
      * Merges extracted metadata with documents based on filename
      * Splits text into chunks
      * Assigns unique IDs to each split and flattens metadata
      * Upserts each chunk to the Pinecone vector database, in batches

      **Domain-specific embeddings model**

      This app uses Voyage AI's embeddings model, `voyage-law-2`, which is purpose-built for use with legal text. This app includes a small handfull of landmark U.S. cases from Justia.

      During the bootstrapping phase, the case documents are chunked and passed to Voyage's embeddings model for embedding:

      ```typescript theme={null}
      // Generate embeddings for each chunk
      const voyageEmbeddings = new VoyageEmbeddings({
        apiKey: process.env.VOYAGE_API_KEY,
        inputType: "document",
        modelName: "voyage-law-2",
      });
      ```

      When the user executes a search, their query is sent to the `/api/search` route, which also uses
      Voyage's embeddings model to convert the user's query into query vectors:

      ```typescript theme={null}

      // Initialize VoyageEmbeddings
      const voyageEmbeddings = new VoyageEmbeddings({
        apiKey: process.env.VOYAGE_API_KEY,
        inputType: 'document',
        modelName: "voyage-law-2",
      });

      // Initialize PineconeVectorStore
      const vectorStore = new PineconeStore(voyageEmbeddings, {
        pineconeIndex: pc.Index(process.env.PINECONE_INDEX as string),
      });

      const retrieved = await vectorStore.maxMarginalRelevanceSearch(query, { k: 20 });
      ```

      ***

      ## Troubleshooting

      Experiencing any issues with the sample app?
      [Submit an issue, create a PR](https://github.com/pinecone-io/sample-apps/), or post in our [community forum](https://community.pinecone.io)!
    </div>

    <div className="sidebar toc" />
  </div>
</div>
