feat(community): add elasticsearch hybrid search (#9385)

margaretjgu · hntrl · web-flow · commit 163614e3c11b · 2025-12-07T12:16:15.000-08:00
Co-authored-by: Hunter Lovell &lt;40191806+hntrl@users.noreply.github.com&gt;
diff --git a/.changeset/empty-ligers-live.md b/.changeset/empty-ligers-live.md
@@ -0,0 +1,5 @@
+---
+"@langchain/community": patch
+---
+
+add elasticsearch hybrid search
diff --git a/examples/src/langchain-classic/indexes/vector_stores/elasticsearch/elasticsearch_hybrid.ts b/examples/src/langchain-classic/indexes/vector_stores/elasticsearch/elasticsearch_hybrid.ts
@@ -0,0 +1,125 @@
+import { Client, ClientOptions } from "@elastic/elasticsearch";
+import { OpenAIEmbeddings } from "@langchain/openai";
+import {
+  ElasticClientArgs,
+  ElasticVectorSearch,
+  HybridRetrievalStrategy,
+} from "@langchain/community/vectorstores/elasticsearch";
+import { Document } from "@langchain/core/documents";
+
+/**
+ * Demonstrates hybrid search with Elasticsearch, combining:
+ * - Vector (semantic) search using embeddings
+ * - BM25 (lexical) full-text search
+ * - Reciprocal Rank Fusion (RRF) for result merging
+ *
+ * Requirements:
+ * - Elasticsearch 8.9+ (for RRF support)
+ * - Run: docker-compose up -d --build (in elasticsearch directory)
+ * - Set ELASTIC_URL, ELASTIC_API_KEY (or ELASTIC_USERNAME/ELASTIC_PASSWORD)
+ */
+export async function run() {
+  const config: ClientOptions = {
+    node: process.env.ELASTIC_URL ?? "http://127.0.0.1:9200",
+  };
+  if (process.env.ELASTIC_API_KEY) {
+    config.auth = {
+      apiKey: process.env.ELASTIC_API_KEY,
+    };
+  } else if (process.env.ELASTIC_USERNAME && process.env.ELASTIC_PASSWORD) {
+    config.auth = {
+      username: process.env.ELASTIC_USERNAME,
+      password: process.env.ELASTIC_PASSWORD,
+    };
+  }
+
+  const embeddings = new OpenAIEmbeddings();
+
+  const clientArgs: ElasticClientArgs = {
+    client: new Client(config),
+    indexName: process.env.ELASTIC_INDEX ?? "test_hybrid_search",
+    strategy: new HybridRetrievalStrategy({
+      rankWindowSize: 100,
+      rankConstant: 60,
+      textField: "text",
+    }),
+  };
+
+  const vectorStore = new ElasticVectorSearch(embeddings, clientArgs);
+
+  await vectorStore.deleteIfExists();
+
+  // Add sample documents
+  const docs = [
+    new Document({
+      pageContent:
+        "Running helps build cardiovascular endurance and strengthens leg muscles.",
+      metadata: { category: "fitness", topic: "running" },
+    }),
+    new Document({
+      pageContent:
+        "Marathon training requires consistent mileage and proper recovery.",
+      metadata: { category: "fitness", topic: "running" },
+    }),
+    new Document({
+      pageContent:
+        "Muscle soreness after exercise is caused by microscopic damage to muscle fibers.",
+      metadata: { category: "health", topic: "recovery" },
+    }),
+    new Document({
+      pageContent:
+        "Stretching and foam rolling can help prevent post-workout muscle pain.",
+      metadata: { category: "health", topic: "recovery" },
+    }),
+    new Document({
+      pageContent:
+        "Python is a popular programming language for data science and machine learning.",
+      metadata: { category: "technology", topic: "programming" },
+    }),
+  ];
+
+  console.log("Adding documents to Elasticsearch...");
+  await vectorStore.addDocuments(docs);
+  console.log("Documents added successfully!\n");
+
+  // Example 1: Hybrid search combines semantic + keyword matching
+  console.log("=== Example 1: Hybrid Search ===");
+  const query1 = "How to avoid muscle soreness while running?";
+  console.log(`Query: "${query1}"\n`);
+
+  const results1 = await vectorStore.similaritySearchWithScore(query1, 3);
+  results1.forEach(([doc, score], i) => {
+    console.log(`${i + 1}. [Score: ${score.toFixed(4)}] ${doc.pageContent}`);
+    console.log(`   Metadata: ${JSON.stringify(doc.metadata)}\n`);
+  });
+
+  // Example 2: Semantic search works well for conceptual queries
+  console.log("\n=== Example 2: Semantic Query ===");
+  const query2 = "tips for preventing pain after workouts";
+  console.log(`Query: "${query2}"\n`);
+
+  const results2 = await vectorStore.similaritySearchWithScore(query2, 2);
+  results2.forEach(([doc, score], i) => {
+    console.log(`${i + 1}. [Score: ${score.toFixed(4)}] ${doc.pageContent}`);
+    console.log(`   Metadata: ${JSON.stringify(doc.metadata)}\n`);
+  });
+
+  // Example 3: With metadata filters
+  console.log("\n=== Example 3: Hybrid Search with Filters ===");
+  const query3 = "fitness advice";
+  console.log(`Query: "${query3}"`);
+  console.log(`Filter: category = "fitness"\n`);
+
+  const results3 = await vectorStore.similaritySearchWithScore(query3, 3, {
+    category: "fitness",
+  });
+  results3.forEach(([doc, score], i) => {
+    console.log(`${i + 1}. [Score: ${score.toFixed(4)}] ${doc.pageContent}`);
+    console.log(`   Metadata: ${JSON.stringify(doc.metadata)}\n`);
+  });
+
+  // Clean up
+  console.log("\n=== Cleanup ===");
+  await vectorStore.deleteIfExists();
+  console.log("Index deleted.");
+}
diff --git a/libs/langchain-community/src/vectorstores/elasticsearch.ts b/libs/langchain-community/src/vectorstores/elasticsearch.ts
@@ -3,6 +3,7 @@ import { Client, estypes } from "@elastic/elasticsearch";
 import type { EmbeddingsInterface } from "@langchain/core/embeddings";
 import { VectorStore } from "@langchain/core/vectorstores";
 import { Document } from "@langchain/core/documents";
+import type { Callbacks } from "@langchain/core/callbacks/manager";
 /**
  * Type representing the k-nearest neighbors (k-NN) engine used in
  * Elasticsearch.
@@ -24,6 +25,30 @@ interface VectorSearchOptions {
   readonly candidates?: number;
 }
 
+/**
+ * Configuration options for hybrid retrieval strategy.
+ */
+export interface HybridRetrievalStrategyConfig {
+  rankWindowSize?: number;
+  rankConstant?: number;
+  textField?: string;
+}
+
+/**
+ * Hybrid search strategy combining vector and BM25 search using RRF.
+ */
+export class HybridRetrievalStrategy {
+  public readonly rankWindowSize: number;
+  public readonly rankConstant: number;
+  public readonly textField: string;
+
+  constructor(config: HybridRetrievalStrategyConfig = {}) {
+    this.rankWindowSize = config.rankWindowSize ?? 100;
+    this.rankConstant = config.rankConstant ?? 60;
+    this.textField = config.textField ?? "text";
+  }
+}
+
 /**
  * Interface defining the arguments required to create an Elasticsearch
  * client.
@@ -32,6 +57,7 @@ export interface ElasticClientArgs {
   readonly client: Client;
   readonly indexName?: string;
   readonly vectorSearchOptions?: VectorSearchOptions;
+  readonly strategy?: HybridRetrievalStrategy;
 }
 
 /**
@@ -51,10 +77,23 @@ type ElasticMetadataTerms = {
 };
 
 /**
- * Class for interacting with an Elasticsearch database. It extends the
- * VectorStore base class and provides methods for adding documents and
- * vectors to the Elasticsearch database, performing similarity searches,
- * deleting documents, and more.
+ * Elasticsearch vector store supporting vector and hybrid search.
+ *
+ * Hybrid search combines kNN vector search with BM25 full-text search
+ * using RRF. Enable by passing a `HybridRetrievalStrategy` to the constructor.
+ *
+ * @example
+ * ```typescript
+ * // Vector search (default)
+ * const vectorStore = new ElasticVectorSearch(embeddings, { client, indexName });
+ *
+ * // Hybrid search
+ * const hybridStore = new ElasticVectorSearch(embeddings, {
+ *   client,
+ *   indexName,
+ *   strategy: new HybridRetrievalStrategy()
+ * });
+ * ```
  */
 export class ElasticVectorSearch extends VectorStore {
   declare FilterType: ElasticFilter;
@@ -73,6 +112,10 @@ export class ElasticVectorSearch extends VectorStore {
 
   private readonly candidates: number;
 
+  private readonly strategy?: HybridRetrievalStrategy;
+
+  private lastQueryText?: string;
+
   _vectorstoreType(): string {
     return "elasticsearch";
   }
@@ -85,9 +128,14 @@ export class ElasticVectorSearch extends VectorStore {
     this.m = args.vectorSearchOptions?.m ?? 16;
     this.efConstruction = args.vectorSearchOptions?.efConstruction ?? 100;
     this.candidates = args.vectorSearchOptions?.candidates ?? 200;
+    this.strategy = args.strategy;
+
+    const userAgent = this.strategy
+      ? "langchain-js-vs-hybrid/0.0.1"
+      : "langchain-js-vs/0.0.1";
 
     this.client = args.client.child({
-      headers: { "user-agent": "langchain-js-vs/0.0.1" },
+      headers: { "user-agent": userAgent },
     });
     this.indexName = args.indexName ?? "documents";
   }
@@ -155,6 +203,16 @@ export class ElasticVectorSearch extends VectorStore {
     return documentIds;
   }
 
+  async similaritySearch(
+    query: string,
+    k = 4,
+    filter?: ElasticFilter,
+    _callbacks?: Callbacks
+  ): Promise<Document[]> {
+    this.lastQueryText = query;
+    return super.similaritySearch(query, k, filter, _callbacks);
+  }
+
   /**
    * Method to perform a similarity search in the Elasticsearch database
    * using a vector. It returns the k most similar documents along with
@@ -169,6 +227,15 @@ export class ElasticVectorSearch extends VectorStore {
     k: number,
     filter?: ElasticFilter
   ): Promise<[Document, number][]> {
+    if (this.strategy && this.lastQueryText) {
+      return this.hybridSearchVectorWithScore(
+        this.lastQueryText,
+        query,
+        k,
+        filter
+      );
+    }
+
     const result = await this.client.search({
       index: this.indexName,
       size: k,
@@ -191,6 +258,59 @@ export class ElasticVectorSearch extends VectorStore {
     ]);
   }
 
+  private async hybridSearchVectorWithScore(
+    queryText: string,
+    queryVector: number[],
+    k: number,
+    filter?: ElasticFilter
+  ): Promise<[Document, number][]> {
+    const metadataTerms = this.buildMetadataTerms(filter);
+    const filterClauses =
+      metadataTerms.must.length > 0 || metadataTerms.must_not.length > 0
+        ? { bool: metadataTerms }
+        : undefined;
+
+    const result = await this.client.search({
+      index: this.indexName,
+      size: k,
+      retriever: {
+        rrf: {
+          retrievers: [
+            {
+              standard: {
+                query: {
+                  match: {
+                    [this.strategy!.textField]: queryText,
+                  },
+                },
+              },
+            },
+            {
+              knn: {
+                field: "embedding",
+                query_vector: queryVector,
+                k,
+                num_candidates: this.candidates,
+              },
+            },
+          ],
+          rank_window_size: this.strategy!.rankWindowSize,
+          rank_constant: this.strategy!.rankConstant,
+        },
+      },
+      ...(filterClauses && { query: filterClauses }),
+    });
+
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    return result.hits.hits.map((hit: any) => [
+      new Document({
+        pageContent: hit._source.text,
+        metadata: hit._source.metadata,
+      }),
+      hit._score,
+    ]);
+  }
+
   /**
    * Method to delete documents from the Elasticsearch database.
    * @param params Object containing the IDs of the documents to delete.
diff --git a/libs/langchain-community/src/vectorstores/tests/elasticsearch.int.test.ts b/libs/langchain-community/src/vectorstores/tests/elasticsearch.int.test.ts

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"@langchain/community": patch
 +---
++
 +add elasticsearch hybrid search