Skip to content

Commit 163614e

Browse files
margaretjguhntrl
andauthored
feat(community): add elasticsearch hybrid search (#9385)
Co-authored-by: Hunter Lovell <[email protected]>
1 parent 5b27f38 commit 163614e

File tree

4 files changed

+516
-6
lines changed

4 files changed

+516
-6
lines changed

.changeset/empty-ligers-live.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@langchain/community": patch
3+
---
4+
5+
add elasticsearch hybrid search
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import { Client, ClientOptions } from "@elastic/elasticsearch";
2+
import { OpenAIEmbeddings } from "@langchain/openai";
3+
import {
4+
ElasticClientArgs,
5+
ElasticVectorSearch,
6+
HybridRetrievalStrategy,
7+
} from "@langchain/community/vectorstores/elasticsearch";
8+
import { Document } from "@langchain/core/documents";
9+
10+
/**
11+
* Demonstrates hybrid search with Elasticsearch, combining:
12+
* - Vector (semantic) search using embeddings
13+
* - BM25 (lexical) full-text search
14+
* - Reciprocal Rank Fusion (RRF) for result merging
15+
*
16+
* Requirements:
17+
* - Elasticsearch 8.9+ (for RRF support)
18+
* - Run: docker-compose up -d --build (in elasticsearch directory)
19+
* - Set ELASTIC_URL, ELASTIC_API_KEY (or ELASTIC_USERNAME/ELASTIC_PASSWORD)
20+
*/
21+
export async function run() {
22+
const config: ClientOptions = {
23+
node: process.env.ELASTIC_URL ?? "http://127.0.0.1:9200",
24+
};
25+
if (process.env.ELASTIC_API_KEY) {
26+
config.auth = {
27+
apiKey: process.env.ELASTIC_API_KEY,
28+
};
29+
} else if (process.env.ELASTIC_USERNAME && process.env.ELASTIC_PASSWORD) {
30+
config.auth = {
31+
username: process.env.ELASTIC_USERNAME,
32+
password: process.env.ELASTIC_PASSWORD,
33+
};
34+
}
35+
36+
const embeddings = new OpenAIEmbeddings();
37+
38+
const clientArgs: ElasticClientArgs = {
39+
client: new Client(config),
40+
indexName: process.env.ELASTIC_INDEX ?? "test_hybrid_search",
41+
strategy: new HybridRetrievalStrategy({
42+
rankWindowSize: 100,
43+
rankConstant: 60,
44+
textField: "text",
45+
}),
46+
};
47+
48+
const vectorStore = new ElasticVectorSearch(embeddings, clientArgs);
49+
50+
await vectorStore.deleteIfExists();
51+
52+
// Add sample documents
53+
const docs = [
54+
new Document({
55+
pageContent:
56+
"Running helps build cardiovascular endurance and strengthens leg muscles.",
57+
metadata: { category: "fitness", topic: "running" },
58+
}),
59+
new Document({
60+
pageContent:
61+
"Marathon training requires consistent mileage and proper recovery.",
62+
metadata: { category: "fitness", topic: "running" },
63+
}),
64+
new Document({
65+
pageContent:
66+
"Muscle soreness after exercise is caused by microscopic damage to muscle fibers.",
67+
metadata: { category: "health", topic: "recovery" },
68+
}),
69+
new Document({
70+
pageContent:
71+
"Stretching and foam rolling can help prevent post-workout muscle pain.",
72+
metadata: { category: "health", topic: "recovery" },
73+
}),
74+
new Document({
75+
pageContent:
76+
"Python is a popular programming language for data science and machine learning.",
77+
metadata: { category: "technology", topic: "programming" },
78+
}),
79+
];
80+
81+
console.log("Adding documents to Elasticsearch...");
82+
await vectorStore.addDocuments(docs);
83+
console.log("Documents added successfully!\n");
84+
85+
// Example 1: Hybrid search combines semantic + keyword matching
86+
console.log("=== Example 1: Hybrid Search ===");
87+
const query1 = "How to avoid muscle soreness while running?";
88+
console.log(`Query: "${query1}"\n`);
89+
90+
const results1 = await vectorStore.similaritySearchWithScore(query1, 3);
91+
results1.forEach(([doc, score], i) => {
92+
console.log(`${i + 1}. [Score: ${score.toFixed(4)}] ${doc.pageContent}`);
93+
console.log(` Metadata: ${JSON.stringify(doc.metadata)}\n`);
94+
});
95+
96+
// Example 2: Semantic search works well for conceptual queries
97+
console.log("\n=== Example 2: Semantic Query ===");
98+
const query2 = "tips for preventing pain after workouts";
99+
console.log(`Query: "${query2}"\n`);
100+
101+
const results2 = await vectorStore.similaritySearchWithScore(query2, 2);
102+
results2.forEach(([doc, score], i) => {
103+
console.log(`${i + 1}. [Score: ${score.toFixed(4)}] ${doc.pageContent}`);
104+
console.log(` Metadata: ${JSON.stringify(doc.metadata)}\n`);
105+
});
106+
107+
// Example 3: With metadata filters
108+
console.log("\n=== Example 3: Hybrid Search with Filters ===");
109+
const query3 = "fitness advice";
110+
console.log(`Query: "${query3}"`);
111+
console.log(`Filter: category = "fitness"\n`);
112+
113+
const results3 = await vectorStore.similaritySearchWithScore(query3, 3, {
114+
category: "fitness",
115+
});
116+
results3.forEach(([doc, score], i) => {
117+
console.log(`${i + 1}. [Score: ${score.toFixed(4)}] ${doc.pageContent}`);
118+
console.log(` Metadata: ${JSON.stringify(doc.metadata)}\n`);
119+
});
120+
121+
// Clean up
122+
console.log("\n=== Cleanup ===");
123+
await vectorStore.deleteIfExists();
124+
console.log("Index deleted.");
125+
}

libs/langchain-community/src/vectorstores/elasticsearch.ts

Lines changed: 125 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { Client, estypes } from "@elastic/elasticsearch";
33
import type { EmbeddingsInterface } from "@langchain/core/embeddings";
44
import { VectorStore } from "@langchain/core/vectorstores";
55
import { Document } from "@langchain/core/documents";
6+
import type { Callbacks } from "@langchain/core/callbacks/manager";
67
/**
78
* Type representing the k-nearest neighbors (k-NN) engine used in
89
* Elasticsearch.
@@ -24,6 +25,30 @@ interface VectorSearchOptions {
2425
readonly candidates?: number;
2526
}
2627

28+
/**
29+
* Configuration options for hybrid retrieval strategy.
30+
*/
31+
export interface HybridRetrievalStrategyConfig {
32+
rankWindowSize?: number;
33+
rankConstant?: number;
34+
textField?: string;
35+
}
36+
37+
/**
38+
* Hybrid search strategy combining vector and BM25 search using RRF.
39+
*/
40+
export class HybridRetrievalStrategy {
41+
public readonly rankWindowSize: number;
42+
public readonly rankConstant: number;
43+
public readonly textField: string;
44+
45+
constructor(config: HybridRetrievalStrategyConfig = {}) {
46+
this.rankWindowSize = config.rankWindowSize ?? 100;
47+
this.rankConstant = config.rankConstant ?? 60;
48+
this.textField = config.textField ?? "text";
49+
}
50+
}
51+
2752
/**
2853
* Interface defining the arguments required to create an Elasticsearch
2954
* client.
@@ -32,6 +57,7 @@ export interface ElasticClientArgs {
3257
readonly client: Client;
3358
readonly indexName?: string;
3459
readonly vectorSearchOptions?: VectorSearchOptions;
60+
readonly strategy?: HybridRetrievalStrategy;
3561
}
3662

3763
/**
@@ -51,10 +77,23 @@ type ElasticMetadataTerms = {
5177
};
5278

5379
/**
54-
* Class for interacting with an Elasticsearch database. It extends the
55-
* VectorStore base class and provides methods for adding documents and
56-
* vectors to the Elasticsearch database, performing similarity searches,
57-
* deleting documents, and more.
80+
* Elasticsearch vector store supporting vector and hybrid search.
81+
*
82+
* Hybrid search combines kNN vector search with BM25 full-text search
83+
* using RRF. Enable by passing a `HybridRetrievalStrategy` to the constructor.
84+
*
85+
* @example
86+
* ```typescript
87+
* // Vector search (default)
88+
* const vectorStore = new ElasticVectorSearch(embeddings, { client, indexName });
89+
*
90+
* // Hybrid search
91+
* const hybridStore = new ElasticVectorSearch(embeddings, {
92+
* client,
93+
* indexName,
94+
* strategy: new HybridRetrievalStrategy()
95+
* });
96+
* ```
5897
*/
5998
export class ElasticVectorSearch extends VectorStore {
6099
declare FilterType: ElasticFilter;
@@ -73,6 +112,10 @@ export class ElasticVectorSearch extends VectorStore {
73112

74113
private readonly candidates: number;
75114

115+
private readonly strategy?: HybridRetrievalStrategy;
116+
117+
private lastQueryText?: string;
118+
76119
_vectorstoreType(): string {
77120
return "elasticsearch";
78121
}
@@ -85,9 +128,14 @@ export class ElasticVectorSearch extends VectorStore {
85128
this.m = args.vectorSearchOptions?.m ?? 16;
86129
this.efConstruction = args.vectorSearchOptions?.efConstruction ?? 100;
87130
this.candidates = args.vectorSearchOptions?.candidates ?? 200;
131+
this.strategy = args.strategy;
132+
133+
const userAgent = this.strategy
134+
? "langchain-js-vs-hybrid/0.0.1"
135+
: "langchain-js-vs/0.0.1";
88136

89137
this.client = args.client.child({
90-
headers: { "user-agent": "langchain-js-vs/0.0.1" },
138+
headers: { "user-agent": userAgent },
91139
});
92140
this.indexName = args.indexName ?? "documents";
93141
}
@@ -155,6 +203,16 @@ export class ElasticVectorSearch extends VectorStore {
155203
return documentIds;
156204
}
157205

206+
async similaritySearch(
207+
query: string,
208+
k = 4,
209+
filter?: ElasticFilter,
210+
_callbacks?: Callbacks
211+
): Promise<Document[]> {
212+
this.lastQueryText = query;
213+
return super.similaritySearch(query, k, filter, _callbacks);
214+
}
215+
158216
/**
159217
* Method to perform a similarity search in the Elasticsearch database
160218
* using a vector. It returns the k most similar documents along with
@@ -169,6 +227,15 @@ export class ElasticVectorSearch extends VectorStore {
169227
k: number,
170228
filter?: ElasticFilter
171229
): Promise<[Document, number][]> {
230+
if (this.strategy && this.lastQueryText) {
231+
return this.hybridSearchVectorWithScore(
232+
this.lastQueryText,
233+
query,
234+
k,
235+
filter
236+
);
237+
}
238+
172239
const result = await this.client.search({
173240
index: this.indexName,
174241
size: k,
@@ -191,6 +258,59 @@ export class ElasticVectorSearch extends VectorStore {
191258
]);
192259
}
193260

261+
private async hybridSearchVectorWithScore(
262+
queryText: string,
263+
queryVector: number[],
264+
k: number,
265+
filter?: ElasticFilter
266+
): Promise<[Document, number][]> {
267+
const metadataTerms = this.buildMetadataTerms(filter);
268+
const filterClauses =
269+
metadataTerms.must.length > 0 || metadataTerms.must_not.length > 0
270+
? { bool: metadataTerms }
271+
: undefined;
272+
273+
const result = await this.client.search({
274+
index: this.indexName,
275+
size: k,
276+
retriever: {
277+
rrf: {
278+
retrievers: [
279+
{
280+
standard: {
281+
query: {
282+
match: {
283+
[this.strategy!.textField]: queryText,
284+
},
285+
},
286+
},
287+
},
288+
{
289+
knn: {
290+
field: "embedding",
291+
query_vector: queryVector,
292+
k,
293+
num_candidates: this.candidates,
294+
},
295+
},
296+
],
297+
rank_window_size: this.strategy!.rankWindowSize,
298+
rank_constant: this.strategy!.rankConstant,
299+
},
300+
},
301+
...(filterClauses && { query: filterClauses }),
302+
});
303+
304+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
305+
return result.hits.hits.map((hit: any) => [
306+
new Document({
307+
pageContent: hit._source.text,
308+
metadata: hit._source.metadata,
309+
}),
310+
hit._score,
311+
]);
312+
}
313+
194314
/**
195315
* Method to delete documents from the Elasticsearch database.
196316
* @param params Object containing the IDs of the documents to delete.

0 commit comments

Comments
 (0)