Update testing suite (#73)

ssh-esh · ssh-esh · web-flow · commit fb9de59de2f9 · 2025-09-19T13:41:08.000-04:00
* update python versions for testing

* create testing matrix for ES versions

* update python versions for _lint workflow

* update python versions for unit test workflow

* make fail fast false in integeration and unit test workflows

* make fake_embeddings less uniform for quantization

* introduce stable hash embeddings

* allows test_similarity_search_without_metadata to use stable hash embeddings

* use ruff to fix lint errors

* fix some more linting errors

* use stable hash embeddings on test_add_embeddings

* make relevance score tests use stable hash embeddings

* create a tolerance on score assertion for test_elasticsearch_with_relevance_score

* fix linting errors

* use stableHashEmbeddings on test_similarity_search_approx_by_vector

* create tolerance on score assertion for test_similarity_search_approx_by_vector

* debug: rank_window_size breaking change

* decide window_key based on ES version for async rrf hybrid search

* decide window_key based on ES version for sync rrf hybrid search

* fix lint errors

* Add comment for StableHashEmbeddings

* improve code comments

* fix lint

* add type annotation for stable hash embeddings

* update comments

* review comment: update consistent embeddings to hash based

* fix lint

* review comment: change to 16-dim vector, 2 d.p and assert full query body

* review comment: min ES version is 8.15 in test matrix

* review comment: update rank_window_size

* linting

* review comments: update comments

---------

Co-authored-by: ssh-esh &lt;eyoeshetu@mac.home&gt;
diff --git a/.github/workflows/_integration_test.yml b/.github/workflows/_integration_test.yml
@@ -25,14 +25,21 @@ jobs:
         working-directory: ${{ inputs.working-directory }}
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         python-version:
           - "3.9"
           - "3.10"
           - "3.11"
+          - "3.12"
+          - "3.13"
+        elasticsearch-version:
+          - "8.15.0"
+          - "8.19.0"
+          - "9.1.2"
     services:
       elasticsearch:
-        image: elasticsearch:8.13.0
+        image: elasticsearch:${{ matrix.elasticsearch-version }}
         env:
           discovery.type: single-node
           xpack.license.self_generated.type: trial
diff --git a/.github/workflows/_lint.yml b/.github/workflows/_lint.yml
@@ -29,8 +29,8 @@ jobs:
         # Starting new jobs is also relatively slow,
         # so linting on fewer versions makes CI faster.
         python-version:
-          - "3.8"
-          - "3.11"
+          - "3.9"
+          - "3.13"
     steps:
       - uses: actions/checkout@v4
 
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
@@ -18,12 +18,14 @@ jobs:
         working-directory: ${{ inputs.working-directory }}
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         python-version:
-          - "3.8"
           - "3.9"
           - "3.10"
           - "3.11"
+          - "3.12"
+          - "3.13"
     name: "make test #${{ matrix.python-version }}"
     steps:
       - uses: actions/checkout@v4
diff --git a/libs/elasticsearch/tests/_async/fake_embeddings.py b/libs/elasticsearch/tests/_async/fake_embeddings.py
@@ -1,5 +1,6 @@
 """Fake Embedding class for testing purposes."""
 
+import hashlib
 from typing import List
 
 from langchain_core.embeddings import Embeddings
@@ -24,26 +25,33 @@ async def aembed_query(self, text: str) -> List[float]:
 
 
 class AsyncConsistentFakeEmbeddings(AsyncFakeEmbeddings):
-    """Fake embeddings which remember all the texts seen so far to return consistent
-    vectors for the same texts."""
-
-    def __init__(self, dimensionality: int = 10) -> None:
-        self.known_texts: List[str] = []
-        self.dimensionality = dimensionality
+    """Deterministic hash-based embeddings for robust testing (async version).
+
+    Why:
+    - Elasticsearch 8.14+ indexes dense vectors with int8_hnsw by default.
+      Quantization (int8) + HNSW ANN can slightly disturb scores/ranking
+      especially when vectors are nearly identical.
+    - Tests need deterministic separation so small quantization/ANN
+      effects do not flip top-1 results or break strict assertions.
+
+    What:
+    - Produce a 16-dim vector from md5(text), convert to floats, then L1-normalize
+      so values sum to 1.0. Round to 2 decimal places for precision stability.
+      This gives stable, well-separated but deterministic vectors which will work
+      across ES versions.
+    """
+
+    @staticmethod
+    def _encode(text: str) -> List[float]:
+        digest = hashlib.md5(text.encode("utf-8")).digest()
+        total = sum(digest)
+        # Round to 2 decimal places to avoid precision issues
+        return [round(float(v) / float(total), 2) for v in digest]
 
     async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
-        """Return consistent embeddings for each text seen so far."""
-        out_vectors = []
-        for text in texts:
-            if text not in self.known_texts:
-                self.known_texts.append(text)
-            vector = [float(1.0)] * (self.dimensionality - 1) + [
-                float(self.known_texts.index(text))
-            ]
-            out_vectors.append(vector)
-        return out_vectors
+        """Return stable hash-based embeddings for each text."""
+        return [self._encode(text) for text in texts]
 
     async def aembed_query(self, text: str) -> List[float]:
-        """Return consistent embeddings for the text, if seen before, or a constant
-        one if the text is unknown."""
-        return (await self.aembed_documents([text]))[0]
+        """Return stable hash-based embeddings for the text."""
+        return self._encode(text)
diff --git a/libs/elasticsearch/tests/_sync/fake_embeddings.py b/libs/elasticsearch/tests/_sync/fake_embeddings.py
@@ -1,5 +1,6 @@
 """Fake Embedding class for testing purposes."""
 
+import hashlib
 from typing import List
 
 from langchain_core.embeddings import Embeddings
@@ -24,26 +25,33 @@ def embed_query(self, text: str) -> List[float]:
 
 
 class ConsistentFakeEmbeddings(FakeEmbeddings):
-    """Fake embeddings which remember all the texts seen so far to return consistent
-    vectors for the same texts."""
-
-    def __init__(self, dimensionality: int = 10) -> None:
-        self.known_texts: List[str] = []
-        self.dimensionality = dimensionality
+    """Deterministic hash-based embeddings for robust testing (sync version).
+
+    Why:
+    - Elasticsearch 8.14+ indexes dense vectors with int8_hnsw by default.
+      Quantization (int8) + HNSW ANN can slightly disturb scores/ranking
+      especially when vectors are nearly identical.
+    - Tests need deterministic separation so small quantization/ANN
+      effects do not flip top-1 results or break strict assertions.
+
+    What:
+    - Produce a 16-dim vector from md5(text), convert to floats, then L1-normalize
+      so values sum to 1.0. Round to 2 decimal places for precision stability.
+      This gives stable, well-separated but deterministic vectors which will work
+      across ES versions.
+    """
+
+    @staticmethod
+    def _encode(text: str) -> List[float]:
+        digest = hashlib.md5(text.encode("utf-8")).digest()
+        total = sum(digest)
+        # Round to 2 decimal places to avoid precision issues
+        return [round(float(v) / float(total), 2) for v in digest]
 
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
-        """Return consistent embeddings for each text seen so far."""
-        out_vectors = []
-        for text in texts:
-            if text not in self.known_texts:
-                self.known_texts.append(text)
-            vector = [float(1.0)] * (self.dimensionality - 1) + [
-                float(self.known_texts.index(text))
-            ]
-            out_vectors.append(vector)
-        return out_vectors
+        """Return stable hash-based embeddings for each text."""
+        return [self._encode(text) for text in texts]
 
     def embed_query(self, text: str) -> List[float]:
-        """Return consistent embeddings for the text, if seen before, or a constant
-        one if the text is unknown."""
-        return (self.embed_documents([text]))[0]
+        """Return stable hash-based embeddings for the text."""
+        return self._encode(text)
diff --git a/libs/elasticsearch/tests/fake_embeddings.py b/libs/elasticsearch/tests/fake_embeddings.py
@@ -5,7 +5,9 @@
 from ._async.fake_embeddings import (
     AsyncConsistentFakeEmbeddings as _AsyncConsistentFakeEmbeddings,
 )
-from ._async.fake_embeddings import AsyncFakeEmbeddings as _AsyncFakeEmbeddings
+from ._async.fake_embeddings import (
+    AsyncFakeEmbeddings as _AsyncFakeEmbeddings,
+)
 from ._sync.fake_embeddings import (  # noqa: F401
     ConsistentFakeEmbeddings,
     FakeEmbeddings,
diff --git a/libs/elasticsearch/tests/integration_tests/_async/test_vectorstores.py b/libs/elasticsearch/tests/integration_tests/_async/test_vectorstores.py
@@ -10,7 +10,10 @@
 
 from langchain_elasticsearch.vectorstores import AsyncElasticsearchStore
 
-from ...fake_embeddings import AsyncConsistentFakeEmbeddings, AsyncFakeEmbeddings
+from ...fake_embeddings import (
+    AsyncConsistentFakeEmbeddings,
+    AsyncFakeEmbeddings,
+)
 from ._test_utilities import clear_test_indices, create_es_client, read_env
 
 logging.basicConfig(level=logging.DEBUG)
@@ -172,15 +175,32 @@ def assert_query(
                     "filter": [],
                     "k": 1,
                     "num_candidates": 50,
-                    "query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
+                    "query_vector": [
+                        0.06,
+                        0.07,
+                        0.01,
+                        0.08,
+                        0.03,
+                        0.07,
+                        0.09,
+                        0.03,
+                        0.09,
+                        0.09,
+                        0.04,
+                        0.03,
+                        0.08,
+                        0.07,
+                        0.06,
+                        0.08,
+                    ],
                 }
             }
             return query_body
 
         texts = ["foo", "bar", "baz"]
         docsearch = await AsyncElasticsearchStore.afrom_texts(
             texts,
-            AsyncFakeEmbeddings(),
+            AsyncConsistentFakeEmbeddings(),
             **es_params,
             index_name=index_name,
         )
@@ -597,7 +617,10 @@ def assert_query(
             k=1,
             custom_query=assert_query,
         )
-        assert output == [(Document(page_content="foo"), 1.0)]
+        doc, score = output[0]
+
+        assert doc == Document(page_content="foo")
+        assert score == pytest.approx(1.0, rel=0.05)
 
     @pytest.mark.asyncio
     async def test_similarity_search_approx_with_hybrid_search_rrf(
@@ -610,7 +633,7 @@ async def test_similarity_search_approx_with_hybrid_search_rrf(
         rrf_test_cases: List[Optional[Union[dict, bool]]] = [
             True,
             False,
-            {"rank_constant": 1, "window_size": 5},
+            {"rank_constant": 1, "rank_window_size": 5},
         ]
         for rrf_test_case in rrf_test_cases:
             texts = ["foo", "bar", "baz"]
@@ -687,7 +710,7 @@ def assert_query(
                 "query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
             },
             size=3,
-            rank={"rrf": {"rank_constant": 1, "window_size": 5}},
+            rank={"rrf": {"rank_constant": 1, "rank_window_size": 5}},
         )
 
         assert [o.page_content for o in output] == [
@@ -748,7 +771,7 @@ async def test_deployed_model_check_fails_approx(
         with pytest.raises(NotFoundError):
             await AsyncElasticsearchStore.afrom_texts(
                 texts=["foo", "bar", "baz"],
-                embedding=AsyncConsistentFakeEmbeddings(10),
+                embedding=AsyncConsistentFakeEmbeddings(),
                 **es_params,
                 index_name=index_name,
                 strategy=AsyncElasticsearchStore.ApproxRetrievalStrategy(
@@ -778,7 +801,7 @@ async def test_elasticsearch_with_relevance_score(
         """Test to make sure the relevance score is scaled to 0-1."""
         texts = ["foo", "bar", "baz"]
         metadatas = [{"page": str(i)} for i in range(len(texts))]
-        embeddings = AsyncFakeEmbeddings()
+        embeddings = AsyncConsistentFakeEmbeddings()
 
         docsearch = await AsyncElasticsearchStore.afrom_texts(
             index_name=index_name,
@@ -792,7 +815,10 @@ async def test_elasticsearch_with_relevance_score(
         output = await docsearch.asimilarity_search_by_vector_with_relevance_scores(
             embedding=embedded_query, k=1
         )
-        assert output == [(Document(page_content="foo", metadata={"page": "0"}), 1.0)]
+        doc, score = output[0]
+
+        assert doc == Document(page_content="foo", metadata={"page": "0"})
+        assert score == pytest.approx(1.0, rel=0.05)
 
     @pytest.mark.asyncio
     async def test_similarity_search_bm25_search(
diff --git a/libs/elasticsearch/tests/integration_tests/_sync/test_vectorstores.py b/libs/elasticsearch/tests/integration_tests/_sync/test_vectorstores.py
@@ -10,7 +10,10 @@
 
 from langchain_elasticsearch.vectorstores import ElasticsearchStore
 
-from ...fake_embeddings import ConsistentFakeEmbeddings, FakeEmbeddings
+from ...fake_embeddings import (
+    ConsistentFakeEmbeddings,
+    FakeEmbeddings,
+)
 from ._test_utilities import clear_test_indices, create_es_client, read_env
 
 logging.basicConfig(level=logging.DEBUG)
@@ -172,15 +175,32 @@ def assert_query(
                     "filter": [],
                     "k": 1,
                     "num_candidates": 50,
-                    "query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
+                    "query_vector": [
+                        0.06,
+                        0.07,
+                        0.01,
+                        0.08,
+                        0.03,
+                        0.07,
+                        0.09,
+                        0.03,
+                        0.09,
+                        0.09,
+                        0.04,
+                        0.03,
+                        0.08,
+                        0.07,
+                        0.06,
+                        0.08,
+                    ],
                 }
             }
             return query_body
 
         texts = ["foo", "bar", "baz"]
         docsearch = ElasticsearchStore.from_texts(
             texts,
-            FakeEmbeddings(),
+            ConsistentFakeEmbeddings(),
             **es_params,
             index_name=index_name,
         )
@@ -581,7 +601,10 @@ def assert_query(
             k=1,
             custom_query=assert_query,
         )
-        assert output == [(Document(page_content="foo"), 1.0)]
+        doc, score = output[0]
+
+        assert doc == Document(page_content="foo")
+        assert score == pytest.approx(1.0, rel=0.05)
 
     @pytest.mark.sync
     def test_similarity_search_approx_with_hybrid_search_rrf(
@@ -594,7 +617,7 @@ def test_similarity_search_approx_with_hybrid_search_rrf(
         rrf_test_cases: List[Optional[Union[dict, bool]]] = [
             True,
             False,
-            {"rank_constant": 1, "window_size": 5},
+            {"rank_constant": 1, "rank_window_size": 5},
         ]
         for rrf_test_case in rrf_test_cases:
             texts = ["foo", "bar", "baz"]
@@ -671,7 +694,7 @@ def assert_query(
                 "query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
             },
             size=3,
-            rank={"rrf": {"rank_constant": 1, "window_size": 5}},
+            rank={"rrf": {"rank_constant": 1, "rank_window_size": 5}},
         )
 
         assert [o.page_content for o in output] == [
@@ -730,7 +753,7 @@ def test_deployed_model_check_fails_approx(
         with pytest.raises(NotFoundError):
             ElasticsearchStore.from_texts(
                 texts=["foo", "bar", "baz"],
-                embedding=ConsistentFakeEmbeddings(10),
+                embedding=ConsistentFakeEmbeddings(),
                 **es_params,
                 index_name=index_name,
                 strategy=ElasticsearchStore.ApproxRetrievalStrategy(
@@ -760,7 +783,7 @@ def test_elasticsearch_with_relevance_score(
         """Test to make sure the relevance score is scaled to 0-1."""
         texts = ["foo", "bar", "baz"]
         metadatas = [{"page": str(i)} for i in range(len(texts))]
-        embeddings = FakeEmbeddings()
+        embeddings = ConsistentFakeEmbeddings()
 
         docsearch = ElasticsearchStore.from_texts(
             index_name=index_name,
@@ -774,7 +797,10 @@ def test_elasticsearch_with_relevance_score(
         output = docsearch.similarity_search_by_vector_with_relevance_scores(
             embedding=embedded_query, k=1
         )
-        assert output == [(Document(page_content="foo", metadata={"page": "0"}), 1.0)]
+        doc, score = output[0]
+
+        assert doc == Document(page_content="foo", metadata={"page": "0"})
+        assert score == pytest.approx(1.0, rel=0.05)
 
     @pytest.mark.sync
     def test_similarity_search_bm25_search(