Skip to content

Commit 7f64241

Browse files
lucabellosinapah
andauthored
feat: send only one GrafanaSource to grafana (#379)
* feat: send only one GrafanaSource to grafana * fix tests * chore: add comment and add itest * chore: lint * fix: static * fix: itest * fix: comment * test: add traefik to itest * test: enhance docstring * fix: remove try except * fix: index * fix: lint * fix: test --------- Co-authored-by: Sina P <[email protected]> Co-authored-by: Sina <[email protected]>
1 parent 56a6e26 commit 7f64241

File tree

4 files changed

+145
-7
lines changed

4 files changed

+145
-7
lines changed

src/charm.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959

6060
logger = logging.getLogger(__name__)
6161

62+
6263
@dataclass
6364
class TLSConfig:
6465
"""TLS configuration received by the charm over the `certificates` relation."""
@@ -67,6 +68,7 @@ class TLSConfig:
6768
ca_cert: str
6869
private_key: str
6970

71+
7072
@trace_charm(
7173
tracing_endpoint="_charm_tracing_endpoint",
7274
server_cert="_charm_tracing_ca_cert",
@@ -143,7 +145,8 @@ def __init__(self, *args):
143145
self.grafana_source_provider = GrafanaSourceProvider(
144146
charm=self,
145147
source_type="alertmanager",
146-
source_url=self._external_url,
148+
source_url=self.ingress.url or self._service_url,
149+
is_ingress_per_app=True, # We want only one alertmanager datasource (unit) to be listed in grafana.
147150
refresh_event=[
148151
self.ingress.on.ready,
149152
self.ingress.on.revoked,
@@ -280,9 +283,7 @@ def set_ports(self):
280283

281284
@property
282285
def _catalogue_item(self) -> CatalogueItem:
283-
api_endpoints = {
284-
"Alerts": "/api/v2/alerts"
285-
}
286+
api_endpoints = {"Alerts": "/api/v2/alerts"}
286287

287288
return CatalogueItem(
288289
name="Alertmanager",
@@ -294,7 +295,9 @@ def _catalogue_item(self) -> CatalogueItem:
294295
"the configured receiver(s)."
295296
),
296297
api_docs="https://github.com/prometheus/alertmanager/blob/main/api/v2/openapi.yaml",
297-
api_endpoints={key: f"{self._external_url}{path}" for key, path in api_endpoints.items()},
298+
api_endpoints={
299+
key: f"{self._external_url}{path}" for key, path in api_endpoints.items()
300+
},
298301
)
299302

300303
@property
@@ -570,8 +573,7 @@ def _on_update_status(self, _):
570573
try:
571574
status = self.alertmanager_workload.api.status()
572575
logger.info(
573-
"alertmanager %s is up and running (uptime: %s); "
574-
"cluster mode: %s, with %d peers",
576+
"alertmanager %s is up and running (uptime: %s); cluster mode: %s, with %d peers",
575577
status["versionInfo"]["version"],
576578
status["uptime"],
577579
status["cluster"]["status"],
@@ -641,6 +643,19 @@ def _internal_url(self) -> str:
641643
"""Return the fqdn dns-based in-cluster (private) address of the alertmanager api server."""
642644
return f"{self._scheme}://{self._fqdn}:{self._ports.api}"
643645

646+
@property
647+
def _service_url(self) -> str:
648+
"""Return the FQDN DNS-based in-cluster (private) address of the service for Alertmanager.
649+
650+
Since our goal is to ensure that we only send one datasource to Grafana when we have multiple units, we can't use the socket FQDN because that would include the AM unit e.g. `http://am-0.am-endpoints.otel.svc.cluster.local:9093`.
651+
The service URL as defined will remove the pod unit so (when ingress missing) the request goes to the K8s service at: http://am-endpoints.otel.svc.cluster.local:9093
652+
The service will then load balance between the units.
653+
This assumes that the FQDN is the interal FQDN for the socket and that the pod unit is always on the left side of the first ".". If those change, this code will need to be updated.
654+
"""
655+
fqdn = self._fqdn.split(".", 1)[-1]
656+
657+
return f"{self._scheme}://{fqdn}:{self._ports.api}"
658+
644659
@property
645660
def _external_url(self) -> str:
646661
"""Return the externally-reachable (public) address of the alertmanager api server."""

tests/integration/conftest.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from datetime import datetime
1111
from pathlib import Path
1212

13+
import juju.utils
1314
import pytest
1415
from pytest_operator.plugin import OpsTest
1516

@@ -94,3 +95,13 @@ async def setup_env(ops_test: OpsTest):
9495
@pytest.fixture(scope="module")
9596
def temp_dir(tmp_path_factory):
9697
return tmp_path_factory.mktemp("data")
98+
99+
@pytest.fixture(scope="module", autouse=True)
100+
def patch_pylibjuju_series_2404():
101+
juju.utils.ALL_SERIES_VERSIONS["noble"] = "24.04"
102+
juju.utils.UBUNTU_SERIES["noble"] = "24.04"
103+
104+
yield
105+
106+
del juju.utils.ALL_SERIES_VERSIONS["noble"]
107+
del juju.utils.UBUNTU_SERIES["noble"]

tests/integration/helpers.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111
from typing import Dict, Optional, Tuple
1212
from urllib.parse import urlparse
1313

14+
import requests
15+
from juju.unit import Unit
1416
from pytest_operator.plugin import OpsTest
17+
from requests.auth import HTTPBasicAuth
1518

1619
logger = logging.getLogger(__name__)
1720

@@ -171,3 +174,54 @@ async def curl(ops_test: OpsTest, *, cert_dir: str, cert_path: str, ip_addr: str
171174
"non-zero return code means curl encountered a >= 400 HTTP code"
172175
)
173176
return stdout
177+
178+
async def grafana_password(ops_test: OpsTest, app_name: str) -> str:
179+
"""Get the admin password. Memoize it to reduce turnaround time.
180+
181+
Args:
182+
ops_test: pytest-operator plugin
183+
app_name: string name of application
184+
185+
Returns:
186+
admin password as a string
187+
"""
188+
leader: Optional[Unit] = None
189+
for unit in ops_test.model.applications[app_name].units: # type: ignore
190+
is_leader = await unit.is_leader_from_status()
191+
if is_leader:
192+
leader = unit
193+
break
194+
195+
assert leader
196+
action = await leader.run_action("get-admin-password")
197+
action = await action.wait()
198+
return action.results["admin-password"]
199+
200+
async def grafana_datasources(ops_test: OpsTest, app_name: str) -> "list[dict]":
201+
"""Get the datasources configured in Grafana.
202+
203+
A sample response from Grafana's /api/datasources endpoint is a list of datasources, similar to below.
204+
205+
[{"id":1,"uid":"ABC","orgId":1,"name":"<name>",
206+
"type":"alertmanager","typeName":"Alertmanager",
207+
"typeLogoUrl":"public/app/plugins/datasource/alertmanager/img/logo.svg","access":"proxy",
208+
"url":"<AM-address>","user":"","database":"","basicAuth":false,"isDefault":false,
209+
"jsonData":{"implementation":"prometheus","timeout":300},"readOnly":true}}, ...]
210+
211+
Args:
212+
ops_test: pytest-operator plugin
213+
app_name: string name of application
214+
Returns:
215+
number of datasources as an integer
216+
"""
217+
address = await get_unit_address(ops_test, app_name, 0)
218+
url = f"http://{address}:3000/api/datasources"
219+
220+
admin_password = await grafana_password(ops_test, app_name)
221+
response = requests.get(
222+
url,
223+
auth=HTTPBasicAuth("admin", admin_password),
224+
)
225+
response.raise_for_status()
226+
datasources = response.json()
227+
return datasources
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import asyncio
2+
import logging
3+
from pathlib import Path
4+
5+
import pytest
6+
import yaml
7+
from helpers import grafana_datasources
8+
from pytest_operator.plugin import OpsTest
9+
from tenacity import retry, stop_after_attempt, wait_fixed
10+
11+
# pyright: reportAttributeAccessIssue = false
12+
# pyright: reportOptionalMemberAccess = false
13+
14+
logger = logging.getLogger(__name__)
15+
16+
METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text())
17+
app_name = METADATA["name"]
18+
resources = {"alertmanager-image": METADATA["resources"]["alertmanager-image"]["upstream-source"]}
19+
20+
"""We need to ensure that, even if there are multiple units for Alertmanager, only one is shown as a datasouce in Grafana.
21+
We use this test to simulate multiple units of Alertmanager, and then check that only the leader has the key `grafana_source_host` written to relation data with Grafana.
22+
"""
23+
24+
@pytest.mark.abort_on_fail
25+
async def test_build_and_deploy(ops_test: OpsTest, charm_under_test):
26+
"""Build the charm-under-test, deploy the charm from charmhub, and upgrade from path."""
27+
await asyncio.gather(
28+
ops_test.model.deploy(charm_under_test, "am", resources=resources, trust=True, num_units=2),
29+
ops_test.model.deploy("grafana-k8s", "grafana", channel="2/edge", trust=True),
30+
)
31+
32+
await ops_test.model.add_relation("grafana:grafana-source", "am")
33+
await ops_test.model.wait_for_idle(apps=["am", "grafana"], status="active")
34+
35+
@retry(wait=wait_fixed(10), stop=stop_after_attempt(6))
36+
async def test_grafana_datasources(ops_test: OpsTest):
37+
# We have 2 units of Alertmanager, but only one datasource should be shown as a Grafana source.
38+
datasources = await grafana_datasources(ops_test, "grafana")
39+
assert len(datasources) == 1
40+
41+
# The datasource URL should point to the service, not to a specific pod unit.
42+
# This check is safe, because we name the application `am` and we're not using TLS, so the service will always start with `http://am-endpoints`.
43+
assert datasources[0]["url"].startswith("http://am-endpoints")
44+
45+
@pytest.mark.abort_on_fail
46+
async def test_deploy_and_integrate_traefik(ops_test: OpsTest):
47+
"""Build the charm-under-test, deploy the charm from charmhub, and upgrade from path."""
48+
await ops_test.model.deploy("traefik-k8s", "traefik", channel="edge", trust=True)
49+
50+
await ops_test.model.add_relation("traefik:ingress", "am")
51+
await ops_test.model.wait_for_idle(apps=["am", "grafana", "traefik"], status="active")
52+
53+
async def test_grafana_datasources_when_ingress_available(ops_test: OpsTest):
54+
# We have 2 units of Alertmanager, but only one datasource should be shown as a Grafana source.
55+
datasources = await grafana_datasources(ops_test, "grafana")
56+
assert len(datasources) == 1
57+
58+
assert "am-endpoints" not in datasources[0]["url"]

0 commit comments

Comments
 (0)