Skip to content

Commit 83d03c9

Browse files
committed
lots of claude work to be reviewed
1 parent e0cf2f6 commit 83d03c9

File tree

18 files changed

+1421
-1360
lines changed

18 files changed

+1421
-1360
lines changed

docs/content/reference/migration/migration-0-28.md

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,3 +230,63 @@ The `Schema` class and related column descriptor/selector types have moved from
230230
| `from rerun.dataframe import IndexColumnSelector` | `from rerun.catalog import IndexColumnSelector` |
231231

232232
The previous import paths are still supported but will be removed in a future release.
233+
234+
## Python SDK: new `DatasetView` API for filtering datasets
235+
236+
A new `DatasetView` class has been introduced for filtering and reading from datasets. It provides a cleaner, lazily-evaluated API for working with subsets of dataset data.
237+
238+
### Creating a DatasetView
239+
240+
Use `filter_segments()` or `filter_contents()` on a `DatasetEntry` to create a `DatasetView`:
241+
242+
```python
243+
from rerun.catalog import CatalogClient
244+
245+
client = CatalogClient("rerun+http://localhost:51234")
246+
dataset = client.get_dataset(name="my_dataset")
247+
248+
# Filter to specific segments
249+
view = dataset.filter_segments(["recording_0", "recording_1"])
250+
251+
# Filter to specific entity paths
252+
view = dataset.filter_contents(["/points/**"])
253+
254+
# Chain filters
255+
view = dataset.filter_segments(["recording_0"]).filter_contents(["/points/**", "-/text/**"])
256+
```
257+
258+
### Reading data
259+
260+
Use `reader()` to get a DataFusion DataFrame:
261+
262+
```python
263+
df = view.reader(index="timeline")
264+
```
265+
266+
### Available methods
267+
268+
| Method | Description |
269+
|--------|-------------|
270+
| `filter_segments(segment_ids)` | Filter to specific segment IDs (list or DataFrame with `rerun_segment_id` column) |
271+
| `filter_contents(exprs)` | Filter to specific entity paths (supports wildcards like `/points/**`) |
272+
| `segment_ids()` | Get the list of segment IDs in this view |
273+
| `segment_table()` | Get segment metadata as a DataFusion DataFrame |
274+
| `schema()` | Get the filtered schema |
275+
| `arrow_schema()` | Get the filtered Arrow schema |
276+
| `reader(index=...)` | Create a DataFusion DataFrame reader |
277+
| `get_index_ranges(index)` | Get min/max values per segment for an index |
278+
| `download_segment(segment_id)` | Download a specific segment as a Recording |
279+
280+
### Deprecation of `dataframe_query_view()`
281+
282+
The `DatasetEntry.dataframe_query_view()` method is deprecated. Use the new `DatasetView` API instead:
283+
284+
```python
285+
# Before (deprecated)
286+
view = dataset.dataframe_query_view(index="timeline", contents={"/points": ["Position2D"]})
287+
df = view.df()
288+
289+
# After
290+
view = dataset.filter_contents(["/points/**"])
291+
df = view.reader(index="timeline")
292+
```

rerun_py/docs/gen_common_index.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -422,8 +422,8 @@ class Section:
422422
show_submodules=True,
423423
class_list=[
424424
"AlreadyExistsError",
425-
"DataframeQueryView",
426425
"DatasetEntry",
426+
"DatasetView",
427427
"CatalogClient",
428428
"Entry",
429429
"EntryId",

rerun_py/rerun_bindings/rerun_bindings.pyi

Lines changed: 34 additions & 177 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from __future__ import annotations
22

33
import os
4-
from collections.abc import Callable, Iterable, Sequence
4+
from collections.abc import Callable, Sequence
55
from datetime import datetime, timedelta
66
from enum import Enum
7-
from typing import TYPE_CHECKING, Any, Self
7+
from typing import TYPE_CHECKING, Any
88

99
import datafusion as dfn
1010
import numpy as np
@@ -1285,14 +1285,6 @@ class DatasetEntryInternal:
12851285
# ---
12861286

12871287
def download_segment(self, segment_id: str) -> Recording: ...
1288-
def dataframe_query_view(
1289-
self,
1290-
*,
1291-
index: str | None,
1292-
contents: Any,
1293-
include_semantically_empty_columns: bool = False,
1294-
include_tombstone_columns: bool = False,
1295-
) -> DataframeQueryView: ...
12961288

12971289
# ---
12981290

@@ -1341,6 +1333,38 @@ class DatasetEntryInternal:
13411333
unsafe_allow_recent_cleanup: bool = False,
13421334
) -> None: ...
13431335

1336+
# --- DatasetView filter methods ---
1337+
1338+
def filter_segments(self, segment_ids: list[str]) -> DatasetViewInternal: ...
1339+
def filter_contents(self, exprs: list[str]) -> DatasetViewInternal: ...
1340+
1341+
class DatasetViewInternal:
1342+
"""Internal Rust implementation of DatasetView."""
1343+
1344+
# Properties
1345+
@property
1346+
def dataset(self) -> DatasetEntryInternal: ...
1347+
@property
1348+
def filtered_segment_ids(self) -> set[str] | None: ...
1349+
@property
1350+
def content_filters(self) -> list[str]: ...
1351+
1352+
# Methods
1353+
def schema(self) -> SchemaInternal: ...
1354+
def arrow_schema(self) -> pa.Schema: ...
1355+
def segment_ids(self) -> list[str]: ...
1356+
def reader(
1357+
self,
1358+
*,
1359+
index: str | None,
1360+
include_semantically_empty_columns: bool = False,
1361+
include_tombstone_columns: bool = False,
1362+
fill_latest_at: bool = False,
1363+
using_index_values: dict[str, Any] | None = None,
1364+
) -> dfn.DataFrame: ...
1365+
def filter_segments(self, segment_ids: list[str]) -> DatasetViewInternal: ...
1366+
def filter_contents(self, exprs: list[str]) -> DatasetViewInternal: ...
1367+
13441368
class TableEntryInternal:
13451369
def catalog(self) -> CatalogClientInternal: ...
13461370
def delete(self) -> None: ...
@@ -1388,173 +1412,6 @@ class _IndexValuesLikeInternal:
13881412
def to_index_values(self) -> npt.NDArray[np.int64]: ...
13891413
def len(self) -> int: ...
13901414

1391-
class DataframeQueryView:
1392-
"""View into a remote dataset acting as DataFusion table provider."""
1393-
1394-
def filter_segment_id(self, segment_id: str, *args: Iterable[str]) -> Self:
1395-
"""Filter by one or more segment ids. All segment ids are included if not specified."""
1396-
1397-
def filter_range_sequence(self, start: int, end: int) -> Self:
1398-
"""
1399-
Filter the view to only include data between the given index sequence numbers.
1400-
1401-
This range is inclusive and will contain both the value at the start and the value at the end.
1402-
1403-
The view must be of a sequential index type to use this method.
1404-
1405-
Parameters
1406-
----------
1407-
start : int
1408-
The inclusive start of the range.
1409-
end : int
1410-
The inclusive end of the range.
1411-
1412-
Returns
1413-
-------
1414-
RecordingView
1415-
A new view containing only the data within the specified range.
1416-
1417-
The original view will not be modified.
1418-
1419-
"""
1420-
1421-
def filter_range_secs(self, start: float, end: float) -> Self:
1422-
"""
1423-
Filter the view to only include data between the given index values expressed as seconds.
1424-
1425-
This range is inclusive and will contain both the value at the start and the value at the end.
1426-
1427-
The view must be of a temporal index type to use this method.
1428-
1429-
Parameters
1430-
----------
1431-
start : int
1432-
The inclusive start of the range.
1433-
end : int
1434-
The inclusive end of the range.
1435-
1436-
Returns
1437-
-------
1438-
RecordingView
1439-
A new view containing only the data within the specified range.
1440-
1441-
The original view will not be modified.
1442-
1443-
"""
1444-
1445-
def filter_range_nanos(self, start: int, end: int) -> Self:
1446-
"""
1447-
Filter the view to only include data between the given index values expressed as nanoseconds.
1448-
1449-
This range is inclusive and will contain both the value at the start and the value at the end.
1450-
1451-
The view must be of a temporal index type to use this method.
1452-
1453-
Parameters
1454-
----------
1455-
start : int
1456-
The inclusive start of the range.
1457-
end : int
1458-
The inclusive end of the range.
1459-
1460-
Returns
1461-
-------
1462-
RecordingView
1463-
A new view containing only the data within the specified range.
1464-
1465-
The original view will not be modified.
1466-
1467-
"""
1468-
1469-
def filter_index_values(self, values: IndexValuesLike) -> Self:
1470-
"""
1471-
Filter the view to only include data at the provided index values.
1472-
1473-
The index values returned will be the intersection between the provided values and the
1474-
original index values.
1475-
1476-
This requires index values to be a precise match. Index values in Rerun are
1477-
represented as i64 sequence counts or nanoseconds. This API does not expose an interface
1478-
in floating point seconds, as the numerical conversion would risk false mismatches.
1479-
1480-
Parameters
1481-
----------
1482-
values : IndexValuesLike
1483-
The index values to filter by.
1484-
1485-
Returns
1486-
-------
1487-
RecordingView
1488-
A new view containing only the data at the specified index values.
1489-
1490-
The original view will not be modified.
1491-
1492-
"""
1493-
1494-
def filter_is_not_null(self, column: AnyComponentColumn) -> Self:
1495-
"""
1496-
Filter the view to only include rows where the given component column is not null.
1497-
1498-
This corresponds to rows for index values where this component was provided to Rerun explicitly
1499-
via `.log()` or `.send_columns()`.
1500-
1501-
Parameters
1502-
----------
1503-
column : AnyComponentColumn
1504-
The component column to filter by.
1505-
1506-
Returns
1507-
-------
1508-
RecordingView
1509-
A new view containing only the data where the specified component column is not null.
1510-
1511-
The original view will not be modified.
1512-
1513-
"""
1514-
1515-
def using_index_values(self, values: IndexValuesLike) -> Self:
1516-
"""
1517-
Create a new view that contains the provided index values.
1518-
1519-
If they exist in the original data they are selected, otherwise empty rows are added to the view.
1520-
1521-
The output view will always have the same number of rows as the provided values, even if
1522-
those rows are empty. Use with [`.fill_latest_at()`][rerun.dataframe.RecordingView.fill_latest_at]
1523-
to populate these rows with the most recent data.
1524-
1525-
Parameters
1526-
----------
1527-
values : IndexValuesLike
1528-
The index values to use.
1529-
1530-
Returns
1531-
-------
1532-
RecordingView
1533-
A new view containing the provided index values.
1534-
1535-
The original view will not be modified.
1536-
1537-
"""
1538-
1539-
def fill_latest_at(self) -> Self:
1540-
"""
1541-
Populate any null values in a row with the latest valid data according to the index.
1542-
1543-
Returns
1544-
-------
1545-
RecordingView
1546-
A new view with the null values filled in.
1547-
1548-
The original view will not be modified.
1549-
1550-
"""
1551-
1552-
def df(self) -> dfn.DataFrame:
1553-
"""Register this view to the global DataFusion context and return a DataFrame."""
1554-
1555-
def to_arrow_reader(self) -> pa.RecordBatchReader:
1556-
"""Convert this view to a [`pyarrow.RecordBatchReader`][]."""
1557-
15581415
class IndexProperties:
15591416
"""The properties and configuration of a user-defined index."""
15601417

rerun_py/rerun_sdk/rerun/catalog/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
AlreadyExistsError as AlreadyExistsError,
55
ComponentColumnDescriptor as ComponentColumnDescriptor,
66
ComponentColumnSelector as ComponentColumnSelector,
7-
DataframeQueryView as DataframeQueryView,
87
DataFusionTable as DataFusionTable,
98
EntryId as EntryId,
109
EntryKind as EntryKind,
@@ -26,5 +25,5 @@
2625
)
2726

2827
from ._catalog_client import CatalogClient as CatalogClient
29-
from ._entry import DatasetEntry as DatasetEntry, Entry as Entry, TableEntry as TableEntry
28+
from ._entry import DatasetEntry as DatasetEntry, DatasetView as DatasetView, Entry as Entry, TableEntry as TableEntry
3029
from ._schema import Schema as Schema

0 commit comments

Comments
 (0)