diff --git a/docs/RFC-9-OZX-Implementation.md b/docs/RFC-9-OZX-Implementation.md new file mode 100644 index 00000000..d3233796 --- /dev/null +++ b/docs/RFC-9-OZX-Implementation.md @@ -0,0 +1,413 @@ +# RFC-9 Zipped OME-Zarr (.ozx) Implementation + +**Date**: 2026-01-28 +**RFC Spec**: https://ngff.openmicroscopy.org/rfc/9/index.html +**OME-Zarr v0.5 Spec**: https://ngff.openmicroscopy.org/0.5/index.html + +## Overview + +This document describes the implementation of RFC-9 support for reading OME-Zarr data from ZIP archives (`.ozx` files) in Fileglancer. The implementation allows users to browse, preview, and access OME-Zarr imaging data stored in compressed ZIP archives without extracting them. + +**Important**: RFC-9 is designed specifically for OME-Zarr v0.5, which is built on **Zarr v3 only**. This implementation does not support Zarr v2 within OZX files. + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Frontend │ +│ ┌─────────────────┐ ┌──────────────┐ ┌─────────────────┐ │ +│ │ ozxDetection.ts │───▶│ OzxFetchStore│───▶│ zarrita/ome-zarr│ │ +│ │ (detection) │ │(custom store)│ │ (existing) │ │ +│ └─────────────────┘ └──────────────┘ └─────────────────┘ │ +│ │ │ +└───────────────────────────────│─────────────────────────────────┘ + │ HTTP + Range requests + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Backend │ +│ ┌──────────────────────┐ ┌─────────────────────────────┐ │ +│ │ /api/zip-content/ │ │ OZXReader (ozxzip.py) │ │ +│ │ /api/ozx-metadata/ │───▶│ - OME metadata parsing │ │ +│ │ /api/zip-list/ │ │ - jsonFirst optimization │ │ +│ └──────────────────────┘ ├─────────────────────────────┤ │ +│ │ ZipReader (zipread.py) │ │ +│ │ - ZIP64 support │ │ +│ │ - Partial CD parsing │ │ +│ │ - Range request streaming │ │ +│ └─────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Files Created/Modified + +### Backend (Python) + +| File | Action | Description | +| ------------------------ | ---------- | --------------------------------------------- | +| `fileglancer/zipread.py` | **CREATE** | Generic ZIP reader with streaming support | +| `fileglancer/ozxzip.py` | **CREATE** | RFC-9 OZX layer extending ZipReader | +| `fileglancer/app.py` | MODIFY | Add `/api/ozx-*` endpoints | +| `fileglancer/model.py` | MODIFY | Add OZX Pydantic models | +| `tests/test_zipread.py` | **CREATE** | Unit tests for generic ZipReader (27 tests) | +| `tests/test_ozxzip.py` | **CREATE** | Unit tests for OZXReader (31 tests) | + +### Frontend (TypeScript) + +| File | Action | Description | +| ------------------------------------------------------- | ---------- | -------------------------------------- | +| `frontend/src/utils/ozxDetection.ts` | **CREATE** | `.ozx` file detection utilities | +| `frontend/src/queries/ozxQueries.ts` | **CREATE** | TanStack Query hooks and OzxFetchStore | +| `frontend/src/queries/zarrQueries.ts` | MODIFY | OZX detection integration | +| `frontend/src/__tests__/unitTests/ozxDetection.test.ts` | **CREATE** | Frontend detection tests (20 tests) | + +## Backend Implementation Details + +The backend uses a two-layer architecture separating generic ZIP functionality from OZX-specific features. + +### ZipReader (`fileglancer/zipread.py`) + +Generic ZIP file reader providing: + +1. **EOCD Parsing**: Locates End of Central Directory record by scanning backwards from file end +2. **ZIP64 Support**: Handles large archives with ZIP64 extended fields +3. **Compression**: Supports STORE (uncompressed) and DEFLATE compression methods +4. **Range Streaming**: Efficient byte-range streaming for HTTP Range requests +5. **Flexible Parsing**: Supports `stop_condition` callback and `max_new_entries` limit + +Key classes and functions: + +- `ZipReader`: Generic ZIP reader with context manager support +- `ZipEntry`: Individual file entry from central directory +- `ZipReaderError`, `InvalidZipError`: Exception classes + +#### Central Directory Parsing API + +```python +def parse_central_directory( + self, + stop_condition: Optional[Callable[[ZipEntry, int], bool]] = None, + max_new_entries: Optional[int] = None +) -> Dict[str, ZipEntry]: + """ + Parse the central directory. + + Args: + stop_condition: Optional callback receiving (entry, index). + Returns True to stop parsing after the current entry. + max_new_entries: Optional maximum number of entries to parse. + + Returns: + Dictionary mapping filenames to ZipEntry objects + """ +``` + +**Examples**: + +```python +# Parse all entries +entries = reader.parse_central_directory() + +# Stop after 100 entries +entries = reader.parse_central_directory(max_new_entries=100) + +# Stop when finding a specific file +def stop_at_target(entry, index): + return entry.filename == "target.json" +entries = reader.parse_central_directory(stop_condition=stop_at_target) + +# Stop after processing 5 JSON files +json_count = [0] +def stop_after_5_json(entry, index): + if entry.filename.endswith('.json'): + json_count[0] += 1 + return json_count[0] >= 5 +entries = reader.parse_central_directory(stop_condition=stop_after_5_json) +``` + +### OZXReader (`fileglancer/ozxzip.py`) + +Extends `ZipReader` with RFC-9 OZX-specific functionality: + +1. **OME Metadata**: Parses ZIP comment for RFC-9 OME metadata JSON +2. **jsonFirst Optimization**: When `jsonFirst=true` in metadata, stops parsing central directory after last JSON metadata file +3. **Metadata File Detection**: Identifies `.json`, `.zattrs`, `.zarray`, `.zgroup` files + +Key classes and functions: + +- `OZXReader`: Extends ZipReader with OZX-specific methods +- `OZXMetadata`: Parsed OME metadata from ZIP comment +- `is_json_metadata_file()`: Check if filename is a JSON metadata file +- `is_ozx_file()`: Check if filename has `.ozx` extension + +#### jsonFirst Optimization + +```python +with OZXReader(path) as reader: + metadata = reader.get_ome_metadata() + + # Parse only JSON metadata files (efficient for large archives) + if metadata and metadata.json_first: + entries = reader.parse_central_directory(json_only=True) + else: + entries = reader.parse_central_directory() +``` + +### API Endpoints + +#### `GET /api/zip-content/{path_name:path}?subpath={internal_path}` + +Streams file content from within an OZX archive. Supports HTTP Range requests for efficient chunk access. + +**Response Headers**: + +- `Accept-Ranges: bytes` +- `Content-Length: {size}` +- `Content-Range: bytes {start}-{end}/{total}` (for 206 responses) + +#### `HEAD /api/zip-content/{path_name:path}?subpath={internal_path}` + +Returns file metadata without content body. + +#### `GET /api/ozx-metadata/{path_name:path}` + +Returns OZX archive metadata: + +```json +{ + "version": "0.5", + "json_first": true, + "file_count": 42, + "is_zip64": false +} +``` + +#### `GET /api/zip-list/{path_name:path}?prefix={optional_prefix}` + +Lists files in the OZX archive: + +```json +{ + "files": ["zarr.json", "0/zarr.json", "0/c/0/0/0", ...] +} +``` + +## Frontend Implementation Details + +### Detection Utilities (`ozxDetection.ts`) + +```typescript +// Check if a file is an OZX file +isOzxFile(file: FileOrFolder): boolean + +// Check filename extension +isOzxFilename(filename: string): boolean + +// Check if array contains OZX files +hasOzxFiles(files: FileOrFolder[]): boolean + +// Filter to get only OZX files +getOzxFiles(files: FileOrFolder[]): FileOrFolder[] +``` + +### OzxFetchStore (`ozxQueries.ts`) + +A zarrita-compatible store that reads from OZX archives via the API: + +```typescript +class OzxFetchStore { + constructor(fspName: string, ozxPath: string); + + // Get full file content + async get(key: string): Promise; + + // Get byte range (for efficient chunk access) + async getRange( + key: string, + offset: number, + length: number + ): Promise; + + // Check if file exists + async has(key: string): Promise; + + // List files with optional prefix + async list(prefix?: string): Promise; +} +``` + +### Query Hooks + +```typescript +// Fetch OZX archive metadata +useOzxMetadataQuery(fspName, ozxFilePath, enabled?) + +// Fetch list of files in OZX +useOzxFileListQuery(fspName, ozxFilePath, prefix?, enabled?) + +// Fetch Zarr v3 metadata from OZX file (RFC-9 requires Zarr v3) +useOzxZarrMetadataQuery({ fspName, ozxFile }) +``` + +### Zarr Version Detection + +```typescript +// Detects Zarr v3 in OZX archives (RFC-9 requires Zarr v3 only) +detectOzxZarrVersions(files: string[]): ('v3')[] +``` + +Note: Unlike regular Zarr directories which can be v2 or v3, OZX files per RFC-9 only support Zarr v3 (OME-Zarr v0.5). The detection function only looks for `zarr.json` files and ignores Zarr v2 markers (`.zarray`, `.zattrs`, `.zgroup`). + +## Modular Architecture + +The implementation separates generic ZIP functionality from OZX-specific features: + +``` +┌──────────────────────────────────────┐ +│ OZXReader │ +│ - OME metadata parsing │ +│ - jsonFirst optimization │ +│ - is_json_metadata_file() │ +├──────────────────────────────────────┤ +│ ZipReader │ +│ - EOCD/ZIP64 parsing │ +│ - Central directory parsing │ +│ - stop_condition & max_new_entries │ +│ - File streaming & range requests │ +│ - STORE/DEFLATE compression │ +└──────────────────────────────────────┘ +``` + +**Benefits**: + +1. **Reusability**: `ZipReader` can be used for any ZIP file, not just OZX +2. **Testability**: Each layer has focused unit tests +3. **Extensibility**: New ZIP-based formats can extend `ZipReader` +4. **Separation of Concerns**: Generic ZIP logic is decoupled from OME-specific features + +## RFC-9 ZIP Comment Format + +The OZX file's ZIP comment contains OME metadata: + +```json +{ + "ome": { + "version": "0.5", + "zipFile": { + "centralDirectory": { + "jsonFirst": true + } + } + } +} +``` + +When `jsonFirst` is true, JSON metadata files (.json, .zattrs, .zarray, .zgroup) are sorted first in the central directory, allowing partial parsing for metadata discovery. + +## Testing + +### Backend Tests + +```bash +# Run all ZIP/OZX tests +pixi run -e test pytest tests/test_zipread.py tests/test_ozxzip.py -v + +# Run only generic ZIP tests +pixi run -e test pytest tests/test_zipread.py -v + +# Run only OZX-specific tests +pixi run -e test pytest tests/test_ozxzip.py -v +``` + +#### Generic ZipReader Tests (`test_zipread.py`) + +Tests cover: + +- Basic reader operations (open, close, context manager) +- Central directory parsing +- `stop_condition` callback with index parameter +- `max_new_entries` limit parameter +- Combined `stop_condition` and `max_new_entries` +- File reading and streaming +- Range request streaming +- DEFLATE compression +- Edge cases (empty archive, unopened reader) + +#### OZX-Specific Tests (`test_ozxzip.py`) + +Tests cover: + +- OZX file detection utilities +- OME metadata parsing (valid, missing, invalid JSON) +- jsonFirst optimization +- File reading (text, binary, compressed) +- Range request streaming +- Unicode filenames +- Edge cases + +### Frontend Tests + +```bash +pixi run test-frontend -- src/__tests__/unitTests/ozxDetection.test.ts +``` + +Tests cover: + +- File detection (extension matching, directories) +- Array filtering functions +- Path handling +- Zarr version detection within OZX + +## Usage Example + +### Reading OZX in Frontend + +```typescript +import { isOzxFile } from '@/utils/ozxDetection'; +import { useOzxZarrMetadataQuery } from '@/queries/zarrQueries'; + +function ZarrViewer({ file, fspName }) { + // Check if this is an OZX file + if (isOzxFile(file)) { + const { data, isLoading } = useOzxZarrMetadataQuery({ + fspName, + ozxFile: file + }); + + if (data?.metadata) { + // Use data.metadata for display + // data.omeZarrUrl can be passed to viewers + // data.store provides the OzxFetchStore for chunk access + } + } +} +``` + +### Direct API Access + +```bash +# Get archive metadata +curl http://localhost:7878/api/ozx-metadata/myFSP/path/to/data.ozx + +# List files +curl http://localhost:7878/api/zip-list/myFSP/path/to/data.ozx + +# Get file content +curl http://localhost:7878/api/zip-content/myFSP/path/to/data.ozx?subpath=zarr.json + +# Get range (for chunk access) +curl -H "Range: bytes=0-1023" \ + http://localhost:7878/api/zip-content/myFSP/path/to/data.ozx?subpath=0/c/0/0/0 +``` + +## Future Enhancements + +1. **Write Support**: Currently read-only; could add ability to update OZX files +2. **Caching**: Add server-side caching of central directory for frequently accessed archives +3. **Thumbnail Generation**: Integrate with existing thumbnail generation for OZX OME-Zarr +4. **Neuroglancer Integration**: Generate Neuroglancer URLs pointing to OZX content + +## Related Documentation + +- [RFC-9 Specification](https://ngff.openmicroscopy.org/rfc/9/index.html) +- [OME-NGFF Specification](https://ngff.openmicroscopy.org/) +- [Zarr v3 Specification](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html) diff --git a/fileglancer/app.py b/fileglancer/app.py index b346b005..a8c67010 100644 --- a/fileglancer/app.py +++ b/fileglancer/app.py @@ -34,6 +34,7 @@ from fileglancer.utils import format_timestamp, guess_content_type, parse_range_header from fileglancer.user_context import UserContext, EffectiveUserContext, CurrentUserContext, UserContextConfigurationError from fileglancer.filestore import Filestore, RootCheckError +from fileglancer.ozxzip import OZXReader, OZXReaderError, InvalidZipError, is_ozx_file, is_zip_file from fileglancer.log import AccessLogMiddleware from x2s3.utils import get_read_access_acl, get_nosuchbucket_response, get_error_response @@ -1214,6 +1215,342 @@ async def get_file_content(request: Request, path_name: str, subpath: Optional[s ) + @app.head("/api/zip-content/{path_name:path}") + async def head_zip_file_content( + path_name: str, + subpath: str = Query(..., description="Path within the ZIP file"), + username: str = Depends(get_current_user) + ): + """HEAD request for ZIP file content (returns size, supports Range).""" + + filestore_name, _, zip_subpath = path_name.partition('/') + + with _get_user_context(username): + filestore, error = _get_filestore(filestore_name) + if filestore is None: + raise HTTPException(status_code=404 if "not found" in error else 500, detail=error) + + try: + zip_file_path = filestore._check_path_in_root(zip_subpath) + except RootCheckError as e: + raise HTTPException(status_code=400, detail=str(e)) + + if not is_zip_file(zip_file_path): + raise HTTPException(status_code=400, detail="Not a ZIP file") + + try: + reader = OZXReader(zip_file_path) + reader.open() + except FileNotFoundError: + raise HTTPException(status_code=404, detail="ZIP file not found") + except (InvalidZipError, OZXReaderError) as e: + raise HTTPException(status_code=400, detail=f"Invalid ZIP file: {e}") + + # Parse central directory and get entry (outside user context) + try: + reader.parse_central_directory() + entry = reader.get_entry(subpath) + if entry is None: + reader.close() + raise HTTPException(status_code=404, detail="File not found in ZIP archive") + + file_size = entry.uncompressed_size + content_type = guess_content_type(subpath) + file_name = subpath.split('/')[-1] if subpath else '' + + headers = { + 'Accept-Ranges': 'bytes', + 'Content-Length': str(file_size), + } + + if content_type == 'application/octet-stream' and file_name: + headers['Content-Disposition'] = f'attachment; filename="{file_name}"' + + reader.close() + return Response(status_code=200, headers=headers, media_type=content_type) + + except Exception as e: + reader.close() + raise HTTPException(status_code=500, detail=str(e)) + + + @app.get("/api/zip-content/{path_name:path}") + async def get_zip_file_content( + request: Request, + path_name: str, + subpath: str = Query(..., description="Path within the ZIP file"), + username: str = Depends(get_current_user) + ): + """ + Stream file content from within a ZIP archive. + Supports HTTP Range requests for efficient chunk access. + """ + + filestore_name, _, zip_subpath = path_name.partition('/') + + with _get_user_context(username): + filestore, error = _get_filestore(filestore_name) + if filestore is None: + raise HTTPException(status_code=404 if "not found" in error else 500, detail=error) + + try: + zip_file_path = filestore._check_path_in_root(zip_subpath) + except RootCheckError as e: + raise HTTPException(status_code=400, detail=str(e)) + + if not is_zip_file(zip_file_path): + raise HTTPException(status_code=400, detail="Not a ZIP file") + + try: + reader = OZXReader(zip_file_path) + reader.open() + except FileNotFoundError: + raise HTTPException(status_code=404, detail="ZIP file not found") + except (InvalidZipError, OZXReaderError) as e: + raise HTTPException(status_code=400, detail=f"Invalid ZIP file: {e}") + + # Parse central directory and get entry (outside user context) + # The file handle retains access rights + try: + reader.parse_central_directory() + entry = reader.get_entry(subpath) + if entry is None: + reader.close() + raise HTTPException(status_code=404, detail="File not found in ZIP archive") + + content_type = guess_content_type(subpath) + file_size = entry.uncompressed_size + file_name = subpath.split('/')[-1] if subpath else '' + range_header = request.headers.get('Range') + + if range_header: + # Handle Range request (HTTP 206) + range_result = parse_range_header(range_header, file_size) + if range_result is None: + reader.close() + return Response(status_code=416, headers={'Content-Range': f'bytes */{file_size}'}) + + start, end = range_result + + async def stream_range(): + try: + for chunk in reader.stream_file_range(subpath, start, end): + yield chunk + finally: + reader.close() + + headers = { + 'Accept-Ranges': 'bytes', + 'Content-Length': str(end - start + 1), + 'Content-Range': f'bytes {start}-{end}/{file_size}', + } + + if content_type == 'application/octet-stream' and file_name: + headers['Content-Disposition'] = f'attachment; filename="{file_name}"' + + return StreamingResponse( + stream_range(), + status_code=206, + headers=headers, + media_type=content_type + ) + else: + # Full file (HTTP 200) + async def stream_full(): + try: + for chunk in reader.stream_file(subpath): + yield chunk + finally: + reader.close() + + headers = { + 'Accept-Ranges': 'bytes', + 'Content-Length': str(file_size), + } + + if content_type == 'application/octet-stream' and file_name: + headers['Content-Disposition'] = f'attachment; filename="{file_name}"' + + return StreamingResponse( + stream_full(), + status_code=200, + headers=headers, + media_type=content_type + ) + + except FileNotFoundError: + reader.close() + raise HTTPException(status_code=404, detail="File not found in ZIP archive") + except Exception as e: + reader.close() + logger.exception(f"Error reading ZIP content: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + + @app.get("/api/ozx-metadata/{path_name:path}") + async def get_ozx_metadata( + path_name: str, + username: str = Depends(get_current_user) + ): + """ + Get metadata about an OZX archive. + Returns OME version, jsonFirst flag, file count, and ZIP64 status. + """ + + filestore_name, _, ozx_subpath = path_name.partition('/') + + with _get_user_context(username): + filestore, error = _get_filestore(filestore_name) + if filestore is None: + raise HTTPException(status_code=404 if "not found" in error else 500, detail=error) + + try: + ozx_file_path = filestore._check_path_in_root(ozx_subpath) + except RootCheckError as e: + raise HTTPException(status_code=400, detail=str(e)) + + if not is_zip_file(ozx_file_path): + raise HTTPException(status_code=400, detail="Not an OZX file") + + try: + reader = OZXReader(ozx_file_path) + reader.open() + except FileNotFoundError: + raise HTTPException(status_code=404, detail="OZX file not found") + except (InvalidZipError, OZXReaderError) as e: + raise HTTPException(status_code=400, detail=f"Invalid OZX file: {e}") + + # Get metadata outside user context + try: + metadata = reader.get_metadata() + entries = reader.parse_central_directory(json_only=metadata.json_first if metadata else False) + + result = { + "version": metadata.version if metadata else None, + "json_first": metadata.json_first if metadata else False, + "file_count": len(entries), + "is_zip64": reader.is_zip64 + } + + reader.close() + return result + + except Exception as e: + reader.close() + logger.exception(f"Error reading OZX metadata: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + + @app.get("/api/zip-list/{path_name:path}") + async def list_zip_files( + path_name: str, + prefix: str = Query('', description="Filter files by prefix"), + details: bool = Query(False, description="Include file details (size, compression)"), + offset: int = Query(0, ge=0, description="Number of entries to skip"), + limit: int = Query(100, ge=1, le=1000, description="Maximum entries to return"), + username: str = Depends(get_current_user) + ): + """ + List files in a ZIP archive with pagination support. + Optionally filter by path prefix. + If details=True, returns full file entry information including size. + + Pagination: + - offset: Number of entries to skip (default 0) + - limit: Maximum entries to return (default 100, max 1000) + + Response includes: + - total_count: Total number of entries in the archive + - offset: Current offset + - limit: Current limit + - has_more: Whether more entries exist beyond this page + """ + + filestore_name, _, zip_subpath = path_name.partition('/') + + with _get_user_context(username): + filestore, error = _get_filestore(filestore_name) + if filestore is None: + raise HTTPException(status_code=404 if "not found" in error else 500, detail=error) + + try: + zip_file_path = filestore._check_path_in_root(zip_subpath) + except RootCheckError as e: + raise HTTPException(status_code=400, detail=str(e)) + + if not is_zip_file(zip_file_path): + raise HTTPException(status_code=400, detail="Not a ZIP file") + + try: + reader = OZXReader(zip_file_path) + reader.open() + except FileNotFoundError: + raise HTTPException(status_code=404, detail="ZIP file not found") + except (InvalidZipError, OZXReaderError) as e: + raise HTTPException(status_code=400, detail=f"Invalid ZIP file: {e}") + + # List files outside user context + try: + # Get total count from central directory metadata (available after open) + total_count = reader.cd_entries_count + + # Parse entries up to offset + limit + reader.parse_central_directory(max_new_entries=offset + limit) + + # Get all parsed entries as a list (preserves CD order) + all_entries = list(reader.entries.values()) + + # Apply offset and limit + paginated_entries = all_entries[offset:offset + limit] + + # Calculate has_more + has_more = offset + limit < total_count + + if details: + # Apply prefix filter if specified + if prefix: + paginated_entries = [e for e in paginated_entries if e.filename.startswith(prefix)] + + # Return full file entry details with pagination info + entries = [] + for entry in paginated_entries: + entries.append({ + "filename": entry.filename, + "compressed_size": entry.compressed_size, + "uncompressed_size": entry.uncompressed_size, + "compression_method": entry.compression_method, + "is_directory": entry.is_directory + }) + reader.close() + return { + "entries": entries, + "total_count": total_count, + "offset": offset, + "limit": limit, + "has_more": has_more + } + else: + # Apply prefix filter if specified + if prefix: + paginated_entries = [e for e in paginated_entries if e.filename.startswith(prefix)] + + # Return just filenames with pagination info + files = [e.filename for e in paginated_entries if not e.is_directory] + reader.close() + return { + "files": files, + "total_count": total_count, + "offset": offset, + "limit": limit, + "has_more": has_more + } + + except Exception as e: + reader.close() + logger.exception(f"Error listing ZIP files: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + @app.get("/api/files/{path_name}") async def get_file_metadata(path_name: str, subpath: Optional[str] = Query(''), username: str = Depends(get_current_user)): diff --git a/fileglancer/model.py b/fileglancer/model.py index ffe5330e..52b645ad 100644 --- a/fileglancer/model.py +++ b/fileglancer/model.py @@ -303,3 +303,41 @@ class NeuroglancerShortLinkResponse(BaseModel): links: List[NeuroglancerShortLink] = Field( description="A list of stored Neuroglancer short links" ) + + +class OZXFileEntry(BaseModel): + """A file entry within an OZX archive""" + filename: str = Field( + description="The path of the file within the OZX archive" + ) + compressed_size: int = Field( + description="The compressed size of the file in bytes" + ) + uncompressed_size: int = Field( + description="The uncompressed size of the file in bytes" + ) + compression_method: int = Field( + description="The compression method (0=STORE, 8=DEFLATE)" + ) + is_directory: bool = Field( + description="Whether this entry is a directory" + ) + + +class OZXMetadataResponse(BaseModel): + """Metadata about an OZX archive""" + version: Optional[str] = Field( + description="The OME version from the ZIP comment", + default=None + ) + json_first: bool = Field( + description="Whether JSON files are sorted first in the central directory", + default=False + ) + file_count: int = Field( + description="Number of files in the archive" + ) + is_zip64: bool = Field( + description="Whether the archive uses ZIP64 format", + default=False + ) diff --git a/fileglancer/ozxzip.py b/fileglancer/ozxzip.py new file mode 100644 index 00000000..1172b650 --- /dev/null +++ b/fileglancer/ozxzip.py @@ -0,0 +1,260 @@ +"""RFC-9 compliant reader for .ozx (Zipped OME-Zarr) files. + +RFC-9 Spec: https://ngff.openmicroscopy.org/rfc/9/index.html +OME-Zarr v0.5 Spec: https://ngff.openmicroscopy.org/0.5/index.html + +This module extends the generic ZipReader with OZX-specific functionality: +- OME metadata parsing from ZIP comment +- jsonFirst optimization for partial central directory parsing +""" + +import json +from dataclasses import dataclass +from typing import Optional, Dict, Callable + +from loguru import logger + +from .zipread import ZipReader, ZipEntry, ZipReaderError, InvalidZipError + + +@dataclass +class OZXMetadata: + """Parsed OME metadata from ZIP comment (RFC-9 format). + + RFC-9 defines the ZIP comment format as: + { + "ome": { + "version": "0.5", + "zipFile": { + "centralDirectory": { + "jsonFirst": true + } + } + } + } + """ + version: str + json_first: bool = False + raw_comment: Optional[str] = None + + +class OZXReaderError(ZipReaderError): + """Base exception for OZX reader errors.""" + pass + + +class InvalidOZXError(OZXReaderError): + """Raised when the file is not a valid OZX file.""" + pass + + +def is_json_metadata_file(filename: str) -> bool: + """Check if a filename is a JSON metadata file. + + Used for the jsonFirst optimization - these files are sorted + first in the central directory when jsonFirst=True. + + Args: + filename: The filename to check + + Returns: + True if this is a JSON metadata file + """ + name = filename.lower() + return (name.endswith('.json') or + name.endswith('.zattrs') or + name.endswith('.zarray') or + name.endswith('.zgroup')) + + +class OZXReader(ZipReader): + """ + RFC-9 compliant reader for .ozx (Zipped OME-Zarr) files. + + Extends ZipReader with OZX-specific functionality: + - Parses OME metadata from ZIP comment + - Supports jsonFirst optimization for partial central directory parsing + + Note: RFC-9 is for OME-Zarr v0.5 which requires Zarr v3 only. + + Usage: + with OZXReader('/path/to/file.ozx') as reader: + metadata = reader.get_ome_metadata() + if metadata and metadata.json_first: + entries = reader.parse_central_directory(json_only=True) + else: + entries = reader.parse_central_directory() + content = reader.read_file('zarr.json') + """ + + def __init__(self, file_path: str): + """Initialize the OZX reader. + + Args: + file_path: Path to the .ozx file + """ + super().__init__(file_path) + self._ome_metadata: Optional[OZXMetadata] = None + + def open(self) -> 'OZXReader': + """Open the file, parse EOCD, and extract OME metadata. + + Returns: + Self for method chaining + + Raises: + FileNotFoundError: If the file doesn't exist + InvalidZipError: If the file is not a valid ZIP + """ + super().open() + # Parse OME metadata from ZIP comment + self._ome_metadata = self._parse_ome_comment(self.comment) + return self + + def get_ome_metadata(self) -> Optional[OZXMetadata]: + """Get parsed OME metadata from ZIP comment. + + Returns: + OZXMetadata if valid OME metadata found, None otherwise + """ + return self._ome_metadata + + # Alias for backward compatibility + def get_metadata(self) -> Optional[OZXMetadata]: + """Alias for get_ome_metadata() for backward compatibility.""" + return self.get_ome_metadata() + + def parse_central_directory( + self, + json_only: bool = False, + stop_condition: Optional[Callable[[ZipEntry, int], bool]] = None, + max_new_entries: Optional[int] = None + ) -> Dict[str, ZipEntry]: + """ + Parse the central directory with optional jsonFirst optimization. + + Args: + json_only: If True and jsonFirst=True in metadata, stop parsing + after the last JSON metadata file. This is the RFC-9 + optimization for efficient metadata discovery. + stop_condition: Optional callback (passed to parent). + max_new_entries: Optional maximum number of entries to parse (passed to parent). + + Returns: + Dictionary mapping filenames to ZipEntry objects + + Raises: + InvalidZipError: If central directory is corrupted + """ + if json_only and self._ome_metadata and self._ome_metadata.json_first: + # Use the stop condition to implement jsonFirst optimization + def stop_at_non_json(entry: ZipEntry, index: int) -> bool: + # Check user's stop condition first + if stop_condition and stop_condition(entry, index): + return True + + if entry.is_directory: + return False + return not is_json_metadata_file(entry.filename) + + return super().parse_central_directory(stop_condition=stop_at_non_json, max_new_entries=max_new_entries) + else: + return super().parse_central_directory(stop_condition=stop_condition, max_new_entries=max_new_entries) + + def _parse_ome_comment(self, comment: str) -> Optional[OZXMetadata]: + """Parse ZIP comment for RFC-9 OME metadata. + + RFC-9 comment format: + { + "ome": { + "version": "0.5", + "zipFile": { + "centralDirectory": { + "jsonFirst": true + } + } + } + } + + Args: + comment: ZIP file comment string + + Returns: + OZXMetadata if valid, None otherwise + """ + if not comment: + return None + + try: + data = json.loads(comment) + if not isinstance(data, dict) or 'ome' not in data: + logger.debug("ZIP comment is not OME metadata") + return None + + ome = data['ome'] + if not isinstance(ome, dict) or 'version' not in ome: + logger.debug("Invalid OME metadata structure") + return None + + version = str(ome['version']) + + # Check for jsonFirst flag + json_first = False + zip_file = ome.get('zipFile', {}) + if isinstance(zip_file, dict): + cd = zip_file.get('centralDirectory', {}) + if isinstance(cd, dict): + json_first = bool(cd.get('jsonFirst', False)) + + logger.debug(f"Parsed OZX metadata: version={version}, jsonFirst={json_first}") + return OZXMetadata( + version=version, + json_first=json_first, + raw_comment=comment + ) + + except json.JSONDecodeError as e: + logger.debug(f"Failed to parse ZIP comment as JSON: {e}") + return None + + +def is_ozx_file(filename: str) -> bool: + """Check if a filename has the .ozx extension. + + Args: + filename: Filename to check + + Returns: + True if the file has a .ozx extension + """ + return filename.lower().endswith('.ozx') + + +def is_zip_file(filename: str) -> bool: + """Check if a filename has a .zip or .ozx extension. + + Args: + filename: Filename to check + + Returns: + True if the file has a .zip or .ozx extension + """ + name = filename.lower() + return name.endswith('.zip') or name.endswith('.ozx') + + +# Re-export commonly used items from zipread for convenience +__all__ = [ + 'OZXReader', + 'OZXMetadata', + 'OZXReaderError', + 'InvalidOZXError', + 'is_ozx_file', + 'is_zip_file', + 'is_json_metadata_file', + # Re-exports from zipread + 'ZipReader', + 'ZipEntry', + 'ZipReaderError', + 'InvalidZipError', +] diff --git a/fileglancer/zipread.py b/fileglancer/zipread.py new file mode 100644 index 00000000..c9ae5ece --- /dev/null +++ b/fileglancer/zipread.py @@ -0,0 +1,626 @@ +"""Generic ZIP file reader with streaming support. + +This module provides functionality to read ZIP archives with support for: +- ZIP64 format for large files +- STORE and DEFLATE compression methods +- Range request streaming for efficient chunk access +""" + +import struct +import zlib +from dataclasses import dataclass, field +from typing import Optional, Dict, Generator, BinaryIO, List, Callable +from io import BytesIO + +from loguru import logger + +# ZIP signatures +ZIP_LOCAL_HEADER_SIG = b'\x50\x4b\x03\x04' +ZIP_CD_SIG = b'\x50\x4b\x01\x02' +ZIP_EOCD_SIG = b'\x50\x4b\x05\x06' +ZIP_EOCD64_SIG = b'\x50\x4b\x06\x06' +ZIP_EOCD64_LOC_SIG = b'\x50\x4b\x06\x07' + +# Compression methods +COMPRESSION_STORED = 0 +COMPRESSION_DEFLATE = 8 + +# ZIP64 marker value +ZIP64_MARKER = 0xFFFFFFFF +ZIP64_MARKER_16 = 0xFFFF + +# Extra field header IDs +ZIP64_EXTRA_ID = 0x0001 + +# Default buffer size for streaming +DEFAULT_BUFFER_SIZE = 8192 + +# Maximum EOCD search size (65KB comment + 22 byte EOCD header) +MAX_EOCD_SEARCH_SIZE = 65536 + 22 + + +@dataclass +class ZipEntry: + """A file entry from the ZIP central directory.""" + filename: str + compressed_size: int + uncompressed_size: int + compression_method: int # 0=STORE, 8=DEFLATE + local_header_offset: int + crc32: int + extra_field: bytes = field(default_factory=bytes, repr=False) + + @property + def is_directory(self) -> bool: + """Check if this entry represents a directory.""" + return self.filename.endswith('/') + + +class ZipReaderError(Exception): + """Base exception for ZIP reader errors.""" + pass + + +class InvalidZipError(ZipReaderError): + """Raised when the ZIP file is invalid or corrupted.""" + pass + + +class ZipReader: + """ + Generic ZIP file reader with streaming support. + + Supports: + - ZIP64 format for large files + - STORE and DEFLATE compression + - Range requests for streaming chunks + - Custom comment parsing via callback + + Usage: + with ZipReader('/path/to/file.zip') as reader: + entries = reader.parse_central_directory() + content = reader.read_file('path/in/archive.txt') + """ + + def __init__(self, file_path: str): + """Initialize the ZIP reader. + + Args: + file_path: Path to the ZIP file + """ + self.file_path = file_path + self._fh: Optional[BinaryIO] = None + self._file_size: int = 0 + self._comment: str = "" + self._entries: Dict[str, ZipEntry] = {} + self._cd_offset: int = 0 + self._cd_size: int = 0 + self._cd_entries_count: int = 0 + self._is_zip64: bool = False + self._cd_parsed: bool = False + self._cd_next_offset: int = 0 + self._cd_entries_read_count: int = 0 + + def open(self) -> 'ZipReader': + """Open the file and parse EOCD. + + Returns: + Self for method chaining + + Raises: + FileNotFoundError: If the file doesn't exist + InvalidZipError: If the file is not a valid ZIP + """ + import os + self._fh = open(self.file_path, 'rb') + self._file_size = os.fstat(self._fh.fileno()).st_size + self._parse_eocd() + return self + + def close(self): + """Close the file handle.""" + if self._fh: + self._fh.close() + self._fh = None + + def __enter__(self) -> 'ZipReader': + return self.open() + + def __exit__(self, *args): + self.close() + + @property + def file_size(self) -> int: + """Get the size of the ZIP file.""" + return self._file_size + + @property + def is_zip64(self) -> bool: + """Check if this is a ZIP64 format archive.""" + return self._is_zip64 + + @property + def comment(self) -> str: + """Get the ZIP file comment.""" + return self._comment + + @property + def entries(self) -> Dict[str, ZipEntry]: + """Get the parsed entries dictionary.""" + return self._entries + + @property + def cd_entries_count(self) -> int: + """Get the number of entries in the central directory.""" + return self._cd_entries_count + + def parse_central_directory( + self, + stop_condition: Optional[Callable[[ZipEntry, int], bool]] = None, + max_new_entries: Optional[int] = None + ) -> Dict[str, ZipEntry]: + """ + Parse the central directory. + + Supports partial parsing and resuming. If called multiple times, + it resumes from where it left off, unless already fully parsed. + + Args: + stop_condition: Optional callback that receives each ZipEntry and its + 0-based index. If it returns True, parsing stops early. + Useful for optimizations like stopping after metadata files. + max_new_entries: Optional maximum number of entries to parse in this call. + If specified, parsing stops after this many NEW entries are processed. + + Returns: + Dictionary mapping filenames to ZipEntry objects (accumulated) + + Raises: + InvalidZipError: If central directory is corrupted + """ + if self._fh is None: + raise ZipReaderError("File not opened") + + if self._cd_parsed: + return self._entries + + self._fh.seek(self._cd_next_offset) + + start_index = self._cd_entries_read_count + remaining_entries = self._cd_entries_count - start_index + + entries_to_read = remaining_entries + if max_new_entries is not None: + entries_to_read = min(remaining_entries, max_new_entries) + + entries_read_this_call = 0 + + while entries_read_this_call < entries_to_read: + i = start_index + entries_read_this_call + + # Read CD file header (46 bytes minimum) + header = self._fh.read(46) + if len(header) < 46 or header[:4] != ZIP_CD_SIG: + raise InvalidZipError(f"Invalid central directory entry at index {i}") + + # Parse header fields + (version_made, version_needed, flags, compression, + mod_time, mod_date, crc32, comp_size, uncomp_size, + name_len, extra_len, comment_len, disk_start, + internal_attr, external_attr, local_offset) = struct.unpack( + ' 0 else b'' + + # Skip comment + if comment_len > 0: + self._fh.seek(comment_len, 1) + + # Update next offset + self._cd_next_offset = self._fh.tell() + + # Handle ZIP64 extra field if needed + if comp_size == ZIP64_MARKER or uncomp_size == ZIP64_MARKER or local_offset == ZIP64_MARKER: + comp_size, uncomp_size, local_offset = self._parse_zip64_extra( + extra, comp_size, uncomp_size, local_offset) + + entry = ZipEntry( + filename=filename, + compressed_size=comp_size, + uncompressed_size=uncomp_size, + compression_method=compression, + local_header_offset=local_offset, + crc32=crc32, + extra_field=extra + ) + + self._entries[filename] = entry + self._cd_entries_read_count += 1 + entries_read_this_call += 1 + + # Check stop condition + if stop_condition and stop_condition(entry, i): + logger.debug(f"Stop condition met at index {i}, filename: {filename}") + break + + if self._cd_entries_read_count >= self._cd_entries_count: + self._cd_parsed = True + + return self._entries + + def list_files(self, prefix: str = "") -> List[str]: + """List files in archive, optionally filtered by prefix. + + Args: + prefix: Only return files starting with this prefix + + Returns: + List of filenames matching the prefix + """ + if not self._cd_parsed: + self.parse_central_directory() + + if prefix: + return [name for name in self._entries.keys() + if name.startswith(prefix) and not self._entries[name].is_directory] + return [name for name in self._entries.keys() + if not self._entries[name].is_directory] + + def get_entry(self, path: str) -> Optional[ZipEntry]: + """Get info about a specific file in the archive. + + Args: + path: Path within the archive + + Returns: + ZipEntry if found, None otherwise + """ + # Check if we already have it + if path in self._entries: + return self._entries[path] + + # If fully parsed and not found, it doesn't exist + if self._cd_parsed: + return None + + # Scan forward until we find it or finish + def stop_on_find(entry, idx): + return entry.filename == path + + self.parse_central_directory(stop_condition=stop_on_find) + + return self._entries.get(path) + + def read_file(self, path: str) -> bytes: + """Read entire file from archive. + + Args: + path: Path within the archive + + Returns: + File contents as bytes + + Raises: + FileNotFoundError: If path not found in archive + InvalidZipError: If decompression fails + """ + return b''.join(self.stream_file(path)) + + def stream_file(self, path: str, buffer_size: int = DEFAULT_BUFFER_SIZE) -> Generator[bytes, None, None]: + """Stream file content from archive. + + Args: + path: Path within the archive + buffer_size: Size of chunks to yield + + Yields: + Chunks of file content + + Raises: + FileNotFoundError: If path not found in archive + """ + if self._fh is None: + raise ZipReaderError("File not opened") + + entry = self.get_entry(path) + if entry is None: + raise FileNotFoundError(f"File not found in archive: {path}") + + # Seek to local file header and skip it + self._fh.seek(entry.local_header_offset) + local_header = self._fh.read(30) + if local_header[:4] != ZIP_LOCAL_HEADER_SIG: + raise InvalidZipError(f"Invalid local header for {path}") + + # Get local header name and extra lengths + name_len, extra_len = struct.unpack(' 0: + chunk_size = min(buffer_size, remaining) + chunk = self._fh.read(chunk_size) + if not chunk: + break + yield chunk + remaining -= len(chunk) + + elif entry.compression_method == COMPRESSION_DEFLATE: + # Compressed - need to decompress + decompressor = zlib.decompressobj(-zlib.MAX_WBITS) + remaining = entry.compressed_size + + while remaining > 0: + chunk_size = min(buffer_size, remaining) + compressed_chunk = self._fh.read(chunk_size) + if not compressed_chunk: + break + remaining -= len(compressed_chunk) + + decompressed = decompressor.decompress(compressed_chunk) + if decompressed: + yield decompressed + + # Flush any remaining data + final = decompressor.flush() + if final: + yield final + else: + raise InvalidZipError(f"Unsupported compression method: {entry.compression_method}") + + def stream_file_range(self, path: str, start: int, end: int, + buffer_size: int = DEFAULT_BUFFER_SIZE) -> Generator[bytes, None, None]: + """Stream a byte range of uncompressed file content. + + Note: For DEFLATE compressed files, this must decompress from the + beginning to reach the desired offset. + + Args: + path: Path within the archive + start: Start byte offset (inclusive) + end: End byte offset (inclusive) + buffer_size: Size of chunks to yield + + Yields: + Chunks of file content within the specified range + + Raises: + FileNotFoundError: If path not found in archive + ValueError: If range is invalid + """ + if self._fh is None: + raise ZipReaderError("File not opened") + + entry = self.get_entry(path) + if entry is None: + raise FileNotFoundError(f"File not found in archive: {path}") + + if start < 0: + raise ValueError("Start position cannot be negative") + if end < start: + raise ValueError("End position cannot be less than start position") + if start >= entry.uncompressed_size: + return # Nothing to return + + # Clamp end to file size + end = min(end, entry.uncompressed_size - 1) + range_length = end - start + 1 + + # Seek to local file header and skip it + self._fh.seek(entry.local_header_offset) + local_header = self._fh.read(30) + if local_header[:4] != ZIP_LOCAL_HEADER_SIG: + raise InvalidZipError(f"Invalid local header for {path}") + + name_len, extra_len = struct.unpack(' 0: + chunk_size = min(buffer_size, remaining) + chunk = self._fh.read(chunk_size) + if not chunk: + break + yield chunk + remaining -= len(chunk) + + elif entry.compression_method == COMPRESSION_DEFLATE: + # For compressed files, we need to decompress from the start + # and skip to the desired offset + decompressor = zlib.decompressobj(-zlib.MAX_WBITS) + compressed_remaining = entry.compressed_size + decompressed_pos = 0 + output_remaining = range_length + + while compressed_remaining > 0 and output_remaining > 0: + chunk_size = min(buffer_size, compressed_remaining) + compressed_chunk = self._fh.read(chunk_size) + if not compressed_chunk: + break + compressed_remaining -= len(compressed_chunk) + + decompressed = decompressor.decompress(compressed_chunk) + if not decompressed: + continue + + # Handle the decompressed chunk + chunk_start = 0 + chunk_len = len(decompressed) + + # Skip data before our range + if decompressed_pos + chunk_len <= start: + decompressed_pos += chunk_len + continue + + # Calculate how much of this chunk to skip + if decompressed_pos < start: + chunk_start = start - decompressed_pos + + # Calculate how much of this chunk to output + output_bytes = min(chunk_len - chunk_start, output_remaining) + + if output_bytes > 0: + yield decompressed[chunk_start:chunk_start + output_bytes] + output_remaining -= output_bytes + + decompressed_pos += chunk_len + + # Flush and handle remaining + if output_remaining > 0: + final = decompressor.flush() + if final: + # Apply same range logic to final chunk + chunk_len = len(final) + if decompressed_pos + chunk_len > start: + chunk_start = max(0, start - decompressed_pos) + output_bytes = min(chunk_len - chunk_start, output_remaining) + if output_bytes > 0: + yield final[chunk_start:chunk_start + output_bytes] + else: + raise InvalidZipError(f"Unsupported compression method: {entry.compression_method}") + + def _parse_eocd(self): + """Parse End of Central Directory record. + + Raises: + InvalidZipError: If EOCD not found or invalid + """ + if self._fh is None: + raise ZipReaderError("File not opened") + + # Search backwards from end of file for EOCD signature + search_size = min(MAX_EOCD_SEARCH_SIZE, self._file_size) + self._fh.seek(self._file_size - search_size) + data = self._fh.read(search_size) + + # Find EOCD signature (searching from end) + eocd_pos = data.rfind(ZIP_EOCD_SIG) + if eocd_pos == -1: + raise InvalidZipError("End of Central Directory not found") + + # Position in file + eocd_file_pos = self._file_size - search_size + eocd_pos + + # Parse EOCD (22 bytes minimum) + eocd = data[eocd_pos:eocd_pos + 22] + if len(eocd) < 22: + raise InvalidZipError("Truncated EOCD record") + + (disk_num, cd_disk, cd_entries_this_disk, cd_entries_total, + cd_size, cd_offset, comment_len) = struct.unpack(' 0: + comment_data = data[eocd_pos + 22:eocd_pos + 22 + comment_len] + if len(comment_data) == comment_len: + self._comment = comment_data.decode('utf-8', errors='replace') + + # Check for ZIP64 + if (cd_offset == ZIP64_MARKER or cd_size == ZIP64_MARKER or + cd_entries_total == ZIP64_MARKER_16): + self._is_zip64 = True + self._parse_zip64_eocd(eocd_file_pos) + else: + self._cd_offset = cd_offset + self._cd_size = cd_size + self._cd_entries_count = cd_entries_total + + # Initialize partial parsing state + self._cd_next_offset = self._cd_offset + self._cd_entries_read_count = 0 + + def _parse_zip64_eocd(self, eocd_pos: int): + """Parse ZIP64 End of Central Directory records. + + Args: + eocd_pos: Position of standard EOCD in file + + Raises: + InvalidZipError: If ZIP64 records not found or invalid + """ + if self._fh is None: + raise ZipReaderError("File not opened") + + # Look for ZIP64 EOCD Locator (20 bytes before EOCD) + loc_pos = eocd_pos - 20 + if loc_pos < 0: + raise InvalidZipError("ZIP64 EOCD Locator not found") + + self._fh.seek(loc_pos) + locator = self._fh.read(20) + + if locator[:4] != ZIP_EOCD64_LOC_SIG: + raise InvalidZipError("Invalid ZIP64 EOCD Locator") + + # Parse locator to get ZIP64 EOCD offset + (zip64_disk, zip64_eocd_offset, total_disks) = struct.unpack( + ' tuple: + """Parse ZIP64 extra field to get actual values. + + Args: + extra: Extra field data + comp_size: Compressed size from CD (may be 0xFFFFFFFF) + uncomp_size: Uncompressed size from CD (may be 0xFFFFFFFF) + local_offset: Local header offset from CD (may be 0xFFFFFFFF) + + Returns: + Tuple of (actual_comp_size, actual_uncomp_size, actual_local_offset) + """ + offset = 0 + while offset + 4 <= len(extra): + header_id, data_size = struct.unpack(' ({ + name, + path: path ?? `/${name}`, + size: 1000, + is_dir: false, + permissions: 'rw-r--r--', + owner: 'test', + group: 'test', + last_modified: Date.now() +}); + +const createDir = (name: string, path?: string): FileOrFolder => ({ + name, + path: path ?? `/${name}`, + size: 0, + is_dir: true, + permissions: 'rwxr-xr-x', + owner: 'test', + group: 'test', + last_modified: Date.now() +}); + +describe('isOzxFile', () => { + it('should return true for files with .ozx extension', () => { + expect(isOzxFile(createFile('image.ozx'))).toBe(true); + expect(isOzxFile(createFile('data.OZX'))).toBe(true); + expect(isOzxFile(createFile('sample.Ozx'))).toBe(true); + }); + + it('should return false for non-ozx files', () => { + expect(isOzxFile(createFile('image.zarr'))).toBe(false); + expect(isOzxFile(createFile('data.zip'))).toBe(false); + expect(isOzxFile(createFile('file.txt'))).toBe(false); + expect(isOzxFile(createFile('ozx'))).toBe(false); + expect(isOzxFile(createFile('.ozx'))).toBe(true); // Hidden file with .ozx extension + }); + + it('should return false for directories', () => { + expect(isOzxFile(createDir('folder.ozx'))).toBe(false); + }); +}); + +describe('isOzxFilename', () => { + it('should return true for filenames with .ozx extension', () => { + expect(isOzxFilename('image.ozx')).toBe(true); + expect(isOzxFilename('data.OZX')).toBe(true); + expect(isOzxFilename('/path/to/file.ozx')).toBe(true); + }); + + it('should return false for non-ozx filenames', () => { + expect(isOzxFilename('image.zarr')).toBe(false); + expect(isOzxFilename('data.zip')).toBe(false); + }); +}); + +describe('hasOzxFiles', () => { + it('should return true if any file is an OZX file', () => { + const files = [ + createFile('image.zarr'), + createFile('data.ozx'), + createFile('text.txt') + ]; + expect(hasOzxFiles(files)).toBe(true); + }); + + it('should return false if no OZX files exist', () => { + const files = [ + createFile('image.zarr'), + createFile('data.zip'), + createFile('text.txt') + ]; + expect(hasOzxFiles(files)).toBe(false); + }); + + it('should return false for empty array', () => { + expect(hasOzxFiles([])).toBe(false); + }); +}); + +describe('getOzxFiles', () => { + it('should return only OZX files', () => { + const files = [ + createFile('image.zarr'), + createFile('data1.ozx'), + createFile('text.txt'), + createFile('data2.ozx') + ]; + const result = getOzxFiles(files); + expect(result).toHaveLength(2); + expect(result[0].name).toBe('data1.ozx'); + expect(result[1].name).toBe('data2.ozx'); + }); + + it('should return empty array if no OZX files', () => { + const files = [createFile('image.zarr'), createFile('text.txt')]; + expect(getOzxFiles(files)).toEqual([]); + }); +}); + +describe('getZipFilePath', () => { + it('should return path without leading slash', () => { + const file = createFile('data.ozx', '/path/to/data.ozx'); + expect(getZipFilePath(file)).toBe('path/to/data.ozx'); + }); + + it('should return path unchanged if no leading slash', () => { + const file = createFile('data.ozx', 'path/to/data.ozx'); + expect(getZipFilePath(file)).toBe('path/to/data.ozx'); + }); +}); + +describe('detectOzxZarrVersions', () => { + // RFC-9 OZX is for OME-Zarr v0.5 which requires Zarr v3 only + + it('should detect zarr v3 when zarr.json exists at root', () => { + const files = ['zarr.json', '0/zarr.json', '0/c/0/0/0']; + expect(detectOzxZarrVersions(files)).toEqual(['v3']); + }); + + it('should NOT detect zarr v2 - RFC-9 requires Zarr v3', () => { + // .zarray and .zattrs are Zarr v2 markers, not supported in RFC-9 OZX + const files = ['.zarray', '.zattrs', '0/0']; + expect(detectOzxZarrVersions(files)).toEqual([]); + }); + + it('should only detect v3 even when v2 markers also exist', () => { + // RFC-9 OZX is Zarr v3 only, so v2 markers are ignored + const files = ['zarr.json', '.zarray', '0/c/0/0/0']; + expect(detectOzxZarrVersions(files)).toEqual(['v3']); + }); + + it('should return empty array when no zarr.json files', () => { + const files = ['data.txt', 'image.png']; + expect(detectOzxZarrVersions(files)).toEqual([]); + }); + + it('should return empty array for empty file list', () => { + expect(detectOzxZarrVersions([])).toEqual([]); + }); + + it('should detect zarr.json from nested paths', () => { + const files = ['folder/zarr.json', 'folder/.zattrs']; + // Nested zarr.json is detected, .zattrs is ignored (v2 only) + const result = detectOzxZarrVersions(files); + expect(result).toEqual(['v3']); + }); + + it('should detect zarr.json from paths ending with /zarr.json', () => { + const files = ['root/zarr.json', 'root/.zattrs']; + // Only zarr.json is detected for RFC-9 OZX + const result = detectOzxZarrVersions(files); + expect(result).toEqual(['v3']); + }); +}); diff --git a/frontend/src/components/ui/BrowsePage/FileViewer.tsx b/frontend/src/components/ui/BrowsePage/FileViewer.tsx index 0fd1e82f..a0e48ee6 100644 --- a/frontend/src/components/ui/BrowsePage/FileViewer.tsx +++ b/frontend/src/components/ui/BrowsePage/FileViewer.tsx @@ -1,5 +1,11 @@ -import { useEffect, useState } from 'react'; -import { Switch, Typography } from '@material-tailwind/react'; +import { useEffect, useState, useMemo } from 'react'; +import { Switch, Typography, IconButton } from '@material-tailwind/react'; +import { + HiOutlineFolder, + HiOutlineDocument, + HiArrowLeft, + HiOutlineDownload +} from 'react-icons/hi'; import { Prism as SyntaxHighlighter } from 'react-syntax-highlighter'; import { materialDark, @@ -10,11 +16,382 @@ import { useFileBrowserContext } from '@/contexts/FileBrowserContext'; import { formatFileSize, formatUnixTimestamp } from '@/utils'; import type { FileOrFolder } from '@/shared.types'; import { useFileContentQuery } from '@/queries/fileContentQueries'; +import { + useZipFileEntriesInfiniteQuery, + useZipFileContentQuery, + buildZipContentUrl +} from '@/queries/ozxQueries'; +import type { ZipFileEntry } from '@/queries/ozxQueries'; +import { isAnyZipFile, getZipFilePath } from '@/utils/ozxDetection'; type FileViewerProps = { readonly file: FileOrFolder; }; +const InternalFileViewer = ({ + fspName, + ozxPath, + internalPath, + onBack +}: { + readonly fspName: string; + readonly ozxPath: string; + readonly internalPath: string; + readonly onBack: () => void; +}) => { + const { data, isLoading, error } = useZipFileContentQuery( + fspName, + ozxPath, + internalPath + ); + const [isDarkMode, setIsDarkMode] = useState(false); + const [formatJson, setFormatJson] = useState(true); + + useEffect(() => { + const checkDarkMode = () => + setIsDarkMode(document.documentElement.classList.contains('dark')); + checkDarkMode(); + const observer = new MutationObserver(checkDarkMode); + observer.observe(document.documentElement, { + attributes: true, + attributeFilter: ['class'] + }); + return () => observer.disconnect(); + }, []); + + if (isLoading) { + return
Loading content...
; + } + if (error) { + return
Error: {error.message}
; + } + + const content = data ? new TextDecoder().decode(data) : ''; + const language = getLanguageFromExtension(internalPath); + const isJsonFile = language === 'json'; + + // Format JSON if toggle is enabled and content is valid JSON + let displayContent = content; + if (isJsonFile && formatJson && content) { + try { + const parsed = JSON.parse(content); + displayContent = JSON.stringify(parsed, null, 2); + } catch { + // If JSON parsing fails, show original content + displayContent = content; + } + } + + return ( +
+
+ + + + + {internalPath} + + {isJsonFile ? ( +
+ + Format JSON + + setFormatJson(!formatJson)} + /> +
+ ) : null} +
+
+ + {displayContent} + +
+
+ ); +}; + +type ZipBrowserItem = { + name: string; + path: string; + isDir: boolean; + size: number; +}; + +const ZipBrowser = ({ file }: { readonly file: FileOrFolder }) => { + const { fspName } = useFileBrowserContext(); + const ozxPath = getZipFilePath(file); + const { + data, + isLoading, + error, + fetchNextPage, + hasNextPage, + isFetchingNextPage + } = useZipFileEntriesInfiniteQuery(fspName, ozxPath, 100); + const [internalPath, setInternalPath] = useState(''); + const [selectedFile, setSelectedFile] = useState(null); + + // Flatten all pages into a single array of entries + const allEntries = useMemo(() => { + if (!data?.pages) { + return []; + } + return data.pages.flatMap(page => page.entries); + }, [data]); + + // Get total count from the first page (same across all pages) + const totalCount = data?.pages[0]?.total_count ?? 0; + const loadedCount = allEntries.length; + + const items = useMemo(() => { + if (!allEntries.length) { + return []; + } + + const folders = new Map(); // path -> total size of contents + const files: ZipBrowserItem[] = []; + + allEntries.forEach(entry => { + const filename = entry.filename; + if (!filename.startsWith(internalPath)) { + return; + } + + const relative = filename.slice(internalPath.length); + const slashIndex = relative.indexOf('/'); + + if (slashIndex === -1) { + // Direct file in current directory + if (relative !== '' && !entry.is_directory) { + files.push({ + name: relative, + path: filename, + isDir: false, + size: entry.uncompressed_size + }); + } + } else { + // File in a subdirectory - track the folder + const folderPath = internalPath + relative.slice(0, slashIndex + 1); + const currentSize = folders.get(folderPath) || 0; + folders.set(folderPath, currentSize + entry.uncompressed_size); + } + }); + + const folderItems: ZipBrowserItem[] = Array.from(folders.entries()) + .sort(([a], [b]) => a.localeCompare(b)) + .map(([path, size]) => ({ + name: path.slice(internalPath.length).replace(/\/$/, ''), + path, + isDir: true, + size + })); + + const fileItems = files.sort((a, b) => a.name.localeCompare(b.name)); + + return [...folderItems, ...fileItems]; + }, [allEntries, internalPath]); + + if (isLoading) { + return ( +
+ + Loading archive contents... + +
+ ); + } + + if (error) { + return ( +
+ Error: {error.message} +
+ ); + } + + if (selectedFile && fspName) { + return ( + setSelectedFile(null)} + ozxPath={ozxPath} + /> + ); + } + + const navigateUp = () => { + const parts = internalPath.split('/').filter(Boolean); + parts.pop(); + setInternalPath(parts.length > 0 ? parts.join('/') + '/' : ''); + }; + + const handleDownload = (itemPath: string, itemName: string) => { + if (!fspName) { + return; + } + const url = buildZipContentUrl(fspName, ozxPath, itemPath); + const link = document.createElement('a'); + link.href = url; + link.download = itemName; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + }; + + return ( +
+ {/* Breadcrumb header with progress indicator */} +
+ {internalPath ? ( + + + + ) : null} + + {file.name}/{internalPath} + + {totalCount > 0 ? ( + + {loadedCount} of {totalCount} entries + + ) : null} +
+ + {/* Table view */} +
+ + + + + + + + + + + {items.map(item => ( + { + if (item.isDir) { + setInternalPath(item.path); + } else { + setSelectedFile(item.path); + } + }} + > + + + + + + ))} + {items.length === 0 && !hasNextPage ? ( + + + + ) : null} + +
+ Name + + Type + + Size + + Actions +
+
+ {item.isDir ? ( + + ) : ( + + )} + + {item.name} + +
+
+ + {item.isDir ? 'Folder' : 'File'} + + + + {formatFileSize(item.size)} + + + {!item.isDir ? ( + { + e.stopPropagation(); + handleDownload(item.path, item.name); + }} + size="sm" + variant="ghost" + > + + + ) : null} +
+ This folder is empty +
+ + {/* Load more button */} + {hasNextPage ? ( +
+ +
+ ) : null} +
+
+ ); +}; + // Map file extensions to syntax highlighter languages const getLanguageFromExtension = (filename: string): string => { const extension = filename.split('.').pop()?.toLowerCase() || ''; @@ -80,7 +457,8 @@ export default function FileViewer({ file }: FileViewerProps) { const [isDarkMode, setIsDarkMode] = useState(false); const [formatJson, setFormatJson] = useState(true); - const contentQuery = useFileContentQuery(fspName, file.path); + const isZip = isAnyZipFile(file); + const contentQuery = useFileContentQuery(fspName, file.path, !isZip); const language = getLanguageFromExtension(file.name); const isJsonFile = language === 'json'; @@ -101,6 +479,10 @@ export default function FileViewer({ file }: FileViewerProps) { }, []); const renderViewer = () => { + if (isAnyZipFile(file)) { + return ; + } + if (contentQuery.isLoading) { return (
diff --git a/frontend/src/components/ui/Table/TableCard.tsx b/frontend/src/components/ui/Table/TableCard.tsx index a367645a..cfbcd017 100644 --- a/frontend/src/components/ui/Table/TableCard.tsx +++ b/frontend/src/components/ui/Table/TableCard.tsx @@ -62,7 +62,7 @@ declare module '@tanstack/react-table' { data: CellContextMenuData ) => void; } - // eslint-disable-next-line @typescript-eslint/no-unused-vars + interface ColumnMeta { // Optional function to extract searchable values from a cell // Used by globalFilterFn to allow columns to define custom search behavior diff --git a/frontend/src/queries/fileContentQueries.ts b/frontend/src/queries/fileContentQueries.ts index 2e71fb7c..fb69b405 100644 --- a/frontend/src/queries/fileContentQueries.ts +++ b/frontend/src/queries/fileContentQueries.ts @@ -48,7 +48,8 @@ async function fetchFileWithTextDetection( export function useFileContentQuery( fspName: string | undefined, - filePath: string + filePath: string, + enabled: boolean = true ): UseQueryResult { return useQuery({ queryKey: fileContentQueryKeys.detail(fspName || '', filePath), @@ -58,7 +59,7 @@ export function useFileContentQuery( }); return content; }, - enabled: !!fspName && !!filePath, + enabled: enabled && !!fspName && !!filePath, retry: (failureCount, error) => { // Do not retry on permission errors if ( diff --git a/frontend/src/queries/ozxQueries.ts b/frontend/src/queries/ozxQueries.ts new file mode 100644 index 00000000..742a3a5b --- /dev/null +++ b/frontend/src/queries/ozxQueries.ts @@ -0,0 +1,542 @@ +/** + * OZX (Zipped OME-Zarr) query hooks and store implementation. + * + * RFC-9 Spec: https://ngff.openmicroscopy.org/rfc/9/index.html + */ + +import { useQuery, useInfiniteQuery } from '@tanstack/react-query'; +import type { + UseQueryResult, + UseInfiniteQueryResult, + InfiniteData +} from '@tanstack/react-query'; +import { default as log } from '@/logger'; +import { buildUrl, sendFetchRequest } from '@/utils'; +import { sendRequestAndThrowForNotOk } from './queryUtils'; + +/** + * Metadata response from the OZX metadata endpoint. + */ +export type OzxMetadataResponse = { + version: string | null; + json_first: boolean; + file_count: number; + is_zip64: boolean; +}; + +/** + * A file entry within a ZIP archive with full details. + */ +export type ZipFileEntry = { + filename: string; + compressed_size: number; + uncompressed_size: number; + compression_method: number; + is_directory: boolean; +}; + +/** + * Paginated response for file entries from a ZIP archive. + */ +export type ZipFileEntriesPage = { + entries: ZipFileEntry[]; + total_count: number; + offset: number; + limit: number; + has_more: boolean; +}; + +/** + * Build URL for accessing content within a ZIP file. + * + * @param fspName - The file share path name + * @param zipFilePath - Path to the ZIP file within the FSP + * @param internalPath - Path within the ZIP archive + * @returns Properly encoded URL for the ZIP content endpoint + */ +export function buildZipContentUrl( + fspName: string, + zipFilePath: string, + internalPath: string +): string { + // Build the path segment: fspName/zipFilePath + const pathSegment = `${fspName}/${zipFilePath}`; + return buildUrl('/api/zip-content/', pathSegment, { subpath: internalPath }); +} + +/** + * Build full URL for accessing content within a ZIP file. + * Returns absolute URL suitable for external use (e.g., zarrita stores). + * + * @param fspName - The file share path name + * @param zipFilePath - Path to the ZIP file within the FSP + * @param internalPath - Path within the ZIP archive + * @returns Absolute URL + */ +export function getZipContentUrl( + fspName: string, + zipFilePath: string, + internalPath: string +): string { + const relativePath = buildZipContentUrl(fspName, zipFilePath, internalPath); + return new URL(relativePath, window.location.origin).href; +} + +/** + * Build URL for the OZX metadata endpoint. + */ +export function buildOzxMetadataUrl( + fspName: string, + ozxFilePath: string +): string { + const pathSegment = `${fspName}/${ozxFilePath}`; + return buildUrl('/api/ozx-metadata/', pathSegment, null); +} + +/** + * Build URL for listing files in a ZIP archive. + * + * @param fspName - The file share path name + * @param zipFilePath - Path to the ZIP file within the FSP + * @param prefix - Optional prefix to filter files + * @param details - If true, include full file entry details + * @param offset - Number of entries to skip (for pagination) + * @param limit - Maximum entries to return (for pagination) + */ +export function buildZipListUrl( + fspName: string, + zipFilePath: string, + prefix?: string, + details?: boolean, + offset?: number, + limit?: number +): string { + const pathSegment = `${fspName}/${zipFilePath}`; + const params: Record = {}; + if (prefix) { + params.prefix = prefix; + } + if (details) { + params.details = 'true'; + } + if (offset !== undefined) { + params.offset = String(offset); + } + if (limit !== undefined) { + params.limit = String(limit); + } + return buildUrl( + '/api/zip-list/', + pathSegment, + Object.keys(params).length > 0 ? params : null + ); +} + +/** + * Fetch OZX metadata from the backend. + */ +async function fetchOzxMetadata( + fspName: string, + ozxFilePath: string +): Promise { + const url = buildOzxMetadataUrl(fspName, ozxFilePath); + const response = (await sendRequestAndThrowForNotOk( + url, + 'GET' + )) as OzxMetadataResponse; + return response; +} + +/** + * Hook to fetch OZX archive metadata. + * + * @param fspName - The file share path name + * @param ozxFilePath - Path to the OZX file within the FSP + * @param enabled - Whether the query should be enabled + */ +export function useOzxMetadataQuery( + fspName: string | undefined, + ozxFilePath: string | undefined, + enabled: boolean = true +): UseQueryResult { + return useQuery({ + queryKey: ['ozx', 'metadata', fspName || '', ozxFilePath || ''], + queryFn: async () => { + if (!fspName || !ozxFilePath) { + throw new Error('fspName and ozxFilePath are required'); + } + return await fetchOzxMetadata(fspName, ozxFilePath); + }, + enabled: enabled && !!fspName && !!ozxFilePath, + staleTime: 5 * 60 * 1000 // 5 minutes - OZX metadata doesn't change often + }); +} + +/** + * Fetch list of files in a ZIP archive. + */ +async function fetchZipFileList( + fspName: string, + zipFilePath: string, + prefix?: string +): Promise { + const url = buildZipListUrl(fspName, zipFilePath, prefix); + const response = (await sendRequestAndThrowForNotOk(url, 'GET')) as { + files: string[]; + }; + return response.files; +} + +/** + * Fetch detailed file entries from a ZIP archive. + */ +async function fetchZipFileEntries( + fspName: string, + zipFilePath: string, + prefix?: string +): Promise { + const url = buildZipListUrl(fspName, zipFilePath, prefix, true); + const response = (await sendRequestAndThrowForNotOk(url, 'GET')) as { + entries: ZipFileEntry[]; + }; + return response.entries; +} + +/** + * Hook to fetch list of files in a ZIP archive. + * + * @param fspName - The file share path name + * @param zipFilePath - Path to the ZIP file within the FSP + * @param prefix - Optional prefix to filter files + * @param enabled - Whether the query should be enabled + */ +export function useZipFileListQuery( + fspName: string | undefined, + zipFilePath: string | undefined, + prefix?: string, + enabled: boolean = true +): UseQueryResult { + return useQuery({ + queryKey: ['zip', 'files', fspName || '', zipFilePath || '', prefix || ''], + queryFn: async () => { + if (!fspName || !zipFilePath) { + throw new Error('fspName and zipFilePath are required'); + } + return await fetchZipFileList(fspName, zipFilePath, prefix); + }, + enabled: enabled && !!fspName && !!zipFilePath, + staleTime: 5 * 60 * 1000 + }); +} + +/** + * Hook to fetch detailed file entries from a ZIP archive. + * + * @param fspName - The file share path name + * @param zipFilePath - Path to the ZIP file within the FSP + * @param prefix - Optional prefix to filter files + * @param enabled - Whether the query should be enabled + */ +export function useZipFileEntriesQuery( + fspName: string | undefined, + zipFilePath: string | undefined, + prefix?: string, + enabled: boolean = true +): UseQueryResult { + return useQuery({ + queryKey: [ + 'zip', + 'entries', + fspName || '', + zipFilePath || '', + prefix || '' + ], + queryFn: async () => { + if (!fspName || !zipFilePath) { + throw new Error('fspName and zipFilePath are required'); + } + return await fetchZipFileEntries(fspName, zipFilePath, prefix); + }, + enabled: enabled && !!fspName && !!zipFilePath, + staleTime: 5 * 60 * 1000 + }); +} + +/** + * Fetch a page of detailed file entries from a ZIP archive. + */ +async function fetchZipFileEntriesPage( + fspName: string, + zipFilePath: string, + offset: number, + limit: number, + prefix?: string +): Promise { + const url = buildZipListUrl( + fspName, + zipFilePath, + prefix, + true, + offset, + limit + ); + const response = (await sendRequestAndThrowForNotOk( + url, + 'GET' + )) as ZipFileEntriesPage; + return response; +} + +/** + * Hook to fetch detailed file entries from a ZIP archive with infinite scrolling. + * Loads entries progressively as user requests more. + * + * @param fspName - The file share path name + * @param zipFilePath - Path to the ZIP file within the FSP + * @param pageSize - Number of entries per page (default 100) + * @param enabled - Whether the query should be enabled + */ +export function useZipFileEntriesInfiniteQuery( + fspName: string | undefined, + zipFilePath: string | undefined, + pageSize: number = 100, + enabled: boolean = true +): UseInfiniteQueryResult, Error> { + return useInfiniteQuery({ + queryKey: [ + 'zip', + 'entries-infinite', + fspName || '', + zipFilePath || '', + pageSize + ], + queryFn: async ({ pageParam = 0 }) => { + if (!fspName || !zipFilePath) { + throw new Error('fspName and zipFilePath are required'); + } + return await fetchZipFileEntriesPage( + fspName, + zipFilePath, + pageParam, + pageSize + ); + }, + initialPageParam: 0, + getNextPageParam: lastPage => { + if (lastPage.has_more) { + return lastPage.offset + lastPage.limit; + } + return undefined; + }, + enabled: enabled && !!fspName && !!zipFilePath, + staleTime: 5 * 60 * 1000 + }); +} + +/** + * Fetch content from within a ZIP file. + * Supports optional range requests. + */ +export async function fetchZipContent( + fspName: string, + zipFilePath: string, + internalPath: string, + options?: { + signal?: AbortSignal; + rangeStart?: number; + rangeEnd?: number; + } +): Promise { + const url = buildZipContentUrl(fspName, zipFilePath, internalPath); + + const headers: HeadersInit = {}; + if (options?.rangeStart !== undefined && options?.rangeEnd !== undefined) { + headers['Range'] = `bytes=${options.rangeStart}-${options.rangeEnd}`; + } + + const response = await fetch(url, { + method: 'GET', + credentials: 'include', + headers, + signal: options?.signal + }); + + if (!response.ok && response.status !== 206) { + throw new Error(`Failed to fetch ZIP content: ${response.status}`); + } + + return new Uint8Array(await response.arrayBuffer()); +} + +/** + * A store implementation compatible with zarrita that reads from OZX archives + * via the Fileglancer OZX API endpoints. + * + * This allows existing zarrita-based code to transparently read from OZX files. + */ +export class OzxFetchStore { + private fspName: string; + private ozxPath: string; + private baseUrl: string; + + /** + * Create a new OzxFetchStore. + * + * @param fspName - The file share path name + * @param ozxPath - Path to the OZX file within the FSP + */ + constructor(fspName: string, ozxPath: string) { + this.fspName = fspName; + this.ozxPath = ozxPath; + // Compute base URL for logging + this.baseUrl = getZipContentUrl(fspName, ozxPath, ''); + log.debug('Created OzxFetchStore for', this.baseUrl); + } + + /** + * Get full content of a file within the OZX archive. + * + * @param key - Path within the archive (e.g., "zarr.json", "0/c/0/0/0") + * @returns File content as Uint8Array, or undefined if not found + */ + async get(key: string): Promise { + try { + const url = buildZipContentUrl(this.fspName, this.ozxPath, key); + const response = await sendFetchRequest(url, 'GET'); + + if (!response.ok) { + if (response.status === 404) { + return undefined; + } + throw new Error(`Failed to fetch ${key}: ${response.status}`); + } + + return new Uint8Array(await response.arrayBuffer()); + } catch (error) { + log.debug(`OzxFetchStore.get(${key}) error:`, error); + return undefined; + } + } + + /** + * Get a byte range from a file within the OZX archive. + * This is the key method for efficient chunk access. + * + * @param key - Path within the archive + * @param offset - Starting byte offset + * @param length - Number of bytes to read + * @returns File content range as Uint8Array, or undefined if not found + */ + async getRange( + key: string, + offset: number, + length: number + ): Promise { + try { + const url = buildZipContentUrl(this.fspName, this.ozxPath, key); + const response = await fetch(url, { + method: 'GET', + credentials: 'include', + headers: { + Range: `bytes=${offset}-${offset + length - 1}` + } + }); + + if (!response.ok && response.status !== 206) { + if (response.status === 404) { + return undefined; + } + throw new Error( + `Failed to fetch range from ${key}: ${response.status}` + ); + } + + return new Uint8Array(await response.arrayBuffer()); + } catch (error) { + log.debug( + `OzxFetchStore.getRange(${key}, ${offset}, ${length}) error:`, + error + ); + return undefined; + } + } + + /** + * Check if a file exists in the OZX archive. + * + * @param key - Path within the archive + * @returns True if the file exists + */ + async has(key: string): Promise { + try { + const url = buildZipContentUrl(this.fspName, this.ozxPath, key); + const response = await fetch(url, { + method: 'HEAD', + credentials: 'include' + }); + return response.ok; + } catch { + return false; + } + } + + /** + * List files in the OZX archive with optional prefix filter. + * + * @param prefix - Optional prefix to filter files + * @returns Array of file paths + */ + async list(prefix?: string): Promise { + return await fetchZipFileList(this.fspName, this.ozxPath, prefix); + } + + /** + * Get the base URL for this store (for debugging/logging). + */ + getBaseUrl(): string { + return this.baseUrl; + } +} + +/** + * Hook to fetch content of a file within a ZIP archive. + */ +export function useZipFileContentQuery( + fspName: string | undefined, + zipFilePath: string | undefined, + internalPath: string | undefined, + enabled: boolean = true +): UseQueryResult { + return useQuery({ + queryKey: [ + 'zip', + 'content', + fspName || '', + zipFilePath || '', + internalPath || '' + ], + queryFn: async () => { + if (!fspName || !zipFilePath || !internalPath) { + throw new Error('fspName, zipFilePath, and internalPath are required'); + } + return await fetchZipContent(fspName, zipFilePath, internalPath); + }, + enabled: enabled && !!fspName && !!zipFilePath && !!internalPath, + staleTime: 5 * 60 * 1000 + }); +} + +/** + * Create an OzxFetchStore for the given file. + * This is a factory function for creating stores. + * + * @param fspName - The file share path name + * @param ozxFilePath - Path to the OZX file within the FSP + * @returns OzxFetchStore instance + */ +export function createOzxStore( + fspName: string, + ozxFilePath: string +): OzxFetchStore { + return new OzxFetchStore(fspName, ozxFilePath); +} diff --git a/frontend/src/queries/zarrQueries.ts b/frontend/src/queries/zarrQueries.ts index 67f71de2..a4d8960f 100644 --- a/frontend/src/queries/zarrQueries.ts +++ b/frontend/src/queries/zarrQueries.ts @@ -8,7 +8,13 @@ import { import type { Metadata } from '@/omezarr-helper'; import { getFileURL } from '@/utils'; import { fetchFileAsJson } from './queryUtils'; -import { FileOrFolder } from '@/shared.types'; +import { isOzxFile } from '@/utils/ozxDetection'; +import { + OzxFetchStore, + getZipContentUrl, + useZipFileListQuery +} from './ozxQueries'; +import type { FileOrFolder } from '@/shared.types'; export type OpenWithToolUrls = { copy: string; @@ -319,3 +325,198 @@ export function useOmeZarrThumbnailQuery( retry: false }); } + +// OZX (Zipped OME-Zarr) types +type OzxZarrMetadataQueryParams = { + fspName: string | undefined; + ozxFile: FileOrFolder | undefined | null; +}; + +type OzxZarrMetadataResult = { + metadata: ZarrMetadata; + omeZarrUrl: string | null; + availableVersions: ('v2' | 'v3')[]; + store: OzxFetchStore | null; +}; + +/** + * Detects if an OZX archive contains Zarr v3 data. + * RFC-9 OZX files are specifically for OME-Zarr v0.5 which requires Zarr v3. + * @param files - Array of file paths within the OZX archive + * @returns Array containing ['v3'] if zarr.json found, empty array otherwise + */ +export function detectOzxZarrVersions(files: string[]): 'v3'[] { + if (!files || files.length === 0) { + return []; + } + + // RFC-9 OZX is for OME-Zarr v0.5 which is Zarr v3 only + // Check for zarr.json at root or in subdirectories + const hasZarrJson = files.some( + f => f === 'zarr.json' || f.endsWith('/zarr.json') + ); + + return hasZarrJson ? ['v3'] : []; +} + +/** + * Fetches Zarr metadata from an OZX archive. + * Uses OzxFetchStore to read files from within the ZIP archive. + */ +async function fetchOzxZarrMetadata( + fspName: string, + ozxFilePath: string, + files: string[] +): Promise { + const store = new OzxFetchStore(fspName, ozxFilePath); + const availableVersions = detectOzxZarrVersions(files); + + // Get the base URL for OME-Zarr viewers (using empty internal path) + const baseUrl = getZipContentUrl(fspName, ozxFilePath, ''); + + // Default to Zarr v3 when available + if (availableVersions.includes('v3')) { + const zarrJsonContent = await store.get('zarr.json'); + if (!zarrJsonContent) { + log.warn('Could not read zarr.json from OZX'); + return { + metadata: null, + omeZarrUrl: null, + availableVersions, + store + }; + } + + const attrs = JSON.parse( + new TextDecoder().decode(zarrJsonContent) + ) as ZarrV3Attrs; + + if (attrs.node_type === 'array') { + log.info('Getting Zarr array from OZX with Zarr version 3'); + // For OZX arrays, we need a custom store - use baseUrl which routes through OZX API + const arr = await getZarrArray(baseUrl, 3); + const shapes = [arr.shape]; + return { + metadata: { + arr, + shapes, + multiscale: undefined, + scales: undefined, + omero: undefined, + labels: undefined, + zarrVersion: 3 + }, + omeZarrUrl: null, + availableVersions, + store + }; + } else if (attrs.node_type === 'group') { + if (attrs.attributes?.ome?.multiscales) { + log.info('Getting OME-Zarr metadata from OZX with Zarr version 3'); + // Use the OZX content URL as the base for OME-Zarr + const metadata = await getOmeZarrMetadata(baseUrl); + + // Check for labels + try { + const labelsContent = await store.get('labels/zarr.json'); + if (labelsContent) { + const labelsAttrs = JSON.parse( + new TextDecoder().decode(labelsContent) + ) as ZarrV3Attrs; + metadata.labels = labelsAttrs?.attributes?.ome?.labels; + if (metadata.labels) { + log.info('OME-Zarr Labels found in OZX: ', metadata.labels); + } + } + } catch (error) { + log.trace('Could not fetch labels attrs from OZX: ', error); + } + + return { + metadata, + omeZarrUrl: baseUrl, + availableVersions, + store + }; + } else { + log.info('OZX Zarrv3 group has no multiscales', attrs.attributes); + return { + metadata: null, + omeZarrUrl: null, + availableVersions, + store + }; + } + } else { + log.warn('Unknown OZX Zarrv3 node type', attrs.node_type); + return { + metadata: null, + omeZarrUrl: null, + availableVersions, + store + }; + } + } + + // RFC-9 OZX is for OME-Zarr v0.5 which requires Zarr v3 + // If we reach here, no valid zarr.json was found + log.debug('No Zarr v3 data detected in OZX (RFC-9 requires Zarr v3)'); + return { + metadata: null, + omeZarrUrl: null, + availableVersions: [], + store + }; +} + +/** + * Hook to fetch Zarr metadata from an OZX (Zipped OME-Zarr) file. + * This hook handles: + * 1. Listing files within the OZX archive + * 2. Detecting Zarr version + * 3. Reading metadata + * 4. Providing an OzxFetchStore for chunk access + */ +export function useOzxZarrMetadataQuery( + params: OzxZarrMetadataQueryParams +): UseQueryResult { + const { fspName, ozxFile } = params; + + // First, get the file list from the OZX + const fileListQuery = useZipFileListQuery( + fspName, + ozxFile?.path, + undefined, + !!fspName && !!ozxFile && isOzxFile(ozxFile) + ); + + return useQuery({ + queryKey: ['ozx', 'zarr', 'metadata', fspName || '', ozxFile?.path || ''], + queryFn: async () => { + if (!fspName || !ozxFile) { + throw new Error('fspName and ozxFile are required'); + } + if (!fileListQuery.data) { + throw new Error('File list not available'); + } + return await fetchOzxZarrMetadata( + fspName, + ozxFile.path, + fileListQuery.data + ); + }, + enabled: + !!fspName && + !!ozxFile && + isOzxFile(ozxFile) && + !!fileListQuery.data && + fileListQuery.data.length > 0 && + detectOzxZarrVersions(fileListQuery.data).length > 0, + staleTime: 5 * 60 * 1000, + retry: false + }); +} + +// Re-export OZX detection utilities for convenience +export { isOzxFile } from '@/utils/ozxDetection'; +export { OzxFetchStore, getZipContentUrl } from './ozxQueries'; diff --git a/frontend/src/utils/ozxDetection.ts b/frontend/src/utils/ozxDetection.ts new file mode 100644 index 00000000..dda44a21 --- /dev/null +++ b/frontend/src/utils/ozxDetection.ts @@ -0,0 +1,83 @@ +/** + * OZX (Zipped OME-Zarr) file detection utilities. + * + * RFC-9 Spec: https://ngff.openmicroscopy.org/rfc/9/index.html + */ + +import type { FileOrFolder } from '@/shared.types'; + +/** + * Check if a file is an OZX (Zipped OME-Zarr) file by extension. + * + * @param file - The file to check + * @returns True if the file has a .ozx extension + */ +export function isOzxFile(file: FileOrFolder): boolean { + return !file.is_dir && file.name.toLowerCase().endsWith('.ozx'); +} + +/** + * Check if a filename has the .ozx extension. + * + * @param filename - The filename to check + * @returns True if the filename ends with .ozx + */ +export function isOzxFilename(filename: string): boolean { + return filename.toLowerCase().endsWith('.ozx'); +} + +/** + * Check if a file is a regular ZIP file by extension. + * + * @param file - The file to check + * @returns True if the file has a .zip extension + */ +export function isZipFile(file: FileOrFolder): boolean { + return !file.is_dir && file.name.toLowerCase().endsWith('.zip'); +} + +/** + * Check if a file is either an OZX or a ZIP file. + * + * @param file - The file to check + * @returns True if the file is an OZX or a ZIP file + */ +export function isAnyZipFile(file: FileOrFolder): boolean { + return isOzxFile(file) || isZipFile(file); +} + +/** + * Check if a list of files contains any OZX files. + * + * @param files - Array of files to check + * @returns True if at least one file is an OZX file + */ +export function hasOzxFiles(files: FileOrFolder[]): boolean { + return files.some(isOzxFile); +} + +/** + * Get all OZX files from a list of files. + * + * @param files - Array of files to filter + * @returns Array containing only the OZX files + */ +export function getOzxFiles(files: FileOrFolder[]): FileOrFolder[] { + return files.filter(isOzxFile); +} + +/** + * Extract the path from a file for ZIP/OZX API calls. + * Removes leading slashes and normalizes the path. + * + * @param file - The file to get the path from + * @returns Normalized path suitable for API calls + */ +export function getZipFilePath(file: FileOrFolder): string { + let path = file.path; + // Remove leading slash if present + if (path.startsWith('/')) { + path = path.slice(1); + } + return path; +} diff --git a/tests/test_ozxzip.py b/tests/test_ozxzip.py new file mode 100644 index 00000000..fa135141 --- /dev/null +++ b/tests/test_ozxzip.py @@ -0,0 +1,573 @@ +"""Tests for the OZX ZIP reader module.""" + +import os +import io +import struct +import json +import zlib +import tempfile +import pytest + +from fileglancer.ozxzip import ( + OZXReader, + OZXMetadata, + OZXReaderError, + InvalidOZXError, + is_ozx_file, + is_json_metadata_file, +) +from fileglancer.zipread import ( + ZipReader, + ZipEntry, + ZipReaderError, + InvalidZipError, + ZIP_LOCAL_HEADER_SIG, + ZIP_CD_SIG, + ZIP_EOCD_SIG, + ZIP_EOCD64_SIG, + ZIP_EOCD64_LOC_SIG, + COMPRESSION_STORED, + COMPRESSION_DEFLATE, +) + + +def create_zip_local_header(filename: bytes, data: bytes, compression: int = COMPRESSION_STORED) -> bytes: + """Create a ZIP local file header.""" + crc = zlib.crc32(data) & 0xFFFFFFFF + + if compression == COMPRESSION_DEFLATE: + compressor = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -zlib.MAX_WBITS) + compressed = compressor.compress(data) + compressor.flush() + comp_size = len(compressed) + data_to_write = compressed + else: + comp_size = len(data) + data_to_write = data + + uncomp_size = len(data) + + header = struct.pack( + '<4sHHHHHLLLHH', + ZIP_LOCAL_HEADER_SIG, + 20, # version needed + 0, # flags + compression, + 0, # mod time + 0, # mod date + crc, + comp_size, + uncomp_size, + len(filename), + 0 # extra field length + ) + return header + filename + data_to_write, crc, comp_size, uncomp_size + + +def create_zip_cd_entry(filename: bytes, crc: int, comp_size: int, uncomp_size: int, + local_offset: int, compression: int = COMPRESSION_STORED) -> bytes: + """Create a ZIP central directory entry.""" + header = struct.pack( + '<4sHHHHHHLLLHHHHHLL', + ZIP_CD_SIG, + 20, # version made by + 20, # version needed + 0, # flags + compression, + 0, # mod time + 0, # mod date + crc, + comp_size, + uncomp_size, + len(filename), + 0, # extra field length + 0, # comment length + 0, # disk number start + 0, # internal attributes + 0, # external attributes + local_offset + ) + return header + filename + + +def create_zip_eocd(cd_entries: int, cd_size: int, cd_offset: int, comment: bytes = b'') -> bytes: + """Create a ZIP end of central directory record.""" + return struct.pack( + '<4sHHHHLLH', + ZIP_EOCD_SIG, + 0, # disk number + 0, # disk with CD + cd_entries, + cd_entries, + cd_size, + cd_offset, + len(comment) + ) + comment + + +def create_simple_ozx(files: dict, comment: str = None) -> bytes: + """Create a simple OZX (ZIP) file with the given files. + + Args: + files: Dictionary mapping filenames to file contents + comment: Optional ZIP comment (for OZX metadata) + + Returns: + bytes: Complete ZIP file data + """ + data = io.BytesIO() + cd_entries = [] + local_offsets = [] + + # Write local file headers and data + for filename, content in files.items(): + filename_bytes = filename.encode('utf-8') + offset = data.tell() + local_offsets.append(offset) + + local_data, crc, comp_size, uncomp_size = create_zip_local_header( + filename_bytes, content.encode('utf-8') if isinstance(content, str) else content + ) + data.write(local_data) + cd_entries.append((filename_bytes, crc, comp_size, uncomp_size, offset)) + + # Write central directory + cd_start = data.tell() + for filename_bytes, crc, comp_size, uncomp_size, offset in cd_entries: + cd_entry = create_zip_cd_entry(filename_bytes, crc, comp_size, uncomp_size, offset) + data.write(cd_entry) + cd_size = data.tell() - cd_start + + # Write EOCD + comment_bytes = comment.encode('utf-8') if comment else b'' + eocd = create_zip_eocd(len(files), cd_size, cd_start, comment_bytes) + data.write(eocd) + + return data.getvalue() + + +@pytest.fixture +def temp_ozx_file(): + """Create a temporary OZX file for testing.""" + files = { + 'zarr.json': '{"zarr_format": 3, "node_type": "group"}', + '0/zarr.json': '{"zarr_format": 3, "node_type": "array"}', + '0/c/0/0/0': b'\x00' * 100, # Binary chunk data + } + comment = json.dumps({ + "ome": { + "version": "0.5", + "zipFile": { + "centralDirectory": { + "jsonFirst": True + } + } + } + }) + + zip_data = create_simple_ozx(files, comment) + + with tempfile.NamedTemporaryFile(suffix='.ozx', delete=False) as f: + f.write(zip_data) + temp_path = f.name + + yield temp_path + + # Cleanup + if os.path.exists(temp_path): + os.unlink(temp_path) + + +@pytest.fixture +def temp_ozx_no_metadata(): + """Create a temporary OZX file without OME metadata.""" + files = { + 'data.txt': 'Hello, World!', + 'folder/nested.txt': 'Nested content', + } + + zip_data = create_simple_ozx(files) + + with tempfile.NamedTemporaryFile(suffix='.ozx', delete=False) as f: + f.write(zip_data) + temp_path = f.name + + yield temp_path + + if os.path.exists(temp_path): + os.unlink(temp_path) + + +class TestOZXReaderBasics: + """Test basic OZXReader functionality.""" + + def test_is_ozx_file(self): + """Test is_ozx_file helper function.""" + assert is_ozx_file('test.ozx') is True + assert is_ozx_file('test.OZX') is True + assert is_ozx_file('path/to/file.ozx') is True + assert is_ozx_file('test.zip') is False + assert is_ozx_file('test.zarr') is False + assert is_ozx_file('ozx') is False + + def test_open_close(self, temp_ozx_file): + """Test opening and closing OZX file.""" + reader = OZXReader(temp_ozx_file) + reader.open() + assert reader._fh is not None + assert reader.file_size > 0 + reader.close() + assert reader._fh is None + + def test_context_manager(self, temp_ozx_file): + """Test using OZXReader as context manager.""" + with OZXReader(temp_ozx_file) as reader: + assert reader._fh is not None + assert reader.file_size > 0 + assert reader._fh is None + + def test_file_not_found(self): + """Test opening non-existent file.""" + reader = OZXReader('/nonexistent/path/file.ozx') + with pytest.raises(FileNotFoundError): + reader.open() + + +class TestOZXMetadataParsing: + """Test OZX metadata parsing from ZIP comment.""" + + def test_parse_ome_metadata(self, temp_ozx_file): + """Test parsing OME metadata from ZIP comment.""" + with OZXReader(temp_ozx_file) as reader: + metadata = reader.get_metadata() + assert metadata is not None + assert metadata.version == "0.5" + assert metadata.json_first is True + + def test_no_metadata(self, temp_ozx_no_metadata): + """Test OZX file without OME metadata.""" + with OZXReader(temp_ozx_no_metadata) as reader: + metadata = reader.get_metadata() + assert metadata is None + + def test_invalid_json_comment(self): + """Test OZX file with invalid JSON comment.""" + files = {'test.txt': 'content'} + zip_data = create_simple_ozx(files, "not valid json") + + with tempfile.NamedTemporaryFile(suffix='.ozx', delete=False) as f: + f.write(zip_data) + temp_path = f.name + + try: + with OZXReader(temp_path) as reader: + metadata = reader.get_metadata() + assert metadata is None + finally: + os.unlink(temp_path) + + def test_json_without_ome_key(self): + """Test OZX file with JSON comment but no 'ome' key.""" + files = {'test.txt': 'content'} + comment = json.dumps({"other": "data"}) + zip_data = create_simple_ozx(files, comment) + + with tempfile.NamedTemporaryFile(suffix='.ozx', delete=False) as f: + f.write(zip_data) + temp_path = f.name + + try: + with OZXReader(temp_path) as reader: + metadata = reader.get_metadata() + assert metadata is None + finally: + os.unlink(temp_path) + + +class TestCentralDirectory: + """Test central directory parsing.""" + + def test_parse_central_directory(self, temp_ozx_file): + """Test parsing central directory.""" + with OZXReader(temp_ozx_file) as reader: + entries = reader.parse_central_directory() + assert 'zarr.json' in entries + assert '0/zarr.json' in entries + assert '0/c/0/0/0' in entries + + def test_entry_properties(self, temp_ozx_file): + """Test ZipEntry properties.""" + with OZXReader(temp_ozx_file) as reader: + entries = reader.parse_central_directory() + + json_entry = entries['zarr.json'] + assert is_json_metadata_file(json_entry.filename) is True + assert json_entry.is_directory is False + + chunk_entry = entries['0/c/0/0/0'] + assert is_json_metadata_file(chunk_entry.filename) is False + assert chunk_entry.uncompressed_size == 100 + + def test_json_first_optimization(self, temp_ozx_file): + """Test jsonFirst optimization stops at first non-JSON file.""" + with OZXReader(temp_ozx_file) as reader: + metadata = reader.get_metadata() + assert metadata.json_first is True + + # Parse with json_only=True + entries = reader.parse_central_directory(json_only=True) + + # Should have stopped before the binary chunk + # The exact behavior depends on the order in the central directory + assert 'zarr.json' in entries + + def test_list_files(self, temp_ozx_file): + """Test listing files in archive.""" + with OZXReader(temp_ozx_file) as reader: + files = reader.list_files() + assert 'zarr.json' in files + assert '0/zarr.json' in files + assert '0/c/0/0/0' in files + + def test_list_files_with_prefix(self, temp_ozx_file): + """Test listing files with prefix filter.""" + with OZXReader(temp_ozx_file) as reader: + files = reader.list_files(prefix='0/') + assert '0/zarr.json' in files + assert '0/c/0/0/0' in files + assert 'zarr.json' not in files + + def test_get_entry(self, temp_ozx_file): + """Test getting specific entry.""" + with OZXReader(temp_ozx_file) as reader: + entry = reader.get_entry('zarr.json') + assert entry is not None + assert entry.filename == 'zarr.json' + + missing = reader.get_entry('nonexistent.txt') + assert missing is None + + +class TestFileReading: + """Test reading files from archive.""" + + def test_read_file(self, temp_ozx_file): + """Test reading entire file.""" + with OZXReader(temp_ozx_file) as reader: + content = reader.read_file('zarr.json') + data = json.loads(content.decode('utf-8')) + assert data['zarr_format'] == 3 + assert data['node_type'] == 'group' + + def test_read_binary_file(self, temp_ozx_file): + """Test reading binary file.""" + with OZXReader(temp_ozx_file) as reader: + content = reader.read_file('0/c/0/0/0') + assert len(content) == 100 + assert content == b'\x00' * 100 + + def test_read_nonexistent_file(self, temp_ozx_file): + """Test reading nonexistent file.""" + with OZXReader(temp_ozx_file) as reader: + with pytest.raises(FileNotFoundError): + reader.read_file('nonexistent.txt') + + def test_stream_file(self, temp_ozx_file): + """Test streaming file content.""" + with OZXReader(temp_ozx_file) as reader: + chunks = list(reader.stream_file('zarr.json', buffer_size=10)) + content = b''.join(chunks) + data = json.loads(content.decode('utf-8')) + assert data['zarr_format'] == 3 + + +class TestRangeRequests: + """Test range request functionality.""" + + def test_stream_file_range(self, temp_ozx_no_metadata): + """Test streaming a range of file content.""" + with OZXReader(temp_ozx_no_metadata) as reader: + # "Hello, World!" = 13 bytes + # Get bytes 0-4 = "Hello" + content = b''.join(reader.stream_file_range('data.txt', 0, 4)) + assert content == b'Hello' + + def test_stream_file_range_middle(self, temp_ozx_no_metadata): + """Test streaming from middle of file.""" + with OZXReader(temp_ozx_no_metadata) as reader: + # Get bytes 7-11 = "World" + content = b''.join(reader.stream_file_range('data.txt', 7, 11)) + assert content == b'World' + + def test_stream_file_range_full(self, temp_ozx_no_metadata): + """Test streaming full file via range.""" + with OZXReader(temp_ozx_no_metadata) as reader: + content = b''.join(reader.stream_file_range('data.txt', 0, 12)) + assert content == b'Hello, World!' + + def test_stream_file_range_past_end(self, temp_ozx_no_metadata): + """Test range extending past end of file.""" + with OZXReader(temp_ozx_no_metadata) as reader: + # Request beyond file size - should clamp to file end + content = b''.join(reader.stream_file_range('data.txt', 7, 100)) + assert content == b'World!' + + def test_stream_file_range_invalid(self, temp_ozx_no_metadata): + """Test invalid range requests.""" + with OZXReader(temp_ozx_no_metadata) as reader: + with pytest.raises(ValueError): + list(reader.stream_file_range('data.txt', -1, 5)) + + with pytest.raises(ValueError): + list(reader.stream_file_range('data.txt', 10, 5)) + + +class TestCompression: + """Test handling of compressed files.""" + + def test_deflate_compression(self): + """Test reading DEFLATE compressed files.""" + # Create a ZIP with compressed content + content = b'Hello, this is some test content that should compress well. ' * 10 + filename = b'compressed.txt' + + data = io.BytesIO() + + # Write local header with compression + local_data, crc, comp_size, uncomp_size = create_zip_local_header( + filename, content, compression=COMPRESSION_DEFLATE + ) + data.write(local_data) + + # Write central directory + cd_start = data.tell() + cd_entry = create_zip_cd_entry(filename, crc, comp_size, uncomp_size, 0, compression=COMPRESSION_DEFLATE) + data.write(cd_entry) + cd_size = data.tell() - cd_start + + # Write EOCD + eocd = create_zip_eocd(1, cd_size, cd_start) + data.write(eocd) + + with tempfile.NamedTemporaryFile(suffix='.ozx', delete=False) as f: + f.write(data.getvalue()) + temp_path = f.name + + try: + with OZXReader(temp_path) as reader: + read_content = reader.read_file('compressed.txt') + assert read_content == content + + # Test streaming too + streamed = b''.join(reader.stream_file('compressed.txt')) + assert streamed == content + finally: + os.unlink(temp_path) + + +class TestZipEntry: + """Test ZipEntry dataclass.""" + + def test_is_directory(self): + """Test is_directory property.""" + dir_entry = ZipEntry( + filename='folder/', + compressed_size=0, + uncompressed_size=0, + compression_method=0, + local_header_offset=0, + crc32=0 + ) + assert dir_entry.is_directory is True + + file_entry = ZipEntry( + filename='file.txt', + compressed_size=100, + uncompressed_size=100, + compression_method=0, + local_header_offset=0, + crc32=123456 + ) + assert file_entry.is_directory is False + + def test_is_json_metadata_file(self): + """Test is_json_metadata_file function.""" + test_cases = [ + ('zarr.json', True), + ('.zarray', True), + ('.zattrs', True), + ('.zgroup', True), + ('data/zarr.JSON', True), # case insensitive + ('data.txt', False), + ('image.png', False), + ('c/0/0/0', False), + ] + + for filename, expected in test_cases: + assert is_json_metadata_file(filename) is expected, f"Failed for {filename}" + + +class TestOZXMetadata: + """Test OZXMetadata dataclass.""" + + def test_metadata_creation(self): + """Test creating OZXMetadata.""" + metadata = OZXMetadata(version="0.5", json_first=True) + assert metadata.version == "0.5" + assert metadata.json_first is True + + def test_metadata_defaults(self): + """Test default values.""" + metadata = OZXMetadata(version="0.4") + assert metadata.json_first is False + assert metadata.raw_comment is None + + +class TestEdgeCases: + """Test edge cases and error handling.""" + + def test_empty_archive(self): + """Test handling of empty archive.""" + files = {} + zip_data = create_simple_ozx(files) + + with tempfile.NamedTemporaryFile(suffix='.ozx', delete=False) as f: + f.write(zip_data) + temp_path = f.name + + try: + with OZXReader(temp_path) as reader: + entries = reader.parse_central_directory() + assert len(entries) == 0 + files = reader.list_files() + assert len(files) == 0 + finally: + os.unlink(temp_path) + + def test_reader_not_opened(self): + """Test error when reader not opened.""" + reader = OZXReader('/some/path.ozx') + with pytest.raises(ZipReaderError): + reader.parse_central_directory() + + def test_unicode_filenames(self): + """Test handling of Unicode filenames.""" + files = { + 'data/日本語.txt': 'Japanese text', + 'data/emoji_🎉.txt': 'Party!', + } + zip_data = create_simple_ozx(files) + + with tempfile.NamedTemporaryFile(suffix='.ozx', delete=False) as f: + f.write(zip_data) + temp_path = f.name + + try: + with OZXReader(temp_path) as reader: + entries = reader.parse_central_directory() + assert 'data/日本語.txt' in entries + assert 'data/emoji_🎉.txt' in entries + + content = reader.read_file('data/日本語.txt') + assert content.decode('utf-8') == 'Japanese text' + finally: + os.unlink(temp_path) diff --git a/tests/test_zipread.py b/tests/test_zipread.py new file mode 100644 index 00000000..1e8c8fe7 --- /dev/null +++ b/tests/test_zipread.py @@ -0,0 +1,451 @@ +"""Tests for the generic ZIP reader module.""" + +import os +import io +import struct +import zlib +import tempfile +import pytest + +from fileglancer.zipread import ( + ZipReader, + ZipEntry, + ZipReaderError, + InvalidZipError, + ZIP_LOCAL_HEADER_SIG, + ZIP_CD_SIG, + ZIP_EOCD_SIG, + COMPRESSION_STORED, + COMPRESSION_DEFLATE, +) + + +def create_zip_local_header(filename: bytes, data: bytes, compression: int = COMPRESSION_STORED) -> bytes: + """Create a ZIP local file header.""" + crc = zlib.crc32(data) & 0xFFFFFFFF + + if compression == COMPRESSION_DEFLATE: + compressor = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -zlib.MAX_WBITS) + compressed = compressor.compress(data) + compressor.flush() + comp_size = len(compressed) + data_to_write = compressed + else: + comp_size = len(data) + data_to_write = data + + uncomp_size = len(data) + + header = struct.pack( + '<4sHHHHHLLLHH', + ZIP_LOCAL_HEADER_SIG, + 20, # version needed + 0, # flags + compression, + 0, # mod time + 0, # mod date + crc, + comp_size, + uncomp_size, + len(filename), + 0 # extra field length + ) + return header + filename + data_to_write, crc, comp_size, uncomp_size + + +def create_zip_cd_entry(filename: bytes, crc: int, comp_size: int, uncomp_size: int, + local_offset: int, compression: int = COMPRESSION_STORED) -> bytes: + """Create a ZIP central directory entry.""" + header = struct.pack( + '<4sHHHHHHLLLHHHHHLL', + ZIP_CD_SIG, + 20, # version made by + 20, # version needed + 0, # flags + compression, + 0, # mod time + 0, # mod date + crc, + comp_size, + uncomp_size, + len(filename), + 0, # extra field length + 0, # comment length + 0, # disk number start + 0, # internal attributes + 0, # external attributes + local_offset + ) + return header + filename + + +def create_zip_eocd(cd_entries: int, cd_size: int, cd_offset: int, comment: bytes = b'') -> bytes: + """Create a ZIP end of central directory record.""" + return struct.pack( + '<4sHHHHLLH', + ZIP_EOCD_SIG, + 0, # disk number + 0, # disk with CD + cd_entries, + cd_entries, + cd_size, + cd_offset, + len(comment) + ) + comment + + +def create_simple_zip(files: dict, comment: str = None) -> bytes: + """Create a simple ZIP file with the given files. + + Args: + files: Dictionary mapping filenames to file contents + comment: Optional ZIP comment + + Returns: + bytes: Complete ZIP file data + """ + data = io.BytesIO() + cd_entries = [] + + # Write local file headers and data + for filename, content in files.items(): + filename_bytes = filename.encode('utf-8') + offset = data.tell() + + content_bytes = content.encode('utf-8') if isinstance(content, str) else content + local_data, crc, comp_size, uncomp_size = create_zip_local_header( + filename_bytes, content_bytes + ) + data.write(local_data) + cd_entries.append((filename_bytes, crc, comp_size, uncomp_size, offset)) + + # Write central directory + cd_start = data.tell() + for filename_bytes, crc, comp_size, uncomp_size, offset in cd_entries: + cd_entry = create_zip_cd_entry(filename_bytes, crc, comp_size, uncomp_size, offset) + data.write(cd_entry) + cd_size = data.tell() - cd_start + + # Write EOCD + comment_bytes = comment.encode('utf-8') if comment else b'' + eocd = create_zip_eocd(len(files), cd_size, cd_start, comment_bytes) + data.write(eocd) + + return data.getvalue() + + +@pytest.fixture +def temp_zip_file(): + """Create a temporary ZIP file for testing.""" + files = { + 'readme.txt': 'This is a test file.', + 'data/file1.txt': 'File 1 content', + 'data/file2.txt': 'File 2 content', + } + comment = "Test ZIP archive" + + zip_data = create_simple_zip(files, comment) + + with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as f: + f.write(zip_data) + temp_path = f.name + + yield temp_path + + if os.path.exists(temp_path): + os.unlink(temp_path) + + +class TestZipReaderBasics: + """Test basic ZipReader functionality.""" + + def test_open_close(self, temp_zip_file): + """Test opening and closing ZIP file.""" + reader = ZipReader(temp_zip_file) + reader.open() + assert reader._fh is not None + assert reader.file_size > 0 + reader.close() + assert reader._fh is None + + def test_context_manager(self, temp_zip_file): + """Test using ZipReader as context manager.""" + with ZipReader(temp_zip_file) as reader: + assert reader._fh is not None + assert reader.file_size > 0 + assert reader._fh is None + + def test_file_not_found(self): + """Test opening non-existent file.""" + reader = ZipReader('/nonexistent/path/file.zip') + with pytest.raises(FileNotFoundError): + reader.open() + + def test_comment_property(self, temp_zip_file): + """Test accessing ZIP comment.""" + with ZipReader(temp_zip_file) as reader: + assert reader.comment == "Test ZIP archive" + + +class TestCentralDirectory: + """Test central directory parsing.""" + + def test_parse_central_directory(self, temp_zip_file): + """Test parsing central directory.""" + with ZipReader(temp_zip_file) as reader: + entries = reader.parse_central_directory() + assert 'readme.txt' in entries + assert 'data/file1.txt' in entries + assert 'data/file2.txt' in entries + + def test_entry_properties(self, temp_zip_file): + """Test ZipEntry properties.""" + with ZipReader(temp_zip_file) as reader: + entries = reader.parse_central_directory() + + readme = entries['readme.txt'] + assert readme.is_directory is False + assert readme.uncompressed_size == len('This is a test file.') + + def test_list_files(self, temp_zip_file): + """Test listing files in archive.""" + with ZipReader(temp_zip_file) as reader: + files = reader.list_files() + assert 'readme.txt' in files + assert 'data/file1.txt' in files + assert 'data/file2.txt' in files + + def test_list_files_with_prefix(self, temp_zip_file): + """Test listing files with prefix filter.""" + with ZipReader(temp_zip_file) as reader: + files = reader.list_files(prefix='data/') + assert 'data/file1.txt' in files + assert 'data/file2.txt' in files + assert 'readme.txt' not in files + + def test_get_entry(self, temp_zip_file): + """Test getting specific entry.""" + with ZipReader(temp_zip_file) as reader: + entry = reader.get_entry('readme.txt') + assert entry is not None + assert entry.filename == 'readme.txt' + + missing = reader.get_entry('nonexistent.txt') + assert missing is None + + def test_stop_condition(self, temp_zip_file): + """Test parsing with stop condition.""" + with ZipReader(temp_zip_file) as reader: + # Stop after first file using index + # Note: stop condition is checked AFTER adding the entry, + # so returning True at index 0 stops after the first entry + def stop_after_one(entry, index): + return index >= 0 # Stop after processing first entry (index 0) + + entries = reader.parse_central_directory(stop_condition=stop_after_one) + # Should have stopped after first entry + assert len(entries) == 1 + + def test_stop_condition_with_index(self, temp_zip_file): + """Test that stop condition receives correct index.""" + with ZipReader(temp_zip_file) as reader: + indices_seen = [] + + def track_indices(entry, index): + indices_seen.append(index) + return False # Don't stop + + reader.parse_central_directory(stop_condition=track_indices) + assert indices_seen == [0, 1, 2] # 3 files in temp_zip_file + + def test_max_new_entries(self, temp_zip_file): + """Test limiting entries with max_new_entries parameter.""" + with ZipReader(temp_zip_file) as reader: + entries = reader.parse_central_directory(max_new_entries=2) + assert len(entries) == 2 + + def test_max_new_entries_zero(self, temp_zip_file): + """Test max_new_entries=0 returns no entries.""" + with ZipReader(temp_zip_file) as reader: + entries = reader.parse_central_directory(max_new_entries=0) + assert len(entries) == 0 + + def test_max_new_entries_exceeds_total(self, temp_zip_file): + """Test max_new_entries larger than total entries.""" + with ZipReader(temp_zip_file) as reader: + # Archive has 3 files, requesting 100 + entries = reader.parse_central_directory(max_new_entries=100) + assert len(entries) == 3 + + def test_max_new_entries_with_stop_condition(self, temp_zip_file): + """Test that stop_condition and max_new_entries work together.""" + with ZipReader(temp_zip_file) as reader: + # Stop condition would stop at index 2, but max_new_entries=1 should stop first + def stop_at_two(entry, index): + return index >= 2 + + entries = reader.parse_central_directory( + stop_condition=stop_at_two, + max_new_entries=1 + ) + assert len(entries) == 1 + + +class TestFileReading: + """Test reading files from archive.""" + + def test_read_file(self, temp_zip_file): + """Test reading entire file.""" + with ZipReader(temp_zip_file) as reader: + content = reader.read_file('readme.txt') + assert content == b'This is a test file.' + + def test_read_nonexistent_file(self, temp_zip_file): + """Test reading nonexistent file.""" + with ZipReader(temp_zip_file) as reader: + with pytest.raises(FileNotFoundError): + reader.read_file('nonexistent.txt') + + def test_stream_file(self, temp_zip_file): + """Test streaming file content.""" + with ZipReader(temp_zip_file) as reader: + chunks = list(reader.stream_file('readme.txt', buffer_size=5)) + content = b''.join(chunks) + assert content == b'This is a test file.' + + +class TestRangeRequests: + """Test range request functionality.""" + + def test_stream_file_range(self, temp_zip_file): + """Test streaming a range of file content.""" + with ZipReader(temp_zip_file) as reader: + # "This is a test file." - get bytes 0-3 = "This" + content = b''.join(reader.stream_file_range('readme.txt', 0, 3)) + assert content == b'This' + + def test_stream_file_range_middle(self, temp_zip_file): + """Test streaming from middle of file.""" + with ZipReader(temp_zip_file) as reader: + # Get "test" + content = b''.join(reader.stream_file_range('readme.txt', 10, 13)) + assert content == b'test' + + def test_stream_file_range_invalid(self, temp_zip_file): + """Test invalid range requests.""" + with ZipReader(temp_zip_file) as reader: + with pytest.raises(ValueError): + list(reader.stream_file_range('readme.txt', -1, 5)) + + with pytest.raises(ValueError): + list(reader.stream_file_range('readme.txt', 10, 5)) + + +class TestCompression: + """Test handling of compressed files.""" + + def test_deflate_compression(self): + """Test reading DEFLATE compressed files.""" + content = b'Hello, this is some test content that should compress well. ' * 10 + filename = b'compressed.txt' + + data = io.BytesIO() + + # Write local header with compression + local_data, crc, comp_size, uncomp_size = create_zip_local_header( + filename, content, compression=COMPRESSION_DEFLATE + ) + data.write(local_data) + + # Write central directory + cd_start = data.tell() + cd_entry = create_zip_cd_entry(filename, crc, comp_size, uncomp_size, 0, compression=COMPRESSION_DEFLATE) + data.write(cd_entry) + cd_size = data.tell() - cd_start + + # Write EOCD + eocd = create_zip_eocd(1, cd_size, cd_start) + data.write(eocd) + + with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as f: + f.write(data.getvalue()) + temp_path = f.name + + try: + with ZipReader(temp_path) as reader: + read_content = reader.read_file('compressed.txt') + assert read_content == content + + # Test streaming too + streamed = b''.join(reader.stream_file('compressed.txt')) + assert streamed == content + finally: + os.unlink(temp_path) + + +class TestZipEntry: + """Test ZipEntry dataclass.""" + + def test_is_directory(self): + """Test is_directory property.""" + dir_entry = ZipEntry( + filename='folder/', + compressed_size=0, + uncompressed_size=0, + compression_method=0, + local_header_offset=0, + crc32=0 + ) + assert dir_entry.is_directory is True + + file_entry = ZipEntry( + filename='file.txt', + compressed_size=100, + uncompressed_size=100, + compression_method=0, + local_header_offset=0, + crc32=123456 + ) + assert file_entry.is_directory is False + + +class TestEdgeCases: + """Test edge cases and error handling.""" + + def test_empty_archive(self): + """Test handling of empty archive.""" + files = {} + zip_data = create_simple_zip(files) + + with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as f: + f.write(zip_data) + temp_path = f.name + + try: + with ZipReader(temp_path) as reader: + entries = reader.parse_central_directory() + assert len(entries) == 0 + files = reader.list_files() + assert len(files) == 0 + finally: + os.unlink(temp_path) + + def test_reader_not_opened(self): + """Test error when reader not opened.""" + reader = ZipReader('/some/path.zip') + with pytest.raises(ZipReaderError): + reader.parse_central_directory() + + def test_entries_property(self, temp_zip_file): + """Test entries property.""" + with ZipReader(temp_zip_file) as reader: + # Before parsing + assert reader.entries == {} + + # After parsing + reader.parse_central_directory() + assert len(reader.entries) == 3 + + def test_cd_entries_count(self, temp_zip_file): + """Test cd_entries_count property.""" + with ZipReader(temp_zip_file) as reader: + assert reader.cd_entries_count == 3 diff --git a/tests/test_zipread_resume.py b/tests/test_zipread_resume.py new file mode 100644 index 00000000..e324eeb0 --- /dev/null +++ b/tests/test_zipread_resume.py @@ -0,0 +1,71 @@ +import zipfile +import pytest +from fileglancer.zipread import ZipReader + +@pytest.fixture +def sample_zip(tmp_path): + zip_path = tmp_path / "test.zip" + with zipfile.ZipFile(zip_path, 'w') as zf: + zf.writestr("file1.txt", "content1") + zf.writestr("file2.txt", "content2") + zf.writestr("file3.txt", "content3") + zf.writestr("file4.txt", "content4") + return str(zip_path) + +def test_resume_parsing(sample_zip): + with ZipReader(sample_zip) as reader: + # Read first 2 entries + entries = reader.parse_central_directory(max_new_entries=2) + assert len(entries) == 2 + assert "file1.txt" in entries + assert "file2.txt" in entries + assert "file3.txt" not in entries + + # Read next 1 entry + entries = reader.parse_central_directory(max_new_entries=1) + assert len(entries) == 3 + assert "file3.txt" in entries + + # Read rest + entries = reader.parse_central_directory() + assert len(entries) == 4 + assert "file4.txt" in entries + + # Check parsed flag + assert reader._cd_parsed + +def test_lazy_get_entry(sample_zip): + with ZipReader(sample_zip) as reader: + # We haven't parsed anything yet + assert len(reader._entries) == 0 + + # Request file3.txt + entry = reader.get_entry("file3.txt") + assert entry is not None + assert entry.filename == "file3.txt" + + # It should have parsed at least 3 entries + assert len(reader._entries) >= 3 + assert "file1.txt" in reader._entries + + # file4 shouldn't be parsed yet + assert "file4.txt" not in reader._entries + assert not reader._cd_parsed + +def test_resume_with_stop_condition(sample_zip): + with ZipReader(sample_zip) as reader: + # Stop at file2 + def stop_at_2(entry, idx): + return entry.filename == "file2.txt" + + entries = reader.parse_central_directory(stop_condition=stop_at_2) + assert "file2.txt" in entries + assert "file3.txt" not in entries + + # Resume and stop at file3 + def stop_at_3(entry, idx): + return entry.filename == "file3.txt" + + entries = reader.parse_central_directory(stop_condition=stop_at_3) + assert "file3.txt" in entries + assert "file4.txt" not in entries