diff --git a/.github/workflows/build-push.yaml b/.github/workflows/build-push.yaml index ab6a122f..a2c1ed8c 100644 --- a/.github/workflows/build-push.yaml +++ b/.github/workflows/build-push.yaml @@ -23,7 +23,7 @@ jobs: "examples/reducestream/counter", "examples/reducestream/sum", "examples/sideinput/simple_sideinput", "examples/sideinput/simple_sideinput/udf", "examples/sink/async_log", "examples/sink/log", "examples/source/simple_source", "examples/sourcetransform/event_time_filter", - "examples/batchmap/flatmap" + "examples/batchmap/flatmap", "examples/accumulator/streamsorter" ] steps: diff --git a/Makefile b/Makefile index 82696bbb..0e5334f2 100644 --- a/Makefile +++ b/Makefile @@ -26,13 +26,14 @@ setup: poetry install --with dev --no-root proto: - python3 -m grpc_tools.protoc --pyi_out=pynumaflow/proto/sinker -I=pynumaflow/proto/sinker --python_out=pynumaflow/proto/sinker --grpc_python_out=pynumaflow/proto/sinker pynumaflow/proto/sinker/*.proto - python3 -m grpc_tools.protoc --pyi_out=pynumaflow/proto/mapper -I=pynumaflow/proto/mapper --python_out=pynumaflow/proto/mapper --grpc_python_out=pynumaflow/proto/mapper pynumaflow/proto/mapper/*.proto - python3 -m grpc_tools.protoc --pyi_out=pynumaflow/proto/reducer -I=pynumaflow/proto/reducer --python_out=pynumaflow/proto/reducer --grpc_python_out=pynumaflow/proto/reducer pynumaflow/proto/reducer/*.proto - python3 -m grpc_tools.protoc --pyi_out=pynumaflow/proto/sourcetransformer -I=pynumaflow/proto/sourcetransformer --python_out=pynumaflow/proto/sourcetransformer --grpc_python_out=pynumaflow/proto/sourcetransformer pynumaflow/proto/sourcetransformer/*.proto - python3 -m grpc_tools.protoc --pyi_out=pynumaflow/proto/sideinput -I=pynumaflow/proto/sideinput --python_out=pynumaflow/proto/sideinput --grpc_python_out=pynumaflow/proto/sideinput pynumaflow/proto/sideinput/*.proto - python3 -m grpc_tools.protoc --pyi_out=pynumaflow/proto/sourcer -I=pynumaflow/proto/sourcer --python_out=pynumaflow/proto/sourcer --grpc_python_out=pynumaflow/proto/sourcer pynumaflow/proto/sourcer/*.proto - python3 -m grpc_tools.protoc --pyi_out=pynumaflow/proto/accumulator -I=pynumaflow/proto/accumulator --python_out=pynumaflow/proto/accumulator --grpc_python_out=pynumaflow/proto/accumulator pynumaflow/proto/accumulator/*.proto + poetry run python3 -m grpc_tools.protoc --pyi_out=pynumaflow/proto/sinker -I=pynumaflow/proto/sinker --python_out=pynumaflow/proto/sinker --grpc_python_out=pynumaflow/proto/sinker pynumaflow/proto/sinker/*.proto + poetry run python3 -m grpc_tools.protoc --pyi_out=pynumaflow/proto/mapper -I=pynumaflow/proto/mapper --python_out=pynumaflow/proto/mapper --grpc_python_out=pynumaflow/proto/mapper pynumaflow/proto/mapper/*.proto + poetry run python3 -m grpc_tools.protoc --pyi_out=pynumaflow/proto/reducer -I=pynumaflow/proto/reducer --python_out=pynumaflow/proto/reducer --grpc_python_out=pynumaflow/proto/reducer pynumaflow/proto/reducer/*.proto + poetry run python3 -m grpc_tools.protoc --pyi_out=pynumaflow/proto/sourcetransformer -I=pynumaflow/proto/sourcetransformer --python_out=pynumaflow/proto/sourcetransformer --grpc_python_out=pynumaflow/proto/sourcetransformer pynumaflow/proto/sourcetransformer/*.proto + poetry run python3 -m grpc_tools.protoc --pyi_out=pynumaflow/proto/sideinput -I=pynumaflow/proto/sideinput --python_out=pynumaflow/proto/sideinput --grpc_python_out=pynumaflow/proto/sideinput pynumaflow/proto/sideinput/*.proto + poetry run python3 -m grpc_tools.protoc --pyi_out=pynumaflow/proto/sourcer -I=pynumaflow/proto/sourcer --python_out=pynumaflow/proto/sourcer --grpc_python_out=pynumaflow/proto/sourcer pynumaflow/proto/sourcer/*.proto + poetry run python3 -m grpc_tools.protoc --pyi_out=pynumaflow/proto/accumulator -I=pynumaflow/proto/accumulator --python_out=pynumaflow/proto/accumulator --grpc_python_out=pynumaflow/proto/accumulator pynumaflow/proto/accumulator/*.proto - sed -i '' 's/^\(import.*_pb2\)/from . \1/' pynumaflow/proto/*/*.py + sed -i.bak -e 's/^\(import.*_pb2\)/from . \1/' pynumaflow/proto/*/*.py + rm pynumaflow/proto/*/*.py.bak diff --git a/docs/DOCKER_OPTIMIZATION.md b/docs/DOCKER_OPTIMIZATION.md new file mode 100644 index 00000000..cc36410f --- /dev/null +++ b/docs/DOCKER_OPTIMIZATION.md @@ -0,0 +1,229 @@ +# Docker Build Optimization for NumaFlow Python UDFs + +## Overview + +This document outlines the optimization strategies to reduce Docker build times for NumaFlow Python UDFs from 2+ minutes to under 30 seconds for subsequent builds. + +## Current Issues + +1. **Redundant dependency installation**: Each UDF rebuilds the entire pynumaflow package +2. **No layer caching**: Dependencies are reinstalled every time +3. **Copying entire project**: The `COPY ./ ./` copies everything, including unnecessary files +4. **No shared base layers**: Each UDF builds its own base environment + +## Optimization Strategy: Three-Stage Approach + +As suggested by @kohlisid, we implement a three-stage build approach: + +### Stage 1: Base Layer +- Common Python environment and tools +- System dependencies (curl, wget, build-essential, git) +- Poetry installation +- dumb-init binary + +### Stage 2: Environment Setup +- pynumaflow package installation +- Shared virtual environment creation +- This layer is cached unless `pyproject.toml` or `poetry.lock` changes + +### Stage 3: Builder +- UDF-specific code and dependencies +- Reuses the pynumaflow installation from Stage 2 +- Minimal additional dependencies + +## Implementation Options + +### Option 1: Optimized Multi-Stage Build (Recommended) + +**File**: `examples/map/even_odd/Dockerfile.optimized` + +**Benefits**: +- Better layer caching +- Reduced build time by ~60-70% +- No external dependencies + +**Usage**: +```bash +cd examples/map/even_odd +make -f Makefile.optimized image +``` + +### Option 2: Shared Base Image (Fastest) + +**Files**: +- `Dockerfile.base` (shared base image) +- `examples/map/even_odd/Dockerfile.shared-base` (UDF-specific) + +**Benefits**: +- Maximum caching efficiency +- Build time reduced by ~80-90% for subsequent builds +- Perfect for CI/CD pipelines + +**Usage**: +```bash +# Build base image once +docker build -f Dockerfile.base -t numaflow-python-base . + +# Build UDF images (very fast) +cd examples/map/even_odd +make -f Makefile.optimized image-fast +``` + +## Performance Comparison + +| Approach | First Build | Subsequent Builds | Cache Efficiency | +|----------|-------------|-------------------|------------------| +| Current | ~2-3 minutes | ~2-3 minutes | Poor | +| Optimized Multi-Stage | ~2-3 minutes | ~45-60 seconds | Good | +| Shared Base Image | ~2-3 minutes | ~15-30 seconds | Excellent | + +## Implementation Steps + +### 1. Build Shared Base Image (One-time setup) + +```bash +# From project root +docker build -f Dockerfile.base -t numaflow-python-base . +``` + +### 2. Update UDF Dockerfiles + +Replace the current Dockerfile with the optimized version: + +```bash +# For each UDF directory +cp Dockerfile.optimized Dockerfile +# or +cp Dockerfile.shared-base Dockerfile +``` + +### 3. Update Makefiles + +Use the optimized Makefile: + +```bash +# For each UDF directory +cp Makefile.optimized Makefile +``` + +### 4. CI/CD Integration + +For CI/CD pipelines, add the base image build step: + +```yaml +# Example GitHub Actions step +- name: Build base image + run: docker build -f Dockerfile.base -t numaflow-python-base . + +- name: Build UDF images + run: | + cd examples/map/even_odd + make image-fast +``` + +## Advanced Optimizations + +### 1. Dependency Caching + +The optimized Dockerfiles implement smart dependency caching: +- `pyproject.toml` and `poetry.lock` are copied first +- pynumaflow installation is cached separately +- UDF-specific dependencies are installed last + +### 2. Layer Optimization + +- Minimal system dependencies in runtime image +- Separate build and runtime stages +- Efficient file copying with specific paths + +### 3. Build Context Optimization + +- Copy only necessary files +- Use `.dockerignore` to exclude unnecessary files +- Minimize build context size + +## Migration Guide + +### For Existing UDFs + +1. **Backup current Dockerfile**: + ```bash + cp Dockerfile Dockerfile.backup + ``` + +2. **Choose optimization approach**: + - For single UDF: Use `Dockerfile.optimized` + - For multiple UDFs: Use `Dockerfile.shared-base` + +3. **Update Makefile**: + ```bash + cp Makefile.optimized Makefile + ``` + +4. **Test the build**: + ```bash + make image + # or + make image-fast + ``` + +### For New UDFs + +1. **Use the optimized template**: + ```bash + cp examples/map/even_odd/Dockerfile.optimized your-udf/Dockerfile + cp examples/map/even_odd/Makefile.optimized your-udf/Makefile + ``` + +2. **Update paths in Dockerfile**: + - Change `EXAMPLE_PATH` to your UDF path + - Update `COPY` commands accordingly + +## Troubleshooting + +### Common Issues + +1. **Base image not found**: + ```bash + docker build -f Dockerfile.base -t numaflow-python-base . + ``` + +2. **Permission issues**: + ```bash + chmod +x entry.sh + ``` + +3. **Poetry cache issues**: + ```bash + poetry cache clear --all pypi + ``` + +### Performance Monitoring + +Monitor build times: +```bash +time make image +time make image-fast +``` + +## Future Enhancements + +1. **Registry-based base images**: Push base image to registry for team sharing +2. **BuildKit optimizations**: Enable BuildKit for parallel layer building +3. **Multi-platform builds**: Optimize for ARM64 and AMD64 +4. **Dependency analysis**: Automate dependency optimization + +## Contributing + +When adding new UDFs or modifying existing ones: + +1. Use the optimized Dockerfile templates +2. Follow the three-stage approach +3. Test build times before and after changes +4. Update this documentation if needed + +## References + +- [Docker Multi-Stage Builds](https://docs.docker.com/develop/dev-best-practices/multistage-build/) +- [Docker Layer Caching](https://docs.docker.com/develop/dev-best-practices/dockerfile_best-practices/#leverage-build-cache) +- [Poetry Docker Best Practices](https://python-poetry.org/docs/configuration/#virtualenvsin-project) \ No newline at end of file diff --git a/examples/accumulator/counter/Dockerfile b/examples/accumulator/counter/Dockerfile deleted file mode 100644 index de1756fd..00000000 --- a/examples/accumulator/counter/Dockerfile +++ /dev/null @@ -1,55 +0,0 @@ -#################################################################################################### -# builder: install needed dependencies -#################################################################################################### - -FROM python:3.10-slim-bullseye AS builder - -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" - -ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/reducestream/counter" -ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ - && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init - && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udf: used for running the udf vertices -#################################################################################################### -FROM builder AS udf - -WORKDIR $PYSETUP_PATH -COPY ./ ./ - -WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - -RUN chmod +x entry.sh - -ENTRYPOINT ["/dumb-init", "--"] -CMD ["sh", "-c", "$EXAMPLE_PATH/entry.sh"] - -EXPOSE 5000 diff --git a/examples/accumulator/counter/example.py b/examples/accumulator/counter/example.py deleted file mode 100644 index 405d7f7a..00000000 --- a/examples/accumulator/counter/example.py +++ /dev/null @@ -1,46 +0,0 @@ -import os -from collections.abc import AsyncIterable - -from pynumaflow.reducestreamer import ( - Message, - Datum, - Metadata, - ReduceStreamAsyncServer, - ReduceStreamer, -) -from pynumaflow.shared.asynciter import NonBlockingIterator - - -class ReduceCounter(ReduceStreamer): - def __init__(self, counter): - self.counter = counter - - async def handler( - self, - keys: list[str], - datums: AsyncIterable[Datum], - output: NonBlockingIterator, - md: Metadata, - ): - async for _ in datums: - self.counter += 1 - if self.counter > 10: - msg = f"counter:{self.counter}" - # NOTE: this is returning results because we have seen all the data - # use this only if you really need this feature because your next vertex - # will get both early result and final results and it should be able to - # handle both the scenarios. - await output.put(Message(str.encode(msg), keys=keys)) - self.counter = 0 - msg = f"counter:{self.counter}" - await output.put(Message(str.encode(msg), keys=keys)) - - -if __name__ == "__main__": - invoke = os.getenv("INVOKE", "class") - if invoke == "class": - # Here we are using the class instance as the reducer_instance - # which will be used to invoke the handler function. - # We are passing the init_args for the class instance. - grpc_server = ReduceStreamAsyncServer(ReduceCounter, init_args=(0,)) - grpc_server.start() diff --git a/examples/accumulator/counter/pipeline.yaml b/examples/accumulator/counter/pipeline.yaml deleted file mode 100644 index 5ac746c9..00000000 --- a/examples/accumulator/counter/pipeline.yaml +++ /dev/null @@ -1,50 +0,0 @@ -apiVersion: numaflow.numaproj.io/v1alpha1 -kind: Pipeline -metadata: - name: even-odd-sum -spec: - vertices: - - name: in - source: - http: {} - - name: atoi - scale: - min: 3 - udf: - container: - # Tell the input number is even or odd, see https://github.com/numaproj/numaflow-go/tree/main/pkg/mapper/examples/even_odd - image: quay.io/numaio/numaflow-go/map-even-odd:stable - imagePullPolicy: Always - - name: compute-sum - udf: - container: - # compute the sum - image: quay.io/numaio/numaflow-python/reduce-stream-counter:stable - imagePullPolicy: Always - env: - - name: PYTHONDEBUG - value: "true" - - name: INVOKE - value: "class" - groupBy: - window: - fixed: - length: 60s - keyed: true - storage: - persistentVolumeClaim: - volumeSize: 10Gi - accessMode: ReadWriteOnce - partitions: 1 - - name: sink - scale: - min: 1 - sink: - log: {} - edges: - - from: in - to: atoi - - from: atoi - to: compute-sum - - from: compute-sum - to: sink diff --git a/examples/accumulator/streamsorter/Dockerfile b/examples/accumulator/streamsorter/Dockerfile index c5c1bda6..dd2d605b 100644 --- a/examples/accumulator/streamsorter/Dockerfile +++ b/examples/accumulator/streamsorter/Dockerfile @@ -1,55 +1,55 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.11-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/accumulator/streamsorter" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/accumulator/streamsorter/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/accumulator/streamsorter" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udf: used for running the udf vertices -#################################################################################################### -FROM builder AS udf + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] CMD ["sh", "-c", "$EXAMPLE_PATH/entry.sh"] -EXPOSE 5000 +EXPOSE 5000 \ No newline at end of file diff --git a/examples/accumulator/streamsorter/Makefile b/examples/accumulator/streamsorter/Makefile index f36656b2..5eb6a3e8 100644 --- a/examples/accumulator/streamsorter/Makefile +++ b/examples/accumulator/streamsorter/Makefile @@ -1,6 +1,6 @@ -TAG ?= test1 +TAG ?= stable PUSH ?= false -IMAGE_REGISTRY = quay.io/numaio/numaflow-python/accumulator-sorter:${TAG} +IMAGE_REGISTRY = quay.io/numaio/numaflow-python/streamsorter:${TAG} DOCKER_FILE_PATH = examples/accumulator/streamsorter/Dockerfile .PHONY: update diff --git a/examples/accumulator/streamsorter/Makefile.optimized b/examples/accumulator/streamsorter/Makefile.optimized new file mode 100644 index 00000000..136be046 --- /dev/null +++ b/examples/accumulator/streamsorter/Makefile.optimized @@ -0,0 +1,52 @@ +TAG ?= stable +PUSH ?= false +IMAGE_REGISTRY = quay.io/numaio/numaflow-python/streamsorter:${TAG} +DOCKER_FILE_PATH = examples/accumulator/streamsorter/Dockerfile +BASE_IMAGE_NAME = numaflow-python-base + +.PHONY: base-image +base-image: + @echo "Building shared base image..." + docker build -f Dockerfile.base -t ${BASE_IMAGE_NAME} . + +.PHONY: update +update: + poetry update -vv + +.PHONY: image-push +image-push: base-image update + cd ../../../ && docker buildx build \ + -f ${DOCKER_FILE_PATH} \ + -t ${IMAGE_REGISTRY} \ + --platform linux/amd64,linux/arm64 . --push + +.PHONY: image +image: base-image update + cd ../../../ && docker build \ + -f ${DOCKER_FILE_PATH} \ + -t ${IMAGE_REGISTRY} . + @if [ "$(PUSH)" = "true" ]; then docker push ${IMAGE_REGISTRY}; fi + +.PHONY: image-fast +image-fast: update + @echo "Building with shared base image (fastest option)..." + cd ../../../ && docker build \ + -f examples/map/even_odd/Dockerfile.shared-base \ + -t ${IMAGE_REGISTRY} . + @if [ "$(PUSH)" = "true" ]; then docker push ${IMAGE_REGISTRY}; fi + +.PHONY: clean +clean: + docker rmi ${BASE_IMAGE_NAME} 2>/dev/null || true + docker rmi ${IMAGE_REGISTRY} 2>/dev/null || true + +.PHONY: help +help: + @echo "Available targets:" + @echo " base-image - Build the shared base image with pynumaflow" + @echo " image - Build UDF image with optimized multi-stage build" + @echo " image-fast - Build UDF image using shared base (fastest)" + @echo " image-push - Build and push multi-platform image" + @echo " update - Update poetry dependencies" + @echo " clean - Remove built images" + @echo " help - Show this help message" \ No newline at end of file diff --git a/examples/accumulator/streamsorter/README.md b/examples/accumulator/streamsorter/README.md new file mode 100644 index 00000000..19b8da6e --- /dev/null +++ b/examples/accumulator/streamsorter/README.md @@ -0,0 +1,43 @@ +# Stream Sorter + +An example User Defined Function that sorts the incoming stream by event time. + +### Applying the Pipeline + +To apply the pipeline, use the following command: + +```shell + kubectl apply -f pipeline.yaml +``` + +### Publish messages + +Port-forward the HTTP endpoint, and make POST requests using curl. Remember to replace xxxx with the appropriate pod names. + +```shell + kubectl port-forward stream-sorter-http-one-0-xxxx 8444:8443 + + # Post data to the HTTP endpoint + curl -kq -X POST -d "101" https://localhost:8444/vertices/http-one -H "X-Numaflow-Event-Time: 60000" + curl -kq -X POST -d "102" https://localhost:8444/vertices/http-one -H "X-Numaflow-Event-Time: 61000" + curl -kq -X POST -d "103" https://localhost:8444/vertices/http-one -H "X-Numaflow-Event-Time: 62000" + curl -kq -X POST -d "104" https://localhost:8444/vertices/http-one -H "X-Numaflow-Event-Time: 63000" +``` + +```shell + kubectl port-forward stream-sorter-http-two-0-xxxx 8445:8443 + + # Post data to the HTTP endpoint + curl -kq -X POST -d "105" https://localhost:8445/vertices/http-two -H "X-Numaflow-Event-Time: 70000" + curl -kq -X POST -d "106" https://localhost:8445/vertices/http-two -H "X-Numaflow-Event-Time: 71000" + curl -kq -X POST -d "107" https://localhost:8445/vertices/http-two -H "X-Numaflow-Event-Time: 72000" + curl -kq -X POST -d "108" https://localhost:8445/vertices/http-two -H "X-Numaflow-Event-Time: 73000" +``` + +### Verify the output + +```shell + kubectl logs -f stream-sorter-log-sink-0-xxxx +``` + +The output should be sorted by event time. \ No newline at end of file diff --git a/examples/accumulator/streamsorter/example.py b/examples/accumulator/streamsorter/example.py index dbab4953..8e0615ed 100644 --- a/examples/accumulator/streamsorter/example.py +++ b/examples/accumulator/streamsorter/example.py @@ -2,11 +2,10 @@ import os from collections.abc import AsyncIterable from datetime import datetime -from typing import List from pynumaflow import setup_logging from pynumaflow.accumulator import Accumulator, AccumulatorAsyncServer -from pynumaflow.reducestreamer import ( +from pynumaflow.accumulator import ( Message, Datum, ) @@ -19,23 +18,26 @@ class StreamSorter(Accumulator): def __init__(self): - _LOGGER.error("MEEEEE") + _LOGGER.info("StreamSorter initialized") self.latest_wm = datetime.fromtimestamp(-1) - self.sorted_buffer: List[Datum] = [] + self.sorted_buffer: list[Datum] = [] async def handler( self, datums: AsyncIterable[Datum], output: NonBlockingIterator, ): - _LOGGER.info("HEREEEEE") + _LOGGER.info("StreamSorter handler started") async for datum in datums: - _LOGGER.info(f"Received datum with event time: {datum.watermark}") - _LOGGER.info(f"Received datum with event time-2:{self.latest_wm}") + _LOGGER.info( + f"Received datum with event time: {datum.event_time}, " + f"Current latest watermark: {self.latest_wm}, " + f"Datum watermark: {datum.watermark}" + ) # If watermark has moved forward - if datum.watermark.ToDatetime() and datum.watermark.ToDatetime() > self.latest_wm: - self.latest_wm = datum.watermark.ToDatetime() + if datum.watermark and datum.watermark > self.latest_wm: + self.latest_wm = datum.watermark await self.flush_buffer(output) self.insert_sorted(datum) @@ -45,7 +47,7 @@ def insert_sorted(self, datum: Datum): left, right = 0, len(self.sorted_buffer) while left < right: mid = (left + right) // 2 - if self.sorted_buffer[mid].event_time.ToDatetime() > datum.event_time.ToDatetime(): + if self.sorted_buffer[mid].event_time > datum.event_time: right = mid else: left = mid + 1 @@ -58,18 +60,13 @@ async def flush_buffer(self, output: NonBlockingIterator): if datum.event_time > self.latest_wm: break await output.put(Message.from_datum(datum)) - logging.info(f"Sent datum with event time: {datum.watermark.ToDatetime()}") + _LOGGER.info(f"Sent datum with event time: {datum.event_time}") i += 1 # Remove flushed items self.sorted_buffer = self.sorted_buffer[i:] if __name__ == "__main__": - invoke = os.getenv("INVOKE", "class") grpc_server = None - if invoke == "class": - # Here we are using the class instance as the reducer_instance - # which will be used to invoke the handler function. - # We are passing the init_args for the class instance. - grpc_server = AccumulatorAsyncServer(StreamSorter) + grpc_server = AccumulatorAsyncServer(StreamSorter) grpc_server.start() diff --git a/examples/accumulator/streamsorter/pipeline.yaml b/examples/accumulator/streamsorter/pipeline.yaml index 604e6997..d4ccab96 100644 --- a/examples/accumulator/streamsorter/pipeline.yaml +++ b/examples/accumulator/streamsorter/pipeline.yaml @@ -1,51 +1,49 @@ apiVersion: numaflow.numaproj.io/v1alpha1 kind: Pipeline metadata: - name: even-odd-sum + name: stream-sorter spec: + limits: + readBatchSize: 1 vertices: - - name: in + - name: http-one + scale: + min: 1 + max: 1 source: http: {} - - name: atoi + - name: http-two scale: - min: 3 - udf: - container: - # Tell the input number is even or odd, see https://github.com/numaproj/numaflow-go/tree/main/pkg/mapper/examples/even_odd - image: quay.io/numaio/numaflow-go/map-even-odd:stable - imagePullPolicy: Always - - name: compute-sum + min: 1 + max: 1 + source: + http: {} + - name: py-accum udf: container: - # compute the sum - image: quay.io/numaio/numaflow-python/reduce-stream-sum:stable + image: quay.io/numaio/numaflow-python/streamsorter:stable imagePullPolicy: Always env: - name: PYTHONDEBUG value: "true" - - name: INVOKE - value: "class" groupBy: window: - fixed: - length: 60s - streaming: true + accumulator: + timeout: 10s keyed: true storage: persistentVolumeClaim: - volumeSize: 10Gi - accessMode: ReadWriteOnce - partitions: 1 - - name: sink + volumeSize: 1Gi + - name: py-sink scale: min: 1 + max: 1 sink: log: {} edges: - - from: in - to: atoi - - from: atoi - to: compute-sum - - from: compute-sum - to: sink + - from: http-one + to: py-accum + - from: http-two + to: py-accum + - from: py-accum + to: py-sink diff --git a/examples/accumulator/streamsorter/pyproject.toml b/examples/accumulator/streamsorter/pyproject.toml index 6557f78f..9397268d 100644 --- a/examples/accumulator/streamsorter/pyproject.toml +++ b/examples/accumulator/streamsorter/pyproject.toml @@ -5,11 +5,9 @@ description = "" authors = ["Numaflow developers"] [tool.poetry.dependencies] -python = ">=3.11,<3.13" +python = "~3.10" pynumaflow = { path = "../../../"} -[tool.poetry.dev-dependencies] - [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" diff --git a/examples/batchmap/flatmap/Dockerfile b/examples/batchmap/flatmap/Dockerfile index 20f1a820..99319c4a 100644 --- a/examples/batchmap/flatmap/Dockerfile +++ b/examples/batchmap/flatmap/Dockerfile @@ -1,52 +1,52 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/batchmap/flatmap" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/batchmap/flatmap/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/batchmap/flatmap" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udf: used for running the udf vertices -#################################################################################################### -FROM builder AS udf + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] diff --git a/examples/map/even_odd/Dockerfile b/examples/map/even_odd/Dockerfile index a2da2f81..0e9be000 100644 --- a/examples/map/even_odd/Dockerfile +++ b/examples/map/even_odd/Dockerfile @@ -1,52 +1,52 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/map/even_odd" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/map/even_odd/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/map/even_odd" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udf: used for running the udf vertices -#################################################################################################### -FROM builder AS udf + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] diff --git a/examples/map/even_odd/Makefile.optimized b/examples/map/even_odd/Makefile.optimized new file mode 100644 index 00000000..3e33cc03 --- /dev/null +++ b/examples/map/even_odd/Makefile.optimized @@ -0,0 +1,52 @@ +TAG ?= stable +PUSH ?= false +IMAGE_REGISTRY = quay.io/numaio/numaflow-python/even-odd:${TAG} +DOCKER_FILE_PATH = examples/map/even_odd/Dockerfile.optimized +BASE_IMAGE_NAME = numaflow-python-base + +.PHONY: base-image +base-image: + @echo "Building shared base image..." + docker build -f Dockerfile.base -t ${BASE_IMAGE_NAME} . + +.PHONY: update +update: + poetry update -vv + +.PHONY: image-push +image-push: base-image update + cd ../../../ && docker buildx build \ + -f ${DOCKER_FILE_PATH} \ + -t ${IMAGE_REGISTRY} \ + --platform linux/amd64,linux/arm64 . --push + +.PHONY: image +image: base-image update + cd ../../../ && docker build \ + -f ${DOCKER_FILE_PATH} \ + -t ${IMAGE_REGISTRY} . + @if [ "$(PUSH)" = "true" ]; then docker push ${IMAGE_REGISTRY}; fi + +.PHONY: image-fast +image-fast: update + @echo "Building with shared base image (fastest option)..." + cd ../../../ && docker build \ + -f examples/map/even_odd/Dockerfile.shared-base \ + -t ${IMAGE_REGISTRY} . + @if [ "$(PUSH)" = "true" ]; then docker push ${IMAGE_REGISTRY}; fi + +.PHONY: clean +clean: + docker rmi ${BASE_IMAGE_NAME} 2>/dev/null || true + docker rmi ${IMAGE_REGISTRY} 2>/dev/null || true + +.PHONY: help +help: + @echo "Available targets:" + @echo " base-image - Build the shared base image with pynumaflow" + @echo " image - Build UDF image with optimized multi-stage build" + @echo " image-fast - Build UDF image using shared base (fastest)" + @echo " image-push - Build and push multi-platform image" + @echo " update - Update poetry dependencies" + @echo " clean - Remove built images" + @echo " help - Show this help message" \ No newline at end of file diff --git a/examples/map/flatmap/Dockerfile b/examples/map/flatmap/Dockerfile index d2ce662f..22d744c0 100644 --- a/examples/map/flatmap/Dockerfile +++ b/examples/map/flatmap/Dockerfile @@ -1,52 +1,52 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/map/flatmap" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/map/flatmap/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/map/flatmap" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udf: used for running the udf vertices -#################################################################################################### -FROM builder AS udf + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] diff --git a/examples/map/forward_message/Dockerfile b/examples/map/forward_message/Dockerfile index 84b4bdff..464fc1fc 100644 --- a/examples/map/forward_message/Dockerfile +++ b/examples/map/forward_message/Dockerfile @@ -1,52 +1,52 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/map/forward_message" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/map/forward_message/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/map/forward_message" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udf: used for running the udf vertices -#################################################################################################### -FROM builder AS udf + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] diff --git a/examples/map/multiproc_map/Dockerfile b/examples/map/multiproc_map/Dockerfile index 0928c03a..3c6e8205 100644 --- a/examples/map/multiproc_map/Dockerfile +++ b/examples/map/multiproc_map/Dockerfile @@ -1,52 +1,52 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/map/multiproc_map" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/map/multiproc_map/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/map/multiproc_map" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udf: used for running the udf vertices -#################################################################################################### -FROM builder AS udf + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] diff --git a/examples/mapstream/flatmap_stream/Dockerfile b/examples/mapstream/flatmap_stream/Dockerfile index a7397526..e56d7fb5 100644 --- a/examples/mapstream/flatmap_stream/Dockerfile +++ b/examples/mapstream/flatmap_stream/Dockerfile @@ -1,52 +1,52 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/mapstream/flatmap_stream" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/mapstream/flatmap_stream/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/mapstream/flatmap_stream" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udf: used for running the udf vertices -#################################################################################################### -FROM builder AS udf + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] diff --git a/examples/reduce/asyncio_reduce/Dockerfile b/examples/reduce/asyncio_reduce/Dockerfile index 32cb8500..e74b6036 100644 --- a/examples/reduce/asyncio_reduce/Dockerfile +++ b/examples/reduce/asyncio_reduce/Dockerfile @@ -1,54 +1,55 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder - -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" \ - VENV_PATH="/opt/pysetup/.venv" - -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init - && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - + && pip install poetry \ + && poetry install --no-root --no-interaction #################################################################################################### -# udf: used for running the udf vertices +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps #################################################################################################### -FROM builder AS udf +FROM base-builder AS udf-builder -WORKDIR $PYSETUP_PATH -COPY pyproject.toml ./ -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ +ENV EXAMPLE_PATH="/opt/pysetup/examples/reduce/asyncio_reduce" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true -ADD . /app -WORKDIR /app +WORKDIR $EXAMPLE_PATH +COPY examples/reduce/asyncio_reduce/ ./ +RUN poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf + +ENV PYSETUP_PATH="/opt/pysetup" +ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/reduce/asyncio_reduce" +ENV VENV_PATH="$EXAMPLE_PATH/.venv" +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ + && chmod +x /dumb-init + +WORKDIR $PYSETUP_PATH +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH +WORKDIR $EXAMPLE_PATH RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] -CMD ["/app/entry.sh"] +CMD ["sh", "-c", "$EXAMPLE_PATH/entry.sh"] EXPOSE 5000 diff --git a/examples/reduce/asyncio_reduce/pyproject.toml b/examples/reduce/asyncio_reduce/pyproject.toml index 31cce969..cac90449 100644 --- a/examples/reduce/asyncio_reduce/pyproject.toml +++ b/examples/reduce/asyncio_reduce/pyproject.toml @@ -7,7 +7,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" pynumaflow = "~0.6.0" -aiorun = "^2022.11.1" +aiorun = ">=2023.7,<2024.0" aiohttp = "~3.8.4" asyncio = "~3.4.3" diff --git a/examples/reduce/batchmap/flatmap/Dockerfile b/examples/reduce/batchmap/flatmap/Dockerfile new file mode 100644 index 00000000..a70d0d90 --- /dev/null +++ b/examples/reduce/batchmap/flatmap/Dockerfile @@ -0,0 +1,55 @@ +#################################################################################################### +# Stage 1: Base Builder - installs core dependencies using poetry +#################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder + +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/reduce/batchmap/flatmap" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/reduce/batchmap/flatmap/ ./ +RUN poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf + +ENV PYSETUP_PATH="/opt/pysetup" +ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/reduce/batchmap/flatmap" +ENV VENV_PATH="$EXAMPLE_PATH/.venv" +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ + && chmod +x /dumb-init + +WORKDIR $PYSETUP_PATH +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH + +WORKDIR $EXAMPLE_PATH +RUN chmod +x entry.sh + +ENTRYPOINT ["/dumb-init", "--"] +CMD ["sh", "-c", "$EXAMPLE_PATH/entry.sh"] + +EXPOSE 5000 \ No newline at end of file diff --git a/examples/reduce/counter/Dockerfile b/examples/reduce/counter/Dockerfile index a617b3fa..f25a9c46 100644 --- a/examples/reduce/counter/Dockerfile +++ b/examples/reduce/counter/Dockerfile @@ -1,52 +1,52 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/reduce/counter" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/reduce/counter/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/reduce/counter" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udf: used for running the udf vertices -#################################################################################################### -FROM builder AS udf + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] diff --git a/examples/reducestream/counter/Dockerfile b/examples/reducestream/counter/Dockerfile index de1756fd..f26543d7 100644 --- a/examples/reducestream/counter/Dockerfile +++ b/examples/reducestream/counter/Dockerfile @@ -1,52 +1,52 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/reducestream/counter" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/reducestream/counter/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/reducestream/counter" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udf: used for running the udf vertices -#################################################################################################### -FROM builder AS udf + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] diff --git a/examples/reducestream/sum/Dockerfile b/examples/reducestream/sum/Dockerfile index 1f715387..4b372b78 100644 --- a/examples/reducestream/sum/Dockerfile +++ b/examples/reducestream/sum/Dockerfile @@ -1,52 +1,52 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/reducestream/sum" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/reducestream/sum/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/reducestream/sum" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udf: used for running the udf vertices -#################################################################################################### -FROM builder AS udf + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] diff --git a/examples/sideinput/simple_sideinput/Dockerfile b/examples/sideinput/simple_sideinput/Dockerfile index ab3e3355..47085100 100644 --- a/examples/sideinput/simple_sideinput/Dockerfile +++ b/examples/sideinput/simple_sideinput/Dockerfile @@ -1,52 +1,52 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/sideinput/simple_sideinput" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/sideinput/simple_sideinput/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/sideinput/simple_sideinput" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udf: used for running the udf vertices -#################################################################################################### -FROM builder AS udf + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] diff --git a/examples/sideinput/simple_sideinput/udf/Dockerfile b/examples/sideinput/simple_sideinput/udf/Dockerfile index 3cbd912a..50cc8578 100644 --- a/examples/sideinput/simple_sideinput/udf/Dockerfile +++ b/examples/sideinput/simple_sideinput/udf/Dockerfile @@ -1,52 +1,52 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/sideinput/simple_sideinput/udf" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/sideinput/simple_sideinput/udf/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/sideinput/simple_sideinput/udf" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udf: used for running the udf vertices -#################################################################################################### -FROM builder AS udf + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] diff --git a/examples/sink/async_log/Dockerfile b/examples/sink/async_log/Dockerfile index 3739ba70..4448c3a8 100644 --- a/examples/sink/async_log/Dockerfile +++ b/examples/sink/async_log/Dockerfile @@ -1,52 +1,52 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/sink/async_log" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/sink/async_log/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/sink/async_log" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udsink: used for running the udsink vertices -#################################################################################################### -FROM builder AS udsink + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] diff --git a/examples/sink/log/Dockerfile b/examples/sink/log/Dockerfile index 2b2a12aa..0c927395 100644 --- a/examples/sink/log/Dockerfile +++ b/examples/sink/log/Dockerfile @@ -1,52 +1,52 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/sink/log" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/sink/log/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/sink/log" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udsink: used for running the udsink vertices -#################################################################################################### -FROM builder AS udsink + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] diff --git a/examples/source/simple_source/Dockerfile b/examples/source/simple_source/Dockerfile index d07c719f..ca33fee3 100644 --- a/examples/source/simple_source/Dockerfile +++ b/examples/source/simple_source/Dockerfile @@ -1,52 +1,52 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/source/simple_source" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/source/simple_source/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/source/simple_source" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udf: used for running the udf vertices -#################################################################################################### -FROM builder AS udf + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] diff --git a/examples/sourcetransform/async_event_time_filter/Dockerfile b/examples/sourcetransform/async_event_time_filter/Dockerfile new file mode 100644 index 00000000..26e66415 --- /dev/null +++ b/examples/sourcetransform/async_event_time_filter/Dockerfile @@ -0,0 +1,55 @@ +#################################################################################################### +# Stage 1: Base Builder - installs core dependencies using poetry +#################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder + +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/sourcetransform/async_event_time_filter" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/sourcetransform/async_event_time_filter/ ./ +RUN poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf + +ENV PYSETUP_PATH="/opt/pysetup" +ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/sourcetransform/async_event_time_filter" +ENV VENV_PATH="$EXAMPLE_PATH/.venv" +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ + && chmod +x /dumb-init + +WORKDIR $PYSETUP_PATH +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH + +WORKDIR $EXAMPLE_PATH +RUN chmod +x entry.sh + +ENTRYPOINT ["/dumb-init", "--"] +CMD ["sh", "-c", "$EXAMPLE_PATH/entry.sh"] + +EXPOSE 5000 diff --git a/examples/accumulator/counter/Makefile b/examples/sourcetransform/async_event_time_filter/Makefile similarity index 59% rename from examples/accumulator/counter/Makefile rename to examples/sourcetransform/async_event_time_filter/Makefile index ba3e8793..5ad2dd38 100644 --- a/examples/accumulator/counter/Makefile +++ b/examples/sourcetransform/async_event_time_filter/Makefile @@ -1,7 +1,7 @@ TAG ?= stable PUSH ?= false -IMAGE_REGISTRY = quay.io/numaio/numaflow-python/reduce-stream-counter:${TAG} -DOCKER_FILE_PATH = examples/reducestream/counter/Dockerfile +IMAGE_REGISTRY = quay.io/numaio/numaflow-python/async-mapt-event-time-filter:${TAG} +DOCKER_FILE_PATH = examples/sourcetransform/async_event_time_filter/Dockerfile .PHONY: update update: @@ -19,4 +19,4 @@ image: update cd ../../../ && docker build \ -f ${DOCKER_FILE_PATH} \ -t ${IMAGE_REGISTRY} . - @if [ "$(PUSH)" = "true" ]; then docker push ${IMAGE_REGISTRY}; fi + @if [ "$(PUSH)" = "true" ]; then docker push ${IMAGE_REGISTRY}:${TAG}; fi diff --git a/examples/accumulator/counter/entry.sh b/examples/sourcetransform/async_event_time_filter/entry.sh similarity index 100% rename from examples/accumulator/counter/entry.sh rename to examples/sourcetransform/async_event_time_filter/entry.sh diff --git a/examples/sourcetransform/async_event_time_filter/example.py b/examples/sourcetransform/async_event_time_filter/example.py new file mode 100644 index 00000000..659b7e24 --- /dev/null +++ b/examples/sourcetransform/async_event_time_filter/example.py @@ -0,0 +1,48 @@ +import datetime +import logging + +from pynumaflow.sourcetransformer import Messages, Message, Datum +from pynumaflow.sourcetransformer import SourceTransformAsyncServer + +""" +This is a simple User Defined Function example which receives a message, applies the following +data transformation, and returns the message. +If the message event time is before year 2022, drop the message with event time unchanged. +If it's within year 2022, update the tag to "within_year_2022" and +update the message event time to Jan 1st 2022. +Otherwise, (exclusively after year 2022), update the tag to "after_year_2022" and update the +message event time to Jan 1st 2023. +""" + +january_first_2022 = datetime.datetime.fromtimestamp(1640995200) +january_first_2023 = datetime.datetime.fromtimestamp(1672531200) + + +async def my_handler(keys: list[str], datum: Datum) -> Messages: + val = datum.value + event_time = datum.event_time + messages = Messages() + + if event_time < january_first_2022: + logging.info("Got event time:%s, it is before 2022, so dropping", event_time) + messages.append(Message.to_drop(event_time)) + elif event_time < january_first_2023: + logging.info( + "Got event time:%s, it is within year 2022, so forwarding to within_year_2022", + event_time, + ) + messages.append( + Message(value=val, event_time=january_first_2022, tags=["within_year_2022"]) + ) + else: + logging.info( + "Got event time:%s, it is after year 2022, so forwarding to after_year_2022", event_time + ) + messages.append(Message(value=val, event_time=january_first_2023, tags=["after_year_2022"])) + + return messages + + +if __name__ == "__main__": + grpc_server = SourceTransformAsyncServer(my_handler) + grpc_server.start() diff --git a/examples/accumulator/counter/pyproject.toml b/examples/sourcetransform/async_event_time_filter/pyproject.toml similarity index 56% rename from examples/accumulator/counter/pyproject.toml rename to examples/sourcetransform/async_event_time_filter/pyproject.toml index aeeb4d30..7c5bf2b5 100644 --- a/examples/accumulator/counter/pyproject.toml +++ b/examples/sourcetransform/async_event_time_filter/pyproject.toml @@ -1,15 +1,15 @@ [tool.poetry] -name = "reduce-stream-counter" +name = "async-mapt-event-time-filter" version = "0.2.4" description = "" authors = ["Numaflow developers"] +readme = "README.md" +packages = [{include = "mapt_event_time_filter"}] [tool.poetry.dependencies] -python = "~3.10" +python = ">=3.9, <3.12" pynumaflow = { path = "../../../"} -[tool.poetry.dev-dependencies] - [build-system] -requires = ["poetry-core>=1.0.0"] +requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" diff --git a/examples/sourcetransform/event_time_filter/Dockerfile b/examples/sourcetransform/event_time_filter/Dockerfile index 3ed3480b..9e702ecf 100644 --- a/examples/sourcetransform/event_time_filter/Dockerfile +++ b/examples/sourcetransform/event_time_filter/Dockerfile @@ -1,52 +1,52 @@ #################################################################################################### -# builder: install needed dependencies +# Stage 1: Base Builder - installs core dependencies using poetry #################################################################################################### +FROM python:3.10-slim-bullseye AS base-builder -FROM python:3.10-slim-bullseye AS builder +ENV PYSETUP_PATH="/opt/pysetup" +WORKDIR $PYSETUP_PATH + +# Copy only core dependency files first for better caching +COPY pyproject.toml poetry.lock README.md ./ +COPY pynumaflow/ ./pynumaflow/ +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl wget build-essential git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && pip install poetry \ + && poetry install --no-root --no-interaction + +#################################################################################################### +# Stage 2: UDF Builder - adds UDF code and installs UDF-specific deps +#################################################################################################### +FROM base-builder AS udf-builder + +ENV EXAMPLE_PATH="/opt/pysetup/examples/sourcetransform/event_time_filter" +ENV POETRY_VIRTUALENVS_IN_PROJECT=true + +WORKDIR $EXAMPLE_PATH +COPY examples/sourcetransform/event_time_filter/ ./ +RUN poetry install --no-root --no-interaction -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=on \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.2.2 \ - POETRY_HOME="/opt/poetry" \ - POETRY_VIRTUALENVS_IN_PROJECT=true \ - POETRY_NO_INTERACTION=1 \ - PYSETUP_PATH="/opt/pysetup" +#################################################################################################### +# Stage 3: UDF Runtime - clean container with only needed stuff +#################################################################################################### +FROM python:3.10-slim-bullseye AS udf +ENV PYSETUP_PATH="/opt/pysetup" ENV EXAMPLE_PATH="$PYSETUP_PATH/examples/sourcetransform/event_time_filter" ENV VENV_PATH="$EXAMPLE_PATH/.venv" -ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - wget \ - # deps for building python deps - build-essential \ - && apt-get install -y git \ +ENV PATH="$VENV_PATH/bin:$PATH" + +RUN apt-get update && apt-get install --no-install-recommends -y wget \ && apt-get clean && rm -rf /var/lib/apt/lists/* \ - \ - # install dumb-init && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \ - && chmod +x /dumb-init \ - && curl -sSL https://install.python-poetry.org | python3 - - -#################################################################################################### -# udf: used for running the udf vertices -#################################################################################################### -FROM builder AS udf + && chmod +x /dumb-init WORKDIR $PYSETUP_PATH -COPY ./ ./ +COPY --from=udf-builder $VENV_PATH $VENV_PATH +COPY --from=udf-builder $EXAMPLE_PATH $EXAMPLE_PATH WORKDIR $EXAMPLE_PATH -RUN poetry lock -RUN poetry install --no-cache --no-root && \ - rm -rf ~/.cache/pypoetry/ - RUN chmod +x entry.sh ENTRYPOINT ["/dumb-init", "--"] diff --git a/poetry.lock b/poetry.lock index 53bad6b7..ab404238 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "aiorun" @@ -58,7 +58,7 @@ typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""} [package.extras] colorama = ["colorama (>=0.4.3)"] -d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"] +d = ["aiohttp (>=3.7.4) ; sys_platform != \"win32\" or implementation_name != \"pypy\"", "aiohttp (>=3.7.4,!=3.9.0) ; sys_platform == \"win32\" and implementation_name == \"pypy\""] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] @@ -304,7 +304,7 @@ files = [ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""} [package.extras] -toml = ["tomli"] +toml = ["tomli ; python_full_version <= \"3.11.0a6\""] [[package]] name = "distlib" @@ -349,7 +349,7 @@ files = [ [package.extras] docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"] testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"] -typing = ["typing-extensions (>=4.12.2)"] +typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""] [[package]] name = "google-api-core" @@ -372,7 +372,7 @@ requests = ">=2.18.0,<3.0.0.dev0" [package.extras] async-rest = ["google-auth[aiohttp] (>=2.35.0,<3.0.dev0)"] -grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev)", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0)"] +grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\""] grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] @@ -1016,13 +1016,13 @@ files = [ ] [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.8.0)"] -core = ["importlib_metadata (>=6)", "jaraco.collections", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""] +core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.collections", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] enabler = ["pytest-enabler (>=2.2)"] -test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] -type = ["importlib_metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.14.*)", "pytest-mypy"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"] [[package]] name = "tomli" @@ -1093,7 +1093,7 @@ files = [ ] [package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""] h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] @@ -1141,7 +1141,7 @@ files = [ [package.extras] docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] -test = ["Cython (>=0.29.36,<0.30.0)", "aiohttp (==3.9.0b0)", "aiohttp (>=3.8.1)", "flake8 (>=5.0,<6.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=23.0.0,<23.1.0)", "pycodestyle (>=2.9.0,<2.10.0)"] +test = ["Cython (>=0.29.36,<0.30.0)", "aiohttp (==3.9.0b0) ; python_version >= \"3.12\"", "aiohttp (>=3.8.1) ; python_version < \"3.12\"", "flake8 (>=5.0,<6.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=23.0.0,<23.1.0)", "pycodestyle (>=2.9.0,<2.10.0)"] [[package]] name = "virtualenv" @@ -1162,7 +1162,7 @@ platformdirs = ">=3.9.1,<5" [package.extras] docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] -test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] +test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""] [metadata] lock-version = "2.1" diff --git a/pynumaflow/accumulator/__init__.py b/pynumaflow/accumulator/__init__.py index 76690768..0d1368d8 100644 --- a/pynumaflow/accumulator/__init__.py +++ b/pynumaflow/accumulator/__init__.py @@ -2,10 +2,9 @@ Message, Datum, IntervalWindow, - Metadata, DROP, - Accumulator, KeyedWindow, + Accumulator, ) from pynumaflow.accumulator.async_server import AccumulatorAsyncServer @@ -13,9 +12,8 @@ "Message", "Datum", "IntervalWindow", - "Metadata", "DROP", "AccumulatorAsyncServer", - "Accumulator", "KeyedWindow", + "Accumulator", ] diff --git a/pynumaflow/accumulator/_dtypes.py b/pynumaflow/accumulator/_dtypes.py index 18a9b199..31a0d5fe 100644 --- a/pynumaflow/accumulator/_dtypes.py +++ b/pynumaflow/accumulator/_dtypes.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from datetime import datetime from enum import IntEnum -from typing import TypeVar, Callable, Union, Optional, Type +from typing import TypeVar, Callable, Union, Optional from collections.abc import AsyncIterable from pynumaflow.shared.asynciter import NonBlockingIterator @@ -19,7 +19,7 @@ class WindowOperation(IntEnum): OPEN = (0,) CLOSE = (1,) - APPEND = (4,) + APPEND = (2,) @dataclass(init=False) @@ -32,7 +32,7 @@ class Datum: event_time: the event time of the event. watermark: the watermark of the event. >>> # Example usage - >>> from pynumaflow.reducer import Datum + >>> from pynumaflow.accumulator import Datum >>> from datetime import datetime, timezone >>> payload = bytes("test_mock_message", encoding="utf-8") >>> t1 = datetime.fromtimestamp(1662998400, timezone.utc) @@ -77,32 +77,56 @@ def __init__( self._id = id_ def keys(self) -> list[str]: - """Returns the keys of the event""" + """Returns the keys of the event. + + Returns: + list[str]: A list of string keys associated with this event. + """ return self._keys @property def value(self) -> bytes: - """Returns the value of the event.""" + """Returns the value of the event. + + Returns: + bytes: The payload data of the event as bytes. + """ return self._value @property def event_time(self) -> datetime: - """Returns the event time of the event.""" + """Returns the event time of the event. + + Returns: + datetime: The timestamp when the event occurred. + """ return self._event_time @property def watermark(self) -> datetime: - """Returns the watermark of the event.""" + """Returns the watermark of the event. + + Returns: + datetime: The watermark timestamp indicating the progress of event time. + """ return self._watermark @property def headers(self) -> dict[str, str]: - """Returns the headers of the event.""" - return self._headers + """Returns the headers of the event. + + Returns: + dict[str, str]: A dictionary containing header key-value pairs for this event. + """ + return self._headers.copy() @property def id(self) -> str: - """Returns the id of the event.""" + """Returns the id of the event. + + Returns: + str: The unique identifier for this event. + """ return self._id @@ -120,13 +144,21 @@ def __init__(self, start: datetime, end: datetime): self._end = end @property - def start(self): - """Returns the start point of the interval window.""" + def start(self) -> datetime: + """Returns the start point of the interval window. + + Returns: + datetime: The start timestamp of the interval window. + """ return self._start @property - def end(self): - """Returns the end point of the interval window.""" + def end(self) -> datetime: + """Returns the end point of the interval window. + + Returns: + datetime: The end timestamp of the interval window. + """ return self._end @@ -149,46 +181,49 @@ def __init__(self, start: datetime, end: datetime, slot: str = "", keys: list[st self._keys = keys @property - def start(self): - """Returns the start point of the interval window.""" + def start(self) -> datetime: + """Returns the start point of the interval window. + + Returns: + datetime: The start timestamp of the interval window. + """ return self._window.start @property - def end(self): - """Returns the end point of the interval window.""" + def end(self) -> datetime: + """Returns the end point of the interval window. + + Returns: + datetime: The end timestamp of the interval window. + """ return self._window.end @property - def slot(self): - """Returns the slot from the window""" + def slot(self) -> str: + """Returns the slot from the window. + + Returns: + str: The slot identifier for this window. + """ return self._slot @property - def window(self): - """Return the interval window""" + def window(self) -> IntervalWindow: + """Returns the interval window. + + Returns: + IntervalWindow: The underlying interval window object. + """ return self._window @property - def keys(self): - """Return the keys for window""" - return self._keys - - -@dataclass(init=False) -class Metadata: - """Defines the metadata for the event.""" - - __slots__ = ("_interval_window",) - - _interval_window: IntervalWindow - - def __init__(self, interval_window: IntervalWindow): - self._interval_window = interval_window + def keys(self) -> list[str]: + """Returns the keys for window. - @property - def interval_window(self): - """Returns the interval window for the event.""" - return self._interval_window + Returns: + list[str]: A list of keys associated with this window. + """ + return self._keys @dataclass @@ -212,37 +247,68 @@ class AccumulatorResult: _latest_watermark: datetime @property - def future(self): - """Returns the future result of computation.""" + def future(self) -> Task: + """Returns the future result of computation. + + Returns: + Task: The asyncio Task representing the computation future. + """ return self._future @property - def iterator(self): - """Returns the handle to the producer queue.""" + def iterator(self) -> NonBlockingIterator: + """Returns the handle to the producer queue. + + Returns: + NonBlockingIterator: The iterator for producing data to the queue. + """ return self._iterator @property def keys(self) -> list[str]: - """Returns the keys of the partition.""" + """Returns the keys of the partition. + + Returns: + list[str]: The keys associated with this partition. + """ return self._key @property - def result_queue(self): - """Returns the async queue used to write the output for the tasks""" + def result_queue(self) -> NonBlockingIterator: + """Returns the async queue used to write the output for the tasks. + + Returns: + NonBlockingIterator: The queue for writing task output. + """ return self._result_queue @property - def consumer_future(self): - """Returns the async consumer task for the result queue""" + def consumer_future(self) -> Task: + """Returns the async consumer task for the result queue. + + Returns: + Task: The asyncio Task for consuming from the result queue. + """ return self._consumer_future @property - def latest_watermark(self): - """Returns the latest watermark for task""" + def latest_watermark(self) -> datetime: + """Returns the latest watermark for task. + + Returns: + datetime: The latest watermark timestamp for this task. + """ return self._latest_watermark def update_watermark(self, new_watermark: datetime): - """Updates the latest watermark value.""" + """Updates the latest watermark value. + + Args: + new_watermark (datetime): The new watermark timestamp to set. + + Raises: + TypeError: If new_watermark is not a datetime object. + """ if not isinstance(new_watermark, datetime): raise TypeError("new_watermark must be a datetime object") self._latest_watermark = new_watermark @@ -252,118 +318,185 @@ def update_watermark(self, new_watermark: datetime): class AccumulatorRequest: """Defines the object to hold a request for the accumulator operation.""" - __slots__ = ("_operation", "_windows", "_payload") + __slots__ = ("_operation", "_keyed_window", "_payload") _operation: WindowOperation - _windows: list[KeyedWindow] + _keyed_window: KeyedWindow _payload: Datum - def __init__(self, operation: WindowOperation, windows: list[KeyedWindow], payload: Datum): + def __init__(self, operation: WindowOperation, keyed_window: KeyedWindow, payload: Datum): self._operation = operation - self._windows = windows + self._keyed_window = keyed_window self._payload = payload @property def operation(self) -> WindowOperation: - """Returns the future result of computation.""" + """Returns the operation type. + + Returns: + WindowOperation: The type of window operation (OPEN, CLOSE, or APPEND). + """ return self._operation @property - def windows(self) -> list[KeyedWindow]: - """Returns the handle to the producer queue.""" - return self._windows + def keyed_window(self) -> KeyedWindow: + """Returns the keyed window. + + Returns: + KeyedWindow: The keyed window associated with this request. + """ + return self._keyed_window @property def payload(self) -> Datum: - """Returns the payload of the window.""" + """Returns the payload of the window. + + Returns: + Datum: The data payload for this accumulator request. + """ return self._payload @dataclass(init=False) class Message: """ - Represents a unit of data passed to the next vertex in the pipeline. + Basic datatype for data passing to the next vertex/vertices. + + Args: + value: data in bytes + keys: []string keys for vertex (optional) + tags: []string tags for conditional forwarding (optional) + watermark: watermark for this message (optional) + event_time: event time for this message (optional) + headers: headers for this message (optional) + id: message id (optional) """ - __slots__ = ( - "_value", "_keys", "_tags", "_event_time", "_watermark", "_id", "_headers" - ) + __slots__ = ("_value", "_keys", "_tags", "_watermark", "_event_time", "_headers", "_id") _value: bytes _keys: list[str] _tags: list[str] - _event_time: datetime _watermark: datetime - _id: str + _event_time: datetime _headers: dict[str, str] + _id: str def __init__( self, value: bytes, - keys: Optional[list[str]] = None, - tags: Optional[list[str]] = None, - event_time: Optional[datetime] = None, - watermark: Optional[datetime] = None, - id: Optional[str] = "", - headers: Optional[dict[str, str]] = None, + keys: list[str] = None, + tags: list[str] = None, + watermark: datetime = None, + event_time: datetime = None, + headers: dict[str, str] = None, + id: str = None, ): - self._value = value or b"" + """ + Creates a Message object to send value to a vertex. + """ self._keys = keys or [] self._tags = tags or [] - self._event_time = event_time or datetime.fromtimestamp(0) - self._watermark = watermark or datetime.fromtimestamp(0) - self._id = id or "" + self._value = value or b"" + self._watermark = watermark + self._event_time = event_time self._headers = headers or {} + self._id = id or "" @classmethod - def to_drop(cls: Type[M]) -> M: - return cls(b"", None, ["DROP"]) + def to_drop(cls: type[M]) -> M: + """Creates a Message instance that indicates the message should be dropped. - @classmethod - def from_datum(cls: Type[M], datum: "Datum") -> M: - """Creates a Message from a Datum.""" - return cls( - value=datum.value, - keys=datum.keys, - tags=[], - event_time=datum.event_time, - watermark=datum.watermark, - id=datum.id, - headers=datum.headers, - ) + Returns: + M: A Message instance with empty value and DROP tag indicating + the message should be dropped. + """ + return cls(b"", None, [DROP]) @property def value(self) -> bytes: + """Returns the message payload value. + + Returns: + bytes: The message payload data as bytes. + """ return self._value @property def keys(self) -> list[str]: + """Returns the message keys. + + Returns: + list[str]: A list of string keys associated with this message. + """ return self._keys @property def tags(self) -> list[str]: + """Returns the message tags for conditional forwarding. + + Returns: + list[str]: A list of string tags used for conditional forwarding. + """ return self._tags + @property + def watermark(self) -> datetime: + """Returns the watermark timestamp for this message. + + Returns: + datetime: The watermark timestamp, or None if not set. + """ + return self._watermark + @property def event_time(self) -> datetime: + """Returns the event time for this message. + + Returns: + datetime: The event time timestamp, or None if not set. + """ return self._event_time @property - def watermark(self) -> datetime: - return self._watermark + def headers(self) -> dict[str, str]: + """Returns the message headers. + + Returns: + dict[str, str]: A dictionary containing header key-value pairs for this message. + """ + return self._headers.copy() @property def id(self) -> str: + """Returns the message ID. + + Returns: + str: The unique identifier for this message. + """ return self._id - @property - def headers(self) -> dict[str, str]: - return self._headers + @classmethod + def from_datum(cls, datum: Datum): + """Create a Message instance from a Datum object. + + Args: + datum: The Datum object to convert + + Returns: + Message: A new Message instance with data from the datum + """ + return cls( + value=datum.value, + keys=datum.keys(), + watermark=datum.watermark, + event_time=datum.event_time, + headers=datum.headers, + id=datum.id, + ) -AccumulatorAsyncCallable = Callable[ - [list[str], AsyncIterable[Datum], NonBlockingIterator, Metadata], None -] +AccumulatorAsyncCallable = Callable[[list[str], AsyncIterable[Datum], NonBlockingIterator], None] class Accumulator(metaclass=ABCMeta): @@ -401,8 +534,8 @@ class _AccumulatorBuilderClass: Args: accumulator_class: the Accumulator class to be used for Accumulator UDF - args: the arguments to be passed to the reducer class - kwargs: the keyword arguments to be passed to the reducer class + args: the arguments to be passed to the accumulator class + kwargs: the keyword arguments to be passed to the accumulator class """ def __init__(self, accumulator_class: type[Accumulator], args: tuple, kwargs: dict): @@ -412,10 +545,10 @@ def __init__(self, accumulator_class: type[Accumulator], args: tuple, kwargs: di def create(self) -> Accumulator: """ - Create a new ReduceStreamer instance. + Create a new Accumulator instance. """ return self._accumulator_class(*self._args, **self._kwargs) -# AccumulatorStreamCallable is a callable which can be used as a handler for the Reduce UDF. +# AccumulatorStreamCallable is a callable which can be used as a handler for the Accumulator UDF. AccumulatorStreamCallable = Union[AccumulatorAsyncCallable, type[Accumulator]] diff --git a/pynumaflow/accumulator/async_server.py b/pynumaflow/accumulator/async_server.py index 16569ad4..042359ca 100644 --- a/pynumaflow/accumulator/async_server.py +++ b/pynumaflow/accumulator/async_server.py @@ -62,8 +62,8 @@ class AccumulatorAsyncServer(NumaflowServer): A new servicer instance is created and attached to the server. The server instance is returned. Args: - accumulator_instance: The reducer instance to be used for - Reduce Streaming UDF + accumulator_instance: The accumulator instance to be used for + Accumulator UDF init_args: The arguments to be passed to the accumulator_handler init_kwargs: The keyword arguments to be passed to the accumulator_handler @@ -75,55 +75,57 @@ class AccumulatorAsyncServer(NumaflowServer): Example invocation: import os from collections.abc import AsyncIterable - from pynumaflow.accumulator import Messages, Message, Datum, Metadata, - AccumulatorAsyncServer, Accumulator + from datetime import datetime - class ReduceCounter(Accumulator): + from pynumaflow.accumulator import Accumulator, AccumulatorAsyncServer + from pynumaflow.accumulator import ( + Message, + Datum, + ) + from pynumaflow.shared.asynciter import NonBlockingIterator + + class StreamSorter(Accumulator): def __init__(self, counter): - self.counter = counter + self.latest_wm = datetime.fromtimestamp(-1) + self.sorted_buffer: list[Datum] = [] async def handler( self, - keys: list[str], datums: AsyncIterable[Datum], output: NonBlockingIterator, - md: Metadata, ): async for _ in datums: - self.counter += 1 - if self.counter > 20: - msg = f"counter:{self.counter}" - await output.put(Message(str.encode(msg), keys=keys)) - self.counter = 0 - msg = f"counter:{self.counter}" - await output.put(Message(str.encode(msg), keys=keys)) - - async def reduce_handler( - keys: list[str], - datums: AsyncIterable[Datum], - output: NonBlockingIterator, - md: Metadata, - ): - counter = 0 - async for _ in datums: - counter += 1 - if counter > 20: - msg = f"counter:{counter}" - await output.put(Message(str.encode(msg), keys=keys)) - counter = 0 - msg = f"counter:{counter}" - await output.put(Message(str.encode(msg), keys=keys)) + # Process the datums and send output + if datum.watermark and datum.watermark > self.latest_wm: + self.latest_wm = datum.watermark + await self.flush_buffer(output) + + self.insert_sorted(datum) + + def insert_sorted(self, datum: Datum): + # Binary insert to keep sorted buffer in order + left, right = 0, len(self.sorted_buffer) + while left < right: + mid = (left + right) // 2 + if self.sorted_buffer[mid].event_time > datum.event_time: + right = mid + else: + left = mid + 1 + self.sorted_buffer.insert(left, datum) + + async def flush_buffer(self, output: NonBlockingIterator): + i = 0 + for datum in self.sorted_buffer: + if datum.event_time > self.latest_wm: + break + await output.put(Message.from_datum(datum)) + i += 1 + # Remove flushed items + self.sorted_buffer = self.sorted_buffer[i:] + if __name__ == "__main__": - invoke = os.getenv("INVOKE", "func_handler") - if invoke == "class": - # Here we are using the class instance as the reducer_instance - # which will be used to invoke the handler function. - # We are passing the init_args for the class instance. - grpc_server = AccumulatorAsyncServer(ReduceCounter, init_args=(0,)) - else: - # Here we are using the handler function directly as the reducer_instance. - grpc_server = AccumulatorAsyncServer(reduce_handler) + grpc_server = AccumulatorAsyncServer(StreamSorter) grpc_server.start() """ diff --git a/pynumaflow/accumulator/servicer/async_servicer.py b/pynumaflow/accumulator/servicer/async_servicer.py index 726711a5..6eebdbeb 100644 --- a/pynumaflow/accumulator/servicer/async_servicer.py +++ b/pynumaflow/accumulator/servicer/async_servicer.py @@ -11,6 +11,7 @@ AccumulatorAsyncCallable, _AccumulatorBuilderClass, AccumulatorRequest, + KeyedWindow, ) from pynumaflow.accumulator.servicer.task_manager import TaskManager from pynumaflow.shared.server import handle_async_error @@ -22,23 +23,32 @@ async def datum_generator( ) -> AsyncIterable[AccumulatorRequest]: """Generate a AccumulatorRequest from a AccumulatorRequest proto message.""" async for d in request_iterator: - reduce_request = AccumulatorRequest( + # Convert protobuf KeyedWindow to our KeyedWindow dataclass + keyed_window = KeyedWindow( + start=d.operation.keyedWindow.start.ToDatetime(), + end=d.operation.keyedWindow.end.ToDatetime(), + slot=d.operation.keyedWindow.slot, + keys=list(d.operation.keyedWindow.keys), + ) + + accumulator_request = AccumulatorRequest( operation=d.operation.event, - windows=d.operation.windows, + keyed_window=keyed_window, # Use the new parameter name payload=Datum( keys=list(d.payload.keys), value=d.payload.value, event_time=d.payload.event_time.ToDatetime(), watermark=d.payload.watermark.ToDatetime(), + id_=d.payload.id, headers=dict(d.payload.headers), ), ) - yield reduce_request + yield accumulator_request class AsyncAccumulatorServicer(accumulator_pb2_grpc.AccumulatorServicer): """ - This class is used to create a new grpc Reduce servicer instance. + This class is used to create a new grpc Accumulator servicer instance. Provides the functionality for the required rpc methods. """ @@ -69,28 +79,28 @@ async def AccumulateFn( consumer = task_manager.global_result_queue.read_iterator() # Create an async iterator from the request iterator - # datum_iterator = datum_generator(request_iterator=request_iterator) + datum_iterator = datum_generator(request_iterator=request_iterator) # Create a process_input_stream task in the task manager, # this would read from the datum iterator # and then create the required tasks to process the data requests # The results from these tasks are then sent to the result queue - producer = asyncio.create_task(task_manager.process_input_stream(request_iterator)) + producer = asyncio.create_task(task_manager.process_input_stream(datum_iterator)) # Start the consumer task where we read from the result queue # and send the results to the client # The task manager can write the following to the result queue: - # 1. A accumulator_pb2.ReduceResponse message - # This is the result of the reduce function, it contains the window and the - # result of the reduce function - # The result of the reduce function is a accumulator_pb2.ReduceResponse message and can be - # directly sent to the client + # 1. A accumulator_pb2.AccumulatorResponse message + # This is the result of the accumulator function, it contains the window and the + # result of the accumulator function + # The result of the accumulator function is a accumulator_pb2.AccumulatorResponse message + # and can be directly sent to the client # # 2. An Exception - # Any exceptions that occur during the processing reduce function tasks are + # Any exceptions that occur during the processing accumulator function tasks are # sent to the result queue. We then forward these exception to the client # - # 3. A accumulator_pb2.ReduceResponse message with EOF=True + # 3. A accumulator_pb2.AccumulatorResponse message with EOF=True # This is a special message that indicates the end of the processing for a window # When we get this message, we send an EOF message to the client try: diff --git a/pynumaflow/accumulator/servicer/task_manager.py b/pynumaflow/accumulator/servicer/task_manager.py index 6e9aab2b..a7c80968 100644 --- a/pynumaflow/accumulator/servicer/task_manager.py +++ b/pynumaflow/accumulator/servicer/task_manager.py @@ -3,6 +3,7 @@ from datetime import datetime from typing import Union +from google.protobuf import timestamp_pb2 from pynumaflow._constants import ( STREAM_EOF, DELIMITER, @@ -17,13 +18,12 @@ ) from pynumaflow.proto.accumulator import accumulator_pb2 from pynumaflow.shared.asynciter import NonBlockingIterator -from google.protobuf import timestamp_pb2 as _timestamp_pb2 def build_unique_key_name(keys): """ Builds a unique key name for the given keys and window. - The key name is used to identify the Reduce task. + The key name is used to identify the Accumulator task. The format is: start_time:end_time:key1:key2:... """ return f"{DELIMITER.join(keys)}" @@ -32,21 +32,16 @@ def build_unique_key_name(keys): def build_window_hash(window): """ Builds a hash for the given window. - The hash is used to identify the Reduce Window + The hash is used to identify the Accumulator Window The format is: start_time:end_time """ return f"{window.start.ToMilliseconds()}:{window.end.ToMilliseconds()}" -def create_window_eof_response(window): - """Create a Reduce response with EOF=True for a given window""" - return accumulator_pb2.ReduceResponse(window=window, EOF=True) - - class TaskManager: """ - TaskManager is responsible for managing the Reduce tasks. - It is created whenever a new reduce operation is requested. + TaskManager is responsible for managing the Accumulator tasks. + It is created whenever a new accumulator operation is requested. """ def __init__(self, handler: Union[AccumulatorAsyncCallable, _AccumulatorBuilderClass]): @@ -56,13 +51,13 @@ def __init__(self, handler: Union[AccumulatorAsyncCallable, _AccumulatorBuilderC # Event loop only keeps a weak reference, which can cause it to # get lost during execution. self.background_tasks = set() - # Handler for the reduce operation + # Handler for the accumulator operation self.__accumulator_handler = handler - # Queue to store the results of the reduce operation + # Queue to store the results of the accumulator operation # This queue is used to send the results to the client - # once the reduce operation is completed. + # once the accumulator operation is completed. # This queue is also used to send the error/exceptions to the client - # if the reduce operation fails. + # if the accumulator operation fails. self.global_result_queue = NonBlockingIterator() def get_unique_windows(self): @@ -82,29 +77,43 @@ def get_unique_windows(self): def get_tasks(self): """ - Returns the list of reduce tasks that are + Returns the list of accumulator tasks that are currently being processed """ return list(self.tasks.values()) async def stream_send_eof(self): """ - Sends EOF to input streams of all the Reduce - tasks that are currently being processed. + Function used to indicate to all processing tasks that no + more requests are expected by sending EOF message to + local input streams of individual tasks. This is called when the input grpc stream is closed. """ - for unified_key in self.tasks: + # Create a copy of the keys to avoid dictionary size change during iteration + task_keys = list(self.tasks.keys()) + for unified_key in task_keys: await self.tasks[unified_key].iterator.put(STREAM_EOF) - self.tasks.clear() async def close_task(self, req): + """ + Closes a running accumulator task for a given key. + Based on the request we compute the unique key, and then + signal the corresponding task for it to closure. + The steps involve + 1. Send a signal to the local request queue of the task to stop reading + 2. Wait for the user function to complete + 3. Wait for all the results from the task to be written to the global result queue + 4. Remove the task from the tracker + """ d = req.payload - keys = d.keys + keys = d.keys() unified_key = build_unique_key_name(keys) curr_task = self.tasks.get(unified_key, None) if curr_task: await self.tasks[unified_key].iterator.put(STREAM_EOF) + await curr_task.future + await curr_task.consumer_future self.tasks.pop(unified_key) else: _LOGGER.critical("accumulator task not found", exc_info=True) @@ -119,7 +128,7 @@ async def create_task(self, req): it creates a new task or appends the request to the existing task. """ d = req.payload - keys = d.keys + keys = d.keys() unified_key = build_unique_key_name(keys) curr_task = self.tasks.get(unified_key, None) @@ -129,7 +138,7 @@ async def create_task(self, req): riter = niter.read_iterator() # Create a new result queue for the current task # We create a new result queue for each task, so that - # the results of the reduce operation can be sent to the + # the results of the accumulator operation can be sent to the # the global result queue, which in turn sends the results # to the client. res_queue = NonBlockingIterator() @@ -146,7 +155,7 @@ async def create_task(self, req): consumer.add_done_callback(self.clean_background) # Create a new task for the accumulator operation, this will invoke the - # Reduce handler with the given keys, request iterator, and window. + # Accumulator handler with the given keys, request iterator, and window. task = asyncio.create_task(self.__invoke_accumulator(riter, res_queue)) # Save a reference to the result of this function, to avoid a # task disappearing mid-execution. @@ -158,7 +167,7 @@ async def create_task(self, req): task, niter, keys, res_queue, consumer, datetime.fromtimestamp(-1) ) - # Save the result of the reduce operation to the task list + # Save the result of the accumulator operation to the task list self.tasks[unified_key] = curr_task # Put the request in the iterator @@ -179,20 +188,20 @@ async def send_datum_to_task(self, req): await result.iterator.put(d) async def __invoke_accumulator( - self, - request_iterator: AsyncIterable[Datum], - output: NonBlockingIterator, + self, + request_iterator: AsyncIterable[Datum], + output: NonBlockingIterator, ): """ - Invokes the UDF reduce handler with the given keys, + Invokes the UDF accumulator handler with the given keys, request iterator, and window. Returns the result of the - reduce operation. + accumulator operation. """ new_instance = self.__accumulator_handler # If the accumulator handler is a class instance, create a new instance of it. # It is required for a new key to be processed by a - # new instance of the reducer for a given window + # new instance of the accumulator for a given window # Otherwise the function handler can be called directly if isinstance(self.__accumulator_handler, _AccumulatorBuilderClass): new_instance = self.__accumulator_handler.create() @@ -200,7 +209,7 @@ async def __invoke_accumulator( _ = await new_instance(request_iterator, output) # send EOF to the output stream await output.put(STREAM_EOF) - # If there is an error in the reduce operation, log and + # If there is an error in the accumulator operation, log and # then send the error to the result queue except BaseException as err: _LOGGER.critical("panic inside accumulator handle", exc_info=True) @@ -208,27 +217,30 @@ async def __invoke_accumulator( await self.global_result_queue.put(err) async def process_input_stream( - self, request_iterator: AsyncIterable[accumulator_pb2.AccumulatorRequest] + self, request_iterator: AsyncIterable[accumulator_pb2.AccumulatorRequest] ): # Start iterating through the request iterator and create tasks # based on the operation type received. try: + request_count = 0 async for request in request_iterator: - # print("IM HERE", request.payload.keys) + request_count += 1 # check whether the request is an open or append operation - if request.operation.event is int(WindowOperation.OPEN): - + if request.operation is int(WindowOperation.OPEN): # create a new task for the open operation and # put the request in the task iterator await self.create_task(request) - elif request.operation.event is int(WindowOperation.APPEND): + elif request.operation is int(WindowOperation.APPEND): # append the task data to the existing task # if the task does not exist, create a new task await self.send_datum_to_task(request) - elif request.operation.event is int(WindowOperation.CLOSE): + elif request.operation is int(WindowOperation.CLOSE): # close the current task for req await self.close_task(request) - # If there is an error in the reduce operation, log and + else: + _LOGGER.debug(f"No operation matched for request: {request}", exc_info=True) + + # If there is an error in the accumulator operation, log and # then send the error to the result queue except BaseException as e: err_msg = f"Accumulator Error: {repr(e)}" @@ -243,7 +255,7 @@ async def process_input_stream( # respective iterators. await self.stream_send_eof() - # get the list of reduce tasks that are currently being processed + # get the list of accumulator tasks that are currently being processed # iterate through the tasks and wait for them to complete for task in self.get_tasks(): # Once this is done, we know that the task has written all the results @@ -251,87 +263,81 @@ async def process_input_stream( fut = task.future await fut - # # Send an EOF message to the local result queue - # # This will signal that the task has completed processing - # await task.result_queue.put(STREAM_EOF) - # Wait for the local queue to write # all the results of this task to the global result queue con_future = task.consumer_future await con_future + self.tasks.clear() - # # Once all tasks are completed, send EOF to all windows that - # # were processed in the Task Manager. We send a single - # # EOF message per window. - # current_windows = self.get_unique_windows() - # for window in current_windows.values(): - # # Send an EOF message to the global result queue - # # This will signal that window has been processed - # eof_window_msg = create_window_eof_response(window=window) - # await self.global_result_queue.put(eof_window_msg) - - # Once all tasks are completed, senf EOF the global result queue + # Now send STREAM_EOF to terminate the global result queue iterator await self.global_result_queue.put(STREAM_EOF) except BaseException as e: - err_msg = f"Reduce Streaming Error: {repr(e)}" + err_msg = f"Accumulator Streaming Error: {repr(e)}" _LOGGER.critical(err_msg, exc_info=True) await self.global_result_queue.put(e) async def write_to_global_queue( - self, input_queue: NonBlockingIterator, output_queue: NonBlockingIterator, unified_key: str + self, input_queue: NonBlockingIterator, output_queue: NonBlockingIterator, unified_key: str ): """ - This task is for given Reduce task. - This would from the local result queue for the task and then write - to the global result queue + This function is used to route the messages from the + local result queue for a given task to the global result queue. + Once all messages are routed, it sends the window EOF messages for the same. """ reader = input_queue.read_iterator() task = self.tasks[unified_key] - wm = task.latest_watermark + wm: datetime = task.latest_watermark async for msg in reader: # Convert the window to a datetime object - if wm < msg.watermark: + # Only update watermark if msg.watermark is not None + if msg.watermark is not None and wm < msg.watermark: task.update_watermark(msg.watermark) self.tasks[unified_key] = task wm = msg.watermark - event_time_timestamp = _timestamp_pb2.Timestamp() - t = datetime.fromtimestamp(0) - event_time_timestamp.FromDatetime(dt=t) + # Convert datetime to protobuf timestamp + event_time_pb = timestamp_pb2.Timestamp() + if msg.event_time is not None: + event_time_pb.FromDatetime(msg.event_time) - event_time_timestamp_end = _timestamp_pb2.Timestamp() - event_time_timestamp_end.FromDatetime(dt=wm) + watermark_pb = timestamp_pb2.Timestamp() + if msg.watermark is not None: + watermark_pb.FromDatetime(msg.watermark) + + start_dt_pb = timestamp_pb2.Timestamp() + start_dt_pb.FromDatetime(datetime.fromtimestamp(0)) + + end_dt_pb = timestamp_pb2.Timestamp() + end_dt_pb.FromDatetime(wm) res = accumulator_pb2.AccumulatorResponse( payload=accumulator_pb2.Payload( keys=msg.keys, value=msg.value, - event_time=msg.event_time, - watermark=msg.watermark, + event_time=event_time_pb, + watermark=watermark_pb, headers=msg.headers, id=msg.id, ), window=accumulator_pb2.KeyedWindow( - start=event_time_timestamp, end=event_time_timestamp_end, slot="slot-0", keys=task.keys + start=start_dt_pb, end=end_dt_pb, slot="slot-0", keys=task.keys ), EOF=False, tags=msg.tags, ) await output_queue.put(res) # send EOF - event_time_timestamp = _timestamp_pb2.Timestamp() - t = datetime.fromtimestamp(0) - event_time_timestamp.FromDatetime(dt=t) + start_eof_pb = timestamp_pb2.Timestamp() + start_eof_pb.FromDatetime(datetime.fromtimestamp(0)) - event_time_timestamp_end = _timestamp_pb2.Timestamp() - event_time_timestamp_end.FromDatetime(dt=wm) + end_eof_pb = timestamp_pb2.Timestamp() + end_eof_pb.FromDatetime(wm) - window = accumulator_pb2.KeyedWindow( - start=event_time_timestamp, end=event_time_timestamp_end, slot="slot-0", keys=task.keys - ) res = accumulator_pb2.AccumulatorResponse( - window=window, + window=accumulator_pb2.KeyedWindow( + start=start_eof_pb, end=end_eof_pb, slot="slot-0", keys=task.keys + ), EOF=True, ) await output_queue.put(res) diff --git a/pynumaflow/proto/accumulator/accumulator.proto b/pynumaflow/proto/accumulator/accumulator.proto index f25691c1..acde986b 100644 --- a/pynumaflow/proto/accumulator/accumulator.proto +++ b/pynumaflow/proto/accumulator/accumulator.proto @@ -1,6 +1,23 @@ +/* +Copyright 2022 The Numaproj Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + syntax = "proto3"; option go_package = "github.com/numaproj/numaflow-go/pkg/apis/proto/accumulator/v1"; +option java_package = "io.numaproj.numaflow.accumulator.v1"; import "google/protobuf/empty.proto"; import "google/protobuf/timestamp.proto"; @@ -45,7 +62,6 @@ message AccumulatorRequest { Payload payload = 1; WindowOperation operation = 2; - optional Handshake handshake = 3; } @@ -63,19 +79,12 @@ message AccumulatorResponse { // window represents a window to which the result belongs. KeyedWindow window = 2; repeated string tags = 3; - optional Handshake handshake = 4; // EOF represents the end of the response for a window. - bool EOF = 5; + bool EOF = 4; } // ReadyResponse is the health check result. message ReadyResponse { bool ready = 1; -} - -// Handshake message between client and server to indicate the start of transmission. -message Handshake { - // Required field indicating the start of transmission. - bool sot = 1; -} +} \ No newline at end of file diff --git a/pynumaflow/proto/accumulator/accumulator_pb2.py b/pynumaflow/proto/accumulator/accumulator_pb2.py index 422aacc1..f1e8ec8d 100644 --- a/pynumaflow/proto/accumulator/accumulator_pb2.py +++ b/pynumaflow/proto/accumulator/accumulator_pb2.py @@ -18,7 +18,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x11\x61\x63\x63umulator.proto\x12\x0e\x61\x63\x63umulator.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\xf8\x01\n\x07Payload\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\n\n\x02id\x18\x05 \x01(\t\x12\x35\n\x07headers\x18\x06 \x03(\x0b\x32$.accumulator.v1.Payload.HeadersEntry\x1a.\n\x0cHeadersEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01"\xff\x02\n\x12\x41\x63\x63umulatorRequest\x12(\n\x07payload\x18\x01 \x01(\x0b\x32\x17.accumulator.v1.Payload\x12\x45\n\toperation\x18\x02 \x01(\x0b\x32\x32.accumulator.v1.AccumulatorRequest.WindowOperation\x12\x31\n\thandshake\x18\x03 \x01(\x0b\x32\x19.accumulator.v1.HandshakeH\x00\x88\x01\x01\x1a\xb6\x01\n\x0fWindowOperation\x12G\n\x05\x65vent\x18\x01 \x01(\x0e\x32\x38.accumulator.v1.AccumulatorRequest.WindowOperation.Event\x12\x30\n\x0bkeyedWindow\x18\x02 \x01(\x0b\x32\x1b.accumulator.v1.KeyedWindow"(\n\x05\x45vent\x12\x08\n\x04OPEN\x10\x00\x12\t\n\x05\x43LOSE\x10\x01\x12\n\n\x06\x41PPEND\x10\x02\x42\x0c\n\n_handshake"}\n\x0bKeyedWindow\x12)\n\x05start\x18\x01 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\'\n\x03\x65nd\x18\x02 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x0c\n\x04slot\x18\x03 \x01(\t\x12\x0c\n\x04keys\x18\x04 \x03(\t"\xc8\x01\n\x13\x41\x63\x63umulatorResponse\x12(\n\x07payload\x18\x01 \x01(\x0b\x32\x17.accumulator.v1.Payload\x12+\n\x06window\x18\x02 \x01(\x0b\x32\x1b.accumulator.v1.KeyedWindow\x12\x0c\n\x04tags\x18\x03 \x03(\t\x12\x31\n\thandshake\x18\x04 \x01(\x0b\x32\x19.accumulator.v1.HandshakeH\x00\x88\x01\x01\x12\x0b\n\x03\x45OF\x18\x05 \x01(\x08\x42\x0c\n\n_handshake"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08"\x18\n\tHandshake\x12\x0b\n\x03sot\x18\x01 \x01(\x08\x32\xac\x01\n\x0b\x41\x63\x63umulator\x12[\n\x0c\x41\x63\x63umulateFn\x12".accumulator.v1.AccumulatorRequest\x1a#.accumulator.v1.AccumulatorResponse(\x01\x30\x01\x12@\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x1d.accumulator.v1.ReadyResponseB?Z=github.com/numaproj/numaflow-go/pkg/apis/proto/accumulator/v1b\x06proto3' + b'\n\x11\x61\x63\x63umulator.proto\x12\x0e\x61\x63\x63umulator.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\xf8\x01\n\x07Payload\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\n\n\x02id\x18\x05 \x01(\t\x12\x35\n\x07headers\x18\x06 \x03(\x0b\x32$.accumulator.v1.Payload.HeadersEntry\x1a.\n\x0cHeadersEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01"\xbe\x02\n\x12\x41\x63\x63umulatorRequest\x12(\n\x07payload\x18\x01 \x01(\x0b\x32\x17.accumulator.v1.Payload\x12\x45\n\toperation\x18\x02 \x01(\x0b\x32\x32.accumulator.v1.AccumulatorRequest.WindowOperation\x1a\xb6\x01\n\x0fWindowOperation\x12G\n\x05\x65vent\x18\x01 \x01(\x0e\x32\x38.accumulator.v1.AccumulatorRequest.WindowOperation.Event\x12\x30\n\x0bkeyedWindow\x18\x02 \x01(\x0b\x32\x1b.accumulator.v1.KeyedWindow"(\n\x05\x45vent\x12\x08\n\x04OPEN\x10\x00\x12\t\n\x05\x43LOSE\x10\x01\x12\n\n\x06\x41PPEND\x10\x02"}\n\x0bKeyedWindow\x12)\n\x05start\x18\x01 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\'\n\x03\x65nd\x18\x02 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x0c\n\x04slot\x18\x03 \x01(\t\x12\x0c\n\x04keys\x18\x04 \x03(\t"\x87\x01\n\x13\x41\x63\x63umulatorResponse\x12(\n\x07payload\x18\x01 \x01(\x0b\x32\x17.accumulator.v1.Payload\x12+\n\x06window\x18\x02 \x01(\x0b\x32\x1b.accumulator.v1.KeyedWindow\x12\x0c\n\x04tags\x18\x03 \x03(\t\x12\x0b\n\x03\x45OF\x18\x04 \x01(\x08"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\xac\x01\n\x0b\x41\x63\x63umulator\x12[\n\x0c\x41\x63\x63umulateFn\x12".accumulator.v1.AccumulatorRequest\x1a#.accumulator.v1.AccumulatorResponse(\x01\x30\x01\x12@\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x1d.accumulator.v1.ReadyResponseBd\n#io.numaproj.numaflow.accumulator.v1Z=github.com/numaproj/numaflow-go/pkg/apis/proto/accumulator/v1b\x06proto3' ) _globals = globals() @@ -28,7 +28,7 @@ _globals["DESCRIPTOR"]._options = None _globals[ "DESCRIPTOR" - ]._serialized_options = b"Z=github.com/numaproj/numaflow-go/pkg/apis/proto/accumulator/v1" + ]._serialized_options = b"\n#io.numaproj.numaflow.accumulator.v1Z=github.com/numaproj/numaflow-go/pkg/apis/proto/accumulator/v1" _globals["_PAYLOAD_HEADERSENTRY"]._options = None _globals["_PAYLOAD_HEADERSENTRY"]._serialized_options = b"8\001" _globals["_PAYLOAD"]._serialized_start = 100 @@ -36,19 +36,17 @@ _globals["_PAYLOAD_HEADERSENTRY"]._serialized_start = 302 _globals["_PAYLOAD_HEADERSENTRY"]._serialized_end = 348 _globals["_ACCUMULATORREQUEST"]._serialized_start = 351 - _globals["_ACCUMULATORREQUEST"]._serialized_end = 734 - _globals["_ACCUMULATORREQUEST_WINDOWOPERATION"]._serialized_start = 538 - _globals["_ACCUMULATORREQUEST_WINDOWOPERATION"]._serialized_end = 720 - _globals["_ACCUMULATORREQUEST_WINDOWOPERATION_EVENT"]._serialized_start = 680 - _globals["_ACCUMULATORREQUEST_WINDOWOPERATION_EVENT"]._serialized_end = 720 - _globals["_KEYEDWINDOW"]._serialized_start = 736 - _globals["_KEYEDWINDOW"]._serialized_end = 861 - _globals["_ACCUMULATORRESPONSE"]._serialized_start = 864 - _globals["_ACCUMULATORRESPONSE"]._serialized_end = 1064 - _globals["_READYRESPONSE"]._serialized_start = 1066 - _globals["_READYRESPONSE"]._serialized_end = 1096 - _globals["_HANDSHAKE"]._serialized_start = 1098 - _globals["_HANDSHAKE"]._serialized_end = 1122 - _globals["_ACCUMULATOR"]._serialized_start = 1125 - _globals["_ACCUMULATOR"]._serialized_end = 1297 + _globals["_ACCUMULATORREQUEST"]._serialized_end = 669 + _globals["_ACCUMULATORREQUEST_WINDOWOPERATION"]._serialized_start = 487 + _globals["_ACCUMULATORREQUEST_WINDOWOPERATION"]._serialized_end = 669 + _globals["_ACCUMULATORREQUEST_WINDOWOPERATION_EVENT"]._serialized_start = 629 + _globals["_ACCUMULATORREQUEST_WINDOWOPERATION_EVENT"]._serialized_end = 669 + _globals["_KEYEDWINDOW"]._serialized_start = 671 + _globals["_KEYEDWINDOW"]._serialized_end = 796 + _globals["_ACCUMULATORRESPONSE"]._serialized_start = 799 + _globals["_ACCUMULATORRESPONSE"]._serialized_end = 934 + _globals["_READYRESPONSE"]._serialized_start = 936 + _globals["_READYRESPONSE"]._serialized_end = 966 + _globals["_ACCUMULATOR"]._serialized_start = 969 + _globals["_ACCUMULATOR"]._serialized_end = 1141 # @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/accumulator/accumulator_pb2.pyi b/pynumaflow/proto/accumulator/accumulator_pb2.pyi index 5c893f90..d9f0f7a5 100644 --- a/pynumaflow/proto/accumulator/accumulator_pb2.pyi +++ b/pynumaflow/proto/accumulator/accumulator_pb2.pyi @@ -47,7 +47,7 @@ class Payload(_message.Message): ) -> None: ... class AccumulatorRequest(_message.Message): - __slots__ = ("payload", "operation", "handshake") + __slots__ = ("payload", "operation") class WindowOperation(_message.Message): __slots__ = ("event", "keyedWindow") @@ -71,15 +71,12 @@ class AccumulatorRequest(_message.Message): ) -> None: ... PAYLOAD_FIELD_NUMBER: _ClassVar[int] OPERATION_FIELD_NUMBER: _ClassVar[int] - HANDSHAKE_FIELD_NUMBER: _ClassVar[int] payload: Payload operation: AccumulatorRequest.WindowOperation - handshake: Handshake def __init__( self, payload: _Optional[_Union[Payload, _Mapping]] = ..., operation: _Optional[_Union[AccumulatorRequest.WindowOperation, _Mapping]] = ..., - handshake: _Optional[_Union[Handshake, _Mapping]] = ..., ) -> None: ... class KeyedWindow(_message.Message): @@ -101,23 +98,20 @@ class KeyedWindow(_message.Message): ) -> None: ... class AccumulatorResponse(_message.Message): - __slots__ = ("payload", "window", "tags", "handshake", "EOF") + __slots__ = ("payload", "window", "tags", "EOF") PAYLOAD_FIELD_NUMBER: _ClassVar[int] WINDOW_FIELD_NUMBER: _ClassVar[int] TAGS_FIELD_NUMBER: _ClassVar[int] - HANDSHAKE_FIELD_NUMBER: _ClassVar[int] EOF_FIELD_NUMBER: _ClassVar[int] payload: Payload window: KeyedWindow tags: _containers.RepeatedScalarFieldContainer[str] - handshake: Handshake EOF: bool def __init__( self, payload: _Optional[_Union[Payload, _Mapping]] = ..., window: _Optional[_Union[KeyedWindow, _Mapping]] = ..., tags: _Optional[_Iterable[str]] = ..., - handshake: _Optional[_Union[Handshake, _Mapping]] = ..., EOF: bool = ..., ) -> None: ... @@ -126,9 +120,3 @@ class ReadyResponse(_message.Message): READY_FIELD_NUMBER: _ClassVar[int] ready: bool def __init__(self, ready: bool = ...) -> None: ... - -class Handshake(_message.Message): - __slots__ = ("sot",) - SOT_FIELD_NUMBER: _ClassVar[int] - sot: bool - def __init__(self, sot: bool = ...) -> None: ... diff --git a/pynumaflow/sourcetransformer/__init__.py b/pynumaflow/sourcetransformer/__init__.py index 69f8018c..8eee3786 100644 --- a/pynumaflow/sourcetransformer/__init__.py +++ b/pynumaflow/sourcetransformer/__init__.py @@ -7,6 +7,7 @@ ) from pynumaflow.sourcetransformer.multiproc_server import SourceTransformMultiProcServer from pynumaflow.sourcetransformer.server import SourceTransformServer +from pynumaflow.sourcetransformer.async_server import SourceTransformAsyncServer __all__ = [ "Message", @@ -16,4 +17,5 @@ "SourceTransformServer", "SourceTransformer", "SourceTransformMultiProcServer", + "SourceTransformAsyncServer", ] diff --git a/pynumaflow/sourcetransformer/_dtypes.py b/pynumaflow/sourcetransformer/_dtypes.py index 13b2bce1..bc0ec7b5 100644 --- a/pynumaflow/sourcetransformer/_dtypes.py +++ b/pynumaflow/sourcetransformer/_dtypes.py @@ -3,6 +3,7 @@ from dataclasses import dataclass from datetime import datetime from typing import TypeVar, Callable, Union, Optional +from collections.abc import Awaitable from warnings import warn from pynumaflow._constants import DROP @@ -210,3 +211,9 @@ def handler(self, keys: list[str], datum: Datum) -> Messages: # SourceTransformCallable is the type of the handler function for the # Source Transformer UDFunction. SourceTransformCallable = Union[SourceTransformHandler, SourceTransformer] + + +# SourceTransformAsyncCallable is a callable which can be used as a handler +# for the Asynchronous Transformer UDF +SourceTransformHandlerAsyncHandlerCallable = Callable[[list[str], Datum], Awaitable[Messages]] +SourceTransformAsyncCallable = Union[SourceTransformer, SourceTransformHandlerAsyncHandlerCallable] diff --git a/pynumaflow/sourcetransformer/async_server.py b/pynumaflow/sourcetransformer/async_server.py new file mode 100644 index 00000000..0dc8add9 --- /dev/null +++ b/pynumaflow/sourcetransformer/async_server.py @@ -0,0 +1,157 @@ +import aiorun +import grpc + +from pynumaflow._constants import ( + NUM_THREADS_DEFAULT, + MAX_MESSAGE_SIZE, + MAX_NUM_THREADS, + SOURCE_TRANSFORMER_SOCK_PATH, + SOURCE_TRANSFORMER_SERVER_INFO_FILE_PATH, +) +from pynumaflow.info.types import ( + ServerInfo, + MINIMUM_NUMAFLOW_VERSION, + ContainerType, +) +from pynumaflow.proto.sourcetransformer import transform_pb2_grpc +from pynumaflow.shared.server import ( + NumaflowServer, + start_async_server, +) +from pynumaflow.sourcetransformer._dtypes import SourceTransformAsyncCallable +from pynumaflow.sourcetransformer.servicer._async_servicer import SourceTransformAsyncServicer + + +class SourceTransformAsyncServer(NumaflowServer): + """ + Create a new grpc Source Transformer Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + source_transform_instance: The source transformer instance to be used for + Source Transformer UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to 4 and max capped at 16 + + Example Invocation: + + import datetime + import logging + + from pynumaflow.sourcetransformer import Messages, Message, Datum, SourceTransformServer + # This is a simple User Defined Function example which receives a message, + # applies the following + # data transformation, and returns the message. + # If the message event time is before year 2022, drop the message with event time unchanged. + # If it's within year 2022, update the tag to "within_year_2022" and + # update the message event time to Jan 1st 2022. + # Otherwise, (exclusively after year 2022), update the tag to + # "after_year_2022" and update the + # message event time to Jan 1st 2023. + + january_first_2022 = datetime.datetime.fromtimestamp(1640995200) + january_first_2023 = datetime.datetime.fromtimestamp(1672531200) + + + async def my_handler(keys: list[str], datum: Datum) -> Messages: + val = datum.value + event_time = datum.event_time + messages = Messages() + + if event_time < january_first_2022: + logging.info("Got event time:%s, it is before 2022, so dropping", event_time) + messages.append(Message.to_drop(event_time)) + elif event_time < january_first_2023: + logging.info( + "Got event time:%s, it is within year 2022, so forwarding to within_year_2022", + event_time, + ) + messages.append( + Message(value=val, event_time=january_first_2022, + tags=["within_year_2022"]) + ) + else: + logging.info( + "Got event time:%s, it is after year 2022, so forwarding to + after_year_2022", event_time + ) + messages.append(Message(value=val, event_time=january_first_2023, + tags=["after_year_2022"])) + + return messages + + + if __name__ == "__main__": + grpc_server = SourceTransformAsyncServer(my_handler) + grpc_server.start() + """ + + def __init__( + self, + source_transform_instance: SourceTransformAsyncCallable, + sock_path=SOURCE_TRANSFORMER_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=NUM_THREADS_DEFAULT, + server_info_file=SOURCE_TRANSFORMER_SERVER_INFO_FILE_PATH, + ): + """ + Create a new grpc Asynchronous Map Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + mapper_instance: The mapper instance to be used for Map UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to 4 and max capped at 16 + """ + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, MAX_NUM_THREADS) + self.max_message_size = max_message_size + self.server_info_file = server_info_file + + self.source_transform_instance = source_transform_instance + + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] + self.servicer = SourceTransformAsyncServicer(handler=source_transform_instance) + + def start(self) -> None: + """ + Starter function for the Async server class, need a separate caller + so that all the async coroutines can be started from a single context + """ + aiorun.run(self.aexec(), use_uvloop=True) + + async def aexec(self) -> None: + """ + Starts the Async gRPC server on the given UNIX socket with + given max threads. + """ + + # As the server is async, we need to create a new server instance in the + # same thread as the event loop so that all the async calls are made in the + # same context + + server_new = grpc.aio.server(options=self._server_options) + server_new.add_insecure_port(self.sock_path) + transform_pb2_grpc.add_SourceTransformServicer_to_server(self.servicer, server_new) + + serv_info = ServerInfo.get_default_server_info() + serv_info.minimum_numaflow_version = MINIMUM_NUMAFLOW_VERSION[ + ContainerType.Sourcetransformer + ] + + # Start the async server + await start_async_server( + server_async=server_new, + sock_path=self.sock_path, + max_threads=self.max_threads, + cleanup_coroutines=list(), + server_info_file=self.server_info_file, + server_info=serv_info, + ) diff --git a/pynumaflow/sourcetransformer/servicer/_async_servicer.py b/pynumaflow/sourcetransformer/servicer/_async_servicer.py new file mode 100644 index 00000000..b2e70799 --- /dev/null +++ b/pynumaflow/sourcetransformer/servicer/_async_servicer.py @@ -0,0 +1,142 @@ +import asyncio +from collections.abc import AsyncIterable + +from google.protobuf import empty_pb2 as _empty_pb2 +from google.protobuf import timestamp_pb2 as _timestamp_pb2 + +from pynumaflow._constants import _LOGGER, STREAM_EOF, ERR_UDF_EXCEPTION_STRING +from pynumaflow.proto.sourcetransformer import transform_pb2, transform_pb2_grpc +from pynumaflow.shared.asynciter import NonBlockingIterator +from pynumaflow.shared.server import handle_async_error +from pynumaflow.sourcetransformer import Datum +from pynumaflow.sourcetransformer._dtypes import SourceTransformAsyncCallable +from pynumaflow.types import NumaflowServicerContext + + +class SourceTransformAsyncServicer(transform_pb2_grpc.SourceTransformServicer): + """ + This class is used to create a new grpc SourceTransform Async Servicer instance. + It implements the SourceTransformServicer interface from the proto + transform_pb2_grpc.py file. + Provides the functionality for the required rpc methods. + """ + + def __init__( + self, + handler: SourceTransformAsyncCallable, + ): + self.background_tasks = set() + self.__transform_handler: SourceTransformAsyncCallable = handler + + async def SourceTransformFn( + self, + request_iterator: AsyncIterable[transform_pb2.SourceTransformRequest], + context: NumaflowServicerContext, + ) -> AsyncIterable[transform_pb2.SourceTransformResponse]: + """ + Applies a transform function to a SourceTransformRequest stream + The pascal case function name comes from the proto transform_pb2_grpc.py file. + """ + try: + # The first message to be received should be a valid handshake + req = await request_iterator.__anext__() + # check if it is a valid handshake req + if not (req.handshake and req.handshake.sot): + raise Exception("SourceTransformFn: expected handshake message") + yield transform_pb2.SourceTransformResponse( + handshake=transform_pb2.Handshake(sot=True), + ) + + # result queue to stream messages from the user code back to the client + global_result_queue = NonBlockingIterator() + + # reader task to process the input task and invoke the required tasks + producer = asyncio.create_task( + self._process_inputs(request_iterator, global_result_queue) + ) + + # keep reading on result queue and send messages back + consumer = global_result_queue.read_iterator() + async for msg in consumer: + # If the message is an exception, we raise the exception + if isinstance(msg, BaseException): + await handle_async_error(context, msg, ERR_UDF_EXCEPTION_STRING) + return + # Send window response back to the client + else: + yield msg + # wait for the producer task to complete + await producer + except BaseException as e: + _LOGGER.critical("SourceTransformFnError, re-raising the error", exc_info=True) + await handle_async_error(context, e, ERR_UDF_EXCEPTION_STRING) + return + + async def _process_inputs( + self, + request_iterator: AsyncIterable[transform_pb2.SourceTransformRequest], + result_queue: NonBlockingIterator, + ): + """ + Utility function for processing incoming SourceTransformRequest + """ + try: + # for each incoming request, create a background task to execute the + # UDF code + async for req in request_iterator: + msg_task = asyncio.create_task(self._invoke_transform(req, result_queue)) + # save a reference to a set to store active tasks + self.background_tasks.add(msg_task) + msg_task.add_done_callback(self.background_tasks.discard) + + # Wait for all tasks to complete concurrently + await asyncio.gather(*self.background_tasks) + + # send an EOF to result queue to indicate that all tasks have completed + await result_queue.put(STREAM_EOF) + + except BaseException: + _LOGGER.critical("SourceTransformFnError Error, re-raising the error", exc_info=True) + + async def _invoke_transform( + self, request: transform_pb2.SourceTransformRequest, result_queue: NonBlockingIterator + ): + """ + Invokes the user defined function. + """ + try: + datum = Datum( + keys=list(request.request.keys), + value=request.request.value, + event_time=request.request.event_time.ToDatetime(), + watermark=request.request.watermark.ToDatetime(), + headers=dict(request.request.headers), + ) + msgs = await self.__transform_handler(list(request.request.keys), datum) + results = [] + for msg in msgs: + event_time_timestamp = _timestamp_pb2.Timestamp() + event_time_timestamp.FromDatetime(dt=msg.event_time) + results.append( + transform_pb2.SourceTransformResponse.Result( + keys=list(msg.keys), + value=msg.value, + tags=msg.tags, + event_time=event_time_timestamp, + ) + ) + await result_queue.put( + transform_pb2.SourceTransformResponse(results=results, id=request.request.id) + ) + except BaseException as err: + _LOGGER.critical("SourceTransformFnError handler error", exc_info=True) + await result_queue.put(err) + + async def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> transform_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto transform_pb2_grpc.py file. + """ + return transform_pb2.ReadyResponse(ready=True) diff --git a/pyproject.toml b/pyproject.toml index 309a300c..41816b73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pynumaflow" -version = "0.9.1" +version = "0.10.0a0" description = "Provides the interfaces of writing Python User Defined Functions and Sinks for NumaFlow." authors = ["NumaFlow Developers"] readme = "README.md" @@ -80,7 +80,5 @@ extend-exclude = [ "*_pb2*.py", "*.pyi" ] -output-format = "full" -[tool.ruff.lint] select = ["E", "F", "UP"] diff --git a/tests/accumulator/__init__.py b/tests/accumulator/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/accumulator/test_async_accumulator.py b/tests/accumulator/test_async_accumulator.py new file mode 100644 index 00000000..292e3687 --- /dev/null +++ b/tests/accumulator/test_async_accumulator.py @@ -0,0 +1,476 @@ +import asyncio +import logging +import threading +import unittest +from collections.abc import AsyncIterable + +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 +from grpc.aio._server import Server + +from pynumaflow import setup_logging +from pynumaflow.accumulator import ( + Message, + Datum, + AccumulatorAsyncServer, + Accumulator, +) +from pynumaflow.proto.accumulator import accumulator_pb2, accumulator_pb2_grpc +from pynumaflow.shared.asynciter import NonBlockingIterator +from tests.testing_utils import ( + mock_message, + mock_interval_window_start, + mock_interval_window_end, + get_time_args, +) + +LOGGER = setup_logging(__name__) + + +def request_generator(count, request, resetkey: bool = False, send_close: bool = False): + for i in range(count): + if resetkey: + # Clear previous keys and add new ones + del request.payload.keys[:] + request.payload.keys.extend([f"key-{i}"]) + + # Set operation based on index - first is OPEN, rest are APPEND + if i == 0: + request.operation.event = accumulator_pb2.AccumulatorRequest.WindowOperation.Event.OPEN + else: + request.operation.event = ( + accumulator_pb2.AccumulatorRequest.WindowOperation.Event.APPEND + ) + yield request + + if send_close: + # Send a close operation after all requests + request.operation.event = accumulator_pb2.AccumulatorRequest.WindowOperation.Event.CLOSE + yield request + + +def request_generator_append_only(count, request, resetkey: bool = False): + for i in range(count): + if resetkey: + # Clear previous keys and add new ones + del request.payload.keys[:] + request.payload.keys.extend([f"key-{i}"]) + + # Set operation to APPEND for all requests + request.operation.event = accumulator_pb2.AccumulatorRequest.WindowOperation.Event.APPEND + yield request + + +def request_generator_mixed(count, request, resetkey: bool = False): + for i in range(count): + if resetkey: + # Clear previous keys and add new ones + del request.payload.keys[:] + request.payload.keys.extend([f"key-{i}"]) + + if i % 2 == 0: + # Set operation to APPEND for even requests + request.operation.event = ( + accumulator_pb2.AccumulatorRequest.WindowOperation.Event.APPEND + ) + else: + # Set operation to CLOSE for odd requests + request.operation.event = accumulator_pb2.AccumulatorRequest.WindowOperation.Event.CLOSE + yield request + + +def start_request() -> accumulator_pb2.AccumulatorRequest: + event_time_timestamp, watermark_timestamp = get_time_args() + window = accumulator_pb2.KeyedWindow( + start=mock_interval_window_start(), + end=mock_interval_window_end(), + slot="slot-0", + keys=["test_key"], + ) + payload = accumulator_pb2.Payload( + keys=["test_key"], + value=mock_message(), + event_time=event_time_timestamp, + watermark=watermark_timestamp, + id="test_id", + ) + operation = accumulator_pb2.AccumulatorRequest.WindowOperation( + event=accumulator_pb2.AccumulatorRequest.WindowOperation.Event.OPEN, + keyedWindow=window, + ) + request = accumulator_pb2.AccumulatorRequest( + payload=payload, + operation=operation, + ) + return request + + +def start_request_without_open() -> accumulator_pb2.AccumulatorRequest: + event_time_timestamp, watermark_timestamp = get_time_args() + + payload = accumulator_pb2.Payload( + keys=["test_key"], + value=mock_message(), + event_time=event_time_timestamp, + watermark=watermark_timestamp, + id="test_id", + ) + + request = accumulator_pb2.AccumulatorRequest( + payload=payload, + ) + return request + + +_s: Server = None +_channel = grpc.insecure_channel("unix:///tmp/accumulator.sock") +_loop = None + + +def startup_callable(loop): + asyncio.set_event_loop(loop) + loop.run_forever() + + +class ExampleClass(Accumulator): + def __init__(self, counter): + self.counter = counter + + async def handler(self, datums: AsyncIterable[Datum], output: NonBlockingIterator): + async for datum in datums: + self.counter += 1 + msg = f"counter:{self.counter}" + await output.put(Message(str.encode(msg), keys=datum.keys(), tags=[])) + + +async def accumulator_handler_func(datums: AsyncIterable[Datum], output: NonBlockingIterator): + counter = 0 + async for datum in datums: + counter += 1 + msg = f"counter:{counter}" + await output.put(Message(str.encode(msg), keys=datum.keys(), tags=[])) + + +def NewAsyncAccumulator(): + server_instance = AccumulatorAsyncServer(ExampleClass, init_args=(0,)) + udfs = server_instance.servicer + return udfs + + +async def start_server(udfs): + server = grpc.aio.server() + accumulator_pb2_grpc.add_AccumulatorServicer_to_server(udfs, server) + listen_addr = "unix:///tmp/accumulator.sock" + server.add_insecure_port(listen_addr) + logging.info("Starting server on %s", listen_addr) + global _s + _s = server + await server.start() + await server.wait_for_termination() + + +class TestAsyncAccumulator(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + global _loop + loop = asyncio.new_event_loop() + _loop = loop + _thread = threading.Thread(target=startup_callable, args=(loop,), daemon=True) + _thread.start() + udfs = NewAsyncAccumulator() + asyncio.run_coroutine_threadsafe(start_server(udfs), loop=loop) + while True: + try: + with grpc.insecure_channel("unix:///tmp/accumulator.sock") as channel: + f = grpc.channel_ready_future(channel) + f.result(timeout=10) + if f.done(): + break + except grpc.FutureTimeoutError as e: + LOGGER.error("error trying to connect to grpc server") + LOGGER.error(e) + + @classmethod + def tearDownClass(cls) -> None: + try: + _loop.stop() + LOGGER.info("stopped the event loop") + except Exception as e: + LOGGER.error(e) + + def test_accumulate(self) -> None: + stub = self.__stub() + request = start_request() + generator_response = None + try: + generator_response = stub.AccumulateFn( + request_iterator=request_generator(count=5, request=request) + ) + except grpc.RpcError as e: + logging.error(e) + + # capture the output from the AccumulateFn generator and assert. + count = 0 + eof_count = 0 + for r in generator_response: + if hasattr(r, "payload") and r.payload.value: + count += 1 + # Each datum should increment the counter + expected_msg = f"counter:{count}" + self.assertEqual( + bytes(expected_msg, encoding="utf-8"), + r.payload.value, + ) + self.assertEqual(r.EOF, False) + # Check that keys are preserved + self.assertEqual(list(r.payload.keys), ["test_key"]) + else: + self.assertEqual(r.EOF, True) + eof_count += 1 + + # We should have received 5 messages (one for each datum) + self.assertEqual(5, count) + self.assertEqual(1, eof_count) + + def test_accumulate_with_multiple_keys(self) -> None: + stub = self.__stub() + request = start_request() + generator_response = None + try: + generator_response = stub.AccumulateFn( + request_iterator=request_generator(count=10, request=request, resetkey=True), + ) + except grpc.RpcError as e: + LOGGER.error(e) + + count = 0 + eof_count = 0 + key_counts = {} + + # capture the output from the AccumulateFn generator and assert. + for r in generator_response: + # Check for responses with values + if r.payload.value: + count += 1 + # Track count per key + key = r.payload.keys[0] if r.payload.keys else "no_key" + key_counts[key] = key_counts.get(key, 0) + 1 + + # Each key should have its own counter starting from 1 + expected_msg = f"counter:{key_counts[key]}" + self.assertEqual( + bytes(expected_msg, encoding="utf-8"), + r.payload.value, + ) + self.assertEqual(r.EOF, False) + else: + eof_count += 1 + self.assertEqual(r.EOF, True) + + # We should have 10 messages (one for each key) + self.assertEqual(10, count) + self.assertEqual(10, eof_count) # Each key/task sends its own EOF + # Each key should appear once + self.assertEqual(len(key_counts), 10) + + def test_accumulate_with_close(self) -> None: + stub = self.__stub() + request = start_request() + generator_response = None + try: + generator_response = stub.AccumulateFn( + request_iterator=request_generator(count=5, request=request, send_close=True) + ) + except grpc.RpcError as e: + logging.error(e) + + # capture the output from the AccumulateFn generator and assert. + count = 0 + eof_count = 0 + for r in generator_response: + if hasattr(r, "payload") and r.payload.value: + count += 1 + # Each datum should increment the counter + expected_msg = f"counter:{count}" + self.assertEqual( + bytes(expected_msg, encoding="utf-8"), + r.payload.value, + ) + self.assertEqual(r.EOF, False) + # Check that keys are preserved + self.assertEqual(list(r.payload.keys), ["test_key"]) + else: + self.assertEqual(r.EOF, True) + eof_count += 1 + + # We should have received 5 messages (one for each datum) + self.assertEqual(5, count) + self.assertEqual(1, eof_count) + + def test_accumulate_append_without_open(self) -> None: + stub = self.__stub() + request = start_request_without_open() + generator_response = None + try: + generator_response = stub.AccumulateFn( + request_iterator=request_generator_append_only(count=5, request=request) + ) + except grpc.RpcError as e: + logging.error(e) + + # capture the output from the AccumulateFn generator and assert. + count = 0 + eof_count = 0 + for r in generator_response: + if hasattr(r, "payload") and r.payload.value: + count += 1 + # Each datum should increment the counter + expected_msg = f"counter:{count}" + self.assertEqual( + bytes(expected_msg, encoding="utf-8"), + r.payload.value, + ) + self.assertEqual(r.EOF, False) + # Check that keys are preserved + self.assertEqual(list(r.payload.keys), ["test_key"]) + else: + self.assertEqual(r.EOF, True) + eof_count += 1 + + # We should have received 5 messages (one for each datum) + self.assertEqual(5, count) + self.assertEqual(1, eof_count) + + def test_accumulate_append_mixed(self) -> None: + stub = self.__stub() + request = start_request() + generator_response = None + try: + generator_response = stub.AccumulateFn( + request_iterator=request_generator_mixed(count=5, request=request) + ) + except grpc.RpcError as e: + logging.error(e) + + # capture the output from the AccumulateFn generator and assert. + count = 0 + eof_count = 0 + for r in generator_response: + if hasattr(r, "payload") and r.payload.value: + count += 1 + # Each datum should increment the counter + expected_msg = "counter:1" + self.assertEqual( + bytes(expected_msg, encoding="utf-8"), + r.payload.value, + ) + self.assertEqual(r.EOF, False) + # Check that keys are preserved + self.assertEqual(list(r.payload.keys), ["test_key"]) + else: + self.assertEqual(r.EOF, True) + eof_count += 1 + + # We should have received 5 messages (one for each datum) + self.assertEqual(3, count) + self.assertEqual(3, eof_count) + + def test_is_ready(self) -> None: + with grpc.insecure_channel("unix:///tmp/accumulator.sock") as channel: + stub = accumulator_pb2_grpc.AccumulatorStub(channel) + + request = _empty_pb2.Empty() + response = None + try: + response = stub.IsReady(request=request) + except grpc.RpcError as e: + logging.error(e) + + self.assertTrue(response.ready) + + def __stub(self): + return accumulator_pb2_grpc.AccumulatorStub(_channel) + + def test_error_init(self): + # Check that accumulator_instance is required + with self.assertRaises(TypeError): + AccumulatorAsyncServer() + # Check that the init_args and init_kwargs are passed + # only with an Accumulator class + with self.assertRaises(TypeError): + AccumulatorAsyncServer(accumulator_handler_func, init_args=(0, 1)) + # Check that an instance is not passed instead of the class + # signature + with self.assertRaises(TypeError): + AccumulatorAsyncServer(ExampleClass(0)) + + # Check that an invalid class is passed + class ExampleBadClass: + pass + + with self.assertRaises(TypeError): + AccumulatorAsyncServer(accumulator_instance=ExampleBadClass) + + def test_max_threads(self): + # max cap at 16 + server = AccumulatorAsyncServer(accumulator_instance=ExampleClass, max_threads=32) + self.assertEqual(server.max_threads, 16) + + # use argument provided + server = AccumulatorAsyncServer(accumulator_instance=ExampleClass, max_threads=5) + self.assertEqual(server.max_threads, 5) + + # defaults to 4 + server = AccumulatorAsyncServer(accumulator_instance=ExampleClass) + self.assertEqual(server.max_threads, 4) + + # zero threads + server = AccumulatorAsyncServer(ExampleClass, max_threads=0) + self.assertEqual(server.max_threads, 0) + + # negative threads + server = AccumulatorAsyncServer(ExampleClass, max_threads=-5) + self.assertEqual(server.max_threads, -5) + + def test_server_info_file_path_handling(self): + """Test AccumulatorAsyncServer with custom server info file path.""" + + server = AccumulatorAsyncServer( + ExampleClass, init_args=(0,), server_info_file="/custom/path/server_info.json" + ) + + self.assertEqual(server.server_info_file, "/custom/path/server_info.json") + + def test_init_kwargs_none_handling(self): + """Test init_kwargs None handling in AccumulatorAsyncServer.""" + + server = AccumulatorAsyncServer( + ExampleClass, init_args=(0,), init_kwargs=None # This should be converted to {} + ) + + # Should not raise any errors and should work correctly + self.assertIsNotNone(server.accumulator_handler) + + def test_server_start_method_logging(self): + """Test server start method includes proper logging.""" + from unittest.mock import patch + + server = AccumulatorAsyncServer(ExampleClass) + + # Mock aiorun.run to prevent actual server startup + with patch("pynumaflow.accumulator.async_server.aiorun") as mock_aiorun, patch( + "pynumaflow.accumulator.async_server._LOGGER" + ) as mock_logger: + server.start() + + # Verify logging was called + mock_logger.info.assert_called_once_with("Starting Async Accumulator Server") + + # Verify aiorun.run was called with correct parameters + mock_aiorun.run.assert_called_once() + self.assertTrue(mock_aiorun.run.call_args[1]["use_uvloop"]) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + unittest.main() diff --git a/tests/accumulator/test_async_accumulator_err.py b/tests/accumulator/test_async_accumulator_err.py new file mode 100644 index 00000000..5b39174c --- /dev/null +++ b/tests/accumulator/test_async_accumulator_err.py @@ -0,0 +1,175 @@ +import asyncio +import logging +import threading +import unittest +from collections.abc import AsyncIterable +from unittest.mock import patch + +import grpc +from grpc.aio._server import Server + +from pynumaflow import setup_logging +from pynumaflow.accumulator import ( + Message, + Datum, + AccumulatorAsyncServer, + Accumulator, +) +from pynumaflow.proto.accumulator import accumulator_pb2, accumulator_pb2_grpc +from pynumaflow.shared.asynciter import NonBlockingIterator +from tests.testing_utils import ( + mock_message, + get_time_args, + mock_terminate_on_stop, +) + +LOGGER = setup_logging(__name__) + + +def request_generator(count, request): + for i in range(count): + yield request + + +def start_request() -> accumulator_pb2.AccumulatorRequest: + event_time_timestamp, watermark_timestamp = get_time_args() + window = accumulator_pb2.KeyedWindow( + start=event_time_timestamp, + end=watermark_timestamp, + slot="slot-0", + keys=["test_key"], + ) + payload = accumulator_pb2.Payload( + keys=["test_key"], + value=mock_message(), + event_time=event_time_timestamp, + watermark=watermark_timestamp, + id="test_id", + headers={"test_header_key": "test_header_value", "source": "test_source"}, + ) + operation = accumulator_pb2.AccumulatorRequest.WindowOperation( + event=accumulator_pb2.AccumulatorRequest.WindowOperation.Event.OPEN, + keyedWindow=window, + ) + request = accumulator_pb2.AccumulatorRequest( + payload=payload, + operation=operation, + ) + return request + + +_s: Server = None +_channel = grpc.insecure_channel("unix:///tmp/accumulator_err.sock") +_loop = None + + +def startup_callable(loop): + asyncio.set_event_loop(loop) + loop.run_forever() + + +class ExampleErrorClass(Accumulator): + def __init__(self, counter): + self.counter = counter + + async def handler(self, datums: AsyncIterable[Datum], output: NonBlockingIterator): + async for datum in datums: + self.counter += 1 + if self.counter == 2: + # Simulate an error on the second datum + raise RuntimeError("Simulated error in accumulator handler") + msg = f"counter:{self.counter}" + await output.put(Message(str.encode(msg), keys=datum.keys(), tags=[])) + + +async def error_accumulator_handler_func(datums: AsyncIterable[Datum], output: NonBlockingIterator): + counter = 0 + async for datum in datums: + counter += 1 + if counter == 2: + # Simulate an error on the second datum + raise RuntimeError("Simulated error in accumulator function") + msg = f"counter:{counter}" + await output.put(Message(str.encode(msg), keys=datum.keys(), tags=[])) + + +def NewAsyncAccumulatorError(): + server_instance = AccumulatorAsyncServer(ExampleErrorClass, init_args=(0,)) + udfs = server_instance.servicer + return udfs + + +@patch("psutil.Process.kill", mock_terminate_on_stop) +async def start_server(udfs): + server = grpc.aio.server() + accumulator_pb2_grpc.add_AccumulatorServicer_to_server(udfs, server) + listen_addr = "unix:///tmp/accumulator_err.sock" + server.add_insecure_port(listen_addr) + logging.info("Starting server on %s", listen_addr) + global _s + _s = server + await server.start() + await server.wait_for_termination() + + +@patch("psutil.Process.kill", mock_terminate_on_stop) +class TestAsyncAccumulatorError(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + global _loop + loop = asyncio.new_event_loop() + _loop = loop + _thread = threading.Thread(target=startup_callable, args=(loop,), daemon=True) + _thread.start() + udfs = NewAsyncAccumulatorError() + asyncio.run_coroutine_threadsafe(start_server(udfs), loop=loop) + while True: + try: + with grpc.insecure_channel("unix:///tmp/accumulator_err.sock") as channel: + f = grpc.channel_ready_future(channel) + f.result(timeout=10) + if f.done(): + break + except grpc.FutureTimeoutError as e: + LOGGER.error("error trying to connect to grpc server") + LOGGER.error(e) + + @classmethod + def tearDownClass(cls) -> None: + try: + _loop.stop() + LOGGER.info("stopped the event loop") + except Exception as e: + LOGGER.error(e) + + @patch("psutil.Process.kill", mock_terminate_on_stop) + def test_accumulate_partial_success(self) -> None: + """Test that the first datum is processed before error occurs""" + stub = self.__stub() + request = start_request() + + try: + generator_response = stub.AccumulateFn( + request_iterator=request_generator(count=5, request=request) + ) + + # Try to consume the generator + counter = 0 + for response in generator_response: + self.assertIsInstance(response, accumulator_pb2.AccumulatorResponse) + self.assertTrue(response.payload.value.startswith(b"counter:")) + counter += 1 + + self.assertEqual(counter, 1, "Expected only one successful response before error") + except BaseException as err: + self.assertTrue("Simulated error in accumulator handler" in str(err)) + return + self.fail("Expected an exception.") + + def __stub(self): + return accumulator_pb2_grpc.AccumulatorStub(_channel) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + unittest.main() diff --git a/tests/accumulator/test_datatypes.py b/tests/accumulator/test_datatypes.py new file mode 100644 index 00000000..a71f3452 --- /dev/null +++ b/tests/accumulator/test_datatypes.py @@ -0,0 +1,339 @@ +import unittest +from collections.abc import AsyncIterable +from datetime import datetime, timezone + +from google.protobuf import timestamp_pb2 as _timestamp_pb2 +from pynumaflow.accumulator import Accumulator + +from pynumaflow.accumulator._dtypes import ( + IntervalWindow, + KeyedWindow, + Datum, + AccumulatorResult, + AccumulatorRequest, + WindowOperation, + Message, +) +from pynumaflow.shared.asynciter import NonBlockingIterator +from tests.testing_utils import ( + mock_message, + mock_event_time, + mock_watermark, + mock_start_time, + mock_end_time, +) + +TEST_KEYS = ["test"] +TEST_ID = "test_id" +TEST_HEADERS = {"key1": "value1", "key2": "value2"} + + +class TestDatum(unittest.TestCase): + def test_err_event_time(self): + ts = _timestamp_pb2.Timestamp() + ts.GetCurrentTime() + headers = {"key1": "value1", "key2": "value2"} + with self.assertRaises(Exception) as context: + Datum( + keys=TEST_KEYS, + value=mock_message(), + event_time=ts, + watermark=mock_watermark(), + id_=TEST_ID, + headers=headers, + ) + self.assertEqual( + "Wrong data type: " + "for Datum.event_time", + str(context.exception), + ) + + def test_err_watermark(self): + ts = _timestamp_pb2.Timestamp() + ts.GetCurrentTime() + headers = {"key1": "value1", "key2": "value2"} + with self.assertRaises(Exception) as context: + Datum( + keys=TEST_KEYS, + value=mock_message(), + event_time=mock_event_time(), + watermark=ts, + id_=TEST_ID, + headers=headers, + ) + self.assertEqual( + "Wrong data type: " + "for Datum.watermark", + str(context.exception), + ) + + def test_properties(self): + d = Datum( + keys=TEST_KEYS, + value=mock_message(), + event_time=mock_event_time(), + watermark=mock_watermark(), + id_=TEST_ID, + headers=TEST_HEADERS, + ) + self.assertEqual(mock_message(), d.value) + self.assertEqual(TEST_KEYS, d.keys()) + self.assertEqual(mock_event_time(), d.event_time) + self.assertEqual(mock_watermark(), d.watermark) + self.assertEqual(TEST_HEADERS, d.headers) + self.assertEqual(TEST_ID, d.id) + + def test_default_values(self): + d = Datum( + keys=None, + value=None, + event_time=mock_event_time(), + watermark=mock_watermark(), + id_=TEST_ID, + ) + self.assertEqual([], d.keys()) + self.assertEqual(b"", d.value) + self.assertEqual({}, d.headers) + + +class TestIntervalWindow(unittest.TestCase): + def test_start(self): + i = IntervalWindow(start=mock_start_time(), end=mock_end_time()) + self.assertEqual(mock_start_time(), i.start) + + def test_end(self): + i = IntervalWindow(start=mock_start_time(), end=mock_end_time()) + self.assertEqual(mock_end_time(), i.end) + + +class TestKeyedWindow(unittest.TestCase): + def test_create_window(self): + kw = KeyedWindow( + start=mock_start_time(), end=mock_end_time(), slot="slot-0", keys=["key1", "key2"] + ) + self.assertEqual(kw.start, mock_start_time()) + self.assertEqual(kw.end, mock_end_time()) + self.assertEqual(kw.slot, "slot-0") + self.assertEqual(kw.keys, ["key1", "key2"]) + + def test_default_values(self): + kw = KeyedWindow(start=mock_start_time(), end=mock_end_time()) + self.assertEqual(kw.slot, "") + self.assertEqual(kw.keys, []) + + def test_window_property(self): + kw = KeyedWindow(start=mock_start_time(), end=mock_end_time()) + self.assertIsInstance(kw.window, IntervalWindow) + self.assertEqual(kw.window.start, mock_start_time()) + self.assertEqual(kw.window.end, mock_end_time()) + + +class TestAccumulatorResult(unittest.TestCase): + def test_create_result(self): + # Create mock objects + future = None # In real usage, this would be an asyncio.Task + iterator = NonBlockingIterator() + keys = ["key1", "key2"] + result_queue = NonBlockingIterator() + consumer_future = None # In real usage, this would be an asyncio.Task + watermark = datetime.fromtimestamp(1662998400, timezone.utc) + + result = AccumulatorResult(future, iterator, keys, result_queue, consumer_future, watermark) + + self.assertEqual(result.future, future) + self.assertEqual(result.iterator, iterator) + self.assertEqual(result.keys, keys) + self.assertEqual(result.result_queue, result_queue) + self.assertEqual(result.consumer_future, consumer_future) + self.assertEqual(result.latest_watermark, watermark) + + def test_update_watermark(self): + result = AccumulatorResult( + None, None, [], None, None, datetime.fromtimestamp(1662998400, timezone.utc) + ) + new_watermark = datetime.fromtimestamp(1662998460, timezone.utc) + result.update_watermark(new_watermark) + self.assertEqual(result.latest_watermark, new_watermark) + + def test_update_watermark_invalid_type(self): + result = AccumulatorResult( + None, None, [], None, None, datetime.fromtimestamp(1662998400, timezone.utc) + ) + with self.assertRaises(TypeError): + result.update_watermark("not a datetime") + + +class TestAccumulatorRequest(unittest.TestCase): + def test_create_request(self): + operation = WindowOperation.OPEN + keyed_window = KeyedWindow(start=mock_start_time(), end=mock_end_time()) + payload = Datum( + keys=TEST_KEYS, + value=mock_message(), + event_time=mock_event_time(), + watermark=mock_watermark(), + id_=TEST_ID, + ) + + request = AccumulatorRequest(operation, keyed_window, payload) + self.assertEqual(request.operation, operation) + self.assertEqual(request.keyed_window, keyed_window) + self.assertEqual(request.payload, payload) + + +class TestWindowOperation(unittest.TestCase): + def test_enum_values(self): + self.assertEqual(WindowOperation.OPEN, 0) + self.assertEqual(WindowOperation.CLOSE, 1) + self.assertEqual(WindowOperation.APPEND, 2) + + +class TestMessage(unittest.TestCase): + def test_create_message(self): + value = b"test_value" + keys = ["key1", "key2"] + tags = ["tag1", "tag2"] + + msg = Message(value=value, keys=keys, tags=tags) + self.assertEqual(msg.value, value) + self.assertEqual(msg.keys, keys) + self.assertEqual(msg.tags, tags) + + def test_default_values(self): + msg = Message(value=b"test") + self.assertEqual(msg.keys, []) + self.assertEqual(msg.tags, []) + + def test_to_drop(self): + msg = Message.to_drop() + self.assertEqual(msg.value, b"") + self.assertEqual(msg.keys, []) + self.assertTrue("U+005C__DROP__" in msg.tags) + + def test_none_values(self): + msg = Message(value=None, keys=None, tags=None) + self.assertEqual(msg.value, b"") + self.assertEqual(msg.keys, []) + self.assertEqual(msg.tags, []) + + def test_from_datum(self): + """Test that Message.from_datum correctly creates a Message from a Datum""" + # Create a sample datum with all properties + test_keys = ["key1", "key2"] + test_value = b"test_message_value" + test_event_time = mock_event_time() + test_watermark = mock_watermark() + test_headers = {"header1": "value1", "header2": "value2"} + test_id = "test_datum_id" + + datum = Datum( + keys=test_keys, + value=test_value, + event_time=test_event_time, + watermark=test_watermark, + id_=test_id, + headers=test_headers, + ) + + # Create message from datum + message = Message.from_datum(datum) + + # Verify all properties are correctly transferred + self.assertEqual(message.value, test_value) + self.assertEqual(message.keys, test_keys) + self.assertEqual(message.event_time, test_event_time) + self.assertEqual(message.watermark, test_watermark) + self.assertEqual(message.headers, test_headers) + self.assertEqual(message.id, test_id) + + # Verify that tags are empty (default for Message) + self.assertEqual(message.tags, []) + + def test_from_datum_minimal(self): + """Test from_datum with minimal Datum (no headers)""" + test_keys = ["minimal_key"] + test_value = b"minimal_value" + test_event_time = mock_event_time() + test_watermark = mock_watermark() + test_id = "minimal_id" + + datum = Datum( + keys=test_keys, + value=test_value, + event_time=test_event_time, + watermark=test_watermark, + id_=test_id, + # headers not provided (will default to {}) + ) + + message = Message.from_datum(datum) + + self.assertEqual(message.value, test_value) + self.assertEqual(message.keys, test_keys) + self.assertEqual(message.event_time, test_event_time) + self.assertEqual(message.watermark, test_watermark) + self.assertEqual(message.headers, {}) + self.assertEqual(message.id, test_id) + self.assertEqual(message.tags, []) + + def test_from_datum_empty_keys(self): + """Test from_datum with empty keys""" + datum = Datum( + keys=None, # Will default to [] + value=b"test_value", + event_time=mock_event_time(), + watermark=mock_watermark(), + id_="test_id", + ) + + message = Message.from_datum(datum) + + self.assertEqual(message.keys, []) + self.assertEqual(message.value, b"test_value") + self.assertEqual(message.id, "test_id") + + +class TestAccumulatorClass(unittest.TestCase): + class ExampleClass(Accumulator): + async def handler(self, datums: AsyncIterable[Datum], output: NonBlockingIterator): + pass + + def __init__(self, test1, test2): + self.test1 = test1 + self.test2 = test2 + self.test3 = self.test1 + + def test_init(self): + r = self.ExampleClass(test1=1, test2=2) + self.assertEqual(1, r.test1) + self.assertEqual(2, r.test2) + self.assertEqual(1, r.test3) + + def test_callable(self): + """Test that accumulator instances can be called directly""" + r = self.ExampleClass(test1=1, test2=2) + # The __call__ method should be callable and delegate to the handler method + self.assertTrue(callable(r)) + # __call__ should return the result of calling handler + # Since handler is an async method, __call__ should return a coroutine + import asyncio + from pynumaflow.shared.asynciter import NonBlockingIterator + + async def test_datums(): + yield Datum( + keys=["test"], + value=b"test", + event_time=mock_event_time(), + watermark=mock_watermark(), + id_="test", + ) + + output = NonBlockingIterator() + result = r(test_datums(), output) + self.assertTrue(asyncio.iscoroutine(result)) + # Clean up the coroutine + result.close() + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/accumulator/utils.py b/tests/accumulator/utils.py new file mode 100644 index 00000000..d0c68fbb --- /dev/null +++ b/tests/accumulator/utils.py @@ -0,0 +1,23 @@ +from datetime import datetime, timezone +from pynumaflow.accumulator import Datum + + +def create_test_datum(keys, value, event_time=None, watermark=None, id_=None, headers=None): + """Create a test Datum object with default values""" + if event_time is None: + event_time = datetime.fromtimestamp(1662998400, timezone.utc) + if watermark is None: + watermark = datetime.fromtimestamp(1662998460, timezone.utc) + if id_ is None: + id_ = "test_id" + if headers is None: + headers = {} + + return Datum( + keys=keys, + value=value, + event_time=event_time, + watermark=watermark, + id_=id_, + headers=headers, + ) diff --git a/tests/sourcetransform/test_async.py b/tests/sourcetransform/test_async.py new file mode 100644 index 00000000..05f7f29d --- /dev/null +++ b/tests/sourcetransform/test_async.py @@ -0,0 +1,272 @@ +import asyncio +import logging +import threading +import unittest +from unittest.mock import patch +from google.protobuf import timestamp_pb2 as _timestamp_pb2 + +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 +from grpc.aio._server import Server + +from pynumaflow import setup_logging +from pynumaflow._constants import MAX_MESSAGE_SIZE +from pynumaflow.proto.sourcetransformer import transform_pb2_grpc +from pynumaflow.sourcetransformer import Datum, Messages, Message, SourceTransformer +from pynumaflow.sourcetransformer.async_server import SourceTransformAsyncServer +from tests.sourcetransform.utils import get_test_datums +from tests.testing_utils import ( + mock_terminate_on_stop, + mock_new_event_time, +) + +LOGGER = setup_logging(__name__) + +# if set to true, transform handler will raise a `ValueError` exception. +raise_error_from_st = False + + +class SimpleAsyncSourceTrn(SourceTransformer): + async def handler(self, keys: list[str], datum: Datum) -> Messages: + if raise_error_from_st: + raise ValueError("Exception thrown from transform") + val = datum.value + msg = "payload:{} event_time:{} ".format( + val.decode("utf-8"), + datum.event_time, + ) + val = bytes(msg, encoding="utf-8") + messages = Messages() + messages.append(Message(val, mock_new_event_time(), keys=keys)) + return messages + + +def request_generator(req): + yield from req + + +_s: Server = None +_channel = grpc.insecure_channel("unix:///tmp/async_st.sock") +_loop = None + + +def startup_callable(loop): + asyncio.set_event_loop(loop) + loop.run_forever() + + +def new_async_st(): + handle = SimpleAsyncSourceTrn() + server = SourceTransformAsyncServer(source_transform_instance=handle) + udfs = server.servicer + return udfs + + +async def start_server(udfs): + _server_options = [ + ("grpc.max_send_message_length", MAX_MESSAGE_SIZE), + ("grpc.max_receive_message_length", MAX_MESSAGE_SIZE), + ] + server = grpc.aio.server(options=_server_options) + transform_pb2_grpc.add_SourceTransformServicer_to_server(udfs, server) + listen_addr = "unix:///tmp/async_st.sock" + server.add_insecure_port(listen_addr) + logging.info("Starting server on %s", listen_addr) + global _s + _s = server + await server.start() + await server.wait_for_termination() + + +# We are mocking the terminate function from the psutil to not exit the program during testing +@patch("psutil.Process.kill", mock_terminate_on_stop) +class TestAsyncTransformer(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + global _loop + loop = asyncio.new_event_loop() + _loop = loop + _thread = threading.Thread(target=startup_callable, args=(loop,), daemon=True) + _thread.start() + udfs = new_async_st() + asyncio.run_coroutine_threadsafe(start_server(udfs), loop=loop) + while True: + try: + with grpc.insecure_channel("unix:///tmp/async_st.sock") as channel: + f = grpc.channel_ready_future(channel) + f.result(timeout=10) + if f.done(): + break + except grpc.FutureTimeoutError as e: + LOGGER.error("error trying to connect to grpc server") + LOGGER.error(e) + + @classmethod + def tearDownClass(cls) -> None: + try: + _loop.stop() + LOGGER.info("stopped the event loop") + except Exception as e: + LOGGER.error(e) + + def test_run_server(self) -> None: + with grpc.insecure_channel("unix:///tmp/async_st.sock") as channel: + stub = transform_pb2_grpc.SourceTransformStub(channel) + request = get_test_datums() + generator_response = None + try: + generator_response = stub.SourceTransformFn( + request_iterator=request_generator(request) + ) + except grpc.RpcError as e: + logging.error(e) + + responses = [] + # capture the output from the ReadFn generator and assert. + for r in generator_response: + responses.append(r) + + # 1 handshake + 3 data responses + self.assertEqual(4, len(responses)) + + self.assertTrue(responses[0].handshake.sot) + + idx = 1 + while idx < len(responses): + _id = "test-id-" + str(idx) + self.assertEqual(_id, responses[idx].id) + self.assertEqual( + bytes( + "payload:test_mock_message " "event_time:2022-09-12 16:00:00 ", + encoding="utf-8", + ), + responses[idx].results[0].value, + ) + self.assertEqual(1, len(responses[idx].results)) + idx += 1 + + LOGGER.info("Successfully validated the server") + + def test_async_source_transformer(self) -> None: + stub = transform_pb2_grpc.SourceTransformStub(_channel) + request = get_test_datums() + generator_response = None + try: + generator_response = stub.SourceTransformFn(request_iterator=request_generator(request)) + except grpc.RpcError as e: + logging.error(e) + + responses = [] + # capture the output from the ReadFn generator and assert. + for r in generator_response: + responses.append(r) + + # 1 handshake + 3 data responses + self.assertEqual(4, len(responses)) + + self.assertTrue(responses[0].handshake.sot) + + idx = 1 + while idx < len(responses): + _id = "test-id-" + str(idx) + self.assertEqual(_id, responses[idx].id) + self.assertEqual( + bytes( + "payload:test_mock_message " "event_time:2022-09-12 16:00:00 ", + encoding="utf-8", + ), + responses[idx].results[0].value, + ) + self.assertEqual(1, len(responses[idx].results)) + idx += 1 + + # Verify new event time gets assigned. + updated_event_time_timestamp = _timestamp_pb2.Timestamp() + updated_event_time_timestamp.FromDatetime(dt=mock_new_event_time()) + self.assertEqual( + updated_event_time_timestamp, + responses[1].results[0].event_time, + ) + # self.assertEqual(code, grpc.StatusCode.OK) + + def test_async_source_transformer_grpc_error_no_handshake(self) -> None: + stub = transform_pb2_grpc.SourceTransformStub(_channel) + request = get_test_datums(handshake=False) + grpc_exception = None + + responses = [] + try: + generator_response = stub.SourceTransformFn(request_iterator=request_generator(request)) + # capture the output from the ReadFn generator and assert. + for r in generator_response: + responses.append(r) + except grpc.RpcError as e: + logging.error(e) + grpc_exception = e + self.assertTrue("SourceTransformFn: expected handshake message" in e.__str__()) + + self.assertEqual(0, len(responses)) + self.assertIsNotNone(grpc_exception) + + def test_async_source_transformer_grpc_error(self) -> None: + stub = transform_pb2_grpc.SourceTransformStub(_channel) + request = get_test_datums() + grpc_exception = None + + responses = [] + try: + global raise_error_from_st + raise_error_from_st = True + generator_response = stub.SourceTransformFn(request_iterator=request_generator(request)) + # capture the output from the ReadFn generator and assert. + for r in generator_response: + responses.append(r) + except grpc.RpcError as e: + logging.error(e) + grpc_exception = e + self.assertEqual(grpc.StatusCode.INTERNAL, e.code()) + self.assertTrue("Exception thrown from transform" in e.__str__()) + finally: + raise_error_from_st = False + # 1 handshake + self.assertEqual(1, len(responses)) + self.assertIsNotNone(grpc_exception) + + def test_is_ready(self) -> None: + with grpc.insecure_channel("unix:///tmp/async_st.sock") as channel: + stub = transform_pb2_grpc.SourceTransformStub(channel) + + request = _empty_pb2.Empty() + response = None + try: + response = stub.IsReady(request=request) + except grpc.RpcError as e: + logging.error(e) + + self.assertTrue(response.ready) + + def test_invalid_input(self): + with self.assertRaises(TypeError): + SourceTransformAsyncServer() + + def __stub(self): + return transform_pb2_grpc.SourceTransformStub(_channel) + + def test_max_threads(self): + handle = SimpleAsyncSourceTrn() + # max cap at 16 + server = SourceTransformAsyncServer(source_transform_instance=handle, max_threads=32) + self.assertEqual(server.max_threads, 16) + + # use argument provided + server = SourceTransformAsyncServer(source_transform_instance=handle, max_threads=5) + self.assertEqual(server.max_threads, 5) + + # defaults to 4 + server = SourceTransformAsyncServer(source_transform_instance=handle) + self.assertEqual(server.max_threads, 4) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + unittest.main()