Source code for bentoml._internal.batch.spark

from __future__ import annotations

import logging
import os.path
import tempfile
import typing as t
from typing import TYPE_CHECKING

from bentoml._internal.utils import reserve_free_port

from ...bentos import import_bento
from ...bentos import serve
from ...client import Client
from ...client import HTTPClient
from ...exceptions import BentoMLException
from ...exceptions import MissingDependencyException
from ..bento import Bento
from ..service.loader import load_bento
from ..tag import Tag

try:
    from pyspark.files import SparkFiles
    from pyspark.sql.types import StructType
except ImportError:  # pragma: no cover (trivial error)
    raise MissingDependencyException(
        "'pyspark' is required in order to use module 'bentoml.spark', install pyspark with 'pip install pyspark'. For more information, refer to https://spark.apache.org/docs/latest/api/python/"
    )

if TYPE_CHECKING:
    import pyspark.sql.dataframe
    import pyspark.sql.session

    RecordBatch: t.TypeAlias = t.Any  # pyarrow doesn't have type annotations


logger = logging.getLogger(__name__)


def _distribute_bento(spark: pyspark.sql.session.SparkSession, bento: Bento) -> str:
    temp_dir = tempfile.mkdtemp()
    export_path = bento.export(temp_dir)
    spark.sparkContext.addFile(export_path)
    return os.path.basename(export_path)


def _load_bento_spark(bento_tag: Tag):
    """
    load Bento from local bento store or the SparkFiles directory
    """
    try:
        return load_bento(bento_tag)
    except Exception:
        # Use the default Bento export file name. This relies on the implementation
        # of _distribute_bento to use default Bento export file name.
        bento_path = SparkFiles.get(f"{bento_tag.name}-{bento_tag.version}.bento")
        if not os.path.isfile(bento_path):
            raise

        import_bento(bento_path)
        return load_bento(bento_tag)


def _get_process(
    bento_tag: Tag, api_name: str
) -> t.Callable[[t.Iterable[RecordBatch]], t.Generator[RecordBatch, None, None]]:
    def process(
        iterator: t.Iterable[RecordBatch],
    ) -> t.Generator[RecordBatch, None, None]:
        svc = _load_bento_spark(bento_tag)

        assert (
            api_name in svc.apis
        ), "An error occurred transferring the Bento to the Spark worker."
        inference_api = svc.apis[api_name]
        assert inference_api.func is not None, "Inference API function not defined"

        # start bento server
        with reserve_free_port() as port:
            pass

        server = serve(bento_tag, port=port)
        Client.wait_until_server_ready("localhost", server.port, 30)
        client = HTTPClient(svc, f"http://localhost:{server.port}")

        for batch in iterator:
            func_input = inference_api.input.from_arrow(batch)
            func_output = client.call(api_name, func_input)
            yield inference_api.output.to_arrow(func_output)

    return process


[docs]def run_in_spark(
    bento: Bento,
    df: pyspark.sql.dataframe.DataFrame,
    spark: pyspark.sql.session.SparkSession,
    api_name: str | None = None,
    output_schema: StructType | None = None,
) -> pyspark.sql.dataframe.DataFrame:
    """
    Run BentoService inference API in Spark.

    The API to run must accept batches as input and return batches as output.

    Args:
        bento:
            The bento containing the inference API to run.
        df:
            The input DataFrame to run the inference API on.
        spark:
            The spark session to use to run the inference API.
        api_name:
            The name of the inference API to run. If not provided, there must be only one API contained
            in the bento; that API will be run.
        output_schema:
            The Spark schema of the output DataFrame. If not provided, BentoML will attempt to infer the
            schema from the output descriptor of the inference API.

    Returns:
        The result of the inference API run on the input ``df``.

    Examples
    --------

    .. code-block:: python

        >>> import bentoml
        >>> import pyspark
        >>> from pyspark.sql import SparkSession
        >>> from pyspark.sql.types import StructType, StructField, StringType

        >>> spark = SparkSession.builder.getOrCreate()
        >>> schema = StructType([
        ...     StructField("name", StringType(), True),
        ...     StructField("age", StringType(), True),
        ... ])
        >>> df = spark.createDataFrame([("John", 30), ("Mike", 25), ("Sally", 40)], schema)

        >>> bento = bentoml.get("my_service:latest")
        >>> results = bentoml.batch.run_in_spark(bento, df, spark)
        >>> results.show()
        +-----+---+
        | name|age|
        +-----+---+
        |John |30 |
        +-----+---+
    """
    svc = load_bento(bento)

    if api_name is None:
        if len(svc.apis) != 1:
            raise BentoMLException(
                f'Bento "{bento.tag}" has multiple APIs ({svc.apis.keys()}), specify which API should be run, e.g.: bentoml.batch.run_in_spark("my_service:latest", df, spark, api_name="predict")'
            )
        api_name = next(iter(svc.apis))
    else:
        if api_name not in svc.apis:
            raise BentoMLException(
                f"API name '{api_name}' not found in Bento '{bento.tag}', available APIs are {svc.apis.keys()}"
            )

    api = svc.apis[api_name]

    _distribute_bento(spark, bento)

    process = _get_process(bento.tag, api_name)

    if output_schema is None:
        output_schema = api.output.spark_schema()

    return df.mapInArrow(process, output_schema)