Python API Reference¶

This section documents the Python API for vector-inference.

Client Interface¶

vec_inf.client.api.VecInfClient ¶

Client for interacting with Vector Inference programmatically.

This class provides methods for launching models, checking their status, retrieving metrics, and shutting down models using the Vector Inference infrastructure.

Examples:

>>> from vec_inf.api import VecInfClient
>>> client = VecInfClient()
>>> response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
>>> job_id = response.slurm_job_id
>>> status = client.get_status(job_id)
>>> if status.status == ModelStatus.READY:
...     print(f"Model is ready at {status.base_url}")
>>> client.shutdown_model(job_id)

Source code in vec_inf/client/api.py

class VecInfClient:
    """Client for interacting with Vector Inference programmatically.

    This class provides methods for launching models, checking their status,
    retrieving metrics, and shutting down models using the Vector Inference
    infrastructure.

    Examples
    --------
    >>> from vec_inf.api import VecInfClient
    >>> client = VecInfClient()
    >>> response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
    >>> job_id = response.slurm_job_id
    >>> status = client.get_status(job_id)
    >>> if status.status == ModelStatus.READY:
    ...     print(f"Model is ready at {status.base_url}")
    >>> client.shutdown_model(job_id)

    """

    def __init__(self) -> None:
        """Initialize the Vector Inference client."""
        pass

    def list_models(self) -> list[ModelInfo]:
        """List all available models.

        Returns
        -------
        list[ModelInfo]
            ModelInfo objects containing information about available models.
        """
        model_registry = ModelRegistry()
        return model_registry.get_all_models()

    def get_model_config(self, model_name: str) -> ModelConfig:
        """Get the configuration for a specific model.

        Parameters
        ----------
        model_name: str
            Name of the model to get configuration for.

        Returns
        -------
        ModelConfig
            Model configuration.
        """
        model_registry = ModelRegistry()
        return model_registry.get_single_model_config(model_name)

    def launch_model(
        self, model_name: str, options: Optional[LaunchOptions] = None
    ) -> LaunchResponse:
        """Launch a model on the cluster.

        Parameters
        ----------
        model_name: str
            Name of the model to launch.
        options: LaunchOptions, optional
            Optional launch options to override default configuration.

        Returns
        -------
        LaunchResponse
            Information about the launched model.
        """
        # Convert LaunchOptions to dictionary if provided
        options_dict: dict[str, Any] = {}
        if options:
            options_dict = {k: v for k, v in vars(options).items() if v is not None}

        # Create and use the API Launch Helper
        model_launcher = ModelLauncher(model_name, options_dict)
        return model_launcher.launch()

    def get_status(
        self, slurm_job_id: int, log_dir: Optional[str] = None
    ) -> StatusResponse:
        """Get the status of a running model.

        Parameters
        ----------
        slurm_job_id: str
            The Slurm job ID to check.
        log_dir: str, optional
            Optional path to the Slurm log directory.

        Returns
        -------
        StatusResponse
            Model status information.
        """
        model_status_monitor = ModelStatusMonitor(slurm_job_id, log_dir)
        return model_status_monitor.process_model_status()

    def get_metrics(
        self, slurm_job_id: int, log_dir: Optional[str] = None
    ) -> MetricsResponse:
        """Get the performance metrics of a running model.

        Parameters
        ----------
        slurm_job_id : str
            The Slurm job ID to get metrics for.
        log_dir : str, optional
            Optional path to the Slurm log directory.

        Returns
        -------
        MetricsResponse
            Object containing the model's performance metrics.
        """
        performance_metrics_collector = PerformanceMetricsCollector(
            slurm_job_id, log_dir
        )

        metrics: Union[dict[str, float], str]
        if not performance_metrics_collector.metrics_url.startswith("http"):
            metrics = performance_metrics_collector.metrics_url
        else:
            metrics = performance_metrics_collector.fetch_metrics()

        return MetricsResponse(
            model_name=performance_metrics_collector.status_info.model_name,
            metrics=metrics,
            timestamp=time.time(),
        )

    def shutdown_model(self, slurm_job_id: int) -> bool:
        """Shutdown a running model.

        Parameters
        ----------
        slurm_job_id: str
            The Slurm job ID to shut down.

        Returns
        -------
        bool
            True if the model was successfully shutdown, False otherwise.

        Raises
        ------
        SlurmJobError
            If there was an error shutting down the model.
        """
        shutdown_cmd = f"scancel {slurm_job_id}"
        _, stderr = run_bash_command(shutdown_cmd)
        if stderr:
            raise SlurmJobError(f"Failed to shutdown model: {stderr}")
        return True

    def wait_until_ready(
        self,
        slurm_job_id: int,
        timeout_seconds: int = 1800,
        poll_interval_seconds: int = 10,
        log_dir: Optional[str] = None,
    ) -> StatusResponse:
        """Wait until a model is ready or fails.

        Parameters
        ----------
        slurm_job_id: str
            The Slurm job ID to wait for.
        timeout_seconds: int
            Maximum time to wait in seconds (default: 30 mins).
        poll_interval_seconds: int
            How often to check status in seconds (default: 10s).
        log_dir: str, optional
            Optional path to the Slurm log directory.

        Returns
        -------
        StatusResponse
            Status, if the model is ready or failed.

        Raises
        ------
        SlurmJobError
            If the specified job is not found or there's an error with the job.
        ServerError
            If the server fails to start within the timeout period.
        APIError
            If there was an error checking the status.

        """
        start_time = time.time()

        while True:
            status_info = self.get_status(slurm_job_id, log_dir)

            if status_info.server_status == ModelStatus.READY:
                return status_info

            if status_info.server_status == ModelStatus.FAILED:
                error_message = status_info.failed_reason or "Unknown error"
                raise ServerError(f"Model failed to start: {error_message}")

            if status_info.server_status == ModelStatus.SHUTDOWN:
                raise ServerError("Model was shutdown before it became ready")

            # Check timeout
            if time.time() - start_time > timeout_seconds:
                raise ServerError(
                    f"Timed out waiting for model to become ready after {timeout_seconds} seconds"
                )

            # Wait before checking again
            time.sleep(poll_interval_seconds)

init ¶

__init__()

Initialize the Vector Inference client.

Source code in vec_inf/client/api.py

def __init__(self) -> None:
    """Initialize the Vector Inference client."""
    pass

list_models ¶

list_models()

List all available models.

Returns:

Type	Description
`list[ModelInfo]`	ModelInfo objects containing information about available models.

Source code in vec_inf/client/api.py

def list_models(self) -> list[ModelInfo]:
    """List all available models.

    Returns
    -------
    list[ModelInfo]
        ModelInfo objects containing information about available models.
    """
    model_registry = ModelRegistry()
    return model_registry.get_all_models()

get_model_config ¶

get_model_config(model_name)

Get the configuration for a specific model.

Parameters:

Name	Type	Description	Default
`model_name`	`str`	Name of the model to get configuration for.	required

Returns:

Type	Description
`ModelConfig`	Model configuration.

Source code in vec_inf/client/api.py

def get_model_config(self, model_name: str) -> ModelConfig:
    """Get the configuration for a specific model.

    Parameters
    ----------
    model_name: str
        Name of the model to get configuration for.

    Returns
    -------
    ModelConfig
        Model configuration.
    """
    model_registry = ModelRegistry()
    return model_registry.get_single_model_config(model_name)

launch_model ¶

launch_model(model_name, options=None)

Launch a model on the cluster.

Parameters:

Name	Type	Description	Default
`model_name`	`str`	Name of the model to launch.	required
`options`	`Optional[LaunchOptions]`	Optional launch options to override default configuration.	`None`

Returns:

Type	Description
`LaunchResponse`	Information about the launched model.

Source code in vec_inf/client/api.py

def launch_model(
    self, model_name: str, options: Optional[LaunchOptions] = None
) -> LaunchResponse:
    """Launch a model on the cluster.

    Parameters
    ----------
    model_name: str
        Name of the model to launch.
    options: LaunchOptions, optional
        Optional launch options to override default configuration.

    Returns
    -------
    LaunchResponse
        Information about the launched model.
    """
    # Convert LaunchOptions to dictionary if provided
    options_dict: dict[str, Any] = {}
    if options:
        options_dict = {k: v for k, v in vars(options).items() if v is not None}

    # Create and use the API Launch Helper
    model_launcher = ModelLauncher(model_name, options_dict)
    return model_launcher.launch()

get_status ¶

get_status(slurm_job_id, log_dir=None)

Get the status of a running model.

Parameters:

Name	Type	Description	Default
`slurm_job_id`	`int`	The Slurm job ID to check.	required
`log_dir`	`Optional[str]`	Optional path to the Slurm log directory.	`None`

Returns:

Type	Description
`StatusResponse`	Model status information.

Source code in vec_inf/client/api.py

def get_status(
    self, slurm_job_id: int, log_dir: Optional[str] = None
) -> StatusResponse:
    """Get the status of a running model.

    Parameters
    ----------
    slurm_job_id: str
        The Slurm job ID to check.
    log_dir: str, optional
        Optional path to the Slurm log directory.

    Returns
    -------
    StatusResponse
        Model status information.
    """
    model_status_monitor = ModelStatusMonitor(slurm_job_id, log_dir)
    return model_status_monitor.process_model_status()

get_metrics ¶

get_metrics(slurm_job_id, log_dir=None)

Get the performance metrics of a running model.

Parameters:

Name	Type	Description	Default
`slurm_job_id`	`str`	The Slurm job ID to get metrics for.	required
`log_dir`	`str`	Optional path to the Slurm log directory.	`None`

Returns:

Type	Description
`MetricsResponse`	Object containing the model's performance metrics.

Source code in vec_inf/client/api.py

def get_metrics(
    self, slurm_job_id: int, log_dir: Optional[str] = None
) -> MetricsResponse:
    """Get the performance metrics of a running model.

    Parameters
    ----------
    slurm_job_id : str
        The Slurm job ID to get metrics for.
    log_dir : str, optional
        Optional path to the Slurm log directory.

    Returns
    -------
    MetricsResponse
        Object containing the model's performance metrics.
    """
    performance_metrics_collector = PerformanceMetricsCollector(
        slurm_job_id, log_dir
    )

    metrics: Union[dict[str, float], str]
    if not performance_metrics_collector.metrics_url.startswith("http"):
        metrics = performance_metrics_collector.metrics_url
    else:
        metrics = performance_metrics_collector.fetch_metrics()

    return MetricsResponse(
        model_name=performance_metrics_collector.status_info.model_name,
        metrics=metrics,
        timestamp=time.time(),
    )

shutdown_model ¶

shutdown_model(slurm_job_id)

Shutdown a running model.

Parameters:

Name	Type	Description	Default
`slurm_job_id`	`int`	The Slurm job ID to shut down.	required

Returns:

Type	Description
`bool`	True if the model was successfully shutdown, False otherwise.

Raises:

Type	Description
`SlurmJobError`	If there was an error shutting down the model.

Source code in vec_inf/client/api.py

def shutdown_model(self, slurm_job_id: int) -> bool:
    """Shutdown a running model.

    Parameters
    ----------
    slurm_job_id: str
        The Slurm job ID to shut down.

    Returns
    -------
    bool
        True if the model was successfully shutdown, False otherwise.

    Raises
    ------
    SlurmJobError
        If there was an error shutting down the model.
    """
    shutdown_cmd = f"scancel {slurm_job_id}"
    _, stderr = run_bash_command(shutdown_cmd)
    if stderr:
        raise SlurmJobError(f"Failed to shutdown model: {stderr}")
    return True

wait_until_ready ¶

wait_until_ready(
    slurm_job_id,
    timeout_seconds=1800,
    poll_interval_seconds=10,
    log_dir=None,
)

Wait until a model is ready or fails.

Parameters:

Name	Type	Description	Default
`slurm_job_id`	`int`	The Slurm job ID to wait for.	required
`timeout_seconds`	`int`	Maximum time to wait in seconds (default: 30 mins).	`1800`
`poll_interval_seconds`	`int`	How often to check status in seconds (default: 10s).	`10`
`log_dir`	`Optional[str]`	Optional path to the Slurm log directory.	`None`

Returns:

Type	Description
`StatusResponse`	Status, if the model is ready or failed.

Raises:

Type	Description
`SlurmJobError`	If the specified job is not found or there's an error with the job.
`ServerError`	If the server fails to start within the timeout period.
`APIError`	If there was an error checking the status.

Source code in vec_inf/client/api.py

def wait_until_ready(
    self,
    slurm_job_id: int,
    timeout_seconds: int = 1800,
    poll_interval_seconds: int = 10,
    log_dir: Optional[str] = None,
) -> StatusResponse:
    """Wait until a model is ready or fails.

    Parameters
    ----------
    slurm_job_id: str
        The Slurm job ID to wait for.
    timeout_seconds: int
        Maximum time to wait in seconds (default: 30 mins).
    poll_interval_seconds: int
        How often to check status in seconds (default: 10s).
    log_dir: str, optional
        Optional path to the Slurm log directory.

    Returns
    -------
    StatusResponse
        Status, if the model is ready or failed.

    Raises
    ------
    SlurmJobError
        If the specified job is not found or there's an error with the job.
    ServerError
        If the server fails to start within the timeout period.
    APIError
        If there was an error checking the status.

    """
    start_time = time.time()

    while True:
        status_info = self.get_status(slurm_job_id, log_dir)

        if status_info.server_status == ModelStatus.READY:
            return status_info

        if status_info.server_status == ModelStatus.FAILED:
            error_message = status_info.failed_reason or "Unknown error"
            raise ServerError(f"Model failed to start: {error_message}")

        if status_info.server_status == ModelStatus.SHUTDOWN:
            raise ServerError("Model was shutdown before it became ready")

        # Check timeout
        if time.time() - start_time > timeout_seconds:
            raise ServerError(
                f"Timed out waiting for model to become ready after {timeout_seconds} seconds"
            )

        # Wait before checking again
        time.sleep(poll_interval_seconds)

Data Models¶

vec_inf.client._models ¶

Data models for Vector Inference API.

This module contains the data model classes used by the Vector Inference API for both request parameters and response objects.

ModelStatus ¶

Bases: str, Enum

Enum representing the possible status states of a model.

Source code in vec_inf/client/_models.py

class ModelStatus(str, Enum):
    """Enum representing the possible status states of a model."""

    PENDING = "PENDING"
    LAUNCHING = "LAUNCHING"
    READY = "READY"
    FAILED = "FAILED"
    SHUTDOWN = "SHUTDOWN"
    UNAVAILABLE = "UNAVAILABLE"

ModelType ¶

Bases: str, Enum

Enum representing the possible model types.

Source code in vec_inf/client/_models.py

class ModelType(str, Enum):
    """Enum representing the possible model types."""

    LLM = "LLM"
    VLM = "VLM"
    TEXT_EMBEDDING = "Text_Embedding"
    REWARD_MODELING = "Reward_Modeling"

LaunchResponse `dataclass` ¶

Response from launching a model.

Source code in vec_inf/client/_models.py

@dataclass
class LaunchResponse:
    """Response from launching a model."""

    slurm_job_id: int
    model_name: str
    config: dict[str, Any]
    raw_output: str = field(repr=False)

StatusResponse `dataclass` ¶

Response from checking a model's status.

Source code in vec_inf/client/_models.py

@dataclass
class StatusResponse:
    """Response from checking a model's status."""

    model_name: str
    server_status: ModelStatus
    job_state: Union[str, ModelStatus]
    raw_output: str = field(repr=False)
    base_url: Optional[str] = None
    pending_reason: Optional[str] = None
    failed_reason: Optional[str] = None

MetricsResponse `dataclass` ¶

Response from retrieving model metrics.

Source code in vec_inf/client/_models.py

@dataclass
class MetricsResponse:
    """Response from retrieving model metrics."""

    model_name: str
    metrics: Union[dict[str, float], str]
    timestamp: float

LaunchOptions `dataclass` ¶

Options for launching a model.

Source code in vec_inf/client/_models.py

@dataclass
class LaunchOptions:
    """Options for launching a model."""

    model_family: Optional[str] = None
    model_variant: Optional[str] = None
    max_model_len: Optional[int] = None
    max_num_seqs: Optional[int] = None
    gpu_memory_utilization: Optional[float] = None
    enable_prefix_caching: Optional[bool] = None
    enable_chunked_prefill: Optional[bool] = None
    max_num_batched_tokens: Optional[int] = None
    partition: Optional[str] = None
    num_nodes: Optional[int] = None
    gpus_per_node: Optional[int] = None
    qos: Optional[str] = None
    time: Optional[str] = None
    vocab_size: Optional[int] = None
    data_type: Optional[str] = None
    venv: Optional[str] = None
    log_dir: Optional[str] = None
    model_weights_parent_dir: Optional[str] = None
    pipeline_parallelism: Optional[bool] = None
    compilation_config: Optional[str] = None
    enforce_eager: Optional[bool] = None

LaunchOptionsDict ¶

Bases: TypedDict

TypedDict for LaunchOptions.

Source code in vec_inf/client/_models.py

class LaunchOptionsDict(TypedDict):
    """TypedDict for LaunchOptions."""

    model_family: NotRequired[Optional[str]]
    model_variant: NotRequired[Optional[str]]
    max_model_len: NotRequired[Optional[int]]
    max_num_seqs: NotRequired[Optional[int]]
    gpu_memory_utilization: NotRequired[Optional[float]]
    enable_prefix_caching: NotRequired[Optional[bool]]
    enable_chunked_prefill: NotRequired[Optional[bool]]
    max_num_batched_tokens: NotRequired[Optional[int]]
    partition: NotRequired[Optional[str]]
    num_nodes: NotRequired[Optional[int]]
    gpus_per_node: NotRequired[Optional[int]]
    qos: NotRequired[Optional[str]]
    time: NotRequired[Optional[str]]
    vocab_size: NotRequired[Optional[int]]
    data_type: NotRequired[Optional[str]]
    venv: NotRequired[Optional[str]]
    log_dir: NotRequired[Optional[str]]
    model_weights_parent_dir: NotRequired[Optional[str]]
    pipeline_parallelism: NotRequired[Optional[bool]]
    compilation_config: NotRequired[Optional[str]]
    enforce_eager: NotRequired[Optional[bool]]

ModelInfo `dataclass` ¶

Information about an available model.

Source code in vec_inf/client/_models.py

@dataclass
class ModelInfo:
    """Information about an available model."""

    name: str
    family: str
    variant: Optional[str]
    type: ModelType
    config: dict[str, Any]

Python API Reference¶

Client Interface¶

vec_inf.client.api.VecInfClient ¶

__init__ ¶

list_models ¶

get_model_config ¶

launch_model ¶

get_status ¶

get_metrics ¶

shutdown_model ¶

wait_until_ready ¶

Data Models¶

vec_inf.client._models ¶

ModelStatus ¶

ModelType ¶

LaunchResponse dataclass ¶

StatusResponse dataclass ¶

MetricsResponse dataclass ¶

LaunchOptions dataclass ¶

LaunchOptionsDict ¶

ModelInfo dataclass ¶

init ¶

LaunchResponse `dataclass` ¶

StatusResponse `dataclass` ¶

MetricsResponse `dataclass` ¶

LaunchOptions `dataclass` ¶

ModelInfo `dataclass` ¶