Skip to content

Python API Reference

This section documents the Python API for vector-inference.

Client Interface

vec_inf.client.api.VecInfClient

Client for interacting with Vector Inference programmatically.

This class provides methods for launching models, checking their status, retrieving metrics, and shutting down models using the Vector Inference infrastructure.

Examples:

>>> from vec_inf.api import VecInfClient
>>> client = VecInfClient()
>>> response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
>>> job_id = response.slurm_job_id
>>> status = client.get_status(job_id)
>>> if status.status == ModelStatus.READY:
...     print(f"Model is ready at {status.base_url}")
>>> client.shutdown_model(job_id)
Source code in vec_inf/client/api.py
class VecInfClient:
    """Client for interacting with Vector Inference programmatically.

    This class provides methods for launching models, checking their status,
    retrieving metrics, and shutting down models using the Vector Inference
    infrastructure.

    Examples
    --------
    >>> from vec_inf.api import VecInfClient
    >>> client = VecInfClient()
    >>> response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
    >>> job_id = response.slurm_job_id
    >>> status = client.get_status(job_id)
    >>> if status.status == ModelStatus.READY:
    ...     print(f"Model is ready at {status.base_url}")
    >>> client.shutdown_model(job_id)

    """

    def __init__(self) -> None:
        """Initialize the Vector Inference client."""
        pass

    def list_models(self) -> list[ModelInfo]:
        """List all available models.

        Returns
        -------
        list[ModelInfo]
            ModelInfo objects containing information about available models.
        """
        model_registry = ModelRegistry()
        return model_registry.get_all_models()

    def get_model_config(self, model_name: str) -> ModelConfig:
        """Get the configuration for a specific model.

        Parameters
        ----------
        model_name: str
            Name of the model to get configuration for.

        Returns
        -------
        ModelConfig
            Model configuration.
        """
        model_registry = ModelRegistry()
        return model_registry.get_single_model_config(model_name)

    def launch_model(
        self, model_name: str, options: Optional[LaunchOptions] = None
    ) -> LaunchResponse:
        """Launch a model on the cluster.

        Parameters
        ----------
        model_name: str
            Name of the model to launch.
        options: LaunchOptions, optional
            Optional launch options to override default configuration.

        Returns
        -------
        LaunchResponse
            Information about the launched model.
        """
        # Convert LaunchOptions to dictionary if provided
        options_dict: dict[str, Any] = {}
        if options:
            options_dict = {k: v for k, v in vars(options).items() if v is not None}

        # Create and use the API Launch Helper
        model_launcher = ModelLauncher(model_name, options_dict)
        return model_launcher.launch()

    def get_status(
        self, slurm_job_id: int, log_dir: Optional[str] = None
    ) -> StatusResponse:
        """Get the status of a running model.

        Parameters
        ----------
        slurm_job_id: str
            The Slurm job ID to check.
        log_dir: str, optional
            Optional path to the Slurm log directory.

        Returns
        -------
        StatusResponse
            Model status information.
        """
        model_status_monitor = ModelStatusMonitor(slurm_job_id, log_dir)
        return model_status_monitor.process_model_status()

    def get_metrics(
        self, slurm_job_id: int, log_dir: Optional[str] = None
    ) -> MetricsResponse:
        """Get the performance metrics of a running model.

        Parameters
        ----------
        slurm_job_id : str
            The Slurm job ID to get metrics for.
        log_dir : str, optional
            Optional path to the Slurm log directory.

        Returns
        -------
        MetricsResponse
            Object containing the model's performance metrics.
        """
        performance_metrics_collector = PerformanceMetricsCollector(
            slurm_job_id, log_dir
        )

        metrics: Union[dict[str, float], str]
        if not performance_metrics_collector.metrics_url.startswith("http"):
            metrics = performance_metrics_collector.metrics_url
        else:
            metrics = performance_metrics_collector.fetch_metrics()

        return MetricsResponse(
            model_name=performance_metrics_collector.status_info.model_name,
            metrics=metrics,
            timestamp=time.time(),
        )

    def shutdown_model(self, slurm_job_id: int) -> bool:
        """Shutdown a running model.

        Parameters
        ----------
        slurm_job_id: str
            The Slurm job ID to shut down.

        Returns
        -------
        bool
            True if the model was successfully shutdown, False otherwise.

        Raises
        ------
        SlurmJobError
            If there was an error shutting down the model.
        """
        shutdown_cmd = f"scancel {slurm_job_id}"
        _, stderr = run_bash_command(shutdown_cmd)
        if stderr:
            raise SlurmJobError(f"Failed to shutdown model: {stderr}")
        return True

    def wait_until_ready(
        self,
        slurm_job_id: int,
        timeout_seconds: int = 1800,
        poll_interval_seconds: int = 10,
        log_dir: Optional[str] = None,
    ) -> StatusResponse:
        """Wait until a model is ready or fails.

        Parameters
        ----------
        slurm_job_id: str
            The Slurm job ID to wait for.
        timeout_seconds: int
            Maximum time to wait in seconds (default: 30 mins).
        poll_interval_seconds: int
            How often to check status in seconds (default: 10s).
        log_dir: str, optional
            Optional path to the Slurm log directory.

        Returns
        -------
        StatusResponse
            Status, if the model is ready or failed.

        Raises
        ------
        SlurmJobError
            If the specified job is not found or there's an error with the job.
        ServerError
            If the server fails to start within the timeout period.
        APIError
            If there was an error checking the status.

        """
        start_time = time.time()

        while True:
            status_info = self.get_status(slurm_job_id, log_dir)

            if status_info.server_status == ModelStatus.READY:
                return status_info

            if status_info.server_status == ModelStatus.FAILED:
                error_message = status_info.failed_reason or "Unknown error"
                raise ServerError(f"Model failed to start: {error_message}")

            if status_info.server_status == ModelStatus.SHUTDOWN:
                raise ServerError("Model was shutdown before it became ready")

            # Check timeout
            if time.time() - start_time > timeout_seconds:
                raise ServerError(
                    f"Timed out waiting for model to become ready after {timeout_seconds} seconds"
                )

            # Wait before checking again
            time.sleep(poll_interval_seconds)

__init__

__init__()

Initialize the Vector Inference client.

Source code in vec_inf/client/api.py
def __init__(self) -> None:
    """Initialize the Vector Inference client."""
    pass

list_models

list_models()

List all available models.

Returns:

Type Description
list[ModelInfo]

ModelInfo objects containing information about available models.

Source code in vec_inf/client/api.py
def list_models(self) -> list[ModelInfo]:
    """List all available models.

    Returns
    -------
    list[ModelInfo]
        ModelInfo objects containing information about available models.
    """
    model_registry = ModelRegistry()
    return model_registry.get_all_models()

get_model_config

get_model_config(model_name)

Get the configuration for a specific model.

Parameters:

Name Type Description Default
model_name str

Name of the model to get configuration for.

required

Returns:

Type Description
ModelConfig

Model configuration.

Source code in vec_inf/client/api.py
def get_model_config(self, model_name: str) -> ModelConfig:
    """Get the configuration for a specific model.

    Parameters
    ----------
    model_name: str
        Name of the model to get configuration for.

    Returns
    -------
    ModelConfig
        Model configuration.
    """
    model_registry = ModelRegistry()
    return model_registry.get_single_model_config(model_name)

launch_model

launch_model(model_name, options=None)

Launch a model on the cluster.

Parameters:

Name Type Description Default
model_name str

Name of the model to launch.

required
options Optional[LaunchOptions]

Optional launch options to override default configuration.

None

Returns:

Type Description
LaunchResponse

Information about the launched model.

Source code in vec_inf/client/api.py
def launch_model(
    self, model_name: str, options: Optional[LaunchOptions] = None
) -> LaunchResponse:
    """Launch a model on the cluster.

    Parameters
    ----------
    model_name: str
        Name of the model to launch.
    options: LaunchOptions, optional
        Optional launch options to override default configuration.

    Returns
    -------
    LaunchResponse
        Information about the launched model.
    """
    # Convert LaunchOptions to dictionary if provided
    options_dict: dict[str, Any] = {}
    if options:
        options_dict = {k: v for k, v in vars(options).items() if v is not None}

    # Create and use the API Launch Helper
    model_launcher = ModelLauncher(model_name, options_dict)
    return model_launcher.launch()

get_status

get_status(slurm_job_id, log_dir=None)

Get the status of a running model.

Parameters:

Name Type Description Default
slurm_job_id int

The Slurm job ID to check.

required
log_dir Optional[str]

Optional path to the Slurm log directory.

None

Returns:

Type Description
StatusResponse

Model status information.

Source code in vec_inf/client/api.py
def get_status(
    self, slurm_job_id: int, log_dir: Optional[str] = None
) -> StatusResponse:
    """Get the status of a running model.

    Parameters
    ----------
    slurm_job_id: str
        The Slurm job ID to check.
    log_dir: str, optional
        Optional path to the Slurm log directory.

    Returns
    -------
    StatusResponse
        Model status information.
    """
    model_status_monitor = ModelStatusMonitor(slurm_job_id, log_dir)
    return model_status_monitor.process_model_status()

get_metrics

get_metrics(slurm_job_id, log_dir=None)

Get the performance metrics of a running model.

Parameters:

Name Type Description Default
slurm_job_id str

The Slurm job ID to get metrics for.

required
log_dir str

Optional path to the Slurm log directory.

None

Returns:

Type Description
MetricsResponse

Object containing the model's performance metrics.

Source code in vec_inf/client/api.py
def get_metrics(
    self, slurm_job_id: int, log_dir: Optional[str] = None
) -> MetricsResponse:
    """Get the performance metrics of a running model.

    Parameters
    ----------
    slurm_job_id : str
        The Slurm job ID to get metrics for.
    log_dir : str, optional
        Optional path to the Slurm log directory.

    Returns
    -------
    MetricsResponse
        Object containing the model's performance metrics.
    """
    performance_metrics_collector = PerformanceMetricsCollector(
        slurm_job_id, log_dir
    )

    metrics: Union[dict[str, float], str]
    if not performance_metrics_collector.metrics_url.startswith("http"):
        metrics = performance_metrics_collector.metrics_url
    else:
        metrics = performance_metrics_collector.fetch_metrics()

    return MetricsResponse(
        model_name=performance_metrics_collector.status_info.model_name,
        metrics=metrics,
        timestamp=time.time(),
    )

shutdown_model

shutdown_model(slurm_job_id)

Shutdown a running model.

Parameters:

Name Type Description Default
slurm_job_id int

The Slurm job ID to shut down.

required

Returns:

Type Description
bool

True if the model was successfully shutdown, False otherwise.

Raises:

Type Description
SlurmJobError

If there was an error shutting down the model.

Source code in vec_inf/client/api.py
def shutdown_model(self, slurm_job_id: int) -> bool:
    """Shutdown a running model.

    Parameters
    ----------
    slurm_job_id: str
        The Slurm job ID to shut down.

    Returns
    -------
    bool
        True if the model was successfully shutdown, False otherwise.

    Raises
    ------
    SlurmJobError
        If there was an error shutting down the model.
    """
    shutdown_cmd = f"scancel {slurm_job_id}"
    _, stderr = run_bash_command(shutdown_cmd)
    if stderr:
        raise SlurmJobError(f"Failed to shutdown model: {stderr}")
    return True

wait_until_ready

wait_until_ready(
    slurm_job_id,
    timeout_seconds=1800,
    poll_interval_seconds=10,
    log_dir=None,
)

Wait until a model is ready or fails.

Parameters:

Name Type Description Default
slurm_job_id int

The Slurm job ID to wait for.

required
timeout_seconds int

Maximum time to wait in seconds (default: 30 mins).

1800
poll_interval_seconds int

How often to check status in seconds (default: 10s).

10
log_dir Optional[str]

Optional path to the Slurm log directory.

None

Returns:

Type Description
StatusResponse

Status, if the model is ready or failed.

Raises:

Type Description
SlurmJobError

If the specified job is not found or there's an error with the job.

ServerError

If the server fails to start within the timeout period.

APIError

If there was an error checking the status.

Source code in vec_inf/client/api.py
def wait_until_ready(
    self,
    slurm_job_id: int,
    timeout_seconds: int = 1800,
    poll_interval_seconds: int = 10,
    log_dir: Optional[str] = None,
) -> StatusResponse:
    """Wait until a model is ready or fails.

    Parameters
    ----------
    slurm_job_id: str
        The Slurm job ID to wait for.
    timeout_seconds: int
        Maximum time to wait in seconds (default: 30 mins).
    poll_interval_seconds: int
        How often to check status in seconds (default: 10s).
    log_dir: str, optional
        Optional path to the Slurm log directory.

    Returns
    -------
    StatusResponse
        Status, if the model is ready or failed.

    Raises
    ------
    SlurmJobError
        If the specified job is not found or there's an error with the job.
    ServerError
        If the server fails to start within the timeout period.
    APIError
        If there was an error checking the status.

    """
    start_time = time.time()

    while True:
        status_info = self.get_status(slurm_job_id, log_dir)

        if status_info.server_status == ModelStatus.READY:
            return status_info

        if status_info.server_status == ModelStatus.FAILED:
            error_message = status_info.failed_reason or "Unknown error"
            raise ServerError(f"Model failed to start: {error_message}")

        if status_info.server_status == ModelStatus.SHUTDOWN:
            raise ServerError("Model was shutdown before it became ready")

        # Check timeout
        if time.time() - start_time > timeout_seconds:
            raise ServerError(
                f"Timed out waiting for model to become ready after {timeout_seconds} seconds"
            )

        # Wait before checking again
        time.sleep(poll_interval_seconds)

Data Models

vec_inf.client._models

Data models for Vector Inference API.

This module contains the data model classes used by the Vector Inference API for both request parameters and response objects.

ModelStatus

Bases: str, Enum

Enum representing the possible status states of a model.

Source code in vec_inf/client/_models.py
class ModelStatus(str, Enum):
    """Enum representing the possible status states of a model."""

    PENDING = "PENDING"
    LAUNCHING = "LAUNCHING"
    READY = "READY"
    FAILED = "FAILED"
    SHUTDOWN = "SHUTDOWN"
    UNAVAILABLE = "UNAVAILABLE"

ModelType

Bases: str, Enum

Enum representing the possible model types.

Source code in vec_inf/client/_models.py
class ModelType(str, Enum):
    """Enum representing the possible model types."""

    LLM = "LLM"
    VLM = "VLM"
    TEXT_EMBEDDING = "Text_Embedding"
    REWARD_MODELING = "Reward_Modeling"

LaunchResponse dataclass

Response from launching a model.

Source code in vec_inf/client/_models.py
@dataclass
class LaunchResponse:
    """Response from launching a model."""

    slurm_job_id: int
    model_name: str
    config: dict[str, Any]
    raw_output: str = field(repr=False)

StatusResponse dataclass

Response from checking a model's status.

Source code in vec_inf/client/_models.py
@dataclass
class StatusResponse:
    """Response from checking a model's status."""

    model_name: str
    server_status: ModelStatus
    job_state: Union[str, ModelStatus]
    raw_output: str = field(repr=False)
    base_url: Optional[str] = None
    pending_reason: Optional[str] = None
    failed_reason: Optional[str] = None

MetricsResponse dataclass

Response from retrieving model metrics.

Source code in vec_inf/client/_models.py
@dataclass
class MetricsResponse:
    """Response from retrieving model metrics."""

    model_name: str
    metrics: Union[dict[str, float], str]
    timestamp: float

LaunchOptions dataclass

Options for launching a model.

Source code in vec_inf/client/_models.py
@dataclass
class LaunchOptions:
    """Options for launching a model."""

    model_family: Optional[str] = None
    model_variant: Optional[str] = None
    max_model_len: Optional[int] = None
    max_num_seqs: Optional[int] = None
    gpu_memory_utilization: Optional[float] = None
    enable_prefix_caching: Optional[bool] = None
    enable_chunked_prefill: Optional[bool] = None
    max_num_batched_tokens: Optional[int] = None
    partition: Optional[str] = None
    num_nodes: Optional[int] = None
    gpus_per_node: Optional[int] = None
    qos: Optional[str] = None
    time: Optional[str] = None
    vocab_size: Optional[int] = None
    data_type: Optional[str] = None
    venv: Optional[str] = None
    log_dir: Optional[str] = None
    model_weights_parent_dir: Optional[str] = None
    pipeline_parallelism: Optional[bool] = None
    compilation_config: Optional[str] = None
    enforce_eager: Optional[bool] = None

LaunchOptionsDict

Bases: TypedDict

TypedDict for LaunchOptions.

Source code in vec_inf/client/_models.py
class LaunchOptionsDict(TypedDict):
    """TypedDict for LaunchOptions."""

    model_family: NotRequired[Optional[str]]
    model_variant: NotRequired[Optional[str]]
    max_model_len: NotRequired[Optional[int]]
    max_num_seqs: NotRequired[Optional[int]]
    gpu_memory_utilization: NotRequired[Optional[float]]
    enable_prefix_caching: NotRequired[Optional[bool]]
    enable_chunked_prefill: NotRequired[Optional[bool]]
    max_num_batched_tokens: NotRequired[Optional[int]]
    partition: NotRequired[Optional[str]]
    num_nodes: NotRequired[Optional[int]]
    gpus_per_node: NotRequired[Optional[int]]
    qos: NotRequired[Optional[str]]
    time: NotRequired[Optional[str]]
    vocab_size: NotRequired[Optional[int]]
    data_type: NotRequired[Optional[str]]
    venv: NotRequired[Optional[str]]
    log_dir: NotRequired[Optional[str]]
    model_weights_parent_dir: NotRequired[Optional[str]]
    pipeline_parallelism: NotRequired[Optional[bool]]
    compilation_config: NotRequired[Optional[str]]
    enforce_eager: NotRequired[Optional[bool]]

ModelInfo dataclass

Information about an available model.

Source code in vec_inf/client/_models.py
@dataclass
class ModelInfo:
    """Information about an available model."""

    name: str
    family: str
    variant: Optional[str]
    type: ModelType
    config: dict[str, Any]