Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 74 additions & 1 deletion libs/python/agent/agent/adapters/models/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,16 @@ class InternVLModel:
"""

def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
"""Initialize the InternVL model with specified configuration.

Args:
model_name: The name or path of the InternVL model to load
device: Device to load the model on, defaults to "auto"
trust_remote_code: Whether to trust remote code when loading the model

Raises:
ImportError: If InternVL dependencies are not available
"""
if not HF_AVAILABLE:
raise ImportError(
"InternVL dependencies not found. Install with: pip install \"cua-agent[internvl-hf]\""
Expand All @@ -37,6 +47,7 @@ def __init__(self, model_name: str, device: str = "auto", trust_remote_code: boo
self._load()

def _load(self) -> None:
"""Load the model and tokenizer from the specified model name."""
# Load model
self.model = AutoModel.from_pretrained(
self.model_name,
Expand All @@ -58,6 +69,15 @@ def _load(self) -> None:
IMAGENET_STD = (0.229, 0.224, 0.225)

def _build_transform(self, input_size: int) -> T.Compose:
"""Build image transformation pipeline for preprocessing.

Args:
input_size: Target size for image resizing

Returns:
Composed transformation pipeline that converts images to RGB, resizes,
converts to tensor, and normalizes with ImageNet statistics
"""
MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
Expand All @@ -68,6 +88,18 @@ def _build_transform(self, input_size: int) -> T.Compose:
return transform

def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tuple], width: int, height: int, image_size: int):
"""Find the target aspect ratio that best matches the input image.

Args:
aspect_ratio: Original aspect ratio of the image
target_ratios: List of possible target aspect ratios as (width, height) tuples
width: Original image width
height: Original image height
image_size: Base image size for calculations

Returns:
Best matching aspect ratio tuple from target_ratios
"""
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
Expand All @@ -83,6 +115,18 @@ def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tu
return best_ratio

def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448, use_thumbnail: bool = True) -> List[Image.Image]:
"""Preprocess image by splitting it into tiles based on aspect ratio.

Args:
image: Input PIL image to preprocess
min_num: Minimum number of tiles to generate
max_num: Maximum number of tiles to generate
image_size: Size of each tile
use_thumbnail: Whether to add a thumbnail version of the full image

Returns:
List of processed image tiles, optionally including a thumbnail
"""
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height

Expand Down Expand Up @@ -116,7 +160,17 @@ def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int
return processed_images

def _load_image_from_source(self, src: str) -> Image.Image:
"""Load PIL image from various sources: data URL, http(s), or local path."""
"""Load PIL image from various sources: data URL, http(s), or local path.

Args:
src: Image source - can be a data URL, HTTP(S) URL, or local file path

Returns:
PIL Image object converted to RGB format

Raises:
Various exceptions depending on source type (network errors, file errors, etc.)
"""
if src.startswith("data:image/"):
# data URL base64
header, b64data = src.split(",", 1)
Expand All @@ -130,6 +184,17 @@ def _load_image_from_source(self, src: str) -> Image.Image:
return Image.open(src).convert('RGB')

def _images_to_pixel_values(self, images: List[Image.Image], input_size: int = 448, max_num: int = 12):
"""Convert list of PIL images to tensor pixel values for model input.

Args:
images: List of PIL images to convert
input_size: Target size for image preprocessing
max_num: Maximum number of tiles per image

Returns:
Tuple of (pixel_values tensor, list of patch counts per image).
Returns (None, []) if no images provided.
"""
transform = self._build_transform(input_size=input_size)
pixel_values_list = []
num_patches_list: List[int] = []
Expand All @@ -151,6 +216,14 @@ def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) ->
This implementation constructs InternVL-compatible inputs and uses
`model.chat(tokenizer, pixel_values, question, history=...)` to avoid
relying on AutoProcessor (which fails for some tokenizers).

Args:
messages: List of message dictionaries with role and content fields.
Content can contain text and image items.
max_new_tokens: Maximum number of new tokens to generate

Returns:
Generated text response from the model, or empty string if generation fails
"""
assert self.model is not None and self.tokenizer is not None

Expand Down
28 changes: 28 additions & 0 deletions libs/python/agent/agent/adapters/models/opencua.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@ class OpenCUAModel:
"""OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor."""

def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
"""Initialize the OpenCUA model with specified configuration.

Args:
model_name: The name or path of the model to load
device: Device to run the model on, defaults to "auto"
trust_remote_code: Whether to trust remote code when loading the model

Raises:
ImportError: If OpenCUA requirements are not installed
"""
if not OPENCUA_AVAILABLE:
raise ImportError(
"OpenCUA requirements not found. Install with: pip install \"cua-agent[opencua-hf]\""
Expand All @@ -30,6 +40,7 @@ def __init__(self, model_name: str, device: str = "auto", trust_remote_code: boo
self._load()

def _load(self) -> None:
"""Load the tokenizer, model, and image processor from the specified model name."""
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name, trust_remote_code=self.trust_remote_code
)
Expand All @@ -46,6 +57,14 @@ def _load(self) -> None:

@staticmethod
def _extract_last_image_b64(messages: List[Dict[str, Any]]) -> str:
"""Extract the base64 encoded image data from the last image in the message list.

Args:
messages: List of message dictionaries in HF format with content items

Returns:
Base64 encoded image data string, or empty string if no image found
"""
# Expect HF-format messages with content items type: "image" with data URL
for msg in reversed(messages):
for item in reversed(msg.get("content", [])):
Expand All @@ -56,6 +75,15 @@ def _extract_last_image_b64(messages: List[Dict[str, Any]]) -> str:
return ""

def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str:
"""Generate text response from the model using the provided messages.

Args:
messages: List of message dictionaries containing conversation history
max_new_tokens: Maximum number of new tokens to generate

Returns:
Generated text response as a string
"""
assert self.model is not None and self.tokenizer is not None and self.image_processor is not None

# Tokenize text side using chat template
Expand Down
62 changes: 62 additions & 0 deletions libs/python/agent/agent/integrations/hud/proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li

Only a subset is supported: computer_call, assistant message (text), and reasoning.
Unknown types are ignored.

Args:
output_items: List of agent output items to convert

Returns:
List of OpenAI ResponseOutputItem objects
"""
blocks: List[ResponseOutputItem] = []
for item in output_items or []:
Expand Down Expand Up @@ -81,6 +87,14 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li
return blocks

def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
"""Convert items to a list of plain dictionaries.

Args:
items: Items to convert, can be objects with model_dump method or dictionaries

Returns:
List of dictionaries representing the items
"""
out: List[Dict[str, Any]] = []
for it in list(items):
if hasattr(it, "model_dump"):
Expand All @@ -100,11 +114,23 @@ class FakeAsyncOpenAI:
"""

def __init__(self, computer_agent: BaseComputerAgent) -> None:
"""Initialize the fake OpenAI client.

Args:
computer_agent: The ComputerAgent instance to use for generating responses
"""
self._agent = computer_agent
self.responses = self._Responses(self)

class _Responses:
"""Internal responses handler for the fake OpenAI client."""

def __init__(self, parent: "FakeAsyncOpenAI") -> None:
"""Initialize the responses handler.

Args:
parent: The parent FakeAsyncOpenAI instance
"""
# Caches for cross-call context when using previous_response_id
self.blocks_cache: Dict[str, ResponseInputParam | ResponseOutputItem] = {}
self.context_cache: Dict[str, List[str]] = {}
Expand All @@ -121,6 +147,23 @@ async def create(
max_retries: int = 5,
**_: Any,
) -> Any:
"""Create a response using the computer agent.

Args:
model: The model name to use
input: The input parameters for the response
tools: Optional list of tools to use
instructions: Optional instructions to prepend
previous_response_id: Optional ID of previous response for context
max_retries: Maximum number of retry attempts
**_: Additional keyword arguments (ignored)

Returns:
OpenAI Response object with agent output

Raises:
Exception: If all retry attempts fail
"""
for attempt in range(max_retries):
# Prepend cached blocks from previous_response_id to input
full_input = input
Expand Down Expand Up @@ -217,6 +260,25 @@ def __init__(
telemetry_enabled: bool | None = True,
**kwargs: Any,
) -> None:
"""Initialize the proxy operator agent.

Args:
model: Model name to use, defaults to "computer-use-preview"
allowed_tools: List of allowed tool names, defaults to ["openai_computer"]
trajectory_dir: Directory for storing trajectories
tools: Additional tools to include
custom_loop: Custom loop implementation
only_n_most_recent_images: Limit on recent images to keep
callbacks: List of callback functions
instructions: Instructions to prepend to prompts
verbosity: Logging verbosity level
max_retries: Maximum retry attempts
screenshot_delay: Delay between screenshots
use_prompt_caching: Whether to use prompt caching
max_trajectory_budget: Budget limit for trajectories
telemetry_enabled: Whether telemetry is enabled
**kwargs: Additional arguments passed to OperatorAgent
"""
model = model or "computer-use-preview"
allowed_tools = allowed_tools or ["openai_computer"]

Expand Down
Loading
Loading