trycua · onel · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025
diff --git a/libs/python/agent/agent/adapters/models/internvl.py b/libs/python/agent/agent/adapters/models/internvl.py
@@ -25,6 +25,16 @@ class InternVLModel:
     """
 
     def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
+        """Initialize the InternVL model with specified configuration.
+
+        Args:
+            model_name: The name or path of the InternVL model to load
+            device: Device to load the model on, defaults to "auto"
+            trust_remote_code: Whether to trust remote code when loading the model
+
+        Raises:
+            ImportError: If InternVL dependencies are not available
+        """
         if not HF_AVAILABLE:
             raise ImportError(
                 "InternVL dependencies not found. Install with: pip install \"cua-agent[internvl-hf]\""
@@ -37,6 +47,7 @@ def __init__(self, model_name: str, device: str = "auto", trust_remote_code: boo
         self._load()
 
     def _load(self) -> None:
+        """Load the model and tokenizer from the specified model name."""
         # Load model
         self.model = AutoModel.from_pretrained(
             self.model_name,
@@ -58,6 +69,15 @@ def _load(self) -> None:
     IMAGENET_STD = (0.229, 0.224, 0.225)
 
     def _build_transform(self, input_size: int) -> T.Compose:
+        """Build image transformation pipeline for preprocessing.
+
+        Args:
+            input_size: Target size for image resizing
+
+        Returns:
+            Composed transformation pipeline that converts images to RGB, resizes, 
+            converts to tensor, and normalizes with ImageNet statistics
+        """
         MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
         transform = T.Compose([
             T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
@@ -68,6 +88,18 @@ def _build_transform(self, input_size: int) -> T.Compose:
         return transform
 
     def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tuple], width: int, height: int, image_size: int):
+        """Find the target aspect ratio that best matches the input image.
+
+        Args:
+            aspect_ratio: Original aspect ratio of the image
+            target_ratios: List of possible target aspect ratios as (width, height) tuples
+            width: Original image width
+            height: Original image height
+            image_size: Base image size for calculations
+
+        Returns:
+            Best matching aspect ratio tuple from target_ratios
+        """
         best_ratio_diff = float('inf')
         best_ratio = (1, 1)
         area = width * height
@@ -83,6 +115,18 @@ def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tu
         return best_ratio
 
     def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448, use_thumbnail: bool = True) -> List[Image.Image]:
+        """Preprocess image by splitting it into tiles based on aspect ratio.
+
+        Args:
+            image: Input PIL image to preprocess
+            min_num: Minimum number of tiles to generate
+            max_num: Maximum number of tiles to generate
+            image_size: Size of each tile
+            use_thumbnail: Whether to add a thumbnail version of the full image
+
+        Returns:
+            List of processed image tiles, optionally including a thumbnail
+        """
         orig_width, orig_height = image.size
         aspect_ratio = orig_width / orig_height
 
@@ -116,7 +160,17 @@ def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int
         return processed_images
 
     def _load_image_from_source(self, src: str) -> Image.Image:
-        """Load PIL image from various sources: data URL, http(s), or local path."""
+        """Load PIL image from various sources: data URL, http(s), or local path.
+
+        Args:
+            src: Image source - can be a data URL, HTTP(S) URL, or local file path
+
+        Returns:
+            PIL Image object converted to RGB format
+
+        Raises:
+            Various exceptions depending on source type (network errors, file errors, etc.)
+        """
         if src.startswith("data:image/"):
             # data URL base64
             header, b64data = src.split(",", 1)
@@ -130,6 +184,17 @@ def _load_image_from_source(self, src: str) -> Image.Image:
         return Image.open(src).convert('RGB')
 
     def _images_to_pixel_values(self, images: List[Image.Image], input_size: int = 448, max_num: int = 12):
+        """Convert list of PIL images to tensor pixel values for model input.
+
+        Args:
+            images: List of PIL images to convert
+            input_size: Target size for image preprocessing
+            max_num: Maximum number of tiles per image
+
+        Returns:
+            Tuple of (pixel_values tensor, list of patch counts per image).
+            Returns (None, []) if no images provided.
+        """
         transform = self._build_transform(input_size=input_size)
         pixel_values_list = []
         num_patches_list: List[int] = []
@@ -151,6 +216,14 @@ def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) ->
         This implementation constructs InternVL-compatible inputs and uses
         `model.chat(tokenizer, pixel_values, question, history=...)` to avoid
         relying on AutoProcessor (which fails for some tokenizers).
+
+        Args:
+            messages: List of message dictionaries with role and content fields.
+                     Content can contain text and image items.
+            max_new_tokens: Maximum number of new tokens to generate
+
+        Returns:
+            Generated text response from the model, or empty string if generation fails
         """
         assert self.model is not None and self.tokenizer is not None
 

diff --git a/libs/python/agent/agent/adapters/models/opencua.py b/libs/python/agent/agent/adapters/models/opencua.py
@@ -17,6 +17,16 @@ class OpenCUAModel:
     """OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor."""
 
     def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
+        """Initialize the OpenCUA model with specified configuration.
+
+        Args:
+            model_name: The name or path of the model to load
+            device: Device to run the model on, defaults to "auto"
+            trust_remote_code: Whether to trust remote code when loading the model
+
+        Raises:
+            ImportError: If OpenCUA requirements are not installed
+        """
         if not OPENCUA_AVAILABLE:
             raise ImportError(
                 "OpenCUA requirements not found. Install with: pip install \"cua-agent[opencua-hf]\""
@@ -30,6 +40,7 @@ def __init__(self, model_name: str, device: str = "auto", trust_remote_code: boo
         self._load()
 
     def _load(self) -> None:
+        """Load the tokenizer, model, and image processor from the specified model name."""
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.model_name, trust_remote_code=self.trust_remote_code
         )
@@ -46,6 +57,14 @@ def _load(self) -> None:
 
     @staticmethod
     def _extract_last_image_b64(messages: List[Dict[str, Any]]) -> str:
+        """Extract the base64 encoded image data from the last image in the message list.
+
+        Args:
+            messages: List of message dictionaries in HF format with content items
+
+        Returns:
+            Base64 encoded image data string, or empty string if no image found
+        """
         # Expect HF-format messages with content items type: "image" with data URL
         for msg in reversed(messages):
             for item in reversed(msg.get("content", [])):
@@ -56,6 +75,15 @@ def _extract_last_image_b64(messages: List[Dict[str, Any]]) -> str:
         return ""
 
     def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str:
+        """Generate text response from the model using the provided messages.
+
+        Args:
+            messages: List of message dictionaries containing conversation history
+            max_new_tokens: Maximum number of new tokens to generate
+
+        Returns:
+            Generated text response as a string
+        """
         assert self.model is not None and self.tokenizer is not None and self.image_processor is not None
 
         # Tokenize text side using chat template

diff --git a/libs/python/agent/agent/integrations/hud/proxy.py b/libs/python/agent/agent/integrations/hud/proxy.py
@@ -35,6 +35,12 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li
 
     Only a subset is supported: computer_call, assistant message (text), and reasoning.
     Unknown types are ignored.
+
+    Args:
+        output_items: List of agent output items to convert
+
+    Returns:
+        List of OpenAI ResponseOutputItem objects
     """
     blocks: List[ResponseOutputItem] = []
     for item in output_items or []:
@@ -81,6 +87,14 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li
     return blocks
 
 def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
+    """Convert items to a list of plain dictionaries.
+
+    Args:
+        items: Items to convert, can be objects with model_dump method or dictionaries
+
+    Returns:
+        List of dictionaries representing the items
+    """
     out: List[Dict[str, Any]] = []
     for it in list(items):
         if hasattr(it, "model_dump"):
@@ -100,11 +114,23 @@ class FakeAsyncOpenAI:
     """
 
     def __init__(self, computer_agent: BaseComputerAgent) -> None:
+        """Initialize the fake OpenAI client.
+
+        Args:
+            computer_agent: The ComputerAgent instance to use for generating responses
+        """
         self._agent = computer_agent
         self.responses = self._Responses(self)
 
     class _Responses:
+        """Internal responses handler for the fake OpenAI client."""
+
         def __init__(self, parent: "FakeAsyncOpenAI") -> None:
+            """Initialize the responses handler.
+
+            Args:
+                parent: The parent FakeAsyncOpenAI instance
+            """
             # Caches for cross-call context when using previous_response_id
             self.blocks_cache: Dict[str, ResponseInputParam | ResponseOutputItem] = {}
             self.context_cache: Dict[str, List[str]] = {}
@@ -121,6 +147,23 @@ async def create(
             max_retries: int = 5,
             **_: Any,
         ) -> Any:
+            """Create a response using the computer agent.
+
+            Args:
+                model: The model name to use
+                input: The input parameters for the response
+                tools: Optional list of tools to use
+                instructions: Optional instructions to prepend
+                previous_response_id: Optional ID of previous response for context
+                max_retries: Maximum number of retry attempts
+                **_: Additional keyword arguments (ignored)
+
+            Returns:
+                OpenAI Response object with agent output
+
+            Raises:
+                Exception: If all retry attempts fail
+            """
             for attempt in range(max_retries):
                 # Prepend cached blocks from previous_response_id to input
                 full_input = input
@@ -217,6 +260,25 @@ def __init__(
         telemetry_enabled: bool | None = True,
         **kwargs: Any,
     ) -> None:
+        """Initialize the proxy operator agent.
+
+        Args:
+            model: Model name to use, defaults to "computer-use-preview"
+            allowed_tools: List of allowed tool names, defaults to ["openai_computer"]
+            trajectory_dir: Directory for storing trajectories
+            tools: Additional tools to include
+            custom_loop: Custom loop implementation
+            only_n_most_recent_images: Limit on recent images to keep
+            callbacks: List of callback functions
+            instructions: Instructions to prepend to prompts
+            verbosity: Logging verbosity level
+            max_retries: Maximum retry attempts
+            screenshot_delay: Delay between screenshots
+            use_prompt_caching: Whether to use prompt caching
+            max_trajectory_budget: Budget limit for trajectories
+            telemetry_enabled: Whether telemetry is enabled
+            **kwargs: Additional arguments passed to OperatorAgent
+        """
         model = model or "computer-use-preview"
         allowed_tools = allowed_tools or ["openai_computer"]