Skip to content

Conversation

@sarinali
Copy link
Contributor

I generated this summary from my session during coding. but all the relevant documentation is linked below. I wanted to stress that the main issue was the input/ output mismatch.

And to test this:

"""
Test script to run agent with LOCAL code changes.
This uses the local modules from libs/python/agent and libs/python/computer
"""

import asyncio
import os
import sys
from pathlib import Path
import logging

# Add local libs to Python path so we use LOCAL code, not installed packages
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root / "libs" / "python" / "agent"))
sys.path.insert(0, str(project_root / "libs" / "python" / "computer"))
sys.path.insert(0, str(project_root / "libs" / "python" / "core"))

print("=" * 60)
print("IMPORTANT: Using LOCAL modules from:")
print(f"  - {project_root / 'libs' / 'python' / 'agent'}")
print(f"  - {project_root / 'libs' / 'python' / 'computer'}")
print("=" * 60)

from dotenv import load_dotenv
from agent import ComputerAgent
from computer import Computer

load_dotenv()


async def main():
    """Test the agent with local code changes"""

    # Verify environment variables
    container_name = os.getenv("CUA_CONTAINER_NAME")
    api_key = os.getenv("CUA_API_KEY")
    openai_key = os.getenv("OPENAI_API_KEY")

    if not container_name:
        print("ERROR: CUA_CONTAINER_NAME not set in .env file")
        return
    if not api_key:
        print("ERROR: CUA_API_KEY not set in .env file")
        return
    if not openai_key:
        print("ERROR: OPENAI_API_KEY not set in .env file")
        return

    print(f"\n✓ Container: {container_name}")
    print(f"✓ API Key: {api_key[:20]}...")
    print(f"✓ OpenAI Key: {openai_key[:20]}...\n")

    async with Computer(
        os_type="linux",
        provider_type="cloud",
        name=container_name,
        api_key=api_key,
    ) as computer:

        agent = ComputerAgent(
            model="omniparser+openai/gpt-4o",
            tools=[computer],
            only_n_most_recent_images=3,
            verbosity=logging.DEBUG,
            trajectory_dir="trajectories",
            use_prompt_caching=True,
            max_trajectory_budget=1.0,
        )

        # Example tasks to demonstrate the agent
        tasks = [
              "Click on the web browser icon",
            #   "Visit https://form.jotform.com/252881246782264 and fill the form from the information in the pdf."
          ]

        # Use message-based conversation history
        history = []
        
        for i, task in enumerate(tasks):
            print(f"\nExecuting task {i+1}/{len(tasks)}: {task}")
            
            # Add user message to history
            history.append({"role": "user", "content": task})
            
            # Run agent with conversation history
            async for result in agent.run(history, stream=False):
                # Add agent outputs to history
                history += result.get("output", [])
                
                # Print output for debugging
                for item in result.get("output", []):
                    if item.get("type") == "message":
                        content = item.get("content", [])
                        for content_part in content:
                            if content_part.get("text"):
                                print(f"Agent: {content_part.get('text')}")
                    elif item.get("type") == "computer_call":
                        action = item.get("action", {})
                        action_type = action.get("type", "")
                        print(f"Computer Action: {action_type}({action})")
                    elif item.get("type") == "computer_call_output":
                        print("Computer Output: [Screenshot/Result]")
                        
            print(f"✅ Task {i+1}/{len(tasks)} completed: {task}")


if __name__ == "__main__":
    asyncio.run(main())

and for anthropic:

"""
Test script to run agent with LOCAL code changes.
This uses the local modules from libs/python/agent and libs/python/computer
"""
import logging
import asyncio
import os
import sys
from pathlib import Path

# Add local libs to Python path so we use LOCAL code, not installed packages
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root / "libs" / "python" / "agent"))
sys.path.insert(0, str(project_root / "libs" / "python" / "computer"))
sys.path.insert(0, str(project_root / "libs" / "python" / "core"))

print("=" * 60)
print("IMPORTANT: Using LOCAL modules from:")
print(f"  - {project_root / 'libs' / 'python' / 'agent'}")
print(f"  - {project_root / 'libs' / 'python' / 'computer'}")
print("=" * 60)

from dotenv import load_dotenv
from agent import ComputerAgent
from computer import Computer

load_dotenv()


async def main():
    """Test the agent with local code changes"""

    # Verify environment variables
    container_name = os.getenv("CUA_CONTAINER_NAME")
    api_key = os.getenv("CUA_API_KEY")
    openai_key = os.getenv("OPENAI_API_KEY")
    anthropic_key = os.getenv("ANTHROPIC_API_KEY")

    if not container_name:
        print("ERROR: CUA_CONTAINER_NAME not set in .env file")
        return
    if not api_key:
        print("ERROR: CUA_API_KEY not set in .env file")
        return
    if not openai_key:
        print("ERROR: OPENAI_API_KEY not set in .env file")
        return
    if not anthropic_key:
        print("ERROR: ANTHROPIC_API_KEY not set in .env file")
        return

    print(f"\n✓ Container: {container_name}")
    print(f"✓ API Key: {api_key[:20]}...")
    print(f"✓ OpenAI Key: {openai_key[:20]}...\n")
    print(f"✓ Anthropic Key: {anthropic_key[:20]}...\n")
    async with Computer(
        os_type="linux",
        provider_type="cloud",
        name=container_name,
        api_key=api_key,
    ) as computer:

        agent = ComputerAgent(
            model="omniparser+anthropic/claude-sonnet-4-20250514",
            tools=[computer],
            only_n_most_recent_images=3,
            verbosity=logging.DEBUG,
            trajectory_dir="trajectories",
            use_prompt_caching=True,
            max_trajectory_budget=1.0,
        )

        # Example tasks to demonstrate the agent
        tasks = [
              "Click on the web browser icon"
          ]

        # Use message-based conversation history
        history = []
        
        for i, task in enumerate(tasks):
            print(f"\nExecuting task {i+1}/{len(tasks)}: {task}")
            
            # Add user message to history
            history.append({"role": "user", "content": task})
            
            # Run agent with conversation history
            async for result in agent.run(history, stream=False):
                # Add agent outputs to history
                history += result.get("output", [])
                
                # Print output for debugging
                for item in result.get("output", []):
                    if item.get("type") == "message":
                        content = item.get("content", [])
                        for content_part in content:
                            if content_part.get("text"):
                                print(f"Agent: {content_part.get('text')}")
                    elif item.get("type") == "computer_call":
                        action = item.get("action", {})
                        action_type = action.get("type", "")
                        print(f"Computer Action: {action_type}({action})")
                    elif item.get("type") == "computer_call_output":
                        print("Computer Output: [Screenshot/Result]")
                        
            print(f"✅ Task {i+1}/{len(tasks)} completed: {task}")


if __name__ == "__main__":
    asyncio.run(main())

OmniParser: Migration from aresponses to acompletion

Problem

OmniParser loop was using litellm.aresponses() which is buggy and poorly supported across providers, causing errors like:

BadRequestError: Unknown parameter: 'input[3].content'

Root Cause

Per LiteLLM documentation:

  • aresponses(): Uses non-standard input parameter, provider-specific format, minimal documentation
  • acompletion(): Uses standard messages parameter, universal provider support, extensive documentation

The moondream3 loop already uses acompletion() successfully.

Changes Applied

1. API Call Migration (lines 344, 330, 377)

# Before
response = await litellm.aresponses(input=messages, ...)

# After
response = await litellm.acompletion(messages=completion_messages, ...)

Why: Standard completion API is stable across all providers (OpenAI, Anthropic, Gemini, etc.)

2. Tool Schema Format (lines 23-85)

# Before (Responses API format)
{
    "type": "function",
    "name": "computer",
    "parameters": {...}
}

# After (Completion API format)
{
    "type": "function",
    "function": {
        "name": "computer",
        "parameters": {...}
    }
}

Why: Completion API requires schema wrapped in function key per Anthropic tool calling docs.

3. Message Format Conversion (lines 370-372, 407-411)

# Convert responses format → completion format
completion_messages = convert_responses_items_to_completion_messages(
    messages_with_element_ids,
    allow_images_in_tool_results=False
)

# Convert completion format → responses format
responses_items.extend(
    convert_completion_messages_to_responses_items([choice_message])
)

Why: Following moondream3 pattern (lines 398-441). Uses proven helpers from responses.py.

4. Coordinate Normalization (lines 305-333)

# Get screen dimensions from computer handler
width, height = await computer_handler.get_dimensions()

# Convert OmniParser normalized coords (0-1) to absolute pixels
pixel_x = int(norm_x * width)
pixel_y = int(norm_y * height)

Why: OmniParser returns normalized coordinates but computer handler expects pixels. Pattern from anthropic.py:72-77 and openai.py:20-25.

5. Annotated Image Injection (lines 335-337)

# Replace original screenshot with annotated image
annotated_image_url = f"data:image/png;base64,{result.annotated_image_base64}"
last_computer_call_output["output"]["image_url"] = annotated_image_url

Why: LLM must see numbered overlays to choose valid element IDs (1-58) instead of hallucinating IDs like 535.

6. Element ID Conversion (lines 422-445)

# Convert element_id → x,y after LLM response
if "element_id" in action:
    element_id = action["element_id"]
    if element_id in id2xy:
        x, y = id2xy[element_id]
        action["x"] = x
        action["y"] = y
        del action["element_id"]

Why: Mirrors moondream3's convert_computer_calls_desc2xy() pattern (responses.py:305-351).

7. Schema Required Fields (line 82)

"required": ["action", "element_id"]  # Added element_id

Why: Anthropic strictly follows JSON schema. Without element_id in required, Claude treats it as optional per Anthropic tool schema spec.

Results

  • ✅ Works with OpenAI (gpt-4o, gpt-4o-mini)
  • ✅ Works with Anthropic (claude-sonnet-4, claude-3.5-sonnet)
  • ✅ Works with any provider supporting acompletion
  • ✅ Coordinates properly converted (pixels, not normalized)
  • ✅ LLM sees annotated images with element IDs
  • ✅ Cross-provider compatibility maintained

References

@codecov-commenter
Copy link

codecov-commenter commented Oct 29, 2025

⚠️ Please install the 'codecov app svg image' to ensure uploads and comments are reliably processed by Codecov.

Codecov Report

❌ Patch coverage is 1.92308% with 51 lines in your changes missing coverage. Please review.

Files with missing lines Patch % Lines
libs/python/agent/agent/loops/omniparser.py 1.92% 51 Missing ⚠️

📢 Thoughts on this report? Let us know!

@sarinali sarinali marked this pull request as ready for review October 29, 2025 18:20
@sarinali sarinali merged commit f91da5d into trycua:main Oct 29, 2025
9 of 12 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants