Fix/omniparser predict refactor #529

sarinali · 2025-10-29T05:04:10Z

I generated this summary from my session during coding. but all the relevant documentation is linked below. I wanted to stress that the main issue was the input/ output mismatch.

And to test this:

"""
Test script to run agent with LOCAL code changes.
This uses the local modules from libs/python/agent and libs/python/computer
"""

import asyncio
import os
import sys
from pathlib import Path
import logging

# Add local libs to Python path so we use LOCAL code, not installed packages
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root / "libs" / "python" / "agent"))
sys.path.insert(0, str(project_root / "libs" / "python" / "computer"))
sys.path.insert(0, str(project_root / "libs" / "python" / "core"))

print("=" * 60)
print("IMPORTANT: Using LOCAL modules from:")
print(f"  - {project_root / 'libs' / 'python' / 'agent'}")
print(f"  - {project_root / 'libs' / 'python' / 'computer'}")
print("=" * 60)

from dotenv import load_dotenv
from agent import ComputerAgent
from computer import Computer

load_dotenv()


async def main():
    """Test the agent with local code changes"""

    # Verify environment variables
    container_name = os.getenv("CUA_CONTAINER_NAME")
    api_key = os.getenv("CUA_API_KEY")
    openai_key = os.getenv("OPENAI_API_KEY")

    if not container_name:
        print("ERROR: CUA_CONTAINER_NAME not set in .env file")
        return
    if not api_key:
        print("ERROR: CUA_API_KEY not set in .env file")
        return
    if not openai_key:
        print("ERROR: OPENAI_API_KEY not set in .env file")
        return

    print(f"\n✓ Container: {container_name}")
    print(f"✓ API Key: {api_key[:20]}...")
    print(f"✓ OpenAI Key: {openai_key[:20]}...\n")

    async with Computer(
        os_type="linux",
        provider_type="cloud",
        name=container_name,
        api_key=api_key,
    ) as computer:

        agent = ComputerAgent(
            model="omniparser+openai/gpt-4o",
            tools=[computer],
            only_n_most_recent_images=3,
            verbosity=logging.DEBUG,
            trajectory_dir="trajectories",
            use_prompt_caching=True,
            max_trajectory_budget=1.0,
        )

        # Example tasks to demonstrate the agent
        tasks = [
              "Click on the web browser icon",
            #   "Visit https://form.jotform.com/252881246782264 and fill the form from the information in the pdf."
          ]

        # Use message-based conversation history
        history = []
        
        for i, task in enumerate(tasks):
            print(f"\nExecuting task {i+1}/{len(tasks)}: {task}")
            
            # Add user message to history
            history.append({"role": "user", "content": task})
            
            # Run agent with conversation history
            async for result in agent.run(history, stream=False):
                # Add agent outputs to history
                history += result.get("output", [])
                
                # Print output for debugging
                for item in result.get("output", []):
                    if item.get("type") == "message":
                        content = item.get("content", [])
                        for content_part in content:
                            if content_part.get("text"):
                                print(f"Agent: {content_part.get('text')}")
                    elif item.get("type") == "computer_call":
                        action = item.get("action", {})
                        action_type = action.get("type", "")
                        print(f"Computer Action: {action_type}({action})")
                    elif item.get("type") == "computer_call_output":
                        print("Computer Output: [Screenshot/Result]")
                        
            print(f"✅ Task {i+1}/{len(tasks)} completed: {task}")


if __name__ == "__main__":
    asyncio.run(main())

and for anthropic:

"""
Test script to run agent with LOCAL code changes.
This uses the local modules from libs/python/agent and libs/python/computer
"""
import logging
import asyncio
import os
import sys
from pathlib import Path

# Add local libs to Python path so we use LOCAL code, not installed packages
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root / "libs" / "python" / "agent"))
sys.path.insert(0, str(project_root / "libs" / "python" / "computer"))
sys.path.insert(0, str(project_root / "libs" / "python" / "core"))

print("=" * 60)
print("IMPORTANT: Using LOCAL modules from:")
print(f"  - {project_root / 'libs' / 'python' / 'agent'}")
print(f"  - {project_root / 'libs' / 'python' / 'computer'}")
print("=" * 60)

from dotenv import load_dotenv
from agent import ComputerAgent
from computer import Computer

load_dotenv()


async def main():
    """Test the agent with local code changes"""

    # Verify environment variables
    container_name = os.getenv("CUA_CONTAINER_NAME")
    api_key = os.getenv("CUA_API_KEY")
    openai_key = os.getenv("OPENAI_API_KEY")
    anthropic_key = os.getenv("ANTHROPIC_API_KEY")

    if not container_name:
        print("ERROR: CUA_CONTAINER_NAME not set in .env file")
        return
    if not api_key:
        print("ERROR: CUA_API_KEY not set in .env file")
        return
    if not openai_key:
        print("ERROR: OPENAI_API_KEY not set in .env file")
        return
    if not anthropic_key:
        print("ERROR: ANTHROPIC_API_KEY not set in .env file")
        return

    print(f"\n✓ Container: {container_name}")
    print(f"✓ API Key: {api_key[:20]}...")
    print(f"✓ OpenAI Key: {openai_key[:20]}...\n")
    print(f"✓ Anthropic Key: {anthropic_key[:20]}...\n")
    async with Computer(
        os_type="linux",
        provider_type="cloud",
        name=container_name,
        api_key=api_key,
    ) as computer:

        agent = ComputerAgent(
            model="omniparser+anthropic/claude-sonnet-4-20250514",
            tools=[computer],
            only_n_most_recent_images=3,
            verbosity=logging.DEBUG,
            trajectory_dir="trajectories",
            use_prompt_caching=True,
            max_trajectory_budget=1.0,
        )

        # Example tasks to demonstrate the agent
        tasks = [
              "Click on the web browser icon"
          ]

        # Use message-based conversation history
        history = []
        
        for i, task in enumerate(tasks):
            print(f"\nExecuting task {i+1}/{len(tasks)}: {task}")
            
            # Add user message to history
            history.append({"role": "user", "content": task})
            
            # Run agent with conversation history
            async for result in agent.run(history, stream=False):
                # Add agent outputs to history
                history += result.get("output", [])
                
                # Print output for debugging
                for item in result.get("output", []):
                    if item.get("type") == "message":
                        content = item.get("content", [])
                        for content_part in content:
                            if content_part.get("text"):
                                print(f"Agent: {content_part.get('text')}")
                    elif item.get("type") == "computer_call":
                        action = item.get("action", {})
                        action_type = action.get("type", "")
                        print(f"Computer Action: {action_type}({action})")
                    elif item.get("type") == "computer_call_output":
                        print("Computer Output: [Screenshot/Result]")
                        
            print(f"✅ Task {i+1}/{len(tasks)} completed: {task}")


if __name__ == "__main__":
    asyncio.run(main())

OmniParser: Migration from `aresponses` to `acompletion`

Problem

OmniParser loop was using litellm.aresponses() which is buggy and poorly supported across providers, causing errors like:

BadRequestError: Unknown parameter: 'input[3].content'

Root Cause

Per LiteLLM documentation:

aresponses(): Uses non-standard input parameter, provider-specific format, minimal documentation
acompletion(): Uses standard messages parameter, universal provider support, extensive documentation

The moondream3 loop already uses acompletion() successfully.

Changes Applied

1. API Call Migration (lines 344, 330, 377)

# Before
response = await litellm.aresponses(input=messages, ...)

# After
response = await litellm.acompletion(messages=completion_messages, ...)

Why: Standard completion API is stable across all providers (OpenAI, Anthropic, Gemini, etc.)

2. Tool Schema Format (lines 23-85)

# Before (Responses API format)
{
    "type": "function",
    "name": "computer",
    "parameters": {...}
}

# After (Completion API format)
{
    "type": "function",
    "function": {
        "name": "computer",
        "parameters": {...}
    }
}

Why: Completion API requires schema wrapped in function key per Anthropic tool calling docs.

3. Message Format Conversion (lines 370-372, 407-411)

# Convert responses format → completion format
completion_messages = convert_responses_items_to_completion_messages(
    messages_with_element_ids,
    allow_images_in_tool_results=False
)

# Convert completion format → responses format
responses_items.extend(
    convert_completion_messages_to_responses_items([choice_message])
)

Why: Following moondream3 pattern (lines 398-441). Uses proven helpers from responses.py.

4. Coordinate Normalization (lines 305-333)

# Get screen dimensions from computer handler
width, height = await computer_handler.get_dimensions()

# Convert OmniParser normalized coords (0-1) to absolute pixels
pixel_x = int(norm_x * width)
pixel_y = int(norm_y * height)

Why: OmniParser returns normalized coordinates but computer handler expects pixels. Pattern from anthropic.py:72-77 and openai.py:20-25.

5. Annotated Image Injection (lines 335-337)

# Replace original screenshot with annotated image
annotated_image_url = f"data:image/png;base64,{result.annotated_image_base64}"
last_computer_call_output["output"]["image_url"] = annotated_image_url

Why: LLM must see numbered overlays to choose valid element IDs (1-58) instead of hallucinating IDs like 535.

6. Element ID Conversion (lines 422-445)

# Convert element_id → x,y after LLM response
if "element_id" in action:
    element_id = action["element_id"]
    if element_id in id2xy:
        x, y = id2xy[element_id]
        action["x"] = x
        action["y"] = y
        del action["element_id"]

Why: Mirrors moondream3's convert_computer_calls_desc2xy() pattern (responses.py:305-351).

7. Schema Required Fields (line 82)

"required": ["action", "element_id"]  # Added element_id

Why: Anthropic strictly follows JSON schema. Without element_id in required, Claude treats it as optional per Anthropic tool schema spec.

Results

✅ Works with OpenAI (gpt-4o, gpt-4o-mini)
✅ Works with Anthropic (claude-sonnet-4, claude-3.5-sonnet)
✅ Works with any provider supporting acompletion
✅ Coordinates properly converted (pixels, not normalized)
✅ LLM sees annotated images with element IDs
✅ Cross-provider compatibility maintained

References

codecov-commenter · 2025-10-29T05:04:40Z

⚠️ Please install the to ensure uploads and comments are reliably processed by Codecov.

Codecov Report

❌ Patch coverage is 1.92308% with 51 lines in your changes missing coverage. Please review.

Files with missing lines	Patch %	Lines
libs/python/agent/agent/loops/omniparser.py	1.92%	51 Missing ⚠️

📢 Thoughts on this report? Let us know!

sarinali added 2 commits October 29, 2025 01:00

omni parser refactor with acompletions + coords normalizing

3769c3d

merge main

f2a3dcf

sarinali requested review from ddupont808, f-trycua and jamesmurdza October 29, 2025 05:05

ddupont808 approved these changes Oct 29, 2025

View reviewed changes

remove debug logs

dc102b5

sarinali marked this pull request as ready for review October 29, 2025 18:20

sarinali merged commit f91da5d into trycua:main Oct 29, 2025
9 of 12 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Fix/omniparser predict refactor #529

Fix/omniparser predict refactor #529

Uh oh!

sarinali commented Oct 29, 2025

Uh oh!

codecov-commenter commented Oct 29, 2025 •

edited

Loading

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

Uh oh!

Fix/omniparser predict refactor #529

Fix/omniparser predict refactor #529

Uh oh!

Conversation

sarinali commented Oct 29, 2025

OmniParser: Migration from aresponses to acompletion

Problem

Root Cause

Changes Applied

1. API Call Migration (lines 344, 330, 377)

2. Tool Schema Format (lines 23-85)

3. Message Format Conversion (lines 370-372, 407-411)

4. Coordinate Normalization (lines 305-333)

5. Annotated Image Injection (lines 335-337)

6. Element ID Conversion (lines 422-445)

7. Schema Required Fields (line 82)

Results

References

Uh oh!

codecov-commenter commented Oct 29, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Codecov Report

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

OmniParser: Migration from `aresponses` to `acompletion`

codecov-commenter commented Oct 29, 2025 •

edited

Loading