Skip to content

Conversation

@sarinali
Copy link
Contributor

@sarinali sarinali commented Dec 18, 2025

  • gemini 3 flash preview integration
  • gemini loop with serialization checking
  • expose browser tool execution in cua
  • add back and forth tools in browser
  • add custom tools instead and support gemini 3 pro and flash
  • merge main

note: were defining custom tools becayse for some reason i cannot get gemini to correct exclude the browser tools. we don't have good support for browser use in Cloud yet so I just avoided that googleapis/python-genai#1885 here is the issue I opened.

Testing script:

"""Test script to verify ALL Gemini 3 custom function declarations work.

Tests each action type:
1. click_at - Click at coordinates
2. type_text_at - Type text at coordinates
3. hover_at - Move cursor without clicking
4. key_combination - Press key combinations
5. scroll_at - Scroll at specific coordinates
6. scroll_document - Scroll the entire page
7. drag_and_drop - Drag from one coordinate to another
8. wait_5_seconds - Wait for 5 seconds
"""

import asyncio
import logging
import os
import sys
import traceback
import signal
from pathlib import Path

# Add local libs to Python path so we use LOCAL code, not installed packages
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root / "libs" / "python" / "agent"))
sys.path.insert(0, str(project_root / "libs" / "python" / "computer"))
sys.path.insert(0, str(project_root / "libs" / "python" / "core"))

print("=" * 60)
print("TESTING ALL GEMINI 3 CUSTOM FUNCTION DECLARATIONS")
print("=" * 60)
print("Using LOCAL modules from:")
print(f"  - {project_root / 'libs' / 'python' / 'agent'}")
print(f"  - {project_root / 'libs' / 'python' / 'computer'}")
print("=" * 60)

from computer import Computer, VMProviderType
from agent import ComputerAgent
from utils import load_dotenv_files, handle_sigint

logger = logging.getLogger(__name__)


def validate_required_env_vars():
    """Validate that all required environment variables are present."""
    required_keys = [
        "CUA_API_KEY",
        "CUA_SANDBOX_NAME",
    ]

    has_google_creds = os.getenv("GOOGLE_API_KEY") or (
        os.getenv("GOOGLE_CLOUD_PROJECT") and
        os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
    )

    if not has_google_creds:
        required_keys.append("GOOGLE_API_KEY or (GOOGLE_CLOUD_PROJECT + GOOGLE_APPLICATION_CREDENTIALS)")

    missing_keys = [key for key in required_keys if not os.getenv(key)]

    if missing_keys:
        raise ValueError(
            f"Missing required environment variables: {', '.join(missing_keys)}. "
        )


# Test tasks designed to trigger specific actions
TEST_TASKS = {
    # Test 1: click_at - Simple click action
    "click_at": {
        "name": "Test click_at",
        "task": "Click on the Firefox icon in the taskbar to open the browser.",
        "expected_action": "click_at",
    },

    # Test 2: type_text_at - Type text at a location
    "type_text_at": {
        "name": "Test type_text_at",
        "task": "Open the terminal application, then type 'echo hello world' and press Enter.",
        "expected_action": "type_text_at",
    },

    # Test 3: hover_at - Move cursor without clicking
    "hover_at": {
        "name": "Test hover_at",
        "task": "Hover over the Applications menu in the top left corner to see the dropdown, but don't click it yet.",
        "expected_action": "hover_at",
    },

    # Test 4: key_combination - Press keyboard shortcuts
    "key_combination": {
        "name": "Test key_combination",
        "task": "Press Ctrl+Alt+T to open a terminal window.",
        "expected_action": "key_combination",
    },

    # Test 5: scroll_at - Scroll at specific location
    "scroll_at": {
        "name": "Test scroll_at",
        "task": "Open Firefox, go to wikipedia.org, and scroll down on the page to see more content.",
        "expected_action": "scroll_at",
    },

    # Test 6: scroll_document - Scroll the whole page
    "scroll_document": {
        "name": "Test scroll_document",
        "task": "In the browser, scroll down the entire page to see the footer.",
        "expected_action": "scroll_document",
    },

    # Test 7: drag_and_drop - Drag operation
    "drag_and_drop": {
        "name": "Test drag_and_drop",
        "task": "Open the file manager, then drag a file from one location to another folder.",
        "expected_action": "drag_and_drop",
    },

    # Test 8: wait_5_seconds - Wait action
    "wait_5_seconds": {
        "name": "Test wait_5_seconds",
        "task": "Open Firefox and navigate to a website. Wait for the page to fully load before clicking anything.",
        "expected_action": "wait_5_seconds",
    },

    # Combined test - Multiple actions in sequence
    "combined": {
        "name": "Test Combined Actions",
        "task": """Do the following steps:
1. Open the terminal (use keyboard shortcut Ctrl+Alt+T)
2. Wait for the terminal to open
3. Type 'ls -la' and press Enter
4. Scroll down if the output is long
5. Type 'pwd' and press Enter""",
        "expected_action": "multiple",
    },
}


async def run_single_test(test_key: str, computer: Computer, model: str):
    """Run a single test task."""
    test = TEST_TASKS[test_key]
    print(f"\n{'='*60}")
    print(f"RUNNING: {test['name']}")
    print(f"Expected action: {test['expected_action']}")
    print(f"{'='*60}")
    print(f"Task: {test['task']}\n")

    agent = ComputerAgent(
        model=model,
        tools=[computer],
        only_n_most_recent_images=3,
        trajectory_dir="trajectories",
        use_prompt_caching=False,
        max_trajectory_budget=5.0,
        computer_environment="unspecified",
    )

    actions_seen = []

    try:
        async for result in agent.run([{"role": "user", "content": test["task"]}], stream=False):
            for item in result.get("output", []):
                if item.get("type") == "message":
                    content = item.get("content", [])
                    for content_part in content:
                        if content_part.get("text"):
                            print(f"Agent: {content_part.get('text')}")
                elif item.get("type") == "computer_call":
                    action = item.get("action", {})
                    action_type = action.get("type", "")
                    actions_seen.append(action_type)
                    print(f">>> ACTION: {action_type} - {action}")
                elif item.get("type") == "computer_call_output":
                    print("<<< OUTPUT: [Screenshot/Result received]")

        print(f"\nActions used: {actions_seen}")
        return {"test": test_key, "success": True, "actions": actions_seen}

    except Exception as e:
        print(f"ERROR: {e}")
        traceback.print_exc()
        return {"test": test_key, "success": False, "error": str(e)}


async def run_all_tests():
    """Run all action tests."""
    print("\n=== Testing ALL Gemini 3 Custom Function Declarations ===\n")

    computer = Computer(
        os_type="linux",
        api_key=os.getenv("CUA_API_KEY"),
        name=os.getenv("CUA_SANDBOX_NAME") or "",
        provider_type=VMProviderType.CLOUD,
    )

    # Use Gemini 3 Pro Preview or Flash Preview
    # model = "gemini-3-pro-preview"
    model = "gemini-3-flash-preview"
    # model = "gemini-2.0-flash"

    results = []

    # Run selected tests
    tests_to_run = [
        "click_at",
        "type_text_at",
        "key_combination",
        # "hover_at",  # Less common, may not be triggered
        # "scroll_at",
        # "scroll_document",
        # "drag_and_drop",
        # "wait_5_seconds",
        # "combined",
    ]

    for test_key in tests_to_run:
        result = await run_single_test(test_key, computer, model)
        results.append(result)

        # Ask user if they want to continue
        print("\n" + "-"*40)
        continue_input = input("Continue to next test? (y/n/skip to [s]kip): ").strip().lower()
        if continue_input == 'n':
            break
        elif continue_input == 's':
            continue

    # Print summary
    print("\n" + "="*60)
    print("TEST SUMMARY")
    print("="*60)
    for r in results:
        status = "PASS" if r["success"] else "FAIL"
        actions = ", ".join(r.get("actions", [])) if r["success"] else r.get("error", "Unknown error")
        print(f"  {r['test']}: {status} - Actions: {actions}")


async def run_quick_combined_test():
    """Run a quick combined test that should trigger multiple actions."""
    print("\n=== Quick Combined Test for Gemini 3 Actions ===\n")

    computer = Computer(
        os_type="linux",
        api_key=os.getenv("CUA_API_KEY"),
        name=os.getenv("CUA_SANDBOX_NAME") or "",
        provider_type=VMProviderType.CLOUD,
    )

    model = "gemini-3-flash-preview"

    agent = ComputerAgent(
        model=model,
        tools=[computer],
        only_n_most_recent_images=3,
        trajectory_dir="trajectories",
        use_prompt_caching=False,
        max_trajectory_budget=10.0,
        computer_environment="unspecified",
    )

    # This task should trigger multiple action types
    task = """Please do the following:
1. Press Ctrl+Alt+T to open a terminal
2. Wait for it to open
3. Type 'echo "Hello from Gemini 3!"' and press Enter
4. Type 'ls -la' and press Enter
5. If the output is long, scroll down to see more
6. Type 'pwd' and press Enter
7. Now open Firefox by clicking on the Firefox icon in the taskbar or applications menu
8. Wait for Firefox to open, then navigate to https://www.github.com
9. Scroll down the GitHub homepage to see more content
10. Hover over the search bar at the top of the page
11. Type 'computer use agent' in the search bar and press Enter
12. Scroll through the search results to find interesting repositories
13. Click on one of the repository links to open it
14. Scroll through the repository page to see the README and code
"""

    print(f"Task: {task}\n")
    print("This should trigger: key_combination, wait_5_seconds, type_text_at, scroll_at/scroll_document, click_at, hover_at, navigate\n")

    actions_seen = []

    async for result in agent.run([{"role": "user", "content": task}], stream=False):
        for item in result.get("output", []):
            if item.get("type") == "message":
                content = item.get("content", [])
                for content_part in content:
                    if content_part.get("text"):
                        print(f"Agent: {content_part.get('text')}")
            elif item.get("type") == "computer_call":
                action = item.get("action", {})
                action_type = action.get("type", "")
                actions_seen.append(action_type)
                print(f">>> ACTION: {action_type}")
                print(f"    Details: {action}")
            elif item.get("type") == "computer_call_output":
                print("<<< OUTPUT: [Screenshot received]")

    print("\n" + "="*60)
    print("ACTIONS TRIGGERED:")
    print("="*60)
    for i, action in enumerate(actions_seen, 1):
        print(f"  {i}. {action}")

    # Check which actions were used
    expected = {"click", "type", "keypress", "scroll", "wait", "move"}
    seen_types = set()
    for a in actions_seen:
        if "click" in a.lower():
            seen_types.add("click")
        elif "type" in a.lower():
            seen_types.add("type")
        elif "key" in a.lower():
            seen_types.add("keypress")
        elif "scroll" in a.lower():
            seen_types.add("scroll")
        elif "wait" in a.lower():
            seen_types.add("wait")
        elif "move" in a.lower() or "hover" in a.lower():
            seen_types.add("move")

    print(f"\nAction types seen: {seen_types}")


def main():
    """Run the tests."""
    try:
        load_dotenv_files()
        validate_required_env_vars()

        signal.signal(signal.SIGINT, handle_sigint)

        print("\nSelect test mode:")
        print("  1. Run quick combined test (recommended)")
        print("  2. Run individual action tests")

        choice = input("\nEnter choice (1 or 2): ").strip()

        if choice == "2":
            asyncio.run(run_all_tests())
        else:
            asyncio.run(run_quick_combined_test())

    except Exception as e:
        print(f"Error running test: {e}")
        traceback.print_exc()


if __name__ == "__main__":
    main()

@vercel
Copy link

vercel bot commented Dec 18, 2025

@sarinali is attempting to deploy a commit to the Cua Team on Vercel.

A member of the Team first needs to authorize it.

@sentry
Copy link

sentry bot commented Dec 18, 2025

Codecov Report

❌ Patch coverage is 3.35196% with 173 lines in your changes missing coverage. Please review.

Files with missing lines Patch % Lines
libs/python/agent/agent/loops/gemini.py 2.94% 165 Missing ⚠️
.../python/computer-server/computer_server/browser.py 0.00% 6 Missing ⚠️
libs/python/agent/agent/computers/cua.py 33.33% 2 Missing ⚠️

📢 Thoughts on this report? Let us know!

@vercel
Copy link

vercel bot commented Dec 18, 2025

The latest updates on your projects. Learn more about Vercel for GitHub.

Project Deployment Review Updated (UTC)
docs Ready Ready Preview, Comment Dec 18, 2025 5:40pm

Updated API key retrieval to allow for kwargs input.
@ddupont808 ddupont808 merged commit 60d8881 into trycua:main Dec 18, 2025
8 of 10 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants