dapr · yaron2 · Sep 17, 2025 · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025
@@ -88,9 +88,10 @@ class AgentBase(BaseModel, ABC):
     max_iterations: int = Field(
         default=10, description="Max iterations for conversation cycles."
     )
+    # TODO(@Sicoyle): Rename this to make clearer
     memory: MemoryBase = Field(
         default_factory=ConversationListMemory,
-        description="Handles conversation history and context storage.",
+        description="Handles long-term conversation history (for all workflow instance-ids within the same session) and context storage.",
     )
     # TODO: we should have a system_template, prompt_template, and response_template, or better separation here.
     # If we have something like a customer service agent, we want diff templates for different types of interactions.

@@ -1,5 +1,5 @@
 from pydantic import BaseModel, Field
-from typing import List, Optional, Dict
+from typing import List, Optional, Dict, Any
 from dapr_agents.types import MessageContent, ToolExecutionRecord
 from datetime import datetime
 import uuid
@@ -30,7 +30,7 @@ class DurableAgentWorkflowEntry(BaseModel):
         description="Timestamp when the workflow was started",
     )
     end_time: Optional[datetime] = Field(
-        default_factory=datetime.now,
+        default=None,
         description="Timestamp when the workflow was completed or failed",
     )
     messages: List[DurableAgentMessage] = Field(
@@ -44,9 +44,21 @@ class DurableAgentWorkflowEntry(BaseModel):
         default_factory=list, description="Tool message exchanged during the workflow"
     )
     source: Optional[str] = Field(None, description="Entity that initiated the task.")
-    source_workflow_instance_id: Optional[str] = Field(
+    workflow_instance_id: Optional[str] = Field(
+        default=None,
+        description="The agent's own workflow instance ID.",
+    )
+    triggering_workflow_instance_id: Optional[str] = Field(
+        default=None,
+        description="The workflow instance ID of the entity that triggered this agent (for multi-agent communication).",
+    )
+    workflow_name: Optional[str] = Field(
+        default=None,
+        description="The name of the workflow.",
+    )
+    trace_context: Optional[Dict[str, Any]] = Field(
         default=None,
-        description="The workflow instance ID associated with the original request.",
+        description="OpenTelemetry trace context for workflow resumption.",
     )
 
 

@@ -223,5 +223,10 @@ def generate(
                 stream=stream,
             )
         except Exception as e:
-            logger.error("ChatCompletion API error", exc_info=True)
-            raise ValueError("Failed to process chat completion") from e
+            error_type = type(e).__name__
+            error_msg = str(e)
+
+            logger.error(f"OpenAI ChatCompletion API error: {error_type} - {error_msg}")
+            logger.error("Full error details:", exc_info=True)
+
+            raise ValueError(f"OpenAI API error ({error_type}): {error_msg}") from e
@@ -191,11 +191,19 @@ def create_child_span_with_context(
         Span context manager that can be used in 'with' statements for proper
         span lifecycle management with restored parent-child relationships
     """
+    # Try to restore context from W3C format first
     parent_ctx = restore_otel_context(otel_context)
 
     if parent_ctx:
         return tracer.start_as_current_span(
             span_name, context=parent_ctx, attributes=attributes
         )
     else:
-        return tracer.start_as_current_span(span_name, attributes=attributes)
+        # Fallback: try to use current active span as parent
+        current_span = trace.get_current_span()
+        if current_span and current_span.is_recording():
+            # Use current span as parent by creating a child span
+            return tracer.start_as_current_span(span_name, attributes=attributes)
+        else:
+            # Last resort: create root span
+            return tracer.start_as_current_span(span_name, attributes=attributes)
@@ -95,6 +95,78 @@ def get_context(self, instance_id: str) -> Optional[Dict[str, Any]]:
                 logger.warning(f"⚠️ No context found for instance {instance_id}")
             return context
 
+    def create_resumed_workflow_context(
+        self,
+        instance_id: str,
+        agent_name: Optional[str] = None,
+        stored_trace_context: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """
+        Create a new trace context for a resumed workflow after app restart.
+
+        When an app restarts, the in-memory context storage is lost. This method
+        creates a new trace context for resumed workflows so they can still be
+        traced, even though they won't be connected to the original trace.
+
+        Args:
+            instance_id (str): Unique workflow instance ID
+
+        Returns:
+            Dict[str, Any]: New W3C context data for the resumed workflow
+        """
+        try:
+            from opentelemetry import trace
+            from opentelemetry.trace.propagation.tracecontext import (
+                TraceContextTextMapPropagator,
+            )
+
+            # Create a new trace for the resumed workflow with proper AGENT span
+            tracer = trace.get_tracer(__name__)
+
+            # Create AGENT span with proper agent name for resumed workflow
+            agent_display_name = agent_name or "DurableAgent"
+            span_name = f"{agent_display_name}.ToolCallingWorkflow"
+            with tracer.start_as_current_span(span_name) as span:
+                # Set AGENT span attributes
+                from .constants import OPENINFERENCE_SPAN_KIND
+
+                span.set_attribute(OPENINFERENCE_SPAN_KIND, "AGENT")
+                span.set_attribute("workflow.instance_id", instance_id)
+                span.set_attribute("workflow.resumed", True)
+                span.set_attribute("agent.name", agent_name)
+                # Extract the new context
+                propagator = TraceContextTextMapPropagator()
+                carrier = {}
+                propagator.inject(carrier)
+
+                context_data = {
+                    "traceparent": carrier.get("traceparent"),
+                    "tracestate": carrier.get("tracestate"),
+                    "instance_id": instance_id,
+                    "resumed": True,
+                    "debug_info": f"New trace created for resumed workflow {instance_id}",
+                }
+
+                # Store the new context
+                self.store_context(instance_id, context_data)
+                logger.info(
+                    f"Created new trace context for resumed workflow {instance_id}"
+                )
+
+                return context_data
+
+        except Exception as e:
+            logger.error(
+                f"Failed to create resumed workflow context for {instance_id}: {e}"
+            )
+            return {
+                "traceparent": None,
+                "tracestate": None,
+                "instance_id": instance_id,
+                "resumed": True,
+                "error": str(e),
+            }
+
     def cleanup_context(self, instance_id: str) -> None:
         """
         Clean up stored context for a completed workflow instance to prevent memory leaks.
@@ -168,6 +240,20 @@ def get_workflow_context(instance_id: str) -> Optional[Dict[str, Any]]:
     return _context_storage.get_context(instance_id)
 
 
+def get_all_workflow_contexts() -> Dict[str, Dict[str, Any]]:
+    """
+    Retrieve all stored OpenTelemetry contexts from the global storage.
+
+    Used for debugging and fallback context lookup when instance-specific
+    context retrieval fails due to timing issues.
+
+    Returns:
+        Dict[str, Dict[str, Any]]: All stored contexts keyed by instance_id/key
+    """
+    with _context_storage._lock:
+        return dict(_context_storage._storage)
+
+
 def cleanup_workflow_context(instance_id: str) -> None:
     """
     Clean up stored context for a completed workflow instance using the global storage.