Coverage for src / harnessutils / compaction / summarization.py: 97%
106 statements
« prev ^ index » next coverage.py v7.13.2, created at 2026-02-18 09:07 -0600
« prev ^ index » next coverage.py v7.13.2, created at 2026-02-18 09:07 -0600
1"""Tier 3: LLM-powered conversation summarization.
3Uses LLM to semantically compress conversation when approaching limit.
4Cost: Expensive (~$0.10-0.50), Latency: ~3-5s.
5"""
7from dataclasses import dataclass
8from typing import Any
10from harnessutils.models.message import Message
11from harnessutils.models.usage import CacheUsage, Usage
12from harnessutils.types import LLMClient
14SUMMARIZATION_PROMPT = """You are a helpful AI assistant tasked with summarizing conversations.
16When asked to summarize, provide a detailed but concise summary of the conversation.
17Focus on information that would be helpful for continuing the conversation, including:
18- What was done
19- What is currently being worked on
20- Which files are being modified
21- What needs to be done next
22- Key user requests, constraints, or preferences that should persist
23- Important technical decisions and why they were made
25Your summary should be comprehensive enough to provide context but concise enough
26to be quickly understood."""
28DIFFERENTIAL_SUMMARIZATION_PROMPT = """You are a helpful AI assistant tasked with \
29updating conversation summaries.
31You will be given:
321. A previous summary of the conversation
332. New messages/activity since that summary
35Your task is to create an UPDATED summary that:
36- Preserves important context from the previous summary
37- Incorporates new information from recent messages
38- Maintains continuity while staying concise
39- Focuses on what's relevant for continuing the conversation
41The updated summary should feel like a natural evolution of the previous one,
42not a disconnected list of old + new information."""
45@dataclass
46class SummarizationResult:
47 """Result of summarization operation."""
49 summary_message: Message
50 tokens_used: Usage
51 cost: float
54def _find_last_summary(messages: list[Message]) -> Message | None:
55 """Find the last summary message in the conversation.
57 Args:
58 messages: List of messages to search
60 Returns:
61 Last summary message or None if no summary exists
62 """
63 for msg in reversed(messages):
64 if msg.summary:
65 return msg
66 return None
69def _get_messages_since_summary(
70 messages: list[Message], summary_msg: Message
71) -> list[Message]:
72 """Get all messages after the given summary message.
74 Args:
75 messages: Full list of messages
76 summary_msg: The summary message to find messages after
78 Returns:
79 List of messages after the summary
80 """
81 summary_idx = None
82 for i, msg in enumerate(messages):
83 if msg.id == summary_msg.id:
84 summary_idx = i
85 break
87 if summary_idx is None:
88 return messages
90 return messages[summary_idx + 1 :]
93def _build_differential_prompt(
94 last_summary: str, new_messages: list[dict[str, Any]]
95) -> list[dict[str, Any]]:
96 """Build prompt for differential summarization.
98 Args:
99 last_summary: Content of the previous summary
100 new_messages: New messages since the last summary
102 Returns:
103 List of messages formatted for differential summarization
104 """
105 prompt_messages = [
106 {
107 "role": "user",
108 "content": f"""Previous summary:
109{last_summary}
111New activity since summary:""",
112 }
113 ]
115 # Add new messages
116 prompt_messages.extend(new_messages)
118 # Add request for updated summary
119 prompt_messages.append(
120 {
121 "role": "user",
122 "content": (
123 "Provide an UPDATED summary that incorporates both the "
124 "previous context and new activity."
125 ),
126 }
127 )
129 return prompt_messages
132def is_overflow(usage: Usage, context_limit: int, output_limit: int) -> bool:
133 """Check if conversation has overflowed context window.
135 The context window must fit both input tokens and output tokens.
136 We reserve space for output, so usable input space is smaller.
138 Args:
139 usage: Token usage from last turn
140 context_limit: Maximum context tokens for model
141 output_limit: Maximum output tokens for model
143 Returns:
144 True if input tokens exceed available space after reserving for output
145 """
146 total_input = usage.input + usage.cache.read
147 usable_input_space = context_limit - output_limit
149 return total_input > usable_input_space
152def summarize_conversation(
153 messages: list[Message],
154 llm_client: LLMClient,
155 parent_message_id: str,
156 message_id: str,
157 model: str | None = None,
158 auto_mode: bool = False,
159 config: Any | None = None,
160 force_full: bool = False,
161) -> SummarizationResult:
162 """Summarize conversation using LLM.
164 Args:
165 messages: Conversation messages to summarize
166 llm_client: LLM client implementation (callback from app)
167 parent_message_id: ID of message that triggered summarization
168 message_id: ID for the summary message
169 model: Optional model to use (cheaper/faster recommended)
170 auto_mode: Whether this was auto-triggered
171 config: Optional SummarizationConfig for differential mode
172 force_full: Force full summarization even if differential is available
174 Returns:
175 SummarizationResult with summary message and metrics
176 """
177 # Check if differential summarization is possible
178 use_differential = False
179 last_summary = None
180 new_messages_only = messages
182 include_tool_outputs = getattr(config, "include_tool_outputs", True)
183 tool_output_max_tokens = getattr(config, "tool_output_max_tokens", 300)
185 if config and not force_full:
186 last_summary = _find_last_summary(messages)
187 if last_summary:
188 new_messages_only = _get_messages_since_summary(messages, last_summary)
190 # Only use differential if we have new messages and haven't exceeded limit
191 max_since_summary = getattr(config, "max_messages_since_summary", 30)
192 if new_messages_only and len(new_messages_only) <= max_since_summary:
193 use_differential = True
195 # Choose model and prompt based on mode
196 if use_differential and last_summary:
197 # Differential mode: cheaper/faster model
198 selected_model = model or getattr(config, "differential_model", None)
199 system_prompt = (
200 getattr(config, "differential_prompt", None) or DIFFERENTIAL_SUMMARIZATION_PROMPT
201 )
203 # Get last summary content
204 summary_content = ""
205 for part in last_summary.parts:
206 if part.type == "text":
207 summary_content = getattr(part, "text", "")
208 break
210 # Convert only new messages
211 new_model_messages = _convert_to_model_format(
212 new_messages_only,
213 include_tool_outputs=include_tool_outputs,
214 tool_output_max_tokens=tool_output_max_tokens,
215 )
217 # Build differential prompt
218 model_messages = _build_differential_prompt(summary_content, new_model_messages)
219 else:
220 # Full mode: more capable model
221 selected_model = model or getattr(config, "full_model", None)
222 system_prompt = getattr(config, "summarization_prompt", None) or SUMMARIZATION_PROMPT
224 # Convert all messages
225 model_messages = _convert_to_model_format(
226 messages,
227 include_tool_outputs=include_tool_outputs,
228 tool_output_max_tokens=tool_output_max_tokens,
229 )
230 model_messages.append(
231 {
232 "role": "user",
233 "content": "Provide a detailed summary for continuing our conversation.",
234 }
235 )
237 response = llm_client.invoke(
238 messages=model_messages,
239 system=[system_prompt],
240 model=selected_model,
241 )
243 usage_data = response.get("usage", {})
244 cache_data = usage_data.get("cache", {})
246 usage = Usage(
247 input=usage_data.get("input", 0),
248 output=usage_data.get("output", 0),
249 reasoning=usage_data.get("reasoning", 0),
250 cache=CacheUsage(
251 read=cache_data.get("read", 0),
252 write=cache_data.get("write", 0),
253 ),
254 )
256 cost = response.get("cost", 0.0)
258 from harnessutils.models.parts import TextPart
260 summary_message = Message(
261 id=message_id,
262 role="assistant",
263 parent_id=parent_message_id,
264 summary=True,
265 agent="summarization",
266 model={"model": response.get("model", model or "unknown")},
267 tokens=usage,
268 cost=cost,
269 )
271 summary_message.add_part(TextPart(text=response.get("content", "")))
273 return SummarizationResult(
274 summary_message=summary_message,
275 tokens_used=usage,
276 cost=cost,
277 )
280def _convert_to_model_format(
281 messages: list[Message],
282 include_tool_outputs: bool = True,
283 tool_output_max_tokens: int = 300,
284) -> list[dict[str, Any]]:
285 """Convert internal messages to model format for summarization.
287 Args:
288 messages: Internal message objects
289 include_tool_outputs: Whether to include tool outputs in context
290 tool_output_max_tokens: Max tokens per tool output
292 Returns:
293 List of messages in model format
294 """
295 from harnessutils.tokens.exact import count_tokens_fast
297 model_messages: list[dict[str, Any]] = []
299 for msg in messages:
300 if len(msg.parts) == 0:
301 continue
303 if msg.role == "user":
304 content_parts = []
305 for part in msg.parts:
306 if part.type == "text" and not getattr(part, "ignored", False):
307 content_parts.append(getattr(part, "text", ""))
309 if content_parts:
310 model_messages.append(
311 {
312 "role": "user",
313 "content": "\n".join(content_parts),
314 }
315 )
317 elif msg.role == "assistant":
318 if msg.error and not msg.has_partial_output():
319 continue
321 content_parts = []
322 for part in msg.parts:
323 if part.type == "text":
324 content_parts.append(getattr(part, "text", ""))
325 elif part.type == "reasoning":
326 content_parts.append(f"[Thinking: {getattr(part, 'text', '')}]")
327 elif include_tool_outputs and part.type == "tool":
328 state = getattr(part, "state", None)
329 if state:
330 output = state.output or "[cleared]"
331 if output != "[cleared]":
332 tokens = count_tokens_fast(output)
333 if tokens > tool_output_max_tokens:
334 output = output[:tool_output_max_tokens * 4] + "...(truncated)"
335 tool_name = getattr(state, "tool", None) or getattr(part, "tool", "unknown")
336 content_parts.append(f"[Tool: {tool_name}] {output}")
338 if content_parts:
339 model_messages.append(
340 {
341 "role": "assistant",
342 "content": "\n".join(content_parts),
343 }
344 )
346 return model_messages