Coverage for src / harnessutils / compaction / summarization.py: 97%

106 statements  

« prev     ^ index     » next       coverage.py v7.13.2, created at 2026-02-18 09:07 -0600

1"""Tier 3: LLM-powered conversation summarization. 

2 

3Uses LLM to semantically compress conversation when approaching limit. 

4Cost: Expensive (~$0.10-0.50), Latency: ~3-5s. 

5""" 

6 

7from dataclasses import dataclass 

8from typing import Any 

9 

10from harnessutils.models.message import Message 

11from harnessutils.models.usage import CacheUsage, Usage 

12from harnessutils.types import LLMClient 

13 

14SUMMARIZATION_PROMPT = """You are a helpful AI assistant tasked with summarizing conversations. 

15 

16When asked to summarize, provide a detailed but concise summary of the conversation. 

17Focus on information that would be helpful for continuing the conversation, including: 

18- What was done 

19- What is currently being worked on 

20- Which files are being modified 

21- What needs to be done next 

22- Key user requests, constraints, or preferences that should persist 

23- Important technical decisions and why they were made 

24 

25Your summary should be comprehensive enough to provide context but concise enough 

26to be quickly understood.""" 

27 

28DIFFERENTIAL_SUMMARIZATION_PROMPT = """You are a helpful AI assistant tasked with \ 

29updating conversation summaries. 

30 

31You will be given: 

321. A previous summary of the conversation 

332. New messages/activity since that summary 

34 

35Your task is to create an UPDATED summary that: 

36- Preserves important context from the previous summary 

37- Incorporates new information from recent messages 

38- Maintains continuity while staying concise 

39- Focuses on what's relevant for continuing the conversation 

40 

41The updated summary should feel like a natural evolution of the previous one, 

42not a disconnected list of old + new information.""" 

43 

44 

45@dataclass 

46class SummarizationResult: 

47 """Result of summarization operation.""" 

48 

49 summary_message: Message 

50 tokens_used: Usage 

51 cost: float 

52 

53 

54def _find_last_summary(messages: list[Message]) -> Message | None: 

55 """Find the last summary message in the conversation. 

56 

57 Args: 

58 messages: List of messages to search 

59 

60 Returns: 

61 Last summary message or None if no summary exists 

62 """ 

63 for msg in reversed(messages): 

64 if msg.summary: 

65 return msg 

66 return None 

67 

68 

69def _get_messages_since_summary( 

70 messages: list[Message], summary_msg: Message 

71) -> list[Message]: 

72 """Get all messages after the given summary message. 

73 

74 Args: 

75 messages: Full list of messages 

76 summary_msg: The summary message to find messages after 

77 

78 Returns: 

79 List of messages after the summary 

80 """ 

81 summary_idx = None 

82 for i, msg in enumerate(messages): 

83 if msg.id == summary_msg.id: 

84 summary_idx = i 

85 break 

86 

87 if summary_idx is None: 

88 return messages 

89 

90 return messages[summary_idx + 1 :] 

91 

92 

93def _build_differential_prompt( 

94 last_summary: str, new_messages: list[dict[str, Any]] 

95) -> list[dict[str, Any]]: 

96 """Build prompt for differential summarization. 

97 

98 Args: 

99 last_summary: Content of the previous summary 

100 new_messages: New messages since the last summary 

101 

102 Returns: 

103 List of messages formatted for differential summarization 

104 """ 

105 prompt_messages = [ 

106 { 

107 "role": "user", 

108 "content": f"""Previous summary: 

109{last_summary} 

110 

111New activity since summary:""", 

112 } 

113 ] 

114 

115 # Add new messages 

116 prompt_messages.extend(new_messages) 

117 

118 # Add request for updated summary 

119 prompt_messages.append( 

120 { 

121 "role": "user", 

122 "content": ( 

123 "Provide an UPDATED summary that incorporates both the " 

124 "previous context and new activity." 

125 ), 

126 } 

127 ) 

128 

129 return prompt_messages 

130 

131 

132def is_overflow(usage: Usage, context_limit: int, output_limit: int) -> bool: 

133 """Check if conversation has overflowed context window. 

134 

135 The context window must fit both input tokens and output tokens. 

136 We reserve space for output, so usable input space is smaller. 

137 

138 Args: 

139 usage: Token usage from last turn 

140 context_limit: Maximum context tokens for model 

141 output_limit: Maximum output tokens for model 

142 

143 Returns: 

144 True if input tokens exceed available space after reserving for output 

145 """ 

146 total_input = usage.input + usage.cache.read 

147 usable_input_space = context_limit - output_limit 

148 

149 return total_input > usable_input_space 

150 

151 

152def summarize_conversation( 

153 messages: list[Message], 

154 llm_client: LLMClient, 

155 parent_message_id: str, 

156 message_id: str, 

157 model: str | None = None, 

158 auto_mode: bool = False, 

159 config: Any | None = None, 

160 force_full: bool = False, 

161) -> SummarizationResult: 

162 """Summarize conversation using LLM. 

163 

164 Args: 

165 messages: Conversation messages to summarize 

166 llm_client: LLM client implementation (callback from app) 

167 parent_message_id: ID of message that triggered summarization 

168 message_id: ID for the summary message 

169 model: Optional model to use (cheaper/faster recommended) 

170 auto_mode: Whether this was auto-triggered 

171 config: Optional SummarizationConfig for differential mode 

172 force_full: Force full summarization even if differential is available 

173 

174 Returns: 

175 SummarizationResult with summary message and metrics 

176 """ 

177 # Check if differential summarization is possible 

178 use_differential = False 

179 last_summary = None 

180 new_messages_only = messages 

181 

182 include_tool_outputs = getattr(config, "include_tool_outputs", True) 

183 tool_output_max_tokens = getattr(config, "tool_output_max_tokens", 300) 

184 

185 if config and not force_full: 

186 last_summary = _find_last_summary(messages) 

187 if last_summary: 

188 new_messages_only = _get_messages_since_summary(messages, last_summary) 

189 

190 # Only use differential if we have new messages and haven't exceeded limit 

191 max_since_summary = getattr(config, "max_messages_since_summary", 30) 

192 if new_messages_only and len(new_messages_only) <= max_since_summary: 

193 use_differential = True 

194 

195 # Choose model and prompt based on mode 

196 if use_differential and last_summary: 

197 # Differential mode: cheaper/faster model 

198 selected_model = model or getattr(config, "differential_model", None) 

199 system_prompt = ( 

200 getattr(config, "differential_prompt", None) or DIFFERENTIAL_SUMMARIZATION_PROMPT 

201 ) 

202 

203 # Get last summary content 

204 summary_content = "" 

205 for part in last_summary.parts: 

206 if part.type == "text": 

207 summary_content = getattr(part, "text", "") 

208 break 

209 

210 # Convert only new messages 

211 new_model_messages = _convert_to_model_format( 

212 new_messages_only, 

213 include_tool_outputs=include_tool_outputs, 

214 tool_output_max_tokens=tool_output_max_tokens, 

215 ) 

216 

217 # Build differential prompt 

218 model_messages = _build_differential_prompt(summary_content, new_model_messages) 

219 else: 

220 # Full mode: more capable model 

221 selected_model = model or getattr(config, "full_model", None) 

222 system_prompt = getattr(config, "summarization_prompt", None) or SUMMARIZATION_PROMPT 

223 

224 # Convert all messages 

225 model_messages = _convert_to_model_format( 

226 messages, 

227 include_tool_outputs=include_tool_outputs, 

228 tool_output_max_tokens=tool_output_max_tokens, 

229 ) 

230 model_messages.append( 

231 { 

232 "role": "user", 

233 "content": "Provide a detailed summary for continuing our conversation.", 

234 } 

235 ) 

236 

237 response = llm_client.invoke( 

238 messages=model_messages, 

239 system=[system_prompt], 

240 model=selected_model, 

241 ) 

242 

243 usage_data = response.get("usage", {}) 

244 cache_data = usage_data.get("cache", {}) 

245 

246 usage = Usage( 

247 input=usage_data.get("input", 0), 

248 output=usage_data.get("output", 0), 

249 reasoning=usage_data.get("reasoning", 0), 

250 cache=CacheUsage( 

251 read=cache_data.get("read", 0), 

252 write=cache_data.get("write", 0), 

253 ), 

254 ) 

255 

256 cost = response.get("cost", 0.0) 

257 

258 from harnessutils.models.parts import TextPart 

259 

260 summary_message = Message( 

261 id=message_id, 

262 role="assistant", 

263 parent_id=parent_message_id, 

264 summary=True, 

265 agent="summarization", 

266 model={"model": response.get("model", model or "unknown")}, 

267 tokens=usage, 

268 cost=cost, 

269 ) 

270 

271 summary_message.add_part(TextPart(text=response.get("content", ""))) 

272 

273 return SummarizationResult( 

274 summary_message=summary_message, 

275 tokens_used=usage, 

276 cost=cost, 

277 ) 

278 

279 

280def _convert_to_model_format( 

281 messages: list[Message], 

282 include_tool_outputs: bool = True, 

283 tool_output_max_tokens: int = 300, 

284) -> list[dict[str, Any]]: 

285 """Convert internal messages to model format for summarization. 

286 

287 Args: 

288 messages: Internal message objects 

289 include_tool_outputs: Whether to include tool outputs in context 

290 tool_output_max_tokens: Max tokens per tool output 

291 

292 Returns: 

293 List of messages in model format 

294 """ 

295 from harnessutils.tokens.exact import count_tokens_fast 

296 

297 model_messages: list[dict[str, Any]] = [] 

298 

299 for msg in messages: 

300 if len(msg.parts) == 0: 

301 continue 

302 

303 if msg.role == "user": 

304 content_parts = [] 

305 for part in msg.parts: 

306 if part.type == "text" and not getattr(part, "ignored", False): 

307 content_parts.append(getattr(part, "text", "")) 

308 

309 if content_parts: 

310 model_messages.append( 

311 { 

312 "role": "user", 

313 "content": "\n".join(content_parts), 

314 } 

315 ) 

316 

317 elif msg.role == "assistant": 

318 if msg.error and not msg.has_partial_output(): 

319 continue 

320 

321 content_parts = [] 

322 for part in msg.parts: 

323 if part.type == "text": 

324 content_parts.append(getattr(part, "text", "")) 

325 elif part.type == "reasoning": 

326 content_parts.append(f"[Thinking: {getattr(part, 'text', '')}]") 

327 elif include_tool_outputs and part.type == "tool": 

328 state = getattr(part, "state", None) 

329 if state: 

330 output = state.output or "[cleared]" 

331 if output != "[cleared]": 

332 tokens = count_tokens_fast(output) 

333 if tokens > tool_output_max_tokens: 

334 output = output[:tool_output_max_tokens * 4] + "...(truncated)" 

335 tool_name = getattr(state, "tool", None) or getattr(part, "tool", "unknown") 

336 content_parts.append(f"[Tool: {tool_name}] {output}") 

337 

338 if content_parts: 

339 model_messages.append( 

340 { 

341 "role": "assistant", 

342 "content": "\n".join(content_parts), 

343 } 

344 ) 

345 

346 return model_messages