Coverage for src / harnessutils / compaction / summarization.py: 96%

93 statements  

« prev     ^ index     » next       coverage.py v7.13.2, created at 2026-02-12 22:41 -0600

1"""Tier 3: LLM-powered conversation summarization. 

2 

3Uses LLM to semantically compress conversation when approaching limit. 

4Cost: Expensive (~$0.10-0.50), Latency: ~3-5s. 

5""" 

6 

7from dataclasses import dataclass 

8from typing import Any 

9 

10from harnessutils.models.message import Message 

11from harnessutils.models.usage import CacheUsage, Usage 

12from harnessutils.types import LLMClient 

13 

14SUMMARIZATION_PROMPT = """You are a helpful AI assistant tasked with summarizing conversations. 

15 

16When asked to summarize, provide a detailed but concise summary of the conversation. 

17Focus on information that would be helpful for continuing the conversation, including: 

18- What was done 

19- What is currently being worked on 

20- Which files are being modified 

21- What needs to be done next 

22- Key user requests, constraints, or preferences that should persist 

23- Important technical decisions and why they were made 

24 

25Your summary should be comprehensive enough to provide context but concise enough 

26to be quickly understood.""" 

27 

28DIFFERENTIAL_SUMMARIZATION_PROMPT = """You are a helpful AI assistant tasked with \ 

29updating conversation summaries. 

30 

31You will be given: 

321. A previous summary of the conversation 

332. New messages/activity since that summary 

34 

35Your task is to create an UPDATED summary that: 

36- Preserves important context from the previous summary 

37- Incorporates new information from recent messages 

38- Maintains continuity while staying concise 

39- Focuses on what's relevant for continuing the conversation 

40 

41The updated summary should feel like a natural evolution of the previous one, 

42not a disconnected list of old + new information.""" 

43 

44 

45@dataclass 

46class SummarizationResult: 

47 """Result of summarization operation.""" 

48 

49 summary_message: Message 

50 tokens_used: Usage 

51 cost: float 

52 

53 

54def _find_last_summary(messages: list[Message]) -> Message | None: 

55 """Find the last summary message in the conversation. 

56 

57 Args: 

58 messages: List of messages to search 

59 

60 Returns: 

61 Last summary message or None if no summary exists 

62 """ 

63 for msg in reversed(messages): 

64 if msg.summary: 

65 return msg 

66 return None 

67 

68 

69def _get_messages_since_summary( 

70 messages: list[Message], summary_msg: Message 

71) -> list[Message]: 

72 """Get all messages after the given summary message. 

73 

74 Args: 

75 messages: Full list of messages 

76 summary_msg: The summary message to find messages after 

77 

78 Returns: 

79 List of messages after the summary 

80 """ 

81 summary_idx = None 

82 for i, msg in enumerate(messages): 

83 if msg.id == summary_msg.id: 

84 summary_idx = i 

85 break 

86 

87 if summary_idx is None: 

88 return messages 

89 

90 return messages[summary_idx + 1 :] 

91 

92 

93def _build_differential_prompt( 

94 last_summary: str, new_messages: list[dict[str, Any]] 

95) -> list[dict[str, Any]]: 

96 """Build prompt for differential summarization. 

97 

98 Args: 

99 last_summary: Content of the previous summary 

100 new_messages: New messages since the last summary 

101 

102 Returns: 

103 List of messages formatted for differential summarization 

104 """ 

105 prompt_messages = [ 

106 { 

107 "role": "user", 

108 "content": f"""Previous summary: 

109{last_summary} 

110 

111New activity since summary:""", 

112 } 

113 ] 

114 

115 # Add new messages 

116 prompt_messages.extend(new_messages) 

117 

118 # Add request for updated summary 

119 prompt_messages.append( 

120 { 

121 "role": "user", 

122 "content": ( 

123 "Provide an UPDATED summary that incorporates both the " 

124 "previous context and new activity." 

125 ), 

126 } 

127 ) 

128 

129 return prompt_messages 

130 

131 

132def is_overflow(usage: Usage, context_limit: int, output_limit: int) -> bool: 

133 """Check if conversation has overflowed context window. 

134 

135 The context window must fit both input tokens and output tokens. 

136 We reserve space for output, so usable input space is smaller. 

137 

138 Args: 

139 usage: Token usage from last turn 

140 context_limit: Maximum context tokens for model 

141 output_limit: Maximum output tokens for model 

142 

143 Returns: 

144 True if input tokens exceed available space after reserving for output 

145 """ 

146 total_input = usage.input + usage.cache.read 

147 usable_input_space = context_limit - output_limit 

148 

149 return total_input > usable_input_space 

150 

151 

152def summarize_conversation( 

153 messages: list[Message], 

154 llm_client: LLMClient, 

155 parent_message_id: str, 

156 message_id: str, 

157 model: str | None = None, 

158 auto_mode: bool = False, 

159 config: Any | None = None, 

160 force_full: bool = False, 

161) -> SummarizationResult: 

162 """Summarize conversation using LLM. 

163 

164 Args: 

165 messages: Conversation messages to summarize 

166 llm_client: LLM client implementation (callback from app) 

167 parent_message_id: ID of message that triggered summarization 

168 message_id: ID for the summary message 

169 model: Optional model to use (cheaper/faster recommended) 

170 auto_mode: Whether this was auto-triggered 

171 config: Optional SummarizationConfig for differential mode 

172 force_full: Force full summarization even if differential is available 

173 

174 Returns: 

175 SummarizationResult with summary message and metrics 

176 """ 

177 # Check if differential summarization is possible 

178 use_differential = False 

179 last_summary = None 

180 new_messages_only = messages 

181 

182 if config and not force_full: 

183 last_summary = _find_last_summary(messages) 

184 if last_summary: 

185 new_messages_only = _get_messages_since_summary(messages, last_summary) 

186 

187 # Only use differential if we have new messages and haven't exceeded limit 

188 max_since_summary = getattr(config, "max_messages_since_summary", 30) 

189 if new_messages_only and len(new_messages_only) <= max_since_summary: 

190 use_differential = True 

191 

192 # Choose model and prompt based on mode 

193 if use_differential and last_summary: 

194 # Differential mode: cheaper/faster model 

195 selected_model = model or getattr(config, "differential_model", None) 

196 system_prompt = DIFFERENTIAL_SUMMARIZATION_PROMPT 

197 

198 # Get last summary content 

199 summary_content = "" 

200 for part in last_summary.parts: 

201 if part.type == "text": 

202 summary_content = getattr(part, "text", "") 

203 break 

204 

205 # Convert only new messages 

206 new_model_messages = _convert_to_model_format(new_messages_only) 

207 

208 # Build differential prompt 

209 model_messages = _build_differential_prompt(summary_content, new_model_messages) 

210 else: 

211 # Full mode: more capable model 

212 selected_model = model or getattr(config, "full_model", None) 

213 system_prompt = SUMMARIZATION_PROMPT 

214 

215 # Convert all messages 

216 model_messages = _convert_to_model_format(messages) 

217 model_messages.append( 

218 { 

219 "role": "user", 

220 "content": "Provide a detailed summary for continuing our conversation.", 

221 } 

222 ) 

223 

224 response = llm_client.invoke( 

225 messages=model_messages, 

226 system=[system_prompt], 

227 model=selected_model, 

228 ) 

229 

230 usage_data = response.get("usage", {}) 

231 cache_data = usage_data.get("cache", {}) 

232 

233 usage = Usage( 

234 input=usage_data.get("input", 0), 

235 output=usage_data.get("output", 0), 

236 reasoning=usage_data.get("reasoning", 0), 

237 cache=CacheUsage( 

238 read=cache_data.get("read", 0), 

239 write=cache_data.get("write", 0), 

240 ), 

241 ) 

242 

243 cost = response.get("cost", 0.0) 

244 

245 from harnessutils.models.parts import TextPart 

246 

247 summary_message = Message( 

248 id=message_id, 

249 role="assistant", 

250 parent_id=parent_message_id, 

251 summary=True, 

252 agent="summarization", 

253 model={"model": response.get("model", model or "unknown")}, 

254 tokens=usage, 

255 cost=cost, 

256 ) 

257 

258 summary_message.add_part(TextPart(text=response.get("content", ""))) 

259 

260 return SummarizationResult( 

261 summary_message=summary_message, 

262 tokens_used=usage, 

263 cost=cost, 

264 ) 

265 

266 

267def _convert_to_model_format(messages: list[Message]) -> list[dict[str, Any]]: 

268 """Convert internal messages to model format for summarization. 

269 

270 Args: 

271 messages: Internal message objects 

272 

273 Returns: 

274 List of messages in model format 

275 """ 

276 model_messages: list[dict[str, Any]] = [] 

277 

278 for msg in messages: 

279 if len(msg.parts) == 0: 

280 continue 

281 

282 if msg.role == "user": 

283 content_parts = [] 

284 for part in msg.parts: 

285 if part.type == "text" and not getattr(part, "ignored", False): 

286 content_parts.append(getattr(part, "text", "")) 

287 

288 if content_parts: 

289 model_messages.append( 

290 { 

291 "role": "user", 

292 "content": "\n".join(content_parts), 

293 } 

294 ) 

295 

296 elif msg.role == "assistant": 

297 if msg.error and not msg.has_partial_output(): 

298 continue 

299 

300 content_parts = [] 

301 for part in msg.parts: 

302 if part.type == "text": 

303 content_parts.append(getattr(part, "text", "")) 

304 elif part.type == "reasoning": 

305 content_parts.append(f"[Thinking: {getattr(part, 'text', '')}]") 

306 

307 if content_parts: 

308 model_messages.append( 

309 { 

310 "role": "assistant", 

311 "content": "\n".join(content_parts), 

312 } 

313 ) 

314 

315 return model_messages