Skip to content

Commit 4ab3c5e

Browse files
committed
feat(document): unify binary file parsing and add parseBytes #463
Refactor document parsing to always use parseBytes for binary files, removing legacy string fallback. Add default parseBytes implementation and platform-specific readFileAsBytes support.
1 parent b43a371 commit 4ab3c5e

File tree

8 files changed

+98
-27
lines changed

8 files changed

+98
-27
lines changed

mpp-core/src/commonMain/kotlin/cc/unitmesh/agent/document/DocumentAgent.kt

Lines changed: 54 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -202,9 +202,9 @@ class DocumentAgent(
202202
203203
## Response Workflow
204204
205-
1. Explain what you plan to query.
206-
2. Make **exactly one** DocQL call.
207-
3. After tool results, answer normally (no more tool use).
205+
1. **Plan**: Analyze the query and identify target documents from filename patterns
206+
2. **Query**: Make **exactly one** DocQL call with appropriate query and documentPath
207+
3. **Respond**: After tool results, synthesize answer naturally (no more tool use)
208208
209209
---
210210
@@ -257,11 +257,57 @@ class DocumentAgent(
257257
258258
## Best Practices
259259
260-
* Prefer `documentPath` when clear.
261-
* retry with broader DocQL queries before giving up.
262-
* Use TOC early.
263-
* Use `heading()` for sections; `chunks()` for full context.
264-
* Always retrieve; never speculate.
260+
✅ **DO:**
261+
- Always specify `documentPath` when filename clearly matches query keywords
262+
- Start with `heading()` for targeted searches, fall back to `chunks()` if empty
263+
- Expand keywords BEFORE first query: synonyms, morphology, translations
264+
- Try 2-3 different queries before concluding "no information found"
265+
- Query multiple related documents for cross-cutting topics
266+
267+
❌ **DON'T:**
268+
- Never guess or speculate without actually querying first
269+
- Don't use filesystem tools on registered documents
270+
- Don't give up after one failed query - retry with broader terms
271+
- Don't use `chunks()` as your first choice unless query is very broad
272+
273+
---
274+
275+
## Successful Query Examples
276+
277+
**Query: "What colors are used in the design system?"**
278+
```json
279+
{
280+
"query": "$.content.heading(\"color\")",
281+
"documentPath": "design-system-color.md"
282+
}
283+
```
284+
✅ Direct filename match + targeted heading search
285+
286+
**Query: "How do I use custom icons?"**
287+
First attempt:
288+
```json
289+
{
290+
"query": "$.content.heading(\"custom icons\")",
291+
"documentPath": "custom-icons-usage.md"
292+
}
293+
```
294+
If empty, retry with:
295+
```json
296+
{
297+
"query": "$.content.chunks(\"icons\")",
298+
"documentPath": "custom-icons-usage.md"
299+
}
300+
```
301+
✅ Good retry strategy: specific → broader
302+
303+
**Query: "Tell me about architecture" (ambiguous)**
304+
```json
305+
{
306+
"query": "$.toc[*]"
307+
}
308+
```
309+
Then identify relevant doc(s) from TOC and query specifically
310+
✅ Use TOC when multiple architecture-related files exist
265311
266312
""".trimIndent()
267313
}

mpp-core/src/commonMain/kotlin/cc/unitmesh/devins/document/DocumentParserService.kt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,16 @@ interface DocumentParserService {
1010
*/
1111
suspend fun parse(file: DocumentFile, content: String): DocumentTreeNode
1212

13+
/**
14+
* 解析二进制文档文件(用于 PDF, DOCX 等格式)
15+
* 默认实现:将字节转换为字符串后调用 parse
16+
*/
17+
suspend fun parseBytes(file: DocumentFile, bytes: ByteArray): DocumentTreeNode {
18+
// Default: decode as UTF-8 and call parse
19+
val content = bytes.decodeToString()
20+
return parse(file, content)
21+
}
22+
1323
/**
1424
* HeadingQL: 根据关键字或标题文本查找最匹配的标题节点及其内容
1525
*/

mpp-core/src/iosMain/kotlin/cc/unitmesh/devins/filesystem/ProjectFileSystem.ios.kt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package cc.unitmesh.devins.filesystem
22

33
import kotlinx.cinterop.ExperimentalForeignApi
4+
import kotlinx.cinterop.get
5+
import kotlinx.cinterop.reinterpret
46
import kotlinx.cinterop.toKString
57
import platform.Foundation.*
68
import platform.posix.getcwd
@@ -29,6 +31,20 @@ actual class DefaultFileSystem actual constructor(
2931
}
3032
}
3133

34+
actual override fun readFileAsBytes(path: String): ByteArray? {
35+
val fullPath = resolvePath(path)
36+
return try {
37+
val data = NSData.dataWithContentsOfFile(fullPath) ?: return null
38+
val length = data.length.toInt()
39+
val bytes = data.bytes?.reinterpret<kotlinx.cinterop.ByteVar>() ?: return null
40+
ByteArray(length) { i ->
41+
bytes[i]
42+
}
43+
} catch (e: Exception) {
44+
null
45+
}
46+
}
47+
3248
actual override fun writeFile(path: String, content: String): Boolean {
3349
val fullPath = resolvePath(path)
3450
return try {

mpp-core/src/jsMain/kotlin/cc/unitmesh/devins/filesystem/DefaultFileSystem.js.kt

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
package cc.unitmesh.devins.filesystem
22

3+
import org.khronos.webgl.ArrayBuffer
4+
import org.khronos.webgl.Uint8Array
5+
36
/**
47
* JavaScript 平台的文件系统实现
58
* 基于 Node.js fs 模块的高性能实现
@@ -55,8 +58,12 @@ actual class DefaultFileSystem actual constructor(private val projectPath: Strin
5558
if (exists(resolvedPath) && !isDirectory(resolvedPath)) {
5659
val buffer = fs.readFileSync(resolvedPath)
5760
// Convert Node.js Buffer to ByteArray
58-
val array = Uint8Array(buffer.buffer as ArrayBuffer, buffer.byteOffset as Int, buffer.length as Int)
59-
ByteArray(array.length) { array[it] }
61+
// Node.js Buffer is already array-like, access elements directly
62+
val length = buffer.length.unsafeCast<Int>()
63+
ByteArray(length) { i ->
64+
val value: dynamic = buffer[i]
65+
value.unsafeCast<Byte>()
66+
}
6067
} else {
6168
null
6269
}

mpp-core/src/jvmMain/kotlin/cc/unitmesh/devins/document/TikaDocumentParser.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class TikaDocumentParser : DocumentParserService {
4848
* Parse document from ByteArray (preferred for binary files)
4949
* This method properly handles binary data without corruption
5050
*/
51-
suspend fun parseBytes(file: DocumentFile, bytes: ByteArray): DocumentTreeNode {
51+
override suspend fun parseBytes(file: DocumentFile, bytes: ByteArray): DocumentTreeNode {
5252
logger.info { "=== Starting Tika Parse (from Bytes) ===" }
5353
logger.info { "File: ${file.path}, Size: ${bytes.size} bytes" }
5454

mpp-ui/src/commonMain/kotlin/cc/unitmesh/devins/service/DocumentIndexService.kt

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -167,14 +167,8 @@ class DocumentIndexService(
167167

168168
// Parse the document based on format type
169169
val parsedDoc = if (isBinary && bytes != null) {
170-
// Use TikaDocumentParser's parseBytes for binary files
171-
if (parser is cc.unitmesh.devins.document.TikaDocumentParser) {
172-
parser.parseBytes(docFile, bytes)
173-
} else {
174-
// Fallback: convert bytes to ISO_8859_1 string
175-
val fallbackContent = bytes.toString(Charsets.ISO_8859_1)
176-
parser.parse(docFile, fallbackContent)
177-
}
170+
// Use parseBytes for binary files
171+
parser.parseBytes(docFile, bytes)
178172
} else if (content != null) {
179173
// Text format
180174
parser.parse(docFile, content)

mpp-ui/src/commonMain/kotlin/cc/unitmesh/devins/ui/compose/document/DocumentReaderViewModel.kt

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -310,14 +310,8 @@ class DocumentReaderViewModel(private val workspace: Workspace) {
310310

311311
documentContent = null // Binary files don't have text content to display
312312

313-
// Use TikaDocumentParser's parseBytes method for binary files
314-
if (parser is cc.unitmesh.devins.document.TikaDocumentParser) {
315-
parser.parseBytes(doc, bytes)
316-
} else {
317-
// Fallback: convert bytes to ISO_8859_1 string (legacy behavior)
318-
val content = bytes.toString(Charsets.ISO_8859_1)
319-
parser.parse(doc, content)
320-
}
313+
// Use parseBytes method for binary files
314+
parser.parseBytes(doc, bytes)
321315
} else {
322316
// Text formats (Markdown, TXT) - read as string
323317
val content = fileSystem.readFile(doc.path)

mpp-ui/src/commonTest/kotlin/cc/unitmesh/devins/service/DocumentIndexServiceTest.kt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ class DocumentIndexServiceTest {
2121

2222
override fun readFile(path: String): String? = files[path]
2323

24+
override fun readFileAsBytes(path: String): ByteArray? {
25+
return files[path]?.encodeToByteArray()
26+
}
27+
2428
override fun writeFile(path: String, content: String): Boolean {
2529
files[path] = content
2630
return true

0 commit comments

Comments
 (0)