Skip to content

Commit 07bb4c5

Browse files
committed
feat(document): add structured document cache serialization #463
Implement structured JSON serialization for document cache, storing extracted text, TOC, chunks, and metadata. Enables full parser state restoration from cache, improving performance and DocQL support.
1 parent 7f5fe6f commit 07bb4c5

File tree

5 files changed

+634
-1
lines changed

5 files changed

+634
-1
lines changed

mpp-core/build.gradle.kts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,9 @@ kotlin {
182182
// Apache Tika for document parsing (PDF, DOC, DOCX, PPT, etc.)
183183
implementation("org.apache.tika:tika-core:3.2.3")
184184
implementation("org.apache.tika:tika-parsers-standard-package:3.2.3")
185+
186+
// Jsoup for HTML document parsing
187+
implementation("org.jsoup:jsoup:1.21.2")
185188
}
186189
}
187190

mpp-core/src/commonMain/kotlin/cc/unitmesh/devins/document/DocumentModels.kt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ enum class DocumentFormatType {
139139
MARKDOWN,
140140
PDF,
141141
DOCX,
142+
HTML,
142143
PLAIN_TEXT
143144
}
144145

mpp-core/src/jvmMain/kotlin/cc/unitmesh/devins/document/DocumentRegistry.jvm.kt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ actual fun platformInitialize() {
2323
logger.debug { "Registered TikaDocumentParser for $format" }
2424
}
2525

26-
logger.info { "JVM parsers initialized: ${tikaFormats.size} formats supported via Tika" }
26+
// Register Jsoup parser for HTML
27+
DocumentParserFactory.registerParser(DocumentFormatType.HTML) { JsoupDocumentParser() }
28+
logger.debug { "Registered JsoupDocumentParser for HTML" }
29+
30+
logger.info { "JVM parsers initialized: ${tikaFormats.size + 1} formats supported (Tika + Jsoup)" }
2731
}
2832

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
package cc.unitmesh.devins.document
2+
3+
import io.github.oshai.kotlinlogging.KotlinLogging
4+
import org.jsoup.Jsoup
5+
import org.jsoup.nodes.Element
6+
import org.jsoup.select.Elements
7+
8+
private val logger = KotlinLogging.logger {}
9+
10+
/**
11+
* Jsoup-based HTML document parser for JVM platform
12+
*
13+
* Parses HTML documents and extracts:
14+
* - Plain text content from the body
15+
* - Table of Contents from heading elements (h1-h6)
16+
* - Metadata from meta tags and title
17+
* - Document chunks based on sections
18+
*
19+
* This parser provides structured access to HTML content with position tracking.
20+
*/
21+
class JsoupDocumentParser : DocumentParserService {
22+
private var currentContent: String? = null
23+
private var currentChunks: List<DocumentChunk> = emptyList()
24+
private var currentDocument: org.jsoup.nodes.Document? = null
25+
26+
override fun getDocumentContent(): String? = currentContent
27+
28+
override suspend fun parse(file: DocumentFile, content: String): DocumentTreeNode {
29+
logger.info { "=== Starting Jsoup HTML Parse ===" }
30+
logger.info { "File: ${file.path}, Size: ${content.length} bytes" }
31+
32+
try {
33+
// Parse HTML document
34+
val doc = Jsoup.parse(content)
35+
currentDocument = doc
36+
37+
// Extract text content from body (excluding head)
38+
val extractedText = doc.body().text()
39+
currentContent = extractedText
40+
41+
logger.info { "Extracted ${extractedText.length} characters from HTML body" }
42+
43+
// Extract metadata
44+
val title = doc.title()
45+
logger.debug { "Document title: $title" }
46+
47+
// Build TOC from heading elements
48+
val toc = extractTOCFromHeadings(doc)
49+
logger.info { "Extracted ${toc.size} TOC items from headings" }
50+
51+
// Build document chunks based on sections
52+
currentChunks = buildChunksFromSections(doc, file.path)
53+
logger.info { "Created ${currentChunks.size} document chunks" }
54+
55+
logger.info { "=== HTML Parse Complete ===" }
56+
57+
return file.copy(
58+
toc = toc,
59+
metadata = file.metadata.copy(
60+
parseStatus = ParseStatus.PARSED,
61+
chapterCount = toc.size,
62+
mimeType = "text/html"
63+
)
64+
)
65+
} catch (e: Exception) {
66+
logger.error(e) { "Failed to parse HTML document: ${e.message}" }
67+
return file.copy(
68+
metadata = file.metadata.copy(
69+
parseStatus = ParseStatus.PARSE_FAILED
70+
)
71+
)
72+
}
73+
}
74+
75+
override suspend fun queryHeading(keyword: String): List<DocumentChunk> {
76+
return currentChunks.filter {
77+
it.chapterTitle?.contains(keyword, ignoreCase = true) == true ||
78+
it.content.contains(keyword, ignoreCase = true)
79+
}.sortedByDescending {
80+
// Relevance scoring: exact title match > title contains > content contains
81+
when {
82+
it.chapterTitle?.equals(keyword, ignoreCase = true) == true -> 10
83+
it.chapterTitle?.contains(keyword, ignoreCase = true) == true -> 5
84+
else -> 1
85+
}
86+
}
87+
}
88+
89+
override suspend fun queryChapter(chapterId: String): DocumentChunk? {
90+
return currentChunks.find {
91+
it.anchor == chapterId || it.anchor == "#$chapterId"
92+
}
93+
}
94+
95+
/**
96+
* Extract Table of Contents from HTML heading elements (h1-h6)
97+
*/
98+
private fun extractTOCFromHeadings(doc: org.jsoup.nodes.Document): List<TOCItem> {
99+
val toc = mutableListOf<TOCItem>()
100+
101+
// Select all heading elements
102+
val headings = doc.select("h1, h2, h3, h4, h5, h6")
103+
104+
headings.forEachIndexed { index, heading ->
105+
val level = heading.tagName().substring(1).toIntOrNull() ?: 1
106+
val title = heading.text()
107+
108+
// Generate anchor from id attribute or from title
109+
val anchor = if (heading.hasAttr("id")) {
110+
"#${heading.attr("id")}"
111+
} else {
112+
"#${title.lowercase().replace(Regex("[^a-z0-9]+"), "-")}"
113+
}
114+
115+
toc.add(TOCItem(
116+
level = level,
117+
title = title,
118+
anchor = anchor,
119+
lineNumber = index
120+
))
121+
}
122+
123+
return toc
124+
}
125+
126+
/**
127+
* Build document chunks based on sections defined by headings
128+
* Each chunk represents content between consecutive headings
129+
*/
130+
private fun buildChunksFromSections(
131+
doc: org.jsoup.nodes.Document,
132+
documentPath: String
133+
): List<DocumentChunk> {
134+
val chunks = mutableListOf<DocumentChunk>()
135+
136+
// Get all heading elements
137+
val headings = doc.select("h1, h2, h3, h4, h5, h6")
138+
139+
if (headings.isEmpty()) {
140+
// No headings found, create a single chunk with all body content
141+
val bodyText = doc.body().text()
142+
if (bodyText.isNotBlank()) {
143+
chunks.add(DocumentChunk(
144+
documentPath = documentPath,
145+
chapterTitle = null,
146+
content = bodyText,
147+
anchor = "#content",
148+
position = PositionMetadata(
149+
documentPath = documentPath,
150+
formatType = DocumentFormatType.HTML,
151+
position = DocumentPosition.LineRange(
152+
startLine = 0,
153+
endLine = 0
154+
)
155+
)
156+
))
157+
}
158+
return chunks
159+
}
160+
161+
// Process each heading and extract content until next heading
162+
headings.forEachIndexed { index, heading ->
163+
val title = heading.text()
164+
val anchor = if (heading.hasAttr("id")) {
165+
"#${heading.attr("id")}"
166+
} else {
167+
"#${title.lowercase().replace(Regex("[^a-z0-9]+"), "-")}"
168+
}
169+
170+
// Collect content between this heading and the next one
171+
val content = StringBuilder()
172+
var currentElement: Element? = heading.nextElementSibling()
173+
174+
while (currentElement != null) {
175+
// Stop if we encounter another heading
176+
if (currentElement.tagName().matches(Regex("h[1-6]"))) {
177+
break
178+
}
179+
180+
// Add element text to content
181+
val elementText = currentElement.text()
182+
if (elementText.isNotBlank()) {
183+
if (content.isNotEmpty()) {
184+
content.append("\n")
185+
}
186+
content.append(elementText)
187+
}
188+
189+
currentElement = currentElement.nextElementSibling()
190+
}
191+
192+
// Always create chunk for heading, even if there's no content
193+
// This ensures headings without direct content can still be found
194+
val chunkContent = content.toString().trim()
195+
chunks.add(DocumentChunk(
196+
documentPath = documentPath,
197+
chapterTitle = title,
198+
content = chunkContent,
199+
anchor = anchor,
200+
startLine = index,
201+
endLine = index,
202+
position = PositionMetadata(
203+
documentPath = documentPath,
204+
formatType = DocumentFormatType.HTML,
205+
position = DocumentPosition.LineRange(
206+
startLine = index,
207+
endLine = index
208+
)
209+
)
210+
))
211+
}
212+
213+
return chunks
214+
}
215+
}

0 commit comments

Comments
 (0)