Skip to content

Commit 05d2814

Browse files
committed
fix(cli): preserve spaces and newlines in treesitter chunking.
1 parent 7bfcedc commit 05d2814

File tree

2 files changed

+19
-7
lines changed

2 files changed

+19
-7
lines changed

src/vectorcode/chunking.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ def __chunk_node(
150150
f"Traversing at node {node.text.decode()} at position {node.byte_range}"
151151
)
152152
current_chunk: str = ""
153-
153+
prev_node = None
154154
current_start = None
155155

156156
for child in node.children:
@@ -184,10 +184,19 @@ def __chunk_node(
184184
current_start = Point(
185185
row=child.start_point.row + 1, column=child.start_point.column
186186
)
187+
prev_node = child
187188

188-
elif len(current_chunk) + child_length <= self.config.chunk_size:
189+
elif len(current_chunk) + child_length + 1 <= self.config.chunk_size:
189190
# Add to current chunk
191+
if prev_node:
192+
if prev_node.end_point.row != child.start_point.row:
193+
current_chunk += "\n"
194+
else:
195+
current_chunk += " " * (
196+
child.start_point.column - prev_node.end_point.column
197+
)
190198
current_chunk += child_bytes.decode()
199+
prev_node = child
191200

192201
else:
193202
# Yield current chunk and start new one

tests/test_chunking.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def bar():
250250

251251

252252
def test_treesitter_chunker_filter_wildcard():
253-
chunker = TreeSitterChunker(Config(chunk_size=30, chunk_filters={"*": [".*foo.*"]}))
253+
chunker = TreeSitterChunker(Config(chunk_size=35, chunk_filters={"*": [".*foo.*"]}))
254254

255255
test_content = r"""
256256
def foo():
@@ -283,12 +283,12 @@ def bar():
283283
test_file = tmp_file.name
284284

285285
chunks = list(str(i) for i in chunker.chunk(test_file))
286-
assert chunks == ['functionbar()return "bar"end']
286+
assert chunks == ['function bar()\n return "bar"\nend']
287287
os.remove(test_file)
288288

289289

290290
def test_treesitter_chunker_lua():
291-
chunker = TreeSitterChunker(Config(chunk_size=30))
291+
chunker = TreeSitterChunker(Config(chunk_size=35))
292292
test_content = r"""
293293
function foo()
294294
return "foo"
@@ -304,7 +304,10 @@ def test_treesitter_chunker_lua():
304304
test_file = tmp_file.name
305305

306306
chunks = list(str(i) for i in chunker.chunk(test_file))
307-
assert chunks == ['functionfoo()return "foo"end', 'functionbar()return "bar"end']
307+
assert chunks == [
308+
'function foo()\n return "foo"\nend',
309+
'function bar()\n return "bar"\nend',
310+
]
308311

309312
os.remove(test_file)
310313

@@ -403,7 +406,7 @@ def bar():
403406
assert len(chunks) >= 2 # Should have at least 2 chunks
404407

405408
# First chunk should contain the function definition start
406-
assert "deffoo():" in chunks[0].text
409+
assert "def foo():" in chunks[0].text
407410
assert chunks[0].start == Point(1, 0)
408411

409412
# Last chunk should contain the final return statement

0 commit comments

Comments
 (0)