Skip to content

Commit b3b8077

Browse files
committed
Fix non unicode characters causing offset
1 parent 8b5b049 commit b3b8077

File tree

1 file changed

+7
-5
lines changed

1 file changed

+7
-5
lines changed

bigcode_eval/tasks/shadereval.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -175,13 +175,14 @@ def remove_last_block(self, code):
175175
### Find the first occassion where a chain of { } is closed??
176176
open_brackets = 1
177177
cut = False
178-
for i, c in enumerate(code):
178+
for i, c in enumerate(code.encode("utf-8")):
179+
c = chr(c)
179180
if c == '{':
180181
open_brackets += 1
181182
elif c == '}':
182183
open_brackets -= 1
183184
if open_brackets == 0:
184-
code = code[:i+1]
185+
code = code.encode("utf-8")[:i+1].decode("utf-8", "ignore")
185186
cut = True
186187
break
187188
if not cut:
@@ -209,9 +210,10 @@ def postprocess_generation(self, generation, idx):
209210
model_ctx = ref["model_ctx"]
210211
full_code = ref["full_code"]
211212
start, end = ref["func_range"]
212-
gen = self.remove_last_block(generation[len(model_ctx):]) #remove last block to avoid syntax errors
213-
214-
return full_code[:start] + model_ctx + gen + full_code[end:] #does this patch it together correctly?
213+
gen = self.remove_last_block(generation.encode("utf-8")[len(model_ctx.encode("utf-8")):].decode("utf-8")) #remove last block to avoid syntax errors
214+
before_gen = full_code.encode("utf-8")[:start].decode("utf-8")
215+
after_gen = full_code.encode("utf-8")[end:].decode("utf-8")
216+
return before_gen + model_ctx + gen + after_gen #does this patch it together correctly?
215217

216218
def process_results(self, generations, references):
217219
# TODO: define how the evaluation score is computed from list of \

0 commit comments

Comments
 (0)