@@ -3169,10 +3169,19 @@ PyObject *JM_fitz_config()
31693169//----------------------------------------------------------------------------
31703170PyObject *JM_BinFromBuffer(fz_context *ctx, fz_buffer *buffer)
31713171{
3172- if (!buffer) return NULL;
3172+ PyObject *bytes = NULL;
31733173 char *c = NULL;
3174- size_t len = fz_buffer_storage(gctx, buffer, &c);
3175- return PyBytes_FromStringAndSize(c, (Py_ssize_t) len);
3174+ if (buffer)
3175+ {
3176+ size_t len = fz_buffer_storage(gctx, buffer, &c);
3177+ bytes = PyBytes_FromStringAndSize(c, (Py_ssize_t) len);
3178+ }
3179+ else
3180+ {
3181+ bytes = PyBytes_FromString("");
3182+ }
3183+ Py_INCREF(bytes);
3184+ return bytes;
31763185}
31773186
31783187//----------------------------------------------------------------------------
@@ -3296,7 +3305,7 @@ void hexlify(int n, unsigned char *in, unsigned char *out)
32963305}
32973306
32983307//----------------------------------------------------------------------------
3299- // Turn Python a bytes or bytearray object into char* string
3308+ // Turn a bytes or bytearray object into char* string
33003309// using the "_AsString" functions. Returns string size or 0 on error.
33013310//----------------------------------------------------------------------------
33023311size_t JM_CharFromBytesOrArray(PyObject *stream, char **data)
@@ -3317,6 +3326,31 @@ size_t JM_CharFromBytesOrArray(PyObject *stream, char **data)
33173326 return len;
33183327}
33193328
3329+ //----------------------------------------------------------------------------
3330+ // Return fz_buffer from a PyBytes or PyByteArray object
3331+ //----------------------------------------------------------------------------
3332+ fz_buffer *JM_BufferFromBytes(fz_context *ctx, PyObject *stream)
3333+ {
3334+ if (!stream) return NULL;
3335+ char *c = NULL;
3336+ size_t len = JM_CharFromBytesOrArray(stream, &c);
3337+ if (!c) return NULL;
3338+ fz_buffer *res = NULL;
3339+ fz_var(res);
3340+ fz_try(ctx)
3341+ {
3342+ res = fz_new_buffer(ctx, len);
3343+ fz_append_data(ctx, res, c, len);
3344+ fz_terminate_buffer(ctx, res);
3345+ }
3346+ fz_catch(ctx)
3347+ {
3348+ fz_drop_buffer(ctx, res);
3349+ fz_rethrow(ctx);
3350+ }
3351+ return res;
3352+ }
3353+
33203354//----------------------------------------------------------------------------
33213355// Modified copy of SWIG_Python_str_AsChar
33223356// If Py3, the SWIG original v3.0.12does *not* deliver NULL for a
@@ -4169,12 +4203,91 @@ JM_style_begin_dict(fz_context *ctx, PyObject *span, fz_font *font, float size,
41694203void
41704204JM_style_end_dict(fz_context *ctx, fz_buffer *buff, PyObject *span, PyObject *spanlist)
41714205{
4172- PyDict_SetItemString(span, "text", JM_StrFromBuffer(ctx, buff));
4206+ if (buff)
4207+ PyDict_SetItemString(span, "text", JM_StrFromBuffer(ctx, buff));
41734208 PyList_Append(spanlist, span);
41744209}
41754210
41764211PyObject *
4177- JM_extract_stext_textblock_as_dict(fz_context *ctx, fz_stext_block *block)
4212+ JM_extract_stext_textchar_as_dict(fz_context *ctx, fz_stext_char *ch)
4213+ {
4214+ PyObject *chardict = NULL;
4215+
4216+ chardict = PyDict_New();
4217+ PyDict_SetItemString(chardict, "c", Py_BuildValue("C", ch->c));
4218+ PyDict_SetItemString(chardict, "origin", Py_BuildValue("ff", ch->origin.x, ch->origin.y));
4219+ PyDict_SetItemString(chardict, "bbox", Py_BuildValue("ffff",
4220+ ch->bbox.x0, ch->bbox.y0,
4221+ ch->bbox.x1, ch->bbox.y1));
4222+ return chardict;
4223+ }
4224+
4225+ PyObject *
4226+ JM_extract_stext_textline_as_dict(fz_context *ctx, fz_stext_line *line)
4227+ {
4228+ fz_stext_char *ch;
4229+ fz_font *font = NULL;
4230+ fz_buffer *buff = NULL;
4231+ float size = 0;
4232+ int sup = 0;
4233+ PyObject *span=NULL, *spanlist = NULL, *linedict = NULL, *charlist;
4234+ PyObject *chardict;
4235+
4236+ linedict = PyDict_New();
4237+ fz_rect *linerect = JM_empty_rect();
4238+ PyDict_SetItemString(linedict, "wmode", Py_BuildValue("i", line->wmode));
4239+ PyDict_SetItemString(linedict, "dir", Py_BuildValue("ff", line->dir.x, line->dir.y));
4240+ spanlist = PyList_New(0);
4241+ font = NULL;
4242+ size = 0;
4243+
4244+ for (ch = line->first_char; ch; ch = ch->next)
4245+ {
4246+ JM_join_rect(linerect, &ch->bbox, ch->size);
4247+
4248+ int ch_sup = detect_super_script(line, ch);
4249+ if (ch->font != font || ch->size != size)
4250+ { // start new span
4251+ if (font) // must finish old span first
4252+ {
4253+ PyDict_SetItemString(span, "chars", charlist);
4254+ Py_CLEAR(charlist);
4255+ JM_style_end_dict(ctx, NULL, span, spanlist);
4256+ Py_CLEAR(span);
4257+ font = NULL;
4258+ }
4259+ font = ch->font;
4260+ size = ch->size;
4261+ sup = ch_sup;
4262+ charlist = PyList_New(0);
4263+ span = PyDict_New();
4264+ JM_style_begin_dict(ctx, span, font, size, sup);
4265+ }
4266+ chardict = JM_extract_stext_textchar_as_dict(ctx, ch);
4267+ PyList_Append(charlist, chardict);
4268+ Py_CLEAR(chardict);
4269+ }
4270+ if (font)
4271+ {
4272+ PyDict_SetItemString(span, "chars", charlist);
4273+ Py_CLEAR(charlist);
4274+ JM_style_end_dict(ctx, NULL, span, spanlist);
4275+ Py_CLEAR(span);
4276+ font = NULL;
4277+ }
4278+
4279+ PyDict_SetItemString(linedict, "spans", spanlist);
4280+ Py_CLEAR(spanlist);
4281+ PyDict_SetItemString(linedict, "bbox", Py_BuildValue("ffff",
4282+ linerect->x0, linerect->y0,
4283+ linerect->x1, linerect->y1));
4284+
4285+ free(linerect);
4286+ return linedict;
4287+ }
4288+
4289+ PyObject *
4290+ JM_extract_stext_textblock_as_dict(fz_context *ctx, fz_stext_block *block, int rawdict)
41784291{
41794292 fz_stext_line *line;
41804293 fz_stext_char *ch;
@@ -4190,6 +4303,15 @@ JM_extract_stext_textblock_as_dict(fz_context *ctx, fz_stext_block *block)
41904303
41914304 for (line = block->u.t.first_line; line; line = line->next)
41924305 {
4306+ if (rawdict != 0)
4307+ {
4308+ linedict = JM_extract_stext_textline_as_dict(ctx, line);
4309+ PyList_Append(linelist, linedict);
4310+ Py_CLEAR(linedict);
4311+ JM_join_rect(blockrect, &line->bbox, 0.0f);
4312+ continue;
4313+ }
4314+
41934315 linedict = PyDict_New();
41944316 fz_rect *linerect = JM_empty_rect();
41954317 PyDict_SetItemString(linedict, "wmode", Py_BuildValue("i", line->wmode));
@@ -4226,6 +4348,8 @@ JM_extract_stext_textblock_as_dict(fz_context *ctx, fz_stext_block *block)
42264348 {
42274349 JM_style_end_dict(ctx, buff, span, spanlist);
42284350 Py_CLEAR(span);
4351+ fz_drop_buffer(ctx, buff);
4352+ buff = NULL;
42294353 font = NULL;
42304354 }
42314355
@@ -4313,7 +4437,7 @@ JM_extract_stext_imageblock_as_dict(fz_context *ctx, fz_stext_block *block)
43134437}
43144438
43154439PyObject *
4316- JM_stext_page_as_dict(fz_context *ctx, fz_stext_page *page)
4440+ JM_stext_page_as_dict(fz_context *ctx, fz_stext_page *page, int rawdict )
43174441{
43184442 PyObject *dict = PyDict_New();
43194443 PyObject *blocklist = PyList_New(0);
@@ -4327,7 +4451,7 @@ JM_stext_page_as_dict(fz_context *ctx, fz_stext_page *page)
43274451 if (block->type == FZ_STEXT_BLOCK_IMAGE)
43284452 PyList_Append(blocklist, JM_extract_stext_imageblock_as_dict(ctx, block));
43294453 else
4330- PyList_Append(blocklist, JM_extract_stext_textblock_as_dict(ctx, block));
4454+ PyList_Append(blocklist, JM_extract_stext_textblock_as_dict(ctx, block, rawdict ));
43314455 }
43324456 PyDict_SetItemString(dict, "blocks", blocklist);
43334457 Py_CLEAR(blocklist);
@@ -7155,24 +7279,19 @@ SWIGINTERN PyObject *fz_document_s__updateStream(struct fz_document_s *self,int
71557279 fz_var(obj);
71567280 fz_buffer *res = NULL;
71577281 fz_var(res);
7158- size_t len = 0;
7159- char *c = NULL;
71607282 pdf_document *pdf = pdf_specifics(gctx, self); // get pdf doc
71617283 fz_try(gctx)
71627284 {
71637285 assert_PDF(pdf);
71647286 int xreflen = pdf_xref_len(gctx, pdf);
71657287 if (!INRANGE(xref, 1, xreflen-1))
71667288 THROWMSG("xref out of range");
7167- len = JM_CharFromBytesOrArray(stream, &c);
7168- if (!c) THROWMSG("stream must be bytes or bytearray");
71697289 // get the object
71707290 obj = pdf_new_indirect(gctx, pdf, xref, 0);
7171- if (new == 0 && !pdf_is_stream(gctx, obj))
7291+ if (! new && !pdf_is_stream(gctx, obj))
71727292 THROWMSG("xref not a stream object");
7173- res = fz_new_buffer(gctx, len);
7174- fz_append_data(gctx, res, c, len);
7175- fz_terminate_buffer(gctx, res);
7293+ res = JM_BufferFromBytes(gctx, stream);
7294+ if (!res) THROWMSG("stream must be bytes or bytearray");
71767295 JM_update_stream(gctx, pdf, obj, res);
71777296
71787297 }
@@ -9984,7 +10103,7 @@ SWIGINTERN PyObject *fz_stext_page_s__extractText(struct fz_stext_page_s *self,i
998410103 fz_print_stext_page_as_html(gctx, out, self);
998510104 break;
998610105 case(2):
9987- text = JM_stext_page_as_dict(gctx, self);
10106+ text = JM_stext_page_as_dict(gctx, self, 0 );
998810107 break;
998910108 case(3):
999010109 fz_print_stext_page_as_xml(gctx, out, self);
@@ -9993,7 +10112,10 @@ SWIGINTERN PyObject *fz_stext_page_s__extractText(struct fz_stext_page_s *self,i
999310112 fz_print_stext_page_as_xhtml(gctx, out, self);
999410113 break;
999510114 case(5):
9996- text = JM_stext_page_as_dict(gctx, self);
10115+ text = JM_stext_page_as_dict(gctx, self, 0);
10116+ break;
10117+ case(6):
10118+ text = JM_stext_page_as_dict(gctx, self, 1);
999710119 break;
999810120 default:
999910121 JM_print_stext_page_as_text(gctx, out, self);
0 commit comments